force maximum block size option, add block number and size in filename

d1a1079c · BOULLE Olivier · 0a4f3f94 · d1a1079c
Commit d1a1079c authored 2 years ago by BOULLE Olivier
--- a/file_to_dna.py
+++ b/file_to_dna.py
@@ -27,8 +27,9 @@ bsaI_size = 7
 extern_extremity_size = primer_size + buffer_size # size of non-payload dna part at the extern extremities (start of first block and end of last block)
 intern_extremity_size = overhang_size + bsaI_size + buffer_size # size of non-payload dna part at the extremities between blocks
  
-max_total_block_size = 1000 # maximum allowed size for a complete block
+max_total_block_size = 500 # maximum allowed size for a complete block
 min_total_block_size = 300 # minimum allowed size for a complete block
+force_size_to_max = True # set to false if blocks can have any size between min and max, else force the size of all blocks to be the max size
 n_block_max = 10 # maximum assembled number of blocks


@@ -160,20 +161,23 @@ def encode_file(input_path: str, output_path: str) -> None:
    # add a non coding A at the end of the sequence to make the last block at correct size when it is not possible to do by adding bits of 0 (very rare case)
    add_A_at_end = False
    
-    rest = total_dna_size % block_number # rest of dna if the sequence is divided equally in the blocks
    
-    if rest > 0: # fill up with some bits of 0 to have a round number of blocks
-        
-        # find the next length the total sequence needs to be to have equal division in blocks
-        next_length_for_round_blocks = block_number * math.ceil(total_dna_size / block_number)
-        #print("next_length_for_round_blocks",str(next_length_for_round_blocks))
+    if force_size_to_max or total_dna_size % block_number > 0: # need to add non payload bits to adjust the size of blocks
        
+        # all blocks must be the maximum size
+        if force_size_to_max: # find the next length the total sequence needs to be to have a division in blocks of maximum size
+            final_assembly_size = block_number * max_total_block_size
+        else: # all blocks still need to have the same round size
+            # find the next length the total sequence needs to be to have equal division in blocks
+            final_assembly_size = block_number * math.ceil(total_dna_size / block_number)
+            #print("final_assembly_size",str(final_assembly_size))
+ 
        # calculate the number of bits to add to get a round number of equal length blocks
-        dna_payload_needed_size = next_length_for_round_blocks - dna_non_payload_size
+        dna_payload_needed_size = final_assembly_size - dna_non_payload_size
        #print(size_binary_from_dna_len(dna_payload_needed_size) )
        filler_length = size_binary_from_dna_len(dna_payload_needed_size) - len(binary_string) - CHECK_SUM_SIZE
        #print("dna_payload_needed_size",str(dna_payload_needed_size)," +",filler_length,"bits")
-
+    
        # fill with '0' at the beginning of the binary # not the end because some zip can end with octets of 0, wich makes difficult to remove only the non coding '0'
        binary_string = math.ceil(filler_length) * "0" + binary_string
        
@@ -204,11 +208,13 @@ def encode_file(input_path: str, output_path: str) -> None:
    # test for errors that should never occur (I hope ...)
    # round number of blocks, no blocks too large, no blocks to small
    if round(total_sequence_size/block_number) != total_sequence_size/block_number or total_sequence_size/block_number > max_total_block_size or total_sequence_size/block_number < min_total_block_size:
-        print("error file to dna")
+        print("error file to dna", input_path)
        print("\tseq payload size",str(len(sequence)))
        print("\ttotal estimated seq size",total_sequence_size)
        print("\t",str(block_number),"blocks of",str(total_sequence_size/block_number))
-
+        print("TODO remove me")
+        #TODO
+        return
        exit(1)

    # split the sequence into blocks of correct size to add the non payload stuff later # start the block count at 1
@@ -233,6 +239,9 @@ def encode_file(input_path: str, output_path: str) -> None:
            
        sub_sequences_dict[str(block_number)] = sequence[index_sequence:]
    
+    # add number and blocks size to path, add type
+    output_path = output_path + "_" + str(block_number) + "x" + str(total_sequence_size//block_number) + ".fasta"
+    
    dfr.save_dict_to_fasta(sub_sequences_dict, output_path)