reword ending of abaab conversion, greatly simplified

cb2c89ef · BOULLE Olivier · eed3fa8c · cb2c89ef
Commit cb2c89ef authored 2 years ago by BOULLE Olivier
--- a/file_to_dna.py
+++ b/file_to_dna.py
@@ -112,9 +112,13 @@ def encode_file(input_path: str, output_path: str) -> None:
    objective is to fit the payload in the lowest number of blocks, but all blocks must be the same size after addition of non coding stuff (primer/buffer/overhang/bsaI) 
    """

+    #TODO
    binary_string = convert_file_to_bits(input_path) # get the binary string representing the file content
    #binary_string = input_path #TODO REMOVE

+    if (len(binary_string) + CHECK_SUM_SIZE) % 8 in [1, 4, 6]: # add a non coding 0 when size of incompatible length for dna conversion
+        binary_string = "0" + binary_string
+        
    dna_payload_size = bdc.size_dna_from_bit_len_abaab(len(binary_string) + CHECK_SUM_SIZE) # length of the payload after conversion in dna
    #print("payload size",str(dna_payload_size))
    
@@ -132,14 +136,11 @@ def encode_file(input_path: str, output_path: str) -> None:
    # estimate the total dna sequence length after addition of non coding parts
    total_dna_size = dna_payload_size + dna_for_assembly_size
    #print("total_dna_size",str(total_dna_size))
-    
-    # possibility to add a non coding base at the end of the sequence to make the last block at correct size when it is not possible to do by adding bits of 0 (very rare case)
-    add_base_at_end = False
-    
+        
    # round the number of base per blocks, make sure it is above the minimal block size
    final_assembly_size = block_number * max(math.ceil(total_dna_size / block_number), min_total_block_size)
        
-    # case when some bits need to be added to increase the number of bases
+    # case when some bits needs to be added to increase the number of bases
    if final_assembly_size != total_dna_size:
    
        # calculate the number of bases to add to the payload to get a round number of equal length blocks
@@ -153,12 +154,6 @@ def encode_file(input_path: str, output_path: str) -> None:
        binary_string = math.ceil(filler_length) * "0" + binary_string
        #print("updated binary size", str(len(binary_string)))
        
-        # rare case where adding 0 can not solve the problem of round blocks
-        if dna_payload_needed_size % 5 == 3:
-            # need a non coding base at the end
-            #print("added non coding base")
-            add_base_at_end = True
-
    
    # apply a filter to the binary string -> shuffle the data to avoid long rows of 0 or 1, and avoid rows repetitions 
    binary_string = binary_string[::-1] # reverse the binary string, because 2 files can have the same start with ziping methods
@@ -172,7 +167,7 @@ def encode_file(input_path: str, output_path: str) -> None:
    # convert binaries into dna sequence
    sequence = bdc.binary_to_dna_abaab(filtered_binary_string)
    
-    if add_base_at_end: sequence += bdc.bit_to_dna_balance_GC("0", sequence[-1]) # add a non coding base different from the preceding one
+    #if add_base_at_end: sequence += bdc.bit_to_dna_balance_GC("0", sequence[-1]) # add a non coding base different from the preceding one
    
    total_sequence_size = len(sequence)+ dna_for_assembly_size
    
@@ -189,7 +184,6 @@ def encode_file(input_path: str, output_path: str) -> None:
    # split the sequence into blocks of correct size to add the non payload stuff later # start the block count at 1
    sub_sequences_dict = {}
    
-    
    if block_number == 1:
        sub_sequences_dict["1"] = sequence
    elif block_number == 2:
@@ -226,7 +220,7 @@ def decode_file(input_path: str, output_path: str) -> None:
    sub_sequences_dict = dfr.read_fasta(input_path)
    
    sequence = "".join(sub_sequences_dict.values())
-    #sequence = input_path
+    #sequence = input_path #TODO REMOVE
    
    # convert the dna sequence into a binary string
    binary_from_dna_string = bdc.dna_to_binary_abaab(sequence)
@@ -283,8 +277,8 @@ if __name__ == '__main__':
            continue
        
        print("i=",str(i))
-        seq = encode_file("", binary)
-        binary_result = decode_file(seq, "toto")
+        seq = encode_file(binary, "")
+        binary_result = decode_file(seq, "")
        
        if binary != binary_result:
            #print(binary)