From cb2c89efa186adce1270a0f6195ea5f48b471d4c Mon Sep 17 00:00:00 2001
From: oboulle <olivier.boulle@inria.fr>
Date: Fri, 14 Apr 2023 14:31:25 +0200
Subject: [PATCH] reword ending of abaab conversion, greatly simplified

---
 file_to_dna.py | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/file_to_dna.py b/file_to_dna.py
index baab5a3..30cc834 100755
--- a/file_to_dna.py
+++ b/file_to_dna.py
@@ -112,9 +112,13 @@ def encode_file(input_path: str, output_path: str) -> None:
     objective is to fit the payload in the lowest number of blocks, but all blocks must be the same size after addition of non coding stuff (primer/buffer/overhang/bsaI) 
     """
 
+    #TODO
     binary_string = convert_file_to_bits(input_path) # get the binary string representing the file content
     #binary_string = input_path #TODO REMOVE
 
+    if (len(binary_string) + CHECK_SUM_SIZE) % 8 in [1, 4, 6]: # add a non coding 0 when size of incompatible length for dna conversion
+        binary_string = "0" + binary_string
+        
     dna_payload_size = bdc.size_dna_from_bit_len_abaab(len(binary_string) + CHECK_SUM_SIZE) # length of the payload after conversion in dna
     #print("payload size",str(dna_payload_size))
     
@@ -132,14 +136,11 @@ def encode_file(input_path: str, output_path: str) -> None:
     # estimate the total dna sequence length after addition of non coding parts
     total_dna_size = dna_payload_size + dna_for_assembly_size
     #print("total_dna_size",str(total_dna_size))
-    
-    # possibility to add a non coding base at the end of the sequence to make the last block at correct size when it is not possible to do by adding bits of 0 (very rare case)
-    add_base_at_end = False
-    
+        
     # round the number of base per blocks, make sure it is above the minimal block size
     final_assembly_size = block_number * max(math.ceil(total_dna_size / block_number), min_total_block_size)
         
-    # case when some bits need to be added to increase the number of bases
+    # case when some bits needs to be added to increase the number of bases
     if final_assembly_size != total_dna_size:
     
         # calculate the number of bases to add to the payload to get a round number of equal length blocks
@@ -153,12 +154,6 @@ def encode_file(input_path: str, output_path: str) -> None:
         binary_string = math.ceil(filler_length) * "0" + binary_string
         #print("updated binary size", str(len(binary_string)))
         
-        # rare case where adding 0 can not solve the problem of round blocks
-        if dna_payload_needed_size % 5 == 3:
-            # need a non coding base at the end
-            #print("added non coding base")
-            add_base_at_end = True
-
     
     # apply a filter to the binary string -> shuffle the data to avoid long rows of 0 or 1, and avoid rows repetitions 
     binary_string = binary_string[::-1] # reverse the binary string, because 2 files can have the same start with ziping methods
@@ -172,7 +167,7 @@ def encode_file(input_path: str, output_path: str) -> None:
     # convert binaries into dna sequence
     sequence = bdc.binary_to_dna_abaab(filtered_binary_string)
     
-    if add_base_at_end: sequence += bdc.bit_to_dna_balance_GC("0", sequence[-1]) # add a non coding base different from the preceding one
+    #if add_base_at_end: sequence += bdc.bit_to_dna_balance_GC("0", sequence[-1]) # add a non coding base different from the preceding one
     
     total_sequence_size = len(sequence)+ dna_for_assembly_size
     
@@ -189,7 +184,6 @@ def encode_file(input_path: str, output_path: str) -> None:
     # split the sequence into blocks of correct size to add the non payload stuff later # start the block count at 1
     sub_sequences_dict = {}
     
-    
     if block_number == 1:
         sub_sequences_dict["1"] = sequence
     elif block_number == 2:
@@ -226,7 +220,7 @@ def decode_file(input_path: str, output_path: str) -> None:
     sub_sequences_dict = dfr.read_fasta(input_path)
     
     sequence = "".join(sub_sequences_dict.values())
-    #sequence = input_path
+    #sequence = input_path #TODO REMOVE
     
     # convert the dna sequence into a binary string
     binary_from_dna_string = bdc.dna_to_binary_abaab(sequence)
@@ -283,8 +277,8 @@ if __name__ == '__main__':
             continue
         
         print("i=",str(i))
-        seq = encode_file("", binary)
-        binary_result = decode_file(seq, "toto")
+        seq = encode_file(binary, "")
+        binary_result = decode_file(seq, "")
         
         if binary != binary_result:
             #print(binary)
-- 
GitLab