switch to abaab, decode all sequences in input

f10225b0 · BOULLE Olivier · 3bd22658 · f10225b0
Commit f10225b0 authored 1 year ago by BOULLE Olivier
--- a/file_to_dna.py
+++ b/file_to_dna.py
@@ -28,8 +28,8 @@ bsaI_size = 7
 extern_extremity_size = primer_size + overhang_size + bsaI_size + buffer_size # size of non-payload dna part at the extern extremities (start of first block and end of last block)
 intern_extremity_size = overhang_size + bsaI_size + buffer_size # size of non-payload dna part at the extremities between blocks
  
-max_total_block_size = 1000 # maximum allowed size for a complete block
-min_total_block_size = 1000 # minimum allowed size for a complete block
+max_total_block_size = 500 # maximum allowed size for a complete block
+min_total_block_size = 500 # minimum allowed size for a complete block
 n_block_max = 10 # maximum assembled number of blocks


@@ -97,7 +97,7 @@ def get_max_binary_len() -> int:
    
    max_payload_dna_len = max_total_dna_len - 2*extern_extremity_size - 2*(n_block_max-1)*intern_extremity_size # without the assembly stuff
    
-    max_binary_len = bdc.size_binary_from_dna_len_baa(max_payload_dna_len) - CHECK_SUM_SIZE # convert the remaining bases to binary and remove the checksum bits
+    max_binary_len = bdc.size_binary_from_dna_len_abaab(max_payload_dna_len) - CHECK_SUM_SIZE # convert the remaining bases to binary and remove the checksum bits
    
    return math.floor(max_binary_len) # round down

@@ -115,11 +115,11 @@ def encode_file(input_path: str, output_path: str) -> None:
    binary_string = convert_file_to_bits(input_path) # get the binary string representing the file content
    #binary_string = input_path #TODO REMOVE

-    if bdc.forbidden_rest_baa(len(binary_string) + CHECK_SUM_SIZE):
+    if bdc.forbidden_rest_abaab(len(binary_string) + CHECK_SUM_SIZE):
        # add a non coding 0 when size of incompatible length for dna conversion
        binary_string = "0" + binary_string
        
-    dna_payload_size = bdc.size_dna_from_bit_len_baa(len(binary_string) + CHECK_SUM_SIZE) # length of the payload after conversion in dna
+    dna_payload_size = bdc.size_dna_from_bit_len_abaab(len(binary_string) + CHECK_SUM_SIZE) # length of the payload after conversion in dna
    #print("payload size",str(dna_payload_size))
    
    # estimate number of blocks, round up to next int
@@ -145,9 +145,9 @@ def encode_file(input_path: str, output_path: str) -> None:
    
        # calculate the number of bases to add to the payload to get a round number of equal length blocks
        dna_payload_needed_size = final_assembly_size - dna_for_assembly_size
-        #print(bdc.size_binary_from_dna_len_abaab(dna_payload_needed_size) )
+        #print(bdc.size_binary_from_dna_len_aabaabb(dna_payload_needed_size) )
        # get number of bits to add
-        filler_length = bdc.size_binary_from_dna_len_baa(dna_payload_needed_size) - len(binary_string) - CHECK_SUM_SIZE
+        filler_length = bdc.size_binary_from_dna_len_abaab(dna_payload_needed_size) - len(binary_string) - CHECK_SUM_SIZE
        #print("dna_payload_needed_size",str(dna_payload_needed_size)," +",filler_length,"bits")
    
        # fill with '0' at the beginning of the binary # not the end because some zip can end with octets of 0, which makes difficult to remove only the non coding '0'
@@ -166,7 +166,7 @@ def encode_file(input_path: str, output_path: str) -> None:
    filtered_binary_string += binary_check_sum
        
    # convert binaries into dna sequence
-    sequence = bdc.binary_to_dna_baa(filtered_binary_string)
+    sequence = bdc.binary_to_dna_abaab(filtered_binary_string)
        
    total_sequence_size = len(sequence)+ dna_for_assembly_size
    
@@ -216,51 +216,47 @@ def decode_file(input_path: str, output_path: str) -> None:
    input : fasta format file with the payload sequences extracted from the block assembly
    """

-    #TODO
-    sub_sequences_dict = dfr.read_fasta(input_path)
+    sequences_dict = dfr.read_fasta(input_path) # get all the sequences from the fasta file
    
-    #TODO
-    sequence = "".join(sub_sequences_dict.values())
-    #sequence = input_path #TODO REMOVE
+    for seq_name, sequence in sequences_dict.items():
    
-    # convert the dna sequence into a binary string
-    binary_from_dna_string = bdc.dna_to_binary_baa(sequence)
-    
-    if not binary_from_dna_string:
-        print("warning file conversion, decoding an empty file")
-        return
-
-    # test if the check_sum corresponds to the binary string
-    binary_string = binary_from_dna_string[:-CHECK_SUM_SIZE]
-    binary_check_sum = binary_from_dna_string[-CHECK_SUM_SIZE:]
-    
-    if compute_check_sum(binary_string) != binary_check_sum:
-        print(compute_check_sum(binary_string),"!=",binary_check_sum)
-        print("Invalid check sum for",input_path)
-        exit(1)
+        # convert the dna sequence into a binary string
+        binary_from_dna_string = bdc.dna_to_binary_abaab(sequence)
        
-    # apply the same filter used in the encoding to the binary string to remove it  
-    binary_string = apply_binary_filter(binary_string)
-    binary_string = binary_string[::-1] # reverse the binary string to get the original
-
-    # case binaries length is not multiple of 8 -> remove the excess bits at the beginning that have been added in the encoding to get a round number of blocks
-    rest = len(binary_string) % 8
-    if rest != 0:
-        binary_string = binary_string[rest:]
-
-    # remove octets of zeros at the beginning (the start of the sequence can be filled with zeros to get a round number of blocks)
-    while binary_string.startswith(8*"0"): # 1/256 (2**8) chance to remove actual data ! but 8*0 is ascii char NULL
-        binary_string = binary_string[8:]       
-
-    #return binary_string #TODO REMOVE
-
-    # convert binaries into bytes
-    n = int(binary_string, 2)
-    bytes = n.to_bytes((n.bit_length() + 7) // 8, 'big')
+        if not binary_from_dna_string:
+            print("warning file conversion, decoding an empty sequence",seq_name,"in",input_path)
+            continue
    
-    # write the bytes into the file
-    with open(output_path, "wb") as f:
-        f.write(bytes)
+        # test if the check_sum corresponds to the binary string
+        binary_string = binary_from_dna_string[:-CHECK_SUM_SIZE]
+        binary_check_sum = binary_from_dna_string[-CHECK_SUM_SIZE:]
+        
+        if compute_check_sum(binary_string) != binary_check_sum:
+            print(compute_check_sum(binary_string),"!=",binary_check_sum)
+            print("Invalid check sum for",seq_name,"in",input_path)
+            continue
+            
+        # apply the same filter used in the encoding to the binary string to remove it  
+        binary_string = apply_binary_filter(binary_string)
+        binary_string = binary_string[::-1] # reverse the binary string to get the original
+    
+        # case binaries length is not multiple of 8 -> remove the excess bits at the beginning that have been added in the encoding to get a round number of blocks
+        rest = len(binary_string) % 8
+        if rest != 0:
+            binary_string = binary_string[rest:]
+    
+        # remove octets of zeros at the beginning (the start of the sequence can be filled with zeros to get a round number of blocks)
+        while binary_string.startswith(8*"0"): # 1/256 (2**8) chance to remove actual data ! but 8*0 is ascii char NULL
+            binary_string = binary_string[8:]       
+        
+        # convert binaries into bytes
+        n = int(binary_string, 2)
+        bytes = n.to_bytes((n.bit_length() + 7) // 8, 'big')
+        
+        # write the bytes into the file
+        with open(output_path, "wb") as f:
+            f.write(bytes)
+        return # end the decoding, since the sequence passed the checksum


 # =================== main ======================= #
@@ -286,8 +282,8 @@ if __name__ == '__main__':
            print(binary_result)
            exit(0)
    #encode_file("", "test")
-    #seq = binary_to_dna_baa(binary_string)
+    #seq = binary_to_dna_abaab(binary_string)
    #print(seq)
-    #print(dna_to_binary_baa(seq))
+    #print(dna_to_binary_abaab(seq))