refactoring, correct comments, greatly simplify binary division and size adjutments

5712f04e · BOULLE Olivier · d75d2e9f · 5712f04e · 5712f04e
Commit 5712f04e authored 2 years ago by BOULLE Olivier
--- a/binary_dna_conversion.py
+++ b/binary_dna_conversion.py
@@ -206,7 +206,7 @@ def size_dna_from_bit_len_abaab(bit_length):
        return base_length + 2
    if rest == 6:
        return base_length + 4
-    print("error size_of_dna_from_bit_len : size not multiple of 2 :",str(bit_length))
+    print("error size_of_dna_from_bit_len_abaab : size not multiple of 2 :",str(bit_length))
    exit(1)

--- a/file_to_dna.py
+++ b/file_to_dna.py
@@ -13,6 +13,7 @@ import dna_file_reader as dfr
 """
 Can be used to convert any type of file
 use the byte encoding of the file to convert to a dna sequence
+the file need to have a round number of octets
 a single substitution error in the sequences can lead to a corrupted file (except for .txt files)
 """
@@ -30,7 +31,6 @@ intern_extremity_size = overhang_size + bsaI_size + buffer_size # size of non-pa
 max_total_block_size = 300 # maximum allowed size for a complete block
 min_total_block_size = 300 # minimum allowed size for a complete block
-force_size_to_max = True # set to false if blocks can have any size between min and max, else force the size of all blocks to be the max size
 n_block_max = 10 # maximum assembled number of blocks
@@ -113,6 +113,7 @@ def encode_file(input_path: str, output_path: str) -> None:
    """
    binary_string = convert_file_to_bits(input_path) # get the binary string representing the file content
+    #binary_string = input_path #TODO REMOVE
    dna_payload_size = bdc.size_dna_from_bit_len_abaab(len(binary_string) + CHECK_SUM_SIZE) # length of the payload after conversion in dna
    #print("payload size",str(dna_payload_size))
@@ -125,42 +126,31 @@ def encode_file(input_path: str, output_path: str) -> None:
        print("error file to dna : block number too high :",str(block_number),"for",input_path)
        exit(1)
-    # size of non payload dna part that is contained in the blocks
+    # size of non payload dna part that is contained in the blocks and used for the assembly
-    dna_non_payload_size = 2*extern_extremity_size + 2*(block_number-1)*intern_extremity_size
+    dna_for_assembly_size = 2*extern_extremity_size + 2*(block_number-1)*intern_extremity_size
    # estimate the total dna sequence length after addition of non coding parts
-    total_dna_size = dna_payload_size + dna_non_payload_size
+    total_dna_size = dna_payload_size + dna_for_assembly_size
    #print("total_dna_size",str(total_dna_size))
-    if block_number == 1 and total_dna_size < min_total_block_size:
+    # possibility to add a non coding base at the end of the sequence to make the last block at correct size when it is not possible to do by adding bits of 0 (very rare case)
-        #TODO fill the block
-        print("error file to dna : 1 block too short", str(total_dna_size))
-        exit(1)
-        pass
-    # add a non coding base at the end of the sequence to make the last block at correct size when it is not possible to do by adding bits of 0 (very rare case)
    add_base_at_end = False
+    # round the number of base per blocks, make sure it is above the minimal block size
-    if force_size_to_max or total_dna_size % block_number > 0: # need to add non payload bits to adjust the size of blocks
+    final_assembly_size = block_number * max(math.ceil(total_dna_size / block_number), min_total_block_size)
-        # all blocks must be the maximum size
+    # case when some bits need to be added to increase the number of bases
-        if force_size_to_max: # find the next length the total sequence needs to be to have a division in blocks of maximum size
+    if final_assembly_size != total_dna_size:
-            final_assembly_size = block_number * max_total_block_size
-        else: # all blocks still need to have the same round size
+        # calculate the number of bases to add to the payload to get a round number of equal length blocks
-            # find the next length the total sequence needs to be to have equal division in blocks
+        dna_payload_needed_size = final_assembly_size - dna_for_assembly_size
-            final_assembly_size = block_number * math.ceil(total_dna_size / block_number)
-            #print("final_assembly_size",str(final_assembly_size))
-        # calculate the number of bits to add to get a round number of equal length blocks
-        dna_payload_needed_size = final_assembly_size - dna_non_payload_size
        #print(bdc.size_binary_from_dna_len_abaab(dna_payload_needed_size) )
+        # get number of bits to add
        filler_length = bdc.size_binary_from_dna_len_abaab(dna_payload_needed_size) - len(binary_string) - CHECK_SUM_SIZE
        #print("dna_payload_needed_size",str(dna_payload_needed_size)," +",filler_length,"bits")
        # fill with '0' at the beginning of the binary # not the end because some zip can end with octets of 0, which makes difficult to remove only the non coding '0'
        binary_string = math.ceil(filler_length) * "0" + binary_string
        #print("updated binary size", str(len(binary_string)))
        # rare case where adding 0 can not solve the problem of round blocks
@@ -184,7 +174,7 @@ def encode_file(input_path: str, output_path: str) -> None:
    if add_base_at_end: sequence += bdc.bit_to_dna_balance_GC("0", sequence[-1]) # add a non coding base different from the preceding one
-    total_sequence_size = len(sequence)+ dna_non_payload_size
+    total_sequence_size = len(sequence)+ dna_for_assembly_size
    # test for errors that should never occur (I hope ...)
    # round number of blocks, no blocks too large, no blocks to small
@@ -193,10 +183,8 @@ def encode_file(input_path: str, output_path: str) -> None:
        print("\tseq payload size",str(len(sequence)))
        print("\ttotal estimated seq size",total_sequence_size)
        print("\t",str(block_number),"blocks of",str(total_sequence_size/block_number))
-        print("TODO remove me")
-        #TODO
-        return
        exit(1)
+    #print("\t",str(block_number),"blocks of",str(total_sequence_size/block_number))
    # split the sequence into blocks of correct size to add the non payload stuff later # start the block count at 1
    sub_sequences_dict = {}
@@ -223,9 +211,11 @@ def encode_file(input_path: str, output_path: str) -> None:
    # add number and blocks size to path, add type
    output_path = output_path + "_" + str(block_number) + "x" + str(total_sequence_size//block_number) + ".fasta"
+    #return sequence #TODO REMOVE
    dfr.save_dict_to_fasta(sub_sequences_dict, output_path)
 def decode_file(input_path: str, output_path: str) -> None:
    """
@@ -236,7 +226,8 @@ def decode_file(input_path: str, output_path: str) -> None:
    sub_sequences_dict = dfr.read_fasta(input_path)
    sequence = "".join(sub_sequences_dict.values())
+    #sequence = input_path
    # convert the dna sequence into a binary string
    binary_from_dna_string = bdc.dna_to_binary_abaab(sequence)
@@ -266,6 +257,8 @@ def decode_file(input_path: str, output_path: str) -> None:
    while binary_string.startswith(8*"0"): # 1/256 (2**8) chance to remove actual data ! but 8*0 is ascii char NULL
        binary_string = binary_string[8:]       
+    #return binary_string #TODO REMOVE
    # convert binaries into bytes
    n = int(binary_string, 2)
    bytes = n.to_bytes((n.bit_length() + 7) // 8, 'big')
@@ -281,18 +274,23 @@ if __name__ == '__main__':
    #binary_string = sys.argv[1]
    #seq = encode_file(doc_path)
-    print(get_max_binary_len())
+    #print(get_max_binary_len())
-    exit(0)
+    #exit(0)
-    for i in range(0,300000):
-        """binary = str(bin(i))[2:]
-        print(compute_check_sum(binary), calculate_check_sum2(binary), compute_check_sum(binary) == calculate_check_sum2(binary))
+    for i in range(400,10000, 8):
-        if not compute_check_sum(binary) == calculate_check_sum2(binary):
+        binary = i*"1"
-            exit(1)"""
+        if len(binary) % 2 != 0:
-    #print("")
+            continue
        print("i=",str(i))
-        seq = encode_file("toto", i)
+        seq = encode_file("", binary)
-        decode_file(seq, "toto")
+        binary_result = decode_file(seq, "toto")
+        if binary != binary_result:
+            #print(binary)
+            #print(seq)
+            #print(binary_result)
+            exit(0)
    #encode_file("", "test")
    #seq = binary_to_dna_abaab(binary_string)
    #print(seq)