function for forbidden rests

5a217f07 · BOULLE Olivier · cb2c89ef · 5a217f07 · 5a217f07
Commit 5a217f07 authored 2 years ago by BOULLE Olivier
--- a/binary_dna_conversion.py
+++ b/binary_dna_conversion.py
@@ -29,6 +29,13 @@ def bit_to_dna_balance_GC(bit_char: str, reference_base: str):
        return bit_to_dna_AT[bit_char]


+def forbidden_rest_abaab(binary_string_size: str) -> bool:
+    """
+    return True if the binary is of a length that cannot be converted by the abaab method
+    """
+    return binary_string_size % 8 in [1, 4, 6]
+
+
 def binary_to_dna_abaab(binary_string: str) -> str:
    """
    convert binaries into dna sequence with some properties
@@ -49,7 +56,7 @@ def binary_to_dna_abaab(binary_string: str) -> str:
    #TODO warning : if ending with bsaI -> end can currently be aa when a rest of 4 bits -> will break with banword removal

    """
-    if len(binary_string) % 8 in [1, 4, 6]:
+    if forbidden_rest_abaab(len(binary_string)):
        print("error binary dna conversion, need a binary string multiple of 8, or with a rest of 2, 3, 7 ("+str(len(binary_string) % 8)+")")
        exit(0)
    
@@ -231,6 +238,13 @@ def size_binary_from_dna_len_abaab(dna_length):
        return base_length + 7


+def forbidden_rest_baa(binary_string_size: str) -> bool:
+    """
+    return True if the binary is of a length that cannot be converted by the baa method
+    """
+    return binary_string_size % 5 in [2, 4]
+
+
 def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
    """
    convert binaries into dna sequence with some properties
@@ -249,9 +263,9 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
    only allowing rests multiple of 5 removes ambiguity (also possible 0, 1 or 3 modulo 5)

    """
-    if len(binary_string) % 5 not in [0, 1, 3]:
-            print("error binary dna conversion, need a binary string multiple of 5, or with a rest of 1, 3")
-            return
+    if forbidden_rest_baa(len(binary_string)):
+        print("error binary dna conversion, need a binary string multiple of 5, or with a rest of 1, 3 ("+str(len(binary_string) % 5)+")")
+        return
    
    sequence = ""
    n_quintuplets, rest = divmod(len(binary_string), 5)
@@ -339,6 +353,7 @@ def dna_to_binary_baa(sequence: str) -> str:

    return binary_string

+
 def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str:
    """
    remove banned words from a sequence encoded with the binary_conversion.binary_to_dna_baa() method

--- a/file_to_dna.py
+++ b/file_to_dna.py
@@ -112,11 +112,12 @@ def encode_file(input_path: str, output_path: str) -> None:
    objective is to fit the payload in the lowest number of blocks, but all blocks must be the same size after addition of non coding stuff (primer/buffer/overhang/bsaI) 
    """

-    #TODO
+    #TODO    
    binary_string = convert_file_to_bits(input_path) # get the binary string representing the file content
    #binary_string = input_path #TODO REMOVE

-    if (len(binary_string) + CHECK_SUM_SIZE) % 8 in [1, 4, 6]: # add a non coding 0 when size of incompatible length for dna conversion
+    if bdc.forbidden_rest_abaab(len(binary_string) + CHECK_SUM_SIZE):
+        # add a non coding 0 when size of incompatible length for dna conversion
        binary_string = "0" + binary_string
        
    dna_payload_size = bdc.size_dna_from_bit_len_abaab(len(binary_string) + CHECK_SUM_SIZE) # length of the payload after conversion in dna
@@ -155,8 +156,9 @@ def encode_file(input_path: str, output_path: str) -> None:
        #print("updated binary size", str(len(binary_string)))
        
    
-    # apply a filter to the binary string -> shuffle the data to avoid long rows of 0 or 1, and avoid rows repetitions 
    binary_string = binary_string[::-1] # reverse the binary string, because 2 files can have the same start with ziping methods
+    
+    # apply a filter to the binary string -> shuffle the data to avoid long rows of 0 or 1, and avoid rows repetitions 
    filtered_binary_string = apply_binary_filter(binary_string)
    
    
@@ -166,9 +168,7 @@ def encode_file(input_path: str, output_path: str) -> None:
        
    # convert binaries into dna sequence
    sequence = bdc.binary_to_dna_abaab(filtered_binary_string)
-    
-    #if add_base_at_end: sequence += bdc.bit_to_dna_balance_GC("0", sequence[-1]) # add a non coding base different from the preceding one
-    
+        
    total_sequence_size = len(sequence)+ dna_for_assembly_size
    
    # test for errors that should never occur (I hope ...)
@@ -217,8 +217,10 @@ def decode_file(input_path: str, output_path: str) -> None:
    input : fasta format file with the payload sequences extracted from the block assembly
    """

+    #TODO
    sub_sequences_dict = dfr.read_fasta(input_path)
    
+    #TODO
    sequence = "".join(sub_sequences_dict.values())
    #sequence = input_path #TODO REMOVE
    
@@ -268,7 +270,6 @@ if __name__ == '__main__':
    #binary_string = sys.argv[1]
    #seq = encode_file(doc_path)
    
-    #print(get_max_binary_len())
    #exit(0)

    for i in range(400,10000, 8):
@@ -281,13 +282,13 @@ if __name__ == '__main__':
        binary_result = decode_file(seq, "")
        
        if binary != binary_result:
-            #print(binary)
-            #print(seq)
-            #print(binary_result)
+            print(binary)
+            print(seq)
+            print(binary_result)
            exit(0)
    #encode_file("", "test")
-    #seq = binary_to_dna_abaab(binary_string)
+    #seq = binary_to_dna_baa(binary_string)
    #print(seq)
-    #print(dna_to_binary_abaab(seq))
+    #print(dna_to_binary_baa(seq))