rework ending of abaab conversion, simplified and more coherent, fixed rare...

rework ending of abaab conversion, simplified and more coherent, fixed rare error for banword removal, correction of comments

rework ending of abaab conversion, simplified and more coherent, fixed rare...
eed3fa8c · BOULLE Olivier · 5712f04e · eed3fa8c
Commit eed3fa8c authored 2 years ago by BOULLE Olivier
--- a/binary_dna_conversion.py
+++ b/binary_dna_conversion.py
 import sys
+import math
 import random
+
 import sequence_control

 """
@@ -47,9 +49,8 @@ def binary_to_dna_abaab(binary_string: str) -> str:
    #TODO warning : if ending with bsaI -> end can currently be aa when a rest of 4 bits -> will break with banword removal

    """
-    
-    if len(binary_string) % 2 != 0:
-        print("error binary dna conversion, need a binary string multiple of 2")
+    if len(binary_string) % 8 in [1, 4, 6]:
+        print("error binary dna conversion, need a binary string multiple of 8, or with a rest of 2, 3, 7 ("+str(len(binary_string) % 8)+")")
        exit(0)
    
    sequence = ""
@@ -68,21 +69,21 @@ def binary_to_dna_abaab(binary_string: str) -> str:
        sequence += nucleotide_1 + nucleotide_2 + nucleotide_3 + nucleotide_4 + nucleotide_5
    # rest should be 0 because all documents contains a round number of octet
    # but some "0" can be added to fill the fragments
-    bit_rest = binary_string[n_octets*8:n_octets*8+rest] # rest is 0-2-4-6 bits
+    bit_rest = binary_string[n_octets*8:n_octets*8+rest] # rest is 0-2-3-5-7 bits

-    # last conversions depends on the length of the rest, 2 bits = 1 base; 4 bits = 2 bases; 6 bits = 4 bases
+    # last conversions depends on the length of the rest, 2 bits = 1 base; 3 bits = 2 bases; 5 bits = 3 bases; 7 bits = 4 bases
    if len(bit_rest) >= 2:
        nucleotide_1 = bit_pair_to_dna[bit_rest[0:2]] # nucleotide_1 from a pair of bits
        sequence += nucleotide_1
-        if len(bit_rest) == 4:
-            sequence += bit_pair_to_dna[bit_rest[2:4]] # nucleotide_2 also from a pair of bits
-        elif len(bit_rest) == 6:   
-            
+        if len(bit_rest) >= 3:
            # nucleotide_2 from a single bit and depending on nucleotide 1
            sequence += bit_to_dna_balance_GC(bit_rest[2], nucleotide_1)
-            
-            sequence += bit_pair_to_dna[bit_rest[3:5]] # nucleotide_3 from a pair of bits
-            sequence += bit_to_dna_AT[bit_rest[5]] # nucleotide_4 from a single bit
+            if len(bit_rest) >= 5:
+                # nucleotide_3 from a pair of bits
+                sequence += bit_pair_to_dna[bit_rest[3:5]] 
+                if len(bit_rest) == 7:
+                    # nucleotide_4 from a pair of bits
+                    sequence += bit_pair_to_dna[bit_rest[5:7]] 

    # remove the banned words
    sequence_wo_banwords = remove_ban_words_abaab_encoding(sequence)
@@ -111,17 +112,14 @@ def dna_to_binary_abaab(sequence: str) -> str:
    if len(sequence_rest) >= 1: # 1 base -> 2 bits
        binary_string += dna_to_bit_pair[sequence_rest[0]]
        
-        if len(sequence_rest) == 2: # 2 bases -> 2 bits + 2 bits
-            binary_string += dna_to_bit_pair[sequence_rest[1]]
+        if len(sequence_rest) >= 2: # 2 bases -> 2 bits + 1 bit
+            binary_string += dna_to_bit_ATGC[sequence_rest[1]]

-        elif len(sequence_rest) == 3:
-            # a rest of 3 should not occur from a binary to dna abaab conversion, but a non coding base can be added to get a round number of blocks
-            binary_string += dna_to_bit_pair[sequence_rest[1]] # just act like the rest was 2
+            if len(sequence_rest) >= 3: # 2 bases -> 2 bits + 1 bit + 2 bits
+                binary_string += dna_to_bit_pair[sequence_rest[2]]

-        elif len(sequence_rest) == 4: # 4 bases -> 2 bits + 1 bit + 2 bits + 1 bit
-            binary_string += dna_to_bit_ATGC[sequence_rest[1]] # 1 base -> 1 bit
-            binary_string += dna_to_bit_pair[sequence_rest[2]] # 1 base -> 2 bits
-            binary_string += dna_to_bit_ATGC[sequence_rest[3]] # 1 base -> 1 bit
+                if len(sequence_rest) == 4: # 4 bases -> 2 bits + 1 bit + 2 bits + 2 bit
+                    binary_string += dna_to_bit_pair[sequence_rest[3]]
    
    return binary_string

@@ -140,24 +138,24 @@ def remove_ban_words_abaab_encoding(sequence: str, abaab_method_offset=0) -> str
    dna_change_beta = {"A": "G", "T": "C", "G": "A", "C": "T"}

    # check for banned words
-    indexes_ban_words = []
+    banword_indexes_list = [] # list of index start;index_stop of all banwords in the sequence
    for banned_word in sequence_control.get_forbidden_sequences():
        index = sequence.find(banned_word)
        while index != -1:
-            indexes_ban_words.append([index, index+len(banned_word)])
+            banword_indexes_list.append([index, index+len(banned_word)])
            index = sequence.find(banned_word, index+1)
    
-    if indexes_ban_words == []:
-        return sequence
-    
-    # indexes of beta bases in the sequence (2nd and 5th base every 5 base window)
-    beta_indexes = [ abaab_method_offset+ 5*k+i for k in range(-1, len(sequence)//5) for i in [1,4] ]
+    # indexes of beta bases in the sequence (2nd and 5th base every 5 base window) # range start in negative for cases where the offset is > 0
+    beta_indexes = [ abaab_method_offset+ 5*k+i for k in range(-1, math.ceil(len(sequence)/5)) for i in [1,4] ]

    sequence_wo_bans = sequence
+    #print(len(sequence))
+    #print(beta_indexes)
+    #print(banword_indexes_list)

-    for banned_words_indexes in indexes_ban_words:
+    for banword_index in banword_indexes_list:
        # indexes of beta bases that can be changed to remove a banned word
-        potential_changes_index = [ k for k in beta_indexes if k >= banned_words_indexes[0] and k < banned_words_indexes[1] ]
+        potential_changes_index = [ k for k in beta_indexes if k >= banword_index[0] and k < banword_index[1] ]

        for change_index in potential_changes_index:
            sequence_wo_bans_list = list(sequence_wo_bans) # turn into list because python strings are immutable
@@ -165,8 +163,8 @@ def remove_ban_words_abaab_encoding(sequence: str, abaab_method_offset=0) -> str
            # change a beta_base without changing the meaning of encoded data
            sequence_wo_bans_list[change_index] = dna_change_beta[sequence_wo_bans[change_index]]
            
-            index_before_ban_word = max(0, banned_words_indexes[0]-3) # test the changed part from 3 bases before to avoid creation of homopolymers
-            index_after_ban_word = min(len(sequence)-1, banned_words_indexes[1]+3) # test to 3 bases after
+            index_before_ban_word = max(0, banword_index[0]-3) # test the changed part from 3 bases before to avoid creation of homopolymers
+            index_after_ban_word = min(len(sequence)-1, banword_index[1]+3) # test to 3 bases after
            
            changed_sub_sequence = ''.join(sequence_wo_bans_list)[index_before_ban_word:index_after_ban_word]

@@ -174,10 +172,10 @@ def remove_ban_words_abaab_encoding(sequence: str, abaab_method_offset=0) -> str
            if sequence_control.sequence_check(changed_sub_sequence, window_size=60, min_GC=0, max_GC=100, verbose=False):   
                # keep the change that removes this ban word if it doesn't create homopolymers or other ban words            
                sequence_wo_bans = ''.join(sequence_wo_bans_list)
-                #print("removed ban word", sequence[banned_words_indexes[0]:banned_words_indexes[1]],"->",changed_sub_sequence)
+                #print("removed ban word", sequence[banword_index[0]:banword_index[1]],"->",changed_sub_sequence)
                break
            else:
-                #print("failed to remove ban word", sequence[banned_words_indexes[0]:banned_words_indexes[1]],"-X>",changed_sub_sequence)
+                #print("failed to remove ban word", sequence[banword_index[0]:banword_index[1]],"-X>",changed_sub_sequence)
                #print("trying again...")
                pass
    
@@ -202,11 +200,13 @@ def size_dna_from_bit_len_abaab(bit_length):
        return base_length
    if rest == 2:
        return base_length + 1
-    if rest == 4:
+    if rest == 3:
        return base_length + 2
-    if rest == 6:
+    if rest == 5:
+        return base_length + 3
+    if rest == 7:
        return base_length + 4
-    print("error size_of_dna_from_bit_len_abaab : size not multiple of 2 :",str(bit_length))
+    print("error size_of_dna_from_bit_len_abaab : invalid size rest :",str(bit_length),"(rest of",str(rest)+")")
    exit(1)


@@ -223,10 +223,12 @@ def size_binary_from_dna_len_abaab(dna_length):
        return base_length
    if rest == 1:
        return base_length + 2
-    if rest == 2 or rest == 3: # if rest of 3, act like a rest of 2 because the third base will be ignored
-        return base_length + 4
+    if rest == 2:
+        return base_length + 3
+    if rest == 3:
+        return base_length + 5
    if rest == 4:
-        return base_length + 6
+        return base_length + 7


 def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
@@ -248,7 +250,7 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:

    """
    if len(binary_string) % 5 not in [0, 1, 3]:
-            print("error binary dna conversion, need a binary string multiple of 5, or 1, 3 modulo 5")
+            print("error binary dna conversion, need a binary string multiple of 5, or with a rest of 1, 3")
            return
    
    sequence = ""
@@ -362,7 +364,7 @@ def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str:
        return sequence
    
    # indexes of beta bases in the sequence (1st base every 3 base window)
-    beta_indexes = [ baa_method_offset + 3*k for k in range(len(sequence)//3)]
+    beta_indexes = [ baa_method_offset + 3*k for k in range(math.ceil(len(sequence)/3))]

    sequence_wo_bans = sequence

@@ -416,7 +418,7 @@ def size_dna_from_bit_len_baa(bit_length):
    if rest == 3:
        return base_length + 2

-    print("error size_of_dna_from_bit_len : invalid rest size:",str(rest))
+    print("error size_of_dna_from_bit_len_baa : invalid size rest :",str(bit_length),"(rest of",str(rest)+")")
    exit(1)