banword removal for baa

dbab2db1 · BOULLE Olivier · 8aa73337 · dbab2db1
Commit dbab2db1 authored 2 years ago by BOULLE Olivier
--- a/binary_dna_conversion.py
+++ b/binary_dna_conversion.py
@@ -151,20 +151,18 @@ def remove_ban_words_abaab_encoding(sequence: str, abaab_method_offset=0) -> str
    if the original sequence has already been fragmented, the 5 bases windows of the abaab method can be offset, abaab_method_offset is used to correct this
    """
    
-    forbidden_sequences = ["GGTCTC", "GAGACC"]
-
    # change a beta base but keep the binary meaning for the decoding
    dna_change_beta = {"A": "G", "T": "C", "G": "A", "C": "T"}

    # check for banned words
-    indexs_ban_words = []
+    indexes_ban_words = []
    for banned_word in sequence_control.get_forbidden_sequences():
        index = sequence.find(banned_word)
        while index != -1:
-            indexs_ban_words.append([index, index+len(banned_word)])
+            indexes_ban_words.append([index, index+len(banned_word)])
            index = sequence.find(banned_word, index+1)
    
-    if indexs_ban_words == []:
+    if indexes_ban_words == []:
        return sequence
    
    # indexes of beta bases in the sequence (2nd and 5th base every 5 base window)
@@ -172,7 +170,7 @@ def remove_ban_words_abaab_encoding(sequence: str, abaab_method_offset=0) -> str

    sequence_wo_bans = sequence

-    for banned_words_indexes in indexs_ban_words:
+    for banned_words_indexes in indexes_ban_words:
        # indexes of beta bases that can be changed to remove a banned word
        potential_changes_index = [ k for k in beta_indexes if k >= banned_words_indexes[0] and k < banned_words_indexes[1] ]

@@ -288,8 +286,74 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
                    
    sequence += nucleotide_1 + nucleotide_2

-    return sequence
+    # remove the banned words
+    sequence_wo_banwords = remove_ban_words_baa_encoding(sequence)
+    
+    return sequence_wo_banwords
+
+
+def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str:
+    """
+    remove banned words from a sequence encoded with the binary_conversion.binary_to_dna_baa() method
+    the baa method ensure a GC% in [33;66] on a window of 3 bases, but we the constraint is on larger windows
+    changing a base to avoid a banned word should not affect the whole sequence too much
+    the changed base must be a beta base, and be A<=>G; T<=>C, so the decoding of the updated sequence will still return the same result
+    
+    if the original sequence has already been fragmented, the 3 bases windows of the baa method can be offset, baa_method_offset is used to correct this
+    """
+    
+    # change a beta base but keep the binary meaning for the decoding
+    dna_change_beta = {"A": "G", "T": "C", "G": "A", "C": "T"}
+
+    # check for banned words
+    indexes_ban_words = []
+    for banned_word in sequence_control.get_forbidden_sequences():
+        index = sequence.find(banned_word)
+        while index != -1:
+            indexes_ban_words.append([index, index+len(banned_word)])
+            index = sequence.find(banned_word, index+1)
+    
+    if indexes_ban_words == []:
+        return sequence
+    
+    # indexes of beta bases in the sequence (2nd and 5th base every 5 base window)
+    beta_indexes = [ baa_method_offset + 3*k for k in range(len(sequence)//3)]
+
+    sequence_wo_bans = sequence
+
+    for banned_words_indexes in indexes_ban_words:
+        # indexes of beta bases that can be changed to remove a banned word
+        potential_changes_index = [ k for k in beta_indexes if k >= banned_words_indexes[0] and k < banned_words_indexes[1] ]

+        for change_index in potential_changes_index:
+            sequence_wo_bans_list = list(sequence_wo_bans) # turn into list because python strings are immutable
+            
+            # change a beta_base without changing the meaning of encoded data
+            sequence_wo_bans_list[change_index] = dna_change_beta[sequence_wo_bans[change_index]]
+            
+            index_before_ban_word = max(0, banned_words_indexes[0]-3) # test the changed part from 3 bases before to avoid creation of homopolymeres
+            index_after_ban_word = min(len(sequence)-1, banned_words_indexes[1]+3) # test to 3 bases after
+            
+            changed_sub_sequence = ''.join(sequence_wo_bans_list)[index_before_ban_word:index_after_ban_word]
+
+            # test the sequence only for the banned words and homopolymeres, ignore the GC%
+            if sequence_control.sequence_check(changed_sub_sequence, window_size=60, min_GC=0, max_GC=100, verbose=False):   
+                # keep the change that removes this ban word if it doesn't create homopolymeres or other ban words            
+                sequence_wo_bans = ''.join(sequence_wo_bans_list)
+                #print("removed ban word", sequence[banned_words_indexes[0]:banned_words_indexes[1]],"->",changed_sub_sequence)
+                break
+            else:
+                #print("failed to remove ban word", sequence[banned_words_indexes[0]:banned_words_indexes[1]],"-X>",changed_sub_sequence)
+                #print("trying again...")
+                pass
+    
+    # last check for very very odd cases (maybe overlapping ban words)
+    for banned_word in sequence_control.get_forbidden_sequences():
+        if banned_word in sequence_wo_bans:
+            print("binary conversion: unable to remove forbidden sequences", sequence_wo_bans)
+            exit(1)
+    
+    return sequence_wo_bans

 def test_conversion():