remove bias for lower GC% in baa

81585279 · BOULLE Olivier · e09a5ed2 · 81585279
Commit 81585279 authored 2 years ago by BOULLE Olivier
--- a/binary_dna_conversion.py
+++ b/binary_dna_conversion.py
@@ -61,7 +61,7 @@ def binary_to_dna_abaab(binary_string: str) -> str:
                          4th 5th bits are normally converted into dna
                          6th 7th bits are normally converted into dna
                          8th bit is converted depending on previous conversion : if previous in {AT}, then (0:G, 1:C), elif in {GC}, then (0:A, 1:T)
-    this ensure that the total sequence cannot contain homopolymeres > 3,
+    this ensure that the total sequence cannot contain homopolymers > 3,
    the GC% is in [40%, 60%] in a window of 5 and the conversion rate is 1.6 bit/base
    
    warning : need the binary string to be a multiple of 2, or rest is inconsistent with decoding 
@@ -190,14 +190,14 @@ def remove_ban_words_abaab_encoding(sequence: str, abaab_method_offset=0) -> str
            # change a beta_base without changing the meaning of encoded data
            sequence_wo_bans_list[change_index] = dna_change_beta[sequence_wo_bans[change_index]]
            
-            index_before_ban_word = max(0, banned_words_indexes[0]-3) # test the changed part from 3 bases before to avoid creation of homopolymeres
+            index_before_ban_word = max(0, banned_words_indexes[0]-3) # test the changed part from 3 bases before to avoid creation of homopolymers
            index_after_ban_word = min(len(sequence)-1, banned_words_indexes[1]+3) # test to 3 bases after
            
            changed_sub_sequence = ''.join(sequence_wo_bans_list)[index_before_ban_word:index_after_ban_word]

-            # test the sequence only for the banned words and homopolymeres, ignore the GC%
+            # test the sequence only for the banned words and homopolymers, ignore the GC%
            if sequence_control.sequence_check(changed_sub_sequence, window_size=60, min_GC=0, max_GC=100, verbose=False):   
-                # keep the change that removes this ban word if it doesn't create homopolymeres or other ban words            
+                # keep the change that removes this ban word if it doesn't create homopolymers or other ban words            
                sequence_wo_bans = ''.join(sequence_wo_bans_list)
                #print("removed ban word", sequence[banned_words_indexes[0]:banned_words_indexes[1]],"->",changed_sub_sequence)
                break
@@ -221,10 +221,10 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
    the binaries are divided into parts of 5 bits
    for each 5 bits part, 1st 2nd bits are normally converted into dna : (00:A, 01:G, 10:T, 11:C)
                          3rd bit is converted depending on previous converted sequence : 
-                              (0:G, 1:C) or (0:A, 1:T), first to break potential homopolymere, if not then to adjust %GC of the previous sequence          
+                              (0:G, 1:C) or (0:A, 1:T), first to break potential homopolymer, if not then to adjust %GC of the previous sequence          
                          4th 5th bits are normally converted into dna

-    this ensure that the total sequence cannot contain homopolymeres > 3,
+    this ensure that the total sequence cannot contain homopolymers > 3,
    the GC% is in [33%, 66%] in the worst case in a window of 3, but tends to 50% with the adjustments and the conversion rate is 1.66 bit/base
    
    warning : need the binary string to be a multiple of 5, or rest is inconsistent with decoding 
@@ -245,20 +245,22 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
        nucleotide_2 = bit_pair_to_dna[bits5[1:3]]
        nucleotide_3 = bit_pair_to_dna[bits5[3:5]]
        
-        # make nucleotide 2
+        # make nucleotide 1
        if len(sequence) >= 3:
            # check 3 previous encoded bases
            if sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 or sequence[-1] == nucleotide_2 == nucleotide_3:
-                # break homopolymere
+                # break homopolymer
                nucleotide_1 = bit_to_dna_balance_GC(bits5[0], sequence[-1])

            else:
-                # adjust GC% in the window of preceding bases + 2 new alpha bases
+                # if no homopolymer to break, adjust GC% in the window of preceding bases + 2 new alpha bases
                check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2 + nucleotide_3
                if check_window.count("A")+check_window.count("T") > check_window.count("G")+check_window.count("C"):
                    nucleotide_1 = bit_to_dna_GC[bits5[0]]
-                else:
+                elif check_window.count("A")+check_window.count("T") < check_window.count("G")+check_window.count("C"):
                    nucleotide_1 = bit_to_dna_AT[bits5[0]]
+                else: # strictly equal, just balance with preceding base
+                    nucleotide_1 = bit_to_dna_balance_GC(bits5[0], nucleotide_2)
    
        else:
            nucleotide_1 = bit_to_dna_balance_GC(bits5[0], nucleotide_2)
@@ -271,13 +273,13 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
    # last conversions depends on the length of the rest, 1 bits = 1 base; 3 bits = 2 bases
        if len(bit_rest) == 1:
            nucleotide_2 = ""
-            homopolymere_check_bool = sequence[-3] == sequence[-2] == sequence[-1] 
+            homopolymer_check_bool = sequence[-3] == sequence[-2] == sequence[-1] 
        else: # rest = 3
            nucleotide_2 = bit_pair_to_dna[bit_rest[1:3]] # nucleotide_2 from a pair of bits
-            homopolymere_check_bool = sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 # true => careful for a potential homopolymere
+            homopolymer_check_bool = sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 # true => careful for a potential homopolymer
        
-        if homopolymere_check_bool:
-            # break homopolymere
+        if homopolymer_check_bool:
+            # break homopolymer
            nucleotide_1 = bit_to_dna_balance_GC(bit_rest[0], sequence[-1])
            
        else:
@@ -360,14 +362,14 @@ def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str:
            # change a beta_base without changing the meaning of encoded data
            sequence_wo_bans_list[change_index] = dna_change_beta[sequence_wo_bans[change_index]]
            
-            index_before_ban_word = max(0, banned_words_indexes[0]-3) # test the changed part from 3 bases before to avoid creation of homopolymeres
+            index_before_ban_word = max(0, banned_words_indexes[0]-3) # test the changed part from 3 bases before to avoid creation of homopolymers
            index_after_ban_word = min(len(sequence)-1, banned_words_indexes[1]+3) # test to 3 bases after
            
            changed_sub_sequence = ''.join(sequence_wo_bans_list)[index_before_ban_word:index_after_ban_word]

-            # test the sequence only for the banned words and homopolymeres, ignore the GC%
+            # test the sequence only for the banned words and homopolymers, ignore the GC%
            if sequence_control.sequence_check(changed_sub_sequence, window_size=60, min_GC=0, max_GC=100, verbose=False):   
-                # keep the change that removes this ban word if it doesn't create homopolymeres or other ban words            
+                # keep the change that removes this ban word if it doesn't create homopolymers or other ban words            
                sequence_wo_bans = ''.join(sequence_wo_bans_list)
                #print("removed ban word", sequence[banned_words_indexes[0]:banned_words_indexes[1]],"->",changed_sub_sequence)
                break
@@ -387,7 +389,7 @@ def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str:

 def test_conversion():
    
-    for binary_size in range(10, 15000, 1):
+    for binary_size in range(100, 15000, 1):
        print("size",str(binary_size))
        random_binary = ""
        for i in range(binary_size):
@@ -404,10 +406,10 @@ def test_conversion():

        #print("check abaab encoding", str(decoded_abaab == random_binary))
        print("check baa encoding", str(decoded_baa == random_binary))
-        if not (decoded_baa == random_binary):
-            print(random_binary)
+        if not (decoded_baa == random_binary and sequence_control.sequence_check(seq_baa, min_GC=40, max_GC=60, verbose=True)):
+            #print(random_binary)
            print(seq_baa)
-            print(decoded_baa)
+            #print(decoded_baa)
            exit(1)
        
    #print("check abaab encoding")