comments improved, refactoring

8201876a · BOULLE Olivier · 5a217f07 · 8201876a
Commit 8201876a authored 1 year ago by BOULLE Olivier
--- a/binary_dna_conversion.py
+++ b/binary_dna_conversion.py
@@ -272,46 +272,47 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
    for i in range(0, n_quintuplets):
        bits5 = binary_string[i*5:(i+1)*5]
        
-        nucleotide_2 = bit_pair_to_dna[bits5[1:3]]
-        nucleotide_3 = bit_pair_to_dna[bits5[3:5]]
+        nucleotide_2 = bit_pair_to_dna[bits5[1:3]] # first alpha base of the encoded triplet
+        nucleotide_3 = bit_pair_to_dna[bits5[3:5]] # 2nd alpha base
        
        # make nucleotide 1
        if len(sequence) >= 3:
-            # check 3 previous encoded bases
+            # check 3 previous encoded bases and 2 following alpha bases to see if there is a potential homopolymer of 4+
            if sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 or sequence[-1] == nucleotide_2 == nucleotide_3:
-                # break homopolymer
+                # break the homopolymer
                nucleotide_1 = bit_to_dna_balance_GC(bits5[0], sequence[-1])

            else:
-                # if no homopolymer to break, adjust GC% in the window of preceding bases + 2 new alpha bases
+                # if no homopolymer to break, adjust GC% in the window of preceding bases + 2 following alpha bases
                check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2 + nucleotide_3
                if check_window.count("A")+check_window.count("T") > check_window.count("G")+check_window.count("C"):
-                    nucleotide_1 = bit_to_dna_GC[bits5[0]]
+                    nucleotide_1 = bit_to_dna_GC[bits5[0]] # more A/T -> add a G/C
                elif check_window.count("A")+check_window.count("T") < check_window.count("G")+check_window.count("C"):
-                    nucleotide_1 = bit_to_dna_AT[bits5[0]]
-                else: # strictly equal, just balance with preceding base
+                    nucleotide_1 = bit_to_dna_AT[bits5[0]] # less A/T -> add a A/T
+                else: # case of strictly equal %GC, just balance with following base
                    nucleotide_1 = bit_to_dna_balance_GC(bits5[0], nucleotide_2)
    
-        else:
+        else: # empty string, just balance the GC with the following base
            nucleotide_1 = bit_to_dna_balance_GC(bits5[0], nucleotide_2)
        
-        sequence += nucleotide_1 + nucleotide_2 + nucleotide_3
+        sequence += nucleotide_1 + nucleotide_2 + nucleotide_3 # add the encoded b a a bases to the sequence

-    bit_rest = binary_string[n_quintuplets*5:n_quintuplets*5+rest] # rest is 0-1-3 bits
+    # all complete quintuplets have been encoded, time for the rest
+    bit_rest = binary_string[n_quintuplets*5:n_quintuplets*5+rest] # rest is 0-1-3 bits long

    if len(bit_rest) > 0:
    # last conversions depends on the length of the rest, 1 bits = 1 base; 3 bits = 2 bases
        if len(bit_rest) == 1:
-            nucleotide_2 = ""
+            nucleotide_2 = "" # no nucleotide 2 to add
            homopolymer_check_bool = sequence[-3] == sequence[-2] == sequence[-1] 
        else: # rest = 3
            nucleotide_2 = bit_pair_to_dna[bit_rest[1:3]] # nucleotide_2 from a pair of bits
            homopolymer_check_bool = sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 # true => careful for a potential homopolymer
        
        if homopolymer_check_bool:
-            # break homopolymer
+            # need to break homopolymer with the nucleotide 1
            nucleotide_1 = bit_to_dna_balance_GC(bit_rest[0], sequence[-1])
-            
+        
        else:
            # adjust GC%
            check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2
@@ -319,9 +320,9 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
                nucleotide_1 = bit_to_dna_GC[bit_rest[0]]
            else:
                nucleotide_1 = bit_to_dna_AT[bit_rest[0]]
-                        
+        
        sequence += nucleotide_1 + nucleotide_2
-
+    
    # remove the banned words
    sequence_wo_banwords = remove_ban_words_baa_encoding(sequence)
    
@@ -336,10 +337,10 @@ def dna_to_binary_baa(sequence: str) -> str:
    binary_string = ""
    n_triplet, rest = divmod(len(sequence), 3)
    for i in range(0, n_triplet):
-        nucleotides3 = sequence[i*3:(i+1)*3]
-        bit_1 = dna_to_bit_ATGC[nucleotides3[0]]
-        bits_23 = dna_to_bit_pair[nucleotides3[1]]
-        bits_45 = dna_to_bit_pair[nucleotides3[2]]
+        triplet_nucleotide = sequence[i*3:(i+1)*3]
+        bit_1 = dna_to_bit_ATGC[triplet_nucleotide[0]]
+        bits_23 = dna_to_bit_pair[triplet_nucleotide[1]]
+        bits_45 = dna_to_bit_pair[triplet_nucleotide[2]]

        binary_string += bit_1 + bits_23 + bits_45
    
@@ -362,6 +363,8 @@ def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str:
    the changed base must be a beta base, and be A<=>G; T<=>C, so the decoding of the updated sequence will still return the same result
    
    if the original sequence has already been fragmented, the 3 bases windows of the baa method can be offset, baa_method_offset is used to correct this
+    
+    #TODO also remove inverse repeat regions of 10+ bases
    """
    
    # change a beta base but keep the binary meaning for the decoding