refactoring in a function GC balancing

cf4e7e4b · BOULLE Olivier · dbab2db1 · cf4e7e4b
Commit cf4e7e4b authored 2 years ago by BOULLE Olivier
--- a/binary_dna_conversion.py
+++ b/binary_dna_conversion.py
@@ -18,6 +18,16 @@ bit_to_dna_GC = {"0": "G", "1": "C"}
 dna_to_bit_ATGC = {"A": "0", "T": "1", "G": "0",  "C": "1", "_": "0"}
+def bit_to_dna_balance_GC(bit_char: str, reference_base: str):
+    """
+    convert a bit to a dna base to balance the GC with a reference base 
+    """
+    if reference_base in ["A", "T"]:
+        return bit_to_dna_GC[bit_char]
+    else:
+        return bit_to_dna_AT[bit_char]
 def binary_to_dna(binary_string: str) -> str:
    """
    basic conversion of binaries into dna sequence
@@ -69,16 +79,14 @@ def binary_to_dna_abaab(binary_string: str) -> str:
    for i in range(0, n_octets):
        bits8 = binary_string[i*8:(i+1)*8]
        nucleotide_1 = bit_pair_to_dna[bits8[0:2]]
-        if nucleotide_1 in ["A", "T"]:
-            nucleotide_2 = bit_to_dna_GC[bits8[2]]
+        nucleotide_2 = bit_to_dna_balance_GC(bits8[2], nucleotide_1)
-        else:
-            nucleotide_2 = bit_to_dna_AT[bits8[2]]
        nucleotide_3 = bit_pair_to_dna[bits8[3:5]]
        nucleotide_4 = bit_pair_to_dna[bits8[5:7]]
-        if nucleotide_4 in ["A", "T"]:
-            nucleotide_5 = bit_to_dna_GC[bits8[7]]
+        nucleotide_5 = bit_to_dna_balance_GC(bits8[7], nucleotide_4)
-        else:
-            nucleotide_5 = bit_to_dna_AT[bits8[7]]
        sequence += nucleotide_1 + nucleotide_2 + nucleotide_3 + nucleotide_4 + nucleotide_5
    # rest should be 0 because all documents contains a round number of octet
    # but some "0" can be added to fill the fragments
@@ -91,10 +99,9 @@ def binary_to_dna_abaab(binary_string: str) -> str:
        if len(bit_rest) == 4:
            sequence += bit_pair_to_dna[bit_rest[2:4]] # nucleotide_2 also from a pair of bits
        elif len(bit_rest) == 6:   
-            if nucleotide_1 in ["A", "T"]:
-                sequence += bit_to_dna_GC[bit_rest[2]] # nucleotide_2 from a single bit and depending on nucleotide 1
+            # nucleotide_2 from a single bit and depending on nucleotide 1
-            else:
+            sequence += bit_to_dna_balance_GC(bit_rest[2], nucleotide_1)
-                sequence += bit_to_dna_AT[bit_rest[2]] # nucleotide_2 from a single bit and depending on nucleotide 1
            sequence += bit_pair_to_dna[bit_rest[3:5]] # nucleotide_3 from a pair of bits
            sequence += bit_to_dna_AT[bit_rest[5]] # nucleotide_4 from a single bit
@@ -240,10 +247,8 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
            # check 3 previous encoded bases
            if sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 or sequence[-1] == nucleotide_2 == nucleotide_3:
                # break homopolymere
-                if sequence[-1] in ["A", "T"]:
+                nucleotide_1 = bit_to_dna_balance_GC(bits5[0], sequence[-1])
-                    nucleotide_1 = bit_to_dna_GC[bits5[0]]
-                else:
-                    nucleotide_1 = bit_to_dna_AT[bits5[0]]
            else:
                # adjust GC% in the window of preceding bases + 2 new alpha bases
                check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2 + nucleotide_3
@@ -253,10 +258,7 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
                    nucleotide_1 = bit_to_dna_AT[bits5[0]]
        else:
-            if nucleotide_2 in ["A", "T"]:
+            nucleotide_1 = bit_to_dna_balance_GC(bits5[0], nucleotide_2)
-                nucleotide_1 = bit_to_dna_GC[bits5[0]]
-            else:
-                nucleotide_1 = bit_to_dna_AT[bits5[0]]
        sequence += nucleotide_1 + nucleotide_2 + nucleotide_3
@@ -272,10 +274,8 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
    if homopolymere_check_bool:
        # break homopolymere
-        if sequence[-1] in ["A", "T"]:
+        nucleotide_1 = bit_to_dna_balance_GC(bit_rest[0], sequence[-1])
-            nucleotide_1 = bit_to_dna_GC[bit_rest[0]]
-        else:
-            nucleotide_1 = bit_to_dna_AT[bit_rest[0]]
    else:
        # adjust GC%
        check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2
@@ -292,6 +292,41 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
    return sequence_wo_banwords
+def dna_to_binary_abaab(sequence: str) -> str:
+    """
+    convert back a sequence to binaries (opposite of binary_to_dna_abaab)
+    """
+    binary_string = ""
+    n_quintuplet, rest = divmod(len(sequence), 5)
+    for i in range(0, n_quintuplet):
+        nucleotides5 = sequence[i*5:(i+1)*5]
+        bits_12 = dna_to_bit_pair[nucleotides5[0]]
+        bit_3 = dna_to_bit_ATGC[nucleotides5[1]]
+        bits_45 = dna_to_bit_pair[nucleotides5[2]]
+        bits_67 = dna_to_bit_pair[nucleotides5[3]]
+        bit_8 = dna_to_bit_ATGC[nucleotides5[4]]
+        binary_string += bits_12 + bit_3 + bits_45 + bits_67 + bit_8
+    # handle rest < 5
+    sequence_rest = sequence[n_quintuplet*5:n_quintuplet*5+rest]
+    if len(sequence_rest) >= 1: # 1 base -> 2 bits
+        binary_string += dna_to_bit_pair[sequence_rest[0]]
+        if len(sequence_rest) == 2: # 2 bases -> 2 bits + 2 bits
+            binary_string += dna_to_bit_pair[sequence_rest[1]]
+        elif len(sequence_rest) == 3:
+            # a rest of 3 should not occur from a binary to dna abaab conversion, but a non coding A can be added to get a round number of blocks
+            binary_string += dna_to_bit_pair[sequence_rest[1]] # just act like the rest was 2
+        elif len(sequence_rest) == 4: # 4 bases -> 2 bits + 1 bit + 2 bits + 1 bit
+            binary_string += dna_to_bit_ATGC[sequence_rest[1]] # 1 base -> 1 bit
+            binary_string += dna_to_bit_pair[sequence_rest[2]] # 1 base -> 2 bits
+            binary_string += dna_to_bit_ATGC[sequence_rest[3]] # 1 base -> 1 bit
+    return binary_string
 def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str:
    """
    remove banned words from a sequence encoded with the binary_conversion.binary_to_dna_baa() method