From 8201876a71c296e4e28e96fd13a794865c243440 Mon Sep 17 00:00:00 2001
From: oboulle <olivier.boulle@inria.fr>
Date: Fri, 21 Jul 2023 11:06:16 +0200
Subject: [PATCH] comments improved, refactoring

---
 binary_dna_conversion.py | 43 +++++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/binary_dna_conversion.py b/binary_dna_conversion.py
index 4d4d790..77d56bc 100755
--- a/binary_dna_conversion.py
+++ b/binary_dna_conversion.py
@@ -272,46 +272,47 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
     for i in range(0, n_quintuplets):
         bits5 = binary_string[i*5:(i+1)*5]
         
-        nucleotide_2 = bit_pair_to_dna[bits5[1:3]]
-        nucleotide_3 = bit_pair_to_dna[bits5[3:5]]
+        nucleotide_2 = bit_pair_to_dna[bits5[1:3]] # first alpha base of the encoded triplet
+        nucleotide_3 = bit_pair_to_dna[bits5[3:5]] # 2nd alpha base
         
         # make nucleotide 1
         if len(sequence) >= 3:
-            # check 3 previous encoded bases
+            # check 3 previous encoded bases and 2 following alpha bases to see if there is a potential homopolymer of 4+
             if sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 or sequence[-1] == nucleotide_2 == nucleotide_3:
-                # break homopolymer
+                # break the homopolymer
                 nucleotide_1 = bit_to_dna_balance_GC(bits5[0], sequence[-1])
 
             else:
-                # if no homopolymer to break, adjust GC% in the window of preceding bases + 2 new alpha bases
+                # if no homopolymer to break, adjust GC% in the window of preceding bases + 2 following alpha bases
                 check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2 + nucleotide_3
                 if check_window.count("A")+check_window.count("T") > check_window.count("G")+check_window.count("C"):
-                    nucleotide_1 = bit_to_dna_GC[bits5[0]]
+                    nucleotide_1 = bit_to_dna_GC[bits5[0]] # more A/T -> add a G/C
                 elif check_window.count("A")+check_window.count("T") < check_window.count("G")+check_window.count("C"):
-                    nucleotide_1 = bit_to_dna_AT[bits5[0]]
-                else: # strictly equal, just balance with preceding base
+                    nucleotide_1 = bit_to_dna_AT[bits5[0]] # less A/T -> add a A/T
+                else: # case of strictly equal %GC, just balance with following base
                     nucleotide_1 = bit_to_dna_balance_GC(bits5[0], nucleotide_2)
     
-        else:
+        else: # empty string, just balance the GC with the following base
             nucleotide_1 = bit_to_dna_balance_GC(bits5[0], nucleotide_2)
         
-        sequence += nucleotide_1 + nucleotide_2 + nucleotide_3
+        sequence += nucleotide_1 + nucleotide_2 + nucleotide_3 # add the encoded b a a bases to the sequence
 
-    bit_rest = binary_string[n_quintuplets*5:n_quintuplets*5+rest] # rest is 0-1-3 bits
+    # all complete quintuplets have been encoded, time for the rest
+    bit_rest = binary_string[n_quintuplets*5:n_quintuplets*5+rest] # rest is 0-1-3 bits long
 
     if len(bit_rest) > 0:
     # last conversions depends on the length of the rest, 1 bits = 1 base; 3 bits = 2 bases
         if len(bit_rest) == 1:
-            nucleotide_2 = ""
+            nucleotide_2 = "" # no nucleotide 2 to add
             homopolymer_check_bool = sequence[-3] == sequence[-2] == sequence[-1] 
         else: # rest = 3
             nucleotide_2 = bit_pair_to_dna[bit_rest[1:3]] # nucleotide_2 from a pair of bits
             homopolymer_check_bool = sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 # true => careful for a potential homopolymer
         
         if homopolymer_check_bool:
-            # break homopolymer
+            # need to break homopolymer with the nucleotide 1
             nucleotide_1 = bit_to_dna_balance_GC(bit_rest[0], sequence[-1])
-            
+        
         else:
             # adjust GC%
             check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2
@@ -319,9 +320,9 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
                 nucleotide_1 = bit_to_dna_GC[bit_rest[0]]
             else:
                 nucleotide_1 = bit_to_dna_AT[bit_rest[0]]
-                        
+        
         sequence += nucleotide_1 + nucleotide_2
-
+    
     # remove the banned words
     sequence_wo_banwords = remove_ban_words_baa_encoding(sequence)
     
@@ -336,10 +337,10 @@ def dna_to_binary_baa(sequence: str) -> str:
     binary_string = ""
     n_triplet, rest = divmod(len(sequence), 3)
     for i in range(0, n_triplet):
-        nucleotides3 = sequence[i*3:(i+1)*3]
-        bit_1 = dna_to_bit_ATGC[nucleotides3[0]]
-        bits_23 = dna_to_bit_pair[nucleotides3[1]]
-        bits_45 = dna_to_bit_pair[nucleotides3[2]]
+        triplet_nucleotide = sequence[i*3:(i+1)*3]
+        bit_1 = dna_to_bit_ATGC[triplet_nucleotide[0]]
+        bits_23 = dna_to_bit_pair[triplet_nucleotide[1]]
+        bits_45 = dna_to_bit_pair[triplet_nucleotide[2]]
 
         binary_string += bit_1 + bits_23 + bits_45
     
@@ -362,6 +363,8 @@ def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str:
     the changed base must be a beta base, and be A<=>G; T<=>C, so the decoding of the updated sequence will still return the same result
     
     if the original sequence has already been fragmented, the 3 bases windows of the baa method can be offset, baa_method_offset is used to correct this
+    
+    #TODO also remove inverse repeat regions of 10+ bases
     """
     
     # change a beta base but keep the binary meaning for the decoding
-- 
GitLab