From 8201876a71c296e4e28e96fd13a794865c243440 Mon Sep 17 00:00:00 2001 From: oboulle <olivier.boulle@inria.fr> Date: Fri, 21 Jul 2023 11:06:16 +0200 Subject: [PATCH] comments improved, refactoring --- binary_dna_conversion.py | 43 +++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/binary_dna_conversion.py b/binary_dna_conversion.py index 4d4d790..77d56bc 100755 --- a/binary_dna_conversion.py +++ b/binary_dna_conversion.py @@ -272,46 +272,47 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str: for i in range(0, n_quintuplets): bits5 = binary_string[i*5:(i+1)*5] - nucleotide_2 = bit_pair_to_dna[bits5[1:3]] - nucleotide_3 = bit_pair_to_dna[bits5[3:5]] + nucleotide_2 = bit_pair_to_dna[bits5[1:3]] # first alpha base of the encoded triplet + nucleotide_3 = bit_pair_to_dna[bits5[3:5]] # 2nd alpha base # make nucleotide 1 if len(sequence) >= 3: - # check 3 previous encoded bases + # check 3 previous encoded bases and 2 following alpha bases to see if there is a potential homopolymer of 4+ if sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 or sequence[-1] == nucleotide_2 == nucleotide_3: - # break homopolymer + # break the homopolymer nucleotide_1 = bit_to_dna_balance_GC(bits5[0], sequence[-1]) else: - # if no homopolymer to break, adjust GC% in the window of preceding bases + 2 new alpha bases + # if no homopolymer to break, adjust GC% in the window of preceding bases + 2 following alpha bases check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2 + nucleotide_3 if check_window.count("A")+check_window.count("T") > check_window.count("G")+check_window.count("C"): - nucleotide_1 = bit_to_dna_GC[bits5[0]] + nucleotide_1 = bit_to_dna_GC[bits5[0]] # more A/T -> add a G/C elif check_window.count("A")+check_window.count("T") < check_window.count("G")+check_window.count("C"): - nucleotide_1 = bit_to_dna_AT[bits5[0]] - else: # strictly equal, just balance with preceding base + nucleotide_1 = bit_to_dna_AT[bits5[0]] # less A/T -> add a A/T + else: # case of strictly equal %GC, just balance with following base nucleotide_1 = bit_to_dna_balance_GC(bits5[0], nucleotide_2) - else: + else: # empty string, just balance the GC with the following base nucleotide_1 = bit_to_dna_balance_GC(bits5[0], nucleotide_2) - sequence += nucleotide_1 + nucleotide_2 + nucleotide_3 + sequence += nucleotide_1 + nucleotide_2 + nucleotide_3 # add the encoded b a a bases to the sequence - bit_rest = binary_string[n_quintuplets*5:n_quintuplets*5+rest] # rest is 0-1-3 bits + # all complete quintuplets have been encoded, time for the rest + bit_rest = binary_string[n_quintuplets*5:n_quintuplets*5+rest] # rest is 0-1-3 bits long if len(bit_rest) > 0: # last conversions depends on the length of the rest, 1 bits = 1 base; 3 bits = 2 bases if len(bit_rest) == 1: - nucleotide_2 = "" + nucleotide_2 = "" # no nucleotide 2 to add homopolymer_check_bool = sequence[-3] == sequence[-2] == sequence[-1] else: # rest = 3 nucleotide_2 = bit_pair_to_dna[bit_rest[1:3]] # nucleotide_2 from a pair of bits homopolymer_check_bool = sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 # true => careful for a potential homopolymer if homopolymer_check_bool: - # break homopolymer + # need to break homopolymer with the nucleotide 1 nucleotide_1 = bit_to_dna_balance_GC(bit_rest[0], sequence[-1]) - + else: # adjust GC% check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2 @@ -319,9 +320,9 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str: nucleotide_1 = bit_to_dna_GC[bit_rest[0]] else: nucleotide_1 = bit_to_dna_AT[bit_rest[0]] - + sequence += nucleotide_1 + nucleotide_2 - + # remove the banned words sequence_wo_banwords = remove_ban_words_baa_encoding(sequence) @@ -336,10 +337,10 @@ def dna_to_binary_baa(sequence: str) -> str: binary_string = "" n_triplet, rest = divmod(len(sequence), 3) for i in range(0, n_triplet): - nucleotides3 = sequence[i*3:(i+1)*3] - bit_1 = dna_to_bit_ATGC[nucleotides3[0]] - bits_23 = dna_to_bit_pair[nucleotides3[1]] - bits_45 = dna_to_bit_pair[nucleotides3[2]] + triplet_nucleotide = sequence[i*3:(i+1)*3] + bit_1 = dna_to_bit_ATGC[triplet_nucleotide[0]] + bits_23 = dna_to_bit_pair[triplet_nucleotide[1]] + bits_45 = dna_to_bit_pair[triplet_nucleotide[2]] binary_string += bit_1 + bits_23 + bits_45 @@ -362,6 +363,8 @@ def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str: the changed base must be a beta base, and be A<=>G; T<=>C, so the decoding of the updated sequence will still return the same result if the original sequence has already been fragmented, the 3 bases windows of the baa method can be offset, baa_method_offset is used to correct this + + #TODO also remove inverse repeat regions of 10+ bases """ # change a beta base but keep the binary meaning for the decoding -- GitLab