Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 8201876a authored by BOULLE Olivier's avatar BOULLE Olivier
Browse files

comments improved, refactoring

parent 5a217f07
Branches
No related tags found
No related merge requests found
......@@ -272,46 +272,47 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
for i in range(0, n_quintuplets):
bits5 = binary_string[i*5:(i+1)*5]
nucleotide_2 = bit_pair_to_dna[bits5[1:3]]
nucleotide_3 = bit_pair_to_dna[bits5[3:5]]
nucleotide_2 = bit_pair_to_dna[bits5[1:3]] # first alpha base of the encoded triplet
nucleotide_3 = bit_pair_to_dna[bits5[3:5]] # 2nd alpha base
# make nucleotide 1
if len(sequence) >= 3:
# check 3 previous encoded bases
# check 3 previous encoded bases and 2 following alpha bases to see if there is a potential homopolymer of 4+
if sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 or sequence[-1] == nucleotide_2 == nucleotide_3:
# break homopolymer
# break the homopolymer
nucleotide_1 = bit_to_dna_balance_GC(bits5[0], sequence[-1])
else:
# if no homopolymer to break, adjust GC% in the window of preceding bases + 2 new alpha bases
# if no homopolymer to break, adjust GC% in the window of preceding bases + 2 following alpha bases
check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2 + nucleotide_3
if check_window.count("A")+check_window.count("T") > check_window.count("G")+check_window.count("C"):
nucleotide_1 = bit_to_dna_GC[bits5[0]]
nucleotide_1 = bit_to_dna_GC[bits5[0]] # more A/T -> add a G/C
elif check_window.count("A")+check_window.count("T") < check_window.count("G")+check_window.count("C"):
nucleotide_1 = bit_to_dna_AT[bits5[0]]
else: # strictly equal, just balance with preceding base
nucleotide_1 = bit_to_dna_AT[bits5[0]] # less A/T -> add a A/T
else: # case of strictly equal %GC, just balance with following base
nucleotide_1 = bit_to_dna_balance_GC(bits5[0], nucleotide_2)
else:
else: # empty string, just balance the GC with the following base
nucleotide_1 = bit_to_dna_balance_GC(bits5[0], nucleotide_2)
sequence += nucleotide_1 + nucleotide_2 + nucleotide_3
sequence += nucleotide_1 + nucleotide_2 + nucleotide_3 # add the encoded b a a bases to the sequence
bit_rest = binary_string[n_quintuplets*5:n_quintuplets*5+rest] # rest is 0-1-3 bits
# all complete quintuplets have been encoded, time for the rest
bit_rest = binary_string[n_quintuplets*5:n_quintuplets*5+rest] # rest is 0-1-3 bits long
if len(bit_rest) > 0:
# last conversions depends on the length of the rest, 1 bits = 1 base; 3 bits = 2 bases
if len(bit_rest) == 1:
nucleotide_2 = ""
nucleotide_2 = "" # no nucleotide 2 to add
homopolymer_check_bool = sequence[-3] == sequence[-2] == sequence[-1]
else: # rest = 3
nucleotide_2 = bit_pair_to_dna[bit_rest[1:3]] # nucleotide_2 from a pair of bits
homopolymer_check_bool = sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 # true => careful for a potential homopolymer
if homopolymer_check_bool:
# break homopolymer
# need to break homopolymer with the nucleotide 1
nucleotide_1 = bit_to_dna_balance_GC(bit_rest[0], sequence[-1])
else:
# adjust GC%
check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2
......@@ -319,9 +320,9 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
nucleotide_1 = bit_to_dna_GC[bit_rest[0]]
else:
nucleotide_1 = bit_to_dna_AT[bit_rest[0]]
sequence += nucleotide_1 + nucleotide_2
# remove the banned words
sequence_wo_banwords = remove_ban_words_baa_encoding(sequence)
......@@ -336,10 +337,10 @@ def dna_to_binary_baa(sequence: str) -> str:
binary_string = ""
n_triplet, rest = divmod(len(sequence), 3)
for i in range(0, n_triplet):
nucleotides3 = sequence[i*3:(i+1)*3]
bit_1 = dna_to_bit_ATGC[nucleotides3[0]]
bits_23 = dna_to_bit_pair[nucleotides3[1]]
bits_45 = dna_to_bit_pair[nucleotides3[2]]
triplet_nucleotide = sequence[i*3:(i+1)*3]
bit_1 = dna_to_bit_ATGC[triplet_nucleotide[0]]
bits_23 = dna_to_bit_pair[triplet_nucleotide[1]]
bits_45 = dna_to_bit_pair[triplet_nucleotide[2]]
binary_string += bit_1 + bits_23 + bits_45
......@@ -362,6 +363,8 @@ def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str:
the changed base must be a beta base, and be A<=>G; T<=>C, so the decoding of the updated sequence will still return the same result
if the original sequence has already been fragmented, the 3 bases windows of the baa method can be offset, baa_method_offset is used to correct this
#TODO also remove inverse repeat regions of 10+ bases
"""
# change a beta base but keep the binary meaning for the decoding
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment