Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 8e077e9c authored by BOULLE Olivier's avatar BOULLE Olivier
Browse files

baa decoding

parent 74cc52c1
Branches
No related tags found
No related merge requests found
......@@ -264,27 +264,28 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
bit_rest = binary_string[n_quintuplets*5:n_quintuplets*5+rest] # rest is 0-1-3 bits
if len(bit_rest) > 0:
# last conversions depends on the length of the rest, 1 bits = 1 base; 3 bits = 2 bases
if len(bit_rest) == 3:
nucleotide_2 = bit_pair_to_dna[bit_rest[1:3]] # nucleotide_2 from a pair of bits
homopolymere_check_bool = sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 # true => careful for a potential homopolymere
else:
nucleotide_2 = ""
homopolymere_check_bool = sequence[-3] == sequence[-2] == sequence[-1]
if homopolymere_check_bool:
# break homopolymere
nucleotide_1 = bit_to_dna_balance_GC(bit_rest[0], sequence[-1])
if len(bit_rest) == 3:
nucleotide_2 = bit_pair_to_dna[bit_rest[1:3]] # nucleotide_2 from a pair of bits
homopolymere_check_bool = sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 # true => careful for a potential homopolymere
else:
nucleotide_2 = ""
homopolymere_check_bool = sequence[-3] == sequence[-2] == sequence[-1]
else:
# adjust GC%
check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2
if check_window.count("A")+check_window.count("T") > check_window.count("G")+check_window.count("C"):
nucleotide_1 = bit_to_dna_GC[bits5[0]]
if homopolymere_check_bool:
# break homopolymere
nucleotide_1 = bit_to_dna_balance_GC(bit_rest[0], sequence[-1])
else:
nucleotide_1 = bit_to_dna_AT[bits5[0]]
sequence += nucleotide_1 + nucleotide_2
# adjust GC%
check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2
if check_window.count("A")+check_window.count("T") > check_window.count("G")+check_window.count("C"):
nucleotide_1 = bit_to_dna_GC[bits5[0]]
else:
nucleotide_1 = bit_to_dna_AT[bits5[0]]
sequence += nucleotide_1 + nucleotide_2
# remove the banned words
sequence_wo_banwords = remove_ban_words_baa_encoding(sequence)
......@@ -292,39 +293,29 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
return sequence_wo_banwords
def dna_to_binary_abaab(sequence: str) -> str:
def dna_to_binary_baa(sequence: str) -> str:
"""
convert back a sequence to binaries (opposite of binary_to_dna_abaab)
convert back a sequence to binaries (opposite of binary_to_dna_baa)
"""
binary_string = ""
n_quintuplet, rest = divmod(len(sequence), 5)
for i in range(0, n_quintuplet):
nucleotides5 = sequence[i*5:(i+1)*5]
bits_12 = dna_to_bit_pair[nucleotides5[0]]
bit_3 = dna_to_bit_ATGC[nucleotides5[1]]
bits_45 = dna_to_bit_pair[nucleotides5[2]]
bits_67 = dna_to_bit_pair[nucleotides5[3]]
bit_8 = dna_to_bit_ATGC[nucleotides5[4]]
binary_string += bits_12 + bit_3 + bits_45 + bits_67 + bit_8
# handle rest < 5
sequence_rest = sequence[n_quintuplet*5:n_quintuplet*5+rest]
if len(sequence_rest) >= 1: # 1 base -> 2 bits
binary_string += dna_to_bit_pair[sequence_rest[0]]
n_triplet, rest = divmod(len(sequence), 3)
for i in range(0, n_triplet):
nucleotides3 = sequence[i*3:(i+1)*3]
bit_1 = dna_to_bit_ATGC[nucleotides3[0]]
bits_23 = dna_to_bit_pair[nucleotides3[1]]
bits_45 = dna_to_bit_pair[nucleotides3[2]]
binary_string += bit_1 + bits_23 + bits_45
# handle rest < 3
sequence_rest = sequence[n_triplet*3:n_triplet*3+rest]
if len(sequence_rest) >= 1: # 1 base -> 1 bit
binary_string += dna_to_bit_ATGC[sequence_rest[0]]
if len(sequence_rest) == 2: # 2 bases -> 2 bits + 2 bits
if len(sequence_rest) == 2: # 2 bases -> 1 bit + 2 bits
binary_string += dna_to_bit_pair[sequence_rest[1]]
elif len(sequence_rest) == 3:
# a rest of 3 should not occur from a binary to dna abaab conversion, but a non coding A can be added to get a round number of blocks
binary_string += dna_to_bit_pair[sequence_rest[1]] # just act like the rest was 2
elif len(sequence_rest) == 4: # 4 bases -> 2 bits + 1 bit + 2 bits + 1 bit
binary_string += dna_to_bit_ATGC[sequence_rest[1]] # 1 base -> 1 bit
binary_string += dna_to_bit_pair[sequence_rest[2]] # 1 base -> 2 bits
binary_string += dna_to_bit_ATGC[sequence_rest[3]] # 1 base -> 1 bit
return binary_string
def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str:
......@@ -390,22 +381,31 @@ def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str:
return sequence_wo_bans
def test_conversion():
binary_size = 15000
random_binary = ""
for i in range(binary_size):
random_binary += random.Random().choice(['0', '1'])
for binary_size in range(10,15000, 10):
print(binary_size)
random_binary = ""
for i in range(binary_size):
random_binary += random.Random().choice(['0', '1'])
seq_abaab = binary_to_dna_abaab(random_binary)
seq_baa = binary_to_dna_baa(random_binary)
print("check abaab encoding")
sequence_control.sanity_check(seq_abaab)
seq_abaab = binary_to_dna_abaab(random_binary)
seq_baa = binary_to_dna_baa(random_binary)
decoded_abaab = dna_to_binary_abaab(seq_abaab)
decoded_baa = dna_to_binary_baa(seq_baa)
print("check abaab encoding", str(decoded_abaab == random_binary))
print("check baa encoding", str(decoded_baa == random_binary))
if not (decoded_baa == random_binary and decoded_abaab == random_binary ):
exit(1)
#print("check abaab encoding")
#sequence_control.sanity_check(seq_abaab)
print("check baa encoding")
sequence_control.sanity_check(seq_baa)
#print("check baa encoding")
#sequence_control.sanity_check(seq_baa)
# =================== main ======================= #
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment