Mentions légales du service

Skip to content
Snippets Groups Projects
Commit e09a5ed2 authored by BOULLE Olivier's avatar BOULLE Olivier
Browse files

correction of encoding error when rest

parent 8e077e9c
No related branches found
No related tags found
No related merge requests found
......@@ -68,6 +68,9 @@ def binary_to_dna_abaab(binary_string: str) -> str:
ex : a rest of 0 -> A, a rest of 00 -> A
so how to decode A ?
only allowing rests multiple of 2 removes ambiguity
#TODO warning : if ending with bsaI -> end can currently be aa when a rest of 4 bits -> will break with banword removal
"""
if len(binary_string) % 2 != 0:
......@@ -227,12 +230,12 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
warning : need the binary string to be a multiple of 5, or rest is inconsistent with decoding
ex : a rest of 0 -> A, a rest of 00 -> A
so how to decode A ?
only allowing rests multiple of 5 removes ambiguity (also possible 0,1 or 3 modulo 5)
only allowing rests multiple of 5 removes ambiguity (also possible 0, 1 or 3 modulo 5)
"""
if len(binary_string) % 5 != 0:
print("error binary dna conversion, need a binary string multiple of 5")
exit(0)
if len(binary_string) % 5 not in [0, 1, 3]:
print("error binary dna conversion, need a binary string multiple of 5, or 1, 3 modulo 5")
return
sequence = ""
n_quintuplets, rest = divmod(len(binary_string), 5)
......@@ -266,12 +269,12 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
if len(bit_rest) > 0:
# last conversions depends on the length of the rest, 1 bits = 1 base; 3 bits = 2 bases
if len(bit_rest) == 3:
if len(bit_rest) == 1:
nucleotide_2 = ""
homopolymere_check_bool = sequence[-3] == sequence[-2] == sequence[-1]
else: # rest = 3
nucleotide_2 = bit_pair_to_dna[bit_rest[1:3]] # nucleotide_2 from a pair of bits
homopolymere_check_bool = sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 # true => careful for a potential homopolymere
else:
nucleotide_2 = ""
homopolymere_check_bool = sequence[-3] == sequence[-2] == sequence[-1]
if homopolymere_check_bool:
# break homopolymere
......@@ -281,9 +284,9 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
# adjust GC%
check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2
if check_window.count("A")+check_window.count("T") > check_window.count("G")+check_window.count("C"):
nucleotide_1 = bit_to_dna_GC[bits5[0]]
nucleotide_1 = bit_to_dna_GC[bit_rest[0]]
else:
nucleotide_1 = bit_to_dna_AT[bits5[0]]
nucleotide_1 = bit_to_dna_AT[bit_rest[0]]
sequence += nucleotide_1 + nucleotide_2
......@@ -384,21 +387,27 @@ def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str:
def test_conversion():
for binary_size in range(10,15000, 10):
print(binary_size)
for binary_size in range(10, 15000, 1):
print("size",str(binary_size))
random_binary = ""
for i in range(binary_size):
random_binary += random.Random().choice(['0', '1'])
seq_abaab = binary_to_dna_abaab(random_binary)
#random_binary = "00011110010"
#seq_abaab = binary_to_dna_abaab(random_binary)
seq_baa = binary_to_dna_baa(random_binary)
if seq_baa is None:
continue
decoded_abaab = dna_to_binary_abaab(seq_abaab)
#decoded_abaab = dna_to_binary_abaab(seq_abaab)
decoded_baa = dna_to_binary_baa(seq_baa)
print("check abaab encoding", str(decoded_abaab == random_binary))
#print("check abaab encoding", str(decoded_abaab == random_binary))
print("check baa encoding", str(decoded_baa == random_binary))
if not (decoded_baa == random_binary and decoded_abaab == random_binary ):
if not (decoded_baa == random_binary):
print(random_binary)
print(seq_baa)
print(decoded_baa)
exit(1)
#print("check abaab encoding")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment