Mentions légales du service

Skip to content
Snippets Groups Projects
Commit e09a5ed2 authored by BOULLE Olivier's avatar BOULLE Olivier
Browse files

correction of encoding error when rest

parent 8e077e9c
No related branches found
No related tags found
No related merge requests found
...@@ -68,6 +68,9 @@ def binary_to_dna_abaab(binary_string: str) -> str: ...@@ -68,6 +68,9 @@ def binary_to_dna_abaab(binary_string: str) -> str:
ex : a rest of 0 -> A, a rest of 00 -> A ex : a rest of 0 -> A, a rest of 00 -> A
so how to decode A ? so how to decode A ?
only allowing rests multiple of 2 removes ambiguity only allowing rests multiple of 2 removes ambiguity
#TODO warning : if ending with bsaI -> end can currently be aa when a rest of 4 bits -> will break with banword removal
""" """
if len(binary_string) % 2 != 0: if len(binary_string) % 2 != 0:
...@@ -227,12 +230,12 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str: ...@@ -227,12 +230,12 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
warning : need the binary string to be a multiple of 5, or rest is inconsistent with decoding warning : need the binary string to be a multiple of 5, or rest is inconsistent with decoding
ex : a rest of 0 -> A, a rest of 00 -> A ex : a rest of 0 -> A, a rest of 00 -> A
so how to decode A ? so how to decode A ?
only allowing rests multiple of 5 removes ambiguity (also possible 0,1 or 3 modulo 5) only allowing rests multiple of 5 removes ambiguity (also possible 0, 1 or 3 modulo 5)
""" """
if len(binary_string) % 5 != 0: if len(binary_string) % 5 not in [0, 1, 3]:
print("error binary dna conversion, need a binary string multiple of 5") print("error binary dna conversion, need a binary string multiple of 5, or 1, 3 modulo 5")
exit(0) return
sequence = "" sequence = ""
n_quintuplets, rest = divmod(len(binary_string), 5) n_quintuplets, rest = divmod(len(binary_string), 5)
...@@ -266,12 +269,12 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str: ...@@ -266,12 +269,12 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
if len(bit_rest) > 0: if len(bit_rest) > 0:
# last conversions depends on the length of the rest, 1 bits = 1 base; 3 bits = 2 bases # last conversions depends on the length of the rest, 1 bits = 1 base; 3 bits = 2 bases
if len(bit_rest) == 3: if len(bit_rest) == 1:
nucleotide_2 = ""
homopolymere_check_bool = sequence[-3] == sequence[-2] == sequence[-1]
else: # rest = 3
nucleotide_2 = bit_pair_to_dna[bit_rest[1:3]] # nucleotide_2 from a pair of bits nucleotide_2 = bit_pair_to_dna[bit_rest[1:3]] # nucleotide_2 from a pair of bits
homopolymere_check_bool = sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 # true => careful for a potential homopolymere homopolymere_check_bool = sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 # true => careful for a potential homopolymere
else:
nucleotide_2 = ""
homopolymere_check_bool = sequence[-3] == sequence[-2] == sequence[-1]
if homopolymere_check_bool: if homopolymere_check_bool:
# break homopolymere # break homopolymere
...@@ -281,9 +284,9 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str: ...@@ -281,9 +284,9 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
# adjust GC% # adjust GC%
check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2 check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2
if check_window.count("A")+check_window.count("T") > check_window.count("G")+check_window.count("C"): if check_window.count("A")+check_window.count("T") > check_window.count("G")+check_window.count("C"):
nucleotide_1 = bit_to_dna_GC[bits5[0]] nucleotide_1 = bit_to_dna_GC[bit_rest[0]]
else: else:
nucleotide_1 = bit_to_dna_AT[bits5[0]] nucleotide_1 = bit_to_dna_AT[bit_rest[0]]
sequence += nucleotide_1 + nucleotide_2 sequence += nucleotide_1 + nucleotide_2
...@@ -384,21 +387,27 @@ def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str: ...@@ -384,21 +387,27 @@ def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str:
def test_conversion(): def test_conversion():
for binary_size in range(10,15000, 10): for binary_size in range(10, 15000, 1):
print(binary_size) print("size",str(binary_size))
random_binary = "" random_binary = ""
for i in range(binary_size): for i in range(binary_size):
random_binary += random.Random().choice(['0', '1']) random_binary += random.Random().choice(['0', '1'])
seq_abaab = binary_to_dna_abaab(random_binary) #random_binary = "00011110010"
#seq_abaab = binary_to_dna_abaab(random_binary)
seq_baa = binary_to_dna_baa(random_binary) seq_baa = binary_to_dna_baa(random_binary)
if seq_baa is None:
continue
decoded_abaab = dna_to_binary_abaab(seq_abaab) #decoded_abaab = dna_to_binary_abaab(seq_abaab)
decoded_baa = dna_to_binary_baa(seq_baa) decoded_baa = dna_to_binary_baa(seq_baa)
print("check abaab encoding", str(decoded_abaab == random_binary)) #print("check abaab encoding", str(decoded_abaab == random_binary))
print("check baa encoding", str(decoded_baa == random_binary)) print("check baa encoding", str(decoded_baa == random_binary))
if not (decoded_baa == random_binary and decoded_abaab == random_binary ): if not (decoded_baa == random_binary):
print(random_binary)
print(seq_baa)
print(decoded_baa)
exit(1) exit(1)
#print("check abaab encoding") #print("check abaab encoding")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment