Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 81585279 authored by BOULLE Olivier's avatar BOULLE Olivier
Browse files

remove bias for lower GC% in baa

parent e09a5ed2
No related branches found
No related tags found
No related merge requests found
......@@ -61,7 +61,7 @@ def binary_to_dna_abaab(binary_string: str) -> str:
4th 5th bits are normally converted into dna
6th 7th bits are normally converted into dna
8th bit is converted depending on previous conversion : if previous in {AT}, then (0:G, 1:C), elif in {GC}, then (0:A, 1:T)
this ensure that the total sequence cannot contain homopolymeres > 3,
this ensure that the total sequence cannot contain homopolymers > 3,
the GC% is in [40%, 60%] in a window of 5 and the conversion rate is 1.6 bit/base
warning : need the binary string to be a multiple of 2, or rest is inconsistent with decoding
......@@ -190,14 +190,14 @@ def remove_ban_words_abaab_encoding(sequence: str, abaab_method_offset=0) -> str
# change a beta_base without changing the meaning of encoded data
sequence_wo_bans_list[change_index] = dna_change_beta[sequence_wo_bans[change_index]]
index_before_ban_word = max(0, banned_words_indexes[0]-3) # test the changed part from 3 bases before to avoid creation of homopolymeres
index_before_ban_word = max(0, banned_words_indexes[0]-3) # test the changed part from 3 bases before to avoid creation of homopolymers
index_after_ban_word = min(len(sequence)-1, banned_words_indexes[1]+3) # test to 3 bases after
changed_sub_sequence = ''.join(sequence_wo_bans_list)[index_before_ban_word:index_after_ban_word]
# test the sequence only for the banned words and homopolymeres, ignore the GC%
# test the sequence only for the banned words and homopolymers, ignore the GC%
if sequence_control.sequence_check(changed_sub_sequence, window_size=60, min_GC=0, max_GC=100, verbose=False):
# keep the change that removes this ban word if it doesn't create homopolymeres or other ban words
# keep the change that removes this ban word if it doesn't create homopolymers or other ban words
sequence_wo_bans = ''.join(sequence_wo_bans_list)
#print("removed ban word", sequence[banned_words_indexes[0]:banned_words_indexes[1]],"->",changed_sub_sequence)
break
......@@ -221,10 +221,10 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
the binaries are divided into parts of 5 bits
for each 5 bits part, 1st 2nd bits are normally converted into dna : (00:A, 01:G, 10:T, 11:C)
3rd bit is converted depending on previous converted sequence :
(0:G, 1:C) or (0:A, 1:T), first to break potential homopolymere, if not then to adjust %GC of the previous sequence
(0:G, 1:C) or (0:A, 1:T), first to break potential homopolymer, if not then to adjust %GC of the previous sequence
4th 5th bits are normally converted into dna
this ensure that the total sequence cannot contain homopolymeres > 3,
this ensure that the total sequence cannot contain homopolymers > 3,
the GC% is in [33%, 66%] in the worst case in a window of 3, but tends to 50% with the adjustments and the conversion rate is 1.66 bit/base
warning : need the binary string to be a multiple of 5, or rest is inconsistent with decoding
......@@ -245,20 +245,22 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
nucleotide_2 = bit_pair_to_dna[bits5[1:3]]
nucleotide_3 = bit_pair_to_dna[bits5[3:5]]
# make nucleotide 2
# make nucleotide 1
if len(sequence) >= 3:
# check 3 previous encoded bases
if sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 or sequence[-1] == nucleotide_2 == nucleotide_3:
# break homopolymere
# break homopolymer
nucleotide_1 = bit_to_dna_balance_GC(bits5[0], sequence[-1])
else:
# adjust GC% in the window of preceding bases + 2 new alpha bases
# if no homopolymer to break, adjust GC% in the window of preceding bases + 2 new alpha bases
check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2 + nucleotide_3
if check_window.count("A")+check_window.count("T") > check_window.count("G")+check_window.count("C"):
nucleotide_1 = bit_to_dna_GC[bits5[0]]
else:
elif check_window.count("A")+check_window.count("T") < check_window.count("G")+check_window.count("C"):
nucleotide_1 = bit_to_dna_AT[bits5[0]]
else: # strictly equal, just balance with preceding base
nucleotide_1 = bit_to_dna_balance_GC(bits5[0], nucleotide_2)
else:
nucleotide_1 = bit_to_dna_balance_GC(bits5[0], nucleotide_2)
......@@ -271,13 +273,13 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
# last conversions depends on the length of the rest, 1 bits = 1 base; 3 bits = 2 bases
if len(bit_rest) == 1:
nucleotide_2 = ""
homopolymere_check_bool = sequence[-3] == sequence[-2] == sequence[-1]
homopolymer_check_bool = sequence[-3] == sequence[-2] == sequence[-1]
else: # rest = 3
nucleotide_2 = bit_pair_to_dna[bit_rest[1:3]] # nucleotide_2 from a pair of bits
homopolymere_check_bool = sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 # true => careful for a potential homopolymere
homopolymer_check_bool = sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 # true => careful for a potential homopolymer
if homopolymere_check_bool:
# break homopolymere
if homopolymer_check_bool:
# break homopolymer
nucleotide_1 = bit_to_dna_balance_GC(bit_rest[0], sequence[-1])
else:
......@@ -360,14 +362,14 @@ def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str:
# change a beta_base without changing the meaning of encoded data
sequence_wo_bans_list[change_index] = dna_change_beta[sequence_wo_bans[change_index]]
index_before_ban_word = max(0, banned_words_indexes[0]-3) # test the changed part from 3 bases before to avoid creation of homopolymeres
index_before_ban_word = max(0, banned_words_indexes[0]-3) # test the changed part from 3 bases before to avoid creation of homopolymers
index_after_ban_word = min(len(sequence)-1, banned_words_indexes[1]+3) # test to 3 bases after
changed_sub_sequence = ''.join(sequence_wo_bans_list)[index_before_ban_word:index_after_ban_word]
# test the sequence only for the banned words and homopolymeres, ignore the GC%
# test the sequence only for the banned words and homopolymers, ignore the GC%
if sequence_control.sequence_check(changed_sub_sequence, window_size=60, min_GC=0, max_GC=100, verbose=False):
# keep the change that removes this ban word if it doesn't create homopolymeres or other ban words
# keep the change that removes this ban word if it doesn't create homopolymers or other ban words
sequence_wo_bans = ''.join(sequence_wo_bans_list)
#print("removed ban word", sequence[banned_words_indexes[0]:banned_words_indexes[1]],"->",changed_sub_sequence)
break
......@@ -387,7 +389,7 @@ def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str:
def test_conversion():
for binary_size in range(10, 15000, 1):
for binary_size in range(100, 15000, 1):
print("size",str(binary_size))
random_binary = ""
for i in range(binary_size):
......@@ -404,10 +406,10 @@ def test_conversion():
#print("check abaab encoding", str(decoded_abaab == random_binary))
print("check baa encoding", str(decoded_baa == random_binary))
if not (decoded_baa == random_binary):
print(random_binary)
if not (decoded_baa == random_binary and sequence_control.sequence_check(seq_baa, min_GC=40, max_GC=60, verbose=True)):
#print(random_binary)
print(seq_baa)
print(decoded_baa)
#print(decoded_baa)
exit(1)
#print("check abaab encoding")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment