Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 81585279 authored by BOULLE Olivier's avatar BOULLE Olivier
Browse files

remove bias for lower GC% in baa

parent e09a5ed2
Branches
No related tags found
No related merge requests found
...@@ -61,7 +61,7 @@ def binary_to_dna_abaab(binary_string: str) -> str: ...@@ -61,7 +61,7 @@ def binary_to_dna_abaab(binary_string: str) -> str:
4th 5th bits are normally converted into dna 4th 5th bits are normally converted into dna
6th 7th bits are normally converted into dna 6th 7th bits are normally converted into dna
8th bit is converted depending on previous conversion : if previous in {AT}, then (0:G, 1:C), elif in {GC}, then (0:A, 1:T) 8th bit is converted depending on previous conversion : if previous in {AT}, then (0:G, 1:C), elif in {GC}, then (0:A, 1:T)
this ensure that the total sequence cannot contain homopolymeres > 3, this ensure that the total sequence cannot contain homopolymers > 3,
the GC% is in [40%, 60%] in a window of 5 and the conversion rate is 1.6 bit/base the GC% is in [40%, 60%] in a window of 5 and the conversion rate is 1.6 bit/base
warning : need the binary string to be a multiple of 2, or rest is inconsistent with decoding warning : need the binary string to be a multiple of 2, or rest is inconsistent with decoding
...@@ -190,14 +190,14 @@ def remove_ban_words_abaab_encoding(sequence: str, abaab_method_offset=0) -> str ...@@ -190,14 +190,14 @@ def remove_ban_words_abaab_encoding(sequence: str, abaab_method_offset=0) -> str
# change a beta_base without changing the meaning of encoded data # change a beta_base without changing the meaning of encoded data
sequence_wo_bans_list[change_index] = dna_change_beta[sequence_wo_bans[change_index]] sequence_wo_bans_list[change_index] = dna_change_beta[sequence_wo_bans[change_index]]
index_before_ban_word = max(0, banned_words_indexes[0]-3) # test the changed part from 3 bases before to avoid creation of homopolymeres index_before_ban_word = max(0, banned_words_indexes[0]-3) # test the changed part from 3 bases before to avoid creation of homopolymers
index_after_ban_word = min(len(sequence)-1, banned_words_indexes[1]+3) # test to 3 bases after index_after_ban_word = min(len(sequence)-1, banned_words_indexes[1]+3) # test to 3 bases after
changed_sub_sequence = ''.join(sequence_wo_bans_list)[index_before_ban_word:index_after_ban_word] changed_sub_sequence = ''.join(sequence_wo_bans_list)[index_before_ban_word:index_after_ban_word]
# test the sequence only for the banned words and homopolymeres, ignore the GC% # test the sequence only for the banned words and homopolymers, ignore the GC%
if sequence_control.sequence_check(changed_sub_sequence, window_size=60, min_GC=0, max_GC=100, verbose=False): if sequence_control.sequence_check(changed_sub_sequence, window_size=60, min_GC=0, max_GC=100, verbose=False):
# keep the change that removes this ban word if it doesn't create homopolymeres or other ban words # keep the change that removes this ban word if it doesn't create homopolymers or other ban words
sequence_wo_bans = ''.join(sequence_wo_bans_list) sequence_wo_bans = ''.join(sequence_wo_bans_list)
#print("removed ban word", sequence[banned_words_indexes[0]:banned_words_indexes[1]],"->",changed_sub_sequence) #print("removed ban word", sequence[banned_words_indexes[0]:banned_words_indexes[1]],"->",changed_sub_sequence)
break break
...@@ -221,10 +221,10 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str: ...@@ -221,10 +221,10 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
the binaries are divided into parts of 5 bits the binaries are divided into parts of 5 bits
for each 5 bits part, 1st 2nd bits are normally converted into dna : (00:A, 01:G, 10:T, 11:C) for each 5 bits part, 1st 2nd bits are normally converted into dna : (00:A, 01:G, 10:T, 11:C)
3rd bit is converted depending on previous converted sequence : 3rd bit is converted depending on previous converted sequence :
(0:G, 1:C) or (0:A, 1:T), first to break potential homopolymere, if not then to adjust %GC of the previous sequence (0:G, 1:C) or (0:A, 1:T), first to break potential homopolymer, if not then to adjust %GC of the previous sequence
4th 5th bits are normally converted into dna 4th 5th bits are normally converted into dna
this ensure that the total sequence cannot contain homopolymeres > 3, this ensure that the total sequence cannot contain homopolymers > 3,
the GC% is in [33%, 66%] in the worst case in a window of 3, but tends to 50% with the adjustments and the conversion rate is 1.66 bit/base the GC% is in [33%, 66%] in the worst case in a window of 3, but tends to 50% with the adjustments and the conversion rate is 1.66 bit/base
warning : need the binary string to be a multiple of 5, or rest is inconsistent with decoding warning : need the binary string to be a multiple of 5, or rest is inconsistent with decoding
...@@ -245,20 +245,22 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str: ...@@ -245,20 +245,22 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
nucleotide_2 = bit_pair_to_dna[bits5[1:3]] nucleotide_2 = bit_pair_to_dna[bits5[1:3]]
nucleotide_3 = bit_pair_to_dna[bits5[3:5]] nucleotide_3 = bit_pair_to_dna[bits5[3:5]]
# make nucleotide 2 # make nucleotide 1
if len(sequence) >= 3: if len(sequence) >= 3:
# check 3 previous encoded bases # check 3 previous encoded bases
if sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 or sequence[-1] == nucleotide_2 == nucleotide_3: if sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 or sequence[-1] == nucleotide_2 == nucleotide_3:
# break homopolymere # break homopolymer
nucleotide_1 = bit_to_dna_balance_GC(bits5[0], sequence[-1]) nucleotide_1 = bit_to_dna_balance_GC(bits5[0], sequence[-1])
else: else:
# adjust GC% in the window of preceding bases + 2 new alpha bases # if no homopolymer to break, adjust GC% in the window of preceding bases + 2 new alpha bases
check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2 + nucleotide_3 check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2 + nucleotide_3
if check_window.count("A")+check_window.count("T") > check_window.count("G")+check_window.count("C"): if check_window.count("A")+check_window.count("T") > check_window.count("G")+check_window.count("C"):
nucleotide_1 = bit_to_dna_GC[bits5[0]] nucleotide_1 = bit_to_dna_GC[bits5[0]]
else: elif check_window.count("A")+check_window.count("T") < check_window.count("G")+check_window.count("C"):
nucleotide_1 = bit_to_dna_AT[bits5[0]] nucleotide_1 = bit_to_dna_AT[bits5[0]]
else: # strictly equal, just balance with preceding base
nucleotide_1 = bit_to_dna_balance_GC(bits5[0], nucleotide_2)
else: else:
nucleotide_1 = bit_to_dna_balance_GC(bits5[0], nucleotide_2) nucleotide_1 = bit_to_dna_balance_GC(bits5[0], nucleotide_2)
...@@ -271,13 +273,13 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str: ...@@ -271,13 +273,13 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
# last conversions depends on the length of the rest, 1 bits = 1 base; 3 bits = 2 bases # last conversions depends on the length of the rest, 1 bits = 1 base; 3 bits = 2 bases
if len(bit_rest) == 1: if len(bit_rest) == 1:
nucleotide_2 = "" nucleotide_2 = ""
homopolymere_check_bool = sequence[-3] == sequence[-2] == sequence[-1] homopolymer_check_bool = sequence[-3] == sequence[-2] == sequence[-1]
else: # rest = 3 else: # rest = 3
nucleotide_2 = bit_pair_to_dna[bit_rest[1:3]] # nucleotide_2 from a pair of bits nucleotide_2 = bit_pair_to_dna[bit_rest[1:3]] # nucleotide_2 from a pair of bits
homopolymere_check_bool = sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 # true => careful for a potential homopolymere homopolymer_check_bool = sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 # true => careful for a potential homopolymer
if homopolymere_check_bool: if homopolymer_check_bool:
# break homopolymere # break homopolymer
nucleotide_1 = bit_to_dna_balance_GC(bit_rest[0], sequence[-1]) nucleotide_1 = bit_to_dna_balance_GC(bit_rest[0], sequence[-1])
else: else:
...@@ -360,14 +362,14 @@ def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str: ...@@ -360,14 +362,14 @@ def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str:
# change a beta_base without changing the meaning of encoded data # change a beta_base without changing the meaning of encoded data
sequence_wo_bans_list[change_index] = dna_change_beta[sequence_wo_bans[change_index]] sequence_wo_bans_list[change_index] = dna_change_beta[sequence_wo_bans[change_index]]
index_before_ban_word = max(0, banned_words_indexes[0]-3) # test the changed part from 3 bases before to avoid creation of homopolymeres index_before_ban_word = max(0, banned_words_indexes[0]-3) # test the changed part from 3 bases before to avoid creation of homopolymers
index_after_ban_word = min(len(sequence)-1, banned_words_indexes[1]+3) # test to 3 bases after index_after_ban_word = min(len(sequence)-1, banned_words_indexes[1]+3) # test to 3 bases after
changed_sub_sequence = ''.join(sequence_wo_bans_list)[index_before_ban_word:index_after_ban_word] changed_sub_sequence = ''.join(sequence_wo_bans_list)[index_before_ban_word:index_after_ban_word]
# test the sequence only for the banned words and homopolymeres, ignore the GC% # test the sequence only for the banned words and homopolymers, ignore the GC%
if sequence_control.sequence_check(changed_sub_sequence, window_size=60, min_GC=0, max_GC=100, verbose=False): if sequence_control.sequence_check(changed_sub_sequence, window_size=60, min_GC=0, max_GC=100, verbose=False):
# keep the change that removes this ban word if it doesn't create homopolymeres or other ban words # keep the change that removes this ban word if it doesn't create homopolymers or other ban words
sequence_wo_bans = ''.join(sequence_wo_bans_list) sequence_wo_bans = ''.join(sequence_wo_bans_list)
#print("removed ban word", sequence[banned_words_indexes[0]:banned_words_indexes[1]],"->",changed_sub_sequence) #print("removed ban word", sequence[banned_words_indexes[0]:banned_words_indexes[1]],"->",changed_sub_sequence)
break break
...@@ -387,7 +389,7 @@ def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str: ...@@ -387,7 +389,7 @@ def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str:
def test_conversion(): def test_conversion():
for binary_size in range(10, 15000, 1): for binary_size in range(100, 15000, 1):
print("size",str(binary_size)) print("size",str(binary_size))
random_binary = "" random_binary = ""
for i in range(binary_size): for i in range(binary_size):
...@@ -404,10 +406,10 @@ def test_conversion(): ...@@ -404,10 +406,10 @@ def test_conversion():
#print("check abaab encoding", str(decoded_abaab == random_binary)) #print("check abaab encoding", str(decoded_abaab == random_binary))
print("check baa encoding", str(decoded_baa == random_binary)) print("check baa encoding", str(decoded_baa == random_binary))
if not (decoded_baa == random_binary): if not (decoded_baa == random_binary and sequence_control.sequence_check(seq_baa, min_GC=40, max_GC=60, verbose=True)):
print(random_binary) #print(random_binary)
print(seq_baa) print(seq_baa)
print(decoded_baa) #print(decoded_baa)
exit(1) exit(1)
#print("check abaab encoding") #print("check abaab encoding")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment