Mentions légales du service

Skip to content
Snippets Groups Projects
Commit dbab2db1 authored by BOULLE Olivier's avatar BOULLE Olivier
Browse files

banword removal for baa

parent 8aa73337
No related branches found
No related tags found
No related merge requests found
......@@ -151,20 +151,18 @@ def remove_ban_words_abaab_encoding(sequence: str, abaab_method_offset=0) -> str
if the original sequence has already been fragmented, the 5 bases windows of the abaab method can be offset, abaab_method_offset is used to correct this
"""
forbidden_sequences = ["GGTCTC", "GAGACC"]
# change a beta base but keep the binary meaning for the decoding
dna_change_beta = {"A": "G", "T": "C", "G": "A", "C": "T"}
# check for banned words
indexs_ban_words = []
indexes_ban_words = []
for banned_word in sequence_control.get_forbidden_sequences():
index = sequence.find(banned_word)
while index != -1:
indexs_ban_words.append([index, index+len(banned_word)])
indexes_ban_words.append([index, index+len(banned_word)])
index = sequence.find(banned_word, index+1)
if indexs_ban_words == []:
if indexes_ban_words == []:
return sequence
# indexes of beta bases in the sequence (2nd and 5th base every 5 base window)
......@@ -172,7 +170,7 @@ def remove_ban_words_abaab_encoding(sequence: str, abaab_method_offset=0) -> str
sequence_wo_bans = sequence
for banned_words_indexes in indexs_ban_words:
for banned_words_indexes in indexes_ban_words:
# indexes of beta bases that can be changed to remove a banned word
potential_changes_index = [ k for k in beta_indexes if k >= banned_words_indexes[0] and k < banned_words_indexes[1] ]
......@@ -288,8 +286,74 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
sequence += nucleotide_1 + nucleotide_2
return sequence
# remove the banned words
sequence_wo_banwords = remove_ban_words_baa_encoding(sequence)
return sequence_wo_banwords
def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str:
"""
remove banned words from a sequence encoded with the binary_conversion.binary_to_dna_baa() method
the baa method ensure a GC% in [33;66] on a window of 3 bases, but we the constraint is on larger windows
changing a base to avoid a banned word should not affect the whole sequence too much
the changed base must be a beta base, and be A<=>G; T<=>C, so the decoding of the updated sequence will still return the same result
if the original sequence has already been fragmented, the 3 bases windows of the baa method can be offset, baa_method_offset is used to correct this
"""
# change a beta base but keep the binary meaning for the decoding
dna_change_beta = {"A": "G", "T": "C", "G": "A", "C": "T"}
# check for banned words
indexes_ban_words = []
for banned_word in sequence_control.get_forbidden_sequences():
index = sequence.find(banned_word)
while index != -1:
indexes_ban_words.append([index, index+len(banned_word)])
index = sequence.find(banned_word, index+1)
if indexes_ban_words == []:
return sequence
# indexes of beta bases in the sequence (2nd and 5th base every 5 base window)
beta_indexes = [ baa_method_offset + 3*k for k in range(len(sequence)//3)]
sequence_wo_bans = sequence
for banned_words_indexes in indexes_ban_words:
# indexes of beta bases that can be changed to remove a banned word
potential_changes_index = [ k for k in beta_indexes if k >= banned_words_indexes[0] and k < banned_words_indexes[1] ]
for change_index in potential_changes_index:
sequence_wo_bans_list = list(sequence_wo_bans) # turn into list because python strings are immutable
# change a beta_base without changing the meaning of encoded data
sequence_wo_bans_list[change_index] = dna_change_beta[sequence_wo_bans[change_index]]
index_before_ban_word = max(0, banned_words_indexes[0]-3) # test the changed part from 3 bases before to avoid creation of homopolymeres
index_after_ban_word = min(len(sequence)-1, banned_words_indexes[1]+3) # test to 3 bases after
changed_sub_sequence = ''.join(sequence_wo_bans_list)[index_before_ban_word:index_after_ban_word]
# test the sequence only for the banned words and homopolymeres, ignore the GC%
if sequence_control.sequence_check(changed_sub_sequence, window_size=60, min_GC=0, max_GC=100, verbose=False):
# keep the change that removes this ban word if it doesn't create homopolymeres or other ban words
sequence_wo_bans = ''.join(sequence_wo_bans_list)
#print("removed ban word", sequence[banned_words_indexes[0]:banned_words_indexes[1]],"->",changed_sub_sequence)
break
else:
#print("failed to remove ban word", sequence[banned_words_indexes[0]:banned_words_indexes[1]],"-X>",changed_sub_sequence)
#print("trying again...")
pass
# last check for very very odd cases (maybe overlapping ban words)
for banned_word in sequence_control.get_forbidden_sequences():
if banned_word in sequence_wo_bans:
print("binary conversion: unable to remove forbidden sequences", sequence_wo_bans)
exit(1)
return sequence_wo_bans
def test_conversion():
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment