Mentions légales du service

Skip to content
Snippets Groups Projects
Commit eed3fa8c authored by BOULLE Olivier's avatar BOULLE Olivier
Browse files

rework ending of abaab conversion, simplified and more coherent, fixed rare...

rework ending of abaab conversion, simplified and more coherent, fixed rare error for banword removal, correction of comments
parent 5712f04e
No related branches found
No related tags found
No related merge requests found
import sys
import math
import random
import sequence_control
"""
......@@ -47,9 +49,8 @@ def binary_to_dna_abaab(binary_string: str) -> str:
#TODO warning : if ending with bsaI -> end can currently be aa when a rest of 4 bits -> will break with banword removal
"""
if len(binary_string) % 2 != 0:
print("error binary dna conversion, need a binary string multiple of 2")
if len(binary_string) % 8 in [1, 4, 6]:
print("error binary dna conversion, need a binary string multiple of 8, or with a rest of 2, 3, 7 ("+str(len(binary_string) % 8)+")")
exit(0)
sequence = ""
......@@ -68,21 +69,21 @@ def binary_to_dna_abaab(binary_string: str) -> str:
sequence += nucleotide_1 + nucleotide_2 + nucleotide_3 + nucleotide_4 + nucleotide_5
# rest should be 0 because all documents contains a round number of octet
# but some "0" can be added to fill the fragments
bit_rest = binary_string[n_octets*8:n_octets*8+rest] # rest is 0-2-4-6 bits
bit_rest = binary_string[n_octets*8:n_octets*8+rest] # rest is 0-2-3-5-7 bits
# last conversions depends on the length of the rest, 2 bits = 1 base; 4 bits = 2 bases; 6 bits = 4 bases
# last conversions depends on the length of the rest, 2 bits = 1 base; 3 bits = 2 bases; 5 bits = 3 bases; 7 bits = 4 bases
if len(bit_rest) >= 2:
nucleotide_1 = bit_pair_to_dna[bit_rest[0:2]] # nucleotide_1 from a pair of bits
sequence += nucleotide_1
if len(bit_rest) == 4:
sequence += bit_pair_to_dna[bit_rest[2:4]] # nucleotide_2 also from a pair of bits
elif len(bit_rest) == 6:
if len(bit_rest) >= 3:
# nucleotide_2 from a single bit and depending on nucleotide 1
sequence += bit_to_dna_balance_GC(bit_rest[2], nucleotide_1)
sequence += bit_pair_to_dna[bit_rest[3:5]] # nucleotide_3 from a pair of bits
sequence += bit_to_dna_AT[bit_rest[5]] # nucleotide_4 from a single bit
if len(bit_rest) >= 5:
# nucleotide_3 from a pair of bits
sequence += bit_pair_to_dna[bit_rest[3:5]]
if len(bit_rest) == 7:
# nucleotide_4 from a pair of bits
sequence += bit_pair_to_dna[bit_rest[5:7]]
# remove the banned words
sequence_wo_banwords = remove_ban_words_abaab_encoding(sequence)
......@@ -111,17 +112,14 @@ def dna_to_binary_abaab(sequence: str) -> str:
if len(sequence_rest) >= 1: # 1 base -> 2 bits
binary_string += dna_to_bit_pair[sequence_rest[0]]
if len(sequence_rest) == 2: # 2 bases -> 2 bits + 2 bits
binary_string += dna_to_bit_pair[sequence_rest[1]]
if len(sequence_rest) >= 2: # 2 bases -> 2 bits + 1 bit
binary_string += dna_to_bit_ATGC[sequence_rest[1]]
elif len(sequence_rest) == 3:
# a rest of 3 should not occur from a binary to dna abaab conversion, but a non coding base can be added to get a round number of blocks
binary_string += dna_to_bit_pair[sequence_rest[1]] # just act like the rest was 2
if len(sequence_rest) >= 3: # 2 bases -> 2 bits + 1 bit + 2 bits
binary_string += dna_to_bit_pair[sequence_rest[2]]
elif len(sequence_rest) == 4: # 4 bases -> 2 bits + 1 bit + 2 bits + 1 bit
binary_string += dna_to_bit_ATGC[sequence_rest[1]] # 1 base -> 1 bit
binary_string += dna_to_bit_pair[sequence_rest[2]] # 1 base -> 2 bits
binary_string += dna_to_bit_ATGC[sequence_rest[3]] # 1 base -> 1 bit
if len(sequence_rest) == 4: # 4 bases -> 2 bits + 1 bit + 2 bits + 2 bit
binary_string += dna_to_bit_pair[sequence_rest[3]]
return binary_string
......@@ -140,24 +138,24 @@ def remove_ban_words_abaab_encoding(sequence: str, abaab_method_offset=0) -> str
dna_change_beta = {"A": "G", "T": "C", "G": "A", "C": "T"}
# check for banned words
indexes_ban_words = []
banword_indexes_list = [] # list of index start;index_stop of all banwords in the sequence
for banned_word in sequence_control.get_forbidden_sequences():
index = sequence.find(banned_word)
while index != -1:
indexes_ban_words.append([index, index+len(banned_word)])
banword_indexes_list.append([index, index+len(banned_word)])
index = sequence.find(banned_word, index+1)
if indexes_ban_words == []:
return sequence
# indexes of beta bases in the sequence (2nd and 5th base every 5 base window)
beta_indexes = [ abaab_method_offset+ 5*k+i for k in range(-1, len(sequence)//5) for i in [1,4] ]
# indexes of beta bases in the sequence (2nd and 5th base every 5 base window) # range start in negative for cases where the offset is > 0
beta_indexes = [ abaab_method_offset+ 5*k+i for k in range(-1, math.ceil(len(sequence)/5)) for i in [1,4] ]
sequence_wo_bans = sequence
#print(len(sequence))
#print(beta_indexes)
#print(banword_indexes_list)
for banned_words_indexes in indexes_ban_words:
for banword_index in banword_indexes_list:
# indexes of beta bases that can be changed to remove a banned word
potential_changes_index = [ k for k in beta_indexes if k >= banned_words_indexes[0] and k < banned_words_indexes[1] ]
potential_changes_index = [ k for k in beta_indexes if k >= banword_index[0] and k < banword_index[1] ]
for change_index in potential_changes_index:
sequence_wo_bans_list = list(sequence_wo_bans) # turn into list because python strings are immutable
......@@ -165,8 +163,8 @@ def remove_ban_words_abaab_encoding(sequence: str, abaab_method_offset=0) -> str
# change a beta_base without changing the meaning of encoded data
sequence_wo_bans_list[change_index] = dna_change_beta[sequence_wo_bans[change_index]]
index_before_ban_word = max(0, banned_words_indexes[0]-3) # test the changed part from 3 bases before to avoid creation of homopolymers
index_after_ban_word = min(len(sequence)-1, banned_words_indexes[1]+3) # test to 3 bases after
index_before_ban_word = max(0, banword_index[0]-3) # test the changed part from 3 bases before to avoid creation of homopolymers
index_after_ban_word = min(len(sequence)-1, banword_index[1]+3) # test to 3 bases after
changed_sub_sequence = ''.join(sequence_wo_bans_list)[index_before_ban_word:index_after_ban_word]
......@@ -174,10 +172,10 @@ def remove_ban_words_abaab_encoding(sequence: str, abaab_method_offset=0) -> str
if sequence_control.sequence_check(changed_sub_sequence, window_size=60, min_GC=0, max_GC=100, verbose=False):
# keep the change that removes this ban word if it doesn't create homopolymers or other ban words
sequence_wo_bans = ''.join(sequence_wo_bans_list)
#print("removed ban word", sequence[banned_words_indexes[0]:banned_words_indexes[1]],"->",changed_sub_sequence)
#print("removed ban word", sequence[banword_index[0]:banword_index[1]],"->",changed_sub_sequence)
break
else:
#print("failed to remove ban word", sequence[banned_words_indexes[0]:banned_words_indexes[1]],"-X>",changed_sub_sequence)
#print("failed to remove ban word", sequence[banword_index[0]:banword_index[1]],"-X>",changed_sub_sequence)
#print("trying again...")
pass
......@@ -202,11 +200,13 @@ def size_dna_from_bit_len_abaab(bit_length):
return base_length
if rest == 2:
return base_length + 1
if rest == 4:
if rest == 3:
return base_length + 2
if rest == 6:
if rest == 5:
return base_length + 3
if rest == 7:
return base_length + 4
print("error size_of_dna_from_bit_len_abaab : size not multiple of 2 :",str(bit_length))
print("error size_of_dna_from_bit_len_abaab : invalid size rest :",str(bit_length),"(rest of",str(rest)+")")
exit(1)
......@@ -223,10 +223,12 @@ def size_binary_from_dna_len_abaab(dna_length):
return base_length
if rest == 1:
return base_length + 2
if rest == 2 or rest == 3: # if rest of 3, act like a rest of 2 because the third base will be ignored
return base_length + 4
if rest == 2:
return base_length + 3
if rest == 3:
return base_length + 5
if rest == 4:
return base_length + 6
return base_length + 7
def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
......@@ -248,7 +250,7 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
"""
if len(binary_string) % 5 not in [0, 1, 3]:
print("error binary dna conversion, need a binary string multiple of 5, or 1, 3 modulo 5")
print("error binary dna conversion, need a binary string multiple of 5, or with a rest of 1, 3")
return
sequence = ""
......@@ -362,7 +364,7 @@ def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str:
return sequence
# indexes of beta bases in the sequence (1st base every 3 base window)
beta_indexes = [ baa_method_offset + 3*k for k in range(len(sequence)//3)]
beta_indexes = [ baa_method_offset + 3*k for k in range(math.ceil(len(sequence)/3))]
sequence_wo_bans = sequence
......@@ -416,7 +418,7 @@ def size_dna_from_bit_len_baa(bit_length):
if rest == 3:
return base_length + 2
print("error size_of_dna_from_bit_len : invalid rest size:",str(rest))
print("error size_of_dna_from_bit_len_baa : invalid size rest :",str(bit_length),"(rest of",str(rest)+")")
exit(1)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment