diff --git a/binary_dna_conversion.py b/binary_dna_conversion.py index b2a9b726b2d50abe6149cfe9916091d1164e8942..d8a7e879f790105fcec432c0b14616b2abd84b53 100755 --- a/binary_dna_conversion.py +++ b/binary_dna_conversion.py @@ -1,5 +1,7 @@ import sys +import math import random + import sequence_control """ @@ -47,9 +49,8 @@ def binary_to_dna_abaab(binary_string: str) -> str: #TODO warning : if ending with bsaI -> end can currently be aa when a rest of 4 bits -> will break with banword removal """ - - if len(binary_string) % 2 != 0: - print("error binary dna conversion, need a binary string multiple of 2") + if len(binary_string) % 8 in [1, 4, 6]: + print("error binary dna conversion, need a binary string multiple of 8, or with a rest of 2, 3, 7 ("+str(len(binary_string) % 8)+")") exit(0) sequence = "" @@ -68,21 +69,21 @@ def binary_to_dna_abaab(binary_string: str) -> str: sequence += nucleotide_1 + nucleotide_2 + nucleotide_3 + nucleotide_4 + nucleotide_5 # rest should be 0 because all documents contains a round number of octet # but some "0" can be added to fill the fragments - bit_rest = binary_string[n_octets*8:n_octets*8+rest] # rest is 0-2-4-6 bits + bit_rest = binary_string[n_octets*8:n_octets*8+rest] # rest is 0-2-3-5-7 bits - # last conversions depends on the length of the rest, 2 bits = 1 base; 4 bits = 2 bases; 6 bits = 4 bases + # last conversions depends on the length of the rest, 2 bits = 1 base; 3 bits = 2 bases; 5 bits = 3 bases; 7 bits = 4 bases if len(bit_rest) >= 2: nucleotide_1 = bit_pair_to_dna[bit_rest[0:2]] # nucleotide_1 from a pair of bits sequence += nucleotide_1 - if len(bit_rest) == 4: - sequence += bit_pair_to_dna[bit_rest[2:4]] # nucleotide_2 also from a pair of bits - elif len(bit_rest) == 6: - + if len(bit_rest) >= 3: # nucleotide_2 from a single bit and depending on nucleotide 1 sequence += bit_to_dna_balance_GC(bit_rest[2], nucleotide_1) - - sequence += bit_pair_to_dna[bit_rest[3:5]] # nucleotide_3 from a pair of bits - sequence += bit_to_dna_AT[bit_rest[5]] # nucleotide_4 from a single bit + if len(bit_rest) >= 5: + # nucleotide_3 from a pair of bits + sequence += bit_pair_to_dna[bit_rest[3:5]] + if len(bit_rest) == 7: + # nucleotide_4 from a pair of bits + sequence += bit_pair_to_dna[bit_rest[5:7]] # remove the banned words sequence_wo_banwords = remove_ban_words_abaab_encoding(sequence) @@ -111,17 +112,14 @@ def dna_to_binary_abaab(sequence: str) -> str: if len(sequence_rest) >= 1: # 1 base -> 2 bits binary_string += dna_to_bit_pair[sequence_rest[0]] - if len(sequence_rest) == 2: # 2 bases -> 2 bits + 2 bits - binary_string += dna_to_bit_pair[sequence_rest[1]] + if len(sequence_rest) >= 2: # 2 bases -> 2 bits + 1 bit + binary_string += dna_to_bit_ATGC[sequence_rest[1]] - elif len(sequence_rest) == 3: - # a rest of 3 should not occur from a binary to dna abaab conversion, but a non coding base can be added to get a round number of blocks - binary_string += dna_to_bit_pair[sequence_rest[1]] # just act like the rest was 2 + if len(sequence_rest) >= 3: # 2 bases -> 2 bits + 1 bit + 2 bits + binary_string += dna_to_bit_pair[sequence_rest[2]] - elif len(sequence_rest) == 4: # 4 bases -> 2 bits + 1 bit + 2 bits + 1 bit - binary_string += dna_to_bit_ATGC[sequence_rest[1]] # 1 base -> 1 bit - binary_string += dna_to_bit_pair[sequence_rest[2]] # 1 base -> 2 bits - binary_string += dna_to_bit_ATGC[sequence_rest[3]] # 1 base -> 1 bit + if len(sequence_rest) == 4: # 4 bases -> 2 bits + 1 bit + 2 bits + 2 bit + binary_string += dna_to_bit_pair[sequence_rest[3]] return binary_string @@ -140,24 +138,24 @@ def remove_ban_words_abaab_encoding(sequence: str, abaab_method_offset=0) -> str dna_change_beta = {"A": "G", "T": "C", "G": "A", "C": "T"} # check for banned words - indexes_ban_words = [] + banword_indexes_list = [] # list of index start;index_stop of all banwords in the sequence for banned_word in sequence_control.get_forbidden_sequences(): index = sequence.find(banned_word) while index != -1: - indexes_ban_words.append([index, index+len(banned_word)]) + banword_indexes_list.append([index, index+len(banned_word)]) index = sequence.find(banned_word, index+1) - if indexes_ban_words == []: - return sequence - - # indexes of beta bases in the sequence (2nd and 5th base every 5 base window) - beta_indexes = [ abaab_method_offset+ 5*k+i for k in range(-1, len(sequence)//5) for i in [1,4] ] + # indexes of beta bases in the sequence (2nd and 5th base every 5 base window) # range start in negative for cases where the offset is > 0 + beta_indexes = [ abaab_method_offset+ 5*k+i for k in range(-1, math.ceil(len(sequence)/5)) for i in [1,4] ] sequence_wo_bans = sequence + #print(len(sequence)) + #print(beta_indexes) + #print(banword_indexes_list) - for banned_words_indexes in indexes_ban_words: + for banword_index in banword_indexes_list: # indexes of beta bases that can be changed to remove a banned word - potential_changes_index = [ k for k in beta_indexes if k >= banned_words_indexes[0] and k < banned_words_indexes[1] ] + potential_changes_index = [ k for k in beta_indexes if k >= banword_index[0] and k < banword_index[1] ] for change_index in potential_changes_index: sequence_wo_bans_list = list(sequence_wo_bans) # turn into list because python strings are immutable @@ -165,8 +163,8 @@ def remove_ban_words_abaab_encoding(sequence: str, abaab_method_offset=0) -> str # change a beta_base without changing the meaning of encoded data sequence_wo_bans_list[change_index] = dna_change_beta[sequence_wo_bans[change_index]] - index_before_ban_word = max(0, banned_words_indexes[0]-3) # test the changed part from 3 bases before to avoid creation of homopolymers - index_after_ban_word = min(len(sequence)-1, banned_words_indexes[1]+3) # test to 3 bases after + index_before_ban_word = max(0, banword_index[0]-3) # test the changed part from 3 bases before to avoid creation of homopolymers + index_after_ban_word = min(len(sequence)-1, banword_index[1]+3) # test to 3 bases after changed_sub_sequence = ''.join(sequence_wo_bans_list)[index_before_ban_word:index_after_ban_word] @@ -174,10 +172,10 @@ def remove_ban_words_abaab_encoding(sequence: str, abaab_method_offset=0) -> str if sequence_control.sequence_check(changed_sub_sequence, window_size=60, min_GC=0, max_GC=100, verbose=False): # keep the change that removes this ban word if it doesn't create homopolymers or other ban words sequence_wo_bans = ''.join(sequence_wo_bans_list) - #print("removed ban word", sequence[banned_words_indexes[0]:banned_words_indexes[1]],"->",changed_sub_sequence) + #print("removed ban word", sequence[banword_index[0]:banword_index[1]],"->",changed_sub_sequence) break else: - #print("failed to remove ban word", sequence[banned_words_indexes[0]:banned_words_indexes[1]],"-X>",changed_sub_sequence) + #print("failed to remove ban word", sequence[banword_index[0]:banword_index[1]],"-X>",changed_sub_sequence) #print("trying again...") pass @@ -202,11 +200,13 @@ def size_dna_from_bit_len_abaab(bit_length): return base_length if rest == 2: return base_length + 1 - if rest == 4: + if rest == 3: return base_length + 2 - if rest == 6: + if rest == 5: + return base_length + 3 + if rest == 7: return base_length + 4 - print("error size_of_dna_from_bit_len_abaab : size not multiple of 2 :",str(bit_length)) + print("error size_of_dna_from_bit_len_abaab : invalid size rest :",str(bit_length),"(rest of",str(rest)+")") exit(1) @@ -223,10 +223,12 @@ def size_binary_from_dna_len_abaab(dna_length): return base_length if rest == 1: return base_length + 2 - if rest == 2 or rest == 3: # if rest of 3, act like a rest of 2 because the third base will be ignored - return base_length + 4 + if rest == 2: + return base_length + 3 + if rest == 3: + return base_length + 5 if rest == 4: - return base_length + 6 + return base_length + 7 def binary_to_dna_baa(binary_string: str, GC_window=20) -> str: @@ -248,7 +250,7 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str: """ if len(binary_string) % 5 not in [0, 1, 3]: - print("error binary dna conversion, need a binary string multiple of 5, or 1, 3 modulo 5") + print("error binary dna conversion, need a binary string multiple of 5, or with a rest of 1, 3") return sequence = "" @@ -362,7 +364,7 @@ def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str: return sequence # indexes of beta bases in the sequence (1st base every 3 base window) - beta_indexes = [ baa_method_offset + 3*k for k in range(len(sequence)//3)] + beta_indexes = [ baa_method_offset + 3*k for k in range(math.ceil(len(sequence)/3))] sequence_wo_bans = sequence @@ -416,7 +418,7 @@ def size_dna_from_bit_len_baa(bit_length): if rest == 3: return base_length + 2 - print("error size_of_dna_from_bit_len : invalid rest size:",str(rest)) + print("error size_of_dna_from_bit_len_baa : invalid size rest :",str(bit_length),"(rest of",str(rest)+")") exit(1)