diff --git a/binary_dna_conversion.py b/binary_dna_conversion.py index b122af1fcd57baadf82765d47d961462af4f35c3..65c954f00c1f42702dcf2c7cf5ed9c9c033237d9 100755 --- a/binary_dna_conversion.py +++ b/binary_dna_conversion.py @@ -61,7 +61,7 @@ def binary_to_dna_abaab(binary_string: str) -> str: 4th 5th bits are normally converted into dna 6th 7th bits are normally converted into dna 8th bit is converted depending on previous conversion : if previous in {AT}, then (0:G, 1:C), elif in {GC}, then (0:A, 1:T) - this ensure that the total sequence cannot contain homopolymeres > 3, + this ensure that the total sequence cannot contain homopolymers > 3, the GC% is in [40%, 60%] in a window of 5 and the conversion rate is 1.6 bit/base warning : need the binary string to be a multiple of 2, or rest is inconsistent with decoding @@ -190,14 +190,14 @@ def remove_ban_words_abaab_encoding(sequence: str, abaab_method_offset=0) -> str # change a beta_base without changing the meaning of encoded data sequence_wo_bans_list[change_index] = dna_change_beta[sequence_wo_bans[change_index]] - index_before_ban_word = max(0, banned_words_indexes[0]-3) # test the changed part from 3 bases before to avoid creation of homopolymeres + index_before_ban_word = max(0, banned_words_indexes[0]-3) # test the changed part from 3 bases before to avoid creation of homopolymers index_after_ban_word = min(len(sequence)-1, banned_words_indexes[1]+3) # test to 3 bases after changed_sub_sequence = ''.join(sequence_wo_bans_list)[index_before_ban_word:index_after_ban_word] - # test the sequence only for the banned words and homopolymeres, ignore the GC% + # test the sequence only for the banned words and homopolymers, ignore the GC% if sequence_control.sequence_check(changed_sub_sequence, window_size=60, min_GC=0, max_GC=100, verbose=False): - # keep the change that removes this ban word if it doesn't create homopolymeres or other ban words + # keep the change that removes this ban word if it doesn't create homopolymers or other ban words sequence_wo_bans = ''.join(sequence_wo_bans_list) #print("removed ban word", sequence[banned_words_indexes[0]:banned_words_indexes[1]],"->",changed_sub_sequence) break @@ -221,10 +221,10 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str: the binaries are divided into parts of 5 bits for each 5 bits part, 1st 2nd bits are normally converted into dna : (00:A, 01:G, 10:T, 11:C) 3rd bit is converted depending on previous converted sequence : - (0:G, 1:C) or (0:A, 1:T), first to break potential homopolymere, if not then to adjust %GC of the previous sequence + (0:G, 1:C) or (0:A, 1:T), first to break potential homopolymer, if not then to adjust %GC of the previous sequence 4th 5th bits are normally converted into dna - this ensure that the total sequence cannot contain homopolymeres > 3, + this ensure that the total sequence cannot contain homopolymers > 3, the GC% is in [33%, 66%] in the worst case in a window of 3, but tends to 50% with the adjustments and the conversion rate is 1.66 bit/base warning : need the binary string to be a multiple of 5, or rest is inconsistent with decoding @@ -245,20 +245,22 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str: nucleotide_2 = bit_pair_to_dna[bits5[1:3]] nucleotide_3 = bit_pair_to_dna[bits5[3:5]] - # make nucleotide 2 + # make nucleotide 1 if len(sequence) >= 3: # check 3 previous encoded bases if sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 or sequence[-1] == nucleotide_2 == nucleotide_3: - # break homopolymere + # break homopolymer nucleotide_1 = bit_to_dna_balance_GC(bits5[0], sequence[-1]) else: - # adjust GC% in the window of preceding bases + 2 new alpha bases + # if no homopolymer to break, adjust GC% in the window of preceding bases + 2 new alpha bases check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2 + nucleotide_3 if check_window.count("A")+check_window.count("T") > check_window.count("G")+check_window.count("C"): nucleotide_1 = bit_to_dna_GC[bits5[0]] - else: + elif check_window.count("A")+check_window.count("T") < check_window.count("G")+check_window.count("C"): nucleotide_1 = bit_to_dna_AT[bits5[0]] + else: # strictly equal, just balance with preceding base + nucleotide_1 = bit_to_dna_balance_GC(bits5[0], nucleotide_2) else: nucleotide_1 = bit_to_dna_balance_GC(bits5[0], nucleotide_2) @@ -271,13 +273,13 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str: # last conversions depends on the length of the rest, 1 bits = 1 base; 3 bits = 2 bases if len(bit_rest) == 1: nucleotide_2 = "" - homopolymere_check_bool = sequence[-3] == sequence[-2] == sequence[-1] + homopolymer_check_bool = sequence[-3] == sequence[-2] == sequence[-1] else: # rest = 3 nucleotide_2 = bit_pair_to_dna[bit_rest[1:3]] # nucleotide_2 from a pair of bits - homopolymere_check_bool = sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 # true => careful for a potential homopolymere + homopolymer_check_bool = sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 # true => careful for a potential homopolymer - if homopolymere_check_bool: - # break homopolymere + if homopolymer_check_bool: + # break homopolymer nucleotide_1 = bit_to_dna_balance_GC(bit_rest[0], sequence[-1]) else: @@ -360,14 +362,14 @@ def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str: # change a beta_base without changing the meaning of encoded data sequence_wo_bans_list[change_index] = dna_change_beta[sequence_wo_bans[change_index]] - index_before_ban_word = max(0, banned_words_indexes[0]-3) # test the changed part from 3 bases before to avoid creation of homopolymeres + index_before_ban_word = max(0, banned_words_indexes[0]-3) # test the changed part from 3 bases before to avoid creation of homopolymers index_after_ban_word = min(len(sequence)-1, banned_words_indexes[1]+3) # test to 3 bases after changed_sub_sequence = ''.join(sequence_wo_bans_list)[index_before_ban_word:index_after_ban_word] - # test the sequence only for the banned words and homopolymeres, ignore the GC% + # test the sequence only for the banned words and homopolymers, ignore the GC% if sequence_control.sequence_check(changed_sub_sequence, window_size=60, min_GC=0, max_GC=100, verbose=False): - # keep the change that removes this ban word if it doesn't create homopolymeres or other ban words + # keep the change that removes this ban word if it doesn't create homopolymers or other ban words sequence_wo_bans = ''.join(sequence_wo_bans_list) #print("removed ban word", sequence[banned_words_indexes[0]:banned_words_indexes[1]],"->",changed_sub_sequence) break @@ -387,7 +389,7 @@ def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str: def test_conversion(): - for binary_size in range(10, 15000, 1): + for binary_size in range(100, 15000, 1): print("size",str(binary_size)) random_binary = "" for i in range(binary_size): @@ -404,10 +406,10 @@ def test_conversion(): #print("check abaab encoding", str(decoded_abaab == random_binary)) print("check baa encoding", str(decoded_baa == random_binary)) - if not (decoded_baa == random_binary): - print(random_binary) + if not (decoded_baa == random_binary and sequence_control.sequence_check(seq_baa, min_GC=40, max_GC=60, verbose=True)): + #print(random_binary) print(seq_baa) - print(decoded_baa) + #print(decoded_baa) exit(1) #print("check abaab encoding")