diff --git a/binary_dna_conversion.py b/binary_dna_conversion.py index 84576f8522c9e363415cc89b9ab7502d8b71b27c..b122af1fcd57baadf82765d47d961462af4f35c3 100755 --- a/binary_dna_conversion.py +++ b/binary_dna_conversion.py @@ -68,6 +68,9 @@ def binary_to_dna_abaab(binary_string: str) -> str: ex : a rest of 0 -> A, a rest of 00 -> A so how to decode A ? only allowing rests multiple of 2 removes ambiguity + + #TODO warning : if ending with bsaI -> end can currently be aa when a rest of 4 bits -> will break with banword removal + """ if len(binary_string) % 2 != 0: @@ -227,12 +230,12 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str: warning : need the binary string to be a multiple of 5, or rest is inconsistent with decoding ex : a rest of 0 -> A, a rest of 00 -> A so how to decode A ? - only allowing rests multiple of 5 removes ambiguity (also possible 0,1 or 3 modulo 5) + only allowing rests multiple of 5 removes ambiguity (also possible 0, 1 or 3 modulo 5) """ - if len(binary_string) % 5 != 0: - print("error binary dna conversion, need a binary string multiple of 5") - exit(0) + if len(binary_string) % 5 not in [0, 1, 3]: + print("error binary dna conversion, need a binary string multiple of 5, or 1, 3 modulo 5") + return sequence = "" n_quintuplets, rest = divmod(len(binary_string), 5) @@ -266,12 +269,12 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str: if len(bit_rest) > 0: # last conversions depends on the length of the rest, 1 bits = 1 base; 3 bits = 2 bases - if len(bit_rest) == 3: + if len(bit_rest) == 1: + nucleotide_2 = "" + homopolymere_check_bool = sequence[-3] == sequence[-2] == sequence[-1] + else: # rest = 3 nucleotide_2 = bit_pair_to_dna[bit_rest[1:3]] # nucleotide_2 from a pair of bits homopolymere_check_bool = sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 # true => careful for a potential homopolymere - else: - nucleotide_2 = "" - homopolymere_check_bool = sequence[-3] == sequence[-2] == sequence[-1] if homopolymere_check_bool: # break homopolymere @@ -281,9 +284,9 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str: # adjust GC% check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2 if check_window.count("A")+check_window.count("T") > check_window.count("G")+check_window.count("C"): - nucleotide_1 = bit_to_dna_GC[bits5[0]] + nucleotide_1 = bit_to_dna_GC[bit_rest[0]] else: - nucleotide_1 = bit_to_dna_AT[bits5[0]] + nucleotide_1 = bit_to_dna_AT[bit_rest[0]] sequence += nucleotide_1 + nucleotide_2 @@ -384,21 +387,27 @@ def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str: def test_conversion(): - for binary_size in range(10,15000, 10): - print(binary_size) + for binary_size in range(10, 15000, 1): + print("size",str(binary_size)) random_binary = "" for i in range(binary_size): random_binary += random.Random().choice(['0', '1']) - seq_abaab = binary_to_dna_abaab(random_binary) + #random_binary = "00011110010" + #seq_abaab = binary_to_dna_abaab(random_binary) seq_baa = binary_to_dna_baa(random_binary) + if seq_baa is None: + continue - decoded_abaab = dna_to_binary_abaab(seq_abaab) + #decoded_abaab = dna_to_binary_abaab(seq_abaab) decoded_baa = dna_to_binary_baa(seq_baa) - print("check abaab encoding", str(decoded_abaab == random_binary)) + #print("check abaab encoding", str(decoded_abaab == random_binary)) print("check baa encoding", str(decoded_baa == random_binary)) - if not (decoded_baa == random_binary and decoded_abaab == random_binary ): + if not (decoded_baa == random_binary): + print(random_binary) + print(seq_baa) + print(decoded_baa) exit(1) #print("check abaab encoding")