diff --git a/binary_dna_conversion.py b/binary_dna_conversion.py index 98c63b7539c6b9588271108f9123761991ddd340..1510a0f0fc8a6f9587fac9f050918849f38b00e4 100755 --- a/binary_dna_conversion.py +++ b/binary_dna_conversion.py @@ -115,7 +115,7 @@ def dna_to_binary_abaab(sequence: str) -> str: binary_string += dna_to_bit_pair[sequence_rest[1]] elif len(sequence_rest) == 3: - # a rest of 3 should not occur from a binary to dna abaab conversion, but a non coding A can be added to get a round number of blocks + # a rest of 3 should not occur from a binary to dna abaab conversion, but a non coding base can be added to get a round number of blocks binary_string += dna_to_bit_pair[sequence_rest[1]] # just act like the rest was 2 elif len(sequence_rest) == 4: # 4 bases -> 2 bits + 1 bit + 2 bits + 1 bit @@ -188,27 +188,9 @@ def remove_ban_words_abaab_encoding(sequence: str, abaab_method_offset=0) -> str exit(1) return sequence_wo_bans - - -def size_binary_from_dna_len_abaab(dna_length): - """ - assume dna_abaab to binary conversion - get the length the binary string will be from the given dna sequence - conversion rate is 1.6bit/base - """ - base_length = 8 * (dna_length//5) # full conversion of octets - rest = dna_length % 5 - if rest == 0: - return base_length - if rest == 1: - return base_length + 2 - if rest == 2 or rest == 3: # if rest of 3, act like a rest of 2 because the third base will be ignored - return base_length + 4 - if rest == 4: - return base_length + 6 - + -def size_of_dna_from_bit_len_abaab(bit_length): +def size_dna_from_bit_len_abaab(bit_length): """ assume binary to dna_abaab conversion get the size the dna sequence will be from the given binary string length @@ -226,23 +208,42 @@ def size_of_dna_from_bit_len_abaab(bit_length): return base_length + 4 print("error size_of_dna_from_bit_len : size not multiple of 2 :",str(bit_length)) exit(1) - + + + +def size_binary_from_dna_len_abaab(dna_length): + """ + assume dna_abaab to binary conversion + get the length the binary string will be from the given dna sequence + conversion rate is 1.6bit/base + """ + base_length = 8 * (dna_length//5) # full conversion of octets + rest = dna_length % 5 + if rest == 0: + return base_length + if rest == 1: + return base_length + 2 + if rest == 2 or rest == 3: # if rest of 3, act like a rest of 2 because the third base will be ignored + return base_length + 4 + if rest == 4: + return base_length + 6 + def binary_to_dna_baa(binary_string: str, GC_window=20) -> str: """ convert binaries into dna sequence with some properties the binaries are divided into parts of 5 bits - for each 5 bits part, 1st 2nd bits are normally converted into dna : (00:A, 01:G, 10:T, 11:C) - 3rd bit is converted depending on previous converted sequence : - (0:G, 1:C) or (0:A, 1:T), first to break potential homopolymer, if not then to adjust %GC of the previous sequence - 4th 5th bits are normally converted into dna + for each 5 bits part, 1st bit is converted depending on previous converted sequence : + (0:G, 1:C) or (0:A, 1:T), first to break potential homopolymer, if not then to adjust %GC of the local sequence in a defined window + 2nd 3rd bits are normally converted into dna : (00:A, 01:G, 10:T, 11:C) + 4th 5th bits are normally converted into dna : (00:A, 01:G, 10:T, 11:C) this ensure that the total sequence cannot contain homopolymers > 3, the GC% is in [33%, 66%] in the worst case in a window of 3, but tends to 50% with the adjustments and the conversion rate is 1.66 bit/base warning : need the binary string to be a multiple of 5, or rest is inconsistent with decoding ex : a rest of 0 -> A, a rest of 00 -> A - so how to decode A ? + so how to decode a base A ? only allowing rests multiple of 5 removes ambiguity (also possible 0, 1 or 3 modulo 5) """ @@ -400,6 +401,41 @@ def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str: return sequence_wo_bans +def size_dna_from_bit_len_baa(bit_length): + """ + assume binary to dna_baa conversion + get the size the dna sequence will be from the given binary string length + conversion rate is 1.66 bit/base + """ + base_length = 3 * (bit_length//5) # 5 bits -> 3 bases + rest = bit_length % 5 + if rest == 0: + return base_length + if rest == 1: + return base_length + 1 + if rest == 3: + return base_length + 2 + + print("error size_of_dna_from_bit_len : invalid rest size:",str(rest)) + exit(1) + + +def size_binary_from_dna_len_baa(dna_length): + """ + assume dna_baa to binary conversion + get the length the binary string will be from the given dna sequence + conversion rate is 1.66 bit/base + """ + base_length = 5 * (dna_length//3) # 3 bases -> 5 bits + rest = dna_length % 3 + if rest == 0: + return base_length + if rest == 1: + return base_length + 1 + if rest == 2: + return base_length + 3 + + def test_conversion(): for binary_size in range(100, 15000, 1): @@ -417,13 +453,20 @@ def test_conversion(): #decoded_abaab = dna_to_binary_abaab(seq_abaab) decoded_baa = dna_to_binary_baa(seq_baa) + if size_binary_from_dna_len_baa(len(seq_baa)) != len(random_binary): + print("dna",str(len(seq_baa))) + exit(0) + if size_dna_from_bit_len_baa(len(random_binary)) != len(seq_baa): + print("bin",str(len(random_binary))) + exit(0) + #print("check abaab encoding", str(decoded_abaab == random_binary)) - print("check baa encoding", str(decoded_baa == random_binary)) - if not (decoded_baa == random_binary and sequence_control.sequence_check(seq_baa, min_GC=40, max_GC=60, verbose=True)): + #print("check baa encoding", str(decoded_baa == random_binary)) + """if not (decoded_baa == random_binary and sequence_control.sequence_check(seq_baa, min_GC=40, max_GC=60, verbose=True)): #print(random_binary) print(seq_baa) #print(decoded_baa) - exit(1) + exit(1)""" #print("check abaab encoding") #sequence_control.sanity_check(seq_abaab)