diff --git a/binary_dna_conversion.py b/binary_dna_conversion.py index bb2d2812607705ee47cbc783f629838702ede688..dbae39c05c02bbf4a87190109176c3601c33dd96 100755 --- a/binary_dna_conversion.py +++ b/binary_dna_conversion.py @@ -18,6 +18,16 @@ bit_to_dna_GC = {"0": "G", "1": "C"} dna_to_bit_ATGC = {"A": "0", "T": "1", "G": "0", "C": "1", "_": "0"} +def bit_to_dna_balance_GC(bit_char: str, reference_base: str): + """ + convert a bit to a dna base to balance the GC with a reference base + """ + if reference_base in ["A", "T"]: + return bit_to_dna_GC[bit_char] + else: + return bit_to_dna_AT[bit_char] + + def binary_to_dna(binary_string: str) -> str: """ basic conversion of binaries into dna sequence @@ -69,16 +79,14 @@ def binary_to_dna_abaab(binary_string: str) -> str: for i in range(0, n_octets): bits8 = binary_string[i*8:(i+1)*8] nucleotide_1 = bit_pair_to_dna[bits8[0:2]] - if nucleotide_1 in ["A", "T"]: - nucleotide_2 = bit_to_dna_GC[bits8[2]] - else: - nucleotide_2 = bit_to_dna_AT[bits8[2]] + + nucleotide_2 = bit_to_dna_balance_GC(bits8[2], nucleotide_1) + nucleotide_3 = bit_pair_to_dna[bits8[3:5]] nucleotide_4 = bit_pair_to_dna[bits8[5:7]] - if nucleotide_4 in ["A", "T"]: - nucleotide_5 = bit_to_dna_GC[bits8[7]] - else: - nucleotide_5 = bit_to_dna_AT[bits8[7]] + + nucleotide_5 = bit_to_dna_balance_GC(bits8[7], nucleotide_4) + sequence += nucleotide_1 + nucleotide_2 + nucleotide_3 + nucleotide_4 + nucleotide_5 # rest should be 0 because all documents contains a round number of octet # but some "0" can be added to fill the fragments @@ -91,10 +99,9 @@ def binary_to_dna_abaab(binary_string: str) -> str: if len(bit_rest) == 4: sequence += bit_pair_to_dna[bit_rest[2:4]] # nucleotide_2 also from a pair of bits elif len(bit_rest) == 6: - if nucleotide_1 in ["A", "T"]: - sequence += bit_to_dna_GC[bit_rest[2]] # nucleotide_2 from a single bit and depending on nucleotide 1 - else: - sequence += bit_to_dna_AT[bit_rest[2]] # nucleotide_2 from a single bit and depending on nucleotide 1 + + # nucleotide_2 from a single bit and depending on nucleotide 1 + sequence += bit_to_dna_balance_GC(bit_rest[2], nucleotide_1) sequence += bit_pair_to_dna[bit_rest[3:5]] # nucleotide_3 from a pair of bits sequence += bit_to_dna_AT[bit_rest[5]] # nucleotide_4 from a single bit @@ -240,10 +247,8 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str: # check 3 previous encoded bases if sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 or sequence[-1] == nucleotide_2 == nucleotide_3: # break homopolymere - if sequence[-1] in ["A", "T"]: - nucleotide_1 = bit_to_dna_GC[bits5[0]] - else: - nucleotide_1 = bit_to_dna_AT[bits5[0]] + nucleotide_1 = bit_to_dna_balance_GC(bits5[0], sequence[-1]) + else: # adjust GC% in the window of preceding bases + 2 new alpha bases check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2 + nucleotide_3 @@ -253,10 +258,7 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str: nucleotide_1 = bit_to_dna_AT[bits5[0]] else: - if nucleotide_2 in ["A", "T"]: - nucleotide_1 = bit_to_dna_GC[bits5[0]] - else: - nucleotide_1 = bit_to_dna_AT[bits5[0]] + nucleotide_1 = bit_to_dna_balance_GC(bits5[0], nucleotide_2) sequence += nucleotide_1 + nucleotide_2 + nucleotide_3 @@ -272,10 +274,8 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str: if homopolymere_check_bool: # break homopolymere - if sequence[-1] in ["A", "T"]: - nucleotide_1 = bit_to_dna_GC[bit_rest[0]] - else: - nucleotide_1 = bit_to_dna_AT[bit_rest[0]] + nucleotide_1 = bit_to_dna_balance_GC(bit_rest[0], sequence[-1]) + else: # adjust GC% check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2 @@ -292,6 +292,41 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str: return sequence_wo_banwords +def dna_to_binary_abaab(sequence: str) -> str: + """ + convert back a sequence to binaries (opposite of binary_to_dna_abaab) + """ + + binary_string = "" + n_quintuplet, rest = divmod(len(sequence), 5) + for i in range(0, n_quintuplet): + nucleotides5 = sequence[i*5:(i+1)*5] + bits_12 = dna_to_bit_pair[nucleotides5[0]] + bit_3 = dna_to_bit_ATGC[nucleotides5[1]] + bits_45 = dna_to_bit_pair[nucleotides5[2]] + bits_67 = dna_to_bit_pair[nucleotides5[3]] + bit_8 = dna_to_bit_ATGC[nucleotides5[4]] + binary_string += bits_12 + bit_3 + bits_45 + bits_67 + bit_8 + + # handle rest < 5 + sequence_rest = sequence[n_quintuplet*5:n_quintuplet*5+rest] + if len(sequence_rest) >= 1: # 1 base -> 2 bits + binary_string += dna_to_bit_pair[sequence_rest[0]] + + if len(sequence_rest) == 2: # 2 bases -> 2 bits + 2 bits + binary_string += dna_to_bit_pair[sequence_rest[1]] + + elif len(sequence_rest) == 3: + # a rest of 3 should not occur from a binary to dna abaab conversion, but a non coding A can be added to get a round number of blocks + binary_string += dna_to_bit_pair[sequence_rest[1]] # just act like the rest was 2 + + elif len(sequence_rest) == 4: # 4 bases -> 2 bits + 1 bit + 2 bits + 1 bit + binary_string += dna_to_bit_ATGC[sequence_rest[1]] # 1 base -> 1 bit + binary_string += dna_to_bit_pair[sequence_rest[2]] # 1 base -> 2 bits + binary_string += dna_to_bit_ATGC[sequence_rest[3]] # 1 base -> 1 bit + + return binary_string + def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str: """ remove banned words from a sequence encoded with the binary_conversion.binary_to_dna_baa() method