diff --git a/binary_dna_conversion.py b/binary_dna_conversion.py index d143dac7e851ebcd8aedc426e4ccc99097431282..09e0bfcbf18dd27343f51541fb42451e0b3dced3 100755 --- a/binary_dna_conversion.py +++ b/binary_dna_conversion.py @@ -1,4 +1,5 @@ import sys +import random import hashing import sequence_control @@ -11,10 +12,10 @@ bit_pair_to_dna = {"00": "A", "01": "G", "10": "T", "11": "C"} # "_" is for missing nucleotides dna_to_bit_pair = {"A": "00", "G": "01", "T" :"10", "C": "11", "_" :"00"} -bit_to_dna_x = {"0": "A", "1": "T"} -bit_to_dna_y = {"0": "G", "1": "C"} +bit_to_dna_AT = {"0": "A", "1": "T"} +bit_to_dna_GC = {"0": "G", "1": "C"} -dna_to_bit_xy = {"A": "0", "T": "1", "G": "0", "C": "1", "_": "0"} +dna_to_bit_ATGC = {"A": "0", "T": "1", "G": "0", "C": "1", "_": "0"} def binary_to_dna(binary_string: str) -> str: @@ -41,64 +42,7 @@ def dna_to_binary(sequence: str) -> str: return binary_string -def binary_to_dna_xy(binary_string: str) -> str: - """ - convert binaries into dna sequence with some properties - the binaries are divided into parts of 6 bits - for each 6 bits part, 1st bit : (0:A, 1:T) - 2nd bit : (0:G, 1:C) - 3rd 4th bits are normally converted into dna : (00:A, 01:G, 10:T, 11:C) - 5th 6th bits are normally converted into dna - this ensure that the total sequence cannot contain homopolymeres > 3, - the GC% is in [25%, 75%] and the conversion rate is 1.5bit/base - """ - - sequence = "" - n_sextuplet, rest = divmod(len(binary_string), 6) - for i in range(0, n_sextuplet): - bits6 = binary_string[i*6:(i+1)*6] - nucleotide_1 = bit_to_dna_x[bits6[0]] - nucleotide_2 = bit_to_dna_y[bits6[1]] - nucleotide_3 = bit_pair_to_dna[bits6[2:4]] - nucleotide_4 = bit_pair_to_dna[bits6[4:6]] - sequence += nucleotide_1 + nucleotide_2 + nucleotide_3 + nucleotide_4 - - bit_rest = binary_string[n_sextuplet*6:n_sextuplet*6+rest] # rest is 0-2-4 bits - if len(bit_rest) >= 2: - sequence += bit_to_dna_x[bit_rest[0]] + bit_to_dna_y[bit_rest[1]] - if len(bit_rest) == 4: - sequence += bit_pair_to_dna[bit_rest[2:4]] - - return sequence - - -def dna_to_binary_xy(sequence: str) -> str: - """ - convert back a sequence to binaries (opposite of binary_to_dna_xy) - #TODO possibility to detect and solve some errors - """ - - binary_string = "" - n_quadruplet, rest = divmod(len(sequence), 4) - for i in range(0, n_quadruplet): - nucleotides4 = sequence[i*4:(i+1)*4] - bit_1 = dna_to_bit_xy[nucleotides4[0]] # should only be A or T #TODO CG equals error - bit_2 = dna_to_bit_xy[nucleotides4[1]] # should only be G or C #TODO AT equals error - bits_3456 = dna_to_bit_pair[nucleotides4[2]] + dna_to_bit_pair[nucleotides4[3]] - binary_string += bit_1 + bit_2 + bits_3456 - - sequence_rest = sequence[n_quadruplet*4:n_quadruplet*4+rest] - if len(sequence_rest) >= 1: - binary_string += dna_to_bit_xy[sequence_rest[0]] - if len(sequence_rest) >= 2: - binary_string += dna_to_bit_xy[sequence_rest[1]] - if len(sequence_rest) >= 3: - binary_string += dna_to_bit_pair[sequence_rest[2]] - - return binary_string - - -def binary_to_dna_z(binary_string: str) -> str: +def binary_to_dna_abaab(binary_string: str) -> str: """ convert binaries into dna sequence with some properties the binaries are divided into parts of 8 bits @@ -108,9 +52,9 @@ def binary_to_dna_z(binary_string: str) -> str: 6th 7th bits are normally converted into dna 8th bit is converted depending on previous conversion : if previous in {AT}, then (0:G, 1:C), elif in {GC}, then (0:A, 1:T) this ensure that the total sequence cannot contain homopolymeres > 3, - the GC% is in [40%, 60%] and the conversion rate is 1.6bit/base + the GC% is in [40%, 60%] in a window of 5 and the conversion rate is 1.6 bit/base - warning : need the binary string to be a multiple of 2, or rest is inconsitant with decoding + warning : need the binary string to be a multiple of 2, or rest is inconsistent with decoding ex : a rest of 0 -> A, a rest of 00 -> A so how to decode A ? only allowing rests multiple of 2 removes ambiguity @@ -126,15 +70,15 @@ def binary_to_dna_z(binary_string: str) -> str: bits8 = binary_string[i*8:(i+1)*8] nucleotide_1 = bit_pair_to_dna[bits8[0:2]] if nucleotide_1 in ["A", "T"]: - nucleotide_2 = bit_to_dna_y[bits8[2]] + nucleotide_2 = bit_to_dna_GC[bits8[2]] else: - nucleotide_2 = bit_to_dna_x[bits8[2]] + nucleotide_2 = bit_to_dna_AT[bits8[2]] nucleotide_3 = bit_pair_to_dna[bits8[3:5]] nucleotide_4 = bit_pair_to_dna[bits8[5:7]] if nucleotide_4 in ["A", "T"]: - nucleotide_5 = bit_to_dna_y[bits8[7]] + nucleotide_5 = bit_to_dna_GC[bits8[7]] else: - nucleotide_5 = bit_to_dna_x[bits8[7]] + nucleotide_5 = bit_to_dna_AT[bits8[7]] sequence += nucleotide_1 + nucleotide_2 + nucleotide_3 + nucleotide_4 + nucleotide_5 # rest should be 0 because all documents contains a round number of octet # but some "0" can be added to fill the fragments @@ -148,22 +92,22 @@ def binary_to_dna_z(binary_string: str) -> str: sequence += bit_pair_to_dna[bit_rest[2:4]] # nucleotide_2 also from a pair of bits elif len(bit_rest) == 6: if nucleotide_1 in ["A", "T"]: - sequence += bit_to_dna_y[bit_rest[2]] # nucleotide_2 from a single bit and depending on nucleotide 1 + sequence += bit_to_dna_GC[bit_rest[2]] # nucleotide_2 from a single bit and depending on nucleotide 1 else: - sequence += bit_to_dna_x[bit_rest[2]] # nucleotide_2 from a single bit and depending on nucleotide 1 + sequence += bit_to_dna_AT[bit_rest[2]] # nucleotide_2 from a single bit and depending on nucleotide 1 sequence += bit_pair_to_dna[bit_rest[3:5]] # nucleotide_3 from a pair of bits - sequence += bit_to_dna_x[bit_rest[5]] # nucleotide_4 from a single bit + sequence += bit_to_dna_AT[bit_rest[5]] # nucleotide_4 from a single bit # remove the banned words - sequence_wo_banwords = remove_ban_words_z_encoding(sequence) + sequence_wo_banwords = remove_ban_words_abaab_encoding(sequence) return sequence_wo_banwords -def dna_to_binary_z(sequence: str) -> str: +def dna_to_binary_abaab(sequence: str) -> str: """ - convert back a sequence to binaries (opposite of binary_to_dna_z) + convert back a sequence to binaries (opposite of binary_to_dna_abaab) """ binary_string = "" @@ -171,10 +115,10 @@ def dna_to_binary_z(sequence: str) -> str: for i in range(0, n_quintuplet): nucleotides5 = sequence[i*5:(i+1)*5] bits_12 = dna_to_bit_pair[nucleotides5[0]] - bit_3 = dna_to_bit_xy[nucleotides5[1]] + bit_3 = dna_to_bit_ATGC[nucleotides5[1]] bits_45 = dna_to_bit_pair[nucleotides5[2]] bits_67 = dna_to_bit_pair[nucleotides5[3]] - bit_8 = dna_to_bit_xy[nucleotides5[4]] + bit_8 = dna_to_bit_ATGC[nucleotides5[4]] binary_string += bits_12 + bit_3 + bits_45 + bits_67 + bit_8 # handle rest < 5 @@ -190,29 +134,27 @@ def dna_to_binary_z(sequence: str) -> str: binary_string += dna_to_bit_pair[sequence_rest[1]] # just act like the rest was 2 elif len(sequence_rest) == 4: # 4 bases -> 2 bits + 1 bit + 2 bits + 1 bit - binary_string += dna_to_bit_xy[sequence_rest[1]] # 1 base -> 1 bit + binary_string += dna_to_bit_ATGC[sequence_rest[1]] # 1 base -> 1 bit binary_string += dna_to_bit_pair[sequence_rest[2]] # 1 base -> 2 bits - binary_string += dna_to_bit_xy[sequence_rest[3]] # 1 base -> 1 bit + binary_string += dna_to_bit_ATGC[sequence_rest[3]] # 1 base -> 1 bit return binary_string - - -def remove_ban_words_z_encoding(sequence: str, z_method_offset=0) -> str: +def remove_ban_words_abaab_encoding(sequence: str, abaab_method_offset=0) -> str: """ - remove banned words from a sequence encoded with the binary_conversion.binary_to_dna_z() method - the z method ensure a GC% in [40;60] on a window of 5 bases (&z&&z), but we the constraint is on larger windows + remove banned words from a sequence encoded with the binary_conversion.binary_to_dna_abaab() method + the abaab method ensure a GC% in [40;60] on a window of 5 bases (&z&&z), but we the constraint is on larger windows changing a base to avoid a banned word should not affect the whole sequence too much - the changed base must be a z base, and be A<=>G; T<=>C, so the decoding of the updated sequence will still return the same result + the changed base must be a beta base, and be A<=>G; T<=>C, so the decoding of the updated sequence will still return the same result - if the original sequence has already been fragmented, the 5 bases windows of the z method can be offset, z_method_offset is used to correct this + if the original sequence has already been fragmented, the 5 bases windows of the abaab method can be offset, abaab_method_offset is used to correct this """ forbidden_sequences = ["GGTCTC", "GAGACC"] - # change a z base but keep the binary meaning for the decoding - dna_change_z = {"A": "G", "T": "C", "G": "A", "C": "T"} + # change a beta base but keep the binary meaning for the decoding + dna_change_beta = {"A": "G", "T": "C", "G": "A", "C": "T"} # check for banned words indexs_ban_words = [] @@ -225,20 +167,20 @@ def remove_ban_words_z_encoding(sequence: str, z_method_offset=0) -> str: if indexs_ban_words == []: return sequence - # indexes of z bases in the sequence (2nd and 5th base every 5 base window) - z_indexes = [ z_method_offset+ 5*k+i for k in range(-1, len(sequence)//5) for i in [1,4] ] + # indexes of beta bases in the sequence (2nd and 5th base every 5 base window) + beta_indexes = [ abaab_method_offset+ 5*k+i for k in range(-1, len(sequence)//5) for i in [1,4] ] sequence_wo_bans = sequence for banned_words_indexes in indexs_ban_words: - # indexes of z bases that can be changed to remove a banned word - potential_changes_index = [ k for k in z_indexes if k >= banned_words_indexes[0] and k < banned_words_indexes[1] ] + # indexes of beta bases that can be changed to remove a banned word + potential_changes_index = [ k for k in beta_indexes if k >= banned_words_indexes[0] and k < banned_words_indexes[1] ] for change_index in potential_changes_index: sequence_wo_bans_list = list(sequence_wo_bans) # turn into list because python strings are immutable - # change a z_base without changing the meaning of encoded data - sequence_wo_bans_list[change_index] = dna_change_z[sequence_wo_bans[change_index]] + # change a beta_base without changing the meaning of encoded data + sequence_wo_bans_list[change_index] = dna_change_beta[sequence_wo_bans[change_index]] index_before_ban_word = max(0, banned_words_indexes[0]-3) # test the changed part from 3 bases before to avoid creation of homopolymeres index_after_ban_word = min(len(sequence)-1, banned_words_indexes[1]+3) # test to 3 bases after @@ -265,12 +207,102 @@ def remove_ban_words_z_encoding(sequence: str, z_method_offset=0) -> str: return sequence_wo_bans +def binary_to_dna_baa(binary_string: str, GC_window=20) -> str: + """ + convert binaries into dna sequence with some properties + the binaries are divided into parts of 5 bits + for each 5 bits part, 1st 2nd bits are normally converted into dna : (00:A, 01:G, 10:T, 11:C) + 3rd bit is converted depending on previous converted sequence : + (0:G, 1:C) or (0:A, 1:T), first to break potential homopolymere, if not then to adjust %GC of the previous sequence + 4th 5th bits are normally converted into dna + + this ensure that the total sequence cannot contain homopolymeres > 3, + the GC% is in [33%, 66%] in the worst case in a window of 3, but tends to 50% with the adjustments and the conversion rate is 1.66 bit/base + + warning : need the binary string to be a multiple of 5, or rest is inconsistent with decoding + ex : a rest of 0 -> A, a rest of 00 -> A + so how to decode A ? + only allowing rests multiple of 5 removes ambiguity (also possible 0,2 or 3 modulo 5) + + """ + if len(binary_string) % 5 != 0: + print("error binary dna conversion, need a binary string multiple of 5") + exit(0) + + sequence = "" + n_quintuplets, rest = divmod(len(binary_string), 5) + for i in range(0, n_quintuplets): + bits5 = binary_string[i*5:(i+1)*5] + + nucleotide_2 = bit_pair_to_dna[bits5[1:3]] + nucleotide_3 = bit_pair_to_dna[bits5[3:5]] + + # make nucleotide 2 + if len(sequence) >= 3: + # check 3 previous encoded bases + if sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 or sequence[-1] == nucleotide_2 == nucleotide_3: + # break homopolymere + if sequence[-1] in ["A", "T"]: + nucleotide_1 = bit_to_dna_GC[bits5[0]] + else: + nucleotide_1 = bit_to_dna_AT[bits5[0]] + else: + # adjust GC% + check_window = sequence[-min(len(sequence), GC_window):] + if check_window.count("A")+check_window.count("T") > check_window.count("G")+check_window.count("C"): + nucleotide_1 = bit_to_dna_GC[bits5[0]] + else: + nucleotide_1 = bit_to_dna_AT[bits5[0]] + + else: + if nucleotide_2 in ["A", "T"]: + nucleotide_1 = bit_to_dna_GC[bits5[0]] + else: + nucleotide_1 = bit_to_dna_AT[bits5[0]] + + sequence += nucleotide_1 + nucleotide_2 + nucleotide_3 + + bit_rest = binary_string[n_quintuplets*5:n_quintuplets*5+rest] # rest is 0-2-3 bits + + # last conversions depends on the length of the rest, 2 bits = 1 base; 3 bits = 2 bases + if len(bit_rest) >= 2: + nucleotide_1 = bit_pair_to_dna[bit_rest[0:2]] # nucleotide_1 from a pair of bits + sequence += nucleotide_1 + if len(bit_rest) == 3: + if nucleotide_1 in ["A", "T"]: + sequence += bit_to_dna_GC[bit_rest[2]] # nucleotide_2 from a single bit and depending on nucleotide 1 + else: + sequence += bit_to_dna_AT[bit_rest[2]] # nucleotide_2 from a single bit and depending on nucleotide 1 + + return sequence + + +def test_conversion(): + + binary_size = 15000 + + random_binary = "" + for i in range(binary_size): + random_binary += random.Random().choice(['0', '1']) + + seq_z = binary_to_dna_abaab(random_binary) + seq_aba = binary_to_dna_aba(random_binary) + seq_baa = binary_to_dna_baa(random_binary) + + print("check z encoding") + sequence_control.sanity_check(seq_z) + print("check aba encoding") + sequence_control.sanity_check(seq_aba) + print("check baa encoding") + sequence_control.sanity_check(seq_baa) + + # =================== main ======================= # if __name__ == '__main__': #doc_path = sys.argv[1] - binary_string = sys.argv[1] - seq = binary_to_dna_z(binary_string) + #binary_string = sys.argv[1] + #seq = binary_to_dna_abaab(binary_string) #print(seq) - #print(dna_to_binary_z(seq)) - + #print(dna_to_binary_abaab(seq)) + test_conversion() diff --git a/file_to_dna.py b/file_to_dna.py index a84ab59c6ebf058e7c943f89d40547d68c98696a..d5cc0f457f7ce02a32a983e409ec4aa1739f1c13 100755 --- a/file_to_dna.py +++ b/file_to_dna.py @@ -90,7 +90,7 @@ def convert_file_to_bits(input_path: str) -> str: def size_of_dna_from_bit_len(bit_length): """ - assume binary to dna_z conversion + assume binary to dna_abaab conversion get the size the dna sequence will be from the given binary string length conversion rate is 1.6 bit/base """ @@ -124,7 +124,7 @@ def get_max_binary_len() -> int: def size_binary_from_dna_len(dna_length): """ - assume dna_z to binary conversion + assume dna_abaab to binary conversion get the length the binary string will be from the given dna sequence conversion rate is 1.6bit/base """ @@ -216,7 +216,7 @@ def encode_file(input_path: str, output_path: str) -> None: filtered_binary_string += binary_check_sum # convert binaries into dna sequence - sequence = bdc.binary_to_dna_z(filtered_binary_string) + sequence = bdc.binary_to_dna_abaab(filtered_binary_string) if add_A_at_end: sequence += "A" @@ -274,7 +274,7 @@ def decode_file(input_path: str, output_path: str) -> None: sequence = "".join(sub_sequences_dict.values()) # convert the dna sequence into a binary string - binary_from_dna_string = bdc.dna_to_binary_z(sequence) + binary_from_dna_string = bdc.dna_to_binary_abaab(sequence) if not binary_from_dna_string: print("warning file conversion, decoding an empty file") @@ -330,8 +330,8 @@ if __name__ == '__main__': # seq = encode_file("toto", i) #decode_file(seq, "toto") #encode_file("", "test") - #seq = binary_to_dna_z(binary_string) + #seq = binary_to_dna_abaab(binary_string) #print(seq) - #print(dna_to_binary_z(seq)) + #print(dna_to_binary_abaab(seq))