Mentions légales du service

Skip to content
Snippets Groups Projects
Commit cf4e7e4b authored by BOULLE Olivier's avatar BOULLE Olivier
Browse files

refactoring in a function GC balancing

parent dbab2db1
No related branches found
No related tags found
No related merge requests found
...@@ -18,6 +18,16 @@ bit_to_dna_GC = {"0": "G", "1": "C"} ...@@ -18,6 +18,16 @@ bit_to_dna_GC = {"0": "G", "1": "C"}
dna_to_bit_ATGC = {"A": "0", "T": "1", "G": "0", "C": "1", "_": "0"} dna_to_bit_ATGC = {"A": "0", "T": "1", "G": "0", "C": "1", "_": "0"}
def bit_to_dna_balance_GC(bit_char: str, reference_base: str):
"""
convert a bit to a dna base to balance the GC with a reference base
"""
if reference_base in ["A", "T"]:
return bit_to_dna_GC[bit_char]
else:
return bit_to_dna_AT[bit_char]
def binary_to_dna(binary_string: str) -> str: def binary_to_dna(binary_string: str) -> str:
""" """
basic conversion of binaries into dna sequence basic conversion of binaries into dna sequence
...@@ -69,16 +79,14 @@ def binary_to_dna_abaab(binary_string: str) -> str: ...@@ -69,16 +79,14 @@ def binary_to_dna_abaab(binary_string: str) -> str:
for i in range(0, n_octets): for i in range(0, n_octets):
bits8 = binary_string[i*8:(i+1)*8] bits8 = binary_string[i*8:(i+1)*8]
nucleotide_1 = bit_pair_to_dna[bits8[0:2]] nucleotide_1 = bit_pair_to_dna[bits8[0:2]]
if nucleotide_1 in ["A", "T"]:
nucleotide_2 = bit_to_dna_GC[bits8[2]] nucleotide_2 = bit_to_dna_balance_GC(bits8[2], nucleotide_1)
else:
nucleotide_2 = bit_to_dna_AT[bits8[2]]
nucleotide_3 = bit_pair_to_dna[bits8[3:5]] nucleotide_3 = bit_pair_to_dna[bits8[3:5]]
nucleotide_4 = bit_pair_to_dna[bits8[5:7]] nucleotide_4 = bit_pair_to_dna[bits8[5:7]]
if nucleotide_4 in ["A", "T"]:
nucleotide_5 = bit_to_dna_GC[bits8[7]] nucleotide_5 = bit_to_dna_balance_GC(bits8[7], nucleotide_4)
else:
nucleotide_5 = bit_to_dna_AT[bits8[7]]
sequence += nucleotide_1 + nucleotide_2 + nucleotide_3 + nucleotide_4 + nucleotide_5 sequence += nucleotide_1 + nucleotide_2 + nucleotide_3 + nucleotide_4 + nucleotide_5
# rest should be 0 because all documents contains a round number of octet # rest should be 0 because all documents contains a round number of octet
# but some "0" can be added to fill the fragments # but some "0" can be added to fill the fragments
...@@ -91,10 +99,9 @@ def binary_to_dna_abaab(binary_string: str) -> str: ...@@ -91,10 +99,9 @@ def binary_to_dna_abaab(binary_string: str) -> str:
if len(bit_rest) == 4: if len(bit_rest) == 4:
sequence += bit_pair_to_dna[bit_rest[2:4]] # nucleotide_2 also from a pair of bits sequence += bit_pair_to_dna[bit_rest[2:4]] # nucleotide_2 also from a pair of bits
elif len(bit_rest) == 6: elif len(bit_rest) == 6:
if nucleotide_1 in ["A", "T"]:
sequence += bit_to_dna_GC[bit_rest[2]] # nucleotide_2 from a single bit and depending on nucleotide 1 # nucleotide_2 from a single bit and depending on nucleotide 1
else: sequence += bit_to_dna_balance_GC(bit_rest[2], nucleotide_1)
sequence += bit_to_dna_AT[bit_rest[2]] # nucleotide_2 from a single bit and depending on nucleotide 1
sequence += bit_pair_to_dna[bit_rest[3:5]] # nucleotide_3 from a pair of bits sequence += bit_pair_to_dna[bit_rest[3:5]] # nucleotide_3 from a pair of bits
sequence += bit_to_dna_AT[bit_rest[5]] # nucleotide_4 from a single bit sequence += bit_to_dna_AT[bit_rest[5]] # nucleotide_4 from a single bit
...@@ -240,10 +247,8 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str: ...@@ -240,10 +247,8 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
# check 3 previous encoded bases # check 3 previous encoded bases
if sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 or sequence[-1] == nucleotide_2 == nucleotide_3: if sequence[-3] == sequence[-2] == sequence[-1] or sequence[-2] == sequence[-1] == nucleotide_2 or sequence[-1] == nucleotide_2 == nucleotide_3:
# break homopolymere # break homopolymere
if sequence[-1] in ["A", "T"]: nucleotide_1 = bit_to_dna_balance_GC(bits5[0], sequence[-1])
nucleotide_1 = bit_to_dna_GC[bits5[0]]
else:
nucleotide_1 = bit_to_dna_AT[bits5[0]]
else: else:
# adjust GC% in the window of preceding bases + 2 new alpha bases # adjust GC% in the window of preceding bases + 2 new alpha bases
check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2 + nucleotide_3 check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2 + nucleotide_3
...@@ -253,10 +258,7 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str: ...@@ -253,10 +258,7 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
nucleotide_1 = bit_to_dna_AT[bits5[0]] nucleotide_1 = bit_to_dna_AT[bits5[0]]
else: else:
if nucleotide_2 in ["A", "T"]: nucleotide_1 = bit_to_dna_balance_GC(bits5[0], nucleotide_2)
nucleotide_1 = bit_to_dna_GC[bits5[0]]
else:
nucleotide_1 = bit_to_dna_AT[bits5[0]]
sequence += nucleotide_1 + nucleotide_2 + nucleotide_3 sequence += nucleotide_1 + nucleotide_2 + nucleotide_3
...@@ -272,10 +274,8 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str: ...@@ -272,10 +274,8 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
if homopolymere_check_bool: if homopolymere_check_bool:
# break homopolymere # break homopolymere
if sequence[-1] in ["A", "T"]: nucleotide_1 = bit_to_dna_balance_GC(bit_rest[0], sequence[-1])
nucleotide_1 = bit_to_dna_GC[bit_rest[0]]
else:
nucleotide_1 = bit_to_dna_AT[bit_rest[0]]
else: else:
# adjust GC% # adjust GC%
check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2 check_window = sequence[-min(len(sequence), GC_window):] + nucleotide_2
...@@ -292,6 +292,41 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str: ...@@ -292,6 +292,41 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
return sequence_wo_banwords return sequence_wo_banwords
def dna_to_binary_abaab(sequence: str) -> str:
"""
convert back a sequence to binaries (opposite of binary_to_dna_abaab)
"""
binary_string = ""
n_quintuplet, rest = divmod(len(sequence), 5)
for i in range(0, n_quintuplet):
nucleotides5 = sequence[i*5:(i+1)*5]
bits_12 = dna_to_bit_pair[nucleotides5[0]]
bit_3 = dna_to_bit_ATGC[nucleotides5[1]]
bits_45 = dna_to_bit_pair[nucleotides5[2]]
bits_67 = dna_to_bit_pair[nucleotides5[3]]
bit_8 = dna_to_bit_ATGC[nucleotides5[4]]
binary_string += bits_12 + bit_3 + bits_45 + bits_67 + bit_8
# handle rest < 5
sequence_rest = sequence[n_quintuplet*5:n_quintuplet*5+rest]
if len(sequence_rest) >= 1: # 1 base -> 2 bits
binary_string += dna_to_bit_pair[sequence_rest[0]]
if len(sequence_rest) == 2: # 2 bases -> 2 bits + 2 bits
binary_string += dna_to_bit_pair[sequence_rest[1]]
elif len(sequence_rest) == 3:
# a rest of 3 should not occur from a binary to dna abaab conversion, but a non coding A can be added to get a round number of blocks
binary_string += dna_to_bit_pair[sequence_rest[1]] # just act like the rest was 2
elif len(sequence_rest) == 4: # 4 bases -> 2 bits + 1 bit + 2 bits + 1 bit
binary_string += dna_to_bit_ATGC[sequence_rest[1]] # 1 base -> 1 bit
binary_string += dna_to_bit_pair[sequence_rest[2]] # 1 base -> 2 bits
binary_string += dna_to_bit_ATGC[sequence_rest[3]] # 1 base -> 1 bit
return binary_string
def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str: def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str:
""" """
remove banned words from a sequence encoded with the binary_conversion.binary_to_dna_baa() method remove banned words from a sequence encoded with the binary_conversion.binary_to_dna_baa() method
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment