Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 6865c727 authored by BOULLE Olivier's avatar BOULLE Olivier
Browse files

refactoring, size estimations for baa

parent 94946468
Branches
No related tags found
No related merge requests found
......@@ -115,7 +115,7 @@ def dna_to_binary_abaab(sequence: str) -> str:
binary_string += dna_to_bit_pair[sequence_rest[1]]
elif len(sequence_rest) == 3:
# a rest of 3 should not occur from a binary to dna abaab conversion, but a non coding A can be added to get a round number of blocks
# a rest of 3 should not occur from a binary to dna abaab conversion, but a non coding base can be added to get a round number of blocks
binary_string += dna_to_bit_pair[sequence_rest[1]] # just act like the rest was 2
elif len(sequence_rest) == 4: # 4 bases -> 2 bits + 1 bit + 2 bits + 1 bit
......@@ -188,27 +188,9 @@ def remove_ban_words_abaab_encoding(sequence: str, abaab_method_offset=0) -> str
exit(1)
return sequence_wo_bans
def size_binary_from_dna_len_abaab(dna_length):
"""
assume dna_abaab to binary conversion
get the length the binary string will be from the given dna sequence
conversion rate is 1.6bit/base
"""
base_length = 8 * (dna_length//5) # full conversion of octets
rest = dna_length % 5
if rest == 0:
return base_length
if rest == 1:
return base_length + 2
if rest == 2 or rest == 3: # if rest of 3, act like a rest of 2 because the third base will be ignored
return base_length + 4
if rest == 4:
return base_length + 6
def size_of_dna_from_bit_len_abaab(bit_length):
def size_dna_from_bit_len_abaab(bit_length):
"""
assume binary to dna_abaab conversion
get the size the dna sequence will be from the given binary string length
......@@ -226,23 +208,42 @@ def size_of_dna_from_bit_len_abaab(bit_length):
return base_length + 4
print("error size_of_dna_from_bit_len : size not multiple of 2 :",str(bit_length))
exit(1)
def size_binary_from_dna_len_abaab(dna_length):
"""
assume dna_abaab to binary conversion
get the length the binary string will be from the given dna sequence
conversion rate is 1.6bit/base
"""
base_length = 8 * (dna_length//5) # full conversion of octets
rest = dna_length % 5
if rest == 0:
return base_length
if rest == 1:
return base_length + 2
if rest == 2 or rest == 3: # if rest of 3, act like a rest of 2 because the third base will be ignored
return base_length + 4
if rest == 4:
return base_length + 6
def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
"""
convert binaries into dna sequence with some properties
the binaries are divided into parts of 5 bits
for each 5 bits part, 1st 2nd bits are normally converted into dna : (00:A, 01:G, 10:T, 11:C)
3rd bit is converted depending on previous converted sequence :
(0:G, 1:C) or (0:A, 1:T), first to break potential homopolymer, if not then to adjust %GC of the previous sequence
4th 5th bits are normally converted into dna
for each 5 bits part, 1st bit is converted depending on previous converted sequence :
(0:G, 1:C) or (0:A, 1:T), first to break potential homopolymer, if not then to adjust %GC of the local sequence in a defined window
2nd 3rd bits are normally converted into dna : (00:A, 01:G, 10:T, 11:C)
4th 5th bits are normally converted into dna : (00:A, 01:G, 10:T, 11:C)
this ensure that the total sequence cannot contain homopolymers > 3,
the GC% is in [33%, 66%] in the worst case in a window of 3, but tends to 50% with the adjustments and the conversion rate is 1.66 bit/base
warning : need the binary string to be a multiple of 5, or rest is inconsistent with decoding
ex : a rest of 0 -> A, a rest of 00 -> A
so how to decode A ?
so how to decode a base A ?
only allowing rests multiple of 5 removes ambiguity (also possible 0, 1 or 3 modulo 5)
"""
......@@ -400,6 +401,41 @@ def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str:
return sequence_wo_bans
def size_dna_from_bit_len_baa(bit_length):
"""
assume binary to dna_baa conversion
get the size the dna sequence will be from the given binary string length
conversion rate is 1.66 bit/base
"""
base_length = 3 * (bit_length//5) # 5 bits -> 3 bases
rest = bit_length % 5
if rest == 0:
return base_length
if rest == 1:
return base_length + 1
if rest == 3:
return base_length + 2
print("error size_of_dna_from_bit_len : invalid rest size:",str(rest))
exit(1)
def size_binary_from_dna_len_baa(dna_length):
"""
assume dna_baa to binary conversion
get the length the binary string will be from the given dna sequence
conversion rate is 1.66 bit/base
"""
base_length = 5 * (dna_length//3) # 3 bases -> 5 bits
rest = dna_length % 3
if rest == 0:
return base_length
if rest == 1:
return base_length + 1
if rest == 2:
return base_length + 3
def test_conversion():
for binary_size in range(100, 15000, 1):
......@@ -417,13 +453,20 @@ def test_conversion():
#decoded_abaab = dna_to_binary_abaab(seq_abaab)
decoded_baa = dna_to_binary_baa(seq_baa)
if size_binary_from_dna_len_baa(len(seq_baa)) != len(random_binary):
print("dna",str(len(seq_baa)))
exit(0)
if size_dna_from_bit_len_baa(len(random_binary)) != len(seq_baa):
print("bin",str(len(random_binary)))
exit(0)
#print("check abaab encoding", str(decoded_abaab == random_binary))
print("check baa encoding", str(decoded_baa == random_binary))
if not (decoded_baa == random_binary and sequence_control.sequence_check(seq_baa, min_GC=40, max_GC=60, verbose=True)):
#print("check baa encoding", str(decoded_baa == random_binary))
"""if not (decoded_baa == random_binary and sequence_control.sequence_check(seq_baa, min_GC=40, max_GC=60, verbose=True)):
#print(random_binary)
print(seq_baa)
#print(decoded_baa)
exit(1)
exit(1)"""
#print("check abaab encoding")
#sequence_control.sanity_check(seq_abaab)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment