Mentions légales du service
Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
Source_Encoding
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Package registry
Model registry
Operate
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Admin message
GitLab upgrade completed. Current version is 17.11.3.
Show more breadcrumbs
dnarXiv
Source_Encoding
Commits
8201876a
Commit
8201876a
authored
1 year ago
by
BOULLE Olivier
Browse files
Options
Downloads
Patches
Plain Diff
comments improved, refactoring
parent
5a217f07
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
binary_dna_conversion.py
+23
-20
23 additions, 20 deletions
binary_dna_conversion.py
with
23 additions
and
20 deletions
binary_dna_conversion.py
+
23
−
20
View file @
8201876a
...
...
@@ -272,46 +272,47 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
for
i
in
range
(
0
,
n_quintuplets
):
bits5
=
binary_string
[
i
*
5
:(
i
+
1
)
*
5
]
nucleotide_2
=
bit_pair_to_dna
[
bits5
[
1
:
3
]]
nucleotide_3
=
bit_pair_to_dna
[
bits5
[
3
:
5
]]
nucleotide_2
=
bit_pair_to_dna
[
bits5
[
1
:
3
]]
# first alpha base of the encoded triplet
nucleotide_3
=
bit_pair_to_dna
[
bits5
[
3
:
5
]]
# 2nd alpha base
# make nucleotide 1
if
len
(
sequence
)
>=
3
:
# check 3 previous encoded bases
# check 3 previous encoded bases
and 2 following alpha bases to see if there is a potential homopolymer of 4+
if
sequence
[
-
3
]
==
sequence
[
-
2
]
==
sequence
[
-
1
]
or
sequence
[
-
2
]
==
sequence
[
-
1
]
==
nucleotide_2
or
sequence
[
-
1
]
==
nucleotide_2
==
nucleotide_3
:
# break homopolymer
# break
the
homopolymer
nucleotide_1
=
bit_to_dna_balance_GC
(
bits5
[
0
],
sequence
[
-
1
])
else
:
# if no homopolymer to break, adjust GC% in the window of preceding bases + 2
new
alpha bases
# if no homopolymer to break, adjust GC% in the window of preceding bases + 2
following
alpha bases
check_window
=
sequence
[
-
min
(
len
(
sequence
),
GC_window
):]
+
nucleotide_2
+
nucleotide_3
if
check_window
.
count
(
"
A
"
)
+
check_window
.
count
(
"
T
"
)
>
check_window
.
count
(
"
G
"
)
+
check_window
.
count
(
"
C
"
):
nucleotide_1
=
bit_to_dna_GC
[
bits5
[
0
]]
nucleotide_1
=
bit_to_dna_GC
[
bits5
[
0
]]
# more A/T -> add a G/C
elif
check_window
.
count
(
"
A
"
)
+
check_window
.
count
(
"
T
"
)
<
check_window
.
count
(
"
G
"
)
+
check_window
.
count
(
"
C
"
):
nucleotide_1
=
bit_to_dna_AT
[
bits5
[
0
]]
else
:
# strictly equal, just balance with
preced
ing base
nucleotide_1
=
bit_to_dna_AT
[
bits5
[
0
]]
# less A/T -> add a A/T
else
:
#
case of
strictly equal
%GC
, just balance with
follow
ing base
nucleotide_1
=
bit_to_dna_balance_GC
(
bits5
[
0
],
nucleotide_2
)
else
:
else
:
# empty string, just balance the GC with the following base
nucleotide_1
=
bit_to_dna_balance_GC
(
bits5
[
0
],
nucleotide_2
)
sequence
+=
nucleotide_1
+
nucleotide_2
+
nucleotide_3
sequence
+=
nucleotide_1
+
nucleotide_2
+
nucleotide_3
# add the encoded b a a bases to the sequence
bit_rest
=
binary_string
[
n_quintuplets
*
5
:
n_quintuplets
*
5
+
rest
]
# rest is 0-1-3 bits
# all complete quintuplets have been encoded, time for the rest
bit_rest
=
binary_string
[
n_quintuplets
*
5
:
n_quintuplets
*
5
+
rest
]
# rest is 0-1-3 bits long
if
len
(
bit_rest
)
>
0
:
# last conversions depends on the length of the rest, 1 bits = 1 base; 3 bits = 2 bases
if
len
(
bit_rest
)
==
1
:
nucleotide_2
=
""
nucleotide_2
=
""
# no nucleotide 2 to add
homopolymer_check_bool
=
sequence
[
-
3
]
==
sequence
[
-
2
]
==
sequence
[
-
1
]
else
:
# rest = 3
nucleotide_2
=
bit_pair_to_dna
[
bit_rest
[
1
:
3
]]
# nucleotide_2 from a pair of bits
homopolymer_check_bool
=
sequence
[
-
3
]
==
sequence
[
-
2
]
==
sequence
[
-
1
]
or
sequence
[
-
2
]
==
sequence
[
-
1
]
==
nucleotide_2
# true => careful for a potential homopolymer
if
homopolymer_check_bool
:
# break homopolymer
#
need to
break homopolymer
with the nucleotide 1
nucleotide_1
=
bit_to_dna_balance_GC
(
bit_rest
[
0
],
sequence
[
-
1
])
else
:
# adjust GC%
check_window
=
sequence
[
-
min
(
len
(
sequence
),
GC_window
):]
+
nucleotide_2
...
...
@@ -319,9 +320,9 @@ def binary_to_dna_baa(binary_string: str, GC_window=20) -> str:
nucleotide_1
=
bit_to_dna_GC
[
bit_rest
[
0
]]
else
:
nucleotide_1
=
bit_to_dna_AT
[
bit_rest
[
0
]]
sequence
+=
nucleotide_1
+
nucleotide_2
# remove the banned words
sequence_wo_banwords
=
remove_ban_words_baa_encoding
(
sequence
)
...
...
@@ -336,10 +337,10 @@ def dna_to_binary_baa(sequence: str) -> str:
binary_string
=
""
n_triplet
,
rest
=
divmod
(
len
(
sequence
),
3
)
for
i
in
range
(
0
,
n_triplet
):
nucleotide
s3
=
sequence
[
i
*
3
:(
i
+
1
)
*
3
]
bit_1
=
dna_to_bit_ATGC
[
nucleotide
s3
[
0
]]
bits_23
=
dna_to_bit_pair
[
nucleotide
s3
[
1
]]
bits_45
=
dna_to_bit_pair
[
nucleotide
s3
[
2
]]
triplet_
nucleotide
=
sequence
[
i
*
3
:(
i
+
1
)
*
3
]
bit_1
=
dna_to_bit_ATGC
[
triplet_
nucleotide
[
0
]]
bits_23
=
dna_to_bit_pair
[
triplet_
nucleotide
[
1
]]
bits_45
=
dna_to_bit_pair
[
triplet_
nucleotide
[
2
]]
binary_string
+=
bit_1
+
bits_23
+
bits_45
...
...
@@ -362,6 +363,8 @@ def remove_ban_words_baa_encoding(sequence: str, baa_method_offset=0) -> str:
the changed base must be a beta base, and be A<=>G; T<=>C, so the decoding of the updated sequence will still return the same result
if the original sequence has already been fragmented, the 3 bases windows of the baa method can be offset, baa_method_offset is used to correct this
#TODO also remove inverse repeat regions of 10+ bases
"""
# change a beta base but keep the binary meaning for the decoding
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment