Mentions légales du service
Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
Sequencing_Modules
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Package registry
Model registry
Operate
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
dnarXiv
Sequencing_Modules
Commits
cef883e6
Commit
cef883e6
authored
10 months ago
by
BOULLE Olivier
Browse files
Options
Downloads
Patches
Plain Diff
moved functions
parent
45cf69b9
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
partitioning/partitioning.py
+62
-5
62 additions, 5 deletions
partitioning/partitioning.py
partitioning/read_matrix.py
+15
-71
15 additions, 71 deletions
partitioning/read_matrix.py
with
77 additions
and
76 deletions
partitioning/partitioning.py
+
62
−
5
View file @
cef883e6
...
...
@@ -9,7 +9,8 @@ import random
import
read_matrix
as
rm
currentdir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
inspect
.
getfile
(
inspect
.
currentframe
())))
sys
.
path
.
insert
(
0
,
os
.
path
.
dirname
(
currentdir
)
+
"
/synthesis_modules
"
)
sys
.
path
.
insert
(
0
,
os
.
path
.
dirname
(
os
.
path
.
dirname
(
currentdir
))
+
"
/synthesis_modules
"
)
import
dna_file_reader
as
dfr
import
synthesis_simulation
as
ss
...
...
@@ -39,9 +40,9 @@ def get_minimizers(reads_path: str):
"""
get a dict of minimizers from the read file
"""
#TODO minimizers that can also be found in primers should not be used
start
=
time
.
time
()
window_size
=
10
# window to look for the minimizer
minimizer_size
=
6
# length of minimizer kmer
...
...
@@ -151,6 +152,62 @@ def kp_iter_bic_output_to_soluce(cluster_dir_path, output_path):
output_soluce
.
write
(
line
+
"
\n
"
)
def
eval_soluce
(
real_soluce_path
:
str
,
soluce_path
:
str
,
result_output
:
str
):
"""
compare the proposed results to the correct solution
"""
soluce_by_read_dict
=
{}
with
open
(
real_soluce_path
,
'
r
'
)
as
input_real_soluce
:
line
=
input_real_soluce
.
readline
()
while
line
!=
""
:
cluster_name
=
int
(
line
.
split
(
"
:
"
)[
0
])
cluster_columns
=
line
.
split
(
"
:
"
)[
1
].
replace
(
"
\n
"
,
""
).
split
(
"
,
"
)
for
read_name
in
cluster_columns
:
soluce_by_read_dict
[
read_name
]
=
cluster_name
line
=
input_real_soluce
.
readline
()
result_dict
=
{}
with
open
(
soluce_path
,
'
r
'
)
as
input_soluce
:
line
=
input_soluce
.
readline
()
cluster_num
=
0
while
line
!=
""
:
reads_list
=
line
.
replace
(
"
\n
"
,
""
).
replace
(
"
"
,
""
).
split
(
"
,
"
)
result_dict
[
cluster_num
]
=
reads_list
cluster_num
+=
1
line
=
input_soluce
.
readline
()
result_lines
=
[]
# replace each read number by the id of referrence cluster, and factorise
for
cluster_num
,
reads_list
in
result_dict
.
items
():
referrenced_dict
=
{}
for
read
in
reads_list
:
referrence
=
soluce_by_read_dict
[
read
]
referrenced_dict
[
referrence
]
=
referrenced_dict
.
get
(
referrence
,
0
)
+
1
referrenced_result_line
=
str
(
sum
([
v
for
_
,
v
in
referrenced_dict
.
items
()]))
+
"
reads :
"
various_seq_sum
=
0
# sum of sequences from small clusters
# display by decreasing number of occurrences
for
ref
in
sorted
(
referrenced_dict
,
key
=
referrenced_dict
.
get
,
reverse
=
True
):
if
referrenced_dict
[
ref
]
>
5
:
referrenced_result_line
+=
"
c
"
+
str
(
ref
)
+
"
_x
"
+
str
(
referrenced_dict
[
ref
])
+
"
"
else
:
various_seq_sum
+=
referrenced_dict
[
ref
]
if
various_seq_sum
>
0
:
referrenced_result_line
+=
"
(+
"
+
str
(
various_seq_sum
)
+
"
)
"
result_lines
.
append
(
referrenced_result_line
)
with
open
(
result_output
,
'
w
'
)
as
output_file
:
for
line
in
result_lines
:
output_file
.
write
(
line
+
"
\n
"
)
def
graph_generation
(
reads_dir
)
->
None
:
"""
generate a matrix of simulated reads
...
...
@@ -159,8 +216,8 @@ def graph_generation(reads_dir) -> None:
cell x y = 1 if the read y contains the minimizer x, else 0
"""
reads_path
=
reads_dir
+
"
/shuffled_reads.fastq
"
graph_file_path
=
reads_dir
+
"
/reads.graph
"
reads_path
=
reads_dir
+
"
/shuffled_reads.fastq
"
graph_file_path
=
reads_dir
+
"
/reads.graph
"
minimizer_list
,
reads_number
=
get_minimizers
(
reads_path
)
minlist_to_graph
(
reads_number
,
minimizer_list
,
graph_file_path
)
...
...
@@ -172,7 +229,7 @@ if __name__ == "__main__":
print
(
"
generate graph...
"
)
dir_path
=
"
matrix_tests/matrix_10k_2
/
"
dir_path
=
"
matrix_tests/matrix_10k_2
"
graph_generation
(
dir_path
)
# cmd : gpmetis matrix_tests/matrix_10k_2/reads_graph.graph 226 -ufactor 300
...
...
This diff is collapsed.
Click to expand it.
partitioning/read_matrix.py
+
15
−
71
View file @
cef883e6
...
...
@@ -312,33 +312,22 @@ def get_coverage_list(total_sum: int, min_coverage=25) -> list:
return
coverage_list
def
matrix_generation
(
column_number
:
int
,
dir_path
:
str
)
->
None
:
def
init_reads
(
dir_path
:
str
,
read_number
:
int
)
:
"""
generate a matrix of simulated reads
column corresponds to a read
line to a minimizer
cell x y = 1 if the read y contains the minimizer x, else 0
generate random reads, shuffle them and keep the original order in a file
"""
coverage_list
=
get_coverage_list
(
column_number
)
# list of coverage for each
reference
re
f
_path
=
dir_path
+
"
reference
s.fast
a
"
reads_path
=
dir_path
+
"
reads.fastq
"
s
huffled_read
s_path
=
dir_path
+
"
shuffled_reads.fastq
"
solutions_path
=
dir_path
+
"
soluce.txt
"
matrix_path
=
dir_path
+
"
matrix_10k_2.csv
"
ref_path
=
dir_path
+
"
/
reference
s.fasta
"
re
ads
_path
=
dir_path
+
"
/read
s.fast
q
"
shuffled_
reads_path
=
dir_path
+
"
/shuffled_
reads.fastq
"
s
olution
s_path
=
dir_path
+
"
/soluce.txt
"
coverage_list
=
get_coverage_list
(
read_number
)
# list of coverage for each reference
generate_references_sequences
(
len
(
coverage_list
),
ref_path
)
generate_random_reads
(
ref_path
,
coverage_list
,
reads_path
)
shuffle_reads
(
reads_path
,
shuffled_reads_path
,
solutions_path
)
minimizer_dict
=
get_minimizers
(
shuffled_reads_path
)
matrix_test
=
Minimizer_matrix
.
init_from_reads
(
shuffled_reads_path
,
minimizer_dict
)
# init a matrix class object from the reads file & minimizers
# save the matrix
matrix_test
.
print_matrix_to_csv
(
matrix_path
)
matrix_test
.
print_matrix_to_csvbm
(
matrix_path
+
"
bm
"
)
def
display_results
(
input_matrix_csv
,
results_dir
,
result_output
):
...
...
@@ -380,65 +369,20 @@ def display_results(input_matrix_csv, results_dir, result_output):
print
(
sorted
(
ordered_lines
))
def
eval_soluce
(
real_soluce_path
:
str
,
soluce_path
:
str
,
result_output
:
str
):
"""
compare the proposed results to the correct solution
"""
soluce_by_read_dict
=
{}
with
open
(
real_soluce_path
,
'
r
'
)
as
input_real_soluce
:
line
=
input_real_soluce
.
readline
()
while
line
!=
""
:
cluster_name
=
int
(
line
.
split
(
"
:
"
)[
0
])
cluster_columns
=
line
.
split
(
"
:
"
)[
1
].
replace
(
"
\n
"
,
""
).
split
(
"
,
"
)
for
read_name
in
cluster_columns
:
soluce_by_read_dict
[
read_name
]
=
cluster_name
line
=
input_real_soluce
.
readline
()
result_dict
=
{}
with
open
(
soluce_path
,
'
r
'
)
as
input_soluce
:
line
=
input_soluce
.
readline
()
cluster_num
=
0
while
line
!=
""
:
reads_list
=
line
.
replace
(
"
\n
"
,
""
).
replace
(
"
"
,
""
).
split
(
"
,
"
)
result_dict
[
cluster_num
]
=
reads_list
cluster_num
+=
1
line
=
input_soluce
.
readline
()
result_lines
=
[]
# replace each read number by the id of referrence cluster, and factorise
for
cluster_num
,
reads_list
in
result_dict
.
items
():
referrenced_dict
=
{}
for
read
in
reads_list
:
referrence
=
soluce_by_read_dict
[
read
]
referrenced_dict
[
referrence
]
=
referrenced_dict
.
get
(
referrence
,
0
)
+
1
referrenced_result_line
=
str
(
sum
([
v
for
_
,
v
in
referrenced_dict
.
items
()]))
+
"
reads :
"
# display by decreasing number of occurrences
for
ref
in
sorted
(
referrenced_dict
,
key
=
referrenced_dict
.
get
,
reverse
=
True
):
referrenced_result_line
+=
"
\"
c_
"
+
str
(
ref
)
+
"
\"
x
"
+
str
(
referrenced_dict
[
ref
])
+
"
"
result_lines
.
append
(
referrenced_result_line
)
with
open
(
result_output
,
'
w
'
)
as
output_file
:
for
line
in
result_lines
:
output_file
.
write
(
line
+
"
\n
"
)
if
__name__
==
"
__main__
"
:
print
(
"
generate
matrix
...
"
)
print
(
"
generate
reads
...
"
)
dir_path
=
"
matrix_tests/matrix_10k_2/
"
#dir_path = "matrix_tests/matrix/"
dir_path
=
sys
.
argv
[
1
]
read_number
=
int
(
sys
.
argv
[
2
])
#eval_soluce(dir_path+"soluce.txt", dir_path+"metis_soluce_100p.txt", dir_path+"metis_result_100p.txt")
init_reads
(
dir_path
,
read_number
)
#eval_soluce(dir_path+"soluce.txt", dir_path+"metis_soluce_10p.txt", dir_path+"metis_result_10p_2.txt")
#matrix = Minimizer_matrix.init_from_matrix_csv(dir_path+"matrix_10k.csv")
#matrix_generation(10000, dir_path)
#matrix.get_representative()
#display_results(dir_path+"/matrix_100/matrix.csv", dir_path+"/matrix_100_results", dir_path+"/matrix_100/matrix_100_ordered_results.csv")
print
(
"
\t
completed !
"
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment