Mentions légales du service

Skip to content
Snippets Groups Projects
Commit cef883e6 authored by BOULLE Olivier's avatar BOULLE Olivier
Browse files

moved functions

parent 45cf69b9
No related branches found
No related tags found
No related merge requests found
......@@ -9,7 +9,8 @@ import random
import read_matrix as rm
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
sys.path.insert(0, os.path.dirname(currentdir)+"/synthesis_modules")
sys.path.insert(0, os.path.dirname(os.path.dirname(currentdir))+"/synthesis_modules")
import dna_file_reader as dfr
import synthesis_simulation as ss
......@@ -39,9 +40,9 @@ def get_minimizers(reads_path: str):
"""
get a dict of minimizers from the read file
"""
#TODO minimizers that can also be found in primers should not be used
start = time.time()
window_size = 10 # window to look for the minimizer
minimizer_size = 6 # length of minimizer kmer
......@@ -151,6 +152,62 @@ def kp_iter_bic_output_to_soluce(cluster_dir_path, output_path):
output_soluce.write(line+"\n")
def eval_soluce(real_soluce_path: str, soluce_path: str, result_output: str):
"""
compare the proposed results to the correct solution
"""
soluce_by_read_dict = {}
with open(real_soluce_path, 'r') as input_real_soluce:
line = input_real_soluce.readline()
while line != "":
cluster_name = int(line.split(":")[0])
cluster_columns = line.split(":")[1].replace("\n","").split(",")
for read_name in cluster_columns:
soluce_by_read_dict[read_name] = cluster_name
line = input_real_soluce.readline()
result_dict = {}
with open(soluce_path, 'r') as input_soluce:
line = input_soluce.readline()
cluster_num = 0
while line != "":
reads_list = line.replace("\n","").replace(" ","").split(",")
result_dict[cluster_num] = reads_list
cluster_num += 1
line = input_soluce.readline()
result_lines = []
# replace each read number by the id of referrence cluster, and factorise
for cluster_num, reads_list in result_dict.items():
referrenced_dict = {}
for read in reads_list:
referrence = soluce_by_read_dict[read]
referrenced_dict[referrence] = referrenced_dict.get(referrence, 0) + 1
referrenced_result_line = str(sum([v for _,v in referrenced_dict.items()]))+" reads : "
various_seq_sum = 0 # sum of sequences from small clusters
# display by decreasing number of occurrences
for ref in sorted(referrenced_dict, key=referrenced_dict.get, reverse=True):
if referrenced_dict[ref] > 5:
referrenced_result_line += "c"+str(ref)+"_x"+str(referrenced_dict[ref])+" "
else:
various_seq_sum += referrenced_dict[ref]
if various_seq_sum > 0:
referrenced_result_line += "(+"+str(various_seq_sum)+")"
result_lines.append(referrenced_result_line)
with open(result_output, 'w') as output_file:
for line in result_lines:
output_file.write(line+"\n")
def graph_generation(reads_dir) -> None:
"""
generate a matrix of simulated reads
......@@ -159,8 +216,8 @@ def graph_generation(reads_dir) -> None:
cell x y = 1 if the read y contains the minimizer x, else 0
"""
reads_path = reads_dir +"/shuffled_reads.fastq"
graph_file_path = reads_dir +"/reads.graph"
reads_path = reads_dir + "/shuffled_reads.fastq"
graph_file_path = reads_dir + "/reads.graph"
minimizer_list, reads_number = get_minimizers(reads_path)
minlist_to_graph(reads_number, minimizer_list, graph_file_path)
......@@ -172,7 +229,7 @@ if __name__ == "__main__":
print("generate graph...")
dir_path = "matrix_tests/matrix_10k_2/"
dir_path = "matrix_tests/matrix_10k_2"
graph_generation(dir_path)
# cmd : gpmetis matrix_tests/matrix_10k_2/reads_graph.graph 226 -ufactor 300
......
......@@ -312,33 +312,22 @@ def get_coverage_list(total_sum: int, min_coverage=25) -> list:
return coverage_list
def matrix_generation(column_number: int, dir_path: str) -> None:
def init_reads(dir_path: str, read_number: int):
"""
generate a matrix of simulated reads
column corresponds to a read
line to a minimizer
cell x y = 1 if the read y contains the minimizer x, else 0
generate random reads, shuffle them and keep the original order in a file
"""
coverage_list = get_coverage_list(column_number) # list of coverage for each reference
ref_path = dir_path +"references.fasta"
reads_path = dir_path +"reads.fastq"
shuffled_reads_path = dir_path +"shuffled_reads.fastq"
solutions_path = dir_path +"soluce.txt"
matrix_path = dir_path + "matrix_10k_2.csv"
ref_path = dir_path +"/references.fasta"
reads_path = dir_path +"/reads.fastq"
shuffled_reads_path = dir_path +"/shuffled_reads.fastq"
solutions_path = dir_path +"/soluce.txt"
coverage_list = get_coverage_list(read_number) # list of coverage for each reference
generate_references_sequences(len(coverage_list), ref_path)
generate_random_reads(ref_path, coverage_list, reads_path)
shuffle_reads(reads_path, shuffled_reads_path, solutions_path)
minimizer_dict = get_minimizers(shuffled_reads_path)
matrix_test = Minimizer_matrix.init_from_reads(shuffled_reads_path, minimizer_dict) # init a matrix class object from the reads file & minimizers
# save the matrix
matrix_test.print_matrix_to_csv(matrix_path)
matrix_test.print_matrix_to_csvbm(matrix_path+"bm")
def display_results(input_matrix_csv, results_dir, result_output):
......@@ -380,65 +369,20 @@ def display_results(input_matrix_csv, results_dir, result_output):
print(sorted(ordered_lines))
def eval_soluce(real_soluce_path: str, soluce_path: str, result_output: str):
"""
compare the proposed results to the correct solution
"""
soluce_by_read_dict = {}
with open(real_soluce_path, 'r') as input_real_soluce:
line = input_real_soluce.readline()
while line != "":
cluster_name = int(line.split(":")[0])
cluster_columns = line.split(":")[1].replace("\n","").split(",")
for read_name in cluster_columns:
soluce_by_read_dict[read_name] = cluster_name
line = input_real_soluce.readline()
result_dict = {}
with open(soluce_path, 'r') as input_soluce:
line = input_soluce.readline()
cluster_num = 0
while line != "":
reads_list = line.replace("\n","").replace(" ","").split(",")
result_dict[cluster_num] = reads_list
cluster_num += 1
line = input_soluce.readline()
result_lines = []
# replace each read number by the id of referrence cluster, and factorise
for cluster_num, reads_list in result_dict.items():
referrenced_dict = {}
for read in reads_list:
referrence = soluce_by_read_dict[read]
referrenced_dict[referrence] = referrenced_dict.get(referrence, 0) + 1
referrenced_result_line = str(sum([v for _,v in referrenced_dict.items()]))+" reads : "
# display by decreasing number of occurrences
for ref in sorted(referrenced_dict, key=referrenced_dict.get, reverse=True):
referrenced_result_line += "\"c_"+str(ref)+"\"x"+str(referrenced_dict[ref])+" "
result_lines.append(referrenced_result_line)
with open(result_output, 'w') as output_file:
for line in result_lines:
output_file.write(line+"\n")
if __name__ == "__main__":
print("generate matrix...")
print("generate reads...")
dir_path = "matrix_tests/matrix_10k_2/"
#dir_path = "matrix_tests/matrix/"
dir_path = sys.argv[1]
read_number = int(sys.argv[2])
#eval_soluce(dir_path+"soluce.txt", dir_path+"metis_soluce_100p.txt", dir_path+"metis_result_100p.txt")
init_reads(dir_path, read_number)
#eval_soluce(dir_path+"soluce.txt", dir_path+"metis_soluce_10p.txt", dir_path+"metis_result_10p_2.txt")
#matrix = Minimizer_matrix.init_from_matrix_csv(dir_path+"matrix_10k.csv")
#matrix_generation(10000, dir_path)
#matrix.get_representative()
#display_results(dir_path+"/matrix_100/matrix.csv", dir_path+"/matrix_100_results", dir_path+"/matrix_100/matrix_100_ordered_results.csv")
print("\tcompleted !")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment