diff --git a/read_matrix.py b/read_matrix.py index 00ff35b24c430bbf98184f7f7d30ddb64a7cbbb5..0c43e26a064916bd9b2065d0720cf1c760c73859 100755 --- a/read_matrix.py +++ b/read_matrix.py @@ -7,8 +7,6 @@ import time import random -import reads_consensus_class as rcc - currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) sys.path.insert(0, os.path.dirname(currentdir)+"/synthesis_modules") import dna_file_reader as dfr @@ -63,6 +61,7 @@ def generate_random_reads(references_file_path, coverage_list, reads_path): def get_minimisers(reads_path): + #TODO minimizers that can also be found in primers should not be used kmer_size = 10 minimiser_size = 6 @@ -108,7 +107,7 @@ def generate_matrix(reads_path, minimisers_dict, solutions_path, matrix_path): soluce_dict[read_cluster] = soluce_dict.get(read_cluster, []) + ["r"+str(i)] soluce_dict = dict(sorted(soluce_dict.items())) - print(soluce_dict) + #print(soluce_dict) with open(solutions_path, 'w') as output_file: for cluster_name in soluce_dict.keys(): @@ -136,15 +135,37 @@ def generate_matrix(reads_path, minimisers_dict, solutions_path, matrix_path): print(str(ones_counter)+ "/"+ str(cases_counter) +" = "+ str(round(100*ones_counter/cases_counter, 1))+"%") -def matrix_generation(): +def get_coverage_list(total_sum, min_coverage=30) -> list: + """ + return a list of number for the coverage fo the generated reads + """ + coverage_list = [] + current_sum = 0 + while current_sum <= total_sum: + coverage_i = random.randint(min_coverage, 2*min_coverage) + current_sum += coverage_i + coverage_list.append(coverage_i) + + # adjust the last cluster to remove the excedent + coverage_list[-1] = coverage_list[-1]-(current_sum-total_sum) + # sorted by descending + coverage_list = sorted(coverage_list, reverse=True) + + print(coverage_list) + print(len(coverage_list),"clusters with a sum of",sum(coverage_list)) + + return coverage_list - coverage_list = [13,11,11,10,9,9,8,8,8,7,5,1] - ref_path = "matrix_tests/references.fasta" - reads_path = "matrix_tests/reads.fastq" - solutions_path = "matrix_tests/soluce.txt" - matrix_path = "matrix_tests/matrix.csv" - generate_references_sequences(12, ref_path) +def matrix_generation(column_number, matrix_name, dir_path): + + coverage_list = get_coverage_list(column_number) # list of coverage for each reference + ref_path = dir_path +"references.fasta" + reads_path = dir_path +"reads.fastq" + solutions_path = dir_path +"soluce.txt" + matrix_path = dir_path + matrix_name+".csv" + + generate_references_sequences(len(coverage_list), ref_path) generate_random_reads(ref_path, coverage_list, reads_path) minimisers_dict = get_minimisers(reads_path) @@ -152,19 +173,111 @@ def matrix_generation(): generate_matrix(reads_path, minimisers_dict, solutions_path, matrix_path) +def matrix_to_COO(input_matrix_csv, output_COO): + + matrix_lines = [] + with open(input_matrix_csv, 'r') as input_matrix: + input_matrix.readline() # skip columns names + line = input_matrix.readline() + while line != "": + content = line.split(",") + matrix_lines.append(content[1:]) + line = input_matrix.readline() + + with open(output_COO, 'w') as output_matrix: + output_matrix.write(str(len(matrix_lines))+" "+str(len(matrix_lines[0]))+"\n") + for i, line in enumerate(matrix_lines): + for j, value in enumerate(line): + if value == "1": + output_matrix.write(str(i)+" "+str(j)+"\n") + + +def matrix_to_CSVBM(input_matrix_csv, output_CSVBM): + """convert a csv matrix to the csvbm format + row_br column_nbr ones_nbr\n + distance_to_first_one\n + ... + distance_to_next_one_from_last_one\n + ... + """ + matrix_rows = [] + number_of_ones = 0 + with open(input_matrix_csv, 'r') as input_matrix: + input_matrix.readline() # skip columns names + line = input_matrix.readline() + while line != "": + content = line.replace("\n","").split(",")[1:] # first one is row name + matrix_rows.append(content) + number_of_ones += sum(int(k) for k in content) + line = input_matrix.readline() + + number_rows = len(matrix_rows) + number_columns = len(matrix_rows[0]) + + with open(output_CSVBM, 'w', encoding='utf-8') as csvbm_file: + csvbm_file.write( + f'{number_rows} {number_columns} {number_of_ones}\n', + ) + distance = 0 + for row in matrix_rows: + for column in row: + if column == "1": + csvbm_file.write(f'{distance}\n') + distance = 1 + else: + distance += 1 + + + +def display_results(input_matrix_csv, results_dir, result_output): + + matrix_lines = [] + with open(input_matrix_csv, 'r') as input_matrix: + input_matrix.readline() # skip columns names + line = input_matrix.readline() + while line != "": + content = line.replace("\n","").split(",") + matrix_lines.append(content[1:]) + line = input_matrix.readline() + + # get list of ordered lines and columns from results files + ordered_lines = [] + ordered_columns = [] + + for filename in sorted(os.listdir(results_dir)): + file_path = os.path.join(results_dir, filename) + with open(file_path, 'r') as input_matrix: + ordered_columns += input_matrix.readline().replace("\n","").split(",")[1:] + line = input_matrix.readline() + while line != "": + line_nbr = line.split(",")[0] + ordered_lines.append(line_nbr) + line = input_matrix.readline() + + print(ordered_columns) + with open(result_output, 'w') as output_file: + output_file.write(","+",".join(ordered_columns)+"\n") + for line_name in ordered_lines: + line_number = int(line_name[1:]) + matrix_line = line_name + for column_name in ordered_columns: + column_number = int(column_name[1:]) + matrix_line += ","+matrix_lines[line_number][column_number] + #print(matrix_line) + output_file.write(matrix_line + "\n") + + print(sorted(ordered_lines)) if __name__ == "__main__": print("generate matrix...") - #for i in [1000*k for k in range(1,21)]: - # min_occ_testing(i) - #for i in range(1): - # consensus_testing(3000) - - matrix_generation() - #consensus_testing(50000) - #s = verify_check_sum("test_consensus/consensus_test.fasta") - #print(s) + column_number = 10000 + matrix_name = "matrix_10k_bis" + + dir_path = "matrix_tests/"+matrix_name+"/" + matrix_generation(column_number, matrix_name, dir_path) + matrix_to_CSVBM(dir_path+ matrix_name+".csv", dir_path + matrix_name+".csvbm") + #display_results(dir_path+"/matrix_100/matrix.csv", dir_path+"/matrix_100_results", dir_path+"/matrix_100/matrix_100_ordered_results.csv") print("\tcompleted !")