diff --git a/read_matrix.py b/read_matrix.py index d1ee2703c57ed2c147531cdae167d5c27bd59d2b..ff70e8fcf7eac0748c22041c8cf308120c6052fa 100755 --- a/read_matrix.py +++ b/read_matrix.py @@ -21,7 +21,6 @@ class Minimizer_matrix: def __init__(self) -> None: self.lines = [] # list of lines - self.soluce_dict = {} # cluster_name : list of reads from the cluster self.ones_counter = 0 # number of 1 in the matrix self.line_number = 0 self.column_number = 0 @@ -41,15 +40,6 @@ class Minimizer_matrix: new_matrix.line_number = len(minimizers_dict) new_matrix.column_number = len(reads_dict) - # shuffle the reads and save the original clusters in a dict - random.shuffle(reads_name_list) - - for i, read_name in enumerate(reads_name_list): - read_cluster = int(read_name.split("_")[1]) - new_matrix.soluce_dict[read_cluster] = new_matrix.soluce_dict.get(read_cluster, []) + ["r"+str(i)] - - new_matrix.soluce_dict = dict(sorted(new_matrix.soluce_dict.items())) - # construction of the matrix for minimizer in minimizers_dict.keys(): @@ -73,7 +63,7 @@ class Minimizer_matrix: @classmethod - def init_from_matrix_csv(cls, matrix_csv_path, soluce_path=None): + def init_from_matrix_csv(cls, matrix_csv_path): """ init the matrix object from a csv file #TODO optional soluce file """ @@ -91,16 +81,6 @@ class Minimizer_matrix: new_matrix.line_number = len(new_matrix.lines) new_matrix.column_number = len(new_matrix.lines[0]) - if soluce_path is not None: - # load the soluce dict from a file - with open(soluce_path, 'r') as input_soluce: - line = input_soluce.readline() - while line != "": - cluster_name = int(line.split(":")[0]) - cluster_columns = line.split(":")[1].replace("\n","").split(",") - new_matrix.soluce_dict[cluster_name] = cluster_columns - line = input_soluce.readline() - return new_matrix @@ -108,44 +88,38 @@ class Minimizer_matrix: repr_list = [] def prod_scalaire(line1, line2): + sum = 0 for k in range(self.column_number): if line1[k] == "1" and line2[k] == "1": - return True - return False - - repr_list.append(self.lines[0]) - - for i in range(5): - compatible_new_repr = False - while not compatible_new_repr: - compatible_new_repr = True - repr = random.choice(self.lines) - - for other_repr in repr_list: - if prod_scalaire(repr, other_repr): - compatible_new_repr = False - break - print(i+1,"/5",) - repr_list.append(repr) - - with open("test_repr", 'w') as output_file: - for repr in repr_list: - - output_file.write(str(repr)+"\n") + sum += 1 + print(sum) + return sum - print(repr_list) + base_repr = self.lines[868] + base_sum = sum([int(k) for k in base_repr]) + repr_list.append(["m893", base_repr]) + for i, repr in enumerate(self.lines): + if sum(int(k) for k in repr) < 25 or sum(int(k) for k in repr) > 70: + continue + if prod_scalaire(repr, base_repr) == 0: + print("more",prod_scalaire(repr, base_repr)) + repr_list.append(["m"+str(i),repr]) + break + for name, repr in repr_list: + with open("representatives/repr_"+name+"_"+str(prod_scalaire(repr, base_repr))+".csv", 'w') as output_file: + first_line = "" + ones_number = 0 + for i, case in enumerate(repr): + if case == "1": + first_line += ",r"+str(i) + ones_number += 1 + output_file.write(first_line+"\n") + output_file.write(name+",1"*ones_number) - def print_soluce(self, solutions_path: str) -> None: - """ - save the solution clusters in a file - """ - with open(solutions_path, 'w') as output_file: - for cluster_name in self.soluce_dict.keys(): - output_file.write(str(cluster_name)+":"+",".join(self.soluce_dict[cluster_name])+"\n") def print_matrix_to_csv(self, matrix_csv_path: str) -> None: @@ -153,7 +127,7 @@ class Minimizer_matrix: save the matrix to .csv format """ with open(matrix_csv_path, 'w') as output_file: - output_file.write(","+",".join("r"+str(k) for k in range(self.column_number))+"\n") + output_file.write("min,"+",".join("r"+str(k) for k in range(self.column_number))+"\n") for i, line in enumerate(self.lines): output_file.write("m"+str(i)+",") output_file.write(",".join(line)+"\n") @@ -184,7 +158,7 @@ class Minimizer_matrix: """ with open(matrix_csvbm_path, 'w', encoding='utf-8') as csvbm_file: csvbm_file.write( - f'{self.line_number} {self.column_number} {self.ones_counter}\n', + f'{self.line_number} {self.column_number} {self.ones_counter} 0\n', ) distance = 0 for line in self.lines: @@ -196,6 +170,33 @@ class Minimizer_matrix: distance += 1 +def shuffle_reads(reads_path, shuffled_reads_path, soluce_path): + + soluce_dict = {} # cluster_name : list of reads from the cluster + + # read the .fastq file and get basic info + reads_dict = dfr.read_fastq(reads_path) + reads_name_list = list(reads_dict.keys()) + + # shuffle the reads and save the original clusters in a dict + random.shuffle(reads_name_list) + + # save the shuffled reads + shuffled_reads = {read_name : reads_dict[read_name] for read_name in reads_name_list} + dfr.save_dict_to_fastq(shuffled_reads, shuffled_reads_path) + + # save the original clusters + for i, read_name in enumerate(reads_name_list): + read_cluster = int(read_name.split("_")[1]) + soluce_dict[read_cluster] = soluce_dict.get(read_cluster, []) + ["r"+str(i)] + + soluce_dict = dict(sorted(soluce_dict.items())) + + with open(soluce_path, 'w') as output_file: + for cluster_name in soluce_dict.keys(): + output_file.write(str(cluster_name)+":"+",".join(soluce_dict[cluster_name])+"\n") + + def generate_references_sequences(seq_number: int, references_path: str) -> dict: """ @@ -283,7 +284,7 @@ def get_minimizers(reads_path: str) -> dict: return minimizer_dict -def get_coverage_list(total_sum: int, min_coverage=30) -> list: +def get_coverage_list(total_sum: int, min_coverage=25) -> list: """ return a list of number for a coverage of each generated read total_sum is the required sum of all coverage in the list @@ -302,8 +303,7 @@ def get_coverage_list(total_sum: int, min_coverage=30) -> list: # sorted by highest->lowest coverage_list = sorted(coverage_list, reverse=True) - print(coverage_list) - print(len(coverage_list),"clusters with a sum of",sum(coverage_list)) + #print(len(coverage_list),"clusters with a sum of",sum(coverage_list)) return coverage_list @@ -319,18 +319,19 @@ def matrix_generation(column_number: int, dir_path: str) -> None: coverage_list = get_coverage_list(column_number) # list of coverage for each reference ref_path = dir_path +"references.fasta" reads_path = dir_path +"reads.fastq" + shuffled_reads_path = dir_path +"shuffled_reads.fastq" solutions_path = dir_path +"soluce.txt" - matrix_path = dir_path + "matrix.csv" + matrix_path = dir_path + "matrix_10k_2.csv" generate_references_sequences(len(coverage_list), ref_path) generate_random_reads(ref_path, coverage_list, reads_path) + shuffle_reads(reads_path, shuffled_reads_path, solutions_path) - minimizer_dict = get_minimizers(reads_path) + minimizer_dict = get_minimizers(shuffled_reads_path) - matrix_test = Minimizer_matrix.init_from_reads(reads_path, minimizer_dict) # init a matrix class object from the reads file & minimizers + matrix_test = Minimizer_matrix.init_from_reads(shuffled_reads_path, minimizer_dict) # init a matrix class object from the reads file & minimizers - # save the matrix and solution - matrix_test.print_soluce(solutions_path) + # save the matrix matrix_test.print_matrix_to_csv(matrix_path) matrix_test.print_matrix_to_csvbm(matrix_path+"bm") @@ -374,17 +375,61 @@ def display_results(input_matrix_csv, results_dir, result_output): print(sorted(ordered_lines)) + +def eval_soluce(soluce_path: str, results_path: str): + """ + compare the proposed results to the correct solution + """ + soluce_by_read_dict = {} + + with open(soluce_path, 'r') as input_soluce: + line = input_soluce.readline() + while line != "": + cluster_name = int(line.split(":")[0]) + cluster_columns = line.split(":")[1].replace("\n","").split(",") + for read_name in cluster_columns: + soluce_by_read_dict[read_name] = cluster_name + line = input_soluce.readline() + + result_dict = {} + + with open(results_path, 'r') as input_result: + line = input_result.readline() + cluster_num = 0 + while line != "": + reads_list = line.replace("\n","").replace(" ","").split(",") + result_dict[cluster_num] = reads_list + cluster_num += 1 + line = input_result.readline() + + # replace each read number by the id of referrence cluster, and factorise + for cluster_num, reads_list in result_dict.items(): + referrenced_dict = {} + + for read in reads_list: + referrence = soluce_by_read_dict[read] + referrenced_dict[referrence] = referrenced_dict.get(referrence, 0) + 1 + referrenced_result_line = "" + + # display by decreasing number of occurrences + for ref in sorted(referrenced_dict, key=referrenced_dict.get, reverse=True): + referrenced_result_line += "\"c_"+str(ref)+"\"x"+str(referrenced_dict[ref])+" " + + print(referrenced_result_line) + + if __name__ == "__main__": print("generate matrix...") - column_number = 100 - dir_path = "matrix_tests/matrix_100_test/" - dir_path = "matrix_tests/matrix_10k/" + dir_path = "matrix_tests/matrix_10k_2/" + #dir_path = "matrix_tests/matrix/" - matrix = Minimizer_matrix.init_from_matrix_csv(dir_path+"matrix_10k.csv", dir_path+"soluce.txt") - matrix.get_representative() + eval_soluce(dir_path+"soluce.txt", dir_path+"metis_soluce.txt") + #matrix = Minimizer_matrix.init_from_matrix_csv(dir_path+"matrix_10k.csv") + #matrix_generation(10000, dir_path) + #matrix.get_representative() #display_results(dir_path+"/matrix_100/matrix.csv", dir_path+"/matrix_100_results", dir_path+"/matrix_100/matrix_100_ordered_results.csv") print("\tcompleted !")