From a129253b66eb6e8c5655fe7a458f6f1a33d8bca0 Mon Sep 17 00:00:00 2001 From: oboulle <olivier.boulle@inria.fr> Date: Thu, 2 May 2024 11:38:09 +0200 Subject: [PATCH] init from csv method --- read_matrix.py | 106 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 89 insertions(+), 17 deletions(-) diff --git a/read_matrix.py b/read_matrix.py index 5ec0b0f..d1ee270 100755 --- a/read_matrix.py +++ b/read_matrix.py @@ -18,7 +18,7 @@ class Minimizer_matrix: class to handle matrix stuff """ - def __init__(self, reads_path: str, minimizers_dict: dict) -> None: + def __init__(self) -> None: self.lines = [] # list of lines self.soluce_dict = {} # cluster_name : list of reads from the cluster @@ -26,28 +26,29 @@ class Minimizer_matrix: self.line_number = 0 self.column_number = 0 - self.generate_matrix(reads_path, minimizers_dict) - - - def generate_matrix(self, reads_path: str, minimizers_dict: dict) -> None: + @classmethod + def init_from_reads(cls, reads_path: str, minimizers_dict: dict): """ - fill the matrix from a .fastq of reads and a dict of minimizers from these reads + init the matrix object from a .fastq of reads and a dict of minimizers from these reads """ + # create the object + new_matrix = Minimizer_matrix() + # read the .fastq file and get basic info reads_dict = dfr.read_fastq(reads_path) reads_name_list = list(reads_dict.keys()) - self.line_number = len(minimizers_dict) - self.column_number = len(reads_dict) + new_matrix.line_number = len(minimizers_dict) + new_matrix.column_number = len(reads_dict) # shuffle the reads and save the original clusters in a dict random.shuffle(reads_name_list) for i, read_name in enumerate(reads_name_list): read_cluster = int(read_name.split("_")[1]) - self.soluce_dict[read_cluster] = self.soluce_dict.get(read_cluster, []) + ["r"+str(i)] + new_matrix.soluce_dict[read_cluster] = new_matrix.soluce_dict.get(read_cluster, []) + ["r"+str(i)] - self.soluce_dict = dict(sorted(self.soluce_dict.items())) + new_matrix.soluce_dict = dict(sorted(new_matrix.soluce_dict.items())) # construction of the matrix @@ -59,14 +60,83 @@ class Minimizer_matrix: # test if minimizer is in the read if minimizer in read_seq or minimizer in read_seq_rv: minimizer_line.append("1") - self.ones_counter += 1 + new_matrix.ones_counter += 1 else: minimizer_line.append("0") - self.lines.append(minimizer_line) + new_matrix.lines.append(minimizer_line) # show matrix density - cases_counter = self.line_number*self.column_number - print(str(self.ones_counter)+ "/"+ str(cases_counter) +" = "+ str(round(100*self.ones_counter/cases_counter, 1))+"%") + cases_counter = new_matrix.line_number*new_matrix.column_number + print(str(new_matrix.ones_counter)+ "/"+ str(cases_counter) +" = "+ str(round(100*new_matrix.ones_counter/cases_counter, 1))+"%") + + return new_matrix + + + @classmethod + def init_from_matrix_csv(cls, matrix_csv_path, soluce_path=None): + """ + init the matrix object from a csv file #TODO optional soluce file + """ + # create the object + new_matrix = Minimizer_matrix() + + with open(matrix_csv_path, 'r') as input_csv: + input_csv.readline() # skip columns names + line = input_csv.readline() + while line != "": + matrix_row = line.replace("\n","").split(",")[1:] + new_matrix.lines.append(matrix_row) + new_matrix.ones_counter += sum(int(k) for k in matrix_row) + line = input_csv.readline() + new_matrix.line_number = len(new_matrix.lines) + new_matrix.column_number = len(new_matrix.lines[0]) + + if soluce_path is not None: + # load the soluce dict from a file + with open(soluce_path, 'r') as input_soluce: + line = input_soluce.readline() + while line != "": + cluster_name = int(line.split(":")[0]) + cluster_columns = line.split(":")[1].replace("\n","").split(",") + new_matrix.soluce_dict[cluster_name] = cluster_columns + line = input_soluce.readline() + + return new_matrix + + + def get_representative(self): + repr_list = [] + + def prod_scalaire(line1, line2): + for k in range(self.column_number): + if line1[k] == "1" and line2[k] == "1": + return True + return False + + repr_list.append(self.lines[0]) + + for i in range(5): + compatible_new_repr = False + while not compatible_new_repr: + compatible_new_repr = True + repr = random.choice(self.lines) + + for other_repr in repr_list: + if prod_scalaire(repr, other_repr): + compatible_new_repr = False + break + print(i+1,"/5",) + repr_list.append(repr) + + with open("test_repr", 'w') as output_file: + for repr in repr_list: + + output_file.write(str(repr)+"\n") + + print(repr_list) + + + def print_soluce(self, solutions_path: str) -> None: @@ -257,7 +327,7 @@ def matrix_generation(column_number: int, dir_path: str) -> None: minimizer_dict = get_minimizers(reads_path) - matrix_test = Minimizer_matrix(reads_path, minimizer_dict) # init a matrix class object + matrix_test = Minimizer_matrix.init_from_reads(reads_path, minimizer_dict) # init a matrix class object from the reads file & minimizers # save the matrix and solution matrix_test.print_soluce(solutions_path) @@ -308,11 +378,13 @@ if __name__ == "__main__": print("generate matrix...") - column_number = 1000 + column_number = 100 dir_path = "matrix_tests/matrix_100_test/" - matrix_generation(column_number, dir_path) + dir_path = "matrix_tests/matrix_10k/" + matrix = Minimizer_matrix.init_from_matrix_csv(dir_path+"matrix_10k.csv", dir_path+"soluce.txt") + matrix.get_representative() #display_results(dir_path+"/matrix_100/matrix.csv", dir_path+"/matrix_100_results", dir_path+"/matrix_100/matrix_100_ordered_results.csv") print("\tcompleted !") -- GitLab