From a129253b66eb6e8c5655fe7a458f6f1a33d8bca0 Mon Sep 17 00:00:00 2001
From: oboulle <olivier.boulle@inria.fr>
Date: Thu, 2 May 2024 11:38:09 +0200
Subject: [PATCH] init from csv method

---
 read_matrix.py | 106 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 89 insertions(+), 17 deletions(-)

diff --git a/read_matrix.py b/read_matrix.py
index 5ec0b0f..d1ee270 100755
--- a/read_matrix.py
+++ b/read_matrix.py
@@ -18,7 +18,7 @@ class Minimizer_matrix:
     class to handle matrix stuff
     """
 
-    def __init__(self, reads_path: str, minimizers_dict: dict) -> None:
+    def __init__(self) -> None:
 
         self.lines = [] # list of lines
         self.soluce_dict = {} # cluster_name : list of reads from the cluster
@@ -26,28 +26,29 @@ class Minimizer_matrix:
         self.line_number = 0
         self.column_number = 0
 
-        self.generate_matrix(reads_path, minimizers_dict)
-
-
-    def generate_matrix(self, reads_path: str, minimizers_dict: dict) -> None:
+    @classmethod
+    def init_from_reads(cls, reads_path: str, minimizers_dict: dict):
         """
-        fill the matrix from a .fastq of reads and a dict of minimizers from these reads
+        init the matrix object from a .fastq of reads and a dict of minimizers from these reads
         """
+        # create the object
+        new_matrix = Minimizer_matrix()
+
         # read the .fastq file and get basic info
         reads_dict = dfr.read_fastq(reads_path)
         reads_name_list = list(reads_dict.keys())
 
-        self.line_number = len(minimizers_dict)
-        self.column_number = len(reads_dict)
+        new_matrix.line_number = len(minimizers_dict)
+        new_matrix.column_number = len(reads_dict)
 
         # shuffle the reads and save the original clusters in a dict
         random.shuffle(reads_name_list)
 
         for i, read_name in enumerate(reads_name_list):
             read_cluster = int(read_name.split("_")[1])
-            self.soluce_dict[read_cluster] = self.soluce_dict.get(read_cluster, []) + ["r"+str(i)]
+            new_matrix.soluce_dict[read_cluster] = new_matrix.soluce_dict.get(read_cluster, []) + ["r"+str(i)]
 
-        self.soluce_dict = dict(sorted(self.soluce_dict.items()))
+        new_matrix.soluce_dict = dict(sorted(new_matrix.soluce_dict.items()))
 
         # construction of the matrix
 
@@ -59,14 +60,83 @@ class Minimizer_matrix:
                 # test if minimizer is in the read
                 if minimizer in read_seq or minimizer in read_seq_rv:
                     minimizer_line.append("1")
-                    self.ones_counter += 1
+                    new_matrix.ones_counter += 1
                 else:
                     minimizer_line.append("0")
-            self.lines.append(minimizer_line)
+            new_matrix.lines.append(minimizer_line)
 
         # show matrix density
-        cases_counter = self.line_number*self.column_number
-        print(str(self.ones_counter)+ "/"+ str(cases_counter) +" = "+ str(round(100*self.ones_counter/cases_counter, 1))+"%")
+        cases_counter = new_matrix.line_number*new_matrix.column_number
+        print(str(new_matrix.ones_counter)+ "/"+ str(cases_counter) +" = "+ str(round(100*new_matrix.ones_counter/cases_counter, 1))+"%")
+
+        return new_matrix
+
+
+    @classmethod
+    def init_from_matrix_csv(cls, matrix_csv_path, soluce_path=None):
+        """
+        init the matrix object from a csv file #TODO optional soluce file
+        """
+        # create the object
+        new_matrix = Minimizer_matrix()
+
+        with open(matrix_csv_path, 'r') as input_csv:
+            input_csv.readline() # skip columns names
+            line = input_csv.readline()
+            while line != "":
+                matrix_row = line.replace("\n","").split(",")[1:]
+                new_matrix.lines.append(matrix_row)
+                new_matrix.ones_counter += sum(int(k) for k in matrix_row)
+                line = input_csv.readline()
+        new_matrix.line_number = len(new_matrix.lines)
+        new_matrix.column_number = len(new_matrix.lines[0])
+
+        if soluce_path is not None:
+            # load the soluce dict from a file
+            with open(soluce_path, 'r') as input_soluce:
+                line = input_soluce.readline()
+                while line != "":
+                    cluster_name = int(line.split(":")[0])
+                    cluster_columns = line.split(":")[1].replace("\n","").split(",")
+                    new_matrix.soluce_dict[cluster_name] = cluster_columns
+                    line = input_soluce.readline()
+
+        return new_matrix
+
+
+    def get_representative(self):
+        repr_list = []
+
+        def prod_scalaire(line1, line2):
+            for k in range(self.column_number):
+                if line1[k] == "1" and line2[k] == "1":
+                    return True
+            return False
+
+        repr_list.append(self.lines[0])
+
+        for i in range(5):
+            compatible_new_repr = False
+            while not compatible_new_repr:
+                compatible_new_repr = True
+                repr = random.choice(self.lines)
+
+                for other_repr in repr_list:
+                    if prod_scalaire(repr, other_repr):
+                        compatible_new_repr = False
+                        break
+            print(i+1,"/5",)
+            repr_list.append(repr)
+
+        with open("test_repr", 'w') as output_file:
+            for repr in repr_list:
+
+                output_file.write(str(repr)+"\n")
+
+        print(repr_list)
+
+
+
 
 
     def print_soluce(self, solutions_path: str) -> None:
@@ -257,7 +327,7 @@ def matrix_generation(column_number: int, dir_path: str) -> None:
 
     minimizer_dict = get_minimizers(reads_path)
 
-    matrix_test = Minimizer_matrix(reads_path, minimizer_dict) # init a matrix class object
+    matrix_test = Minimizer_matrix.init_from_reads(reads_path, minimizer_dict) # init a matrix class object from the reads file & minimizers
 
     # save the matrix and solution
     matrix_test.print_soluce(solutions_path)
@@ -308,11 +378,13 @@ if __name__ == "__main__":
 
 
     print("generate matrix...")
-    column_number = 1000
+    column_number = 100
 
     dir_path = "matrix_tests/matrix_100_test/"
-    matrix_generation(column_number, dir_path)
+    dir_path = "matrix_tests/matrix_10k/"
 
+    matrix = Minimizer_matrix.init_from_matrix_csv(dir_path+"matrix_10k.csv", dir_path+"soluce.txt")
+    matrix.get_representative()
     #display_results(dir_path+"/matrix_100/matrix.csv", dir_path+"/matrix_100_results", dir_path+"/matrix_100/matrix_100_ordered_results.csv")
     print("\tcompleted !")
 
-- 
GitLab