From bd140b69c9ba4346838be545abc0241066bad40d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= <hande@ginkgo.work>
Date: Thu, 15 Feb 2024 10:37:48 +0100
Subject: [PATCH] update run_steps to reuse already generated matrices

---
 examples/experiment_custom_lisn.py | 131 +++++++++++++++++++----------
 1 file changed, 86 insertions(+), 45 deletions(-)

diff --git a/examples/experiment_custom_lisn.py b/examples/experiment_custom_lisn.py
index 1722b260..78a0d9e3 100644
--- a/examples/experiment_custom_lisn.py
+++ b/examples/experiment_custom_lisn.py
@@ -156,6 +156,23 @@ class Experiment(BaseExperiment):
 
         self.persist_current_result(dir_nD)
 
+    def _entity_matrices_exist(self, natures, matrix_type, working_dir=None):
+        if natures is None or len(natures) == 0:
+            print("No entity natures specified!")
+
+        for nature in natures:
+            entity_files = working_dir.rglob(
+                f"{nature}_{matrix_type}.*"
+        )
+        if len(list(entity_files)) == 0:
+            return False
+
+        print(
+            f"Matrices of type {matrix_type} for {', '.join(natures)}"
+            " already exist. Skipping creation."
+        )
+        return True
+
     def run_steps(self, next_params):
         print(f"\nRunning experiment for parameters:\n{next_params}")
         #-------------------------------------
@@ -170,11 +187,15 @@ class Experiment(BaseExperiment):
         dir_mat.mkdir(parents=True, exist_ok=True)
             
         #-------------------------------------
-        # create and save entity matrices and scores
-        matrices, scores = dataset.create_matrices_and_scores()
+        # create and save entity matrices
+        if self._entity_matrices_exist(dataset.natures, "mat", dir_mat):
+            matrices = load_matrices_from_dumps(dataset.natures, "mat", dir_mat)
+            scores = load_scores(dataset.natures, dir_mat)
+        else:
+            matrices, scores = dataset.create_matrices_and_scores()
+            dump_scores(dataset.natures, scores, dir_mat)
+            dump_matrices(dataset.natures, matrices, "mat", dir_mat)
 
-        dump_scores(dataset.natures, scores, dir_mat)
-        dump_matrices(dataset.natures, matrices, "mat", dir_mat)
 
         #-------------------------------------
         # do n-dimensional projection
@@ -187,12 +208,15 @@ class Experiment(BaseExperiment):
         dir_nD = dir_mat / projection_nD.params
         # The value of the extra_param will appear in the name of the 
         # directory generated for nD projection.
-        print(dir_nD)
         dir_nD.mkdir(parents=True, exist_ok=True)
-        
-        matrices_nD = projection_nD.execute(matrices, dataset, dir_nD)
-        print(f'{key_nD} matrices generated.')
-        dump_matrices(dataset.natures, matrices_nD, key_nD, dir_nD)
+
+        if self._entity_matrices_exist(dataset.natures, 
+                                        key_nD, dir_nD):
+            matrices_nD = load_matrices_from_dumps(dataset.natures, key_nD, dir_nD)
+        else:
+            matrices_nD = projection_nD.execute(matrices, dataset, dir_nD)
+            print(f'{key_nD} matrices generated.')
+            dump_matrices(dataset.natures, matrices_nD, key_nD, dir_nD)
 
         # add n-dimensional projection scores
         self.add_nD_scores(key_nD, dir_mat, dir_nD,
@@ -205,16 +229,20 @@ class Experiment(BaseExperiment):
         dir_2D = dir_nD / projection_2D.params
         dir_2D.mkdir(parents=True, exist_ok=True)
 
-        matrices_2D = projection_2D.execute(matrices_nD, dir_2D)
-        print(f'{key_2D} matrices generated.')
-        dump_matrices(dataset.natures, matrices_2D, key_2D, dir_2D)
+        if self._entity_matrices_exist(dataset.natures, 
+                                        key_2D, dir_2D):
+            matrices_2D = load_matrices_from_dumps(dataset.natures, key_2D, dir_2D)
+        else:
+            matrices_2D = projection_2D.execute(matrices_nD, dir_2D)
+            print(f'{key_2D} matrices generated.')
+            dump_matrices(dataset.natures, matrices_2D, key_2D, dir_2D)
 
-        # save 2D plots
-        title_parts = [dataset.name,
-                       dataset.version,
-                       projection_nD.params,
-                       projection_2D.params]
-        fig = self.save_plots(dataset.natures, matrices_2D, dir_2D, title_parts)
+            # save 2D plots
+            title_parts = [dataset.name,
+                           dataset.version,
+                           projection_nD.params,
+                           projection_2D.params]
+            fig = self.save_plots(dataset.natures, matrices_2D, dir_2D, title_parts)
         
         # add 2-dimensional projection scores
         self.add_2D_scores(
@@ -229,35 +257,48 @@ class Experiment(BaseExperiment):
         key_clus = clustering.key
         dir_clus = dir_2D / clustering.params
         dir_clus.mkdir(parents=True, exist_ok=True)
-        (clus_nD, clus_2D, clus_scores,
-        cluster_labels, cluster_eval_pos,
-        cluster_eval_neg) = clustering.create_clusters(
-            matrices, matrices_2D, matrices_nD, scores, dataset.corpus_index
-        )
 
-        dump_scores(cluster_natures, clus_scores, dir_clus)
-        dump_matrices(cluster_natures, clus_nD, key_nD, dir_clus)
-        dump_matrices(cluster_natures, clus_2D, key_2D, dir_clus)
-        dump_scores(cluster_natures, cluster_eval_pos, dir_clus, suffix="eval_pos")
-        dump_scores(cluster_natures, cluster_eval_neg, dir_clus, suffix="eval_neg")
-        dump_labels(cluster_natures, cluster_labels, dir_clus)
-
-        # save clustering plots
-        figs = []
-        for i, nature in enumerate(cluster_natures):
-            clus_scores_i = clus_scores[i]
-            clus_mat_i = clus_2D[i]
-            title_parts = [dataset.name,
-                           dataset.version,
-                           projection_nD.params,
-                           projection_2D.params,
-                           clustering.key,
-                           nature]
-            fig = self.save_plots(
-                dataset.natures, matrices_2D, dir_clus, title_parts, 
-                annotations=clus_scores_i.index, annotation_mat=clus_mat_i
+        if (self._entity_matrices_exist(cluster_natures, key_2D, dir_clus) and
+            self._entity_matrices_exist(cluster_natures, key_nD, dir_clus)):
+            clus_scores = load_scores(cluster_natures, dir_clus)
+            clus_nD = load_matrices(cluster_natures, key_nD, dir_clus)
+            clus_2D = load_matrices(cluster_natures, key_2D, dir_clus)
+    
+            clus_eval_pos = load_scores(cluster_natures, dir_clus,
+                                             suffix="eval_pos")
+            clus_eval_neg = load_scores(cluster_natures, dir_clus,
+                                             suffix="eval_neg")
+            labels = load_labels(cluster_natures, dir_clus)
+        else:
+            (clus_nD, clus_2D, clus_scores,
+            cluster_labels, cluster_eval_pos,
+            cluster_eval_neg) = clustering.create_clusters(
+                matrices, matrices_2D, matrices_nD, scores, dataset.corpus_index
             )
-            figs.append(fig)
+    
+            dump_scores(cluster_natures, clus_scores, dir_clus)
+            dump_matrices(cluster_natures, clus_nD, key_nD, dir_clus)
+            dump_matrices(cluster_natures, clus_2D, key_2D, dir_clus)
+            dump_scores(cluster_natures, cluster_eval_pos, dir_clus, suffix="eval_pos")
+            dump_scores(cluster_natures, cluster_eval_neg, dir_clus, suffix="eval_neg")
+            dump_labels(cluster_natures, cluster_labels, dir_clus)
+
+            # save clustering plots
+            figs = []
+            for i, nature in enumerate(cluster_natures):
+                clus_scores_i = clus_scores[i]
+                clus_mat_i = clus_2D[i]
+                title_parts = [dataset.name,
+                               dataset.version,
+                               projection_nD.params,
+                               projection_2D.params,
+                               clustering.key,
+                               nature]
+                fig = self.save_plots(
+                    dataset.natures, matrices_2D, dir_clus, title_parts, 
+                    annotations=clus_scores_i.index, annotation_mat=clus_mat_i
+                )
+                figs.append(fig)
         
         # add clustering scores
         self.add_clustering_scores(
-- 
GitLab