From bd140b69c9ba4346838be545abc0241066bad40d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= <hande@ginkgo.work> Date: Thu, 15 Feb 2024 10:37:48 +0100 Subject: [PATCH] update run_steps to reuse already generated matrices --- examples/experiment_custom_lisn.py | 131 +++++++++++++++++++---------- 1 file changed, 86 insertions(+), 45 deletions(-) diff --git a/examples/experiment_custom_lisn.py b/examples/experiment_custom_lisn.py index 1722b260..78a0d9e3 100644 --- a/examples/experiment_custom_lisn.py +++ b/examples/experiment_custom_lisn.py @@ -156,6 +156,23 @@ class Experiment(BaseExperiment): self.persist_current_result(dir_nD) + def _entity_matrices_exist(self, natures, matrix_type, working_dir=None): + if natures is None or len(natures) == 0: + print("No entity natures specified!") + + for nature in natures: + entity_files = working_dir.rglob( + f"{nature}_{matrix_type}.*" + ) + if len(list(entity_files)) == 0: + return False + + print( + f"Matrices of type {matrix_type} for {', '.join(natures)}" + " already exist. Skipping creation." + ) + return True + def run_steps(self, next_params): print(f"\nRunning experiment for parameters:\n{next_params}") #------------------------------------- @@ -170,11 +187,15 @@ class Experiment(BaseExperiment): dir_mat.mkdir(parents=True, exist_ok=True) #------------------------------------- - # create and save entity matrices and scores - matrices, scores = dataset.create_matrices_and_scores() + # create and save entity matrices + if self._entity_matrices_exist(dataset.natures, "mat", dir_mat): + matrices = load_matrices_from_dumps(dataset.natures, "mat", dir_mat) + scores = load_scores(dataset.natures, dir_mat) + else: + matrices, scores = dataset.create_matrices_and_scores() + dump_scores(dataset.natures, scores, dir_mat) + dump_matrices(dataset.natures, matrices, "mat", dir_mat) - dump_scores(dataset.natures, scores, dir_mat) - dump_matrices(dataset.natures, matrices, "mat", dir_mat) #------------------------------------- # do n-dimensional projection @@ -187,12 +208,15 @@ class Experiment(BaseExperiment): dir_nD = dir_mat / projection_nD.params # The value of the extra_param will appear in the name of the # directory generated for nD projection. - print(dir_nD) dir_nD.mkdir(parents=True, exist_ok=True) - - matrices_nD = projection_nD.execute(matrices, dataset, dir_nD) - print(f'{key_nD} matrices generated.') - dump_matrices(dataset.natures, matrices_nD, key_nD, dir_nD) + + if self._entity_matrices_exist(dataset.natures, + key_nD, dir_nD): + matrices_nD = load_matrices_from_dumps(dataset.natures, key_nD, dir_nD) + else: + matrices_nD = projection_nD.execute(matrices, dataset, dir_nD) + print(f'{key_nD} matrices generated.') + dump_matrices(dataset.natures, matrices_nD, key_nD, dir_nD) # add n-dimensional projection scores self.add_nD_scores(key_nD, dir_mat, dir_nD, @@ -205,16 +229,20 @@ class Experiment(BaseExperiment): dir_2D = dir_nD / projection_2D.params dir_2D.mkdir(parents=True, exist_ok=True) - matrices_2D = projection_2D.execute(matrices_nD, dir_2D) - print(f'{key_2D} matrices generated.') - dump_matrices(dataset.natures, matrices_2D, key_2D, dir_2D) + if self._entity_matrices_exist(dataset.natures, + key_2D, dir_2D): + matrices_2D = load_matrices_from_dumps(dataset.natures, key_2D, dir_2D) + else: + matrices_2D = projection_2D.execute(matrices_nD, dir_2D) + print(f'{key_2D} matrices generated.') + dump_matrices(dataset.natures, matrices_2D, key_2D, dir_2D) - # save 2D plots - title_parts = [dataset.name, - dataset.version, - projection_nD.params, - projection_2D.params] - fig = self.save_plots(dataset.natures, matrices_2D, dir_2D, title_parts) + # save 2D plots + title_parts = [dataset.name, + dataset.version, + projection_nD.params, + projection_2D.params] + fig = self.save_plots(dataset.natures, matrices_2D, dir_2D, title_parts) # add 2-dimensional projection scores self.add_2D_scores( @@ -229,35 +257,48 @@ class Experiment(BaseExperiment): key_clus = clustering.key dir_clus = dir_2D / clustering.params dir_clus.mkdir(parents=True, exist_ok=True) - (clus_nD, clus_2D, clus_scores, - cluster_labels, cluster_eval_pos, - cluster_eval_neg) = clustering.create_clusters( - matrices, matrices_2D, matrices_nD, scores, dataset.corpus_index - ) - dump_scores(cluster_natures, clus_scores, dir_clus) - dump_matrices(cluster_natures, clus_nD, key_nD, dir_clus) - dump_matrices(cluster_natures, clus_2D, key_2D, dir_clus) - dump_scores(cluster_natures, cluster_eval_pos, dir_clus, suffix="eval_pos") - dump_scores(cluster_natures, cluster_eval_neg, dir_clus, suffix="eval_neg") - dump_labels(cluster_natures, cluster_labels, dir_clus) - - # save clustering plots - figs = [] - for i, nature in enumerate(cluster_natures): - clus_scores_i = clus_scores[i] - clus_mat_i = clus_2D[i] - title_parts = [dataset.name, - dataset.version, - projection_nD.params, - projection_2D.params, - clustering.key, - nature] - fig = self.save_plots( - dataset.natures, matrices_2D, dir_clus, title_parts, - annotations=clus_scores_i.index, annotation_mat=clus_mat_i + if (self._entity_matrices_exist(cluster_natures, key_2D, dir_clus) and + self._entity_matrices_exist(cluster_natures, key_nD, dir_clus)): + clus_scores = load_scores(cluster_natures, dir_clus) + clus_nD = load_matrices(cluster_natures, key_nD, dir_clus) + clus_2D = load_matrices(cluster_natures, key_2D, dir_clus) + + clus_eval_pos = load_scores(cluster_natures, dir_clus, + suffix="eval_pos") + clus_eval_neg = load_scores(cluster_natures, dir_clus, + suffix="eval_neg") + labels = load_labels(cluster_natures, dir_clus) + else: + (clus_nD, clus_2D, clus_scores, + cluster_labels, cluster_eval_pos, + cluster_eval_neg) = clustering.create_clusters( + matrices, matrices_2D, matrices_nD, scores, dataset.corpus_index ) - figs.append(fig) + + dump_scores(cluster_natures, clus_scores, dir_clus) + dump_matrices(cluster_natures, clus_nD, key_nD, dir_clus) + dump_matrices(cluster_natures, clus_2D, key_2D, dir_clus) + dump_scores(cluster_natures, cluster_eval_pos, dir_clus, suffix="eval_pos") + dump_scores(cluster_natures, cluster_eval_neg, dir_clus, suffix="eval_neg") + dump_labels(cluster_natures, cluster_labels, dir_clus) + + # save clustering plots + figs = [] + for i, nature in enumerate(cluster_natures): + clus_scores_i = clus_scores[i] + clus_mat_i = clus_2D[i] + title_parts = [dataset.name, + dataset.version, + projection_nD.params, + projection_2D.params, + clustering.key, + nature] + fig = self.save_plots( + dataset.natures, matrices_2D, dir_clus, title_parts, + annotations=clus_scores_i.index, annotation_mat=clus_mat_i + ) + figs.append(fig) # add clustering scores self.add_clustering_scores( -- GitLab