diff --git a/cartodata/exporting.py b/cartodata/exporting.py index 10ab85eb70925efc87ca0c06812fe02826eb9e4a..a294e58d4a396ea47c205981ba03b12906ee9015 100644 --- a/cartodata/exporting.py +++ b/cartodata/exporting.py @@ -81,7 +81,7 @@ class Exporter(object): self.dump_dir, self.dump_dir, self.key_2D) self.points = pd.DataFrame(points) - self.points['label'].fillna('N/A', inplace=True) + self.points.fillna({'label': 'N/A'}, inplace=True) def init_nature_points(self, points, total, natures, scores_dir, dir_2D, key_2D): @@ -248,7 +248,7 @@ class Exporter(object): references.at[rows[-1], to_nature] = ','.join( str(x) for x in (cols[col:-1] + offset).tolist() ) - references[to_nature].fillna('', inplace=True) + references.fillna({to_nature: ''}, inplace=True) self.add_metadata_values(from_nature, references) @@ -286,7 +286,7 @@ class Exporter(object): """ df = self.metadata[nature] for column in columns: - df[column].fillna('', inplace=True) + df.fillna({column: ''}, inplace=True) df[join_column_name] = df[columns].apply(','.join, axis=1) to_drop = [col for col in columns if col != join_column_name] df.drop(to_drop, axis=1, inplace=True) diff --git a/cartodata/model_selection/scoring.py b/cartodata/model_selection/scoring.py index c0a5ac3957427befacb5cf53e414f2d24d3e685b..b94587406f89445f704115ed0b78b62a77564768 100644 --- a/cartodata/model_selection/scoring.py +++ b/cartodata/model_selection/scoring.py @@ -4,8 +4,8 @@ from pathlib import Path import gzip import numpy as np import pandas as pd +from scipy.spatial import distance from sklearn import metrics - from sklearn.manifold import trustworthiness from umap.validation import trustworthiness_vector @@ -52,11 +52,11 @@ class Neighbors(ScoringBase): KEY = "neighbors" DEFAULTS = { - "min_score": 20, - "recompute": False, - "sample_size": None, - "n_neighbors": 10, - "random_state": 42 + "min_score": 20, + "recompute": False, + "sample_size": None, + "n_neighbors": 10, + "random_state": 42 } @classmethod @@ -478,6 +478,32 @@ class TrustworthinessUmap(Trustworthiness): return result +class Distance(ScoringBase): + + KEY = "distance" + DEFAULTS = {} + + @classmethod + def evaluate(cls, matrices1, matrices2): + + name = "distance" + result = Result() + + matrices1_stacked = np.hstack(matrices1).T + matrices2_stacked = np.hstack(matrices2).T + distances = [ + distance.euclidean( + tuple(matrices1_stacked[i]), tuple(matrices2_stacked[i]) + ) + for i in range(matrices1_stacked.shape[0]) + ] + + result.add_score(f"{name}_mean", np.mean(distances)) + result.add_score(f"{name}_median", np.median(distances)) + + return result + + class Clustering(ScoringBase): KEY = "clustering" diff --git a/cartodata/model_selection/tests/test_scoring.py b/cartodata/model_selection/tests/test_scoring.py index 95c52fa317ce7f80ebd5bb0709ba03d858c79271..ba045493564f273321efdebe1a4ebeb22f328034 100644 --- a/cartodata/model_selection/tests/test_scoring.py +++ b/cartodata/model_selection/tests/test_scoring.py @@ -272,7 +272,6 @@ class TestNeighbors2D(NeighborsBase, TestCase): name_desc = f"{score_name}_det" desc_scores, value = result.desc_scores[name_desc] assert value == approx(0.4442, abs=1e-2) - print(desc_scores) # checks only first 3 values as following these values, sort order # might change for the authors with same score diff --git a/cartodata/neighbors.py b/cartodata/neighbors.py index d1cea916facad2bd6d56406285f97481ce46cb0b..6a2611b16cb5bdce919bdb06a68f6d084de6a28b 100644 --- a/cartodata/neighbors.py +++ b/cartodata/neighbors.py @@ -53,7 +53,7 @@ def get_neighbors_indirect(indirect_matrix, natural_space_matrix, matrices, def get_neighbors(neighbors_matrix, neighbors_scores, matrices, power_score=0, dump_dir=None, neighbors_nature=None, natures=None, n_neighbors=10, best_among=100, - space="l2"): + space="l2", num_threads=4): """ Compute the nearest neighbors from the entities represented by the neighbors_matrix for all the matrices in the matrices array. For each @@ -98,11 +98,11 @@ def get_neighbors(neighbors_matrix, neighbors_scores, matrices, power_score=0, logger.debug(f"Calculating for {neighbors_nature}.") if power_score == 0: neighbors = nms.knnQueryBatch(matrix.T, k=n_neighbors, - num_threads=4) + num_threads=num_threads) neighbor_ids = np.vstack([ids for ids, _ in neighbors]).T else: neighbors = nms.knnQueryBatch(matrix.T, k=best_among, - num_threads=4) + num_threads=num_threads) neighbor_ids = get_best_neighbors_by_score( neighbors, neighbors_scores, n_neighbors, power_score ) diff --git a/cartodata/operations.py b/cartodata/operations.py index ee3bf060ac802e7411c3d199cbb006c336d8551d..d927ee0a0515b1fe054c6631980953ef16dddf01 100644 --- a/cartodata/operations.py +++ b/cartodata/operations.py @@ -122,6 +122,8 @@ def load_neighbors_for_nature(point_nature, natures, dump_dir): def dump_matrices(natures, matrices, suffix, dirname): + if not os.path.exists(dirname): + os.makedirs(dirname) for idx, nature in enumerate(natures): filename = save_matrix( matrices[idx], os.path.join(dirname, nature + '_' + suffix) @@ -145,6 +147,8 @@ def save_matrix(matrix, filename): def dump_scores(natures, scores, dirname, suffix=""): + if not os.path.exists(dirname): + os.makedirs(dirname) for idx, nature in enumerate(natures): filename = os.path.join(dirname, nature + f'_{suffix}scores') logger.info( @@ -160,7 +164,7 @@ def dump_scores(natures, scores, dirname, suffix=""): def load_scores(natures, dirname, suffix=""): try: - return deprecated_load_scores(natures, dirname) + return deprecated_load_scores(natures, dirname, suffix) except KeyError: scores = [] for nature in natures: diff --git a/cartodata/pipeline/base.py b/cartodata/pipeline/base.py index a43082965d13d93437c8d81ddf6e52d7f52992b0..dae23875ebff6b1c4052b0db8d4c728c1caec75b 100644 --- a/cartodata/pipeline/base.py +++ b/cartodata/pipeline/base.py @@ -1,5 +1,14 @@ +import logging +from pathlib import Path from yaml import YAMLObject +from cartodata.operations import ( + dump_matrices, load_matrices_from_dumps, load_scores, dump_scores, + load_labels, dump_labels +) + +logger = logging.getLogger(__name__) + class BaseEntity(): @@ -84,3 +93,132 @@ class BaseEstimator(BaseEntity, YAMLObject): def execute(self, matrices, dataset, dump_dir): raise NotImplementedError() + + +class BaseProjection(BaseEstimator): + + def __init__(self, key): + super().__init__(key) + + def matrices_exist(self, natures, working_dir): + """Checks if entity matrices are already generated for this projection + type. + + Parameters + ---------- + natures: list + list of natures to search for corresponding matrix files + working_dir: str, Path + the directory to look for the matrices + + Returns + ------- + bool + True if the entity matrices for this projection is already generated; + False otherwise. + + raises + ------ + Exception + if the specified ``working_dir`` is not valid or ``natures`` is + ``None`` or empty. + """ + if natures is None or len(natures) == 0: + raise ValueError("No entity natures specified!") + + try: + working_dir = Path(working_dir) + for nature in natures: + entity_files = working_dir.rglob( + f"{nature}_{self.key}.*" + ) + if len(list(entity_files)) == 0: + logger.info( + f"Entity matrix for nature {nature} does not exist for" + f" {self.key}." + ) + return False + + logger.info( + f"Matrices of type {self.key} for {', '.join(natures)}" + " already exist." + ) + return True + except Exception as ex: + logger.error(f"Specified directory {working_dir} is not valid.") + raise ex + + def dump_matrices(self, natures, matrices, working_dir, key=None): + if natures is None or len(natures) == 0: + raise ValueError("No entity natures specified!") + if key is None: + key = self.key + logger.info(f'Saving matrices for {key} ...') + + if isinstance(working_dir, Path) or isinstance(working_dir, str): + dump_matrices(natures, matrices, key, working_dir) + + else: + for i, dir_i in enumerate(working_dir): + dump_matrices(natures, matrices[i], key, dir_i) + + def load_matrices(self, natures, working_dir, key=None): + """Looks for the matrices for the specified ``natures`` in the + ``working_dir`` and loads them to a list. + + Parameters + ---------- + natures: list + the list of natures to load + working_dir: str, Path, list + the directory where the matrices are stored, or the list of + directories where the matrices are stored + key: str + the key for the matrices. If empty string, uses the key for this + current object + + Returns + ------- + list + the list of matrices for the specified ``natures``, or list of list + of matrices for the specifed ``natures``; if ``working_dir`` is + specified as a list of directories + raises + ------ + Exception + if the specified ``natures`` is ``None`` or empty. + """ + if key is None: + key = self.key + if natures is None or len(natures) == 0: + raise ValueError("No entity natures specified!") + + logger.info(f'Loading {key} matrices from dumps...') + + if isinstance(working_dir, Path) or isinstance(working_dir, str): + return load_matrices_from_dumps(natures, key, working_dir) + + matrices_2D_all = [] + for dir_sub in working_dir: + matrices_2D_all.append( + load_matrices_from_dumps(natures, key, dir_sub) + ) + return matrices_2D_all + + def load_scores(self, natures, working_dir, suffix=""): + if isinstance(working_dir, Path) or isinstance(working_dir, str): + return load_scores(natures, working_dir, suffix) + + scores_all = [] + for dir_sub in working_dir: + scores_all.append(load_scores(natures, dir_sub, suffix)) + return scores_all + + def dump_scores(self, natures, scores, working_dir, suffix=""): + dump_scores(natures, scores, working_dir, suffix) + + def load_labels(self, natures, working_dir): + return load_labels(natures, working_dir) + + def dump_labels(self, natures, labels, working_dir): + dump_labels(natures, labels, working_dir) diff --git a/cartodata/pipeline/clustering.py b/cartodata/pipeline/clustering.py index f2707081e51a2ec97d40985b150c9d7563901397..b02ccf3136a3a3a7827009e59113414c7167c3f2 100644 --- a/cartodata/pipeline/clustering.py +++ b/cartodata/pipeline/clustering.py @@ -3,7 +3,7 @@ import logging from cartodata.clustering import ( create_kmeans_clusters, create_hdbscan_clusters ) -from cartodata.pipeline.base import BaseEstimator +from cartodata.pipeline.base import BaseProjection logger = logging.getLogger(__name__) @@ -16,7 +16,7 @@ def get_executor_clus(key): return HDBSCANClustering -class Clustering(BaseEstimator): +class Clustering(BaseProjection): def __init__(self, key, n, base_factor, natures, label_length, weight_name_length, random_state=42): @@ -33,6 +33,48 @@ class Clustering(BaseEstimator): # clustering self._add_to_params("base_factor") + def load_cluster_matrices(self, key_nD, key_2D, dir_clus): + clus_scores = self.load_scores(self.natures, dir_clus) + clus_nD = self.load_matrices(self.natures, dir_clus, key_nD) + clus_2D = self.load_matrices(self.natures, dir_clus, key_2D) + eval_pos = self.load_scores(self.natures, dir_clus, suffix="eval_pos") + eval_neg = self.load_scores(self.natures, dir_clus, suffix="eval_neg") + labels = self.load_labels(self.natures, dir_clus) + + return clus_nD, clus_2D, clus_scores, labels, eval_pos, eval_neg + + def load_execute(self, dataset, dir_mat, key_nD, dir_nD, key_2D, dir_2D, + dir_clus, dump=True, force=False, return_mat=False): + """Loads nD and 2D matrices and executes clustering on them. + If ``force`` is False, checks if the clustering matrices are already + generated. Loads and returns them if ``return_mat`` is True. + + Parameters + ---------- + """ + # if not force to regenerate matrices and the matrices already exist + if (not force and self.matrices_exist(self.natures, dir_clus) and + self.matrices_exist(self.natures, dir_clus)): + logger.info( + f"{self.key} matrices already exist, will not regenerate." + ) + if return_mat: + self.load_cluster_matrices(key_nD, key_2D, dir_clus) + else: + return None + + natures = dataset.natures + matrices = dataset.load_matrices(natures, dir_mat) + scores = dataset.load_scores(natures, dir_mat) + matrices_nD = self.load_matrices(natures, dir_nD, key_nD) + matrices_2D = self.load_matrices(natures, dir_2D, key_2D) + + return self.create_clusters( + matrices, matrices_2D, matrices_nD, scores, + dataset.corpus_index, key_nD=key_nD, key_2D=key_2D, + dir_clus=dir_clus, dump=dump + ) + def _create_clusters(self, nb_clusters, clustering_table, naming_table, natural_space_naming_table, naming_scores, previous_cluster_labels, naming_profile_table=None, @@ -41,7 +83,8 @@ class Clustering(BaseEstimator): raise NotImplementedError() def create_clusters(self, matrices, matrices_2D, matrices_nD, scores, - words_index): + words_index, key_nD=None, key_2D=None, dir_clus=None, + dump=False): """ Parameters ---------- @@ -58,23 +101,27 @@ class Clustering(BaseEstimator): words_index: int Index of the entity that corresponds to text corpus in the list of matrices and scores. + dir_clus: str, path + the path to save the matrices and scores for clustering + dump: bool + boolean value to indicate if the matrices and scores generated for + clustering should be saved Returns ------- clus_nD: list of numpy.ndarray clus_2D: list of numpy.ndarray clus_scores: list of pandas.core.series.Series [float] - clus_eval_pos: list of pandas.core.series.Series [float] - clus_eval_neg: list of pandas.core.series.Series [float] - clus_labels: list of list - clus_nb_clusters: list + labels: list of list + eval_pos: list of pandas.core.series.Series [float] + eval_neg: list of pandas.core.series.Series [float] """ cluster_labels = [] clus_nD = [] clus_2D = [] clus_scores = [] - clus_eval_pos = [] - clus_eval_neg = [] + eval_pos = [] + eval_neg = [] clus_labels = [] for idx, nature in enumerate(self.natures): @@ -108,12 +155,21 @@ class Clustering(BaseEstimator): clus_nD.append(c_nD) clus_2D.append(c_2D) clus_scores.append(c_scores) - clus_eval_pos.append(cluster_eval_pos) - clus_eval_neg.append(cluster_eval_neg) + eval_pos.append(cluster_eval_pos) + eval_neg.append(cluster_eval_neg) clus_labels.append(labels_dict) - return (clus_nD, clus_2D, clus_scores, clus_labels, - clus_eval_pos, clus_eval_neg) + if dump: + self.dump_scores(self.natures, clus_scores, dir_clus) + self.dump_matrices(self.natures, clus_nD, dir_clus, key=key_nD) + self.dump_matrices(self.natures, clus_2D, dir_clus, key=key_2D) + self.dump_labels(self.natures, clus_labels, dir_clus) + self.dump_scores(self.natures, eval_pos, dir_clus, + suffix="eval_pos") + self.dump_scores(self.natures, eval_neg, dir_clus, + suffix="eval_neg") + + return clus_nD, clus_2D, clus_scores, clus_labels, eval_pos, eval_neg class KMeansClustering(Clustering): diff --git a/cartodata/pipeline/common.py b/cartodata/pipeline/common.py index 8ba6e25b677518f9b9c8f47ee71913ab8cf63392..3446020660249fdd130c6999dc4e327443e885b1 100644 --- a/cartodata/pipeline/common.py +++ b/cartodata/pipeline/common.py @@ -2,10 +2,6 @@ import logging from pathlib import Path from cartodata.exporting import Exporter -from cartodata.operations import ( - dump_matrices, dump_scores, load_matrices_from_dumps, load_scores, - dump_labels, load_labels -) from cartodata.plotting import plot_map, save_fig, close_plots logger = logging.getLogger(__name__) @@ -35,7 +31,6 @@ def get_pipeline(dataset, top_dir, input_dir, projection_nd=None, class Pipeline(): - yaml_tag = u'!Pipeline' def __init__(self, dataset=None, top_dir="./dumps", input_dir="./datas", @@ -69,15 +64,18 @@ class Pipeline(): if self.dataset is not None: self.dataset.input_dir = self.input_dir self.dataset.update_top_dir(self._top_dir) - working_dir = self.dataset.working_dir - if self.optimisation_seed > -1: - working_dir = working_dir / str(self.optimisation_seed) + self._set_workdir_from_dataset() + + def _set_workdir_from_dataset(self): + working_dir = self.dataset.working_dir + if self.optimisation_seed > -1: + working_dir = working_dir / str(self.optimisation_seed) - if self.hierarchical_dirs: - working_dir = ( - working_dir / self.dataset.params - ) - self.working_dir = working_dir + if self.hierarchical_dirs: + working_dir = ( + working_dir / self.dataset.params + ) + self.working_dir = working_dir @property def natures(self): @@ -114,39 +112,7 @@ class Pipeline(): self.dataset = dataset self.dataset.input_dir = self.input_dir dataset.update_top_dir(self._top_dir) - - self.working_dir = dataset.working_dir - - def _entity_matrices_exist(self, natures, matrix_type, working_dir=None, - force=False): - """Checks if entity matrices are already generated for specified - `matrix_type`. - - :param natures: list of natures to check. - :param matrix_type: type of entity matrix. - :param working_dir: the directory to look for the entity matrices - :param force: if True, forces to regenerate entity matrices. - :return: bool - True if matrices exist in the dump directory; False otherwise. - """ - - if natures is None or len(natures) == 0: - logger.warn("No entity natures specified!") - - if not force: - for nature in natures: - entity_files = working_dir.rglob( - f"{nature}_{matrix_type}.*" - ) - if len(list(entity_files)) == 0: - return False - - logger.info( - f"Matrices of type {matrix_type} for {', '.join(self.natures)}" - " already exist. Skipping creation." - ) - return True - return False + self._set_workdir_from_dataset() def set_projection_nd(self, projection_nd): self.projection_nd = projection_nd @@ -204,77 +170,6 @@ class Pipeline(): return obj_dir - def load_scores(self, natures, working_dir, suffix=""): - return load_scores(natures, working_dir, suffix) - - def dump_scores(self, natures, scores, working_dir, suffix=""): - dump_scores(natures, scores, working_dir, suffix) - - def load_matrices(self, natures, mat_type, working_dir): - return load_matrices_from_dumps(natures, mat_type, - working_dir) - - def dump_matrices(self, natures, entity_type, matrices, working_dir): - dump_matrices(natures, matrices, entity_type, working_dir) - - def dump_params(self, params, working_dir): - with open(working_dir / "params.txt", "w") as text_file: - text_file.write(params) - - def load_labels(self, natures, working_dir): - return load_labels(natures, working_dir) - - def dump_labels(self, natures, labels, working_dir): - dump_labels(natures, labels, working_dir) - - def load_entity_matrices(self, load_scores=True): - logger.info('Loading entity matrices from dumps...') - key = self.dataset.key - current_dir = self.working_dir - - matrices = self.load_matrices(self.natures, key, current_dir) - - if load_scores: - scores = self.load_scores(self.natures, current_dir) - return matrices, scores - - return matrices - - def load_nD_matrices(self): - key = self.projection_nd.key - current_dir = self.get_nD_dir() - - logger.info(f'Loading {key} matrices from dumps...') - - return self.load_matrices(self.natures, key, current_dir) - - def load_2D_matrices(self): - key = self.projection_2d.key - current_dir = self.get_2D_dir() - - logger.info(f'Loading {key} matrices from dumps...') - - return self.load_matrices(self.natures, key, current_dir) - - def load_cluster_matrices(self): - cluster_natures = self.clustering.natures - current_dir = self.get_clus_dir() - - clus_scores = self.load_scores(cluster_natures, current_dir) - clus_nD = self.load_matrices(cluster_natures, self.projection_nd.key, - current_dir) - clus_2D = self.load_matrices(cluster_natures, self.projection_2d.key, - current_dir) - - clus_eval_pos = self.load_scores(cluster_natures, current_dir, - suffix="eval_pos") - clus_eval_neg = self.load_scores(cluster_natures, current_dir, - suffix="eval_neg") - labels = self.load_labels(cluster_natures, current_dir) - - return (clus_nD, clus_2D, clus_scores, labels, - clus_eval_pos, clus_eval_neg) - def title_parts_2D(self): title_parts = [self.dataset.name, self.dataset.version, @@ -289,7 +184,7 @@ class Pipeline(): title_parts.extend([self.clustering.key, nature]) return title_parts - def run(self, save_plots=False, force=False): + def run(self, save_plots=True, dump=True, force=False): """Generates entity matrices, makes n-dimesional and 2-dimensional projection, creates clusters, neighbors and saves to export.feather file. @@ -300,16 +195,16 @@ class Pipeline(): self.generate_entity_matrices(return_mat=False, force=force) # executes projection if not already executed and saves the results - self.do_projection_nD(return_mat=False, force=force) + self.do_projection_nD(dump=dump, return_mat=False, force=force) # find neighbors self.find_neighbors() # executes 2D projection if not already executed and saves the results - self.do_projection_2D(return_mat=False, force=force) + self.do_projection_2D(dump=dump, return_mat=False, force=force) # creates clusters - self.create_clusters(return_mat=False, force=force) + self.do_clustering(dump=dump, return_mat=False, force=force) if save_plots: self.save_plots() @@ -318,160 +213,38 @@ class Pipeline(): self.export() def generate_entity_matrices(self, return_mat=True, force=False): - key = self.dataset.key - - current_dir = self.working_dir - - # check if entity matrices are already generated - if self._entity_matrices_exist(self.natures, key, - working_dir=current_dir, - force=force): - logger.info('Entity matrices already exist. Will not regenerate!') - if return_mat: - return self.load_entity_matrices() - else: - logger.info('Generating entity matrices') - matrices, scores = self.dataset.create_matrices_and_scores() - logger.info('Entity matrices generated.') - - self.dump_scores(self.natures, scores, current_dir) - self.dump_matrices(self.natures, key, matrices, current_dir) - logger.info(f'Entity matrices dumped in {current_dir}.') - - if return_mat: - return matrices, scores - - def do_projection_nD(self, return_mat=True, force=False): - key = self.projection_nd.key - current_dir = self.get_nD_dir() - - # check if nD matrices already exists - if self._entity_matrices_exist(self.natures, key, - working_dir=current_dir, - force=force): - logger.info(f'{key} matrices already exist. Will not regenerate!') - if return_mat: - return self.load_nD_matrices() - else: - matrices = self.load_entity_matrices(load_scores=False) - - logger.info(f'Generating {key} matrices.') - projection_nD_matrices = self.projection_nd.execute( - matrices, self.dataset, current_dir - ) - logger.info(f'{key} matrices generated.') - - self.dump_matrices(self.natures, key, projection_nD_matrices, - current_dir) - logger.info(f'{key} matrices dumped in {current_dir}.') - - if return_mat: - return projection_nD_matrices - - def do_projection_2D(self, return_mat=True, force=False): - key = self.projection_2d.key - current_dir = self.get_2D_dir() - - # check if 2D matrices already exists - if self._entity_matrices_exist(self.natures, key, - working_dir=current_dir, - force=force): - logger.info(f'{key} already exist. Will not regenerate!') - if return_mat: - return self.load_2D_matrices() - else: - matrices_nD = self.load_nD_matrices() - - logger.info(f'Generating {key} matrices') - projection_2d_matrices = self.projection_2d.execute(matrices_nD, - current_dir) - logger.info(f'{key} matrices generated.') - - self.dump_matrices(self.natures, key, projection_2d_matrices, - current_dir) - logger.info(f'{key} matrices dumped in {current_dir}.') - - if return_mat: - return projection_2d_matrices - - def create_clusters(self, return_mat=True, force=False): - """ - Create a correspondance matrix for nb_clusters clusters. - - :param matrices: - :param matrices_2D: - :param matrices_nD: - :param scores: - :return: - """ - cluster_natures = self.clustering.natures - key = self.projection_2d.key - current_dir = self.get_clus_dir() - - # check if cus_nD and clus_2D matrices already exists - if (self._entity_matrices_exist(cluster_natures, - self.projection_2d.key, - working_dir=current_dir, - force=force) and - self._entity_matrices_exist(cluster_natures, - self.projection_nd.key, - working_dir=current_dir, - force=force)): - logger.info( - f'{key} clustering matrices already exist. Will not ' - 'regenerate!' - ) - if return_mat: - return self.load_cluster_matrices() - else: - matrices, scores = self.load_entity_matrices(load_scores=True) - matrices_nD = self.load_nD_matrices() - matrices_2D = self.load_2D_matrices() - - (clus_nD, clus_2D, clus_scores, - cluster_labels, cluster_eval_pos, - cluster_eval_neg) = self.clustering.create_clusters( - matrices, matrices_2D, matrices_nD, scores, - self.dataset.corpus_index - ) + return self.dataset.generate_entity_matrices( + dir_mat=self.working_dir, force=force, dump=True, + return_mat=return_mat + ) - self.dump_scores(cluster_natures, clus_scores, current_dir) - self.dump_matrices(cluster_natures, self.projection_nd.key, - clus_nD, current_dir) - self.dump_matrices(cluster_natures, self.projection_2d.key, - clus_2D, current_dir) + def do_projection_nD(self, dump=True, return_mat=True, force=False): + return self.projection_nd.load_execute( + self.dataset, dir_mat=self.working_dir, dir_nD=self.get_nD_dir(), + dump=dump, force=force, return_mat=return_mat + ) - self.dump_scores(cluster_natures, cluster_eval_pos, current_dir, - suffix="eval_pos") - self.dump_scores(cluster_natures, cluster_eval_neg, current_dir, - suffix="eval_neg") + def do_projection_2D(self, dump=True, return_mat=True, force=False): - self.dump_labels(cluster_natures, cluster_labels, current_dir) + return self.projection_2d.load_execute( + self.dataset.natures, self.projection_nd.key, self.get_nD_dir(), + self.get_2D_dir(), dump=dump, force=force, return_mat=return_mat + ) - if return_mat: - return ( - clus_nD, clus_2D, clus_scores, cluster_labels, - cluster_eval_pos, cluster_eval_neg - ) + def do_clustering(self, dump=True, return_mat=True, force=False): + return self.clustering.load_execute( + self.dataset, self.working_dir, self.projection_nd.key, + self.get_nD_dir(), self.projection_2d.key, self.get_2D_dir(), + self.get_clus_dir(), dump=dump, force=force, return_mat=return_mat + ) - def find_neighbors(self, natures=None): + def find_neighbors(self, natures=None, force=False): if natures is None: natures = self.natures - dumps_dir = self.get_neighbors_dir() - - for neighbors_nature in natures: - for nature in natures: - if not self.neighboring.get_neighbors_path( - neighbors_nature, nature, dumps_dir).exists(): - - matrices_nD = self.load_nD_matrices() - scores = self.load_scores(natures, self.working_dir) - - # this already dumps the matrices - self.neighboring.get_neighbors( - natures, matrices_nD, scores, dumps_dir - ) - return + return self.neighboring.load_execute( + natures, self.working_dir, self.projection_nd.key, + self.get_nD_dir(), self.get_neighbors_dir(), force=force + ) def export(self, export_natures=None): if export_natures is not None: @@ -499,7 +272,9 @@ class Pipeline(): annotation_mat, annotation_color) def save_2D_plots(self, name_suffix="", file_ext=".png"): - matrices_2D = self.load_2D_matrices() + dir_2D = self.get_2D_dir() + matrices_2D = self.projection_2d.load_matrices(self.natures, dir_2D) + labels = tuple(self.natures) title_parts = self.title_parts_2D() if name_suffix != "": @@ -511,7 +286,7 @@ class Pipeline(): matrices_2D, labels, title=fig_title ) filename = "_".join(title_parts) + file_ext - self.save_plot(fig, filename, working_dir=self.get_2D_dir()) + self.save_plot(fig, filename, working_dir=dir_2D) return (matrices_2D, (fig, filename)) @@ -519,11 +294,15 @@ class Pipeline(): if save_2D: matrices_2D, _ = self.save_2D_plots(name_suffix, file_ext) else: - matrices_2D = self.load_2D_matrices() + matrices_2D = self.projection_2d.load_matrices(self.natures, + self.get_2D_dir()) labels = tuple(self.natures) - _, clus_2D, clus_scores, _, _, _ = self.load_cluster_matrices() - current_dir = self.get_clus_dir() + dir_clus = self.get_clus_dir() + (_, clus_2D, clus_scores, + _, _, _) = self.clustering.load_cluster_matrices( + self.projection_nd.key, self.projection_2d.key, dir_clus + ) figs = [] for i, nature in enumerate(self.clustering.natures): @@ -541,7 +320,7 @@ class Pipeline(): matrices_2D, labels, title=fig_title, annotations=clus_scores_i.index, annotation_mat=clus_mat_i ) - self.save_plot(fig_i, filename, working_dir=current_dir) + self.save_plot(fig_i, filename, working_dir=dir_clus) figs.append((fig_i, filename)) @@ -607,7 +386,7 @@ class AlignedPipeline(Pipeline): for i in range(self.dataset.slice_count): self.set_current_slice(i) matrices_i, scores_i = super().generate_entity_matrices( - force=force + return_mat=return_mat, force=force ) if return_mat: matrices_all.append(matrices_i) @@ -618,14 +397,14 @@ class AlignedPipeline(Pipeline): if return_mat: return matrices_all, scores_all - def do_projection_nD(self, return_mat=True, force=False): + def do_projection_nD(self, dump=True, return_mat=True, force=False): matrices_nD_all = [] for i in range(self.dataset.slice_count): self.set_current_slice(i) matrices_nD_i = super().do_projection_nD( - return_mat=return_mat, force=force + dump=dump, return_mat=return_mat, force=force ) matrices_nD_all.append(matrices_nD_i) @@ -634,106 +413,34 @@ class AlignedPipeline(Pipeline): if return_mat: return matrices_nD_all - def do_projection_2D(self, return_mat=True, prev_version=None, force=False, - **kwargs): + def do_projection_2D(self, prev_version=None, dump=True, return_mat=True, + force=False, **kwargs): """ """ if prev_version is None: prev_version = self.prev_version - key = f"{self.projection_2d.key}" - matrices_2D_all = [] - # check if 2D processing has already been executed - for i in range(self.dataset.slice_count): - self.set_current_slice(i) - current_dir = self.get_2D_dir() - - if self._entity_matrices_exist(self.natures, key, - working_dir=current_dir, - force=force): - logger.info(f'Loading {key} matrices from dumps...') - projection_2D_matrices = self.load_2D_matrices() - matrices_2D_all.append(projection_2D_matrices) - - self.working_dir = self.main_working_dir - - # if 2D processing has already been executed, return results - if len(matrices_2D_all) > 0: - # load reducer from previous run, if 2D projection has already been - # executed - self.projection_2d.load_reducer(self.working_dir) - if return_mat: - self.set_current_slice(len(matrices_2D_all) - 1) - final_scores = super().load_scores(self.natures, - self.working_dir) - return matrices_2D_all, final_scores - - # if there are no dumps of 2D processing - logger.info(f'Generating {key} matrices...') - - matrices_nD_all = [] - scores_all = [] + dirs_mat = [] + dirs_nD = [] + dirs_2D = [] for i in range(self.dataset.slice_count): self.set_current_slice(i) - scores_i = super().load_scores(self.natures, self.working_dir) - matrices_nD_i = super().load_nD_matrices() - matrices_nD_all.append(matrices_nD_i) - scores_all.append(scores_i) + dirs_mat.append(self.working_dir) + dirs_nD.append(self.get_nD_dir()) + dirs_2D.append(self.get_2D_dir()) self.working_dir = self.main_working_dir - if prev_version is not None: - # set previous version directory to load reducer from that - # directory. For an update with existing - # reducer prev_version should be set to None - prev_dir = Path(str(self.working_dir).replace(self.dataset.version, - prev_version)) - - scores_prev = self.load_scores(self.natures, prev_dir) - - # this persists the reducer - matrices_2D_all = self.projection_2d.update( - matrices_nD_all, scores_all, scores_prev, self.working_dir, - prev_dir, **kwargs - ) - else: - # this persists the reducer - matrices_2D_all = self.projection_2d.execute( - matrices_nD_all, scores_all, self.working_dir - ) - - logger.info(f'{key} matrices generated.') - - for i in range(self.dataset.slice_count): - self.set_current_slice(i) - current_dir = self.get_2D_dir() - - self.dump_matrices(self.natures, key, matrices_2D_all[i], - working_dir=current_dir) - - if i == self.dataset.slice_count - 1: - final_scores = self.load_scores(self.natures, self.working_dir) - - self.working_dir = self.main_working_dir - - self.dump_scores(self.natures, final_scores, self.working_dir) - - if return_mat: - return matrices_2D_all, final_scores + return self.projection_2d.load_execute( + self.dataset.natures, dirs_mat, self.projection_nd.key, dirs_nD, + dirs_2D, self.main_working_dir, + current_version=self.dataset.version, prev_version=prev_version, + dump=dump, force=force, return_mat=return_mat, **kwargs + ) - def create_clusters(self, return_mat=True, force=False): - """ - Create a correspondance matrix for nb_clusters clusters. Clusters are - created using the KMeans algorithm. - - :param matrices: - :param matrices_2D: - :param matrices_nD: - :param scores: - :return: - """ + def do_clustering(self, dump=True, return_mat=True, force=False): clus_nD_all = [] clus_2D_all = [] clus_scores_all = [] @@ -746,8 +453,8 @@ class AlignedPipeline(Pipeline): (clus_nD, clus_2D, clus_scores, clus_labels, clus_eval_pos, - clus_eval_neg) = super().create_clusters( - force=force + clus_eval_neg) = super().do_clustering( + dump=dump, return_mat=return_mat, force=force ) if return_mat: @@ -817,3 +524,191 @@ class AlignedPipeline(Pipeline): export_file_link = (self.working_dir / "export.feather") export_file_link.unlink(missing_ok=True) export_file_link.symlink_to(target) + + +class GuidedAlignedPipeline(AlignedPipeline): + + yaml_tag = u'!GuidedAlignedPipeline' + + def __init__(self, dataset=None, top_dir='dumps', input_dir="datas", + prev_version=None, projection_nd=None, projection_2d=None, + clustering=None, neighboring=None, export_natures=None, + hierarchical_dirs=False, optimisation_seed=-1): + + super().__init__(dataset, top_dir, input_dir, prev_version, + projection_nd, projection_2d, clustering, neighboring, + export_natures, hierarchical_dirs, optimisation_seed) + + def get_guided_dir(self): + dir_guided = self.working_dir / "final" + dir_guided.mkdir(parents=True, exist_ok=True) + return dir_guided + + def get_guided_nD_dir(self): + obj_dir = self.get_guided_dir() + + if self.hierarchical_dirs: + obj_dir = ( + obj_dir / self.projection_nd.params + ) + obj_dir.mkdir(parents=True, exist_ok=True) + + return obj_dir + + def get_guided_neighbors_dir(self): + obj_dir = self.get_guided_nD_dir() + + if self.hierarchical_dirs: + obj_dir = ( + obj_dir / self.neighboring.params + ) + obj_dir.mkdir(parents=True, exist_ok=True) + + return obj_dir + + def get_guided_2D_dir(self): + obj_dir = self.get_guided_nD_dir() + + if self.hierarchical_dirs: + obj_dir = ( + obj_dir / self.projection_2d.params + ) + obj_dir.mkdir(parents=True, exist_ok=True) + + return obj_dir + + def get_guided_clus_dir(self): + obj_dir = self.get_guided_2D_dir() + + if self.hierarchical_dirs: + obj_dir = ( + obj_dir / self.clustering.params + ) + obj_dir.mkdir(parents=True, exist_ok=True) + + return obj_dir + + def generate_entity_matrices(self, return_mat=True, force=False): + + matrices_all, scores_all = super().generate_entity_matrices( + return_mat=True, force=force + ) + self.dataset.dump_matrices(self.dataset.natures, matrices_all[-1], + self.get_guided_dir()) + self.dataset.dump_scores(self.dataset.natures, scores_all[-1], + self.get_guided_dir()) + if return_mat: + return matrices_all, scores_all + + def do_projection_nD(self, dump=True, return_mat=True, force=False): + + matrices_nD_all = super().do_projection_nD(dump=dump, return_mat=True, + force=force) + + self.projection_nd.dump_matrices(self.dataset.natures, + matrices_nD_all[-1], + self.get_guided_nD_dir()) + if return_mat: + return matrices_nD_all + + def find_neighbors(self, natures=None, force=False): + if natures is None: + natures = self.natures + return self.neighboring.load_execute( + natures, self.get_guided_dir(), self.projection_nd.key, + self.get_guided_nD_dir(), self.get_guided_neighbors_dir(), + force=force + ) + + def do_projection_2D(self, prev_version=None, dump=True, return_mat=True, + force=False, **kwargs): + if prev_version is None: + prev_version = self.prev_version + + dirs_mat = [] + dirs_nD = [] + dirs_2D = [] + + for i in range(self.dataset.slice_count): + self.set_current_slice(i) + + dirs_mat.append(self.working_dir) + dirs_nD.append(self.get_nD_dir()) + dirs_2D.append(self.get_2D_dir()) + + self.working_dir = self.main_working_dir + + return self.projection_2d.load_execute( + self.dataset.natures, dirs_mat, self.projection_nd.key, dirs_nD, + dirs_2D, self.dataset.working_dir, self.get_guided_nD_dir(), + self.get_guided_2D_dir(), current_version=self.dataset.version, + prev_version=prev_version, dump=dump, force=force, + return_mat=return_mat, **kwargs + ) + + def do_clustering(self, dump=True, return_mat=True, force=False): + return self.clustering.load_execute( + self.dataset, self.get_guided_dir(), self.projection_nd.key, + self.get_guided_nD_dir(), self.projection_2d.key, + self.get_guided_2D_dir(), self.get_guided_clus_dir(), + dump=dump, force=force, return_mat=return_mat + ) + + def save_2D_plots(self, name_suffix="", file_ext=".png"): + dir_2D = self.get_guided_2D_dir() + matrices_2D = self.projection_2d.load_matrices( + self.natures, dir_2D, key=self.projection_2d.key + ) + + labels = tuple(self.natures) + title_parts = self.title_parts_2D() + if name_suffix != "": + title_parts.append(name_suffix) + + fig_title = " ".join(title_parts) + + fig, ax = self.plot_map( + matrices_2D, labels, title=fig_title + ) + filename = "_".join(title_parts) + file_ext + self.save_plot(fig, filename, working_dir=dir_2D) + + return (matrices_2D, (fig, filename)) + + def save_plots(self, name_suffix="", file_ext=".png", save_2D=True): + if save_2D: + matrices_2D, _ = self.save_2D_plots(name_suffix, file_ext) + else: + matrices_2D = self.projection_2d.load_matrices( + self.natures, self.get_guided_2D_dir(), + key=self.projection_2d.key + ) + + labels = tuple(self.natures) + dir_clus = self.get_guided_clus_dir() + (_, clus_2D, clus_scores, + _, _, _) = self.clustering.load_cluster_matrices( + self.projection_nd.key, self.projection_2d.key, dir_clus + ) + + figs = [] + for i, nature in enumerate(self.clustering.natures): + clus_scores_i = clus_scores[i] + clus_mat_i = clus_2D[i] + + title_parts = self.title_parts_clus(nature) + if name_suffix != "": + title_parts.append(name_suffix) + + fig_title = " ".join(title_parts) + filename = "_".join(title_parts) + file_ext + + fig_i, ax_i = self.plot_map( + matrices_2D, labels, title=fig_title, + annotations=clus_scores_i.index, annotation_mat=clus_mat_i + ) + self.save_plot(fig_i, filename, working_dir=dir_clus) + + figs.append((fig_i, filename)) + + return figs diff --git a/cartodata/pipeline/datasets.py b/cartodata/pipeline/datasets.py index ff99569f9e0623adf7416787c588aa157a1e3998..50133b5e62fac1cb1106cacff3a7744a009921a9 100644 --- a/cartodata/pipeline/datasets.py +++ b/cartodata/pipeline/datasets.py @@ -6,12 +6,13 @@ from download import download import pandas as pd from shutil import copy +from cartodata.pipeline.base import BaseProjection from cartodata.pipeline.columns import Columns logger = logging.getLogger(__name__) -class Dataset(): +class Dataset(BaseProjection): """A dataset class that holds the dataset to be processed in a pandas.DataFrame. @@ -249,6 +250,30 @@ class Dataset(): elif column.type is Columns.CORPUS: self.corpus_index = i + def generate_entity_matrices(self, dir_mat, force=False, dump=True, + return_mat=True): + logger.info("Generating entity matrices...") + + matrices = None + scores = None + + # if not force to regenerate matrices and the matrices already exist + if not force and self.matrices_exist(self.natures, dir_mat): + logger.info("Matrices already exist, will not regenerate.") + if return_mat: + matrices = self.load_matrices(self.natures, dir_mat) + scores = self.load_scores(self.natures, dir_mat) + + else: + matrices, scores = self.create_matrices_and_scores() + logger.info('Entity matrices generated.') + + if dump: + self.dump_scores(self.natures, scores, dir_mat) + self.dump_matrices(self.natures, matrices, dir_mat) + + return matrices, scores + def create_matrices_and_scores(self): """Creates matrices and scores for each entity specified by the columns of the dataset. @@ -397,7 +422,9 @@ class FileDataset(Dataset): (self.input_dir / self._filename).exists() ) ), ( - "The file does not exist locally, please specify fileurl.") + f"The file {self.working_dir.absolute() / self._filename} " + "does not exist locally, please specify fileurl." + ) class CSVDataset(FileDataset): @@ -672,6 +699,8 @@ class SliceDataset(Dataset): def create_slices(self, slice_count, slice_type="cumulative", overlap=0, sort_asc=None): + if self._index == -1: + self._load() assert slice_count > 0, ( "Please specify a slice_count greater than 0!" ) diff --git a/cartodata/pipeline/experiment.py b/cartodata/pipeline/experiment.py index fbf375826ea312de7137bd54e225bcd71a63c207..cf09b4f1da137861e8e3444ea8e331039f213a17 100644 --- a/cartodata/pipeline/experiment.py +++ b/cartodata/pipeline/experiment.py @@ -107,7 +107,7 @@ class PipelineExperiment(BaseExperiment): logger.info("Finished running 2D scoring.") # run clustering and save scores - pipeline.create_clusters() + pipeline.do_clustering() plots = pipeline.save_plots(save_2D=False) if pipeline.is_aligned: plots = plots[last_slice_index] diff --git a/cartodata/pipeline/exporting.py b/cartodata/pipeline/exporting.py index 3df45961fa2f24fdca48bab8c53467ca11e6f0a2..04ce39e6ce892f77eb117e46406ac39196e2511f 100644 --- a/cartodata/pipeline/exporting.py +++ b/cartodata/pipeline/exporting.py @@ -68,7 +68,7 @@ class EntityMetadataMapColumn(MetadataColumn): df = pipeline.data matrix_dir = pipeline.working_dir - matrix = pipeline.load_matrices([self.entity], 'mat', matrix_dir)[0] + matrix = pipeline.dataset.load_matrices([self.entity], matrix_dir)[0] rows, cols = matrix.T.nonzero() col = 0 row = 0 diff --git a/cartodata/pipeline/interpolator.py b/cartodata/pipeline/interpolator.py new file mode 100644 index 0000000000000000000000000000000000000000..1e0b0581b23c5f31716813e1660a0495b1700263 --- /dev/null +++ b/cartodata/pipeline/interpolator.py @@ -0,0 +1,35 @@ +import numpy as np +from sklearn.neighbors import KNeighborsRegressor + +from cartodata.pipeline.base import BaseProjection +from cartodata.utils import ( + hstack_transpose_matrices, decompose_transpose_matrix +) + + +class KNeighborsInterpolator(BaseProjection): + + def __init__(self, key="kneighbors_int", metric="euclidean", + n_neighbors=2): + super().__init__(key) + + self.metric = metric + self.n_neighbors = n_neighbors + + self.regressor = KNeighborsRegressor(metric=metric, n_neighbors=2) + + self._add_to_params(["metric", "n_neighbors"]) + + def fit(self, X_train, y_train): + X_train_T, row_counts = hstack_transpose_matrices(X_train) + y_train_T = np.hstack(y_train).T + + self.regressor.fit(X_train_T, y_train_T) + + def predict(self, X): + X_T, row_counts = hstack_transpose_matrices(X) + embedding = self.regressor.predict(X_T) + + matrices_2D = decompose_transpose_matrix(embedding, row_counts) + + return matrices_2D diff --git a/cartodata/pipeline/neighbors.py b/cartodata/pipeline/neighbors.py index 331c8763b19cf5198ced742318a20f5dc8c28c7a..b139aecaadeb846c8b7823eebe6385b696c89c7e 100644 --- a/cartodata/pipeline/neighbors.py +++ b/cartodata/pipeline/neighbors.py @@ -1,5 +1,5 @@ from cartodata.neighbors import get_neighbors, get_neighbors_path -from cartodata.pipeline.base import BaseEstimator +from cartodata.pipeline.base import BaseProjection def get_executor_nbr(key): @@ -9,7 +9,7 @@ def get_executor_nbr(key): raise NotImplementedError() -class AllNeighbors(BaseEstimator): +class AllNeighbors(BaseProjection): yaml_tag = u'!AllNeighbors' @@ -18,21 +18,39 @@ class AllNeighbors(BaseEstimator): "power_scores" ] - def __init__(self, n_neighbors=10, power_scores=[0, 0.5, 0.5, 0, 0]): + def __init__(self, n_neighbors=10, power_scores=[0, 0.5, 0.5, 0, 0], + num_threads=8): super().__init__("neighbors") self.n_neighbors = n_neighbors self.power_scores = power_scores + self.num_threads = num_threads self._add_to_params(AllNeighbors.ALL_PARAMS) + def load_execute(self, natures, dir_mat, key_nD, dir_nD, dir_neighbors, + force=False): + exists = True + + if not force: + for neighbors_nature in natures: + for nature in natures: + if not self.get_neighbors_path( + neighbors_nature, nature, dir_neighbors).exists(): + exists = False + + if not exists or force: + scores = self.load_scores(natures, dir_mat) + matrices_nD = self.load_matrices(natures, dir_nD, key=key_nD) + self.get_neighbors(natures, matrices_nD, scores, dir_neighbors) + def get_neighbors(self, natures, matrices_nd, scores, dump_dir): for idx in range(len(matrices_nd)): get_neighbors( matrices_nd[idx], scores[idx], matrices_nd, power_score=self.power_scores[idx], dump_dir=dump_dir, neighbors_nature=natures[idx], natures=natures, - n_neighbors=self.n_neighbors + n_neighbors=self.n_neighbors, num_threads=self.num_threads ) def get_neighbors_path(self, neighbors_nature, nature, dump_dir): diff --git a/cartodata/pipeline/projection2d.py b/cartodata/pipeline/projection2d.py index a73bdbafd716a2c94d7ad041a0f3625ea781b219..737a481c278c8035bfa1878bb2f277e10c2086a0 100644 --- a/cartodata/pipeline/projection2d.py +++ b/cartodata/pipeline/projection2d.py @@ -1,20 +1,27 @@ import logging +from pathlib import Path from pickle import dump, load -import numpy as np from numba.typed import List import pandas as pd from sklearn.manifold import TSNE from umap import UMAP from umap.aligned_umap import AlignedUMAP -from cartodata.pipeline.base import BaseEstimator +from cartodata.pipeline.base import BaseProjection +from cartodata.pipeline.interpolator import KNeighborsInterpolator from cartodata.projection import ( - guided_umap_projection, indirect_umap_projection + indirect_umap_projection +) +from cartodata.utils import ( + hstack_transpose_matrices, decompose_transpose_matrix, make_relation, + digest ) logger = logging.getLogger(__name__) +INIT = "random" + def get_executor_2D(key): if key == "umap": @@ -25,9 +32,13 @@ def get_executor_2D(key): return TSNEProjection elif key == "guided_umap": return GuidedUMAPProjection + elif key == "aligned_guided_umap": + return AlignedGuidedUMAPProjection + elif key == "aligned_kneighbors": + return AlignedKNeighborsProjection -class Projection2D(BaseEstimator): +class Projection2D(BaseProjection): def __init__(self, key): super().__init__(key) @@ -36,6 +47,126 @@ class Projection2D(BaseEstimator): def is_aligned(self): return False + def digest(self, natures): + initial = [self.params] + initial.extend(natures) + return digest("_".join(initial)) + + def load_execute(self, natures, key_nD, dir_nD, dir_2D, dump=False, + force=False, return_mat=False): + """Loads nD matrices and executes 2-dimensional projection. + If ``force`` is False, checks if the matrices are already + generated. Loads and returns them if ``return_mat`` is True. + + Parameters + ---------- + natures: list of str + the list of natures for the matrices to load + key_nD: str + the key for projection-nD + dir_nD: str, pathlib.Path + the path where the nD matrices are saved + dir_2D: str, pathlib.Path + the path where the generated matrices should be saved + dump: bool + boolean value to indicate if the generated matrices should be saved + force: bool + boolean value to indicate if the matrices should be regenerated or + loaded from existing. + return_mat: bool + boolean value to indicate if the projected matrices should be + returned from the method + + Returns + ------- + list of matrices of shape (2, entity_count) if ``return_mat`` is + True; None otherwise + """ + + # if not force to regenerate matrices and the matrices already exist + if not force and self.matrices_exist(natures, dir_2D): + logger.info( + f"{self.key} matrices already exist, will not regenerate." + ) + if return_mat: + return self.load_matrices(natures, dir_2D) + else: + return None + + # if the matrices do not already exist or force=True + matrices_nD = self.load_matrices(natures, dir_nD, key_nD) + + return self.execute(matrices_nD, dir_2D, natures=natures, dump=dump) + + def execute(self, matrices_nD, dir_2D=None, natures=None, dump=False): + """Executes 2D-projection on the specifed ``matrices`` reducing the + number of features of each matrix to 2. + This works only if the combined number of documents for all matrices + is ~ < 2million. Otherwise, use the indirect_umap_projection method. + + Parameters + ---------- + matrices_nD: list of numpy.ndarray + list of matrices of shape (num_dims, entity_count) to project to 2D + dir_2D: str, Path + the directory path where the generated 2D matrices should be saved + natures: list of str + the list of natures to be used for dump names + dump: bool + the boolean value to indicate if the matrices should be saved + + Returns + ------- + list of numpy.ndarray + list of matrices of shape (2, entity_count) + """ + logger.info(f'Starting {self.key} projection...') + + matrices_2D = self.fit_transform(matrices_nD) + + if dump: + assert natures is not None and dir_2D is not None, ( + "To save generated matrices natures and directory should be " + "specifed!" + ) + self.dump_matrices(natures, matrices_2D, dir_2D) + + logger.info(f'Finished {self.key} projection.') + + return matrices_2D + + def fit(self, matrices_nD, matrices_2D=None): + matrices_nD_T, _ = hstack_transpose_matrices(matrices_nD) + if matrices_2D is not None: + matrices_2D_T, _ = hstack_transpose_matrices(matrices_2D) + else: + matrices_2D_T = None + self.reducer.fit(matrices_nD_T, matrices_2D_T) + + def transform(self, matrices_nD): + matrices_2D = [] + + for matrix in matrices_nD: + projected_matrix = self.reducer.transform(matrix.T) + matrices_2D.append(projected_matrix.T) + + return matrices_2D + + def fit_transform(self, matrices_nD): + global_matrix, row_counts = hstack_transpose_matrices(matrices_nD) + + embedding = self.reducer.fit_transform(global_matrix) + + matrices_2D = decompose_transpose_matrix(embedding, row_counts) + + return matrices_2D + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + pass + class UMAPProjection(Projection2D): @@ -43,54 +174,28 @@ class UMAPProjection(Projection2D): ALL_PARAMS = [ "n_neighbors", - "n_components", + "min_dist", "metric", + "init", "n_epochs", "learning_rate", - "init", - "min_dist", - "repulsion_strength", - "random_state", - "n_jobs", + "random_state" ] - def __init__(self, n_neighbors=15, n_components=2, metric="euclidean", - metric_kwds=None, output_metric='euclidean', - output_metric_kwds=None, n_epochs=None, learning_rate=1.0, - init='spectral', min_dist=0.1, spread=1.0, low_memory=True, - n_jobs=-1, set_op_mix_ratio=1.0, local_connectivity=1.0, - repulsion_strength=1.0, negative_sample_rate=5, - transform_queue_size=4.0, a=None, b=None, random_state=None, - angular_rp_forest=False, target_n_neighbors=-1, - target_metric='categorical', target_metric_kwds=None, - target_weight=0.5, transform_seed=42, - transform_mode='embedding', - force_approximation_algorithm=False, verbose=False, - tqdm_kwds=None, unique=False, densmap=False, dens_lambda=2.0, - dens_frac=0.3, dens_var_shift=0.1, output_dens=False, - precomputed_knn=(None, None, None)): - super().__init__("umap") + def __init__(self, key="umap", n_neighbors=15, min_dist=0.1, + metric="euclidean", n_epochs=None, learning_rate=1.0, + init=INIT, low_memory=True, n_jobs=-1, random_state=None, + target_metric='categorical', transform_mode='embedding', + verbose=False): + super().__init__(key) self.reducer = UMAP( - n_neighbors=n_neighbors, n_components=n_components, metric=metric, - metric_kwds=metric_kwds, output_metric=output_metric, - output_metric_kwds=output_metric_kwds, n_epochs=n_epochs, - learning_rate=learning_rate, init=init, min_dist=min_dist, - spread=spread, low_memory=low_memory, n_jobs=n_jobs, - set_op_mix_ratio=set_op_mix_ratio, - local_connectivity=local_connectivity, - repulsion_strength=repulsion_strength, - negative_sample_rate=negative_sample_rate, - transform_queue_size=transform_queue_size, a=a, b=b, - random_state=random_state, angular_rp_forest=angular_rp_forest, - target_n_neighbors=target_n_neighbors, target_metric=target_metric, - target_metric_kwds=target_metric_kwds, target_weight=target_weight, - transform_seed=transform_seed, transform_mode=transform_mode, - force_approximation_algorithm=force_approximation_algorithm, - verbose=verbose, tqdm_kwds=tqdm_kwds, unique=unique, - densmap=densmap, dens_lambda=dens_lambda, dens_frac=dens_frac, - dens_var_shift=dens_var_shift, output_dens=output_dens, - precomputed_knn=precomputed_knn + n_components=2, n_neighbors=n_neighbors, min_dist=min_dist, + metric=metric, n_epochs=n_epochs, learning_rate=learning_rate, + init=init, low_memory=low_memory, n_jobs=n_jobs, + random_state=random_state, target_metric=target_metric, + transform_seed=random_state, transform_mode=transform_mode, + verbose=verbose ) if isinstance(init, list): @@ -101,45 +206,17 @@ class UMAPProjection(Projection2D): self.metric = metric self.init = init self.learning_rate = learning_rate - self.repulsion_strength = repulsion_strength - - # TODO if other parameters are to be used, this should be updated - self._add_to_params(["n_neighbors", "min_dist", "metric", "init", - "learning_rate", "repulsion_strength"]) - - def execute(self, matrices, dump_dir): - """Apply the UMAP fit_transform algorithm on all the matrices, reducing - the number of features of each matrix to 2. - This works only if the combined number of documents for all matrices - is ~ < 2million. Otherwise, use the indirect_umap_projection method. - - :param matrices: - :param n_neighbors: - :param min_dist: - :return: - """ - logger.info('Starting UMAP projection') - - shapes = [matrix.shape for matrix in matrices] - global_matrix = np.asarray(np.hstack(matrices).T) + self.n_epochs = n_epochs + self.random_state = random_state - proj_umap = self.reducer.fit_transform(global_matrix) - - min_coords = np.amin(proj_umap, axis=0) - max_coords = np.amax(proj_umap, axis=0) - logger.debug( - f"UMAP projection done. Max coords: {max_coords}. " - f"Min Coords: {min_coords}." - ) - - start = 0 - projected_matrices = [] - for shape in shapes: - matrix = np.asarray(proj_umap[start:start + shape[1]].T) - projected_matrices.append(matrix) - start += shape[1] - - return projected_matrices + # if other parameters are to be used, this should be updated + self._add_to_params(["metric", + "n_neighbors", + "min_dist", + "init", + "learning_rate", + "n_epochs", + "random_state"]) class IndirectUMAPProjection(UMAPProjection): @@ -148,226 +225,223 @@ class IndirectUMAPProjection(UMAPProjection): ALL_PARAMS = [ "n_neighbors", - "n_components", + "min_dist", "metric", + "init", "n_epochs", "learning_rate", - "init", - "min_dist", - "repulsion_strength", "random_state", - "n_jobs", - "max_size", + "max_size" ] - def __init__(self, n_neighbors=15, n_components=2, metric="euclidean", - metric_kwds=None, output_metric='euclidean', - output_metric_kwds=None, n_epochs=None, learning_rate=1.0, - init='spectral', min_dist=0.1, spread=1.0, low_memory=True, - n_jobs=-1, set_op_mix_ratio=1.0, local_connectivity=1.0, - repulsion_strength=1.0, negative_sample_rate=5, - transform_queue_size=4.0, a=None, b=None, random_state=None, - angular_rp_forest=False, target_n_neighbors=-1, - target_metric='categorical', target_metric_kwds=None, - target_weight=0.5, transform_seed=42, - transform_mode='embedding', - force_approximation_algorithm=False, verbose=False, - tqdm_kwds=None, unique=False, densmap=False, dens_lambda=2.0, - dens_frac=0.3, dens_var_shift=0.1, output_dens=False, - precomputed_knn=(None, None, None), max_size=1500000): + def __init__(self, key="indirect_umap", n_neighbors=15, min_dist=0.1, + metric="euclidean", n_epochs=None, learning_rate=1.0, + init=INIT, low_memory=True, n_jobs=-1, random_state=None, + target_metric='categorical', transform_mode='embedding', + verbose=False, max_size=1500000): super().__init__( - "indirect_umap", - n_neighbors=n_neighbors, n_components=n_components, metric=metric, - metric_kwds=metric_kwds, output_metric=output_metric, - output_metric_kwds=output_metric_kwds, n_epochs=n_epochs, - learning_rate=learning_rate, init=init, min_dist=min_dist, - spread=spread, low_memory=low_memory, n_jobs=n_jobs, - set_op_mix_ratio=set_op_mix_ratio, - local_connectivity=local_connectivity, - repulsion_strength=repulsion_strength, - negative_sample_rate=negative_sample_rate, - transform_queue_size=transform_queue_size, a=a, b=b, - random_state=random_state, angular_rp_forest=angular_rp_forest, - target_n_neighbors=target_n_neighbors, target_metric=target_metric, - target_metric_kwds=target_metric_kwds, target_weight=target_weight, - transform_seed=transform_seed, transform_mode=transform_mode, - force_approximation_algorithm=force_approximation_algorithm, - verbose=verbose, tqdm_kwds=tqdm_kwds, unique=unique, - densmap=densmap, dens_lambda=dens_lambda, dens_frac=dens_frac, - dens_var_shift=dens_var_shift, output_dens=output_dens, - precomputed_knn=precomputed_knn + key=key, n_neighbors=n_neighbors, min_dist=min_dist, + metric=metric, n_epochs=n_epochs, learning_rate=learning_rate, + init=init, low_memory=low_memory, n_jobs=n_jobs, + random_state=random_state, target_metric=target_metric, + transform_mode=transform_mode, verbose=verbose ) self.max_size = max_size self._add_to_params("max_size") - def execute(self, matrices, dump_dir): - logger.info('Starting UMAP projection') - + def fit_transform(self, matrices): # TODO this should use reducer of this object - umap_matrices = indirect_umap_projection( + return indirect_umap_projection( matrices, self.n_neighbors, self.min_dist, self.max_size ) - return umap_matrices - -class GuidedUMAPProjection(Projection2D): +class GuidedUMAPProjection(UMAPProjection): yaml_tag = u'!GuidedUMAPProjection' ALL_PARAMS = [ "n_neighbors", - "n_components", + "min_dist", "metric", + "init", "n_epochs", "learning_rate", - "init", - "min_dist", - "repulsion_strength", "random_state", - "n_jobs", "max_size", - "keepsize", + "keepsize" ] - def __init__(self, n_neighbors=15, n_components=2, metric="euclidean", - metric_kwds=None, output_metric='euclidean', - output_metric_kwds=None, n_epochs=None, learning_rate=1.0, - init='spectral', min_dist=0.1, spread=1.0, low_memory=True, - n_jobs=-1, set_op_mix_ratio=1.0, local_connectivity=1.0, - repulsion_strength=1.0, negative_sample_rate=5, - transform_queue_size=4.0, a=None, b=None, random_state=None, - angular_rp_forest=False, target_n_neighbors=-1, - target_metric='categorical', target_metric_kwds=None, - target_weight=0.5, transform_seed=42, - transform_mode='embedding', - force_approximation_algorithm=False, verbose=False, - tqdm_kwds=None, unique=False, densmap=False, dens_lambda=2.0, - dens_frac=0.3, dens_var_shift=0.1, output_dens=False, - precomputed_knn=(None, None, None), max_size=1500000, - keepsize=60000): - super().__init__("guided_umap") - - self.reducer = UMAP( - n_neighbors=n_neighbors, n_components=n_components, metric=metric, - metric_kwds=metric_kwds, output_metric=output_metric, - output_metric_kwds=output_metric_kwds, n_epochs=n_epochs, - learning_rate=learning_rate, init=init, min_dist=min_dist, - spread=spread, low_memory=low_memory, n_jobs=n_jobs, - set_op_mix_ratio=set_op_mix_ratio, - local_connectivity=local_connectivity, - repulsion_strength=repulsion_strength, - negative_sample_rate=negative_sample_rate, - transform_queue_size=transform_queue_size, a=a, b=b, - random_state=random_state, angular_rp_forest=angular_rp_forest, - target_n_neighbors=target_n_neighbors, target_metric=target_metric, - target_metric_kwds=target_metric_kwds, target_weight=target_weight, - transform_seed=transform_seed, transform_mode=transform_mode, - force_approximation_algorithm=force_approximation_algorithm, - verbose=verbose, tqdm_kwds=tqdm_kwds, unique=unique, - densmap=densmap, dens_lambda=dens_lambda, dens_frac=dens_frac, - dens_var_shift=dens_var_shift, output_dens=output_dens, - precomputed_knn=precomputed_knn + def __init__(self, key="guided_umap", n_neighbors=15, min_dist=0.1, + metric="euclidean", n_epochs=None, learning_rate=1.0, + init=INIT, low_memory=True, n_jobs=-1, random_state=None, + target_metric='categorical', transform_mode='embedding', + verbose=False, max_size=1500000, keepsize=60000): + super().__init__( + key=key, n_neighbors=n_neighbors, min_dist=min_dist, + metric=metric, n_epochs=n_epochs, learning_rate=learning_rate, + init=init, low_memory=low_memory, n_jobs=n_jobs, + random_state=random_state, target_metric=target_metric, + transform_mode=transform_mode, verbose=verbose ) + self.max_size = max_size + self.keepsize = keepsize - if isinstance(init, list): - init = "list" + self._add_to_params(["max_size", "keepsize"]) - self.n_neighbors = n_neighbors - self.min_dist = min_dist + def fit(self, matrices_nD, matrices_2D=None): + """Fits only with the first matrix of ``matrices_nD`` and + ``matrices_2D`` which is assumed to be the main entity. + + Parameters + ---------- + matrices_nD: list of numpy.ndarray + list of matrices of shape (num_dims, entity_count) + matrices_2D: list of numpy.ndarray + list of matrices of shape (2, entity_count) + """ + matrices_nD_T, _ = hstack_transpose_matrices([matrices_nD[0]]) + if matrices_2D is not None: + matrices_2D_T, _ = hstack_transpose_matrices([matrices_2D[0]]) + else: + matrices_2D_T = None + self.reducer.fit(matrices_nD_T, matrices_2D_T) + + def fit_transform(self, matrices_nD): + """Fits the model with the first matrix of the specified + ``matrices_nD`` and transforms all matrices of ``matrices_nD``. + + Parameters + ---------- + matrices_nD: list of numpy.ndarray + list of matrices of shape (num_dims, entity_count) to project to 2D + + Returns + ------- + list of numpy.ndarray + list of matrices of shape (2, entity_count) + """ + self.fit(matrices_nD) + return self.transform(matrices_nD) + + +class TSNEProjection(Projection2D): + + yaml_tag = u'!TSNEProjection' + + ALL_PARAMS = [ + "metric", + "perplexity", + "early_exaggeration", + "method", + "init", + "learning_rate", + "angle", + "random_state", + ] + + def __init__(self, key="tsne", metric="euclidean", perplexity=30.0, + early_exaggeration=12.0, method="barnes_hut", + learning_rate=400.0, init=INIT, n_iter=1000, angle=0.5, + n_jobs=None, random_state=None, verbose=0): + super().__init__(key) self.metric = metric + self.perplexity = perplexity + self.early_exaggeration = early_exaggeration + self.method = method self.init = init self.learning_rate = learning_rate - self.repulsion_strength = repulsion_strength - self.max_size = max_size - self.keepsize = keepsize + self.angle = angle + self.random_state = random_state - self._add_to_params(["n_neighbors", "min_dist", "metric", "init", - "learning_rate", "repulsion_strength", - "max_size", "keepsize"]) - - def execute(self, matrices, dump_dir): - matrix = matrices[0] - perm = np.random.permutation(matrix.shape[1] - self.keepsize) - fit_data = matrix[:, perm[:self.max_size]] - fit_data = np.concatenate( - (fit_data, matrix[:, -self.keepsize:]), axis=1) - - # TODO it should use the reducer of this instance - umap_matrices = guided_umap_projection( - matrices[0][:, -self.max_size:], matrices, - self.n_neighbors, self.min_dist, self.max_size - ) + self.reducer = TSNE(metric=metric, perplexity=perplexity, + early_exaggeration=early_exaggeration, + method=method, learning_rate=learning_rate, + init=init, n_iter=n_iter, angle=angle, + n_jobs=n_jobs, random_state=random_state, + verbose=verbose) - return umap_matrices + self._add_to_params(["metric", + "perplexity", + "early_exaggeration", + "method", + "learning_rate", + "init", + "angle", + "random_state"]) class AlignedUMAPProjection(Projection2D): ALL_PARAMS = [ "n_neighbors", - "n_components", + "min_dist", "metric", + "init", + "alignment_regularisation", + "alignment_window_size", "n_epochs", "learning_rate", - "init", - "min_dist", - "repulsion_strength", "random_state", - "n_jobs", ] - def __init__(self, n_neighbors=15, n_components=2, metric="euclidean", - metric_kwds=None, n_epochs=None, learning_rate=1.0, - init='spectral', min_dist=0.1, spread=1.0, low_memory=True, - alignment_regularisation=1.0e-2, alignment_window_size=3, - set_op_mix_ratio=1.0, local_connectivity=1.0, - repulsion_strength=1.0, negative_sample_rate=5, - transform_queue_size=4.0, a=None, b=None, random_state=None, - angular_rp_forest=False, target_n_neighbors=-1, - target_metric='categorical', target_metric_kwds=None, - target_weight=0.5, transform_seed=42, - force_approximation_algorithm=False, verbose=False, - unique=False): - super().__init__("aligned_umap") + def __init__(self, key="aligned_umap", n_neighbors=15, min_dist=0.1, + metric="euclidean", n_epochs=None, learning_rate=1.0, + init=INIT, low_memory=True, alignment_regularisation=1.0e-2, + alignment_window_size=3, random_state=None, + target_metric='categorical', verbose=False): + super().__init__(key) + + if isinstance(init, list): + init = "list" + self.n_neighbors = n_neighbors self.min_dist = min_dist self.metric = metric self.init = init + self.alignment_regularisation = alignment_regularisation + self.alignment_window_size = alignment_window_size + self.n_epochs = n_epochs self.learning_rate = learning_rate - self.repulsion_strength = repulsion_strength + self.random_state = random_state self.reducer = AlignedUMAP( - n_neighbors=n_neighbors, n_components=n_components, metric=metric, - metric_kwds=metric_kwds, n_epochs=n_epochs, + n_components=2, n_neighbors=n_neighbors, min_dist=min_dist, + metric=metric, n_epochs=n_epochs, learning_rate=learning_rate, + init=init, low_memory=low_memory, alignment_regularisation=alignment_regularisation, alignment_window_size=alignment_window_size, - learning_rate=learning_rate, init=init, min_dist=min_dist, - spread=spread, low_memory=low_memory, - set_op_mix_ratio=set_op_mix_ratio, - local_connectivity=local_connectivity, - repulsion_strength=repulsion_strength, - negative_sample_rate=negative_sample_rate, - transform_queue_size=transform_queue_size, a=a, b=b, - random_state=random_state, angular_rp_forest=angular_rp_forest, - target_n_neighbors=target_n_neighbors, target_metric=target_metric, - target_metric_kwds=target_metric_kwds, target_weight=target_weight, - transform_seed=transform_seed, - force_approximation_algorithm=force_approximation_algorithm, - verbose=verbose, unique=unique + random_state=random_state, target_metric=target_metric, + transform_seed=random_state, verbose=verbose ) - self._add_to_params(["n_neighbors", "min_dist", "metric", - "init", "learning_rate", "repulsion_strength"]) + self._add_to_params(["metric", + "n_neighbors", + "min_dist", + "init", + "alignment_regularisation", + "alignment_window_size", + "learning_rate", + "random_state"]) @property def is_aligned(self): return True - def load_reducer(self, reducer_dir): + def load_reducer(self, reducer_dir, suffix): + """Loads the model saved in ``reducer_dir`` with name + reducer_<suffix>.pkl + + Parameters + ---------- + reducer_dir: pathlib.Path + the directory where the model is saved + suffix: str + the model name should be of the form ``reducer_<suffix>.pkl`` + """ # From https://github.com/lmcinnes/umap/issues/672 - params_new = load(open(reducer_dir / 'reducer.pkl', 'rb')) + params_new = load(open(reducer_dir / f'reducer_{suffix}.pkl', 'rb')) # self.reducer.set_params(**params_new.get('umap_params')) for attr, value in params_new.get('umap_attributes').items(): @@ -375,7 +449,19 @@ class AlignedUMAPProjection(Projection2D): self.reducer.__setattr__('embeddings_', List( params_new.get('umap_attributes').get('embeddings_'))) - def persist_reducer(self, reducer, dump_dir): + def persist_reducer(self, reducer, reducer_dir, suffix): + """Saves the model ``reducer`` to ``reducer_dir`` with name + ``reducer_<suffix>.pkl``. + + Parameters + ---------- + reducer: + the model to be saved + reducer_dir: pathlib.Path + the directory where the model is saved + suffix: str + the model name should be of the form ``reducer_<suffix>.pkl`` + """ # From https://github.com/lmcinnes/umap/issues/672 params = reducer.get_params(deep=True) attributes_names = [attr for attr in reducer.__dir__( @@ -396,64 +482,192 @@ class AlignedUMAPProjection(Projection2D): } } - dump(all_params, open(dump_dir / 'reducer.pkl', 'wb')) + dump(all_params, open(reducer_dir / f'reducer_{suffix}.pkl', 'wb')) + + def load_execute(self, natures, dirs_mat, key_nD, dirs_nD, dirs_2D, + dir_reducer, current_version=None, prev_version=None, + dump=False, force=False, return_mat=False): + """Loads nD matrices and executes 2-dimensional projection on all + slices. If ``force`` is False, checks if the matrices are already + generated. Loads and returns them if ``return_mat`` is True. + + Parameters + ---------- + natures: list of str + the list of natures for the matrices to load + dirs_mat: list of str or pathlib.Path + the list of directories where entity matrices and scores are + stored for each slice. Scores for entities are necessary to create + relations between consecutive slices. + key_nD: str + the key for projection-nD + dirs_nD: list of str or pathlib.Path + the list of directories where the nD matrices are saved for each + slice + dirs_2D: list of str or pathlib.Path + the list of directories where the generated matrices should be + saved, or are saved in the case where there is already a 2D projection + executed + dir_reducer: str, pathlib.Path + the directory to save the reducer after 2D projection is executed + current_version: str + the version of the dataset processed + prev_version: str + the previous version of dataset processed with + AlignedUMAPProjection, it will be used to find and load the reducer of + the previous processing and update the reducer with the new matrices + dump: bool + boolean value to indicate if the generated matrices should be saved + force: bool + boolean value to indicate if the matrices should be regenerated or + loaded from existing. + return_mat: bool + boolean value to indicate if the projected matrices should be + returned from the method + + Returns + ------- + list of list of matrices, list of scores + a list that contains list of matrices of shape (2, entity_count) for + each slice and the scores of entities for the last slice if + ``return_mat`` is True; None otherwise + """ + exist = True + if not force: + for dir_2D in dirs_2D: + if not self.matrices_exist(natures, dir_2D): + exist = False + if exist: + logger.info( + f"{self.key} matrices already exist, will not regenerate." + ) + digest = self.digest(natures) + self.load_reducer(dir_reducer, digest) + if return_mat: + matrices_2D_AS = self.load_matrices(natures, dirs_2D) + final_scores = self.load_scores(natures, dir_reducer, + digest) + return matrices_2D_AS, final_scores + else: + return + + matrices_nD_AS = self.load_matrices(natures, dirs_nD, key=key_nD) + scores_AS = self.load_scores(natures, dirs_mat) + + return self.execute(matrices_nD_AS, scores_AS, dir_reducer, + dirs_2D, natures, current_version=current_version, + prev_version=prev_version, dump=dump) + + def execute(self, matrices_nD_AS, scores_AS, dir_reducer, dirs_2D=None, + natures=None, current_version=None, prev_version=None, + dump=False): + matrices_2D_AS = None + digest = self.digest(natures) + if prev_version is not None: + # set previous version directory to load reducer from that + # directory. For an update with existing + # reducer prev_version should be set to None + dir_prev = Path(str(dir_reducer).replace(current_version, + prev_version)) + scores_prev = self.load_scores(natures, dir_prev, digest) + + matrices_2D_AS = self.update( + matrices_nD_AS, scores_AS, scores_prev, dir_prev, + natures=natures + ) + else: + matrices_2D_AS = self.fit_transform(matrices_nD_AS, scores_AS) - def execute(self, matrices_all, scores_all, dump_dir, **kwargs): + self.persist_reducer(self.reducer, dir_reducer, digest) + scores_LS = scores_AS[-1] + if dump: + self.dump_matrices(natures, matrices_2D_AS, dirs_2D) + self.dump_scores(natures, scores_LS, dir_reducer, digest) - transposed_matrices = self._transpose_matrices(matrices_all) - relations = self._create_relations(scores_all) + return matrices_2D_AS, scores_LS - self.reducer.fit_transform( - transposed_matrices, relations=relations, **kwargs + def fit(self, matrices_nD, matrices_2D=None): + # cannot fit and transform separately for aligned UMAP + raise NotImplementedError( + "With AlignedUMAP, it is only possible to fit_transform" ) - decomposed_embeddings = self._decompose_and_transpose_embeddings( - self.reducer.embeddings_, scores_all + def transform(self, matrices_nD): + # cannot fit and transform separately for aligned UMAP + raise NotImplementedError( + "With AlignedUMAP, it is only possible to fit_transform" ) - self.persist_reducer(self.reducer, dump_dir) - - return decomposed_embeddings + def fit_transform(self, matrices_nD_AS, scores_AS): + transposed_matrices, row_counts_all = self._transpose_matrices( + matrices_nD_AS + ) + relations = self._create_relations(scores_AS) - def update(self, matrices, scores, scores_prev, dump_dir, reducer_dir=None, - **kwargs): + self.reducer.fit_transform( + transposed_matrices, relations=relations + ) - if reducer_dir is not None: - self.load_reducer(reducer_dir) + return self._decompose_and_transpose_embeddings( + self.reducer.embeddings_, row_counts_all + ) - transposed_matrices = self._transpose_matrices(matrices) - relations = self._create_relations([scores_prev, scores[0]]) + def update(self, matrices_nD_AS, scores_AS, scores_prev, dir_prev=None, + natures=None): - self.reducer.update( - transposed_matrices[0], relations=relations[0], **kwargs + transposed_matrices_AS, row_counts_AS = self._transpose_matrices( + matrices_nD_AS ) + scores = [scores_prev] + scores_AS + relations_AS = self._create_relations(scores) + + if dir_prev is not None: + digest = self.digest(natures) + self.load_reducer(dir_prev, digest) + + for i in range(len(transposed_matrices_AS)): + self.reducer.update( + transposed_matrices_AS[i], relations=relations_AS[i] + ) - decomposed_embeddings = self._decompose_and_transpose_embeddings( - [self.reducer.embeddings_[ - len(self.reducer.embeddings_) - 1]], scores + return self._decompose_and_transpose_embeddings( + [self.reducer.embeddings_[-1]], row_counts_AS ) - self.persist_reducer(self.reducer, dump_dir) - return decomposed_embeddings + def _transpose_matrices(self, matrices_nD_AS): + transposed_matrices_AS = [] + row_counts_AS = [] + + for matrices in matrices_nD_AS: + global_matrix, row_counts = hstack_transpose_matrices( + matrices + ) + transposed_matrices_AS.append(global_matrix) + row_counts_AS.append(row_counts) - def _transpose_matrices(self, matrices_all): - transposed_mats = [] + return transposed_matrices_AS, row_counts_AS + + def _decompose_and_transpose_embeddings(self, embeddings, row_counts_AS): + decomposed_embeddings = [] - for matrices in matrices_all: - transposed_mats.append(np.hstack(matrices).T) + for i in range(len(row_counts_AS)): + row_counts = row_counts_AS[i] + embedding = embeddings[i] + matrices_i = decompose_transpose_matrix(embedding, row_counts) + decomposed_embeddings.append(matrices_i) - return transposed_mats + return decomposed_embeddings - def _create_relations(self, scores_all): + def _create_relations(self, scores_AS): concatenated_scores = [] - for scores in scores_all: + for scores in scores_AS: concatenated_scores.append(pd.concat(scores)) relations = [] for i in range(len(concatenated_scores) - 1): - relation = self._make_relation( + relation = make_relation( concatenated_scores[i], concatenated_scores[i+1] ) @@ -461,117 +675,172 @@ class AlignedUMAPProjection(Projection2D): return relations - def _make_relation(self, from_df, to_df): - # create a new dataframe with index from from_df, and values as - # integers starting from 0 to the length of from_df - left = pd.DataFrame(data=np.arange( - len(from_df)), index=from_df.index) - - # create a new dataframe with index from to_df, and values as integers - # starting from 0 to the length of to_df - right = pd.DataFrame(data=np.arange(len(to_df)), index=to_df.index) - # merge left and right dataframes on the intersection of keys of both - # dataframes preserving the order of left keys - merge = pd.merge(left, right, how="inner", - left_index=True, right_index=True) +class AlignedInterpolatedProjection(AlignedUMAPProjection): - return dict(merge.values) + def load_execute(self, natures, dirs_mat, key_nD, dirs_nD, dirs_2D, + dir_reducer, current_version=None, prev_version=None, + dump=True, force=False, return_mat=False, dir_guided=None, + dir_guided_2D=None): - def _decompose_and_transpose_embeddings(self, embeddings, scores_all): - decomposed_embeddings = [] + if not force and dir_guided_2D is not None and ( + super().matrices_exist(natures, dir_guided_2D) + ): + logger.info( + f"{self.key} matrices already exist, will not regenerate." + ) + if return_mat: + return super().load_matrices(natures, dir_guided_2D) + else: + return None + + matrices_nD_AS = self.load_matrices(natures, dirs_nD, key=key_nD) + scores_AS = self.load_scores(natures, dirs_mat) + + return self.execute(matrices_nD_AS, scores_AS, dir_reducer, + dir_guided_2D, natures, dirs_2D, + current_version=current_version, + prev_version=prev_version, dump=dump) + + def _aligned_execute(self, matrices_nD_AS, scores_AS, dir_reducer, + dir_guided_2D, natures=None, dirs_2D=None, + current_version=None, prev_version=None, dump=False): + # LSAE stands for last slice, all entities + matrices_nD_LSAE = matrices_nD_AS[-1] + + # FE stands for "first entity" + natures_FE = [natures[0]] + exist = True + if not self.matrices_exist(natures_FE, dirs_2D[-1]): + exist = False + if not exist: + # ASFE stands for "all slices first entity" + scores_ASFE = [] + # gets scores for first entity of each slice + for i in range(len(scores_AS)): + scores_ASFE.append([scores_AS[i][0]]) + + matrices_nD_ASFE = [] + # get nD matrix for first entity of each slice + for i in range(len(matrices_nD_AS)): + matrices_nD_ASFE.append([matrices_nD_AS[i][0]]) + + matrices_2D_ASFE = None + digest = self.digest(natures) + if prev_version is not None: + # set previous version directory to load reducer from that + # directory. For an update with existing + # reducer prev_version should be set to None + dir_prev = Path(str(dir_reducer).replace(current_version, + prev_version)) + scores_prev = super().load_scores(natures_FE, dir_prev, digest) + + matrices_2D_ASFE = self.update( + matrices_nD_ASFE, scores_ASFE, scores_prev, dir_prev, + natures=natures + ) + else: + matrices_2D_ASFE = self.fit_transform( + matrices_nD_ASFE, scores_ASFE + ) + + self.persist_reducer(self.reducer, dir_reducer, digest) + if dump: + self.dump_matrices( + natures_FE, matrices_2D_ASFE, dirs_2D + ) + self.dump_scores(natures_FE, scores_AS[-1], dir_reducer, + digest) + + matrices_2D_LSFE = matrices_2D_ASFE[-1] + else: + logger.info("matrices already exist, loading from existing") + matrices_2D_LSFE = self.load_matrices(natures_FE, dirs_2D[-1]) + + return matrices_nD_LSAE, matrices_2D_LSFE + + +class AlignedGuidedUMAPProjection(AlignedInterpolatedProjection): + + def __init__(self, key="aligned_guided_umap", n_neighbors=15, min_dist=0.1, + metric="euclidean", n_epochs=None, learning_rate=1.0, + init=INIT, low_memory=True, alignment_regularisation=1.0e-2, + alignment_window_size=3, random_state=None, + target_metric='categorical', verbose=False): - for i in range(len(scores_all)): - scores = scores_all[i] - embedding = embeddings[i] - print(f"embedding {embedding.shape}") + super().__init__( + n_neighbors=n_neighbors, min_dist=min_dist, + metric=metric, n_epochs=n_epochs, learning_rate=learning_rate, + init=init, low_memory=low_memory, + alignment_regularisation=alignment_regularisation, + alignment_window_size=alignment_window_size, + random_state=random_state, target_metric=target_metric, + verbose=verbose + ) + self.key2 = key - start = 0 - end = 0 - i_matrices = [] - for j in range(len(scores)): - len_j = len(scores[j]) - end += len_j - print(f"start {start} --- end {end}") + def execute(self, matrices_nD_AS, scores_AS, dir_reducer, dir_guided_2D, + natures=None, dirs_2D=None, current_version=None, + prev_version=None, dump=False): - sub_embedding = embedding[start:end] - print(sub_embedding.shape) - i_matrices.append(sub_embedding.T) + matrices_nD_LSAE, matrices_2D_LSFE = self._aligned_execute( + matrices_nD_AS, scores_AS, dir_reducer, dir_guided_2D, + natures=natures, dirs_2D=dirs_2D, current_version=current_version, + prev_version=prev_version, dump=dump + ) + init, _ = hstack_transpose_matrices(matrices_2D_LSFE) + # ------------------------- + # use 2D projection array of last slice for first + # entity as initial points + projection_2D = GuidedUMAPProjection( + n_neighbors=15, min_dist=0.1, init=init + ) - start = end - decomposed_embeddings.append(i_matrices) + # projection_2D.fit([matrices_nD_LSAE[0]], matrices_2D_LSFE) + # do projection 2D for all nD matrices of the last slice + matrices_2D = projection_2D.fit_transform(matrices_nD_LSAE) + if dump: + self.dump_matrices(natures, matrices_2D, dir_guided_2D, + key=self.key2) - return decomposed_embeddings + return matrices_2D -class TSNEProjection(Projection2D): +class AlignedKNeighborsProjection(AlignedInterpolatedProjection): - yaml_tag = u'!TSNEProjection' + def __init__(self, key="aligned_kneighbors", n_neighbors=15, min_dist=0.1, + metric="euclidean", n_epochs=None, learning_rate=1.0, + init=INIT, low_memory=True, alignment_regularisation=1.0e-2, + alignment_window_size=3, random_state=None, + target_metric='categorical', verbose=False): - ALL_PARAMS = [ - "n_components", - "perplexity", - "early_exaggeration", - "metric", - "method", - "learning_rate", - "init", - "angle", - "random_state", - ] - - def __init__(self, n_components=2, perplexity=30.0, - early_exaggeration=12.0, learning_rate=400.0, - n_iter=1000, n_iter_without_progress=300, - min_grad_norm=1e-07, metric="euclidean", metric_params=None, - init="random", verbose=0, random_state=None, - method="barnes_hut", angle=0.5, n_jobs=None): - super().__init__("tsne") - self.perplexity = perplexity - self.early_exaggeration = early_exaggeration - self.learning_rate = learning_rate - self.metric = metric - self.angle = angle - self.method = method - - self.reducer = TSNE(n_components=n_components, perplexity=perplexity, - early_exaggeration=early_exaggeration, - learning_rate=learning_rate, n_iter=n_iter, - n_iter_without_progress=n_iter_without_progress, - min_grad_norm=min_grad_norm, metric=metric, - metric_params=metric_params, init=init, - verbose=verbose, random_state=random_state, - method=method, angle=angle) - - self._add_to_params(["perplexity", "early_exaggeration", - "learning_rate", "metric", "angle", "method"]) - - def execute(self, matrices, dump_dir): - """Apply the TSNE fit_transform algorithm on all the matrices, reducing - the number of features of each matrix to 2. - :param matrices: - :param dump_dir: - :return: - """ - logger.info('Starting TSNE projection') - - shapes = [matrix.shape for matrix in matrices] - global_matrix = np.asarray(np.hstack(matrices).T) + super().__init__( + n_neighbors=n_neighbors, min_dist=min_dist, + metric=metric, n_epochs=n_epochs, learning_rate=learning_rate, + init=init, low_memory=low_memory, + alignment_regularisation=alignment_regularisation, + alignment_window_size=alignment_window_size, + random_state=random_state, target_metric=target_metric, + verbose=verbose + ) + self.key2 = key - proj_tsne = self.reducer.fit_transform(global_matrix) + def execute(self, matrices_nD_AS, scores_AS, dir_reducer, dir_guided_2D, + natures=None, dirs_2D=None, current_version=None, + prev_version=None, dump=False): - min_coords = np.amin(proj_tsne, axis=0) - max_coords = np.amax(proj_tsne, axis=0) - logger.debug( - f"TSNE projection done. Max coords: {max_coords}. " - f"Min Coords: {min_coords}." + matrices_nD_LSAE, matrices_2D_LSFE = self._aligned_execute( + matrices_nD_AS, scores_AS, dir_reducer, dir_guided_2D, + natures=natures, dirs_2D=dirs_2D, current_version=current_version, + prev_version=prev_version, dump=dump ) - start = 0 - projected_matrices = [] - for shape in shapes: - matrix = np.asarray(proj_tsne[start:start + shape[1]].T) - projected_matrices.append(matrix) - start += shape[1] + interpolator = KNeighborsInterpolator() + interpolator.fit([matrices_nD_LSAE[0]], matrices_2D_LSFE) + matrices_2D = interpolator.predict(matrices_nD_LSAE) + + if dump: + self.dump_matrices(natures, matrices_2D, dir_guided_2D, + key=self.key2) - return projected_matrices + return matrices_2D diff --git a/cartodata/pipeline/projectionnd.py b/cartodata/pipeline/projectionnd.py index 8b2e0cf240677f9b286560d5d1ec5d581b15f9e3..193148a11698d2e708049df9e4467d810086cad2 100644 --- a/cartodata/pipeline/projectionnd.py +++ b/cartodata/pipeline/projectionnd.py @@ -1,7 +1,6 @@ import logging -from cartodata.operations import normalize_l2 -from cartodata.pipeline.base import BaseEstimator +from cartodata.pipeline.base import BaseProjection from cartodata.projection import ( lsa_projection, lda_projection, doc2vec_projection, bert_projection ) @@ -20,7 +19,7 @@ def get_executor_nD(key): return BertProjection -class ProjectionND(BaseEstimator): +class ProjectionND(BaseProjection): """Base class for n-dimensional projection. N-dimensional projection is used to create topic models for the specified @@ -65,25 +64,83 @@ class ProjectionND(BaseEstimator): self._add_to_params(["num_dims", "normalize"]) - def execute(self, matrices, dataset, dump_dir): + def load_execute(self, dataset, dir_mat, dir_nD, dump=True, + force=False, return_mat=False): + """Loads the entity matrices and executes n-dimensional projection. + If ``force`` is False, checks if the matrices are already + generated. Loads and returns them if ``return_mat`` is True. + + Parameters + ---------- + dataset: cartodata.pipeline.Dataset + the dataset object from which the entity matrices are generated + dir_mat: str, pathlib.Path + the path where the entity matrices are saved + dir_nD: str, pathlib.Path + the path where the generated matrices should be saved + dump: bool + boolean value to indicate if the generated matrices should be saved + under ``dir_nD`` + force: bool + boolean value to indicate if the matrices should be regenerated or + loaded from existing. + return_mat: bool + boolean value to indicate if the projected matrices should be + returned from the method + + Returns + ------- + list of matrices of shape (num_dims, entity_count) if ``return_mat`` is + True; None otherwise + + """ + natures = dataset.natures + + # if not force to regenerate matrices and the matrices already exist + if not force and self.matrices_exist(natures, dir_nD): + logger.info( + f"{self.key} matrices already exist, will not regenerate." + ) + if return_mat: + return self.load_matrices(natures, dir_nD) + else: + return None + + # if the matrices do not already exist or force=True + matrices = dataset.load_matrices(natures, dir_mat) + + return self.execute(matrices, dataset, dir_nD, dump) + + def execute(self, matrices, dataset, dir_nD=None, dump=False): """Executes n-dimensional projection on the specified matrices. Parameters ---------- matrices: list - List of entity matrices - dataset: cartodata.pipeline.datasets.Dataset - The dataset object that contains the data from which the matrices - are generated. - dump_dir: pathlib.Path - The dumps directory where the dumps of matrices should be saved. + list of entity matrices + dataset: cartodata.pipeline.Dataset + the dataset object from which the entity matrices are generated + dir_nD: str, pathlib.Path + the path where the generated nD-matrices should be saved + dump: bool + boolean value to indicate if the generated matrices should be saved + under ``dir_nD`` Returns ------- list of matrices of shape (num_dims, entity_count) """ - raise NotImplementedError() + logger.info(f"Starting {self.key} projection...") + + matrices_nD = self._execute(matrices, dataset, dir_nD) + + if dump: + self.dump_matrices(dataset.natures, matrices_nD, dir_nD) + + logger.info(f'Finished {self.key} projection') + + return matrices_nD class LSAProjection(ProjectionND): @@ -106,9 +163,12 @@ class LSAProjection(ProjectionND): Methods ------- - execute(matrices, dataset, dump_dir) - Executes n-dimensional projection on the specified matrices. + load_execute(self, dataset, dir_mat, dir_nD, dump, force, return_mat) + Loads the entity matrices and executes n-dimensional projection on the + entity matrices. + execute(matrices, dataset, dir_nD, dump) + Executes n-dimensional projection on the specified matrices. """ yaml_tag = u'!LSAProjection' @@ -141,26 +201,16 @@ class LSAProjection(ProjectionND): self.randomized_svd_n_iter = randomized_svd_n_iter self.max_size = max_size self.random_state = random_state - # self._add_to_params([("max_size", max_size), - # ("randomized_svd_n_iter", - # randomized_svd_n_iter)]) - def execute(self, matrices, dataset, dump_dir=None): - logger.info('Starting LSA projection') + def _execute(self, matrices, dataset, dir_nD=None): words_index = dataset.corpus_index - - lsa_matrices = lsa_projection( + return lsa_projection( self.num_dims, matrices[words_index], matrices, randomized_svd_n_iter=self.randomized_svd_n_iter, - max_size=self.max_size, random_state=self.random_state, - dump_dir=dump_dir + max_size=self.max_size, normalize=self.normalize, + random_state=self.random_state, dump_dir=dir_nD ) - if self.normalize: - lsa_matrices = list(map(normalize_l2, lsa_matrices)) - - return lsa_matrices - class LDAProjection(ProjectionND): """An n-dimensional projection class that uses Latent Dirichlet Allocation @@ -187,9 +237,12 @@ class LDAProjection(ProjectionND): Methods ------- - execute(matrices, dataset, dump_dir) - Executes n-dimensional projection on the specified matrices. + load_execute(self, dataset, dir_mat, dir_nD, dump, force, return_mat) + Loads the entity matrices and executes n-dimensional projection on the + entity matrices. + execute(matrices, dataset, dir_nD, dump) + Executes n-dimensional projection on the specified matrices. """ yaml_tag = u'!LDAProjection' @@ -225,17 +278,13 @@ class LDAProjection(ProjectionND): self._add_to_params(["update_every", "passes"]) - def execute(self, matrices, dataset, dump_dir): - logger.info('Starting LDA projection') + def _execute(self, matrices, dataset, dir_nD=None): words_index = dataset.corpus_index - - lda_matrices = lda_projection(self.num_dims, words_index, matrices, - update_every=self.update_every, - passes=self.passes) - - if self.normalize: - lda_matrices = list(map(normalize_l2, lda_matrices)) - return lda_matrices + return lda_projection( + self.num_dims, words_index, matrices, + update_every=self.update_every, passes=self.passes, + normalize=self.normalize + ) class Doc2VecProjection(ProjectionND): @@ -318,9 +367,12 @@ class Doc2VecProjection(ProjectionND): Methods ------- - execute(matrices, dataset, dump_dir) - Executes n-dimensional projection on the specified matrices. + load_execute(self, dataset, dir_mat, dir_nD, dump, force, return_mat) + Loads the entity matrices and executes n-dimensional projection on the + entity matrices. + execute(matrices, dataset, dir_nD, dump) + Executes n-dimensional projection on the specified matrices. """ yaml_tag = u'!Doc2vecProjection' @@ -437,26 +489,21 @@ class Doc2VecProjection(ProjectionND): # TODO complete this # self._add_to_params([("epochs", epochs)]) - def execute(self, matrices, dataset, dump_dir): - logger.info('Starting Doc2Vec projection') + def _execute(self, matrices, dataset, dir_nD=None): words_index = dataset.corpus_index vocab_df = dataset.corpus - doc2vec_matrices = doc2vec_projection( + return doc2vec_projection( self.num_dims, words_index, matrices, vocab_df, workers=self.workers, epochs=self.epochs, callbacks=(), batch_words=self.batch_words, trim_rule=self.trim_rule, alpha=self.alpha, window=self.window, seed=self.seed, hs=self.hs, min_alpha=self.min_alpha, compute_loss=self.compute_loss, dm_mean=self.dm_mean, dm=self.dm, dbow_words=self.dbow_words, - dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count + dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count, + normalize=self.normalize ) - if self.normalize: - doc2vec_matrices = list(map(normalize_l2, doc2vec_matrices)) - - return doc2vec_matrices - class BertProjection(ProjectionND): @@ -477,10 +524,10 @@ class BertProjection(ProjectionND): self._add_to_params(["family", "max_length"]) - def execute(self, matrices, dataset, dump_dir): - logger.info('Starting Bert projection') + def _execute(self, matrices, dataset, dir_nD=None): corpus = dataset.corpus - return bert_projection(matrices, corpus, self.family, self.normalize, - self.pt_device, self.batch_size, - self.max_length) + return bert_projection( + matrices, corpus, self.family, self.normalize, self.pt_device, + self.batch_size, self.max_length + ) diff --git a/cartodata/pipeline/tests/test_dataset.py b/cartodata/pipeline/tests/test_dataset.py index 55d1c07b2a8c102aab26fb0bf2c0c1695d2b67da..f704177f8e90fdd3077957512e756b2bbf7cfc2a 100644 --- a/cartodata/pipeline/tests/test_dataset.py +++ b/cartodata/pipeline/tests/test_dataset.py @@ -352,9 +352,7 @@ class TestCSVDataset(TestDataset): dataset = CSVDataset(DATASET_NAME, VERSION, top_dir=TOP_DIR, filename="filename") - with pytest.raises(AssertionError, - match="The file does not exist locally, please " - "specify fileurl."): + with pytest.raises(AssertionError): dataset.df def test_df(self): diff --git a/cartodata/pipeline/tests/test_projection2d.py b/cartodata/pipeline/tests/test_projection2d.py new file mode 100644 index 0000000000000000000000000000000000000000..112333aacd4e76969c62118d4feb2983e8552635 --- /dev/null +++ b/cartodata/pipeline/tests/test_projection2d.py @@ -0,0 +1,250 @@ +import pandas as pd +from pathlib import Path +from unittest import TestCase + +from cartodata.loading import ( + load_identity_column, load_comma_separated_column, load_text_column +) + +from cartodata.pipeline.projection2d import ( + UMAPProjection, IndirectUMAPProjection, GuidedUMAPProjection, + TSNEProjection, AlignedUMAPProjection, AlignedGuidedUMAPProjection, + AlignedKNeighborsProjection +) + + +class BaseProjection(): + + def setUp(self): + df = pd.DataFrame( + {"authFullName_s": [ + "Philippe Caillou,Samir Aknine,Suzanne Pinson, Samuel Thiriot", + "Bruno Cessac,Hélène Paugam-Moisy,Thierry Viéville", + "Zach Lewkovicz,Philippe Caillou,Jean-Daniel Kant", + "Zach Lewkovicz,Samuel Thiriot" + ] + } + ) + + self.authors_tab, self.authors_scores = load_comma_separated_column( + df, "authFullName_s") + + df = pd.DataFrame( + {"text": [ + ("Many empirical studies emphasize the role of social " + "networks in job search. The social network implicated " + "in this process"), + ("In this short note, we investigate. Responsibility of " + "Pseudoknotted"), + "Exactly Solvable Stochastic Processes for Traffic Modelling", + ("In the ground case, the procedure terminates and provides a " + "decision algorithm for the word problem.")]}) + + self.df = df + self.words_tab, self.words_scores = load_text_column( + df['text'], 4, 1, 1) + + df = pd.DataFrame( + {"en_title_s": [ + "Multi-prover verification of floating-point programs", + "Hardware-independent proofs of numerical programs", + "Viewing a World of Annotations through AnnoVIP", + "Combinatorial identification problems and graph powers"] + } + ) + + self.articles_tab, self.articles_scores = load_identity_column( + df, "en_title_s") + + self.natures = ["articles", "authors", "words"] + self.matrices = [self.articles_tab.toarray(), + self.authors_tab.toarray(), + self.words_tab.toarray()] + + def test_execute(self): + projected_matrices = self.projection.execute( + self.matrices, natures=self.natures + ) + + self.assertEqual(len(projected_matrices), 3) + self.assertEqual( + projected_matrices[0].shape, (2, len(self.articles_scores))) + self.assertEqual( + projected_matrices[1].shape, (2, len(self.authors_scores))) + self.assertEqual( + projected_matrices[2].shape, (2, len(self.words_scores))) + + +class TestUMAPProjection(BaseProjection, TestCase): + + def setUp(self): + super().setUp() + self.projection = UMAPProjection(random_state=42) + + +class TestIndirectUMAPProjection(BaseProjection, TestCase): + + def setUp(self): + super().setUp() + self.projection = IndirectUMAPProjection(random_state=42) + + +class TestGuidedUMAPProjection(BaseProjection, TestCase): + + def setUp(self): + super().setUp() + self.projection = GuidedUMAPProjection(random_state=42) + + +class TestTSNEProjection(BaseProjection, TestCase): + + def setUp(self): + super().setUp() + self.projection = TSNEProjection(random_state=42) + + +class BaseAlignedProjection(): + + def setUp(self, i): + df = pd.DataFrame( + {"authFullName_s": [ + "Philippe Caillou,Samir Aknine,Suzanne Pinson, Samuel Thiriot", + "Bruno Cessac,Hélène Paugam-Moisy,Thierry Viéville", + "Zach Lewkovicz,Philippe Caillou,Jean-Daniel Kant", + "Zach Lewkovicz,Samuel Thiriot", + "Hélène Paugam-Moisy,Samuel Thiriot" + ] + } + ) + + self.authors_tab1, self.authors_scores1 = load_comma_separated_column( + df.loc[:i], "authFullName_s") + self.authors_tab2, self.authors_scores2 = load_comma_separated_column( + df, "authFullName_s") + + df = pd.DataFrame( + {"text": [ + ("Many empirical studies emphasize the role of social " + "networks in job search. The social network implicated " + "in this process"), + ("In this short note, we investigate. Responsibility of " + "Pseudoknotted"), + ("AD algorithms score the data instances and produce a " + "ranked list of candidate anomalies."), + "Exactly Solvable Stochastic Processes for Traffic Modelling", + ("In the ground case, the procedure terminates and provides a " + "decision algorithm for the word problem.")]}) + + self.df = df + self.words_tab1, self.words_scores1 = load_text_column( + df.loc[:i]['text'], 4, 1, 1) + self.words_tab2, self.words_scores2 = load_text_column( + df['text'], 4, 1, 1) + + df = pd.DataFrame( + {"en_title_s": [ + "Multi-prover verification of floating-point programs", + "Hardware-independent proofs of numerical programs", + "Viewing a World of Annotations through AnnoVIP", + "Effectiveness of Tree-based Ensembles for Anomaly " + "Discovery: Insights, Batch and Streaming Active Learning", + "Combinatorial identification problems and graph powers"] + } + ) + + self.articles_tab1, self.articles_scores1 = load_identity_column( + df.loc[:i], "en_title_s") + self.articles_tab2, self.articles_scores2 = load_identity_column( + df, "en_title_s") + + self.natures = ["articles", "authors", "words"] + self.matrices_all = [[self.articles_tab1.toarray(), + self.authors_tab1.toarray(), + self.words_tab1.toarray()], + [self.articles_tab2.toarray(), + self.authors_tab2.toarray(), + self.words_tab2.toarray()]] + self.scores_all = [[self.articles_scores1, + self.authors_scores1, + self.words_scores1], + [self.articles_scores2, + self.authors_scores2, + self.words_scores2]] + + +class TestAlignedUmapProjection(BaseAlignedProjection, TestCase): + + def setUp(self): + super().setUp(1) + self.projection = AlignedUMAPProjection(random_state=42) + + def test_execute(self): + tmp_dir = Path(__file__).parent + dirs_2D = [tmp_dir, tmp_dir] + projected_matrices_all, scores = self.projection.execute( + self.matrices_all, self.scores_all, dir_reducer=tmp_dir, + dirs_2D=dirs_2D, natures=self.natures + ) + + self.assertEqual(len(projected_matrices_all), 2) + + projected_matrices1 = projected_matrices_all[0] + self.assertEqual(len(projected_matrices1), 3) + self.assertEqual( + projected_matrices1[0].shape, (2, len(self.articles_scores1))) + self.assertEqual( + projected_matrices1[1].shape, (2, len(self.authors_scores1))) + self.assertEqual( + projected_matrices1[2].shape, (2, len(self.words_scores1))) + + projected_matrices2 = projected_matrices_all[1] + self.assertEqual(len(projected_matrices2), 3) + self.assertEqual( + projected_matrices2[0].shape, (2, len(self.articles_scores2))) + self.assertEqual( + projected_matrices2[1].shape, (2, len(self.authors_scores2))) + self.assertEqual( + projected_matrices2[2].shape, (2, len(self.words_scores2))) + + self.assertEqual(len(scores), 3) + self.assertEqual(len(scores[0]), len(self.articles_scores2)) + self.assertEqual(len(scores[1]), len(self.authors_scores2)) + self.assertEqual(len(scores[2]), len(self.words_scores2)) + + +class BaseAlignedInterpolatedProjection(BaseAlignedProjection): + + def setUp(self): + super().setUp(4) + + def test_execute(self): + tmp_dir = Path(__file__).parent + dirs_2D = [tmp_dir, tmp_dir] + projected_matrices = self.projection.execute( + self.matrices_all, self.scores_all, dir_reducer=tmp_dir, + dir_guided_2D=tmp_dir, natures=self.natures, dirs_2D=dirs_2D + ) + + self.assertEqual(len(projected_matrices), 3) + self.assertEqual( + projected_matrices[0].shape, (2, len(self.articles_scores2))) + self.assertEqual( + projected_matrices[1].shape, (2, len(self.authors_scores2))) + self.assertEqual( + projected_matrices[2].shape, (2, len(self.words_scores2))) + + +class TestAlignedGuidedUMAPProjection(BaseAlignedInterpolatedProjection, + TestCase): + + def setUp(self): + super().setUp() + self.projection = AlignedGuidedUMAPProjection(random_state=42) + + +class TestAlignedKNeighborsProjection(BaseAlignedInterpolatedProjection, + TestCase): + + def setUp(self): + super().setUp() + self.projection = AlignedKNeighborsProjection(random_state=42) diff --git a/cartodata/pipeline/tests/test_projectionnd.py b/cartodata/pipeline/tests/test_projectionnd.py index afecbc49d2310f3879f933c2a3c31c3adaad646f..63e53e9d82249223b55890b0a52af8f8dcdbf03f 100644 --- a/cartodata/pipeline/tests/test_projectionnd.py +++ b/cartodata/pipeline/tests/test_projectionnd.py @@ -1,3 +1,4 @@ +import pytest from unittest import TestCase from unittest.mock import patch @@ -6,11 +7,10 @@ import pandas as pd from cartodata.loading import ( load_comma_separated_column, load_text_column, load_identity_column ) - - from cartodata.pipeline.projectionnd import ( LSAProjection, LDAProjection, Doc2VecProjection ) +from cartodata.tests.conftest import DATASET_DIR class TestProjectionND(TestCase): @@ -74,9 +74,12 @@ class TestProjectionLSA(TestProjectionND): """Tests LSAProjection.execute method.""" matrices = [self.articles_tab, self.authors_tab, self.words_tab] - dataset.corpus_index = 2 - projected = self.proj.execute(matrices, dataset, "dumps") + dataset.corpus_index = 2 + dataset.natures = ["articles", "authors", "words"] + projected = self.proj.execute( + matrices, dataset, dir_nD="dumps" + ) self.assertEqual(len(projected), 3) self.assertEqual(projected[0].shape, @@ -85,6 +88,48 @@ class TestProjectionLSA(TestProjectionND): (2, len(self.authors_scores))) self.assertEqual(projected[2].shape, (2, len(self.words_scores))) + def test_matrices_exist(self): + natures = ["articles", "authors", "labs", "teams"] + assert self.proj.matrices_exist(natures, DATASET_DIR) + + natures_non_existing = natures + ["not_existing"] + assert not self.proj.matrices_exist(natures_non_existing, DATASET_DIR) + + assert not self.proj.matrices_exist(natures, "trial") + + def test_matrices_exist_error(self): + natures = ["articles", "authors", "labs", "teams"] + + with pytest.raises(ValueError): + self.proj.matrices_exist(None, DATASET_DIR) + + with pytest.raises(ValueError): + self.proj.matrices_exist([], DATASET_DIR) + + with pytest.raises(TypeError): + self.proj.matrices_exist(natures, None) + + def test_load_matrices(self): + natures = ["articles", "authors", "labs", "teams"] + matrices = self.proj.load_matrices(natures, DATASET_DIR) + + assert len(natures) == len(matrices) + + def test_load_matrices_error(self): + natures = ["articles", "authors", "labs", "teams"] + + with pytest.raises(ValueError): + self.proj.load_matrices(None, DATASET_DIR) + + with pytest.raises(ValueError): + self.proj.load_matrices([], DATASET_DIR) + + with pytest.raises(FileNotFoundError): + self.proj.load_matrices(natures, "trial") + + def test_dump_matrices(self): + pass + class TestProjectionLDA(TestProjectionND): @@ -108,6 +153,7 @@ class TestProjectionLDA(TestProjectionND): matrices = [self.articles_tab, self.authors_tab, self.words_tab] dataset.corpus_index = 2 + dataset.natures = ["articles", "authors", "words"] projected = self.proj.execute(matrices, dataset, "dumps") @@ -156,6 +202,7 @@ class TestProjectionDoc2Vec(TestProjectionND): matrices = [self.articles_tab, self.authors_tab, self.words_tab] + dataset.natures = ["articles", "authors", "words"] dataset.corpus = self.df[["auth", "text", "title"]].apply( lambda row: ' . '.join(row.values.astype(str)), axis=1) dataset.corpus_index = 2 diff --git a/cartodata/plotting.py b/cartodata/plotting.py index d60437c5bbddf2d7be56d4158b8aa29d8bab32e9..780e9764c00e415a213b218cb95c5937065bbbd0 100644 --- a/cartodata/plotting.py +++ b/cartodata/plotting.py @@ -1,13 +1,18 @@ import matplotlib.pyplot as plt import matplotlib.patheffects as pe +# COLORS = ['b', 'r', 'c', 'y', 'm', 'g'] +COLORS = ['blue', 'red', 'magenta', 'gold', + 'purple', 'slateblue', "orangered", "olive", "grey"] + def plot_map(matrices, labels, colors=None, title=None, annotations=None, annotation_mat=None, annotation_color='black'): if colors is None: - from matplotlib.colors import TABLEAU_COLORS - colors = list(TABLEAU_COLORS)[:len(matrices)] + # from matplotlib.colors import TABLEAU_COLORS + # colors = list(TABLEAU_COLORS)[:len(matrices)] + colors = list(COLORS)[:len(matrices)] fig, ax = plt.subplots(figsize=(12, 8), dpi=80) @@ -17,7 +22,7 @@ def plot_map(matrices, labels, colors=None, title=None, annotations=None, axes.append(ax.scatter(m[0, :], m[1, :], color=colors[i], s=10, - alpha=0.25, + alpha=0.35, label=labels[i])) # set title diff --git a/cartodata/projection.py b/cartodata/projection.py index 1ae9f225ad62f29edca50d5b61b744b3633e14dc..6229aa16d27ef6554c243de00c7e01e396ca8136 100644 --- a/cartodata/projection.py +++ b/cartodata/projection.py @@ -271,13 +271,16 @@ def matrix_dot(U, matrix, max_size=1000): @timeit def lsa_projection(nb_dims, canon_table, matrices, randomized_svd_n_iter=5, - max_size=1000, random_state=None, dump_dir=None): + max_size=1000, normalize=True, random_state=None, + dump_dir=None): U, sigma, VT = randomized_svd(canon_table, n_components=nb_dims, n_iter=randomized_svd_n_iter, random_state=random_state) if dump_dir is not None: + if not os.path.exists(dump_dir): + os.makedirs(dump_dir) save_matrix(VT, os.path.join(dump_dir, 'lsa_components')) projected_matrices = [] @@ -290,12 +293,15 @@ def lsa_projection(nb_dims, canon_table, matrices, randomized_svd_n_iter=5, proj = matrix_dot(U, matrix, max_size=max_size) projected_matrices.append(proj) + if normalize: + projected_matrices = list(map(normalize_l2, projected_matrices)) + return projected_matrices @timeit def lda_projection(nb_dims, canon_table_idx, matrices, update_every=0, - passes=20): + passes=20, normalize=True): corpus = Sparse2Corpus( sparse=matrices[canon_table_idx], documents_columns=False @@ -314,6 +320,9 @@ def lda_projection(nb_dims, canon_table_idx, matrices, update_every=0, proj = matrix_dot(np.transpose(U), matrix) projected_matrices.append(proj) + if normalize: + projected_matrices = list(map(normalize_l2, projected_matrices)) + return projected_matrices @@ -323,7 +332,8 @@ def doc2vec_projection(nb_dims, canon_table_idx, matrices, vocab_df, workers=3, trim_rule=None, alpha=0.025, window=5, seed=1, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, min_alpha=0.0001, compute_loss=False, dm_mean=None, - dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1): + dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, + normalize=True): """ Apply the `doc2vec_projection` method. Takes at least four arguments: :param nb_dims: the number of dimensions wanted @@ -366,6 +376,9 @@ def doc2vec_projection(nb_dims, canon_table_idx, matrices, vocab_df, workers=3, except TypeError: projected_matrices.append(U) + if normalize: + projected_matrices = list(map(normalize_l2, projected_matrices)) + return projected_matrices @@ -422,9 +435,9 @@ def umap_projection( @timeit def guided_umap_projection(projected_matrice, matrices, n_neighbors=15, - min_dist=0.1, max_size=2000000): + min_dist=0.1, max_size=2000000, model=None): """ - Apply the UMAP fit algorithm to the fprojected_matrice. Then transform + Apply the UMAP fit algorithm to the projected_matrice. Then transform all the matrices using the fitted model to extrapolate the vectors in 2 dimension. @@ -436,7 +449,7 @@ def guided_umap_projection(projected_matrice, matrices, n_neighbors=15, :return: """ model = fit_transform_guided(projected_matrice, n_neighbors, min_dist, - max_size) + max_size, model=model) logger.debug( f"UMAP: Reference matrix projected to size {projected_matrice.shape}" @@ -463,7 +476,8 @@ def guided_umap_projection(projected_matrice, matrices, n_neighbors=15, return projected_matrices -def fit_transform_guided(projected_matrice, n_neighbors, min_dist, max_size): +def fit_transform_guided(projected_matrice, n_neighbors, min_dist, max_size, + model=None): """ Fit a UMAP model with a random sample of max_size documents from the matrix. Then use this model to transform the remaining documents, @@ -482,7 +496,9 @@ def fit_transform_guided(projected_matrice, n_neighbors, min_dist, max_size): "UMAP: Fitting model with sample matrix of size " f"{projected_matrice.shape}" ) - model = UMAP(n_neighbors=n_neighbors, min_dist=min_dist, init='random') + if model is None: + model = UMAP(n_neighbors=n_neighbors, min_dist=min_dist, init='random') + model.fit(projected_matrice.T) # Model's embedding is equal to transformed data for the train data diff --git a/cartodata/utils.py b/cartodata/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..56db1eb6f93ba5d38077cc71358cfd0f02e5b8d2 --- /dev/null +++ b/cartodata/utils.py @@ -0,0 +1,119 @@ +from hashlib import blake2b +import numpy as np +import pandas as pd + + +def digest(data, digest_size=8): + h = blake2b(digest_size=digest_size) + h.update(data.encode()) + return h.hexdigest() + + +def hstack_transpose_matrices(matrices): + """Horizontally stacks each matrix in the matrices list and gets transpose + of the array. Returns also column count of each matrix in a list. + + Parameters + ---------- + matrices: list of numpy.ndarray + a list of arrays that have same row counts, but may differ in column + counts + + Returns: + numpy.ndarray, list of int + + transposed array and list of integers that contains row count for each + matrix in the matrices to be used to decompose the transposed matrix + """ + global_matrix = np.asarray(np.hstack(matrices).T) + row_counts = [matrix.shape[1] for matrix in matrices] + + return global_matrix, row_counts + + +def decompose_transpose_matrix(global_matrix, row_counts): + """ Decomposes the ``global_matrix`` according to the specifed + ``row_counts`` and transposes the obtained matrix. + + Parameters + ---------- + global_matrix: numpy.ndarray + matrix that contains multiple arrays + row_counts: list + list of integers that contains the number of rows in each array in the + ``global_matrix`` + + Returns + ------- + list of numpy.ndarray + """ + start = 0 + matrices = [] + for row_count in row_counts: + matrix = np.asarray(global_matrix[start:start + row_count].T) + matrices.append(matrix) + start += row_count + + return matrices + + +def make_relation(from_df, to_df): + """Creates a mapping between ``from_df`` to ``to_df`` for common indices in + both dataframes. + + For each dataframe a new dataframe is generated with the index of the + dataframe and values as integers starting from 0 to number of elements + in the dataframe. The mapping is created from the integer value of each + row in ``from_df`` to the integer value of corresponding row in ``to_df`` + for common indexes in dataframes. + + Parameters + ---------- + from_df: pandas.DataFrame + the dataframe whose integer values are used as key for the mapping + to_df: pandas.DataFrame + the dataframe whose integer values are used as value for mapping + + Returns + ------- + dict + a mapping from ``from_df`` to ``to_df`` + """ + # create a new dataframe with index from from_df, and values as + # integers starting from 0 to the length of from_df + left = pd.DataFrame(data=np.arange(len(from_df)), index=from_df.index) + + # create a new dataframe with index from to_df, and values as integers + # starting from 0 to the length of to_df + right = pd.DataFrame(data=np.arange(len(to_df)), index=to_df.index) + + # merge left and right dataframes on the intersection of keys of both + # dataframes preserving the order of left keys + merge = left.merge(right, how="inner", + left_index=True, right_index=True) + + return dict(merge.values) + + +def get_sub_matrices(scores_s1, scores_s2, matrices, values=True): + """ + Creates a dictionary of mapping of the common entities in scores_s1 and + scores_s2. The keys the dictionary are the indices of the entities from + scores_s1 and the values are the indices of the entities from scores_s2. + + Using this dictionary, it takes the rows corresponding to the entities + specified by the values in the dictionary from the matrices and returns the + list of matrices. + """ + matrices_sub = [] + for i in range(len(matrices)): + relations_i = make_relation(scores_s1[i], scores_s2[i]) + if values: + indices_i = list(relations_i.values()) + else: + indices_i = list(relations_i.keys()) + matrix_i = matrices[i] + matrix_s2_sub_s1 = np.take(matrix_i, indices_i, axis=1) + matrices_sub.append(matrix_s2_sub_s1) + + return matrices_sub diff --git a/conf/lisn/aligned/pipeline.yaml b/conf/lisn/aligned/pipeline.yaml index 8bdb2e7c86415da8fbf5d2a1fac593c7c5783586..841b6079aa4264e0b2a8def34f8915b8af2af71f 100644 --- a/conf/lisn/aligned/pipeline.yaml +++ b/conf/lisn/aligned/pipeline.yaml @@ -3,10 +3,9 @@ top_dir: dumps projection_nd: !LSAProjection num_dims: 100 projection_2d: !AlignedUMAPProjection - n_neighbors: [20,20] - min_dist: [0.1,0.1] + n_neighbors: 20 + min_dist: 0.1 init: random - random_state: 42 n_epochs: 500 clustering: !KMeansClustering n: 8 diff --git a/examples/pipeline_guided_aligned_lisn_lsa_kmeans.py b/examples/pipeline_guided_aligned_lisn_lsa_kmeans.py new file mode 100644 index 0000000000000000000000000000000000000000..96df32c20122d06929e4fdb12a48b756b20bc0b7 --- /dev/null +++ b/examples/pipeline_guided_aligned_lisn_lsa_kmeans.py @@ -0,0 +1,396 @@ +""" +Guided Aligned pipeline +============================= + +In this example we will demonstrate use of `GuidedAlignedUMAP` with pipeline API. + +""" + +############################################################################### +# Create Aligned Dataset +# ------------------------------------------ + +############################################################################### +# For using guided aligned UMAP, we need to use `cartodata.pipeline.datasets.CSVSliceDataset`. `CSVSliceDataset` divides the specified dataset into `slice_count` slices. +# +# Setting the `slice_type` parameter to `cumulative` enables us to make slices cumulative. +# +# Before slicing, we can sort the dataset using `sort_asc_by` parameter specifying the column name to sort in ascending order. + +from cartodata.pipeline.datasets import CSVSliceDataset # noqa +from pathlib import Path # noqa + +ROOT_DIR = Path.cwd().parent +# The directory where files necessary to load dataset columns reside +INPUT_DIR = ROOT_DIR / "datas" +# The directory where the generated dump files will be saved +TOP_DIR = ROOT_DIR / "dumps_interpolate_kneigh" + +slice_count = 3 + +dataset = CSVSliceDataset( + "lisn", input_dir=INPUT_DIR, version="3.0.0", filename="lisn_2000_2022.csv", + fileurl="https://zenodo.org/record/7323538/files/lisn_2000_2022.csv", + columns=None, slice_count=slice_count, slice_type="cumulative", + sort_asc="producedDateY_i", index_col=0 +) + +############################################################################### +# Now we should define our entities and set the column names corresponding to those entities from the data file. We have 4 entities: +# +# | entity | column name in the file | +# ---------|-------------| +# | articles | en_title_s | +# | authors | authFullName_s | +# | labs | structAcronym_s | +# | words | en_abstract_s, en_title_s, en_keyword_s, en_domainAllCodeLabel_fs | +# +# +# Cartolabe provides 4 types of columns: +# +# +# - **IdentityColumn**: The entity of this column represents the main entity of the dataset. The column data corresponding to the entity in the file should contain a single value and this value should be unique among column values. There can only be one `IdentityColumn` in the dataset. +# - **CSColumn**: The entity of this column type is related to the main entity, and can contain single or comma separated values. +# - **CorpusColumn**: The entity of this column type is the corpus related to the main entity. This can be a combination of multiple columns in the file. It uses a modified version of CountVectorizer(https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer). +# - **TfidfCorpusColumn**: The entity of this column type is the corpus related to the main entity. This can be a combination of multiple columns in the file or can contain filepath from which to read the text corpus. It uses TfidfVectorizer (https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html). +# +# +# In this dataset, **Articles** is our main entity. We will define it as IdentityColumn: + +from cartodata.pipeline.columns import IdentityColumn, CSColumn, CorpusColumn # noqa + +articles_column = IdentityColumn(nature="articles", column_name="en_title_s") +authors_column = CSColumn( + nature="authors", column_name="authFullName_s", filter_min_score=4 +) +labs_column = CSColumn( + nature="labs", column_name="structAcronym_s", filter_min_score=4 +) +words_column = CorpusColumn( + nature="words", + column_names=["en_abstract_s", "en_title_s", + "en_keyword_s", "en_domainAllCodeLabel_fs"], + stopwords="stopwords.txt", nb_grams=4, min_df=10, max_df=0.05, + min_word_length=5, normalize=True +) + +dataset.set_columns( + [articles_column, authors_column, labs_column, words_column]) + +############################################################################### +# As we are going to use AlignedUMAP, we need to create an instance of `cartodata.pipeline.common.AlignedPipeline`. + +from cartodata.pipeline.common import GuidedAlignedPipeline # noqa + +pipeline = GuidedAlignedPipeline(dataset, top_dir=TOP_DIR, input_dir=INPUT_DIR) + +############################################################################### +# Creating correspondance matrices for each entity type +# ------------------------------------------------------------------------------- +# +# We want to extract matrices that will map the correspondance between the articles and the entities we want to use. +# +# AlignedPipeline has `generate_entity_matrices` function to generate matrices and scores for each entity (nature) specified for the dataset. + +matrices_all, scores_all = pipeline.generate_entity_matrices() + +############################################################################### +# We can list the sizes of each entity matrix for each slice. + +for i in range(dataset.slice_count): + print(f"############## Slice {i + 1} #################") + matrices_i = matrices_all[i] + + for nature, matrix in zip(pipeline.natures, matrices_i): + print(f"{nature} ------------- {matrix.shape}") + +############################################################################### +# Dimension reduction +# ------------------------------ +# +# One way to see the matrices that we created is as coordinates in the space of +# all articles. What we want to do is to reduce the dimension of this space to +# make it easier to work with and see. +# +# **LSA projection** +# +# We'll start by using the LSA (Latent Semantic Analysis) technique to reduce the number of rows in our data. + +from cartodata.pipeline.projectionnd import LSAProjection # noqa + +num_dim = 100 + +lsa_projection = LSAProjection(num_dim) +pipeline.set_projection_nd(lsa_projection) + +############################################################################### +# Now we can run LSA projection on the matrices. + +matrices_nD_all = pipeline.do_projection_nD() + +"" +for i in range(dataset.slice_count): + print(f"############## Slice {i + 1} #################") + matrices_nD = matrices_nD_all[i] + + for nature, matrix in zip(pipeline.natures, matrices_nD): + print(f"{nature} ------------- {matrix.shape}") + + +############################################################################### +# This makes it easier to work with them for clustering or nearest neighbors +# tasks, but we also want to project them on a 2D space to be able to map them. +# +# **Aligned UMAP projection** +# +# The `UMAP <https://github.com/lmcinnes/umap>`_ (Uniform Manifold Approximation +# and Projection) is a dimension reduction technique that can be used for +# visualisation similarly to t-SNE. +# +# We use this algorithm to project matrices of each slice aligned with each other in 2 dimensions. +# +# The difference of `cartodata.pipeline.projection2d.GuidedAlignedUMAPProjection` from `cartodata.pipeline.projection2d.AlignedUMAPProjection` is that, in GuidedAlignedUMAPProjection AlignedUmapProjection is used only for articles. The 2D points obtained for articles are used as initial points in `cartodata.pipeline.projection2d.UMAPProjection` and then the model is fit and transformed with all nD matrices. + +from cartodata.pipeline.projection2d import AlignedRBFProjection # noqa +from cartodata.pipeline.projection2d import AlignedKNeighborsProjection +#from cartodata.pipeline.interpolator import CustomKNeighborsRegressor # noqa + +n_neighbors = 20 +min_dists = 0.1 + + +projection_2d = AlignedKNeighborsProjection( + n_neighbors=n_neighbors, + min_dist=min_dists, + init='random', + random_state=42, + alignment_window_size = 1, + alignment_regularisation=1e-3, + n_epochs=200) + +pipeline.set_projection_2d(projection_2d) + +############################################################################### +# Now we can run GuidedAlignedUMAPProjection. + +matrices_2D = pipeline.do_projection_2D(dump=True) + +"" +from cartodata.pipeline.clustering import KMeansClustering # noqa + +# level of clusters, hl: high level, ml: medium level +cluster_natures = ["hl_clusters", "ml_clusters"] + +kmeans_clustering = KMeansClustering(n=8, base_factor=3, natures=cluster_natures) + +pipeline.set_clustering(kmeans_clustering) + +"" +clus_nD, clus_2D, clus_scores, cluster_labels, cluster_eval_pos, cluster_eval_neg = pipeline.do_clustering() + +"" +pipeline.save_plots() + +"" +image_title_parts = pipeline.title_parts_clus("ml_clusters") + +image_name_ml_clusters_3_0_0 = "_".join(image_title_parts) + ".png" +image_name_ml_clusters_3_0_0 + +############################################################################### +# Align new version of dataset +# -------------------------------------------- +# +# We have sliced version 3.0.0 of LISN dataset into 2 and created 2D projections aligned with each other. It contains data until year 2022 inclusive. +# +# +# Now we will download recent version of LISN data that contains publications of year 2023 as well and we will align it with version 3.0.0 of LISN dataset. + +from cartodata.scraping import scrape_hal, process_domain_column # noqa +from cartodata.command_line import STRUCT_MAP # noqa + +filters = {} +struct = "lisn" + +yearfrom = 2000 +yearto = 2024 + +file = dataset.input_dir / f"{struct.lower()}_{yearfrom}_{yearto - 1}.csv" + +if not file.exists(): + filters['structId_i'] = "(" + STRUCT_MAP[struct] + ")" + + years = range(yearfrom, yearto) + + df = scrape_hal(struct, filters, years, cool_down=2) + process_domain_column(df) + + df.to_csv(file) + +############################################################################### +# At this point we have two options to align with version 3.0.0. +# +# - We can use current projection_2d instance and update using that. +# - We can initiate a new GuidedAlignedUMAP instance and align specifying the previous version. +# +# **Using current GuidedAlignedUMAP** +# +# We will create a new dataset for the new dataset file. We will version this as 3.1.0. + +slice_count = 1 + +new_dataset = CSVSliceDataset( + "lisn", input_dir=INPUT_DIR, filename="lisn_2000_2023.csv", + slice_count=slice_count, version="3.1.0", sort_asc="producedDateY_i" +) + +new_dataset.set_columns( + [articles_column, authors_column, labs_column, words_column]) + +############################################################################### +# We will update pipeline's dataset with the new one. + +pipeline.update_dataset(new_dataset) + +############################################################################### +# We will generate entity matrices for the new dataset. + +matrices_all, scores_all = pipeline.generate_entity_matrices() + +############################################################################### +# We will do LSA projection. + +from cartodata.pipeline.projectionnd import LSAProjection # noqa + +num_dim = 100 + +lsa_projection = LSAProjection(num_dim) + +pipeline.set_projection_nd(lsa_projection) + +matrices_nD_all = pipeline.do_projection_nD() + +"" +matrices_nD = matrices_nD_all[0] + +for nature, matrix in zip(pipeline.natures, matrices_nD): + print(f"{nature} ------------- {matrix.shape}") + +############################################################################### +# Now we will run guided aligned UMAP to align the current version 3.1.0 of dataset with the current reducer. + +pipeline.do_projection_2D(prev_version="3.0.0", return_mat=True) + +"" +clus_nD, clus_2D, clus_scores, cluster_labels, cluster_eval_pos, cluster_eval_neg = pipeline.do_clustering() + +"" +pipeline.save_plots() + +"" +image_title_parts = pipeline.title_parts_clus("ml_clusters") + +image_name_ml_clusters_3_1_0 = "_".join(image_title_parts) + ".png" +image_name_ml_clusters_3_1_0 + +############################################################################### +# Let's view the dataset version 3.1.0 with the final slice of the dataset version 3.0.0. + +import matplotlib.pyplot as plt # noqa + +img1 = plt.imread(pipeline.working_dir.parent / "3.0.0" / "guided" / image_name_ml_clusters_3_0_0) +img2 = plt.imread(pipeline.working_dir / "guided" / image_name_ml_clusters_3_1_0) + +f, ax = plt.subplots(2, 1, figsize=(9, 12)) + +ax[0].imshow(img1) +ax[1].imshow(img2) + +ax[0].axis('off') +ax[1].axis('off') + +plt.tight_layout() +plt.show() + +############################################################################### +# **Using new GuidedAlignedUmap instance** +# +# Now we will follow the alternative flow and create a new instance of GuidedAlignedUMAPProjection and load the reducer from version 3.0.0 by reading the `reducer.pkl` file. +# +# First we will create new dataset, with version 3.2.0 with the same data. + +new_dataset = CSVSliceDataset( + "lisn", input_dir=INPUT_DIR, filename="lisn_2000_2023.csv", + slice_count=slice_count, version="3.2.0", sort_asc="producedDateY_i" +) + +new_dataset.set_columns( + [articles_column, authors_column, labs_column, words_column]) + +pipeline.update_dataset(new_dataset) + +"" +matrices_all, scores_all = pipeline.generate_entity_matrices() + +"" +from cartodata.pipeline.projectionnd import LSAProjection # noqa + +num_dim = 100 + +lsa_projection = LSAProjection(num_dim) + +pipeline.set_projection_nd(lsa_projection) + +matrices_nD_all = pipeline.do_projection_nD() + +"" +n_neighbors = 20 +min_dists = 0.1 + +projection_2d = AlignedKNeighborsProjection( + n_neighbors=n_neighbors, + min_dist=min_dists, + init='random', + random_state=42, + alignment_window_size = 1, + alignment_regularisation=1e-3, + n_epochs=200 +) + +pipeline.set_projection_2d(projection_2d) + +"" +pipeline.do_projection_2D(prev_version="3.0.0") + +"" +clus_nD, clus_2D, clus_scores, cluster_labels, cluster_eval_pos, cluster_eval_neg = pipeline.do_clustering() + +"" +pipeline.save_plots() + +"" +image_title_parts = pipeline.title_parts_clus("ml_clusters") + +image_name_ml_clusters_3_2_0 = "_".join(image_title_parts) + ".png" +image_name_ml_clusters_3_2_0 + +"" +img1 = plt.imread(pipeline.working_dir.parent / "3.0.0" / "guided" / image_name_ml_clusters_3_0_0) +img2 = plt.imread(pipeline.working_dir.parent/ "3.1.0" / "guided" / image_name_ml_clusters_3_1_0) +img3 = plt.imread(pipeline.working_dir / "guided" / image_name_ml_clusters_3_2_0) + +f, ax = plt.subplots(3, 1, figsize=(20, 15)) + +ax[0].imshow(img1) +ax[1].imshow(img2) +ax[2].imshow(img3) + +ax[0].axis('off') +ax[1].axis('off') +ax[2].axis('off') + +plt.tight_layout() +plt.show() + +"" + diff --git a/examples/pipeline_lisn_bert_kmeans.py b/examples/pipeline_lisn_bert_umap_kmeans.py similarity index 99% rename from examples/pipeline_lisn_bert_kmeans.py rename to examples/pipeline_lisn_bert_umap_kmeans.py index ac7fab9149d037d32d1e60a54570576bcb92e90a..de6b99b480ed0cb95ad5e050efa9a9061292bcb1 100644 --- a/examples/pipeline_lisn_bert_kmeans.py +++ b/examples/pipeline_lisn_bert_umap_kmeans.py @@ -353,7 +353,7 @@ pipeline.set_clustering(kmeans_clustering) # Now we can run clustering on the matrices. (clus_nD, clus_2D, clus_scores, cluster_labels, -cluster_eval_pos, cluster_eval_neg) = pipeline.create_clusters() +cluster_eval_pos, cluster_eval_neg) = pipeline.do_clustering() ############################################################################### # As we have specified two levels of clustering, the returned lists wil have two values. diff --git a/examples/pipeline_aligned_lisn_lsa_kmeans.py b/examples/pipeline_lisn_lsa_aligned_umap_kmeans.py similarity index 98% rename from examples/pipeline_aligned_lisn_lsa_kmeans.py rename to examples/pipeline_lisn_lsa_aligned_umap_kmeans.py index c3395060d995e9ff38c32fbca5b44544264d49fd..394f299b3a83eed10ceb5d7e637ea24b2b49cabc 100644 --- a/examples/pipeline_aligned_lisn_lsa_kmeans.py +++ b/examples/pipeline_lisn_lsa_aligned_umap_kmeans.py @@ -151,8 +151,8 @@ for i in range(dataset.slice_count): from cartodata.pipeline.projection2d import AlignedUMAPProjection # noqa -n_neighbors = [20, 20, 100] -min_dists = [0.1, 0.1, 0.3] +n_neighbors = 20 +min_dists = 0.1 projection_2d = AlignedUMAPProjection( @@ -209,7 +209,7 @@ kmeans_clustering = KMeansClustering( pipeline.set_clustering(kmeans_clustering) (clus_nD_all, clus_2D_all, clus_scores_all, clus_labels_all, - clus_eval_pos_all, clus_eval_neg_all) = pipeline.create_clusters() + clus_eval_pos_all, clus_eval_neg_all) = pipeline.do_clustering() ############################################################################### # We will now display high level clusters: @@ -441,7 +441,7 @@ pipeline.plot_map(matrices_2D, labels, colors, "" (clus_nD_all, clus_2D_all, clus_scores_all, clus_labels_all, - clus_eval_pos_all, clus_eval_neg_all) = pipeline.create_clusters() + clus_eval_pos_all, clus_eval_neg_all) = pipeline.do_clustering() "" ml_index = 1 @@ -509,8 +509,8 @@ new_dataset.set_columns( pipeline.update_dataset(new_dataset) "" -n_neighbors = 200 -min_dists = 0.3 +n_neighbors = 20 +min_dists = 0.1 projection_2d = AlignedUMAPProjection( n_neighbors=n_neighbors, @@ -551,7 +551,7 @@ pipeline.plot_map(matrices_2D, labels, colors, title="New Dataset") "" (clus_nD_all, clus_2D_all, clus_scores_all, clus_labels_all, - clus_eval_pos_all, clus_eval_neg_all) = pipeline.create_clusters() + clus_eval_pos_all, clus_eval_neg_all) = pipeline.do_clustering() "" ml_index = 1 diff --git a/examples/pipeline_lisn_lsa_tsne_kmeans.py b/examples/pipeline_lisn_lsa_tsne_kmeans.py new file mode 100644 index 0000000000000000000000000000000000000000..adafed5a325dfbc1525cffe8f3f0fbd8be8f315c --- /dev/null +++ b/examples/pipeline_lisn_lsa_tsne_kmeans.py @@ -0,0 +1,547 @@ +""" +Processing LISN data with Pipeline API (LSA projection) +==================================================================== + +In this example we will process LISN (Laboratoire Interdisciplinaire des Sciences du Numérique) dataset using `Pipeline` API. LISN dataset contains all articles from HAL (https://hal.archives-ouvertes.fr/) published by authors from LISN between 2000-2022. + +The pipeline will comprise of the following steps: + +- extract entities (articles, authors, teams, labs, words) from a collection of + scientific articles +- use Latent Semantic Analysis (LSA) to generate n-dimensional vector + representation of the entities +- use Uniform Manifold Approximation and Projection (UMAP) to project those + entities in 2 dimensions +- use KMeans clustering to cluster entities +- find their nearest neighbors. + +""" + +############################################################################### +# Create LISN Dataset +# ==================== +# +# We will first create Dataset for LISN. +# +# The CSV file containing the data can be downloaded from https://zenodo.org/record/7323538/files/lisn_2000_2022.csv . We will use version 2.0.0 of the dataset. When we specify the URL to `CSVDataset`, it will download the file if it does not exist locally. + +from cartodata.pipeline.datasets import CSVDataset # noqa +from pathlib import Path # noqa + +ROOT_DIR = Path.cwd().parent +# The directory where files necessary to load dataset columns reside +INPUT_DIR = ROOT_DIR / "datas" +# The directory where the generated dump files will be saved +TOP_DIR = ROOT_DIR / "dumps" + +dataset = CSVDataset(name="lisn", input_dir=INPUT_DIR, version="2.0.0", filename="lisn_2000_2022.csv", + fileurl="https://zenodo.org/record/7323538/files/lisn_2000_2022.csv", + columns=None, index_col=0) + +############################################################################### +# This will check if the dataset file already exists locally. If it does not, it downloads the file from the specified URL and the loads the file to a pandas Dataframe. +# +# Let's view the dataset. + +df = dataset.df + +df.head(5) + +############################################################################### +# The dataframe that we just read consists of 4262 articles as rows. + +df.shape[0] + +############################################################################### +# And their authors, abstract, keywords, title, research labs and domain as columns. + +print(*df.columns, sep="\n") + +############################################################################### +# Now we should define our entities and set the column names corresponding to those entities from the data file. We have 5 entities: +# +# | entity | column name in the file | +# ---------|-------------| +# | articles | en_title_s | +# | authors | authFullName_s | +# | teams | structAcronym_s | +# | labs | structAcronym_s | +# | words | en_abstract_s, en_title_s, en_keyword_s, en_domainAllCodeLabel_fs | +# +# +# Cartolabe provides 4 types of columns: +# +# +# - **IdentityColumn**: The entity of this column represents the main entity of the dataset. The column data corresponding to the entity in the file should contain a single value and this value should be unique among column values. There can only be one `IdentityColumn` in the dataset. +# - **CSColumn**: The entity of this column type is related to the main entity, and can contain single or comma separated values. +# - **CorpusColumn**: The entity of this column type is the corpus related to the main entity. This can be a combination of multiple columns in the file. It uses a modified version of CountVectorizer(https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer). +# - **TfidfCorpusColumn**: The entity of this column type is the corpus related to the main entity. This can be a combination of multiple columns in the file or can contain filepath from which to read the text corpus. It uses TfidfVectorizer (https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html). +# +# +# In this dataset, **Articles** is our main entity. We will define it as IdentityColumn: + +from cartodata.pipeline.columns import IdentityColumn, CSColumn, CorpusColumn # noqa + +articles_column = IdentityColumn(nature="articles", column_name="en_title_s") + +############################################################################### +# `authFullName_s` column for entity **authors** in the dataset lists the authors who have authored each article, and has comma separated values. We will define a CSColumn: + +authors_column = CSColumn(nature="authors", column_name="authFullName_s", filter_min_score=4) + +############################################################################### +# Here we have set `filter_min_score=4` to indicate that, while processing data, authors who have authored less than 4 articles will be filtered. When it is not set, the default value is `0`, meaning that entities will not be filtered. +# +# **Teams** and **Labs** entities both use `structAcronym_s` column which also has comma separated values. `structAcronym_s` column contains both teams and labs of the articles. For teams entity we will take only teams and for labs entity we will take only labs. +# +# The file **../datas/inria-teams.csv** contains the list of Inria teams. For teams entity, we will whitelist the values from inria-teams.csv and for labs entity, we will blacklist values from inria-teams.csv. + +teams_column = CSColumn(nature="teams", column_name="structAcronym_s", whitelist="inria-teams.csv", + filter_min_score=4) + +labs_column = CSColumn(nature="labs", column_name="structAcronym_s", blacklist="inria-teams.csv", + filter_min_score=4) + +############################################################################### +# For **words** entity, we are going to use multiple columns to create a text corpus for each article: + +words_column = CorpusColumn(nature="words", + column_names=["en_abstract_s", "en_title_s", "en_keyword_s", "en_domainAllCodeLabel_fs"], + stopwords="stopwords.txt", nb_grams=4, min_df=10, max_df=0.05, + min_word_length=5, normalize=True) + +############################################################################### +# Now we are going to set the columns of the dataset: + +dataset.set_columns([articles_column, authors_column, teams_column, labs_column, words_column]) + +############################################################################### +# We can set the columns in any order that we prefer. We will set the first entity as identity entity and the last entity as the corpus. If we set the entities in a different order, the `Dataset` will put the main entity as first. +# +# The dataset for LISN data is ready. Now we will create and run our pipeline. For this pipeline, we will: +# +# - run LSA projection -> N-dimesional +# - run UMAP projection -> 2D +# - cluster entities +# - find nearest neighbors + +############################################################################### +# Create and run pipeline +# ===================== + +############################################################################### +# We will first create a pipeline with the dataset. + +from cartodata.pipeline.common import Pipeline # noqa + +pipeline = Pipeline(dataset=dataset, top_dir=TOP_DIR, input_dir=INPUT_DIR, hierarchical_dirs=True) + +############################################################################### +# The workflow generates the `natures` from dataset columns. + +pipeline.natures + +############################################################################### +# Creating correspondance matrices for each entity type +# ------------------------------------------------------------------------------- +# +# From this table of articles, we want to extract matrices that will map the +# correspondance between these articles and the entities we want to use. +# +# Pipeline has `generate_entity_matrices` function to generate matrices and scores for each entity (nature) specified for the dataset. + +matrices, scores = pipeline.generate_entity_matrices(force=True) + +############################################################################### +# The order of matrices and scores correspond to the order of dataset columns specified. + +dataset.natures + +############################################################################### +# **Articles** +# +# The first matrix in matrices and Series in scores corresponds to **articles**. +# +# The type for article column is `IdentityColumn`. It generates a matrix that simply maps each article to itself. + +articles_mat = matrices[0] +articles_mat.shape + +############################################################################### +# Having type `IdentityColumn`, each article will have score 1. + +articles_scores = scores[0] +articles_scores.shape + +"" +articles_scores.head() + +############################################################################### +# **Authors** +# +# The second matrix in matrices and score in scores correspond to **authors**. +# +# The type for authors column is `CSColumn`. It generates a sparce matrix where rows correspond to articles and columns corresponds to authors. + +authors_mat = matrices[1] +authors_mat.shape + +############################################################################### +# Here we see that after filtering authors which have less than 4 articles, there are 694 distinct authors. +# +# The series, which we named `authors_scores`, contains the list of authors +# extracted from the column `authFullName_s` with a score that is equal to the +# number of rows (articles) that this value was mapped within the `authors_mat` +# matrix. + +authors_scores = scores[1] +authors_scores.head() + +############################################################################### +# If we look at the *4th* column of the matrix, which corresponds to the author +# **Ralf Treinen**, we can see that it has 5 non-zero rows, each row +# indicating which articles he authored. + +print(authors_mat[:, 3]) + +############################################################################### +# **Teams** +# +# The third matrix in matrices and score in scores correspond to **teams**. +# +# The type for teams column is `CSColumn`. It generates a sparce matrix where rows correspond to articles and columns corresponds to teams. + +teams_mat = matrices[2] +teams_mat.shape + +############################################################################### +# Here we see that after filtering teams which have less than 4 articles, there are 33 distinct teams. +# +# The series, which we named `teams_scores`, contains the list of teams +# extracted from the column `structAcronym_s` with a score that is equal to the +# number of rows (articles) that this value was mapped within the `teams_mat` +# matrix. + +teams_scores = scores[2] +teams_scores.head() + +############################################################################### +# **Labs** +# +# The fourth matrix in matrices and score in scores correspond to **labs**. +# +# The type for labs column is `CSColumn`. It generates a sparce matrix where rows correspond to articles and columns corresponds to labs. + +labs_mat = matrices[3] +labs_mat.shape + +############################################################################### +# Here we see that after filtering labs which have less than 4 articles, there are 549 distinct labs. +# +# The series, which we named `labs_scores`, contains the list of labs +# extracted from the column `structAcronym_s` with a score that is equal to the +# number of rows (articles) that this value was mapped within the `labs_mat` +# matrix. + +labs_scores = scores[3] +labs_scores.head() + +############################################################################### +# **Words** +# +# The fifth matrix in matrices and score in scores correspond to **words**. +# +# The type for words column is `CorpusColumn`. It creates a corpus merging multiple text columns in the dataset, and then extracts n-grams from that corpus. Finally it generates a sparce matrix where rows correspond to articles and columns corresponds to n-grams. + +words_mat = matrices[4] +words_mat.shape + +############################################################################### +# Here we see that there are 5226 distinct n-grams. +# +# The series, which we named `words_scores`, contains the list of n-grams +# with a score that is equal to the number of rows (articles) that this value +# was mapped within the `words_mat` matrix. + +words_scores = scores[4] +words_scores.head() + +############################################################################### +# Dimension reduction +# ------------------------------ +# +# One way to see the matrices that we created is as coordinates in the space of +# all articles. What we want to do is to reduce the dimension of this space to +# make it easier to work with and see. +# +# **LSA projection** +# +# We'll start by using the LSA (Latent Semantic Analysis) technique to reduce the number of rows in our data. + +from cartodata.pipeline.projectionnd import LSAProjection # noqa + +num_dim = 80 + +lsa_projection = LSAProjection(num_dim) + +pipeline.set_projection_nd(lsa_projection) + +############################################################################### +# Now we can run LSA projection on the matrices. + +matrices_nD = pipeline.do_projection_nD(force=True) + +"" +for nature, matrix in zip(pipeline.natures, matrices_nD): + print(f"{nature} ------------- {matrix.shape}") + +############################################################################### +# We have 80 rows for each entity. + +############################################################################### +# This makes it easier to work with them for clustering or nearest neighbors +# tasks, but we also want to project them on a 2D space to be able to map them. +# +# **TSNE projection** +# +# We use TSNE to project our matrices in 2 dimensions. + +from cartodata.pipeline.projection2d import TSNEProjection # noqa + + +umap_projection = TSNEProjection() + +pipeline.set_projection_2d(umap_projection) + +############################################################################### +# Now we can run TSNE projection on the LSA matrices. + +matrices_2D = pipeline.do_projection_2D(force=True) + +############################################################################### +# Now that we have 2D coordinates for our points, we can try to plot them to +# get a feel of the data's shape. + +labels = tuple(pipeline.natures) +colors = ['b', 'r', 'c', 'y', 'm'] + +fig, ax = pipeline.plot_map(matrices_2D, labels, colors) + +############################################################################### +# The plot above, as we don't have labels for the points, doesn't make much sense +# as is. But we can see that the data shows some clusters which we could try to identify. +# +# Clustering +# --------------- +# +# In order to identify clusters, we use the KMeans clustering technique on the +# articles. We'll also try to label these clusters by selecting the most +# frequent words that appear in each cluster's articles. + +from cartodata.pipeline.clustering import KMeansClustering # noqa + +# level of clusters, hl: high level, ml: medium level +cluster_natures = ["hl_clusters", "ml_clusters"] + +kmeans_clustering = KMeansClustering(n=8, base_factor=3, natures=cluster_natures) + +pipeline.set_clustering(kmeans_clustering) + +############################################################################### +# Now we can run clustering on the matrices. + +(clus_nD, clus_2D, clus_scores, cluster_labels, +cluster_eval_pos, cluster_eval_neg) = pipeline.do_clustering() + +############################################################################### +# As we have specified two levels of clustering, the returned lists wil have two values. + +len(clus_2D) + +############################################################################### +# We will now display two levels of clusters in separate plots, we will start with high level clusters: + +clus_scores_hl = clus_scores[0] +clus_mat_hl = clus_2D[0] + + +fig_hl, ax_hl = pipeline.plot_map(matrices_2D, labels, colors, + title="LISN Dataset High Level Clusters", + annotations=clus_scores_hl.index, annotation_mat=clus_mat_hl) + +############################################################################### +# The 8 high level clusters that we created give us a general idea of what the big +# clusters of data contain. +# +# With medium level clusters we have a finer level of detail: + +clus_scores_ml = clus_scores[1] +clus_mat_ml = clus_2D[1] + +fig_ml, ax_ml = pipeline.plot_map(matrices_2D, labels, colors, + title="LISN Dataset Medium Level Clusters", + annotations=clus_scores_ml.index, annotation_mat=clus_mat_ml, + annotation_color='black') + +############################################################################### +# We have 24 medium level clusters. We can increase the number of clusters to have even finer details to zoom in and focus on smaller areas. +# +# Now we will save the plots in the `working_dir` directory. + +pipeline.save_plots() + +"" +for file in pipeline.working_dir.glob("*.png"): + print(file) + +############################################################################### +# Nearest neighbors +# ---------------------------- +# +# One more thing which could be useful to appreciate the quality of our data +# would be to get each point's nearest neighbors. If our data processing is +# done correctly, we expect the related articles, labs, words and authors to be +# located close to each other. +# +# Finding nearest neighbors is a common task with various algorithms aiming to +# solve it. The `find_neighbors` method uses one of these algorithms to find the +# nearest points of all entities (articles, authors, teams, +# labs, words). It takes an optional weight parameter to tweak +# the distance calculation to select points that have a higher score but are +# maybe a bit farther instead of just selecting the closest neighbors. + +from cartodata.pipeline.neighbors import AllNeighbors + +n_neighbors = 10 +weights = [0, 0.5, 0.5, 0, 0] + +neighboring = AllNeighbors(n_neighbors=n_neighbors, power_scores=weights) + +pipeline.set_neighboring(neighboring) + +pipeline.find_neighbors() + + +############################################################################### +# Export file using exporter +# ======================= +# +# We can now export the data. To export the data, we need to configure the exporter. +# +# The exported data will be the points extracted from the dataset corresponding to the entities that we have defined. +# +# In the export file, we will have the following columns for each point: +# +# +# | column | value | +# ---------|-------------| +# | nature | one of articles, authors, teams, labs, words | +# | label | point's label | +# | score | point's score | +# | rank | point's rank | +# | x | point's x location on the map | +# | y | point's y location on the map | +# | nn_articles | neighboring articles to this point | +# | nn_teams | neighboring teams to this point | +# | nn_labs | neighboring labs to this point | +# | nn_words | neighboring words to this point | +# +# we will call `pipeline.export` function. It will create `export.feather` file and save under `pipeline.working_dir`. + +pipeline.export() + +############################################################################### +# Let's display the contents of the file. + +import pandas as pd # noqa + +df = pd.read_feather(pipeline.get_clus_dir() / "export.feather") +df.head() + +############################################################################### +# This is a basic export file. For each point, we can add additional columns. +# +# For example, for each author, we can add **labs** and **teams** columns to list the labs and teams that the author belongs to. We can also merge the teams and labs in one column and name it as labs. To do that we have to first create export config for the entity (nature) that we would like to modify. + +from cartodata.pipeline.exporting import ( + ExportNature, MetadataColumn +) # noqa + +ex_author = ExportNature(key="authors", + refs=["labs", "teams"], + merge_metadata=[{"columns": ["teams", "labs"], + "as_column": "labs"}]) + +############################################################################### +# We can do the same for articles. Each article will have **teams** and **labs** data, and additionally **author** of the article. So we can set `refs=["labs", "teams", "authors"]`. +# +# The original dataset contains a column `producedDateY_i` which contains the year that the article is published. We can add this data as metadata for the point but updating column name with a more clear alternative `year`. We can also add a function to apply to the column value. In this example we will convert column value to string. + +meta_year_article = MetadataColumn(column="producedDateY_i", as_column="year", + func="x.astype(str)") + +############################################################################### +# We will also add `halId_s` column as `url` and set empty string if the value does not exist: + +meta_url_article = MetadataColumn(column="halId_s", as_column="url", func="x.fillna('')") + +"" +ex_article = ExportNature(key="articles", refs=["labs", "teams", "authors"], + merge_metadata=[{"columns": ["teams", "labs"], + "as_column": "labs"}], + add_metadata=[meta_year_article, meta_url_article]) + +pipeline.export(export_natures=[ex_article, ex_author]) + +############################################################################### +# Now we can load the new `export.feather` file to see the difference. + +df = pd.read_feather(pipeline.get_clus_dir() / "export.feather") + +df.head() + +############################################################################### +# For the points of nature **articles**, we have additional **labs**, **authors**, **year**, **url** columns. +# +# Let's see the points of nature **authors**: + +df[df["nature"] == "authors"].head() + +############################################################################### +# We have values for labs field, but not for authors, year, or url field. +# +# As we have not defined any relation for points of natures **teams**, **labs** and **words**, these new columns are empty for those points. + +df[df["nature"] == "teams"].head() + +"" +df[df["nature"] == "labs"].head() + +"" +df[df["nature"] == "words"].head() + +"" +df['x'][1] + +############################################################################### +# Export to json file +# ------------------------------- + +############################################################################### +# We can export the data to a **json** file as well. + +export_json_file = pipeline.get_clus_dir() / 'pipeline_lisn_lsa_tsne_kmeans.json' + +pipeline.exporter.export_to_json(export_json_file) + +############################################################################### +# This creates the `pipeline_lisn_lsa_tsne_kmeans.json` file which contains a list of points +# ready to be imported into Cartolabe. Have a look at it to check that it +# contains everything. + +import json # noqa + +with open(export_json_file, 'r') as f: + data = json.load(f) + +data[1] diff --git a/examples/pipeline_lisn_lsa_hdbscan_hierarchical.py b/examples/pipeline_lisn_lsa_umap_hdbscan_hierarchical.py similarity index 99% rename from examples/pipeline_lisn_lsa_hdbscan_hierarchical.py rename to examples/pipeline_lisn_lsa_umap_hdbscan_hierarchical.py index d3d9ac51f259f6283bd2b8faba2cb749625e9c42..e0dd574b53b77ce5c56741693e000add1a6cac53 100644 --- a/examples/pipeline_lisn_lsa_hdbscan_hierarchical.py +++ b/examples/pipeline_lisn_lsa_umap_hdbscan_hierarchical.py @@ -355,7 +355,7 @@ pipeline.set_clustering(hdbscan_clustering) # Now we can run clustering on the matrices. (clus_nD, clus_2D, clus_scores, cluster_labels, -cluster_eval_pos, cluster_eval_neg) = pipeline.create_clusters() +cluster_eval_pos, cluster_eval_neg) = pipeline.do_clustering() ############################################################################### # As we have specified two levels of clustering, the returned lists wil have two values. diff --git a/examples/pipeline_lisn_lsa_kmeans.py b/examples/pipeline_lisn_lsa_umap_kmeans.py similarity index 98% rename from examples/pipeline_lisn_lsa_kmeans.py rename to examples/pipeline_lisn_lsa_umap_kmeans.py index e8dcf56fb8c8de1b84e8c0f1a27b1238ca24c0d7..32714db9b01242f5089fc30756b7df6a4a0ae271 100644 --- a/examples/pipeline_lisn_lsa_kmeans.py +++ b/examples/pipeline_lisn_lsa_umap_kmeans.py @@ -355,7 +355,7 @@ pipeline.set_clustering(kmeans_clustering) # Now we can run clustering on the matrices. (clus_nD, clus_2D, clus_scores, cluster_labels, -cluster_eval_pos, cluster_eval_neg) = pipeline.create_clusters() +cluster_eval_pos, cluster_eval_neg) = pipeline.do_clustering() ############################################################################### # As we have specified two levels of clustering, the returned lists wil have two values. @@ -534,12 +534,12 @@ df['x'][1] ############################################################################### # We can export the data to a **json** file as well. -export_json_file = pipeline.get_clus_dir() / 'lisn_workflow_lsa.json' +export_json_file = pipeline.get_clus_dir() / 'pipeline_lisn_lsa_umap_kmeans.json' pipeline.exporter.export_to_json(export_json_file) ############################################################################### -# This creates the `lisn_workflow_lsa.json` file which contains a list of points +# This creates the `pipeline_lisn_lsa_umap_kmeans.json` file which contains a list of points # ready to be imported into Cartolabe. Have a look at it to check that it # contains everything. diff --git a/examples/pipeline_yaml_lisn.py b/examples/pipeline_yaml_lisn.py index 64f8fef2072398a2a4b228f266f6e502ebe40bd2..f0d58e25f914cadd9a43b34099becd68a0c9ebaa 100644 --- a/examples/pipeline_yaml_lisn.py +++ b/examples/pipeline_yaml_lisn.py @@ -298,7 +298,7 @@ pipeline.plot_map(matrices_2D, labels, colors) # First we will create clusters: (clus_nD, clus_2D, clus_scores, cluster_labels, -cluster_eval_pos, cluster_eval_neg) = pipeline.create_clusters(force=True) +cluster_eval_pos, cluster_eval_neg) = pipeline.do_clustering(force=True) ############################################################################### # We will view only medium levels clusters: