diff --git a/cartodata/pipeline/scoring.py b/cartodata/pipeline/scoring.py new file mode 100644 index 0000000000000000000000000000000000000000..80502cee8817763f5140a948e1186e004eecb629 --- /dev/null +++ b/cartodata/pipeline/scoring.py @@ -0,0 +1,353 @@ +import numpy as np +import pandas as pd +from sklearn import metrics + + +class ScoreManager(): + + def __init__(self, key, params, dump_dir, score_dir): + self.key = key + self.id = params["id"] + self.params = params + self.dump_dir = dump_dir + self.score_dir = score_dir + + def calculate_score(self): + raise NotImplementedError() + + def can_continue(self): + raise NotImplementedError() + + def add_score(self, name, value, df_all_scores): + score_file = self.score_dir / f"{name}.csv" + score_df = self.read_file_to_df(score_file) + + new_hor_value = pd.DataFrame([[value]], columns=[name], + index=[self.params]) + score_df = pd.concat([score_df, new_hor_value]) + + score_df.to_csv(score_file) + + df_all_scores = self.update_all_scores(name, value, df_all_scores) + + return df_all_scores + + def update_all_scores(self, name, value, df_all_scores): + new_ver_value = pd.DataFrame([[value]], columns=["value"], + index=[name]) + df_all_scores = pd.concat([df_all_scores, new_ver_value]) + + return df_all_scores + + def add_desc_scores(self, desc_name, value, valuescore): + desc_score_file = self.score_dir / f"{desc_name}.csv" + desc_score_file_loc = self.dump_dir / f"{desc_name}.csv" + + desc_score_df = self.read_file_to_df(desc_score_file) + + value = value.to_list() + + new_value = pd.DataFrame( + [[valuescore] + value], index=[self.params], + columns=["value"] + [str(i) for i in range(len(value))] + ) + desc_score_df = pd.concat([desc_score_df, new_value]) + desc_score_df.to_csv(desc_score_file) + + new_value.transpose().to_csv(desc_score_file_loc) + + def read_file_to_df(self, filepath): + try: + df = pd.read_csv(filepath, index_col=0) + except FileNotFoundError: + df = pd.DataFrame() + + return df + + +class ScoreManagerProjection(ScoreManager): + + def __init__(self, key, dim_key, next_params, dump_dir, score_dir): + super().__init__(key, next_params, dump_dir, score_dir) + self.dim_key = dim_key + + def calculate_score(self, pipeline, source, target, min_nb=30, + recompute=False): + dataset_dir = pipeline.working_dir + scores_source = pipeline.load_scores([source], dataset_dir)[0] + scores_target = pipeline.load_scores([target], dataset_dir)[0] + + matrix_source = pipeline.load_matrices([source], pipeline.dataset.key, + dataset_dir)[0] + + if recompute: + weight = [0.0] + matrix = pipeline.load_matrices([target], self.key, + self.dump_dir)[0] + from cartodata.neighbors import get_neighbors # noqa + + matrix_nn = get_neighbors(matrix, scores_target, [matrix], + weight)[0] + # TODO fix this +# else: +# import gzip +# filename = ( +# self.dumps_dir / f"nearest_{target}_for_{target}.npy.gz" +# ) +# with gzip.GzipFile(filename, 'r') as file: +# nnmatrix = np.load(file) + + use = scores_source >= min_nb + artauts = matrix_source.T.tolil().rows + nntransp = matrix_nn.T + numb_nn = nntransp.shape[1] + + isin = [np.sum([ + art in artauts[aut] for art in nntransp[artauts[aut], :].flatten() + ]) for aut in range(len(scores_source))] + + maxsum = [ + scores_source[aut] * numb_nn for aut in range(len(scores_source)) + ] + + rate = np.divide(isin, maxsum) + filt = use * rate + moy = np.sum(filt) / np.sum(use) + + return moy, filt[use] + + +class ScoreND(ScoreManagerProjection): + + def __init__(self, next_params, dump_dir, score_dir): + super().__init__(next_params['projectionnD'], + "nD", next_params, dump_dir, score_dir) + + +class Score2D(ScoreManagerProjection): + + def __init__(self, key, next_params, dump_dir, score_dir): + super().__init__(key, "2D", next_params, dump_dir, score_dir) + + +class ScoreClustering(ScoreManager): + + def __init__(self, next_params, dump_dir, score_dir): + super().__init__("clustering", next_params, dump_dir, score_dir) + + def calculate_score(self, clus_evals, clus_scores, df_all_scores): + + avg_stab = 0.0 + avg_couv = 0.0 + for s in range(len(clus_evals)): + st = str(s) + clus_size = clus_scores[s] + neg_detail = clus_evals[s]["neg_detail"] + pos_detail = clus_evals[s]["pos_detail"] + + avg_clus = clus_evals[s]["avg_word_couv"] + df_all_scores = self.add_score("avg_word_couv_" + st, avg_clus, + df_all_scores) + + stab_clus = clus_evals[s]["clus_name_stability"] + df_all_scores = self.add_score( + "stab_clus_" + st, stab_clus, df_all_scores) + + stabclu_list = clus_evals[s]["clus_name_stab_detail_clu"] + clus_sort = clus_size.sort_values(ascending=False) + all_clu = pd.DataFrame({"size": clus_sort, + "pos_detail": pos_detail, + "neg_detail": neg_detail, + "stability": stabclu_list}) + all_clu = all_clu.sort_values(["size"], ascending=False) + descclus = all_clu.index.astype( + str) + " : s " + all_clu["size"].values.astype( + str) + " stb " + all_clu["stability"].map( + lambda x: '{0:.2f}'.format(x) + ).astype(str) + " + " + all_clu["pos_detail"].map( + lambda x: '{0:.2f}'.format(x) + ).astype(str) + " - " + all_clu["neg_detail"].map( + lambda x: '{0:.2f}'.format(x) + ).astype(str) + + self.add_desc_scores("clus_eval_posdetail_" + + st, descclus, avg_clus) + + sil_clus = clus_evals[s]["silhouette"] + df_all_scores = self.add_score("silhouette_" + st, + sil_clus, df_all_scores) + + sil_clus = clus_evals[s]["med_word_couv"] + df_all_scores = self.add_score("med_word_couv_" + st, + sil_clus, df_all_scores) + + sil_clus = clus_evals[s]["avg_word_couv_minus"] + df_all_scores = self.add_score("avg_word_couv_minus_" + st, + sil_clus, df_all_scores) + + sil_clus = clus_evals[s]["big_small_ratio"] + df_all_scores = self.add_score("big_small_ratio_" + st, + sil_clus, df_all_scores) + + avg_stab = avg_stab + stab_clus + avg_couv = avg_couv + avg_clus + + # TODO fix avg_stab and avg_couv + avg_stab = avg_stab/len(clus_evals) + avg_couv = avg_couv/len(clus_evals) + + df_all_scores = self.add_score("avg_stab_avg", avg_stab, df_all_scores) + df_all_scores = self.add_score("avg_couv_avg", avg_couv, df_all_scores) + df_all_scores = self.add_score("clu_score", (avg_couv+avg_stab)/2, + df_all_scores) + + return df_all_scores + + +class ScoreFinal(ScoreManager): + + def __init__(self, next_params, dump_dir, score_dir): + super().__init__("final", next_params, dump_dir, score_dir) + + def calculate_score(self, valuelist, df_all_scores): + initvalue = 0.0 + for s in valuelist: + initvalue = initvalue + float(df_all_scores.loc[s]["value"]) + value = initvalue / len(valuelist) + + return value + + @property + def final_results(self): + final_result_file = self.score_dir / "final_results.csv" + try: + final_results = pd.read_csv(final_result_file, header=0, + index_col=0) + except FileNotFoundError: + paramlistconcat = ["rank", "agscore"] + print(self.params) + paramlistconcat = ( + paramlistconcat + list(self.params.keys()) + ) + paramlistconcat = paramlistconcat[:-1] + + final_results = pd.DataFrame(columns=paramlistconcat) + + final_results.to_csv(final_result_file) + + return final_results + + def add_final_score(self, name, value, df_all_scores): + final_results = self.final_results + + idsc = len(final_results) + + horcols = ["rank", "agscore"] + list(self.params.keys()) + \ + list(df_all_scores.T.columns) + ["dump", "active"] + horlist = ( + [0] + [value] + list(self.params.values()) + + df_all_scores.T.astype(float).values.tolist()[0] + + [self.dump_dir] + [True] + ) + new_hor_value = pd.DataFrame([horlist], columns=horcols, index=[idsc]) + + if idsc == 0: + final_results = new_hor_value + else: + new_hor_value.columns = final_results.columns + final_results.loc[idsc] = new_hor_value.loc[idsc] + + final_results = final_results.sort_values(["agscore"], ascending=False) + + final_results.to_csv(self.score_dir / "final_results.csv") + + +class ClusterEvaluator(): + + def __init__(self, iter_stab=2, remove_stab=[0, .01, .03, .1, .25]): + self.iter_stab = iter_stab + self.remove_stab = remove_stab + self.clus_evals = [] + + def reset(self): + self.clus_evals = [] + + def evaluate(self, clustering, nb_clusters, c_o, clustering_table, + naming_table, natural_space_naming_table, + naming_scores, previous_cluster_labels, + naming_profile_table, c_scores, + cluster_eval_pos, cluster_eval_neg): + + # TODO check initial labels + initial_labels = previous_cluster_labels.copy() + + labels = c_o.labels_ + silhouette = metrics.silhouette_score(clustering_table.T, labels, + metric='euclidean') + c_evals = dict() + c_evals['nbclust'] = nb_clusters + c_evals['silhouette'] = silhouette + c_evals['avg_word_couv'] = np.mean(cluster_eval_pos.values) + c_evals['med_word_couv'] = np.median(cluster_eval_pos.values) + c_evals['avg_word_couv_minus'] = np.mean( + cluster_eval_pos.values) - np.mean(cluster_eval_neg.values) + c_evals['pos_detail'] = cluster_eval_pos + c_evals['neg_detail'] = cluster_eval_neg + + labels, counts = np.unique(labels, return_counts=True) + big_small_ratio = np.max(counts) / np.min(counts) + c_evals['big_small_ratio'] = big_small_ratio + + if self.iter_stab > 0: + + cluster_scores_stab = pd.Series(dtype=np.float64) + stabdetail = {} + + for i in c_scores.index.values: + cluster_scores_stab[i] = 0.0 + + nbtot = self.iter_stab * len(self.remove_stab) + lentot = len(c_scores.index.values) * self.iter_stab + + for samp in self.remove_stab: + stabdetail[samp] = 0.0 + + for self.iterstab in range(self.iter_stab): + subsamp = clustering_table + natural_sample = natural_space_naming_table + + if samp > 0: + select = np.random.choice( + clustering_table.shape[1], + (int)(clustering_table.shape[1] * (1.0 - samp)), + replace=False + ) + subsamp = clustering_table[:, select] + natural_sample = natural_space_naming_table[select, :] + + (cs_lsa, cs_umap, cs_scores, cs_km, + cs_wscores, cscluster_eval_pos, + cscluster_eval_neg) = clustering._create_clusters( + nb_clusters, subsamp, naming_table, natural_sample, + naming_scores, initial_labels.copy(), + naming_profile_table) + names = cs_scores.index.values + + for nn in names: + + if nn in c_scores.index: + cluster_scores_stab[nn] = (cluster_scores_stab[nn] + + (1 / nbtot)) + stabdetail[samp] = stabdetail[samp] + (1 / lentot) + + stabtotal = np.mean(list(stabdetail.values())) + c_evals["clus_name_stability"] = stabtotal + c_evals["clus_name_stab_detail_samp"] = stabdetail + c_evals["clus_name_stab_detail_clu"] = cluster_scores_stab + + self.clus_evals.append(c_evals) + + return c_evals + + def get_evals(self): + return self.clus_evals diff --git a/cartodata/pipeline/tests/test_loader.py b/cartodata/pipeline/tests/test_loader.py index 6496cbfc046039befa43d12c338209e9c4d80280..b9834e73ee01b31e02e910bfba4ae1dea1c03d63 100644 --- a/cartodata/pipeline/tests/test_loader.py +++ b/cartodata/pipeline/tests/test_loader.py @@ -34,7 +34,7 @@ class TestLoader(TestCase): assert isinstance(pipeline, Pipeline) assert pipeline.dataset.name == PIPELINE_DATASET assert pipeline.projection_2d is not None - assert pipeline.projection_2d.key == "umap" + assert pipeline.projection_2d.key == "tsne" assert pipeline.projection_nd is not None assert pipeline.clustering is not None assert pipeline.neighboring is not None