From fc64c4ceee2216296dfbebbbe38de46b60d2e5d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= <hande@ginkgo.work> Date: Thu, 15 Feb 2024 14:17:11 +0100 Subject: [PATCH 1/8] install transformers for all test --- .gitlab-ci.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7ba95f15..f3fc1fcd 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -26,7 +26,7 @@ lint: - small before_script: - apt-get update && apt-get install -y git build-essential - - python -m pip install -e .[test] + - python -m pip install -e .[test,transformers] script: - echo "Running tests" - pytest --no-cov @@ -41,7 +41,6 @@ run_tests_3.10: image: python:3.10.13-slim-bookworm script: - echo "Running tests" - - python -m pip install -e .[transformers] - pytest --cov=cartodata --cov-report=html artifacts: paths: -- GitLab From fc3a4e862c7b2af4e67cabaf3219034dabb77f43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= <hande@ginkgo.work> Date: Thu, 15 Feb 2024 15:30:30 +0100 Subject: [PATCH 2/8] implement bert projection --- cartodata/pipeline/projectionnd.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/cartodata/pipeline/projectionnd.py b/cartodata/pipeline/projectionnd.py index c4d6be3b..33c6392b 100644 --- a/cartodata/pipeline/projectionnd.py +++ b/cartodata/pipeline/projectionnd.py @@ -3,7 +3,7 @@ import logging from cartodata.operations import normalize_l2 from cartodata.pipeline.base import BaseEstimator from cartodata.projection import ( - lsa_projection, lda_projection, doc2vec_projection + lsa_projection, lda_projection, doc2vec_projection, bert_projection ) logger = logging.getLogger(__name__) @@ -16,6 +16,8 @@ def get_executor_nD(key): return LDAProjection elif key == "doc2vec": return Doc2VecProjection + elif key == "bert": + return BertProjection class ProjectionND(BaseEstimator): @@ -456,3 +458,18 @@ class Doc2VecProjection(ProjectionND): doc2vec_matrices = list(map(normalize_l2, doc2vec_matrices)) return doc2vec_matrices + + +class BertProjection(ProjectionND): + + def __init__(self, family="specter2", pt_device=None, normalize=True): + super().__init__("bert", 768, normalize) + self.family = family + self.pt_device = pt_device + + def execute(self, matrices, dataset, dump_dir): + logger.info('Starting Bert projection') + corpus = dataset.corpus + + return bert_projection(matrices, corpus, self.family, self.normalize, + self.pt_device) -- GitLab From 94cccb3111254110825b59f7e3f45adbde6d73dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= <hande@ginkgo.work> Date: Thu, 15 Feb 2024 15:30:47 +0100 Subject: [PATCH 3/8] add bert notebook --- examples/pipeline_lisn_bert_kmeans.py | 549 ++++++++++++++++++++++++++ 1 file changed, 549 insertions(+) create mode 100644 examples/pipeline_lisn_bert_kmeans.py diff --git a/examples/pipeline_lisn_bert_kmeans.py b/examples/pipeline_lisn_bert_kmeans.py new file mode 100644 index 00000000..ac7fab91 --- /dev/null +++ b/examples/pipeline_lisn_bert_kmeans.py @@ -0,0 +1,549 @@ +""" +Processing LISN data with Pipeline API (BERT projection) +==================================================================== + +In this example we will process LISN (Laboratoire Interdisciplinaire des Sciences du Numérique) dataset using `Pipeline` API. LISN dataset contains all articles from HAL (https://hal.archives-ouvertes.fr/) published by authors from LISN between 2000-2022. + +The pipeline will comprise of the following steps: + +- extract entities (articles, authors, teams, labs, words) from a collection of + scientific articles +- use Bidirectional Encoder Representations from Transformers (BERT) to generate vector + representation of the entities +- use Uniform Manifold Approximation and Projection (UMAP) to project those + entities in 2 dimensions +- use KMeans clustering to cluster entities +- find their nearest neighbors. + +""" + +############################################################################### +# Create LISN Dataset +# ==================== +# +# We will first create Dataset for LISN. +# +# The CSV file containing the data can be downloaded from https://zenodo.org/record/7323538/files/lisn_2000_2022.csv . We will use version 2.0.0 of the dataset. When we specify the URL to `CSVDataset`, it will download the file if it does not exist locally. + +from cartodata.pipeline.datasets import CSVDataset # noqa +from pathlib import Path # noqa + +ROOT_DIR = Path.cwd().parent +# The directory where files necessary to load dataset columns reside +INPUT_DIR = ROOT_DIR / "datas" +# The directory where the generated dump files will be saved +TOP_DIR = ROOT_DIR / "dumps" + +dataset = CSVDataset(name="lisn", input_dir=INPUT_DIR, version="2.0.0", filename="lisn_2000_2022.csv", + fileurl="https://zenodo.org/record/7323538/files/lisn_2000_2022.csv", + columns=None, index_col=0) + +############################################################################### +# This will check if the dataset file already exists locally. If it does not, it downloads the file from the specified URL and the loads the file to a pandas Dataframe. +# +# Let's view the dataset. + +df = dataset.df + +df.head(5) + +############################################################################### +# The dataframe that we just read consists of 4262 articles as rows. + +df.shape[0] + +############################################################################### +# And their authors, abstract, keywords, title, research labs and domain as columns. + +print(*df.columns, sep="\n") + +############################################################################### +# Now we should define our entities and set the column names corresponding to those entities from the data file. We have 5 entities: +# +# | entity | column name in the file | +# ---------|-------------| +# | articles | en_title_s | +# | authors | authFullName_s | +# | teams | structAcronym_s | +# | labs | structAcronym_s | +# | words | en_abstract_s, en_title_s, en_keyword_s, en_domainAllCodeLabel_fs | +# +# +# Cartolabe provides 4 types of columns: +# +# +# - **IdentityColumn**: The entity of this column represents the main entity of the dataset. The column data corresponding to the entity in the file should contain a single value and this value should be unique among column values. There can only be one `IdentityColumn` in the dataset. +# - **CSColumn**: The entity of this column type is related to the main entity, and can contain single or comma separated values. +# - **CorpusColumn**: The entity of this column type is the corpus related to the main entity. This can be a combination of multiple columns in the file. It uses a modified version of CountVectorizer(https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer). +# - **TfidfCorpusColumn**: The entity of this column type is the corpus related to the main entity. This can be a combination of multiple columns in the file or can contain filepath from which to read the text corpus. It uses TfidfVectorizer (https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html). +# +# +# In this dataset, **Articles** is our main entity. We will define it as IdentityColumn: + +from cartodata.pipeline.columns import IdentityColumn, CSColumn, CorpusColumn # noqa + +articles_column = IdentityColumn(nature="articles", column_name="en_title_s") + +############################################################################### +# `authFullName_s` column for entity **authors** in the dataset lists the authors who have authored each article, and has comma separated values. We will define a CSColumn: + +authors_column = CSColumn(nature="authors", column_name="authFullName_s", filter_min_score=4) + +############################################################################### +# Here we have set `filter_min_score=4` to indicate that, while processing data, authors who have authored less than 4 articles will be filtered. When it is not set, the default value is `0`, meaning that entities will not be filtered. +# +# **Teams** and **Labs** entities both use `structAcronym_s` column which also has comma separated values. `structAcronym_s` column contains both teams and labs of the articles. For teams entity we will take only teams and for labs entity we will take only labs. +# +# The file **../datas/inria-teams.csv** contains the list of Inria teams. For teams entity, we will whitelist the values from inria-teams.csv and for labs entity, we will blacklist values from inria-teams.csv. + +teams_column = CSColumn(nature="teams", column_name="structAcronym_s", whitelist="inria-teams.csv", + filter_min_score=4) + +labs_column = CSColumn(nature="labs", column_name="structAcronym_s", blacklist="inria-teams.csv", + filter_min_score=4) + +############################################################################### +# For **words** entity, we are going to use multiple columns to create a text corpus for each article: + +words_column = CorpusColumn(nature="words", + column_names=["en_abstract_s", "en_title_s", "en_keyword_s", "en_domainAllCodeLabel_fs"], + stopwords="stopwords.txt", nb_grams=4, min_df=10, max_df=0.05, + min_word_length=5, normalize=True) + +############################################################################### +# Now we are going to set the columns of the dataset: + +dataset.set_columns([articles_column, authors_column, teams_column, labs_column, words_column]) + +############################################################################### +# We can set the columns in any order that we prefer. We will set the first entity as identity entity and the last entity as the corpus. If we set the entities in a different order, the `Dataset` will put the main entity as first. +# +# The dataset for LISN data is ready. Now we will create and run our pipeline. For this pipeline, we will: +# +# - run LSA projection -> N-dimesional +# - run UMAP projection -> 2D +# - cluster entities +# - find nearest neighbors + +############################################################################### +# Create and run pipeline +# ===================== + +############################################################################### +# We will first create a pipeline with the dataset. + +from cartodata.pipeline.common import Pipeline # noqa + +pipeline = Pipeline(dataset=dataset, top_dir=TOP_DIR, input_dir=INPUT_DIR, hierarchical_dirs=True) + +############################################################################### +# The workflow generates the `natures` from dataset columns. + +pipeline.natures + +############################################################################### +# Creating correspondance matrices for each entity type +# ------------------------------------------------------------------------------- +# +# From this table of articles, we want to extract matrices that will map the +# correspondance between these articles and the entities we want to use. +# +# Pipeline has `generate_entity_matrices` function to generate matrices and scores for each entity (nature) specified for the dataset. + +matrices, scores = pipeline.generate_entity_matrices(force=True) + +############################################################################### +# The order of matrices and scores correspond to the order of dataset columns specified. + +dataset.natures + +############################################################################### +# **Articles** +# +# The first matrix in matrices and Series in scores corresponds to **articles**. +# +# The type for article column is `IdentityColumn`. It generates a matrix that simply maps each article to itself. + +articles_mat = matrices[0] +articles_mat.shape + +############################################################################### +# Having type `IdentityColumn`, each article will have score 1. + +articles_scores = scores[0] +articles_scores.shape + +"" +articles_scores.head() + +############################################################################### +# **Authors** +# +# The second matrix in matrices and score in scores correspond to **authors**. +# +# The type for authors column is `CSColumn`. It generates a sparce matrix where rows correspond to articles and columns corresponds to authors. + +authors_mat = matrices[1] +authors_mat.shape + +############################################################################### +# Here we see that after filtering authors which have less than 4 articles, there are 694 distinct authors. +# +# The series, which we named `authors_scores`, contains the list of authors +# extracted from the column `authFullName_s` with a score that is equal to the +# number of rows (articles) that this value was mapped within the `authors_mat` +# matrix. + +authors_scores = scores[1] +authors_scores.head() + +############################################################################### +# If we look at the *4th* column of the matrix, which corresponds to the author +# **Ralf Treinen**, we can see that it has 5 non-zero rows, each row +# indicating which articles he authored. + +print(authors_mat[:, 3]) + +############################################################################### +# **Teams** +# +# The third matrix in matrices and score in scores correspond to **teams**. +# +# The type for teams column is `CSColumn`. It generates a sparce matrix where rows correspond to articles and columns corresponds to teams. + +teams_mat = matrices[2] +teams_mat.shape + +############################################################################### +# Here we see that after filtering teams which have less than 4 articles, there are 33 distinct teams. +# +# The series, which we named `teams_scores`, contains the list of teams +# extracted from the column `structAcronym_s` with a score that is equal to the +# number of rows (articles) that this value was mapped within the `teams_mat` +# matrix. + +teams_scores = scores[2] +teams_scores.head() + +############################################################################### +# **Labs** +# +# The fourth matrix in matrices and score in scores correspond to **labs**. +# +# The type for labs column is `CSColumn`. It generates a sparce matrix where rows correspond to articles and columns corresponds to labs. + +labs_mat = matrices[3] +labs_mat.shape + +############################################################################### +# Here we see that after filtering labs which have less than 4 articles, there are 549 distinct labs. +# +# The series, which we named `labs_scores`, contains the list of labs +# extracted from the column `structAcronym_s` with a score that is equal to the +# number of rows (articles) that this value was mapped within the `labs_mat` +# matrix. + +labs_scores = scores[3] +labs_scores.head() + +############################################################################### +# **Words** +# +# The fifth matrix in matrices and score in scores correspond to **words**. +# +# The type for words column is `CorpusColumn`. It creates a corpus merging multiple text columns in the dataset, and then extracts n-grams from that corpus. Finally it generates a sparce matrix where rows correspond to articles and columns corresponds to n-grams. + +words_mat = matrices[4] +words_mat.shape + +############################################################################### +# Here we see that there are 5226 distinct n-grams. +# +# The series, which we named `words_scores`, contains the list of n-grams +# with a score that is equal to the number of rows (articles) that this value +# was mapped within the `words_mat` matrix. + +words_scores = scores[4] +words_scores.head() + +############################################################################### +# Dimension reduction +# ------------------------------ +# +# One way to see the matrices that we created is as coordinates in the space of +# all articles. What we want to do is to reduce the dimension of this space to +# make it easier to work with and see. +# +# **Bert projection** +# +# We'll start by using the Bert technique to reduce the number of rows in our data. + +from cartodata.pipeline.projectionnd import BertProjection # noqa + + +bert_projection = BertProjection() + +pipeline.set_projection_nd(bert_projection) + +############################################################################### +# Now we can run Bert projection on the matrices. + +matrices_nD = pipeline.do_projection_nD(force=True) + +"" +for nature, matrix in zip(pipeline.natures, matrices_nD): + print(f"{nature} ------------- {matrix.shape}") + +############################################################################### +# We have 768 rows for each entity. + +############################################################################### +# We will project them on a 2D space to be able to map them. +# +# **UMAP projection** +# +# The `UMAP <https://github.com/lmcinnes/umap>`_ (Uniform Manifold Approximation +# and Projection) is a dimension reduction technique that can be used for +# visualisation similarly to t-SNE. +# +# We use this algorithm to project our matrices in 2 dimensions. + +from cartodata.pipeline.projection2d import UMAPProjection # noqa + + +umap_projection = UMAPProjection(n_neighbors=15, min_dist=0.1) + +pipeline.set_projection_2d(umap_projection) + +############################################################################### +# Now we can run UMAP projection on the LSA matrices. + +matrices_2D = pipeline.do_projection_2D(force=True) + +############################################################################### +# Now that we have 2D coordinates for our points, we can try to plot them to +# get a feel of the data's shape. + +labels = tuple(pipeline.natures) +colors = ['b', 'r', 'c', 'y', 'm'] + +fig, ax = pipeline.plot_map(matrices_2D, labels, colors) + +############################################################################### +# The plot above, as we don't have labels for the points, doesn't make much sense +# as is. But we can see that the data shows some clusters which we could try to identify. +# +# Clustering +# --------------- +# +# In order to identify clusters, we use the KMeans clustering technique on the +# articles. We'll also try to label these clusters by selecting the most +# frequent words that appear in each cluster's articles. + +from cartodata.pipeline.clustering import KMeansClustering # noqa + +# level of clusters, hl: high level, ml: medium level +cluster_natures = ["hl_clusters", "ml_clusters"] + +kmeans_clustering = KMeansClustering(n=8, base_factor=3, natures=cluster_natures) + +pipeline.set_clustering(kmeans_clustering) + +############################################################################### +# Now we can run clustering on the matrices. + +(clus_nD, clus_2D, clus_scores, cluster_labels, +cluster_eval_pos, cluster_eval_neg) = pipeline.create_clusters() + +############################################################################### +# As we have specified two levels of clustering, the returned lists wil have two values. + +len(clus_2D) + +############################################################################### +# We will now display two levels of clusters in separate plots, we will start with high level clusters: + +clus_scores_hl = clus_scores[0] +clus_mat_hl = clus_2D[0] + + +fig_hl, ax_hl = pipeline.plot_map(matrices_2D, labels, colors, + title="LISN Dataset High Level Clusters", + annotations=clus_scores_hl.index, annotation_mat=clus_mat_hl) + +############################################################################### +# The 8 high level clusters that we created give us a general idea of what the big +# clusters of data contain. +# +# With medium level clusters we have a finer level of detail: + +clus_scores_ml = clus_scores[1] +clus_mat_ml = clus_2D[1] + +fig_ml, ax_ml = pipeline.plot_map(matrices_2D, labels, colors, + title="LISN Dataset Medium Level Clusters", + annotations=clus_scores_ml.index, annotation_mat=clus_mat_ml, + annotation_color='black') + +############################################################################### +# We have 24 medium level clusters. We can increase the number of clusters to have even finer details to zoom in and focus on smaller areas. +# +# Now we will save the plots in the `working_dir` directory. + +pipeline.save_plots() + +"" +for file in pipeline.working_dir.glob("*.png"): + print(file) + +############################################################################### +# Nearest neighbors +# ---------------------------- +# +# One more thing which could be useful to appreciate the quality of our data +# would be to get each point's nearest neighbors. If our data processing is +# done correctly, we expect the related articles, labs, words and authors to be +# located close to each other. +# +# Finding nearest neighbors is a common task with various algorithms aiming to +# solve it. The `find_neighbors` method uses one of these algorithms to find the +# nearest points of all entities (articles, authors, teams, +# labs, words). It takes an optional weight parameter to tweak +# the distance calculation to select points that have a higher score but are +# maybe a bit farther instead of just selecting the closest neighbors. + +from cartodata.pipeline.neighbors import AllNeighbors + +n_neighbors = 10 +weights = [0, 0.5, 0.5, 0, 0] + +neighboring = AllNeighbors(n_neighbors=n_neighbors, power_scores=weights) + +pipeline.set_neighboring(neighboring) + +pipeline.find_neighbors() + + +############################################################################### +# Export file using exporter +# ======================= +# +# We can now export the data. To export the data, we need to configure the exporter. +# +# The exported data will be the points extracted from the dataset corresponding to the entities that we have defined. +# +# In the export file, we will have the following columns for each point: +# +# +# | column | value | +# ---------|-------------| +# | nature | one of articles, authors, teams, labs, words | +# | label | point's label | +# | score | point's score | +# | rank | point's rank | +# | x | point's x location on the map | +# | y | point's y location on the map | +# | nn_articles | neighboring articles to this point | +# | nn_teams | neighboring teams to this point | +# | nn_labs | neighboring labs to this point | +# | nn_words | neighboring words to this point | +# +# we will call `pipeline.export` function. It will create `export.feather` file and save under `pipeline.working_dir`. + +pipeline.export() + +############################################################################### +# Let's display the contents of the file. + +import pandas as pd # noqa + +df = pd.read_feather(pipeline.get_clus_dir() / "export.feather") +df.head() + +############################################################################### +# This is a basic export file. For each point, we can add additional columns. +# +# For example, for each author, we can add **labs** and **teams** columns to list the labs and teams that the author belongs to. We can also merge the teams and labs in one column and name it as labs. To do that we have to first create export config for the entity (nature) that we would like to modify. + +from cartodata.pipeline.exporting import ( + ExportNature, MetadataColumn +) # noqa + +ex_author = ExportNature(key="authors", + refs=["labs", "teams"], + merge_metadata=[{"columns": ["teams", "labs"], + "as_column": "labs"}]) + +############################################################################### +# We can do the same for articles. Each article will have **teams** and **labs** data, and additionally **author** of the article. So we can set `refs=["labs", "teams", "authors"]`. +# +# The original dataset contains a column `producedDateY_i` which contains the year that the article is published. We can add this data as metadata for the point but updating column name with a more clear alternative `year`. We can also add a function to apply to the column value. In this example we will convert column value to string. + +meta_year_article = MetadataColumn(column="producedDateY_i", as_column="year", + func="x.astype(str)") + +############################################################################### +# We will also add `halId_s` column as `url` and set empty string if the value does not exist: + +meta_url_article = MetadataColumn(column="halId_s", as_column="url", func="x.fillna('')") + +"" +ex_article = ExportNature(key="articles", refs=["labs", "teams", "authors"], + merge_metadata=[{"columns": ["teams", "labs"], + "as_column": "labs"}], + add_metadata=[meta_year_article, meta_url_article]) + +pipeline.export(export_natures=[ex_article, ex_author]) + +############################################################################### +# Now we can load the new `export.feather` file to see the difference. + +df = pd.read_feather(pipeline.get_clus_dir() / "export.feather") + +df.head() + +############################################################################### +# For the points of nature **articles**, we have additional **labs**, **authors**, **year**, **url** columns. +# +# Let's see the points of nature **authors**: + +df[df["nature"] == "authors"].head() + +############################################################################### +# We have values for labs field, but not for authors, year, or url field. +# +# As we have not defined any relation for points of natures **teams**, **labs** and **words**, these new columns are empty for those points. + +df[df["nature"] == "teams"].head() + +"" +df[df["nature"] == "labs"].head() + +"" +df[df["nature"] == "words"].head() + +"" +df['x'][1] + +############################################################################### +# Export to json file +# ------------------------------- + +############################################################################### +# We can export the data to a **json** file as well. + +export_json_file = pipeline.get_clus_dir() / 'lisn_workflow_lsa.json' + +pipeline.exporter.export_to_json(export_json_file) + +############################################################################### +# This creates the `lisn_workflow_lsa.json` file which contains a list of points +# ready to be imported into Cartolabe. Have a look at it to check that it +# contains everything. + +import json # noqa + +with open(export_json_file, 'r') as f: + data = json.load(f) + +data[1] -- GitLab From d737d8738fa37811644517412b585e91756d9a02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= <hande@ginkgo.work> Date: Thu, 15 Feb 2024 16:19:22 +0100 Subject: [PATCH 4/8] remove duplicate notebook --- examples/compare_projections_vispubdata.ipynb | 1328 ----------------- 1 file changed, 1328 deletions(-) delete mode 100644 examples/compare_projections_vispubdata.ipynb diff --git a/examples/compare_projections_vispubdata.ipynb b/examples/compare_projections_vispubdata.ipynb deleted file mode 100644 index 0d8796cd..00000000 --- a/examples/compare_projections_vispubdata.ipynb +++ /dev/null @@ -1,1328 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "0057b3e8-2bd0-45ef-9962-b551dba15256", - "metadata": {}, - "source": [ - "# Extracting and processing VisPubdata data with the Cartolabe API\n", - "Comparing the quality of embeddings using multiple methods" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "c56de0ec-6dcb-4ad6-a44b-3a097e639355", - "metadata": {}, - "outputs": [], - "source": [ - "%matplotlib inline\n", - "%load_ext autoreload\n", - "%autoreload 2\n", - "%matplotlib widget" - ] - }, - { - "cell_type": "markdown", - "id": "5ba3803b-606c-45ca-ab17-ec565a1c2baa", - "metadata": {}, - "source": [ - "Download data\n", - "================\n", - "\n", - "We will start by downloading the VisPubData dataset from Google Spreadsheet.\n", - "See Petra Isenberg, Florian Heimerl, Steffen Koch, Tobias Isenberg, Panpan Xu, et al.. vispubdata.org: A Metadata Collection about IEEE Visualization (VIS) Publications. IEEE Transactions on Visualization and Computer Graphics, 2017, 23 (9), pp.2199-2206. ⟨[https://dx.doi.org/10.1109/TVCG.2016.2615308](10.1109/TVCG.2016.2615308)⟩. ⟨[https://dx.doi.org/10.1109/TVCG.2016.2615308](hal-01376597)⟩\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "3f84bfe9-99fd-41d2-a7ae-18e12ad9d668", - "metadata": {}, - "outputs": [], - "source": [ - "SHEET_ID = '1xgoOPu28dQSSGPIp_HHQs0uvvcyLNdkMF9XtRajhhxU'\n", - "SHEET_NAME='Main%20dataset'\n", - "url = f'https://docs.google.com/spreadsheets/d/{SHEET_ID}/gviz/tq?tqx=out:csv&sheet={SHEET_NAME}'\n", - "\n", - "min_df = 25\n", - "max_df = 0.1\n", - "max_words = 100000\n", - "vocab_sample = 250000\n", - "num_dims = 50\n", - "filt_min_score = 3\n", - "n_neighbors = 10" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "138ce887-1d97-400a-a020-e35c0fa9a16c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>Conference</th>\n", - " <th>Year</th>\n", - " <th>Title</th>\n", - " <th>DOI</th>\n", - " <th>Link</th>\n", - " <th>FirstPage</th>\n", - " <th>LastPage</th>\n", - " <th>PaperType</th>\n", - " <th>Abstract</th>\n", - " <th>AuthorNames-Deduped</th>\n", - " <th>AuthorNames</th>\n", - " <th>AuthorAffiliation</th>\n", - " <th>InternalReferences</th>\n", - " <th>AuthorKeywords</th>\n", - " <th>AminerCitationCount</th>\n", - " <th>CitationCount_CrossRef</th>\n", - " <th>PubsCited_CrossRef</th>\n", - " <th>Award</th>\n", - " <th>text</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>0</th>\n", - " <td>Vis</td>\n", - " <td>2022</td>\n", - " <td>Photosensitive Accessibility for Interactive D...</td>\n", - " <td>10.1109/TVCG.2022.3209359</td>\n", - " <td>http://dx.doi.org/10.1109/TVCG.2022.3209359</td>\n", - " <td>374.0</td>\n", - " <td>384.0</td>\n", - " <td>J</td>\n", - " <td>Accessibility guidelines place restrictions on...</td>\n", - " <td>Laura South;Michelle Borkin</td>\n", - " <td>Laura South;Michelle A. Borkin</td>\n", - " <td>Northeastern University, USA;Northeastern Univ...</td>\n", - " <td>10.1109/TVCG.2011.185;10.1109/TVCG.2021.311482...</td>\n", - " <td>accessibility,photosensitive epilepsy,photosen...</td>\n", - " <td>NaN</td>\n", - " <td>1.0</td>\n", - " <td>63.0</td>\n", - " <td>NaN</td>\n", - " <td>Accessibility guidelines place restrictions on...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1</th>\n", - " <td>Vis</td>\n", - " <td>2022</td>\n", - " <td>HetVis: A Visual Analysis Approach for Identif...</td>\n", - " <td>10.1109/TVCG.2022.3209347</td>\n", - " <td>http://dx.doi.org/10.1109/TVCG.2022.3209347</td>\n", - " <td>310.0</td>\n", - " <td>319.0</td>\n", - " <td>J</td>\n", - " <td>Horizontal federated learning (HFL) enables di...</td>\n", - " <td>Xumeng Wang;Wei Chen 0001;Jiazhi Xia;Zhen Wen;...</td>\n", - " <td>Xumeng Wang;Wei Chen;Jiazhi Xia;Zhen Wen;Rongc...</td>\n", - " <td>TMCC, CS, Nankai University, China;State Key L...</td>\n", - " <td>10.1109/TVCG.2015.2467618;10.1109/TVCG.2019.29...</td>\n", - " <td>Federated learning,data heterogeneity,cluster ...</td>\n", - " <td>NaN</td>\n", - " <td>3.0</td>\n", - " <td>43.0</td>\n", - " <td>NaN</td>\n", - " <td>Horizontal federated learning (HFL) enables di...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>2</th>\n", - " <td>Vis</td>\n", - " <td>2022</td>\n", - " <td>Rigel: Transforming Tabular Data by Declarativ...</td>\n", - " <td>10.1109/TVCG.2022.3209385</td>\n", - " <td>http://dx.doi.org/10.1109/TVCG.2022.3209385</td>\n", - " <td>128.0</td>\n", - " <td>138.0</td>\n", - " <td>J</td>\n", - " <td>We present Rigel, an interactive system for ra...</td>\n", - " <td>Ran Chen;Di Weng;Yanwei Huang;Xinhuan Shu;Jiay...</td>\n", - " <td>Ran Chen;Di Weng;Yanwei Huang;Xinhuan Shu;Jiay...</td>\n", - " <td>State Key Lab of CAD&CG, Zhejiang University, ...</td>\n", - " <td>10.1109/TVCG.2021.3114830;10.1109/VAST47406.20...</td>\n", - " <td>Data transformation,self-service data transfor...</td>\n", - " <td>NaN</td>\n", - " <td>3.0</td>\n", - " <td>68.0</td>\n", - " <td>NaN</td>\n", - " <td>We present Rigel, an interactive system for ra...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3</th>\n", - " <td>Vis</td>\n", - " <td>2022</td>\n", - " <td>BeauVis: A Validated Scale for Measuring the A...</td>\n", - " <td>10.1109/TVCG.2022.3209390</td>\n", - " <td>http://dx.doi.org/10.1109/TVCG.2022.3209390</td>\n", - " <td>363.0</td>\n", - " <td>373.0</td>\n", - " <td>J</td>\n", - " <td>We developed and validated a rating scale to a...</td>\n", - " <td>Tingying He;Petra Isenberg;Raimund Dachselt;To...</td>\n", - " <td>Tingying He;Petra Isenberg;Raimund Dachselt;To...</td>\n", - " <td>Université Paris-Saclay, CNRS, Inria, LISN, Fr...</td>\n", - " <td>10.1109/INFVIS.2005.1532128;10.1109/TVCG.2006....</td>\n", - " <td>Aesthetics,aesthetic pleasure,validated scale,...</td>\n", - " <td>NaN</td>\n", - " <td>1.0</td>\n", - " <td>79.0</td>\n", - " <td>NaN</td>\n", - " <td>We developed and validated a rating scale to a...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>4</th>\n", - " <td>Vis</td>\n", - " <td>2022</td>\n", - " <td>NAS-Navigator: Visual Steering for Explainable...</td>\n", - " <td>10.1109/TVCG.2022.3209361</td>\n", - " <td>http://dx.doi.org/10.1109/TVCG.2022.3209361</td>\n", - " <td>299.0</td>\n", - " <td>309.0</td>\n", - " <td>J</td>\n", - " <td>The success of DL can be attributed to hours o...</td>\n", - " <td>Anjul Tyagi;Cong Xie;Klaus Mueller 0001</td>\n", - " <td>Anjul Tyagi;Cong Xie;Klaus Mueller</td>\n", - " <td>Computer Science Department, Visual Analytics ...</td>\n", - " <td>10.1109/VAST.2012.6400490;10.1109/TVCG.2019.29...</td>\n", - " <td>Deep Learning,Neural Network Architecture Sear...</td>\n", - " <td>NaN</td>\n", - " <td>0.0</td>\n", - " <td>63.0</td>\n", - " <td>NaN</td>\n", - " <td>The success of DL can be attributed to hours o...</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "</div>" - ], - "text/plain": [ - " Conference Year Title \\\n", - "0 Vis 2022 Photosensitive Accessibility for Interactive D... \n", - "1 Vis 2022 HetVis: A Visual Analysis Approach for Identif... \n", - "2 Vis 2022 Rigel: Transforming Tabular Data by Declarativ... \n", - "3 Vis 2022 BeauVis: A Validated Scale for Measuring the A... \n", - "4 Vis 2022 NAS-Navigator: Visual Steering for Explainable... \n", - "\n", - " DOI Link \\\n", - "0 10.1109/TVCG.2022.3209359 http://dx.doi.org/10.1109/TVCG.2022.3209359 \n", - "1 10.1109/TVCG.2022.3209347 http://dx.doi.org/10.1109/TVCG.2022.3209347 \n", - "2 10.1109/TVCG.2022.3209385 http://dx.doi.org/10.1109/TVCG.2022.3209385 \n", - "3 10.1109/TVCG.2022.3209390 http://dx.doi.org/10.1109/TVCG.2022.3209390 \n", - "4 10.1109/TVCG.2022.3209361 http://dx.doi.org/10.1109/TVCG.2022.3209361 \n", - "\n", - " FirstPage LastPage PaperType \\\n", - "0 374.0 384.0 J \n", - "1 310.0 319.0 J \n", - "2 128.0 138.0 J \n", - "3 363.0 373.0 J \n", - "4 299.0 309.0 J \n", - "\n", - " Abstract \\\n", - "0 Accessibility guidelines place restrictions on... \n", - "1 Horizontal federated learning (HFL) enables di... \n", - "2 We present Rigel, an interactive system for ra... \n", - "3 We developed and validated a rating scale to a... \n", - "4 The success of DL can be attributed to hours o... \n", - "\n", - " AuthorNames-Deduped \\\n", - "0 Laura South;Michelle Borkin \n", - "1 Xumeng Wang;Wei Chen 0001;Jiazhi Xia;Zhen Wen;... \n", - "2 Ran Chen;Di Weng;Yanwei Huang;Xinhuan Shu;Jiay... \n", - "3 Tingying He;Petra Isenberg;Raimund Dachselt;To... \n", - "4 Anjul Tyagi;Cong Xie;Klaus Mueller 0001 \n", - "\n", - " AuthorNames \\\n", - "0 Laura South;Michelle A. Borkin \n", - "1 Xumeng Wang;Wei Chen;Jiazhi Xia;Zhen Wen;Rongc... \n", - "2 Ran Chen;Di Weng;Yanwei Huang;Xinhuan Shu;Jiay... \n", - "3 Tingying He;Petra Isenberg;Raimund Dachselt;To... \n", - "4 Anjul Tyagi;Cong Xie;Klaus Mueller \n", - "\n", - " AuthorAffiliation \\\n", - "0 Northeastern University, USA;Northeastern Univ... \n", - "1 TMCC, CS, Nankai University, China;State Key L... \n", - "2 State Key Lab of CAD&CG, Zhejiang University, ... \n", - "3 Université Paris-Saclay, CNRS, Inria, LISN, Fr... \n", - "4 Computer Science Department, Visual Analytics ... \n", - "\n", - " InternalReferences \\\n", - "0 10.1109/TVCG.2011.185;10.1109/TVCG.2021.311482... \n", - "1 10.1109/TVCG.2015.2467618;10.1109/TVCG.2019.29... \n", - "2 10.1109/TVCG.2021.3114830;10.1109/VAST47406.20... \n", - "3 10.1109/INFVIS.2005.1532128;10.1109/TVCG.2006.... \n", - "4 10.1109/VAST.2012.6400490;10.1109/TVCG.2019.29... \n", - "\n", - " AuthorKeywords AminerCitationCount \\\n", - "0 accessibility,photosensitive epilepsy,photosen... NaN \n", - "1 Federated learning,data heterogeneity,cluster ... NaN \n", - "2 Data transformation,self-service data transfor... NaN \n", - "3 Aesthetics,aesthetic pleasure,validated scale,... NaN \n", - "4 Deep Learning,Neural Network Architecture Sear... NaN \n", - "\n", - " CitationCount_CrossRef PubsCited_CrossRef Award \\\n", - "0 1.0 63.0 NaN \n", - "1 3.0 43.0 NaN \n", - "2 3.0 68.0 NaN \n", - "3 1.0 79.0 NaN \n", - "4 0.0 63.0 NaN \n", - "\n", - " text \n", - "0 Accessibility guidelines place restrictions on... \n", - "1 Horizontal federated learning (HFL) enables di... \n", - "2 We present Rigel, an interactive system for ra... \n", - "3 We developed and validated a rating scale to a... \n", - "4 The success of DL can be attributed to hours o... " - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd # noqa\n", - "\n", - "df = pd.read_csv(url)\n", - "df.AuthorKeywords.fillna('', inplace=True)\n", - "df.Abstract.fillna('', inplace=True)\n", - "df.AuthorAffiliation.fillna('', inplace=True)\n", - "df['text'] = df.Abstract + ' ' \\\n", - " + df.AuthorKeywords + ' ' \\\n", - " + df.Title\n", - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "57fe4607-9e4f-4b40-94b3-25296f4b1f0b", - "metadata": {}, - "source": [ - "Creating correspondance matrices for each entity type\n", - "================================================================\n", - "\n", - "From this table of articles, we want to extract matrices that will map the\n", - "correspondance between these articles and the entities we want to use." - ] - }, - { - "cell_type": "markdown", - "id": "9b93b75e-8312-42bf-b0ad-39a91be51f92", - "metadata": {}, - "source": [ - "Authors\n", - "--------------\n", - "\n", - "Let's start with the authors for example. We want to create a matrix where\n", - "the rows represent the articles and the columns represent the authors. Each\n", - "cell (n, m) will have a 1 in it if the *nth* article was written by the *mth*\n", - "author.\n", - "\n", - "As we have multiple dataframes, the results will be arrays corresponding to specified dataframes." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "6ffddb28-0b8d-4853-972a-378a2c40c284", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(3620, 6608)" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from cartodata.loading import load_comma_separated_column # noqa\n", - "\n", - "authors_mat, authors_scores = load_comma_separated_column(df, 'AuthorNames-Deduped', comma=';')\n", - "authors_mat.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "0becb75d-5b08-481d-8cd6-65b7b99c78f8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Laura South 2\n", - "Michelle Borkin 8\n", - "Xumeng Wang 6\n", - "Wei Chen 0001 42\n", - "Jiazhi Xia 13\n", - "dtype: int64" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "authors_scores.head()" - ] - }, - { - "cell_type": "markdown", - "id": "19101047-0c99-46fd-be1a-f33faa94b1de", - "metadata": {}, - "source": [ - "If we look at the *2nd* column of the matrix, which corresponds to the author\n", - "**Michelle Borkin**, we can see that she has 8 non-zero rows, each row\n", - "indicating which articles she authored." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "0e57ff10-8e60-4185-ba89-8ff4345d6da9", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (0, 0)\t1\n", - " (79, 0)\t1\n", - " (337, 0)\t1\n", - " (485, 0)\t1\n", - " (871, 0)\t1\n", - " (1164, 0)\t1\n", - " (1203, 0)\t1\n", - " (1411, 0)\t1\n" - ] - } - ], - "source": [ - "print(authors_mat[:, 1])" - ] - }, - { - "cell_type": "markdown", - "id": "8e955285-4929-4f2c-b2bd-28292d02afa4", - "metadata": {}, - "source": [ - "Filtering low score entities\n", - "---------------------------------------\n", - "\n", - "A lot of the authors that we just extracted from the dataframe have\n", - "a very low score, which means they're only linked to one or two articles. To\n", - "improve the quality of our data, we'll filter the authors by\n", - "removing those that appear less than 3 times.\n", - "\n", - "To do this, we'll use the `filter_min_score` function." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "14964472-c58d-45d2-97af-965d1f36d343", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Removed 5912 authors with less than 3 articles from a total of 6608 authors.\n", - "Working with 696 authors.\n", - "\n" - ] - } - ], - "source": [ - "from cartodata.operations import filter_min_score # noqa\n", - "\n", - "authors_before = len(authors_scores)\n", - "\n", - "authors_mat, authors_scores = filter_min_score(authors_mat,\n", - " authors_scores,\n", - " filt_min_score)\n", - "\n", - "print(f\"Removed {authors_before - len(authors_scores)} authors with less \"\n", - " f\"than 3 articles from a total of {authors_before} authors.\")\n", - "print(f\"Working with {len(authors_scores)} authors.\\n\")\n" - ] - }, - { - "cell_type": "markdown", - "id": "489fc4e8-40dc-4486-a150-7c9586b3c769", - "metadata": {}, - "source": [ - "Words\n", - "------------\n", - "\n", - "For the words, it's a bit trickier because we want to extract n-grams (groups\n", - "of n terms) instead of just comma separated values. We'll call the\n", - "`load_text_column` which uses scikit-learn's\n", - "`CountVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_\n", - "to create a vocabulary and map the tokens." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "30afcbf1-550d-4f9f-9b04-53e231c4a423", - "metadata": {}, - "outputs": [], - "source": [ - "from cartodata.loading import load_text_column # noqa\n", - "from sklearn.feature_extraction import text as sktxt # noqa\n", - "\n", - "with open('../datas/stopwords.txt', 'r') as stop_file:\n", - " stopwords = sktxt.ENGLISH_STOP_WORDS.union(\n", - " set(stop_file.read().splitlines()))\n", - "\n", - "words_mat, words_scores = load_text_column(df['text'],\n", - " 4,\n", - " min_df,\n", - " max_df,\n", - " stopwords=stopwords)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "f8b4ec3e-e2fa-4c68-ad2b-cfeaf90a450c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "ability 176\n", - "abstract 107\n", - "abstraction 95\n", - "abstractions 40\n", - "accelerate 36\n", - "dtype: int64" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "words_scores.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "df590c17-53af-4a17-abd2-e09316249ed8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(3620, 1902)" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "words_mat.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "6501c67c-dba8-41ec-9aa8-2006f91c49f2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(3620, 1902)" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from cartodata.operations import normalize_tfidf # noqa\n", - "\n", - "words_mat = normalize_tfidf(words_mat)\n", - "words_mat.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "888bfe4e-f081-4013-9d86-d0b259e45f61", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Photosensitive Accessibility for Interactive Data Visualizations 1.0\n", - "HetVis: A Visual Analysis Approach for Identifying Data Heterogeneity in Horizontal Federated Learning 1.0\n", - "Rigel: Transforming Tabular Data by Declarative Mapping 1.0\n", - "BeauVis: A Validated Scale for Measuring the Aesthetic Pleasure of Visual Representations 1.0\n", - "NAS-Navigator: Visual Steering for Explainable One-Shot Deep Neural Network Synthesis 1.0\n", - "dtype: float64" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from cartodata.loading import load_identity_column # noqa\n", - "\n", - "articles_mat, articles_scores = load_identity_column(df, 'Title')\n", - "articles_scores.head()" - ] - }, - { - "cell_type": "markdown", - "id": "090d2a9d-b9a3-411d-8c37-322393f51d42", - "metadata": {}, - "source": [ - "Dimension reduction/Embeddings\n", - "==============================\n", - "\n", - "One way to see the matrices that we created is as coordinates in the space of\n", - "all articles. What we want to do is to reduce the dimension of this space to\n", - "make it easier to work with and see.\n", - "\n", - "Validation\n", - "----------\n", - "\n", - "We compute a score that counts the average number of times the 10 nearest neighbors of an article are from the same author as the article.\n", - "For each author, we have a number between 1 (100%) and 0.1 (none of the articles are from the same author, except the initial article itself).\n", - "\n", - "LSA projection\n", - "-------------------------\n", - "\n", - "We'll start by using the LSA (Latent Semantic Analysis) technique to identify\n", - "keywords in our data and thus reduce the number of rows in our matrices. The\n", - "`lsa_projection` method takes three arguments:\n", - "\n", - "- the number of dimensions you want to keep\n", - "- the matrix of documents/words frequency\n", - "- a list of matrices to project\n", - "\n", - "It returns a list of the same length containing the matrices projected in the\n", - "latent space.\n", - "\n", - "We also apply an l2 normalization to each feature of the projected matrices." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "d9468b90-b137-46cf-8dd7-0f54ba721647", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 835 ms, sys: 2.7 s, total: 3.54 s\n", - "Wall time: 237 ms\n" - ] - } - ], - "source": [ - "from cartodata.projection import lsa_projection # noqa\n", - "from cartodata.operations import normalize_l2 # noqa\n", - "from cartodata.scoring import score_projection\n", - "\n", - "%time lsa_matrices = lsa_projection(num_dims, words_mat, [articles_mat, authors_mat, words_mat])\n", - "lsa_matrices = list(map(normalize_l2, lsa_matrices))" - ] - }, - { - "cell_type": "markdown", - "id": "d699cda1-ded6-4664-ace1-97bc80397c8f", - "metadata": {}, - "source": [ - "We've reduced the number of rows in each of `articles_mat`, `authors_mat`,\n", - "`words_mat` and `labs_mat` to just 80." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "603cdd83-a86c-4179-b984-e8dbb921aa60", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "articles_mat: (50, 3620)\n", - "authors_mat: (50, 696)\n", - "words_mat: (50, 1902)\n" - ] - } - ], - "source": [ - "print(f\"articles_mat: {lsa_matrices[0].shape}\")\n", - "print(f\"authors_mat: {lsa_matrices[1].shape}\")\n", - "print(f\"words_mat: {lsa_matrices[2].shape}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "06573504-5315-448b-8113-6964021c1ed9", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "0% 10 20 30 40 50 60 70 80 90 100%\n", - "|----|----|----|----|----|----|----|----|----|----|\n", - "***************************************************\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "LSA score: 0.13522117589072855\n" - ] - } - ], - "source": [ - "lsa_score = score_projection(lsa_matrices[0], articles_scores, authors_mat, authors_scores, min_score=filt_min_score, n_neighbors=n_neighbors)\n", - "print(\"LSA score:\", lsa_score[0])" - ] - }, - { - "cell_type": "markdown", - "id": "8f5f1db6-51d0-4214-bf6a-0e8329d7123d", - "metadata": {}, - "source": [ - "LDA projection\n", - "-------------------------" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "ad19b962-2bc2-43c5-8f53-7b84139ec753", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 8.21 s, sys: 350 ms, total: 8.56 s\n", - "Wall time: 8.12 s\n" - ] - } - ], - "source": [ - "from cartodata.projection import lda_projection # noqa\n", - "\n", - "%time lda_matrices = lda_projection(num_dims, 1, [articles_mat, authors_mat, words_mat])\n", - "lda_matrices = list(map(normalize_l2, lda_matrices))" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "70e333b9-f2c3-47ab-8911-5012850a67ec", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "articles_mat: (50, 3620)\n", - "authors_mat: (50, 696)\n", - "words_mat: (50, 1902)\n" - ] - } - ], - "source": [ - "print(f\"articles_mat: {lda_matrices[0].shape}\")\n", - "print(f\"authors_mat: {lda_matrices[1].shape}\")\n", - "print(f\"words_mat: {lda_matrices[2].shape}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "795a2309-1c86-4a04-b476-23293a939415", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "0% 10 20 30 40 50 60 70 80 90 100%\n", - "|----|----|----|----|----|----|----|----|----|----|\n", - "***************************************************\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "LDA score: 0.004340582092816129\n" - ] - } - ], - "source": [ - "lda_score = score_projection(lda_matrices[0], articles_scores, authors_mat, authors_scores, min_score=filt_min_score, n_neighbors=n_neighbors)\n", - "print(\"LDA score:\", lda_score[0])" - ] - }, - { - "cell_type": "markdown", - "id": "705f785d-1bc3-4cc5-a3f9-c21f54bfad80", - "metadata": {}, - "source": [ - "DOC2Vec projection\n", - "-------------------------" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "2a5c612a-f369-455a-945d-dbe1d8b064fd", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 7.02 s, sys: 1.05 s, total: 8.07 s\n", - "Wall time: 4.98 s\n" - ] - } - ], - "source": [ - "from cartodata.projection import doc2vec_projection # noqa\n", - "\n", - "%time doc2vec_matrices = doc2vec_projection(num_dims, 1, [articles_mat, authors_mat, words_mat], df['text'])\n", - "doc2vec_matrices = list(map(normalize_l2, doc2vec_matrices))" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "73564507-0120-4394-8912-19022fda3c33", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "articles_mat: (50, 3620)\n", - "authors_mat: (50, 696)\n", - "words_mat: (50, 1902)\n" - ] - } - ], - "source": [ - "print(f\"articles_mat: {doc2vec_matrices[0].shape}\")\n", - "print(f\"authors_mat: {doc2vec_matrices[1].shape}\")\n", - "print(f\"words_mat: {doc2vec_matrices[2].shape}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "df7558af-bff9-43cd-b69b-eb93d6cb39b4", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "0% 10 20 30 40 50 60 70 80 90 100%\n", - "|----|----|----|----|----|----|----|----|----|----|\n", - "***************************************************\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Doc2Vec score: 0.12074700936029183\n" - ] - } - ], - "source": [ - "doc2vec_score = score_projection(doc2vec_matrices[0], articles_scores, authors_mat, authors_scores, min_score=filt_min_score, n_neighbors=n_neighbors)\n", - "print(\"Doc2Vec score:\", doc2vec_score[0])" - ] - }, - { - "cell_type": "markdown", - "id": "a9e272be-4c18-4295-8e30-df0dd2c678a8", - "metadata": {}, - "source": [ - "Specter2 projection\n", - "-------------------------" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "623c35a2-96c4-499c-a4a6-221fd173bf47", - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "aa7e2afc90d94614960cd1ac8b00ce3f", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5c03a4ff7e974a8ab9aa8c8f23e2c610", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Processing batches: 0%| | 0/362 [00:00<?, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 3min 23s, sys: 1.76 s, total: 3min 24s\n", - "Wall time: 2min 48s\n" - ] - } - ], - "source": [ - "from cartodata.projection import bert_projection # noqa\n", - "\n", - "%time specter2_matrices = bert_projection([articles_mat, authors_mat, words_mat], df['text'])\n", - "specter2_matrices = list(map(normalize_l2, specter2_matrices))" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "0b0922e3-cc1f-4662-9cb8-22afdb6b0acf", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "articles_mat: (768, 3620)\n", - "authors_mat: (768, 696)\n", - "words_mat: (768, 1902)\n" - ] - } - ], - "source": [ - "print(f\"articles_mat: {specter2_matrices[0].shape}\")\n", - "print(f\"authors_mat: {specter2_matrices[1].shape}\")\n", - "print(f\"words_mat: {specter2_matrices[2].shape}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "a02439ec-9023-4081-899a-fd4363a20f35", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "0% 10 20 30 40 50 60 70 80 90 100%\n", - "|----|----|----|----|----|----|----|----|----|----|\n", - "***************************************************\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Specter2 score: 0.16743185811171973\n" - ] - } - ], - "source": [ - "specter2_score = score_projection(specter2_matrices[0], articles_scores, authors_mat, authors_scores, min_score=filt_min_score, n_neighbors=n_neighbors)\n", - "print(\"Specter2 score:\", specter2_score[0])" - ] - }, - { - "cell_type": "markdown", - "id": "39906a7b-841a-4253-ab13-228968d4e948", - "metadata": {}, - "source": [ - "Scincl projection\n", - "-------------------------" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "33eb15e4-873c-4da0-9ac3-c721267030a4", - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "8ffd14c07e0f44f8846243a165506942", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Processing batches: 0%| | 0/362 [00:00<?, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 3min 22s, sys: 1.48 s, total: 3min 23s\n", - "Wall time: 2min 45s\n" - ] - } - ], - "source": [ - "%time scincl_matrices = bert_projection([articles_mat, authors_mat, words_mat], df['text'], family=\"scincl\")\n", - "scincl_matrices = list(map(normalize_l2, scincl_matrices))" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "ba4704cd-8b53-49b7-b192-ad66a5e7b24e", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "articles_mat: (768, 3620)\n", - "authors_mat: (768, 696)\n", - "words_mat: (768, 1902)\n" - ] - } - ], - "source": [ - "print(f\"articles_mat: {scincl_matrices[0].shape}\")\n", - "print(f\"authors_mat: {scincl_matrices[1].shape}\")\n", - "print(f\"words_mat: {scincl_matrices[2].shape}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "5466f82a-e9a8-407d-993e-de077a386ae2", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "0% 10 20 30 40 50 60 70 80 90 100%\n", - "|----|----|----|----|----|----|----|----|----|----|\n", - "***************************************************\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Scincl score: 0.1592984953690405\n" - ] - } - ], - "source": [ - "scincl_score = score_projection(scincl_matrices[0], articles_scores, authors_mat, authors_scores, min_score=filt_min_score, n_neighbors=n_neighbors)\n", - "print(\"Scincl score:\", scincl_score[0])" - ] - }, - { - "cell_type": "markdown", - "id": "8e50c84e-a0d9-4b06-938f-f8c33317c0e6", - "metadata": {}, - "source": [ - "\"all-MiniLM-L6-v2\" projection\n", - "-------------------------" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "3b8de0fb-ac51-4c54-bdbc-af32113028f8", - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5d36d950d1c64ed8ad57a5025638eb67", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Processing batches: 0%| | 0/362 [00:00<?, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 1min 17s, sys: 1.35 s, total: 1min 19s\n", - "Wall time: 41.4 s\n" - ] - } - ], - "source": [ - "%time minilm_matrices = bert_projection([articles_mat, authors_mat, words_mat], df['text'], family=\"all-MiniLM-L6-v2\")\n", - "minilm_matrices = list(map(normalize_l2, minilm_matrices))" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "9e562af9-9bde-4f25-81cb-faee4ade8794", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "articles_mat: (384, 3620)\n", - "authors_mat: (384, 696)\n", - "words_mat: (384, 1902)\n" - ] - } - ], - "source": [ - "print(f\"articles_mat: {minilm_matrices[0].shape}\")\n", - "print(f\"authors_mat: {minilm_matrices[1].shape}\")\n", - "print(f\"words_mat: {minilm_matrices[2].shape}\")" - ] - }, - { - "cell_type": "markdown", - "id": "257b2246-2670-4313-8f07-e06246dafdca", - "metadata": {}, - "source": [ - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "b61cbd38-be29-41f8-9b26-6323074eb924", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "0% 10 20 30 40 50 60 70 80 90 100%\n", - "|----|----|----|----|----|----|----|----|----|----|\n", - "***************************************************\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "all-MiniLM-L6-v2 score: 0.16517116288290376\n" - ] - } - ], - "source": [ - "minilm_score = score_projection(minilm_matrices[0], articles_scores, authors_mat, authors_scores, min_score=filt_min_score, n_neighbors=n_neighbors)\n", - "print(\"all-MiniLM-L6-v2 score:\", minilm_score[0])" - ] - }, - { - "cell_type": "markdown", - "id": "b79f09e1-01be-4f9e-aba4-6fb815a32589", - "metadata": {}, - "source": [ - "\"all-mpnet-base-v2\" projection\n", - "-------------------------" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "4002c624-8ad8-42f7-b806-957a10f78e9b", - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b8e3b6f7f7054ea3873b4a53c5793bb2", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Processing batches: 0%| | 0/362 [00:00<?, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 4min 48s, sys: 1.63 s, total: 4min 50s\n", - "Wall time: 3min 57s\n" - ] - } - ], - "source": [ - "%time mpnet_matrices = bert_projection([articles_mat, authors_mat, words_mat], df['text'], family=\"all-mpnet-base-v2\")\n", - "mpnet_matrices = list(map(normalize_l2, mpnet_matrices))" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "bfd4a538-d81a-4b61-a279-a10dd8a50987", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "articles_mat: (768, 3620)\n", - "authors_mat: (768, 696)\n", - "words_mat: (768, 1902)\n" - ] - } - ], - "source": [ - "print(f\"articles_mat: {mpnet_matrices[0].shape}\")\n", - "print(f\"authors_mat: {mpnet_matrices[1].shape}\")\n", - "print(f\"words_mat: {mpnet_matrices[2].shape}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "cc9c485c-b9e5-49e4-aae9-25b3da16cbaf", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "0% 10 20 30 40 50 60 70 80 90 100%\n", - "|----|----|----|----|----|----|----|----|----|----|\n", - "***************************************************\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "all-mpnet-base-v2 score: 0.1673046071197\n" - ] - } - ], - "source": [ - "mpnet_score = score_projection(mpnet_matrices[0], articles_scores, authors_mat, authors_scores, min_score=filt_min_score, n_neighbors=n_neighbors)\n", - "print(\"all-mpnet-base-v2 score:\", mpnet_score[0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7012a302-6462-42ac-a9fc-983b3a3679fa", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "jupytext": { - "formats": "ipynb,auto:percent" - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} -- GitLab From 6fde809a310d3b56ee6529a0a2f6c2c122e45a55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= <hande@ginkgo.work> Date: Thu, 15 Feb 2024 16:22:00 +0100 Subject: [PATCH 5/8] use new scoring API and remove duplicate scoring file --- cartodata/scoring.py | 49 ---------------- examples/compare_projections_vispubdata.py | 65 ++++++++++++++++------ 2 files changed, 48 insertions(+), 66 deletions(-) delete mode 100644 cartodata/scoring.py diff --git a/cartodata/scoring.py b/cartodata/scoring.py deleted file mode 100644 index 7851c1e4..00000000 --- a/cartodata/scoring.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -Simple scoring function -""" -import numpy as np - -from cartodata.neighbors import get_neighbors - - -def score_projection( - article_vectors, - article_scores, - author_to_articles, - author_scores, - min_score=20, - n_neighbors=10 -): - matrix_nn = get_neighbors( - article_vectors, - article_scores, - [article_vectors], - n_neighbors=n_neighbors - )[0] - - use = author_scores >= min_score - nature_source = author_to_articles.T.tolil().rows - - nntransp = matrix_nn.T - numb_nn = nntransp.shape[1] - - isin = [np.sum([ - nature in nature_source[source_index] for nature in nntransp[ - nature_source[source_index], :].flatten() - ]) for source_index in range(len(author_scores))] - - # TODO: if the author has less articles than n_neighbors, - # the score is pessimistic - maxsum = [ - author_scores.iloc[ - source_index - ] * numb_nn for source_index in range(len(author_scores)) - ] - - rate = np.divide(isin, maxsum) - filt = use * rate - mean = np.sum(filt) / np.sum(use) - - scores = filt[use] - - return mean, scores diff --git a/examples/compare_projections_vispubdata.py b/examples/compare_projections_vispubdata.py index e6316122..5aa120cf 100644 --- a/examples/compare_projections_vispubdata.py +++ b/examples/compare_projections_vispubdata.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.16.0 +# jupytext_version: 1.16.1 # kernelspec: # display_name: Python 3 (ipykernel) # language: python @@ -206,8 +206,17 @@ print(f"authors_mat: {lsa_matrices[1].shape}") print(f"words_mat: {lsa_matrices[2].shape}") # %% -lsa_score = score_projection(lsa_matrices[0], articles_scores, authors_mat, authors_scores, min_score=filt_min_score, n_neighbors=n_neighbors) -print("LSA score:", lsa_score[0]) +from cartodata.model_selection.scoring import Neighbors # noqa + +NATURE = "articles" +SOURCE = "authors" + +lsa_score = Neighbors.evaluate( + NATURE, SOURCE, authors_mat, authors_scores, dir_xD=".", + scores_nature=articles_scores, matrix_nature_xD=lsa_matrices[0], + min_score=filt_min_score, n_neighbors=n_neighbors, recompute=True +) +lsa_score.print() # %% [markdown] # LDA projection @@ -225,8 +234,12 @@ print(f"authors_mat: {lda_matrices[1].shape}") print(f"words_mat: {lda_matrices[2].shape}") # %% -lda_score = score_projection(lda_matrices[0], articles_scores, authors_mat, authors_scores, min_score=filt_min_score, n_neighbors=n_neighbors) -print("LDA score:", lda_score[0]) +lda_score = Neighbors.evaluate( + NATURE, SOURCE, authors_mat, authors_scores, dir_xD=".", + scores_nature=articles_scores, matrix_nature_xD=lda_matrices[0], + min_score=filt_min_score, n_neighbors=n_neighbors, recompute=True +) +lda_score.print() # %% [markdown] # DOC2Vec projection @@ -244,8 +257,12 @@ print(f"authors_mat: {doc2vec_matrices[1].shape}") print(f"words_mat: {doc2vec_matrices[2].shape}") # %% -doc2vec_score = score_projection(doc2vec_matrices[0], articles_scores, authors_mat, authors_scores, min_score=filt_min_score, n_neighbors=n_neighbors) -print("Doc2Vec score:", doc2vec_score[0]) +doc2vec_score = Neighbors.evaluate( + NATURE, SOURCE, authors_mat, authors_scores, dir_xD=".", + scores_nature=articles_scores, matrix_nature_xD=doc2vec_matrices[0], + min_score=filt_min_score, n_neighbors=n_neighbors, recompute=True +) +doc2vec_score2.print() # %% [markdown] # Specter2 projection @@ -263,8 +280,12 @@ print(f"authors_mat: {specter2_matrices[1].shape}") print(f"words_mat: {specter2_matrices[2].shape}") # %% -specter2_score = score_projection(specter2_matrices[0], articles_scores, authors_mat, authors_scores, min_score=filt_min_score, n_neighbors=n_neighbors) -print("Specter2 score:", specter2_score[0]) +specter2_score = Neighbors.evaluate( + NATURE, SOURCE, authors_mat, authors_scores, dir_xD=".", + scores_nature=articles_scores, matrix_nature_xD=specter2_matrices[0], + min_score=filt_min_score, n_neighbors=n_neighbors, recompute=True +) +specter2_score.print() # %% [markdown] # Scincl projection @@ -280,8 +301,12 @@ print(f"authors_mat: {scincl_matrices[1].shape}") print(f"words_mat: {scincl_matrices[2].shape}") # %% -scincl_score = score_projection(scincl_matrices[0], articles_scores, authors_mat, authors_scores, min_score=filt_min_score, n_neighbors=n_neighbors) -print("Scincl score:", scincl_score[0]) +scincl_score = Neighbors.evaluate( + NATURE, SOURCE, authors_mat, authors_scores, dir_xD=".", + scores_nature=articles_scores, matrix_nature_xD=scincl_matrices[0], + min_score=filt_min_score, n_neighbors=n_neighbors, recompute=True +) +scincl_score.print() # %% [markdown] # "all-MiniLM-L6-v2" projection @@ -301,8 +326,12 @@ print(f"words_mat: {minilm_matrices[2].shape}") # # %% -minilm_score = score_projection(minilm_matrices[0], articles_scores, authors_mat, authors_scores, min_score=filt_min_score, n_neighbors=n_neighbors) -print("all-MiniLM-L6-v2 score:", minilm_score[0]) +minilm_score = Neighbors.evaluate( + NATURE, SOURCE, authors_mat, authors_scores, dir_xD=".", + scores_nature=articles_scores, matrix_nature_xD=minilm_matrices[0], + min_score=filt_min_score, n_neighbors=n_neighbors, recompute=True +) +minilm_score.print() # %% [markdown] # "all-mpnet-base-v2" projection @@ -318,7 +347,9 @@ print(f"authors_mat: {mpnet_matrices[1].shape}") print(f"words_mat: {mpnet_matrices[2].shape}") # %% -mpnet_score = score_projection(mpnet_matrices[0], articles_scores, authors_mat, authors_scores, min_score=filt_min_score, n_neighbors=n_neighbors) -print("all-mpnet-base-v2 score:", mpnet_score[0]) - -# %% +mpnet_score = Neighbors.evaluate( + NATURE, SOURCE, authors_mat, authors_scores, dir_xD=".", + scores_nature=articles_scores, matrix_nature_xD=mpnet_matrices[0], + min_score=filt_min_score, n_neighbors=n_neighbors, recompute=True +) +mpnet_score.print() -- GitLab From 30f5ab4a9d6685a503a9d89576f7c2d7b1880f2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= <hande@ginkgo.work> Date: Thu, 15 Feb 2024 16:25:57 +0100 Subject: [PATCH 6/8] convert directories to Path --- cartodata/model_selection/scoring.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/cartodata/model_selection/scoring.py b/cartodata/model_selection/scoring.py index 24c10c14..0bd975bd 100644 --- a/cartodata/model_selection/scoring.py +++ b/cartodata/model_selection/scoring.py @@ -1,4 +1,5 @@ import logging +from pathlib import Path import gzip import numpy as np @@ -52,7 +53,7 @@ class Neighbors(ScoringBase): the base nature of calculation. source: str the source nature to calculate the score for. - dir_mat: Path + dir_mat: str, Path the path of the directory that contains the entity matrices. dumps_dir: Path the directory that contains x-dimensional projection matrices. @@ -67,6 +68,9 @@ class Neighbors(ScoringBase): evals: Evaluation Returns the calculated scores and descriptive scores. """ + dir_mat = Path(dir_mat) + dir_xD = Path(dir_xD) + scores_source = load_scores([source], dir_mat)[0] matrix_source = load_matrices_from_dumps([source], "mat", dir_mat)[0] @@ -123,6 +127,7 @@ class Neighbors(ScoringBase): scores_nature=None, matrix_nature_xD=None, min_score=20, recompute=True, sample_size=None, n_neighbors=10, random_state=42): + dir_xD = Path(dir_xD) logger.info(f"Calculating scores for {cls.KEY}_{nature}_{source}.") @@ -135,7 +140,12 @@ class Neighbors(ScoringBase): "random_state": random_state }) - assert neighbors_filename is not None + if neighbors_filename is None: + neighbors_filename = ( + dir_xD / NEIGHBORS_FILENAME_FORMAT.format( + nature, nature + ) + ) if recompute: assert (dir_xD is not None and scores_nature is not None and @@ -260,6 +270,9 @@ class Trustworthiness(ScoringBase): def load_and_evaluate(cls, natures, key_nD, key_2D, dir_nD, dir_2D, n_neighbors=5, metric="euclidean"): + dir_nD = Path(dir_nD) + dir_2D = Path(dir_2D) + matrices_nD = load_matrices_from_dumps(natures, key_nD, dir_nD) matrices_2D = load_matrices_from_dumps(natures, key_2D, dir_2D) @@ -333,6 +346,11 @@ class Clustering(ScoringBase): dir_clus, iter_stab=2, remove_stab=[0, .01, .03, .1, .25], metric='euclidean', random_state=None): + dir_mat = Path(dir_mat) + dir_nD = Path(dir_nD) + dir_2D = Path(dir_2D) + dir_clus = Path(dir_clus) + # load necessary matrices and scores matrices = load_matrices_from_dumps(natures, "mat", dir_mat) scores = load_scores(natures, dir_mat) -- GitLab From 531bf3adee66830cf57b165561f7d7e695588fe9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= <hande@ginkgo.work> Date: Thu, 15 Feb 2024 16:49:45 +0100 Subject: [PATCH 7/8] increase size of ci machine for tests --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f3fc1fcd..191d8a89 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -23,7 +23,7 @@ lint: stage: test tags: - ci.inria.fr - - small + - medium before_script: - apt-get update && apt-get install -y git build-essential - python -m pip install -e .[test,transformers] -- GitLab From 663e56d2fbbb2066944f3b89008b78adef6e68ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hande=20G=C3=B6z=C3=BCkan?= <hande@ginkgo.work> Date: Thu, 15 Feb 2024 17:18:38 +0100 Subject: [PATCH 8/8] install transformers for pages as well --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 191d8a89..9d6267e8 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -80,7 +80,7 @@ pages: - apt-get update && apt-get install -y make git build-essential script: - echo "Generating Docs" - - pip install -e .[doc] + - pip install -e .[doc,transformers] - mkdir -p dumps/lisn - cd docs && make html - mv build/html/ ../public/ -- GitLab