diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9d6267e8e647c2c0556af40c78103774886cc0e6..fa6007edbdcf5d587b6b220665f8d734561f054b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -95,5 +95,5 @@ pages: paths: - public needs: ["run_tests_3.10"] - timeout: 3h 30m + timeout: 5h 30m diff --git a/cartodata/pipeline/projectionnd.py b/cartodata/pipeline/projectionnd.py index 33c6392beebc794f040e74d44f806cdbf4af5dda..8117c0f31bbb6c6771e288ffebba4299786ad4a0 100644 --- a/cartodata/pipeline/projectionnd.py +++ b/cartodata/pipeline/projectionnd.py @@ -462,9 +462,11 @@ class Doc2VecProjection(ProjectionND): class BertProjection(ProjectionND): - def __init__(self, family="specter2", pt_device=None, normalize=True): + def __init__(self, family="specter2", batch_size=10, pt_device=None, + normalize=True): super().__init__("bert", 768, normalize) self.family = family + self.batch_size = batch_size self.pt_device = pt_device def execute(self, matrices, dataset, dump_dir): @@ -472,4 +474,4 @@ class BertProjection(ProjectionND): corpus = dataset.corpus return bert_projection(matrices, corpus, self.family, self.normalize, - self.pt_device) + self.pt_device, self.batch_size) diff --git a/cartodata/projection.py b/cartodata/projection.py index 6d2829c5d8cf4bee9f06ef109af820fa3bda9ca1..5bbc79336817f79c4f7ea0edb078e79a7366c45d 100644 --- a/cartodata/projection.py +++ b/cartodata/projection.py @@ -91,7 +91,7 @@ BERT_PROJECTIONS = { def bert_projection(matrices, vocab_df, family="specter2", normalize=True, - pt_device=None): + pt_device=None, batch_size=10): """ Apply a Bert projection method, Specter2 by default. :param matrices: a list of matrices to project @@ -130,7 +130,7 @@ def bert_projection(matrices, vocab_df, model.eval() collect_embeddings = torch.Tensor([]) i = 0 - for texts in iterate_in_batches(vocab_df, 10): + for texts in iterate_in_batches(vocab_df, batch_size): # preprocess the input inputs = tokenizer(list(texts), padding=True, truncation=True, return_tensors="pt", return_token_type_ids=False, @@ -176,7 +176,7 @@ def mean_pooling(model_output, attention_mask): def sbert_projection(matrices, vocab_df, family="all-MiniLM-L6-v2", normalize=True, - pt_device=None): + pt_device=None, batch_size=10): """ Apply a Sentence Bert projection method, "all-MiniLM-L6-v2" by default. :param matrices: a list of matrices to project @@ -206,7 +206,7 @@ def sbert_projection(matrices, vocab_df, model.eval() collect_embeddings = torch.Tensor([]) i = 0 - for texts in iterate_in_batches(vocab_df, 10): + for texts in iterate_in_batches(vocab_df, batch_size): # preprocess the input inputs = tokenizer(list(texts), padding=True, truncation=True, return_tensors="pt", return_token_type_ids=False diff --git a/examples/compare_projections_vispubdata.py b/examples/compare_projections_vispubdata.py index ea59912d4f228fffcab42799fd9daf75d7411456..e269b5a098f6db67fc37ed71d4c1da115ee4cad6 100644 --- a/examples/compare_projections_vispubdata.py +++ b/examples/compare_projections_vispubdata.py @@ -1,22 +1,9 @@ -# --- -# jupyter: -# jupytext: -# formats: ipynb,py:sphinx -# rst2md: false -# text_representation: -# extension: .py -# format_name: sphinx -# format_version: '1.1' -# jupytext_version: 1.16.0 -# kernelspec: -# display_name: Python 3 (ipykernel) -# language: python -# name: python3 -# --- - """ -# Extracting and processing VisPubdata data with the Cartolabe API +Extracting and processing VisPubdata data with the Cartolabe API +================================================================== + Comparing the quality of embeddings using multiple methods +------------------------------------------------------------- """ # %matplotlib inline @@ -33,7 +20,7 @@ Comparing the quality of embeddings using multiple methods # SHEET_ID = '1xgoOPu28dQSSGPIp_HHQs0uvvcyLNdkMF9XtRajhhxU' -SHEET_NAME='Main%20dataset' +SHEET_NAME = 'Main%20dataset' url = f'https://docs.google.com/spreadsheets/d/{SHEET_ID}/gviz/tq?tqx=out:csv&sheet={SHEET_NAME}' min_df = 25 @@ -186,9 +173,12 @@ articles_scores.head() from cartodata.projection import lsa_projection # noqa from cartodata.operations import normalize_l2 # noqa -from cartodata.scoring import score_projection -# %time lsa_matrices = lsa_projection(num_dims, words_mat, [articles_mat, authors_mat, words_mat]) +"" +# %%time +lsa_matrices = lsa_projection(num_dims, words_mat, [articles_mat, authors_mat, words_mat]) + +"" lsa_matrices = list(map(normalize_l2, lsa_matrices)) ############################################################################### @@ -218,7 +208,11 @@ lsa_score.print() from cartodata.projection import lda_projection # noqa -# %time lda_matrices = lda_projection(num_dims, 1, [articles_mat, authors_mat, words_mat]) +"" +# %%time +lda_matrices = lda_projection(num_dims, 1, [articles_mat, authors_mat, words_mat]) + +"" lda_matrices = list(map(normalize_l2, lda_matrices)) "" @@ -240,7 +234,11 @@ lda_score.print() from cartodata.projection import doc2vec_projection # noqa -# %time doc2vec_matrices = doc2vec_projection(num_dims, 1, [articles_mat, authors_mat, words_mat], df['text']) +"" +# %%time +doc2vec_matrices = doc2vec_projection(num_dims, 1, [articles_mat, authors_mat, words_mat], df['text']) + +"" doc2vec_matrices = list(map(normalize_l2, doc2vec_matrices)) "" @@ -254,7 +252,7 @@ doc2vec_score = Neighbors.evaluate( scores_nature=articles_scores, matrix_nature_xD=doc2vec_matrices[0], min_score=filt_min_score, n_neighbors=n_neighbors, recompute=True ) -doc2vec_score2.print() +doc2vec_score.print() ############################################################################### # Specter2 projection @@ -262,7 +260,11 @@ doc2vec_score2.print() from cartodata.projection import bert_projection # noqa -# %time specter2_matrices = bert_projection([articles_mat, authors_mat, words_mat], df['text']) +"" +# %%time +specter2_matrices = bert_projection([articles_mat, authors_mat, words_mat], df['text']) + +"" specter2_matrices = list(map(normalize_l2, specter2_matrices)) "" @@ -282,7 +284,10 @@ specter2_score.print() # Scincl projection # ------------------------- -# %time scincl_matrices = bert_projection([articles_mat, authors_mat, words_mat], df['text'], family="scincl") +# %%time +scincl_matrices = bert_projection([articles_mat, authors_mat, words_mat], df['text'], family="scincl") + +"" scincl_matrices = list(map(normalize_l2, scincl_matrices)) "" @@ -302,7 +307,10 @@ scincl_score.print() # "all-MiniLM-L6-v2" projection # ------------------------- -# %time minilm_matrices = bert_projection([articles_mat, authors_mat, words_mat], df['text'], family="all-MiniLM-L6-v2") +# %%time +minilm_matrices = bert_projection([articles_mat, authors_mat, words_mat], df['text'], family="all-MiniLM-L6-v2") + +"" minilm_matrices = list(map(normalize_l2, minilm_matrices)) "" @@ -325,7 +333,10 @@ minilm_score.print() # "all-mpnet-base-v2" projection # ------------------------- -# %time mpnet_matrices = bert_projection([articles_mat, authors_mat, words_mat], df['text'], family="all-mpnet-base-v2") +# %%time +mpnet_matrices = bert_projection([articles_mat, authors_mat, words_mat], df['text'], family="all-mpnet-base-v2") + +"" mpnet_matrices = list(map(normalize_l2, mpnet_matrices)) ""