Commit 7ccd8bb9 authored by UPADHYAY Prajna Devi's avatar UPADHYAY Prajna Devi
Browse files

Updated README.md, moved images under docs/images/

parent 5dfcc71b
......@@ -79,7 +79,7 @@ From the main folder, call the jar in the `core` folder with the following optio
java -jar core/connection-lens-core-full-1.1-SNAPSHOT.jar -DRDBMSDBName=cl_myinstance -i data/poc/2/deputes.json,data/poc/2/fb-etienne-chouard.txt,data/poc/2/medias.txt,data/poc/2/tweet-Ruffin.json,data/poc/2/rt-wikipedia.txt
```
![image_3.png](./image_3.png)
![image_3.png](./docs/images/image_3.png)
For more options, the following command will provide further details about applicable parameters and options:
......@@ -165,7 +165,7 @@ java -jar core/connection-lens-core-full-1.1-SNAPSHOT.jar -DRDBMSDBName=cl_myins
The `query>` indicates that the shell is ready to accept queries.
![image_2.png](./image_2.png)
![image_2.png](./docs/images/image_2.png)
#### Gathering statistics about queries (command-line)
......@@ -199,7 +199,7 @@ Follow the [GUI installation instructions](https://gitlab.inria.fr/cedar/connect
Then, queries can be asked through the GUI and results can be visualized in the GUI. For instance, the screenshot below
corresponds to the query "Briand Halluin Tonolli".
![image_4.png](./image_4.png)
![image_4.png](./docs/images/image_4.png)
# Contributing to ConnectionLens
......
random_seed=0
PYTHONPath =cl_env/bin/python3.6
## Path to python scripts (flair and pdf)
python_script_location=/var/connectionlens/scripts
# Directory where CL will write temporary files.
# These include: temporary files used for loading data in a relational database;
# dot-generated drawings of small CL graphs and answer trees;
# other temporary files created notably by the Flair NER tool.
temp_dir=/tmp
# DATABASE:
# The following are the setting to access the master database, where the system central information
# are stored.
#
# For now, this is the only supported vendor
RDBMSType = POSTGRESQL
# hostname where the DB is run
RDBMSHost = localhost
RDBMSPort = 5432
RDBMSUser = kwsearch
RDBMSPassword =
# database name
RDBMSDBName = cl_default
# LANGUAGE
#
# The default locale used through the execution of the system. Possible values: en, fr
default_locale=fr
# DRAWING AND PLOTTING
#
# The path where dot is installed
drawing.dot_installation=/usr/local/bin/dot
drawing.draw=false
# drawing coarse edges
drawing.coarse_edge=false
#
# Parameters for plotting solution numbers as a function of the search time
drawing.solution_times=false
# The same directory as for drawing (parameter above) will be used.
# However, one can plot solutions or not, and draw trees Ror not, independently.
drawing.gnuplot_installation=/usr/local/bin/gnuplot
# CACHES
#
# The number of nodes to keep in cache
node_cache=50000
# The number of edges to keep in cache
edge_cache=50000
# The number of same as edges to keep in cache
same_as_cache=50000
# The number of string-pair and similarity results to keep in cache
similarity_cache=200000
# The number of nodes for which the adjacency list in cache
adjacency_cache=50000
# The number of (similarity) candidates kept in cache
candidate_pairs_cache=50000
# The number of specificities kept in cache
specificity_cache=5000
# Location on the local where to store caches (mainly extraction outputs)
cache_location=/var/connectionlens/cache
# Number of updates to keep in memory before spilling to the database
# (1 mean sequential updates, <= 0 means all updates are kept in memory until spilling)
update_batch_size=100000
# Maximum number of extraction entries that will be cached in memory.
# Note that if value_atomicity=PER_GRAPH, a buffer of size node_cache will cache all nodes
# (including, but not only, value nodes) thus no extraction will be repeated, thus no need for caching.
# -1 means unbounded caching; 0 means no caching.
max_extraction_mem_cache=100000
# Maximum number of extraction entries that will be cached on disk.
# The same comment wrt value_atomicity=PER_GRAPH applies (but disk caching will hurt performance even more).
# You may still want some disk caching when using Flair, in order to speed up future executions.
# Memory cached extractor entities are saved to disk only if you use disk caching.
max_extraction_disk_cache=-1
# INDEXING
#
# The indexing model. Possible value:
# - POSTGRES_FULLTEXT: Postgres' own fulltext index is used
# - LUCENE: Lucene is used.
indexing_model=POSTGRES_FULLTEXT
lucene_base_dir=.lucene
# EXTRACTION
#
# The location where the TreeTagger library is installed
treetagger_home=/opt/treetagger/latest/
## The location where Stanford models are installed
stanford.models=/var/connectionlens/models/
# Named entity extractor. Possible values: SNER (Stanford extractor), FLAIR_NER (Flair extractor), NONE (no extraction)
extractor=SNER
#
# This parameter is used to start the Flask Web sevice, then it is used by Flair to determine
# how many sentences to process in parallel, which helps e.g. when running on a GPU. A useful value was 128.
extractor_batch_size_flair=1
# This parameter controls the size of a batch of texts ConnectionLens sends to the extractor.
# Currently this is only useful if you use Flair extraction and the Flair batch size is not 1.
# It should be a multiple of extractor_batch_size_flair (for example 10 times larger).
extractor_batch_size_cl=1
#
# Whether to extract generic entities using TreeTagger. If the extractor is NONE, generics will not be extracted, either.
extract_generics=true
#
# Whether to extract from URI nodes (true or false). In some cases we don't want it.
extract_from_uris=true
# Extraction policies (for now available only for XML). Policies appear on the same line and are comma-separated.
# Each policy is of the form "XMLPath Decision" where Decision can be: Person, Organization, Location, NoExtract, or NoExtractAll
# Fields that are not referred to by any extraction policy will be handled by the respective extractor.
extract_policy=
# PubmedArticleSet.PubmedArticle.AuthorList.Author.Name Person, PubmedArticleSet.PubmedArticle.KeywordList NoExtractAll
# Whether to call Ambiverse for disambiguation
entity_disambiguation=false
# Entity disambiguation policies follow the same syntax as extraction policies except that only Person, Organization, Location are supported.
# If the disambiguation policy is non empty, disambiguation is attempted only for the entities explicitly designated in the policy.
# If the disambiguation policy is empty, all entities, regardless of the extractor who produced them, will be disambiguated.
entity_disambig_policy=
# Whether to split long texts into sentences (thus creating more nodes)
split_sentences=false
# Maximum length for a node label if split=true. The current value is 1/3 of the default Postgres page size.
max_label_length=2000
#PDF
# Type of tables in PDF. Possible values: lattice (separation lines of cell written), stream (separation lines of cell not written).
flavor=lattice
# NODE COMPARISON AND SIMILARITY MEASURES
#
# Whether or not to perform node comparisons (true or false)
compare_nodes=false
# Threshold above which same as are considered relevant
# similarity_threshold_hamming=-1
# similarity_threshold_jaro=-1
# similarity between long strings
similarity_threshold_jaccard=.8
# similarity between short strings
similarity_threshold_levenshtein=.8
# similarity metric for uri
similarity_threshold_uri=.8
# similarity metric for entity
similarity_threshold_entity=.95
# similarity metric for organization
similarity_threshold_organization=.95
# similarity metric for location
similarity_threshold_location=.95
# similarity metric for person
similarity_threshold_person=.8
# similarity metric for first name person
similarity_threshold_fname=.8
# similarity metric for last name person
similarity_threshold_lname=.9
#similarity_threshold_email
similarity_threshold_email=1
#similarity_threshold_hashtag
similarity_threshold_hashtag=1
# whether or not to require a short, common, meaningful prefix between *any two* labels compared
# if true, this will be implemented based on the column labelPrefix
compare_stored_label_prefix=true
# mandatory prefix length
stored_prefix_length=3
# "do not link" labels. Must be double quote-enclosed and comma-separated.
do_not_link_labels=
#"[Données non publiées]","true","false","M.","Mme", "Mme.","Paris","France","Net","CREATION","VUE_PDF_DU_RECEPISSE_DU_DEPOT_XML","SCI","néant"
#
# similarity metric for numbers
similarity_threshold_number=-1
# similarity metric for date and time values
similarity_threshold_datetime=1
# some LSH (locality-sensitive hashing) parameters, not currently used...
#minhash_lsh_bands=-1
#minhash_lsh_buckets=-1
#minhash_dictionary_size=1000000
#minhash_lsh_rows_per_bands=2
#minhash_lsh_bands=16
#minhash_lsh_buckets=1000000
# Minimum matching prefix length for prefix-based string matching: needs to be at least
# as large as stored_prefix_length!
matching_prefix_length=3
# How to select pairs of short strings for comparison: having a common prefix (PREFIX); being identical (EQUAL); not at all (NONE)
short_string_comparison=PREFIX
# How to select pairs of long strings for comparison: having a common prefix (PREFIX); being identical (EQUAL); not at all (NONE)
long_string_comparison=PREFIX
# Length above which Jaccard similarity will be used over the other
long_string_threshold=128
# Length over which Jaro/Hamming/Levenshtein similarity will be used over the other
short_string_threshold=32
# Number of keyword allowed to be missing in a match for an answer to be returned.
query_cover_threshold=0
# Possible values:
# - PER_INSTANCE: all value nodes are distinct
# - PER_PATH: all value nodes that share the same label and type within a given data source, are unified
# - PER_DATASET: all value nodes that share the same label within a data source, are unified
# - PER_GRAPH: all value nodes sharing the same label, across all data sources composing a graph, are unified
value_atomicity=PER_INSTANCE
# How many entity nodes to create when loading
# PER_OCCURRENCE, PER_DATASET, PER_LOADING, PER_GRAPH
entity_node_creation_policy=PER_GRAPH
# Whether the Jaccard common-word heuristic should be used.
jaccard_common_word=true
# Used with LENGTH above to determine the length range (must be within [0, 1]
length_difference_ratio=.2
# default storage berkeley folder
dbberkeley_base_dir=.dbberkeley
# IMPLICIT NODES AND EDGES
# - true: nodes and edges deemed implicit are stored in memory (requires reloading upon each run)
# - false: no node or edge is considered implicit and all are stored in the underlying database
keep_implicit_in_memory=false
# SEARCH
#
# Timeout (ms) after which to stop the search
#search_stopper_timeout=20000
#search_stopper_timeout=10000
search_stopper_timeout=15000
# Number of results after which to stop the search
search_stopper_topk=-1
#
# The default search algorithm to use:
# Possible values:
# - GAM: Grow and Aggressive Merge as described in CAiSE 2020 submission
# - GAM_COARSE: like GAM, but operating only on entities and datasets.
# - GAM_COARSE_LOCAL: coarse GAM with local search using short path algorithm within island.
# - GAM_COARSE_LOCAL_ASYNC: same as above, running local searches asynchronously.
# - SQL_PATH: supports only 2-keyword queries and only finds results within a single dataset; computes paths up to some fixed length using SQL directly
global_search_algorithm=GAM
#
# Parameters for the GAM algorithm(s):
# Number of matches to consider for a keyword:
max_matches_per_kwd=-1
# Metric on a (tree, root-adjacent edge) pair that determines the priority of the pair in the queue:
# - NUMBER_OF_NODES: The number of nodes of the tree.
# - ROOT_ADJACENT_EDGES: The number of edges adjacent to the tree root.
# - NUMBER_OF_NODES_ROOT_ADJACENT_EDGES: The number of nodes in the tree, then the number of edges adjacent to the tree root.
# - NODES_MINUS_MATCHES: The number of tree nodes minus the number of tree matches.
# - MATCHES_MINUS_NODES: The number of tree matches minus the number of tree nodes.
# - NODES_MINUS_MATCHES_THEN_FANOUT: The number of tree nodes minus the number of tree matches, then (if tie) the number of root-adjacent edges.
# - MATCHES_MINUS_NODES_THEN_FANOUT: The number of tree matches minus the number of tree nodes, then (if tie) the number of root-adjacent edges.
# - NODES_MINUS_MATCHES_THEN_SPECIFICITY: The number of tree nodes minus the number of tree matches, then (if tie) the specificity of the edge.
# - MATCHES_MINUS_NODES_THEN_SPECIFICITY: The number of tree matches minus the number of tree nodes, then (if tie) specificity of the edge.
# - MATCHES_THEN_NODES_THEN_SPECIFICITY: (-1)* number of tree matches, then the number of nodes, then (if tie) the specificity of the edge.
gam_heuristics_growth=MATCHES_MINUS_NODES_THEN_SPECIFICITY
# whether or not to limit the number of entity nodes extracted from the same text matching the same kwd, to 1
one_entity_match_per_text=false
# bidrectional_search: target will be source
gam_bidirectional_search=true
# whether or not to only traverse edge whose specificity is above a minimum threshold
query_only_specific_edge=false
# whether or not to load all the graph in the memory cache before evaluating queries
prefetch_graph_in_memory=false
# threshold for same_as edges
same_as_threshold=.8
# number of threads
gam_number_of_thread=4
# Scoring functions.
# Weight of the matching score in the scoring function
score_alpha=0.2
# Weight of the connection score in the scoring function
score_beta=0.2
# Weight of the specificity in the scoring function is 1 - alpha - beta
# max length of label to use Needleman-Wunsch in score matching/chunk long label for print and visualization
node_label_matching_length=80
# Parameters for the SQL_SEARCH algorithm:
# Maximum path length:
sql_search_max_length=5
# GRAPH STORAGE
#
# The storage model i.e. how the graph is stored. Possible values:
# - COMPACT: The graph is stored in a SQL database, IDs are integer-based
storage_model=COMPACT
# Refresh materialized view in DB.
storage_use_materialized_strong_same_as_edges=false
# Whether or not to remove obsolete tuples from the specificity table
cleanup_specificity=false
# ABSTRACT GRAPH
#
# Whether to create the abstract graph from the basic ConnectionLens graph. Possible values : true, false
create_abstract_graph=false
#
# Which graph to read when accessing the database. Possible values :
# - false : The graph read is the basic ConnectionLens graph which is stored in the default tables (nodes, edges, weak_same_as, specificity, ...)
# - true : The graph read is the abstract graph stored in a distinct set of tables (abs_nodes, abs_edges, ...). Should be set to true only for abstract graphs (i.e. after R&C extraction)
read_abstract_graph=false
query_only_specific_edge=false
# To run (or not) the classification method on the abstract graph. Possibles values are: NONE (classification is not run), TUFFY (to run Tuffy classification) or SUMMARIES (to run classification based on summaries). Default is NONE.
classification_method=NONE
tuffy_dbname = tuffydb
tuffy_username = tuffer
tuffy_password = strongPasswoRd
tuffy_directory = /var/connectionlens/tuffy
# the path where is store the models used by WordNet
fxml_file = /var/connectionlens/models/fra-eng.tei
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment