properties 11 KB
Newer Older
MERABTI Tayeb's avatar
MERABTI Tayeb committed
1
2
3
random_seed=0
PYTHONPath =/usr/local/bin/python3.7
#
MERABTI Tayeb's avatar
MERABTI Tayeb committed
4
python_script_location=/var/connectionlens/scripts
MERABTI Tayeb's avatar
MERABTI Tayeb committed
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# DATABASE:
# The following are the setting to access the master database, where the system central information
# are stored.
#
# For now, this is the only supported vendor
RDBMSType = POSTGRESQL
# hostname where the DB is run
RDBMSHost = localhost
RDBMSPort = 5432
RDBMSUser = kwsearch
RDBMSPassword =
# database name
RDBMSDBName = cl_default


# LANGUAGE
#
# The default locale used through the execution of the system
default_locale=fr
#default_locale=en


# DRAWING AND PLOTTING
#
# The path where dot is installed
drawing.dot_installation=/usr/local/bin/dot
# The main directory where .dot and .pdf will be saved (must be created previously)
drawing.main_directory=/tmp
drawing.draw=true
# drawing coarse edges
drawing.coarse_edge=false
#
# Parameters for plotting solution numbers as a function of the search time
drawing.solution_times=true
# The same directory as for drawing (parameter above) will be used.
# However, one can plot solutions or not, and draw trees Ror not, independently.
drawing.gnuplot_installation=/usr/local/bin/gnuplot


# CACHES
#
# The number of paths to keep in cache
path_cache=50000
# The number of nodes to keep in cache
node_cache=50000
# The number of edges to keep in cache
edge_cache=50000
# The number of same as edges to keep in cache
same_as_cache=50000
# The number of string-pair and similarity results to keep in cache
similarity_cache=200000
# The number of nodes for which the adjacency list in cache
adjacency_cache=50000
# The number of (similarity) candidates kept in cache
candidate_pairs_cache=50000
# The number of specificities kept in cache
specificity_cache=5000
# Location on the local where to store caches (mainly extraction outputs)
# cache_location=/Users/cedar/Documents/Xin/tmp/cache
cache_location=/var/connectionlens/cache
# Number of updates to keep in memory before spilling to the database
# (1 mean sequential updates, <= 0 means all updates are kept in memory until spilling)
update_batch_size=500000
#number of cached label, the max of label to put in the cache
#-1 put all labels in cache
max_extraction_cache_size=-1

# INDEXING
#
# The indexing model. Possible value:
# - POSTGRES_FULLTEXT: Postgres' own fulltext index is used
# - LUCENE: Lucene is used.
indexing_model=POSTGRES_FULLTEXT
lucene_base_dir=.lucene

# EXTRACTION
#
# The location where the TreeTagger library is installed
treetagger_home=/opt/treetagger/latest/
# Choose one of the extractors: SNER (Stanford extractor), FLAIR_NER (Flair extractor), NONE (no extraction)
extractor=SNER
#extractor=FLAIR_NER
#extractor=NONE
#
# Whether or not to extract generic entities using TreeTagger. If the extractor is NONE, generics will not be extracted, either.
# In rare cases, TreeTagger output varies from a machine to another, and we
# have not so far been able to determine why.
extract_generics=true
#
# Whether or not to extract from URI nodes (true or false). In some cases we don't want it.
MANOLESCU Ioana's avatar
MANOLESCU Ioana committed
95
extract_from_uris=false
MERABTI Tayeb's avatar
MERABTI Tayeb committed
96
97
98
99
100

# if true call Ambiverse for disambiguation
entity_disambiguation=false

# If true long text will be splitted.
MANOLESCU Ioana's avatar
MANOLESCU Ioana committed
101
split_sentences=true
MERABTI Tayeb's avatar
MERABTI Tayeb committed
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171

# NODE COMPARISON AND SIMILARITY MEASURES
#
# Whether or not to perform node comparisons (true or false)
compare_nodes=true
# Threshold above which same as are considered relevant
# similarity_threshold_hamming=-1
# similarity_threshold_jaro=-1
# similarity between long strings
similarity_threshold_jaccard=.8
# similarity between short strings
similarity_threshold_levenshtein=.8
# similarity metric for uri
similarity_threshold_uri=.8
# similarity metric for entity
similarity_threshold_entity=.95
# similarity metric for organization
similarity_threshold_organization=.95
# similarity metric for location
similarity_threshold_location=.95
# similarity metric for person
similarity_threshold_person=.8
# similarity metric for first name person
similarity_threshold_fname=.8
# similarity metric for last name person
similarity_threshold_lname=.9
#similarity_threshold_email
similarity_threshold_email=1
#similarity_threshold_hashtag
similarity_threshold_hashtag=1
# whether or not to require a short, common, meaningful prefix between *any two* labels compared
# if true, this will be implemented based on the column labelPrefix
compare_stored_label_prefix=true
# mandatory prefix length
stored_prefix_length=3
# "do not link" labels. Must be double quote-enclosed and comma-separated.
do_not_link_labels=
#"[Données non publiées]","true","false","M.","Mme", "Mme.","Paris","France","Net","CREATION","VUE_PDF_DU_RECEPISSE_DU_DEPOT_XML","SCI","néant"
#
# similarity metric for numbers
similarity_threshold_number=-1
# similarity metric for date and time values
similarity_threshold_datetime=1
# some LSH (locality-sensitive hashing) parameters, not currently used...
#minhash_lsh_bands=-1
#minhash_lsh_buckets=-1
#minhash_dictionary_size=1000000
#minhash_lsh_rows_per_bands=2
#minhash_lsh_bands=16
#minhash_lsh_buckets=1000000
# Minimum matching prefix length for prefix-based string matching: needs to be at least
# as large as stored_prefix_length!
matching_prefix_length=3
# How to select pairs of short strings for comparison: having a common prefix (PREFIX); being identical (EQUAL); not at all (NONE)
short_string_comparison=PREFIX
# How to select pairs of long strings for comparison: having a common prefix (PREFIX); being identical (EQUAL); not at all (NONE)
long_string_comparison=PREFIX
# Length above which Jaccard similarity will be used over the other
long_string_threshold=128
# Length over which Jaro/Hamming/Levenshtein similarity will be used over the other
short_string_threshold=32
# If true, similarities are only compared between entities
entities_only=false
# Number of keyword allowed to be missing in a match for an answer to be returned.
query_cover_threshold=0
# Possible values:
# - PER_INSTANCE: all value nodes are distinct
# - PER_PATH:     all value nodes that share the same label and type within a given data source, are unified
# - PER_DATASET:    all value nodes that share the same label within a data source, are unified
# - PER_GRAPH: all value nodes sharing the same label, across all data sources composing a graph, are unified
MANOLESCU Ioana's avatar
MANOLESCU Ioana committed
172
value_atomicity=PER_GRAPH
MERABTI Tayeb's avatar
MERABTI Tayeb committed
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
# How many entity nodes to create when loading
# PER_OCCURRENCE, PER_DATASET, PER_LOADING, PER_GRAPH
entity_node_creation_policy=PER_GRAPH
# Whether the Jaccard common-word heuristic should be used.
jaccard_common_word=true
# Used with LENGTH above to determine the length range (must be within [0, 1]
length_difference_ratio=.2
# default storage berkeley folder
dbberkeley_base_dir=.dbberkeley


# IMPLICIT NODES AND EDGES
# - true: nodes and edges deemed implicit are stored in memory (requires reloading upon each run)
# - false: no node or edge is considered implicit and all are stored in the underlying database
keep_implicit_in_memory=false

# SEARCH
#
# Timeout (ms) after which to stop the search
#search_stopper_timeout=20000
#search_stopper_timeout=10000
search_stopper_timeout=15000
# Number of results after which to stop the search
search_stopper_topk=-1
#
# The default search algorithm to use:
# Possible values:
# - GAM: Grow and Aggressive Merge as described in CAiSE 2020 submission
# - GAM_COARSE: like GAM, but operating only on entities and datasets.
# - GAM_COARSE_LOCAL:  coarse GAM with local search using short path algorithm within island.
# - GAM_COARSE_LOCAL_ASYNC:  same as above, running local searches asynchronously.
# - SQL_PATH: supports only 2-keyword queries and only finds results within a single dataset; computes paths up to some fixed length using SQL directly
global_search_algorithm=GAM
#
# Parameters for the GAM algorithm(s):
# Number of matches to consider for a keyword:
max_matches_per_kwd=-1
# Metric on a (tree, root-adjacent edge) pair that determines the priority of the pair in the queue:
# - NUMBER_OF_NODES: The number of nodes of the tree.
# - ROOT_ADJACENT_EDGES: The number of edges adjacent to the tree root.
# - NUMBER_OF_NODES_ROOT_ADJACENT_EDGES: The number of nodes in the tree, then the number of edges adjacent to the tree root.
# - NODES_MINUS_MATCHES: The number of tree nodes minus the number of tree matches.
# - MATCHES_MINUS_NODES: The number of tree matches minus the number of tree nodes.
# - NODES_MINUS_MATCHES_THEN_FANOUT: The number of tree nodes minus the number of tree matches, then (if tie) the number of root-adjacent edges.
# - MATCHES_MINUS_NODES_THEN_FANOUT: The number of tree matches minus the number of tree nodes, then (if tie) the number of root-adjacent edges.
# - NODES_MINUS_MATCHES_THEN_SPECIFICITY: The number of tree nodes minus the number of tree matches, then (if tie) the specificity of the edge.
# - MATCHES_MINUS_NODES_THEN_SPECIFICITY: The number of tree matches minus the number of tree nodes, then (if tie) specificity of the edge.
# - MATCHES_THEN_NODES_THEN_SPECIFICITY: (-1)* number of tree matches, then the number of nodes, then (if tie) the specificity of the edge.
gam_heuristics_growth=MATCHES_MINUS_NODES_THEN_SPECIFICITY
# whether or not to limit the number of entity nodes extracted from the same text matching the same kwd, to 1
one_entity_match_per_text=false
# bidrectional_search: target will be source
gam_bidirectional_search=true
# minimize trees resulting from merge
gam_minimize_merge=true
# minimize trees resulting from grow-across.
gam_minimize_growacross=true
#specify the GAM Search Grow To equivalent strategy.
# possible values:
# - growAcross
# - growToRepresentative
# - None (if we do not want to use either growAcross nor growToRepresentative). This is used in CoarseGAM
growToEquivalentStrategy = growToRepresentative
# threshold for sam_as edges.
same_as_threshold=.8
# number of threads
gam_number_of_thread=4
# Scoring functions.
# Weight of the matching score in the scoring function
score_alpha=0.2
# Weight of the connection score in the scoring function
score_beta=0.2
# Weight of the specificity in the scoring function is 1 - alpha - beta
# max length of label to use Needleman-Wunsch in score matching/chunk long label for print and visualization
node_label_matching_length=80
# Parameters for the SQL_SEARCH algorithm:
# Maximum path length:
sql_search_max_length=5


MERABTI Tayeb's avatar
MERABTI Tayeb committed
253

MERABTI Tayeb's avatar
MERABTI Tayeb committed
254
255
256
257
258
259
260
261
262
263
264
# GRAPH STORAGE
#
# The storage model i.e. how the graph is stored. Possible values:
# - COMPACT: The graph is stored in a SQL database, IDs are integer-based
storage_model=COMPACT
# Refresh materialized view in DB.
storage_use_materialized_strong_same_as_edges=false
# Whether or not to remove obsolete tuples from the specificity table
cleanup_specificity=false


MERABTI Tayeb's avatar
MERABTI Tayeb committed
265

MERABTI Tayeb's avatar
MERABTI Tayeb committed
266
267
268
# ABSTRACT GRAPH
#
# Whether to create the abstract graph from the basic ConnectionLens graph. Possible values : true, false
MERABTI Tayeb's avatar
MERABTI Tayeb committed
269
create_abstract_graph=false
MERABTI Tayeb's avatar
MERABTI Tayeb committed
270
271
272
273
274
275
276
#
# Which graph to read when accessing the database. Possible values :
# - false : The graph read is the basic ConnectionLens graph which is stored in the default tables (nodes, edges, weak_same_as, specificity, ...)
# - true : The graph read is the abstract graph stored in a distinct set of tables (abs_nodes, abs_edges, ...). Should be set to true only for abstract graphs (i.e. after R&C extraction)
read_abstract_graph=false

query_only_specific_edge=false