Commit 5bd5a224 authored by MERABTI Tayeb's avatar MERABTI Tayeb
Browse files

update project according the new develop version: edd7d330 14/09

parent ea22e81b
This diff is collapsed.
......@@ -6,7 +6,7 @@ A clone of this repository includes:
- the jar file `connection-lens-core-full-1.1-SNAPSHOT.jar`,
- the python `scripts` folder (these implement an entity extraction based on Flair),
- a `settings/properties` file which allows controlling multiple parameters related to the execution.
- a `settings/local.settings` file which allows controlling multiple parameters related to the execution.
2. The `gui` folder with the file `gui.war` that allows us to run the web app.
......@@ -23,7 +23,7 @@ ConnectionLens is available as a web application or as a command line applicatio
* a beginner-friendly installation trough a virtual image (Docker) that will give access only to the web application;
* a full installation in which both command line and web application are installed.
ConnectionLens allows customizing many parameters, illustrated in `core/settings/properties` (for instance: default_locale controls the language etc.). Each parameter has a default value built in the JAR. You can change parameter values to your liking in the `core/settings/properties` file; **to make sure your settings are used, add `-c core/settings/properties` to the launch command.**
ConnectionLens allows customizing many parameters, illustrated in `core/settings/local.settings` (for instance: default_locale controls the language etc.). Each parameter has a default value built in the JAR. You can change parameter values to your liking in the `core/settings/local.settings` file; **to make sure your settings are used, add `-c core/settings/local.settings` to the launch command.**
# Installation using Docker
......
......@@ -15,7 +15,7 @@ for opt, arg in options:
location = arg
if location == 'french':
MODEL_PATH = "/var/connectionlens/scripts/Flair_NER_tool/stacked-standard-flair-150-wikiner.pt"
MODEL_PATH = "scripts/Flair_NER_tool/stacked-standard-flair-150-wikiner.pt"
props = {
'processors': 'tokenize',
'lang': 'fr',
......@@ -25,11 +25,11 @@ if location == 'french':
'normalizeFractions=false, '
'normalizeAmpersandEntity=false, '
'invertible=true',
'tokenize_model_path': '/var/connectionlens/scripts/Flair_NER_tool/stanfordnlp_resources/fr_gsd_models/fr_gsd_tokenizer.pt'
'tokenize_model_path': 'scripts/Flair_NER_tool/stanfordnlp_resources/fr_gsd_models/fr_gsd_tokenizer.pt'
}
tag_type = "label"
else:
MODEL_PATH = "/var/connectionlens/scripts/Flair_NER_tool/en-ner-conll03-v0.4.pt"
MODEL_PATH = "scripts/Flair_NER_tool/en-ner-conll03-v0.4.pt"
props = {
'processors': 'tokenize',
'lang': 'en',
......@@ -39,7 +39,7 @@ else:
'normalizeFractions=false, '
'normalizeAmpersandEntity=false, '
'invertible=true',
'tokenize_model_path': '/var/connectionlens/scripts/Flair_NER_tool/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt'
'tokenize_model_path': 'scripts/Flair_NER_tool/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt'
}
tag_type = "ner"
......@@ -65,4 +65,4 @@ def fner_extract_entities():
if __name__ == '__main__':
app.run(debug=True,threaded=True)
app.run(debug=False,use_reloader=False)
import os
import signal
import psutil
def proc_names():
return dict([(proc.pid, proc.name()) for proc in psutil.process_iter()])
def proc_cmdlines():
cmdlines = {}
for proc in psutil.process_iter():
try:
cmdlines[proc.pid] = proc.cmdline()
except psutil.AccessDenied:
cmdlines[proc.pid] = None
return cmdlines
def to_regex(regex):
if not hasattr(regex, "search"):
import re
regex = re.compile(regex)
return regex
def search_procs_by_name(regex):
pid_names = {}
regex = to_regex(regex)
for pid, name in proc_names().items():
if regex.search(name):
pid_names[pid] = name
return pid_names
def search_procs_by_cmdline(regex):
pid_cmdlines = {}
regex = to_regex(regex)
for pid, cmdline in proc_cmdlines().items():
if cmdline is not None:
for part in cmdline:
if regex.search(part):
pid_cmdlines[pid] = cmdline
break
return pid_cmdlines
for pid, cmdline in search_procs_by_cmdline("flaskFlair").items():
os.kill(int(pid), signal.SIGKILL)
\ No newline at end of file
import os
from scrapdf import *
from scripts.utils_evaluate import *
def main():
# choose options for scrapping pdf
parser = argparse.ArgumentParser()
parser.add_argument('--path_pdf', type=str)
parser.add_argument('--flavor', type=str, choices=["stream","lattice"],default="lattice",help=" 'lattice' = separation lines of cell written, 'stream' = separation lines of cell not written")
args = parser.parse_args()
pdf = PDF(args.path_pdf,"all",args.flavor,0,False)
pdf.save_text()
pdf.save_tables()
errors = []
try :
#%% DATA raw (loading files)
# (RES) Results data
## RES structure & functionnality
g=[]
i=0
tables_left = True
prefix = pdf.directory +'/' +'extracted_files_' + pdf.name
for i in range(pdf.n_tables) :
g.append(0)
g[i] = rdflib.Graph()
g[i].parse(prefix + '/' + '{}-table{}.nt'.format(pdf.name,i), format="nt")
n_tables_res = len(g)
# (GT) Ground Truth data
## GT regions
reg_gt = ET.parse("evaluation/database/{}-reg.xml".format(pdf.name)).getroot()
## GT structure
struct_gt = ET.parse("evaluation/database/{}-str.xml".format(pdf.name)).getroot()
for i,region in enumerate(struct_gt.iter('region')):
region.set('id', str(i))
## GT functionality
func_gt = pd.read_csv('evaluation/database/{}-fnc.csv'.format(pdf.name),header=None,dtype='object')#,sep=';')
bbox_gt = get_bbox_gt(reg_gt)
n_tables_gt = len(bbox_gt)
#%% PRECISION & RECALL
#%%####### FIRST METRIC
tokens_gt = [get_tokens(table_gt(struct_gt,i),'xml') for i in range(n_tables_gt)]
tokens_res = [get_tokens(g[i],'nt') for i in range(n_tables_res)]
f1_step1, precision_step1, recall_step1 = precision_recall(n_tables_res, n_tables_gt, tokens_res, tokens_gt)
#%%####### SECOND METRIC
limits_gt = [get_nm(i,table_gt(struct_gt,i), 'xml') for i in range(n_tables_gt)]
limits_res = [get_nm(i, pdf, 'nt') for i in range(n_tables_res)]
adjacencies_gt = [adjacency_relations(i, table_gt(struct_gt,i), limits_gt,'xml') for i in range(n_tables_gt)]
adjacencies_res = [adjacency_relations(i, g[i], limits_res,'nt') for i in range(n_tables_res)]
f1_step2, precision_step2, recall_step2 = precision_recall(n_tables_res, n_tables_gt, adjacencies_res, adjacencies_gt)
#%%####### THIRD METRIC
n_tables_gt_func = n_func_table(func_gt)
access_relations_res = [get_access_relations(i, g[i] , 'nt') for i in range(n_tables_res)]
access_relations_gt = [get_access_relations(i, functional_table(func_gt,i) , 'xml') for i in range(n_tables_gt_func)]
f1_step3, precision_step3, recall_step3 = precision_recall_func(n_tables_res, n_tables_gt_func, access_relations_res, access_relations_gt)
# %% FINAL COUNT
## precision & recall
precisions = [precision_step1, precision_step2, precision_step3]
recalls = [recall_step1, recall_step2, recall_step3]
f1s = [f1_step1,f1_step2,f1_step3]
if precisions and recalls and f1s :
precision = mean(precisions)
recall = mean(recalls)
f1 = mean(f1s)
else :
precision = 0
recall = 0
f1 = 0
## saving data
if not os.path.isfile('evaluation/PR_{}.csv'.format(args.flavor)):
new_pr = {'doc':pdf.name, 'f1':f1,'precision':precision,'recall':recall, 'f1_step1':f1_step1,'f1_step2':f1_step2,'f1_step3':f1_step3,'precision_step1':precision_step1,'precision_step2':precision_step2,'precision_step3':precision_step3, 'recall_step1':recall_step1,'recall_step2':recall_step2,'recall_step3':recall_step3}
for key, value in new_pr.items():
new_pr[key] = [value]
PR = pd.DataFrame.from_dict(new_pr)
PR.to_csv('evaluation/PR_{}.csv'.format(args.flavor),index = False)
else :
PR = pd.read_csv('evaluation/PR_{}.csv'.format(args.flavor))
new_pr = {'doc':pdf.name, 'f1':f1,'precision':precision,'recall':recall, 'f1_step1':f1_step1,'f1_step2':f1_step2,'f1_step3':f1_step3,'precision_step1':precision_step1,'precision_step2':precision_step2,'precision_step3':precision_step3, 'recall_step1':recall_step1,'recall_step2':recall_step2,'recall_step3':recall_step3}
PR = PR.append(new_pr, ignore_index=True)
PR.to_csv('evaluation/PR_{}.csv'.format(args.flavor),index = False)
except :
print("DIDN'T WORK FOR : ",pdf.name)
errors.append(pdf.name)
if __name__ == '__main__':
main()
#%% AFTER RUNNING PREVIOUS PROGRAM FOR LATTICE AND STREAM
import pandas as pd
pd.set_option("display.max_rows", None, "display.max_columns", None)
from pprint import pprint
PR_stream = pd.read_csv('evaluation/PR_stream.csv')
PR_lattice = pd.read_csv('evaluation/PR_lattice.csv')
PR = pd.DataFrame(columns=['doc','f1','precision','recall','f1_step1','f1_step2','f1_step3','precision_step1','precision_step2','precision_step3', 'recall_step1','recall_step2','recall_step3'])
docs = pd.concat([PR_lattice[['doc']],PR_stream[['doc']]]).drop_duplicates().values.tolist()
docs.sort()
for doc in docs :
doc = doc[0]
try :
if PR_stream[PR_stream['doc'] ==doc].reset_index(drop=True).loc[0,'f1'] > PR_lattice[PR_lattice['doc'] ==doc].reset_index(drop=True).loc[0,'f1'] :
PR = PR.append(PR_stream[PR_stream['doc'] == doc],ignore_index=True)
else :
PR = PR.append(PR_lattice[PR_lattice['doc'] == doc],ignore_index=True)
except :
try :
PR = PR.append(PR_stream[PR_stream['doc'] == doc],ignore_index=True)
except :
PR = PR.append(PR_lattice[PR_lattice['doc'] == doc], ignore_index=True)
PR.mean()
#%%
PR.to_csv('evaluation/PR.csv',index = False)
# %%
#!/usr/bin/env python
# coding: utf-8
# Main libraries used (maintained)
## https://github.com/pikepdf/pikepdf (for reading or writing PDF)
## https://github.com/atlanhq/camelot (for tables extraction)
## https://github.com/chrismattmann/tika-python (for text extraction)
#%% IMPORTS
import re
import os
import argparse
import pandas as pd
pd.set_option("display.max_rows", None, "display.max_columns", None)
import camelot
import pikepdf
from tika import parser
import rdflib
from rdflib.namespace import RDF, XSD
PDFTABLEXTR = rdflib.Namespace("http://tableXtr.pdf/")
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import tostring
from scripts.utils import path, decrypt, create_pdf, delete_pdf, page_content, number_of_pages, table_identifier
from scripts.prepare_text import is_empty_lines, is_spaces_lines, allowed, prepare_text
from scripts.prepare_tables import clean_line_break_df, select_accurate_tables, clean_tables, prepare_tables
from scripts.clean_text import fuzzy_extract, update_k, update_cell_in_text, update_text_in_cell, text_without_df, text_without_df_full, text2paragraphs, linking_conjunctions, rdflib_fullstop, merging_fullstop, stopping_fullstop, adding_fullstop, closing_parenthesis, sentence_bounding
from scripts.clean_tables import naive_merge_intra_columns_df, naive_merge_intra_lines_df, naive_merge_intra_df, naive_merge_intra_tables, comparable_cell, same_type, same_length, similarity, last_header, find_i_start, find_j_start, datacell_start, merge_ok, tuple2string, delete_tuple, actions_to_merge, update_s_i, update_s_i_j, df_empty, tuple2URI, merge_inter, merge_intra_df, merge_intra
from scripts.save_rdf import add_cells_rdf, add_closeness_rdf
from scripts.save_json import save_json, load_json, dict_from_text, dict_from_dict, json_from_dict
from scripts.utils_paper import deleting_attributes, finding_last_pages_index, look_for_doi, pmid_from_doi, look_for_pmid, root_from_pmid, xml_from_pmid, data_without_pmid, data_cois_acknowledgment, clean_cois
#%% CLASS PDF
class PDF:
def __init__(self,path_pdf,pages,flavor,threshold,graph_position):
"""
clean text and tables (but doesn't save them)
@var path_pdf [string], path to pdf
@var pages [string], number of pages of pdf to be scrapped
@var flavor [string], type of tables in PDF : "lattice" = separation lines of cell written, "stream" = separation lines of cell not written
@var threshold [int], from 0 to 100 miminum of accuracy of tables to be scrapped
@var graph_position [boolean], are the graphs' (their uri) position to be put in the text?
"""
#%% DECRYPT
self.directory = os.path.dirname(path_pdf)
self.name = path_pdf.replace(self.directory,'').replace('/','')[:-4]
decrypt(self)
path_pdf = path(self.directory,self.name)
#%% SELECT TABLES (CAMELOT)
self.tables = camelot.read_pdf(path(self.directory,self.name),pages=pages,flavor=flavor)
self.n_tables = len(self.tables)
#%% SELECT TEXT (TIKA)
create_pdf(self,path_pdf,pages)
parsedPDF = parser.from_file(path(self.directory,self.name))
self.text = parsedPDF["content"]
delete_pdf(self,path_pdf,pages)
#%% PREPARING
prepare_text(self)
prepare_tables(self,threshold)
#%% CLEANING
self.text_without_tables(graph_position)
self.modify_tables_and_text(flavor,graph_position)
def text_without_tables(self,graph_position):
"""
takes off the tables part of the text and turns the text into a list of paragraphs
@var graph_position [boolean], are the graphs' (their uri) position to be put in the text?
"""
if self.text :
#%% TEXT WITHOUT TABLES
text_without_df_full(self,graph_position)
#%% TEXT SPLIT IN PARAGRAPHS
text2paragraphs(self)
def modify_tables_and_text(self,flavor,graph_position):
"""
clean tables headers and datacell, adapts positions of tables URIs in text
clean text by joining sentences together
@var flavor [string], type of tables in PDF : "lattice" = separation lines of cell written, "stream" = separation lines of cell not written
@var graph_position [boolean], are the graphs' (their uri) position to be put in the text?
"""
if self.tables:
naive_merge_intra_tables(self,flavor)
merge_inter(self,graph_position)
merge_intra(self,flavor)
if self.text :
self.text = [word if isinstance(word,rdflib.term.URIRef) else re.sub('( )+',' ',word) for word in self.text] # deleting long white spaces in lines
self.text = sentence_bounding(self.text) # uniting separated sentences
def save_text(self):
"""
Text is saved to JSON files with 2 keys
One as the extraction property as key, and as value the object identifiying the original PDF
One as the "content" as key, and as value the dictionnary containing the line number for key and line as value
"""
print('--- FINAL TEXT')
#print(self.text)
text_dict = dict_from_text(self.text)
final_dict = dict_from_dict(self,text_dict)
json_from_dict(self,final_dict)
def save_tables(self):
"""
Tables are saved as RDF graph
Certain cells are considered as Headers (for lines or columns), sometimes cells of pivot table or agregating cells
Every non header cell is identified with its value, type and closest X and Y header cell
"""
print('--- FINAL TABLES')
for j in range(self.n_tables):
print('Start of datacell i_start : ', getattr(self, 'i{}_start'.format(j)))
print('Start of datacell j_start : ', getattr(self, 'j{}_start'.format(j)))
print(getattr(self,'df_{}'.format(j)))
for index_graph in range(self.n_tables):
# dataframe chosen
df = getattr(self,'df_{}'.format(index_graph))
# creating graph
g = rdflib.Graph()
# RDF prefixes
g.bind("rdf", RDF)
g.bind("xsd", XSD)
g.bind("pdfTableXtr", PDFTABLEXTR)
# triples
## creating cells
add_cells_rdf(self.directory,self.name,index_graph,df,g,getattr(self, 'i{}_start'.format(index_graph)), getattr(self, 'j{}_start'.format(index_graph)))
## adding closest & aggregating links
add_closeness_rdf(index_graph,df,g,getattr(self, 'i{}_start'.format(index_graph)), getattr(self, 'j{}_start'.format(index_graph)))
## identify original PDF
g.add((
PDFTABLEXTR[table_identifier(self.directory,index_graph,self.name)],
PDFTABLEXTR.extractedFrom,
rdflib.URIRef('file://'+ path(self.directory,self.name+'.pdf'))
))
# serialize and save
pdf_name = self.name.replace('-decrypted', '')
prefix = self.directory +'/' +'extracted_files_' + pdf_name
if not os.path.exists(prefix):
os.makedirs(prefix)
g.serialize(destination=prefix + '/' + '{}-table{}.nt'.format(pdf_name,index_graph), format="nt")
#%% CLASS papers
class paper(PDF):
def __init__(self,path_pdf,pages,flavor,threshold,graph_position):
"""
clean text and tables of paper by :
* looking to first pages and searching for doi or pmid to ouput pubmed xml file, and if not found find info on authors, keywords to ouput json...
* looking to final pages and searching for coistatement and acknowledgements to output json
@var path_pdf [string], path to pdf
@var pages [string], number of pages of pdf to be scrapped
@var flavor [string], type of tables in PDF : "lattice" = separation lines of cell written, "stream" = separation lines of cell not written
@var threshold [int], from 0 to 100 miminum of accuracy of tables to be scrapped
@var graph_position [boolean], are the graphs' (their uri) position to be put in the text?
"""
#% SELECT FIRST PAGES
self.directory = os.path.dirname(path_pdf)
self.name = path_pdf.replace(self.directory,'').replace('/','')[:-4]
decrypt(self)
num_pages = number_of_pages(self.directory,self.name)
if num_pages >= 2 :
super().__init__(path_pdf,'0,1',flavor,threshold,graph_position)
else :
super().__init__(path_pdf,'0',flavor,threshold,graph_position)
#% FIRST RESULTS
self.pmid, content_first_pages = self.data_from_first_pages()
try :
print('--- FINAL TABLES')
print(self.tables)
except :
pass
#% SELECT LAST PAGES
self.directory = os.path.dirname(path_pdf)
self.name = path_pdf.replace(self.directory,'').replace('/','')[:-4]
decrypt(self)
num_pages = number_of_pages(self.directory,self.name)
i = finding_last_pages_index(num_pages,self.directory,self.name)
if i < 2 :
super().__init__(path_pdf,'{}'.format(i+1),flavor,threshold,graph_position)
else :
try :
super().__init__(path_pdf,'{},{},{}'.format(i,i+1,i+2),flavor,threshold,graph_position)
except :
super().__init__(path_pdf,'{},{}'.format(i,i+1),flavor,threshold,graph_position)
content_last_pages = self.data_from_last_pages()
#% CLEAN AND SAVE
self.data_from_paper(content_first_pages,content_last_pages)
#% SECOND RESULTS
print('--- FINAL TEXT')
#print(self.text)
def data_from_first_pages(self):
"""
looking to first pages and searching for doi or pmid, and if not found find info on authors, keywords...
"""
deleting_attributes(self)
pmid = look_for_pmid(self.text)
if pmid :
self.tables, pubmed_data = xml_from_pmid(self,pmid)
self.n_tables = 1
if pubmed_data :
return pmid, pubmed_data
else :
return False, data_without_pmid(self.text)
else :
return False, data_without_pmid(self.text)
def data_from_last_pages(self):
"""
looking to final pages and searching for coistatement and acknowledgements
"""
deleting_attributes(self)
return data_cois_acknowledgment(self.text)
def data_from_paper(self,content_first,content_last):
"""
save results of extraction :
* output xml for information from pubmed
* output json for information from pdf text
@var content_first [], root element of xml or dict with author, year, journal....
@var content_last [dict], coistatement and acknowledgment
"""
pdf_name = self.name.replace('-decrypted', '')
prefix = self.directory +'/' +'extracted_files_' + pdf_name
if not os.path.exists(prefix):
os.makedirs(prefix)
if self.pmid:
content_first.write(prefix + '/' + '{}-table.xml'.format(pdf_name), encoding='utf-8', xml_declaration=True)
content_last = clean_cois(content_first,content_last)
final_dict = dict_from_dict(self,content_last)
json_from_dict(self,final_dict)
self.text = content_last
else :
text_dict = {**content_first, **content_last}
final_dict = dict_from_dict(self,text_dict)
json_from_dict(self,final_dict)
self.text = text_dict
#%% CLASS EFSA
class cois_EFSA(PDF):
def __init__(self,path_pdf,pages,flavor,threshold,graph_position):
"""
clean and save
@var path_pdf [string], path to pdf
@var pages [string], number of pages of pdf to be scrapped
@var flavor [string], type of tables in PDF : "lattice" = separation lines of cell written, "stream" = separation lines of cell not written
@var threshold [int], from 0 to 100 miminum of accuracy of tables to be scrapped
@var graph_position [boolean], are the graphs' (their uri) position to be put in the text?
see github for contacts & tips : https://github.com/regardscitoyens/EFSA-DOIs
"""
super().__init__(path_pdf,pages,flavor,threshold,graph_position)
self.modify_text_EFSA()
self.modify_tables_EFSA()
self.save_text_EFSA()
self.save_tables()
def modify_text_EFSA(self):
"""
put text in friendly dictionnary
"""
cv = ['Name:','SCCS, SCHER, SCENIHR involvement:', 'Title:', 'Profession:', 'Current EFSA involvements:', 'Date:']
text_dict = { }
for j,line in enumerate(self.text):
drop_line = True
i,n_pres = 0,len(cv)
while drop_line and i < n_pres:
if cv[i] in line:
drop_line = False
text_dict[cv[i]] = line[len(cv[i])+1:].replace(" Signature: SIGNED", "")
else :
i = i + 1
names = text_dict['Name:'].split(' ')
names.reverse()
names = [name.lower().