From 27a480ff8e4feb353d5764fa0aaa74f3bea0fec0 Mon Sep 17 00:00:00 2001 From: Benoit Sagot <benoit.sagot@inria.fr> Date: Tue, 8 Nov 2016 07:01:26 +0000 Subject: [PATCH] New option -M (no tagging, only those pre/post-processings as specified by other options; this is different from -C which does not tag but does normalise the input) git-svn-id: https://scm.gforge.inria.fr/authscm/cfourrie/svn/lingwb/MElt/trunk@5698 dc05b511-7f1d-0410-9f1c-d6f32a2df9e4 --- bin/MElt.in | 8 +++++--- bin/MElt_postprocess.pl | 12 ++++++++---- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/bin/MElt.in b/bin/MElt.in index f44b483..22104c2 100644 --- a/bin/MElt.in +++ b/bin/MElt.in @@ -28,13 +28,14 @@ LOWERCASE_UPPERCASE_SENTENCES= FEATURE_SELECTION_PARAMETERS= LANGUAGE=fr -while getopts :m:f:l:cvthrndeszx:NPLTCZKS o +while getopts :m:f:l:cvthrndeszx:NPLTCZKSM o do case "$o" in v) echo $VERSION && exit 1;; - h) echo "Usage: MElt [ -cvthrndesNPLTCZKS ] [ -m /path/to/model | -l language ] < input > output" \ + h) echo "Usage: MElt [ -cvthrndesNPLTCZKSM ] [ -m /path/to/model | -l language ] < input > output" \ && echo "MODEL SELECTION" \ && echo " -m [/path/to/model] Use the MElt model given instead of the default one (which is the model 'fr' for French, trained on the FTB and the Lefff, installed in @datadir@/fr)" \ && echo " -l [language] Use the default MElt model for language 'language' instead of the default 'fr' (the MElt model folder must be in @datadir@)" \ + && echo " -M Do not tag, only apply tokenisation and other pre-processing steps triggered by other options" \ && echo "TOKENISATION AND 'NAMED ENTITY' DETECTION" \ && echo " -t Use the SxPipe-melt-light tokeniser before tagging (sxpipe-melt-light, included in MElt's distribution, is a very lightweight version of SxPipe, a full-featured pre-parsing processing chain); option -r has no effect if -t is used ; -t is not compatible with -c and -S" \ && echo " -x Provide additional options to SxPipe-melt-light (e.g., for processing oral transcripts, '-x -ot' is recommended" \ @@ -48,7 +49,7 @@ do case "$o" in && echo " -n Normalise text before tagging, and restore initial tokens afterwards (tags are redistributed on original tokens, non-standard amalgams are tagged X, components within non-standard compounds are tagged Y)" \ && echo " -N Normalise text before tagging, and restore initial tokens afterwards (tags are redistributed on original tokens, non-standard amalgams are assigned tags of the form T1+...+Tn, components within non-standard compounds receive the tag GW, except for the last one which receives the compound's overall tag)" \ && echo " -K (to be used with -n or -N) Keep both original tokens (in the form of SxPipe-like comments) and normalised tokens (which will be associated with the POS tags)" \ - && echo " -C Do not perform any POS tagging, outputs the result of the normalisation step" \ + && echo " -C Normalise text but do not perform any POS tagging (the output is the result of the normalisation step)" \ && echo "MISC OPTIONS" \ && echo " -c Handle SxPipe-like 'comments' (arbitrary sequence of characters preceeding a token and surrounded by curly brackets; implies -r, incompatible with -T or -t)" \ && echo " -d Downcase sentences which are fully uppercase" \ @@ -78,6 +79,7 @@ do case "$o" in ;; C) CAT_OR_CORRECTOR="${BINDIR}/MElt_normaliser.pl" DO_TAGGING=0;; + M) DO_TAGGING=0;; K) KEEP_COMMENTS="-ktfd";; L) DO_LEMMATISE=1;; t) if [ "z$HANDLE_COMMENTS" = "z-c" ] diff --git a/bin/MElt_postprocess.pl b/bin/MElt_postprocess.pl index e619ef7..badf7f0 100755 --- a/bin/MElt_postprocess.pl +++ b/bin/MElt_postprocess.pl @@ -42,10 +42,14 @@ if ($lang eq "zzz" || $no_post_process) { # s/_ACC_F/\\}/g; } else { # in non-normalising mode, situations with {} require ignoring original tokens and using splitted forms (e.g. "don't > do n't" for English) - s/{ *[^}]+? *} *([^ ]+?\/[^ \/]+(?:\/[0-9\.]+)?) +{} */$1 /g; - s/{} *//g; - - s/{ *([^}]*?) *} *[^ ]+?(\/[^ \/]+(?:\/[0-9\.]+)?) /replace_whitespaces_with_underscores($1).$2." "/ge; + if (/\//) { # did we tag? + s/{ *[^}]+? *} *([^ ]+?\/[^ \/]+(?:\/[0-9\.]+)?) +{} */$1 /g; + s/{} *//g; + + s/{ *([^}]*?) *} *[^ ]+?(\/[^ \/]+(?:\/[0-9\.]+)?) /replace_whitespaces_with_underscores($1).$2." "/ge; + } elsif ($keep_token_form_distinction == 0) { + s/{[^{}]*} //g; + } s/â—/{/g; s/â–·/}/g; # s/_ACC_O/{/g; -- GitLab