diff --git a/bin/MElt.in b/bin/MElt.in index f44b483a46ac41806b0e480709d0e239acfd8b46..22104c2a4e86e5b3434e52e6f3d1a117c98a8a33 100644 --- a/bin/MElt.in +++ b/bin/MElt.in @@ -28,13 +28,14 @@ LOWERCASE_UPPERCASE_SENTENCES= FEATURE_SELECTION_PARAMETERS= LANGUAGE=fr -while getopts :m:f:l:cvthrndeszx:NPLTCZKS o +while getopts :m:f:l:cvthrndeszx:NPLTCZKSM o do case "$o" in v) echo $VERSION && exit 1;; - h) echo "Usage: MElt [ -cvthrndesNPLTCZKS ] [ -m /path/to/model | -l language ] < input > output" \ + h) echo "Usage: MElt [ -cvthrndesNPLTCZKSM ] [ -m /path/to/model | -l language ] < input > output" \ && echo "MODEL SELECTION" \ && echo " -m [/path/to/model] Use the MElt model given instead of the default one (which is the model 'fr' for French, trained on the FTB and the Lefff, installed in @datadir@/fr)" \ && echo " -l [language] Use the default MElt model for language 'language' instead of the default 'fr' (the MElt model folder must be in @datadir@)" \ + && echo " -M Do not tag, only apply tokenisation and other pre-processing steps triggered by other options" \ && echo "TOKENISATION AND 'NAMED ENTITY' DETECTION" \ && echo " -t Use the SxPipe-melt-light tokeniser before tagging (sxpipe-melt-light, included in MElt's distribution, is a very lightweight version of SxPipe, a full-featured pre-parsing processing chain); option -r has no effect if -t is used ; -t is not compatible with -c and -S" \ && echo " -x Provide additional options to SxPipe-melt-light (e.g., for processing oral transcripts, '-x -ot' is recommended" \ @@ -48,7 +49,7 @@ do case "$o" in && echo " -n Normalise text before tagging, and restore initial tokens afterwards (tags are redistributed on original tokens, non-standard amalgams are tagged X, components within non-standard compounds are tagged Y)" \ && echo " -N Normalise text before tagging, and restore initial tokens afterwards (tags are redistributed on original tokens, non-standard amalgams are assigned tags of the form T1+...+Tn, components within non-standard compounds receive the tag GW, except for the last one which receives the compound's overall tag)" \ && echo " -K (to be used with -n or -N) Keep both original tokens (in the form of SxPipe-like comments) and normalised tokens (which will be associated with the POS tags)" \ - && echo " -C Do not perform any POS tagging, outputs the result of the normalisation step" \ + && echo " -C Normalise text but do not perform any POS tagging (the output is the result of the normalisation step)" \ && echo "MISC OPTIONS" \ && echo " -c Handle SxPipe-like 'comments' (arbitrary sequence of characters preceeding a token and surrounded by curly brackets; implies -r, incompatible with -T or -t)" \ && echo " -d Downcase sentences which are fully uppercase" \ @@ -78,6 +79,7 @@ do case "$o" in ;; C) CAT_OR_CORRECTOR="${BINDIR}/MElt_normaliser.pl" DO_TAGGING=0;; + M) DO_TAGGING=0;; K) KEEP_COMMENTS="-ktfd";; L) DO_LEMMATISE=1;; t) if [ "z$HANDLE_COMMENTS" = "z-c" ] diff --git a/bin/MElt_postprocess.pl b/bin/MElt_postprocess.pl index e619ef7320c609b75daf3434e2ae425c2e9e6017..badf7f0cff31f060c998460e2d599a58d45596e2 100755 --- a/bin/MElt_postprocess.pl +++ b/bin/MElt_postprocess.pl @@ -42,10 +42,14 @@ if ($lang eq "zzz" || $no_post_process) { # s/_ACC_F/\\}/g; } else { # in non-normalising mode, situations with {} require ignoring original tokens and using splitted forms (e.g. "don't > do n't" for English) - s/{ *[^}]+? *} *([^ ]+?\/[^ \/]+(?:\/[0-9\.]+)?) +{} */$1 /g; - s/{} *//g; - - s/{ *([^}]*?) *} *[^ ]+?(\/[^ \/]+(?:\/[0-9\.]+)?) /replace_whitespaces_with_underscores($1).$2." "/ge; + if (/\//) { # did we tag? + s/{ *[^}]+? *} *([^ ]+?\/[^ \/]+(?:\/[0-9\.]+)?) +{} */$1 /g; + s/{} *//g; + + s/{ *([^}]*?) *} *[^ ]+?(\/[^ \/]+(?:\/[0-9\.]+)?) /replace_whitespaces_with_underscores($1).$2." "/ge; + } elsif ($keep_token_form_distinction == 0) { + s/{[^{}]*} //g; + } s/â—/{/g; s/â–·/}/g; # s/_ACC_O/{/g;