From 27a480ff8e4feb353d5764fa0aaa74f3bea0fec0 Mon Sep 17 00:00:00 2001
From: Benoit Sagot <benoit.sagot@inria.fr>
Date: Tue, 8 Nov 2016 07:01:26 +0000
Subject: [PATCH] New option -M (no tagging, only those pre/post-processings as
 specified by other options; this is different from -C which does not tag but
 does normalise the input)

git-svn-id: https://scm.gforge.inria.fr/authscm/cfourrie/svn/lingwb/MElt/trunk@5698 dc05b511-7f1d-0410-9f1c-d6f32a2df9e4
---
 bin/MElt.in             |  8 +++++---
 bin/MElt_postprocess.pl | 12 ++++++++----
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/bin/MElt.in b/bin/MElt.in
index f44b483..22104c2 100644
--- a/bin/MElt.in
+++ b/bin/MElt.in
@@ -28,13 +28,14 @@ LOWERCASE_UPPERCASE_SENTENCES=
 FEATURE_SELECTION_PARAMETERS=
 LANGUAGE=fr
 
-while getopts :m:f:l:cvthrndeszx:NPLTCZKS o
+while getopts :m:f:l:cvthrndeszx:NPLTCZKSM o
 do case "$o" in
 	v)  echo $VERSION && exit 1;;
-	h)  echo "Usage: MElt [ -cvthrndesNPLTCZKS ] [ -m /path/to/model | -l language ] < input > output" \
+	h)  echo "Usage: MElt [ -cvthrndesNPLTCZKSM ] [ -m /path/to/model | -l language ] < input > output" \
 	    && echo "MODEL SELECTION" \
             && echo "  -m	[/path/to/model] Use the MElt model given instead of the default one (which is the model 'fr' for French, trained on the FTB and the Lefff, installed in @datadir@/fr)" \
             && echo "  -l	[language] Use the default MElt model for language 'language' instead of the default 'fr' (the MElt model folder must be in @datadir@)" \
+            && echo "  -M	Do not tag, only apply tokenisation and other pre-processing steps triggered by other options" \
 	    && echo "TOKENISATION AND 'NAMED ENTITY' DETECTION" \
             && echo "  -t	Use the SxPipe-melt-light tokeniser before tagging (sxpipe-melt-light, included in MElt's distribution, is a very lightweight version of SxPipe, a full-featured pre-parsing processing chain); option -r has no effect if -t is used ; -t is not compatible with -c and -S" \
             && echo "  -x	Provide additional options to SxPipe-melt-light (e.g., for processing oral transcripts, '-x -ot' is recommended" \
@@ -48,7 +49,7 @@ do case "$o" in
 	    && echo "  -n       Normalise text before tagging, and restore initial tokens afterwards (tags are redistributed on original tokens, non-standard amalgams are tagged X, components within non-standard compounds are tagged Y)" \
 	    && echo "  -N       Normalise text before tagging, and restore initial tokens afterwards (tags are redistributed on original tokens, non-standard amalgams are assigned tags of the form T1+...+Tn, components within non-standard compounds receive the tag GW, except for the last one which receives the compound's overall tag)" \
 	    && echo "  -K       (to be used with -n or -N) Keep both original tokens (in the form of SxPipe-like comments) and normalised tokens (which will be associated with the POS tags)" \
-	    && echo "  -C       Do not perform any POS tagging, outputs the result of the normalisation step" \
+	    && echo "  -C       Normalise text but do not perform any POS tagging (the output is the result of the normalisation step)" \
 	    && echo "MISC OPTIONS" \
             && echo "  -c	Handle SxPipe-like 'comments' (arbitrary sequence of characters preceeding a token and surrounded by curly brackets; implies -r, incompatible with -T or -t)" \
             && echo "  -d	Downcase sentences which are fully uppercase" \
@@ -78,6 +79,7 @@ do case "$o" in
 	    ;;
 	C)  CAT_OR_CORRECTOR="${BINDIR}/MElt_normaliser.pl"
 	    DO_TAGGING=0;;
+	M)  DO_TAGGING=0;;
 	K)  KEEP_COMMENTS="-ktfd";;
 	L)  DO_LEMMATISE=1;;
 	t)  if [ "z$HANDLE_COMMENTS" = "z-c" ]
diff --git a/bin/MElt_postprocess.pl b/bin/MElt_postprocess.pl
index e619ef7..badf7f0 100755
--- a/bin/MElt_postprocess.pl
+++ b/bin/MElt_postprocess.pl
@@ -42,10 +42,14 @@ if ($lang eq "zzz" || $no_post_process) {
 #      s/_ACC_F/\\}/g;
     } else {
       # in non-normalising mode, situations with {} require ignoring original tokens and using splitted forms (e.g. "don't > do n't" for English)
-      s/{ *[^}]+? *} *([^ ]+?\/[^ \/]+(?:\/[0-9\.]+)?) +{} */$1 /g;
-      s/{} *//g;
-      
-      s/{ *([^}]*?) *} *[^ ]+?(\/[^ \/]+(?:\/[0-9\.]+)?) /replace_whitespaces_with_underscores($1).$2." "/ge;
+      if (/\//) { # did we tag?
+	s/{ *[^}]+? *} *([^ ]+?\/[^ \/]+(?:\/[0-9\.]+)?) +{} */$1 /g;
+	s/{} *//g;
+	
+	s/{ *([^}]*?) *} *[^ ]+?(\/[^ \/]+(?:\/[0-9\.]+)?) /replace_whitespaces_with_underscores($1).$2." "/ge;
+      } elsif ($keep_token_form_distinction == 0) {
+	s/{[^{}]*} //g;
+      }
       s/â—/{/g;
       s/â–·/}/g;
 #      s/_ACC_O/{/g;
-- 
GitLab