From 9b0a5cd6a92a1243615561da646f049d5bfa21b6 Mon Sep 17 00:00:00 2001 From: Benoit Sagot <benoit.sagot@inria.fr> Date: Wed, 18 Jan 2017 08:28:31 +0000 Subject: [PATCH] git-svn-id: https://scm.gforge.inria.fr/authscm/cfourrie/svn/lingwb/MElt/trunk@5737 dc05b511-7f1d-0410-9f1c-d6f32a2df9e4 --- bin/MElt_postprocess.pl | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/bin/MElt_postprocess.pl b/bin/MElt_postprocess.pl index ca6b2cf..426fcd5 100755 --- a/bin/MElt_postprocess.pl +++ b/bin/MElt_postprocess.pl @@ -47,21 +47,25 @@ if ($lang eq "zzz" || $no_post_process || $tokeniser_mode) { # s/_ACC_F/\\}/g; } else { # in non-normalising mode, situations with {} require ignoring original tokens and using splitted forms (e.g. "don't > do n't" for English) - if (!/(^| )[^ \/{}]+( |$)/) { # did we tag? + my $tmp = $_; + $tmp =~ s/{.*?} *//g; + if ($tmp =~ /(^| )([^ \/{}]+)( |$)/) { # did we tag? + # no we did not tag + if ($keep_token_form_distinction == 0) { + if (!$tokeniser_mode) { + s/{ *([^}]*?) *} *([^ ]+?\/[^ \/]+(?:\/[0-9\.]+)?) /process_transition_with_slash($1,$2)." "/ge; + } + s/{[^{}]*} *//g; + if ($tokeniser_mode) { + s/_ACC_O/{/g; + s/_ACC_F/}/g; + } + } + } else { # yes we did tag s/{ *[^}]+? *} *([^ ]+?\/[^ \/]+(?:\/[0-9\.]+)?) +{} */$1 /g; s/{} *//g; s/{ *([^}]*?) *} *[^ ]+?(\/[^ \/]+(?:\/[0-9\.]+)?) /replace_whitespaces_with_underscores($1).$2." "/ge; - } elsif ($keep_token_form_distinction == 0) { - # no we did not tag - if (!$tokeniser_mode) { - s/{ *([^}]*?) *} *([^ ]+?\/[^ \/]+(?:\/[0-9\.]+)?) /process_transition_with_slash($1,$2)." "/ge; - } - s/{[^{}]*} //g; - if ($tokeniser_mode) { - s/_ACC_O/{/g; - s/_ACC_F/}/g; - } } s/â—/{/g; s/â–·/}/g; -- GitLab