diff --git a/bin/MElt_postprocess.pl b/bin/MElt_postprocess.pl index ca6b2cf8e8dfe275ae786b208985b26c0c77e57c..426fcd54bc425f5167437c8f5bc4cc56453b94d8 100755 --- a/bin/MElt_postprocess.pl +++ b/bin/MElt_postprocess.pl @@ -47,21 +47,25 @@ if ($lang eq "zzz" || $no_post_process || $tokeniser_mode) { # s/_ACC_F/\\}/g; } else { # in non-normalising mode, situations with {} require ignoring original tokens and using splitted forms (e.g. "don't > do n't" for English) - if (!/(^| )[^ \/{}]+( |$)/) { # did we tag? + my $tmp = $_; + $tmp =~ s/{.*?} *//g; + if ($tmp =~ /(^| )([^ \/{}]+)( |$)/) { # did we tag? + # no we did not tag + if ($keep_token_form_distinction == 0) { + if (!$tokeniser_mode) { + s/{ *([^}]*?) *} *([^ ]+?\/[^ \/]+(?:\/[0-9\.]+)?) /process_transition_with_slash($1,$2)." "/ge; + } + s/{[^{}]*} *//g; + if ($tokeniser_mode) { + s/_ACC_O/{/g; + s/_ACC_F/}/g; + } + } + } else { # yes we did tag s/{ *[^}]+? *} *([^ ]+?\/[^ \/]+(?:\/[0-9\.]+)?) +{} */$1 /g; s/{} *//g; s/{ *([^}]*?) *} *[^ ]+?(\/[^ \/]+(?:\/[0-9\.]+)?) /replace_whitespaces_with_underscores($1).$2." "/ge; - } elsif ($keep_token_form_distinction == 0) { - # no we did not tag - if (!$tokeniser_mode) { - s/{ *([^}]*?) *} *([^ ]+?\/[^ \/]+(?:\/[0-9\.]+)?) /process_transition_with_slash($1,$2)." "/ge; - } - s/{[^{}]*} //g; - if ($tokeniser_mode) { - s/_ACC_O/{/g; - s/_ACC_F/}/g; - } } s/â—/{/g; s/â–·/}/g;