diff --git a/bin/MElt.in b/bin/MElt.in index 5a8cd6c678e0881ba8d6fa79e0137e1b6be1e916..603e17c1816cc75fa537242e048e911bd52ec39e 100644 --- a/bin/MElt.in +++ b/bin/MElt.in @@ -81,8 +81,7 @@ do case "$o" in C) CAT_OR_CORRECTOR="${BINDIR}/MElt_normaliser.pl" DO_TAGGING=0;; M) DO_TAGGING=0; - SXPIPE_OPTIONS2="-tok -ndd"; - POSTPROCESS="${BINDIR}/MElt_postprocess.pl -t";; # tokeniser mode (replace {<something>} _URL by <something>, etc + SXPIPE_OPTIONS2="-tok -ndd";; # tokeniser mode (replace {<something>} _URL by <something>, etc K) KEEP_COMMENTS="-ktfd";; L) DO_LEMMATISE=1;; t) if [ "z$HANDLE_COMMENTS" = "z-c" ] @@ -173,7 +172,12 @@ done if [ $DO_TAGGING -eq 0 ] then - POSTPROCESS="${BINDIR}/MElt_postprocess.pl -npp" + if [ "x$SXPIPE_OPTIONS2" = "x-tok -ndd" ] + then + POSTPROCESS="${BINDIR}/MElt_postprocess.pl -t" + else + POSTPROCESS="${BINDIR}/MElt_postprocess.pl -npp" + fi fi if [ "$MODEL" = "unspecified_model" ] diff --git a/bin/MElt_postprocess.pl b/bin/MElt_postprocess.pl index 7052f8315cb5a482eb3adfddcca2537172dc98c4..61415a014704ad0bd34c6709d7fd6ffdbd795775 100755 --- a/bin/MElt_postprocess.pl +++ b/bin/MElt_postprocess.pl @@ -51,10 +51,11 @@ if ($lang eq "zzz" || $no_post_process || $tokeniser_mode) { s/{ *([^}]*?) *} *[^ ]+?(\/[^ \/]+(?:\/[0-9\.]+)?) /replace_whitespaces_with_underscores($1).$2." "/ge; } elsif ($keep_token_form_distinction == 0) { + s/{[^{}]*} //g; if ($tokeniser_mode) { - s/{([^{}]*)} _(?:URL|ROMNUM|NUMBER|NUM)/$1/g; + s/_ACC_O/{/g; + s/_ACC_F/}/g; } - s/{[^{}]*} //g; } s/â—/{/g; s/â–·/}/g; diff --git a/normalisation/en.normalisationdata/ngrams b/normalisation/en.normalisationdata/ngrams index a98f49e6510ba9b01bb12da36eefa89d7319dc37..4baf95a5ac163602e3a30d75462cb1bc9959a34b 100644 --- a/normalisation/en.normalisationdata/ngrams +++ b/normalisation/en.normalisationdata/ngrams @@ -221,10 +221,11 @@ There ain't There are n't 0 ain't I am n't I 0 I ain't I am n't 0 i ain't I am n't 0 +you ain't you are n't 0 +we ain't we are n't 0 ain't is n't 0 wanna want to 0 gonna going to 0 -ain't is n't 0 Gonna Going to 0 Wanna Want to 0 I gotta I have got to 0 @@ -412,4 +413,8 @@ yea yeah 48 yess yes 11 yo you 45 yu you 50 -yuh you 11 +([^ ]+in)' $1g 0 +jes' just 0 +cannae can n't 0 +'em them 0 +las' last 0 diff --git a/sxpipe-melt/MElt_finalise_tokenisation.pl b/sxpipe-melt/MElt_finalise_tokenisation.pl index e293cb2ba14f83ba1e84932854e3a11ad958b6e4..d6f1ec1fe25f69d317f28d3536fb73a701b9a571 100644 --- a/sxpipe-melt/MElt_finalise_tokenisation.pl +++ b/sxpipe-melt/MElt_finalise_tokenisation.pl @@ -17,12 +17,12 @@ while (1) { while (<>) { chomp; s/_ACC_([OF])/_ACC\1/g; - s/(} *_[A-Za-z_]+[A-Za-z])_[^_]+_/\1/g; + s/(} *_[A-Za-z_]+[A-Za-z])_[^_]+_( |$)/\1\2/g; if ($tokeniser_mode) { s/{([^{}]*)} *_(?:SENT_BOUND) //g; s/(^| ){([^{}]*)} *_(?:SENT_BOUND)$//g; - s/{([^{}]*)} *_(?:ROMNUM|NUM|URL|EMAIL|META[^ ]*)( |$)/"{".$1."} ".remove_blanks($1).$2/ge; + s/{([^{}]*)} *_(?:ROMNUM|NUM|URL|EMAIL|SMILEY|META[^ ]*)( |$)/"{".$1."} ".remove_blanks($1).$2/ge; s/{([^{}]*)} *_NPREF /$1/g; } else { s/{([0-9 ]+)([^}]*)} *_NUM( |$)/"{$1$2} ".remove_blanks($1).$3/ge; @@ -31,13 +31,16 @@ while (<>) { s/{([^{}]*)} _XML {/{_<_\1_>_/g; s/} *([^ {}]+) *{([^{}]+)} _XML/_<_\2_>_} \1/g; - s/_ACC([OF])/_ACC_\1/g; if ($tokeniser_mode) { - s/_/ /g; - s/(^|[^\\])\\(.)/$1$2/g; s/(^|[^\\])\\(.)/$1$2/g; + s/_(UNDERSCORE|ACC[OF])/ÊƒÉ™Æ™Ê€É›Æ É±ÉœÊ‚Ê‚ÉÊ¥$1/g; + s/_/ /g; + s/ÊƒÉ™Æ™Ê€É›Æ É±ÉœÊ‚Ê‚ÉÊ¥UNDERSCORE/_/g; + s/ÊƒÉ™Æ™Ê€É›Æ É±ÉœÊ‚Ê‚ÉÊ¥/_/g; } + + s/_ACC([OF])/_ACC_\1/g; print "$_\n"; } diff --git a/sxpipe-melt/caponlysentences.pl b/sxpipe-melt/caponlysentences.pl index 2fe2fb0b85e1556f430e867b2f4eb503ff5ea74c..1f3e43a9d83366852004fd16873844787f3a9fa8 100755 --- a/sxpipe-melt/caponlysentences.pl +++ b/sxpipe-melt/caponlysentences.pl @@ -191,6 +191,8 @@ if ($lang =~ /^(de|ja|ko|zh|tw|ar)$/) { unless ($decap =~ /[A-ZÐБВГДЕÐЖЗИЙІÌКЛМÐОПРСТУФХЦЧШЩЪЫЬѢÐЮЯѲѴӘҒҚҢӨҰҮҺİÃÀÂÄĄÃĂÅĆČÇĎÉÈÊËĘĚĞÌÃÎĨĬÃĹĽÅŃÑŇÒÓÔÕÖØŔŘŚŠŞŤŢÙÚÛŨÜǓỲÃŶÿŹáºÅ»Å½a-zабвгдеёжзийіìклмнопрÑтуфхцчшщъыьѣÑÑŽÑѳѵәғқңөұүһİáà âäąãăåćÄçÄéèêëęěğìÃîĩÄïĺľłńñňòóôõöøŕřśšşťţùúûũüǔỳýŷÿźẑżž_0-9-]{4}[\.\?\!] $/o && /^[A-ZÐБВГДЕÐЖЗИЙІÌКЛМÐОПРСТУФХЦЧШЩЪЫЬѢÐЮЯѲѴӘҒҚҢӨҰҮҺİÃÀÂÄĄÃĂÅĆČÇĎÉÈÊËĘĚĞÌÃÎĨĬÃĹĽÅŃÑŇÒÓÔÕÖØŔŘŚŠŞŤŢÙÚÛŨÜǓỲÃŶÿŹáºÅ»Å½a-zабвгдеёжзийіìклмнопрÑтуфхцчшщъыьѣÑÑŽÑѳѵәғқңөұүһİáà âäąãăåćÄçÄéèêëęěğìÃîĩÄïĺľłńñňòóôõöøŕřśšşťţùúûũüǔỳýŷÿźẑżž_0-9-]/o) { if ($lang eq "tr" && $c eq "I") { $c = "ı"; + } elsif ($lang eq "en" && $c eq "I" && $decap =~ /(^| )$/ && /^[ ,]/) { + $c = "I"; } else { $c = ext_lc($c); } diff --git a/sxpipe-melt/gl_format.pl b/sxpipe-melt/gl_format.pl index 028b25a1a34399f7ad79343cdc4c9b234f735eeb..bcb11e43f6deda6f3d85c3b82a0862d380137cd7 100755 --- a/sxpipe-melt/gl_format.pl +++ b/sxpipe-melt/gl_format.pl @@ -21,8 +21,8 @@ while (<>) { s/\s*$/ /o; # reconnaissance - s/(\s)_UNDERSCORE([^\s_]+)_UNDERSCORE(\s)/$1\{_UNDERSCORE$2\_UNDERSCORE\} $2$3/o; - s/(\s)\*([^ _\*]+)\*(\s)/$1\{*$2\*\} $2$3/o; + s/(\s)_UNDERSCORE([^\s_{}]+)_UNDERSCORE(\s)/$1\{_UNDERSCORE$2\_UNDERSCORE\} $2$3/o; + s/(\s)\*([^ _\*{}]+)\*(\s)/$1\{*$2\*\} $2$3/o; # sortie s/^ //o; diff --git a/sxpipe-melt/gl_number.pl b/sxpipe-melt/gl_number.pl index 6cc4defb1450dc71080a2db2075d67a7edac19d0..a5c567a5eede4e5c833fe8536eb415d61ab33104 100755 --- a/sxpipe-melt/gl_number.pl +++ b/sxpipe-melt/gl_number.pl @@ -220,10 +220,10 @@ while (<>) { # remet M. en mode no word segmentation au lieu de _META_TEXTUAL_GN _SENT_BOUND if ($no_sw) { - s/{(Mr\.?|M\.|Mme|Miss|Mrs\.?|Sir|Lady)}_META_TEXTUAL_GN/\1/g; + s/{(Mr\.?|M\.|Mme|Miss|Mrs\.?|Sir|Lady|Sgt\.?)}_META_TEXTUAL_GN/\1/g; } - s/((?:Mr\.?|M\.|Mme|Miss|Mrs\.?|Sir|Lady) )\{([A-ZÉÃÀÂÊÛÎÔÄËÜÃÖÇ])\}_META_TEXTUAL_GN\{\.\}_META_TEXTUAL_PONCT/$1$2$3./g; + s/((?:Mr\.?|M\.|Mme|Miss|Mrs\.?|Sir|Lady|Sgt\.?) )\{([A-ZÉÃÀÂÊÛÎÔÄËÜÃÖÇ])\}_META_TEXTUAL_GN\{\.\}_META_TEXTUAL_PONCT/$1$2$3./g; s/$listnumprefix\{([G-ZÉÃÀÂÊÛÎÔÄËÜÃÖÇ])\}_META_TEXTUAL_GN\{\.\}_META_TEXTUAL_PONCT/$1$2./g; # \1 est dans listnumprefix if ($lang =~ /^(?:fr|en|es|pt|it|ro)$/) { s/ ((?:pp|[pnv])\.?) \{(\d+)\}_META_TEXTUAL_GN\{\.\}_META_TEXTUAL_PONCT/ $1 $2$3./g; @@ -234,10 +234,15 @@ while (<>) { } if ($lang eq "en") { # le mot "a" ne peut terminer une phrase... donc on va dire artificiellement que " a[.\)...]" désigne tjs un _META_TEXTUAL_truc -# s/ ([Aa])([\.\)\/\]\-\Â])(?=[^0-9])/ \{$1\}_META_TEXTUAL_GN\{$2\}_META_TEXTUAL_PONCT/go; # a. TROP GREEDY - s/ ([Aa])([\.\)\]])( *)(?=[^0-9mM ])/ \{$1\}_META_TEXTUAL_GN\{$2\}_META_TEXTUAL_PONCT\3/go; # a. ATTENTION, TRES RISQUÉ + s/ ([Aa])(\.)( *)(?=[^0-9mM ])/ \{$1\}_META_TEXTUAL_GN\{$2\}_META_TEXTUAL_PONCT\3/go; # a. ATTENTION, TRES RISQUÉ + unless ($less_lists) { + s/ ([Aa])([\)\]])( *)(?=[^0-9mM ])/ \{$1\}_META_TEXTUAL_GN\{$2\}_META_TEXTUAL_PONCT\3/go; # a. ATTENTION, TRES RISQUÉ + } # ... sauf dans certains contextes gauches s/((?:Mr\.?|M\.|Mme|Miss|Mrs\.?|Sir|Lady) )\{([Aa])\}_META_TEXTUAL_GN\{\.\}_META_TEXTUAL_PONCT/\1\2\3./g; + # ... et droite + s/\{([Aa])\}_META_TEXTUAL_GN\{\.\}_META_TEXTUAL_PONCT(\.)/\1.\2/g; + s/\{A\}_META_TEXTUAL_GN\{\.\}_META_TEXTUAL_PONCT([A-Z]\.)/A.\1/g; } if ($less_lists) { diff --git a/sxpipe-melt/rebuild_easy_tags.pl b/sxpipe-melt/rebuild_easy_tags.pl index 3c55d18ff489422ce2c2495e74ba394f96604c67..1378f6e0fec7e5fdfd7914dcbd5458029d138fdd 100755 --- a/sxpipe-melt/rebuild_easy_tags.pl +++ b/sxpipe-melt/rebuild_easy_tags.pl @@ -7,8 +7,6 @@ binmode STDERR, ":utf8"; $| = 1; -$e=0; - # Construction des commentaires au format XML issu de Easy, i.e. { <F id=\"E$iF$j\">token</F> } # En entrée, chaque mot peut ne pas avoir de commentaire, ou avoir un commentaire non-XML: {token} mot # Il peut y avoir plusieurs tokens dans un même commentaire : { <F id=\"E$iF$j\">token1</F> <F id=\"E$iF$j+1\">token2</F> <F id=\"E$iF$j+2\">token3</F> } @@ -26,7 +24,12 @@ $e=0; # {a} b _REGLUE_c _UNSPLIT_d ==> { <F id="E1F1">ac</F> } b { <F id="E1F1">ac</F> } c { <F id="E1F1">ac</F> } d # Il n'est pas prévu que _(REGLUE|UNSPLIT)_b ait déjà un commentaire à lui. Si c'est le cas, la sortie est incorrecte. -$no_sf = 0; # no split forms (i.e., EASy sous-mots ou FTB sous-mots) + +my $MElt_tokeniser_mode = 0; # no split forms (i.e., EASy sous-mots ou FTB sous-mots) +my $no_sf = 0; #no replace _UNDERSCORE by _ + +my $e=0; + if ($lang =~ /^(fa|ckb)$/) { $no_sf = 1; } @@ -36,6 +39,7 @@ while (1) { if (/^$/) {last;} elsif (/^-l$/ || /^-lang$/i) {$lang=shift;} elsif (/^-no_sf$/) {$no_sf=1;} + elsif (/^-tok$/) {$MElt_tokeniser_mode=1;} } while (<>) { @@ -87,7 +91,7 @@ while (<>) { s/^ +//g; s/ +$//g; if ($commentaire && $_!~/^(_REGLUE_|_UNSPLIT_)/) { - $tobeprinted =~ s/_UNDERSCORE/_/g; + $tobeprinted =~ s/_UNDERSCORE/_/g unless $MElt_tokeniser_mode; $line .= "$tobeprinted"; $tobeprinted=""; } @@ -133,7 +137,7 @@ while (<>) { $tobeprinted.=" $_ "; } } - $tobeprinted =~ s/_UNDERSCORE/_/g; + $tobeprinted =~ s/_UNDERSCORE/_/g unless $MElt_tokeniser_mode; $line .= $tobeprinted; diff --git a/sxpipe-melt/segmenteur.pl b/sxpipe-melt/segmenteur.pl index 09429b42a6694f8172eb29a96ad8a1d9c6f20917..b88092650b7e669e72d58c81da1de9588dc57fef 100755 --- a/sxpipe-melt/segmenteur.pl +++ b/sxpipe-melt/segmenteur.pl @@ -249,6 +249,7 @@ while (<STDIN>) { my $inputline = $_; my $line = ""; + if ($xml) { while ($inputline =~ s/(<[^>]*){([^}>]+)} _[^ {}<>]+ /$1$2/g) {} while ($inputline =~ s/^(.*?)(<[^>]+>(?:\s*<[^>]+>)*)//) { @@ -458,12 +459,12 @@ sub tokenize_sequence { $sq = s/(?<=[Ss])\'(?=[ ,;?\!:\"“â€\)\(\*\#<>\[\]\%\/\\\=\+\«\»—–\Ë\&\`\.])/'/g; if ($sq == 0 && $lq == 0 && $rq_no_s == 0) { } elsif ($sq == 0 && $lq == $rq_no_s) { - s/(?<=[ \(\[])([\'\`])([^ \'](?:[^\'])*?[^ \'sS])\'(?=[ ,;?\!:\"“â€\)\(\*\#<>\[\]\%\/\\\=\+\«\»—–\Ë\&\`\.])/ {\1} ` \2 ' /g; # les apostrophes peuvent servir à quoter... + s/(?<=[ \(\[])([\'\`])(?!em )([^ \'](?:[^\'])*?[^ \'sS])\'(?=[ ,;?\!:\"“â€\)\(\*\#<>\[\]\%\/\\\=\+\«\»—–\Ë\&\`\.])/ {\1} ` \2 ' /g; # les apostrophes peuvent servir à quoter... } elsif ($sq == 1 && $lq == $rq_no_s) { - s/(?<=[ \(\[])(['\`])([^ '](?:[^']|[sS]')*?[^ 'sS])'(?=[ ,;?\!:\"“â€\)\(\*\#<>\[\]\%\/\\\=\+\«\»—–\Ë\&\`\.])/ {\1} ` \2 ' /g; # les apostrophes peuvent servir à quoter... + s/(?<=[ \(\[])(['\`])(?!em )([^ '](?:[^']|[sS]')*?[^ 'sS])'(?=[ ,;?\!:\"“â€\)\(\*\#<>\[\]\%\/\\\=\+\«\»—–\Ë\&\`\.])/ {\1} ` \2 ' /g; # les apostrophes peuvent servir à quoter... } else { $_ = reverse($_); - s/(?<=[ ,;?\!:\"“â€\)\(\*\#<>\[\]\%\/\\\=\+\«\»—–\Ë\&\`\.])'([^ '](?:[^']|'[sS])*?[^ '])(['\`])(?=[ \(\[])/ ' \1 ` \}\2\{ /g; # les apostrophes peuvent servir à quoter... + s/(?<=[ ,;?\!:\"“â€\)\(\*\#<>\[\]\%\/\\\=\+\«\»—–\Ë\&\`\.])'(?!em )([^ '](?:[^']|'[sS])*?[^ '])(['\`])(?=[ \(\[])/ ' \1 ` \}\2\{ /g; # les apostrophes peuvent servir à quoter... $_ = reverse($_); s/{`} ` /` /g; } @@ -676,40 +677,49 @@ sub tokenize_sequence { if ($lang eq "en") { if ($expand_contractions) { + #TODO: report here improvements made to the !$expand_contractions section s/(?<=[^\}]) ([cC]a)n't / {\1n't} \1n _UNSPLIT_not /goi; s/(?<=[^\}]) ([Ww])on't / {\1on't} \1ill _UNSPLIT_not /goi; s/(?<=[^\}]) ([^ _][^ ]*)n't / {\1n't} \1 _UNSPLIT_not /goi; s/(?<=[^\}]) ([Ii])'m / {\1'm} I _UNSPLIT_am /goi; s/(?<=[^\}]) ([Yy]ou|[Ww]e)'re / {\1're} \1 _UNSPLIT_are /goi; s/(?<=[^\}]) (I|you|we|they|should|would)'(ve) / {\1've} \1 _UNSPLIT_have /goi; - s/(?<=[^\}]) (I|you|he|she|we|they|there)'(d) / {\1'd} \1 _UNSPLIT_would /goi; - s/(?<=[^\}]) (I|you|he|she|we|they|there)'(ll) / {\1'll} \1 _UNSPLIT_will /goi; + s/(?<=[^\}]) (I|you|he|she|we|they|there|this|that|it)'(d) / {\1'd} \1 _UNSPLIT_would /goi; + s/(?<=[^\}]) (I|you|he|she|we|they|there|this|that|it)'(ll) / {\1'll} \1 _UNSPLIT_will /goi; s/(?<=[^\}]) (they)'(re) / {\1're} \1 _UNSPLIT_are /goi; s/(?<=[^\}]) ([^ ]*[^ s_])'s / {\1's} \1 _UNSPLIT_'s /goi; s/(?<=[^\}]) ([^ _][^ ]*s)'(?=[a-z] )/ \1 _REGLUE___APOS__/goi; - s/(?<=[^\}]) ([^ _][^ ]*s)'(?!s |\}.)/ {\1'} \1 _UNSPLIT_'s /goi; + s/(?<=[^\}]) ([^ _][^ ]*s)'(?![SsDd] |[lL][lL]|\}.)/ {\1'} \1 _UNSPLIT_'s /goi; s/__APOS__/'/g; } else { - s/(?<=[^\}]) ([cC]a)n't / {\1n't} \1n _UNSPLIT_n't /goi; + #preliminary corrections + s/(?<=[^\}]) ((?:some|every|any|no)(?:one|body))(s)' / {$1} $1 _REGLUE_'$2 /goi; + #processing + s/(?<=[^\}]) ([cC][aA])([Nn])('[tT]) / {$1$2$3} $1$2 _UNSPLIT_$2$3 /go; + s/(?<=[^\}]) WON'T / {WON'T} WILL _UNSPLIT_N'T /go; s/(?<=[^\}]) ([Ww])on't / {\1on't} \1ill _UNSPLIT_n't /goi; - s/(?<=[^\}]) ([^_ ][^ ])n't / {\1n't} \1 _UNSPLIT_n't /goi; - s/(?<=[^\}]) ([Ii])'m / {\1} I 'm /goi; - s/(?<=[^\}]) ([Yy]ou|[Ww]e)'re / \1 're /goi; - s/(?<=[^\}]) (I|you|we|they|should|would)'(ve) / \1 '\2 /goi; - s/(?<=[^\}]) (I|you|he|she|we|they|there)'(d|ll) / \1 '\2 /goi; + s/(?<=[^\}]) ([^_ ][^ ])N'T / {\1N'T} \1 _UNSPLIT_N'T /go; + s/(?<=[^\}]) ([^_ ][^ ])n't / {\1n't} \1 _UNSPLIT_n't /goi; + s/(?<=[^\}]) ([Ii])'([mMdD]|[lL][lL]) / {$1} I '$2 /go; + s/(?<=[^\}]) (YOU|WE)'RE / \1 'RE /go; + s/(?<=[^\}]) ([Yy][Oo][Uu]|[Ww][Ee])'re / \1 're /goi; + s/(?<=[^\}]) (you|we|they|should|would)'(ve) / \1 '\2 /goi; + s/(?<=[^\}]) (you|he|she|we|they|there|this|that|it)'(d|ll) / \1 '\2 /goi; s/(?<=[^\}]) (they)'(re) / \1 '\2 /goi; - s/(?<=[^\}]) ([^ ]*[^ s_])'s / \1 's /goi; - s/(?<=[^\}]) ([^ ]*[^ s_]){''} " ((?:_REGLUE_)?)s / {\1''s} \1 \2's /goi; + s/(?<=[^\}]) ([^ ]*[^ s_])'([sS]) / $1 '$2 /go; + s/(?<=[^\}]) ([^ ]*[^ s_]){''} " ((?:_REGLUE_)?)([smd]|re|ve|ll) / {\1''s} \1 \2'\3 /goi; s/(?<=[^\}]) ([^ _][^ ]*s)'(?=[a-z] )/ \1 _REGLUE___APOS__/goi; - s/(?<=[^\}]) ([^ _][^ ]*s)'(?!s |\}.)/ \1 {'} 's /goi; + s/(?<=[^\}]) (jes|las)'(?= )/ \1__APOS__/goi; # jes' = jus' = just + s/(?<=[^\}]) ([^ _][^ ]*)(s)'(?![SsDd] |[lL][lL]|\}.)/ \1\2 {'} '\2 /go; s/__APOS__/'/g; + s/([a-z]{2,})'(ll) /$1 '$2 /goi; } } elsif ($lang eq "fr") { s/(?<=[^\}]) ([Ss]) ' (\S+)/ {\1 '} \1' \2/goi; s/(?<=[^\}]) (\') / {$1} " /go; s/ ([ldnmst]) ([aeéiouy]\S+)/ {\1} \1' \2/goi; } - + s/(?<=[^\}]) \'([^ ]+)\' / {'\1'} ' \1 ' /goi; if (!$no_sw) { @@ -792,6 +802,8 @@ sub tokenize_sequence { s/(?<=[^\}])([- ])([Rr])([eé]num[eé]ration) /$1\{$2$3} $2émunération /go; s/(?<=[^\}])([- ])c (est|ets) /$1\{c} c' \{$2} est /go; } elsif ($lang eq "en") { + #dots for frequent abbrevs + s/\b(mrs?|sgt|ms|dr|prof|lt) \. ([A-Z])/$1. $2/goi; #abréviations courantes s/(?<=[^\}])([- ])(acct(?: ?\.)?) /$1\{$2} account /g; s/(?<=[^\}])([- ])(addl(?: ?\.)?) /$1\{$2} additional /g; @@ -826,7 +838,7 @@ sub tokenize_sequence { s/(?<=[^\}])([- ])(co(?: ?\.)?) /$1\{$2} company /g; s/(?<=[^\}])([- ])(hr(?: ?\.)?) /$1\{$2} hour /g; s/(?<=[^\}])([- ])(hrs(?: ?\.)?) /$1\{$2} hours /g; - s/(?<=[^\}])([- ])(mo(?: ?\.)?) /$1\{$2} month /g; + #s/(?<=[^\}])([- ])(mo(?: ?\.)?) /$1\{$2} month /g; #s/(?<=[^\}])([- ])(mon(?: ?\.)?) /$1\{$2} Monday /g; #s/(?<=[^\}])([- ])(tue(?: ?\.)?) /$1\{$2} Tuesday /g; #s/(?<=[^\}])([- ])(wed(?: ?\.)?) /$1\{$2} Wednesday /g; @@ -838,8 +850,8 @@ sub tokenize_sequence { s/(?<=[^\}])([- ])(abt) /$1\{$2} about /g; s/(?<=[^\}])([- ])(jr(?: ?\.)?) /$1\{$2} junior /g; s/(?<=[^\}])([- ])(jnr(?: ?\.)?) /$1\{$2} junior /g; - s/(?<=[^\}])([- ])(mo(?: ?\.)?) /$1\{$2} month /g; - s/(?<=[^\}])([- ])(mos(?: ?\.)?) /$1\{$2} months /g; + #s/(?<=[^\}])([- ])(mo(?: ?\.)?) /$1\{$2} month /g; + #s/(?<=[^\}])([- ])(mos(?: ?\.)?) /$1\{$2} months /g; s/(?<=[^\}])([- ])(sr(?: ?\.)?) /$1\{$2} senior /g; s/(?<=[^\}])([- ])(co-op) /$1\{$2} cooperative /g; s/(?<=[^\}])([- ])(co(?: ?\.)?) /$1\{$2} company /g; diff --git a/sxpipe-melt/sxpipe-melt-light.conf.in b/sxpipe-melt/sxpipe-melt-light.conf.in index 59d40c3b9795aee852929f9820435daf7d32d481..c8483d539d087c42b4398401aad3847da729ee53 100644 --- a/sxpipe-melt/sxpipe-melt-light.conf.in +++ b/sxpipe-melt/sxpipe-melt-light.conf.in @@ -91,7 +91,7 @@ cmd = $melt/remove_inner_ne.pl desc = desencapsulation des entitées nommées [caponly] -cmd = $melt/caponlysentences.pl +cmd = $melt/caponlysentences.pl -l $lang desc = minusculisation des phrases (presque) entièrement en majuscules [segment] @@ -100,7 +100,7 @@ options = -a -ca -af=@pkgdatadir@/pctabr -p=r $* desc = segmentation [rebuild] -cmd = $melt/rebuild_easy_tags.pl -no_sf +cmd = $melt/rebuild_easy_tags.pl -no_sf $* [sxpipe2melt] cmd = $melt/sxpipe2melt.pl diff --git a/sxpipe-melt/sxpipe-melt.conf.in b/sxpipe-melt/sxpipe-melt.conf.in index 1704e481e6b79eb4262328647179b021209aa852..b22ade221ac7a51c828093001457ea2fff88820a 100644 --- a/sxpipe-melt/sxpipe-melt.conf.in +++ b/sxpipe-melt/sxpipe-melt.conf.in @@ -100,7 +100,7 @@ options = -l $lang -ll $* desc = reconnaissance des nombres [caponly] -cmd = $melt/caponlysentences.pl +cmd = $melt/caponlysentences.pl -l $lang desc = minusculisation des phrases (presque) entièrement en majuscules [segment]