diff --git a/sxpipe-melt/segmenteur.pl b/sxpipe-melt/segmenteur.pl index 4c638f92e639c775a613dab34d1dd28671fb2bec..09429b42a6694f8172eb29a96ad8a1d9c6f20917 100755 --- a/sxpipe-melt/segmenteur.pl +++ b/sxpipe-melt/segmenteur.pl @@ -673,7 +673,7 @@ sub tokenize_sequence { } s/(?<=[^\}]) Mr +\. / {Mr .} Mr. /go; s/(?<=[^\}]) (autocad) / {$1} _SPECWORD_AutoCAD /gi; - + if ($lang eq "en") { if ($expand_contractions) { s/(?<=[^\}]) ([cC]a)n't / {\1n't} \1n _UNSPLIT_not /goi; @@ -686,7 +686,9 @@ sub tokenize_sequence { s/(?<=[^\}]) (I|you|he|she|we|they|there)'(ll) / {\1'll} \1 _UNSPLIT_will /goi; s/(?<=[^\}]) (they)'(re) / {\1're} \1 _UNSPLIT_are /goi; s/(?<=[^\}]) ([^ ]*[^ s_])'s / {\1's} \1 _UNSPLIT_'s /goi; + s/(?<=[^\}]) ([^ _][^ ]*s)'(?=[a-z] )/ \1 _REGLUE___APOS__/goi; s/(?<=[^\}]) ([^ _][^ ]*s)'(?!s |\}.)/ {\1'} \1 _UNSPLIT_'s /goi; + s/__APOS__/'/g; } else { s/(?<=[^\}]) ([cC]a)n't / {\1n't} \1n _UNSPLIT_n't /goi; s/(?<=[^\}]) ([Ww])on't / {\1on't} \1ill _UNSPLIT_n't /goi; @@ -698,7 +700,9 @@ sub tokenize_sequence { s/(?<=[^\}]) (they)'(re) / \1 '\2 /goi; s/(?<=[^\}]) ([^ ]*[^ s_])'s / \1 's /goi; s/(?<=[^\}]) ([^ ]*[^ s_]){''} " ((?:_REGLUE_)?)s / {\1''s} \1 \2's /goi; + s/(?<=[^\}]) ([^ _][^ ]*s)'(?=[a-z] )/ \1 _REGLUE___APOS__/goi; s/(?<=[^\}]) ([^ _][^ ]*s)'(?!s |\}.)/ \1 {'} 's /goi; + s/__APOS__/'/g; } } elsif ($lang eq "fr") { s/(?<=[^\}]) ([Ss]) ' (\S+)/ {\1 '} \1' \2/goi;