diff --git a/bin/MElt-train.in b/bin/MElt-train.in index d8b9091ab2109247ed0bc12ab1bee1d8755d2727..c495313acc602ce165158c436304b2c4eac2fcc7 100644 --- a/bin/MElt-train.in +++ b/bin/MElt-train.in @@ -69,7 +69,7 @@ if [ $TRAIN_LEMMATISER -eq 0 ] then if [ $LEXMAPPING_FOR_LEMMATISER -eq 1 ] then - echo "Error: option -M (compute lexical mapping for the lemmatiser) requires option -s (do build lemmatization data)" >&2 + echo "Error: option -M (compute lexical mapping for the lemmatiser) requires option -s (do build lemmatisation data)" >&2 exit 4 fi fi @@ -113,7 +113,7 @@ then echo "-S ${FEATURE_SELECTION_PARAMETERS}" > "$1/options" if [ $TRAIN_LEMMATISER -eq 1 ] then - echo Building the lemmatization database && \ + echo Building the lemmatisation database && \ PYTHONPATH=$PYTHONPATH perl @bindir@/MElt_lemmatiser-train.pl -m "$1" if [ $LEXMAPPING_FOR_LEMMATISER -eq 1 ] then diff --git a/bin/MElt_lemmatiser-train.pl.in b/bin/MElt_lemmatiser-train.pl.in index 8db1539508f8af57866ef079371e9a3242bbf987..d972f4f18e5c6cbb95634fcc3b7527fe0c6d1c8a 100644 --- a/bin/MElt_lemmatiser-train.pl.in +++ b/bin/MElt_lemmatiser-train.pl.in @@ -31,7 +31,7 @@ while (1) { elsif (/^-ml$/) {$multiple_lemmas = 1;} elsif (/^-h$/ || /^--?help^/) { print STDERR <<END; -Usage: MElt_lemmatizer-train.pl -m model +Usage: MElt_lemmatiser-train.pl -m model END exit(0); } @@ -60,12 +60,12 @@ if ($lexfile eq "") { if ($dbfile eq "") { $lexfile =~ /^(.*\/)[^\/]+$/; - $dbfile = $1."lemmatization_data.db"; + $dbfile = $1."lemmatisation_data.db"; } if ($verbose) { print STDERR "Lexicon used: $lexfile\n"; - print STDERR "Output lemmatization DB: $dbfile\n"; + print STDERR "Output lemmatisation DB: $dbfile\n"; if ($flag_unknowns eq "") { print STDERR "Lemmas for forms unknown to the lexicon are not prefixed by any special character\n" ; } else { diff --git a/bin/MElt_lemmatiser.pl.in b/bin/MElt_lemmatiser.pl.in index dcf3667fe954fc8c897d11e851bff8120987a97a..d248cf1a32928ea1f958ad28b1e94310795aff6b 100644 --- a/bin/MElt_lemmatiser.pl.in +++ b/bin/MElt_lemmatiser.pl.in @@ -33,8 +33,8 @@ while (1) { elsif (/^-ml$/) {$multiple_lemmas = 1;} elsif (/^-h$/ || /^--?help^/) { print STDERR <<END; -Usage: MElt_lemmatizer.pl [ -l language | -m model ] [ -nfu ] [ -itmapping ] [ -lcl ] < input > output -Input: POS-tagged text in Brown format. The text *must* have been tagged using MElt, as this lemmatizer is based +Usage: MElt_lemmatiser.pl [ -l language | -m model ] [ -nfu ] [ -itmapping ] [ -lcl ] < input > output +Input: POS-tagged text in Brown format. The text *must* have been tagged using MElt, as this lemmatiser is based on the (external) lexicon used by a particular MElt model and on the tags assigned by MElt using this model Brown format: word1/pos1 word2/pos2 ... wordn/posn (newline = new sentence) Output: word1/pos1/lemma1 word2/pos2/lemma2 ... wordn/posn/lemman (newline = new sentence; lemmas for words @@ -43,7 +43,7 @@ Options: -l language Use the lexicon of the default MElt model for language 'language' -m model Use the lexicon of the MElt model to be found in the directory 'model' -M mapping Use the tagset mapping file provided - -v Verbose (outputs information about the options used on STDERR before lemmatizing) + -v Verbose (outputs information about the options used on STDERR before lemmatising) -nv Silent (outputs nothing on STDERR) -nfu Do not prefix lemmas for forms unknown to the lexicon with the character '*' -lcl Output all lemmas in lowercase @@ -70,7 +70,7 @@ if ($mapping_file ne "") { } } -print STDERR " LEMMATIZER: Using tagset mapping table from model $model\n"; +print STDERR " LEMMATISER: Using tagset mapping table from model $model\n"; if ($lang eq "it") {$itmapping = 1} @@ -85,7 +85,11 @@ if ($dbfile eq "") { } $model = $datadir."/".$language; } - $dbfile = $model."/lemmatization_data.db"; + if (-r $model."/lemmatisation_data.db") { + $dbfile = $model."/lemmatisation_data.db"; + } else { + $dbfile = $model."/lemmatization_data.db"; + } } else { if ($language ne "" || $model ne "") { die "Error: option -lex can not be used with options -l or -m"; @@ -93,7 +97,7 @@ if ($dbfile eq "") { } if ($verbose) { - print STDERR "Lemmatization database used: $dbfile\n"; + print STDERR "Lemmatisation database used: $dbfile\n"; if ($flag_unknowns eq "") { print STDERR "Lemmas for forms unknown to the lexicon are not prefixed by any special character\n" ; } else { @@ -117,7 +121,7 @@ my $sth_cfslsc3=$dbh->prepare('select lemmasuff from cat_formsuff_lemmasuff2coun "--LRB--" => "(", ); -print STDERR " LEMMATIZER: Lemmatizing...\n" unless $silent; +print STDERR " LEMMATISER: Lemmatising...\n" unless $silent; my %get_cat_form2lemma_cache; my %includes_data_for_cat_formsuff_cache; @@ -156,19 +160,19 @@ while (<>) { push @result, "$comment$token/$cat$postcat$proba/".get_cat_form2lemma($cat,$equiv{$token}); } elsif ($it_mapping && $token !~ /^[A-ZÉ]/ && $token =~ /^(.*?)(lo|la|mi|ne|gli|si|li|le)$/ && get_cat_form2lemma(VERB,lc($1)) ne "" && get_cat_form2lemma(PRON,lc($2)) ne "") { if ($cat ne "PRON") { - push @result, "$comment$token/VERB$postcat/".get_cat_form2lemma(VERB,lc($1)); + push @result, "$comment$token/VERB$postcat/".get_cat_form2lemma("VERB",lc($1)); } elsif ($cat eq "PRON") { push @result, "$comment$token/$cat$postcat$proba/".get_cat_form2lemma($cat,lc($2)); } } elsif ($it_mapping && $token !~ /^[A-ZÉ]/ && $token =~ /^(.*?)(lo|la|mi|ne|gli|si|li|le)$/ && get_cat_form2lemma(VERB,lc($1."e")) ne "" && get_cat_form2lemma(PRON,lc($2)) ne "") { if ($cat ne "PRON") { - push @result, "$comment$token/VERB$postcat/".get_cat_form2lemma(VERB,lc($1."e")); + push @result, "$comment$token/VERB$postcat/".get_cat_form2lemma("VERB",lc($1."e")); } elsif ($cat eq "PRON") { push @result, "$comment$token/$cat$postcat$proba/".get_cat_form2lemma($cat,lc($2)); } } elsif ($it_mapping && $token !~ /^[A-ZÉ]/ && $token =~ /^(.*?)(.)(lo|la|mi|ne|gli|si|li|le)$/ && get_cat_form2lemma(VERB,lc($1.$2.$2."e")) ne "" && get_cat_form2lemma(PRON,lc($3)) ne "") { if ($cat ne "PRON") { - push @result, "$comment$token/VERB$postcat/".get_cat_form2lemma(VERB,lc($1.$2.$2."e")); + push @result, "$comment$token/VERB$postcat/".get_cat_form2lemma("VERB",lc($1.$2.$2."e")); } elsif ($cat eq "PRON") { push @result, "$comment$token/$cat$postcat$proba/".get_cat_form2lemma($cat,lc($3)); } @@ -178,6 +182,8 @@ while (<>) { } else { push @result, "$comment$token/$cat$postcat$proba/$1o"; } + } elsif ($token =~ /^\d+(?:[,\/\.':°\-]\d+)*$/) { + push @result, "$comment$token/$cat$postcat$proba/$token"; } else { if ($token !~ /^[A-ZÉ]/) { $token_suff = $token; @@ -211,11 +217,11 @@ while (<>) { $_ .= $1; } $what_remains =~ s/^\s*//; - die "Format error in lemmatizer input near to: $what_remains" if ($what_remains ne ""); + die "Format error in lemmatiser input near to: $what_remains" if ($what_remains ne ""); print $_.$post."\n"; } -print STDERR " LEMMATIZER: Lemmatizing: done\n" unless $silent; +print STDERR " LEMMATISER: Lemmatising: done\n" unless $silent; sub get_cat_form2lemma { my $cat = shift; diff --git a/models/Makefile.am b/models/Makefile.am index 5469156c17fbae96ddaec4d891176a8e13c47f9e..eaf0c555982b009c4276e5635eb35a656d0e6c44 100644 --- a/models/Makefile.am +++ b/models/Makefile.am @@ -21,7 +21,7 @@ nobase_pkgdata_DATA = \ $(srcdir)/fr-ftbuc/feature_map.json \ $(srcdir)/fr-ftbuc/weights.npy \ $(srcdir)/fr-ftbuc/bias_weights.npy \ - $(srcdir)/fr-ftbuc/lemmatization_data.db + $(srcdir)/fr-ftbuc/lemmatization_data.db \ \ $(srcdir)/fr-perceo/lexicon.json \ $(srcdir)/fr-perceo/tag_dict.json \ @@ -29,7 +29,7 @@ nobase_pkgdata_DATA = \ $(srcdir)/fr-perceo/feature_map.json \ $(srcdir)/fr-perceo/weights.npy \ $(srcdir)/fr-perceo/bias_weights.npy \ - $(srcdir)/fr-perceo/lemmatization_data.db + $(srcdir)/fr-perceo/lemmatization_data.db \ \ $(srcdir)/en/lexicon.json \ $(srcdir)/en/tag_dict.json \