Mentions légales du service

Skip to content
Snippets Groups Projects
Commit a6ce8129 authored by Benoît Sagot's avatar Benoît Sagot
Browse files

git-svn-id: https://scm.gforge.inria.fr/authscm/cfourrie/svn/lingwb/MElt/trunk@5718 dc05b511-7f1d-0410-9f1c-d6f32a2df9e4
parent 4a3afcc2
No related branches found
No related tags found
No related merge requests found
......@@ -69,7 +69,7 @@ if [ $TRAIN_LEMMATISER -eq 0 ]
then
if [ $LEXMAPPING_FOR_LEMMATISER -eq 1 ]
then
echo "Error: option -M (compute lexical mapping for the lemmatiser) requires option -s (do build lemmatization data)" >&2
echo "Error: option -M (compute lexical mapping for the lemmatiser) requires option -s (do build lemmatisation data)" >&2
exit 4
fi
fi
......@@ -113,7 +113,7 @@ then
echo "-S ${FEATURE_SELECTION_PARAMETERS}" > "$1/options"
if [ $TRAIN_LEMMATISER -eq 1 ]
then
echo Building the lemmatization database && \
echo Building the lemmatisation database && \
PYTHONPATH=$PYTHONPATH perl @bindir@/MElt_lemmatiser-train.pl -m "$1"
if [ $LEXMAPPING_FOR_LEMMATISER -eq 1 ]
then
......
......@@ -31,7 +31,7 @@ while (1) {
elsif (/^-ml$/) {$multiple_lemmas = 1;}
elsif (/^-h$/ || /^--?help^/) {
print STDERR <<END;
Usage: MElt_lemmatizer-train.pl -m model
Usage: MElt_lemmatiser-train.pl -m model
END
exit(0);
}
......@@ -60,12 +60,12 @@ if ($lexfile eq "") {
if ($dbfile eq "") {
$lexfile =~ /^(.*\/)[^\/]+$/;
$dbfile = $1."lemmatization_data.db";
$dbfile = $1."lemmatisation_data.db";
}
if ($verbose) {
print STDERR "Lexicon used: $lexfile\n";
print STDERR "Output lemmatization DB: $dbfile\n";
print STDERR "Output lemmatisation DB: $dbfile\n";
if ($flag_unknowns eq "") {
print STDERR "Lemmas for forms unknown to the lexicon are not prefixed by any special character\n" ;
} else {
......
......@@ -33,8 +33,8 @@ while (1) {
elsif (/^-ml$/) {$multiple_lemmas = 1;}
elsif (/^-h$/ || /^--?help^/) {
print STDERR <<END;
Usage: MElt_lemmatizer.pl [ -l language | -m model ] [ -nfu ] [ -itmapping ] [ -lcl ] < input > output
Input: POS-tagged text in Brown format. The text *must* have been tagged using MElt, as this lemmatizer is based
Usage: MElt_lemmatiser.pl [ -l language | -m model ] [ -nfu ] [ -itmapping ] [ -lcl ] < input > output
Input: POS-tagged text in Brown format. The text *must* have been tagged using MElt, as this lemmatiser is based
on the (external) lexicon used by a particular MElt model and on the tags assigned by MElt using this model
Brown format: word1/pos1 word2/pos2 ... wordn/posn (newline = new sentence)
Output: word1/pos1/lemma1 word2/pos2/lemma2 ... wordn/posn/lemman (newline = new sentence; lemmas for words
......@@ -43,7 +43,7 @@ Options:
-l language Use the lexicon of the default MElt model for language 'language'
-m model Use the lexicon of the MElt model to be found in the directory 'model'
-M mapping Use the tagset mapping file provided
-v Verbose (outputs information about the options used on STDERR before lemmatizing)
-v Verbose (outputs information about the options used on STDERR before lemmatising)
-nv Silent (outputs nothing on STDERR)
-nfu Do not prefix lemmas for forms unknown to the lexicon with the character '*'
-lcl Output all lemmas in lowercase
......@@ -70,7 +70,7 @@ if ($mapping_file ne "") {
}
}
print STDERR " LEMMATIZER: Using tagset mapping table from model $model\n";
print STDERR " LEMMATISER: Using tagset mapping table from model $model\n";
if ($lang eq "it") {$itmapping = 1}
......@@ -85,7 +85,11 @@ if ($dbfile eq "") {
}
$model = $datadir."/".$language;
}
$dbfile = $model."/lemmatization_data.db";
if (-r $model."/lemmatisation_data.db") {
$dbfile = $model."/lemmatisation_data.db";
} else {
$dbfile = $model."/lemmatization_data.db";
}
} else {
if ($language ne "" || $model ne "") {
die "Error: option -lex can not be used with options -l or -m";
......@@ -93,7 +97,7 @@ if ($dbfile eq "") {
}
if ($verbose) {
print STDERR "Lemmatization database used: $dbfile\n";
print STDERR "Lemmatisation database used: $dbfile\n";
if ($flag_unknowns eq "") {
print STDERR "Lemmas for forms unknown to the lexicon are not prefixed by any special character\n" ;
} else {
......@@ -117,7 +121,7 @@ my $sth_cfslsc3=$dbh->prepare('select lemmasuff from cat_formsuff_lemmasuff2coun
"--LRB--" => "(",
);
print STDERR " LEMMATIZER: Lemmatizing...\n" unless $silent;
print STDERR " LEMMATISER: Lemmatising...\n" unless $silent;
my %get_cat_form2lemma_cache;
my %includes_data_for_cat_formsuff_cache;
......@@ -156,19 +160,19 @@ while (<>) {
push @result, "$comment$token/$cat$postcat$proba/".get_cat_form2lemma($cat,$equiv{$token});
} elsif ($it_mapping && $token !~ /^[A-ZÉ]/ && $token =~ /^(.*?)(lo|la|mi|ne|gli|si|li|le)$/ && get_cat_form2lemma(VERB,lc($1)) ne "" && get_cat_form2lemma(PRON,lc($2)) ne "") {
if ($cat ne "PRON") {
push @result, "$comment$token/VERB$postcat/".get_cat_form2lemma(VERB,lc($1));
push @result, "$comment$token/VERB$postcat/".get_cat_form2lemma("VERB",lc($1));
} elsif ($cat eq "PRON") {
push @result, "$comment$token/$cat$postcat$proba/".get_cat_form2lemma($cat,lc($2));
}
} elsif ($it_mapping && $token !~ /^[A-ZÉ]/ && $token =~ /^(.*?)(lo|la|mi|ne|gli|si|li|le)$/ && get_cat_form2lemma(VERB,lc($1."e")) ne "" && get_cat_form2lemma(PRON,lc($2)) ne "") {
if ($cat ne "PRON") {
push @result, "$comment$token/VERB$postcat/".get_cat_form2lemma(VERB,lc($1."e"));
push @result, "$comment$token/VERB$postcat/".get_cat_form2lemma("VERB",lc($1."e"));
} elsif ($cat eq "PRON") {
push @result, "$comment$token/$cat$postcat$proba/".get_cat_form2lemma($cat,lc($2));
}
} elsif ($it_mapping && $token !~ /^[A-ZÉ]/ && $token =~ /^(.*?)(.)(lo|la|mi|ne|gli|si|li|le)$/ && get_cat_form2lemma(VERB,lc($1.$2.$2."e")) ne "" && get_cat_form2lemma(PRON,lc($3)) ne "") {
if ($cat ne "PRON") {
push @result, "$comment$token/VERB$postcat/".get_cat_form2lemma(VERB,lc($1.$2.$2."e"));
push @result, "$comment$token/VERB$postcat/".get_cat_form2lemma("VERB",lc($1.$2.$2."e"));
} elsif ($cat eq "PRON") {
push @result, "$comment$token/$cat$postcat$proba/".get_cat_form2lemma($cat,lc($3));
}
......@@ -178,6 +182,8 @@ while (<>) {
} else {
push @result, "$comment$token/$cat$postcat$proba/$1o";
}
} elsif ($token =~ /^\d+(?:[,\/\.':°\-]\d+)*$/) {
push @result, "$comment$token/$cat$postcat$proba/$token";
} else {
if ($token !~ /^[A-ZÉ]/) {
$token_suff = $token;
......@@ -211,11 +217,11 @@ while (<>) {
$_ .= $1;
}
$what_remains =~ s/^\s*//;
die "Format error in lemmatizer input near to: $what_remains" if ($what_remains ne "");
die "Format error in lemmatiser input near to: $what_remains" if ($what_remains ne "");
print $_.$post."\n";
}
print STDERR " LEMMATIZER: Lemmatizing: done\n" unless $silent;
print STDERR " LEMMATISER: Lemmatising: done\n" unless $silent;
sub get_cat_form2lemma {
my $cat = shift;
......
......@@ -21,7 +21,7 @@ nobase_pkgdata_DATA = \
$(srcdir)/fr-ftbuc/feature_map.json \
$(srcdir)/fr-ftbuc/weights.npy \
$(srcdir)/fr-ftbuc/bias_weights.npy \
$(srcdir)/fr-ftbuc/lemmatization_data.db
$(srcdir)/fr-ftbuc/lemmatization_data.db \
\
$(srcdir)/fr-perceo/lexicon.json \
$(srcdir)/fr-perceo/tag_dict.json \
......@@ -29,7 +29,7 @@ nobase_pkgdata_DATA = \
$(srcdir)/fr-perceo/feature_map.json \
$(srcdir)/fr-perceo/weights.npy \
$(srcdir)/fr-perceo/bias_weights.npy \
$(srcdir)/fr-perceo/lemmatization_data.db
$(srcdir)/fr-perceo/lemmatization_data.db \
\
$(srcdir)/en/lexicon.json \
$(srcdir)/en/tag_dict.json \
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment