git-svn-id: https://scm.gforge.inria.fr/authscm/cfourrie/svn/lingwb/MElt/trunk@5718 dc05b511-7f1d-0410-9f1c-d6f32a2df9e4

a6ce8129 · Benoît Sagot · 4a3afcc2 · a6ce8129 · a6ce8129 · a6ce8129
Commit a6ce8129 authored 8 years ago by Benoît Sagot
--- a/bin/MElt-train.in
+++ b/bin/MElt-train.in
@@ -69,7 +69,7 @@ if [ $TRAIN_LEMMATISER -eq 0 ]
 then
    if [ $LEXMAPPING_FOR_LEMMATISER -eq 1 ]
    then
-	echo "Error: option -M (compute lexical mapping for the lemmatiser) requires option -s (do build lemmatization data)" >&2
+	echo "Error: option -M (compute lexical mapping for the lemmatiser) requires option -s (do build lemmatisation data)" >&2
 	exit 4
    fi
 fi
@@ -113,7 +113,7 @@ then
    echo "-S ${FEATURE_SELECTION_PARAMETERS}" > "$1/options"
    if [ $TRAIN_LEMMATISER -eq 1 ]
    then
-	echo Building the lemmatization database && \
+	echo Building the lemmatisation database && \
 	PYTHONPATH=$PYTHONPATH perl @bindir@/MElt_lemmatiser-train.pl -m "$1"
 	if [ $LEXMAPPING_FOR_LEMMATISER -eq 1 ]
 	then

--- a/bin/MElt_lemmatiser-train.pl.in
+++ b/bin/MElt_lemmatiser-train.pl.in
@@ -31,7 +31,7 @@ while (1) {
  elsif (/^-ml$/) {$multiple_lemmas = 1;}
  elsif (/^-h$/ || /^--?help^/) {
    print STDERR <<END;
-Usage: MElt_lemmatizer-train.pl -m model
+Usage: MElt_lemmatiser-train.pl -m model
 END
    exit(0);
  }
@@ -60,12 +60,12 @@ if ($lexfile eq "") {

 if ($dbfile eq "") {
  $lexfile =~ /^(.*\/)[^\/]+$/;
-  $dbfile = $1."lemmatization_data.db";
+  $dbfile = $1."lemmatisation_data.db";
 }

 if ($verbose) {
  print STDERR "Lexicon used:	$lexfile\n";
-  print STDERR "Output lemmatization DB:	$dbfile\n";
+  print STDERR "Output lemmatisation DB:	$dbfile\n";
  if ($flag_unknowns eq "") {
    print STDERR "Lemmas for forms unknown to the lexicon are not prefixed by any special character\n" ;
  } else {

--- a/bin/MElt_lemmatiser.pl.in
+++ b/bin/MElt_lemmatiser.pl.in
@@ -33,8 +33,8 @@ while (1) {
  elsif (/^-ml$/) {$multiple_lemmas = 1;}
  elsif (/^-h$/ || /^--?help^/) {
    print STDERR <<END;
-Usage: MElt_lemmatizer.pl [ -l language | -m model ] [ -nfu ] [ -itmapping ] [ -lcl ] < input > output
-Input:	POS-tagged text in Brown format. The text *must* have been tagged using MElt, as this lemmatizer is based
+Usage: MElt_lemmatiser.pl [ -l language | -m model ] [ -nfu ] [ -itmapping ] [ -lcl ] < input > output
+Input:	POS-tagged text in Brown format. The text *must* have been tagged using MElt, as this lemmatiser is based
 	on the (external) lexicon used by a particular MElt model and on the tags assigned by MElt using this model
 	Brown format: word1/pos1 word2/pos2 ... wordn/posn		(newline = new sentence)
 Output:	word1/pos1/lemma1 word2/pos2/lemma2 ... wordn/posn/lemman	(newline = new sentence; lemmas for words
@@ -43,7 +43,7 @@ Options:
 	-l language	Use the lexicon of the default MElt model for language 'language'
 	-m model	Use the lexicon of the MElt model to be found in the directory 'model'
 	-M mapping	Use the tagset mapping file provided
-	-v		Verbose (outputs information about the options used on STDERR before lemmatizing)
+	-v		Verbose (outputs information about the options used on STDERR before lemmatising)
 	-nv		Silent (outputs nothing on STDERR)
 	-nfu		Do not prefix lemmas for forms unknown to the lexicon with the character '*'
 	-lcl		Output all lemmas in lowercase
@@ -70,7 +70,7 @@ if ($mapping_file ne "") {
  }
 }

-print STDERR "  LEMMATIZER: Using tagset mapping table from model $model\n";
+print STDERR "  LEMMATISER: Using tagset mapping table from model $model\n";

 if ($lang eq "it") {$itmapping = 1}

@@ -85,7 +85,11 @@ if ($dbfile eq "") {
    }
    $model = $datadir."/".$language;
  }
-  $dbfile = $model."/lemmatization_data.db";
+  if (-r $model."/lemmatisation_data.db") {
+    $dbfile = $model."/lemmatisation_data.db";
+  } else {
+    $dbfile = $model."/lemmatization_data.db";
+  }
 } else {
  if ($language ne "" || $model ne "") {
    die "Error: option -lex can not be used with options -l or -m";
@@ -93,7 +97,7 @@ if ($dbfile eq "") {
 }

 if ($verbose) {
-  print STDERR "Lemmatization database used:	$dbfile\n";
+  print STDERR "Lemmatisation database used:	$dbfile\n";
  if ($flag_unknowns eq "") {
    print STDERR "Lemmas for forms unknown to the lexicon are not prefixed by any special character\n" ;
  } else {
@@ -117,7 +121,7 @@ my $sth_cfslsc3=$dbh->prepare('select lemmasuff from cat_formsuff_lemmasuff2coun
 	  "--LRB--" => "(",
 );

-print STDERR "  LEMMATIZER: Lemmatizing...\n" unless $silent;
+print STDERR "  LEMMATISER: Lemmatising...\n" unless $silent;

 my %get_cat_form2lemma_cache;
 my %includes_data_for_cat_formsuff_cache;
@@ -156,19 +160,19 @@ while (<>) {
      push @result, "$comment$token/$cat$postcat$proba/".get_cat_form2lemma($cat,$equiv{$token});
    } elsif ($it_mapping && $token !~ /^[A-ZÉ]/ && $token =~ /^(.*?)(lo|la|mi|ne|gli|si|li|le)$/ && get_cat_form2lemma(VERB,lc($1)) ne "" && get_cat_form2lemma(PRON,lc($2)) ne "") {
      if ($cat ne "PRON") {
-	push @result, "$comment$token/VERB$postcat/".get_cat_form2lemma(VERB,lc($1));
+	push @result, "$comment$token/VERB$postcat/".get_cat_form2lemma("VERB",lc($1));
      } elsif ($cat eq "PRON") {
 	push @result, "$comment$token/$cat$postcat$proba/".get_cat_form2lemma($cat,lc($2));
      }
    } elsif ($it_mapping && $token !~ /^[A-ZÉ]/ && $token =~ /^(.*?)(lo|la|mi|ne|gli|si|li|le)$/ && get_cat_form2lemma(VERB,lc($1."e")) ne "" && get_cat_form2lemma(PRON,lc($2)) ne "") {
      if ($cat ne "PRON") {
-	push @result, "$comment$token/VERB$postcat/".get_cat_form2lemma(VERB,lc($1."e"));
+	push @result, "$comment$token/VERB$postcat/".get_cat_form2lemma("VERB",lc($1."e"));
      } elsif ($cat eq "PRON") {
 	push @result, "$comment$token/$cat$postcat$proba/".get_cat_form2lemma($cat,lc($2));
      }
    } elsif ($it_mapping && $token !~ /^[A-ZÉ]/ && $token =~ /^(.*?)(.)(lo|la|mi|ne|gli|si|li|le)$/ && get_cat_form2lemma(VERB,lc($1.$2.$2."e")) ne "" && get_cat_form2lemma(PRON,lc($3)) ne "") {
      if ($cat ne "PRON") {
-	push @result, "$comment$token/VERB$postcat/".get_cat_form2lemma(VERB,lc($1.$2.$2."e"));
+	push @result, "$comment$token/VERB$postcat/".get_cat_form2lemma("VERB",lc($1.$2.$2."e"));
      } elsif ($cat eq "PRON") {
 	push @result, "$comment$token/$cat$postcat$proba/".get_cat_form2lemma($cat,lc($3));
      }
@@ -178,6 +182,8 @@ while (<>) {
      } else {
 	push @result, "$comment$token/$cat$postcat$proba/$1o";
      }
+    } elsif ($token =~ /^\d+(?:[,\/\.':°\-]\d+)*$/) {
+      push @result, "$comment$token/$cat$postcat$proba/$token";
    } else {
      if ($token !~ /^[A-ZÉ]/) {
 	$token_suff = $token;
@@ -211,11 +217,11 @@ while (<>) {
    $_ .= $1;
  }
  $what_remains =~ s/^\s*//;
-  die "Format error in lemmatizer input near to: $what_remains" if ($what_remains ne "");
+  die "Format error in lemmatiser input near to: $what_remains" if ($what_remains ne "");
  print $_.$post."\n";
 }

-print STDERR "  LEMMATIZER: Lemmatizing: done\n" unless $silent;
+print STDERR "  LEMMATISER: Lemmatising: done\n" unless $silent;

 sub get_cat_form2lemma {
  my $cat = shift;

--- a/models/Makefile.am
+++ b/models/Makefile.am
@@ -21,7 +21,7 @@ nobase_pkgdata_DATA = \
 	$(srcdir)/fr-ftbuc/feature_map.json	\
 	$(srcdir)/fr-ftbuc/weights.npy	\
 	$(srcdir)/fr-ftbuc/bias_weights.npy	\
-	$(srcdir)/fr-ftbuc/lemmatization_data.db	
+	$(srcdir)/fr-ftbuc/lemmatization_data.db	\
 	\
 	$(srcdir)/fr-perceo/lexicon.json	\
 	$(srcdir)/fr-perceo/tag_dict.json	\
@@ -29,7 +29,7 @@ nobase_pkgdata_DATA = \
 	$(srcdir)/fr-perceo/feature_map.json	\
 	$(srcdir)/fr-perceo/weights.npy	\
 	$(srcdir)/fr-perceo/bias_weights.npy	\
-	$(srcdir)/fr-perceo/lemmatization_data.db
+	$(srcdir)/fr-perceo/lemmatization_data.db	\
 	\
 	$(srcdir)/en/lexicon.json	\
 	$(srcdir)/en/tag_dict.json	\