diff --git a/bin/baseline_tagger.pl b/bin/baseline_tagger.pl new file mode 100755 index 0000000000000000000000000000000000000000..74ef67a29737fbfac18099077043046afd02c195 --- /dev/null +++ b/bin/baseline_tagger.pl @@ -0,0 +1,70 @@ +#!/usr/bin/perl + +$training_corpus = shift || die "Please provide a training corpus"; + +$lefff = shift || ""; + +open (TRAIN, "<$training_corpus") || die "Could not open traning corpus"; +while (<TRAIN>) { + chomp; + s/^ +//; + s/ +$//; + for (split / +/, $_) { + /^(.*)\/(.*)$/ || die "Format error: $_"; + $form_cat2occ{$1}{$2}++; + $cat2occ{$2}++; + } +} +if ($lefff ne "") { + open (LEFFF, "<$lefff") || die "Could not open $lefff: $!"; + while (<LEFFF>) { + s/ /_/g; + s/(\S)-(\S)/\1_-_\2/g; + /^(.*?)\t(.*?)\t/; + $lefff{$1}{$2}=1; + } +} + + +for $form (keys %form_cat2occ) { + for $cat (keys %{$form_cat2occ{$form}}) { + if (!defined($form2baseline_cat_occ{$form}) || $form2baseline_cat_occ{$form} < $form_cat2occ{$form}{$cat}) { + $form2baseline_cat_occ{$form} = $form_cat2occ{$form}{$cat}; + $form2baseline_cat{$form} = $cat; + } + } +} +@ordered_cats = sort {$cat2occ{$b} <=> $cat2occ{$a}} keys %cat2occ; +for $form (keys %lefff) { + for $cat (@ordered_cats) { + if (defined ($lefff{$form}{$cat})) { + $disambiguated_lefff{$form} = $cat; + last; + } + } +} +$best_cat = $ordered_cats [0]; + +%form2baseline_cat_occ = (); + +while (<>) { + chomp; + s/^ +//; + s/ +$//; + $line = ""; + for (split / +/, $_) { + s/^(.*)\/(.*)$/\1/; + $form = $1; + if (defined $form2baseline_cat{$form}) { + $line .= $form."/".$form2baseline_cat{$form}." "; + } elsif ($lefff ne "" && defined ($disambiguated_lefff{$form})) { + $line .= $form."/".$disambiguated_lefff{$form}." "; + } elsif ($lefff ne "" && defined ($disambiguated_lefff{lc($form)})) { + $line .= $form."/".$disambiguated_lefff{lc($form)}." "; + } else { + $line .= $form."/".$best_cat." "; + } + } + $line =~ s/ $//; + print "$line\n"; +}