From 9d9daaf7c7da2c0ff3e48c8934b5522f1628afc3 Mon Sep 17 00:00:00 2001
From: Pascal Denis <pascal.denis@alpage>
Date: Thu, 23 Jul 2009 09:12:23 +0000
Subject: [PATCH] extra option for eval in pos_tag.py

git-svn-id: https://scm.gforge.inria.fr/authscm/cfourrie/svn/lingwb/metagger/trunk@2727 dc05b511-7f1d-0410-9f1c-d6f32a2df9e4
---
 bin/eval.py.in                      |  7 ++++---
 bin/pos_tag.py.in                   | 12 ++++++++----
 src/metagger/megam_classifier.py.in |  6 ++++--
 src/metagger/pos_tagger.py.in       | 12 ++++++++----
 src/metagger/result_sink.py.in      |  8 ++++++--
 5 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/bin/eval.py.in b/bin/eval.py.in
index ce1a966..77df955 100644
--- a/bin/eval.py.in
+++ b/bin/eval.py.in
@@ -13,6 +13,7 @@ parser = optparse.OptionParser(usage=usage)
 parser.add_option("-g", "--gold", action="store", help="gold file")
 parser.add_option("-p", "--predictions", action="store", help="predictions file")
 parser.add_option("-d", "--dictionary", action="store", help="dictionary")
+parser.add_option("-e", "--errors", action="store_true", help="show errors",default=False)
 (options, args) = parser.parse_args()
 
 if not (options.gold and options.predictions):
@@ -33,7 +34,7 @@ unk_sink = AccuracySink() # unkown words
 
 ############ compare gold/pred files ########################################
 
-compare_files( options.gold, options.predictions, sink, unk_sink, known_words)
+compare_files( options.gold, options.predictions, sink, unk_sink, known_words, quiet=not options.errors)
 
 ############ print out scores ########################################
 
@@ -41,8 +42,8 @@ sink.rpf()
 print 
 sink.confusion()
 print             
-print "\nOverall Acc:", sink.score()
-print "Unk wds Acc:", unk_sink.score()
+print "\nOverall Acc: %s (%s/%s)" %(sink.score(),sink.correct,sink.total)
+print "Unk wds Acc: %s (%s/%s)" %(unk_sink.score(),unk_sink.correct,unk_sink.total)
 
     
 
diff --git a/bin/pos_tag.py.in b/bin/pos_tag.py.in
index 4a4e9e9..4786b1e 100644
--- a/bin/pos_tag.py.in
+++ b/bin/pos_tag.py.in
@@ -5,7 +5,7 @@ import sys
 import codecs
 import optparse 
 from metagger.pos_tagger import POSTagger
-
+from metagger.result_sink import AccuracySink, compare_files
 
 # Import Psyco if available
 try:
@@ -25,7 +25,8 @@ parser.add_option("-p", "--prior_prec", action="store", help="set precision of g
 parser.add_option("-w", "--word_list", action="store", help="read in word_list", default='')
 parser.add_option("-d", "--tag_dict", action="store", help="read in tag dictionary", default='')
 parser.add_option("-l", "--lefff", action="store", help="read in Lefff DB", default='')
-parser.add_option("-o", "--output_file", action="store", help="output file", default='')
+parser.add_option("-o", "--output_file", action="store", help="output file", default='pos_tagger.out')
+parser.add_option("-g", "--gold_file", action="store", help="reference file")
 (options, args) = parser.parse_args()
 
 infile = args[0]
@@ -67,5 +68,8 @@ else:
 pos_tagger.apply( infile, beam_size=options.beam_size, outfile=options.output_file )
 
 
-    
-    
+############## eval ##################################
+if options.gold_file:
+    sink = AccuracySink()
+    compare_files( options.gold_file, options.output_file, sink )
+    print "Acc: %s (%s/%s)" %(sink.score(),sink.correct,sink.total)
diff --git a/src/metagger/megam_classifier.py.in b/src/metagger/megam_classifier.py.in
index dfcea61..a2d3f76 100755
--- a/src/metagger/megam_classifier.py.in
+++ b/src/metagger/megam_classifier.py.in
@@ -51,7 +51,7 @@ class MegamClassifier:
 
 
     def train( self, datafile, paramfile=tempfile.mktemp(), \
-               prior_prec=1, repeat=4, maxit=100, bias=True, quiet=True ):
+               prior_prec=1, repeat=2, maxit=100, bias=True, quiet=True ):
         """ simple call to megam executable for multiclass
         classification with some relevant options:
         
@@ -79,7 +79,9 @@ class MegamClassifier:
         #rc = os.spawnv(os.P_WAIT, megam_exec_path, proc)
         #if rc == 127:
         #    raise Exception("Error while trying to execute "+" ".join(proc))
-        os.system( " ".join(proc) )
+        proc_str = " ".join(proc)
+        print >> sys.stderr, proc 
+        os.system( proc_str )
         print >> sys.stderr, "Megam parameters dumped into file %s" %self.paramfile
         # load model from output param file
         self.load_model()
diff --git a/src/metagger/pos_tagger.py.in b/src/metagger/pos_tagger.py.in
index 52b8143..28dc285 100755
--- a/src/metagger/pos_tagger.py.in
+++ b/src/metagger/pos_tagger.py.in
@@ -121,13 +121,15 @@ class POSTagger:
         return best_sequence
 
 
-    def apply(self, infile, beam_size=3, outfile='', encoding='latin-1'):
+    def apply(self, infile, beam_size=3, outfile='pos_tagger.out', encoding='latin-1'):
+        print >> sys.stderr, "Applying tagger on %s" %infile
         # open output file
-        out = sys.stdout
-        if outfile:
-            out = codecs.open( outfile, 'w', encoding )
+        out = codecs.open( outfile, 'w', encoding )
         # process sentences
+        s_ct = 0
         for line in codecs.open( infile, 'r', encoding ):
+            s_ct += 1
+            os.write(1, "%s" %"\b"*len(str(s_ct))+str(s_ct))
             wds = line.strip().split()
             tokens = []
             for wd in wds:
@@ -137,6 +139,8 @@ class POSTagger:
             # print tagged sentence to output file
             tagged_sent = " ".join( [tok.__str__() for tok in tagged_tokens] )
             print >> out, tagged_sent
+        # close file
+        out.close()
         return
 
 
diff --git a/src/metagger/result_sink.py.in b/src/metagger/result_sink.py.in
index 8b2afc7..8b70203 100644
--- a/src/metagger/result_sink.py.in
+++ b/src/metagger/result_sink.py.in
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 
+import sys
 from metagger.corpus_reader import BrownReader
 
 class ResultSink:
@@ -85,7 +86,7 @@ class AccuracySink(ResultSink):
     
 
 
-def compare_files( gold_file, pred_file, sink, unk_sink, known_words={}):
+def compare_files( gold_file, pred_file, sink, unk_sink=None, known_words={}, quiet=True):
     gold = BrownReader( gold_file )
     pred = BrownReader( pred_file )
     s_ct = 0
@@ -102,7 +103,10 @@ def compare_files( gold_file, pred_file, sink, unk_sink, known_words={}):
                 print >> sys.stderr, "Warning: Missing prediction for Sentence #%s" %s_ct
             # update sinks
             sink.update(gtag,ptag)
-            if not gwd in known_words:
+            if not quiet:
+                if ptag<>gtag:
+                    print >> sys.stderr, "%s: %s <==> *%s" %(gwd,gtag,ptag) 
+            if unk_sink and not gwd in known_words:
                 unk_sink.update(gtag,ptag)
 
     return
-- 
GitLab