refactoring; raw text tagging

git-svn-id: https://scm.gforge.inria.fr/authscm/cfourrie/svn/lingwb/metagger/trunk@2687 dc05b511-7f1d-0410-9f1c-d6f32a2df9e4

refactoring; raw text tagging
b606bdb0 · Pascal Denis · 00713268 · b606bdb0 · b606bdb0 · b606bdb0
Commit b606bdb0 authored 15 years ago by Pascal Denis
--- a/bin/Makefile.am
+++ b/bin/Makefile.am
@@ -20,9 +20,9 @@ BUILT_SOURCES =	build_word_list.py	\
 		pos_tag.py		\
 		eval.py

-EDIT = perl -pe "s|\@alexinadir\@|$(alexinadir)|g;\
+EDIT = perl -pe "s|\@alexinadir\@|$(alexinadir)|g;	\
 		s|\@datadir\@|\$(pkgdatadir)|g; 	\
-		s|\@libdir\@|\$(pkglibdir)|g; 	\
+		s|\@libdir\@|\$(pkglibdir)|g; 		\
 		s|\@bindir\@|\$(bindir)|g; "

 %.py: %.py.in

--- a/bin/pos_tag.py.in
+++ b/bin/pos_tag.py.in
@@ -11,21 +11,15 @@ from metagger.pos_tagger import POSTagger
 usage = "usage: %prog [options] <input_file>"
 parser = optparse.OptionParser(usage=usage)
 parser.add_option("-m", "--model", action="store", help="model file")
-parser.add_option("-t", "--training", action="store", help="training data file")
-parser.add_option("-b", "--beam_size", action="store", help="set beam size", type=int, default=5)
+parser.add_option("-t", "--train", action="store", help="training data file")
+parser.add_option("-b", "--beam_size", action="store", help="set beam size", type=int, default=3)
+parser.add_option("-p", "--prior_prec", action="store", help="set precision of gaussian prior", type=int, default=1)
 parser.add_option("-w", "--word_list", action="store", help="read in word_list", default='')
 parser.add_option("-d", "--tag_dict", action="store", help="read in tag dictionary", default='')
 parser.add_option("-l", "--lefff", action="store", help="read in Lefff DB", default='')
 parser.add_option("-o", "--output_file", action="store", help="output file", default='')
 (options, args) = parser.parse_args()

-model_file = options.model
-train_file = options.training
-beam_size = options.beam_size
-tag_dict_file = options.tag_dict
-word_list_file = options.word_list
-lefff_file = options.lefff
-out_file = options.output_file
 infile = args[0]

 print >> sys.stderr, options
@@ -34,34 +28,35 @@ print >> sys.stderr, options
 pos_tagger = POSTagger()

 # read-in tag dictionary from file
-if tag_dict_file:
-    pos_tagger.load_tag_dictionary( tag_dict_file )
+if options.tag_dict:
+    pos_tagger.load_tag_dictionary( options.tag_dict )
 else:
    print >> sys.stderr, "Warning: No tag dictionary provided"
    
 # read in known words list from file
-if word_list_file:
-    pos_tagger.load_word_list( word_list_file )
+if options.word_list:
+    pos_tagger.load_word_list( options.word_list )
 else:
    print >> sys.stderr, "Warning: No word list provided"
    
 # read-in Lefff
-if lefff_file:
-    pos_tagger.load_lefff( lefff_file )
+if options.lefff:
+    pos_tagger.load_lefff( options.lefff )
 else:
    print >> sys.stderr, "Warning: Lefff not provided"
    
 # load model if provided
-if train_file:
-    pos_tagger.train_model( train_file, model_path=model_file )
-elif model_file:
-    pos_tagger.load_model( model_file )
+if options.train:
+    pos_tagger.train_model( options.train, prior_prec=options.prior_prec, \
+                            model_path=options.model )
+elif options.model:
+    pos_tagger.load_model( options.model )
 else:
    sys.exit("Error: please provide model file (-m) or training file (-t)")


 ############## apply tagger ##################################
-pos_tagger.apply( infile, outfile=out_file )
+pos_tagger.apply( infile, beam_size=options.beam_size, outfile=options.output_file )


    

--- a/data/ftb4+mergeC+undocpd/ftb_2.raw
+++ b/data/ftb4+mergeC+undocpd/ftb_2.raw
--- a/src/metagger/Makefile.am
+++ b/src/metagger/Makefile.am
@@ -23,9 +23,9 @@ BUILT_SOURCES = 	__init__.py		\
 			megam_classifier.py	\
 			result_sink.py

-EDIT = perl -pe "s|\@alexinadir\@|$(alexinadir)|g;\
+EDIT = perl -pe "s|\@alexinadir\@|$(alexinadir)|g;	\
 		s|\@datadir\@|\$(pkgdatadir)|g; 	\
-		s|\@libdir\@|\$(pkglibdir)|g; 	\
+		s|\@libdir\@|\$(pkglibdir)|g;		\
 		s|\@bindir\@|\$(bindir)|g; "

 %.py: %.py.in

--- a/src/metagger/megam_classifier.py.in
+++ b/src/metagger/megam_classifier.py.in
@@ -50,11 +50,12 @@ class MegamClassifier:


    def train( self, datafile, paramfile=tempfile.mktemp(), \
-               prior=1, repeat=4, maxit=100, quiet=True ):
+               prior_prec=1, repeat=4, maxit=100, quiet=True ):
        """ simple call to megam executable for multiclass
        classification with some relevant options:
        
-        -prior: precision of Gaussian prior (megam default:1)
+        -prior_prec: precision of Gaussian prior (megam default:1). It's
+         the inverse variance. See http://www.cs.utah.edu/~hal/docs/daume04cg-bfgs.pdf.
        -repeat: repeat optimization <int> times (megam default:1)
        -maxit: max # of iterations (megam default:100)
                    
@@ -64,7 +65,7 @@ class MegamClassifier:
        self.paramfile = paramfile
        # build process command
        cmd = 'megam.opt -nc -repeat %s -lambda %s -maxi %s multiclass %s 1> %s' \
-              %(repeat,prior,maxit,datafile,self.paramfile)
+              %(repeat,prior_prec,maxit,datafile,self.paramfile)
        if quiet:
            cmd += " 2> /tmp/null"
        # run command

--- a/src/metagger/pos_tagger.py.in
+++ b/src/metagger/pos_tagger.py.in
@@ -19,8 +19,7 @@ from metagger.instance import Instance
 class POSTagger:


-    def __init__(self, beam_size=5):
-        self.beam_size = beam_size
+    def __init__(self):
        self.known_words = []
        self.tag_dict = {}
        self.lefff_dict = defaultdict(list)
@@ -44,7 +43,8 @@ class POSTagger:
        train_inst_file = self.generate_training_data(_file)
        print  >> sys.stderr, "Data file: %s" %train_inst_file
        print >> sys.stderr, "Training POS model..."
-        self.classifier.train( train_inst_file, quiet=False )
+        self.classifier.train( train_inst_file, paramfile=model_path, \
+                               prior_prec=1, quiet=False )
        return
    

@@ -78,7 +78,7 @@ class POSTagger:
    


-    def tag_token_sequence(self, tokens):
+    def tag_token_sequence(self, tokens, beam_size):
        ''' N-best breath search for the best tag sequence for each sentence'''
        # maintain N-best sequences of tagged tokens
        sequences = [([],0.0)]  # log prob.
@@ -111,14 +111,14 @@ class POSTagger:
            # sort sequences
            n_best_sequences.sort(lambda x,y:cmp(x[1],y[1]))
            # keep N best
-            sequences = n_best_sequences[-self.beam_size:]
+            sequences = n_best_sequences[-beam_size:]
        # return sequence with highest prob. 
        best_sequence = sequences[-1][0]
        # print >> sys.stderr, "Best tok seq:", [(t.string,t.label) for t in best_sequence]
        return best_sequence


-    def apply(self, _file, outfile=''):
+    def apply(self, _file, beam_size=3, outfile=''):
        # open output file
        out = sys.stdout
        if outfile:
@@ -130,7 +130,7 @@ class POSTagger:
            for item in line.split():
                token = Token( string=item )
                tokens.append( token )
-            tagged_tokens = self.tag_token_sequence( tokens )
+            tagged_tokens = self.tag_token_sequence( tokens, beam_size )
            # print tagged sentence to output file
            tagged_sent = " ".join( [tok.__str__() for tok in tagged_tokens] )
            print >> out, tagged_sent

--- a/src/metagger/result_sink.py.in
+++ b/src/metagger/result_sink.py.in
@@ -17,10 +17,9 @@ class AccuracySink(ResultSink):
        self.matrix = {}
        self.tags = {}
 	self.model_predictions = []
-	self.label_distributions = []
        return

-    def update(self,gold_cl,pred_cl,label_distribution):
+    def update(self,gold_cl,pred_cl):
        self.total += 1
        self.total_by_tag[gold_cl] = self.total_by_tag.get(gold_cl,0) + 1
        self.pred_by_tag[pred_cl] = self.pred_by_tag.get(pred_cl,0) + 1
@@ -30,7 +29,6 @@ class AccuracySink(ResultSink):
        self.tags[gold_cl] = 1
        self.matrix[(gold_cl,pred_cl)] = self.matrix.get((gold_cl,pred_cl),0) + 1
 	self.model_predictions.append((gold_cl,pred_cl))
-	self.label_distributions.append(label_distribution)
        return

    def score(self, rounding=3 ):
@@ -62,12 +60,6 @@ class AccuracySink(ResultSink):
 	print "\n".join(truth_and_predicted)
 	print "============================================"

-    def distributions(self):
-	print "====== Model distributions on test data ======"
-	print "\n".join([repr(item) for item in self.label_distributions])
-	print "=============================================="
-        return
-
    def rpf(self):
        """Print R-P-F scores by class labels"""
        print "Recall/Precision/F1 by class labels"