From 17d2e225a390c08d244640d77c90c0ce82cfae6c Mon Sep 17 00:00:00 2001
From: Pascal Denis <pascal.denis@alpage>
Date: Tue, 6 Oct 2009 08:00:47 +0000
Subject: [PATCH] clean up; addition of shell script pos_tag.sh

git-svn-id: https://scm.gforge.inria.fr/authscm/cfourrie/svn/lingwb/metagger/trunk@2793 dc05b511-7f1d-0410-9f1c-d6f32a2df9e4
---
 bin/pos_tag.sh                      |  8 ++++++++
 src/metagger/instance.py.in         | 14 +++++++-------
 src/metagger/megam_classifier.py.in | 15 +++++++--------
 {bin => src/metagger}/pos_tag.py.in |  0
 src/metagger/pos_tagger.py.in       | 10 ----------
 5 files changed, 22 insertions(+), 25 deletions(-)
 create mode 100644 bin/pos_tag.sh
 rename {bin => src/metagger}/pos_tag.py.in (100%)

diff --git a/bin/pos_tag.sh b/bin/pos_tag.sh
new file mode 100644
index 0000000..c82bb4e
--- /dev/null
+++ b/bin/pos_tag.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/sh
+
+PYTHONPATH=${MELT_DIR}/src/melt
+MODEL=${MELT_DIR}/data/ftb1_model.megam
+TAGDICO=${MELT_DIR}/data/ftb1_tag_dict
+LEFFF=${MELT_DIR}/data/lefff.ftb4tags
+python ${MELT_DIR}/src/pos_tag.py -m ${MODEL} -d ${TAGDICO} -l ${LEFFF} $@
+
diff --git a/src/metagger/instance.py.in b/src/metagger/instance.py.in
index 8e61202..7e8c4c5 100644
--- a/src/metagger/instance.py.in
+++ b/src/metagger/instance.py.in
@@ -33,8 +33,8 @@ class Instance:
         return u'%s\t%s' %(self.label," ".join(self.fv))
 
 
-    def get_features(self,index,tokens,known_words={},lefff={},cache={}):
-        self.get_static_features(index,tokens,known_words=known_words,lefff=lefff,cache=cache)
+    def get_features(self,index,tokens,lefff={},cache={}):
+        self.get_static_features(index,tokens,lefff=lefff,cache=cache)
         self.get_sequential_features(index,tokens)
         return
     
@@ -47,15 +47,15 @@ class Instance:
         return
 
 
-    def get_static_features(self,index,tokens,known_words={},lefff={},cache={}):
+    def get_static_features(self,index,tokens,lefff={},cache={}):
         # current word features
-        self.get_cwd_features(index, tokens, known_words, lefff, cache)
+        self.get_cwd_features(index, tokens, lefff, cache)
         # contextual features
-        self.get_conx_features(index,tokens,known_words, lefff)
+        self.get_conx_features(index,tokens,lefff)
         return
 
 
-    def get_cwd_features(self,index,tokens,known_words,lefff,cache):
+    def get_cwd_features(self,index,tokens,lefff,cache):
         word = tokens[index].string
         #### lexical features
         self.add('wd',word)
@@ -75,7 +75,7 @@ class Instance:
         return
 
 
-    def get_conx_features(self,index,tokens,known_words,lefff,w=2):
+    def get_conx_features(self,index,tokens,lefff,w=2):
         for p in range(-w,w+1):
             if p == 0: continue # skip current wd
             absp = index+p
diff --git a/src/metagger/megam_classifier.py.in b/src/metagger/megam_classifier.py.in
index 559cca4..fe05dfc 100755
--- a/src/metagger/megam_classifier.py.in
+++ b/src/metagger/megam_classifier.py.in
@@ -28,8 +28,7 @@ class MegamClassifier:
         """ the paramfile is a sequence of whitespace-separated lines
         the column column is a string feature label, while subsequent
         columns are the weight for that feature for class. The first
-        line is a map of class *names* to *column positions* for
-        example: ***NAMEDLABELSIDS*** O B I
+        line is a map of class *names* to *column positions* 
         """
         print >> sys.stderr, "Reading parameters file: %s" %self.paramfile
         
@@ -62,11 +61,13 @@ class MegamClassifier:
                     
         """
         print >> sys.stderr, ">>> Training Megam classifier..."
-        megam_exec_path = os.environ.get("MEGAM_DIR",None)+"/megam.opt"
+        try:
+            megam_exec_path = os.environ.get("MEGAM_DIR",None)+"/megam.opt"
+        except TypeError:
+            sys.exit("Missing env variable for MEGAM_DIR. You need Megam to train models.")
         self.paramfile = paramfile
         # build process command
-        proc = ["megam.opt", "-nc", "-repeat", repeat, "-lambda", prior_prec,\
-                "-maxi", maxit]
+        proc = ["megam.opt", "-nc", "-repeat", repeat, "-lambda", prior_prec,"-maxi", maxit]
         if not bias:
             proc.append("-nobias") 
         proc.append("multiclass") # optimization type
@@ -85,7 +86,6 @@ class MegamClassifier:
         print >> sys.stderr, "Megam parameters dumped into file %s" %self.paramfile
         # load model from output param file
         self.load_model()
-        # self.flush_model_file()
         print >> sys.stderr, "Model classes (%s): %s" %(len(self.classes),self.classes)
         print >> sys.stderr, "# of features: %s" %(len(self.weights))
         return
@@ -137,8 +137,7 @@ class MegamClassifier:
         return zip( self.classes, probs )
 
 
-    def get_uniform_distribution( self, features ):
-        return zip( self.classes, [1.0/len(self.classes) for c in self.classes] )
+
 
     
 
diff --git a/bin/pos_tag.py.in b/src/metagger/pos_tag.py.in
similarity index 100%
rename from bin/pos_tag.py.in
rename to src/metagger/pos_tag.py.in
diff --git a/src/metagger/pos_tagger.py.in b/src/metagger/pos_tagger.py.in
index 3a144eb..50436bb 100755
--- a/src/metagger/pos_tagger.py.in
+++ b/src/metagger/pos_tagger.py.in
@@ -22,7 +22,6 @@ class POSTagger:
 
 
     def __init__(self):
-        self.known_words = {}
         self.tag_dict = {}
         self.lefff_dict = defaultdict(list)
         self.classifier = MegamClassifier()
@@ -71,7 +70,6 @@ class POSTagger:
                 os.write(1, "%s" %"\b"*len(str(inst_ct))+str(inst_ct))
                 inst = Instance(tokens[i].label)
                 inst.get_features(i,tokens,
-                                  known_words=self.known_words,
                                   lefff=self.lefff_dict,
                                   cache=feat_cache)
                 print >> data_file, inst.__str__() 
@@ -91,7 +89,6 @@ class POSTagger:
             # cache static features
             cached_inst = Instance() 
             cached_inst.get_static_features(i,tokens,
-                                            known_words=self.known_words,
                                             lefff=self.lefff_dict)
             # possible tags
             legit_tags = self.tag_dict.get(token.string,[])
@@ -145,13 +142,6 @@ class POSTagger:
         return
 
 
-    def load_word_list(self, _file, encoding='latin-1'):
-        print >> sys.stderr, "Loading know words list..."
-        self.known_words = dict([l.strip().split() for l in codecs.open(_file,'r',encoding)])
-        print >> sys.stderr, "Frequent word list loaded: %s words" %(len( self.known_words))
-        return
-
-
     def load_tag_dictionary(self, _file, encoding='latin-1'):
         print >> sys.stderr, "Loading tag dictionary..."
         for l in codecs.open(_file,'r',encoding):
-- 
GitLab