From 75c212f3d5e6c95ba66f724d1fbcba5ca4b73931 Mon Sep 17 00:00:00 2001
From: Pascal Denis <pascal.denis@alpage>
Date: Fri, 31 Jul 2009 20:15:56 +0000
Subject: [PATCH] feature cleanup

git-svn-id: https://scm.gforge.inria.fr/authscm/cfourrie/svn/lingwb/metagger/trunk@2732 dc05b511-7f1d-0410-9f1c-d6f32a2df9e4
---
 src/metagger/instance.py.in         | 28 +++++++++++-----------------
 src/metagger/megam_classifier.py.in |  2 +-
 src/metagger/pos_tagger.py.in       |  8 ++++----
 3 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/src/metagger/instance.py.in b/src/metagger/instance.py.in
index 4fdc91f..8e61202 100644
--- a/src/metagger/instance.py.in
+++ b/src/metagger/instance.py.in
@@ -30,7 +30,7 @@ class Instance:
 
 
     def __str__(self):                                                       
-        return '%s\t%s' %(self.label," ".join(self.fv))
+        return u'%s\t%s' %(self.label," ".join(self.fv))
 
 
     def get_features(self,index,tokens,known_words={},lefff={},cache={}):
@@ -58,19 +58,15 @@ class Instance:
     def get_cwd_features(self,index,tokens,known_words,lefff,cache):
         word = tokens[index].string
         #### lexical features
-        if word in known_words:
-            # cwd is not rare
-            self.add('wd',word)
-        else:
-            # cwd is rare
-            self.add('pref1',word[0])
-            self.add('pref2',word[:2])
-            self.add('pref3',word[:3])
-            self.add('pref4',word[:4])
-            self.add('suff1',word[-1])
-            self.add('suff2',word[-2:])
-            self.add('suff3',word[-3:])
-            self.add('suff4',word[-4:])
+        self.add('wd',word)
+        self.add('pref1',word[0])
+        self.add('pref2',word[:2])
+        self.add('pref3',word[:3])
+        self.add('pref4',word[:4])
+        self.add('suff1',word[-1])
+        self.add('suff2',word[-2:])
+        self.add('suff3',word[-3:])
+        self.add('suff4',word[-4:])
         #### word form features
         self.get_wf_features(word, index)
         #### lefff tags
@@ -85,9 +81,7 @@ class Instance:
             absp = index+p
             word = self.get_conx_wd(tokens,absp)
             #### lexical features
-            if word in known_words:
-                # cwd is not rare
-                self.add('wd%s' %p,word)
+            self.add('wd%s' %p,word)
             #### lefff tags
             if word in lefff:
                 self.add_lefff_features(word,lefff,feat_suffix=p)
diff --git a/src/metagger/megam_classifier.py.in b/src/metagger/megam_classifier.py.in
index df182eb..559cca4 100755
--- a/src/metagger/megam_classifier.py.in
+++ b/src/metagger/megam_classifier.py.in
@@ -51,7 +51,7 @@ class MegamClassifier:
 
 
     def train( self, datafile, paramfile=tempfile.mktemp(), \
-               prior_prec=1, repeat=2, maxit=100, bias=True, quiet=True ):
+               prior_prec=1, repeat=5, maxit=100, bias=True, quiet=True ):
         """ simple call to megam executable for multiclass
         classification with some relevant options:
         
diff --git a/src/metagger/pos_tagger.py.in b/src/metagger/pos_tagger.py.in
index 1bc056a..3a144eb 100755
--- a/src/metagger/pos_tagger.py.in
+++ b/src/metagger/pos_tagger.py.in
@@ -93,6 +93,8 @@ class POSTagger:
             cached_inst.get_static_features(i,tokens,
                                             known_words=self.known_words,
                                             lefff=self.lefff_dict)
+            # possible tags
+            legit_tags = self.tag_dict.get(token.string,[])
             for j in range(len(sequences)):
                 seq_j,log_pr_j = sequences[j]
                 tokens_j = seq_j+tokens[i:]
@@ -104,10 +106,8 @@ class POSTagger:
                 # extend sequence j with current token
                 for (cl,pr) in label_pr_distrib:
                     # make sure that cl is a possible tag
-                    if token.string in self.tag_dict:
-                        legit_tags = self.tag_dict[token.string]
-                        if not cl in legit_tags:
-                            continue
+                    if legit_tags and not cl in legit_tags:
+                        continue
                     labelled_token = Token(string=token.string,pos=token.pos,\
                                            label=cl,label_pr_distrib=label_pr_distrib)
                     n_best_sequences.append((seq_j+[labelled_token],log_pr_j+math.log(pr)))
-- 
GitLab