From 75c212f3d5e6c95ba66f724d1fbcba5ca4b73931 Mon Sep 17 00:00:00 2001 From: Pascal Denis <pascal.denis@alpage> Date: Fri, 31 Jul 2009 20:15:56 +0000 Subject: [PATCH] feature cleanup git-svn-id: https://scm.gforge.inria.fr/authscm/cfourrie/svn/lingwb/metagger/trunk@2732 dc05b511-7f1d-0410-9f1c-d6f32a2df9e4 --- src/metagger/instance.py.in | 28 +++++++++++----------------- src/metagger/megam_classifier.py.in | 2 +- src/metagger/pos_tagger.py.in | 8 ++++---- 3 files changed, 16 insertions(+), 22 deletions(-) diff --git a/src/metagger/instance.py.in b/src/metagger/instance.py.in index 4fdc91f..8e61202 100644 --- a/src/metagger/instance.py.in +++ b/src/metagger/instance.py.in @@ -30,7 +30,7 @@ class Instance: def __str__(self): - return '%s\t%s' %(self.label," ".join(self.fv)) + return u'%s\t%s' %(self.label," ".join(self.fv)) def get_features(self,index,tokens,known_words={},lefff={},cache={}): @@ -58,19 +58,15 @@ class Instance: def get_cwd_features(self,index,tokens,known_words,lefff,cache): word = tokens[index].string #### lexical features - if word in known_words: - # cwd is not rare - self.add('wd',word) - else: - # cwd is rare - self.add('pref1',word[0]) - self.add('pref2',word[:2]) - self.add('pref3',word[:3]) - self.add('pref4',word[:4]) - self.add('suff1',word[-1]) - self.add('suff2',word[-2:]) - self.add('suff3',word[-3:]) - self.add('suff4',word[-4:]) + self.add('wd',word) + self.add('pref1',word[0]) + self.add('pref2',word[:2]) + self.add('pref3',word[:3]) + self.add('pref4',word[:4]) + self.add('suff1',word[-1]) + self.add('suff2',word[-2:]) + self.add('suff3',word[-3:]) + self.add('suff4',word[-4:]) #### word form features self.get_wf_features(word, index) #### lefff tags @@ -85,9 +81,7 @@ class Instance: absp = index+p word = self.get_conx_wd(tokens,absp) #### lexical features - if word in known_words: - # cwd is not rare - self.add('wd%s' %p,word) + self.add('wd%s' %p,word) #### lefff tags if word in lefff: self.add_lefff_features(word,lefff,feat_suffix=p) diff --git a/src/metagger/megam_classifier.py.in b/src/metagger/megam_classifier.py.in index df182eb..559cca4 100755 --- a/src/metagger/megam_classifier.py.in +++ b/src/metagger/megam_classifier.py.in @@ -51,7 +51,7 @@ class MegamClassifier: def train( self, datafile, paramfile=tempfile.mktemp(), \ - prior_prec=1, repeat=2, maxit=100, bias=True, quiet=True ): + prior_prec=1, repeat=5, maxit=100, bias=True, quiet=True ): """ simple call to megam executable for multiclass classification with some relevant options: diff --git a/src/metagger/pos_tagger.py.in b/src/metagger/pos_tagger.py.in index 1bc056a..3a144eb 100755 --- a/src/metagger/pos_tagger.py.in +++ b/src/metagger/pos_tagger.py.in @@ -93,6 +93,8 @@ class POSTagger: cached_inst.get_static_features(i,tokens, known_words=self.known_words, lefff=self.lefff_dict) + # possible tags + legit_tags = self.tag_dict.get(token.string,[]) for j in range(len(sequences)): seq_j,log_pr_j = sequences[j] tokens_j = seq_j+tokens[i:] @@ -104,10 +106,8 @@ class POSTagger: # extend sequence j with current token for (cl,pr) in label_pr_distrib: # make sure that cl is a possible tag - if token.string in self.tag_dict: - legit_tags = self.tag_dict[token.string] - if not cl in legit_tags: - continue + if legit_tags and not cl in legit_tags: + continue labelled_token = Token(string=token.string,pos=token.pos,\ label=cl,label_pr_distrib=label_pr_distrib) n_best_sequences.append((seq_j+[labelled_token],log_pr_j+math.log(pr))) -- GitLab