Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 75c212f3 authored by Pascal Denis's avatar Pascal Denis
Browse files

feature cleanup

git-svn-id: https://scm.gforge.inria.fr/authscm/cfourrie/svn/lingwb/metagger/trunk@2732 dc05b511-7f1d-0410-9f1c-d6f32a2df9e4
parent c239e5c8
No related branches found
No related tags found
No related merge requests found
......@@ -30,7 +30,7 @@ class Instance:
def __str__(self):
return '%s\t%s' %(self.label," ".join(self.fv))
return u'%s\t%s' %(self.label," ".join(self.fv))
def get_features(self,index,tokens,known_words={},lefff={},cache={}):
......@@ -58,19 +58,15 @@ class Instance:
def get_cwd_features(self,index,tokens,known_words,lefff,cache):
word = tokens[index].string
#### lexical features
if word in known_words:
# cwd is not rare
self.add('wd',word)
else:
# cwd is rare
self.add('pref1',word[0])
self.add('pref2',word[:2])
self.add('pref3',word[:3])
self.add('pref4',word[:4])
self.add('suff1',word[-1])
self.add('suff2',word[-2:])
self.add('suff3',word[-3:])
self.add('suff4',word[-4:])
self.add('wd',word)
self.add('pref1',word[0])
self.add('pref2',word[:2])
self.add('pref3',word[:3])
self.add('pref4',word[:4])
self.add('suff1',word[-1])
self.add('suff2',word[-2:])
self.add('suff3',word[-3:])
self.add('suff4',word[-4:])
#### word form features
self.get_wf_features(word, index)
#### lefff tags
......@@ -85,9 +81,7 @@ class Instance:
absp = index+p
word = self.get_conx_wd(tokens,absp)
#### lexical features
if word in known_words:
# cwd is not rare
self.add('wd%s' %p,word)
self.add('wd%s' %p,word)
#### lefff tags
if word in lefff:
self.add_lefff_features(word,lefff,feat_suffix=p)
......
......@@ -51,7 +51,7 @@ class MegamClassifier:
def train( self, datafile, paramfile=tempfile.mktemp(), \
prior_prec=1, repeat=2, maxit=100, bias=True, quiet=True ):
prior_prec=1, repeat=5, maxit=100, bias=True, quiet=True ):
""" simple call to megam executable for multiclass
classification with some relevant options:
......
......@@ -93,6 +93,8 @@ class POSTagger:
cached_inst.get_static_features(i,tokens,
known_words=self.known_words,
lefff=self.lefff_dict)
# possible tags
legit_tags = self.tag_dict.get(token.string,[])
for j in range(len(sequences)):
seq_j,log_pr_j = sequences[j]
tokens_j = seq_j+tokens[i:]
......@@ -104,10 +106,8 @@ class POSTagger:
# extend sequence j with current token
for (cl,pr) in label_pr_distrib:
# make sure that cl is a possible tag
if token.string in self.tag_dict:
legit_tags = self.tag_dict[token.string]
if not cl in legit_tags:
continue
if legit_tags and not cl in legit_tags:
continue
labelled_token = Token(string=token.string,pos=token.pos,\
label=cl,label_pr_distrib=label_pr_distrib)
n_best_sequences.append((seq_j+[labelled_token],log_pr_j+math.log(pr)))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment