Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 17d2e225 authored by Pascal Denis's avatar Pascal Denis
Browse files

clean up; addition of shell script pos_tag.sh

git-svn-id: https://scm.gforge.inria.fr/authscm/cfourrie/svn/lingwb/metagger/trunk@2793 dc05b511-7f1d-0410-9f1c-d6f32a2df9e4
parent c7d4d0dd
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/sh
PYTHONPATH=${MELT_DIR}/src/melt
MODEL=${MELT_DIR}/data/ftb1_model.megam
TAGDICO=${MELT_DIR}/data/ftb1_tag_dict
LEFFF=${MELT_DIR}/data/lefff.ftb4tags
python ${MELT_DIR}/src/pos_tag.py -m ${MODEL} -d ${TAGDICO} -l ${LEFFF} $@
......@@ -33,8 +33,8 @@ class Instance:
return u'%s\t%s' %(self.label," ".join(self.fv))
def get_features(self,index,tokens,known_words={},lefff={},cache={}):
self.get_static_features(index,tokens,known_words=known_words,lefff=lefff,cache=cache)
def get_features(self,index,tokens,lefff={},cache={}):
self.get_static_features(index,tokens,lefff=lefff,cache=cache)
self.get_sequential_features(index,tokens)
return
......@@ -47,15 +47,15 @@ class Instance:
return
def get_static_features(self,index,tokens,known_words={},lefff={},cache={}):
def get_static_features(self,index,tokens,lefff={},cache={}):
# current word features
self.get_cwd_features(index, tokens, known_words, lefff, cache)
self.get_cwd_features(index, tokens, lefff, cache)
# contextual features
self.get_conx_features(index,tokens,known_words, lefff)
self.get_conx_features(index,tokens,lefff)
return
def get_cwd_features(self,index,tokens,known_words,lefff,cache):
def get_cwd_features(self,index,tokens,lefff,cache):
word = tokens[index].string
#### lexical features
self.add('wd',word)
......@@ -75,7 +75,7 @@ class Instance:
return
def get_conx_features(self,index,tokens,known_words,lefff,w=2):
def get_conx_features(self,index,tokens,lefff,w=2):
for p in range(-w,w+1):
if p == 0: continue # skip current wd
absp = index+p
......
......@@ -28,8 +28,7 @@ class MegamClassifier:
""" the paramfile is a sequence of whitespace-separated lines
the column column is a string feature label, while subsequent
columns are the weight for that feature for class. The first
line is a map of class *names* to *column positions* for
example: ***NAMEDLABELSIDS*** O B I
line is a map of class *names* to *column positions*
"""
print >> sys.stderr, "Reading parameters file: %s" %self.paramfile
......@@ -62,11 +61,13 @@ class MegamClassifier:
"""
print >> sys.stderr, ">>> Training Megam classifier..."
megam_exec_path = os.environ.get("MEGAM_DIR",None)+"/megam.opt"
try:
megam_exec_path = os.environ.get("MEGAM_DIR",None)+"/megam.opt"
except TypeError:
sys.exit("Missing env variable for MEGAM_DIR. You need Megam to train models.")
self.paramfile = paramfile
# build process command
proc = ["megam.opt", "-nc", "-repeat", repeat, "-lambda", prior_prec,\
"-maxi", maxit]
proc = ["megam.opt", "-nc", "-repeat", repeat, "-lambda", prior_prec,"-maxi", maxit]
if not bias:
proc.append("-nobias")
proc.append("multiclass") # optimization type
......@@ -85,7 +86,6 @@ class MegamClassifier:
print >> sys.stderr, "Megam parameters dumped into file %s" %self.paramfile
# load model from output param file
self.load_model()
# self.flush_model_file()
print >> sys.stderr, "Model classes (%s): %s" %(len(self.classes),self.classes)
print >> sys.stderr, "# of features: %s" %(len(self.weights))
return
......@@ -137,8 +137,7 @@ class MegamClassifier:
return zip( self.classes, probs )
def get_uniform_distribution( self, features ):
return zip( self.classes, [1.0/len(self.classes) for c in self.classes] )
......
File moved
......@@ -22,7 +22,6 @@ class POSTagger:
def __init__(self):
self.known_words = {}
self.tag_dict = {}
self.lefff_dict = defaultdict(list)
self.classifier = MegamClassifier()
......@@ -71,7 +70,6 @@ class POSTagger:
os.write(1, "%s" %"\b"*len(str(inst_ct))+str(inst_ct))
inst = Instance(tokens[i].label)
inst.get_features(i,tokens,
known_words=self.known_words,
lefff=self.lefff_dict,
cache=feat_cache)
print >> data_file, inst.__str__()
......@@ -91,7 +89,6 @@ class POSTagger:
# cache static features
cached_inst = Instance()
cached_inst.get_static_features(i,tokens,
known_words=self.known_words,
lefff=self.lefff_dict)
# possible tags
legit_tags = self.tag_dict.get(token.string,[])
......@@ -145,13 +142,6 @@ class POSTagger:
return
def load_word_list(self, _file, encoding='latin-1'):
print >> sys.stderr, "Loading know words list..."
self.known_words = dict([l.strip().split() for l in codecs.open(_file,'r',encoding)])
print >> sys.stderr, "Frequent word list loaded: %s words" %(len( self.known_words))
return
def load_tag_dictionary(self, _file, encoding='latin-1'):
print >> sys.stderr, "Loading tag dictionary..."
for l in codecs.open(_file,'r',encoding):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment