From 17d2e225a390c08d244640d77c90c0ce82cfae6c Mon Sep 17 00:00:00 2001 From: Pascal Denis <pascal.denis@alpage> Date: Tue, 6 Oct 2009 08:00:47 +0000 Subject: [PATCH] clean up; addition of shell script pos_tag.sh git-svn-id: https://scm.gforge.inria.fr/authscm/cfourrie/svn/lingwb/metagger/trunk@2793 dc05b511-7f1d-0410-9f1c-d6f32a2df9e4 --- bin/pos_tag.sh | 8 ++++++++ src/metagger/instance.py.in | 14 +++++++------- src/metagger/megam_classifier.py.in | 15 +++++++-------- {bin => src/metagger}/pos_tag.py.in | 0 src/metagger/pos_tagger.py.in | 10 ---------- 5 files changed, 22 insertions(+), 25 deletions(-) create mode 100644 bin/pos_tag.sh rename {bin => src/metagger}/pos_tag.py.in (100%) diff --git a/bin/pos_tag.sh b/bin/pos_tag.sh new file mode 100644 index 0000000..c82bb4e --- /dev/null +++ b/bin/pos_tag.sh @@ -0,0 +1,8 @@ +#!/usr/bin/sh + +PYTHONPATH=${MELT_DIR}/src/melt +MODEL=${MELT_DIR}/data/ftb1_model.megam +TAGDICO=${MELT_DIR}/data/ftb1_tag_dict +LEFFF=${MELT_DIR}/data/lefff.ftb4tags +python ${MELT_DIR}/src/pos_tag.py -m ${MODEL} -d ${TAGDICO} -l ${LEFFF} $@ + diff --git a/src/metagger/instance.py.in b/src/metagger/instance.py.in index 8e61202..7e8c4c5 100644 --- a/src/metagger/instance.py.in +++ b/src/metagger/instance.py.in @@ -33,8 +33,8 @@ class Instance: return u'%s\t%s' %(self.label," ".join(self.fv)) - def get_features(self,index,tokens,known_words={},lefff={},cache={}): - self.get_static_features(index,tokens,known_words=known_words,lefff=lefff,cache=cache) + def get_features(self,index,tokens,lefff={},cache={}): + self.get_static_features(index,tokens,lefff=lefff,cache=cache) self.get_sequential_features(index,tokens) return @@ -47,15 +47,15 @@ class Instance: return - def get_static_features(self,index,tokens,known_words={},lefff={},cache={}): + def get_static_features(self,index,tokens,lefff={},cache={}): # current word features - self.get_cwd_features(index, tokens, known_words, lefff, cache) + self.get_cwd_features(index, tokens, lefff, cache) # contextual features - self.get_conx_features(index,tokens,known_words, lefff) + self.get_conx_features(index,tokens,lefff) return - def get_cwd_features(self,index,tokens,known_words,lefff,cache): + def get_cwd_features(self,index,tokens,lefff,cache): word = tokens[index].string #### lexical features self.add('wd',word) @@ -75,7 +75,7 @@ class Instance: return - def get_conx_features(self,index,tokens,known_words,lefff,w=2): + def get_conx_features(self,index,tokens,lefff,w=2): for p in range(-w,w+1): if p == 0: continue # skip current wd absp = index+p diff --git a/src/metagger/megam_classifier.py.in b/src/metagger/megam_classifier.py.in index 559cca4..fe05dfc 100755 --- a/src/metagger/megam_classifier.py.in +++ b/src/metagger/megam_classifier.py.in @@ -28,8 +28,7 @@ class MegamClassifier: """ the paramfile is a sequence of whitespace-separated lines the column column is a string feature label, while subsequent columns are the weight for that feature for class. The first - line is a map of class *names* to *column positions* for - example: ***NAMEDLABELSIDS*** O B I + line is a map of class *names* to *column positions* """ print >> sys.stderr, "Reading parameters file: %s" %self.paramfile @@ -62,11 +61,13 @@ class MegamClassifier: """ print >> sys.stderr, ">>> Training Megam classifier..." - megam_exec_path = os.environ.get("MEGAM_DIR",None)+"/megam.opt" + try: + megam_exec_path = os.environ.get("MEGAM_DIR",None)+"/megam.opt" + except TypeError: + sys.exit("Missing env variable for MEGAM_DIR. You need Megam to train models.") self.paramfile = paramfile # build process command - proc = ["megam.opt", "-nc", "-repeat", repeat, "-lambda", prior_prec,\ - "-maxi", maxit] + proc = ["megam.opt", "-nc", "-repeat", repeat, "-lambda", prior_prec,"-maxi", maxit] if not bias: proc.append("-nobias") proc.append("multiclass") # optimization type @@ -85,7 +86,6 @@ class MegamClassifier: print >> sys.stderr, "Megam parameters dumped into file %s" %self.paramfile # load model from output param file self.load_model() - # self.flush_model_file() print >> sys.stderr, "Model classes (%s): %s" %(len(self.classes),self.classes) print >> sys.stderr, "# of features: %s" %(len(self.weights)) return @@ -137,8 +137,7 @@ class MegamClassifier: return zip( self.classes, probs ) - def get_uniform_distribution( self, features ): - return zip( self.classes, [1.0/len(self.classes) for c in self.classes] ) + diff --git a/bin/pos_tag.py.in b/src/metagger/pos_tag.py.in similarity index 100% rename from bin/pos_tag.py.in rename to src/metagger/pos_tag.py.in diff --git a/src/metagger/pos_tagger.py.in b/src/metagger/pos_tagger.py.in index 3a144eb..50436bb 100755 --- a/src/metagger/pos_tagger.py.in +++ b/src/metagger/pos_tagger.py.in @@ -22,7 +22,6 @@ class POSTagger: def __init__(self): - self.known_words = {} self.tag_dict = {} self.lefff_dict = defaultdict(list) self.classifier = MegamClassifier() @@ -71,7 +70,6 @@ class POSTagger: os.write(1, "%s" %"\b"*len(str(inst_ct))+str(inst_ct)) inst = Instance(tokens[i].label) inst.get_features(i,tokens, - known_words=self.known_words, lefff=self.lefff_dict, cache=feat_cache) print >> data_file, inst.__str__() @@ -91,7 +89,6 @@ class POSTagger: # cache static features cached_inst = Instance() cached_inst.get_static_features(i,tokens, - known_words=self.known_words, lefff=self.lefff_dict) # possible tags legit_tags = self.tag_dict.get(token.string,[]) @@ -145,13 +142,6 @@ class POSTagger: return - def load_word_list(self, _file, encoding='latin-1'): - print >> sys.stderr, "Loading know words list..." - self.known_words = dict([l.strip().split() for l in codecs.open(_file,'r',encoding)]) - print >> sys.stderr, "Frequent word list loaded: %s words" %(len( self.known_words)) - return - - def load_tag_dictionary(self, _file, encoding='latin-1'): print >> sys.stderr, "Loading tag dictionary..." for l in codecs.open(_file,'r',encoding): -- GitLab