diff --git a/bin/pos_tag.sh b/bin/pos_tag.sh new file mode 100644 index 0000000000000000000000000000000000000000..c82bb4eaff05e8026d446e2dcaca8ca85a471458 --- /dev/null +++ b/bin/pos_tag.sh @@ -0,0 +1,8 @@ +#!/usr/bin/sh + +PYTHONPATH=${MELT_DIR}/src/melt +MODEL=${MELT_DIR}/data/ftb1_model.megam +TAGDICO=${MELT_DIR}/data/ftb1_tag_dict +LEFFF=${MELT_DIR}/data/lefff.ftb4tags +python ${MELT_DIR}/src/pos_tag.py -m ${MODEL} -d ${TAGDICO} -l ${LEFFF} $@ + diff --git a/src/metagger/instance.py.in b/src/metagger/instance.py.in index 8e612025eeb6985d452c704ccdc551a9ab88f836..7e8c4c501dc7920ca0df70b5bebf9a093eff8006 100644 --- a/src/metagger/instance.py.in +++ b/src/metagger/instance.py.in @@ -33,8 +33,8 @@ class Instance: return u'%s\t%s' %(self.label," ".join(self.fv)) - def get_features(self,index,tokens,known_words={},lefff={},cache={}): - self.get_static_features(index,tokens,known_words=known_words,lefff=lefff,cache=cache) + def get_features(self,index,tokens,lefff={},cache={}): + self.get_static_features(index,tokens,lefff=lefff,cache=cache) self.get_sequential_features(index,tokens) return @@ -47,15 +47,15 @@ class Instance: return - def get_static_features(self,index,tokens,known_words={},lefff={},cache={}): + def get_static_features(self,index,tokens,lefff={},cache={}): # current word features - self.get_cwd_features(index, tokens, known_words, lefff, cache) + self.get_cwd_features(index, tokens, lefff, cache) # contextual features - self.get_conx_features(index,tokens,known_words, lefff) + self.get_conx_features(index,tokens,lefff) return - def get_cwd_features(self,index,tokens,known_words,lefff,cache): + def get_cwd_features(self,index,tokens,lefff,cache): word = tokens[index].string #### lexical features self.add('wd',word) @@ -75,7 +75,7 @@ class Instance: return - def get_conx_features(self,index,tokens,known_words,lefff,w=2): + def get_conx_features(self,index,tokens,lefff,w=2): for p in range(-w,w+1): if p == 0: continue # skip current wd absp = index+p diff --git a/src/metagger/megam_classifier.py.in b/src/metagger/megam_classifier.py.in index 559cca44ee1a8317593d7654ec56d63237fef68f..fe05dfc431cd667b62cb1c84adffad79aa8acac1 100755 --- a/src/metagger/megam_classifier.py.in +++ b/src/metagger/megam_classifier.py.in @@ -28,8 +28,7 @@ class MegamClassifier: """ the paramfile is a sequence of whitespace-separated lines the column column is a string feature label, while subsequent columns are the weight for that feature for class. The first - line is a map of class *names* to *column positions* for - example: ***NAMEDLABELSIDS*** O B I + line is a map of class *names* to *column positions* """ print >> sys.stderr, "Reading parameters file: %s" %self.paramfile @@ -62,11 +61,13 @@ class MegamClassifier: """ print >> sys.stderr, ">>> Training Megam classifier..." - megam_exec_path = os.environ.get("MEGAM_DIR",None)+"/megam.opt" + try: + megam_exec_path = os.environ.get("MEGAM_DIR",None)+"/megam.opt" + except TypeError: + sys.exit("Missing env variable for MEGAM_DIR. You need Megam to train models.") self.paramfile = paramfile # build process command - proc = ["megam.opt", "-nc", "-repeat", repeat, "-lambda", prior_prec,\ - "-maxi", maxit] + proc = ["megam.opt", "-nc", "-repeat", repeat, "-lambda", prior_prec,"-maxi", maxit] if not bias: proc.append("-nobias") proc.append("multiclass") # optimization type @@ -85,7 +86,6 @@ class MegamClassifier: print >> sys.stderr, "Megam parameters dumped into file %s" %self.paramfile # load model from output param file self.load_model() - # self.flush_model_file() print >> sys.stderr, "Model classes (%s): %s" %(len(self.classes),self.classes) print >> sys.stderr, "# of features: %s" %(len(self.weights)) return @@ -137,8 +137,7 @@ class MegamClassifier: return zip( self.classes, probs ) - def get_uniform_distribution( self, features ): - return zip( self.classes, [1.0/len(self.classes) for c in self.classes] ) + diff --git a/bin/pos_tag.py.in b/src/metagger/pos_tag.py.in similarity index 100% rename from bin/pos_tag.py.in rename to src/metagger/pos_tag.py.in diff --git a/src/metagger/pos_tagger.py.in b/src/metagger/pos_tagger.py.in index 3a144eb80e313be8e73b8d31773ddddf3793985b..50436bbfd2b400c080f983e0970602d4a8f04d03 100755 --- a/src/metagger/pos_tagger.py.in +++ b/src/metagger/pos_tagger.py.in @@ -22,7 +22,6 @@ class POSTagger: def __init__(self): - self.known_words = {} self.tag_dict = {} self.lefff_dict = defaultdict(list) self.classifier = MegamClassifier() @@ -71,7 +70,6 @@ class POSTagger: os.write(1, "%s" %"\b"*len(str(inst_ct))+str(inst_ct)) inst = Instance(tokens[i].label) inst.get_features(i,tokens, - known_words=self.known_words, lefff=self.lefff_dict, cache=feat_cache) print >> data_file, inst.__str__() @@ -91,7 +89,6 @@ class POSTagger: # cache static features cached_inst = Instance() cached_inst.get_static_features(i,tokens, - known_words=self.known_words, lefff=self.lefff_dict) # possible tags legit_tags = self.tag_dict.get(token.string,[]) @@ -145,13 +142,6 @@ class POSTagger: return - def load_word_list(self, _file, encoding='latin-1'): - print >> sys.stderr, "Loading know words list..." - self.known_words = dict([l.strip().split() for l in codecs.open(_file,'r',encoding)]) - print >> sys.stderr, "Frequent word list loaded: %s words" %(len( self.known_words)) - return - - def load_tag_dictionary(self, _file, encoding='latin-1'): print >> sys.stderr, "Loading tag dictionary..." for l in codecs.open(_file,'r',encoding):