Mentions légales du service

Skip to content
Snippets Groups Projects
Commit b606bdb0 authored by Pascal Denis's avatar Pascal Denis
Browse files

refactoring; raw text tagging

git-svn-id: https://scm.gforge.inria.fr/authscm/cfourrie/svn/lingwb/metagger/trunk@2687 dc05b511-7f1d-0410-9f1c-d6f32a2df9e4
parent 00713268
No related branches found
No related tags found
No related merge requests found
......@@ -20,9 +20,9 @@ BUILT_SOURCES = build_word_list.py \
pos_tag.py \
eval.py
EDIT = perl -pe "s|\@alexinadir\@|$(alexinadir)|g;\
EDIT = perl -pe "s|\@alexinadir\@|$(alexinadir)|g; \
s|\@datadir\@|\$(pkgdatadir)|g; \
s|\@libdir\@|\$(pkglibdir)|g; \
s|\@libdir\@|\$(pkglibdir)|g; \
s|\@bindir\@|\$(bindir)|g; "
%.py: %.py.in
......
......@@ -11,21 +11,15 @@ from metagger.pos_tagger import POSTagger
usage = "usage: %prog [options] <input_file>"
parser = optparse.OptionParser(usage=usage)
parser.add_option("-m", "--model", action="store", help="model file")
parser.add_option("-t", "--training", action="store", help="training data file")
parser.add_option("-b", "--beam_size", action="store", help="set beam size", type=int, default=5)
parser.add_option("-t", "--train", action="store", help="training data file")
parser.add_option("-b", "--beam_size", action="store", help="set beam size", type=int, default=3)
parser.add_option("-p", "--prior_prec", action="store", help="set precision of gaussian prior", type=int, default=1)
parser.add_option("-w", "--word_list", action="store", help="read in word_list", default='')
parser.add_option("-d", "--tag_dict", action="store", help="read in tag dictionary", default='')
parser.add_option("-l", "--lefff", action="store", help="read in Lefff DB", default='')
parser.add_option("-o", "--output_file", action="store", help="output file", default='')
(options, args) = parser.parse_args()
model_file = options.model
train_file = options.training
beam_size = options.beam_size
tag_dict_file = options.tag_dict
word_list_file = options.word_list
lefff_file = options.lefff
out_file = options.output_file
infile = args[0]
print >> sys.stderr, options
......@@ -34,34 +28,35 @@ print >> sys.stderr, options
pos_tagger = POSTagger()
# read-in tag dictionary from file
if tag_dict_file:
pos_tagger.load_tag_dictionary( tag_dict_file )
if options.tag_dict:
pos_tagger.load_tag_dictionary( options.tag_dict )
else:
print >> sys.stderr, "Warning: No tag dictionary provided"
# read in known words list from file
if word_list_file:
pos_tagger.load_word_list( word_list_file )
if options.word_list:
pos_tagger.load_word_list( options.word_list )
else:
print >> sys.stderr, "Warning: No word list provided"
# read-in Lefff
if lefff_file:
pos_tagger.load_lefff( lefff_file )
if options.lefff:
pos_tagger.load_lefff( options.lefff )
else:
print >> sys.stderr, "Warning: Lefff not provided"
# load model if provided
if train_file:
pos_tagger.train_model( train_file, model_path=model_file )
elif model_file:
pos_tagger.load_model( model_file )
if options.train:
pos_tagger.train_model( options.train, prior_prec=options.prior_prec, \
model_path=options.model )
elif options.model:
pos_tagger.load_model( options.model )
else:
sys.exit("Error: please provide model file (-m) or training file (-t)")
############## apply tagger ##################################
pos_tagger.apply( infile, outfile=out_file )
pos_tagger.apply( infile, beam_size=options.beam_size, outfile=options.output_file )
......
Source diff could not be displayed: it is too large. Options to address this: view the blob.
......@@ -23,9 +23,9 @@ BUILT_SOURCES = __init__.py \
megam_classifier.py \
result_sink.py
EDIT = perl -pe "s|\@alexinadir\@|$(alexinadir)|g;\
EDIT = perl -pe "s|\@alexinadir\@|$(alexinadir)|g; \
s|\@datadir\@|\$(pkgdatadir)|g; \
s|\@libdir\@|\$(pkglibdir)|g; \
s|\@libdir\@|\$(pkglibdir)|g; \
s|\@bindir\@|\$(bindir)|g; "
%.py: %.py.in
......
......@@ -50,11 +50,12 @@ class MegamClassifier:
def train( self, datafile, paramfile=tempfile.mktemp(), \
prior=1, repeat=4, maxit=100, quiet=True ):
prior_prec=1, repeat=4, maxit=100, quiet=True ):
""" simple call to megam executable for multiclass
classification with some relevant options:
-prior: precision of Gaussian prior (megam default:1)
-prior_prec: precision of Gaussian prior (megam default:1). It's
the inverse variance. See http://www.cs.utah.edu/~hal/docs/daume04cg-bfgs.pdf.
-repeat: repeat optimization <int> times (megam default:1)
-maxit: max # of iterations (megam default:100)
......@@ -64,7 +65,7 @@ class MegamClassifier:
self.paramfile = paramfile
# build process command
cmd = 'megam.opt -nc -repeat %s -lambda %s -maxi %s multiclass %s 1> %s' \
%(repeat,prior,maxit,datafile,self.paramfile)
%(repeat,prior_prec,maxit,datafile,self.paramfile)
if quiet:
cmd += " 2> /tmp/null"
# run command
......
......@@ -19,8 +19,7 @@ from metagger.instance import Instance
class POSTagger:
def __init__(self, beam_size=5):
self.beam_size = beam_size
def __init__(self):
self.known_words = []
self.tag_dict = {}
self.lefff_dict = defaultdict(list)
......@@ -44,7 +43,8 @@ class POSTagger:
train_inst_file = self.generate_training_data(_file)
print >> sys.stderr, "Data file: %s" %train_inst_file
print >> sys.stderr, "Training POS model..."
self.classifier.train( train_inst_file, quiet=False )
self.classifier.train( train_inst_file, paramfile=model_path, \
prior_prec=1, quiet=False )
return
......@@ -78,7 +78,7 @@ class POSTagger:
def tag_token_sequence(self, tokens):
def tag_token_sequence(self, tokens, beam_size):
''' N-best breath search for the best tag sequence for each sentence'''
# maintain N-best sequences of tagged tokens
sequences = [([],0.0)] # log prob.
......@@ -111,14 +111,14 @@ class POSTagger:
# sort sequences
n_best_sequences.sort(lambda x,y:cmp(x[1],y[1]))
# keep N best
sequences = n_best_sequences[-self.beam_size:]
sequences = n_best_sequences[-beam_size:]
# return sequence with highest prob.
best_sequence = sequences[-1][0]
# print >> sys.stderr, "Best tok seq:", [(t.string,t.label) for t in best_sequence]
return best_sequence
def apply(self, _file, outfile=''):
def apply(self, _file, beam_size=3, outfile=''):
# open output file
out = sys.stdout
if outfile:
......@@ -130,7 +130,7 @@ class POSTagger:
for item in line.split():
token = Token( string=item )
tokens.append( token )
tagged_tokens = self.tag_token_sequence( tokens )
tagged_tokens = self.tag_token_sequence( tokens, beam_size )
# print tagged sentence to output file
tagged_sent = " ".join( [tok.__str__() for tok in tagged_tokens] )
print >> out, tagged_sent
......
......@@ -17,10 +17,9 @@ class AccuracySink(ResultSink):
self.matrix = {}
self.tags = {}
self.model_predictions = []
self.label_distributions = []
return
def update(self,gold_cl,pred_cl,label_distribution):
def update(self,gold_cl,pred_cl):
self.total += 1
self.total_by_tag[gold_cl] = self.total_by_tag.get(gold_cl,0) + 1
self.pred_by_tag[pred_cl] = self.pred_by_tag.get(pred_cl,0) + 1
......@@ -30,7 +29,6 @@ class AccuracySink(ResultSink):
self.tags[gold_cl] = 1
self.matrix[(gold_cl,pred_cl)] = self.matrix.get((gold_cl,pred_cl),0) + 1
self.model_predictions.append((gold_cl,pred_cl))
self.label_distributions.append(label_distribution)
return
def score(self, rounding=3 ):
......@@ -62,12 +60,6 @@ class AccuracySink(ResultSink):
print "\n".join(truth_and_predicted)
print "============================================"
def distributions(self):
print "====== Model distributions on test data ======"
print "\n".join([repr(item) for item in self.label_distributions])
print "=============================================="
return
def rpf(self):
"""Print R-P-F scores by class labels"""
print "Recall/Precision/F1 by class labels"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment