Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 870432ec authored by Pierre Magistry's avatar Pierre Magistry
Browse files

oublie de changer le .in

git-svn-id: https://scm.gforge.inria.fr/authscm/cfourrie/svn/lingwb/MElt/branches/pierre@5504 dc05b511-7f1d-0410-9f1c-d6f32a2df9e4
parent 038cd05c
No related branches found
No related tags found
No related merge requests found
......@@ -215,7 +215,7 @@ class POSTagger:
data_dump_file.close()
os.write(1, "%s" %"\b"*len(str(inst_ct))+"filtering")
features_filter_exec_path = where_is_exec("MElt_features_filtering.pl")
os.system("@bindir@/MElt_features_filtering.pl \"%s\" %d" %(data_file_name,feat_options.get('ffthrsld',2)));
os.system("/home/pierre/local/bin/MElt_features_filtering.pl \"%s\" %d" %(data_file_name,feat_options.get('ffthrsld',2)));
os.write(1,'\n')
return data_file_name
......@@ -255,6 +255,7 @@ class POSTagger:
cache=self.cache )
inst.fv = cached_inst.fv[:]
inst.get_sequential_features()
#print "features (lineaire)", tokens[i], sorted(inst.fv)
label_pr_distrib = self.classifier.class_distribution(inst.fv)
# extend sequence j with current token
for (cl,pr) in label_pr_distrib:
......@@ -277,7 +278,74 @@ class POSTagger:
# print >> sys.stderr, "Best tok seq:", [(t.string,t.label) for t in best_sequence]
return best_sequence
def tag_token_dag_v2(self, tokens, feat_options={}, beam_size=3):
''' N-best breath search for the best tag sequence for each sentence'''
#Dag mode only allowed with window of length 2
assert(feat_options.get('win',2) == 2)
# maintain N-best sequences of tagged tokens
end_index = max([tok.position[1] for tok in tokens])
sequences = np.zeros((end_index+1,),dtype=object)
sequences[0] = [([],0.0)] # log prob.
for i in range(1, end_index+1):
n_best_sequences = []
#get all tokens that reach position i
reaching_tokens = [tok for tok in tokens if tok.position[1] == i]
for token in reaching_tokens:
#get sequences that end at the begining of the token
previous_paths = sequences[token.position[0]]
following_paths = next_two_in_dag(tokens,token.position[1])
for left_context, log_pr_j in previous_paths:
idx = len(left_context)
linear_tokens = left_context[:] # copy to a new array
linear_tokens.append(token)
for right_context in following_paths:
if right_context is not None:
linear_tokens.extend(right_context)
cached_inst = Instance( label=token.label,
index=idx, tokens=linear_tokens,
feat_selection=feat_options,
lex_dict=self.lex_dict,
tag_dict=self.tag_dict,
cache=self.cache )
cached_inst.get_static_features()
# get possible tags: union of tags found in tag_dict and
# lex_dict
wd = token.string
wasCap = token.wasCap
legit_tags1 = self.tag_dict.get(wd,{})
legit_tags2 = {} # self.lex_dict.get(wd,{})
# classify token
inst = Instance( label=token.label,
index=idx, tokens=linear_tokens,
feat_selection=feat_options,
lex_dict=self.lex_dict,
tag_dict=self.tag_dict,
cache=self.cache )
inst.fv = cached_inst.fv[:]
inst.get_sequential_features()
label_pr_distrib = self.classifier.class_distribution(inst.fv)
# extend sequence j with current token
for (cl,pr) in label_pr_distrib:
# make sure that cl is a legal tag
if legit_tags1 or legit_tags2:
if (cl not in legit_tags1) and (cl not in legit_tags2):
continue
labelled_token = Token(string=(token.position[0],token.position[1],token.tokobj),pos=token.pos,\
comment=token.comment,\
wasCap=wasCap,\
label=cl,proba=pr,label_pr_distrib=label_pr_distrib)
n_best_sequences.append((left_context + [labelled_token], log_pr_j+math.log(pr)))
# sort sequences
n_best_sequences.sort( key=operator.itemgetter(1) )
#debug_n_best_sequence(n_best_sequences)
# keep N best
sequences[i] = n_best_sequences[-beam_size:]
# return sequence with highest prob.
best_sequence = sequences[-1][-1][0]
# print >> sys.stderr, "Best tok seq:", [(t.string,t.label) for t in best_sequence]
return best_sequence
def tag_token_dag(self, tokens, feat_options={}, beam_size=3):
''' N-best breath search for the best tag sequence for each sentence with segmentation ambiguities'''
# maintain N-best sequences of tagged tokens
......@@ -329,6 +397,8 @@ class POSTagger:
#inst.fv = cached_inst.fv[:]
inst.get_static_features()
inst.get_sequential_features()
print "tokens", map(str,tokens), map(str,tokens_j)
print "features (dag)", tokens[i], sorted(inst.fv)
label_pr_distrib = self.classifier.class_distribution(inst.fv)
# import IPython
# IPython.embed()
......@@ -347,7 +417,7 @@ class POSTagger:
n_best_sequences.append((seq_j+[labelled_token],log_pr_j+math.log(pr)))
# sort sequences
n_best_sequences.sort( key=lambda x:x[1]/len(x[0]))# operator.itemgetter(1) )
debug_n_best_sequence(n_best_sequences)
#debug_n_best_sequence(n_best_sequences)
# keep N best
sequences = n_best_sequences[-beam_size:]
# return sequence with highest prob.
......@@ -427,7 +497,7 @@ class POSTagger:
for i,wd in enumerate(dag):
token = Token( string=wd, index=i )
tokens.append( token )
tagged_tokens = self.tag_token_dag( tokens,
tagged_tokens = self.tag_token_dag_v2( tokens,
feat_options=feat_options,
beam_size=beam_size )
if (print_probas):
......@@ -557,7 +627,6 @@ class DAGReader(CorpusReader):
class Token:
def __init__(self, string=None, wasCap=0, pos=None, label=None, proba=None, comment=None, label_pr_distrib=[],index=None,position=None):
if type(string) is tuple and isinstance(string[2],sxp.Token) : #DAG
self.string = string[2].forme
......@@ -1164,6 +1233,33 @@ def suivants_in_dag(tokens,i):
suivants.append(tok)
return suivants
def after_position_in_dag(tokens,i):
if i is None :
fin = 0
else :
fin = i
suivants = []
for tok in tokens:
if tok.position[0] == fin :
suivants.append(tok)
return suivants
def next_two_in_dag(tokens,i):
suivs1 = after_position_in_dag(tokens,i)
result = []
for suiv in suivs1:
suivs2 = after_position_in_dag(tokens, suiv.position[1])
if len(suivs2) == 0:
result.append([suiv])
else:
for s2 in suivs2:
result.append([suiv, s2])
if result == []:
result = [None]
return result
def where_is_exec(program):
u''' retourne le chemin d'un executable si il est trouvé dans le PATH'''
import os
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment