From 870432ec89fd16aa37ec0930dcf83d0afc6b1e84 Mon Sep 17 00:00:00 2001 From: Pierre Magistry <pierre.magistry@alpage> Date: Tue, 14 Oct 2014 13:04:10 +0000 Subject: [PATCH] oublie de changer le .in git-svn-id: https://scm.gforge.inria.fr/authscm/cfourrie/svn/lingwb/MElt/branches/pierre@5504 dc05b511-7f1d-0410-9f1c-d6f32a2df9e4 --- bin/MElt_tagger.py.in | 106 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 101 insertions(+), 5 deletions(-) diff --git a/bin/MElt_tagger.py.in b/bin/MElt_tagger.py.in index 3d082d6..dc5692c 100644 --- a/bin/MElt_tagger.py.in +++ b/bin/MElt_tagger.py.in @@ -215,7 +215,7 @@ class POSTagger: data_dump_file.close() os.write(1, "%s" %"\b"*len(str(inst_ct))+"filtering") features_filter_exec_path = where_is_exec("MElt_features_filtering.pl") - os.system("@bindir@/MElt_features_filtering.pl \"%s\" %d" %(data_file_name,feat_options.get('ffthrsld',2))); + os.system("/home/pierre/local/bin/MElt_features_filtering.pl \"%s\" %d" %(data_file_name,feat_options.get('ffthrsld',2))); os.write(1,'\n') return data_file_name @@ -255,6 +255,7 @@ class POSTagger: cache=self.cache ) inst.fv = cached_inst.fv[:] inst.get_sequential_features() + #print "features (lineaire)", tokens[i], sorted(inst.fv) label_pr_distrib = self.classifier.class_distribution(inst.fv) # extend sequence j with current token for (cl,pr) in label_pr_distrib: @@ -277,7 +278,74 @@ class POSTagger: # print >> sys.stderr, "Best tok seq:", [(t.string,t.label) for t in best_sequence] return best_sequence - + def tag_token_dag_v2(self, tokens, feat_options={}, beam_size=3): + ''' N-best breath search for the best tag sequence for each sentence''' + #Dag mode only allowed with window of length 2 + assert(feat_options.get('win',2) == 2) + # maintain N-best sequences of tagged tokens + end_index = max([tok.position[1] for tok in tokens]) + sequences = np.zeros((end_index+1,),dtype=object) + sequences[0] = [([],0.0)] # log prob. + for i in range(1, end_index+1): + n_best_sequences = [] + #get all tokens that reach position i + reaching_tokens = [tok for tok in tokens if tok.position[1] == i] + for token in reaching_tokens: + #get sequences that end at the begining of the token + previous_paths = sequences[token.position[0]] + following_paths = next_two_in_dag(tokens,token.position[1]) + for left_context, log_pr_j in previous_paths: + idx = len(left_context) + linear_tokens = left_context[:] # copy to a new array + linear_tokens.append(token) + for right_context in following_paths: + if right_context is not None: + linear_tokens.extend(right_context) + cached_inst = Instance( label=token.label, + index=idx, tokens=linear_tokens, + feat_selection=feat_options, + lex_dict=self.lex_dict, + tag_dict=self.tag_dict, + cache=self.cache ) + cached_inst.get_static_features() + # get possible tags: union of tags found in tag_dict and + # lex_dict + wd = token.string + wasCap = token.wasCap + legit_tags1 = self.tag_dict.get(wd,{}) + legit_tags2 = {} # self.lex_dict.get(wd,{}) + # classify token + inst = Instance( label=token.label, + index=idx, tokens=linear_tokens, + feat_selection=feat_options, + lex_dict=self.lex_dict, + tag_dict=self.tag_dict, + cache=self.cache ) + inst.fv = cached_inst.fv[:] + inst.get_sequential_features() + label_pr_distrib = self.classifier.class_distribution(inst.fv) + # extend sequence j with current token + for (cl,pr) in label_pr_distrib: + # make sure that cl is a legal tag + if legit_tags1 or legit_tags2: + if (cl not in legit_tags1) and (cl not in legit_tags2): + continue + labelled_token = Token(string=(token.position[0],token.position[1],token.tokobj),pos=token.pos,\ + comment=token.comment,\ + wasCap=wasCap,\ + label=cl,proba=pr,label_pr_distrib=label_pr_distrib) + n_best_sequences.append((left_context + [labelled_token], log_pr_j+math.log(pr))) + # sort sequences + n_best_sequences.sort( key=operator.itemgetter(1) ) + #debug_n_best_sequence(n_best_sequences) + # keep N best + sequences[i] = n_best_sequences[-beam_size:] + # return sequence with highest prob. + best_sequence = sequences[-1][-1][0] + # print >> sys.stderr, "Best tok seq:", [(t.string,t.label) for t in best_sequence] + return best_sequence + + def tag_token_dag(self, tokens, feat_options={}, beam_size=3): ''' N-best breath search for the best tag sequence for each sentence with segmentation ambiguities''' # maintain N-best sequences of tagged tokens @@ -329,6 +397,8 @@ class POSTagger: #inst.fv = cached_inst.fv[:] inst.get_static_features() inst.get_sequential_features() + print "tokens", map(str,tokens), map(str,tokens_j) + print "features (dag)", tokens[i], sorted(inst.fv) label_pr_distrib = self.classifier.class_distribution(inst.fv) # import IPython # IPython.embed() @@ -347,7 +417,7 @@ class POSTagger: n_best_sequences.append((seq_j+[labelled_token],log_pr_j+math.log(pr))) # sort sequences n_best_sequences.sort( key=lambda x:x[1]/len(x[0]))# operator.itemgetter(1) ) - debug_n_best_sequence(n_best_sequences) + #debug_n_best_sequence(n_best_sequences) # keep N best sequences = n_best_sequences[-beam_size:] # return sequence with highest prob. @@ -427,7 +497,7 @@ class POSTagger: for i,wd in enumerate(dag): token = Token( string=wd, index=i ) tokens.append( token ) - tagged_tokens = self.tag_token_dag( tokens, + tagged_tokens = self.tag_token_dag_v2( tokens, feat_options=feat_options, beam_size=beam_size ) if (print_probas): @@ -557,7 +627,6 @@ class DAGReader(CorpusReader): class Token: - def __init__(self, string=None, wasCap=0, pos=None, label=None, proba=None, comment=None, label_pr_distrib=[],index=None,position=None): if type(string) is tuple and isinstance(string[2],sxp.Token) : #DAG self.string = string[2].forme @@ -1164,6 +1233,33 @@ def suivants_in_dag(tokens,i): suivants.append(tok) return suivants +def after_position_in_dag(tokens,i): + if i is None : + fin = 0 + else : + fin = i + suivants = [] + for tok in tokens: + if tok.position[0] == fin : + suivants.append(tok) + return suivants + +def next_two_in_dag(tokens,i): + suivs1 = after_position_in_dag(tokens,i) + result = [] + for suiv in suivs1: + suivs2 = after_position_in_dag(tokens, suiv.position[1]) + if len(suivs2) == 0: + result.append([suiv]) + else: + for s2 in suivs2: + result.append([suiv, s2]) + if result == []: + result = [None] + return result + + + def where_is_exec(program): u''' retourne le chemin d'un executable si il est trouvé dans le PATH''' import os -- GitLab