From 870432ec89fd16aa37ec0930dcf83d0afc6b1e84 Mon Sep 17 00:00:00 2001
From: Pierre Magistry <pierre.magistry@alpage>
Date: Tue, 14 Oct 2014 13:04:10 +0000
Subject: [PATCH] oublie de changer le .in

git-svn-id: https://scm.gforge.inria.fr/authscm/cfourrie/svn/lingwb/MElt/branches/pierre@5504 dc05b511-7f1d-0410-9f1c-d6f32a2df9e4
---
 bin/MElt_tagger.py.in | 106 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 101 insertions(+), 5 deletions(-)

diff --git a/bin/MElt_tagger.py.in b/bin/MElt_tagger.py.in
index 3d082d6..dc5692c 100644
--- a/bin/MElt_tagger.py.in
+++ b/bin/MElt_tagger.py.in
@@ -215,7 +215,7 @@ class POSTagger:
              data_dump_file.close()
         os.write(1, "%s" %"\b"*len(str(inst_ct))+"filtering")
         features_filter_exec_path = where_is_exec("MElt_features_filtering.pl")
-        os.system("@bindir@/MElt_features_filtering.pl \"%s\" %d" %(data_file_name,feat_options.get('ffthrsld',2)));
+        os.system("/home/pierre/local/bin/MElt_features_filtering.pl \"%s\" %d" %(data_file_name,feat_options.get('ffthrsld',2)));
         os.write(1,'\n')
         return data_file_name
     
@@ -255,6 +255,7 @@ class POSTagger:
                                  cache=self.cache )
                 inst.fv = cached_inst.fv[:]
                 inst.get_sequential_features()
+                #print "features (lineaire)", tokens[i], sorted(inst.fv)
                 label_pr_distrib = self.classifier.class_distribution(inst.fv)
                 # extend sequence j with current token
                 for (cl,pr) in label_pr_distrib:
@@ -277,7 +278,74 @@ class POSTagger:
         # print >> sys.stderr, "Best tok seq:", [(t.string,t.label) for t in best_sequence]
         return best_sequence
 
-        
+    def tag_token_dag_v2(self, tokens, feat_options={}, beam_size=3):
+        ''' N-best breath search for the best tag sequence for each sentence'''
+        #Dag mode only allowed with window of length 2
+        assert(feat_options.get('win',2) == 2)
+        # maintain N-best sequences of tagged tokens
+        end_index = max([tok.position[1] for tok in tokens])
+        sequences = np.zeros((end_index+1,),dtype=object)
+        sequences[0] = [([],0.0)]  # log prob.
+        for i in range(1, end_index+1):
+            n_best_sequences = []
+            #get all tokens that reach position i
+            reaching_tokens = [tok for tok in tokens if tok.position[1] == i]
+            for token in reaching_tokens:
+                #get sequences that end at the begining of the token
+                previous_paths = sequences[token.position[0]]
+                following_paths = next_two_in_dag(tokens,token.position[1])
+                for left_context, log_pr_j in previous_paths:
+                    idx = len(left_context)
+                    linear_tokens = left_context[:] # copy to a new array
+                    linear_tokens.append(token)
+                    for right_context in following_paths:
+                        if right_context is not None:
+                            linear_tokens.extend(right_context)
+                        cached_inst = Instance( label=token.label,
+                                    index=idx, tokens=linear_tokens,
+                                    feat_selection=feat_options,
+                                    lex_dict=self.lex_dict,
+                                    tag_dict=self.tag_dict,
+                                    cache=self.cache )
+                        cached_inst.get_static_features()
+                        # get possible tags: union of tags found in tag_dict and
+                        # lex_dict
+                        wd = token.string
+                        wasCap = token.wasCap
+                        legit_tags1 = self.tag_dict.get(wd,{})
+                        legit_tags2 = {} # self.lex_dict.get(wd,{}) 
+                        # classify token
+                        inst = Instance( label=token.label,
+                                 index=idx, tokens=linear_tokens, 
+                                 feat_selection=feat_options,
+                                 lex_dict=self.lex_dict,
+                                 tag_dict=self.tag_dict,
+                                 cache=self.cache )
+                        inst.fv = cached_inst.fv[:]
+                        inst.get_sequential_features()
+                        label_pr_distrib = self.classifier.class_distribution(inst.fv)
+                        # extend sequence j with current token
+                        for (cl,pr) in label_pr_distrib:
+                            # make sure that cl is a legal tag
+                            if legit_tags1 or legit_tags2:
+                                if (cl not in legit_tags1) and (cl not in legit_tags2):
+                                    continue
+                            labelled_token = Token(string=(token.position[0],token.position[1],token.tokobj),pos=token.pos,\
+                                           comment=token.comment,\
+                                           wasCap=wasCap,\
+                                           label=cl,proba=pr,label_pr_distrib=label_pr_distrib)
+                            n_best_sequences.append((left_context + [labelled_token], log_pr_j+math.log(pr)))
+            # sort sequences
+            n_best_sequences.sort( key=operator.itemgetter(1) )
+            #debug_n_best_sequence(n_best_sequences)
+            # keep N best
+            sequences[i] = n_best_sequences[-beam_size:]
+        # return sequence with highest prob. 
+        best_sequence = sequences[-1][-1][0]
+        # print >> sys.stderr, "Best tok seq:", [(t.string,t.label) for t in best_sequence]
+        return best_sequence
+
+       
     def tag_token_dag(self, tokens, feat_options={}, beam_size=3):
         ''' N-best breath search for the best tag sequence for each sentence with segmentation ambiguities'''
         # maintain N-best sequences of tagged tokens
@@ -329,6 +397,8 @@ class POSTagger:
                     #inst.fv = cached_inst.fv[:]
                     inst.get_static_features()
                     inst.get_sequential_features()
+                    print "tokens", map(str,tokens), map(str,tokens_j)
+                    print "features (dag)", tokens[i], sorted(inst.fv)
                     label_pr_distrib = self.classifier.class_distribution(inst.fv)
 #                    import IPython
 #                    IPython.embed()
@@ -347,7 +417,7 @@ class POSTagger:
                         n_best_sequences.append((seq_j+[labelled_token],log_pr_j+math.log(pr)))
             # sort sequences
             n_best_sequences.sort( key=lambda x:x[1]/len(x[0]))# operator.itemgetter(1) )
-            debug_n_best_sequence(n_best_sequences)
+            #debug_n_best_sequence(n_best_sequences)
             # keep N best
             sequences = n_best_sequences[-beam_size:]
         # return sequence with highest prob. 
@@ -427,7 +497,7 @@ class POSTagger:
             for i,wd in enumerate(dag):
                     token = Token( string=wd, index=i )
                     tokens.append( token )
-            tagged_tokens = self.tag_token_dag( tokens,
+            tagged_tokens = self.tag_token_dag_v2( tokens,
                                                      feat_options=feat_options,
                                                      beam_size=beam_size )
             if (print_probas):
@@ -557,7 +627,6 @@ class DAGReader(CorpusReader):
 
 
 class Token:
-
     def __init__(self, string=None, wasCap=0, pos=None, label=None, proba=None, comment=None, label_pr_distrib=[],index=None,position=None):
         if type(string) is tuple and isinstance(string[2],sxp.Token) : #DAG
             self.string = string[2].forme
@@ -1164,6 +1233,33 @@ def suivants_in_dag(tokens,i):
            suivants.append(tok)
     return suivants
 
+def after_position_in_dag(tokens,i):
+    if i is None :
+        fin = 0
+    else :
+        fin = i 
+    suivants = []
+    for tok in tokens:
+        if tok.position[0] == fin :
+           suivants.append(tok)
+    return suivants
+
+def next_two_in_dag(tokens,i):
+    suivs1 = after_position_in_dag(tokens,i)
+    result = []
+    for suiv in suivs1:
+        suivs2 = after_position_in_dag(tokens, suiv.position[1])
+        if len(suivs2) == 0:
+            result.append([suiv])
+        else:
+            for s2 in suivs2:
+                result.append([suiv, s2])
+    if result == []:
+        result = [None]
+    return result
+
+
+
 def where_is_exec(program):
     u''' retourne le chemin d'un executable si il est trouvé dans le PATH'''
     import os
-- 
GitLab