diff --git a/bin/MElt_tagger.py.in b/bin/MElt_tagger.py.in index dc5692ceeb26ee144c8ea61dcd57696afde98e04..8c9c406552efa615ad0f084864f37a4aca23e34b 100644 --- a/bin/MElt_tagger.py.in +++ b/bin/MElt_tagger.py.in @@ -93,7 +93,6 @@ outfile = codecs.getwriter(options.encoding)(sys.stdout) if options.output_file: outfile = codecs.open( options.output_file, "w", options.encoding ) if options.ZH: - import sxparser as sxp from DagInstance import DagInstance @@ -278,7 +277,7 @@ class POSTagger: # print >> sys.stderr, "Best tok seq:", [(t.string,t.label) for t in best_sequence] return best_sequence - def tag_token_dag_v2(self, tokens, feat_options={}, beam_size=3): + def tag_token_dag_v2(self, tokens, feat_options={}, beam_size=5): ''' N-best breath search for the best tag sequence for each sentence''' #Dag mode only allowed with window of length 2 assert(feat_options.get('win',2) == 2) @@ -323,10 +322,12 @@ class POSTagger: cache=self.cache ) inst.fv = cached_inst.fv[:] inst.get_sequential_features() + # print "debug",token.string.encode("utf8")," ".join(sorted(inst.fv)).encode("utf8") label_pr_distrib = self.classifier.class_distribution(inst.fv) # extend sequence j with current token for (cl,pr) in label_pr_distrib: # make sure that cl is a legal tag + #print cl, pr, token.string.encode("utf8") if legit_tags1 or legit_tags2: if (cl not in legit_tags1) and (cl not in legit_tags2): continue @@ -499,7 +500,7 @@ class POSTagger: tokens.append( token ) tagged_tokens = self.tag_token_dag_v2( tokens, feat_options=feat_options, - beam_size=beam_size ) + beam_size=2*beam_size ) if (print_probas): tagged_sent = " ".join( [tok.__pstr__() for tok in tagged_tokens] ) else: @@ -598,9 +599,26 @@ class WeightedReader(CorpusReader): return (w,tokens) ###########################DAGReader####################### -from sxparser import sxyacc +class DagToken(): + def __init__(self,form,pos="UNK",sem=None,com=None): + self.forme = form + self.semantique = sem + self.commentaire = com + self.pos = pos + def to_string(self): + acc = u"" + if self.commentaire: + acc += "{%s} " % (self.commentaire,) + acc += "%s__%s " % (self.forme,self.pos) + if self.semantique: + acc += "[|%s|] " % (self.semantique,) + return acc + class DAGReader(CorpusReader): + """ + reads a corpus written as a sequence of udags + """ def __init__(self,infile, encoding='utf8'): if type(infile) == str or type(infile) == unicode : self.stream = codecs.open(infile,'r',encoding) @@ -608,19 +626,31 @@ class DAGReader(CorpusReader): else : self.allowSeek = False #(peut être stdin) self.stream = infile + self.re_token = rex = re.compile(r"(?P<com>\{[^\}]+})? (?P<tok>[^ ]+) (?P<sem>\[\|[^\|]+\|\])?") return def __iter__(self): return self def next(self): - line = self.stream.readline() - if (line == ''): - if self.allowSeek: - self.stream.seek(0) + dag = [] + line = self.stream.readline().strip() + if(line != "##DAG BEGIN"): raise StopIteration - dag = sxyacc.parse(line.strip()) - l = sxp.fsa_of_dag(dag) - return l + line = self.stream.readline() + while line != "##DAG END": + (source, token_desc, target) = line.split("\t") + match = self.re_token.match(token_desc) + if match is None: + print "problem with", line + raise StopIteration + token = DagToken(match.group('tok'),com=match.group('com'),sem=match.group('sem')) + dag.append((int(source) - 1, int(target) - 1, token)) + line = self.stream.readline().strip() + if (line == ''): + if self.allowSeek: + self.stream.seek(0) + raise StopIteration + return dag ############################ my_token.py ############################ @@ -628,7 +658,7 @@ class DAGReader(CorpusReader): class Token: def __init__(self, string=None, wasCap=0, pos=None, label=None, proba=None, comment=None, label_pr_distrib=[],index=None,position=None): - if type(string) is tuple and isinstance(string[2],sxp.Token) : #DAG + if type(string) is tuple and isinstance(string[2],DagToken) : #DAG self.string = string[2].forme self.position = tuple(string[0:2]) self.tokobj = string[2] @@ -663,7 +693,8 @@ class Token: r += "%s__%s" %(self.string,self.label) if self.tokobj.semantique != "": r += "[|%s|] " %(self.tokobj.semantique,) - return r + #return r + return "%s%s/%s" %(self.comment, self.string, self.label) if (self.wasCap): return "%s%s/%s" %(self.comment,self.string.upper(),self.label) else: @@ -991,20 +1022,22 @@ class Instance: # word string-based features if word in self.cache: # if wd has been seen, use cache - self.add_cached_features(self.cache[word]) + pass + #self.add_cached_features(self.cache[word]) else: + pass # word string - self.add('wd',word) + # NOTE: disabled for PACTE: self.add('wd',word) # suffix/prefix - wd_ln = len(word) - if pln > 0: - for i in range(1,pln+1): - if wd_ln >= i: - self.add('pref%i' %i, word[:i]) - if sln > 0: - for i in range(1,sln+1): - if wd_ln >= i: - self.add('suff%i' %i, word[-i:], val) + #wd_ln = len(word) + #if pln > 0: + # for i in range(1,pln+1): + # if wd_ln >= i: + # self.add('pref%i' %i, word[:i]) + #if sln > 0: + # for i in range(1,sln+1): + # if wd_ln >= i: + # self.add('suff%i' %i, word[-i:], val) # regex-based features self.add( 'nb', number.search(word) != None ) self.add( 'hyph', hyphen.search(word) != None ) diff --git a/pkgpythonlib/sxlexer.py b/pkgpythonlib/sxlexer.py index 3545a923c4d21070eb830f9d04f56c6005758be6..319ba1923aa7224d1690d0620b8b698319ab3de8 100644 --- a/pkgpythonlib/sxlexer.py +++ b/pkgpythonlib/sxlexer.py @@ -6,7 +6,8 @@ tokens = [ 'WORD','SEM_G', 'SEM_D'] t_ignore = ' ' t_SEM_G = r'\[\|' t_SEM_D = r'\|\]' -t_WORD = r'[^_{}[\]\|\(\) ]+' +t_WORD = r'[^{}[\]\|\(\) ]+' +#t_WORD = r'[^_{}[\]\|\(\) ]+' def t_error(t): t.type = t.value[0] @@ -14,7 +15,8 @@ def t_error(t): t.lexer.skip(1) return t -literals = '_{}|()' +literals = '{}|()' +#literals = '_{}|()' lex.lex() diff --git a/pkgpythonlib/sxparser.py b/pkgpythonlib/sxparser.py index b7bc8bb10dbeb993c846d2d27417374e2b1a9818..6acd3ff3fa2ed337d563fd314dc34cf29b060e04 100644 --- a/pkgpythonlib/sxparser.py +++ b/pkgpythonlib/sxparser.py @@ -128,9 +128,13 @@ def p_no_pos(p): p[0] = ('Forme',{"Forme":p[1], "POS":""}) def p_special_char(p): - '''special_char : '_' WORD - | '_' WORD '_' WORD''' - p[0] = ('Forme',{"Forme": "".join(p[1:]), "POS":"escaped"}) + '''special_char : special_char '_' WORD + | '_' WORD''' + if len(p)>3: + p[1][1]["Forme"] += "".join(p[2:]) + p[0] = ('Forme',p[1][1]) + else: + p[0] = ('Forme',{"Forme": "".join(p[1:]), "POS":"escaped"}) def p_wordlist(p): '''wordlist : wordlist WORD