Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 8545fb0a authored by Pierre Magistry's avatar Pierre Magistry
Browse files

MElt -Z prend maintenant des udag en entrée

git-svn-id: https://scm.gforge.inria.fr/authscm/cfourrie/svn/lingwb/MElt/branches/pierre@5546 dc05b511-7f1d-0410-9f1c-d6f32a2df9e4
parent 870432ec
Branches pierre
No related tags found
No related merge requests found
......@@ -93,7 +93,6 @@ outfile = codecs.getwriter(options.encoding)(sys.stdout)
if options.output_file:
outfile = codecs.open( options.output_file, "w", options.encoding )
if options.ZH:
import sxparser as sxp
from DagInstance import DagInstance
......@@ -278,7 +277,7 @@ class POSTagger:
# print >> sys.stderr, "Best tok seq:", [(t.string,t.label) for t in best_sequence]
return best_sequence
def tag_token_dag_v2(self, tokens, feat_options={}, beam_size=3):
def tag_token_dag_v2(self, tokens, feat_options={}, beam_size=5):
''' N-best breath search for the best tag sequence for each sentence'''
#Dag mode only allowed with window of length 2
assert(feat_options.get('win',2) == 2)
......@@ -323,10 +322,12 @@ class POSTagger:
cache=self.cache )
inst.fv = cached_inst.fv[:]
inst.get_sequential_features()
# print "debug",token.string.encode("utf8")," ".join(sorted(inst.fv)).encode("utf8")
label_pr_distrib = self.classifier.class_distribution(inst.fv)
# extend sequence j with current token
for (cl,pr) in label_pr_distrib:
# make sure that cl is a legal tag
#print cl, pr, token.string.encode("utf8")
if legit_tags1 or legit_tags2:
if (cl not in legit_tags1) and (cl not in legit_tags2):
continue
......@@ -499,7 +500,7 @@ class POSTagger:
tokens.append( token )
tagged_tokens = self.tag_token_dag_v2( tokens,
feat_options=feat_options,
beam_size=beam_size )
beam_size=2*beam_size )
if (print_probas):
tagged_sent = " ".join( [tok.__pstr__() for tok in tagged_tokens] )
else:
......@@ -598,9 +599,26 @@ class WeightedReader(CorpusReader):
return (w,tokens)
###########################DAGReader#######################
from sxparser import sxyacc
class DagToken():
def __init__(self,form,pos="UNK",sem=None,com=None):
self.forme = form
self.semantique = sem
self.commentaire = com
self.pos = pos
def to_string(self):
acc = u""
if self.commentaire:
acc += "{%s} " % (self.commentaire,)
acc += "%s__%s " % (self.forme,self.pos)
if self.semantique:
acc += "[|%s|] " % (self.semantique,)
return acc
class DAGReader(CorpusReader):
"""
reads a corpus written as a sequence of udags
"""
def __init__(self,infile, encoding='utf8'):
if type(infile) == str or type(infile) == unicode :
self.stream = codecs.open(infile,'r',encoding)
......@@ -608,19 +626,31 @@ class DAGReader(CorpusReader):
else :
self.allowSeek = False #(peut être stdin)
self.stream = infile
self.re_token = rex = re.compile(r"(?P<com>\{[^\}]+})? (?P<tok>[^ ]+) (?P<sem>\[\|[^\|]+\|\])?")
return
def __iter__(self):
return self
def next(self):
line = self.stream.readline()
if (line == ''):
if self.allowSeek:
self.stream.seek(0)
dag = []
line = self.stream.readline().strip()
if(line != "##DAG BEGIN"):
raise StopIteration
dag = sxyacc.parse(line.strip())
l = sxp.fsa_of_dag(dag)
return l
line = self.stream.readline()
while line != "##DAG END":
(source, token_desc, target) = line.split("\t")
match = self.re_token.match(token_desc)
if match is None:
print "problem with", line
raise StopIteration
token = DagToken(match.group('tok'),com=match.group('com'),sem=match.group('sem'))
dag.append((int(source) - 1, int(target) - 1, token))
line = self.stream.readline().strip()
if (line == ''):
if self.allowSeek:
self.stream.seek(0)
raise StopIteration
return dag
############################ my_token.py ############################
......@@ -628,7 +658,7 @@ class DAGReader(CorpusReader):
class Token:
def __init__(self, string=None, wasCap=0, pos=None, label=None, proba=None, comment=None, label_pr_distrib=[],index=None,position=None):
if type(string) is tuple and isinstance(string[2],sxp.Token) : #DAG
if type(string) is tuple and isinstance(string[2],DagToken) : #DAG
self.string = string[2].forme
self.position = tuple(string[0:2])
self.tokobj = string[2]
......@@ -663,7 +693,8 @@ class Token:
r += "%s__%s" %(self.string,self.label)
if self.tokobj.semantique != "":
r += "[|%s|] " %(self.tokobj.semantique,)
return r
#return r
return "%s%s/%s" %(self.comment, self.string, self.label)
if (self.wasCap):
return "%s%s/%s" %(self.comment,self.string.upper(),self.label)
else:
......@@ -991,20 +1022,22 @@ class Instance:
# word string-based features
if word in self.cache:
# if wd has been seen, use cache
self.add_cached_features(self.cache[word])
pass
#self.add_cached_features(self.cache[word])
else:
pass
# word string
self.add('wd',word)
# NOTE: disabled for PACTE: self.add('wd',word)
# suffix/prefix
wd_ln = len(word)
if pln > 0:
for i in range(1,pln+1):
if wd_ln >= i:
self.add('pref%i' %i, word[:i])
if sln > 0:
for i in range(1,sln+1):
if wd_ln >= i:
self.add('suff%i' %i, word[-i:], val)
#wd_ln = len(word)
#if pln > 0:
# for i in range(1,pln+1):
# if wd_ln >= i:
# self.add('pref%i' %i, word[:i])
#if sln > 0:
# for i in range(1,sln+1):
# if wd_ln >= i:
# self.add('suff%i' %i, word[-i:], val)
# regex-based features
self.add( 'nb', number.search(word) != None )
self.add( 'hyph', hyphen.search(word) != None )
......
......@@ -6,7 +6,8 @@ tokens = [ 'WORD','SEM_G', 'SEM_D']
t_ignore = ' '
t_SEM_G = r'\[\|'
t_SEM_D = r'\|\]'
t_WORD = r'[^_{}[\]\|\(\) ]+'
t_WORD = r'[^{}[\]\|\(\) ]+'
#t_WORD = r'[^_{}[\]\|\(\) ]+'
def t_error(t):
t.type = t.value[0]
......@@ -14,7 +15,8 @@ def t_error(t):
t.lexer.skip(1)
return t
literals = '_{}|()'
literals = '{}|()'
#literals = '_{}|()'
lex.lex()
......
......@@ -128,9 +128,13 @@ def p_no_pos(p):
p[0] = ('Forme',{"Forme":p[1], "POS":""})
def p_special_char(p):
'''special_char : '_' WORD
| '_' WORD '_' WORD'''
p[0] = ('Forme',{"Forme": "".join(p[1:]), "POS":"escaped"})
'''special_char : special_char '_' WORD
| '_' WORD'''
if len(p)>3:
p[1][1]["Forme"] += "".join(p[2:])
p[0] = ('Forme',p[1][1])
else:
p[0] = ('Forme',{"Forme": "".join(p[1:]), "POS":"escaped"})
def p_wordlist(p):
'''wordlist : wordlist WORD
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment