Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 7adc4947 authored by Pascal Denis's avatar Pascal Denis
Browse files

cleaning up crap left off

git-svn-id: https://scm.gforge.inria.fr/authscm/cfourrie/svn/lingwb/metagger/trunk@2690 dc05b511-7f1d-0410-9f1c-d6f32a2df9e4
parent 40af4fce
No related branches found
No related tags found
No related merge requests found
......@@ -18,7 +18,15 @@ CLEANFILES = __init__.py \
megam_classifier.py \
result_sink.py \
utils.py \
corpus_reader.py
corpus_reader.py \
__init__.pyc \
pos_tagger.pyc \
instance.pyc \
mytoken.pyc \
megam_classifier.pyc \
result_sink.pyc \
utils.pyc \
corpus_reader.pyc
BUILT_SOURCES = __init__.py \
pos_tagger.py \
......
""" Corpus reader """
import sys
import codecs
import re
WD_TAG_RE = re.compile(r'^(.+?)/(.+)$')
class CorpusReader:
pass
class BrownReader(CorpusReader):
"""
Data reader for corpus in the Brown format:
Le/DET prix/NC de/P vente/NC du/P+D journal/NC n'/ADV a/V pas/ADV <E9>t<E9>/VPP divulgu<E9>/VPP ./PONCT
Le/DET cabinet/NC Arthur/NPP Andersen/NPP ,/PONCT administrateur/NC judiciaire/ADJ du/P+D titre/NC depuis/P l'/DET effondrement/NC de/P l'/DET empire/NC Maxwell/NPP en/P d<E9>cembre/NC 1991/NC ,/PO
NCT a/V indiqu<E9>/VPP que/CS les/DET nouveaux/ADJ propri<E9>taires/NC allaient/V poursuivre/VINF la/DET publication/NC ./PONCT
"""
def __init__(self,infile, encoding='latin-1'):
self.stream = codecs.open(infile, 'r', encoding)
return
def __iter__(self):
return self
def next(self):
line = self.stream.readline().strip()
if (line == ''):
self.stream.seek(0)
raise StopIteration
token_list = []
for item in line.split():
wd,tag = WD_TAG_RE.match(item).groups()
token_list.append( (wd,tag) )
return token_list
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment