diff --git a/src/metagger/Makefile.am b/src/metagger/Makefile.am index 3f796738f353ab3de31374829b7290da47e4a5e9..7963196eb92e5a0e79f049d907e1df45fbae7805 100644 --- a/src/metagger/Makefile.am +++ b/src/metagger/Makefile.am @@ -7,21 +7,27 @@ pkgpython_PYTHON = __init__.py \ instance.py \ mytoken.py \ megam_classifier.py \ - result_sink.py + result_sink.py \ + utils.py \ + corpus_reader.py CLEANFILES = __init__.py \ pos_tagger.py \ instance.py \ mytoken.py \ megam_classifier.py \ - result_sink.py + result_sink.py \ + utils.py \ + corpus_reader.py BUILT_SOURCES = __init__.py \ pos_tagger.py \ instance.py \ mytoken.py \ megam_classifier.py \ - result_sink.py + result_sink.py \ + utils.py \ + corpus_reader.py EDIT = perl -pe "s|\@alexinadir\@|$(alexinadir)|g; \ s|\@datadir\@|\$(pkgdatadir)|g; \ diff --git a/src/metagger/corpus_reader.py b/src/metagger/corpus_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..b354f3340ff06412200f407b85c984e103fee16c --- /dev/null +++ b/src/metagger/corpus_reader.py @@ -0,0 +1,46 @@ +""" Corpus reader """ + +import sys +import codecs +import re + +WD_TAG_RE = re.compile(r'^(.+?)/(.+)$') + + +class CorpusReader: + pass + + +class BrownReader(CorpusReader): + + """ + Data reader for corpus in the Brown format: + + Le/DET prix/NC de/P vente/NC du/P+D journal/NC n'/ADV a/V pas/ADV <E9>t<E9>/VPP divulgu<E9>/VPP ./PONCT + Le/DET cabinet/NC Arthur/NPP Andersen/NPP ,/PONCT administrateur/NC judiciaire/ADJ du/P+D titre/NC depuis/P l'/DET effondrement/NC de/P l'/DET empire/NC Maxwell/NPP en/P d<E9>cembre/NC 1991/NC ,/PO + NCT a/V indiqu<E9>/VPP que/CS les/DET nouveaux/ADJ propri<E9>taires/NC allaient/V poursuivre/VINF la/DET publication/NC ./PONCT + + """ + + def __init__(self,infile, encoding='latin-1'): + self.stream = codecs.open(infile, 'r', encoding) + return + + def __iter__(self): + return self + + def next(self): + line = self.stream.readline().strip() + if (line == ''): + self.stream.seek(0) + raise StopIteration + token_list = [] + for item in line.split(): + wd,tag = WD_TAG_RE.match(item).groups() + token_list.append( (wd,tag) ) + return token_list + + + + + diff --git a/src/metagger/corpus_reader.py.in b/src/metagger/corpus_reader.py.in new file mode 100644 index 0000000000000000000000000000000000000000..b354f3340ff06412200f407b85c984e103fee16c --- /dev/null +++ b/src/metagger/corpus_reader.py.in @@ -0,0 +1,46 @@ +""" Corpus reader """ + +import sys +import codecs +import re + +WD_TAG_RE = re.compile(r'^(.+?)/(.+)$') + + +class CorpusReader: + pass + + +class BrownReader(CorpusReader): + + """ + Data reader for corpus in the Brown format: + + Le/DET prix/NC de/P vente/NC du/P+D journal/NC n'/ADV a/V pas/ADV <E9>t<E9>/VPP divulgu<E9>/VPP ./PONCT + Le/DET cabinet/NC Arthur/NPP Andersen/NPP ,/PONCT administrateur/NC judiciaire/ADJ du/P+D titre/NC depuis/P l'/DET effondrement/NC de/P l'/DET empire/NC Maxwell/NPP en/P d<E9>cembre/NC 1991/NC ,/PO + NCT a/V indiqu<E9>/VPP que/CS les/DET nouveaux/ADJ propri<E9>taires/NC allaient/V poursuivre/VINF la/DET publication/NC ./PONCT + + """ + + def __init__(self,infile, encoding='latin-1'): + self.stream = codecs.open(infile, 'r', encoding) + return + + def __iter__(self): + return self + + def next(self): + line = self.stream.readline().strip() + if (line == ''): + self.stream.seek(0) + raise StopIteration + token_list = [] + for item in line.split(): + wd,tag = WD_TAG_RE.match(item).groups() + token_list.append( (wd,tag) ) + return token_list + + + + +