Commit 724e2a9f authored by Gérard Huet's avatar Gérard Huet

Zen library now external

parent 4d47afd4
pidgin.cmo :
pidgin.cmx :
gen.cmo :
gen.cmx :
list2.cmo :
list2.cmx :
ascii.cmo : list2.cmo
ascii.cmx : list2.cmx
latin.cmo : transducer.cmo list2.cmo
latin.cmx : transducer.cmx list2.cmx
word.cmo : list2.cmo
word.cmx : list2.cmx
zen_lexer.cmo :
zen_lexer.cmx :
transducer.cmo : zen_lexer.cmo
transducer.cmx : zen_lexer.cmx
zipper.cmo : word.cmo list2.cmo
zipper.cmx : word.cmx list2.cmx
bintree.cmo :
bintree.cmx :
share.cmi :
share.cmo : share.cmi
share.cmx : share.cmi
sharemod.cmi :
sharemod.cmo : sharemod.cmi
sharemod.cmx : sharemod.cmi
trie.cmo : word.cmo list2.cmo gen.cmo
trie.cmx : word.cmx list2.cmx gen.cmx
mini.cmo : trie.cmo share.cmi gen.cmo
mini.cmx : trie.cmx share.cmx gen.cmx
deco.cmo : word.cmo trie.cmo list2.cmo
deco.cmx : word.cmx trie.cmx list2.cmx
lexmap.cmo : word.cmo list2.cmo deco.cmo
lexmap.cmx : word.cmx list2.cmx deco.cmx
minimap.cmi : lexmap.cmo
minimap.cmo : share.cmi lexmap.cmo deco.cmo minimap.cmi
minimap.cmx : share.cmx lexmap.cmx deco.cmx minimap.cmi
tertree.cmo : trie.cmo gen.cmo
tertree.cmx : trie.cmx gen.cmx
minitertree.cmo : tertree.cmo share.cmi gen.cmo
minitertree.cmx : tertree.cmx share.cmx gen.cmx
lexicon.cmo : trie.cmo ascii.cmo
lexicon.cmx : trie.cmx ascii.cmx
make_lex.cmo : word.cmo trie.cmo
make_lex.cmx : word.cmx trie.cmx
unglue.cmo : word.cmo trie.cmo ascii.cmo
unglue.cmx : word.cmx trie.cmx ascii.cmx
aum0.cmi : word.cmo
react0.cmo : word.cmo aum0.cmi
react0.cmx : word.cmx aum0.cmi
aume.cmi : word.cmo
reacte.cmo : word.cmo list2.cmo aume.cmi
reacte.cmx : word.cmx list2.cmx aume.cmi
aumt.cmi : word.cmo
reactt.cmo : word.cmo list2.cmo aumt.cmi
reactt.cmx : word.cmx list2.cmx aumt.cmi
regular.cmo :
regular.cmx :
unshare.cmo : trie.cmo
unshare.cmx : trie.cmx
terdagify.cmo : tertree.cmo minitertree.cmo
terdagify.cmx : tertree.cmx minitertree.cmx
list_iso.cmo : trie.cmo latin.cmo
list_iso.cmx : trie.cmx latin.cmx
dagify.cmo : trie.cmo mini.cmo
dagify.cmx : trie.cmx mini.cmx
timeshare.cmo : trie.cmo mini.cmo gen.cmo
timeshare.cmx : trie.cmx mini.cmx gen.cmx
test.cmo : word.cmo ascii.cmo
test.cmx : word.cmx ascii.cmx
unglue_test.cmo : unglue.cmo lexicon.cmo ascii.cmo
unglue_test.cmx : unglue.cmx lexicon.cmx ascii.cmx
make_english_lexicon.cmo : make_lex.cmo ascii.cmo
make_english_lexicon.cmx : make_lex.cmx ascii.cmx
make_french_lexicon.cmo : make_lex.cmo latin.cmo
make_french_lexicon.cmx : make_lex.cmx latin.cmx
example_dispatch.cmo :
example_dispatch.cmx :
sanskrit_dispatch.cmo :
sanskrit_dispatch.cmx :
example_engine.cmo : react0.cmo example_dispatch.cmo aum0.cmi
example_engine.cmx : react0.cmx example_dispatch.cmx aum0.cmi
sanskrit_engine.cmo : sanskrit_dispatch.cmo reactt.cmo aumt.cmi
sanskrit_engine.cmx : sanskrit_dispatch.cmx reactt.cmx aumt.cmi
#load "dynlink.cma";;
#load "camlp4/camlp4r.cma";;
(**************************************************************************)
(* *)
(* The Zen/Aum Library *)
(* *)
(* Gérard Huet *)
(* *)
(* ©2005 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
############################################################################
# #
# The Zen Computational Linguistics Toolkit #
# #
# Grard Huet #
# #
############################################################################
# Makefile of ZEN computational linguistics toolkit documentation #
# V2.3.2 24-12-2010 Copyright INRIA 2010 #
############################################################################
#NOWEB=ocamlweb --old-fullpage -p "\usepackage{url} \usepackage{graphicx}"
NOWEB=ocamlweb -p "\usepackage{url} \usepackage{graphicx}"
OCAMLWEB=../../DOC/OCAMLWEB
TEX=intro.tex gen.tex list2.tex word.tex zipper.tex bintree.tex trie.tex lexicon.tex make_lex.tex share.tex mini.tex dagify.tex tertree.tex deco.tex lexmap.tex minimap.tex english.tex french.tex sanskrit.tex unglue.tex coroutines.tex biblio.tex supplements.tex zen.tex
DOC=zen.ps zen.pdf zen.html
MAIN=../pidgin.ml gen.tex ../gen.ml list2.tex ../list2.ml word.tex ../word.ml \
zipper.tex ../zipper.ml bintree.tex ../bintree.ml trie.tex ../trie.ml ascii.tex \
../ascii.ml lexicon.tex ../lexicon.ml make_lex.tex ../make_lex.ml share.tex \
../share.mli ../share.ml mini.tex ../mini.ml dagify.tex ../dagify.ml \
../make_english_lexicon.ml english.tex ../zen_lexer.ml ../transducer.ml \
../latin.ml french.tex ../make_french_lexicon.ml sanskrit.tex tertree.tex \
../tertree.ml ../minitertree.ml deco.tex ../deco.ml lexmap.tex ../lexmap.ml \
minimap.tex ../minimap.mli ../minimap.ml unglue.tex ../unglue.ml \
../unglue_test.ml coroutines.tex \
intro_aum0.tex ../aum0.mli react0.tex ../react0.ml aumt.tex ../aumt.mli \
reactt.tex ../reactt.ml regular.tex ../regular.ml linking.tex \
../sanskrit_engine.ml conclusion.tex biblio.tex
ZEN=intro.tex $(MAIN)
doc: tex # html
tex:
$(NOWEB) --noweb $(ZEN) -o zen.tex
pdflatex zen
# dvips -Pwww -f zen.dvi > zen.ps
# ps2pdf zen.ps
html: # fails because includegraphics
$(NOWEB) --noweb --html --hevea-option "-I $(OCAMLWEB)" $(ZEN) -o zen.html
\subsection{Ascii encoding}
The $Ascii$ module defines coercions
$encode$ from strings to words
and $decode$ from words to strings.
The function {\sl react1} is a recognizer for the rational language which is the
image by the {\sl transducer} morphism of the regular expression over phases.
It stops at the first solution - when the input string is a word in this language
- otherwise it raises the exception {\sl Finished}. However, note that the
general mechanism for managing non-determinism through coroutine resumptions
allows restarting the computation to find other solutions. This mechanism will
be specially important later when our engine
is used for transductions, where we may be interested in the various solutions.
We give above an example of using {\sl continue} as a coroutine by computing
the {\sl multiplicity} function, which counts the number of ways in which
the input string may be solution to the regular expression. We remark that
standard formal language theory deals with languages as {\sl sets} of words,
whereas here we formalize the finer notion of a {\sl stream} (i.e. a
potentially infinite list) of words recursively enumerating a {\sl multiset}
of words.
% amiable together etc -> 4 TODO
% completeness, finiteness condition : morphism such that epsilon not in languages
\section{Modular aum transducers}
So far our automata were mere deterministic recognizers for finite sets of words
(although a dose of non-determinism arises from the choice, at any accepting node,
between external transition to the next phase and continuing the local search, since
the local language may contain a word and one of its proper prefixes).
We now consider a more general framework where we handle loops in the transition
relation, non-deterministic transitions, and output.
\begin{thebibliography}{10}
\bibitem{asu} Alfred V. Aho, Ravi Sethi and Jeffrey D. Ullman.
``Compilers - Principles, Techniques and Tools.'' Addison-Wesley, 1986.
\bibitem{beeskar} Kenneth R. Beesley and Lauri Karttunen. ``Finite-State
Morphology: Xerox Tools and Techniques.'' Private communication, April 2001.
\bibitem{bentley} Jon L. Bentley and Robert Sedgewick.
``Fast Algorithms for Sorting and Searching Strings.''
Proceedings, 8th Annual ACM-SIAM Symposium on Discrete Algorithms, Jan. 1997.
\bibitem{berrysethi}
G\'erard Berry and Ravi Sethi.
From regular expressions to deterministic automata.
Theoretical Computer Science 48 (1986), pp. 117--126.
\bibitem{berstelpin}
Jean Berstel and Jean-Eric Pin. Local languages and the {Berry}-{Sethi} algorithm.
Theoretical Computer Science 155 (1996), pp. 439--446.
\bibitem{brill} Eric Brill. ``A simple rule-based part of speech tagger.''
In Proceedings, Third Conference on Applied Natural Language Processing, 1992.
Trento, Italy, 152--155.
\bibitem{burge} W. H. Burge. ``Recursive Programming Techniques.''
Addison-Wesley, 1975.
\bibitem{fp} Guy Cousineau and Michel Mauny. ``The Functional Approach to
Programming.'' Cambridge University Press, 1998.
\bibitem{daciuk} Jan Daciuk, Stoyan Mihov, Bruce W. Watson and Richard E.
Watson. ``Incremental Construction of Minimal Acyclic Finite-State Automata.''
Computational Linguistics 26,1 (2000).
\bibitem{eilenberg}
Samuel Eilenberg. Automata, Languages, and Machines, volume A.
Academic Press, 1974.
\bibitem{MLer} Matthias Felleisen and Daniel P. Friedman. ``The Little MLer''.
MIT Press, 1998.
\bibitem{flajsipste} Philippe Flajolet, Paola Sipala and Jean-Marc Steyaert.
``Analytic Variations on the Common Subexpresssion Problem.'' Proceedings of
17th ICALP Colloquium, Warwick (1990), LNCS 443, Springer-Verlag,
pp. 220--234.
\bibitem{ML-LCF}
M. Gordon, R. Milner, C. Wadsworth.
``A Metalanguage for Interactive Proof in LCF.''
Internal Report CSR-16-77, Department of Computer Science,
University of Edinburgh (Sept. 1977).
\bibitem{zipper} G\'erard Huet. ``The Zipper''. J. Functional Programming 7,5
(Sept. 1997), pp. 549--554.
\bibitem{dico-report}
G\'erard Huet.
``Structure of a Sanskrit dictionary.''
INRIA Technical Report, Sept. 2000.
Available as: \raggedright
\verb!http://pauillac.inria.fr/~huet/PUBLIC/Dicostruct.ps!.
\bibitem{wcre}
G\'erard Huet.
``From an informal textual lexicon to a well-structured lexical database:
An experiment in data reverse engineering.''
IEEE Working Conference on Reverse Engineering (WCRE'2001),
Stuttgart, Oct. 2001.
\bibitem{2003-Huet-3}
G\'erard Huet. Automata Mista. In ``Verification: Theory and Practice: Essays Dedicated
to {Zohar} {Manna} on the Occasion of His 64th Birthday". Ed. Nachum Dershowitz,
Springer-Verlag LNCS vol. 2772 (2004), pp. 359--372.
\bibitem{2004-Huet-1}
G\'erard Huet. A Functional Toolkit for Morphological
and Phonological Processing, Application to a {Sanskrit} Tagger.
J. Functional Programming, 15,4 (2005), pp. 573--614.
\bibitem{2006-Huet-Razet}
G\'erard Huet and Beno{\^\i}t Razet. The Reactive Engine for Modular Transducers.
In ``Algebra, Meaning and Computation, Essays Dedicated to
Joseph A. Goguen on the Occasion of His 65th Birthday'', Eds.
Kokichi Futatsugi, Jean-Pierre Jouannaud and Jos\'e Meseguer.
Springer-Verlag LNCS vol. 4060 (2006), pp. 355--374
\bibitem{2008-Huet-Razet}
G\'erard Huet and Beno{\^\i}t Razet. Computing with Relational Machines.
ICON'2008 tutorial. Preliminary version available at URL
\url{http://yquem.inria.fr/~huet/PUBLIC/Pune_tutorial.pdf}.
\bibitem{kk} Ronald M. Kaplan and Martin Kay. ``Regular Models of
Phonological Rule Systems.'' Computational Linguistics (20,3), 1994,
pp. 331--378.
\bibitem{karttunen1} Lauri Karttunen. ``Applications of Finite-State
Transducers in Natural Language Processing.''
In Proceedings of CIAA-2000.
\bibitem{karttunen2} Lauri Karttunen. ``The Replace Operator.''
In Proceedings of ACL'95, Cambridge, MA, 1995. Extended version
in \cite{rs2}.
\bibitem{kosk} K. Koskenniemi. ``A general computational model for word-form
recognition and production.'' In Proceedings, 10th International Conference
on Computational Linguistics, Stanford (1984).
\bibitem{laporte} Eric Laporte.
``Rational Transductions for Phonetic Conversion and Phonology.''
Report IGM 96-14, Institut Gaspard Monge,
Universit\'e de Marne-la-Vall\'ee, Aug. 1995. Also in \cite{rs2}.
\bibitem{ocaml} Xavier Leroy et al. ``Objective Caml.'' See: \raggedright
\verb!http://caml.inria.fr/ocaml/index.html!.
\bibitem{mohri} Mehryar Mohri. ``Finite-State Transducers in Language and
Speech Processing.'' Computational Linguistics 23,2 (1997), pp. 269--311.
\bibitem{paulson} Larry C. Paulson. ``ML for the Working Programmer.''
Cambridge University Press, 1991.
\bibitem{ranta} Aarne Ranta. ``The GF Language: Syntax and Type System.''
See: \raggedright \verb!http://www.cs.chalmers.se/~aarne/GF/!.
\bibitem{camlp4} Daniel de Rauglaudre. ``The Camlp4 preprocessor."
See: \raggedright \verb!http://caml.inria.fr/camlp4/!.
\bibitem{razet05}
Beno{\^\i}t Razet. Automates modulaires. M\'emoire de Master,
Universit\'e {Denis} {Diderot} (Paris 7), 2005.
\bibitem{Razet08a}
Beno{\^\i}t Razet. Finite {Eilenberg} Machines.
Proceedings of CIIA 2008,
Eds. O.H. Ibarra and B. Ravikumar,
Springer-Verlag LNCS vol. 5148 (2008), pp. 242--251.
\bibitem{Razet08b}
Beno{\^\i}t Razet.
Simulating Finite {Eilenberg} Machines with a Reactive Engine.
In Proceedings of MSFP 2008,
Electric Notes in Theoretical Computer Science,
\url!http://gallium.inria.fr/~razet/PDF/razet_msfp08.pdf!.
\bibitem{Razet09}
Beno{\^\i}t Razet.
Machines d'{Eilenberg} Effectives. Th\`ese de Doctorat,
Universit\'e {Denis} {Diderot} (Paris 7), 2009.
\bibitem{revuz} Dominique Revuz. ``Dictionnaires et lexiques.'' Th\`ese de
Doctorat, Universit\'e Paris VII, Feb. 1991.
\bibitem{rs1} Emmanuel Roche and Yves Schabes. ``Deterministic Part-of-Speech
Tagging with Finite-State Transducers.''
Computational Linguistics 21,2 (1995), pp. 227-253.
\bibitem{rs2} Emmanuel Roche and Yves Schabes, Eds.
``Finite-State Language Processing.'' MIT Press, 1997.
\bibitem{sproat} Richard Sproat. ``Morphology and Computation."
MIT Press, 1992.
\bibitem{sproatshih} Richard Sproat, Chilin Shih, William Gale and Nancy Chang.
``A Stochastic Finite-State Word-Segmentation Algorithm for Chinese.''
Computational Linguistics 22,3 (1996), pp. 377--408.
\bibitem{caml} Pierre Weis and Xavier Leroy. ``Le langage Caml.''
2\`eme \'edition, Dunod, Paris, 1999.
\end{thebibliography}
% Extract from ~/TEXT/ESSLLI/DB.tex amended with SL.tex
% Supplement with differentiation reference
% Add example of binary zippers
\subsection{Zippers for binary trees}
We end this section by showing the special case of zippers for
binary trees.
What we just constructed is a simple engine which may recognize a Sanskrit sentence
as a sequence of inflected word forms. Actually such forms are glued together
using a euphony junction
process known as {\sl sandhi}. It is possible to invert the sandhi relation while doing
the recognition, and to use the transducer output to give a trace of the sandhi relation
between the words. Piping this process through a lemmatizer, which itselfs inverts the
flexional morphology, yields a Sanskrit tagger. This application is described in
\cite{2004-Huet-1}.
We remark that nondeterministic programming is basically
trivial in a functional programming language, provided one identifies well
the search space, states of computation are stored as pure data structures
(which cannot get corrupted by pointer mutation),
and fairness is taken care of by a termination argument
(here this amounts to proving that \verb:react: always terminate).
Nondeterminism is best handled by a generating process which delivers
one solution at a time, and which thus may be used in coroutine fashion with
a solution handler.
The reader will note that the very same state graph which was originally the
state space of the deterministic lexicon lookup is used here for a possibly
non-deterministic transduction. What changes is not the state space, but
the way it is traversed. That is we clearly separate the notion of
finite-state graph, a data structure, from the notion of a reactive process,
which uses this graph as a component of its computation space, other components
being the input and output tapes, possibly a backtrack stack, etc.
We shall continue to investigate transducers which are lexicon mappings,
but now with an explicit non-determinism state component. Such components,
whose structure may vary according to the particular construction, are
decorations on the lexicon structure, which is seen as the basic
deterministic state skeleton of all processes which are
lexicon-driven; we shall say that such processes are
{\sl lexicon morphisms} whenever the decoration of a lexicon trie node is a
function of the sub-trie at that node.
This property entails an important
efficiency consideration, since the sharing of the trie as a dag may be
preserved when constructing the automaton structure:
\noindent
{\bf Fact}. Every lexicon morphism may minimize its state space isomorphically
with the dag maximal sharing of the lexical tree. That is, we may directly
decorate the lexicon dag, since in this case
decorations are invariant by sub-tree sharing.
There are numerous practical applications of this general methodology.
For instance, it is shown in \cite{2004-Huet-1} how to construct
a Sanskrit segmenter as a
decorated inflected forms lexicon, where the decorations express application
of the euphony (sandhi) rules at the juncture between words.
This construction is a
direct extension of the unglueing construction, which is the special case
when there are no euphony rules, or when they are optional.
\subsection{Dagified lexicons}
We now return to our problem of building a lexicon which shares
common suffixes of words as well as common prefixes.
\vspace*{15pt}
Ternary trees are more complex than tries, but use slightly less storage.
Access is potentially faster in balanced trees than tries.
A good methodology seems to use tries for edition, and to translate them
to balanced ternary trees for production use with a fixed lexicon.
The ternary version of our English lexicon takes 3.6Mb, a savings of 20\%
over its trie version using 4.5Mb. After dag minimization, it takes 1Mb,
a savings of 10\% over the trie dag version using 1.1Mb.
In the case of our Sanskrit lexicon index, the trie takes 221Kb and the tertree
180Kb, whereas shared as dags the trie takes 103Kb and the tertree 96Kb.
\section{Decorated Tries for Inflected Forms Storage}
\subsection{Decorated Tries}
A set of elements of some type $\tau$ may be identified as its
characteristic predicate in $\tau\rightarrow bool$. A trie with boolean
information may similarly be generalized to a structure
representing a map, or function from words to some target type, by storing
elements of that type in the information slot.
In order to distinguish absence of information, we could use a type
{\sl (option info)} with constructor {\sl None}, presence of value
{\sl v} being indicated by {\sl Some(v)}. We rather
choose here a variant with lists,
which are versatile to represent sets, feature structures, etc. Now we
may associate to a word a non-empty list of information of polymorphic
type $\alpha$, absence of information being encoded by the empty list.
We shall call such associations a {\sl decorated trie}, or
{\sl deco} in short.
\subsection{Some statistics}
\vspace*{15pt}
If we apply this technique to our English lexicon, with command:\\
\verb:dagify <english.rem >small.rem:, we now get an optimal
representation which only needs 1Mb of storage, half of the
original ASCII string representation.
The recursive algorithms given so far are fairly straightforward.
They are easy to debug, maintain and modify,
due to the strong typing safeguard of ML, and even easy to formally certify.
They are nonetheless efficient enough for production use, thanks to the
optimizing native-code compiler of Objective Caml.
In our Sanskrit application, the trie of 11500 entries
is shrunk from 219Kb to 103Kb in 0.1s, whereas the trie of 120000 flexed
forms is shrunk from 1.63Mb to 140Kb in 0.5s on a 864MHz PC.
Our list of 173528 English words, represented as an ASCII file of 1.92
Mbytes, is represented as a trie of 4.5 Mbytes, which shrinks to 1.1
Mbytes by sharing (in 2.7s).
Measurements showed that the time complexity is linear with the size of
the lexicon (within comparable sets of words). This is consistent with
algorithmic analysis, since it is known that tries compress dictionaries
up to a linear entropy factor, and that perfect hashing compresses trees
in dags in linear time \cite{flajsipste}.
Tuning of the hash function parameters leads to many variations.
For instance if we assume an infinite memory we may turn the hash calculation
into a one-to-one G\"odel numbering, and at the opposite end taking
{\sl hash\_max} to 1 we would do list lookup in the unique bucket,
with worse than quadratic performance.
Using hash tables for sharing with bottom-up traversal is a standard
dynamic programming technique, but the usual
way is to delegate computation of the hash function to some hash library,
using a generic low-level package. This is what happens for instance
if one uses the module hashtbl from the Ocaml library. Here the
{\sl Share} module does {\sl not} compute the keys, which are computed
on the client side, avoiding re-exploration of the structures. That is,
{\sl Share} is just an associative memory. Furthermore, key computation may
take advantage of specific statistical distribution of the application domain.
We shall see later another application of the {\sl Share}
functor to the minimization of the state space of (acyclic) finite automata.
Actually, what we just did is minimization of acyclic deterministic
automata represented as lexical dags.
More sophisticated compression techniques are known, which may combine with
array implementations insuring fast access, and which may extend to possibly
cyclic automata state spaces. Such techniques are used in lexical analysers
for programming languages, for which speed is essential. See for instance
the table-compression method described in section 3.9 of \cite{asu}.
\subsection{ISO-LATIN and French}
The next modules explain how to define the ISO-LATIN encoding, and
how to use it to represent French words.
First we give a simple lexer, which is used to parse raw text with
Camlp4 grammars. Next we give such a grammar, used to define a
transducer from notations such as \verb:e': to ISO-LATIN character
\'e. Finally, we give a module Latin which defines ISO-LATIN encoding.
\subsection{Statistics for French}
We may now instanciate the functor $make\_lex$ with the Latin module.
\section{Basic Utilities}
We present in this section some basic utilities libraries.
\subsection{Miscellaneous primitives}
%\documentclass[11pt]{article}
%\pagestyle{plain}
\def\lbr{\langle} %<
\def\rbr{\rangle} %>
\def\lsq{[} %[
\def\rsq{]} %]
\def\R{{\cal R}} % script R
\def\otrema{\"o} % otrema necessary for balance of "..." in comments.
\def\skip{\vspace{10pt}}
%\def\url{}
%\begin{document}
\begin{center}
\vspace*{24pt}
{\Large The Zen Computational Linguistics Toolkit}\\[10pt]
{\Large Version 3.2}\\[15pt]
{November 24th, 2013}\\[15pt]
{\large G\'erard Huet}\\[10pt]
{\large Copyright \copyright ~2002-2013 INRIA}\\[20pt]
\end{center}
\tableofcontents
\begin{abstract}
We present in this document a few fundamental structures useful for
computational linguistics.
The central structure is that of lexical tree, or {\sl trie}.
A crucial observation is that a trie is isomorphic to the state space
of a deterministic acyclic automaton. More complex finite-state
automata and transducers, deterministic or not, and cyclic or
not, may be represented as tries decorated by extra information. Thus we
obtain a family of structures underlying lexicon-directed linguistic
processes.
First we describe plain tries, which are adequate to represent lexicon indexes.
Then we describe decorated tries, or {\sl decos}, which are appropriate to
represent symbol tables, and dictionaries associating with the lexicon
grammatical or other informations. We then describe how to represent
maps and more generally invertible relations between lexicons. We call these
structures lexical maps or {\sl lexmaps}. Lexmaps are appropriate for instance
to associate inflected forms to lexicon stems and roots, using morphological
operations. Such lexmaps are invertible in the sense that we may retrieve
from the lexmap entry of a inflected form the stems and operations from which
it may be obtained. Finally we show how lexicon directed transducers may
be represented using tries decorated with choice points. Such transducers
are useful to describe segmentation and taggings processes.
All data structures and algorithms are described
in a computational metalanguage called \verb:Pidgin ML:. \verb:Pidgin ML: is a
publication language for the ML family of programming languages. All the
algorithms described here could be described as well in Standard ML
or in Objective CAML, to cite two popular ML implementations, or in the
lazy functional language Haskell. They could also be described in a
programming language such as LISP or Scheme, but the strong typing discipline
of ML, supporting polymorphism and modules, is an insurance that computations
cannot corrupt data structures and lead to run-type errors.
An initial chapter of these notes gives a quick overview of \verb:Pidgin ML:.
The resulting design may be considered as the reference implementation of a Free
Computational Linguistics Toolkit. It may turn useful as an ``off the shelf''
toolkit for simple operations on linguistics material. Due to its
lightweight approach we shall talk of the Zen CL Toolkit.
This toolkit was abstracted from the Sanskrit ML Library, which constitutes
its first large-scale application. Thus some of this material already
appeared in the documentation of the Sanskrit Segmenter algorithm,
which solves Sandhi Analysis \cite{2004-Huet-1}.
%The Sanskrit Library Documentation, a companion to this document,
%is available at \url{http://pauillac.inria.fr/~huet/SKT/DOC/doc.ps} under
%format postscript, \verb|doc.pdf| under format pdf,
%and \verb|doc.html| under format html.
This document was automatically generated from the code of the toolkit
using the Ocamlweb package of Jean-Christophe Filli\^atre,
with the Latex package, in the literate programming style pioneered by
Don Knuth.
%The Html version uses the Hevea Tex-to-Html translator of Luc Maranget.
\end{abstract}
\part{Dictionaries}
\section{Pidgin ML}
We shall use as {\sl meta language} for the description of our algorithms
a pidgin version of the functional language ML
\cite{ML-LCF,MLer,paulson,caml}.
Readers familiar with ML
may skip this section, which gives a crash
overview of its syntax and semantics.
%Part II
\part{Reactive Transducers}
\section{Introduction}
This second part gives additional tools for manipulating variants of finite-state machines.
They are a natural extension of the unglueing process presented at the end of Part I.
The general idea is to represent applicatively the state graph of finite-state machines
as a decorated dictionary. The dictionary, used as spanning tree of the state transition graph,
is a deterministic subset of this graph. The rest of the structure of the finite-state machine,
permitting the representation of non-determinism, of loops, and of transducer operations,
is encoded as attributes decorating the dictionary nodes. This general framework
of {\sl mixed automata} or {\sl aums}, is described in reference \cite{2003-Huet-3}.
Its application to the problem of segmentation and tagging of Sanskrit is described
in \cite{2004-Huet-1}.
We provide here various specific examples of this general methodology, and a mechanism
for composing such finite-state descriptions in a modular fashion \cite{2006-Huet-Razet}.
This methodology has been lifted more recently
to a very general paradigm of relational
programming within the framework of Eilenberg machines by Beno{\^\i}t Razet
\cite{2008-Huet-Razet, Razet08a, Razet08b, Razet09}.
\section{A simplistic modular Automaton recognizer}
The simplest aum structure is the one reduced to deterministic acyclic finite-state automata,
where the aum structure is reduced to the underlying dictionary (Trie). Provided all
states are accessible from the initial one, the reduced structure obtained by applying the
Sharing functor yields the minimal deterministic automaton. This framework applies to the
simple but important subcase of finite languages.
We assume known the modules of the first part of the toolkit documentation.
\subsection{Simplistic aums}
\subsection{Implementing a lexicon as a trie}
Now, using the coercion $encode$ from strings to words from the
$Ascii$ module,
we build a lexicon trie from a list of strings by function $make\_lex$,
using Ocaml's $fold\_left$ from the $List$ library
(the terminal recursive list iterator).
\subsection{Lexical maps}
We can easily generalize sharing to decorated tries. However,
substantial savings will result only if the information at a given node
is a function of the subtrie at that node, i.e. if such information is
defined as a {\sl trie morphism}. This will not be generally the case,
since this information is in general a function of the word stored at
that point, and thus of all the accessing path to that node. The way in which
the information is encoded is of course crucial. For instance, encoding
morphological derivation as an operation on the suffix of a flexed form
is likely to be amenable to sharing common suffixes in the flexed trie,
whereas encoding it as an operation on the whole stem will prevent any
such sharing.
In order to facilitate the sharing of mappings which preserve an initial
prefix of a word, we shall use the notion of {\sl differential word} above.
We may now store inverse maps of lexical relations (such as morphology
derivations) using the following structures
(where the type parameter $\alpha$: codes the relation).
\subsection{Lexical maps}
We can easily generalize sharing to decorated tries. However,
substantial savings will result only if the information at a given node
is a function of the subtrie at that node, i.e. if such information is
defined as a {\sl trie morphism}. This will not be generally the case,
since this information is in general a function of the word stored at
that point, and thus of all the accessing path to that node. The way in which
the information is encoded is of course crucial. For instance, encoding
morphological derivation as an operation on the suffix of a inflected form
is likely to be amenable to sharing common suffixes in the inflected trie,
whereas encoding it as an operation on the whole stem will prevent any
such sharing.
In order to facilitate the sharing of mappings which preserve an initial
prefix of a word, we shall use the notion of {\sl differential word} above.
We may now store inverse maps of lexical relations (such as morphology
derivations) using the following structures
(where the type parameter $\alpha$: codes the relation).