open Grammar
module InfiniteArray =
MenhirLib.InfiniteArray
(* ------------------------------------------------------------------------ *)
(* Symbolic lookahead information. *)
(* A symbolic lookahead set consists of an actual concrete set of
terminal symbols and of a number of set variables. Set variables as
encoded as integers. *)
module SymbolicLookahead = struct
type t =
TerminalSet.t * CompressedBitSet.t
let constant toks =
(toks, CompressedBitSet.empty)
let empty =
constant TerminalSet.empty
let union (toks1, vars1) ((toks2, vars2) as s2) =
let toks = TerminalSet.union toks1 toks2
and vars = CompressedBitSet.union vars1 vars2 in
if toks2 == toks && vars2 == vars then
s2
else
(toks, vars)
let variable (var : int) : t =
(TerminalSet.empty, CompressedBitSet.singleton var)
let project (toks, vars) =
assert (CompressedBitSet.is_empty vars);
toks
end
(* We will perform closure operations over symbolic lookahead sets.
This allows us to later represent LR(1) states as pairs of an
LR(0) node number and an array of concrete lookahead sets. *)
module SymbolicClosure =
Item.Closure(SymbolicLookahead)
(* Closure operations over concrete lookahead sets are also used (when
explaining conflicts). One could take another instance of the
functor. The approach below is somewhat less elegant and makes each
call to [closure] somewhat slower, but saves the cost of
instantiating the functor again -- which is linear in the size of
the grammar. *)
type concretelr1state =
TerminalSet.t Item.Map.t
let closure (state : concretelr1state) : concretelr1state =
Item.Map.map SymbolicLookahead.project
(SymbolicClosure.closure
(Item.Map.map SymbolicLookahead.constant state))
(* ------------------------------------------------------------------------ *)
(* Finding which non-epsilon transitions leave a set of items. This
code is parametric in the nature of lookahead sets. *)
let transitions (state : 'a Item.Map.t) : 'a Item.Map.t SymbolMap.t =
Item.Map.fold (fun item toks transitions ->
match Item.classify item with
| Item.Shift (symbol, item') ->
let items : 'a Item.Map.t =
try
SymbolMap.find symbol transitions
with Not_found ->
Item.Map.empty
in
SymbolMap.add symbol (Item.Map.add item' toks items) transitions
| Item.Reduce _ ->
transitions
) state SymbolMap.empty
(* ------------------------------------------------------------------------ *)
(* Determining the reduction opportunities at a (closed) state. They
are represented as a list of pairs of a lookahead set and a
production index. This code is again parametric in the nature of
lookahead sets. *)
let reductions (state : 'a Item.Map.t) : ('a * Production.index) list =
Item.Map.fold (fun item toks accu ->
match Item.classify item with
| Item.Reduce prod ->
(toks, prod) :: accu
| Item.Shift _ ->
accu
) state []
(* ------------------------------------------------------------------------ *)
(* Construction of the the LR(0) automaton. *)
(* Nodes are numbered sequentially. *)
type node =
int
(* A symbolic transition is a pair of the target state number and an
array of symbolic lookahead sets. The variables in these sets are
numbered in [0,g) where g is the number of items in the source
LR(0) state. Items are numbered in the order of presentation by
[Item.Set.fold]. *)
type symbolic_transition_target =
node * SymbolicLookahead.t array
(* The automaton is represented by (growing) arrays of states (sets of
items), symbolic transition information, and symbolic reduction
information, indexed by node numbers. Conversely, a hash table maps
states (sets of items) to node numbers. *)
let n =
ref 0
let states : Item.Set.t InfiniteArray.t =
InfiniteArray.make Item.Set.empty
let _transitions : symbolic_transition_target SymbolMap.t InfiniteArray.t =
InfiniteArray.make SymbolMap.empty
let _reductions : (SymbolicLookahead.t * Production.index) list InfiniteArray.t =
InfiniteArray.make []
let map : (Item.Set.t, node) Hashtbl.t =
Hashtbl.create 50021
let incoming : Symbol.t option InfiniteArray.t =
InfiniteArray.make None
(* The automaton is built depth-first. *)
let rec explore (symbol : Symbol.t option) (state : Item.Set.t) : node =
(* Find out whether this state was already explored. *)
try
Hashtbl.find map state
with Not_found ->
(* If not, create a new node. *)
let k = !n in
n := k + 1;
InfiniteArray.set states k state;
Hashtbl.add map state k;
(* Record its incoming symbol. *)
InfiniteArray.set incoming k symbol;
(* Build a symbolic version of the current state, where each item
is associated with a distinct lookahead set variable, numbered
consecutively. *)
let (_ : int), (symbolic_state : SymbolicClosure.state) =
Item.Set.fold (fun item (i, symbolic_state) ->
i+1, Item.Map.add item (SymbolicLookahead.variable i) symbolic_state
) state (0, Item.Map.empty) in
(* Compute the symbolic closure. *)
let closure = SymbolicClosure.closure symbolic_state in
(* Compute symbolic information about reductions. *)
InfiniteArray.set _reductions k (reductions closure);
(* Compute symbolic information about the transitions, and, by
dropping the symbolic lookahead information, explore the
transitions to further LR(0) states. *)
InfiniteArray.set _transitions k (SymbolMap.mapi (fun symbol symbolic_state ->
let (k : node) = explore (Some symbol) (Item.Map.domain symbolic_state) in
let lookahead : SymbolicLookahead.t array =
Array.make (Item.Map.cardinal symbolic_state) SymbolicLookahead.empty in
let (_ : int) = Item.Map.fold (fun _ s i ->
lookahead.(i) <- s;
i+1
) symbolic_state 0 in
((k, lookahead) : symbolic_transition_target)
) (transitions closure));
k
(* Creating a start state out of a start production. It contains a
single item, consisting of the start production, at position 0. *)
let start prod : Item.Set.t =
Item.Set.singleton (Item.import (prod, 0))
(* This starts the construction of the automaton and records the
entry nodes in an array. *)
let entry : node ProductionMap.t =
ProductionMap.start (fun prod ->
explore None (start prod)
)
let () =
Hashtbl.clear map
let n =
!n
let () =
Error.logA 1 (fun f -> Printf.fprintf f "Built an LR(0) automaton with %d states.\n" n);
Time.tick "Construction of the LR(0) automaton"
(* ------------------------------------------------------------------------ *)
(* Accessors. *)
let items node : Item.Set.t =
InfiniteArray.get states node
let incoming_symbol node : Symbol.t option =
InfiniteArray.get incoming node
(* ------------------------------------------------------------------------ *)
(* Help for building the LR(1) automaton. *)
(* An LR(1) state is represented as a pair of an LR(0) state number
and an array of concrete lookahead sets (whose length depends on
the LR(0) state). *)
type lr1state =
node * TerminalSet.t array
(* An encoded LR(1) state can be turned into a concrete representation,
that is, a mapping of items to concrete lookahead sets. *)
let export (k, toksr) =
let (_ : int), items = Item.Set.fold (fun item (i, items) ->
i+1, Item.Map.add item toksr.(i) items
) (InfiniteArray.get states k) (0, Item.Map.empty) in
items
(* Displaying a concrete state. *)
let print_concrete leading (state : concretelr1state) =
let buffer = Buffer.create 1024 in
Item.Map.iter (fun item toks ->
Printf.bprintf buffer "%s%s[ %s ]\n"
leading
(Item.print item)
(TerminalSet.print toks)
) state;
Buffer.contents buffer
(* Displaying a state. By default, only the kernel is displayed, not
the closure. *)
let print leading state =
print_concrete leading (export state)
let print_closure leading state =
print_concrete leading (closure (export state))
(* The core of an LR(1) state is the underlying LR(0) state. *)
let core (k, _) =
k
(* A sanity check. *)
let well_formed (k, toksr) =
Array.length toksr = Item.Set.cardinal (InfiniteArray.get states k)
(* An LR(1) start state is the combination of an LR(0) start state
(which consists of a single item) with a singleton lookahead set
that consists of the end-of-file pseudo-token. *)
let start k =
let state = (k, [| TerminalSet.singleton Terminal.sharp |]) in
assert (well_formed state);
state
(* Interpreting a symbolic lookahead set with respect to a source
state. The variables in the symbolic lookahead set (which are
integers) are interpreted as indices into the state's array of
concrete lookahead sets. The result is a concrete lookahead set. *)
let interpret
((_, toksr) as state : lr1state)
((toks, vars) : SymbolicLookahead.t)
: TerminalSet.t =
assert (well_formed state);
CompressedBitSet.fold (fun var toks ->
assert (var >= 0 && var < Array.length toksr);
TerminalSet.union toksr.(var) toks
) vars toks
(* Out of an LR(1) state, one produces information about reductions
and transitions. This is done in an efficient way by interpreting
the precomputed symbolic information with respect to that state. *)
let reductions
((k, _) as state : lr1state)
: (TerminalSet.t * Production.index) list =
List.map (fun (s, prod) ->
interpret state s, prod
) (InfiniteArray.get _reductions k)
let transitions
((k, _) as state : lr1state)
: lr1state SymbolMap.t =
SymbolMap.map (fun ((k, sr) : symbolic_transition_target) ->
((k, Array.map (interpret state) sr) : lr1state)
) (InfiniteArray.get _transitions k)
let outgoing_symbols
(k : node)
: Symbol.t list =
SymbolMap.domain (InfiniteArray.get _transitions k)
let transition
symbol
((k, _) as state : lr1state)
: lr1state =
let ((k, sr) : symbolic_transition_target) =
try
SymbolMap.find symbol (InfiniteArray.get _transitions k)
with Not_found ->
assert false (* no transition along this symbol *)
in
(k, Array.map (interpret state) sr)
(* Equality of states. *)
let equal ((k1, toksr1) as state1) ((k2, toksr2) as state2) =
assert (k1 = k2 && well_formed state1 && well_formed state2);
let rec loop i =
if i = 0 then
true
else
let i = i - 1 in
(TerminalSet.equal toksr1.(i) toksr2.(i)) && (loop i)
in
loop (Array.length toksr1)
(* Subsumption between states. *)
let subsume ((k1, toksr1) as state1) ((k2, toksr2) as state2) =
assert (k1 = k2 && well_formed state1 && well_formed state2);
let rec loop i =
if i = 0 then
true
else
let i = i - 1 in
(TerminalSet.subset toksr1.(i) toksr2.(i)) && (loop i)
in
loop (Array.length toksr1)
(* This function determines whether two (core-equivalent) states are
compatible, according to a criterion that is close to Pager's weak
compatibility criterion.
Pager's criterion guarantees that if a merged state has a potential
conflict at [(i, j)] -- that is, some token [t] appears within the
lookahead sets of both item [i] and item [j] -- then there exists a
state in the canonical automaton that also has a potential conflict
at [(i, j)] -- that is, some token [u] appears within the lookahead
sets of both item [i] and item [j]. Note that [t] and [u] can be
distinct.
Pager has shown that his weak compatibility criterion is stable,
that is, preserved by transitions and closure. This means that, if
two states can be merged, then so can their successors. This is
important, because merging two states means committing to merging
their successors, even though we have not even built these
successors yet.
The criterion used here is a slightly more restrictive version of
Pager's criterion, which guarantees equality of the tokens [t] and
[u]. This is done essentially by applying Pager's original
criterion on a token-wise basis. Pager's original criterion states
that two states can be merged if the new state has no conflict or
one of the original states has a conflict. Our more restrictive
criterion states that two states can be merged if, for every token
[t], the new state has no conflict at [t] or one of the original
states has a conflict at [t].
This modified criterion is also stable. My experiments show that it
is almost as effective in practice: out of more than a hundred
real-world sample grammars, only one automaton was affected, and
only one extra state appeared as a result of using the modified
criterion. Its advantage is to potentially make conflict
explanations easier: if there appears to be a conflict at [t], then
some conflict at [t] can be explained. This was not true when using
Pager's original criterion. *)
let compatible (k1, toksr1) (k2, toksr2) =
assert (k1 = k2);
let n = Array.length toksr1 in
(* Two states are compatible if and only if they are compatible
at every pair (i, j), where i and j are distinct. *)
let rec loopi i =
if i = n then
true
else
let toksr1i = toksr1.(i)
and toksr2i = toksr2.(i) in
let rec loopj j =
if j = i then
true
else
let toksr1j = toksr1.(j)
and toksr2j = toksr2.(j) in
(* The two states are compatible at (i, j) if every conflict
token in the merged state already was a conflict token in
one of the two original states. This could be written as
follows:
TerminalSet.subset
(TerminalSet.inter (TerminalSet.union toksr1i toksr2i) (TerminalSet.union toksr1j toksr2j))
(TerminalSet.union (TerminalSet.inter toksr1i toksr1j) (TerminalSet.inter toksr2i toksr2j))
but is easily seen (on paper) to be equivalent to:
*)
TerminalSet.subset
(TerminalSet.inter toksr2i toksr1j)
(TerminalSet.union toksr1i toksr2j)
&&
TerminalSet.subset
(TerminalSet.inter toksr1i toksr2j)
(TerminalSet.union toksr2i toksr1j)
&&
loopj (j+1)
in
loopj 0 && loopi (i+1)
in
loopi 0
(* This function determines whether two (core-equivalent) states can
be merged without creating an end-of-stream conflict, now or in the
future.
The rule is, if an item appears in one state with the singleton "#"
as its lookahead set, then its lookahead set in the other state
must contain "#".
So, either the second lookahead set is also the singleton "#", and
no end-of-stream conflict exists, or it is larger, and the second
state already contains an end-of-stream conflict.
Put another way, we do not want to merge two lookahead sets when one
contains "#" alone and the other does not contain "#".
I invented this rule to complement Pager's criterion. I believe,
but I am not 100% sure, that it does indeed prevent end-of-stream
conflicts and that it is stable.
Thanks to Sébastien Hinderer for reporting the bug caused by the
absence of this extra criterion. *)
let eos_compatible (k1, toksr1) (k2, toksr2) =
assert (k1 = k2);
let n = Array.length toksr1 in
let rec loop i =
if i = n then
true
else
let toks1 = toksr1.(i)
and toks2 = toksr2.(i) in
begin
if TerminalSet.mem Terminal.sharp toks1 && TerminalSet.is_singleton toks1 then
(* "#" is alone in one set: it must be a member of the other set. *)
TerminalSet.mem Terminal.sharp toks2
else if TerminalSet.mem Terminal.sharp toks2 && TerminalSet.is_singleton toks2 then
(* Symmetric condition. *)
TerminalSet.mem Terminal.sharp toks1
else
true
end
&& loop (i+1)
in
loop 0
(* This function determines whether two (core-equivalent) states can
be merged without creating spurious reductions on the [error]
token.
The rule is, we merge two states only if they agree on which
reductions are permitted on the [error] token.
Without this restriction, we might end up in a situation where we
decide to introduce an [error] token into the input stream and
perform a reduction, whereas a canonical LR(1) automaton,
confronted with the same input string, would fail normally -- that
is, it would introduce an [error] token into the input stream, but
it would not be able to perform a reduction right away: the current
state would be discarded.
In the interest of more accurate (or sane, or predictable) error
handling, I decided to introduce this restriction as of 20110124.
This will cause an increase in the size of automata for grammars
that use the [error] token. It might actually make the [error]
token somewhat easier to use.
Note that two sets can be in the subsumption relation and still
be error-incompatible. Error-compatibility requires equality of
the lookahead sets, restricted to [error].
Thanks to Didier Rémy for reporting a bug caused by the absence
of this extra criterion. *)
let error_compatible (k1, toksr1) (k2, toksr2) =
assert (k1 = k2);
let n = Array.length toksr1 in
let rec loop i =
if i = n then
true
else
let toks1 = toksr1.(i)
and toks2 = toksr2.(i) in
begin
if TerminalSet.mem Terminal.error toks1 then
(* [error] is a member of one set: it must be a member of the other set. *)
TerminalSet.mem Terminal.error toks2
else if TerminalSet.mem Terminal.error toks2 then
(* Symmetric condition. *)
TerminalSet.mem Terminal.error toks1
else
true
end
&& loop (i+1)
in
loop 0
(* Union of two states. The two states must have the same core. The
new state is obtained by pointwise union of the lookahead sets. *)
let union (k1, toksr1) (k2, toksr2) =
assert (k1 = k2);
k1, Array.init (Array.length toksr1) (fun i ->
TerminalSet.union toksr1.(i) toksr2.(i)
)
(* Restriction of a state to a set of tokens of interest. Every
lookahead set is intersected with that set. *)
let restrict toks (k, toksr) =
k, Array.map (fun toksri ->
TerminalSet.inter toksri toks
) toksr