lr0.ml 17.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
open Grammar

module InfiniteArray =
  MenhirLib.InfiniteArray

(* ------------------------------------------------------------------------ *)
(* Symbolic lookahead information. *)

(* A symbolic lookahead set consists of an actual concrete set of
   terminal symbols and of a number of set variables. Set variables as
   encoded as integers. *)

module SymbolicLookahead = struct

  type t =
    TerminalSet.t * CompressedBitSet.t

  let constant toks =
    (toks, CompressedBitSet.empty)

  let empty =
    constant TerminalSet.empty

  let union (toks1, vars1) ((toks2, vars2) as s2) =
    let toks = TerminalSet.union toks1 toks2
    and vars = CompressedBitSet.union vars1 vars2 in
    if toks2 == toks && vars2 == vars then
      s2
    else   
      (toks, vars)

  let variable (var : int) : t =
    (TerminalSet.empty, CompressedBitSet.singleton var)

  let project (toks, vars) =
    assert (CompressedBitSet.is_empty vars);
    toks

end

(* We will perform closure operations over symbolic lookahead sets.
   This allows us to later represent LR(1) states as pairs of an
   LR(0) node number and an array of concrete lookahead sets. *)

module SymbolicClosure =
  Item.Closure(SymbolicLookahead)

(* Closure operations over concrete lookahead sets are also used (when
   explaining conflicts). One could take another instance of the
   functor. The approach below is somewhat less elegant and makes each
   call to [closure] somewhat slower, but saves the cost of
   instantiating the functor again -- which is linear in the size of
   the grammar. *)

type concretelr1state =
    TerminalSet.t Item.Map.t

let closure (state : concretelr1state) : concretelr1state =
   Item.Map.map SymbolicLookahead.project
     (SymbolicClosure.closure
       (Item.Map.map SymbolicLookahead.constant state))

(* ------------------------------------------------------------------------ *)
(* Finding which non-epsilon transitions leave a set of items. This
   code is parametric in the nature of lookahead sets. *)

let transitions (state : 'a Item.Map.t) : 'a Item.Map.t SymbolMap.t =

  Item.Map.fold (fun item toks transitions ->
    match Item.classify item with
    | Item.Shift (symbol, item') ->
	let items : 'a Item.Map.t =
	  try
	    SymbolMap.find symbol transitions
	  with Not_found ->
	    Item.Map.empty
	in
	SymbolMap.add symbol (Item.Map.add item' toks items) transitions
    | Item.Reduce _ ->
	transitions
  ) state SymbolMap.empty

(* ------------------------------------------------------------------------ *)
(* Determining the reduction opportunities at a (closed) state. They
   are represented as a list of pairs of a lookahead set and a
   production index. This code is again parametric in the nature of
   lookahead sets. *)

let reductions (state : 'a Item.Map.t) : ('a * Production.index) list =
  Item.Map.fold (fun item toks accu ->
    match Item.classify item with
    | Item.Reduce prod ->
	(toks, prod) :: accu
    | Item.Shift _ ->
	accu
  ) state []

(* ------------------------------------------------------------------------ *)
(* Construction of the the LR(0) automaton. *)

(* Nodes are numbered sequentially. *)

type node =
    int

(* A symbolic transition is a pair of the target state number and an
   array of symbolic lookahead sets. The variables in these sets are
   numbered in [0,g) where g is the number of items in the source
   LR(0) state. Items are numbered in the order of presentation by
   [Item.Set.fold]. *)

type symbolic_transition_target =
    node * SymbolicLookahead.t array

(* The automaton is represented by (growing) arrays of states (sets of
   items), symbolic transition information, and symbolic reduction
   information, indexed by node numbers. Conversely, a hash table maps
   states (sets of items) to node numbers. *)

let n =
  ref 0

let states : Item.Set.t InfiniteArray.t =
  InfiniteArray.make Item.Set.empty

let _transitions : symbolic_transition_target SymbolMap.t InfiniteArray.t =
  InfiniteArray.make SymbolMap.empty

let _reductions : (SymbolicLookahead.t * Production.index) list InfiniteArray.t =
  InfiniteArray.make []

let map : (Item.Set.t, node) Hashtbl.t =
  Hashtbl.create 50021

135 136 137
let incoming : Symbol.t option InfiniteArray.t =
  InfiniteArray.make None

138 139
(* The automaton is built depth-first. *)

140
let rec explore (symbol : Symbol.t option) (state : Item.Set.t) : node =
141 142 143 144

  (* Find out whether this state was already explored. *)

  try
145
    Hashtbl.find map state
146 147 148 149 150 151 152 153 154
  with Not_found ->

    (* If not, create a new node. *)

    let k = !n in
    n := k + 1;
    InfiniteArray.set states k state;
    Hashtbl.add map state k;

155 156 157 158
    (* Record its incoming symbol. *)

    InfiniteArray.set incoming k symbol;

159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
    (* Build a symbolic version of the current state, where each item
       is associated with a distinct lookahead set variable, numbered
       consecutively. *)

    let (_ : int), (symbolic_state : SymbolicClosure.state) =
      Item.Set.fold (fun item (i, symbolic_state) ->
	i+1, Item.Map.add item (SymbolicLookahead.variable i) symbolic_state
      ) state (0, Item.Map.empty) in

    (* Compute the symbolic closure. *)

    let closure = SymbolicClosure.closure symbolic_state in

    (* Compute symbolic information about reductions. *)

    InfiniteArray.set _reductions k (reductions closure);

    (* Compute symbolic information about the transitions, and, by
       dropping the symbolic lookahead information, explore the
       transitions to further LR(0) states. *)

180 181
    InfiniteArray.set _transitions k (SymbolMap.mapi (fun symbol symbolic_state ->
      let (k : node) = explore (Some symbol) (Item.Map.domain symbolic_state) in
182
      let lookahead : SymbolicLookahead.t array =
183
	Array.make (Item.Map.cardinal symbolic_state) SymbolicLookahead.empty in
184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
      let (_ : int) = Item.Map.fold (fun _ s i ->
	lookahead.(i) <- s;
	i+1
      ) symbolic_state 0 in
      ((k, lookahead) : symbolic_transition_target)
    ) (transitions closure));

    k

(* Creating a start state out of a start production. It contains a
   single item, consisting of the start production, at position 0. *)

let start prod : Item.Set.t =
   Item.Set.singleton (Item.import (prod, 0))

(* This starts the construction of the automaton and records the
   entry nodes in an array. *)

let entry : node ProductionMap.t =
  ProductionMap.start (fun prod ->
204
    explore None (start prod)
205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
  )

let () =
  Hashtbl.clear map

let n =
  !n

let () =
  Error.logA 1 (fun f -> Printf.fprintf f "Built an LR(0) automaton with %d states.\n" n);
  Time.tick "Construction of the LR(0) automaton"

(* ------------------------------------------------------------------------ *)
(* Accessors. *)

let items node : Item.Set.t =
  InfiniteArray.get states node

223 224 225
let incoming_symbol node : Symbol.t option =
  InfiniteArray.get incoming node

226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246
(* ------------------------------------------------------------------------ *)
(* Help for building the LR(1) automaton. *)

(* An LR(1) state is represented as a pair of an LR(0) state number
   and an array of concrete lookahead sets (whose length depends on
   the LR(0) state). *)

type lr1state =
    node * TerminalSet.t array

(* An encoded LR(1) state can be turned into a concrete representation,
   that is, a mapping of items to concrete lookahead sets. *)

let export (k, toksr) =
  let (_ : int), items = Item.Set.fold (fun item (i, items) ->
    i+1, Item.Map.add item toksr.(i) items
  ) (InfiniteArray.get states k) (0, Item.Map.empty) in
  items

(* Displaying a concrete state. *)

247
let print_concrete leading (state : concretelr1state) =
248 249
  let buffer = Buffer.create 1024 in
  Item.Map.iter (fun item toks ->
250 251 252 253
    Printf.bprintf buffer "%s%s[ %s ]\n"
      leading
      (Item.print item)
      (TerminalSet.print toks)
254 255 256 257 258 259
  ) state;
  Buffer.contents buffer

(* Displaying a state. By default, only the kernel is displayed, not
   the closure. *)

260 261
let print leading state =
  print_concrete leading (export state)
262

263 264
let print_closure leading state =
  print_concrete leading (closure (export state))
265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480

(* The core of an LR(1) state is the underlying LR(0) state. *)

let core (k, _) =
  k

(* A sanity check. *)

let well_formed (k, toksr) =
  Array.length toksr = Item.Set.cardinal (InfiniteArray.get states k)

(* An LR(1) start state is the combination of an LR(0) start state
   (which consists of a single item) with a singleton lookahead set
   that consists of the end-of-file pseudo-token. *)

let start k =
  let state = (k, [| TerminalSet.singleton Terminal.sharp |]) in
  assert (well_formed state);
  state

(* Interpreting a symbolic lookahead set with respect to a source
   state. The variables in the symbolic lookahead set (which are
   integers) are interpreted as indices into the state's array of
   concrete lookahead sets. The result is a concrete lookahead set. *)

let interpret
    ((_, toksr) as state : lr1state)
    ((toks, vars) : SymbolicLookahead.t)
    : TerminalSet.t =

  assert (well_formed state);
  CompressedBitSet.fold (fun var toks ->
    assert (var >= 0 && var < Array.length toksr);
    TerminalSet.union toksr.(var) toks
  ) vars toks

(* Out of an LR(1) state, one produces information about reductions
   and transitions. This is done in an efficient way by interpreting
   the precomputed symbolic information with respect to that state. *)

let reductions
    ((k, _) as state : lr1state)
    : (TerminalSet.t * Production.index) list =

  List.map (fun (s, prod) ->
    interpret state s, prod
  ) (InfiniteArray.get _reductions k)

let transitions
    ((k, _) as state : lr1state)
    : lr1state SymbolMap.t =

  SymbolMap.map (fun ((k, sr) : symbolic_transition_target) ->
    ((k, Array.map (interpret state) sr) : lr1state)
  ) (InfiniteArray.get _transitions k)

let outgoing_symbols
    (k : node)
    : Symbol.t list =

  SymbolMap.domain (InfiniteArray.get _transitions k)

let transition
    symbol
    ((k, _) as state : lr1state)
    : lr1state =

  let ((k, sr) : symbolic_transition_target) =
    try
      SymbolMap.find symbol (InfiniteArray.get _transitions k)
    with Not_found ->
      assert false (* no transition along this symbol *)
  in
  (k, Array.map (interpret state) sr)

(* Equality of states. *)

let equal ((k1, toksr1) as state1) ((k2, toksr2) as state2) =
  assert (k1 = k2 && well_formed state1 && well_formed state2);
  let rec loop i =
    if i = 0 then
      true
    else
      let i = i - 1 in
      (TerminalSet.equal toksr1.(i) toksr2.(i)) && (loop i)
  in
  loop (Array.length toksr1)

(* Subsumption between states. *)

let subsume ((k1, toksr1) as state1) ((k2, toksr2) as state2) =
  assert (k1 = k2 && well_formed state1 && well_formed state2);
  let rec loop i =
    if i = 0 then
      true
    else
      let i = i - 1 in
      (TerminalSet.subset toksr1.(i) toksr2.(i)) && (loop i)
  in
  loop (Array.length toksr1)

(* This function determines whether two (core-equivalent) states are
   compatible, according to a criterion that is close to Pager's weak
   compatibility criterion.

   Pager's criterion guarantees that if a merged state has a potential
   conflict at [(i, j)] -- that is, some token [t] appears within the
   lookahead sets of both item [i] and item [j] -- then there exists a
   state in the canonical automaton that also has a potential conflict
   at [(i, j)] -- that is, some token [u] appears within the lookahead
   sets of both item [i] and item [j]. Note that [t] and [u] can be
   distinct.

   Pager has shown that his weak compatibility criterion is stable,
   that is, preserved by transitions and closure. This means that, if
   two states can be merged, then so can their successors. This is
   important, because merging two states means committing to merging
   their successors, even though we have not even built these
   successors yet.

   The criterion used here is a slightly more restrictive version of
   Pager's criterion, which guarantees equality of the tokens [t] and
   [u]. This is done essentially by applying Pager's original
   criterion on a token-wise basis. Pager's original criterion states
   that two states can be merged if the new state has no conflict or
   one of the original states has a conflict. Our more restrictive
   criterion states that two states can be merged if, for every token
   [t], the new state has no conflict at [t] or one of the original
   states has a conflict at [t].

   This modified criterion is also stable. My experiments show that it
   is almost as effective in practice: out of more than a hundred
   real-world sample grammars, only one automaton was affected, and
   only one extra state appeared as a result of using the modified
   criterion. Its advantage is to potentially make conflict
   explanations easier: if there appears to be a conflict at [t], then
   some conflict at [t] can be explained. This was not true when using
   Pager's original criterion. *)

let compatible (k1, toksr1) (k2, toksr2) =
  assert (k1 = k2);
  let n = Array.length toksr1 in
  (* Two states are compatible if and only if they are compatible
     at every pair (i, j), where i and j are distinct. *)
  let rec loopi i =
    if i = n then
      true
    else
      let toksr1i = toksr1.(i)
      and toksr2i = toksr2.(i) in
      let rec loopj j =
	if j = i then
	  true
	else
	  let toksr1j = toksr1.(j)
	  and toksr2j = toksr2.(j) in

	  (* The two states are compatible at (i, j) if every conflict
	     token in the merged state already was a conflict token in
	     one of the two original states. This could be written as
	     follows:

            TerminalSet.subset
	      (TerminalSet.inter (TerminalSet.union toksr1i toksr2i) (TerminalSet.union toksr1j toksr2j))
	      (TerminalSet.union (TerminalSet.inter toksr1i toksr1j) (TerminalSet.inter toksr2i toksr2j))

	     but is easily seen (on paper) to be equivalent to:

          *)

	     TerminalSet.subset
	       (TerminalSet.inter toksr2i toksr1j)
	       (TerminalSet.union toksr1i toksr2j)
	  &&
	     TerminalSet.subset
	       (TerminalSet.inter toksr1i toksr2j)
	       (TerminalSet.union toksr2i toksr1j)
	  &&
	     loopj (j+1)
      in
      loopj 0 && loopi (i+1)
  in
  loopi 0

(* This function determines whether two (core-equivalent) states can
   be merged without creating an end-of-stream conflict, now or in the
   future.

   The rule is, if an item appears in one state with the singleton "#"
   as its lookahead set, then its lookahead set in the other state
   must contain "#".

   So, either the second lookahead set is also the singleton "#", and
   no end-of-stream conflict exists, or it is larger, and the second
   state already contains an end-of-stream conflict.

   Put another way, we do not want to merge two lookahead sets when one
   contains "#" alone and the other does not contain "#".

   I invented this rule to complement Pager's criterion. I believe,
   but I am not 100% sure, that it does indeed prevent end-of-stream
   conflicts and that it is stable.

   Thanks to Sébastien Hinderer for reporting the bug caused by the
   absence of this extra criterion. *)

let eos_compatible  (k1, toksr1) (k2, toksr2) =
  assert (k1 = k2);
  let n = Array.length toksr1 in
  let rec loop i =
    if i = n then
      true
    else
      let toks1 = toksr1.(i)
      and toks2 = toksr2.(i) in
      begin
481
	if TerminalSet.mem Terminal.sharp toks1 && TerminalSet.is_singleton toks1 then
482 483
	  (* "#" is alone in one set: it must be a member of the other set. *)
	  TerminalSet.mem Terminal.sharp toks2
484
	else if TerminalSet.mem Terminal.sharp toks2 && TerminalSet.is_singleton toks2 then
485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561
	  (* Symmetric condition. *)
	  TerminalSet.mem Terminal.sharp toks1
	else
	  true
      end
      && loop (i+1)
  in
  loop 0

(* This function determines whether two (core-equivalent) states can
   be merged without creating spurious reductions on the [error]
   token.

   The rule is, we merge two states only if they agree on which
   reductions are permitted on the [error] token.

   Without this restriction, we might end up in a situation where we
   decide to introduce an [error] token into the input stream and
   perform a reduction, whereas a canonical LR(1) automaton,
   confronted with the same input string, would fail normally -- that
   is, it would introduce an [error] token into the input stream, but
   it would not be able to perform a reduction right away: the current
   state would be discarded.

   In the interest of more accurate (or sane, or predictable) error
   handling, I decided to introduce this restriction as of 20110124.
   This will cause an increase in the size of automata for grammars
   that use the [error] token. It might actually make the [error]
   token somewhat easier to use.

   Note that two sets can be in the subsumption relation and still
   be error-incompatible. Error-compatibility requires equality of
   the lookahead sets, restricted to [error].

   Thanks to Didier Rémy for reporting a bug caused by the absence
   of this extra criterion. *)

let error_compatible  (k1, toksr1) (k2, toksr2) =
  assert (k1 = k2);
  let n = Array.length toksr1 in
  let rec loop i =
    if i = n then
      true
    else
      let toks1 = toksr1.(i)
      and toks2 = toksr2.(i) in
      begin
	if TerminalSet.mem Terminal.error toks1 then
	  (* [error] is a member of one set: it must be a member of the other set. *)
	  TerminalSet.mem Terminal.error toks2
	else if TerminalSet.mem Terminal.error toks2 then
	  (* Symmetric condition. *)
	  TerminalSet.mem Terminal.error toks1
	else
	  true
      end
      && loop (i+1)
  in
  loop 0

(* Union of two states. The two states must have the same core. The
   new state is obtained by pointwise union of the lookahead sets. *)

let union (k1, toksr1) (k2, toksr2) =
  assert (k1 = k2);
  k1, Array.init (Array.length toksr1) (fun i ->
    TerminalSet.union toksr1.(i) toksr2.(i)
  )

(* Restriction of a state to a set of tokens of interest. Every
   lookahead set is intersected with that set. *)

let restrict toks (k, toksr) =
  k, Array.map (fun toksri ->
    TerminalSet.inter toksri toks
  ) toksr