LRijkstra.ml 51.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
(******************************************************************************)
(*                                                                            *)
(*                                   Menhir                                   *)
(*                                                                            *)
(*                       François Pottier, Inria Paris                        *)
(*              Yann Régis-Gianas, PPS, Université Paris Diderot              *)
(*                                                                            *)
(*  Copyright Inria. All rights reserved. This file is distributed under the  *)
(*  terms of the GNU General Public License version 2, as described in the    *)
(*  file LICENSE.                                                             *)
(*                                                                            *)
(******************************************************************************)

14 15 16 17 18 19 20 21 22 23 24 25 26 27
(* The purpose of this algorithm is to find, for each pair of a state [s]
   and a terminal symbol [z] such that looking at [z] in state [s] causes
   an error, a minimal path (starting in some initial state) that actually
   triggers this error. *)

(* This is potentially useful for grammar designers who wish to better
   understand the properties of their grammar, or who wish to produce a
   list of all possible syntax errors (or, at least, one syntax error in
   each automaton state where an error may occur). *)

(* The problem seems rather tricky. One might think that it suffices to
   compute shortest paths in the automaton, and to use [Analysis.minimal] to
   replace each non-terminal symbol in a path with a minimal word that this
   symbol generates. One can indeed do so, but this yields only a lower bound
POTTIER Francois's avatar
POTTIER Francois committed
28 29 30 31 32 33
   on the actual shortest path to the error at [s, z]. Indeed, several
   difficulties arise, including the fact that reductions are subject to a
   lookahead hypothesis; the fact that some states have a default reduction,
   hence will never trigger an error; the fact that conflict resolution
   removes some (shift or reduce) actions, hence may suppress the shortest
   path. *)
34

35 36 37 38 39 40 41 42 43 44 45
(* We explicitly choose to ignore the [error] token. Thus, we disregard any
   reductions or transitions that take place when the lookahead symbol is
   [error]. As a result, any state whose incoming symbol is [error] is found
   unreachable. It would be too complicated to have to create a first error in
   order to be able to take certain transitions or drop certain parts of the
   input. *)

(* We never work with the terminal symbol [#] either. This symbol never
   appears in the maps returned by [Lr1.transitions] and [Lr1.reductions].
   Thus, in principle, we work with ``real'' terminal symbols only. However,
   we encode [any] as [#] -- see below. *)
46

47
(* NOTE: THIS FILE IS COMPILED WITH -noassert BY DEFAULT. If you would like
48 49
   the assertions to be tested at runtime, change that in the file _tags.
   The performance impact of the assertions is about 10%. *)
50

51 52 53
(* ------------------------------------------------------------------------ *)

(* To delay the side effects performed by this module, we wrap everything in
54
   in a big functor. The functor also serves to pass verbosity parameters. *)
55

56 57 58 59 60 61 62
module Run (X : sig
  (* If [verbose] is set, produce various messages on [stderr]. *)
  val verbose: bool
  (* If [statistics] is defined, it is interpreted as the name of
     a file to which one line of statistics is appended. *)
  val statistics: string option
end) = struct
63

64
open Grammar
65

66 67 68 69 70 71 72 73 74 75 76
(* ------------------------------------------------------------------------ *)

(* Record our start time. *)

let now () =
  match X.statistics with
  | Some _ ->
      Unix.((times()).tms_utime)
  | None ->
      0.0

77
let start =
78
  now()
79

80 81
(* ------------------------------------------------------------------------ *)

82 83
(* Because of our encoding of terminal symbols as 8-bit characters, this
   algorithm supports at most 256 terminal symbols. *)
84 85

let () =
86
  if Terminal.n > 256 then
87
    Error.error []
88
      "--list-errors supports at most 256 terminal symbols.\n\
89 90 91 92
       The grammar has %d terminal symbols." Terminal.n

(* ------------------------------------------------------------------------ *)

93 94 95 96 97
(* Build a module that represents words as (hash-consed) strings. Note:
   this functor application has a side effect (it allocates memory, and
   more importantly, it may fail). *)

module W = Terminal.Word(struct end)
98

99 100
(* ------------------------------------------------------------------------ *)

101 102
(* The [error] token may appear in the maps returned by [Lr1.transitions]
   and [Lr1.reductions], so we sometimes need to explicitly check for it. *)
103

104
let non_error z =
105
  not (Terminal.equal z Terminal.error)
106

107
(* We introduce a pseudo-terminal symbol [any]. It is used in several places
108
   later on, in particular in the [lookahead] field of a fact, to encode the
109 110
   absence of a lookahead hypothesis -- i.e., any terminal symbol will do. *)

111 112 113 114 115
(* We choose to encode [any] as [#]. There is no risk of confusion, since we
   do not use [#] anywhere. Thus, the assertion [Terminal.real z] implies
   [z <> any]. *)

let any =
116
  Terminal.sharp
117

118 119
(* ------------------------------------------------------------------------ *)

120 121 122 123 124
(* We begin with a number of auxiliary functions that provide information
   about the LR(1) automaton. These functions could perhaps be moved to a
   separate module. We keep them here, for the moment, because they are not
   used anywhere else. *)

POTTIER Francois's avatar
POTTIER Francois committed
125 126 127
(* [reductions_on s z] is the list of reductions permitted in state [s] when
   the lookahead symbol is [z]. This is a list of zero or one elements. This
   does not take default reductions into account. *)
128

POTTIER Francois's avatar
POTTIER Francois committed
129
let reductions_on s z : Production.index list =
130
  assert (Terminal.real z);
131 132 133 134 135
  try
    TerminalMap.find z (Lr1.reductions s)
  with Not_found ->
    []

136 137 138
(* [has_reduction s z] tells whether state [s] is willing to reduce some
   production (and if so, which one) when the lookahead symbol is [z]. It
   takes a possible default reduction into account. *)
139 140

let has_reduction s z : Production.index option =
141
  assert (Terminal.real z);
142 143 144 145
  match Invariant.has_default_reduction s with
  | Some (prod, _) ->
      Some prod
  | None ->
POTTIER Francois's avatar
POTTIER Francois committed
146
      match reductions_on s z with
147 148 149 150 151 152
      | prod :: prods ->
          assert (prods = []);
          Some prod
      | [] ->
          None

153 154 155 156 157 158 159 160 161
(* [can_reduce s prod] indicates whether state [s] is able to reduce
   production [prod] (either as a default reduction, or as a normal
   reduction). *)

let can_reduce s prod =
  match Invariant.has_default_reduction s with
  | Some (prod', _) when prod = prod' ->
      true
  | _ ->
162
      TerminalMap.fold (fun z prods accu ->
163 164
        (* A reduction on [#] is always a default reduction. (See [lr1.ml].) *)
        assert (not (Terminal.equal z Terminal.sharp));
165
        accu || non_error z && List.mem prod prods
166 167
      ) (Lr1.reductions s) false

168 169
(* [causes_an_error s z] tells whether state [s] will initiate an error on the
   lookahead symbol [z]. *)
170

171
let causes_an_error s z : bool =
172
  assert (Terminal.real z);
173 174 175 176
  match Invariant.has_default_reduction s with
  | Some _ ->
      false
  | None ->
POTTIER Francois's avatar
POTTIER Francois committed
177
      reductions_on s z = [] &&
178 179
      not (SymbolMap.mem (Symbol.T z) (Lr1.transitions s))

180
(* [foreach_terminal f] applies the function [f] to every terminal symbol in
181
   turn, except [error] and [#]. *)
182

183 184
let foreach_terminal =
  Terminal.iter_real
185

186 187 188
(* [foreach_terminal_not_causing_an_error s f] applies the function [f] to
   every terminal symbol [z] such that [causes_an_error s z] is false. This
   could be implemented in a naive manner using [foreach_terminal] and
POTTIER Francois's avatar
POTTIER Francois committed
189
   [causes_an_error]. This implementation is significantly more efficient. *)
190

191 192 193
let foreach_terminal_not_causing_an_error s f =
  match Invariant.has_default_reduction s with
  | Some _ ->
194
      (* There is a default reduction. No symbol causes an error. *)
195 196
      foreach_terminal f
  | None ->
197 198 199
      (* Enumerate every terminal symbol [z] for which there is a
         reduction. *)
      TerminalMap.iter (fun z _ ->
200 201 202
        (* A reduction on [#] is always a default reduction. (See [lr1.ml].) *)
        assert (not (Terminal.equal z Terminal.sharp));
        if non_error z then
203
          f z
204
      ) (Lr1.reductions s);
205 206
      (* Enumerate every terminal symbol [z] for which there is a
         transition. *)
207 208
      SymbolMap.iter (fun sym _ ->
        match sym with
209
        | Symbol.T z ->
210 211
            assert (not (Terminal.equal z Terminal.sharp));
            if non_error z then
212
              f z
213 214 215 216
        | Symbol.N _ ->
            ()
      ) (Lr1.transitions s)

217
(* Let us say a state [s] is solid if its incoming symbol is a terminal symbol
POTTIER Francois's avatar
POTTIER Francois committed
218 219
   (or if it has no incoming symbol at all, i.e., it is an initial state). It
   is fragile if its incoming symbol is a non-terminal symbol. *)
220 221 222 223 224

let is_solid s =
  match Lr1.incoming_symbol s with
  | None
  | Some (Symbol.T _) ->
POTTIER Francois's avatar
POTTIER Francois committed
225
      true
226
  | Some (Symbol.N _) ->
POTTIER Francois's avatar
POTTIER Francois committed
227
      false
228

229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
(* [reduction_path_exists s w prod] tests whether the path determined by the
   sequence of symbols [w] out of the state [s] exists in the automaton and
   leads to a state where [prod] can be reduced. It further requires [w] to
   not contain the [error] token. Finally, it it sees the [error] token, it
   sets the flag [grammar_uses_error]. *)

let grammar_uses_error =
  ref false

let rec reduction_path_exists s (w : Symbol.t list) prod : bool =
  match w with
  | [] ->
      can_reduce s prod
  | (Symbol.T t) :: _ when Terminal.equal t Terminal.error ->
      grammar_uses_error := true;
      false
  | a :: w ->
      match SymbolMap.find a (Lr1.transitions s) with
      | s ->
          reduction_path_exists s w prod
      | exception Not_found ->
          false

252 253
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
254 255 256 257 258 259 260 261 262
(* Suppose [s] is a state that carries an outgoing edge labeled with a
   non-terminal symbol [nt]. We are interested in finding out how this edge
   can be taken. In order to do that, we must determine how, by starting in
   [s], one can follow a path that corresponds to (the right-hand side of) a
   production [prod] associated with [nt]. There are in general several such
   productions. The paths that they determine in the automaton form a "star".
   We represent the star rooted at [s] as a trie. For every state [s], the
   star rooted at [s] is constructed in advance, before the algorithm runs.
   While the algorithm runs, a point in the trie (that is, a sub-trie) tells
POTTIER Francois's avatar
Typo.  
POTTIER Francois committed
263
   us where we come from, where we are, and which production(s) we are hoping
POTTIER Francois's avatar
POTTIER Francois committed
264 265
   to reduce in the future. *)

266 267 268
module Trie : sig

  type trie
269 270 271 272

  (* [star s] creates a (new) trie whose source is [s], populated with its
     branches. (There is one branch for every production [prod] associated
     with every non-terminal symbol [nt] for which [s] carries an outgoing
273
     edge.) If the star turns out to be trivial then [None] is returned. *)
274 275
  val star: Lr1.node -> trie option

276 277 278 279
  (* After [star s] has been called, [size (Lr1.number s)] reports the size
     of the trie that has been constructed for state [s]. *)
  val size: int -> int

280
  (* After [star] has been called a number of times, [total_size()]
281
     reports the total size of the tries that have been constructed. *)
282
  val total_size: unit -> int
283

284 285 286 287 288 289 290 291
  (* Every (sub-)trie has a unique identity. (One can think of it as its
     address.) [compare] compares the identity of two tries. This can be
     used, e.g., to set up a map whose keys are tries. *)
  val compare: trie -> trie -> int

  (* [source t] returns the source state of the (sub-)trie [t]. This is
     the root of the star of which [t] is a sub-trie. In other words, this
     tells us "where we come from". *)
292
  val source: trie -> Lr1.node
POTTIER Francois's avatar
POTTIER Francois committed
293 294

  (* [current t] returns the current state of the (sub-)trie [t]. This is
295 296
     the root of the sub-trie [t]. In other words, this tells us "where
     we are". *)
POTTIER Francois's avatar
POTTIER Francois committed
297
  val current: trie -> Lr1.node
298

299 300 301 302 303
  (* [accepts prod t] tells whether the current state of the trie [t] is
     the end of a branch associated with production [prod]. If so, this
     means that we have successfully followed a path that corresponds to
     the right-hand side of production [prod]. *)
  val accepts: Production.index -> trie -> bool
304

305 306 307 308 309 310 311
  (* [step sym t] is the immediate sub-trie of [t] along the symbol [sym].
     This function raises [Not_found] if [t] has no child labeled [sym]. *)
  val step: Symbol.t -> trie -> trie

  (* [verbose()] outputs debugging & performance information. *)
  val verbose: unit -> unit

312 313 314 315 316 317 318
  (* Since every (sub-)trie has a unique identity, its identity can serve
     as a unique integer code for this (sub-)trie. We allow this conversion,
     both ways. This mechanism is used only as a way of saving space in the
     encoding of facts. *)
  val encode: trie -> int
  val decode: int -> trie

319
end = struct
320

321 322
  (* A trie has the following structure. *)

323
  type trie = {
324 325 326
    (* A unique identity, used by [compare]. The trie construction code
       ensures that these numbers are indeed unique: see [fresh], [insert],
       [star]. *)
327
    identity: int;
328
    (* The root state of this star: "where we come from". *)
329
    source: Lr1.node;
330
    (* The current state, i.e., the root of this sub-trie: "where we are". *)
POTTIER Francois's avatar
POTTIER Francois committed
331
    current: Lr1.node;
332 333 334
    (* The productions that we can reduce in the current state. In other
       words, if this list is nonempty, then the current state is the end
       of one (or several) branches. It can nonetheless have children. *)
335
    mutable productions: Production.index list;
336
    (* The children, or sub-tries. *)
337 338 339
    mutable transitions: trie SymbolMap.t
    (* The two fields above are written only during the construction of a
       trie. Once every trie has been constructed, they are frozen. *)
340
  }
341

342
  (* This counter is used by [mktrie] to produce unique identities. *)
343 344
  let c = ref 0

345 346 347 348 349 350 351 352
  (* We keep a mapping of integer identities to tries. Whenever a new
     identity is assigned, this mapping must be updated. *)
  let tries =
    let s : Lr1.node = Obj.magic () in (* yes, this hurts *)
    let dummy = { identity = -1; source = s; current = s;
                  productions = []; transitions = SymbolMap.empty } in
    MenhirLib.InfiniteArray.make dummy

353
  (* This smart constructor creates a new trie with a unique identity. *)
POTTIER Francois's avatar
POTTIER Francois committed
354
  let mktrie source current productions transitions =
355
    let identity = Misc.postincrement c in
356 357 358
    let t = { identity; source; current; productions; transitions } in
    MenhirLib.InfiniteArray.set tries identity t;
    t
359

360 361 362 363 364
  (* [insert t w prod] updates the trie (in place) by adding a new branch,
     corresponding to the sequence of symbols [w], and ending with a reduction
     of production [prod]. We assume [reduction_path_exists w prod t.current]
     holds, so we need not worry about this being a dead branch, and we can
     use destructive updates without having to set up an undo mechanism. *)
365

366
  let rec insert (t : trie) (w : Symbol.t list) prod : unit =
367 368
    match w with
    | [] ->
369 370
        assert (can_reduce t.current prod);
        t.productions <- prod :: t.productions
371
    | a :: w ->
POTTIER Francois's avatar
POTTIER Francois committed
372
        match SymbolMap.find a (Lr1.transitions t.current) with
373 374
        | exception Not_found ->
            assert false
375
        | successor ->
376 377 378 379 380
            (* Find our child at [a], or create it. *)
            let t' =
              try
                SymbolMap.find a t.transitions
              with Not_found ->
381 382 383
                let t' = mktrie t.source successor [] SymbolMap.empty in
                t.transitions <- SymbolMap.add a t' t.transitions;
                t'
384
            in
385 386
            (* Update our child. *)
            insert t' w prod
387

388 389 390
  (* [insert t prod] inserts a new branch, corresponding to production
     [prod], into the trie [t], which is updated in place. *)
  let insert t prod : unit =
391
    let w = Array.to_list (Production.rhs prod) in
392 393 394 395 396 397 398 399 400
    (* Check whether the path [w] leads to a state where [prod] can be
       reduced. If not, then some transition or reduction action must
       have been suppressed by conflict resolution; or the path [w]
       involves the [error] token. In that case, the branch is dead,
       and is not added. This test is superfluous (i.e., it would
       be OK to add a dead branch) but allows us to build a slightly
       smaller star in some cases. *)
    if reduction_path_exists t.current w prod then
      insert t w prod
401

402 403 404 405
  (* [fresh s] creates a new empty trie whose source is [s]. *)
  let fresh source =
    mktrie source source [] SymbolMap.empty

406 407 408
  (* The star at [s] is obtained by starting with a fresh empty trie and
     inserting into it every production [prod] whose left-hand side [nt]
     is the label of an outgoing edge at [s]. *)
409
  let star s =
410 411
    let t = fresh s in
    SymbolMap.iter (fun sym _ ->
412 413
      match sym with
      | Symbol.T _ ->
414
          ()
415
      | Symbol.N nt ->
416 417 418
          Production.iternt nt (insert t)
    ) (Lr1.transitions s);
    t
419

420 421 422 423 424 425 426
  (* A trie [t] is nontrivial if it has at least one branch, i.e., contains at
     least one sub-trie whose [productions] field is nonempty. Trivia: a trie
     of size greater than 1 is necessarily nontrivial, but the converse is not
     true: a nontrivial trie can have size 1. (This occurs if all productions
     have zero length.) *)
  let trivial t =
    t.productions = [] && SymbolMap.is_empty t.transitions
427

428 429
  (* Redefine [star] to include a [nontrivial] test and to record the size of
     the newly built trie. *)
430 431 432 433

  let size =
    Array.make Lr1.n (-1)

434
  let star s =
435
    let initial = !c in
436
    let t = star s in
437 438
    let final = !c in
    size.(Lr1.number s) <- final - initial;
439
    if trivial t then None else Some t
440

441 442 443 444
  let size s =
    assert (size.(s) >= 0);
    size.(s)

445
  let total_size () =
446 447
    !c

POTTIER Francois's avatar
POTTIER Francois committed
448
  let compare t1 t2 =
449
    Pervasives.compare t1.identity t2.identity
450

451 452 453
  let source t =
    t.source

POTTIER Francois's avatar
POTTIER Francois committed
454 455
  let current t =
    t.current
456

457 458 459
  let accepts prod t =
    List.mem prod t.productions

460
  let step a t =
461
    SymbolMap.find a t.transitions (* careful: may raise [Not_found] *)
462

463
  let verbose () =
464
    Printf.eprintf "Total star size: %d\n%!" (total_size())
465

466 467 468 469 470 471 472 473 474
  let decode i =
    let t = MenhirLib.InfiniteArray.get tries i in
    assert (t.identity = i); (* ensure we do not get the [dummy] trie *)
    t

  let encode t =
    assert (decode t.identity == t); (* round-trip property *)
    t.identity

475 476
end

POTTIER Francois's avatar
POTTIER Francois committed
477 478
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
479
(* The main algorithm, [LRijkstra], accumulates facts. A fact is a triple of a
480 481 482 483
   [position] (that is, a sub-trie), a [word], and a [lookahead] assumption.
   Such a fact means that this [position] can be reached, from the source
   state [Trie.source position], by consuming [word], under the assumption
   that the next input symbol is [lookahead]. *)
POTTIER Francois's avatar
POTTIER Francois committed
484

485
(* We allow [lookahead] to be [any] so as to indicate that this fact does
486
   not have a lookahead assumption. *)
POTTIER Francois's avatar
POTTIER Francois committed
487

488 489
(*

490
type fact = {
491
  position: Trie.trie;
492
  word: W.word;
493
  lookahead: Terminal.t (* may be [any] *)
494 495
}

496 497 498 499
*)

(* To save memory (and therefore time), we encode a fact in a single OCaml
   integer value. This is made possible by the fact that tries, words, and
500
   terminal symbols are represented as (or can be encoded as) integers.
501 502 503 504 505
   This admittedly horrible hack allows us to save roughly a factor of 2
   in space, and to gain 10% in time. *)

type fact = int

506 507 508
let dummy : fact =
  -1 (* should never be accessed! *)

509 510
(* Encoding and decoding facts. *)

511
(* We encode [position|word|lookahead] in a single word of memory. *)
512

513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537
(* The lookahead symbol fits in 8 bits. *)

(* In the largest grammars that we have seen, the number of unique words is
   about 3.10^5, so a word should fit in about 19 bits (2^19 = 524288). In the
   largest grammars that we have seen, the total star size is about 64000, so a
   trie should fit in about 17 bits (2^17 = 131072). *)

(* On a 64-bit machine, we have ample space in a 63-bit word! We allocate 30
   bits for [word] and the rest (i.e., 25 bits) for [position]. *)

(* On a 32-bit machine, we are a bit more cramped! In Menhir's own fancy-parser,
   the number of terminal symbols is 27, the number of unique words is 566, and
   the total star size is 546. We allocate 12 bits for [word] and 11 bits for
   [position]. This is better than refusing to work altogether, but still not
   great. A more satisfactory approach might be to revert to heap allocation of
   facts when in 32-bit mode, but that would make the code somewhat ugly. *)

let w_lookahead =
  8

let w_word =
  if Sys.word_size < 64 then 12 else 30

let w_position  =
  Sys.word_size - 1 - (w_word + w_lookahead) (* 25, on a 64-bit machine *)
538

539
let identity (fact : fact) : int =
540
  assert (fact <> dummy);
541
  fact lsr (w_word + w_lookahead)
542

543
let position (fact : fact) : Trie.trie =
544
  assert (fact <> dummy);
545
  Trie.decode (identity fact)
546 547

let word (fact : fact) : W.word =
548
  assert (fact <> dummy);
549
  (fact lsr w_lookahead) land (1 lsl w_word - 1)
550 551

let lookahead (fact : fact) : Terminal.t =
552
  Terminal.i2t (fact land (1 lsl w_lookahead - 1))
553 554 555 556 557 558

let mkfact position (word : W.word) lookahead =
  let position : int = Trie.encode position
  and word : int = word
  and lookahead : int = Terminal.t2i lookahead in
  assert (0 <= position && 0 <= word && 0 <= lookahead);
559 560 561 562
  assert (lookahead < 1 lsl w_lookahead);
  if position < 1 lsl w_position && word < 1 lsl w_word then
    (* [lsl] binds tighter than [lor] *)
    (position lsl w_word lor word) lsl w_lookahead lor lookahead
563
  else
564 565 566 567 568 569
    let advice =
       if Sys.word_size < 64 then
         "Please use a 64-bit machine."
       else
         "Please report this error to Menhir's developers."
    in
570 571
    Error.error []
      "an internal limit was exceeded.\n\
572
       Sys.word_size = %d. Position = %d. Word = %d.\n\
573
       %s%!"
574
      Sys.word_size position word advice
575 576 577 578 579 580 581 582

let mkfact p w l =
  let fact = mkfact p w l in
  assert (word fact == w);      (* round-trip property *)
  assert (lookahead fact == l); (* round-trip property *)
  assert (position fact == p);  (* round-trip property *)
  fact

583 584
(* Two invariants reduce the number of facts that we consider:

585 586
   1. If [lookahead] is a real terminal symbol [z] (i.e., not [any]),
      then [z] does not cause an error in the [current] state.
587
      It would be useless to consider a fact that violates this property;
POTTIER Francois's avatar
POTTIER Francois committed
588 589 590
      this cannot possibly lead to a successful reduction. In practice,
      this refinement allows reducing the number of facts that go through
      the queue by a factor of two.
591

592
   2. [lookahead] is [any] iff the [current] state is
593 594
      solid. This sounds rather reasonable (when a state is entered
      by shifting, it is entered regardless of which symbol follows)
595
      and simplifies the implementation of the sub-module [F].
596 597 598

*)

599 600
let invariant1 position _word lookahead =
  let current = Trie.current position in
601
  lookahead = any || not (causes_an_error current lookahead)
602

603 604
let invariant2 position _word lookahead =
  let current = Trie.current position in
605
  (lookahead = any) = is_solid current
606 607 608 609 610

(* [compatible z a] checks whether the terminal symbol [a] satisfies the
   lookahead assumption [z] -- which can be [any]. *)

let compatible z a =
611 612
  assert (non_error z);
  assert (Terminal.real a);
613 614
  z = any || z = a

POTTIER Francois's avatar
POTTIER Francois committed
615
(* ------------------------------------------------------------------------ *)
616

617
(* As in Dijkstra's algorithm, a priority queue contains the facts that await
618
   examination. The length of [word fact] serves as the priority of a fact.
619 620 621 622 623 624 625 626 627 628 629
   This guarantees that we discover shortest paths. (We never insert into the
   queue a fact whose priority is less than the priority of the last fact
   extracted out of the queue.) *)

(* [LowIntegerPriorityQueue] offers very efficient operations (essentially
   constant time, for a small constant). It exploits the fact that priorities
   are low nonnegative integers. *)

module Q = LowIntegerPriorityQueue

let q =
630
  Q.create dummy
631

632
(* In principle, there is no need to insert the fact into the queue if [F]
633
   already stores a comparable fact. We could perform this test in [enqueue].
POTTIER Francois's avatar
POTTIER Francois committed
634
   However, a few experiments suggests that this is not worthwhile. The run
635
   time augments (because membership in [F] is tested twice, upon inserting
636 637 638
   and upon extracting) and the memory consumption does not seem to go down
   significantly. *)

639 640 641 642 643
let enqueue position word lookahead =
  (* [lookahead] can be [any], but cannot be [error] *)
  assert (non_error lookahead);
  assert (invariant1 position word lookahead);
  assert (invariant2 position word lookahead);
644
  (* The length of [word] serves as the priority of this fact. *)
645 646 647
  let priority = W.length word in
  (* Encode and enqueue this fact. *)
  Q.add q (mkfact position word lookahead) priority
648

POTTIER Francois's avatar
POTTIER Francois committed
649 650
(* ------------------------------------------------------------------------ *)

651 652 653
(* Construct the [star] of every state [s]. Initialize the priority queue. *)

let () =
POTTIER Francois's avatar
POTTIER Francois committed
654
  (* For every state [s]... *)
655
  Lr1.iter (fun s ->
POTTIER Francois's avatar
POTTIER Francois committed
656
    (* If the trie rooted at [s] is nontrivial...*)
657
    match Trie.star s with
POTTIER Francois's avatar
POTTIER Francois committed
658 659 660 661 662 663 664 665 666 667
    | None ->
        ()
    | Some position ->
        (* ...then insert an initial fact into the priority queue. *)
        (* In order to respect invariants 1 and 2, we must distinguish two
           cases. If [s] is solid, then we insert a single fact, whose
           lookahead assumption is [any]. Otherwise, we must insert one
           initial fact for every terminal symbol [z] that does not cause
           an error in state [s]. *)
        let word = W.epsilon in
668
        if is_solid s then
669
          enqueue position word any
670 671
        else
          foreach_terminal_not_causing_an_error s (fun z ->
672
            enqueue position word z
673
          )
POTTIER Francois's avatar
POTTIER Francois committed
674 675 676
  );
  if X.verbose then
    Trie.verbose()
677

678 679 680 681 682 683 684
(* Produce a warning if the grammar uses the [error] pseudo-token. *)

let () =
  if !grammar_uses_error then
    Error.warning []
      "--list-errors ignores all productions that involve the error token."

685 686
(* ------------------------------------------------------------------------ *)

687
(* The module [F] maintains a set of known facts. *)
688 689

(* Three aspects of a fact are of particular interest:
690 691 692
   - its position [position], given by [position fact];
   - its first symbol [a], given by [W.first (word fact) (lookahead fact)];
   - its lookahead assumption [z], given by [lookahead fact].
693 694 695 696 697

   For every triple of [position], [a], and [z], we store at most one fact,
   (whose word has minimal length). Indeed, we are not interested in keeping
   track of several words that produce the same effect. Only the shortest such
   word is of interest.
698

699 700 701 702 703 704
   Thus, the total number of facts accumulated by the algorithm is at most
   [T.n^2], where [T] is the total size of the tries that we have constructed,
   and [n] is the number of terminal symbols. (This number can be quite large.
   [T] can be in the tens of thousands, and [n] can be over one hundred. These
   figures lead to a theoretical upper bound of 100M. In practice, for T=25K
   and n=108, we observe that the algorithm gathers about 7M facts.) *)
705

706
module F : sig
707 708

  (* [register fact] registers the fact [fact]. It returns [true] if this fact
POTTIER Francois's avatar
POTTIER Francois committed
709 710
     is new, i.e., no fact concerning the same triple of [position], [a], and
     [z] was previously known. *)
711 712
  val register: fact -> bool

POTTIER Francois's avatar
POTTIER Francois committed
713
  (* [query current z f] enumerates all known facts whose current state is
714 715
     [current] and whose lookahead assumption is compatible with [z]. The
     symbol [z] must a real terminal symbol, i.e., cannot be [any]. *)
716
  val query: Lr1.node -> Terminal.t -> (fact -> unit) -> unit
717

718 719 720
  (* [size()] returns the number of facts currently stored in the set. *)
  val size: unit -> int

721
  (* [verbose()] outputs debugging & performance information. *)
722
  val verbose: unit -> unit
723

724
end = struct
725

726 727 728 729 730 731 732 733 734 735 736
  (* We need to query the set of facts in two ways. In [register], we must test
     whether a proposed triple of [position], [a], [z] already appears in the
     set. In [query], we must find all facts that match a pair [current, z],
     where [current] is a state. (Note that [position] determines [current], but
     the converse is not true: a position contains more information besides the
     current state.)

     To address these needs, we use a two-level table. The first level is a
     matrix indexed by [current] and [z]. At the second level, we find sets of
     facts, where two facts are considered equal if they have the same triple of
     [position], [a], and [z]. In fact, we know at this level that all facts
737 738 739
     have the same [z] component, so only [position] and [a] are compared.

     Because our facts satisfy invariant 2, [z] is [any] if and only if the
740
     state [current] is solid. This means that we are wasting quite a
741 742
     lot of space in the matrix (for a solid state, the whole line is empty,
     except for the [any] column). *)
743

744
  (* The level-2 sets. *)
745 746

  module M =
747
    MySet.Make(struct
748 749
      type t = fact
      let compare fact1 fact2 =
750
        assert (lookahead fact1 = lookahead fact2);
751 752 753 754 755
        (* Compare the two positions first. This can be done without going
           through [Trie.decode], by directly comparing the two integer
           identities. *)
        let c = Pervasives.compare (identity fact1) (identity fact2) in
        assert (c = Trie.compare (position fact1) (position fact2));
756
        if c <> 0 then c else
757 758 759
        let z = lookahead fact1 in
        let a1 = W.first (word fact1) z
        and a2 = W.first (word fact2) z in
760
        (* note: [a1] and [a2] can be [any] here *)
761 762
        Terminal.compare a1 a2
    end)
763

764 765 766
  (* The level-1 matrix. *)

  let table =
767
    Array.make (Lr1.n * Terminal.n) M.empty
768

POTTIER Francois's avatar
POTTIER Francois committed
769
  let index current z =
770
    Terminal.n * (Lr1.number current) + Terminal.t2i z
771

772 773
  let count = ref 0

774
  let register fact =
775
    let current = Trie.current (position fact) in
776
    let z = lookahead fact in
POTTIER Francois's avatar
POTTIER Francois committed
777
    let i = index current z in
778 779 780 781 782 783 784 785 786 787 788 789
    let m = table.(i) in
    (* We crucially rely on the fact that [M.add] guarantees not to
       change the set if an ``equal'' fact already exists. Thus, a
       later, longer path is ignored in favor of an earlier, shorter
       path. *)
    let m' = M.add fact m in
    m != m' && begin
      incr count;
      table.(i) <- m';
      true
    end

POTTIER Francois's avatar
POTTIER Francois committed
790
  let query current z f =
791 792 793 794 795
    assert (not (Terminal.equal z any));
    (* If the state [current] is solid then the facts that concern it are
       stored in the column [any], and all of them are compatible with [z].
       Otherwise, they are stored in all columns except [any], and only
       those stored in the column [z] are compatible with [z]. *)
796
    let i = index current (if is_solid current then any else z) in
797 798
    let m = table.(i) in
    M.iter f m
799

800 801 802
  let size () =
    !count

803
  let verbose () =
804
    Printf.eprintf "F stores %d facts.\n%!" (size())
805

806 807
end

POTTIER Francois's avatar
POTTIER Francois committed
808 809
(* ------------------------------------------------------------------------ *)

810 811
(* The module [E] is in charge of recording the non-terminal edges that we have
   discovered, or more precisely, the conditions under which these edges can be
POTTIER Francois's avatar
POTTIER Francois committed
812
   taken.
813

POTTIER Francois's avatar
POTTIER Francois committed
814 815 816 817 818 819 820 821 822 823
   It maintains a set of quadruples [s, nt, w, z], where such a quadruple means
   that in the state [s], the outgoing edge labeled [nt] can be taken by
   consuming the word [w], under the assumption that the next symbol is [z].

   Again, the terminal symbol [a], given by [W.first w z], plays a role. For
   each quadruple [s, nt, a, z], we store at most one quadruple [s, nt, w, z].
   Thus, internally, we maintain a mapping of [s, nt, a, z] to [w].

   For greater simplicity, we do not allow [z] to be [any] in [register] or
   [query]. Allowing it would complicate things significantly, it seems. *)
824 825 826 827

module E : sig

  (* [register s nt w z] records that, in state [s], the outgoing edge labeled
828
     [nt] can be taken by consuming the word [w], if the next symbol is [z].
POTTIER Francois's avatar
POTTIER Francois committed
829 830
     It returns [true] if this information is new, i.e., if the underlying
     quadruple [s, nt, a, z] is new. The symbol [z] cannot be [any]. *)
831
  val register: Lr1.node -> Nonterminal.t -> W.word -> Terminal.t -> bool
832

833
  (* [query s nt a foreach] enumerates all words [w] and all real symbols [z]
834 835 836
     such that, in state [s], the outgoing edge labeled [nt] can be taken by
     consuming the word [w], under the assumption that the next symbol is [z],
     and the first symbol of the word [w.z] is [a]. The symbol [a] can be [any].
837 838 839 840 841
     The function [foreach] can be either [foreach_terminal] or of the form
     [foreach_terminal_not_causing_an_error _]. It limits the symbols [z] that
     are considered. *)
  val query: Lr1.node -> Nonterminal.t -> Terminal.t ->
             (* foreach: *) ((Terminal.t -> unit) -> unit) ->
842
             (W.word -> Terminal.t -> unit) -> unit
843

844 845 846
  (* [size()] returns the number of edges currently stored in the set. *)
  val size: unit -> int

POTTIER Francois's avatar
POTTIER Francois committed
847
  (* [verbose()] outputs debugging & performance information. *)
848
  val verbose: unit -> unit
849

850 851
end = struct

852 853 854 855 856 857 858 859 860 861
  (* At a high level, we must implement a mapping of [s, nt, a, z] to [w]. In
     practice, we can implement this specification using any combination of
     arrays, hash tables, balanced binary trees, and perfect hashing (i.e.,
     packing several of [s], [nt], [a], [z] in one word.) Here, we choose to
     use an array, indexed by [s], of hash tables, indexed by a key that packs
     [nt], [a], and [z] in one word. According to a quick experiment, the
     final population of the hash table [table.(index s)] seems to be roughly
     [Terminal.n * Trie.size s]. We note that using an initial capacity
     of 0 and relying on the hash table's resizing mechanism has a significant
     cost, which is why we try to guess a good initial capacity. *)
862

863
  module H = Hashtbl
864

POTTIER Francois's avatar
POTTIER Francois committed
865 866
  let table =
    Array.init Lr1.n (fun i ->
867 868 869
      let size = Trie.size i in
      H.create (if size = 1 then 0 else Terminal.n * size)
    )
870 871 872

  let index s =
    Lr1.number s
873

874
  let pack nt a z : int =
POTTIER Francois's avatar
POTTIER Francois committed
875
    (* We rely on the fact that we have at most 256 terminal symbols. *)
876 877 878
    (Nonterminal.n2i nt lsl 16) lor
    (Terminal.t2i a lsl 8) lor
    (Terminal.t2i z)
879

880 881
  let count = ref 0

882
  let register s nt w z =
883
    assert (Terminal.real z);
884
    let i = index s in
885
    let m = table.(i) in
886
    let a = W.first w z in
POTTIER Francois's avatar
POTTIER Francois committed
887
    (* Note that looking at [a] in state [s] cannot cause an error. *)
888
    assert (not (causes_an_error s a));
889 890 891 892
    let key = pack nt a z in
    if H.mem m key then
      false
    else begin
893
      incr count;
894
      H.add m key w;
895 896
      true
    end
897

898
  let rec query s nt a foreach f =
POTTIER Francois's avatar
POTTIER Francois committed
899 900
    if Terminal.equal a any then begin
      (* If [a] is [any], we query the table for every real symbol [a].
901 902 903 904
         We can limit ourselves to symbols that do not cause an error
         in state [s]. Those that do certainly do not have an entry;
         see the assertion in [register] above. *)
      foreach_terminal_not_causing_an_error s (fun a ->
905
        query s nt a foreach f
906
      )
POTTIER Francois's avatar
POTTIER Francois committed
907
    end
908
    else
909 910
      let i = index s in
      let m = table.(i) in
911 912 913 914 915 916 917
      foreach (fun z ->
        assert (Terminal.real z);
        let key = pack nt a z in
        match H.find m key with
        | w -> f w z
        | exception Not_found -> ()
      )
918

919 920 921
  let size () =
    !count

922
  let verbose () =
923
    Printf.eprintf "E stores %d edges.\n%!" (size())
924

925 926
end

POTTIER Francois's avatar
POTTIER Francois committed
927 928
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
929 930 931 932 933 934 935
(* [new_edge s nt w z] is invoked when we discover that in the state [s], the
   outgoing edge labeled [nt] can be taken by consuming the word [w], under
   the assumption that the next symbol is [z]. We check whether this quadruple
   already exists in the set [E]. If not, then we add it, and we compute its
   consequences, in the form of new facts, which we insert into the priority
   queue for later examination. *)

936
let new_edge s nt w z =
937
  assert (Terminal.real z);
938
  if E.register s nt w z then
939
    let sym = Symbol.N nt in
940
    (* Query [F] for existing facts which could be extended by following
POTTIER Francois's avatar
POTTIER Francois committed
941 942 943
       this newly discovered edge. They must be facts whose current state
       is [s] and whose lookahead assumption is compatible with [a]. For
       each such fact, ... *)
944
    F.query s (W.first w z) (fun fact ->
945
      assert (compatible (lookahead fact) (W.first w z));
POTTIER Francois's avatar
POTTIER Francois committed
946
      (* ... try to take one step in the trie along an edge labeled [nt]. *)
947
      match Trie.step sym (position fact) with
948
      | position ->
POTTIER Francois's avatar
POTTIER Francois committed
949
          (* This takes us to a new state whose incoming symbol is [nt].
POTTIER Francois's avatar
POTTIER Francois committed
950 951 952 953 954
             Hence, this state is not solid. In order to satisfy invariant 2,
             we must create fact whose lookahead assumption is not [any].
             That's fine, since our lookahead assumption is [z]. In order to
             satisfy invariant 1, we must check that [z] does not cause an
             error in this state. *)
955
          assert (not (is_solid (Trie.current position)));
956
          if not (causes_an_error (Trie.current position) z) then
957
            let word = W.append (word fact) w in
958
            enqueue position word z
959
      | exception Not_found ->
POTTIER Francois's avatar
POTTIER Francois committed
960 961 962
          (* Could not take a step in the trie. This means this branch
             leads nowhere of interest, and was pruned when the trie
             was constructed. *)
963
          ()
964
    )
965

POTTIER Francois's avatar
POTTIER Francois committed
966 967
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
968 969
(* [new_fact fact] is invoked when we discover a new fact (i.e., one that was
   not previously known). It studies the consequences of this fact. These
970 971 972
   consequences are of two kinds:

   - As in Dijkstra's algorithm, the new fact can be viewed as a newly
POTTIER Francois's avatar
POTTIER Francois committed
973 974 975 976
     discovered vertex. We study its (currently known) outgoing edges,
     and enqueue new facts in the priority queue.

   - Sometimes, a fact can also be viewed as a newly discovered edge. This is
977 978
     the case when the word that took us from [source] to [current]
     represents a production of the grammar and [current] is willing to
POTTIER Francois's avatar
POTTIER Francois committed
979 980 981
     reduce this production. We record the existence of this edge, and
     re-inspect any previously discovered vertices which are interested in
     this outgoing edge. *)
982

POTTIER Francois's avatar
POTTIER Francois committed
983
let new_fact fact =
984

985 986 987
  (* Throughout this rather long function, there is just one [fact]. Let's
     name its components right now, so as to avoid accessing them several
     times. (That could be costly, as it requires decoding the fact.) *)
988
  let position = position fact
989 990 991 992
  and lookahead = lookahead fact
  and word = word fact in
  let source = Trie.source position
  and current = Trie.current position in
993

POTTIER Francois's avatar
POTTIER Francois committed
994 995 996
  (* 1. View [fact] as a vertex. Examine the transitions out of [current].
     For every transition labeled by a symbol [sym] and into a state
     [target], ... *)
997

POTTIER Francois's avatar
POTTIER Francois committed
998
  Lr1.transitions current |> SymbolMap.iter (fun sym target ->
999 1000 1001
    (* ... try to follow this transition in the trie [position],
       down to a child which we call [child]. *)
    match Trie.step sym position, sym with
POTTIER Francois's avatar
POTTIER Francois committed
1002 1003 1004 1005 1006 1007 1008

    | exception Not_found ->

        (* Could not take a step in the trie. This means this transition
           leads nowhere of interest. *)
        ()

1009
    | child, Symbol.T t ->
1010

POTTIER Francois's avatar
POTTIER Francois committed
1011 1012 1013
        (* 1a. The transition exists in the trie, and [sym] is in fact a
           terminal symbol [t]. We note that [t] cannot be the [error] token,
           because the trie does not have any edges labeled [error]. *)
1014
        assert (Lr1.Node.compare (Trie.current child) target = 0);
POTTIER Francois's avatar
POTTIER Francois committed
1015
        assert (is_solid target);
1016
        assert (non_error t);
1017

1018
        (* If the lookahead assumption [lookahead] is compatible with
POTTIER Francois's avatar
POTTIER Francois committed
1019 1020
           [t], then we derive a new fact, where one more edge has been taken,
           and enqueue this new fact for later examination. *)
1021

POTTIER Francois's avatar
POTTIER Francois committed
1022 1023 1024 1025 1026 1027
        (* The state [target] is solid, i.e., its incoming symbol is terminal.
           This state is always entered without consideration for the next
           lookahead symbol. Thus, we can use [any] as the lookahead assumption
           in the new fact that we produce. If we did not have [any], we would
           have to produce one fact for every possible lookahead symbol. *)

1028 1029 1030
        if compatible lookahead t then
          let word = W.append word (W.singleton t) in
          enqueue child word any
1031

1032
    | child, Symbol.N nt ->
1033

POTTIER Francois's avatar
POTTIER Francois committed
1034 1035
        (* 1b. The transition exists in the trie, and [sym] is in fact a
           nonterminal symbol [nt]. *)
POTTIER Francois's avatar
POTTIER Francois committed
1036 1037
        assert (Lr1.Node.compare (Trie.current child) target = 0);
        assert (not (is_solid target));
POTTIER Francois's avatar
POTTIER Francois committed
1038 1039 1040 1041 1042 1043

        (* We need to know how this nonterminal edge can be taken. We query
           [E] for a word [w] that allows us to take this edge. In general,
           the answer depends on the terminal symbol [z] that comes *after*
           this word: we try all such symbols. We must make sure that the
           first symbol of the word [w.z] satisfies the lookahead assumption
1044
           [lookahead]; this is ensured by passing this information to
POTTIER Francois's avatar
POTTIER Francois committed
1045 1046 1047 1048
           [E.query]. *)

        (* It could be the case that, due to a default reduction, the answer
           to our query does not depend on [z], and we are wasting work.
1049
           However, allowing [z] to be [any] in [E.query], and taking
POTTIER Francois's avatar
POTTIER Francois committed
1050 1051
           advantage of this to increase performance, seems difficult. *)

1052 1053
        let foreach = foreach_terminal_not_causing_an_error target in
        E.query current nt lookahead foreach (fun w z ->
1054 1055