LRijkstra.ml 49.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
(* The purpose of this algorithm is to find, for each pair of a state [s]
   and a terminal symbol [z] such that looking at [z] in state [s] causes
   an error, a minimal path (starting in some initial state) that actually
   triggers this error. *)

(* This is potentially useful for grammar designers who wish to better
   understand the properties of their grammar, or who wish to produce a
   list of all possible syntax errors (or, at least, one syntax error in
   each automaton state where an error may occur). *)

(* The problem seems rather tricky. One might think that it suffices to
   compute shortest paths in the automaton, and to use [Analysis.minimal] to
   replace each non-terminal symbol in a path with a minimal word that this
   symbol generates. One can indeed do so, but this yields only a lower bound
POTTIER Francois's avatar
POTTIER Francois committed
15 16 17 18 19 20
   on the actual shortest path to the error at [s, z]. Indeed, several
   difficulties arise, including the fact that reductions are subject to a
   lookahead hypothesis; the fact that some states have a default reduction,
   hence will never trigger an error; the fact that conflict resolution
   removes some (shift or reduce) actions, hence may suppress the shortest
   path. *)
21

22 23 24 25 26 27 28 29 30 31 32
(* We explicitly choose to ignore the [error] token. Thus, we disregard any
   reductions or transitions that take place when the lookahead symbol is
   [error]. As a result, any state whose incoming symbol is [error] is found
   unreachable. It would be too complicated to have to create a first error in
   order to be able to take certain transitions or drop certain parts of the
   input. *)

(* We never work with the terminal symbol [#] either. This symbol never
   appears in the maps returned by [Lr1.transitions] and [Lr1.reductions].
   Thus, in principle, we work with ``real'' terminal symbols only. However,
   we encode [any] as [#] -- see below. *)
33

34
(* NOTE: THIS FILE IS COMPILED WITH -noassert BY DEFAULT. If you would like
35 36
   the assertions to be tested at runtime, change that in the file _tags.
   The performance impact of the assertions is about 10%. *)
37

38 39 40
(* ------------------------------------------------------------------------ *)

(* To delay the side effects performed by this module, we wrap everything in
41
   in a big functor. The functor also serves to pass verbosity parameters. *)
42

43 44 45 46 47 48 49
module Run (X : sig
  (* If [verbose] is set, produce various messages on [stderr]. *)
  val verbose: bool
  (* If [statistics] is defined, it is interpreted as the name of
     a file to which one line of statistics is appended. *)
  val statistics: string option
end) = struct
50

51
open Grammar
52

53 54 55 56 57 58 59 60 61 62 63
(* ------------------------------------------------------------------------ *)

(* Record our start time. *)

let now () =
  match X.statistics with
  | Some _ ->
      Unix.((times()).tms_utime)
  | None ->
      0.0

64
let start =
65
  now()
66

67 68
(* ------------------------------------------------------------------------ *)

69 70
(* Because of our encoding of terminal symbols as 8-bit characters, this
   algorithm supports at most 256 terminal symbols. *)
71 72

let () =
73
  if Terminal.n > 256 then
74
    Error.error [] (Printf.sprintf
75
      "--list-errors supports at most 256 terminal symbols.\n\
76 77 78 79 80
       The grammar has %d terminal symbols." Terminal.n
    )

(* ------------------------------------------------------------------------ *)

81 82 83 84 85
(* Build a module that represents words as (hash-consed) strings. Note:
   this functor application has a side effect (it allocates memory, and
   more importantly, it may fail). *)

module W = Terminal.Word(struct end)
86

87 88
(* ------------------------------------------------------------------------ *)

89 90
(* The [error] token may appear in the maps returned by [Lr1.transitions]
   and [Lr1.reductions], so we sometimes need to explicitly check for it. *)
91

92
let non_error z =
93
  not (Terminal.equal z Terminal.error)
94

95 96 97 98
(* We introduce a pseudo-terminal symbol [any]. It is used in several places
   later on, in particular in the field [fact.lookahead], to encode the
   absence of a lookahead hypothesis -- i.e., any terminal symbol will do. *)

99 100 101 102 103
(* We choose to encode [any] as [#]. There is no risk of confusion, since we
   do not use [#] anywhere. Thus, the assertion [Terminal.real z] implies
   [z <> any]. *)

let any =
104
  Terminal.sharp
105

106 107
(* ------------------------------------------------------------------------ *)

108 109 110 111 112
(* We begin with a number of auxiliary functions that provide information
   about the LR(1) automaton. These functions could perhaps be moved to a
   separate module. We keep them here, for the moment, because they are not
   used anywhere else. *)

POTTIER Francois's avatar
POTTIER Francois committed
113 114 115
(* [reductions_on s z] is the list of reductions permitted in state [s] when
   the lookahead symbol is [z]. This is a list of zero or one elements. This
   does not take default reductions into account. *)
116

POTTIER Francois's avatar
POTTIER Francois committed
117
let reductions_on s z : Production.index list =
118
  assert (Terminal.real z);
119 120 121 122 123
  try
    TerminalMap.find z (Lr1.reductions s)
  with Not_found ->
    []

124 125 126
(* [has_reduction s z] tells whether state [s] is willing to reduce some
   production (and if so, which one) when the lookahead symbol is [z]. It
   takes a possible default reduction into account. *)
127 128

let has_reduction s z : Production.index option =
129
  assert (Terminal.real z);
130 131 132 133
  match Invariant.has_default_reduction s with
  | Some (prod, _) ->
      Some prod
  | None ->
POTTIER Francois's avatar
POTTIER Francois committed
134
      match reductions_on s z with
135 136 137 138 139 140
      | prod :: prods ->
          assert (prods = []);
          Some prod
      | [] ->
          None

141 142 143 144 145 146 147 148 149
(* [can_reduce s prod] indicates whether state [s] is able to reduce
   production [prod] (either as a default reduction, or as a normal
   reduction). *)

let can_reduce s prod =
  match Invariant.has_default_reduction s with
  | Some (prod', _) when prod = prod' ->
      true
  | _ ->
150
      TerminalMap.fold (fun z prods accu ->
151 152
        (* A reduction on [#] is always a default reduction. (See [lr1.ml].) *)
        assert (not (Terminal.equal z Terminal.sharp));
153
        accu || non_error z && List.mem prod prods
154 155
      ) (Lr1.reductions s) false

156 157
(* [causes_an_error s z] tells whether state [s] will initiate an error on the
   lookahead symbol [z]. *)
158

159
let causes_an_error s z : bool =
160
  assert (Terminal.real z);
161 162 163 164
  match Invariant.has_default_reduction s with
  | Some _ ->
      false
  | None ->
POTTIER Francois's avatar
POTTIER Francois committed
165
      reductions_on s z = [] &&
166 167
      not (SymbolMap.mem (Symbol.T z) (Lr1.transitions s))

168
(* [foreach_terminal f] applies the function [f] to every terminal symbol in
169
   turn, except [error] and [#]. *)
170

171 172
let foreach_terminal =
  Terminal.iter_real
173

174 175 176
(* [foreach_terminal_not_causing_an_error s f] applies the function [f] to
   every terminal symbol [z] such that [causes_an_error s z] is false. This
   could be implemented in a naive manner using [foreach_terminal] and
POTTIER Francois's avatar
POTTIER Francois committed
177
   [causes_an_error]. This implementation is significantly more efficient. *)
178

179 180 181
let foreach_terminal_not_causing_an_error s f =
  match Invariant.has_default_reduction s with
  | Some _ ->
182
      (* There is a default reduction. No symbol causes an error. *)
183 184
      foreach_terminal f
  | None ->
185 186 187
      (* Enumerate every terminal symbol [z] for which there is a
         reduction. *)
      TerminalMap.iter (fun z _ ->
188 189 190
        (* A reduction on [#] is always a default reduction. (See [lr1.ml].) *)
        assert (not (Terminal.equal z Terminal.sharp));
        if non_error z then
191
          f z
192
      ) (Lr1.reductions s);
193 194
      (* Enumerate every terminal symbol [z] for which there is a
         transition. *)
195 196
      SymbolMap.iter (fun sym _ ->
        match sym with
197
        | Symbol.T z ->
198 199
            assert (not (Terminal.equal z Terminal.sharp));
            if non_error z then
200
              f z
201 202 203 204
        | Symbol.N _ ->
            ()
      ) (Lr1.transitions s)

205
(* Let us say a state [s] is solid if its incoming symbol is a terminal symbol
POTTIER Francois's avatar
POTTIER Francois committed
206 207
   (or if it has no incoming symbol at all, i.e., it is an initial state). It
   is fragile if its incoming symbol is a non-terminal symbol. *)
208 209 210 211 212

let is_solid s =
  match Lr1.incoming_symbol s with
  | None
  | Some (Symbol.T _) ->
POTTIER Francois's avatar
POTTIER Francois committed
213
    true
214
  | Some (Symbol.N _) ->
POTTIER Francois's avatar
POTTIER Francois committed
215
    false
216

217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
(* [reduction_path_exists s w prod] tests whether the path determined by the
   sequence of symbols [w] out of the state [s] exists in the automaton and
   leads to a state where [prod] can be reduced. It further requires [w] to
   not contain the [error] token. Finally, it it sees the [error] token, it
   sets the flag [grammar_uses_error]. *)

let grammar_uses_error =
  ref false

let rec reduction_path_exists s (w : Symbol.t list) prod : bool =
  match w with
  | [] ->
      can_reduce s prod
  | (Symbol.T t) :: _ when Terminal.equal t Terminal.error ->
      grammar_uses_error := true;
      false
  | a :: w ->
      match SymbolMap.find a (Lr1.transitions s) with
      | s ->
          reduction_path_exists s w prod
      | exception Not_found ->
          false

240 241
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
242 243 244 245 246 247 248 249 250
(* Suppose [s] is a state that carries an outgoing edge labeled with a
   non-terminal symbol [nt]. We are interested in finding out how this edge
   can be taken. In order to do that, we must determine how, by starting in
   [s], one can follow a path that corresponds to (the right-hand side of) a
   production [prod] associated with [nt]. There are in general several such
   productions. The paths that they determine in the automaton form a "star".
   We represent the star rooted at [s] as a trie. For every state [s], the
   star rooted at [s] is constructed in advance, before the algorithm runs.
   While the algorithm runs, a point in the trie (that is, a sub-trie) tells
POTTIER Francois's avatar
Typo.  
POTTIER Francois committed
251
   us where we come from, where we are, and which production(s) we are hoping
POTTIER Francois's avatar
POTTIER Francois committed
252 253
   to reduce in the future. *)

254 255 256
module Trie : sig

  type trie
257 258 259 260

  (* [star s] creates a (new) trie whose source is [s], populated with its
     branches. (There is one branch for every production [prod] associated
     with every non-terminal symbol [nt] for which [s] carries an outgoing
261
     edge.) If the star turns out to be trivial then [None] is returned. *)
262 263
  val star: Lr1.node -> trie option

264 265 266 267
  (* After [star s] has been called, [size (Lr1.number s)] reports the size
     of the trie that has been constructed for state [s]. *)
  val size: int -> int

268 269 270 271
  (* After [star] has been called a number of times, [cumulated_size()]
     reports the total size of the tries that have been constructed. *)
  val cumulated_size: unit -> int

272 273 274 275 276 277 278 279
  (* Every (sub-)trie has a unique identity. (One can think of it as its
     address.) [compare] compares the identity of two tries. This can be
     used, e.g., to set up a map whose keys are tries. *)
  val compare: trie -> trie -> int

  (* [source t] returns the source state of the (sub-)trie [t]. This is
     the root of the star of which [t] is a sub-trie. In other words, this
     tells us "where we come from". *)
280
  val source: trie -> Lr1.node
POTTIER Francois's avatar
POTTIER Francois committed
281 282

  (* [current t] returns the current state of the (sub-)trie [t]. This is
283 284
     the root of the sub-trie [t]. In other words, this tells us "where
     we are". *)
POTTIER Francois's avatar
POTTIER Francois committed
285
  val current: trie -> Lr1.node
286

287 288 289 290 291
  (* [accepts prod t] tells whether the current state of the trie [t] is
     the end of a branch associated with production [prod]. If so, this
     means that we have successfully followed a path that corresponds to
     the right-hand side of production [prod]. *)
  val accepts: Production.index -> trie -> bool
292

293 294 295 296 297 298 299
  (* [step sym t] is the immediate sub-trie of [t] along the symbol [sym].
     This function raises [Not_found] if [t] has no child labeled [sym]. *)
  val step: Symbol.t -> trie -> trie

  (* [verbose()] outputs debugging & performance information. *)
  val verbose: unit -> unit

300 301 302 303 304 305 306
  (* Since every (sub-)trie has a unique identity, its identity can serve
     as a unique integer code for this (sub-)trie. We allow this conversion,
     both ways. This mechanism is used only as a way of saving space in the
     encoding of facts. *)
  val encode: trie -> int
  val decode: int -> trie

307
end = struct
308

309 310
  (* A trie has the following structure. *)

311
  type trie = {
312 313 314
    (* A unique identity, used by [compare]. The trie construction code
       ensures that these numbers are indeed unique: see [fresh], [insert],
       [star]. *)
315
    identity: int;
316
    (* The root state of this star: "where we come from". *)
317
    source: Lr1.node;
318
    (* The current state, i.e., the root of this sub-trie: "where we are". *)
POTTIER Francois's avatar
POTTIER Francois committed
319
    current: Lr1.node;
320 321 322
    (* The productions that we can reduce in the current state. In other
       words, if this list is nonempty, then the current state is the end
       of one (or several) branches. It can nonetheless have children. *)
323
    mutable productions: Production.index list;
324
    (* The children, or sub-tries. *)
325 326 327
    mutable transitions: trie SymbolMap.t
    (* The two fields above are written only during the construction of a
       trie. Once every trie has been constructed, they are frozen. *)
328
  }
329

330
  (* This counter is used by [mktrie] to produce unique identities. *)
331 332
  let c = ref 0

333 334 335 336 337 338 339 340
  (* We keep a mapping of integer identities to tries. Whenever a new
     identity is assigned, this mapping must be updated. *)
  let tries =
    let s : Lr1.node = Obj.magic () in (* yes, this hurts *)
    let dummy = { identity = -1; source = s; current = s;
                  productions = []; transitions = SymbolMap.empty } in
    MenhirLib.InfiniteArray.make dummy

341
  (* This smart constructor creates a new trie with a unique identity. *)
POTTIER Francois's avatar
POTTIER Francois committed
342
  let mktrie source current productions transitions =
343
    let identity = Misc.postincrement c in
344 345 346
    let t = { identity; source; current; productions; transitions } in
    MenhirLib.InfiniteArray.set tries identity t;
    t
347

348 349 350 351 352
  (* [insert t w prod] updates the trie (in place) by adding a new branch,
     corresponding to the sequence of symbols [w], and ending with a reduction
     of production [prod]. We assume [reduction_path_exists w prod t.current]
     holds, so we need not worry about this being a dead branch, and we can
     use destructive updates without having to set up an undo mechanism. *)
353

354
  let rec insert (t : trie) (w : Symbol.t list) prod : unit =
355 356
    match w with
    | [] ->
357 358
        assert (can_reduce t.current prod);
        t.productions <- prod :: t.productions
359
    | a :: w ->
POTTIER Francois's avatar
POTTIER Francois committed
360
        match SymbolMap.find a (Lr1.transitions t.current) with
361 362
        | exception Not_found ->
            assert false
363
        | successor ->
364 365 366 367 368
            (* Find our child at [a], or create it. *)
            let t' =
              try
                SymbolMap.find a t.transitions
              with Not_found ->
369 370 371
                let t' = mktrie t.source successor [] SymbolMap.empty in
                t.transitions <- SymbolMap.add a t' t.transitions;
                t'
372
            in
373 374
            (* Update our child. *)
            insert t' w prod
375

376 377 378
  (* [insert t prod] inserts a new branch, corresponding to production
     [prod], into the trie [t], which is updated in place. *)
  let insert t prod : unit =
379
    let w = Array.to_list (Production.rhs prod) in
380 381 382 383 384 385 386 387 388
    (* Check whether the path [w] leads to a state where [prod] can be
       reduced. If not, then some transition or reduction action must
       have been suppressed by conflict resolution; or the path [w]
       involves the [error] token. In that case, the branch is dead,
       and is not added. This test is superfluous (i.e., it would
       be OK to add a dead branch) but allows us to build a slightly
       smaller star in some cases. *)
    if reduction_path_exists t.current w prod then
      insert t w prod
389

390 391 392 393
  (* [fresh s] creates a new empty trie whose source is [s]. *)
  let fresh source =
    mktrie source source [] SymbolMap.empty

394 395 396
  (* The star at [s] is obtained by starting with a fresh empty trie and
     inserting into it every production [prod] whose left-hand side [nt]
     is the label of an outgoing edge at [s]. *)
397
  let star s =
398 399
    let t = fresh s in
    SymbolMap.iter (fun sym _ ->
400 401
      match sym with
      | Symbol.T _ ->
402
          ()
403
      | Symbol.N nt ->
404 405 406
          Production.iternt nt (insert t)
    ) (Lr1.transitions s);
    t
407

408 409 410 411 412 413 414
  (* A trie [t] is nontrivial if it has at least one branch, i.e., contains at
     least one sub-trie whose [productions] field is nonempty. Trivia: a trie
     of size greater than 1 is necessarily nontrivial, but the converse is not
     true: a nontrivial trie can have size 1. (This occurs if all productions
     have zero length.) *)
  let trivial t =
    t.productions = [] && SymbolMap.is_empty t.transitions
415

416 417
  (* Redefine [star] to include a [nontrivial] test and to record the size of
     the newly built trie. *)
418 419 420 421

  let size =
    Array.make Lr1.n (-1)

422
  let star s =
423
    let initial = !c in
424
    let t = star s in
425 426
    let final = !c in
    size.(Lr1.number s) <- final - initial;
427
    if trivial t then None else Some t
428

429 430 431 432
  let size s =
    assert (size.(s) >= 0);
    size.(s)

433 434 435
  let cumulated_size () =
    !c

POTTIER Francois's avatar
POTTIER Francois committed
436
  let compare t1 t2 =
437
    Pervasives.compare t1.identity t2.identity
438

439 440 441
  let source t =
    t.source

POTTIER Francois's avatar
POTTIER Francois committed
442 443
  let current t =
    t.current
444

445 446 447
  let accepts prod t =
    List.mem prod t.productions

448
  let step a t =
449
    SymbolMap.find a t.transitions (* careful: may raise [Not_found] *)
450

451
  let verbose () =
452
    Printf.eprintf "Cumulated star size: %d\n%!" (cumulated_size())
453

454 455 456 457 458 459 460 461 462
  let decode i =
    let t = MenhirLib.InfiniteArray.get tries i in
    assert (t.identity = i); (* ensure we do not get the [dummy] trie *)
    t

  let encode t =
    assert (decode t.identity == t); (* round-trip property *)
    t.identity

463 464
end

POTTIER Francois's avatar
POTTIER Francois committed
465 466
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
467 468 469 470 471 472
(* The main algorithm, [LRijkstra], accumulates facts. A fact is a triple of a
   position (that is, a sub-trie), a word, and a lookahead assumption. Such a
   fact means that this position can be reached, from the source state
   [Trie.source fact.position], by consuming [fact.word], under the assumption
   that the next input symbol is [fact.lookahead]. *)

473 474
(* We allow [fact.lookahead] to be [any] so as to indicate that this fact does
   not have a lookahead assumption. *)
POTTIER Francois's avatar
POTTIER Francois committed
475

476 477
(*

478
type fact = {
479
  position: Trie.trie;
480
  word: W.word;
481
  lookahead: Terminal.t (* may be [any] *)
482 483
}

484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541
*)

(* To save memory (and therefore time), we encode a fact in a single OCaml
   integer value. This is made possible by the fact that tries, words, and
   terminal symbols are represented as (or can be encoded as) integers. 
   This admittedly horrible hack allows us to save roughly a factor of 2
   in space, and to gain 10% in time. *)

type fact = int

(* Encoding and decoding facts. *)

(* The lookahead symbol fits in 8 bits. In the largest grammars that we have
   seen, the number of unique words is about 3.10^5, so a word should fit in
   about 19 bits (2^19 = 524288). In the largest grammars that we have seen,
   the cumulated star size is about 64000, so a trie should fit in about 17
   bits (2^17 = 131072). We have ample space in a 63-bit word! We allocate 8
   bits for [lookahead], 30 bits for [word], and 25 bits for [position]. We
   could support 32-bit machines too, but that is probably pointless. *)

let () =
  if Sys.word_size < 64 then
    Error.error [] (Printf.sprintf
      "--list-errors requires a 64-bit machine.\n\
       You are using a %d-bit machine." Sys.word_size
    )

let position (fact : fact) : Trie.trie =
  Trie.decode (fact lsr 38)

let word (fact : fact) : W.word =
  (fact lsr 8) land (1 lsl 30 - 1)

let lookahead (fact : fact) : Terminal.t =
  Terminal.i2t (fact land (1 lsl 8 - 1))

let mkfact position (word : W.word) lookahead =
  let position : int = Trie.encode position
  and word : int = word
  and lookahead : int = Terminal.t2i lookahead in
  assert (0 <= position && 0 <= word && 0 <= lookahead);
  assert (lookahead < 1 lsl 8);
  if position < 1 lsl 25 && word < 1 lsl 30 then
    (position lsl 38) lor (word lsl 8) lor lookahead
  else
    Error.error [] (Printf.sprintf
      "Internal error: a hardwired limit was exceeded.\n\
       Position = %d. Word = %d.\n\
       Please report this error to Menhir's developers.\n%!"
      position word)

let mkfact p w l =
  let fact = mkfact p w l in
  assert (word fact == w);      (* round-trip property *)
  assert (lookahead fact == l); (* round-trip property *)
  assert (position fact == p);  (* round-trip property *)
  fact

542 543
(* Accessors. *)

544
let source fact =
545
  Trie.source (position fact)
546

POTTIER Francois's avatar
POTTIER Francois committed
547
let current fact =
548
  Trie.current (position fact)
549

550 551
(* Two invariants reduce the number of facts that we consider:

552 553 554
   1. If [fact.lookahead] is a real terminal symbol [z] (i.e., not [any]),
      then [z] does not cause an error in the current state [current fact].
      It would be useless to consider a fact that violates this property;
POTTIER Francois's avatar
POTTIER Francois committed
555 556 557
      this cannot possibly lead to a successful reduction. In practice,
      this refinement allows reducing the number of facts that go through
      the queue by a factor of two.
558 559 560 561

   2. [fact.lookahead] is [any] iff the current state [current fact] is
      solid. This sounds rather reasonable (when a state is entered
      by shifting, it is entered regardless of which symbol follows)
562
      and simplifies the implementation of the sub-module [F].
563 564 565 566

*)

let invariant1 fact =
567 568 569
  let current = current fact
  and lookahead = lookahead fact in
  lookahead = any || not (causes_an_error current lookahead)
570 571

let invariant2 fact =
572 573 574
  let current = current fact
  and lookahead = lookahead fact in
  (lookahead = any) = is_solid current
575 576 577 578 579

(* [compatible z a] checks whether the terminal symbol [a] satisfies the
   lookahead assumption [z] -- which can be [any]. *)

let compatible z a =
580 581
  assert (non_error z);
  assert (Terminal.real a);
582 583
  z = any || z = a

POTTIER Francois's avatar
POTTIER Francois committed
584
(* ------------------------------------------------------------------------ *)
585

586 587 588 589 590 591 592 593 594 595 596 597 598 599 600
(* As in Dijkstra's algorithm, a priority queue contains the facts that await
   examination. The length of [fact.word] serves as the priority of a fact.
   This guarantees that we discover shortest paths. (We never insert into the
   queue a fact whose priority is less than the priority of the last fact
   extracted out of the queue.) *)

(* [LowIntegerPriorityQueue] offers very efficient operations (essentially
   constant time, for a small constant). It exploits the fact that priorities
   are low nonnegative integers. *)

module Q = LowIntegerPriorityQueue

let q =
  Q.create()

601
(* In principle, there is no need to insert the fact into the queue if [F]
602 603
   already stores a comparable fact. We could perform this test in [add].
   However, a quick experiment suggests that this is not worthwhile. The run
604
   time augments (because membership in [F] is tested twice, upon inserting
605 606 607 608
   and upon extracting) and the memory consumption does not seem to go down
   significantly. *)

let add fact =
POTTIER Francois's avatar
POTTIER Francois committed
609
  (* [fact.lookahead] can be [any], but cannot be [error] *)
610
  assert (non_error (lookahead fact));
611 612
  assert (invariant1 fact);
  assert (invariant2 fact);
613
  (* The length of [fact.word] serves as the priority of this fact. *)
614 615
  (* TEMPORARY we are decoding a word that we have just encoded *)
  Q.add q fact (W.length (word fact))
616

POTTIER Francois's avatar
POTTIER Francois committed
617 618
(* ------------------------------------------------------------------------ *)

619 620 621
(* Construct the [star] of every state [s]. Initialize the priority queue. *)

let () =
POTTIER Francois's avatar
POTTIER Francois committed
622
  (* For every state [s]... *)
623
  Lr1.iter (fun s ->
POTTIER Francois's avatar
POTTIER Francois committed
624
    (* If the trie rooted at [s] is nontrivial...*)
625
    match Trie.star s with
POTTIER Francois's avatar
POTTIER Francois committed
626 627 628 629 630 631 632 633 634 635
    | None ->
        ()
    | Some position ->
        (* ...then insert an initial fact into the priority queue. *)
        (* In order to respect invariants 1 and 2, we must distinguish two
           cases. If [s] is solid, then we insert a single fact, whose
           lookahead assumption is [any]. Otherwise, we must insert one
           initial fact for every terminal symbol [z] that does not cause
           an error in state [s]. *)
        let word = W.epsilon in
636
        if is_solid s then
637
          add (mkfact position word any)
638 639
        else
          foreach_terminal_not_causing_an_error s (fun z ->
640
            add (mkfact position word z)
641
          )
POTTIER Francois's avatar
POTTIER Francois committed
642 643 644
  );
  if X.verbose then
    Trie.verbose()
645

646 647 648 649 650 651 652
(* Produce a warning if the grammar uses the [error] pseudo-token. *)

let () =
  if !grammar_uses_error then
    Error.warning []
      "--list-errors ignores all productions that involve the error token."

653 654
(* ------------------------------------------------------------------------ *)

655
(* The module [F] maintains a set of known facts. *)
656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672

(* Three aspects of a fact are of particular interest:
   - its position [position], given by [fact.position];
   - its first symbol [a], given by [W.first fact.word fact.lookahead];
   - its lookahead assumption [z], given by [fact.lookahead].

   For every triple of [position], [a], and [z], we store at most one fact,
   (whose word has minimal length). Indeed, we are not interested in keeping
   track of several words that produce the same effect. Only the shortest such
   word is of interest.
   
   Thus, the total number of facts accumulated by the algorithm is at most
   [T.n^2], where [T] is the total size of the tries that we have constructed,
   and [n] is the number of terminal symbols. (This number can be quite large.
   [T] can be in the tens of thousands, and [n] can be over one hundred. These
   figures lead to a theoretical upper bound of 100M. In practice, for T=25K
   and n=108, we observe that the algorithm gathers about 7M facts.) *)
673

674
module F : sig
675 676

  (* [register fact] registers the fact [fact]. It returns [true] if this fact
POTTIER Francois's avatar
POTTIER Francois committed
677 678
     is new, i.e., no fact concerning the same triple of [position], [a], and
     [z] was previously known. *)
679 680
  val register: fact -> bool

POTTIER Francois's avatar
POTTIER Francois committed
681
  (* [query current z f] enumerates all known facts whose current state is
682 683
     [current] and whose lookahead assumption is compatible with [z]. The
     symbol [z] must a real terminal symbol, i.e., cannot be [any]. *)
684
  val query: Lr1.node -> Terminal.t -> (fact -> unit) -> unit
685

686 687 688
  (* [size()] returns the number of facts currently stored in the set. *)
  val size: unit -> int

689
  (* [verbose()] outputs debugging & performance information. *)
690
  val verbose: unit -> unit
691

692
end = struct
693

694 695 696 697 698 699 700 701 702 703 704
  (* We need to query the set of facts in two ways. In [register], we must test
     whether a proposed triple of [position], [a], [z] already appears in the
     set. In [query], we must find all facts that match a pair [current, z],
     where [current] is a state. (Note that [position] determines [current], but
     the converse is not true: a position contains more information besides the
     current state.)

     To address these needs, we use a two-level table. The first level is a
     matrix indexed by [current] and [z]. At the second level, we find sets of
     facts, where two facts are considered equal if they have the same triple of
     [position], [a], and [z]. In fact, we know at this level that all facts
705 706 707 708 709 710
     have the same [z] component, so only [position] and [a] are compared.

     Because our facts satisfy invariant 2, [z] is [any] if and only if the
     state [current fact] is solid. This means that we are wasting quite a
     lot of space in the matrix (for a solid state, the whole line is empty,
     except for the [any] column). *)
711

712
  (* The level-2 sets. *)
713 714

  module M =
715
    MySet.Make(struct
716 717
      type t = fact
      let compare fact1 fact2 =
718 719 720
        assert (lookahead fact1 = lookahead fact2);
        let c = Trie.compare (position fact1) (position fact2) in
          (* TEMPORARY could compare positions without decoding *)
721
        if c <> 0 then c else
722 723 724
        let z = lookahead fact1 in
        let a1 = W.first (word fact1) z
        and a2 = W.first (word fact2) z in
725
        (* note: [a1] and [a2] can be [any] here *)
726 727
        Terminal.compare a1 a2
    end)
728

729 730 731
  (* The level-1 matrix. *)

  let table =
732
    Array.make (Lr1.n * Terminal.n) M.empty
733

POTTIER Francois's avatar
POTTIER Francois committed
734
  let index current z =
735
    Terminal.n * (Lr1.number current) + Terminal.t2i z
736

737 738
  let count = ref 0

739
  let register fact =
POTTIER Francois's avatar
POTTIER Francois committed
740
    let current = current fact in
741
    let z = lookahead fact in
POTTIER Francois's avatar
POTTIER Francois committed
742
    let i = index current z in
743 744 745 746 747 748 749 750 751 752 753 754
    let m = table.(i) in
    (* We crucially rely on the fact that [M.add] guarantees not to
       change the set if an ``equal'' fact already exists. Thus, a
       later, longer path is ignored in favor of an earlier, shorter
       path. *)
    let m' = M.add fact m in
    m != m' && begin
      incr count;
      table.(i) <- m';
      true
    end

POTTIER Francois's avatar
POTTIER Francois committed
755
  let query current z f =
756 757 758 759 760
    assert (not (Terminal.equal z any));
    (* If the state [current] is solid then the facts that concern it are
       stored in the column [any], and all of them are compatible with [z].
       Otherwise, they are stored in all columns except [any], and only
       those stored in the column [z] are compatible with [z]. *)
761
    let i = index current (if is_solid current then any else z) in
762 763
    let m = table.(i) in
    M.iter f m
764

765 766 767
  let size () =
    !count

768
  let verbose () =
769
    Printf.eprintf "F stores %d facts.\n%!" (size())
770

771 772
end

POTTIER Francois's avatar
POTTIER Francois committed
773 774
(* ------------------------------------------------------------------------ *)

775 776
(* The module [E] is in charge of recording the non-terminal edges that we have
   discovered, or more precisely, the conditions under which these edges can be
POTTIER Francois's avatar
POTTIER Francois committed
777 778 779 780 781 782 783 784 785 786 787 788
   taken.
   
   It maintains a set of quadruples [s, nt, w, z], where such a quadruple means
   that in the state [s], the outgoing edge labeled [nt] can be taken by
   consuming the word [w], under the assumption that the next symbol is [z].

   Again, the terminal symbol [a], given by [W.first w z], plays a role. For
   each quadruple [s, nt, a, z], we store at most one quadruple [s, nt, w, z].
   Thus, internally, we maintain a mapping of [s, nt, a, z] to [w].

   For greater simplicity, we do not allow [z] to be [any] in [register] or
   [query]. Allowing it would complicate things significantly, it seems. *)
789 790 791 792

module E : sig

  (* [register s nt w z] records that, in state [s], the outgoing edge labeled
793
     [nt] can be taken by consuming the word [w], if the next symbol is [z].
POTTIER Francois's avatar
POTTIER Francois committed
794 795
     It returns [true] if this information is new, i.e., if the underlying
     quadruple [s, nt, a, z] is new. The symbol [z] cannot be [any]. *)
796
  val register: Lr1.node -> Nonterminal.t -> W.word -> Terminal.t -> bool
797

POTTIER Francois's avatar
POTTIER Francois committed
798 799 800 801 802 803 804
  (* [query s nt a z] enumerates all words [w] such that, in state [s], the
     outgoing edge labeled [nt] can be taken by consuming the word [w], under
     the assumption that the next symbol is [z], and the first symbol of the
     word [w.z] is [a]. The symbol [a] can be [any]. The symbol [z] cannot be
     [any]. *)
  val query: Lr1.node -> Nonterminal.t -> Terminal.t -> Terminal.t ->
             (W.word -> unit) -> unit
805

806 807 808
  (* [size()] returns the number of edges currently stored in the set. *)
  val size: unit -> int

POTTIER Francois's avatar
POTTIER Francois committed
809
  (* [verbose()] outputs debugging & performance information. *)
810
  val verbose: unit -> unit
811

812 813
end = struct

814 815 816 817 818 819 820 821 822 823
  (* At a high level, we must implement a mapping of [s, nt, a, z] to [w]. In
     practice, we can implement this specification using any combination of
     arrays, hash tables, balanced binary trees, and perfect hashing (i.e.,
     packing several of [s], [nt], [a], [z] in one word.) Here, we choose to
     use an array, indexed by [s], of hash tables, indexed by a key that packs
     [nt], [a], and [z] in one word. According to a quick experiment, the
     final population of the hash table [table.(index s)] seems to be roughly
     [Terminal.n * Trie.size s]. We note that using an initial capacity
     of 0 and relying on the hash table's resizing mechanism has a significant
     cost, which is why we try to guess a good initial capacity. *)
824

825
  module H = Hashtbl
826

POTTIER Francois's avatar
POTTIER Francois committed
827 828
  let table =
    Array.init Lr1.n (fun i ->
829 830 831
      let size = Trie.size i in
      H.create (if size = 1 then 0 else Terminal.n * size)
    )
832 833 834

  let index s =
    Lr1.number s
835

836
  let pack nt a z : int =
POTTIER Francois's avatar
POTTIER Francois committed
837
    (* We rely on the fact that we have at most 256 terminal symbols. *)
838 839 840
    (Nonterminal.n2i nt lsl 16) lor
    (Terminal.t2i a lsl 8) lor
    (Terminal.t2i z)
841

842 843
  let count = ref 0

844
  let register s nt w z =
845
    assert (Terminal.real z);
846
    let i = index s in
847
    let m = table.(i) in
848
    let a = W.first w z in
POTTIER Francois's avatar
POTTIER Francois committed
849
    (* Note that looking at [a] in state [s] cannot cause an error. *)
850
    assert (not (causes_an_error s a));
851 852 853 854
    let key = pack nt a z in
    if H.mem m key then
      false
    else begin
855
      incr count;
856
      H.add m key w;
857 858
      true
    end
859

860
  let rec query s nt a z f =
861
    assert (Terminal.real z);
POTTIER Francois's avatar
POTTIER Francois committed
862 863
    if Terminal.equal a any then begin
      (* If [a] is [any], we query the table for every real symbol [a].
864 865 866 867 868 869
         We can limit ourselves to symbols that do not cause an error
         in state [s]. Those that do certainly do not have an entry;
         see the assertion in [register] above. *)
      foreach_terminal_not_causing_an_error s (fun a ->
        query s nt a z f
      )
POTTIER Francois's avatar
POTTIER Francois committed
870 871 872 873 874 875 876 877
    end
    else begin
      let i = index s in
      let m = table.(i) in
      let key = pack nt a z in
      match H.find m key with
      | w -> f w
      | exception Not_found -> ()
878
    end
879

880 881 882
  let size () =
    !count

883
  let verbose () =
884
    Printf.eprintf "E stores %d edges.\n%!" (size())
885

886 887
end

POTTIER Francois's avatar
POTTIER Francois committed
888 889
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
890 891 892 893 894 895 896
(* [new_edge s nt w z] is invoked when we discover that in the state [s], the
   outgoing edge labeled [nt] can be taken by consuming the word [w], under
   the assumption that the next symbol is [z]. We check whether this quadruple
   already exists in the set [E]. If not, then we add it, and we compute its
   consequences, in the form of new facts, which we insert into the priority
   queue for later examination. *)

897
let new_edge s nt w z =
898
  assert (Terminal.real z);
899
  if E.register s nt w z then
900
    let sym = Symbol.N nt in
901
    (* Query [F] for existing facts which could be extended by following
POTTIER Francois's avatar
POTTIER Francois committed
902 903 904
       this newly discovered edge. They must be facts whose current state
       is [s] and whose lookahead assumption is compatible with [a]. For
       each such fact, ... *)
905
    F.query s (W.first w z) (fun fact ->
906
      assert (compatible (lookahead fact) (W.first w z));
POTTIER Francois's avatar
POTTIER Francois committed
907
      (* ... try to take one step in the trie along an edge labeled [nt]. *)
908
      match Trie.step sym (position fact) with
909
      | position ->
POTTIER Francois's avatar
POTTIER Francois committed
910
          (* This takes us to a new state whose incoming symbol is [nt].
POTTIER Francois's avatar
POTTIER Francois committed
911 912 913 914 915
             Hence, this state is not solid. In order to satisfy invariant 2,
             we must create fact whose lookahead assumption is not [any].
             That's fine, since our lookahead assumption is [z]. In order to
             satisfy invariant 1, we must check that [z] does not cause an
             error in this state. *)
916
          assert (not (is_solid (Trie.current position)));
917
          if not (causes_an_error (Trie.current position) z) then
918 919
            let word = W.append (word fact) w in
            add (mkfact position word z)
920
      | exception Not_found ->
POTTIER Francois's avatar
POTTIER Francois committed
921 922 923
          (* Could not take a step in the trie. This means this branch
             leads nowhere of interest, and was pruned when the trie
             was constructed. *)
924
          ()
925
    )
926

POTTIER Francois's avatar
POTTIER Francois committed
927 928
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
929 930
(* [new_fact fact] is invoked when we discover a new fact (i.e., one that was
   not previously known). It studies the consequences of this fact. These
931 932 933
   consequences are of two kinds:

   - As in Dijkstra's algorithm, the new fact can be viewed as a newly
POTTIER Francois's avatar
POTTIER Francois committed
934 935 936 937 938 939 940 941 942
     discovered vertex. We study its (currently known) outgoing edges,
     and enqueue new facts in the priority queue.

   - Sometimes, a fact can also be viewed as a newly discovered edge. This is
     the case when the word that took us from [fact.source] to [fact.current]
     represents a production of the grammar and [fact.current] is willing to
     reduce this production. We record the existence of this edge, and
     re-inspect any previously discovered vertices which are interested in
     this outgoing edge. *)
943

POTTIER Francois's avatar
POTTIER Francois committed
944
let new_fact fact =
945

946
  (* TEMPORARY here and possibly elsewhere, avoid decoding [fact] several times *)
POTTIER Francois's avatar
POTTIER Francois committed
947
  let current = current fact in
948

POTTIER Francois's avatar
POTTIER Francois committed
949 950 951
  (* 1. View [fact] as a vertex. Examine the transitions out of [current].
     For every transition labeled by a symbol [sym] and into a state
     [target], ... *)
952
  
POTTIER Francois's avatar
POTTIER Francois committed
953 954 955
  Lr1.transitions current |> SymbolMap.iter (fun sym target ->
    (* ... try to follow this transition in the trie [fact.position],
       down to a child which we call [position]. *)
956
    match Trie.step sym (position fact), sym with
POTTIER Francois's avatar
POTTIER Francois committed
957 958 959 960 961 962 963

    | exception Not_found ->

        (* Could not take a step in the trie. This means this transition
           leads nowhere of interest. *)
        ()

964
    | position, Symbol.T t ->
POTTIER Francois's avatar
POTTIER Francois committed
965 966 967 968 969 970
          
        (* 1a. The transition exists in the trie, and [sym] is in fact a
           terminal symbol [t]. We note that [t] cannot be the [error] token,
           because the trie does not have any edges labeled [error]. *)
        assert (Lr1.Node.compare (Trie.current position) target = 0);
        assert (is_solid target);
971
        assert (non_error t);
972

POTTIER Francois's avatar
POTTIER Francois committed
973 974 975 976 977 978 979 980 981 982
        (* If the lookahead assumption [fact.lookahead] is compatible with
           [t], then we derive a new fact, where one more edge has been taken,
           and enqueue this new fact for later examination. *)
        
        (* The state [target] is solid, i.e., its incoming symbol is terminal.
           This state is always entered without consideration for the next
           lookahead symbol. Thus, we can use [any] as the lookahead assumption
           in the new fact that we produce. If we did not have [any], we would
           have to produce one fact for every possible lookahead symbol. *)

983 984 985
        if compatible (lookahead fact) t then
          let word = W.append (word fact) (W.singleton t) in
          add (mkfact position word any)
986

987
    | position, Symbol.N nt ->
988

POTTIER Francois's avatar
POTTIER Francois committed
989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007
        (* 1b. The transition exists in the trie, and [sym] is in fact a
           nonterminal symbol [nt]. *)
         assert (Lr1.Node.compare (Trie.current position) target = 0);
         assert (not (is_solid target));

        (* We need to know how this nonterminal edge can be taken. We query
           [E] for a word [w] that allows us to take this edge. In general,
           the answer depends on the terminal symbol [z] that comes *after*
           this word: we try all such symbols. We must make sure that the
           first symbol of the word [w.z] satisfies the lookahead assumption
           [fact.lookahead]; this is ensured by passing this information to
           [E.query]. *)

        (* It could be the case that, due to a default reduction, the answer
           to our query does not depend on [z], and we are wasting work.
           However, allowing [z] to be [any] in [E.query], and taking 
           advantage of this to increase performance, seems difficult. *)

        foreach_terminal_not_causing_an_error target (fun z ->
1008 1009 1010 1011
          E.query current nt (lookahead fact) z (fun w ->
            assert (compatible (lookahead fact) (W.first w z));
            let word = W.append (word fact) w in
            add (mkfact position word z)
1012 1013
          )
        )
1014

POTTIER Francois's avatar
POTTIER Francois committed
1015
  );
1016 1017

  (* 2. View [fact] as a possible edge. This is possible if the path from
POTTIER Francois's avatar
POTTIER Francois committed
1018 1019
     [fact.source] to the [current] state represents a production [prod] and
     [current] is willing to reduce this production. Then, reducing [prod]
1020 1021
     takes us all the way back to [fact.source]. Thus, this production gives
     rise to an edge labeled [nt] -- the left-hand side of [prod] -- out of
POTTIER Francois's avatar
POTTIER Francois committed
1022
     [fact.source]. *)
1023

1024
  let z = lookahead fact in
POTTIER Francois's avatar
POTTIER Francois committed
1025 1026 1027 1028 1029 1030 1031 1032 1033
  if not (Terminal.equal z any) then begin

    (* 2a. The lookahead assumption [z] is a real terminal symbol. We check
       whether [current] is willing to reduce some production [prod] on [z],
       and whether the sub-trie [fact.position] accepts [prod], which means
       that this reduction takes us back to the root of the trie. If so, we
       have discovered a new edge. *)

    match has_reduction current z with
1034 1035
    | Some prod when Trie.accepts prod (position fact) ->
        new_edge (source fact) (Production.nt prod) (word fact) z
1036 1037
    | _ ->
        ()
POTTIER Francois's avatar
POTTIER Francois committed
1038

1039 1040
  end
  else begin
POTTIER Francois's avatar
POTTIER Francois committed
1041 1042 1043 1044 1045

    (* 2b. The lookahead assumption is [any]. We must consider every pair
       [prod, z] such that the [current] state can reduce [prod] on [z]
       and [fact.position] accepts [prod]. *)

1046 1047
    match Invariant.has_default_reduction current with
    | Some (prod, _) ->
1048
        if Trie.accepts prod (position fact) then
POTTIER Francois's avatar
POTTIER Francois committed
1049 1050
          (* [new_edge] does not accept [any] as its 4th parameter, so we
             must iterate over all terminal symbols. *)
1051
          foreach_terminal (fun z ->
1052
            new_edge (source fact) (Production.nt prod) (word fact) z
1053 1054 1055
          )
    | None ->
       TerminalMap.iter (fun z prods ->
1056
         if non_error z then
1057
           let prod = Misc.single prods in
1058 1059
           if Trie.accepts prod (position fact) then
             new_edge (source fact) (Production.nt prod) (word fact) z
1060
       ) (Lr1.reductions current)
POTTIER Francois's avatar
POTTIER Francois committed
1061

1062
  end
1063

POTTIER Francois's avatar
POTTIER Francois committed
1064 1065
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
1066
(* The main loop of the algorithm. *)
1067

1068 1069 1070 1071 1072
(* [level] is the length of [fact.word] for the facts that we are examining
   at the moment. [extracted] counts how many facts we have extracted out of
   the priority queue. [considered] counts how many of these were found to
   be new, and subsequently passed to [new_fact]. *)

POTTIER Francois's avatar
POTTIER Francois committed
1073 1074
let level, extracted, considered =
  ref 0, ref 0, ref 0
POTTIER Francois's avatar
POTTIER Francois committed
1075

POTTIER Francois's avatar
POTTIER Francois committed
1076
let done_with_level () =
1077 1078
  Printf.eprintf "Done with level %d.\n" !level;
  W.verbose();
1079
  F.verbose();
1080 1081 1082 1083
  E.verbose();
  Printf.eprintf "Q stores %d facts.\n" (Q.cardinal q);
  Printf.eprintf "%d facts extracted out of Q, of which %d considered.\n%!"
    !extracted !considered
1084

POTTIER Francois's avatar
POTTIER Francois committed
1085
let () =
POTTIER Francois's avatar
POTTIER Francois committed
1086 1087
  Q.repeat q (fun fact ->
    incr extracted;
1088
    if F.register fact then begin
1089
      if X.verbose && W.length (word fact) > !level then begin
POTTIER Francois's avatar
POTTIER Francois committed
1090
        done_with_level();
1091
        level := W.length (word fact);
POTTIER Francois's avatar
POTTIER Francois committed
1092 1093 1094 1095 1096
      end;
      incr considered;
      new_fact fact
    end
  );
1097 1098
  if X.verbose then
    done_with_level();
POTTIER Francois's avatar
POTTIER Francois committed
1099
  Time.tick "Running LRijkstra"
1100

1101 1102
(* ------------------------------------------------------------------------ *)

1103
(* The following code validates the fact that an error can be triggered in
1104 1105 1106 1107
   state [s'] by beginning at the start symbol [nt] and reading the
   sequence of terminal symbols [w]. We use this for debugging purposes.
   Furthermore, this gives us a list of spurious reductions, which we use
   to produce a comment. *)
1108 1109

let fail msg =
1110
  Printf.eprintf "LRijkstra: internal error: %s.\n%!" msg;
1111
  exit 1
1112

1113
let validate nt s' w : ReferenceInterpreter.target =
1114
  let open ReferenceInterpreter in
1115
  match
1116
    check_error_path nt (W.elements w)
1117 1118 1119 1120 1121 1122 1123
  with
  | OInputReadPastEnd ->
      fail "input was read past its end"
  | OInputNotFullyConsumed ->
      fail "input was not fully consumed"
  | OUnexpectedAccept ->
      fail "input was unexpectedly accepted"
1124 1125 1126 1127 1128 1129 1130 1131 1132
  | OK ((state, _) as target) ->
      if Lr1.Node.compare state s' <> 0 then
        fail (
          Printf.sprintf "error occurred in state %d instead of %d"
            (Lr1.number state)
            (Lr1.number s')
        )
      else
        target
1133 1134 1135

(* ------------------------------------------------------------------------ *)