LRijkstra.ml 51 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
(* The purpose of this algorithm is to find, for each pair of a state [s]
   and a terminal symbol [z] such that looking at [z] in state [s] causes
   an error, a minimal path (starting in some initial state) that actually
   triggers this error. *)

(* This is potentially useful for grammar designers who wish to better
   understand the properties of their grammar, or who wish to produce a
   list of all possible syntax errors (or, at least, one syntax error in
   each automaton state where an error may occur). *)

(* The problem seems rather tricky. One might think that it suffices to
   compute shortest paths in the automaton, and to use [Analysis.minimal] to
   replace each non-terminal symbol in a path with a minimal word that this
   symbol generates. One can indeed do so, but this yields only a lower bound
POTTIER Francois's avatar
POTTIER Francois committed
15 16 17 18 19 20
   on the actual shortest path to the error at [s, z]. Indeed, several
   difficulties arise, including the fact that reductions are subject to a
   lookahead hypothesis; the fact that some states have a default reduction,
   hence will never trigger an error; the fact that conflict resolution
   removes some (shift or reduce) actions, hence may suppress the shortest
   path. *)
21

22 23 24 25 26 27 28 29 30 31 32
(* We explicitly choose to ignore the [error] token. Thus, we disregard any
   reductions or transitions that take place when the lookahead symbol is
   [error]. As a result, any state whose incoming symbol is [error] is found
   unreachable. It would be too complicated to have to create a first error in
   order to be able to take certain transitions or drop certain parts of the
   input. *)

(* We never work with the terminal symbol [#] either. This symbol never
   appears in the maps returned by [Lr1.transitions] and [Lr1.reductions].
   Thus, in principle, we work with ``real'' terminal symbols only. However,
   we encode [any] as [#] -- see below. *)
33

34
(* NOTE: THIS FILE IS COMPILED WITH -noassert BY DEFAULT. If you would like
35 36
   the assertions to be tested at runtime, change that in the file _tags.
   The performance impact of the assertions is about 10%. *)
37

38 39 40
(* ------------------------------------------------------------------------ *)

(* To delay the side effects performed by this module, we wrap everything in
41
   in a big functor. The functor also serves to pass verbosity parameters. *)
42

43 44 45 46 47 48 49
module Run (X : sig
  (* If [verbose] is set, produce various messages on [stderr]. *)
  val verbose: bool
  (* If [statistics] is defined, it is interpreted as the name of
     a file to which one line of statistics is appended. *)
  val statistics: string option
end) = struct
50

51
open Grammar
52

53 54 55 56 57 58 59 60 61 62 63
(* ------------------------------------------------------------------------ *)

(* Record our start time. *)

let now () =
  match X.statistics with
  | Some _ ->
      Unix.((times()).tms_utime)
  | None ->
      0.0

64
let start =
65
  now()
66

67 68
(* ------------------------------------------------------------------------ *)

69 70
(* Because of our encoding of terminal symbols as 8-bit characters, this
   algorithm supports at most 256 terminal symbols. *)
71 72

let () =
73
  if Terminal.n > 256 then
74
    Error.error []
75
      "--list-errors supports at most 256 terminal symbols.\n\
76 77 78 79
       The grammar has %d terminal symbols." Terminal.n

(* ------------------------------------------------------------------------ *)

80 81 82 83 84
(* Build a module that represents words as (hash-consed) strings. Note:
   this functor application has a side effect (it allocates memory, and
   more importantly, it may fail). *)

module W = Terminal.Word(struct end)
85

86 87
(* ------------------------------------------------------------------------ *)

88 89
(* The [error] token may appear in the maps returned by [Lr1.transitions]
   and [Lr1.reductions], so we sometimes need to explicitly check for it. *)
90

91
let non_error z =
92
  not (Terminal.equal z Terminal.error)
93

94
(* We introduce a pseudo-terminal symbol [any]. It is used in several places
95
   later on, in particular in the [lookahead] field of a fact, to encode the
96 97
   absence of a lookahead hypothesis -- i.e., any terminal symbol will do. *)

98 99 100 101 102
(* We choose to encode [any] as [#]. There is no risk of confusion, since we
   do not use [#] anywhere. Thus, the assertion [Terminal.real z] implies
   [z <> any]. *)

let any =
103
  Terminal.sharp
104

105 106
(* ------------------------------------------------------------------------ *)

107 108 109 110 111
(* We begin with a number of auxiliary functions that provide information
   about the LR(1) automaton. These functions could perhaps be moved to a
   separate module. We keep them here, for the moment, because they are not
   used anywhere else. *)

POTTIER Francois's avatar
POTTIER Francois committed
112 113 114
(* [reductions_on s z] is the list of reductions permitted in state [s] when
   the lookahead symbol is [z]. This is a list of zero or one elements. This
   does not take default reductions into account. *)
115

POTTIER Francois's avatar
POTTIER Francois committed
116
let reductions_on s z : Production.index list =
117
  assert (Terminal.real z);
118 119 120 121 122
  try
    TerminalMap.find z (Lr1.reductions s)
  with Not_found ->
    []

123 124 125
(* [has_reduction s z] tells whether state [s] is willing to reduce some
   production (and if so, which one) when the lookahead symbol is [z]. It
   takes a possible default reduction into account. *)
126 127

let has_reduction s z : Production.index option =
128
  assert (Terminal.real z);
129 130 131 132
  match Invariant.has_default_reduction s with
  | Some (prod, _) ->
      Some prod
  | None ->
POTTIER Francois's avatar
POTTIER Francois committed
133
      match reductions_on s z with
134 135 136 137 138 139
      | prod :: prods ->
          assert (prods = []);
          Some prod
      | [] ->
          None

140 141 142 143 144 145 146 147 148
(* [can_reduce s prod] indicates whether state [s] is able to reduce
   production [prod] (either as a default reduction, or as a normal
   reduction). *)

let can_reduce s prod =
  match Invariant.has_default_reduction s with
  | Some (prod', _) when prod = prod' ->
      true
  | _ ->
149
      TerminalMap.fold (fun z prods accu ->
150 151
        (* A reduction on [#] is always a default reduction. (See [lr1.ml].) *)
        assert (not (Terminal.equal z Terminal.sharp));
152
        accu || non_error z && List.mem prod prods
153 154
      ) (Lr1.reductions s) false

155 156
(* [causes_an_error s z] tells whether state [s] will initiate an error on the
   lookahead symbol [z]. *)
157

158
let causes_an_error s z : bool =
159
  assert (Terminal.real z);
160 161 162 163
  match Invariant.has_default_reduction s with
  | Some _ ->
      false
  | None ->
POTTIER Francois's avatar
POTTIER Francois committed
164
      reductions_on s z = [] &&
165 166
      not (SymbolMap.mem (Symbol.T z) (Lr1.transitions s))

167
(* [foreach_terminal f] applies the function [f] to every terminal symbol in
168
   turn, except [error] and [#]. *)
169

170 171
let foreach_terminal =
  Terminal.iter_real
172

173 174 175
(* [foreach_terminal_not_causing_an_error s f] applies the function [f] to
   every terminal symbol [z] such that [causes_an_error s z] is false. This
   could be implemented in a naive manner using [foreach_terminal] and
POTTIER Francois's avatar
POTTIER Francois committed
176
   [causes_an_error]. This implementation is significantly more efficient. *)
177

178 179 180
let foreach_terminal_not_causing_an_error s f =
  match Invariant.has_default_reduction s with
  | Some _ ->
181
      (* There is a default reduction. No symbol causes an error. *)
182 183
      foreach_terminal f
  | None ->
184 185 186
      (* Enumerate every terminal symbol [z] for which there is a
         reduction. *)
      TerminalMap.iter (fun z _ ->
187 188 189
        (* A reduction on [#] is always a default reduction. (See [lr1.ml].) *)
        assert (not (Terminal.equal z Terminal.sharp));
        if non_error z then
190
          f z
191
      ) (Lr1.reductions s);
192 193
      (* Enumerate every terminal symbol [z] for which there is a
         transition. *)
194 195
      SymbolMap.iter (fun sym _ ->
        match sym with
196
        | Symbol.T z ->
197 198
            assert (not (Terminal.equal z Terminal.sharp));
            if non_error z then
199
              f z
200 201 202 203
        | Symbol.N _ ->
            ()
      ) (Lr1.transitions s)

204
(* Let us say a state [s] is solid if its incoming symbol is a terminal symbol
POTTIER Francois's avatar
POTTIER Francois committed
205 206
   (or if it has no incoming symbol at all, i.e., it is an initial state). It
   is fragile if its incoming symbol is a non-terminal symbol. *)
207 208 209 210 211

let is_solid s =
  match Lr1.incoming_symbol s with
  | None
  | Some (Symbol.T _) ->
POTTIER Francois's avatar
POTTIER Francois committed
212
    true
213
  | Some (Symbol.N _) ->
POTTIER Francois's avatar
POTTIER Francois committed
214
    false
215

216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238
(* [reduction_path_exists s w prod] tests whether the path determined by the
   sequence of symbols [w] out of the state [s] exists in the automaton and
   leads to a state where [prod] can be reduced. It further requires [w] to
   not contain the [error] token. Finally, it it sees the [error] token, it
   sets the flag [grammar_uses_error]. *)

let grammar_uses_error =
  ref false

let rec reduction_path_exists s (w : Symbol.t list) prod : bool =
  match w with
  | [] ->
      can_reduce s prod
  | (Symbol.T t) :: _ when Terminal.equal t Terminal.error ->
      grammar_uses_error := true;
      false
  | a :: w ->
      match SymbolMap.find a (Lr1.transitions s) with
      | s ->
          reduction_path_exists s w prod
      | exception Not_found ->
          false

239 240
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
241 242 243 244 245 246 247 248 249
(* Suppose [s] is a state that carries an outgoing edge labeled with a
   non-terminal symbol [nt]. We are interested in finding out how this edge
   can be taken. In order to do that, we must determine how, by starting in
   [s], one can follow a path that corresponds to (the right-hand side of) a
   production [prod] associated with [nt]. There are in general several such
   productions. The paths that they determine in the automaton form a "star".
   We represent the star rooted at [s] as a trie. For every state [s], the
   star rooted at [s] is constructed in advance, before the algorithm runs.
   While the algorithm runs, a point in the trie (that is, a sub-trie) tells
POTTIER Francois's avatar
POTTIER Francois committed
250
   us where we come from, where we are, and which production(s) we are hoping
POTTIER Francois's avatar
POTTIER Francois committed
251 252
   to reduce in the future. *)

253 254 255
module Trie : sig

  type trie
256 257 258 259

  (* [star s] creates a (new) trie whose source is [s], populated with its
     branches. (There is one branch for every production [prod] associated
     with every non-terminal symbol [nt] for which [s] carries an outgoing
260
     edge.) If the star turns out to be trivial then [None] is returned. *)
261 262
  val star: Lr1.node -> trie option

263 264 265 266
  (* After [star s] has been called, [size (Lr1.number s)] reports the size
     of the trie that has been constructed for state [s]. *)
  val size: int -> int

267
  (* After [star] has been called a number of times, [total_size()]
268
     reports the total size of the tries that have been constructed. *)
269
  val total_size: unit -> int
270

271 272 273 274 275 276 277 278
  (* Every (sub-)trie has a unique identity. (One can think of it as its
     address.) [compare] compares the identity of two tries. This can be
     used, e.g., to set up a map whose keys are tries. *)
  val compare: trie -> trie -> int

  (* [source t] returns the source state of the (sub-)trie [t]. This is
     the root of the star of which [t] is a sub-trie. In other words, this
     tells us "where we come from". *)
279
  val source: trie -> Lr1.node
280 281

  (* [current t] returns the current state of the (sub-)trie [t]. This is
282 283
     the root of the sub-trie [t]. In other words, this tells us "where
     we are". *)
284
  val current: trie -> Lr1.node
285

286 287 288 289 290
  (* [accepts prod t] tells whether the current state of the trie [t] is
     the end of a branch associated with production [prod]. If so, this
     means that we have successfully followed a path that corresponds to
     the right-hand side of production [prod]. *)
  val accepts: Production.index -> trie -> bool
291

292 293 294 295 296 297 298
  (* [step sym t] is the immediate sub-trie of [t] along the symbol [sym].
     This function raises [Not_found] if [t] has no child labeled [sym]. *)
  val step: Symbol.t -> trie -> trie

  (* [verbose()] outputs debugging & performance information. *)
  val verbose: unit -> unit

299 300 301 302 303 304 305
  (* Since every (sub-)trie has a unique identity, its identity can serve
     as a unique integer code for this (sub-)trie. We allow this conversion,
     both ways. This mechanism is used only as a way of saving space in the
     encoding of facts. *)
  val encode: trie -> int
  val decode: int -> trie

306
end = struct
307

308 309
  (* A trie has the following structure. *)

310
  type trie = {
311 312 313
    (* A unique identity, used by [compare]. The trie construction code
       ensures that these numbers are indeed unique: see [fresh], [insert],
       [star]. *)
314
    identity: int;
315
    (* The root state of this star: "where we come from". *)
316
    source: Lr1.node;
317
    (* The current state, i.e., the root of this sub-trie: "where we are". *)
318
    current: Lr1.node;
319 320 321
    (* The productions that we can reduce in the current state. In other
       words, if this list is nonempty, then the current state is the end
       of one (or several) branches. It can nonetheless have children. *)
322
    mutable productions: Production.index list;
323
    (* The children, or sub-tries. *)
324 325 326
    mutable transitions: trie SymbolMap.t
    (* The two fields above are written only during the construction of a
       trie. Once every trie has been constructed, they are frozen. *)
327
  }
328

329
  (* This counter is used by [mktrie] to produce unique identities. *)
330 331
  let c = ref 0

332 333 334 335 336 337 338 339
  (* We keep a mapping of integer identities to tries. Whenever a new
     identity is assigned, this mapping must be updated. *)
  let tries =
    let s : Lr1.node = Obj.magic () in (* yes, this hurts *)
    let dummy = { identity = -1; source = s; current = s;
                  productions = []; transitions = SymbolMap.empty } in
    MenhirLib.InfiniteArray.make dummy

340
  (* This smart constructor creates a new trie with a unique identity. *)
341
  let mktrie source current productions transitions =
342
    let identity = Misc.postincrement c in
343 344 345
    let t = { identity; source; current; productions; transitions } in
    MenhirLib.InfiniteArray.set tries identity t;
    t
346

347 348 349 350 351
  (* [insert t w prod] updates the trie (in place) by adding a new branch,
     corresponding to the sequence of symbols [w], and ending with a reduction
     of production [prod]. We assume [reduction_path_exists w prod t.current]
     holds, so we need not worry about this being a dead branch, and we can
     use destructive updates without having to set up an undo mechanism. *)
352

353
  let rec insert (t : trie) (w : Symbol.t list) prod : unit =
354 355
    match w with
    | [] ->
356 357
        assert (can_reduce t.current prod);
        t.productions <- prod :: t.productions
358
    | a :: w ->
359
        match SymbolMap.find a (Lr1.transitions t.current) with
360 361
        | exception Not_found ->
            assert false
362
        | successor ->
363 364 365 366 367
            (* Find our child at [a], or create it. *)
            let t' =
              try
                SymbolMap.find a t.transitions
              with Not_found ->
368 369 370
                let t' = mktrie t.source successor [] SymbolMap.empty in
                t.transitions <- SymbolMap.add a t' t.transitions;
                t'
371
            in
372 373
            (* Update our child. *)
            insert t' w prod
374

375 376 377
  (* [insert t prod] inserts a new branch, corresponding to production
     [prod], into the trie [t], which is updated in place. *)
  let insert t prod : unit =
378
    let w = Array.to_list (Production.rhs prod) in
379 380 381 382 383 384 385 386 387
    (* Check whether the path [w] leads to a state where [prod] can be
       reduced. If not, then some transition or reduction action must
       have been suppressed by conflict resolution; or the path [w]
       involves the [error] token. In that case, the branch is dead,
       and is not added. This test is superfluous (i.e., it would
       be OK to add a dead branch) but allows us to build a slightly
       smaller star in some cases. *)
    if reduction_path_exists t.current w prod then
      insert t w prod
388

389 390 391 392
  (* [fresh s] creates a new empty trie whose source is [s]. *)
  let fresh source =
    mktrie source source [] SymbolMap.empty

393 394 395
  (* The star at [s] is obtained by starting with a fresh empty trie and
     inserting into it every production [prod] whose left-hand side [nt]
     is the label of an outgoing edge at [s]. *)
396
  let star s =
397 398
    let t = fresh s in
    SymbolMap.iter (fun sym _ ->
399 400
      match sym with
      | Symbol.T _ ->
401
          ()
402
      | Symbol.N nt ->
403 404 405
          Production.iternt nt (insert t)
    ) (Lr1.transitions s);
    t
406

407 408 409 410 411 412 413
  (* A trie [t] is nontrivial if it has at least one branch, i.e., contains at
     least one sub-trie whose [productions] field is nonempty. Trivia: a trie
     of size greater than 1 is necessarily nontrivial, but the converse is not
     true: a nontrivial trie can have size 1. (This occurs if all productions
     have zero length.) *)
  let trivial t =
    t.productions = [] && SymbolMap.is_empty t.transitions
414

415 416
  (* Redefine [star] to include a [nontrivial] test and to record the size of
     the newly built trie. *)
417 418 419 420

  let size =
    Array.make Lr1.n (-1)

421
  let star s =
422
    let initial = !c in
423
    let t = star s in
424 425
    let final = !c in
    size.(Lr1.number s) <- final - initial;
426
    if trivial t then None else Some t
427

428 429 430 431
  let size s =
    assert (size.(s) >= 0);
    size.(s)

432
  let total_size () =
433 434
    !c

435
  let compare t1 t2 =
436
    Pervasives.compare t1.identity t2.identity
437

438 439 440
  let source t =
    t.source

441 442
  let current t =
    t.current
443

444 445 446
  let accepts prod t =
    List.mem prod t.productions

447
  let step a t =
448
    SymbolMap.find a t.transitions (* careful: may raise [Not_found] *)
449

450
  let verbose () =
451
    Printf.eprintf "Total star size: %d\n%!" (total_size())
452

453 454 455 456 457 458 459 460 461
  let decode i =
    let t = MenhirLib.InfiniteArray.get tries i in
    assert (t.identity = i); (* ensure we do not get the [dummy] trie *)
    t

  let encode t =
    assert (decode t.identity == t); (* round-trip property *)
    t.identity

462 463
end

464 465
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
466
(* The main algorithm, [LRijkstra], accumulates facts. A fact is a triple of a
467 468 469 470
   [position] (that is, a sub-trie), a [word], and a [lookahead] assumption.
   Such a fact means that this [position] can be reached, from the source
   state [Trie.source position], by consuming [word], under the assumption
   that the next input symbol is [lookahead]. *)
POTTIER Francois's avatar
POTTIER Francois committed
471

472
(* We allow [lookahead] to be [any] so as to indicate that this fact does
473
   not have a lookahead assumption. *)
POTTIER Francois's avatar
POTTIER Francois committed
474

475 476
(*

477
type fact = {
478
  position: Trie.trie;
479
  word: W.word;
480
  lookahead: Terminal.t (* may be [any] *)
481 482
}

483 484 485 486 487 488 489 490 491 492
*)

(* To save memory (and therefore time), we encode a fact in a single OCaml
   integer value. This is made possible by the fact that tries, words, and
   terminal symbols are represented as (or can be encoded as) integers. 
   This admittedly horrible hack allows us to save roughly a factor of 2
   in space, and to gain 10% in time. *)

type fact = int

493 494 495
let dummy : fact =
  -1 (* should never be accessed! *)

496 497
(* Encoding and decoding facts. *)

498
(* We encode [position|word|lookahead] in a single word of memory. *)
499

500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524
(* The lookahead symbol fits in 8 bits. *)

(* In the largest grammars that we have seen, the number of unique words is
   about 3.10^5, so a word should fit in about 19 bits (2^19 = 524288). In the
   largest grammars that we have seen, the total star size is about 64000, so a
   trie should fit in about 17 bits (2^17 = 131072). *)

(* On a 64-bit machine, we have ample space in a 63-bit word! We allocate 30
   bits for [word] and the rest (i.e., 25 bits) for [position]. *)

(* On a 32-bit machine, we are a bit more cramped! In Menhir's own fancy-parser,
   the number of terminal symbols is 27, the number of unique words is 566, and
   the total star size is 546. We allocate 12 bits for [word] and 11 bits for
   [position]. This is better than refusing to work altogether, but still not
   great. A more satisfactory approach might be to revert to heap allocation of
   facts when in 32-bit mode, but that would make the code somewhat ugly. *)

let w_lookahead =
  8

let w_word =
  if Sys.word_size < 64 then 12 else 30

let w_position  =
  Sys.word_size - 1 - (w_word + w_lookahead) (* 25, on a 64-bit machine *)
525

526
let identity (fact : fact) : int =
527
  assert (fact <> dummy);
528
  fact lsr (w_word + w_lookahead)
529

530
let position (fact : fact) : Trie.trie =
531
  assert (fact <> dummy);
532
  Trie.decode (identity fact)
533 534

let word (fact : fact) : W.word =
535
  assert (fact <> dummy);
536
  (fact lsr w_lookahead) land (1 lsl w_word - 1)
537 538

let lookahead (fact : fact) : Terminal.t =
539
  Terminal.i2t (fact land (1 lsl w_lookahead - 1))
540 541 542 543 544 545

let mkfact position (word : W.word) lookahead =
  let position : int = Trie.encode position
  and word : int = word
  and lookahead : int = Terminal.t2i lookahead in
  assert (0 <= position && 0 <= word && 0 <= lookahead);
546 547 548 549
  assert (lookahead < 1 lsl w_lookahead);
  if position < 1 lsl w_position && word < 1 lsl w_word then
    (* [lsl] binds tighter than [lor] *)
    (position lsl w_word lor word) lsl w_lookahead lor lookahead
550
  else
551 552 553 554 555 556
    let advice =
       if Sys.word_size < 64 then
         "Please use a 64-bit machine."
       else
         "Please report this error to Menhir's developers."
    in
557 558
    Error.error []
      "an internal limit was exceeded.\n\
559
       Sys.word_size = %d. Position = %d. Word = %d.\n\
560
       %s%!"
561
      Sys.word_size position word advice
562 563 564 565 566 567 568 569

let mkfact p w l =
  let fact = mkfact p w l in
  assert (word fact == w);      (* round-trip property *)
  assert (lookahead fact == l); (* round-trip property *)
  assert (position fact == p);  (* round-trip property *)
  fact

570 571
(* Two invariants reduce the number of facts that we consider:

572 573
   1. If [lookahead] is a real terminal symbol [z] (i.e., not [any]),
      then [z] does not cause an error in the [current] state.
574
      It would be useless to consider a fact that violates this property;
POTTIER Francois's avatar
POTTIER Francois committed
575 576 577
      this cannot possibly lead to a successful reduction. In practice,
      this refinement allows reducing the number of facts that go through
      the queue by a factor of two.
578

579
   2. [lookahead] is [any] iff the [current] state is
580 581
      solid. This sounds rather reasonable (when a state is entered
      by shifting, it is entered regardless of which symbol follows)
582
      and simplifies the implementation of the sub-module [F].
583 584 585

*)

586 587
let invariant1 position _word lookahead =
  let current = Trie.current position in
588
  lookahead = any || not (causes_an_error current lookahead)
589

590 591
let invariant2 position _word lookahead =
  let current = Trie.current position in
592
  (lookahead = any) = is_solid current
593 594 595 596 597

(* [compatible z a] checks whether the terminal symbol [a] satisfies the
   lookahead assumption [z] -- which can be [any]. *)

let compatible z a =
598 599
  assert (non_error z);
  assert (Terminal.real a);
600 601
  z = any || z = a

POTTIER Francois's avatar
POTTIER Francois committed
602
(* ------------------------------------------------------------------------ *)
603

604
(* As in Dijkstra's algorithm, a priority queue contains the facts that await
605
   examination. The length of [word fact] serves as the priority of a fact.
606 607 608 609 610 611 612 613 614 615 616
   This guarantees that we discover shortest paths. (We never insert into the
   queue a fact whose priority is less than the priority of the last fact
   extracted out of the queue.) *)

(* [LowIntegerPriorityQueue] offers very efficient operations (essentially
   constant time, for a small constant). It exploits the fact that priorities
   are low nonnegative integers. *)

module Q = LowIntegerPriorityQueue

let q =
617
  Q.create dummy
618

619
(* In principle, there is no need to insert the fact into the queue if [F]
620
   already stores a comparable fact. We could perform this test in [enqueue].
POTTIER Francois's avatar
POTTIER Francois committed
621
   However, a few experiments suggests that this is not worthwhile. The run
622
   time augments (because membership in [F] is tested twice, upon inserting
623 624 625
   and upon extracting) and the memory consumption does not seem to go down
   significantly. *)

626 627 628 629 630
let enqueue position word lookahead =
  (* [lookahead] can be [any], but cannot be [error] *)
  assert (non_error lookahead);
  assert (invariant1 position word lookahead);
  assert (invariant2 position word lookahead);
631
  (* The length of [word] serves as the priority of this fact. *)
632 633 634
  let priority = W.length word in
  (* Encode and enqueue this fact. *)
  Q.add q (mkfact position word lookahead) priority
635

POTTIER Francois's avatar
POTTIER Francois committed
636 637
(* ------------------------------------------------------------------------ *)

638 639 640
(* Construct the [star] of every state [s]. Initialize the priority queue. *)

let () =
POTTIER Francois's avatar
POTTIER Francois committed
641
  (* For every state [s]... *)
642
  Lr1.iter (fun s ->
POTTIER Francois's avatar
POTTIER Francois committed
643
    (* If the trie rooted at [s] is nontrivial...*)
644
    match Trie.star s with
POTTIER Francois's avatar
POTTIER Francois committed
645 646 647 648 649 650 651 652 653 654
    | None ->
        ()
    | Some position ->
        (* ...then insert an initial fact into the priority queue. *)
        (* In order to respect invariants 1 and 2, we must distinguish two
           cases. If [s] is solid, then we insert a single fact, whose
           lookahead assumption is [any]. Otherwise, we must insert one
           initial fact for every terminal symbol [z] that does not cause
           an error in state [s]. *)
        let word = W.epsilon in
655
        if is_solid s then
656
          enqueue position word any
657 658
        else
          foreach_terminal_not_causing_an_error s (fun z ->
659
            enqueue position word z
660
          )
POTTIER Francois's avatar
POTTIER Francois committed
661 662 663
  );
  if X.verbose then
    Trie.verbose()
664

665 666 667 668 669 670 671
(* Produce a warning if the grammar uses the [error] pseudo-token. *)

let () =
  if !grammar_uses_error then
    Error.warning []
      "--list-errors ignores all productions that involve the error token."

672 673
(* ------------------------------------------------------------------------ *)

674
(* The module [F] maintains a set of known facts. *)
675 676

(* Three aspects of a fact are of particular interest:
677 678 679
   - its position [position], given by [position fact];
   - its first symbol [a], given by [W.first (word fact) (lookahead fact)];
   - its lookahead assumption [z], given by [lookahead fact].
680 681 682 683 684 685 686 687 688 689 690 691

   For every triple of [position], [a], and [z], we store at most one fact,
   (whose word has minimal length). Indeed, we are not interested in keeping
   track of several words that produce the same effect. Only the shortest such
   word is of interest.
   
   Thus, the total number of facts accumulated by the algorithm is at most
   [T.n^2], where [T] is the total size of the tries that we have constructed,
   and [n] is the number of terminal symbols. (This number can be quite large.
   [T] can be in the tens of thousands, and [n] can be over one hundred. These
   figures lead to a theoretical upper bound of 100M. In practice, for T=25K
   and n=108, we observe that the algorithm gathers about 7M facts.) *)
692

693
module F : sig
694 695

  (* [register fact] registers the fact [fact]. It returns [true] if this fact
696 697
     is new, i.e., no fact concerning the same triple of [position], [a], and
     [z] was previously known. *)
698 699
  val register: fact -> bool

700
  (* [query current z f] enumerates all known facts whose current state is
701 702
     [current] and whose lookahead assumption is compatible with [z]. The
     symbol [z] must a real terminal symbol, i.e., cannot be [any]. *)
703
  val query: Lr1.node -> Terminal.t -> (fact -> unit) -> unit
704

705 706 707
  (* [size()] returns the number of facts currently stored in the set. *)
  val size: unit -> int

708
  (* [verbose()] outputs debugging & performance information. *)
709
  val verbose: unit -> unit
710

711
end = struct
712

713 714 715 716 717 718 719 720 721 722 723
  (* We need to query the set of facts in two ways. In [register], we must test
     whether a proposed triple of [position], [a], [z] already appears in the
     set. In [query], we must find all facts that match a pair [current, z],
     where [current] is a state. (Note that [position] determines [current], but
     the converse is not true: a position contains more information besides the
     current state.)

     To address these needs, we use a two-level table. The first level is a
     matrix indexed by [current] and [z]. At the second level, we find sets of
     facts, where two facts are considered equal if they have the same triple of
     [position], [a], and [z]. In fact, we know at this level that all facts
724 725 726
     have the same [z] component, so only [position] and [a] are compared.

     Because our facts satisfy invariant 2, [z] is [any] if and only if the
727
     state [current] is solid. This means that we are wasting quite a
728 729
     lot of space in the matrix (for a solid state, the whole line is empty,
     except for the [any] column). *)
730

731
  (* The level-2 sets. *)
732 733

  module M =
734
    MySet.Make(struct
735 736
      type t = fact
      let compare fact1 fact2 =
737
        assert (lookahead fact1 = lookahead fact2);
738 739 740 741 742
        (* Compare the two positions first. This can be done without going
           through [Trie.decode], by directly comparing the two integer
           identities. *)
        let c = Pervasives.compare (identity fact1) (identity fact2) in
        assert (c = Trie.compare (position fact1) (position fact2));
743
        if c <> 0 then c else
744 745 746
        let z = lookahead fact1 in
        let a1 = W.first (word fact1) z
        and a2 = W.first (word fact2) z in
747
        (* note: [a1] and [a2] can be [any] here *)
748 749
        Terminal.compare a1 a2
    end)
750

751 752 753
  (* The level-1 matrix. *)

  let table =
754
    Array.make (Lr1.n * Terminal.n) M.empty
755

756
  let index current z =
757
    Terminal.n * (Lr1.number current) + Terminal.t2i z
758

759 760
  let count = ref 0

761
  let register fact =
762
    let current = Trie.current (position fact) in
763
    let z = lookahead fact in
764
    let i = index current z in
765 766 767 768 769 770 771 772 773 774 775 776
    let m = table.(i) in
    (* We crucially rely on the fact that [M.add] guarantees not to
       change the set if an ``equal'' fact already exists. Thus, a
       later, longer path is ignored in favor of an earlier, shorter
       path. *)
    let m' = M.add fact m in
    m != m' && begin
      incr count;
      table.(i) <- m';
      true
    end

777
  let query current z f =
778 779 780 781 782
    assert (not (Terminal.equal z any));
    (* If the state [current] is solid then the facts that concern it are
       stored in the column [any], and all of them are compatible with [z].
       Otherwise, they are stored in all columns except [any], and only
       those stored in the column [z] are compatible with [z]. *)
783
    let i = index current (if is_solid current then any else z) in
784 785
    let m = table.(i) in
    M.iter f m
786

787 788 789
  let size () =
    !count

790
  let verbose () =
791
    Printf.eprintf "F stores %d facts.\n%!" (size())
792

793 794
end

POTTIER Francois's avatar
POTTIER Francois committed
795 796
(* ------------------------------------------------------------------------ *)

797 798
(* The module [E] is in charge of recording the non-terminal edges that we have
   discovered, or more precisely, the conditions under which these edges can be
POTTIER Francois's avatar
POTTIER Francois committed
799 800 801 802 803 804 805 806 807 808 809 810
   taken.
   
   It maintains a set of quadruples [s, nt, w, z], where such a quadruple means
   that in the state [s], the outgoing edge labeled [nt] can be taken by
   consuming the word [w], under the assumption that the next symbol is [z].

   Again, the terminal symbol [a], given by [W.first w z], plays a role. For
   each quadruple [s, nt, a, z], we store at most one quadruple [s, nt, w, z].
   Thus, internally, we maintain a mapping of [s, nt, a, z] to [w].

   For greater simplicity, we do not allow [z] to be [any] in [register] or
   [query]. Allowing it would complicate things significantly, it seems. *)
811 812 813 814

module E : sig

  (* [register s nt w z] records that, in state [s], the outgoing edge labeled
815
     [nt] can be taken by consuming the word [w], if the next symbol is [z].
POTTIER Francois's avatar
POTTIER Francois committed
816 817
     It returns [true] if this information is new, i.e., if the underlying
     quadruple [s, nt, a, z] is new. The symbol [z] cannot be [any]. *)
818
  val register: Lr1.node -> Nonterminal.t -> W.word -> Terminal.t -> bool
819

POTTIER Francois's avatar
POTTIER Francois committed
820 821 822 823 824 825 826
  (* [query s nt a z] enumerates all words [w] such that, in state [s], the
     outgoing edge labeled [nt] can be taken by consuming the word [w], under
     the assumption that the next symbol is [z], and the first symbol of the
     word [w.z] is [a]. The symbol [a] can be [any]. The symbol [z] cannot be
     [any]. *)
  val query: Lr1.node -> Nonterminal.t -> Terminal.t -> Terminal.t ->
             (W.word -> unit) -> unit
827

828 829 830
  (* [size()] returns the number of edges currently stored in the set. *)
  val size: unit -> int

POTTIER Francois's avatar
POTTIER Francois committed
831
  (* [verbose()] outputs debugging & performance information. *)
832
  val verbose: unit -> unit
833

834 835
end = struct

836 837 838 839 840 841 842 843 844 845
  (* At a high level, we must implement a mapping of [s, nt, a, z] to [w]. In
     practice, we can implement this specification using any combination of
     arrays, hash tables, balanced binary trees, and perfect hashing (i.e.,
     packing several of [s], [nt], [a], [z] in one word.) Here, we choose to
     use an array, indexed by [s], of hash tables, indexed by a key that packs
     [nt], [a], and [z] in one word. According to a quick experiment, the
     final population of the hash table [table.(index s)] seems to be roughly
     [Terminal.n * Trie.size s]. We note that using an initial capacity
     of 0 and relying on the hash table's resizing mechanism has a significant
     cost, which is why we try to guess a good initial capacity. *)
846

847
  module H = Hashtbl
848

POTTIER Francois's avatar
POTTIER Francois committed
849 850
  let table =
    Array.init Lr1.n (fun i ->
851 852 853
      let size = Trie.size i in
      H.create (if size = 1 then 0 else Terminal.n * size)
    )
854 855 856

  let index s =
    Lr1.number s
857

858
  let pack nt a z : int =
POTTIER Francois's avatar
POTTIER Francois committed
859
    (* We rely on the fact that we have at most 256 terminal symbols. *)
860 861 862
    (Nonterminal.n2i nt lsl 16) lor
    (Terminal.t2i a lsl 8) lor
    (Terminal.t2i z)
863

864 865
  let count = ref 0

866
  let register s nt w z =
867
    assert (Terminal.real z);
868
    let i = index s in
869
    let m = table.(i) in
870
    let a = W.first w z in
POTTIER Francois's avatar
POTTIER Francois committed
871
    (* Note that looking at [a] in state [s] cannot cause an error. *)
872
    assert (not (causes_an_error s a));
873 874 875 876
    let key = pack nt a z in
    if H.mem m key then
      false
    else begin
877
      incr count;
878
      H.add m key w;
879 880
      true
    end
881

882
  let rec query s nt a z f =
883
    assert (Terminal.real z);
POTTIER Francois's avatar
POTTIER Francois committed
884 885
    if Terminal.equal a any then begin
      (* If [a] is [any], we query the table for every real symbol [a].
886 887 888 889 890 891
         We can limit ourselves to symbols that do not cause an error
         in state [s]. Those that do certainly do not have an entry;
         see the assertion in [register] above. *)
      foreach_terminal_not_causing_an_error s (fun a ->
        query s nt a z f
      )
POTTIER Francois's avatar
POTTIER Francois committed
892 893 894 895 896 897 898 899
    end
    else begin
      let i = index s in
      let m = table.(i) in
      let key = pack nt a z in
      match H.find m key with
      | w -> f w
      | exception Not_found -> ()
900
    end
901

902 903 904
  let size () =
    !count

905
  let verbose () =
906
    Printf.eprintf "E stores %d edges.\n%!" (size())
907

908 909
end

POTTIER Francois's avatar
POTTIER Francois committed
910 911
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
912 913 914 915 916 917 918
(* [new_edge s nt w z] is invoked when we discover that in the state [s], the
   outgoing edge labeled [nt] can be taken by consuming the word [w], under
   the assumption that the next symbol is [z]. We check whether this quadruple
   already exists in the set [E]. If not, then we add it, and we compute its
   consequences, in the form of new facts, which we insert into the priority
   queue for later examination. *)

919
let new_edge s nt w z =
920
  assert (Terminal.real z);
921
  if E.register s nt w z then
922
    let sym = Symbol.N nt in
923
    (* Query [F] for existing facts which could be extended by following
POTTIER Francois's avatar
POTTIER Francois committed
924 925 926
       this newly discovered edge. They must be facts whose current state
       is [s] and whose lookahead assumption is compatible with [a]. For
       each such fact, ... *)
927
    F.query s (W.first w z) (fun fact ->
928
      assert (compatible (lookahead fact) (W.first w z));
POTTIER Francois's avatar
POTTIER Francois committed
929
      (* ... try to take one step in the trie along an edge labeled [nt]. *)
930
      match Trie.step sym (position fact) with
931
      | position ->
POTTIER Francois's avatar
POTTIER Francois committed
932
          (* This takes us to a new state whose incoming symbol is [nt].
POTTIER Francois's avatar
POTTIER Francois committed
933 934 935 936 937
             Hence, this state is not solid. In order to satisfy invariant 2,
             we must create fact whose lookahead assumption is not [any].
             That's fine, since our lookahead assumption is [z]. In order to
             satisfy invariant 1, we must check that [z] does not cause an
             error in this state. *)
938
          assert (not (is_solid (Trie.current position)));
939
          if not (causes_an_error (Trie.current position) z) then
940
            let word = W.append (word fact) w in
941
            enqueue position word z
942
      | exception Not_found ->
POTTIER Francois's avatar
POTTIER Francois committed
943 944 945
          (* Could not take a step in the trie. This means this branch
             leads nowhere of interest, and was pruned when the trie
             was constructed. *)
946
          ()
947
    )
948

POTTIER Francois's avatar
POTTIER Francois committed
949 950
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
951 952
(* [new_fact fact] is invoked when we discover a new fact (i.e., one that was
   not previously known). It studies the consequences of this fact. These
953 954 955
   consequences are of two kinds:

   - As in Dijkstra's algorithm, the new fact can be viewed as a newly
POTTIER Francois's avatar
POTTIER Francois committed
956 957 958 959
     discovered vertex. We study its (currently known) outgoing edges,
     and enqueue new facts in the priority queue.

   - Sometimes, a fact can also be viewed as a newly discovered edge. This is
960 961
     the case when the word that took us from [source] to [current]
     represents a production of the grammar and [current] is willing to
POTTIER Francois's avatar
POTTIER Francois committed
962 963 964
     reduce this production. We record the existence of this edge, and
     re-inspect any previously discovered vertices which are interested in
     this outgoing edge. *)
965

POTTIER Francois's avatar
POTTIER Francois committed
966
let new_fact fact =
967

968 969 970 971 972 973 974 975
  (* Throughout this rather long function, there is just one [fact]. Let's
     name its components right now, so as to avoid accessing them several
     times. (That could be costly, as it requires decoding the fact.) *)
  let position = position fact 
  and lookahead = lookahead fact
  and word = word fact in
  let source = Trie.source position
  and current = Trie.current position in
976

POTTIER Francois's avatar
POTTIER Francois committed
977 978 979
  (* 1. View [fact] as a vertex. Examine the transitions out of [current].
     For every transition labeled by a symbol [sym] and into a state
     [target], ... *)
980
  
POTTIER Francois's avatar
POTTIER Francois committed
981
  Lr1.transitions current |> SymbolMap.iter (fun sym target ->
982 983 984
    (* ... try to follow this transition in the trie [position],
       down to a child which we call [child]. *)
    match Trie.step sym position, sym with
POTTIER Francois's avatar
POTTIER Francois committed
985 986 987 988 989 990 991

    | exception Not_found ->

        (* Could not take a step in the trie. This means this transition
           leads nowhere of interest. *)
        ()

992
    | child, Symbol.T t ->
POTTIER Francois's avatar
POTTIER Francois committed
993 994 995 996
          
        (* 1a. The transition exists in the trie, and [sym] is in fact a
           terminal symbol [t]. We note that [t] cannot be the [error] token,
           because the trie does not have any edges labeled [error]. *)
997
        assert (Lr1.Node.compare (Trie.current child) target = 0);
POTTIER Francois's avatar
POTTIER Francois committed
998
        assert (is_solid target);
999
        assert (non_error t);
1000

1001
        (* If the lookahead assumption [lookahead] is compatible with
POTTIER Francois's avatar
POTTIER Francois committed
1002 1003 1004 1005 1006 1007 1008 1009 1010
           [t], then we derive a new fact, where one more edge has been taken,
           and enqueue this new fact for later examination. *)
        
        (* The state [target] is solid, i.e., its incoming symbol is terminal.
           This state is always entered without consideration for the next
           lookahead symbol. Thus, we can use [any] as the lookahead assumption
           in the new fact that we produce. If we did not have [any], we would
           have to produce one fact for every possible lookahead symbol. *)

1011 1012 1013
        if compatible lookahead t then
          let word = W.append word (W.singleton t) in
          enqueue child word any
1014

1015
    | child, Symbol.N nt ->
1016

POTTIER Francois's avatar
POTTIER Francois committed
1017 1018
        (* 1b. The transition exists in the trie, and [sym] is in fact a
           nonterminal symbol [nt]. *)
1019
         assert (Lr1.Node.compare (Trie.current child) target = 0);
POTTIER Francois's avatar
POTTIER Francois committed
1020 1021 1022 1023 1024 1025 1026
         assert (not (is_solid target));

        (* We need to know how this nonterminal edge can be taken. We query
           [E] for a word [w] that allows us to take this edge. In general,
           the answer depends on the terminal symbol [z] that comes *after*
           this word: we try all such symbols. We must make sure that the
           first symbol of the word [w.z] satisfies the lookahead assumption
1027
           [lookahead]; this is ensured by passing this information to
POTTIER Francois's avatar
POTTIER Francois committed
1028 1029 1030 1031 1032 1033 1034
           [E.query]. *)

        (* It could be the case that, due to a default reduction, the answer
           to our query does not depend on [z], and we are wasting work.
           However, allowing [z] to be [any] in [E.query], and taking 
           advantage of this to increase performance, seems difficult. *)

POTTIER Francois's avatar
POTTIER Francois committed
1035 1036 1037 1038 1039
        (* Remark by Jacques-Henri Jourdan: we could remove the outer loop
           on [z], remove the parameter [z] to [E.query], and let [E.query]
           itself enumerate all values of [z]. Potentially this could allow
           a more efficient implementation of the data structure [E]. *)

POTTIER Francois's avatar
POTTIER Francois committed
1040
        foreach_terminal_not_causing_an_error target (fun z ->
1041 1042 1043 1044
          E.query current nt lookahead z (fun w ->
            assert (compatible lookahead (W.first w z));
            let word = W.append word w in
            enqueue child word z
1045 1046
          )
        )
1047

POTTIER Francois's avatar
POTTIER Francois committed
1048
  );
1049 1050

  (* 2. View [fact] as a possible edge. This is possible if the path from
1051
     [source] to the [current] state represents a production [prod] and
POTTIER Francois's avatar
POTTIER Francois committed
1052
     [current] is willing to reduce this production. Then, reducing [prod]
1053
     takes us all the way back to [source]. Thus, this production gives
1054
     rise to an edge labeled [nt] -- the left-hand side of [prod] -- out of
1055
     [source]. *)
1056

1057
  let z = lookahead in
POTTIER Francois's avatar
POTTIER Francois committed
1058 1059 1060 1061
  if not (Terminal.equal z any) then begin

    (* 2a. The lookahead assumption [z] is a real terminal symbol. We check
       whether [current] is willing to reduce some production [prod] on [z],
1062
       and whether the sub-trie [position] accepts [prod], which means
POTTIER Francois's avatar
POTTIER Francois committed
1063 1064 1065 1066
       that this reduction takes us back to the root of the trie. If so, we
       have discovered a new edge. *)

    match has_reduction current z with
1067 1068
    | Some prod when Trie.accepts prod position ->
        new_edge source (Production.nt prod) word z
1069 1070
    | _ ->
        ()
POTTIER Francois's avatar
POTTIER Francois committed
1071

1072 1073
  end
  else begin
POTTIER Francois's avatar
POTTIER Francois committed
1074 1075 1076

    (* 2b. The lookahead assumption is [any]. We must consider every pair
       [prod, z] such that the [current] state can reduce [prod] on [z]
1077
       and [position] accepts [prod]. *)
POTTIER Francois's avatar
POTTIER Francois committed
1078

1079 1080
    match Invariant.has_default_reduction current with
    | Some (prod, _) ->
1081
        if Trie.accepts prod position then
POTTIER Francois's avatar
POTTIER Francois committed
1082 1083
          (* [new_edge] does not accept [any] as its 4th parameter, so we
             must iterate over all terminal symbols. *)
1084
          foreach_terminal (fun z ->
1085
            new_edge source (Production.nt prod) word z
1086 1087 1088
          )
    | None ->
       TerminalMap.iter (fun z prods ->
1089
         if non_error z then
1090
           let prod = Misc.single prods in
1091 1092
           if Trie.accepts prod position then
             new_edge source (Production.nt prod) word z
1093
       ) (Lr1.reductions current)
POTTIER Francois's avatar
POTTIER Francois committed
1094

1095
  end
1096

POTTIER Francois's avatar
POTTIER Francois committed
1097 1098
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
1099
(* The main loop of the algorithm. *)
1100

1101
(* [level] is the length of [word fact] for the facts that we are examining
1102 1103 1104 1105
   at the moment. [extracted] counts how many facts we have extracted out of
   the priority queue. [considered] counts how many of these were found to
   be new, and subsequently passed to [new_fact]. *)

POTTIER Francois's avatar
POTTIER Francois committed
1106 1107
let level, extracted, considered =
  ref 0, ref 0, ref 0
1108

POTTIER Francois's avatar
POTTIER Francois committed
1109
let done_with_level () =
1110 1111
  Printf.eprintf "Done with level %d.\n" !level;
  W.verbose();
1112
  F.verbose();
1113 1114 1115 1116
  E.verbose();
  Printf.eprintf "Q stores %d facts.\n" (Q.cardinal q);
  Printf.eprintf "%d facts extracted out of Q, of which %d considered.\n%!"
    !extracted !considered
1117

POTTIER Francois's avatar
POTTIER Francois committed
1118
let () =
POTTIER Francois's avatar
POTTIER Francois committed
1119 1120
  Q.repeat q (fun fact ->
    incr extracted;
1121
    if F.register fact then begin
1122
      if X.verbose && W.length (word fact) > !level then begin
POTTIER Francois's avatar
POTTIER Francois committed
1123
        done_with_level();
1124
        level := W.length (word fact);
POTTIER Francois's avatar
POTTIER Francois committed
1125 1126 1127 1128 1129
      end;
      incr considered;
      new_fact fact
    end
  );
1130 1131
  if X.verbose then
    done_with_level();
POTTIER Francois's avatar
POTTIER Francois committed
1132
  Time.tick "Running LRijkstra"
1133

1134 1135
(* ------------------------------------------------------------------------ *)

1136
(* The following code validates the fact that an error can be triggered in
1137 1138 1139 1140
   state [s'] by beginning at the start symbol [nt] and reading the
   sequence of terminal symbols [w]. We use this for debugging purposes.
   Furthermore, this gives us a list of spurious reductions, which we use
   to produce a comment. *)
1141 1142

let fail msg =
1143
  Printf.eprintf "LRijkstra: internal error: %s.\n%!" msg;
1144
  exit 1
1145

1146
let validate nt s' w : ReferenceInterpreter.target =
1147
  let open ReferenceInterpreter in
1148
  match
1149
    check_error_path false nt (W.elements w)
1150 1151 1152 1153 1154 1155 1156
  with
  | OInputReadPastEnd ->
      fail "input was read past its end"
  | OInputNotFullyConsumed ->
      fail "input was not fully consumed"
  | OUnexpectedAccept ->
      fail "input was unexpectedly accepted"
1157 1158 1159 1160 1161 1162 1163 1164 1165
  | OK ((state, _) as target) ->
      if Lr1.Node.compare state s' <> 0 then
        fail (
          Printf.sprintf "error occurred in state %d instead of %d"
            (Lr1.number state)
            (Lr1.number s')
        )
      else
        target
1166 1167 1168

(* ------------------------------------------------------------------------ *)

1169 1170 1171 1172 1173 1174 1175
(* We now wish to determine, given a state [s'] and a terminal symbol [z], a
   minimal path that takes us from some entry state to state [s'] with [z] as
   the next (unconsumed) symbol. *)

(* This can be formulated as a search for a shortest path in a graph. The
   graph is not just the automaton, though. It is a (much) larger graph whose
   vertices are pairs [s, z] and whose edges are obtained by querying the
1176 1177 1178
   module [E] above. For this purpose, we use Dijkstra's algorithm,
   unmodified. Experiments show that the running time of this phase is
   typically 10x shorter than the running time of the main loop above. *)
1179

1180
module A = Astar.Make(struct
1181

1182 1183 1184
  (* A vertex is a pair [s, z], where [z] is a real terminal symbol. *)
  type node =
      Lr1.node * Terminal.t
1185

1186 1187
  let equal (s'1, z1) (s'2, z2) =
    Lr1.Node.compare s'1 s'2 = 0 && Terminal.compare z1 z2 = 0
1188

1189 1190
  let hash (s, z) =
    Hashtbl.hash (Lr1.number s, z)
1191

1192 1193 1194
  (* An edge is labeled with a word. *)
  type label =
    W.word
1195

1196 1197 1198 1199 1200 1201 1202
  (* We search forward from every [s, z], where [s] is an initial state. *)
  let sources f =
    foreach_terminal (fun z ->
      ProductionMap.iter (fun _ s ->
        f (s, z)
      ) Lr1.entry
    )
1203

1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230
  (* The successors of [s, z] are defined as follows. *)
  let successors (s, z) edge =
    assert (Terminal.real z);
    (* For every transition out of [s], labeled [sym], leading to [s']... *)
    Lr1.transitions s |> SymbolMap.iter (fun sym s' ->
      match sym with
      | Symbol.T t ->
          if Terminal.equal z t then
            (* If [sym] is the terminal symbol [z], then this transition
               matches our lookahead assumption, so we can take it. For
               every [z'], we have an edge to [s', z'], labeled with the
               singleton word [z]. *)
            let w = W.singleton z in
            foreach_terminal (fun z' ->
              edge w 1 (s', z')
            )
      | Symbol.N nt ->
          (* If [sym] is a nonterminal symbol [nt], then we query [E]
             in order to find out which (minimal) words [w] allow us
             to take this transition. We must again try every [z'],
             and must respect the constraint that the first symbol
             of the word [w.z'] is [z]. For every [z'] and [w] that
             fulfill these requirements, we have an edge to [s', z'],
             labeled with the word [w]. *)
         foreach_terminal (fun z' ->
           E.query s nt z z' (fun w ->
             edge w (W.length w) (s', z')
1231
           )
1232 1233
         )
    )
1234

1235 1236 1237 1238 1239 1240 1241 1242 1243
  (* Algorithm A*, used with a zero estimate, is Dijkstra's algorithm.
     We have experimented with a non-zero estimate, but the performance
     increase was minimal. *)
  let estimate _ =
    0

end)

(* ------------------------------------------------------------------------ *)