LRijkstra.ml 46.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
(* The purpose of this algorithm is to find, for each pair of a state [s]
   and a terminal symbol [z] such that looking at [z] in state [s] causes
   an error, a minimal path (starting in some initial state) that actually
   triggers this error. *)

(* This is potentially useful for grammar designers who wish to better
   understand the properties of their grammar, or who wish to produce a
   list of all possible syntax errors (or, at least, one syntax error in
   each automaton state where an error may occur). *)

(* The problem seems rather tricky. One might think that it suffices to
   compute shortest paths in the automaton, and to use [Analysis.minimal] to
   replace each non-terminal symbol in a path with a minimal word that this
   symbol generates. One can indeed do so, but this yields only a lower bound
   on the actual shortest path to the error at [s, z]. Indeed, two
   difficulties arise:

   - Some states have a default reduction. Thus, they will not trigger
     an error, even though they should. The error is triggered in some
     other state, after reduction takes place.

   - If the grammar has conflicts, conflict resolution removes some
     (shift or reduce) actions, hence may suppress the shortest path. *)

25 26 27 28 29 30 31 32 33 34 35
(* We explicitly choose to ignore the [error] token. Thus, we disregard any
   reductions or transitions that take place when the lookahead symbol is
   [error]. As a result, any state whose incoming symbol is [error] is found
   unreachable. It would be too complicated to have to create a first error in
   order to be able to take certain transitions or drop certain parts of the
   input. *)

(* We never work with the terminal symbol [#] either. This symbol never
   appears in the maps returned by [Lr1.transitions] and [Lr1.reductions].
   Thus, in principle, we work with ``real'' terminal symbols only. However,
   we encode [any] as [#] -- see below. *)
36

37
(* NOTE: THIS FILE IS COMPILED WITH -noassert BY DEFAULT. If you would like
38 39
   the assertions to be tested at runtime, change that in the file _tags.
   The performance impact of the assertions is about 10%. *)
40

41 42 43
(* ------------------------------------------------------------------------ *)

(* To delay the side effects performed by this module, we wrap everything in
44
   in a big functor. The functor also serves to pass verbosity parameters. *)
45

46 47 48 49 50 51 52
module Run (X : sig
  (* If [verbose] is set, produce various messages on [stderr]. *)
  val verbose: bool
  (* If [statistics] is defined, it is interpreted as the name of
     a file to which one line of statistics is appended. *)
  val statistics: string option
end) = struct
53

54
open Grammar
55

56 57 58 59 60 61 62 63 64 65 66
(* ------------------------------------------------------------------------ *)

(* Record our start time. *)

let now () =
  match X.statistics with
  | Some _ ->
      Unix.((times()).tms_utime)
  | None ->
      0.0

67
let start =
68
  now()
69

70 71
(* ------------------------------------------------------------------------ *)

72 73
(* Because of our encoding of terminal symbols as 8-bit characters, this
   algorithm supports at most 256 terminal symbols. *)
74 75

let () =
76
  if Terminal.n > 256 then
77
    Error.error [] (Printf.sprintf
78
      "--list-errors supports at most 256 terminal symbols.\n\
79 80 81 82 83
       The grammar has %d terminal symbols." Terminal.n
    )

(* ------------------------------------------------------------------------ *)

84 85 86 87 88
(* Build a module that represents words as (hash-consed) strings. Note:
   this functor application has a side effect (it allocates memory, and
   more importantly, it may fail). *)

module W = Terminal.Word(struct end)
89

90 91
(* ------------------------------------------------------------------------ *)

92 93
(* The [error] token may appear in the maps returned by [Lr1.transitions]
   and [Lr1.reductions], so we sometimes need to explicitly check for it. *)
94

95
let non_error z =
96
  not (Terminal.equal z Terminal.error)
97

98 99 100 101
(* We introduce a pseudo-terminal symbol [any]. It is used in several places
   later on, in particular in the field [fact.lookahead], to encode the
   absence of a lookahead hypothesis -- i.e., any terminal symbol will do. *)

102 103 104 105 106
(* We choose to encode [any] as [#]. There is no risk of confusion, since we
   do not use [#] anywhere. Thus, the assertion [Terminal.real z] implies
   [z <> any]. *)

let any =
107
  Terminal.sharp
108

109 110
(* ------------------------------------------------------------------------ *)

111 112 113 114 115
(* We begin with a number of auxiliary functions that provide information
   about the LR(1) automaton. These functions could perhaps be moved to a
   separate module. We keep them here, for the moment, because they are not
   used anywhere else. *)

POTTIER Francois's avatar
POTTIER Francois committed
116 117 118
(* [reductions_on s z] is the list of reductions permitted in state [s] when
   the lookahead symbol is [z]. This is a list of zero or one elements. This
   does not take default reductions into account. *)
119

POTTIER Francois's avatar
POTTIER Francois committed
120
let reductions_on s z : Production.index list =
121
  assert (Terminal.real z);
122 123 124 125 126
  try
    TerminalMap.find z (Lr1.reductions s)
  with Not_found ->
    []

127 128 129
(* [has_reduction s z] tells whether state [s] is willing to reduce some
   production (and if so, which one) when the lookahead symbol is [z]. It
   takes a possible default reduction into account. *)
130 131

let has_reduction s z : Production.index option =
132
  assert (Terminal.real z);
133 134 135 136
  match Invariant.has_default_reduction s with
  | Some (prod, _) ->
      Some prod
  | None ->
POTTIER Francois's avatar
POTTIER Francois committed
137
      match reductions_on s z with
138 139 140 141 142 143
      | prod :: prods ->
          assert (prods = []);
          Some prod
      | [] ->
          None

144 145 146 147 148 149 150 151 152
(* [can_reduce s prod] indicates whether state [s] is able to reduce
   production [prod] (either as a default reduction, or as a normal
   reduction). *)

let can_reduce s prod =
  match Invariant.has_default_reduction s with
  | Some (prod', _) when prod = prod' ->
      true
  | _ ->
153
      TerminalMap.fold (fun z prods accu ->
154 155
        (* A reduction on [#] is always a default reduction. (See [lr1.ml].) *)
        assert (not (Terminal.equal z Terminal.sharp));
156
        accu || non_error z && List.mem prod prods
157 158
      ) (Lr1.reductions s) false

159 160
(* [causes_an_error s z] tells whether state [s] will initiate an error on the
   lookahead symbol [z]. *)
161

162
let causes_an_error s z : bool =
163
  assert (Terminal.real z);
164 165 166 167
  match Invariant.has_default_reduction s with
  | Some _ ->
      false
  | None ->
POTTIER Francois's avatar
POTTIER Francois committed
168
      reductions_on s z = [] &&
169 170
      not (SymbolMap.mem (Symbol.T z) (Lr1.transitions s))

171
(* [foreach_terminal f] applies the function [f] to every terminal symbol in
172
   turn, except [error] and [#]. *)
173

174 175
let foreach_terminal =
  Terminal.iter_real
176

177 178 179
(* [foreach_terminal_not_causing_an_error s f] applies the function [f] to
   every terminal symbol [z] such that [causes_an_error s z] is false. This
   could be implemented in a naive manner using [foreach_terminal] and
POTTIER Francois's avatar
POTTIER Francois committed
180
   [causes_an_error]. This implementation is significantly more efficient. *)
181

182 183 184
let foreach_terminal_not_causing_an_error s f =
  match Invariant.has_default_reduction s with
  | Some _ ->
185
      (* There is a default reduction. No symbol causes an error. *)
186 187
      foreach_terminal f
  | None ->
188 189 190
      (* Enumerate every terminal symbol [z] for which there is a
         reduction. *)
      TerminalMap.iter (fun z _ ->
191 192 193
        (* A reduction on [#] is always a default reduction. (See [lr1.ml].) *)
        assert (not (Terminal.equal z Terminal.sharp));
        if non_error z then
194
          f z
195
      ) (Lr1.reductions s);
196 197
      (* Enumerate every terminal symbol [z] for which there is a
         transition. *)
198 199
      SymbolMap.iter (fun sym _ ->
        match sym with
200
        | Symbol.T z ->
201 202
            assert (not (Terminal.equal z Terminal.sharp));
            if non_error z then
203
              f z
204 205 206 207
        | Symbol.N _ ->
            ()
      ) (Lr1.transitions s)

208
(* Let us say a state [s] is solid if its incoming symbol is a terminal symbol
POTTIER Francois's avatar
POTTIER Francois committed
209 210
   (or if it has no incoming symbol at all, i.e., it is an initial state). It
   is fragile if its incoming symbol is a non-terminal symbol. *)
211 212 213 214 215

let is_solid s =
  match Lr1.incoming_symbol s with
  | None
  | Some (Symbol.T _) ->
POTTIER Francois's avatar
POTTIER Francois committed
216
    true
217
  | Some (Symbol.N _) ->
POTTIER Francois's avatar
POTTIER Francois committed
218
    false
219

220 221
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
222 223 224 225 226 227 228 229 230
(* Suppose [s] is a state that carries an outgoing edge labeled with a
   non-terminal symbol [nt]. We are interested in finding out how this edge
   can be taken. In order to do that, we must determine how, by starting in
   [s], one can follow a path that corresponds to (the right-hand side of) a
   production [prod] associated with [nt]. There are in general several such
   productions. The paths that they determine in the automaton form a "star".
   We represent the star rooted at [s] as a trie. For every state [s], the
   star rooted at [s] is constructed in advance, before the algorithm runs.
   While the algorithm runs, a point in the trie (that is, a sub-trie) tells
POTTIER Francois's avatar
POTTIER Francois committed
231
   us where we come from, where we are, and which production(s) we are hoping
POTTIER Francois's avatar
POTTIER Francois committed
232 233
   to reduce in the future. *)

234 235 236
let grammar_uses_error =
  ref false

237 238 239
module Trie : sig

  type trie
240 241 242 243

  (* [star s] creates a (new) trie whose source is [s], populated with its
     branches. (There is one branch for every production [prod] associated
     with every non-terminal symbol [nt] for which [s] carries an outgoing
244
     edge.) If the star turns out to be trivial then [None] is returned. *)
245 246
  val star: Lr1.node -> trie option

247 248 249 250
  (* After [star s] has been called, [size (Lr1.number s)] reports the size
     of the trie that has been constructed for state [s]. *)
  val size: int -> int

251 252 253 254
  (* After [star] has been called a number of times, [cumulated_size()]
     reports the total size of the tries that have been constructed. *)
  val cumulated_size: unit -> int

255 256 257 258 259 260 261 262
  (* Every (sub-)trie has a unique identity. (One can think of it as its
     address.) [compare] compares the identity of two tries. This can be
     used, e.g., to set up a map whose keys are tries. *)
  val compare: trie -> trie -> int

  (* [source t] returns the source state of the (sub-)trie [t]. This is
     the root of the star of which [t] is a sub-trie. In other words, this
     tells us "where we come from". *)
263
  val source: trie -> Lr1.node
264 265

  (* [current t] returns the current state of the (sub-)trie [t]. This is
266 267
     the root of the sub-trie [t]. In other words, this tells us "where
     we are". *)
268
  val current: trie -> Lr1.node
269

270 271 272 273 274
  (* [accepts prod t] tells whether the current state of the trie [t] is
     the end of a branch associated with production [prod]. If so, this
     means that we have successfully followed a path that corresponds to
     the right-hand side of production [prod]. *)
  val accepts: Production.index -> trie -> bool
275

276 277 278 279 280 281 282 283
  (* [step sym t] is the immediate sub-trie of [t] along the symbol [sym].
     This function raises [Not_found] if [t] has no child labeled [sym]. *)
  val step: Symbol.t -> trie -> trie

  (* [verbose()] outputs debugging & performance information. *)
  val verbose: unit -> unit

end = struct
284

285 286
  (* A trie has the following structure. *)

287
  type trie = {
288 289 290
    (* A unique identity, used by [compare]. The trie construction code
       ensures that these numbers are indeed unique: see [fresh], [insert],
       [star]. *)
291
    identity: int;
292
    (* The root state of this star: "where we come from". *)
293
    source: Lr1.node;
294
    (* The current state, i.e., the root of this sub-trie: "where we are". *)
295
    current: Lr1.node;
296 297 298
    (* The productions that we can reduce in the current state. In other
       words, if this list is nonempty, then the current state is the end
       of one (or several) branches. It can nonetheless have children. *)
299
    productions: Production.index list;
300 301
    (* The children, or sub-tries. *)
    transitions: trie SymbolMap.t
302
  }
303

304
  (* This counter is used by [mktrie] to produce unique identities. *)
305 306
  let c = ref 0

307
  (* This smart constructor creates a new trie with a unique identity. *)
308
  let mktrie source current productions transitions =
309
    let identity = Misc.postincrement c in
310
    { identity; source; current; productions; transitions }
311

312 313
  exception DeadBranch

314
  let rec insert w prod t =
315 316
    match w with
    | [] ->
317
        (* We check whether the current state [t.current] is able to reduce
318 319 320 321 322
           production [prod]. (If [prod] cannot be reduced, the reduction
           action must have been suppressed by conflict resolution.) If not,
           then this branch is dead. This test is superfluous (i.e., it would
           be OK to conservatively assume that [prod] can be reduced) but
           allows us to build a slightly smaller star in some cases. *)
323
        if can_reduce t.current prod then
324 325 326 327 328 329
          (* We consume (update) the trie [t], so there is no need to allocate
             a new stamp. (Of course we could allocate a new stamp, but I prefer
             to be precise.) *)
          { t with productions = prod :: t.productions }
        else
          raise DeadBranch
330
    | (Symbol.T t) :: _ when Terminal.equal t Terminal.error ->
331
         grammar_uses_error := true;
332
         raise DeadBranch
333
    | a :: w ->
334
        (* Check if there is a transition labeled [a] out of [t.current]. If
335 336
           there is, we add a child to the trie [t]. If there isn't, then it
           must have been removed by conflict resolution. (Indeed, it must be
337 338 339
           present in a canonical automaton.) We could in this case return an
           unchanged sub-trie. We can do slightly better: we abort the whole
           insertion, so as to return an unchanged toplevel trie. *)
340
        match SymbolMap.find a (Lr1.transitions t.current) with
341
        | successor ->
342 343 344 345 346 347 348 349
            (* Find our child at [a], or create it. *)
            let t' =
              try
                SymbolMap.find a t.transitions
              with Not_found ->
                mktrie t.source successor [] SymbolMap.empty
            in
            (* Update the child [t']. *)
350
            let t' = insert w prod t' in
351 352
            (* Update [t]. Again, no need to allocate a new stamp. *)
            { t with transitions = SymbolMap.add a t' t.transitions }
353
        | exception Not_found ->
354
            raise DeadBranch
355

356 357 358 359 360
  (* [insert prod t] inserts a new branch, corresponding to production
     [prod], into the trie [t]. This function consumes its argument,
     which should no longer be used afterwards. *)
  let insert prod t =
    let w = Array.to_list (Production.rhs prod) in
361 362 363 364 365 366
    let save = !c in
    try
      insert w prod t
    with DeadBranch ->
      c := save;
      t
367

368 369 370 371
  (* [fresh s] creates a new empty trie whose source is [s]. *)
  let fresh source =
    mktrie source source [] SymbolMap.empty

372 373 374
  (* The star at [s] is obtained by starting with a fresh empty trie and
     inserting into it every production [prod] whose left-hand side [nt]
     is the label of an outgoing edge at [s]. *)
375 376 377 378 379 380 381 382 383
  let star s =
    SymbolMap.fold (fun sym _ accu ->
      match sym with
      | Symbol.T _ ->
          accu
      | Symbol.N nt ->
          Production.foldnt nt accu insert
    ) (Lr1.transitions s) (fresh s)

384 385 386 387 388 389 390
  (* A trie [t] is nontrivial if it has at least one branch, i.e., contains at
     least one sub-trie whose [productions] field is nonempty. Trivia: a trie
     of size greater than 1 is necessarily nontrivial, but the converse is not
     true: a nontrivial trie can have size 1. (This occurs if all productions
     have zero length.) *)
  let trivial t =
    t.productions = [] && SymbolMap.is_empty t.transitions
391

392 393
  (* Redefine [star] to include a [nontrivial] test and to record the size of
     the newly built trie. *)
394 395 396 397

  let size =
    Array.make Lr1.n (-1)

398
  let star s =
399
    let initial = !c in
400
    let t = star s in
401 402
    let final = !c in
    size.(Lr1.number s) <- final - initial;
403
    if trivial t then None else Some t
404

405 406 407 408
  let size s =
    assert (size.(s) >= 0);
    size.(s)

409 410 411
  let cumulated_size () =
    !c

412
  let compare t1 t2 =
413
    Pervasives.compare t1.identity t2.identity
414

415 416 417
  let source t =
    t.source

418 419
  let current t =
    t.current
420

421 422 423
  let accepts prod t =
    List.mem prod t.productions

424
  let step a t =
425
    SymbolMap.find a t.transitions (* careful: may raise [Not_found] *)
426

427
  let verbose () =
428
    Printf.eprintf "Cumulated star size: %d\n%!" (cumulated_size())
429

430 431
end

432 433
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
434 435 436 437 438 439
(* The main algorithm, [LRijkstra], accumulates facts. A fact is a triple of a
   position (that is, a sub-trie), a word, and a lookahead assumption. Such a
   fact means that this position can be reached, from the source state
   [Trie.source fact.position], by consuming [fact.word], under the assumption
   that the next input symbol is [fact.lookahead]. *)

440 441
(* We allow [fact.lookahead] to be [any] so as to indicate that this fact does
   not have a lookahead assumption. *)
POTTIER Francois's avatar
POTTIER Francois committed
442

443
type fact = {
444
  position: Trie.trie;
445
  word: W.word;
446
  lookahead: Terminal.t (* may be [any] *)
447 448
}

449 450
(* Accessors. *)

451
let source fact =
452
  Trie.source fact.position
453

454
let current fact =
455
  Trie.current fact.position
456

457 458
(* Two invariants reduce the number of facts that we consider:

459 460 461
   1. If [fact.lookahead] is a real terminal symbol [z] (i.e., not [any]),
      then [z] does not cause an error in the current state [current fact].
      It would be useless to consider a fact that violates this property;
POTTIER Francois's avatar
POTTIER Francois committed
462 463 464
      this cannot possibly lead to a successful reduction. In practice,
      this refinement allows reducing the number of facts that go through
      the queue by a factor of two.
465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482

   2. [fact.lookahead] is [any] iff the current state [current fact] is
      solid. This sounds rather reasonable (when a state is entered
      by shifting, it is entered regardless of which symbol follows)
      and simplifies the implementation of the sub-module [T].

*)

let invariant1 fact =
  fact.lookahead = any || not (causes_an_error (current fact) fact.lookahead)

let invariant2 fact =
  (fact.lookahead = any) = is_solid (current fact)

(* [compatible z a] checks whether the terminal symbol [a] satisfies the
   lookahead assumption [z] -- which can be [any]. *)

let compatible z a =
483 484
  assert (non_error z);
  assert (Terminal.real a);
485 486
  z = any || z = a

POTTIER Francois's avatar
POTTIER Francois committed
487
(* ------------------------------------------------------------------------ *)
488

489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511
(* As in Dijkstra's algorithm, a priority queue contains the facts that await
   examination. The length of [fact.word] serves as the priority of a fact.
   This guarantees that we discover shortest paths. (We never insert into the
   queue a fact whose priority is less than the priority of the last fact
   extracted out of the queue.) *)

(* [LowIntegerPriorityQueue] offers very efficient operations (essentially
   constant time, for a small constant). It exploits the fact that priorities
   are low nonnegative integers. *)

module Q = LowIntegerPriorityQueue

let q =
  Q.create()

(* In principle, there is no need to insert the fact into the queue if [T]
   already stores a comparable fact. We could perform this test in [add].
   However, a quick experiment suggests that this is not worthwhile. The run
   time augments (because membership in [T] is tested twice, upon inserting
   and upon extracting) and the memory consumption does not seem to go down
   significantly. *)

let add fact =
POTTIER Francois's avatar
POTTIER Francois committed
512
  (* [fact.lookahead] can be [any], but cannot be [error] *)
513
  assert (non_error fact.lookahead);
514 515
  assert (invariant1 fact);
  assert (invariant2 fact);
516 517 518
  (* The length of [fact.word] serves as the priority of this fact. *)
  Q.add q fact (W.length fact.word)

POTTIER Francois's avatar
POTTIER Francois committed
519 520
(* ------------------------------------------------------------------------ *)

521 522 523
(* Construct the [star] of every state [s]. Initialize the priority queue. *)

let () =
POTTIER Francois's avatar
POTTIER Francois committed
524
  (* For every state [s]... *)
525
  Lr1.iter (fun s ->
POTTIER Francois's avatar
POTTIER Francois committed
526
    (* If the trie rooted at [s] is nontrivial...*)
527
    match Trie.star s with
POTTIER Francois's avatar
POTTIER Francois committed
528 529 530 531 532 533 534 535 536 537
    | None ->
        ()
    | Some position ->
        (* ...then insert an initial fact into the priority queue. *)
        (* In order to respect invariants 1 and 2, we must distinguish two
           cases. If [s] is solid, then we insert a single fact, whose
           lookahead assumption is [any]. Otherwise, we must insert one
           initial fact for every terminal symbol [z] that does not cause
           an error in state [s]. *)
        let word = W.epsilon in
538
        if is_solid s then
POTTIER Francois's avatar
POTTIER Francois committed
539
          add { position; word; lookahead = any }
540 541
        else
          foreach_terminal_not_causing_an_error s (fun z ->
POTTIER Francois's avatar
POTTIER Francois committed
542
            add { position; word; lookahead = z }
543
          )
POTTIER Francois's avatar
POTTIER Francois committed
544 545 546
  );
  if X.verbose then
    Trie.verbose()
547

548 549 550 551 552 553 554
(* Produce a warning if the grammar uses the [error] pseudo-token. *)

let () =
  if !grammar_uses_error then
    Error.warning []
      "--list-errors ignores all productions that involve the error token."

555 556
(* ------------------------------------------------------------------------ *)

557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574
(* The module [T] maintains a set of known facts. *)

(* Three aspects of a fact are of particular interest:
   - its position [position], given by [fact.position];
   - its first symbol [a], given by [W.first fact.word fact.lookahead];
   - its lookahead assumption [z], given by [fact.lookahead].

   For every triple of [position], [a], and [z], we store at most one fact,
   (whose word has minimal length). Indeed, we are not interested in keeping
   track of several words that produce the same effect. Only the shortest such
   word is of interest.
   
   Thus, the total number of facts accumulated by the algorithm is at most
   [T.n^2], where [T] is the total size of the tries that we have constructed,
   and [n] is the number of terminal symbols. (This number can be quite large.
   [T] can be in the tens of thousands, and [n] can be over one hundred. These
   figures lead to a theoretical upper bound of 100M. In practice, for T=25K
   and n=108, we observe that the algorithm gathers about 7M facts.) *)
575

576
module T : sig
577 578

  (* [register fact] registers the fact [fact]. It returns [true] if this fact
579 580
     is new, i.e., no fact concerning the same triple of [position], [a], and
     [z] was previously known. *)
581 582
  val register: fact -> bool

583
  (* [query current z f] enumerates all known facts whose current state is
584 585
     [current] and whose lookahead assumption is compatible with [z]. The
     symbol [z] must a real terminal symbol, i.e., cannot be [any]. *)
586
  val query: Lr1.node -> Terminal.t -> (fact -> unit) -> unit
587

588 589 590
  (* [size()] returns the number of facts currently stored in the set. *)
  val size: unit -> int

591
  (* [verbose()] outputs debugging & performance information. *)
592
  val verbose: unit -> unit
593

594
end = struct
595

596 597 598 599 600 601 602 603 604 605 606
  (* We need to query the set of facts in two ways. In [register], we must test
     whether a proposed triple of [position], [a], [z] already appears in the
     set. In [query], we must find all facts that match a pair [current, z],
     where [current] is a state. (Note that [position] determines [current], but
     the converse is not true: a position contains more information besides the
     current state.)

     To address these needs, we use a two-level table. The first level is a
     matrix indexed by [current] and [z]. At the second level, we find sets of
     facts, where two facts are considered equal if they have the same triple of
     [position], [a], and [z]. In fact, we know at this level that all facts
607 608 609 610 611 612
     have the same [z] component, so only [position] and [a] are compared.

     Because our facts satisfy invariant 2, [z] is [any] if and only if the
     state [current fact] is solid. This means that we are wasting quite a
     lot of space in the matrix (for a solid state, the whole line is empty,
     except for the [any] column). *)
613

614
  (* The level-2 sets. *)
615 616

  module M =
617
    MySet.Make(struct
618 619
      type t = fact
      let compare fact1 fact2 =
620
        assert (fact1.lookahead = fact2.lookahead);
621
        let c = Trie.compare fact1.position fact2.position in
622
        if c <> 0 then c else
623 624 625
        let z = fact1.lookahead in
        let a1 = W.first fact1.word z
        and a2 = W.first fact2.word z in
626
        (* note: [a1] and [a2] can be [any] here *)
627 628
        Terminal.compare a1 a2
    end)
629

630 631 632
  (* The level-1 matrix. *)

  let table =
633
    Array.make (Lr1.n * Terminal.n) M.empty
634

635
  let index current z =
636
    Terminal.n * (Lr1.number current) + Terminal.t2i z
637

638 639
  let count = ref 0

640
  let register fact =
641
    let current = current fact in
642
    let z = fact.lookahead in
643
    let i = index current z in
644 645 646 647 648 649 650 651 652 653 654 655
    let m = table.(i) in
    (* We crucially rely on the fact that [M.add] guarantees not to
       change the set if an ``equal'' fact already exists. Thus, a
       later, longer path is ignored in favor of an earlier, shorter
       path. *)
    let m' = M.add fact m in
    m != m' && begin
      incr count;
      table.(i) <- m';
      true
    end

656
  let query current z f =
657 658 659 660 661
    assert (not (Terminal.equal z any));
    (* If the state [current] is solid then the facts that concern it are
       stored in the column [any], and all of them are compatible with [z].
       Otherwise, they are stored in all columns except [any], and only
       those stored in the column [z] are compatible with [z]. *)
662
    let i = index current (if is_solid current then any else z) in
663 664
    let m = table.(i) in
    M.iter f m
665

666 667 668
  let size () =
    !count

669
  let verbose () =
670
    Printf.eprintf "T stores %d facts.\n%!" (size())
671

672 673
end

POTTIER Francois's avatar
POTTIER Francois committed
674 675
(* ------------------------------------------------------------------------ *)

676 677
(* The module [E] is in charge of recording the non-terminal edges that we have
   discovered, or more precisely, the conditions under which these edges can be
POTTIER Francois's avatar
POTTIER Francois committed
678 679 680 681 682 683 684 685 686 687 688 689
   taken.
   
   It maintains a set of quadruples [s, nt, w, z], where such a quadruple means
   that in the state [s], the outgoing edge labeled [nt] can be taken by
   consuming the word [w], under the assumption that the next symbol is [z].

   Again, the terminal symbol [a], given by [W.first w z], plays a role. For
   each quadruple [s, nt, a, z], we store at most one quadruple [s, nt, w, z].
   Thus, internally, we maintain a mapping of [s, nt, a, z] to [w].

   For greater simplicity, we do not allow [z] to be [any] in [register] or
   [query]. Allowing it would complicate things significantly, it seems. *)
690 691 692 693

module E : sig

  (* [register s nt w z] records that, in state [s], the outgoing edge labeled
694
     [nt] can be taken by consuming the word [w], if the next symbol is [z].
POTTIER Francois's avatar
POTTIER Francois committed
695 696
     It returns [true] if this information is new, i.e., if the underlying
     quadruple [s, nt, a, z] is new. The symbol [z] cannot be [any]. *)
697
  val register: Lr1.node -> Nonterminal.t -> W.word -> Terminal.t -> bool
698

POTTIER Francois's avatar
POTTIER Francois committed
699 700 701 702 703 704 705
  (* [query s nt a z] enumerates all words [w] such that, in state [s], the
     outgoing edge labeled [nt] can be taken by consuming the word [w], under
     the assumption that the next symbol is [z], and the first symbol of the
     word [w.z] is [a]. The symbol [a] can be [any]. The symbol [z] cannot be
     [any]. *)
  val query: Lr1.node -> Nonterminal.t -> Terminal.t -> Terminal.t ->
             (W.word -> unit) -> unit
706

707 708 709
  (* [size()] returns the number of edges currently stored in the set. *)
  val size: unit -> int

POTTIER Francois's avatar
POTTIER Francois committed
710
  (* [verbose()] outputs debugging & performance information. *)
711
  val verbose: unit -> unit
712

713 714
end = struct

715 716 717 718 719 720 721 722 723 724
  (* At a high level, we must implement a mapping of [s, nt, a, z] to [w]. In
     practice, we can implement this specification using any combination of
     arrays, hash tables, balanced binary trees, and perfect hashing (i.e.,
     packing several of [s], [nt], [a], [z] in one word.) Here, we choose to
     use an array, indexed by [s], of hash tables, indexed by a key that packs
     [nt], [a], and [z] in one word. According to a quick experiment, the
     final population of the hash table [table.(index s)] seems to be roughly
     [Terminal.n * Trie.size s]. We note that using an initial capacity
     of 0 and relying on the hash table's resizing mechanism has a significant
     cost, which is why we try to guess a good initial capacity. *)
725

726
  module H = Hashtbl
727

POTTIER Francois's avatar
POTTIER Francois committed
728 729
  let table =
    Array.init Lr1.n (fun i ->
730 731 732
      let size = Trie.size i in
      H.create (if size = 1 then 0 else Terminal.n * size)
    )
733 734 735

  let index s =
    Lr1.number s
736

737
  let pack nt a z : int =
POTTIER Francois's avatar
POTTIER Francois committed
738
    (* We rely on the fact that we have at most 256 terminal symbols. *)
739 740 741
    (Nonterminal.n2i nt lsl 16) lor
    (Terminal.t2i a lsl 8) lor
    (Terminal.t2i z)
742

743 744
  let count = ref 0

745
  let register s nt w z =
746
    assert (Terminal.real z);
747
    let i = index s in
748
    let m = table.(i) in
749
    let a = W.first w z in
POTTIER Francois's avatar
POTTIER Francois committed
750
    (* Note that looking at [a] in state [s] cannot cause an error. *)
751
    assert (not (causes_an_error s a));
752 753 754 755
    let key = pack nt a z in
    if H.mem m key then
      false
    else begin
756
      incr count;
757
      H.add m key w;
758 759
      true
    end
760

761
  let rec query s nt a z f =
762
    assert (Terminal.real z);
POTTIER Francois's avatar
POTTIER Francois committed
763 764
    if Terminal.equal a any then begin
      (* If [a] is [any], we query the table for every real symbol [a].
765 766 767 768 769 770
         We can limit ourselves to symbols that do not cause an error
         in state [s]. Those that do certainly do not have an entry;
         see the assertion in [register] above. *)
      foreach_terminal_not_causing_an_error s (fun a ->
        query s nt a z f
      )
POTTIER Francois's avatar
POTTIER Francois committed
771 772 773 774 775 776 777 778
    end
    else begin
      let i = index s in
      let m = table.(i) in
      let key = pack nt a z in
      match H.find m key with
      | w -> f w
      | exception Not_found -> ()
779
    end
780

781 782 783
  let size () =
    !count

784
  let verbose () =
785
    Printf.eprintf "E stores %d edges.\n%!" (size())
786

787 788
end

POTTIER Francois's avatar
POTTIER Francois committed
789 790
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
791 792 793 794 795 796 797
(* [new_edge s nt w z] is invoked when we discover that in the state [s], the
   outgoing edge labeled [nt] can be taken by consuming the word [w], under
   the assumption that the next symbol is [z]. We check whether this quadruple
   already exists in the set [E]. If not, then we add it, and we compute its
   consequences, in the form of new facts, which we insert into the priority
   queue for later examination. *)

798
let new_edge s nt w z =
799
  assert (Terminal.real z);
800
  if E.register s nt w z then
801
    let sym = Symbol.N nt in
POTTIER Francois's avatar
POTTIER Francois committed
802 803 804 805
    (* Query [T] for existing facts which could be extended by following
       this newly discovered edge. They must be facts whose current state
       is [s] and whose lookahead assumption is compatible with [a]. For
       each such fact, ... *)
806
    T.query s (W.first w z) (fun fact ->
807
      assert (compatible fact.lookahead (W.first w z));
POTTIER Francois's avatar
POTTIER Francois committed
808
      (* ... try to take one step in the trie along an edge labeled [nt]. *)
809 810
      match Trie.step sym fact.position with
      | position ->
POTTIER Francois's avatar
POTTIER Francois committed
811 812 813 814 815 816
          (* This takes up to a new state whose incoming symbol is [nt].
             Hence, this state is not solid. In order to satisfy invariant 2,
             we must create fact whose lookahead assumption is not [any].
             That's fine, since our lookahead assumption is [z]. In order to
             satisfy invariant 1, we must check that [z] does not cause an
             error in this state. *)
817
          assert (not (is_solid (Trie.current position)));
818
          if not (causes_an_error (Trie.current position) z) then
POTTIER Francois's avatar
POTTIER Francois committed
819 820
            let word = W.append fact.word w in
            add { position; word; lookahead = z }
821
      | exception Not_found ->
POTTIER Francois's avatar
POTTIER Francois committed
822 823 824
          (* Could not take a step in the trie. This means this branch
             leads nowhere of interest, and was pruned when the trie
             was constructed. *)
825
          ()
826
    )
827

POTTIER Francois's avatar
POTTIER Francois committed
828 829
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
830 831
(* [new_fact fact] is invoked when we discover a new fact (i.e., one that was
   not previously known). It studies the consequences of this fact. These
832 833 834
   consequences are of two kinds:

   - As in Dijkstra's algorithm, the new fact can be viewed as a newly
POTTIER Francois's avatar
POTTIER Francois committed
835 836 837 838 839 840 841 842 843
     discovered vertex. We study its (currently known) outgoing edges,
     and enqueue new facts in the priority queue.

   - Sometimes, a fact can also be viewed as a newly discovered edge. This is
     the case when the word that took us from [fact.source] to [fact.current]
     represents a production of the grammar and [fact.current] is willing to
     reduce this production. We record the existence of this edge, and
     re-inspect any previously discovered vertices which are interested in
     this outgoing edge. *)
844

POTTIER Francois's avatar
POTTIER Francois committed
845
let new_fact fact =
846

847
  let current = current fact in
848

POTTIER Francois's avatar
POTTIER Francois committed
849 850 851
  (* 1. View [fact] as a vertex. Examine the transitions out of [current].
     For every transition labeled by a symbol [sym] and into a state
     [target], ... *)
852
  
POTTIER Francois's avatar
POTTIER Francois committed
853 854 855
  Lr1.transitions current |> SymbolMap.iter (fun sym target ->
    (* ... try to follow this transition in the trie [fact.position],
       down to a child which we call [position]. *)
856
    match Trie.step sym fact.position, sym with
POTTIER Francois's avatar
POTTIER Francois committed
857 858 859 860 861 862 863

    | exception Not_found ->

        (* Could not take a step in the trie. This means this transition
           leads nowhere of interest. *)
        ()

864
    | position, Symbol.T t ->
POTTIER Francois's avatar
POTTIER Francois committed
865 866 867 868 869 870
          
        (* 1a. The transition exists in the trie, and [sym] is in fact a
           terminal symbol [t]. We note that [t] cannot be the [error] token,
           because the trie does not have any edges labeled [error]. *)
        assert (Lr1.Node.compare (Trie.current position) target = 0);
        assert (is_solid target);
871
        assert (non_error t);
872

POTTIER Francois's avatar
POTTIER Francois committed
873 874 875 876 877 878 879 880 881 882 883
        (* If the lookahead assumption [fact.lookahead] is compatible with
           [t], then we derive a new fact, where one more edge has been taken,
           and enqueue this new fact for later examination. *)
        
        (* The state [target] is solid, i.e., its incoming symbol is terminal.
           This state is always entered without consideration for the next
           lookahead symbol. Thus, we can use [any] as the lookahead assumption
           in the new fact that we produce. If we did not have [any], we would
           have to produce one fact for every possible lookahead symbol. *)

        if compatible fact.lookahead t then
884
          let word = W.append fact.word (W.singleton t) in
885
          add { position; word; lookahead = any }
886

887
    | position, Symbol.N nt ->
888

POTTIER Francois's avatar
POTTIER Francois committed
889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907
        (* 1b. The transition exists in the trie, and [sym] is in fact a
           nonterminal symbol [nt]. *)
         assert (Lr1.Node.compare (Trie.current position) target = 0);
         assert (not (is_solid target));

        (* We need to know how this nonterminal edge can be taken. We query
           [E] for a word [w] that allows us to take this edge. In general,
           the answer depends on the terminal symbol [z] that comes *after*
           this word: we try all such symbols. We must make sure that the
           first symbol of the word [w.z] satisfies the lookahead assumption
           [fact.lookahead]; this is ensured by passing this information to
           [E.query]. *)

        (* It could be the case that, due to a default reduction, the answer
           to our query does not depend on [z], and we are wasting work.
           However, allowing [z] to be [any] in [E.query], and taking 
           advantage of this to increase performance, seems difficult. *)

        foreach_terminal_not_causing_an_error target (fun z ->
908
          E.query current nt fact.lookahead z (fun w ->
909
            assert (compatible fact.lookahead (W.first w z));
POTTIER Francois's avatar
POTTIER Francois committed
910 911
            let word = W.append fact.word w in
            add { position; word; lookahead = z }
912 913
          )
        )
914

POTTIER Francois's avatar
POTTIER Francois committed
915
  );
916 917

  (* 2. View [fact] as a possible edge. This is possible if the path from
POTTIER Francois's avatar
POTTIER Francois committed
918 919
     [fact.source] to the [current] state represents a production [prod] and
     [current] is willing to reduce this production. Then, reducing [prod]
920 921
     takes us all the way back to [fact.source]. Thus, this production gives
     rise to an edge labeled [nt] -- the left-hand side of [prod] -- out of
POTTIER Francois's avatar
POTTIER Francois committed
922
     [fact.source]. *)
923

POTTIER Francois's avatar
POTTIER Francois committed
924 925 926 927 928 929 930 931 932 933
  let z = fact.lookahead in
  if not (Terminal.equal z any) then begin

    (* 2a. The lookahead assumption [z] is a real terminal symbol. We check
       whether [current] is willing to reduce some production [prod] on [z],
       and whether the sub-trie [fact.position] accepts [prod], which means
       that this reduction takes us back to the root of the trie. If so, we
       have discovered a new edge. *)

    match has_reduction current z with
934
    | Some prod when Trie.accepts prod fact.position ->
POTTIER Francois's avatar
POTTIER Francois committed
935
        new_edge (source fact) (Production.nt prod) fact.word z
936 937
    | _ ->
        ()
POTTIER Francois's avatar
POTTIER Francois committed
938

939 940
  end
  else begin
POTTIER Francois's avatar
POTTIER Francois committed
941 942 943 944 945

    (* 2b. The lookahead assumption is [any]. We must consider every pair
       [prod, z] such that the [current] state can reduce [prod] on [z]
       and [fact.position] accepts [prod]. *)

946 947 948
    match Invariant.has_default_reduction current with
    | Some (prod, _) ->
        if Trie.accepts prod fact.position then
POTTIER Francois's avatar
POTTIER Francois committed
949 950
          (* [new_edge] does not accept [any] as its 4th parameter, so we
             must iterate over all terminal symbols. *)
951
          foreach_terminal (fun z ->
POTTIER Francois's avatar
POTTIER Francois committed
952
            new_edge (source fact) (Production.nt prod) fact.word z
953 954 955
          )
    | None ->
       TerminalMap.iter (fun z prods ->
956
         if non_error z then
957 958
           let prod = Misc.single prods in
           if Trie.accepts prod fact.position then
POTTIER Francois's avatar
POTTIER Francois committed
959
             new_edge (source fact) (Production.nt prod) fact.word z
960
       ) (Lr1.reductions current)
POTTIER Francois's avatar
POTTIER Francois committed
961

962
  end
963

POTTIER Francois's avatar
POTTIER Francois committed
964 965
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
966
(* The main loop of the algorithm. *)
967

968 969 970 971 972
(* [level] is the length of [fact.word] for the facts that we are examining
   at the moment. [extracted] counts how many facts we have extracted out of
   the priority queue. [considered] counts how many of these were found to
   be new, and subsequently passed to [new_fact]. *)

POTTIER Francois's avatar
POTTIER Francois committed
973 974
let level, extracted, considered =
  ref 0, ref 0, ref 0
975

POTTIER Francois's avatar
POTTIER Francois committed
976
let done_with_level () =
977 978 979 980 981 982 983
  Printf.eprintf "Done with level %d.\n" !level;
  W.verbose();
  T.verbose();
  E.verbose();
  Printf.eprintf "Q stores %d facts.\n" (Q.cardinal q);
  Printf.eprintf "%d facts extracted out of Q, of which %d considered.\n%!"
    !extracted !considered
984

POTTIER Francois's avatar
POTTIER Francois committed
985
let () =
POTTIER Francois's avatar
POTTIER Francois committed
986 987 988
  Q.repeat q (fun fact ->
    incr extracted;
    if T.register fact then begin
989
      if X.verbose && W.length fact.word > !level then begin
POTTIER Francois's avatar
POTTIER Francois committed
990 991 992 993 994 995 996
        done_with_level();
        level := W.length fact.word;
      end;
      incr considered;
      new_fact fact
    end
  );
997 998
  if X.verbose then
    done_with_level();
POTTIER Francois's avatar
POTTIER Francois committed
999
  Time.tick "Running LRijkstra"
1000

1001 1002
(* ------------------------------------------------------------------------ *)

1003 1004 1005 1006 1007
(* The following code validates the fact that an error can be triggered in
   state [s'] by beginning in the initial state [s] and reading the
   sequence of terminal symbols [w]. We use this for debugging purposes. *)

let fail msg =
1008
  Printf.eprintf "LRijkstra: internal error: %s.\n%!" msg;
1009 1010 1011
  false

let validate s s' w : bool =
1012
  let open ReferenceInterpreter in
1013
  match
1014
    check_error_path (Lr1.nt_of_entry s) (W.elements w)
1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031
  with
  | OInputReadPastEnd ->
      fail "input was read past its end"
  | OInputNotFullyConsumed ->
      fail "input was not fully consumed"
  | OUnexpectedAccept ->
      fail "input was unexpectedly accepted"
  | OK state ->
      Lr1.Node.compare state s' = 0 ||
      fail (
        Printf.sprintf "error occurred in state %d instead of %d"
          (Lr1.number state)
          (Lr1.number s')
      )

(* ------------------------------------------------------------------------ *)

1032 1033 1034 1035 1036 1037 1038
(* We now wish to determine, given a state [s'] and a terminal symbol [z], a
   minimal path that takes us from some entry state to state [s'] with [z] as
   the next (unconsumed) symbol. *)

(* This can be formulated as a search for a shortest path in a graph. The
   graph is not just the automaton, though. It is a (much) larger graph whose
   vertices are pairs [s, z] and whose edges are obtained by querying the
1039 1040 1041
   module [E] above. For this purpose, we use Dijkstra's algorithm,
   unmodified. Experiments show that the running time of this phase is
   typically 10x shorter than the running time of the main loop above. *)
1042

1043
module A = Astar.Make(struct
1044

1045 1046 1047
  (* A vertex is a pair [s, z], where [z] is a real terminal symbol. *)
  type node =
      Lr1.node * Terminal.t
1048

1049 1050
  let equal (s'1, z1) (s'2, z2) =
    Lr1.Node.compare s'1 s'2 = 0 && Terminal.compare z1 z2 = 0
1051

1052 1053
  let hash (s, z) =
    Hashtbl.hash (Lr1.number s, z)
1054

1055 1056 1057
  (* An edge is labeled with a word. *)
  type label =
    W.word
1058

1059 1060 1061 1062 1063 1064 1065
  (* We search forward from every [s, z], where [s] is an initial state. *)
  let sources f =
    foreach_terminal (fun z ->
      ProductionMap.iter (fun _ s ->
        f (s, z)
      ) Lr1.entry
    )
1066

1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093
  (* The successors of [s, z] are defined as follows. *)
  let successors (s, z) edge =
    assert (Terminal.real z);
    (* For every transition out of [s], labeled [sym], leading to [s']... *)
    Lr1.transitions s |> SymbolMap.iter (fun sym s' ->
      match sym with
      | Symbol.T t ->
          if Terminal.equal z t then
            (* If [sym] is the terminal symbol [z], then this transition
               matches our lookahead assumption, so we can take it. For
               every [z'], we have an edge to [s', z'], labeled with the
               singleton word [z]. *)
            let w = W.singleton z in
            foreach_terminal (fun z' ->
              edge w 1 (s', z')
            )
      | Symbol.N nt ->
          (* If [sym] is a nonterminal symbol [nt], then we query [E]
             in order to find out which (minimal) words [w] allow us
             to take this transition. We must again try every [z'],
             and must respect the constraint that the first symbol
             of the word [w.z'] is [z]. For every [z'] and [w] that
             fulfill these requirements, we have an edge to [s', z'],
             labeled with the word [w]. *)
         foreach_terminal (fun z' ->
           E.query s nt z z' (fun w ->
             edge w (W.length w) (s', z')
1094
           )
1095 1096
         )
    )
1097

1098 1099 1100 1101 1102 1103 1104 1105 1106
  (* Algorithm A*, used with a zero estimate, is Dijkstra's algorithm.
     We have experimented with a non-zero estimate, but the performance
     increase was minimal. *)
  let estimate _ =
    0

end)

(* ------------------------------------------------------------------------ *)
1107

1108 1109
(* [explored] counts how many graph nodes we have discovered during the
   search. *)
1110

1111 1112
let explored =
  ref 0
1113

1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127
(* We wish to store a set of triples [nt, w, s'], meaning that an error can be
   triggered in state [s'] by beginning in the initial state that corresponds
   to [nt] and by reading the sequence of terminal symbols [w]. We wish to
   store at most one such triple for every state [s'], so we organize the data
   as a set [domain] of states [s'] and a list [data] of triples [nt, w, s']. *)

(* We could print this data as we go, which would naturally result in sorting
   the output by increasing word sizes. However, it seems preferable to sort
   the sentences lexicographically, so that similar sentences end up close to
   one another. This is why we store a list of triples and sort it before
   printing it out. *)

let domain =
  ref Lr1.NodeSet.empty
1128

1129 1130
let data : (Nonterminal.t * W.word * Lr1.node) list ref =
  ref []
1131

1132 1133
(* The set [reachable] stores every reachable state (regardless of whether an
   error can be triggered in that state). *)
1134 1135 1136 1137

let reachable =
  ref Lr1.NodeSet.empty

1138 1139
(* [display] displays one data item. The format is that of a [.messages]
   file, which the user can then edit to customize the error messages. *)
1140

1141 1142 1143 1144
let display (nt, w, s') : unit =
  (* Print the sentence, followed with a few comments, followed with a
     blank line, followed with a proposed error message, followed with
     another blank line. *)
1145
  Printf.printf
1146
    "%s: %s\n# Length: %d\n# Leads to an error in state: %d.\n%s\n%s\n"
1147 1148
    (Nonterminal.print false nt)
    (W.print w)
1149 1150
    (W.length w)
    (Lr1.number s')
1151
    (Lr0.print "# " (Lr1.state s')) (* TEMPORARY [print] or [print_closure]? *)
1152
    Interpret.default_message
1153

1154 1155 1156 1157 1158
(* Perform the forward search. *)

let _, _ =
  A.search (fun ((s', z), path) ->
    incr explored;
1159
    reachable := Lr1.NodeSet.add s' !reachable;
1160 1161
    (* If [z] causes an error in state [s'] and this is the first time
       we are able to trigger an error in this state, ... *)
1162
    if causes_an_error s' z && not (Lr1.NodeSet.mem s' !domain) then begin
1163 1164
      (* Reconstruct the initial state [s] and the word [w] that lead
         to this error. *)
1165 1166
      let (s, _), ws = A.reverse path in
      let w = List.fold_right W.append ws (W.singleton z) in
1167 1168 1169 1170
      (* Check that the reference interpreter confirms our finding. *)
      assert (validate s s' w);
      (* Store this new data. *)
      let nt = Lr1.nt_of_entry s in
1171 1172
      domain := Lr1.NodeSet.add s' !domain;
      data := (nt, w, s') :: !data
1173
    end
1174
  )
1175

1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188
(* Sort and output the data. *)

let () =
  let compare (nt1, w1, _) (nt2, w2, _) =
    let c = Nonterminal.compare nt1 nt2 in
    if c <> 0 then c else W.compare w2 w1
  in
  List.iter display (List.fast_sort compare !data)

(* ------------------------------------------------------------------------ *)

(* Verbosity. *)

1189 1190 1191 1192 1193 1194 1195
let max_heap_size =
  if X.verbose || X.statistics <> None then
    let stat = Gc.quick_stat() in
    (stat.Gc.top_heap_words * (Sys.word_size / 8) / 1024 / 1024)
  else
    0 (* dummy *)

1196
let () =
1197
  Time.tick "Forward search";
1198 1199 1200
  if X.verbose then begin
    Printf.eprintf
      "%d graph nodes explored by forward search.\n\
1201
       %d out of %d states are reachable.\n\
1202 1203
       Found %d states where an error can occur.\n\
       Maximum size reached by the major heap: %dM\n%!"
1204
    !explored
1205
    (Lr1.NodeSet.cardinal !reachable) Lr1.n
1206
    (Lr1.NodeSet.cardinal !domain)
1207
    max_heap_size
1208
  end
1209

1210 1211
(* ------------------------------------------------------------------------ *)

1212
(* If requested by the client, write one line of statistics to a .csv file. *)
1213

1214 1215
let stop =
  now()
1216 1217

let () =
1218 1219
  X.statistics |> Option.iter (fun filename ->
    let c = open_out_gen [ Open_creat; Open_append; Open_text ] 0o644 filename in
1220
    Printf.fprintf c
1221
      "%s,%d,%d,%d,%d,%d,%d,%d,%.2f,%d\n%!"
1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247
      (* Grammar name. *)
      Settings.base
      (* Number of terminal symbols. *)
      Terminal.n
      (* Number of nonterminal symbols. *)
      Nonterminal.n
      (* Grammar size (not counting the error productions). *)
      begin
        Production.foldx (fun prod accu ->
          let rhs = Production.rhs prod in
          if List.mem (Symbol.T Terminal.error) (Array.to_list rhs) then
            accu
          else
            accu + Array.length rhs
        ) 0
      end
      (* Automaton size (i.e., number of states). *)
      Lr1.n
      (* Cumulated trie size. *)
      (Trie.cumulated_size())
      (* Size of [T]. *)
      (T.size())
      (* Size of [E]. *)
      (E.size())
      (* Elapsed user time, in seconds. *)
      (stop -. start)
1248 1249
      (* Max heap size, in megabytes. *)
      max_heap_size
1250 1251
    ;
    close_out c
1252
  )
1253 1254 1255

(* ------------------------------------------------------------------------ *)

1256
end
1257