LRijkstra.ml 34.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
(* The purpose of this algorithm is to find, for each pair of a state [s]
   and a terminal symbol [z] such that looking at [z] in state [s] causes
   an error, a minimal path (starting in some initial state) that actually
   triggers this error. *)

(* This is potentially useful for grammar designers who wish to better
   understand the properties of their grammar, or who wish to produce a
   list of all possible syntax errors (or, at least, one syntax error in
   each automaton state where an error may occur). *)

(* The problem seems rather tricky. One might think that it suffices to
   compute shortest paths in the automaton, and to use [Analysis.minimal] to
   replace each non-terminal symbol in a path with a minimal word that this
   symbol generates. One can indeed do so, but this yields only a lower bound
   on the actual shortest path to the error at [s, z]. Indeed, two
   difficulties arise:

   - Some states have a default reduction. Thus, they will not trigger
     an error, even though they should. The error is triggered in some
     other state, after reduction takes place.

   - If the grammar has conflicts, conflict resolution removes some
     (shift or reduce) actions, hence may suppress the shortest path. *)

(* We explicitly choose to ignore the [error] token. If the grammar mentions
   [error] and if some state is reachable only via an [error] transition, too
   bad: we report this state as unreachable. It would be too complicated to
   have to create a first error in order to be able to take certain
   transitions or drop certain parts of the input. *)

31 32 33 34 35 36 37
(* ------------------------------------------------------------------------ *)

(* To delay the side effects performed by this module, we wrap everything in
   in a big functor without arguments. *)

module Run (X : sig end) = struct

38
open Grammar
39 40 41 42 43 44 45 46

(* ------------------------------------------------------------------------ *)

(* Build a module that represents words as (hash-consed) strings. Note:
   this functor application has a side effect (it allocates memory, and
   more importantly, it may fail). *)

module W = Terminal.Word(struct end)
47

48 49
(* ------------------------------------------------------------------------ *)

50
(* Throughout, we ignore the [error] pseudo-token completely. We consider that
51 52 53 54 55 56
   it never appears on the input stream. Thus, we disregard any reductions or
   transitions that take place when the lookahead symbol is [error]. As a
   result, any state whose incoming symbol is [error] is found unreachable. *)

let regular z =
  not (Terminal.equal z Terminal.error)
57

58 59 60 61 62 63 64 65
(* We introduce a pseudo-terminal symbol [any]. It is used in several places
   later on, in particular in the field [fact.lookahead], to encode the
   absence of a lookahead hypothesis -- i.e., any terminal symbol will do. *)

let any : Terminal.t =
  assert (Terminal.n < 255); (* TEMPORARY *)
  Obj.magic Terminal.n       (* TEMPORARY *)

66 67
(* ------------------------------------------------------------------------ *)

68 69 70 71 72
(* We begin with a number of auxiliary functions that provide information
   about the LR(1) automaton. These functions could perhaps be moved to a
   separate module. We keep them here, for the moment, because they are not
   used anywhere else. *)

POTTIER Francois's avatar
POTTIER Francois committed
73 74 75
(* [reductions_on s z] is the list of reductions permitted in state [s] when
   the lookahead symbol is [z]. This is a list of zero or one elements. This
   does not take default reductions into account. *)
76

POTTIER Francois's avatar
POTTIER Francois committed
77
let reductions_on s z : Production.index list =
78
  assert (regular z && z <> any);
79 80 81 82 83
  try
    TerminalMap.find z (Lr1.reductions s)
  with Not_found ->
    []

84 85 86
(* [has_reduction s z] tells whether state [s] is willing to reduce some
   production (and if so, which one) when the lookahead symbol is [z]. It
   takes a possible default reduction into account. *)
87 88

let has_reduction s z : Production.index option =
89
  assert (z <> any);
90 91 92 93
  match Invariant.has_default_reduction s with
  | Some (prod, _) ->
      Some prod
  | None ->
POTTIER Francois's avatar
POTTIER Francois committed
94
      match reductions_on s z with
95 96 97 98 99 100
      | prod :: prods ->
          assert (prods = []);
          Some prod
      | [] ->
          None

101 102 103 104 105 106 107 108 109
(* [can_reduce s prod] indicates whether state [s] is able to reduce
   production [prod] (either as a default reduction, or as a normal
   reduction). *)

let can_reduce s prod =
  match Invariant.has_default_reduction s with
  | Some (prod', _) when prod = prod' ->
      true
  | _ ->
110 111
      TerminalMap.fold (fun z prods accu ->
        accu || regular z && List.mem prod prods
112 113
      ) (Lr1.reductions s) false

114 115
(* [causes_an_error s z] tells whether state [s] will initiate an error on the
   lookahead symbol [z]. *)
116

117
let causes_an_error s z : bool =
118
  assert (regular z && z <> any);
119 120 121 122
  match Invariant.has_default_reduction s with
  | Some _ ->
      false
  | None ->
POTTIER Francois's avatar
POTTIER Francois committed
123
      reductions_on s z = [] &&
124 125
      not (SymbolMap.mem (Symbol.T z) (Lr1.transitions s))

126 127 128
(* [foreach_terminal f] applies the function [f] to every terminal symbol in
   turn, except [error]. *)

129 130
let foreach_terminal f =
  Terminal.iter (fun t ->
131
    if regular t then
132 133 134
      f t
  )

135 136 137 138 139
(* [foreach_terminal_not_causing_an_error s f] applies the function [f] to
   every terminal symbol [z] such that [causes_an_error s z] is false. This
   could be implemented in a naive manner using [foreach_terminal] and
   [causes_an_error]. This implementation is slightly more efficient. *)

140 141 142
let foreach_terminal_not_causing_an_error s f =
  match Invariant.has_default_reduction s with
  | Some _ ->
143
      (* There is a default reduction. No symbol causes an error. *)
144 145
      foreach_terminal f
  | None ->
146 147 148
      (* Enumerate every terminal symbol [z] for which there is a
         reduction. *)
      TerminalMap.iter (fun z _ ->
149
        if regular z then
150
          f z
151
      ) (Lr1.reductions s);
152 153
      (* Enumerate every terminal symbol [z] for which there is a
         transition. *)
154 155
      SymbolMap.iter (fun sym _ ->
        match sym with
156
        | Symbol.T z ->
157
            if regular z then
158
              f z
159 160 161 162
        | Symbol.N _ ->
            ()
      ) (Lr1.transitions s)

163 164 165 166 167 168 169 170 171 172 173 174 175
(* Let us say a state [s] is solid if its incoming symbol is a terminal symbol
   (or if it has no incoming symbol at all, i.e., it is an initial state). A
   contrario, a state is fragile if its incoming symbol is a non-terminal
   symbol. *)

let is_solid s =
  match Lr1.incoming_symbol s with
  | None
  | Some (Symbol.T _) ->
      true
  | Some (Symbol.N _) ->
      false

176 177
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
178 179 180 181 182 183 184 185 186 187 188 189
(* Suppose [s] is a state that carries an outgoing edge labeled with a
   non-terminal symbol [nt]. We are interested in finding out how this edge
   can be taken. In order to do that, we must determine how, by starting in
   [s], one can follow a path that corresponds to (the right-hand side of) a
   production [prod] associated with [nt]. There are in general several such
   productions. The paths that they determine in the automaton form a "star".
   We represent the star rooted at [s] as a trie. For every state [s], the
   star rooted at [s] is constructed in advance, before the algorithm runs.
   While the algorithm runs, a point in the trie (that is, a sub-trie) tells
   us where we come form, where we are, and which production(s) we are hoping
   to reduce in the future. *)

190 191 192
module Trie : sig

  type trie
193 194 195 196

  (* [star s] creates a (new) trie whose source is [s], populated with its
     branches. (There is one branch for every production [prod] associated
     with every non-terminal symbol [nt] for which [s] carries an outgoing
197
     edge.) If the star turns out to be trivial then [None] is returned. *)
198 199
  val star: Lr1.node -> trie option

200 201 202 203
  (* After [star s] has been called, [size (Lr1.number s)] reports the size
     of the trie that has been constructed for state [s]. *)
  val size: int -> int

204 205 206 207 208 209 210 211
  (* Every (sub-)trie has a unique identity. (One can think of it as its
     address.) [compare] compares the identity of two tries. This can be
     used, e.g., to set up a map whose keys are tries. *)
  val compare: trie -> trie -> int

  (* [source t] returns the source state of the (sub-)trie [t]. This is
     the root of the star of which [t] is a sub-trie. In other words, this
     tells us "where we come from". *)
212
  val source: trie -> Lr1.node
POTTIER Francois's avatar
POTTIER Francois committed
213 214

  (* [current t] returns the current state of the (sub-)trie [t]. This is
215 216
     the root of the sub-trie [t]. In other words, this tells us "where
     we are". *)
POTTIER Francois's avatar
POTTIER Francois committed
217
  val current: trie -> Lr1.node
218

219 220 221 222 223
  (* [accepts prod t] tells whether the current state of the trie [t] is
     the end of a branch associated with production [prod]. If so, this
     means that we have successfully followed a path that corresponds to
     the right-hand side of production [prod]. *)
  val accepts: Production.index -> trie -> bool
224

225 226 227 228 229 230 231 232
  (* [step sym t] is the immediate sub-trie of [t] along the symbol [sym].
     This function raises [Not_found] if [t] has no child labeled [sym]. *)
  val step: Symbol.t -> trie -> trie

  (* [verbose()] outputs debugging & performance information. *)
  val verbose: unit -> unit

end = struct
233

234 235
  (* A trie has the following structure. *)

236
  type trie = {
237 238 239
    (* A unique identity, used by [compare]. The trie construction code
       ensures that these numbers are indeed unique: see [fresh], [insert],
       [star]. *)
240
    identity: int;
241
    (* The root state of this star: "where we come from". *)
242
    source: Lr1.node;
243
    (* The current state, i.e., the root of this sub-trie: "where we are". *)
POTTIER Francois's avatar
POTTIER Francois committed
244
    current: Lr1.node;
245 246 247
    (* The productions that we can reduce in the current state. In other
       words, if this list is nonempty, then the current state is the end
       of one (or several) branches. It can nonetheless have children. *)
248
    productions: Production.index list;
249 250
    (* The children, or sub-tries. *)
    transitions: trie SymbolMap.t
251
  }
252

253
  (* This counter is used by [mktrie] to produce unique identities. *)
254 255
  let c = ref 0

256
  (* This smart constructor creates a new trie with a unique identity. *)
POTTIER Francois's avatar
POTTIER Francois committed
257
  let mktrie source current productions transitions =
258
    let identity = Misc.postincrement c in
POTTIER Francois's avatar
POTTIER Francois committed
259
    { identity; source; current; productions; transitions }
260

261 262
  exception DeadBranch

263
  let rec insert w prod t =
264 265
    match w with
    | [] ->
POTTIER Francois's avatar
POTTIER Francois committed
266
        (* We check whether the current state [t.current] is able to reduce
267 268 269 270 271
           production [prod]. (If [prod] cannot be reduced, the reduction
           action must have been suppressed by conflict resolution.) If not,
           then this branch is dead. This test is superfluous (i.e., it would
           be OK to conservatively assume that [prod] can be reduced) but
           allows us to build a slightly smaller star in some cases. *)
POTTIER Francois's avatar
POTTIER Francois committed
272
        if can_reduce t.current prod then
273 274 275 276 277 278
          (* We consume (update) the trie [t], so there is no need to allocate
             a new stamp. (Of course we could allocate a new stamp, but I prefer
             to be precise.) *)
          { t with productions = prod :: t.productions }
        else
          raise DeadBranch
279 280
    | (Symbol.T t) :: _ when not (regular t) ->
         raise DeadBranch
281
    | a :: w ->
POTTIER Francois's avatar
POTTIER Francois committed
282
        (* Check if there is a transition labeled [a] out of [t.current]. If
283 284
           there is, we add a child to the trie [t]. If there isn't, then it
           must have been removed by conflict resolution. (Indeed, it must be
285 286 287
           present in a canonical automaton.) We could in this case return an
           unchanged sub-trie. We can do slightly better: we abort the whole
           insertion, so as to return an unchanged toplevel trie. *)
POTTIER Francois's avatar
POTTIER Francois committed
288
        match SymbolMap.find a (Lr1.transitions t.current) with
289
        | successor ->
290 291 292 293 294 295 296 297
            (* Find our child at [a], or create it. *)
            let t' =
              try
                SymbolMap.find a t.transitions
              with Not_found ->
                mktrie t.source successor [] SymbolMap.empty
            in
            (* Update the child [t']. *)
298
            let t' = insert w prod t' in
299 300
            (* Update [t]. Again, no need to allocate a new stamp. *)
            { t with transitions = SymbolMap.add a t' t.transitions }
301
        | exception Not_found ->
302
            raise DeadBranch
303

304 305 306 307 308
  (* [insert prod t] inserts a new branch, corresponding to production
     [prod], into the trie [t]. This function consumes its argument,
     which should no longer be used afterwards. *)
  let insert prod t =
    let w = Array.to_list (Production.rhs prod) in
309 310 311 312 313 314
    let save = !c in
    try
      insert w prod t
    with DeadBranch ->
      c := save;
      t
315

316 317 318 319 320 321 322 323 324 325 326 327 328
  (* [fresh s] creates a new empty trie whose source is [s]. *)
  let fresh source =
    mktrie source source [] SymbolMap.empty

  let star s =
    SymbolMap.fold (fun sym _ accu ->
      match sym with
      | Symbol.T _ ->
          accu
      | Symbol.N nt ->
          Production.foldnt nt accu insert
    ) (Lr1.transitions s) (fresh s)

329
  (* [nontrivial t] tests whether the trie [t] has any branches, i.e.,
330 331 332 333
     contains at least one sub-trie whose [productions] field is nonempty.
     Trivia: a trie of size greater than 1 is necessarily nontrivial, but the
     converse is not true: a nontrivial trie can have size 1. (This occurs
     when all productions have zero length.) *)
334 335 336
  let nontrivial t =
    not (t.productions = [] && SymbolMap.is_empty t.transitions)

337 338 339 340 341 342
  (* Redefine [star] to include a [nontrivial] test and to record the size
     of the newly built trie. *)

  let size =
    Array.make Lr1.n (-1)

343
  let star s =
344
    let initial = !c in
345
    let t = star s in
346 347
    let final = !c in
    size.(Lr1.number s) <- final - initial;
348 349 350 351 352
    if nontrivial t then
      Some t
    else
      None

353 354 355 356
  let size s =
    assert (size.(s) >= 0);
    size.(s)

POTTIER Francois's avatar
POTTIER Francois committed
357 358
  let compare t1 t2 =
    Pervasives.compare (t1.identity : int) t2.identity
359

360 361 362
  let source t =
    t.source

POTTIER Francois's avatar
POTTIER Francois committed
363 364
  let current t =
    t.current
365

366 367 368
  let accepts prod t =
    List.mem prod t.productions

369
  let step a t =
370
    SymbolMap.find a t.transitions (* careful: may raise [Not_found] *)
371

372
  let verbose () =
373
    Printf.fprintf stderr "Cumulated star size: %d\n%!" !c
374

375 376
end

POTTIER Francois's avatar
POTTIER Francois committed
377 378
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
379 380 381 382 383 384
(* The main algorithm, [LRijkstra], accumulates facts. A fact is a triple of a
   position (that is, a sub-trie), a word, and a lookahead assumption. Such a
   fact means that this position can be reached, from the source state
   [Trie.source fact.position], by consuming [fact.word], under the assumption
   that the next input symbol is [fact.lookahead]. *)

385 386
(* We allow [fact.lookahead] to be [any] so as to indicate that this fact does
   not have a lookahead assumption. *)
POTTIER Francois's avatar
POTTIER Francois committed
387

388
type fact = {
389
  position: Trie.trie;
390
  word: W.word;
391
  lookahead: Terminal.t (* may be [any] *)
392 393
}

394 395
(* Accessors. *)

396
let source fact =
397
  Trie.source fact.position
398

POTTIER Francois's avatar
POTTIER Francois committed
399
let current fact =
400
  Trie.current fact.position
401

402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428
(* Two invariants reduce the number of facts that we consider:

   1. If [fact.lookahead] is a terminal symbol [z] (i.e., not [any]), then
      [z] does not cause an error in the current state [current fact]. It
      would be useless to consider a fact that violates this property; it
      cannot possibly lead to a successful reduction.

   2. [fact.lookahead] is [any] iff the current state [current fact] is
      solid. This sounds rather reasonable (when a state is entered
      by shifting, it is entered regardless of which symbol follows)
      and simplifies the implementation of the sub-module [T].

*)

let invariant1 fact =
  fact.lookahead = any || not (causes_an_error (current fact) fact.lookahead)

let invariant2 fact =
  (fact.lookahead = any) = is_solid (current fact)

(* [compatible z a] checks whether the terminal symbol [a] satisfies the
   lookahead assumption [z] -- which can be [any]. *)

let compatible z a =
  assert (a <> any);
  z = any || z = a

POTTIER Francois's avatar
POTTIER Francois committed
429
(* ------------------------------------------------------------------------ *)
430

431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458
(* As in Dijkstra's algorithm, a priority queue contains the facts that await
   examination. The length of [fact.word] serves as the priority of a fact.
   This guarantees that we discover shortest paths. (We never insert into the
   queue a fact whose priority is less than the priority of the last fact
   extracted out of the queue.) *)

(* [LowIntegerPriorityQueue] offers very efficient operations (essentially
   constant time, for a small constant). It exploits the fact that priorities
   are low nonnegative integers. *)

module Q = LowIntegerPriorityQueue

let q =
  Q.create()

(* We never insert into the queue a fact that immediately causes an error,
   i.e., a fact such that [causes_an_error (current fact) fact.lookahead]
   holds. In practice, this convention allows reducing the number of facts
   that go through the queue by a factor of two. *)

(* In principle, there is no need to insert the fact into the queue if [T]
   already stores a comparable fact. We could perform this test in [add].
   However, a quick experiment suggests that this is not worthwhile. The run
   time augments (because membership in [T] is tested twice, upon inserting
   and upon extracting) and the memory consumption does not seem to go down
   significantly. *)

let add fact =
459 460
  assert (invariant1 fact);
  assert (invariant2 fact);
461 462 463 464 465 466 467 468 469
  (* The length of [fact.word] serves as the priority of this fact. *)
  Q.add q fact (W.length fact.word)

(* Construct the [star] of every state [s]. Initialize the priority queue. *)

let () =
  Lr1.iter (fun s ->
    match Trie.star s with
    | Some trie ->
470 471
        (* TEMPORARY weird *)
        if is_solid s then
472 473 474
          add {
            position = trie;
            word = W.epsilon;
475
            lookahead = any
476
          }
477 478 479 480 481 482 483 484
        else
          foreach_terminal_not_causing_an_error s (fun z ->
            add {
              position = trie;
              word = W.epsilon;
              lookahead = z
            }
          )
485 486 487 488 489 490
    | None ->
        ()
  )

(* ------------------------------------------------------------------------ *)

491 492 493 494 495 496 497 498 499 500
(* The first symbol of the input, [W.first fact.word fact.lookahead], plays a
   special role. Indeed, for every position, for every first symbol, and for
   every lookahead symbol, we keep track of at most one fact. Thus, the total
   number of facts accumulated by the algorithm is at most [T.n^2], where [T]
   is the total size of the tries that we have constructed, and [n] is the
   number of terminal symbols. (This number can be quite large. [T] can be in
   the tens of thousands, and [n] can be over one hundred. These figures lead
   to a theoretical upper bound of 100M. In practice, for T=25K and n=108, we
   observe that the algorithm gathers about 7M facts.) *)

501
module T : sig
502 503

  (* [register fact] registers the fact [fact]. It returns [true] if this fact
POTTIER Francois's avatar
POTTIER Francois committed
504 505
     is new, i.e., no fact concerning the same triple of [position], [a], and
     [z] was previously known. *)
506 507
  val register: fact -> bool

POTTIER Francois's avatar
POTTIER Francois committed
508 509
  (* [query current z f] enumerates all known facts whose current state is
     [current] and whose lookahead assumption is [z]. *)
510
  val query: Lr1.node -> Terminal.t -> (fact -> unit) -> unit
511

512
  val verbose: unit -> unit
513

514
end = struct
515

516
  (* This module implements a set of facts. Two facts are considered equal
517
     (for the purposes of this set) if they have the same [position], [a], and
518 519 520 521 522 523
     [z] fields. The [word] is not considered. Indeed, we are not interested
     in keeping track of several words that produce the same effect. Only the
     shortest such word is of interest. *)

  (* We need to query the set of facts in two ways. In [register], we need to
     test whether a fact is in the set. In [query], we need to find all facts
POTTIER Francois's avatar
POTTIER Francois committed
524 525
     that match a pair [current, z]. For this reason, we use a two-level table.
     The first level is a matrix indexed by [current] and [z]. At the second
526
     level, we find sets of facts. *)
527 528 529
(**)

  module M =
530
    MySet.Make(struct
531 532
      type t = fact
      let compare fact1 fact2 =
533
        let c = Trie.compare fact1.position fact2.position in
534
        if c <> 0 then c else
535 536
        let a1 = W.first fact1.word fact1.lookahead
        and a2 = W.first fact2.word fact2.lookahead in
537
        (* note: [a1] and [a2] can be [any] here *)
538 539
        Terminal.compare a1 a2
    end)
540

541
  let table = (* a pretty large table... *)
542 543
    Array.make (Lr1.n * (Terminal.n + 1)) M.empty (* room for [any] *)
  (* TEMPORARY this space is wasted for solid states *)
544

POTTIER Francois's avatar
POTTIER Francois committed
545
  let index current z =
546
    (Terminal.n + 1) * (Lr1.number current) + Terminal.t2i z
547

548 549
  let count = ref 0

550
  let register fact =
POTTIER Francois's avatar
POTTIER Francois committed
551
    let current = current fact in
552
    let z = fact.lookahead in
553 554
    (* [z] is [any] iff [current] is solid. *)
    assert ((z = any) = is_solid current);
POTTIER Francois's avatar
POTTIER Francois committed
555
    let i = index current z in
556 557 558 559 560 561 562 563 564 565 566 567
    let m = table.(i) in
    (* We crucially rely on the fact that [M.add] guarantees not to
       change the set if an ``equal'' fact already exists. Thus, a
       later, longer path is ignored in favor of an earlier, shorter
       path. *)
    let m' = M.add fact m in
    m != m' && begin
      incr count;
      table.(i) <- m';
      true
    end

POTTIER Francois's avatar
POTTIER Francois committed
568
  let query current z f =
569 570 571 572
    assert (z <> any);
    (* if [current] is solid then the facts that concern it are stored
       under any [any], not under [z] *)
    let i = index current (if is_solid current then any else z) in
573 574
    let m = table.(i) in
    M.iter f m
575

576
  let verbose () =
577 578
    Printf.fprintf stderr "T stores %d facts.\n%!" !count

579 580
end

POTTIER Francois's avatar
POTTIER Francois committed
581 582
(* ------------------------------------------------------------------------ *)

583 584 585 586 587 588 589
(* The module [E] is in charge of recording the non-terminal edges that we have
   discovered, or more precisely, the conditions under which these edges can be
   taken. *)

module E : sig

  (* [register s nt w z] records that, in state [s], the outgoing edge labeled
590 591 592
     [nt] can be taken by consuming the word [w], if the next symbol is [z].
     It returns [true] if this information is new. *)
  val register: Lr1.node -> Nonterminal.t -> W.word -> Terminal.t -> bool
593 594 595 596

  (* [query s nt a z] answers whether, in state [s], the outgoing edge labeled
     [nt] can be taken by consuming some word [w], under the assumption that
     the next symbol is [z], and under the constraint that the first symbol of
597 598
     [w.z] is [a]. *)
  val query: Lr1.node -> Nonterminal.t -> Terminal.t -> Terminal.t -> (W.word -> unit) -> unit
599

600
  val verbose: unit -> unit
601

602 603
end = struct

604 605 606 607 608 609 610 611 612 613
  (* At a high level, we must implement a mapping of [s, nt, a, z] to [w]. In
     practice, we can implement this specification using any combination of
     arrays, hash tables, balanced binary trees, and perfect hashing (i.e.,
     packing several of [s], [nt], [a], [z] in one word.) Here, we choose to
     use an array, indexed by [s], of hash tables, indexed by a key that packs
     [nt], [a], and [z] in one word. According to a quick experiment, the
     final population of the hash table [table.(index s)] seems to be roughly
     [Terminal.n * Trie.size s]. We note that using an initial capacity
     of 0 and relying on the hash table's resizing mechanism has a significant
     cost, which is why we try to guess a good initial capacity. *)
614

615
  module H = Hashtbl
616

617
  let table = (* a pretty large table... *)
618 619 620 621
    Array.init (Lr1.n) (fun i ->
      let size = Trie.size i in
      H.create (if size = 1 then 0 else Terminal.n * size)
    )
622 623 624

  let index s =
    Lr1.number s
625

626 627 628 629
  let pack nt a z : int =
    (Nonterminal.n2i nt lsl 16) lor
    (Terminal.t2i a lsl 8) lor
    (Terminal.t2i z)
630

631 632
  let count = ref 0

633
  let register s nt w z =
634
    assert (regular z && z <> any);
635
    let i = index s in
636
    let m = table.(i) in
637
    let a = W.first w z in
638
    assert (not (causes_an_error s a));
639 640 641 642
    let key = pack nt a z in
    if H.mem m key then
      false
    else begin
643
      incr count;
644
      H.add m key w;
645 646
      true
    end
647

648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668
  let rec query s nt a z f =
    assert (regular z && z <> any);
    (* [a] can be [any] *)
    if a <> any then begin
      let i = index s in
      let m = table.(i) in
      let key = pack nt a z in
      match H.find m key with
      | w -> f w
      | exception Not_found -> ()
    end
    else begin
      (* If [a] is [any], we query the table for every concrete [a].
         We can limit ourselves to symbols that do not cause an error
         in state [s]. Those that do certainly do not have an entry;
         see the assertion in [register] above. *)
      foreach_terminal_not_causing_an_error s (fun a ->
        query s nt a z f
      )
        (* TEMPORARY try a scheme that allows a more efficient iteration? *)
    end
669

670
  let verbose () =
671 672
    Printf.fprintf stderr "E stores %d facts.\n%!" !count

673 674
end

POTTIER Francois's avatar
POTTIER Francois committed
675 676
(* ------------------------------------------------------------------------ *)

677
let new_edge s nt w z =
678
  assert (regular z && z <> any);
679
  if E.register s nt w z then
680
    let sym = Symbol.N nt in
681
    T.query s (W.first w z) (fun fact ->
682
      assert (compatible fact.lookahead (W.first w z));
683 684
      match Trie.step sym fact.position with
      | position ->
685
          assert (not (is_solid (Trie.current position)));
686
          if not (causes_an_error (Trie.current position) z) then
687
            add {
688
              position;
689 690 691 692 693
              word = W.append fact.word w;
              lookahead = z
            }
      | exception Not_found ->
          ()
694
    )
695 696 697 698 699 700 701 702 703 704

(* [consequences fact] is invoked when we discover a new fact (i.e., one that
   was not previously known). It studies the consequences of this fact. These
   consequences are of two kinds:

   - As in Dijkstra's algorithm, the new fact can be viewed as a newly
   discovered vertex. We study its (currently known) outgoing edges,
   and enqueue new facts in the priority queue.

   - Sometimes, a fact can also be viewed as a newly discovered edge.
POTTIER Francois's avatar
POTTIER Francois committed
705 706
   This is the case when the word from [fact.source] to [fact.current]
   represents a production of the grammar and [fact.current] is willing
707 708 709 710 711 712 713 714
   to reduce this production. We record the existence of this edge,
   and re-inspect any previously discovered vertices which are
   interested in this outgoing edge.
*)
(**)

let consequences fact =

POTTIER Francois's avatar
POTTIER Francois committed
715
  let current = current fact in
716

POTTIER Francois's avatar
POTTIER Francois committed
717
  (* 1. View [fact] as a vertex. Examine the transitions out of [current]. *)
718
  
719
  SymbolMap.iter (fun sym s' ->
720
    match Trie.step sym fact.position, sym with
721
    | exception Not_found -> ()
722
    | position, Symbol.T t ->
723 724 725
        (* [t] cannot be the [error] token, because the trie does not have
           any edges labeled [error]. *)
        assert (regular t);
726

POTTIER Francois's avatar
POTTIER Francois committed
727
        (* 1a. There is a transition labeled [t] out of [current]. If
728 729 730 731 732
           the lookahead assumption [fact.lookahead] is compatible with [t],
           then we derive a new fact, where one more edge has been taken. We
           enqueue this new fact for later examination. *)
        (**)

733
        if compatible fact.lookahead t then begin
734
          let word = W.append fact.word (W.singleton t) in
735 736 737 738 739 740 741 742
          (* assert (Lr1.Node.compare (Trie.current position) s' = 0); *)
          (* [s'] has a terminal incoming symbol. It is always entered
             without consideration for the next lookahead symbol. Thus,
             we use [any] as the lookahead assumption in the new fact
             that we produce. *)
          assert (is_solid (Trie.current position));
          add { position; word; lookahead = any }
        end
743

744
    | position, Symbol.N nt ->
745

POTTIER Francois's avatar
POTTIER Francois committed
746
        (* 1b. There is a transition labeled [nt] out of [current]. We
747 748 749 750 751 752
           need to know how this nonterminal edge can be taken. We query for a
           word [w] that allows us to take this edge. The answer depends on
           the terminal symbol [z] that comes *after* this word: we try all
           such symbols. Furthermore, we need the first symbol of [w.z] to
           satisfy the lookahead assumption [fact.lookahead], so the answer
           also depends on this assumption. *)
POTTIER Francois's avatar
POTTIER Francois committed
753 754
        (* TEMPORARY it could be that the answer does not depend on [z]...
           (default reduction) *)
755 756 757
        (**)

        foreach_terminal_not_causing_an_error s' (fun z ->
POTTIER Francois's avatar
POTTIER Francois committed
758
          E.query current nt fact.lookahead z (fun w ->
759 760
            assert (compatible fact.lookahead (W.first w z));
            assert (not (is_solid (Trie.current position)));
761
            add {
762
              position;
763 764 765 766 767
              word = W.append fact.word w;
              lookahead = z
            }
          )
        )
768

POTTIER Francois's avatar
POTTIER Francois committed
769
  ) (Lr1.transitions current);
770 771

  (* 2. View [fact] as a possible edge. This is possible if the path from
POTTIER Francois's avatar
POTTIER Francois committed
772 773
     [fact.source] to [current] represents a production [prod] and
     [current] is willing to reduce this production. We check that
774
     [fact.position] accepts [epsilon]. This guarantees that reducing [prod]
775 776 777 778
     takes us all the way back to [fact.source]. Thus, this production gives
     rise to an edge labeled [nt] -- the left-hand side of [prod] -- out of
     [fact.source]. This edge is subject to the lookahead assumption
     [fact.lookahead], so we record that. *)
779 780
  (**)

781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804
  if fact.lookahead <> any then begin
    match has_reduction current fact.lookahead with
    | Some prod when Trie.accepts prod fact.position ->
        new_edge (source fact) (Production.nt prod) fact.word fact.lookahead
    | _ ->
        ()
  end
  else begin
    (* Every reduction must be considered. *)
    match Invariant.has_default_reduction current with
    | Some (prod, _) ->
        if Trie.accepts prod fact.position then
          (* TEMPORARY for now, avoid sending [any] into [new_edge] *)
          foreach_terminal (fun z ->
            new_edge (source fact) (Production.nt prod) (fact.word) z
          )
    | None ->
       TerminalMap.iter (fun z prods ->
         if regular z then
           let prod = Misc.single prods in
           if Trie.accepts prod fact.position then
             new_edge (source fact) (Production.nt prod) (fact.word) z
       ) (Lr1.reductions current)
  end
805

806
let level = ref 0
807

POTTIER Francois's avatar
POTTIER Francois committed
808 809 810
let extracted, considered =
  ref 0, ref 0

POTTIER Francois's avatar
POTTIER Francois committed
811 812
let done_with_level () =
  Printf.fprintf stderr "Done with level %d.\n" !level;
813
  W.verbose();
814 815
  T.verbose();
  E.verbose();
POTTIER Francois's avatar
POTTIER Francois committed
816 817 818
  Printf.fprintf stderr "Q stores %d facts.\n" (Q.cardinal q);
  Printf.fprintf stderr "%d facts extracted out of Q, of which %d considered.\n%!"
    !extracted !considered
POTTIER Francois's avatar
POTTIER Francois committed
819

820
let discover fact =
POTTIER Francois's avatar
POTTIER Francois committed
821
  incr extracted;
822
  if T.register fact then begin
823
    if W.length fact.word > ! level then begin
POTTIER Francois's avatar
POTTIER Francois committed
824
      done_with_level();
825 826
      level := W.length fact.word;
    end;
POTTIER Francois's avatar
POTTIER Francois committed
827
    incr considered;
828
    consequences fact
829
  end
830

POTTIER Francois's avatar
POTTIER Francois committed
831
let () =
832
  Trie.verbose();
833
  Q.repeat q discover;
834
  Time.tick "Running LRijkstra";
POTTIER Francois's avatar
POTTIER Francois committed
835
  done_with_level()
836

837 838
(* ------------------------------------------------------------------------ *)

839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868
(* The following code validates the fact that an error can be triggered in
   state [s'] by beginning in the initial state [s] and reading the
   sequence of terminal symbols [w]. We use this for debugging purposes. *)

let fail msg =
  Printf.fprintf stderr "coverage: internal error: %s.\n%!" msg;
  false

open ReferenceInterpreter

let validate s s' w : bool =
  match
    ReferenceInterpreter.check_error_path (Lr1.nt_of_entry s) (W.elements w)
  with
  | OInputReadPastEnd ->
      fail "input was read past its end"
  | OInputNotFullyConsumed ->
      fail "input was not fully consumed"
  | OUnexpectedAccept ->
      fail "input was unexpectedly accepted"
  | OK state ->
      Lr1.Node.compare state s' = 0 ||
      fail (
        Printf.sprintf "error occurred in state %d instead of %d"
          (Lr1.number state)
          (Lr1.number s')
      )

(* ------------------------------------------------------------------------ *)

869 870 871 872 873 874 875
(* We now wish to determine, given a state [s'] and a terminal symbol [z], a
   minimal path that takes us from some entry state to state [s'] with [z] as
   the next (unconsumed) symbol. *)

(* This can be formulated as a search for a shortest path in a graph. The
   graph is not just the automaton, though. It is a (much) larger graph whose
   vertices are pairs [s, z] and whose edges are obtained by querying the
876
   module [E] above. *)
877

878
let forward () =
879

880
  let module A = Astar.Make(struct
881

882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902
    (* A vertex is a pair [s, z].
       [z] cannot be the [error] token. *)
    type node =
        Lr1.node * Terminal.t

    let equal (s'1, z1) (s'2, z2) =
      Lr1.Node.compare s'1 s'2 = 0 && Terminal.compare z1 z2 = 0

    let hash (s, z) =
      Hashtbl.hash (Lr1.number s, z)

    (* An edge is labeled with a word. *)
    type label =
      W.word

    (* Forward search from every [s, z], where [s] is an initial state. *)
    let sources f =
      foreach_terminal (fun z ->
        ProductionMap.iter (fun _ s ->
          f (s, z)
        ) Lr1.entry
903 904
      )

905
    let successors (s, z) edge =
906
      assert (regular z);
907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929
      SymbolMap.iter (fun sym s' ->
        match sym with
        | Symbol.T t ->
            if Terminal.equal z t then
              let w = W.singleton t in
              foreach_terminal (fun z ->
                edge w 1 (s', z)
              )
        | Symbol.N nt ->
           foreach_terminal (fun z' ->
             E.query s nt z z' (fun w ->
               edge w (W.length w) (s', z')
             )
           )
      ) (Lr1.transitions s)

    let estimate _ =
      0

  end) in

  (* Search forward. *)

930
  Printf.fprintf stderr "Forward search:\n%!";
931
  let seen = ref Lr1.NodeSet.empty in
932
  let _, _ = A.search (fun ((s', z), path) ->
933 934 935 936 937 938 939 940 941 942 943 944 945 946
    if causes_an_error s' z && not (Lr1.NodeSet.mem s' !seen) then begin
      seen := Lr1.NodeSet.add s' !seen;
      (* An error can be triggered in state [s'] by beginning in the initial
         state [s] and reading the sequence of terminal symbols [w]. *)
      let (s, _), ws = A.reverse path in
      let w = List.fold_right W.append ws (W.singleton z) in
      Printf.fprintf stderr
        "An error can be reached from state %d to state %d:\n%!"
        (Lr1.number s)
        (Lr1.number s');
      Printf.fprintf stderr "%s\n%!" (W.print w);
      assert (validate s s' w)
    end
  ) in
947 948 949
  Printf.fprintf stderr "Reachable (forward): %d states\n%!"
    (Lr1.NodeSet.cardinal !seen);
  !seen
950 951

let () =
952 953
  let f = forward() in
  Time.tick "Forward search";
POTTIER Francois's avatar
POTTIER Francois committed
954 955 956 957
  let stat = Gc.quick_stat() in
  Printf.fprintf stderr
    "Maximum size reached by the major heap: %dM\n"
    (stat.Gc.top_heap_words * (Sys.word_size / 8) / 1024 / 1024);
958
  ignore f
959 960

(* TODO:
POTTIER Francois's avatar
POTTIER Francois committed
961
  can we store fewer facts when we hit a default reduction?
962
  remove CompletedNatWitness?, revert Fix
963
  collect performance data, correlated with star size and alphabet size; draw a graph
POTTIER Francois's avatar
POTTIER Francois committed
964
  count the unreachable states and see if they are numerous in practice
POTTIER Francois's avatar
POTTIER Francois committed
965 966
  optionally report several ways of reaching an error in state s
    (with different lookahead tokens) (report all of them?)
967 968 969
  warn if --list-errors is set AND the grammar uses [error]
  remove $syntaxerror?
  how do we maintain the list of error messages when the grammar evolves?
970 971
  implement a naive semi-algorithm that enumerates all input sentences,
    and evaluate how well (or how badly) it scales
972
*)
POTTIER Francois's avatar
POTTIER Francois committed
973

974
end