LRijkstra.ml 35 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
(* The purpose of this algorithm is to find, for each pair of a state [s]
   and a terminal symbol [z] such that looking at [z] in state [s] causes
   an error, a minimal path (starting in some initial state) that actually
   triggers this error. *)

(* This is potentially useful for grammar designers who wish to better
   understand the properties of their grammar, or who wish to produce a
   list of all possible syntax errors (or, at least, one syntax error in
   each automaton state where an error may occur). *)

(* The problem seems rather tricky. One might think that it suffices to
   compute shortest paths in the automaton, and to use [Analysis.minimal] to
   replace each non-terminal symbol in a path with a minimal word that this
   symbol generates. One can indeed do so, but this yields only a lower bound
   on the actual shortest path to the error at [s, z]. Indeed, two
   difficulties arise:

   - Some states have a default reduction. Thus, they will not trigger
     an error, even though they should. The error is triggered in some
     other state, after reduction takes place.

   - If the grammar has conflicts, conflict resolution removes some
     (shift or reduce) actions, hence may suppress the shortest path. *)

(* We explicitly choose to ignore the [error] token. If the grammar mentions
   [error] and if some state is reachable only via an [error] transition, too
   bad: we report this state as unreachable. It would be too complicated to
   have to create a first error in order to be able to take certain
   transitions or drop certain parts of the input. *)

31 32 33 34 35 36 37
(* ------------------------------------------------------------------------ *)

(* To delay the side effects performed by this module, we wrap everything in
   in a big functor without arguments. *)

module Run (X : sig end) = struct

38
open Grammar
39 40 41

(* ------------------------------------------------------------------------ *)

42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
(* Because of our encoding of terminal symbols as 8-bit characters, and
   because of the need to make room for [any], this algorithm supports
   at most 255 terminal symbols. *)

(* We could perhaps make this limit 256 by encoding [any] as [error] or [#],
   but that sounds dangerous. *)

let () =
  if Terminal.n > 255 then
    Error.error [] (Printf.sprintf
      "--list-errors supports at most 255 terminal symbols.\n\
       The grammar has %d terminal symbols." Terminal.n
    )

(* ------------------------------------------------------------------------ *)

58 59 60 61 62
(* Build a module that represents words as (hash-consed) strings. Note:
   this functor application has a side effect (it allocates memory, and
   more importantly, it may fail). *)

module W = Terminal.Word(struct end)
63

64 65
(* ------------------------------------------------------------------------ *)

66
(* Throughout, we ignore the [error] pseudo-token completely. We consider that
67 68 69 70 71 72
   it never appears on the input stream. Thus, we disregard any reductions or
   transitions that take place when the lookahead symbol is [error]. As a
   result, any state whose incoming symbol is [error] is found unreachable. *)

let regular z =
  not (Terminal.equal z Terminal.error)
73

74 75 76 77 78 79 80 81
(* We introduce a pseudo-terminal symbol [any]. It is used in several places
   later on, in particular in the field [fact.lookahead], to encode the
   absence of a lookahead hypothesis -- i.e., any terminal symbol will do. *)

let any : Terminal.t =
  assert (Terminal.n < 255); (* TEMPORARY *)
  Obj.magic Terminal.n       (* TEMPORARY *)

82 83
(* ------------------------------------------------------------------------ *)

84 85 86 87 88
(* We begin with a number of auxiliary functions that provide information
   about the LR(1) automaton. These functions could perhaps be moved to a
   separate module. We keep them here, for the moment, because they are not
   used anywhere else. *)

POTTIER Francois's avatar
POTTIER Francois committed
89 90 91
(* [reductions_on s z] is the list of reductions permitted in state [s] when
   the lookahead symbol is [z]. This is a list of zero or one elements. This
   does not take default reductions into account. *)
92

POTTIER Francois's avatar
POTTIER Francois committed
93
let reductions_on s z : Production.index list =
94
  assert (regular z && z <> any);
95 96 97 98 99
  try
    TerminalMap.find z (Lr1.reductions s)
  with Not_found ->
    []

100 101 102
(* [has_reduction s z] tells whether state [s] is willing to reduce some
   production (and if so, which one) when the lookahead symbol is [z]. It
   takes a possible default reduction into account. *)
103 104

let has_reduction s z : Production.index option =
105
  assert (z <> any);
106 107 108 109
  match Invariant.has_default_reduction s with
  | Some (prod, _) ->
      Some prod
  | None ->
POTTIER Francois's avatar
POTTIER Francois committed
110
      match reductions_on s z with
111 112 113 114 115 116
      | prod :: prods ->
          assert (prods = []);
          Some prod
      | [] ->
          None

117 118 119 120 121 122 123 124 125
(* [can_reduce s prod] indicates whether state [s] is able to reduce
   production [prod] (either as a default reduction, or as a normal
   reduction). *)

let can_reduce s prod =
  match Invariant.has_default_reduction s with
  | Some (prod', _) when prod = prod' ->
      true
  | _ ->
126
      TerminalMap.fold (fun z prods accu ->
127 128
        (* A reduction on [#] is always a default reduction. (See [lr1.ml].) *)
        assert (not (Terminal.equal z Terminal.sharp));
129
        accu || regular z && List.mem prod prods
130 131
      ) (Lr1.reductions s) false

132 133
(* [causes_an_error s z] tells whether state [s] will initiate an error on the
   lookahead symbol [z]. *)
134

135
let causes_an_error s z : bool =
136
  assert (regular z && z <> any);
137 138 139 140
  match Invariant.has_default_reduction s with
  | Some _ ->
      false
  | None ->
POTTIER Francois's avatar
POTTIER Francois committed
141
      reductions_on s z = [] &&
142 143
      not (SymbolMap.mem (Symbol.T z) (Lr1.transitions s))

144 145 146
(* [foreach_terminal f] applies the function [f] to every terminal symbol in
   turn, except [error]. *)

147 148
let foreach_terminal f =
  Terminal.iter (fun t ->
149
    if regular t then
150 151 152
      f t
  )

153 154 155 156 157
(* [foreach_terminal_not_causing_an_error s f] applies the function [f] to
   every terminal symbol [z] such that [causes_an_error s z] is false. This
   could be implemented in a naive manner using [foreach_terminal] and
   [causes_an_error]. This implementation is slightly more efficient. *)

158 159 160
let foreach_terminal_not_causing_an_error s f =
  match Invariant.has_default_reduction s with
  | Some _ ->
161
      (* There is a default reduction. No symbol causes an error. *)
162 163
      foreach_terminal f
  | None ->
164 165 166
      (* Enumerate every terminal symbol [z] for which there is a
         reduction. *)
      TerminalMap.iter (fun z _ ->
167
        if regular z then
168
          f z
169
      ) (Lr1.reductions s);
170 171
      (* Enumerate every terminal symbol [z] for which there is a
         transition. *)
172 173
      SymbolMap.iter (fun sym _ ->
        match sym with
174
        | Symbol.T z ->
175
            if regular z then
176
              f z
177 178 179 180
        | Symbol.N _ ->
            ()
      ) (Lr1.transitions s)

181 182 183 184 185 186 187 188 189 190 191 192 193
(* Let us say a state [s] is solid if its incoming symbol is a terminal symbol
   (or if it has no incoming symbol at all, i.e., it is an initial state). A
   contrario, a state is fragile if its incoming symbol is a non-terminal
   symbol. *)

let is_solid s =
  match Lr1.incoming_symbol s with
  | None
  | Some (Symbol.T _) ->
      true
  | Some (Symbol.N _) ->
      false

194 195
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
196 197 198 199 200 201 202 203 204 205 206 207
(* Suppose [s] is a state that carries an outgoing edge labeled with a
   non-terminal symbol [nt]. We are interested in finding out how this edge
   can be taken. In order to do that, we must determine how, by starting in
   [s], one can follow a path that corresponds to (the right-hand side of) a
   production [prod] associated with [nt]. There are in general several such
   productions. The paths that they determine in the automaton form a "star".
   We represent the star rooted at [s] as a trie. For every state [s], the
   star rooted at [s] is constructed in advance, before the algorithm runs.
   While the algorithm runs, a point in the trie (that is, a sub-trie) tells
   us where we come form, where we are, and which production(s) we are hoping
   to reduce in the future. *)

208 209 210
module Trie : sig

  type trie
211 212 213 214

  (* [star s] creates a (new) trie whose source is [s], populated with its
     branches. (There is one branch for every production [prod] associated
     with every non-terminal symbol [nt] for which [s] carries an outgoing
215
     edge.) If the star turns out to be trivial then [None] is returned. *)
216 217
  val star: Lr1.node -> trie option

218 219 220 221
  (* After [star s] has been called, [size (Lr1.number s)] reports the size
     of the trie that has been constructed for state [s]. *)
  val size: int -> int

222 223 224 225 226 227 228 229
  (* Every (sub-)trie has a unique identity. (One can think of it as its
     address.) [compare] compares the identity of two tries. This can be
     used, e.g., to set up a map whose keys are tries. *)
  val compare: trie -> trie -> int

  (* [source t] returns the source state of the (sub-)trie [t]. This is
     the root of the star of which [t] is a sub-trie. In other words, this
     tells us "where we come from". *)
230
  val source: trie -> Lr1.node
POTTIER Francois's avatar
POTTIER Francois committed
231 232

  (* [current t] returns the current state of the (sub-)trie [t]. This is
233 234
     the root of the sub-trie [t]. In other words, this tells us "where
     we are". *)
POTTIER Francois's avatar
POTTIER Francois committed
235
  val current: trie -> Lr1.node
236

237 238 239 240 241
  (* [accepts prod t] tells whether the current state of the trie [t] is
     the end of a branch associated with production [prod]. If so, this
     means that we have successfully followed a path that corresponds to
     the right-hand side of production [prod]. *)
  val accepts: Production.index -> trie -> bool
242

243 244 245 246 247 248 249 250
  (* [step sym t] is the immediate sub-trie of [t] along the symbol [sym].
     This function raises [Not_found] if [t] has no child labeled [sym]. *)
  val step: Symbol.t -> trie -> trie

  (* [verbose()] outputs debugging & performance information. *)
  val verbose: unit -> unit

end = struct
251

252 253
  (* A trie has the following structure. *)

254
  type trie = {
255 256 257
    (* A unique identity, used by [compare]. The trie construction code
       ensures that these numbers are indeed unique: see [fresh], [insert],
       [star]. *)
258
    identity: int;
259
    (* The root state of this star: "where we come from". *)
260
    source: Lr1.node;
261
    (* The current state, i.e., the root of this sub-trie: "where we are". *)
POTTIER Francois's avatar
POTTIER Francois committed
262
    current: Lr1.node;
263 264 265
    (* The productions that we can reduce in the current state. In other
       words, if this list is nonempty, then the current state is the end
       of one (or several) branches. It can nonetheless have children. *)
266
    productions: Production.index list;
267 268
    (* The children, or sub-tries. *)
    transitions: trie SymbolMap.t
269
  }
270

271
  (* This counter is used by [mktrie] to produce unique identities. *)
272 273
  let c = ref 0

274
  (* This smart constructor creates a new trie with a unique identity. *)
POTTIER Francois's avatar
POTTIER Francois committed
275
  let mktrie source current productions transitions =
276
    let identity = Misc.postincrement c in
POTTIER Francois's avatar
POTTIER Francois committed
277
    { identity; source; current; productions; transitions }
278

279 280
  exception DeadBranch

281
  let rec insert w prod t =
282 283
    match w with
    | [] ->
POTTIER Francois's avatar
POTTIER Francois committed
284
        (* We check whether the current state [t.current] is able to reduce
285 286 287 288 289
           production [prod]. (If [prod] cannot be reduced, the reduction
           action must have been suppressed by conflict resolution.) If not,
           then this branch is dead. This test is superfluous (i.e., it would
           be OK to conservatively assume that [prod] can be reduced) but
           allows us to build a slightly smaller star in some cases. *)
POTTIER Francois's avatar
POTTIER Francois committed
290
        if can_reduce t.current prod then
291 292 293 294 295 296
          (* We consume (update) the trie [t], so there is no need to allocate
             a new stamp. (Of course we could allocate a new stamp, but I prefer
             to be precise.) *)
          { t with productions = prod :: t.productions }
        else
          raise DeadBranch
297 298
    | (Symbol.T t) :: _ when not (regular t) ->
         raise DeadBranch
299
    | a :: w ->
POTTIER Francois's avatar
POTTIER Francois committed
300
        (* Check if there is a transition labeled [a] out of [t.current]. If
301 302
           there is, we add a child to the trie [t]. If there isn't, then it
           must have been removed by conflict resolution. (Indeed, it must be
303 304 305
           present in a canonical automaton.) We could in this case return an
           unchanged sub-trie. We can do slightly better: we abort the whole
           insertion, so as to return an unchanged toplevel trie. *)
POTTIER Francois's avatar
POTTIER Francois committed
306
        match SymbolMap.find a (Lr1.transitions t.current) with
307
        | successor ->
308 309 310 311 312 313 314 315
            (* Find our child at [a], or create it. *)
            let t' =
              try
                SymbolMap.find a t.transitions
              with Not_found ->
                mktrie t.source successor [] SymbolMap.empty
            in
            (* Update the child [t']. *)
316
            let t' = insert w prod t' in
317 318
            (* Update [t]. Again, no need to allocate a new stamp. *)
            { t with transitions = SymbolMap.add a t' t.transitions }
319
        | exception Not_found ->
320
            raise DeadBranch
321

322 323 324 325 326
  (* [insert prod t] inserts a new branch, corresponding to production
     [prod], into the trie [t]. This function consumes its argument,
     which should no longer be used afterwards. *)
  let insert prod t =
    let w = Array.to_list (Production.rhs prod) in
327 328 329 330 331 332
    let save = !c in
    try
      insert w prod t
    with DeadBranch ->
      c := save;
      t
333

334 335 336 337 338 339 340 341 342 343 344 345 346
  (* [fresh s] creates a new empty trie whose source is [s]. *)
  let fresh source =
    mktrie source source [] SymbolMap.empty

  let star s =
    SymbolMap.fold (fun sym _ accu ->
      match sym with
      | Symbol.T _ ->
          accu
      | Symbol.N nt ->
          Production.foldnt nt accu insert
    ) (Lr1.transitions s) (fresh s)

347
  (* [nontrivial t] tests whether the trie [t] has any branches, i.e.,
348 349 350 351
     contains at least one sub-trie whose [productions] field is nonempty.
     Trivia: a trie of size greater than 1 is necessarily nontrivial, but the
     converse is not true: a nontrivial trie can have size 1. (This occurs
     when all productions have zero length.) *)
352 353 354
  let nontrivial t =
    not (t.productions = [] && SymbolMap.is_empty t.transitions)

355 356 357 358 359 360
  (* Redefine [star] to include a [nontrivial] test and to record the size
     of the newly built trie. *)

  let size =
    Array.make Lr1.n (-1)

361
  let star s =
362
    let initial = !c in
363
    let t = star s in
364 365
    let final = !c in
    size.(Lr1.number s) <- final - initial;
366 367 368 369 370
    if nontrivial t then
      Some t
    else
      None

371 372 373 374
  let size s =
    assert (size.(s) >= 0);
    size.(s)

POTTIER Francois's avatar
POTTIER Francois committed
375 376
  let compare t1 t2 =
    Pervasives.compare (t1.identity : int) t2.identity
377

378 379 380
  let source t =
    t.source

POTTIER Francois's avatar
POTTIER Francois committed
381 382
  let current t =
    t.current
383

384 385 386
  let accepts prod t =
    List.mem prod t.productions

387
  let step a t =
388
    SymbolMap.find a t.transitions (* careful: may raise [Not_found] *)
389

390
  let verbose () =
391
    Printf.fprintf stderr "Cumulated star size: %d\n%!" !c
392

393 394
end

POTTIER Francois's avatar
POTTIER Francois committed
395 396
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
397 398 399 400 401 402
(* The main algorithm, [LRijkstra], accumulates facts. A fact is a triple of a
   position (that is, a sub-trie), a word, and a lookahead assumption. Such a
   fact means that this position can be reached, from the source state
   [Trie.source fact.position], by consuming [fact.word], under the assumption
   that the next input symbol is [fact.lookahead]. *)

403 404
(* We allow [fact.lookahead] to be [any] so as to indicate that this fact does
   not have a lookahead assumption. *)
POTTIER Francois's avatar
POTTIER Francois committed
405

406
type fact = {
407
  position: Trie.trie;
408
  word: W.word;
409
  lookahead: Terminal.t (* may be [any] *)
410 411
}

412 413
(* Accessors. *)

414
let source fact =
415
  Trie.source fact.position
416

POTTIER Francois's avatar
POTTIER Francois committed
417
let current fact =
418
  Trie.current fact.position
419

420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446
(* Two invariants reduce the number of facts that we consider:

   1. If [fact.lookahead] is a terminal symbol [z] (i.e., not [any]), then
      [z] does not cause an error in the current state [current fact]. It
      would be useless to consider a fact that violates this property; it
      cannot possibly lead to a successful reduction.

   2. [fact.lookahead] is [any] iff the current state [current fact] is
      solid. This sounds rather reasonable (when a state is entered
      by shifting, it is entered regardless of which symbol follows)
      and simplifies the implementation of the sub-module [T].

*)

let invariant1 fact =
  fact.lookahead = any || not (causes_an_error (current fact) fact.lookahead)

let invariant2 fact =
  (fact.lookahead = any) = is_solid (current fact)

(* [compatible z a] checks whether the terminal symbol [a] satisfies the
   lookahead assumption [z] -- which can be [any]. *)

let compatible z a =
  assert (a <> any);
  z = any || z = a

POTTIER Francois's avatar
POTTIER Francois committed
447
(* ------------------------------------------------------------------------ *)
448

449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476
(* As in Dijkstra's algorithm, a priority queue contains the facts that await
   examination. The length of [fact.word] serves as the priority of a fact.
   This guarantees that we discover shortest paths. (We never insert into the
   queue a fact whose priority is less than the priority of the last fact
   extracted out of the queue.) *)

(* [LowIntegerPriorityQueue] offers very efficient operations (essentially
   constant time, for a small constant). It exploits the fact that priorities
   are low nonnegative integers. *)

module Q = LowIntegerPriorityQueue

let q =
  Q.create()

(* We never insert into the queue a fact that immediately causes an error,
   i.e., a fact such that [causes_an_error (current fact) fact.lookahead]
   holds. In practice, this convention allows reducing the number of facts
   that go through the queue by a factor of two. *)

(* In principle, there is no need to insert the fact into the queue if [T]
   already stores a comparable fact. We could perform this test in [add].
   However, a quick experiment suggests that this is not worthwhile. The run
   time augments (because membership in [T] is tested twice, upon inserting
   and upon extracting) and the memory consumption does not seem to go down
   significantly. *)

let add fact =
477 478
  assert (invariant1 fact);
  assert (invariant2 fact);
479 480 481 482 483 484 485 486 487
  (* The length of [fact.word] serves as the priority of this fact. *)
  Q.add q fact (W.length fact.word)

(* Construct the [star] of every state [s]. Initialize the priority queue. *)

let () =
  Lr1.iter (fun s ->
    match Trie.star s with
    | Some trie ->
488 489
        (* TEMPORARY weird *)
        if is_solid s then
490 491 492
          add {
            position = trie;
            word = W.epsilon;
493
            lookahead = any
494
          }
495 496 497 498 499 500 501 502
        else
          foreach_terminal_not_causing_an_error s (fun z ->
            add {
              position = trie;
              word = W.epsilon;
              lookahead = z
            }
          )
503 504 505 506 507 508
    | None ->
        ()
  )

(* ------------------------------------------------------------------------ *)

509 510 511 512 513 514 515 516 517 518
(* The first symbol of the input, [W.first fact.word fact.lookahead], plays a
   special role. Indeed, for every position, for every first symbol, and for
   every lookahead symbol, we keep track of at most one fact. Thus, the total
   number of facts accumulated by the algorithm is at most [T.n^2], where [T]
   is the total size of the tries that we have constructed, and [n] is the
   number of terminal symbols. (This number can be quite large. [T] can be in
   the tens of thousands, and [n] can be over one hundred. These figures lead
   to a theoretical upper bound of 100M. In practice, for T=25K and n=108, we
   observe that the algorithm gathers about 7M facts.) *)

519
module T : sig
520 521

  (* [register fact] registers the fact [fact]. It returns [true] if this fact
POTTIER Francois's avatar
POTTIER Francois committed
522 523
     is new, i.e., no fact concerning the same triple of [position], [a], and
     [z] was previously known. *)
524 525
  val register: fact -> bool

POTTIER Francois's avatar
POTTIER Francois committed
526 527
  (* [query current z f] enumerates all known facts whose current state is
     [current] and whose lookahead assumption is [z]. *)
528
  val query: Lr1.node -> Terminal.t -> (fact -> unit) -> unit
529

530
  val verbose: unit -> unit
531

532
end = struct
533

534
  (* This module implements a set of facts. Two facts are considered equal
535
     (for the purposes of this set) if they have the same [position], [a], and
536 537 538 539 540 541
     [z] fields. The [word] is not considered. Indeed, we are not interested
     in keeping track of several words that produce the same effect. Only the
     shortest such word is of interest. *)

  (* We need to query the set of facts in two ways. In [register], we need to
     test whether a fact is in the set. In [query], we need to find all facts
POTTIER Francois's avatar
POTTIER Francois committed
542 543
     that match a pair [current, z]. For this reason, we use a two-level table.
     The first level is a matrix indexed by [current] and [z]. At the second
544
     level, we find sets of facts. *)
545 546 547
(**)

  module M =
548
    MySet.Make(struct
549 550
      type t = fact
      let compare fact1 fact2 =
551
        let c = Trie.compare fact1.position fact2.position in
552
        if c <> 0 then c else
553 554
        let a1 = W.first fact1.word fact1.lookahead
        and a2 = W.first fact2.word fact2.lookahead in
555
        (* note: [a1] and [a2] can be [any] here *)
556 557
        Terminal.compare a1 a2
    end)
558

559
  let table = (* a pretty large table... *)
560 561
    Array.make (Lr1.n * (Terminal.n + 1)) M.empty (* room for [any] *)
  (* TEMPORARY this space is wasted for solid states *)
562

POTTIER Francois's avatar
POTTIER Francois committed
563
  let index current z =
564
    (Terminal.n + 1) * (Lr1.number current) + Terminal.t2i z
565

566 567
  let count = ref 0

568
  let register fact =
POTTIER Francois's avatar
POTTIER Francois committed
569
    let current = current fact in
570
    let z = fact.lookahead in
571 572
    (* [z] is [any] iff [current] is solid. *)
    assert ((z = any) = is_solid current);
POTTIER Francois's avatar
POTTIER Francois committed
573
    let i = index current z in
574 575 576 577 578 579 580 581 582 583 584 585
    let m = table.(i) in
    (* We crucially rely on the fact that [M.add] guarantees not to
       change the set if an ``equal'' fact already exists. Thus, a
       later, longer path is ignored in favor of an earlier, shorter
       path. *)
    let m' = M.add fact m in
    m != m' && begin
      incr count;
      table.(i) <- m';
      true
    end

POTTIER Francois's avatar
POTTIER Francois committed
586
  let query current z f =
587 588 589 590
    assert (z <> any);
    (* if [current] is solid then the facts that concern it are stored
       under any [any], not under [z] *)
    let i = index current (if is_solid current then any else z) in
591 592
    let m = table.(i) in
    M.iter f m
593

594
  let verbose () =
595 596
    Printf.fprintf stderr "T stores %d facts.\n%!" !count

597 598
end

POTTIER Francois's avatar
POTTIER Francois committed
599 600
(* ------------------------------------------------------------------------ *)

601 602 603 604 605 606 607
(* The module [E] is in charge of recording the non-terminal edges that we have
   discovered, or more precisely, the conditions under which these edges can be
   taken. *)

module E : sig

  (* [register s nt w z] records that, in state [s], the outgoing edge labeled
608 609 610
     [nt] can be taken by consuming the word [w], if the next symbol is [z].
     It returns [true] if this information is new. *)
  val register: Lr1.node -> Nonterminal.t -> W.word -> Terminal.t -> bool
611 612 613 614

  (* [query s nt a z] answers whether, in state [s], the outgoing edge labeled
     [nt] can be taken by consuming some word [w], under the assumption that
     the next symbol is [z], and under the constraint that the first symbol of
615 616
     [w.z] is [a]. *)
  val query: Lr1.node -> Nonterminal.t -> Terminal.t -> Terminal.t -> (W.word -> unit) -> unit
617

618
  val verbose: unit -> unit
619

620 621
end = struct

622 623 624 625 626 627 628 629 630 631
  (* At a high level, we must implement a mapping of [s, nt, a, z] to [w]. In
     practice, we can implement this specification using any combination of
     arrays, hash tables, balanced binary trees, and perfect hashing (i.e.,
     packing several of [s], [nt], [a], [z] in one word.) Here, we choose to
     use an array, indexed by [s], of hash tables, indexed by a key that packs
     [nt], [a], and [z] in one word. According to a quick experiment, the
     final population of the hash table [table.(index s)] seems to be roughly
     [Terminal.n * Trie.size s]. We note that using an initial capacity
     of 0 and relying on the hash table's resizing mechanism has a significant
     cost, which is why we try to guess a good initial capacity. *)
632

633
  module H = Hashtbl
634

635
  let table = (* a pretty large table... *)
636 637 638 639
    Array.init (Lr1.n) (fun i ->
      let size = Trie.size i in
      H.create (if size = 1 then 0 else Terminal.n * size)
    )
640 641 642

  let index s =
    Lr1.number s
643

644 645 646 647
  let pack nt a z : int =
    (Nonterminal.n2i nt lsl 16) lor
    (Terminal.t2i a lsl 8) lor
    (Terminal.t2i z)
648

649 650
  let count = ref 0

651
  let register s nt w z =
652
    assert (regular z && z <> any);
653
    let i = index s in
654
    let m = table.(i) in
655
    let a = W.first w z in
656
    assert (not (causes_an_error s a));
657 658 659 660
    let key = pack nt a z in
    if H.mem m key then
      false
    else begin
661
      incr count;
662
      H.add m key w;
663 664
      true
    end
665

666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686
  let rec query s nt a z f =
    assert (regular z && z <> any);
    (* [a] can be [any] *)
    if a <> any then begin
      let i = index s in
      let m = table.(i) in
      let key = pack nt a z in
      match H.find m key with
      | w -> f w
      | exception Not_found -> ()
    end
    else begin
      (* If [a] is [any], we query the table for every concrete [a].
         We can limit ourselves to symbols that do not cause an error
         in state [s]. Those that do certainly do not have an entry;
         see the assertion in [register] above. *)
      foreach_terminal_not_causing_an_error s (fun a ->
        query s nt a z f
      )
        (* TEMPORARY try a scheme that allows a more efficient iteration? *)
    end
687

688
  let verbose () =
689 690
    Printf.fprintf stderr "E stores %d facts.\n%!" !count

691 692
end

POTTIER Francois's avatar
POTTIER Francois committed
693 694
(* ------------------------------------------------------------------------ *)

695
let new_edge s nt w z =
696
  assert (regular z && z <> any);
697
  if E.register s nt w z then
698
    let sym = Symbol.N nt in
699
    T.query s (W.first w z) (fun fact ->
700
      assert (compatible fact.lookahead (W.first w z));
701 702
      match Trie.step sym fact.position with
      | position ->
703
          assert (not (is_solid (Trie.current position)));
704
          if not (causes_an_error (Trie.current position) z) then
705
            add {
706
              position;
707 708 709 710 711
              word = W.append fact.word w;
              lookahead = z
            }
      | exception Not_found ->
          ()
712
    )
713 714 715 716 717 718 719 720 721 722

(* [consequences fact] is invoked when we discover a new fact (i.e., one that
   was not previously known). It studies the consequences of this fact. These
   consequences are of two kinds:

   - As in Dijkstra's algorithm, the new fact can be viewed as a newly
   discovered vertex. We study its (currently known) outgoing edges,
   and enqueue new facts in the priority queue.

   - Sometimes, a fact can also be viewed as a newly discovered edge.
POTTIER Francois's avatar
POTTIER Francois committed
723 724
   This is the case when the word from [fact.source] to [fact.current]
   represents a production of the grammar and [fact.current] is willing
725 726 727 728 729 730 731 732
   to reduce this production. We record the existence of this edge,
   and re-inspect any previously discovered vertices which are
   interested in this outgoing edge.
*)
(**)

let consequences fact =

POTTIER Francois's avatar
POTTIER Francois committed
733
  let current = current fact in
734

POTTIER Francois's avatar
POTTIER Francois committed
735
  (* 1. View [fact] as a vertex. Examine the transitions out of [current]. *)
736
  
737
  SymbolMap.iter (fun sym s' ->
738
    match Trie.step sym fact.position, sym with
739
    | exception Not_found -> ()
740
    | position, Symbol.T t ->
741 742 743
        (* [t] cannot be the [error] token, because the trie does not have
           any edges labeled [error]. *)
        assert (regular t);
744

POTTIER Francois's avatar
POTTIER Francois committed
745
        (* 1a. There is a transition labeled [t] out of [current]. If
746 747 748 749 750
           the lookahead assumption [fact.lookahead] is compatible with [t],
           then we derive a new fact, where one more edge has been taken. We
           enqueue this new fact for later examination. *)
        (**)

751
        if compatible fact.lookahead t then begin
752
          let word = W.append fact.word (W.singleton t) in
753 754 755 756 757 758 759 760
          (* assert (Lr1.Node.compare (Trie.current position) s' = 0); *)
          (* [s'] has a terminal incoming symbol. It is always entered
             without consideration for the next lookahead symbol. Thus,
             we use [any] as the lookahead assumption in the new fact
             that we produce. *)
          assert (is_solid (Trie.current position));
          add { position; word; lookahead = any }
        end
761

762
    | position, Symbol.N nt ->
763

POTTIER Francois's avatar
POTTIER Francois committed
764
        (* 1b. There is a transition labeled [nt] out of [current]. We
765 766 767 768 769 770
           need to know how this nonterminal edge can be taken. We query for a
           word [w] that allows us to take this edge. The answer depends on
           the terminal symbol [z] that comes *after* this word: we try all
           such symbols. Furthermore, we need the first symbol of [w.z] to
           satisfy the lookahead assumption [fact.lookahead], so the answer
           also depends on this assumption. *)
POTTIER Francois's avatar
POTTIER Francois committed
771 772
        (* TEMPORARY it could be that the answer does not depend on [z]...
           (default reduction) *)
773 774 775
        (**)

        foreach_terminal_not_causing_an_error s' (fun z ->
POTTIER Francois's avatar
POTTIER Francois committed
776
          E.query current nt fact.lookahead z (fun w ->
777 778
            assert (compatible fact.lookahead (W.first w z));
            assert (not (is_solid (Trie.current position)));
779
            add {
780
              position;
781 782 783 784 785
              word = W.append fact.word w;
              lookahead = z
            }
          )
        )
786

POTTIER Francois's avatar
POTTIER Francois committed
787
  ) (Lr1.transitions current);
788 789

  (* 2. View [fact] as a possible edge. This is possible if the path from
POTTIER Francois's avatar
POTTIER Francois committed
790 791
     [fact.source] to [current] represents a production [prod] and
     [current] is willing to reduce this production. We check that
792
     [fact.position] accepts [epsilon]. This guarantees that reducing [prod]
793 794 795 796
     takes us all the way back to [fact.source]. Thus, this production gives
     rise to an edge labeled [nt] -- the left-hand side of [prod] -- out of
     [fact.source]. This edge is subject to the lookahead assumption
     [fact.lookahead], so we record that. *)
797 798
  (**)

799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822
  if fact.lookahead <> any then begin
    match has_reduction current fact.lookahead with
    | Some prod when Trie.accepts prod fact.position ->
        new_edge (source fact) (Production.nt prod) fact.word fact.lookahead
    | _ ->
        ()
  end
  else begin
    (* Every reduction must be considered. *)
    match Invariant.has_default_reduction current with
    | Some (prod, _) ->
        if Trie.accepts prod fact.position then
          (* TEMPORARY for now, avoid sending [any] into [new_edge] *)
          foreach_terminal (fun z ->
            new_edge (source fact) (Production.nt prod) (fact.word) z
          )
    | None ->
       TerminalMap.iter (fun z prods ->
         if regular z then
           let prod = Misc.single prods in
           if Trie.accepts prod fact.position then
             new_edge (source fact) (Production.nt prod) (fact.word) z
       ) (Lr1.reductions current)
  end
823

824
let level = ref 0
825

POTTIER Francois's avatar
POTTIER Francois committed
826 827 828
let extracted, considered =
  ref 0, ref 0

POTTIER Francois's avatar
POTTIER Francois committed
829 830
let done_with_level () =
  Printf.fprintf stderr "Done with level %d.\n" !level;
831
  W.verbose();
832 833
  T.verbose();
  E.verbose();
POTTIER Francois's avatar
POTTIER Francois committed
834 835 836
  Printf.fprintf stderr "Q stores %d facts.\n" (Q.cardinal q);
  Printf.fprintf stderr "%d facts extracted out of Q, of which %d considered.\n%!"
    !extracted !considered
POTTIER Francois's avatar
POTTIER Francois committed
837

838
let discover fact =
POTTIER Francois's avatar
POTTIER Francois committed
839
  incr extracted;
840
  if T.register fact then begin
841
    if W.length fact.word > ! level then begin
POTTIER Francois's avatar
POTTIER Francois committed
842
      done_with_level();
843 844
      level := W.length fact.word;
    end;
POTTIER Francois's avatar
POTTIER Francois committed
845
    incr considered;
846
    consequences fact
847
  end
848

POTTIER Francois's avatar
POTTIER Francois committed
849
let () =
850
  Trie.verbose();
851
  Q.repeat q discover;
852
  Time.tick "Running LRijkstra";
POTTIER Francois's avatar
POTTIER Francois committed
853
  done_with_level()
854

855 856
(* ------------------------------------------------------------------------ *)

857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886
(* The following code validates the fact that an error can be triggered in
   state [s'] by beginning in the initial state [s] and reading the
   sequence of terminal symbols [w]. We use this for debugging purposes. *)

let fail msg =
  Printf.fprintf stderr "coverage: internal error: %s.\n%!" msg;
  false

open ReferenceInterpreter

let validate s s' w : bool =
  match
    ReferenceInterpreter.check_error_path (Lr1.nt_of_entry s) (W.elements w)
  with
  | OInputReadPastEnd ->
      fail "input was read past its end"
  | OInputNotFullyConsumed ->
      fail "input was not fully consumed"
  | OUnexpectedAccept ->
      fail "input was unexpectedly accepted"
  | OK state ->
      Lr1.Node.compare state s' = 0 ||
      fail (
        Printf.sprintf "error occurred in state %d instead of %d"
          (Lr1.number state)
          (Lr1.number s')
      )

(* ------------------------------------------------------------------------ *)

887 888 889 890 891 892 893
(* We now wish to determine, given a state [s'] and a terminal symbol [z], a
   minimal path that takes us from some entry state to state [s'] with [z] as
   the next (unconsumed) symbol. *)

(* This can be formulated as a search for a shortest path in a graph. The
   graph is not just the automaton, though. It is a (much) larger graph whose
   vertices are pairs [s, z] and whose edges are obtained by querying the
894
   module [E] above. *)
895

896
let forward () =
897

898
  let module A = Astar.Make(struct
899

900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920
    (* A vertex is a pair [s, z].
       [z] cannot be the [error] token. *)
    type node =
        Lr1.node * Terminal.t

    let equal (s'1, z1) (s'2, z2) =
      Lr1.Node.compare s'1 s'2 = 0 && Terminal.compare z1 z2 = 0

    let hash (s, z) =
      Hashtbl.hash (Lr1.number s, z)

    (* An edge is labeled with a word. *)
    type label =
      W.word

    (* Forward search from every [s, z], where [s] is an initial state. *)
    let sources f =
      foreach_terminal (fun z ->
        ProductionMap.iter (fun _ s ->
          f (s, z)
        ) Lr1.entry
921 922
      )

923
    let successors (s, z) edge =
924
      assert (regular z);
925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947
      SymbolMap.iter (fun sym s' ->
        match sym with
        | Symbol.T t ->
            if Terminal.equal z t then
              let w = W.singleton t in
              foreach_terminal (fun z ->
                edge w 1 (s', z)
              )
        | Symbol.N nt ->
           foreach_terminal (fun z' ->
             E.query s nt z z' (fun w ->
               edge w (W.length w) (s', z')
             )
           )
      ) (Lr1.transitions s)

    let estimate _ =
      0

  end) in

  (* Search forward. *)

948
  Printf.fprintf stderr "Forward search:\n%!";
949
  let seen = ref Lr1.NodeSet.empty in
950
  let _, _ = A.search (fun ((s', z), path) ->
951 952 953 954 955 956 957 958 959 960 961 962 963 964
    if causes_an_error s' z && not (Lr1.NodeSet.mem s' !seen) then begin
      seen := Lr1.NodeSet.add s' !seen;
      (* An error can be triggered in state [s'] by beginning in the initial
         state [s] and reading the sequence of terminal symbols [w]. *)
      let (s, _), ws = A.reverse path in
      let w = List.fold_right W.append ws (W.singleton z) in
      Printf.fprintf stderr
        "An error can be reached from state %d to state %d:\n%!"
        (Lr1.number s)
        (Lr1.number s');
      Printf.fprintf stderr "%s\n%!" (W.print w);
      assert (validate s s' w)
    end
  ) in
965 966 967
  Printf.fprintf stderr "Reachable (forward): %d states\n%!"
    (Lr1.NodeSet.cardinal !seen);
  !seen
968 969

let () =
970 971
  let f = forward() in
  Time.tick "Forward search";
POTTIER Francois's avatar
POTTIER Francois committed
972 973 974 975
  let stat = Gc.quick_stat() in
  Printf.fprintf stderr
    "Maximum size reached by the major heap: %dM\n"
    (stat.Gc.top_heap_words * (Sys.word_size / 8) / 1024 / 1024);
976
  ignore f
977 978

(* TODO:
POTTIER Francois's avatar
POTTIER Francois committed
979
  can we store fewer facts when we hit a default reduction?
980
  remove CompletedNatWitness?, revert Fix
981
  collect performance data, correlated with star size and alphabet size; draw a graph
POTTIER Francois's avatar
POTTIER Francois committed
982
  count the unreachable states and see if they are numerous in practice
POTTIER Francois's avatar
POTTIER Francois committed
983 984
  optionally report several ways of reaching an error in state s
    (with different lookahead tokens) (report all of them?)
985 986 987
  warn if --list-errors is set AND the grammar uses [error]
  remove $syntaxerror?
  how do we maintain the list of error messages when the grammar evolves?
988 989
  implement a naive semi-algorithm that enumerates all input sentences,
    and evaluate how well (or how badly) it scales
990
*)
POTTIER Francois's avatar
POTTIER Francois committed
991

992
end