LRijkstra.ml 32.9 KB
Newer Older
1
open Grammar
2
module W = Terminal.Word(struct end) (* TEMPORARY wrap side effect in functor *)
3

4 5
(* ------------------------------------------------------------------------ *)

6
(* Throughout, we ignore the [error] pseudo-token completely. We consider that
7 8 9 10 11 12
   it never appears on the input stream. Thus, we disregard any reductions or
   transitions that take place when the lookahead symbol is [error]. As a
   result, any state whose incoming symbol is [error] is found unreachable. *)

let regular z =
  not (Terminal.equal z Terminal.error)
13

14 15 16 17 18 19 20 21
(* We introduce a pseudo-terminal symbol [any]. It is used in several places
   later on, in particular in the field [fact.lookahead], to encode the
   absence of a lookahead hypothesis -- i.e., any terminal symbol will do. *)

let any : Terminal.t =
  assert (Terminal.n < 255); (* TEMPORARY *)
  Obj.magic Terminal.n       (* TEMPORARY *)

22 23
(* ------------------------------------------------------------------------ *)

24 25 26 27 28
(* We begin with a number of auxiliary functions that provide information
   about the LR(1) automaton. These functions could perhaps be moved to a
   separate module. We keep them here, for the moment, because they are not
   used anywhere else. *)

POTTIER Francois's avatar
POTTIER Francois committed
29 30 31
(* [reductions_on s z] is the list of reductions permitted in state [s] when
   the lookahead symbol is [z]. This is a list of zero or one elements. This
   does not take default reductions into account. *)
32

POTTIER Francois's avatar
POTTIER Francois committed
33
let reductions_on s z : Production.index list =
34
  assert (regular z && z <> any);
35 36 37 38 39
  try
    TerminalMap.find z (Lr1.reductions s)
  with Not_found ->
    []

40 41 42
(* [has_reduction s z] tells whether state [s] is willing to reduce some
   production (and if so, which one) when the lookahead symbol is [z]. It
   takes a possible default reduction into account. *)
43 44

let has_reduction s z : Production.index option =
45
  assert (z <> any);
46 47 48 49
  match Invariant.has_default_reduction s with
  | Some (prod, _) ->
      Some prod
  | None ->
POTTIER Francois's avatar
POTTIER Francois committed
50
      match reductions_on s z with
51 52 53 54 55 56
      | prod :: prods ->
          assert (prods = []);
          Some prod
      | [] ->
          None

57 58 59 60 61 62 63 64 65
(* [can_reduce s prod] indicates whether state [s] is able to reduce
   production [prod] (either as a default reduction, or as a normal
   reduction). *)

let can_reduce s prod =
  match Invariant.has_default_reduction s with
  | Some (prod', _) when prod = prod' ->
      true
  | _ ->
66 67
      TerminalMap.fold (fun z prods accu ->
        accu || regular z && List.mem prod prods
68 69
      ) (Lr1.reductions s) false

70 71
(* [causes_an_error s z] tells whether state [s] will initiate an error on the
   lookahead symbol [z]. *)
72

73
let causes_an_error s z : bool =
74
  assert (regular z && z <> any);
75 76 77 78
  match Invariant.has_default_reduction s with
  | Some _ ->
      false
  | None ->
POTTIER Francois's avatar
POTTIER Francois committed
79
      reductions_on s z = [] &&
80 81
      not (SymbolMap.mem (Symbol.T z) (Lr1.transitions s))

82 83 84
(* [foreach_terminal f] applies the function [f] to every terminal symbol in
   turn, except [error]. *)

85 86
let foreach_terminal f =
  Terminal.iter (fun t ->
87
    if regular t then
88 89 90
      f t
  )

91 92 93 94 95
(* [foreach_terminal_not_causing_an_error s f] applies the function [f] to
   every terminal symbol [z] such that [causes_an_error s z] is false. This
   could be implemented in a naive manner using [foreach_terminal] and
   [causes_an_error]. This implementation is slightly more efficient. *)

96 97 98
let foreach_terminal_not_causing_an_error s f =
  match Invariant.has_default_reduction s with
  | Some _ ->
99
      (* There is a default reduction. No symbol causes an error. *)
100 101
      foreach_terminal f
  | None ->
102 103 104
      (* Enumerate every terminal symbol [z] for which there is a
         reduction. *)
      TerminalMap.iter (fun z _ ->
105
        if regular z then
106
          f z
107
      ) (Lr1.reductions s);
108 109
      (* Enumerate every terminal symbol [z] for which there is a
         transition. *)
110 111
      SymbolMap.iter (fun sym _ ->
        match sym with
112
        | Symbol.T z ->
113
            if regular z then
114
              f z
115 116 117 118
        | Symbol.N _ ->
            ()
      ) (Lr1.transitions s)

119 120 121 122 123 124 125 126 127 128 129 130 131
(* Let us say a state [s] is solid if its incoming symbol is a terminal symbol
   (or if it has no incoming symbol at all, i.e., it is an initial state). A
   contrario, a state is fragile if its incoming symbol is a non-terminal
   symbol. *)

let is_solid s =
  match Lr1.incoming_symbol s with
  | None
  | Some (Symbol.T _) ->
      true
  | Some (Symbol.N _) ->
      false

132 133
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
134 135 136 137 138 139 140 141 142 143 144 145
(* Suppose [s] is a state that carries an outgoing edge labeled with a
   non-terminal symbol [nt]. We are interested in finding out how this edge
   can be taken. In order to do that, we must determine how, by starting in
   [s], one can follow a path that corresponds to (the right-hand side of) a
   production [prod] associated with [nt]. There are in general several such
   productions. The paths that they determine in the automaton form a "star".
   We represent the star rooted at [s] as a trie. For every state [s], the
   star rooted at [s] is constructed in advance, before the algorithm runs.
   While the algorithm runs, a point in the trie (that is, a sub-trie) tells
   us where we come form, where we are, and which production(s) we are hoping
   to reduce in the future. *)

146 147 148
module Trie : sig

  type trie
149 150 151 152

  (* [star s] creates a (new) trie whose source is [s], populated with its
     branches. (There is one branch for every production [prod] associated
     with every non-terminal symbol [nt] for which [s] carries an outgoing
153
     edge.) If the star turns out to be trivial then [None] is returned. *)
154 155
  val star: Lr1.node -> trie option

156 157 158 159
  (* After [star s] has been called, [size (Lr1.number s)] reports the size
     of the trie that has been constructed for state [s]. *)
  val size: int -> int

160 161 162 163 164 165 166 167
  (* Every (sub-)trie has a unique identity. (One can think of it as its
     address.) [compare] compares the identity of two tries. This can be
     used, e.g., to set up a map whose keys are tries. *)
  val compare: trie -> trie -> int

  (* [source t] returns the source state of the (sub-)trie [t]. This is
     the root of the star of which [t] is a sub-trie. In other words, this
     tells us "where we come from". *)
168
  val source: trie -> Lr1.node
POTTIER Francois's avatar
POTTIER Francois committed
169 170

  (* [current t] returns the current state of the (sub-)trie [t]. This is
171 172
     the root of the sub-trie [t]. In other words, this tells us "where
     we are". *)
POTTIER Francois's avatar
POTTIER Francois committed
173
  val current: trie -> Lr1.node
174

175 176 177 178 179
  (* [accepts prod t] tells whether the current state of the trie [t] is
     the end of a branch associated with production [prod]. If so, this
     means that we have successfully followed a path that corresponds to
     the right-hand side of production [prod]. *)
  val accepts: Production.index -> trie -> bool
180

181 182 183 184 185 186 187 188
  (* [step sym t] is the immediate sub-trie of [t] along the symbol [sym].
     This function raises [Not_found] if [t] has no child labeled [sym]. *)
  val step: Symbol.t -> trie -> trie

  (* [verbose()] outputs debugging & performance information. *)
  val verbose: unit -> unit

end = struct
189

190 191
  (* A trie has the following structure. *)

192
  type trie = {
193 194 195
    (* A unique identity, used by [compare]. The trie construction code
       ensures that these numbers are indeed unique: see [fresh], [insert],
       [star]. *)
196
    identity: int;
197
    (* The root state of this star: "where we come from". *)
198
    source: Lr1.node;
199
    (* The current state, i.e., the root of this sub-trie: "where we are". *)
POTTIER Francois's avatar
POTTIER Francois committed
200
    current: Lr1.node;
201 202 203
    (* The productions that we can reduce in the current state. In other
       words, if this list is nonempty, then the current state is the end
       of one (or several) branches. It can nonetheless have children. *)
204
    productions: Production.index list;
205 206
    (* The children, or sub-tries. *)
    transitions: trie SymbolMap.t
207
  }
208

209
  (* This counter is used by [mktrie] to produce unique identities. *)
210 211
  let c = ref 0

212
  (* This smart constructor creates a new trie with a unique identity. *)
POTTIER Francois's avatar
POTTIER Francois committed
213
  let mktrie source current productions transitions =
214
    let identity = Misc.postincrement c in
POTTIER Francois's avatar
POTTIER Francois committed
215
    { identity; source; current; productions; transitions }
216

217 218
  exception DeadBranch

219
  let rec insert w prod t =
220 221
    match w with
    | [] ->
POTTIER Francois's avatar
POTTIER Francois committed
222
        (* We check whether the current state [t.current] is able to reduce
223 224 225 226 227
           production [prod]. (If [prod] cannot be reduced, the reduction
           action must have been suppressed by conflict resolution.) If not,
           then this branch is dead. This test is superfluous (i.e., it would
           be OK to conservatively assume that [prod] can be reduced) but
           allows us to build a slightly smaller star in some cases. *)
POTTIER Francois's avatar
POTTIER Francois committed
228
        if can_reduce t.current prod then
229 230 231 232 233 234
          (* We consume (update) the trie [t], so there is no need to allocate
             a new stamp. (Of course we could allocate a new stamp, but I prefer
             to be precise.) *)
          { t with productions = prod :: t.productions }
        else
          raise DeadBranch
235 236
    | (Symbol.T t) :: _ when not (regular t) ->
         raise DeadBranch
237
    | a :: w ->
POTTIER Francois's avatar
POTTIER Francois committed
238
        (* Check if there is a transition labeled [a] out of [t.current]. If
239 240
           there is, we add a child to the trie [t]. If there isn't, then it
           must have been removed by conflict resolution. (Indeed, it must be
241 242 243
           present in a canonical automaton.) We could in this case return an
           unchanged sub-trie. We can do slightly better: we abort the whole
           insertion, so as to return an unchanged toplevel trie. *)
POTTIER Francois's avatar
POTTIER Francois committed
244
        match SymbolMap.find a (Lr1.transitions t.current) with
245
        | successor ->
246 247 248 249 250 251 252 253
            (* Find our child at [a], or create it. *)
            let t' =
              try
                SymbolMap.find a t.transitions
              with Not_found ->
                mktrie t.source successor [] SymbolMap.empty
            in
            (* Update the child [t']. *)
254
            let t' = insert w prod t' in
255 256
            (* Update [t]. Again, no need to allocate a new stamp. *)
            { t with transitions = SymbolMap.add a t' t.transitions }
257
        | exception Not_found ->
258
            raise DeadBranch
259

260 261 262 263 264
  (* [insert prod t] inserts a new branch, corresponding to production
     [prod], into the trie [t]. This function consumes its argument,
     which should no longer be used afterwards. *)
  let insert prod t =
    let w = Array.to_list (Production.rhs prod) in
265 266 267 268 269 270
    let save = !c in
    try
      insert w prod t
    with DeadBranch ->
      c := save;
      t
271

272 273 274 275 276 277 278 279 280 281 282 283 284
  (* [fresh s] creates a new empty trie whose source is [s]. *)
  let fresh source =
    mktrie source source [] SymbolMap.empty

  let star s =
    SymbolMap.fold (fun sym _ accu ->
      match sym with
      | Symbol.T _ ->
          accu
      | Symbol.N nt ->
          Production.foldnt nt accu insert
    ) (Lr1.transitions s) (fresh s)

285
  (* [nontrivial t] tests whether the trie [t] has any branches, i.e.,
286 287 288 289
     contains at least one sub-trie whose [productions] field is nonempty.
     Trivia: a trie of size greater than 1 is necessarily nontrivial, but the
     converse is not true: a nontrivial trie can have size 1. (This occurs
     when all productions have zero length.) *)
290 291 292
  let nontrivial t =
    not (t.productions = [] && SymbolMap.is_empty t.transitions)

293 294 295 296 297 298
  (* Redefine [star] to include a [nontrivial] test and to record the size
     of the newly built trie. *)

  let size =
    Array.make Lr1.n (-1)

299
  let star s =
300
    let initial = !c in
301
    let t = star s in
302 303
    let final = !c in
    size.(Lr1.number s) <- final - initial;
304 305 306 307 308
    if nontrivial t then
      Some t
    else
      None

309 310 311 312
  let size s =
    assert (size.(s) >= 0);
    size.(s)

POTTIER Francois's avatar
POTTIER Francois committed
313 314
  let compare t1 t2 =
    Pervasives.compare (t1.identity : int) t2.identity
315

316 317 318
  let source t =
    t.source

POTTIER Francois's avatar
POTTIER Francois committed
319 320
  let current t =
    t.current
321

322 323 324
  let accepts prod t =
    List.mem prod t.productions

325
  let step a t =
326
    SymbolMap.find a t.transitions (* careful: may raise [Not_found] *)
327

328
  let verbose () =
329
    Printf.fprintf stderr "Cumulated star size: %d\n%!" !c
330

331 332
end

POTTIER Francois's avatar
POTTIER Francois committed
333 334
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
335 336 337 338 339 340
(* The main algorithm, [LRijkstra], accumulates facts. A fact is a triple of a
   position (that is, a sub-trie), a word, and a lookahead assumption. Such a
   fact means that this position can be reached, from the source state
   [Trie.source fact.position], by consuming [fact.word], under the assumption
   that the next input symbol is [fact.lookahead]. *)

341 342
(* We allow [fact.lookahead] to be [any] so as to indicate that this fact does
   not have a lookahead assumption. *)
POTTIER Francois's avatar
POTTIER Francois committed
343

344
type fact = {
345
  position: Trie.trie;
346
  word: W.word;
347
  lookahead: Terminal.t (* may be [any] *)
348 349
}

350 351
(* Accessors. *)

352
let source fact =
353
  Trie.source fact.position
354

POTTIER Francois's avatar
POTTIER Francois committed
355
let current fact =
356
  Trie.current fact.position
357

358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384
(* Two invariants reduce the number of facts that we consider:

   1. If [fact.lookahead] is a terminal symbol [z] (i.e., not [any]), then
      [z] does not cause an error in the current state [current fact]. It
      would be useless to consider a fact that violates this property; it
      cannot possibly lead to a successful reduction.

   2. [fact.lookahead] is [any] iff the current state [current fact] is
      solid. This sounds rather reasonable (when a state is entered
      by shifting, it is entered regardless of which symbol follows)
      and simplifies the implementation of the sub-module [T].

*)

let invariant1 fact =
  fact.lookahead = any || not (causes_an_error (current fact) fact.lookahead)

let invariant2 fact =
  (fact.lookahead = any) = is_solid (current fact)

(* [compatible z a] checks whether the terminal symbol [a] satisfies the
   lookahead assumption [z] -- which can be [any]. *)

let compatible z a =
  assert (a <> any);
  z = any || z = a

POTTIER Francois's avatar
POTTIER Francois committed
385
(* ------------------------------------------------------------------------ *)
386

387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414
(* As in Dijkstra's algorithm, a priority queue contains the facts that await
   examination. The length of [fact.word] serves as the priority of a fact.
   This guarantees that we discover shortest paths. (We never insert into the
   queue a fact whose priority is less than the priority of the last fact
   extracted out of the queue.) *)

(* [LowIntegerPriorityQueue] offers very efficient operations (essentially
   constant time, for a small constant). It exploits the fact that priorities
   are low nonnegative integers. *)

module Q = LowIntegerPriorityQueue

let q =
  Q.create()

(* We never insert into the queue a fact that immediately causes an error,
   i.e., a fact such that [causes_an_error (current fact) fact.lookahead]
   holds. In practice, this convention allows reducing the number of facts
   that go through the queue by a factor of two. *)

(* In principle, there is no need to insert the fact into the queue if [T]
   already stores a comparable fact. We could perform this test in [add].
   However, a quick experiment suggests that this is not worthwhile. The run
   time augments (because membership in [T] is tested twice, upon inserting
   and upon extracting) and the memory consumption does not seem to go down
   significantly. *)

let add fact =
415 416
  assert (invariant1 fact);
  assert (invariant2 fact);
417 418 419 420 421 422 423 424 425
  (* The length of [fact.word] serves as the priority of this fact. *)
  Q.add q fact (W.length fact.word)

(* Construct the [star] of every state [s]. Initialize the priority queue. *)

let () =
  Lr1.iter (fun s ->
    match Trie.star s with
    | Some trie ->
426 427
        (* TEMPORARY weird *)
        if is_solid s then
428 429 430
          add {
            position = trie;
            word = W.epsilon;
431
            lookahead = any
432
          }
433 434 435 436 437 438 439 440
        else
          foreach_terminal_not_causing_an_error s (fun z ->
            add {
              position = trie;
              word = W.epsilon;
              lookahead = z
            }
          )
441 442 443 444 445 446
    | None ->
        ()
  )

(* ------------------------------------------------------------------------ *)

447 448 449 450 451 452 453 454 455 456
(* The first symbol of the input, [W.first fact.word fact.lookahead], plays a
   special role. Indeed, for every position, for every first symbol, and for
   every lookahead symbol, we keep track of at most one fact. Thus, the total
   number of facts accumulated by the algorithm is at most [T.n^2], where [T]
   is the total size of the tries that we have constructed, and [n] is the
   number of terminal symbols. (This number can be quite large. [T] can be in
   the tens of thousands, and [n] can be over one hundred. These figures lead
   to a theoretical upper bound of 100M. In practice, for T=25K and n=108, we
   observe that the algorithm gathers about 7M facts.) *)

457
module T : sig
458 459

  (* [register fact] registers the fact [fact]. It returns [true] if this fact
POTTIER Francois's avatar
POTTIER Francois committed
460 461
     is new, i.e., no fact concerning the same triple of [position], [a], and
     [z] was previously known. *)
462 463
  val register: fact -> bool

POTTIER Francois's avatar
POTTIER Francois committed
464 465
  (* [query current z f] enumerates all known facts whose current state is
     [current] and whose lookahead assumption is [z]. *)
466
  val query: Lr1.node -> Terminal.t -> (fact -> unit) -> unit
467

468
  val verbose: unit -> unit
469

470
end = struct
471

472
  (* This module implements a set of facts. Two facts are considered equal
473
     (for the purposes of this set) if they have the same [position], [a], and
474 475 476 477 478 479
     [z] fields. The [word] is not considered. Indeed, we are not interested
     in keeping track of several words that produce the same effect. Only the
     shortest such word is of interest. *)

  (* We need to query the set of facts in two ways. In [register], we need to
     test whether a fact is in the set. In [query], we need to find all facts
POTTIER Francois's avatar
POTTIER Francois committed
480 481
     that match a pair [current, z]. For this reason, we use a two-level table.
     The first level is a matrix indexed by [current] and [z]. At the second
482
     level, we find sets of facts. *)
483 484 485
(**)

  module M =
486
    MySet.Make(struct
487 488
      type t = fact
      let compare fact1 fact2 =
489
        let c = Trie.compare fact1.position fact2.position in
490
        if c <> 0 then c else
491 492
        let a1 = W.first fact1.word fact1.lookahead
        and a2 = W.first fact2.word fact2.lookahead in
493
        (* note: [a1] and [a2] can be [any] here *)
494 495
        Terminal.compare a1 a2
    end)
496

497
  let table = (* a pretty large table... *)
498 499
    Array.make (Lr1.n * (Terminal.n + 1)) M.empty (* room for [any] *)
  (* TEMPORARY this space is wasted for solid states *)
500

POTTIER Francois's avatar
POTTIER Francois committed
501
  let index current z =
502
    (Terminal.n + 1) * (Lr1.number current) + Terminal.t2i z
503

504 505
  let count = ref 0

506
  let register fact =
POTTIER Francois's avatar
POTTIER Francois committed
507
    let current = current fact in
508
    let z = fact.lookahead in
509 510
    (* [z] is [any] iff [current] is solid. *)
    assert ((z = any) = is_solid current);
POTTIER Francois's avatar
POTTIER Francois committed
511
    let i = index current z in
512 513 514 515 516 517 518 519 520 521 522 523
    let m = table.(i) in
    (* We crucially rely on the fact that [M.add] guarantees not to
       change the set if an ``equal'' fact already exists. Thus, a
       later, longer path is ignored in favor of an earlier, shorter
       path. *)
    let m' = M.add fact m in
    m != m' && begin
      incr count;
      table.(i) <- m';
      true
    end

POTTIER Francois's avatar
POTTIER Francois committed
524
  let query current z f =
525 526 527 528
    assert (z <> any);
    (* if [current] is solid then the facts that concern it are stored
       under any [any], not under [z] *)
    let i = index current (if is_solid current then any else z) in
529 530
    let m = table.(i) in
    M.iter f m
531

532
  let verbose () =
533 534
    Printf.fprintf stderr "T stores %d facts.\n%!" !count

535 536
end

POTTIER Francois's avatar
POTTIER Francois committed
537 538
(* ------------------------------------------------------------------------ *)

539 540 541 542 543 544 545
(* The module [E] is in charge of recording the non-terminal edges that we have
   discovered, or more precisely, the conditions under which these edges can be
   taken. *)

module E : sig

  (* [register s nt w z] records that, in state [s], the outgoing edge labeled
546 547 548
     [nt] can be taken by consuming the word [w], if the next symbol is [z].
     It returns [true] if this information is new. *)
  val register: Lr1.node -> Nonterminal.t -> W.word -> Terminal.t -> bool
549 550 551 552

  (* [query s nt a z] answers whether, in state [s], the outgoing edge labeled
     [nt] can be taken by consuming some word [w], under the assumption that
     the next symbol is [z], and under the constraint that the first symbol of
553 554
     [w.z] is [a]. *)
  val query: Lr1.node -> Nonterminal.t -> Terminal.t -> Terminal.t -> (W.word -> unit) -> unit
555

556
  val verbose: unit -> unit
557

558 559
end = struct

560 561 562 563 564 565 566 567 568 569
  (* At a high level, we must implement a mapping of [s, nt, a, z] to [w]. In
     practice, we can implement this specification using any combination of
     arrays, hash tables, balanced binary trees, and perfect hashing (i.e.,
     packing several of [s], [nt], [a], [z] in one word.) Here, we choose to
     use an array, indexed by [s], of hash tables, indexed by a key that packs
     [nt], [a], and [z] in one word. According to a quick experiment, the
     final population of the hash table [table.(index s)] seems to be roughly
     [Terminal.n * Trie.size s]. We note that using an initial capacity
     of 0 and relying on the hash table's resizing mechanism has a significant
     cost, which is why we try to guess a good initial capacity. *)
570

571
  module H = Hashtbl
572

573
  let table = (* a pretty large table... *)
574 575 576 577
    Array.init (Lr1.n) (fun i ->
      let size = Trie.size i in
      H.create (if size = 1 then 0 else Terminal.n * size)
    )
578 579 580

  let index s =
    Lr1.number s
581

582 583 584 585
  let pack nt a z : int =
    (Nonterminal.n2i nt lsl 16) lor
    (Terminal.t2i a lsl 8) lor
    (Terminal.t2i z)
586

587 588
  let count = ref 0

589
  let register s nt w z =
590
    assert (regular z && z <> any);
591
    let i = index s in
592
    let m = table.(i) in
593
    let a = W.first w z in
594
    assert (not (causes_an_error s a));
595 596 597 598
    let key = pack nt a z in
    if H.mem m key then
      false
    else begin
599
      incr count;
600
      H.add m key w;
601 602
      true
    end
603

604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624
  let rec query s nt a z f =
    assert (regular z && z <> any);
    (* [a] can be [any] *)
    if a <> any then begin
      let i = index s in
      let m = table.(i) in
      let key = pack nt a z in
      match H.find m key with
      | w -> f w
      | exception Not_found -> ()
    end
    else begin
      (* If [a] is [any], we query the table for every concrete [a].
         We can limit ourselves to symbols that do not cause an error
         in state [s]. Those that do certainly do not have an entry;
         see the assertion in [register] above. *)
      foreach_terminal_not_causing_an_error s (fun a ->
        query s nt a z f
      )
        (* TEMPORARY try a scheme that allows a more efficient iteration? *)
    end
625

626
  let verbose () =
627 628
    Printf.fprintf stderr "E stores %d facts.\n%!" !count

629 630
end

POTTIER Francois's avatar
POTTIER Francois committed
631 632
(* ------------------------------------------------------------------------ *)

633
let new_edge s nt w z =
634
  assert (regular z && z <> any);
635
  if E.register s nt w z then
636
    let sym = Symbol.N nt in
637
    T.query s (W.first w z) (fun fact ->
638
      assert (compatible fact.lookahead (W.first w z));
639 640
      match Trie.step sym fact.position with
      | position ->
641
          assert (not (is_solid (Trie.current position)));
642
          if not (causes_an_error (Trie.current position) z) then
643
            add {
644
              position;
645 646 647 648 649
              word = W.append fact.word w;
              lookahead = z
            }
      | exception Not_found ->
          ()
650
    )
651 652 653 654 655 656 657 658 659 660

(* [consequences fact] is invoked when we discover a new fact (i.e., one that
   was not previously known). It studies the consequences of this fact. These
   consequences are of two kinds:

   - As in Dijkstra's algorithm, the new fact can be viewed as a newly
   discovered vertex. We study its (currently known) outgoing edges,
   and enqueue new facts in the priority queue.

   - Sometimes, a fact can also be viewed as a newly discovered edge.
POTTIER Francois's avatar
POTTIER Francois committed
661 662
   This is the case when the word from [fact.source] to [fact.current]
   represents a production of the grammar and [fact.current] is willing
663 664 665 666 667 668 669 670
   to reduce this production. We record the existence of this edge,
   and re-inspect any previously discovered vertices which are
   interested in this outgoing edge.
*)
(**)

let consequences fact =

POTTIER Francois's avatar
POTTIER Francois committed
671
  let current = current fact in
672

POTTIER Francois's avatar
POTTIER Francois committed
673
  (* 1. View [fact] as a vertex. Examine the transitions out of [current]. *)
674
  
675
  SymbolMap.iter (fun sym s' ->
676
    match Trie.step sym fact.position, sym with
677
    | exception Not_found -> ()
678
    | position, Symbol.T t ->
679 680 681
        (* [t] cannot be the [error] token, because the trie does not have
           any edges labeled [error]. *)
        assert (regular t);
682

POTTIER Francois's avatar
POTTIER Francois committed
683
        (* 1a. There is a transition labeled [t] out of [current]. If
684 685 686 687 688
           the lookahead assumption [fact.lookahead] is compatible with [t],
           then we derive a new fact, where one more edge has been taken. We
           enqueue this new fact for later examination. *)
        (**)

689
        if compatible fact.lookahead t then begin
690
          let word = W.append fact.word (W.singleton t) in
691 692 693 694 695 696 697 698
          (* assert (Lr1.Node.compare (Trie.current position) s' = 0); *)
          (* [s'] has a terminal incoming symbol. It is always entered
             without consideration for the next lookahead symbol. Thus,
             we use [any] as the lookahead assumption in the new fact
             that we produce. *)
          assert (is_solid (Trie.current position));
          add { position; word; lookahead = any }
        end
699

700
    | position, Symbol.N nt ->
701

POTTIER Francois's avatar
POTTIER Francois committed
702
        (* 1b. There is a transition labeled [nt] out of [current]. We
703 704 705 706 707 708
           need to know how this nonterminal edge can be taken. We query for a
           word [w] that allows us to take this edge. The answer depends on
           the terminal symbol [z] that comes *after* this word: we try all
           such symbols. Furthermore, we need the first symbol of [w.z] to
           satisfy the lookahead assumption [fact.lookahead], so the answer
           also depends on this assumption. *)
POTTIER Francois's avatar
POTTIER Francois committed
709 710
        (* TEMPORARY it could be that the answer does not depend on [z]...
           (default reduction) *)
711 712 713
        (**)

        foreach_terminal_not_causing_an_error s' (fun z ->
POTTIER Francois's avatar
POTTIER Francois committed
714
          E.query current nt fact.lookahead z (fun w ->
715 716
            assert (compatible fact.lookahead (W.first w z));
            assert (not (is_solid (Trie.current position)));
717
            add {
718
              position;
719 720 721 722 723
              word = W.append fact.word w;
              lookahead = z
            }
          )
        )
724

POTTIER Francois's avatar
POTTIER Francois committed
725
  ) (Lr1.transitions current);
726 727

  (* 2. View [fact] as a possible edge. This is possible if the path from
POTTIER Francois's avatar
POTTIER Francois committed
728 729
     [fact.source] to [current] represents a production [prod] and
     [current] is willing to reduce this production. We check that
730
     [fact.position] accepts [epsilon]. This guarantees that reducing [prod]
731 732 733 734
     takes us all the way back to [fact.source]. Thus, this production gives
     rise to an edge labeled [nt] -- the left-hand side of [prod] -- out of
     [fact.source]. This edge is subject to the lookahead assumption
     [fact.lookahead], so we record that. *)
735 736
  (**)

737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760
  if fact.lookahead <> any then begin
    match has_reduction current fact.lookahead with
    | Some prod when Trie.accepts prod fact.position ->
        new_edge (source fact) (Production.nt prod) fact.word fact.lookahead
    | _ ->
        ()
  end
  else begin
    (* Every reduction must be considered. *)
    match Invariant.has_default_reduction current with
    | Some (prod, _) ->
        if Trie.accepts prod fact.position then
          (* TEMPORARY for now, avoid sending [any] into [new_edge] *)
          foreach_terminal (fun z ->
            new_edge (source fact) (Production.nt prod) (fact.word) z
          )
    | None ->
       TerminalMap.iter (fun z prods ->
         if regular z then
           let prod = Misc.single prods in
           if Trie.accepts prod fact.position then
             new_edge (source fact) (Production.nt prod) (fact.word) z
       ) (Lr1.reductions current)
  end
761

762
let level = ref 0
763

POTTIER Francois's avatar
POTTIER Francois committed
764 765 766
let extracted, considered =
  ref 0, ref 0

POTTIER Francois's avatar
POTTIER Francois committed
767 768
let done_with_level () =
  Printf.fprintf stderr "Done with level %d.\n" !level;
769
  W.verbose();
770 771
  T.verbose();
  E.verbose();
POTTIER Francois's avatar
POTTIER Francois committed
772 773 774
  Printf.fprintf stderr "Q stores %d facts.\n" (Q.cardinal q);
  Printf.fprintf stderr "%d facts extracted out of Q, of which %d considered.\n%!"
    !extracted !considered
POTTIER Francois's avatar
POTTIER Francois committed
775

776
let discover fact =
POTTIER Francois's avatar
POTTIER Francois committed
777
  incr extracted;
778
  if T.register fact then begin
779
    if W.length fact.word > ! level then begin
POTTIER Francois's avatar
POTTIER Francois committed
780
      done_with_level();
781 782
      level := W.length fact.word;
    end;
POTTIER Francois's avatar
POTTIER Francois committed
783
    incr considered;
784
    consequences fact
785
  end
786

POTTIER Francois's avatar
POTTIER Francois committed
787
let () =
788
  Trie.verbose();
789
  Q.repeat q discover;
790
  Time.tick "Running LRijkstra";
POTTIER Francois's avatar
POTTIER Francois committed
791
  done_with_level()
792

793 794
(* ------------------------------------------------------------------------ *)

795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824
(* The following code validates the fact that an error can be triggered in
   state [s'] by beginning in the initial state [s] and reading the
   sequence of terminal symbols [w]. We use this for debugging purposes. *)

let fail msg =
  Printf.fprintf stderr "coverage: internal error: %s.\n%!" msg;
  false

open ReferenceInterpreter

let validate s s' w : bool =
  match
    ReferenceInterpreter.check_error_path (Lr1.nt_of_entry s) (W.elements w)
  with
  | OInputReadPastEnd ->
      fail "input was read past its end"
  | OInputNotFullyConsumed ->
      fail "input was not fully consumed"
  | OUnexpectedAccept ->
      fail "input was unexpectedly accepted"
  | OK state ->
      Lr1.Node.compare state s' = 0 ||
      fail (
        Printf.sprintf "error occurred in state %d instead of %d"
          (Lr1.number state)
          (Lr1.number s')
      )

(* ------------------------------------------------------------------------ *)

825 826 827 828 829 830 831
(* We now wish to determine, given a state [s'] and a terminal symbol [z], a
   minimal path that takes us from some entry state to state [s'] with [z] as
   the next (unconsumed) symbol. *)

(* This can be formulated as a search for a shortest path in a graph. The
   graph is not just the automaton, though. It is a (much) larger graph whose
   vertices are pairs [s, z] and whose edges are obtained by querying the
832
   module [E] above. *)
833

834
let forward () =
835

836
  let module A = Astar.Make(struct
837

838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858
    (* A vertex is a pair [s, z].
       [z] cannot be the [error] token. *)
    type node =
        Lr1.node * Terminal.t

    let equal (s'1, z1) (s'2, z2) =
      Lr1.Node.compare s'1 s'2 = 0 && Terminal.compare z1 z2 = 0

    let hash (s, z) =
      Hashtbl.hash (Lr1.number s, z)

    (* An edge is labeled with a word. *)
    type label =
      W.word

    (* Forward search from every [s, z], where [s] is an initial state. *)
    let sources f =
      foreach_terminal (fun z ->
        ProductionMap.iter (fun _ s ->
          f (s, z)
        ) Lr1.entry
859 860
      )

861
    let successors (s, z) edge =
862
      assert (regular z);
863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885
      SymbolMap.iter (fun sym s' ->
        match sym with
        | Symbol.T t ->
            if Terminal.equal z t then
              let w = W.singleton t in
              foreach_terminal (fun z ->
                edge w 1 (s', z)
              )
        | Symbol.N nt ->
           foreach_terminal (fun z' ->
             E.query s nt z z' (fun w ->
               edge w (W.length w) (s', z')
             )
           )
      ) (Lr1.transitions s)

    let estimate _ =
      0

  end) in

  (* Search forward. *)

886
  Printf.fprintf stderr "Forward search:\n%!";
887
  let seen = ref Lr1.NodeSet.empty in
888
  let _, _ = A.search (fun ((s', z), path) ->
889 890 891 892 893 894 895 896 897 898 899 900 901 902
    if causes_an_error s' z && not (Lr1.NodeSet.mem s' !seen) then begin
      seen := Lr1.NodeSet.add s' !seen;
      (* An error can be triggered in state [s'] by beginning in the initial
         state [s] and reading the sequence of terminal symbols [w]. *)
      let (s, _), ws = A.reverse path in
      let w = List.fold_right W.append ws (W.singleton z) in
      Printf.fprintf stderr
        "An error can be reached from state %d to state %d:\n%!"
        (Lr1.number s)
        (Lr1.number s');
      Printf.fprintf stderr "%s\n%!" (W.print w);
      assert (validate s s' w)
    end
  ) in
903 904 905
  Printf.fprintf stderr "Reachable (forward): %d states\n%!"
    (Lr1.NodeSet.cardinal !seen);
  !seen
906

907
(* TEMPORARY the code in this module should run only if --coverage is set *)
908 909

let () =
910 911
  let f = forward() in
  Time.tick "Forward search";
POTTIER Francois's avatar
POTTIER Francois committed
912 913 914 915
  let stat = Gc.quick_stat() in
  Printf.fprintf stderr
    "Maximum size reached by the major heap: %dM\n"
    (stat.Gc.top_heap_words * (Sys.word_size / 8) / 1024 / 1024);
916
  ignore f
917 918

(* TODO:
POTTIER Francois's avatar
POTTIER Francois committed
919
  can we store fewer facts when we hit a default reduction?
920 921
  subject to --coverage
  write to .coverage file
POTTIER Francois's avatar
POTTIER Francois committed
922
  remove Coverage, remove CompletedNatWitness?, revert Fix
923
  collect performance data, correlated with star size and alphabet size; draw a graph
POTTIER Francois's avatar
POTTIER Francois committed
924
  count the unreachable states and see if they are numerous in practice
POTTIER Francois's avatar
POTTIER Francois committed
925 926
  optionally report several ways of reaching an error in state s
    (with different lookahead tokens) (report all of them?)
927
*)
POTTIER Francois's avatar
POTTIER Francois committed
928 929 930 931 932 933 934 935 936 937 938

(* One could approach the problem just by exploring the (infinite) graph whose
   vertices are configurations of the LR automaton (i.e., stacks, or perhaps
   pairs of a stack and a lookahead symbol) and transitions are determined by
   feeding one symbol to the automaton. A small-step version of the reference
   interpreter would allow us to set this up easily. One could then run a
   breadth-first exploration of this graph and stop when desired, e.g., as
   soon as all automaton states have been reached. However, this process does
   not necessarily terminate, and could be very costly -- e.g. enumerating all
   sentences of length 10 when the alphabet has size 100 costs 10^20. Also,
   this approach cannot prove that a state is unreachable. *)