LRijkstra.ml 28.6 KB
Newer Older
1
open Grammar
2
module W = Terminal.Word(struct end) (* TEMPORARY wrap side effect in functor *)
3

4
(* Throughout, we ignore the [error] pseudo-token completely. We consider that
5 6 7 8 9 10
   it never appears on the input stream. Thus, we disregard any reductions or
   transitions that take place when the lookahead symbol is [error]. As a
   result, any state whose incoming symbol is [error] is found unreachable. *)

let regular z =
  not (Terminal.equal z Terminal.error)
11

12 13
(* ------------------------------------------------------------------------ *)

14 15 16 17 18
(* We begin with a number of auxiliary functions that provide information
   about the LR(1) automaton. These functions could perhaps be moved to a
   separate module. We keep them here, for the moment, because they are not
   used anywhere else. *)

POTTIER Francois's avatar
POTTIER Francois committed
19 20 21
(* [reductions_on s z] is the list of reductions permitted in state [s] when
   the lookahead symbol is [z]. This is a list of zero or one elements. This
   does not take default reductions into account. *)
22

POTTIER Francois's avatar
POTTIER Francois committed
23
let reductions_on s z : Production.index list =
24
  assert (regular z);
25 26 27 28 29
  try
    TerminalMap.find z (Lr1.reductions s)
  with Not_found ->
    []

30 31 32
(* [has_reduction s z] tells whether state [s] is willing to reduce some
   production (and if so, which one) when the lookahead symbol is [z]. It
   takes a possible default reduction into account. *)
33 34 35 36 37 38

let has_reduction s z : Production.index option =
  match Invariant.has_default_reduction s with
  | Some (prod, _) ->
      Some prod
  | None ->
POTTIER Francois's avatar
POTTIER Francois committed
39
      match reductions_on s z with
40 41 42 43 44 45
      | prod :: prods ->
          assert (prods = []);
          Some prod
      | [] ->
          None

46 47 48 49 50 51 52 53 54
(* [can_reduce s prod] indicates whether state [s] is able to reduce
   production [prod] (either as a default reduction, or as a normal
   reduction). *)

let can_reduce s prod =
  match Invariant.has_default_reduction s with
  | Some (prod', _) when prod = prod' ->
      true
  | _ ->
55 56
      TerminalMap.fold (fun z prods accu ->
        accu || regular z && List.mem prod prods
57 58
      ) (Lr1.reductions s) false

59 60
(* [causes_an_error s z] tells whether state [s] will initiate an error on the
   lookahead symbol [z]. *)
61

62
let causes_an_error s z : bool =
63
  assert (regular z);
64 65 66 67
  match Invariant.has_default_reduction s with
  | Some _ ->
      false
  | None ->
POTTIER Francois's avatar
POTTIER Francois committed
68
      reductions_on s z = [] &&
69 70
      not (SymbolMap.mem (Symbol.T z) (Lr1.transitions s))

71 72 73
(* [foreach_terminal f] applies the function [f] to every terminal symbol in
   turn, except [error]. *)

74 75
let foreach_terminal f =
  Terminal.iter (fun t ->
76
    if regular t then
77 78 79
      f t
  )

80 81 82 83 84
(* [foreach_terminal_not_causing_an_error s f] applies the function [f] to
   every terminal symbol [z] such that [causes_an_error s z] is false. This
   could be implemented in a naive manner using [foreach_terminal] and
   [causes_an_error]. This implementation is slightly more efficient. *)

85 86 87
let foreach_terminal_not_causing_an_error s f =
  match Invariant.has_default_reduction s with
  | Some _ ->
88
      (* There is a default reduction. No symbol causes an error. *)
89 90
      foreach_terminal f
  | None ->
91 92 93
      (* Enumerate every terminal symbol [z] for which there is a
         reduction. *)
      TerminalMap.iter (fun z _ ->
94
        if regular z then
95
          f z
96
      ) (Lr1.reductions s);
97 98
      (* Enumerate every terminal symbol [z] for which there is a
         transition. *)
99 100
      SymbolMap.iter (fun sym _ ->
        match sym with
101
        | Symbol.T z ->
102
            if regular z then
103
              f z
104 105 106 107
        | Symbol.N _ ->
            ()
      ) (Lr1.transitions s)

108 109
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
110 111 112 113 114 115 116 117 118 119 120 121
(* Suppose [s] is a state that carries an outgoing edge labeled with a
   non-terminal symbol [nt]. We are interested in finding out how this edge
   can be taken. In order to do that, we must determine how, by starting in
   [s], one can follow a path that corresponds to (the right-hand side of) a
   production [prod] associated with [nt]. There are in general several such
   productions. The paths that they determine in the automaton form a "star".
   We represent the star rooted at [s] as a trie. For every state [s], the
   star rooted at [s] is constructed in advance, before the algorithm runs.
   While the algorithm runs, a point in the trie (that is, a sub-trie) tells
   us where we come form, where we are, and which production(s) we are hoping
   to reduce in the future. *)

122 123 124
module Trie : sig

  type trie
125 126 127 128

  (* [star s] creates a (new) trie whose source is [s], populated with its
     branches. (There is one branch for every production [prod] associated
     with every non-terminal symbol [nt] for which [s] carries an outgoing
129
     edge.) If the star turns out to be trivial then [None] is returned. *)
130 131
  val star: Lr1.node -> trie option

132 133 134 135
  (* After [star s] has been called, [size (Lr1.number s)] reports the size
     of the trie that has been constructed for state [s]. *)
  val size: int -> int

136 137 138 139 140 141 142 143
  (* Every (sub-)trie has a unique identity. (One can think of it as its
     address.) [compare] compares the identity of two tries. This can be
     used, e.g., to set up a map whose keys are tries. *)
  val compare: trie -> trie -> int

  (* [source t] returns the source state of the (sub-)trie [t]. This is
     the root of the star of which [t] is a sub-trie. In other words, this
     tells us "where we come from". *)
144
  val source: trie -> Lr1.node
POTTIER Francois's avatar
POTTIER Francois committed
145 146

  (* [current t] returns the current state of the (sub-)trie [t]. This is
147 148
     the root of the sub-trie [t]. In other words, this tells us "where
     we are". *)
POTTIER Francois's avatar
POTTIER Francois committed
149
  val current: trie -> Lr1.node
150

151 152 153 154 155
  (* [accepts prod t] tells whether the current state of the trie [t] is
     the end of a branch associated with production [prod]. If so, this
     means that we have successfully followed a path that corresponds to
     the right-hand side of production [prod]. *)
  val accepts: Production.index -> trie -> bool
156

157 158 159 160 161 162 163 164
  (* [step sym t] is the immediate sub-trie of [t] along the symbol [sym].
     This function raises [Not_found] if [t] has no child labeled [sym]. *)
  val step: Symbol.t -> trie -> trie

  (* [verbose()] outputs debugging & performance information. *)
  val verbose: unit -> unit

end = struct
165

166 167
  (* A trie has the following structure. *)

168
  type trie = {
169 170 171
    (* A unique identity, used by [compare]. The trie construction code
       ensures that these numbers are indeed unique: see [fresh], [insert],
       [star]. *)
172
    identity: int;
173
    (* The root state of this star: "where we come from". *)
174
    source: Lr1.node;
175
    (* The current state, i.e., the root of this sub-trie: "where we are". *)
POTTIER Francois's avatar
POTTIER Francois committed
176
    current: Lr1.node;
177 178 179
    (* The productions that we can reduce in the current state. In other
       words, if this list is nonempty, then the current state is the end
       of one (or several) branches. It can nonetheless have children. *)
180
    productions: Production.index list;
181 182
    (* The children, or sub-tries. *)
    transitions: trie SymbolMap.t
183
  }
184

185
  (* This counter is used by [mktrie] to produce unique identities. *)
186 187
  let c = ref 0

188
  (* This smart constructor creates a new trie with a unique identity. *)
POTTIER Francois's avatar
POTTIER Francois committed
189
  let mktrie source current productions transitions =
190
    let identity = Misc.postincrement c in
POTTIER Francois's avatar
POTTIER Francois committed
191
    { identity; source; current; productions; transitions }
192

193 194
  exception DeadBranch

195
  let rec insert w prod t =
196 197
    match w with
    | [] ->
POTTIER Francois's avatar
POTTIER Francois committed
198
        (* We check whether the current state [t.current] is able to reduce
199 200 201 202 203
           production [prod]. (If [prod] cannot be reduced, the reduction
           action must have been suppressed by conflict resolution.) If not,
           then this branch is dead. This test is superfluous (i.e., it would
           be OK to conservatively assume that [prod] can be reduced) but
           allows us to build a slightly smaller star in some cases. *)
POTTIER Francois's avatar
POTTIER Francois committed
204
        if can_reduce t.current prod then
205 206 207 208 209 210
          (* We consume (update) the trie [t], so there is no need to allocate
             a new stamp. (Of course we could allocate a new stamp, but I prefer
             to be precise.) *)
          { t with productions = prod :: t.productions }
        else
          raise DeadBranch
211 212
    | (Symbol.T t) :: _ when not (regular t) ->
         raise DeadBranch
213
    | a :: w ->
POTTIER Francois's avatar
POTTIER Francois committed
214
        (* Check if there is a transition labeled [a] out of [t.current]. If
215 216
           there is, we add a child to the trie [t]. If there isn't, then it
           must have been removed by conflict resolution. (Indeed, it must be
217 218 219
           present in a canonical automaton.) We could in this case return an
           unchanged sub-trie. We can do slightly better: we abort the whole
           insertion, so as to return an unchanged toplevel trie. *)
POTTIER Francois's avatar
POTTIER Francois committed
220
        match SymbolMap.find a (Lr1.transitions t.current) with
221
        | successor ->
222 223 224 225 226 227 228 229
            (* Find our child at [a], or create it. *)
            let t' =
              try
                SymbolMap.find a t.transitions
              with Not_found ->
                mktrie t.source successor [] SymbolMap.empty
            in
            (* Update the child [t']. *)
230
            let t' = insert w prod t' in
231 232
            (* Update [t]. Again, no need to allocate a new stamp. *)
            { t with transitions = SymbolMap.add a t' t.transitions }
233
        | exception Not_found ->
234
            raise DeadBranch
235

236 237 238 239 240
  (* [insert prod t] inserts a new branch, corresponding to production
     [prod], into the trie [t]. This function consumes its argument,
     which should no longer be used afterwards. *)
  let insert prod t =
    let w = Array.to_list (Production.rhs prod) in
241 242 243 244 245 246
    let save = !c in
    try
      insert w prod t
    with DeadBranch ->
      c := save;
      t
247

248 249 250 251 252 253 254 255 256 257 258 259 260
  (* [fresh s] creates a new empty trie whose source is [s]. *)
  let fresh source =
    mktrie source source [] SymbolMap.empty

  let star s =
    SymbolMap.fold (fun sym _ accu ->
      match sym with
      | Symbol.T _ ->
          accu
      | Symbol.N nt ->
          Production.foldnt nt accu insert
    ) (Lr1.transitions s) (fresh s)

261
  (* [nontrivial t] tests whether the trie [t] has any branches, i.e.,
262 263 264 265
     contains at least one sub-trie whose [productions] field is nonempty.
     Trivia: a trie of size greater than 1 is necessarily nontrivial, but the
     converse is not true: a nontrivial trie can have size 1. (This occurs
     when all productions have zero length.) *)
266 267 268
  let nontrivial t =
    not (t.productions = [] && SymbolMap.is_empty t.transitions)

269 270 271 272 273 274
  (* Redefine [star] to include a [nontrivial] test and to record the size
     of the newly built trie. *)

  let size =
    Array.make Lr1.n (-1)

275
  let star s =
276
    let initial = !c in
277
    let t = star s in
278 279
    let final = !c in
    size.(Lr1.number s) <- final - initial;
280 281 282 283 284
    if nontrivial t then
      Some t
    else
      None

285 286 287 288
  let size s =
    assert (size.(s) >= 0);
    size.(s)

POTTIER Francois's avatar
POTTIER Francois committed
289 290
  let compare t1 t2 =
    Pervasives.compare (t1.identity : int) t2.identity
291

292 293 294
  let source t =
    t.source

POTTIER Francois's avatar
POTTIER Francois committed
295 296
  let current t =
    t.current
297

298 299 300
  let accepts prod t =
    List.mem prod t.productions

301
  let step a t =
302
    SymbolMap.find a t.transitions (* careful: may raise [Not_found] *)
303

304
  let verbose () =
305
    Printf.fprintf stderr "Cumulated star size: %d\n%!" !c
306

307 308
end

POTTIER Francois's avatar
POTTIER Francois committed
309 310
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326
(* The main algorithm, [LRijkstra], accumulates facts. A fact is a triple of a
   position (that is, a sub-trie), a word, and a lookahead assumption. Such a
   fact means that this position can be reached, from the source state
   [Trie.source fact.position], by consuming [fact.word], under the assumption
   that the next input symbol is [fact.lookahead]. *)

(* The first symbol of the input, [first fact.word fact.lookahead], plays a
   special role. Indeed, for every position, for every first symbol, and for
   every lookahead symbol, we keep track of at most one fact. Thus, the total
   number of facts accumulated by the algorithm is at most [T.n^2], where [T]
   is the total size of the tries that we have constructed, and [n] is the
   number of terminal symbols. (This number can be quite large. [T] can be in
   the tens of thousands, and [n] can be over one hundred. These figures lead
   to a theoretical upper bound of 100M. In practice, for T=25K and n=108, we
   observe that the algorithm gathers about 7M facts.) *)

327
type fact = {
328
  position: Trie.trie;
329
  word: W.word;
330
  lookahead: Terminal.t
331 332
}

333
let source fact =
334
  Trie.source fact.position
335

POTTIER Francois's avatar
POTTIER Francois committed
336
let current fact =
337
  Trie.current fact.position
338

POTTIER Francois's avatar
POTTIER Francois committed
339
(* ------------------------------------------------------------------------ *)
340

341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391
(* As in Dijkstra's algorithm, a priority queue contains the facts that await
   examination. The length of [fact.word] serves as the priority of a fact.
   This guarantees that we discover shortest paths. (We never insert into the
   queue a fact whose priority is less than the priority of the last fact
   extracted out of the queue.) *)

(* [LowIntegerPriorityQueue] offers very efficient operations (essentially
   constant time, for a small constant). It exploits the fact that priorities
   are low nonnegative integers. *)

module Q = LowIntegerPriorityQueue

let q =
  Q.create()

(* We never insert into the queue a fact that immediately causes an error,
   i.e., a fact such that [causes_an_error (current fact) fact.lookahead]
   holds. In practice, this convention allows reducing the number of facts
   that go through the queue by a factor of two. *)

(* In principle, there is no need to insert the fact into the queue if [T]
   already stores a comparable fact. We could perform this test in [add].
   However, a quick experiment suggests that this is not worthwhile. The run
   time augments (because membership in [T] is tested twice, upon inserting
   and upon extracting) and the memory consumption does not seem to go down
   significantly. *)

let add fact =
  (* assert (not (causes_an_error (current fact) fact.lookahead)); *)
  (* The length of [fact.word] serves as the priority of this fact. *)
  Q.add q fact (W.length fact.word)

(* Construct the [star] of every state [s]. Initialize the priority queue. *)

let () =
  Lr1.iter (fun s ->
    match Trie.star s with
    | Some trie ->
        foreach_terminal_not_causing_an_error s (fun z ->
          add {
            position = trie;
            word = W.epsilon;
            lookahead = z
          }
        )
    | None ->
        ()
  )

(* ------------------------------------------------------------------------ *)

392
module T : sig
393 394

  (* [register fact] registers the fact [fact]. It returns [true] if this fact
POTTIER Francois's avatar
POTTIER Francois committed
395 396
     is new, i.e., no fact concerning the same triple of [position], [a], and
     [z] was previously known. *)
397 398
  val register: fact -> bool

POTTIER Francois's avatar
POTTIER Francois committed
399 400
  (* [query current z f] enumerates all known facts whose current state is
     [current] and whose lookahead assumption is [z]. *)
401
  val query: Lr1.node -> Terminal.t -> (fact -> unit) -> unit
402

403
  val verbose: unit -> unit
404

405
end = struct
406

407
  (* This module implements a set of facts. Two facts are considered equal
408
     (for the purposes of this set) if they have the same [position], [a], and
409 410 411 412 413 414
     [z] fields. The [word] is not considered. Indeed, we are not interested
     in keeping track of several words that produce the same effect. Only the
     shortest such word is of interest. *)

  (* We need to query the set of facts in two ways. In [register], we need to
     test whether a fact is in the set. In [query], we need to find all facts
POTTIER Francois's avatar
POTTIER Francois committed
415 416
     that match a pair [current, z]. For this reason, we use a two-level table.
     The first level is a matrix indexed by [current] and [z]. At the second
417
     level, we find sets of facts. *)
418 419 420
(**)

  module M =
421
    MySet.Make(struct
422 423
      type t = fact
      let compare fact1 fact2 =
424
        let c = Trie.compare fact1.position fact2.position in
425
        if c <> 0 then c else
426 427
        let a1 = W.first fact1.word fact1.lookahead
        and a2 = W.first fact2.word fact2.lookahead in
428 429
        Terminal.compare a1 a2
    end)
430

431 432 433
  let table = (* a pretty large table... *)
    Array.make (Lr1.n * Terminal.n) M.empty

POTTIER Francois's avatar
POTTIER Francois committed
434 435
  let index current z =
    Terminal.n * (Lr1.number current) + Terminal.t2i z
436

437 438
  let count = ref 0

439
  let register fact =
POTTIER Francois's avatar
POTTIER Francois committed
440
    let current = current fact in
441
    let z = fact.lookahead in
POTTIER Francois's avatar
POTTIER Francois committed
442
    let i = index current z in
443 444 445 446 447 448 449 450 451 452 453 454
    let m = table.(i) in
    (* We crucially rely on the fact that [M.add] guarantees not to
       change the set if an ``equal'' fact already exists. Thus, a
       later, longer path is ignored in favor of an earlier, shorter
       path. *)
    let m' = M.add fact m in
    m != m' && begin
      incr count;
      table.(i) <- m';
      true
    end

POTTIER Francois's avatar
POTTIER Francois committed
455 456
  let query current z f =
    let i = index current z in
457 458
    let m = table.(i) in
    M.iter f m
459

460
  let verbose () =
461 462
    Printf.fprintf stderr "T stores %d facts.\n%!" !count

463 464
end

POTTIER Francois's avatar
POTTIER Francois committed
465 466
(* ------------------------------------------------------------------------ *)

467 468 469 470 471 472 473
(* The module [E] is in charge of recording the non-terminal edges that we have
   discovered, or more precisely, the conditions under which these edges can be
   taken. *)

module E : sig

  (* [register s nt w z] records that, in state [s], the outgoing edge labeled
474 475 476
     [nt] can be taken by consuming the word [w], if the next symbol is [z].
     It returns [true] if this information is new. *)
  val register: Lr1.node -> Nonterminal.t -> W.word -> Terminal.t -> bool
477 478 479 480

  (* [query s nt a z] answers whether, in state [s], the outgoing edge labeled
     [nt] can be taken by consuming some word [w], under the assumption that
     the next symbol is [z], and under the constraint that the first symbol of
481 482
     [w.z] is [a]. *)
  val query: Lr1.node -> Nonterminal.t -> Terminal.t -> Terminal.t -> (W.word -> unit) -> unit
483

484
  val verbose: unit -> unit
485

486 487
end = struct

488 489 490 491 492 493 494 495 496 497
  (* At a high level, we must implement a mapping of [s, nt, a, z] to [w]. In
     practice, we can implement this specification using any combination of
     arrays, hash tables, balanced binary trees, and perfect hashing (i.e.,
     packing several of [s], [nt], [a], [z] in one word.) Here, we choose to
     use an array, indexed by [s], of hash tables, indexed by a key that packs
     [nt], [a], and [z] in one word. According to a quick experiment, the
     final population of the hash table [table.(index s)] seems to be roughly
     [Terminal.n * Trie.size s]. We note that using an initial capacity
     of 0 and relying on the hash table's resizing mechanism has a significant
     cost, which is why we try to guess a good initial capacity. *)
498

499
  module H = Hashtbl
500

501
  let table = (* a pretty large table... *)
502 503 504 505
    Array.init (Lr1.n) (fun i ->
      let size = Trie.size i in
      H.create (if size = 1 then 0 else Terminal.n * size)
    )
506 507 508

  let index s =
    Lr1.number s
509

510 511 512 513
  let pack nt a z : int =
    (Nonterminal.n2i nt lsl 16) lor
    (Terminal.t2i a lsl 8) lor
    (Terminal.t2i z)
514

515 516
  let count = ref 0

517
  let register s nt w z =
518
    let i = index s in
519
    let m = table.(i) in
520
    let a = W.first w z in
521 522 523 524
    let key = pack nt a z in
    if H.mem m key then
      false
    else begin
525
      incr count;
526
      H.add m key w;
527 528
      true
    end
529

530
  let query s nt a z f =
531
    let i = index s in
532
    let m = table.(i) in
533 534 535
    let key = pack nt a z in
    match H.find m key with
    | w -> f w
536
    | exception Not_found -> ()
537

538
  let verbose () =
539 540
    Printf.fprintf stderr "E stores %d facts.\n%!" !count

541 542
end

POTTIER Francois's avatar
POTTIER Francois committed
543 544
(* ------------------------------------------------------------------------ *)

545
let new_edge s nt w z =
POTTIER Francois's avatar
POTTIER Francois committed
546 547 548 549
  (*
  Printf.fprintf stderr "Considering reduction on %s in state %d\n"
    (Terminal.print z) (Lr1.number s);
  *)
550
  if E.register s nt w z then
551
    let sym = Symbol.N nt in
552
    T.query s (W.first w z) (fun fact ->
553
      assert (Terminal.equal fact.lookahead (W.first w z));
554 555 556
      match Trie.step sym fact.position with
      | position ->
          if not (causes_an_error (Trie.current position) z) then
557
            add {
558
              position;
559 560 561 562 563
              word = W.append fact.word w;
              lookahead = z
            }
      | exception Not_found ->
          ()
564
    )
565 566 567 568 569 570 571 572 573 574

(* [consequences fact] is invoked when we discover a new fact (i.e., one that
   was not previously known). It studies the consequences of this fact. These
   consequences are of two kinds:

   - As in Dijkstra's algorithm, the new fact can be viewed as a newly
   discovered vertex. We study its (currently known) outgoing edges,
   and enqueue new facts in the priority queue.

   - Sometimes, a fact can also be viewed as a newly discovered edge.
POTTIER Francois's avatar
POTTIER Francois committed
575 576
   This is the case when the word from [fact.source] to [fact.current]
   represents a production of the grammar and [fact.current] is willing
577 578 579 580 581 582 583 584
   to reduce this production. We record the existence of this edge,
   and re-inspect any previously discovered vertices which are
   interested in this outgoing edge.
*)
(**)

let consequences fact =

POTTIER Francois's avatar
POTTIER Francois committed
585
  let current = current fact in
586

POTTIER Francois's avatar
POTTIER Francois committed
587
  (* 1. View [fact] as a vertex. Examine the transitions out of [current]. *)
588
  
589
  SymbolMap.iter (fun sym s' ->
590
    match Trie.step sym fact.position, sym with
591
    | exception Not_found -> ()
592
    | position, Symbol.T t ->
593 594 595
        (* [t] cannot be the [error] token, because the trie does not have
           any edges labeled [error]. *)
        assert (regular t);
596

POTTIER Francois's avatar
POTTIER Francois committed
597
        (* 1a. There is a transition labeled [t] out of [current]. If
598 599 600 601 602 603 604
           the lookahead assumption [fact.lookahead] is compatible with [t],
           then we derive a new fact, where one more edge has been taken. We
           enqueue this new fact for later examination. *)
        (**)

        if Terminal.equal fact.lookahead t then
          let word = W.append fact.word (W.singleton t) in
605
          (* assert (Lr1.Node.compare position.Trie.current s' = 0); *)
606
          foreach_terminal_not_causing_an_error s' (fun z ->
607
            add { position; word; lookahead = z }
608
          )
609

610
    | position, Symbol.N nt ->
611

POTTIER Francois's avatar
POTTIER Francois committed
612
        (* 1b. There is a transition labeled [nt] out of [current]. We
613 614 615 616 617 618 619 620 621
           need to know how this nonterminal edge can be taken. We query for a
           word [w] that allows us to take this edge. The answer depends on
           the terminal symbol [z] that comes *after* this word: we try all
           such symbols. Furthermore, we need the first symbol of [w.z] to
           satisfy the lookahead assumption [fact.lookahead], so the answer
           also depends on this assumption. *)
        (**)

        foreach_terminal_not_causing_an_error s' (fun z ->
POTTIER Francois's avatar
POTTIER Francois committed
622
          E.query current nt fact.lookahead z (fun w ->
623 624
            assert (Terminal.equal fact.lookahead (W.first w z));
            add {
625
              position;
626 627 628 629 630
              word = W.append fact.word w;
              lookahead = z
            }
          )
        )
631

POTTIER Francois's avatar
POTTIER Francois committed
632
  ) (Lr1.transitions current);
633 634

  (* 2. View [fact] as a possible edge. This is possible if the path from
POTTIER Francois's avatar
POTTIER Francois committed
635 636
     [fact.source] to [current] represents a production [prod] and
     [current] is willing to reduce this production. We check that
637
     [fact.position] accepts [epsilon]. This guarantees that reducing [prod]
638 639 640 641
     takes us all the way back to [fact.source]. Thus, this production gives
     rise to an edge labeled [nt] -- the left-hand side of [prod] -- out of
     [fact.source]. This edge is subject to the lookahead assumption
     [fact.lookahead], so we record that. *)
642 643
  (**)

POTTIER Francois's avatar
POTTIER Francois committed
644
  match has_reduction current fact.lookahead with
645
  | Some prod when Trie.accepts prod fact.position ->
646
      new_edge (source fact) (Production.nt prod) fact.word fact.lookahead
647 648 649
  | _ ->
      ()

650
let level = ref 0
651

POTTIER Francois's avatar
POTTIER Francois committed
652 653 654
let extracted, considered =
  ref 0, ref 0

POTTIER Francois's avatar
POTTIER Francois committed
655 656
let done_with_level () =
  Printf.fprintf stderr "Done with level %d.\n" !level;
657
  W.verbose();
658 659
  T.verbose();
  E.verbose();
POTTIER Francois's avatar
POTTIER Francois committed
660 661 662
  Printf.fprintf stderr "Q stores %d facts.\n" (Q.cardinal q);
  Printf.fprintf stderr "%d facts extracted out of Q, of which %d considered.\n%!"
    !extracted !considered
POTTIER Francois's avatar
POTTIER Francois committed
663

664
let discover fact =
POTTIER Francois's avatar
POTTIER Francois committed
665
  incr extracted;
666
  if T.register fact then begin
667
    if W.length fact.word > ! level then begin
POTTIER Francois's avatar
POTTIER Francois committed
668
      done_with_level();
669 670
      level := W.length fact.word;
    end;
POTTIER Francois's avatar
POTTIER Francois committed
671
    incr considered;
672
    consequences fact
673
  end
674

POTTIER Francois's avatar
POTTIER Francois committed
675
let () =
676
  Trie.verbose();
677
  Q.repeat q discover;
678
  Time.tick "Running LRijkstra";
POTTIER Francois's avatar
POTTIER Francois committed
679
  done_with_level()
680

681 682
(* ------------------------------------------------------------------------ *)

683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712
(* The following code validates the fact that an error can be triggered in
   state [s'] by beginning in the initial state [s] and reading the
   sequence of terminal symbols [w]. We use this for debugging purposes. *)

let fail msg =
  Printf.fprintf stderr "coverage: internal error: %s.\n%!" msg;
  false

open ReferenceInterpreter

let validate s s' w : bool =
  match
    ReferenceInterpreter.check_error_path (Lr1.nt_of_entry s) (W.elements w)
  with
  | OInputReadPastEnd ->
      fail "input was read past its end"
  | OInputNotFullyConsumed ->
      fail "input was not fully consumed"
  | OUnexpectedAccept ->
      fail "input was unexpectedly accepted"
  | OK state ->
      Lr1.Node.compare state s' = 0 ||
      fail (
        Printf.sprintf "error occurred in state %d instead of %d"
          (Lr1.number state)
          (Lr1.number s')
      )

(* ------------------------------------------------------------------------ *)

713 714 715 716 717 718 719
(* We now wish to determine, given a state [s'] and a terminal symbol [z], a
   minimal path that takes us from some entry state to state [s'] with [z] as
   the next (unconsumed) symbol. *)

(* This can be formulated as a search for a shortest path in a graph. The
   graph is not just the automaton, though. It is a (much) larger graph whose
   vertices are pairs [s, z] and whose edges are obtained by querying the
720
   module [E] above. *)
721

722
let forward () =
723

724
  let module A = Astar.Make(struct
725

726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746
    (* A vertex is a pair [s, z].
       [z] cannot be the [error] token. *)
    type node =
        Lr1.node * Terminal.t

    let equal (s'1, z1) (s'2, z2) =
      Lr1.Node.compare s'1 s'2 = 0 && Terminal.compare z1 z2 = 0

    let hash (s, z) =
      Hashtbl.hash (Lr1.number s, z)

    (* An edge is labeled with a word. *)
    type label =
      W.word

    (* Forward search from every [s, z], where [s] is an initial state. *)
    let sources f =
      foreach_terminal (fun z ->
        ProductionMap.iter (fun _ s ->
          f (s, z)
        ) Lr1.entry
747 748
      )

749
    let successors (s, z) edge =
750
      assert (regular z);
751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773
      SymbolMap.iter (fun sym s' ->
        match sym with
        | Symbol.T t ->
            if Terminal.equal z t then
              let w = W.singleton t in
              foreach_terminal (fun z ->
                edge w 1 (s', z)
              )
        | Symbol.N nt ->
           foreach_terminal (fun z' ->
             E.query s nt z z' (fun w ->
               edge w (W.length w) (s', z')
             )
           )
      ) (Lr1.transitions s)

    let estimate _ =
      0

  end) in

  (* Search forward. *)

774
  Printf.fprintf stderr "Forward search:\n%!";
775
  let seen = ref Lr1.NodeSet.empty in
776
  let _, _ = A.search (fun ((s', z), path) ->
777 778 779 780 781 782 783 784 785 786 787 788 789 790
    if causes_an_error s' z && not (Lr1.NodeSet.mem s' !seen) then begin
      seen := Lr1.NodeSet.add s' !seen;
      (* An error can be triggered in state [s'] by beginning in the initial
         state [s] and reading the sequence of terminal symbols [w]. *)
      let (s, _), ws = A.reverse path in
      let w = List.fold_right W.append ws (W.singleton z) in
      Printf.fprintf stderr
        "An error can be reached from state %d to state %d:\n%!"
        (Lr1.number s)
        (Lr1.number s');
      Printf.fprintf stderr "%s\n%!" (W.print w);
      assert (validate s s' w)
    end
  ) in
791 792 793
  Printf.fprintf stderr "Reachable (forward): %d states\n%!"
    (Lr1.NodeSet.cardinal !seen);
  !seen
794

795
(* TEMPORARY the code in this module should run only if --coverage is set *)
796 797

let () =
798 799
  let f = forward() in
  Time.tick "Forward search";
POTTIER Francois's avatar
POTTIER Francois committed
800 801 802 803
  let stat = Gc.quick_stat() in
  Printf.fprintf stderr
    "Maximum size reached by the major heap: %dM\n"
    (stat.Gc.top_heap_words * (Sys.word_size / 8) / 1024 / 1024);
804
  ignore f
805 806

(* TODO:
POTTIER Francois's avatar
POTTIER Francois committed
807
  can we store fewer facts when we hit a default reduction?
808 809
  subject to --coverage
  write to .coverage file
POTTIER Francois's avatar
POTTIER Francois committed
810
  remove Coverage, remove CompletedNatWitness?, revert Fix
811
  collect performance data, correlated with star size and alphabet size; draw a graph
POTTIER Francois's avatar
POTTIER Francois committed
812
  count the unreachable states and see if they are numerous in practice
813
*)
POTTIER Francois's avatar
POTTIER Francois committed
814 815 816 817 818 819 820 821 822 823 824

(* One could approach the problem just by exploring the (infinite) graph whose
   vertices are configurations of the LR automaton (i.e., stacks, or perhaps
   pairs of a stack and a lookahead symbol) and transitions are determined by
   feeding one symbol to the automaton. A small-step version of the reference
   interpreter would allow us to set this up easily. One could then run a
   breadth-first exploration of this graph and stop when desired, e.g., as
   soon as all automaton states have been reached. However, this process does
   not necessarily terminate, and could be very costly -- e.g. enumerating all
   sentences of length 10 when the alphabet has size 100 costs 10^20. Also,
   this approach cannot prove that a state is unreachable. *)