LRijkstra.ml 36.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
(* The purpose of this algorithm is to find, for each pair of a state [s]
   and a terminal symbol [z] such that looking at [z] in state [s] causes
   an error, a minimal path (starting in some initial state) that actually
   triggers this error. *)

(* This is potentially useful for grammar designers who wish to better
   understand the properties of their grammar, or who wish to produce a
   list of all possible syntax errors (or, at least, one syntax error in
   each automaton state where an error may occur). *)

(* The problem seems rather tricky. One might think that it suffices to
   compute shortest paths in the automaton, and to use [Analysis.minimal] to
   replace each non-terminal symbol in a path with a minimal word that this
   symbol generates. One can indeed do so, but this yields only a lower bound
   on the actual shortest path to the error at [s, z]. Indeed, two
   difficulties arise:

   - Some states have a default reduction. Thus, they will not trigger
     an error, even though they should. The error is triggered in some
     other state, after reduction takes place.

   - If the grammar has conflicts, conflict resolution removes some
     (shift or reduce) actions, hence may suppress the shortest path. *)

25 26 27 28 29 30 31 32 33 34 35
(* We explicitly choose to ignore the [error] token. Thus, we disregard any
   reductions or transitions that take place when the lookahead symbol is
   [error]. As a result, any state whose incoming symbol is [error] is found
   unreachable. It would be too complicated to have to create a first error in
   order to be able to take certain transitions or drop certain parts of the
   input. *)

(* We never work with the terminal symbol [#] either. This symbol never
   appears in the maps returned by [Lr1.transitions] and [Lr1.reductions].
   Thus, in principle, we work with ``real'' terminal symbols only. However,
   we encode [any] as [#] -- see below. *)
36

37 38 39 40 41
(* ------------------------------------------------------------------------ *)

(* To delay the side effects performed by this module, we wrap everything in
   in a big functor without arguments. *)

42
module Run (X : sig val verbose: bool end) = struct
43

44
open Grammar
45 46 47

(* ------------------------------------------------------------------------ *)

48 49
(* Because of our encoding of terminal symbols as 8-bit characters, this
   algorithm supports at most 256 terminal symbols. *)
50 51

let () =
52
  if Terminal.n > 256 then
53
    Error.error [] (Printf.sprintf
54
      "--list-errors supports at most 256 terminal symbols.\n\
55 56 57 58 59
       The grammar has %d terminal symbols." Terminal.n
    )

(* ------------------------------------------------------------------------ *)

60 61 62 63 64
(* Build a module that represents words as (hash-consed) strings. Note:
   this functor application has a side effect (it allocates memory, and
   more importantly, it may fail). *)

module W = Terminal.Word(struct end)
65

66 67
(* ------------------------------------------------------------------------ *)

68 69
(* The [error] token may appear in the maps returned by [Lr1.transitions]
   and [Lr1.reductions], so we sometimes need to explicitly check for it. *)
70

71
let non_error z =
72
  not (Terminal.equal z Terminal.error)
73

74 75 76 77
(* We introduce a pseudo-terminal symbol [any]. It is used in several places
   later on, in particular in the field [fact.lookahead], to encode the
   absence of a lookahead hypothesis -- i.e., any terminal symbol will do. *)

78 79 80 81 82
(* We choose to encode [any] as [#]. There is no risk of confusion, since we
   do not use [#] anywhere. Thus, the assertion [Terminal.real z] implies
   [z <> any]. *)

let any =
83
  Terminal.sharp
84

85 86
(* ------------------------------------------------------------------------ *)

87 88 89 90 91
(* We begin with a number of auxiliary functions that provide information
   about the LR(1) automaton. These functions could perhaps be moved to a
   separate module. We keep them here, for the moment, because they are not
   used anywhere else. *)

POTTIER Francois's avatar
POTTIER Francois committed
92 93 94
(* [reductions_on s z] is the list of reductions permitted in state [s] when
   the lookahead symbol is [z]. This is a list of zero or one elements. This
   does not take default reductions into account. *)
95

POTTIER Francois's avatar
POTTIER Francois committed
96
let reductions_on s z : Production.index list =
97
  assert (Terminal.real z);
98 99 100 101 102
  try
    TerminalMap.find z (Lr1.reductions s)
  with Not_found ->
    []

103 104 105
(* [has_reduction s z] tells whether state [s] is willing to reduce some
   production (and if so, which one) when the lookahead symbol is [z]. It
   takes a possible default reduction into account. *)
106 107

let has_reduction s z : Production.index option =
108
  assert (Terminal.real z);
109 110 111 112
  match Invariant.has_default_reduction s with
  | Some (prod, _) ->
      Some prod
  | None ->
POTTIER Francois's avatar
POTTIER Francois committed
113
      match reductions_on s z with
114 115 116 117 118 119
      | prod :: prods ->
          assert (prods = []);
          Some prod
      | [] ->
          None

120 121 122 123 124 125 126 127 128
(* [can_reduce s prod] indicates whether state [s] is able to reduce
   production [prod] (either as a default reduction, or as a normal
   reduction). *)

let can_reduce s prod =
  match Invariant.has_default_reduction s with
  | Some (prod', _) when prod = prod' ->
      true
  | _ ->
129
      TerminalMap.fold (fun z prods accu ->
130 131
        (* A reduction on [#] is always a default reduction. (See [lr1.ml].) *)
        assert (not (Terminal.equal z Terminal.sharp));
132
        accu || non_error z && List.mem prod prods
133 134
      ) (Lr1.reductions s) false

135 136
(* [causes_an_error s z] tells whether state [s] will initiate an error on the
   lookahead symbol [z]. *)
137

138
let causes_an_error s z : bool =
139
  assert (Terminal.real z);
140 141 142 143
  match Invariant.has_default_reduction s with
  | Some _ ->
      false
  | None ->
POTTIER Francois's avatar
POTTIER Francois committed
144
      reductions_on s z = [] &&
145 146
      not (SymbolMap.mem (Symbol.T z) (Lr1.transitions s))

147
(* [foreach_terminal f] applies the function [f] to every terminal symbol in
148
   turn, except [error] and [#]. *)
149

150 151
let foreach_terminal =
  Terminal.iter_real
152

153 154 155
(* [foreach_terminal_not_causing_an_error s f] applies the function [f] to
   every terminal symbol [z] such that [causes_an_error s z] is false. This
   could be implemented in a naive manner using [foreach_terminal] and
POTTIER Francois's avatar
POTTIER Francois committed
156
   [causes_an_error]. This implementation is significantly more efficient. *)
157

158 159 160
let foreach_terminal_not_causing_an_error s f =
  match Invariant.has_default_reduction s with
  | Some _ ->
161
      (* There is a default reduction. No symbol causes an error. *)
162 163
      foreach_terminal f
  | None ->
164 165 166
      (* Enumerate every terminal symbol [z] for which there is a
         reduction. *)
      TerminalMap.iter (fun z _ ->
167 168 169
        (* A reduction on [#] is always a default reduction. (See [lr1.ml].) *)
        assert (not (Terminal.equal z Terminal.sharp));
        if non_error z then
170
          f z
171
      ) (Lr1.reductions s);
172 173
      (* Enumerate every terminal symbol [z] for which there is a
         transition. *)
174 175
      SymbolMap.iter (fun sym _ ->
        match sym with
176
        | Symbol.T z ->
177 178
            assert (not (Terminal.equal z Terminal.sharp));
            if non_error z then
179
              f z
180 181 182 183
        | Symbol.N _ ->
            ()
      ) (Lr1.transitions s)

184
(* Let us say a state [s] is solid if its incoming symbol is a terminal symbol
POTTIER Francois's avatar
POTTIER Francois committed
185 186
   (or if it has no incoming symbol at all, i.e., it is an initial state). It
   is fragile if its incoming symbol is a non-terminal symbol. *)
187 188 189 190 191

let is_solid s =
  match Lr1.incoming_symbol s with
  | None
  | Some (Symbol.T _) ->
POTTIER Francois's avatar
POTTIER Francois committed
192
    true
193
  | Some (Symbol.N _) ->
POTTIER Francois's avatar
POTTIER Francois committed
194
    false
195

196 197
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
198 199 200 201 202 203 204 205 206
(* Suppose [s] is a state that carries an outgoing edge labeled with a
   non-terminal symbol [nt]. We are interested in finding out how this edge
   can be taken. In order to do that, we must determine how, by starting in
   [s], one can follow a path that corresponds to (the right-hand side of) a
   production [prod] associated with [nt]. There are in general several such
   productions. The paths that they determine in the automaton form a "star".
   We represent the star rooted at [s] as a trie. For every state [s], the
   star rooted at [s] is constructed in advance, before the algorithm runs.
   While the algorithm runs, a point in the trie (that is, a sub-trie) tells
POTTIER Francois's avatar
Typo.  
POTTIER Francois committed
207
   us where we come from, where we are, and which production(s) we are hoping
POTTIER Francois's avatar
POTTIER Francois committed
208 209
   to reduce in the future. *)

210 211 212
module Trie : sig

  type trie
213 214 215 216

  (* [star s] creates a (new) trie whose source is [s], populated with its
     branches. (There is one branch for every production [prod] associated
     with every non-terminal symbol [nt] for which [s] carries an outgoing
217
     edge.) If the star turns out to be trivial then [None] is returned. *)
218 219
  val star: Lr1.node -> trie option

220 221 222 223
  (* After [star s] has been called, [size (Lr1.number s)] reports the size
     of the trie that has been constructed for state [s]. *)
  val size: int -> int

224 225 226 227 228 229 230 231
  (* Every (sub-)trie has a unique identity. (One can think of it as its
     address.) [compare] compares the identity of two tries. This can be
     used, e.g., to set up a map whose keys are tries. *)
  val compare: trie -> trie -> int

  (* [source t] returns the source state of the (sub-)trie [t]. This is
     the root of the star of which [t] is a sub-trie. In other words, this
     tells us "where we come from". *)
232
  val source: trie -> Lr1.node
POTTIER Francois's avatar
POTTIER Francois committed
233 234

  (* [current t] returns the current state of the (sub-)trie [t]. This is
235 236
     the root of the sub-trie [t]. In other words, this tells us "where
     we are". *)
POTTIER Francois's avatar
POTTIER Francois committed
237
  val current: trie -> Lr1.node
238

239 240 241 242 243
  (* [accepts prod t] tells whether the current state of the trie [t] is
     the end of a branch associated with production [prod]. If so, this
     means that we have successfully followed a path that corresponds to
     the right-hand side of production [prod]. *)
  val accepts: Production.index -> trie -> bool
244

245 246 247 248 249 250 251 252
  (* [step sym t] is the immediate sub-trie of [t] along the symbol [sym].
     This function raises [Not_found] if [t] has no child labeled [sym]. *)
  val step: Symbol.t -> trie -> trie

  (* [verbose()] outputs debugging & performance information. *)
  val verbose: unit -> unit

end = struct
253

254 255
  (* A trie has the following structure. *)

256
  type trie = {
257 258 259
    (* A unique identity, used by [compare]. The trie construction code
       ensures that these numbers are indeed unique: see [fresh], [insert],
       [star]. *)
260
    identity: int;
261
    (* The root state of this star: "where we come from". *)
262
    source: Lr1.node;
263
    (* The current state, i.e., the root of this sub-trie: "where we are". *)
POTTIER Francois's avatar
POTTIER Francois committed
264
    current: Lr1.node;
265 266 267
    (* The productions that we can reduce in the current state. In other
       words, if this list is nonempty, then the current state is the end
       of one (or several) branches. It can nonetheless have children. *)
268
    productions: Production.index list;
269 270
    (* The children, or sub-tries. *)
    transitions: trie SymbolMap.t
271
  }
272

273
  (* This counter is used by [mktrie] to produce unique identities. *)
274 275
  let c = ref 0

276
  (* This smart constructor creates a new trie with a unique identity. *)
POTTIER Francois's avatar
POTTIER Francois committed
277
  let mktrie source current productions transitions =
278
    let identity = Misc.postincrement c in
POTTIER Francois's avatar
POTTIER Francois committed
279
    { identity; source; current; productions; transitions }
280

281 282
  exception DeadBranch

283
  let rec insert w prod t =
284 285
    match w with
    | [] ->
POTTIER Francois's avatar
POTTIER Francois committed
286
        (* We check whether the current state [t.current] is able to reduce
287 288 289 290 291
           production [prod]. (If [prod] cannot be reduced, the reduction
           action must have been suppressed by conflict resolution.) If not,
           then this branch is dead. This test is superfluous (i.e., it would
           be OK to conservatively assume that [prod] can be reduced) but
           allows us to build a slightly smaller star in some cases. *)
POTTIER Francois's avatar
POTTIER Francois committed
292
        if can_reduce t.current prod then
293 294 295 296 297 298
          (* We consume (update) the trie [t], so there is no need to allocate
             a new stamp. (Of course we could allocate a new stamp, but I prefer
             to be precise.) *)
          { t with productions = prod :: t.productions }
        else
          raise DeadBranch
299
    | (Symbol.T t) :: _ when Terminal.equal t Terminal.error ->
300
         raise DeadBranch
301
    | a :: w ->
POTTIER Francois's avatar
POTTIER Francois committed
302
        (* Check if there is a transition labeled [a] out of [t.current]. If
303 304
           there is, we add a child to the trie [t]. If there isn't, then it
           must have been removed by conflict resolution. (Indeed, it must be
305 306 307
           present in a canonical automaton.) We could in this case return an
           unchanged sub-trie. We can do slightly better: we abort the whole
           insertion, so as to return an unchanged toplevel trie. *)
POTTIER Francois's avatar
POTTIER Francois committed
308
        match SymbolMap.find a (Lr1.transitions t.current) with
309
        | successor ->
310 311 312 313 314 315 316 317
            (* Find our child at [a], or create it. *)
            let t' =
              try
                SymbolMap.find a t.transitions
              with Not_found ->
                mktrie t.source successor [] SymbolMap.empty
            in
            (* Update the child [t']. *)
318
            let t' = insert w prod t' in
319 320
            (* Update [t]. Again, no need to allocate a new stamp. *)
            { t with transitions = SymbolMap.add a t' t.transitions }
321
        | exception Not_found ->
322
            raise DeadBranch
323

324 325 326 327 328
  (* [insert prod t] inserts a new branch, corresponding to production
     [prod], into the trie [t]. This function consumes its argument,
     which should no longer be used afterwards. *)
  let insert prod t =
    let w = Array.to_list (Production.rhs prod) in
329 330 331 332 333 334
    let save = !c in
    try
      insert w prod t
    with DeadBranch ->
      c := save;
      t
335

336 337 338 339
  (* [fresh s] creates a new empty trie whose source is [s]. *)
  let fresh source =
    mktrie source source [] SymbolMap.empty

340 341 342
  (* The star at [s] is obtained by starting with a fresh empty trie and
     inserting into it every production [prod] whose left-hand side [nt]
     is the label of an outgoing edge at [s]. *)
343 344 345 346 347 348 349 350 351
  let star s =
    SymbolMap.fold (fun sym _ accu ->
      match sym with
      | Symbol.T _ ->
          accu
      | Symbol.N nt ->
          Production.foldnt nt accu insert
    ) (Lr1.transitions s) (fresh s)

352 353 354 355 356 357 358
  (* A trie [t] is nontrivial if it has at least one branch, i.e., contains at
     least one sub-trie whose [productions] field is nonempty. Trivia: a trie
     of size greater than 1 is necessarily nontrivial, but the converse is not
     true: a nontrivial trie can have size 1. (This occurs if all productions
     have zero length.) *)
  let trivial t =
    t.productions = [] && SymbolMap.is_empty t.transitions
359

360 361
  (* Redefine [star] to include a [nontrivial] test and to record the size of
     the newly built trie. *)
362 363 364 365

  let size =
    Array.make Lr1.n (-1)

366
  let star s =
367
    let initial = !c in
368
    let t = star s in
369 370
    let final = !c in
    size.(Lr1.number s) <- final - initial;
371
    if trivial t then None else Some t
372

373 374 375 376
  let size s =
    assert (size.(s) >= 0);
    size.(s)

POTTIER Francois's avatar
POTTIER Francois committed
377
  let compare t1 t2 =
378
    Pervasives.compare t1.identity t2.identity
379

380 381 382
  let source t =
    t.source

POTTIER Francois's avatar
POTTIER Francois committed
383 384
  let current t =
    t.current
385

386 387 388
  let accepts prod t =
    List.mem prod t.productions

389
  let step a t =
390
    SymbolMap.find a t.transitions (* careful: may raise [Not_found] *)
391

392
  let verbose () =
393
    Printf.eprintf "Cumulated star size: %d\n%!" !c
394

395 396
end

POTTIER Francois's avatar
POTTIER Francois committed
397 398
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
399 400 401 402 403 404
(* The main algorithm, [LRijkstra], accumulates facts. A fact is a triple of a
   position (that is, a sub-trie), a word, and a lookahead assumption. Such a
   fact means that this position can be reached, from the source state
   [Trie.source fact.position], by consuming [fact.word], under the assumption
   that the next input symbol is [fact.lookahead]. *)

405 406
(* We allow [fact.lookahead] to be [any] so as to indicate that this fact does
   not have a lookahead assumption. *)
POTTIER Francois's avatar
POTTIER Francois committed
407

408
type fact = {
409
  position: Trie.trie;
410
  word: W.word;
411
  lookahead: Terminal.t (* may be [any] *)
412 413
}

414 415
(* Accessors. *)

416
let source fact =
417
  Trie.source fact.position
418

POTTIER Francois's avatar
POTTIER Francois committed
419
let current fact =
420
  Trie.current fact.position
421

422 423
(* Two invariants reduce the number of facts that we consider:

424 425 426
   1. If [fact.lookahead] is a real terminal symbol [z] (i.e., not [any]),
      then [z] does not cause an error in the current state [current fact].
      It would be useless to consider a fact that violates this property;
POTTIER Francois's avatar
POTTIER Francois committed
427 428 429
      this cannot possibly lead to a successful reduction. In practice,
      this refinement allows reducing the number of facts that go through
      the queue by a factor of two.
430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447

   2. [fact.lookahead] is [any] iff the current state [current fact] is
      solid. This sounds rather reasonable (when a state is entered
      by shifting, it is entered regardless of which symbol follows)
      and simplifies the implementation of the sub-module [T].

*)

let invariant1 fact =
  fact.lookahead = any || not (causes_an_error (current fact) fact.lookahead)

let invariant2 fact =
  (fact.lookahead = any) = is_solid (current fact)

(* [compatible z a] checks whether the terminal symbol [a] satisfies the
   lookahead assumption [z] -- which can be [any]. *)

let compatible z a =
448 449
  assert (non_error z);
  assert (Terminal.real a);
450 451
  z = any || z = a

POTTIER Francois's avatar
POTTIER Francois committed
452
(* ------------------------------------------------------------------------ *)
453

454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476
(* As in Dijkstra's algorithm, a priority queue contains the facts that await
   examination. The length of [fact.word] serves as the priority of a fact.
   This guarantees that we discover shortest paths. (We never insert into the
   queue a fact whose priority is less than the priority of the last fact
   extracted out of the queue.) *)

(* [LowIntegerPriorityQueue] offers very efficient operations (essentially
   constant time, for a small constant). It exploits the fact that priorities
   are low nonnegative integers. *)

module Q = LowIntegerPriorityQueue

let q =
  Q.create()

(* In principle, there is no need to insert the fact into the queue if [T]
   already stores a comparable fact. We could perform this test in [add].
   However, a quick experiment suggests that this is not worthwhile. The run
   time augments (because membership in [T] is tested twice, upon inserting
   and upon extracting) and the memory consumption does not seem to go down
   significantly. *)

let add fact =
POTTIER Francois's avatar
POTTIER Francois committed
477
  (* [fact.lookahead] can be [any], but cannot be [error] *)
478
  assert (non_error fact.lookahead);
479 480
  assert (invariant1 fact);
  assert (invariant2 fact);
481 482 483 484 485 486
  (* The length of [fact.word] serves as the priority of this fact. *)
  Q.add q fact (W.length fact.word)

(* Construct the [star] of every state [s]. Initialize the priority queue. *)

let () =
POTTIER Francois's avatar
POTTIER Francois committed
487
  (* For every state [s]... *)
488
  Lr1.iter (fun s ->
POTTIER Francois's avatar
POTTIER Francois committed
489
    (* If the trie rooted at [s] is nontrivial...*)
490
    match Trie.star s with
POTTIER Francois's avatar
POTTIER Francois committed
491 492 493 494 495 496 497 498 499 500
    | None ->
        ()
    | Some position ->
        (* ...then insert an initial fact into the priority queue. *)
        (* In order to respect invariants 1 and 2, we must distinguish two
           cases. If [s] is solid, then we insert a single fact, whose
           lookahead assumption is [any]. Otherwise, we must insert one
           initial fact for every terminal symbol [z] that does not cause
           an error in state [s]. *)
        let word = W.epsilon in
501
        if is_solid s then
POTTIER Francois's avatar
POTTIER Francois committed
502
          add { position; word; lookahead = any }
503 504
        else
          foreach_terminal_not_causing_an_error s (fun z ->
POTTIER Francois's avatar
POTTIER Francois committed
505
            add { position; word; lookahead = z }
506
          )
507 508 509 510
  )

(* ------------------------------------------------------------------------ *)

511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528
(* The module [T] maintains a set of known facts. *)

(* Three aspects of a fact are of particular interest:
   - its position [position], given by [fact.position];
   - its first symbol [a], given by [W.first fact.word fact.lookahead];
   - its lookahead assumption [z], given by [fact.lookahead].

   For every triple of [position], [a], and [z], we store at most one fact,
   (whose word has minimal length). Indeed, we are not interested in keeping
   track of several words that produce the same effect. Only the shortest such
   word is of interest.
   
   Thus, the total number of facts accumulated by the algorithm is at most
   [T.n^2], where [T] is the total size of the tries that we have constructed,
   and [n] is the number of terminal symbols. (This number can be quite large.
   [T] can be in the tens of thousands, and [n] can be over one hundred. These
   figures lead to a theoretical upper bound of 100M. In practice, for T=25K
   and n=108, we observe that the algorithm gathers about 7M facts.) *)
529

530
module T : sig
531 532

  (* [register fact] registers the fact [fact]. It returns [true] if this fact
POTTIER Francois's avatar
POTTIER Francois committed
533 534
     is new, i.e., no fact concerning the same triple of [position], [a], and
     [z] was previously known. *)
535 536
  val register: fact -> bool

POTTIER Francois's avatar
POTTIER Francois committed
537
  (* [query current z f] enumerates all known facts whose current state is
538 539
     [current] and whose lookahead assumption is compatible with [z]. The
     symbol [z] must a real terminal symbol, i.e., cannot be [any]. *)
540
  val query: Lr1.node -> Terminal.t -> (fact -> unit) -> unit
541

542
  (* [verbose()] outputs debugging & performance information. *)
543
  val verbose: unit -> unit
544

545
end = struct
546

547 548 549 550 551 552 553 554 555 556 557
  (* We need to query the set of facts in two ways. In [register], we must test
     whether a proposed triple of [position], [a], [z] already appears in the
     set. In [query], we must find all facts that match a pair [current, z],
     where [current] is a state. (Note that [position] determines [current], but
     the converse is not true: a position contains more information besides the
     current state.)

     To address these needs, we use a two-level table. The first level is a
     matrix indexed by [current] and [z]. At the second level, we find sets of
     facts, where two facts are considered equal if they have the same triple of
     [position], [a], and [z]. In fact, we know at this level that all facts
558 559 560 561 562 563
     have the same [z] component, so only [position] and [a] are compared.

     Because our facts satisfy invariant 2, [z] is [any] if and only if the
     state [current fact] is solid. This means that we are wasting quite a
     lot of space in the matrix (for a solid state, the whole line is empty,
     except for the [any] column). *)
564

565
  (* The level-2 sets. *)
566 567

  module M =
568
    MySet.Make(struct
569 570
      type t = fact
      let compare fact1 fact2 =
571
        (* assert (fact1.lookahead = fact2.lookahead); *)
572
        let c = Trie.compare fact1.position fact2.position in
573
        if c <> 0 then c else
574 575 576
        let z = fact1.lookahead in
        let a1 = W.first fact1.word z
        and a2 = W.first fact2.word z in
577
        (* note: [a1] and [a2] can be [any] here *)
578 579
        Terminal.compare a1 a2
    end)
580

581 582 583
  (* The level-1 matrix. *)

  let table =
584
    Array.make (Lr1.n * Terminal.n) M.empty
585

POTTIER Francois's avatar
POTTIER Francois committed
586
  let index current z =
587
    Terminal.n * (Lr1.number current) + Terminal.t2i z
588

589 590
  let count = ref 0

591
  let register fact =
POTTIER Francois's avatar
POTTIER Francois committed
592
    let current = current fact in
593
    let z = fact.lookahead in
POTTIER Francois's avatar
POTTIER Francois committed
594
    let i = index current z in
595 596 597 598 599 600 601 602 603 604 605 606
    let m = table.(i) in
    (* We crucially rely on the fact that [M.add] guarantees not to
       change the set if an ``equal'' fact already exists. Thus, a
       later, longer path is ignored in favor of an earlier, shorter
       path. *)
    let m' = M.add fact m in
    m != m' && begin
      incr count;
      table.(i) <- m';
      true
    end

POTTIER Francois's avatar
POTTIER Francois committed
607
  let query current z f =
608 609 610 611 612
    assert (not (Terminal.equal z any));
    (* If the state [current] is solid then the facts that concern it are
       stored in the column [any], and all of them are compatible with [z].
       Otherwise, they are stored in all columns except [any], and only
       those stored in the column [z] are compatible with [z]. *)
613
    let i = index current (if is_solid current then any else z) in
614 615
    let m = table.(i) in
    M.iter f m
616

617
  let verbose () =
618
    Printf.eprintf "T stores %d facts.\n%!" !count
619

620 621
end

POTTIER Francois's avatar
POTTIER Francois committed
622 623
(* ------------------------------------------------------------------------ *)

624 625 626 627 628 629 630
(* The module [E] is in charge of recording the non-terminal edges that we have
   discovered, or more precisely, the conditions under which these edges can be
   taken. *)

module E : sig

  (* [register s nt w z] records that, in state [s], the outgoing edge labeled
631 632 633
     [nt] can be taken by consuming the word [w], if the next symbol is [z].
     It returns [true] if this information is new. *)
  val register: Lr1.node -> Nonterminal.t -> W.word -> Terminal.t -> bool
634 635 636 637

  (* [query s nt a z] answers whether, in state [s], the outgoing edge labeled
     [nt] can be taken by consuming some word [w], under the assumption that
     the next symbol is [z], and under the constraint that the first symbol of
638 639
     [w.z] is [a]. *)
  val query: Lr1.node -> Nonterminal.t -> Terminal.t -> Terminal.t -> (W.word -> unit) -> unit
640

641
  val verbose: unit -> unit
642

643 644
end = struct

645 646 647 648 649 650 651 652 653 654
  (* At a high level, we must implement a mapping of [s, nt, a, z] to [w]. In
     practice, we can implement this specification using any combination of
     arrays, hash tables, balanced binary trees, and perfect hashing (i.e.,
     packing several of [s], [nt], [a], [z] in one word.) Here, we choose to
     use an array, indexed by [s], of hash tables, indexed by a key that packs
     [nt], [a], and [z] in one word. According to a quick experiment, the
     final population of the hash table [table.(index s)] seems to be roughly
     [Terminal.n * Trie.size s]. We note that using an initial capacity
     of 0 and relying on the hash table's resizing mechanism has a significant
     cost, which is why we try to guess a good initial capacity. *)
655

656
  module H = Hashtbl
657

658
  let table = (* a pretty large table... *)
659 660 661 662
    Array.init (Lr1.n) (fun i ->
      let size = Trie.size i in
      H.create (if size = 1 then 0 else Terminal.n * size)
    )
663 664 665

  let index s =
    Lr1.number s
666

667 668 669 670
  let pack nt a z : int =
    (Nonterminal.n2i nt lsl 16) lor
    (Terminal.t2i a lsl 8) lor
    (Terminal.t2i z)
671

672 673
  let count = ref 0

674
  let register s nt w z =
675
    assert (Terminal.real z);
676
    let i = index s in
677
    let m = table.(i) in
678
    let a = W.first w z in
679
    assert (not (causes_an_error s a));
680 681 682 683
    let key = pack nt a z in
    if H.mem m key then
      false
    else begin
684
      incr count;
685
      H.add m key w;
686 687
      true
    end
688

689
  let rec query s nt a z f =
690
    assert (Terminal.real z);
691
    (* [a] can be [any] *)
692
    if not (Terminal.equal a any) then begin
693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709
      let i = index s in
      let m = table.(i) in
      let key = pack nt a z in
      match H.find m key with
      | w -> f w
      | exception Not_found -> ()
    end
    else begin
      (* If [a] is [any], we query the table for every concrete [a].
         We can limit ourselves to symbols that do not cause an error
         in state [s]. Those that do certainly do not have an entry;
         see the assertion in [register] above. *)
      foreach_terminal_not_causing_an_error s (fun a ->
        query s nt a z f
      )
        (* TEMPORARY try a scheme that allows a more efficient iteration? *)
    end
710

711
  let verbose () =
712
    Printf.eprintf "E stores %d facts.\n%!" !count
713

714 715
end

POTTIER Francois's avatar
POTTIER Francois committed
716 717
(* ------------------------------------------------------------------------ *)

718
let new_edge s nt w z =
719
  assert (Terminal.real z);
720
  if E.register s nt w z then
721
    let sym = Symbol.N nt in
722
    T.query s (W.first w z) (fun fact ->
723
      assert (compatible fact.lookahead (W.first w z));
724 725
      match Trie.step sym fact.position with
      | position ->
726
          assert (not (is_solid (Trie.current position)));
727
          if not (causes_an_error (Trie.current position) z) then
728
            add {
729
              position;
730 731 732 733 734
              word = W.append fact.word w;
              lookahead = z
            }
      | exception Not_found ->
          ()
735
    )
736 737 738 739 740 741 742 743 744 745

(* [consequences fact] is invoked when we discover a new fact (i.e., one that
   was not previously known). It studies the consequences of this fact. These
   consequences are of two kinds:

   - As in Dijkstra's algorithm, the new fact can be viewed as a newly
   discovered vertex. We study its (currently known) outgoing edges,
   and enqueue new facts in the priority queue.

   - Sometimes, a fact can also be viewed as a newly discovered edge.
POTTIER Francois's avatar
POTTIER Francois committed
746 747
   This is the case when the word from [fact.source] to [fact.current]
   represents a production of the grammar and [fact.current] is willing
748 749 750 751 752 753 754 755
   to reduce this production. We record the existence of this edge,
   and re-inspect any previously discovered vertices which are
   interested in this outgoing edge.
*)
(**)

let consequences fact =

POTTIER Francois's avatar
POTTIER Francois committed
756
  let current = current fact in
757

POTTIER Francois's avatar
POTTIER Francois committed
758
  (* 1. View [fact] as a vertex. Examine the transitions out of [current]. *)
759
  
760
  SymbolMap.iter (fun sym s' ->
761
    match Trie.step sym fact.position, sym with
762
    | exception Not_found -> ()
763
    | position, Symbol.T t ->
764 765
        (* [t] cannot be the [error] token, because the trie does not have
           any edges labeled [error]. *)
766
        assert (non_error t);
767

POTTIER Francois's avatar
POTTIER Francois committed
768
        (* 1a. There is a transition labeled [t] out of [current]. If
769 770 771 772 773
           the lookahead assumption [fact.lookahead] is compatible with [t],
           then we derive a new fact, where one more edge has been taken. We
           enqueue this new fact for later examination. *)
        (**)

774
        if compatible fact.lookahead t then begin
775
          let word = W.append fact.word (W.singleton t) in
776 777 778 779 780 781 782 783
          (* assert (Lr1.Node.compare (Trie.current position) s' = 0); *)
          (* [s'] has a terminal incoming symbol. It is always entered
             without consideration for the next lookahead symbol. Thus,
             we use [any] as the lookahead assumption in the new fact
             that we produce. *)
          assert (is_solid (Trie.current position));
          add { position; word; lookahead = any }
        end
784

785
    | position, Symbol.N nt ->
786

POTTIER Francois's avatar
POTTIER Francois committed
787
        (* 1b. There is a transition labeled [nt] out of [current]. We
788 789 790 791 792 793
           need to know how this nonterminal edge can be taken. We query for a
           word [w] that allows us to take this edge. The answer depends on
           the terminal symbol [z] that comes *after* this word: we try all
           such symbols. Furthermore, we need the first symbol of [w.z] to
           satisfy the lookahead assumption [fact.lookahead], so the answer
           also depends on this assumption. *)
POTTIER Francois's avatar
POTTIER Francois committed
794 795
        (* TEMPORARY it could be that the answer does not depend on [z]...
           (default reduction) *)
796 797 798
        (**)

        foreach_terminal_not_causing_an_error s' (fun z ->
POTTIER Francois's avatar
POTTIER Francois committed
799
          E.query current nt fact.lookahead z (fun w ->
800 801
            assert (compatible fact.lookahead (W.first w z));
            assert (not (is_solid (Trie.current position)));
802
            add {
803
              position;
804 805 806 807 808
              word = W.append fact.word w;
              lookahead = z
            }
          )
        )
809

POTTIER Francois's avatar
POTTIER Francois committed
810
  ) (Lr1.transitions current);
811 812

  (* 2. View [fact] as a possible edge. This is possible if the path from
POTTIER Francois's avatar
POTTIER Francois committed
813 814
     [fact.source] to [current] represents a production [prod] and
     [current] is willing to reduce this production. We check that
815
     [fact.position] accepts [epsilon]. This guarantees that reducing [prod]
816 817 818 819
     takes us all the way back to [fact.source]. Thus, this production gives
     rise to an edge labeled [nt] -- the left-hand side of [prod] -- out of
     [fact.source]. This edge is subject to the lookahead assumption
     [fact.lookahead], so we record that. *)
820 821
  (**)

822
  if not (Terminal.equal fact.lookahead any) then begin
823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839
    match has_reduction current fact.lookahead with
    | Some prod when Trie.accepts prod fact.position ->
        new_edge (source fact) (Production.nt prod) fact.word fact.lookahead
    | _ ->
        ()
  end
  else begin
    (* Every reduction must be considered. *)
    match Invariant.has_default_reduction current with
    | Some (prod, _) ->
        if Trie.accepts prod fact.position then
          (* TEMPORARY for now, avoid sending [any] into [new_edge] *)
          foreach_terminal (fun z ->
            new_edge (source fact) (Production.nt prod) (fact.word) z
          )
    | None ->
       TerminalMap.iter (fun z prods ->
840
         if non_error z then
841 842 843 844 845
           let prod = Misc.single prods in
           if Trie.accepts prod fact.position then
             new_edge (source fact) (Production.nt prod) (fact.word) z
       ) (Lr1.reductions current)
  end
846

847
let level = ref 0
848

POTTIER Francois's avatar
POTTIER Francois committed
849 850 851
let extracted, considered =
  ref 0, ref 0

POTTIER Francois's avatar
POTTIER Francois committed
852
let done_with_level () =
853 854 855 856 857 858 859 860
  Printf.eprintf "Done with level %d.\n" !level;
  if X.verbose then begin
    W.verbose();
    T.verbose();
    E.verbose()
  end;
  Printf.eprintf "Q stores %d facts.\n" (Q.cardinal q);
  Printf.eprintf "%d facts extracted out of Q, of which %d considered.\n%!"
POTTIER Francois's avatar
POTTIER Francois committed
861
    !extracted !considered
POTTIER Francois's avatar
POTTIER Francois committed
862

863
let discover fact =
POTTIER Francois's avatar
POTTIER Francois committed
864
  incr extracted;
865
  if T.register fact then begin
866
    if W.length fact.word > ! level then begin
POTTIER Francois's avatar
POTTIER Francois committed
867
      done_with_level();
868 869
      level := W.length fact.word;
    end;
POTTIER Francois's avatar
POTTIER Francois committed
870
    incr considered;
871
    consequences fact
872
  end
873

POTTIER Francois's avatar
POTTIER Francois committed
874
let () =
875 876
  if X.verbose then
    Trie.verbose();
877
  Q.repeat q discover;
878
  Time.tick "Running LRijkstra";
POTTIER Francois's avatar
POTTIER Francois committed
879
  done_with_level()
880

881 882
(* ------------------------------------------------------------------------ *)

883 884 885 886 887
(* The following code validates the fact that an error can be triggered in
   state [s'] by beginning in the initial state [s] and reading the
   sequence of terminal symbols [w]. We use this for debugging purposes. *)

let fail msg =
888
  Printf.eprintf "coverage: internal error: %s.\n%!" msg;
889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912
  false

open ReferenceInterpreter

let validate s s' w : bool =
  match
    ReferenceInterpreter.check_error_path (Lr1.nt_of_entry s) (W.elements w)
  with
  | OInputReadPastEnd ->
      fail "input was read past its end"
  | OInputNotFullyConsumed ->
      fail "input was not fully consumed"
  | OUnexpectedAccept ->
      fail "input was unexpectedly accepted"
  | OK state ->
      Lr1.Node.compare state s' = 0 ||
      fail (
        Printf.sprintf "error occurred in state %d instead of %d"
          (Lr1.number state)
          (Lr1.number s')
      )

(* ------------------------------------------------------------------------ *)

913 914 915 916 917 918 919
(* We now wish to determine, given a state [s'] and a terminal symbol [z], a
   minimal path that takes us from some entry state to state [s'] with [z] as
   the next (unconsumed) symbol. *)

(* This can be formulated as a search for a shortest path in a graph. The
   graph is not just the automaton, though. It is a (much) larger graph whose
   vertices are pairs [s, z] and whose edges are obtained by querying the
920
   module [E] above. *)
921

922
let forward () =
923

924
  let module A = Astar.Make(struct
925

926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946
    (* A vertex is a pair [s, z].
       [z] cannot be the [error] token. *)
    type node =
        Lr1.node * Terminal.t

    let equal (s'1, z1) (s'2, z2) =
      Lr1.Node.compare s'1 s'2 = 0 && Terminal.compare z1 z2 = 0

    let hash (s, z) =
      Hashtbl.hash (Lr1.number s, z)

    (* An edge is labeled with a word. *)
    type label =
      W.word

    (* Forward search from every [s, z], where [s] is an initial state. *)
    let sources f =
      foreach_terminal (fun z ->
        ProductionMap.iter (fun _ s ->
          f (s, z)
        ) Lr1.entry
947 948
      )

949
    let successors (s, z) edge =
950
      assert (Terminal.real z);
951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973
      SymbolMap.iter (fun sym s' ->
        match sym with
        | Symbol.T t ->
            if Terminal.equal z t then
              let w = W.singleton t in
              foreach_terminal (fun z ->
                edge w 1 (s', z)
              )
        | Symbol.N nt ->
           foreach_terminal (fun z' ->
             E.query s nt z z' (fun w ->
               edge w (W.length w) (s', z')
             )
           )
      ) (Lr1.transitions s)

    let estimate _ =
      0

  end) in

  (* Search forward. *)

974
  Printf.eprintf "Forward search:\n%!";
975
  let seen = ref Lr1.NodeSet.empty in
976
  let _, _ = A.search (fun ((s', z), path) ->
977 978 979 980 981 982
    if causes_an_error s' z && not (Lr1.NodeSet.mem s' !seen) then begin
      seen := Lr1.NodeSet.add s' !seen;
      (* An error can be triggered in state [s'] by beginning in the initial
         state [s] and reading the sequence of terminal symbols [w]. *)
      let (s, _), ws = A.reverse path in
      let w = List.fold_right W.append ws (W.singleton z) in
983
      Printf.eprintf
984 985 986
        "An error can be reached from state %d to state %d:\n%!"
        (Lr1.number s)
        (Lr1.number s');
987
      Printf.eprintf "%s\n%!" (W.print w);
988 989 990
      assert (validate s s' w)
    end
  ) in
991
  Printf.eprintf "Reachable (forward): %d states\n%!"
992 993
    (Lr1.NodeSet.cardinal !seen);
  !seen
994 995

let () =
996 997
  let f = forward() in
  Time.tick "Forward search";
POTTIER Francois's avatar
POTTIER Francois committed
998
  let stat = Gc.quick_stat() in
999
  Printf.eprintf
POTTIER Francois's avatar
POTTIER Francois committed
1000 1001
    "Maximum size reached by the major heap: %dM\n"
    (stat.Gc.top_heap_words * (Sys.word_size / 8) / 1024 / 1024);
1002
  ignore f
1003 1004

(* TODO:
POTTIER Francois's avatar
POTTIER Francois committed
1005
  can we store fewer facts when we hit a default reduction?
1006
  remove CompletedNatWitness?, revert Fix
1007
  collect performance data, correlated with star size and alphabet size; draw a graph
POTTIER Francois's avatar
POTTIER Francois committed
1008
  count the unreachable states and see if they are numerous in practice
POTTIER Francois's avatar
POTTIER Francois committed
1009 1010
  optionally report several ways of reaching an error in state s
    (with different lookahead tokens) (report all of them?)
1011
  warn if --list-errors is set AND the grammar uses [error]
1012
  control verbose output
POTTIER Francois's avatar
POTTIER Francois committed
1013
  measure the cost of assertions
1014 1015
  remove $syntaxerror?
  how do we maintain the list of error messages when the grammar evolves?
1016 1017
  implement a naive semi-algorithm that enumerates all input sentences,
    and evaluate how well (or how badly) it scales
1018
*)
POTTIER Francois's avatar
POTTIER Francois committed
1019

1020
end