LRijkstra.ml 40.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
(* The purpose of this algorithm is to find, for each pair of a state [s]
   and a terminal symbol [z] such that looking at [z] in state [s] causes
   an error, a minimal path (starting in some initial state) that actually
   triggers this error. *)

(* This is potentially useful for grammar designers who wish to better
   understand the properties of their grammar, or who wish to produce a
   list of all possible syntax errors (or, at least, one syntax error in
   each automaton state where an error may occur). *)

(* The problem seems rather tricky. One might think that it suffices to
   compute shortest paths in the automaton, and to use [Analysis.minimal] to
   replace each non-terminal symbol in a path with a minimal word that this
   symbol generates. One can indeed do so, but this yields only a lower bound
   on the actual shortest path to the error at [s, z]. Indeed, two
   difficulties arise:

   - Some states have a default reduction. Thus, they will not trigger
     an error, even though they should. The error is triggered in some
     other state, after reduction takes place.

   - If the grammar has conflicts, conflict resolution removes some
     (shift or reduce) actions, hence may suppress the shortest path. *)

25
26
27
28
29
30
31
32
33
34
35
(* We explicitly choose to ignore the [error] token. Thus, we disregard any
   reductions or transitions that take place when the lookahead symbol is
   [error]. As a result, any state whose incoming symbol is [error] is found
   unreachable. It would be too complicated to have to create a first error in
   order to be able to take certain transitions or drop certain parts of the
   input. *)

(* We never work with the terminal symbol [#] either. This symbol never
   appears in the maps returned by [Lr1.transitions] and [Lr1.reductions].
   Thus, in principle, we work with ``real'' terminal symbols only. However,
   we encode [any] as [#] -- see below. *)
36

37
38
39
(* NOTE: THIS FILE IS COMPILED WITH -noassert BY DEFAULT. If you would like
   the assertions to be tested at runtime, change that in the file _tags. *)

40
41
42
43
44
(* ------------------------------------------------------------------------ *)

(* To delay the side effects performed by this module, we wrap everything in
   in a big functor without arguments. *)

45
module Run (X : sig val verbose: bool end) = struct
46

47
open Grammar
48
49
50

(* ------------------------------------------------------------------------ *)

51
52
(* Because of our encoding of terminal symbols as 8-bit characters, this
   algorithm supports at most 256 terminal symbols. *)
53
54

let () =
55
  if Terminal.n > 256 then
56
    Error.error [] (Printf.sprintf
57
      "--list-errors supports at most 256 terminal symbols.\n\
58
59
60
61
62
       The grammar has %d terminal symbols." Terminal.n
    )

(* ------------------------------------------------------------------------ *)

63
64
65
66
67
(* Build a module that represents words as (hash-consed) strings. Note:
   this functor application has a side effect (it allocates memory, and
   more importantly, it may fail). *)

module W = Terminal.Word(struct end)
68

69
70
(* ------------------------------------------------------------------------ *)

71
72
(* The [error] token may appear in the maps returned by [Lr1.transitions]
   and [Lr1.reductions], so we sometimes need to explicitly check for it. *)
73

74
let non_error z =
75
  not (Terminal.equal z Terminal.error)
76

77
78
79
80
(* We introduce a pseudo-terminal symbol [any]. It is used in several places
   later on, in particular in the field [fact.lookahead], to encode the
   absence of a lookahead hypothesis -- i.e., any terminal symbol will do. *)

81
82
83
84
85
(* We choose to encode [any] as [#]. There is no risk of confusion, since we
   do not use [#] anywhere. Thus, the assertion [Terminal.real z] implies
   [z <> any]. *)

let any =
86
  Terminal.sharp
87

88
89
(* ------------------------------------------------------------------------ *)

90
91
92
93
94
(* We begin with a number of auxiliary functions that provide information
   about the LR(1) automaton. These functions could perhaps be moved to a
   separate module. We keep them here, for the moment, because they are not
   used anywhere else. *)

POTTIER Francois's avatar
POTTIER Francois committed
95
96
97
(* [reductions_on s z] is the list of reductions permitted in state [s] when
   the lookahead symbol is [z]. This is a list of zero or one elements. This
   does not take default reductions into account. *)
98

POTTIER Francois's avatar
POTTIER Francois committed
99
let reductions_on s z : Production.index list =
100
  assert (Terminal.real z);
101
102
103
104
105
  try
    TerminalMap.find z (Lr1.reductions s)
  with Not_found ->
    []

106
107
108
(* [has_reduction s z] tells whether state [s] is willing to reduce some
   production (and if so, which one) when the lookahead symbol is [z]. It
   takes a possible default reduction into account. *)
109
110

let has_reduction s z : Production.index option =
111
  assert (Terminal.real z);
112
113
114
115
  match Invariant.has_default_reduction s with
  | Some (prod, _) ->
      Some prod
  | None ->
POTTIER Francois's avatar
POTTIER Francois committed
116
      match reductions_on s z with
117
118
119
120
121
122
      | prod :: prods ->
          assert (prods = []);
          Some prod
      | [] ->
          None

123
124
125
126
127
128
129
130
131
(* [can_reduce s prod] indicates whether state [s] is able to reduce
   production [prod] (either as a default reduction, or as a normal
   reduction). *)

let can_reduce s prod =
  match Invariant.has_default_reduction s with
  | Some (prod', _) when prod = prod' ->
      true
  | _ ->
132
      TerminalMap.fold (fun z prods accu ->
133
134
        (* A reduction on [#] is always a default reduction. (See [lr1.ml].) *)
        assert (not (Terminal.equal z Terminal.sharp));
135
        accu || non_error z && List.mem prod prods
136
137
      ) (Lr1.reductions s) false

138
139
(* [causes_an_error s z] tells whether state [s] will initiate an error on the
   lookahead symbol [z]. *)
140

141
let causes_an_error s z : bool =
142
  assert (Terminal.real z);
143
144
145
146
  match Invariant.has_default_reduction s with
  | Some _ ->
      false
  | None ->
POTTIER Francois's avatar
POTTIER Francois committed
147
      reductions_on s z = [] &&
148
149
      not (SymbolMap.mem (Symbol.T z) (Lr1.transitions s))

150
(* [foreach_terminal f] applies the function [f] to every terminal symbol in
151
   turn, except [error] and [#]. *)
152

153
154
let foreach_terminal =
  Terminal.iter_real
155

156
157
158
(* [foreach_terminal_not_causing_an_error s f] applies the function [f] to
   every terminal symbol [z] such that [causes_an_error s z] is false. This
   could be implemented in a naive manner using [foreach_terminal] and
POTTIER Francois's avatar
POTTIER Francois committed
159
   [causes_an_error]. This implementation is significantly more efficient. *)
160

161
162
163
let foreach_terminal_not_causing_an_error s f =
  match Invariant.has_default_reduction s with
  | Some _ ->
164
      (* There is a default reduction. No symbol causes an error. *)
165
166
      foreach_terminal f
  | None ->
167
168
169
      (* Enumerate every terminal symbol [z] for which there is a
         reduction. *)
      TerminalMap.iter (fun z _ ->
170
171
172
        (* A reduction on [#] is always a default reduction. (See [lr1.ml].) *)
        assert (not (Terminal.equal z Terminal.sharp));
        if non_error z then
173
          f z
174
      ) (Lr1.reductions s);
175
176
      (* Enumerate every terminal symbol [z] for which there is a
         transition. *)
177
178
      SymbolMap.iter (fun sym _ ->
        match sym with
179
        | Symbol.T z ->
180
181
            assert (not (Terminal.equal z Terminal.sharp));
            if non_error z then
182
              f z
183
184
185
186
        | Symbol.N _ ->
            ()
      ) (Lr1.transitions s)

187
(* Let us say a state [s] is solid if its incoming symbol is a terminal symbol
POTTIER Francois's avatar
POTTIER Francois committed
188
189
   (or if it has no incoming symbol at all, i.e., it is an initial state). It
   is fragile if its incoming symbol is a non-terminal symbol. *)
190
191
192
193
194

let is_solid s =
  match Lr1.incoming_symbol s with
  | None
  | Some (Symbol.T _) ->
POTTIER Francois's avatar
POTTIER Francois committed
195
    true
196
  | Some (Symbol.N _) ->
POTTIER Francois's avatar
POTTIER Francois committed
197
    false
198

199
200
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
201
202
203
204
205
206
207
208
209
(* Suppose [s] is a state that carries an outgoing edge labeled with a
   non-terminal symbol [nt]. We are interested in finding out how this edge
   can be taken. In order to do that, we must determine how, by starting in
   [s], one can follow a path that corresponds to (the right-hand side of) a
   production [prod] associated with [nt]. There are in general several such
   productions. The paths that they determine in the automaton form a "star".
   We represent the star rooted at [s] as a trie. For every state [s], the
   star rooted at [s] is constructed in advance, before the algorithm runs.
   While the algorithm runs, a point in the trie (that is, a sub-trie) tells
POTTIER Francois's avatar
Typo.    
POTTIER Francois committed
210
   us where we come from, where we are, and which production(s) we are hoping
POTTIER Francois's avatar
POTTIER Francois committed
211
212
   to reduce in the future. *)

213
214
215
module Trie : sig

  type trie
216
217
218
219

  (* [star s] creates a (new) trie whose source is [s], populated with its
     branches. (There is one branch for every production [prod] associated
     with every non-terminal symbol [nt] for which [s] carries an outgoing
220
     edge.) If the star turns out to be trivial then [None] is returned. *)
221
222
  val star: Lr1.node -> trie option

223
224
225
226
  (* After [star s] has been called, [size (Lr1.number s)] reports the size
     of the trie that has been constructed for state [s]. *)
  val size: int -> int

227
228
229
230
231
232
233
234
  (* Every (sub-)trie has a unique identity. (One can think of it as its
     address.) [compare] compares the identity of two tries. This can be
     used, e.g., to set up a map whose keys are tries. *)
  val compare: trie -> trie -> int

  (* [source t] returns the source state of the (sub-)trie [t]. This is
     the root of the star of which [t] is a sub-trie. In other words, this
     tells us "where we come from". *)
235
  val source: trie -> Lr1.node
POTTIER Francois's avatar
POTTIER Francois committed
236
237

  (* [current t] returns the current state of the (sub-)trie [t]. This is
238
239
     the root of the sub-trie [t]. In other words, this tells us "where
     we are". *)
POTTIER Francois's avatar
POTTIER Francois committed
240
  val current: trie -> Lr1.node
241

242
243
244
245
246
  (* [accepts prod t] tells whether the current state of the trie [t] is
     the end of a branch associated with production [prod]. If so, this
     means that we have successfully followed a path that corresponds to
     the right-hand side of production [prod]. *)
  val accepts: Production.index -> trie -> bool
247

248
249
250
251
252
253
254
255
  (* [step sym t] is the immediate sub-trie of [t] along the symbol [sym].
     This function raises [Not_found] if [t] has no child labeled [sym]. *)
  val step: Symbol.t -> trie -> trie

  (* [verbose()] outputs debugging & performance information. *)
  val verbose: unit -> unit

end = struct
256

257
258
  (* A trie has the following structure. *)

259
  type trie = {
260
261
262
    (* A unique identity, used by [compare]. The trie construction code
       ensures that these numbers are indeed unique: see [fresh], [insert],
       [star]. *)
263
    identity: int;
264
    (* The root state of this star: "where we come from". *)
265
    source: Lr1.node;
266
    (* The current state, i.e., the root of this sub-trie: "where we are". *)
POTTIER Francois's avatar
POTTIER Francois committed
267
    current: Lr1.node;
268
269
270
    (* The productions that we can reduce in the current state. In other
       words, if this list is nonempty, then the current state is the end
       of one (or several) branches. It can nonetheless have children. *)
271
    productions: Production.index list;
272
273
    (* The children, or sub-tries. *)
    transitions: trie SymbolMap.t
274
  }
275

276
  (* This counter is used by [mktrie] to produce unique identities. *)
277
278
  let c = ref 0

279
  (* This smart constructor creates a new trie with a unique identity. *)
POTTIER Francois's avatar
POTTIER Francois committed
280
  let mktrie source current productions transitions =
281
    let identity = Misc.postincrement c in
POTTIER Francois's avatar
POTTIER Francois committed
282
    { identity; source; current; productions; transitions }
283

284
285
  exception DeadBranch

286
  let rec insert w prod t =
287
288
    match w with
    | [] ->
POTTIER Francois's avatar
POTTIER Francois committed
289
        (* We check whether the current state [t.current] is able to reduce
290
291
292
293
294
           production [prod]. (If [prod] cannot be reduced, the reduction
           action must have been suppressed by conflict resolution.) If not,
           then this branch is dead. This test is superfluous (i.e., it would
           be OK to conservatively assume that [prod] can be reduced) but
           allows us to build a slightly smaller star in some cases. *)
POTTIER Francois's avatar
POTTIER Francois committed
295
        if can_reduce t.current prod then
296
297
298
299
300
301
          (* We consume (update) the trie [t], so there is no need to allocate
             a new stamp. (Of course we could allocate a new stamp, but I prefer
             to be precise.) *)
          { t with productions = prod :: t.productions }
        else
          raise DeadBranch
302
    | (Symbol.T t) :: _ when Terminal.equal t Terminal.error ->
303
         raise DeadBranch
304
    | a :: w ->
POTTIER Francois's avatar
POTTIER Francois committed
305
        (* Check if there is a transition labeled [a] out of [t.current]. If
306
307
           there is, we add a child to the trie [t]. If there isn't, then it
           must have been removed by conflict resolution. (Indeed, it must be
308
309
310
           present in a canonical automaton.) We could in this case return an
           unchanged sub-trie. We can do slightly better: we abort the whole
           insertion, so as to return an unchanged toplevel trie. *)
POTTIER Francois's avatar
POTTIER Francois committed
311
        match SymbolMap.find a (Lr1.transitions t.current) with
312
        | successor ->
313
314
315
316
317
318
319
320
            (* Find our child at [a], or create it. *)
            let t' =
              try
                SymbolMap.find a t.transitions
              with Not_found ->
                mktrie t.source successor [] SymbolMap.empty
            in
            (* Update the child [t']. *)
321
            let t' = insert w prod t' in
322
323
            (* Update [t]. Again, no need to allocate a new stamp. *)
            { t with transitions = SymbolMap.add a t' t.transitions }
324
        | exception Not_found ->
325
            raise DeadBranch
326

327
328
329
330
331
  (* [insert prod t] inserts a new branch, corresponding to production
     [prod], into the trie [t]. This function consumes its argument,
     which should no longer be used afterwards. *)
  let insert prod t =
    let w = Array.to_list (Production.rhs prod) in
332
333
334
335
336
337
    let save = !c in
    try
      insert w prod t
    with DeadBranch ->
      c := save;
      t
338

339
340
341
342
  (* [fresh s] creates a new empty trie whose source is [s]. *)
  let fresh source =
    mktrie source source [] SymbolMap.empty

343
344
345
  (* The star at [s] is obtained by starting with a fresh empty trie and
     inserting into it every production [prod] whose left-hand side [nt]
     is the label of an outgoing edge at [s]. *)
346
347
348
349
350
351
352
353
354
  let star s =
    SymbolMap.fold (fun sym _ accu ->
      match sym with
      | Symbol.T _ ->
          accu
      | Symbol.N nt ->
          Production.foldnt nt accu insert
    ) (Lr1.transitions s) (fresh s)

355
356
357
358
359
360
361
  (* A trie [t] is nontrivial if it has at least one branch, i.e., contains at
     least one sub-trie whose [productions] field is nonempty. Trivia: a trie
     of size greater than 1 is necessarily nontrivial, but the converse is not
     true: a nontrivial trie can have size 1. (This occurs if all productions
     have zero length.) *)
  let trivial t =
    t.productions = [] && SymbolMap.is_empty t.transitions
362

363
364
  (* Redefine [star] to include a [nontrivial] test and to record the size of
     the newly built trie. *)
365
366
367
368

  let size =
    Array.make Lr1.n (-1)

369
  let star s =
370
    let initial = !c in
371
    let t = star s in
372
373
    let final = !c in
    size.(Lr1.number s) <- final - initial;
374
    if trivial t then None else Some t
375

376
377
378
379
  let size s =
    assert (size.(s) >= 0);
    size.(s)

POTTIER Francois's avatar
POTTIER Francois committed
380
  let compare t1 t2 =
381
    Pervasives.compare t1.identity t2.identity
382

383
384
385
  let source t =
    t.source

POTTIER Francois's avatar
POTTIER Francois committed
386
387
  let current t =
    t.current
388

389
390
391
  let accepts prod t =
    List.mem prod t.productions

392
  let step a t =
393
    SymbolMap.find a t.transitions (* careful: may raise [Not_found] *)
394

395
  let verbose () =
396
    Printf.eprintf "Cumulated star size: %d\n%!" !c
397

398
399
end

POTTIER Francois's avatar
POTTIER Francois committed
400
401
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
402
403
404
405
406
407
(* The main algorithm, [LRijkstra], accumulates facts. A fact is a triple of a
   position (that is, a sub-trie), a word, and a lookahead assumption. Such a
   fact means that this position can be reached, from the source state
   [Trie.source fact.position], by consuming [fact.word], under the assumption
   that the next input symbol is [fact.lookahead]. *)

408
409
(* We allow [fact.lookahead] to be [any] so as to indicate that this fact does
   not have a lookahead assumption. *)
POTTIER Francois's avatar
POTTIER Francois committed
410

411
type fact = {
412
  position: Trie.trie;
413
  word: W.word;
414
  lookahead: Terminal.t (* may be [any] *)
415
416
}

417
418
(* Accessors. *)

419
let source fact =
420
  Trie.source fact.position
421

POTTIER Francois's avatar
POTTIER Francois committed
422
let current fact =
423
  Trie.current fact.position
424

425
426
(* Two invariants reduce the number of facts that we consider:

427
428
429
   1. If [fact.lookahead] is a real terminal symbol [z] (i.e., not [any]),
      then [z] does not cause an error in the current state [current fact].
      It would be useless to consider a fact that violates this property;
POTTIER Francois's avatar
POTTIER Francois committed
430
431
432
      this cannot possibly lead to a successful reduction. In practice,
      this refinement allows reducing the number of facts that go through
      the queue by a factor of two.
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450

   2. [fact.lookahead] is [any] iff the current state [current fact] is
      solid. This sounds rather reasonable (when a state is entered
      by shifting, it is entered regardless of which symbol follows)
      and simplifies the implementation of the sub-module [T].

*)

let invariant1 fact =
  fact.lookahead = any || not (causes_an_error (current fact) fact.lookahead)

let invariant2 fact =
  (fact.lookahead = any) = is_solid (current fact)

(* [compatible z a] checks whether the terminal symbol [a] satisfies the
   lookahead assumption [z] -- which can be [any]. *)

let compatible z a =
451
452
  assert (non_error z);
  assert (Terminal.real a);
453
454
  z = any || z = a

POTTIER Francois's avatar
POTTIER Francois committed
455
(* ------------------------------------------------------------------------ *)
456

457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
(* As in Dijkstra's algorithm, a priority queue contains the facts that await
   examination. The length of [fact.word] serves as the priority of a fact.
   This guarantees that we discover shortest paths. (We never insert into the
   queue a fact whose priority is less than the priority of the last fact
   extracted out of the queue.) *)

(* [LowIntegerPriorityQueue] offers very efficient operations (essentially
   constant time, for a small constant). It exploits the fact that priorities
   are low nonnegative integers. *)

module Q = LowIntegerPriorityQueue

let q =
  Q.create()

(* In principle, there is no need to insert the fact into the queue if [T]
   already stores a comparable fact. We could perform this test in [add].
   However, a quick experiment suggests that this is not worthwhile. The run
   time augments (because membership in [T] is tested twice, upon inserting
   and upon extracting) and the memory consumption does not seem to go down
   significantly. *)

let add fact =
POTTIER Francois's avatar
POTTIER Francois committed
480
  (* [fact.lookahead] can be [any], but cannot be [error] *)
481
  assert (non_error fact.lookahead);
482
483
  assert (invariant1 fact);
  assert (invariant2 fact);
484
485
486
  (* The length of [fact.word] serves as the priority of this fact. *)
  Q.add q fact (W.length fact.word)

POTTIER Francois's avatar
POTTIER Francois committed
487
488
(* ------------------------------------------------------------------------ *)

489
490
491
(* Construct the [star] of every state [s]. Initialize the priority queue. *)

let () =
POTTIER Francois's avatar
POTTIER Francois committed
492
  (* For every state [s]... *)
493
  Lr1.iter (fun s ->
POTTIER Francois's avatar
POTTIER Francois committed
494
    (* If the trie rooted at [s] is nontrivial...*)
495
    match Trie.star s with
POTTIER Francois's avatar
POTTIER Francois committed
496
497
498
499
500
501
502
503
504
505
    | None ->
        ()
    | Some position ->
        (* ...then insert an initial fact into the priority queue. *)
        (* In order to respect invariants 1 and 2, we must distinguish two
           cases. If [s] is solid, then we insert a single fact, whose
           lookahead assumption is [any]. Otherwise, we must insert one
           initial fact for every terminal symbol [z] that does not cause
           an error in state [s]. *)
        let word = W.epsilon in
506
        if is_solid s then
POTTIER Francois's avatar
POTTIER Francois committed
507
          add { position; word; lookahead = any }
508
509
        else
          foreach_terminal_not_causing_an_error s (fun z ->
POTTIER Francois's avatar
POTTIER Francois committed
510
            add { position; word; lookahead = z }
511
          )
POTTIER Francois's avatar
POTTIER Francois committed
512
513
514
  );
  if X.verbose then
    Trie.verbose()
515
516
517

(* ------------------------------------------------------------------------ *)

518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
(* The module [T] maintains a set of known facts. *)

(* Three aspects of a fact are of particular interest:
   - its position [position], given by [fact.position];
   - its first symbol [a], given by [W.first fact.word fact.lookahead];
   - its lookahead assumption [z], given by [fact.lookahead].

   For every triple of [position], [a], and [z], we store at most one fact,
   (whose word has minimal length). Indeed, we are not interested in keeping
   track of several words that produce the same effect. Only the shortest such
   word is of interest.
   
   Thus, the total number of facts accumulated by the algorithm is at most
   [T.n^2], where [T] is the total size of the tries that we have constructed,
   and [n] is the number of terminal symbols. (This number can be quite large.
   [T] can be in the tens of thousands, and [n] can be over one hundred. These
   figures lead to a theoretical upper bound of 100M. In practice, for T=25K
   and n=108, we observe that the algorithm gathers about 7M facts.) *)
536

537
module T : sig
538
539

  (* [register fact] registers the fact [fact]. It returns [true] if this fact
POTTIER Francois's avatar
POTTIER Francois committed
540
541
     is new, i.e., no fact concerning the same triple of [position], [a], and
     [z] was previously known. *)
542
543
  val register: fact -> bool

POTTIER Francois's avatar
POTTIER Francois committed
544
  (* [query current z f] enumerates all known facts whose current state is
545
546
     [current] and whose lookahead assumption is compatible with [z]. The
     symbol [z] must a real terminal symbol, i.e., cannot be [any]. *)
547
  val query: Lr1.node -> Terminal.t -> (fact -> unit) -> unit
548

549
  (* [verbose()] outputs debugging & performance information. *)
550
  val verbose: unit -> unit
551

552
end = struct
553

554
555
556
557
558
559
560
561
562
563
564
  (* We need to query the set of facts in two ways. In [register], we must test
     whether a proposed triple of [position], [a], [z] already appears in the
     set. In [query], we must find all facts that match a pair [current, z],
     where [current] is a state. (Note that [position] determines [current], but
     the converse is not true: a position contains more information besides the
     current state.)

     To address these needs, we use a two-level table. The first level is a
     matrix indexed by [current] and [z]. At the second level, we find sets of
     facts, where two facts are considered equal if they have the same triple of
     [position], [a], and [z]. In fact, we know at this level that all facts
565
566
567
568
569
570
     have the same [z] component, so only [position] and [a] are compared.

     Because our facts satisfy invariant 2, [z] is [any] if and only if the
     state [current fact] is solid. This means that we are wasting quite a
     lot of space in the matrix (for a solid state, the whole line is empty,
     except for the [any] column). *)
571

572
  (* The level-2 sets. *)
573
574

  module M =
575
    MySet.Make(struct
576
577
      type t = fact
      let compare fact1 fact2 =
578
        assert (fact1.lookahead = fact2.lookahead);
579
        let c = Trie.compare fact1.position fact2.position in
580
        if c <> 0 then c else
581
582
583
        let z = fact1.lookahead in
        let a1 = W.first fact1.word z
        and a2 = W.first fact2.word z in
584
        (* note: [a1] and [a2] can be [any] here *)
585
586
        Terminal.compare a1 a2
    end)
587

588
589
590
  (* The level-1 matrix. *)

  let table =
591
    Array.make (Lr1.n * Terminal.n) M.empty
592

POTTIER Francois's avatar
POTTIER Francois committed
593
  let index current z =
594
    Terminal.n * (Lr1.number current) + Terminal.t2i z
595

596
597
  let count = ref 0

598
  let register fact =
POTTIER Francois's avatar
POTTIER Francois committed
599
    let current = current fact in
600
    let z = fact.lookahead in
POTTIER Francois's avatar
POTTIER Francois committed
601
    let i = index current z in
602
603
604
605
606
607
608
609
610
611
612
613
    let m = table.(i) in
    (* We crucially rely on the fact that [M.add] guarantees not to
       change the set if an ``equal'' fact already exists. Thus, a
       later, longer path is ignored in favor of an earlier, shorter
       path. *)
    let m' = M.add fact m in
    m != m' && begin
      incr count;
      table.(i) <- m';
      true
    end

POTTIER Francois's avatar
POTTIER Francois committed
614
  let query current z f =
615
616
617
618
619
    assert (not (Terminal.equal z any));
    (* If the state [current] is solid then the facts that concern it are
       stored in the column [any], and all of them are compatible with [z].
       Otherwise, they are stored in all columns except [any], and only
       those stored in the column [z] are compatible with [z]. *)
620
    let i = index current (if is_solid current then any else z) in
621
622
    let m = table.(i) in
    M.iter f m
623

624
  let verbose () =
625
    Printf.eprintf "T stores %d facts.\n%!" !count
626

627
628
end

POTTIER Francois's avatar
POTTIER Francois committed
629
630
(* ------------------------------------------------------------------------ *)

631
632
(* The module [E] is in charge of recording the non-terminal edges that we have
   discovered, or more precisely, the conditions under which these edges can be
POTTIER Francois's avatar
POTTIER Francois committed
633
634
635
636
637
638
639
640
641
642
643
644
   taken.
   
   It maintains a set of quadruples [s, nt, w, z], where such a quadruple means
   that in the state [s], the outgoing edge labeled [nt] can be taken by
   consuming the word [w], under the assumption that the next symbol is [z].

   Again, the terminal symbol [a], given by [W.first w z], plays a role. For
   each quadruple [s, nt, a, z], we store at most one quadruple [s, nt, w, z].
   Thus, internally, we maintain a mapping of [s, nt, a, z] to [w].

   For greater simplicity, we do not allow [z] to be [any] in [register] or
   [query]. Allowing it would complicate things significantly, it seems. *)
645
646
647
648

module E : sig

  (* [register s nt w z] records that, in state [s], the outgoing edge labeled
649
     [nt] can be taken by consuming the word [w], if the next symbol is [z].
POTTIER Francois's avatar
POTTIER Francois committed
650
651
     It returns [true] if this information is new, i.e., if the underlying
     quadruple [s, nt, a, z] is new. The symbol [z] cannot be [any]. *)
652
  val register: Lr1.node -> Nonterminal.t -> W.word -> Terminal.t -> bool
653

POTTIER Francois's avatar
POTTIER Francois committed
654
655
656
657
658
659
660
  (* [query s nt a z] enumerates all words [w] such that, in state [s], the
     outgoing edge labeled [nt] can be taken by consuming the word [w], under
     the assumption that the next symbol is [z], and the first symbol of the
     word [w.z] is [a]. The symbol [a] can be [any]. The symbol [z] cannot be
     [any]. *)
  val query: Lr1.node -> Nonterminal.t -> Terminal.t -> Terminal.t ->
             (W.word -> unit) -> unit
661

POTTIER Francois's avatar
POTTIER Francois committed
662
  (* [verbose()] outputs debugging & performance information. *)
663
  val verbose: unit -> unit
664

665
666
end = struct

667
668
669
670
671
672
673
674
675
676
  (* At a high level, we must implement a mapping of [s, nt, a, z] to [w]. In
     practice, we can implement this specification using any combination of
     arrays, hash tables, balanced binary trees, and perfect hashing (i.e.,
     packing several of [s], [nt], [a], [z] in one word.) Here, we choose to
     use an array, indexed by [s], of hash tables, indexed by a key that packs
     [nt], [a], and [z] in one word. According to a quick experiment, the
     final population of the hash table [table.(index s)] seems to be roughly
     [Terminal.n * Trie.size s]. We note that using an initial capacity
     of 0 and relying on the hash table's resizing mechanism has a significant
     cost, which is why we try to guess a good initial capacity. *)
677

678
  module H = Hashtbl
679

POTTIER Francois's avatar
POTTIER Francois committed
680
681
  let table =
    Array.init Lr1.n (fun i ->
682
683
684
      let size = Trie.size i in
      H.create (if size = 1 then 0 else Terminal.n * size)
    )
685
686
687

  let index s =
    Lr1.number s
688

689
  let pack nt a z : int =
POTTIER Francois's avatar
POTTIER Francois committed
690
    (* We rely on the fact that we have at most 256 terminal symbols. *)
691
692
693
    (Nonterminal.n2i nt lsl 16) lor
    (Terminal.t2i a lsl 8) lor
    (Terminal.t2i z)
694

695
696
  let count = ref 0

697
  let register s nt w z =
698
    assert (Terminal.real z);
699
    let i = index s in
700
    let m = table.(i) in
701
    let a = W.first w z in
POTTIER Francois's avatar
POTTIER Francois committed
702
    (* Note that looking at [a] in state [s] cannot cause an error. *)
703
    assert (not (causes_an_error s a));
704
705
706
707
    let key = pack nt a z in
    if H.mem m key then
      false
    else begin
708
      incr count;
709
      H.add m key w;
710
711
      true
    end
712

713
  let rec query s nt a z f =
714
    assert (Terminal.real z);
POTTIER Francois's avatar
POTTIER Francois committed
715
716
    if Terminal.equal a any then begin
      (* If [a] is [any], we query the table for every real symbol [a].
717
718
719
720
721
722
         We can limit ourselves to symbols that do not cause an error
         in state [s]. Those that do certainly do not have an entry;
         see the assertion in [register] above. *)
      foreach_terminal_not_causing_an_error s (fun a ->
        query s nt a z f
      )
POTTIER Francois's avatar
POTTIER Francois committed
723
724
725
726
727
728
729
730
    end
    else begin
      let i = index s in
      let m = table.(i) in
      let key = pack nt a z in
      match H.find m key with
      | w -> f w
      | exception Not_found -> ()
731
    end
732

733
  let verbose () =
734
    Printf.eprintf "E stores %d facts.\n%!" !count
735

736
737
end

POTTIER Francois's avatar
POTTIER Francois committed
738
739
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
740
741
742
743
744
745
746
(* [new_edge s nt w z] is invoked when we discover that in the state [s], the
   outgoing edge labeled [nt] can be taken by consuming the word [w], under
   the assumption that the next symbol is [z]. We check whether this quadruple
   already exists in the set [E]. If not, then we add it, and we compute its
   consequences, in the form of new facts, which we insert into the priority
   queue for later examination. *)

747
let new_edge s nt w z =
748
  assert (Terminal.real z);
749
  if E.register s nt w z then
750
    let sym = Symbol.N nt in
POTTIER Francois's avatar
POTTIER Francois committed
751
752
753
754
    (* Query [T] for existing facts which could be extended by following
       this newly discovered edge. They must be facts whose current state
       is [s] and whose lookahead assumption is compatible with [a]. For
       each such fact, ... *)
755
    T.query s (W.first w z) (fun fact ->
756
      assert (compatible fact.lookahead (W.first w z));
POTTIER Francois's avatar
POTTIER Francois committed
757
      (* ... try to take one step in the trie along an edge labeled [nt]. *)
758
759
      match Trie.step sym fact.position with
      | position ->
POTTIER Francois's avatar
POTTIER Francois committed
760
761
762
763
764
765
          (* This takes up to a new state whose incoming symbol is [nt].
             Hence, this state is not solid. In order to satisfy invariant 2,
             we must create fact whose lookahead assumption is not [any].
             That's fine, since our lookahead assumption is [z]. In order to
             satisfy invariant 1, we must check that [z] does not cause an
             error in this state. *)
766
          assert (not (is_solid (Trie.current position)));
767
          if not (causes_an_error (Trie.current position) z) then
POTTIER Francois's avatar
POTTIER Francois committed
768
769
            let word = W.append fact.word w in
            add { position; word; lookahead = z }
770
      | exception Not_found ->
POTTIER Francois's avatar
POTTIER Francois committed
771
772
773
          (* Could not take a step in the trie. This means this branch
             leads nowhere of interest, and was pruned when the trie
             was constructed. *)
774
          ()
775
    )
776

POTTIER Francois's avatar
POTTIER Francois committed
777
778
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
779
780
(* [new_fact fact] is invoked when we discover a new fact (i.e., one that was
   not previously known). It studies the consequences of this fact. These
781
782
783
   consequences are of two kinds:

   - As in Dijkstra's algorithm, the new fact can be viewed as a newly
POTTIER Francois's avatar
POTTIER Francois committed
784
785
786
787
788
789
790
791
792
     discovered vertex. We study its (currently known) outgoing edges,
     and enqueue new facts in the priority queue.

   - Sometimes, a fact can also be viewed as a newly discovered edge. This is
     the case when the word that took us from [fact.source] to [fact.current]
     represents a production of the grammar and [fact.current] is willing to
     reduce this production. We record the existence of this edge, and
     re-inspect any previously discovered vertices which are interested in
     this outgoing edge. *)
793

POTTIER Francois's avatar
POTTIER Francois committed
794
let new_fact fact =
795

POTTIER Francois's avatar
POTTIER Francois committed
796
  let current = current fact in
797

POTTIER Francois's avatar
POTTIER Francois committed
798
799
800
  (* 1. View [fact] as a vertex. Examine the transitions out of [current].
     For every transition labeled by a symbol [sym] and into a state
     [target], ... *)
801
  
POTTIER Francois's avatar
POTTIER Francois committed
802
803
804
  Lr1.transitions current |> SymbolMap.iter (fun sym target ->
    (* ... try to follow this transition in the trie [fact.position],
       down to a child which we call [position]. *)
805
    match Trie.step sym fact.position, sym with
POTTIER Francois's avatar
POTTIER Francois committed
806
807
808
809
810
811
812

    | exception Not_found ->

        (* Could not take a step in the trie. This means this transition
           leads nowhere of interest. *)
        ()

813
    | position, Symbol.T t ->
POTTIER Francois's avatar
POTTIER Francois committed
814
815
816
817
818
819
          
        (* 1a. The transition exists in the trie, and [sym] is in fact a
           terminal symbol [t]. We note that [t] cannot be the [error] token,
           because the trie does not have any edges labeled [error]. *)
        assert (Lr1.Node.compare (Trie.current position) target = 0);
        assert (is_solid target);
820
        assert (non_error t);
821

POTTIER Francois's avatar
POTTIER Francois committed
822
823
824
825
826
827
828
829
830
831
832
        (* If the lookahead assumption [fact.lookahead] is compatible with
           [t], then we derive a new fact, where one more edge has been taken,
           and enqueue this new fact for later examination. *)
        
        (* The state [target] is solid, i.e., its incoming symbol is terminal.
           This state is always entered without consideration for the next
           lookahead symbol. Thus, we can use [any] as the lookahead assumption
           in the new fact that we produce. If we did not have [any], we would
           have to produce one fact for every possible lookahead symbol. *)

        if compatible fact.lookahead t then
833
          let word = W.append fact.word (W.singleton t) in
834
          add { position; word; lookahead = any }
835

836
    | position, Symbol.N nt ->
837

POTTIER Francois's avatar
POTTIER Francois committed
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
        (* 1b. The transition exists in the trie, and [sym] is in fact a
           nonterminal symbol [nt]. *)
         assert (Lr1.Node.compare (Trie.current position) target = 0);
         assert (not (is_solid target));

        (* We need to know how this nonterminal edge can be taken. We query
           [E] for a word [w] that allows us to take this edge. In general,
           the answer depends on the terminal symbol [z] that comes *after*
           this word: we try all such symbols. We must make sure that the
           first symbol of the word [w.z] satisfies the lookahead assumption
           [fact.lookahead]; this is ensured by passing this information to
           [E.query]. *)

        (* It could be the case that, due to a default reduction, the answer
           to our query does not depend on [z], and we are wasting work.
           However, allowing [z] to be [any] in [E.query], and taking 
           advantage of this to increase performance, seems difficult. *)

        foreach_terminal_not_causing_an_error target (fun z ->
POTTIER Francois's avatar
POTTIER Francois committed
857
          E.query current nt fact.lookahead z (fun w ->
858
            assert (compatible fact.lookahead (W.first w z));
POTTIER Francois's avatar
POTTIER Francois committed
859
860
            let word = W.append fact.word w in
            add { position; word; lookahead = z }
861
862
          )
        )
863

POTTIER Francois's avatar
POTTIER Francois committed
864
  );
865
866

  (* 2. View [fact] as a possible edge. This is possible if the path from
POTTIER Francois's avatar
POTTIER Francois committed
867
868
     [fact.source] to the [current] state represents a production [prod] and
     [current] is willing to reduce this production. Then, reducing [prod]
869
870
     takes us all the way back to [fact.source]. Thus, this production gives
     rise to an edge labeled [nt] -- the left-hand side of [prod] -- out of
POTTIER Francois's avatar
POTTIER Francois committed
871
     [fact.source]. *)
872

POTTIER Francois's avatar
POTTIER Francois committed
873
874
875
876
877
878
879
880
881
882
  let z = fact.lookahead in
  if not (Terminal.equal z any) then begin

    (* 2a. The lookahead assumption [z] is a real terminal symbol. We check
       whether [current] is willing to reduce some production [prod] on [z],
       and whether the sub-trie [fact.position] accepts [prod], which means
       that this reduction takes us back to the root of the trie. If so, we
       have discovered a new edge. *)

    match has_reduction current z with
883
    | Some prod when Trie.accepts prod fact.position ->
POTTIER Francois's avatar
POTTIER Francois committed
884
        new_edge (source fact) (Production.nt prod) fact.word z
885
886
    | _ ->
        ()
POTTIER Francois's avatar
POTTIER Francois committed
887

888
889
  end
  else begin
POTTIER Francois's avatar
POTTIER Francois committed
890
891
892
893
894

    (* 2b. The lookahead assumption is [any]. We must consider every pair
       [prod, z] such that the [current] state can reduce [prod] on [z]
       and [fact.position] accepts [prod]. *)

895
896
897
    match Invariant.has_default_reduction current with
    | Some (prod, _) ->
        if Trie.accepts prod fact.position then
POTTIER Francois's avatar
POTTIER Francois committed
898
899
          (* [new_edge] does not accept [any] as its 4th parameter, so we
             must iterate over all terminal symbols. *)
900
          foreach_terminal (fun z ->
POTTIER Francois's avatar
POTTIER Francois committed
901
            new_edge (source fact) (Production.nt prod) fact.word z
902
903
904
          )
    | None ->
       TerminalMap.iter (fun z prods ->
905
         if non_error z then
906
907
           let prod = Misc.single prods in
           if Trie.accepts prod fact.position then
POTTIER Francois's avatar
POTTIER Francois committed
908
             new_edge (source fact) (Production.nt prod) fact.word z
909
       ) (Lr1.reductions current)
POTTIER Francois's avatar
POTTIER Francois committed
910

911
  end
912

POTTIER Francois's avatar
POTTIER Francois committed
913
914
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
915
(* The main loop of the algorithm. *)
916

POTTIER Francois's avatar
POTTIER Francois committed
917
918
let level, extracted, considered =
  ref 0, ref 0, ref 0
POTTIER Francois's avatar
POTTIER Francois committed
919

POTTIER Francois's avatar
POTTIER Francois committed
920
let done_with_level () =
921
  if X.verbose then begin
POTTIER Francois's avatar
POTTIER Francois committed
922
    Printf.eprintf "Done with level %d.\n" !level;
923
924
    W.verbose();
    T.verbose();
POTTIER Francois's avatar
POTTIER Francois committed
925
926
927
928
    E.verbose();
    Printf.eprintf "Q stores %d facts.\n" (Q.cardinal q);
    Printf.eprintf "%d facts extracted out of Q, of which %d considered.\n%!"
      !extracted !considered
929
  end
930

POTTIER Francois's avatar
POTTIER Francois committed
931
let () =
POTTIER Francois's avatar
POTTIER Francois committed
932
933
934
935
936
937
938
939
940
941
942
943
944
  Q.repeat q (fun fact ->
    incr extracted;
    if T.register fact then begin
      if W.length fact.word > !level then begin
        done_with_level();
        level := W.length fact.word;
      end;
      incr considered;
      new_fact fact
    end
  );
  done_with_level();
  Time.tick "Running LRijkstra"
945

946
947
(* ------------------------------------------------------------------------ *)

948
949
950
951
952
(* The following code validates the fact that an error can be triggered in
   state [s'] by beginning in the initial state [s] and reading the
   sequence of terminal symbols [w]. We use this for debugging purposes. *)

let fail msg =
953
  Printf.eprintf "coverage: internal error: %s.\n%!" msg;
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
  false

open ReferenceInterpreter

let validate s s' w : bool =
  match
    ReferenceInterpreter.check_error_path (Lr1.nt_of_entry s) (W.elements w)
  with
  | OInputReadPastEnd ->
      fail "input was read past its end"
  | OInputNotFullyConsumed ->
      fail "input was not fully consumed"
  | OUnexpectedAccept ->
      fail "input was unexpectedly accepted"
  | OK state ->
      Lr1.Node.compare state s' = 0 ||
      fail (
        Printf.sprintf "error occurred in state %d instead of %d"
          (Lr1.number state)
          (Lr1.number s')
      )

(* ------------------------------------------------------------------------ *)

978
979
980
981
982
983
984
(* We now wish to determine, given a state [s'] and a terminal symbol [z], a
   minimal path that takes us from some entry state to state [s'] with [z] as
   the next (unconsumed) symbol. *)

(* This can be formulated as a search for a shortest path in a graph. The
   graph is not just the automaton, though. It is a (much) larger graph whose
   vertices are pairs [s, z] and whose edges are obtained by querying the
985
   module [E] above. *)
986

987
let forward () =
988

989
  let module A = Astar.Make(struct
990

991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
    (* A vertex is a pair [s, z].
       [z] cannot be the [error] token. *)
    type node =
        Lr1.node * Terminal.t

    let equal (s'1, z1) (s'2, z2) =
      Lr1.Node.compare s'1 s'2 = 0 && Terminal.compare z1 z2 = 0

    let hash (s, z) =
      Hashtbl.hash (Lr1.number s, z)

    (* An edge is labeled with a word. *)
    type label =
      W.word

    (* Forward search from every [s, z], where [s] is an initial state. *)
    let sources f =
      foreach_terminal (fun z ->
        ProductionMap.iter (fun _ s ->
          f (s, z)
        ) Lr1.entry
1012
1013
      )

1014
    let successors (s, z) edge =
1015
      assert (Terminal.real z);
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
      SymbolMap.iter (fun sym s' ->
        match sym with
        | Symbol.T t ->
            if Terminal.equal z t then
              let w = W.singleton t in
              foreach_terminal (fun z ->
                edge w 1 (s', z)
              )
        | Symbol.N nt ->
           foreach_terminal (fun z' ->
             E.query s nt z z' (fun w ->
               edge w (W.length w) (s', z')
             )
           )
      ) (Lr1.transitions s)

    let estimate _ =
      0

  end) in

  (* Search forward. *)

1039
  Printf.eprintf "Forward search:\n%!";
1040
  let seen = ref Lr1.NodeSet.empty in
1041
  let _, _ = A.search (fun ((s', z), path) ->
1042
1043
1044
1045
1046
1047
    if causes_an_error s' z && not (Lr1.NodeSet.mem s' !seen) then begin
      seen := Lr1.NodeSet.add s' !seen;
      (* An error can be triggered in state [s'] by beginning in the initial
         state [s] and reading the sequence of terminal symbols [w]. *)
      let (s, _), ws = A.reverse path in
      let w = List.fold_right W.append ws (W.singleton z) in
1048
      Printf.eprintf
1049
1050
1051
        "An error can be reached from state %d to state %d:\n%!"
        (Lr1.number s)
        (Lr1.number s');
1052
      Printf.eprintf "%s\n%!" (W.print w);
1053
1054
1055
      assert (validate s s' w)
    end
  ) in
1056
  Printf.eprintf "Reachable (forward): %d states\n%!"
1057
1058
    (Lr1.NodeSet.cardinal !seen);
  !seen
1059
1060

let () =
1061
1062
  let f = forward() in
  Time.tick "Forward search";
POTTIER Francois's avatar
POTTIER Francois committed
1063
  let stat = Gc.quick_stat() in
1064
  Printf.eprintf
POTTIER Francois's avatar
POTTIER Francois committed
1065
1066
    "Maximum size reached by the major heap: %dM\n"
    (stat.Gc.top_heap_words * (Sys.word_size / 8) / 1024 / 1024);