LRijkstra.ml 35.3 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
(* The purpose of this algorithm is to find, for each pair of a state [s]
   and a terminal symbol [z] such that looking at [z] in state [s] causes
   an error, a minimal path (starting in some initial state) that actually
   triggers this error. *)

(* This is potentially useful for grammar designers who wish to better
   understand the properties of their grammar, or who wish to produce a
   list of all possible syntax errors (or, at least, one syntax error in
   each automaton state where an error may occur). *)

(* The problem seems rather tricky. One might think that it suffices to
   compute shortest paths in the automaton, and to use [Analysis.minimal] to
   replace each non-terminal symbol in a path with a minimal word that this
   symbol generates. One can indeed do so, but this yields only a lower bound
   on the actual shortest path to the error at [s, z]. Indeed, two
   difficulties arise:

   - Some states have a default reduction. Thus, they will not trigger
     an error, even though they should. The error is triggered in some
     other state, after reduction takes place.

   - If the grammar has conflicts, conflict resolution removes some
     (shift or reduce) actions, hence may suppress the shortest path. *)

25
26
27
28
29
30
31
32
33
34
35
(* We explicitly choose to ignore the [error] token. Thus, we disregard any
   reductions or transitions that take place when the lookahead symbol is
   [error]. As a result, any state whose incoming symbol is [error] is found
   unreachable. It would be too complicated to have to create a first error in
   order to be able to take certain transitions or drop certain parts of the
   input. *)

(* We never work with the terminal symbol [#] either. This symbol never
   appears in the maps returned by [Lr1.transitions] and [Lr1.reductions].
   Thus, in principle, we work with ``real'' terminal symbols only. However,
   we encode [any] as [#] -- see below. *)
36

37
38
39
40
41
42
43
(* ------------------------------------------------------------------------ *)

(* To delay the side effects performed by this module, we wrap everything in
   in a big functor without arguments. *)

module Run (X : sig end) = struct

44
open Grammar
45
46
47

(* ------------------------------------------------------------------------ *)

48
49
(* Because of our encoding of terminal symbols as 8-bit characters, this
   algorithm supports at most 256 terminal symbols. *)
50
51

let () =
52
  if Terminal.n > 256 then
53
    Error.error [] (Printf.sprintf
54
      "--list-errors supports at most 256 terminal symbols.\n\
55
56
57
58
59
       The grammar has %d terminal symbols." Terminal.n
    )

(* ------------------------------------------------------------------------ *)

60
61
62
63
64
(* Build a module that represents words as (hash-consed) strings. Note:
   this functor application has a side effect (it allocates memory, and
   more importantly, it may fail). *)

module W = Terminal.Word(struct end)
65

66
67
(* ------------------------------------------------------------------------ *)

68
69
(* The [error] token may appear in the maps returned by [Lr1.transitions]
   and [Lr1.reductions], so we sometimes need to explicitly check for it. *)
70

71
let non_error z =
72
  not (Terminal.equal z Terminal.error)
73

74
75
76
77
(* We introduce a pseudo-terminal symbol [any]. It is used in several places
   later on, in particular in the field [fact.lookahead], to encode the
   absence of a lookahead hypothesis -- i.e., any terminal symbol will do. *)

78
79
80
81
82
(* We choose to encode [any] as [#]. There is no risk of confusion, since we
   do not use [#] anywhere. Thus, the assertion [Terminal.real z] implies
   [z <> any]. *)

let any =
83
  Terminal.sharp
84

85
86
(* ------------------------------------------------------------------------ *)

87
88
89
90
91
(* We begin with a number of auxiliary functions that provide information
   about the LR(1) automaton. These functions could perhaps be moved to a
   separate module. We keep them here, for the moment, because they are not
   used anywhere else. *)

POTTIER Francois's avatar
POTTIER Francois committed
92
93
94
(* [reductions_on s z] is the list of reductions permitted in state [s] when
   the lookahead symbol is [z]. This is a list of zero or one elements. This
   does not take default reductions into account. *)
95

POTTIER Francois's avatar
POTTIER Francois committed
96
let reductions_on s z : Production.index list =
97
  assert (Terminal.real z);
98
99
100
101
102
  try
    TerminalMap.find z (Lr1.reductions s)
  with Not_found ->
    []

103
104
105
(* [has_reduction s z] tells whether state [s] is willing to reduce some
   production (and if so, which one) when the lookahead symbol is [z]. It
   takes a possible default reduction into account. *)
106
107

let has_reduction s z : Production.index option =
108
  assert (Terminal.real z);
109
110
111
112
  match Invariant.has_default_reduction s with
  | Some (prod, _) ->
      Some prod
  | None ->
POTTIER Francois's avatar
POTTIER Francois committed
113
      match reductions_on s z with
114
115
116
117
118
119
      | prod :: prods ->
          assert (prods = []);
          Some prod
      | [] ->
          None

120
121
122
123
124
125
126
127
128
(* [can_reduce s prod] indicates whether state [s] is able to reduce
   production [prod] (either as a default reduction, or as a normal
   reduction). *)

let can_reduce s prod =
  match Invariant.has_default_reduction s with
  | Some (prod', _) when prod = prod' ->
      true
  | _ ->
129
      TerminalMap.fold (fun z prods accu ->
130
131
        (* A reduction on [#] is always a default reduction. (See [lr1.ml].) *)
        assert (not (Terminal.equal z Terminal.sharp));
132
        accu || non_error z && List.mem prod prods
133
134
      ) (Lr1.reductions s) false

135
136
(* [causes_an_error s z] tells whether state [s] will initiate an error on the
   lookahead symbol [z]. *)
137

138
let causes_an_error s z : bool =
139
  assert (Terminal.real z);
140
141
142
143
  match Invariant.has_default_reduction s with
  | Some _ ->
      false
  | None ->
POTTIER Francois's avatar
POTTIER Francois committed
144
      reductions_on s z = [] &&
145
146
      not (SymbolMap.mem (Symbol.T z) (Lr1.transitions s))

147
(* [foreach_terminal f] applies the function [f] to every terminal symbol in
148
   turn, except [error] and [#]. *)
149

150
151
let foreach_terminal =
  Terminal.iter_real
152

153
154
155
156
157
(* [foreach_terminal_not_causing_an_error s f] applies the function [f] to
   every terminal symbol [z] such that [causes_an_error s z] is false. This
   could be implemented in a naive manner using [foreach_terminal] and
   [causes_an_error]. This implementation is slightly more efficient. *)

158
159
160
let foreach_terminal_not_causing_an_error s f =
  match Invariant.has_default_reduction s with
  | Some _ ->
161
      (* There is a default reduction. No symbol causes an error. *)
162
163
      foreach_terminal f
  | None ->
164
165
166
      (* Enumerate every terminal symbol [z] for which there is a
         reduction. *)
      TerminalMap.iter (fun z _ ->
167
168
169
        (* A reduction on [#] is always a default reduction. (See [lr1.ml].) *)
        assert (not (Terminal.equal z Terminal.sharp));
        if non_error z then
170
          f z
171
      ) (Lr1.reductions s);
172
173
      (* Enumerate every terminal symbol [z] for which there is a
         transition. *)
174
175
      SymbolMap.iter (fun sym _ ->
        match sym with
176
        | Symbol.T z ->
177
178
            assert (not (Terminal.equal z Terminal.sharp));
            if non_error z then
179
              f z
180
181
182
183
        | Symbol.N _ ->
            ()
      ) (Lr1.transitions s)

184
185
186
187
188
189
190
191
192
193
194
195
196
(* Let us say a state [s] is solid if its incoming symbol is a terminal symbol
   (or if it has no incoming symbol at all, i.e., it is an initial state). A
   contrario, a state is fragile if its incoming symbol is a non-terminal
   symbol. *)

let is_solid s =
  match Lr1.incoming_symbol s with
  | None
  | Some (Symbol.T _) ->
      true
  | Some (Symbol.N _) ->
      false

197
198
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
199
200
201
202
203
204
205
206
207
208
209
210
(* Suppose [s] is a state that carries an outgoing edge labeled with a
   non-terminal symbol [nt]. We are interested in finding out how this edge
   can be taken. In order to do that, we must determine how, by starting in
   [s], one can follow a path that corresponds to (the right-hand side of) a
   production [prod] associated with [nt]. There are in general several such
   productions. The paths that they determine in the automaton form a "star".
   We represent the star rooted at [s] as a trie. For every state [s], the
   star rooted at [s] is constructed in advance, before the algorithm runs.
   While the algorithm runs, a point in the trie (that is, a sub-trie) tells
   us where we come form, where we are, and which production(s) we are hoping
   to reduce in the future. *)

211
212
213
module Trie : sig

  type trie
214
215
216
217

  (* [star s] creates a (new) trie whose source is [s], populated with its
     branches. (There is one branch for every production [prod] associated
     with every non-terminal symbol [nt] for which [s] carries an outgoing
218
     edge.) If the star turns out to be trivial then [None] is returned. *)
219
220
  val star: Lr1.node -> trie option

221
222
223
224
  (* After [star s] has been called, [size (Lr1.number s)] reports the size
     of the trie that has been constructed for state [s]. *)
  val size: int -> int

225
226
227
228
229
230
231
232
  (* Every (sub-)trie has a unique identity. (One can think of it as its
     address.) [compare] compares the identity of two tries. This can be
     used, e.g., to set up a map whose keys are tries. *)
  val compare: trie -> trie -> int

  (* [source t] returns the source state of the (sub-)trie [t]. This is
     the root of the star of which [t] is a sub-trie. In other words, this
     tells us "where we come from". *)
233
  val source: trie -> Lr1.node
POTTIER Francois's avatar
POTTIER Francois committed
234
235

  (* [current t] returns the current state of the (sub-)trie [t]. This is
236
237
     the root of the sub-trie [t]. In other words, this tells us "where
     we are". *)
POTTIER Francois's avatar
POTTIER Francois committed
238
  val current: trie -> Lr1.node
239

240
241
242
243
244
  (* [accepts prod t] tells whether the current state of the trie [t] is
     the end of a branch associated with production [prod]. If so, this
     means that we have successfully followed a path that corresponds to
     the right-hand side of production [prod]. *)
  val accepts: Production.index -> trie -> bool
245

246
247
248
249
250
251
252
253
  (* [step sym t] is the immediate sub-trie of [t] along the symbol [sym].
     This function raises [Not_found] if [t] has no child labeled [sym]. *)
  val step: Symbol.t -> trie -> trie

  (* [verbose()] outputs debugging & performance information. *)
  val verbose: unit -> unit

end = struct
254

255
256
  (* A trie has the following structure. *)

257
  type trie = {
258
259
260
    (* A unique identity, used by [compare]. The trie construction code
       ensures that these numbers are indeed unique: see [fresh], [insert],
       [star]. *)
261
    identity: int;
262
    (* The root state of this star: "where we come from". *)
263
    source: Lr1.node;
264
    (* The current state, i.e., the root of this sub-trie: "where we are". *)
POTTIER Francois's avatar
POTTIER Francois committed
265
    current: Lr1.node;
266
267
268
    (* The productions that we can reduce in the current state. In other
       words, if this list is nonempty, then the current state is the end
       of one (or several) branches. It can nonetheless have children. *)
269
    productions: Production.index list;
270
271
    (* The children, or sub-tries. *)
    transitions: trie SymbolMap.t
272
  }
273

274
  (* This counter is used by [mktrie] to produce unique identities. *)
275
276
  let c = ref 0

277
  (* This smart constructor creates a new trie with a unique identity. *)
POTTIER Francois's avatar
POTTIER Francois committed
278
  let mktrie source current productions transitions =
279
    let identity = Misc.postincrement c in
POTTIER Francois's avatar
POTTIER Francois committed
280
    { identity; source; current; productions; transitions }
281

282
283
  exception DeadBranch

284
  let rec insert w prod t =
285
286
    match w with
    | [] ->
POTTIER Francois's avatar
POTTIER Francois committed
287
        (* We check whether the current state [t.current] is able to reduce
288
289
290
291
292
           production [prod]. (If [prod] cannot be reduced, the reduction
           action must have been suppressed by conflict resolution.) If not,
           then this branch is dead. This test is superfluous (i.e., it would
           be OK to conservatively assume that [prod] can be reduced) but
           allows us to build a slightly smaller star in some cases. *)
POTTIER Francois's avatar
POTTIER Francois committed
293
        if can_reduce t.current prod then
294
295
296
297
298
299
          (* We consume (update) the trie [t], so there is no need to allocate
             a new stamp. (Of course we could allocate a new stamp, but I prefer
             to be precise.) *)
          { t with productions = prod :: t.productions }
        else
          raise DeadBranch
300
    | (Symbol.T t) :: _ when not (non_error t) ->
301
         raise DeadBranch
302
    | a :: w ->
POTTIER Francois's avatar
POTTIER Francois committed
303
        (* Check if there is a transition labeled [a] out of [t.current]. If
304
305
           there is, we add a child to the trie [t]. If there isn't, then it
           must have been removed by conflict resolution. (Indeed, it must be
306
307
308
           present in a canonical automaton.) We could in this case return an
           unchanged sub-trie. We can do slightly better: we abort the whole
           insertion, so as to return an unchanged toplevel trie. *)
POTTIER Francois's avatar
POTTIER Francois committed
309
        match SymbolMap.find a (Lr1.transitions t.current) with
310
        | successor ->
311
312
313
314
315
316
317
318
            (* Find our child at [a], or create it. *)
            let t' =
              try
                SymbolMap.find a t.transitions
              with Not_found ->
                mktrie t.source successor [] SymbolMap.empty
            in
            (* Update the child [t']. *)
319
            let t' = insert w prod t' in
320
321
            (* Update [t]. Again, no need to allocate a new stamp. *)
            { t with transitions = SymbolMap.add a t' t.transitions }
322
        | exception Not_found ->
323
            raise DeadBranch
324

325
326
327
328
329
  (* [insert prod t] inserts a new branch, corresponding to production
     [prod], into the trie [t]. This function consumes its argument,
     which should no longer be used afterwards. *)
  let insert prod t =
    let w = Array.to_list (Production.rhs prod) in
330
331
332
333
334
335
    let save = !c in
    try
      insert w prod t
    with DeadBranch ->
      c := save;
      t
336

337
338
339
340
341
342
343
344
345
346
347
348
349
  (* [fresh s] creates a new empty trie whose source is [s]. *)
  let fresh source =
    mktrie source source [] SymbolMap.empty

  let star s =
    SymbolMap.fold (fun sym _ accu ->
      match sym with
      | Symbol.T _ ->
          accu
      | Symbol.N nt ->
          Production.foldnt nt accu insert
    ) (Lr1.transitions s) (fresh s)

350
  (* [nontrivial t] tests whether the trie [t] has any branches, i.e.,
351
352
353
354
     contains at least one sub-trie whose [productions] field is nonempty.
     Trivia: a trie of size greater than 1 is necessarily nontrivial, but the
     converse is not true: a nontrivial trie can have size 1. (This occurs
     when all productions have zero length.) *)
355
356
357
  let nontrivial t =
    not (t.productions = [] && SymbolMap.is_empty t.transitions)

358
359
360
361
362
363
  (* Redefine [star] to include a [nontrivial] test and to record the size
     of the newly built trie. *)

  let size =
    Array.make Lr1.n (-1)

364
  let star s =
365
    let initial = !c in
366
    let t = star s in
367
368
    let final = !c in
    size.(Lr1.number s) <- final - initial;
369
370
371
372
373
    if nontrivial t then
      Some t
    else
      None

374
375
376
377
  let size s =
    assert (size.(s) >= 0);
    size.(s)

POTTIER Francois's avatar
POTTIER Francois committed
378
379
  let compare t1 t2 =
    Pervasives.compare (t1.identity : int) t2.identity
380

381
382
383
  let source t =
    t.source

POTTIER Francois's avatar
POTTIER Francois committed
384
385
  let current t =
    t.current
386

387
388
389
  let accepts prod t =
    List.mem prod t.productions

390
  let step a t =
391
    SymbolMap.find a t.transitions (* careful: may raise [Not_found] *)
392

393
  let verbose () =
394
    Printf.fprintf stderr "Cumulated star size: %d\n%!" !c
395

396
397
end

POTTIER Francois's avatar
POTTIER Francois committed
398
399
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
400
401
402
403
404
405
(* The main algorithm, [LRijkstra], accumulates facts. A fact is a triple of a
   position (that is, a sub-trie), a word, and a lookahead assumption. Such a
   fact means that this position can be reached, from the source state
   [Trie.source fact.position], by consuming [fact.word], under the assumption
   that the next input symbol is [fact.lookahead]. *)

406
407
(* We allow [fact.lookahead] to be [any] so as to indicate that this fact does
   not have a lookahead assumption. *)
POTTIER Francois's avatar
POTTIER Francois committed
408

409
type fact = {
410
  position: Trie.trie;
411
  word: W.word;
412
  lookahead: Terminal.t (* may be [any] *)
413
414
}

415
416
(* Accessors. *)

417
let source fact =
418
  Trie.source fact.position
419

POTTIER Francois's avatar
POTTIER Francois committed
420
let current fact =
421
  Trie.current fact.position
422

423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
(* Two invariants reduce the number of facts that we consider:

   1. If [fact.lookahead] is a terminal symbol [z] (i.e., not [any]), then
      [z] does not cause an error in the current state [current fact]. It
      would be useless to consider a fact that violates this property; it
      cannot possibly lead to a successful reduction.

   2. [fact.lookahead] is [any] iff the current state [current fact] is
      solid. This sounds rather reasonable (when a state is entered
      by shifting, it is entered regardless of which symbol follows)
      and simplifies the implementation of the sub-module [T].

*)

let invariant1 fact =
  fact.lookahead = any || not (causes_an_error (current fact) fact.lookahead)

let invariant2 fact =
  (fact.lookahead = any) = is_solid (current fact)

(* [compatible z a] checks whether the terminal symbol [a] satisfies the
   lookahead assumption [z] -- which can be [any]. *)

let compatible z a =
447
448
  assert (non_error z);
  assert (Terminal.real a);
449
450
  z = any || z = a

POTTIER Francois's avatar
POTTIER Francois committed
451
(* ------------------------------------------------------------------------ *)
452

453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
(* As in Dijkstra's algorithm, a priority queue contains the facts that await
   examination. The length of [fact.word] serves as the priority of a fact.
   This guarantees that we discover shortest paths. (We never insert into the
   queue a fact whose priority is less than the priority of the last fact
   extracted out of the queue.) *)

(* [LowIntegerPriorityQueue] offers very efficient operations (essentially
   constant time, for a small constant). It exploits the fact that priorities
   are low nonnegative integers. *)

module Q = LowIntegerPriorityQueue

let q =
  Q.create()

(* We never insert into the queue a fact that immediately causes an error,
   i.e., a fact such that [causes_an_error (current fact) fact.lookahead]
   holds. In practice, this convention allows reducing the number of facts
   that go through the queue by a factor of two. *)

(* In principle, there is no need to insert the fact into the queue if [T]
   already stores a comparable fact. We could perform this test in [add].
   However, a quick experiment suggests that this is not worthwhile. The run
   time augments (because membership in [T] is tested twice, upon inserting
   and upon extracting) and the memory consumption does not seem to go down
   significantly. *)

let add fact =
481
482
  (* [fact.lookahead] can be [any] *)
  assert (non_error fact.lookahead);
483
484
  assert (invariant1 fact);
  assert (invariant2 fact);
485
486
487
488
489
490
491
492
493
  (* The length of [fact.word] serves as the priority of this fact. *)
  Q.add q fact (W.length fact.word)

(* Construct the [star] of every state [s]. Initialize the priority queue. *)

let () =
  Lr1.iter (fun s ->
    match Trie.star s with
    | Some trie ->
494
495
        (* TEMPORARY weird *)
        if is_solid s then
496
497
498
          add {
            position = trie;
            word = W.epsilon;
499
            lookahead = any
500
          }
501
502
503
504
505
506
507
508
        else
          foreach_terminal_not_causing_an_error s (fun z ->
            add {
              position = trie;
              word = W.epsilon;
              lookahead = z
            }
          )
509
510
511
512
513
514
    | None ->
        ()
  )

(* ------------------------------------------------------------------------ *)

515
516
517
518
519
520
521
522
523
524
(* The first symbol of the input, [W.first fact.word fact.lookahead], plays a
   special role. Indeed, for every position, for every first symbol, and for
   every lookahead symbol, we keep track of at most one fact. Thus, the total
   number of facts accumulated by the algorithm is at most [T.n^2], where [T]
   is the total size of the tries that we have constructed, and [n] is the
   number of terminal symbols. (This number can be quite large. [T] can be in
   the tens of thousands, and [n] can be over one hundred. These figures lead
   to a theoretical upper bound of 100M. In practice, for T=25K and n=108, we
   observe that the algorithm gathers about 7M facts.) *)

525
module T : sig
526
527

  (* [register fact] registers the fact [fact]. It returns [true] if this fact
POTTIER Francois's avatar
POTTIER Francois committed
528
529
     is new, i.e., no fact concerning the same triple of [position], [a], and
     [z] was previously known. *)
530
531
  val register: fact -> bool

POTTIER Francois's avatar
POTTIER Francois committed
532
533
  (* [query current z f] enumerates all known facts whose current state is
     [current] and whose lookahead assumption is [z]. *)
534
  val query: Lr1.node -> Terminal.t -> (fact -> unit) -> unit
535

536
  val verbose: unit -> unit
537

538
end = struct
539

540
  (* This module implements a set of facts. Two facts are considered equal
541
     (for the purposes of this set) if they have the same [position], [a], and
542
543
544
545
546
547
     [z] fields. The [word] is not considered. Indeed, we are not interested
     in keeping track of several words that produce the same effect. Only the
     shortest such word is of interest. *)

  (* We need to query the set of facts in two ways. In [register], we need to
     test whether a fact is in the set. In [query], we need to find all facts
POTTIER Francois's avatar
POTTIER Francois committed
548
549
     that match a pair [current, z]. For this reason, we use a two-level table.
     The first level is a matrix indexed by [current] and [z]. At the second
550
     level, we find sets of facts. *)
551
552
553
(**)

  module M =
554
    MySet.Make(struct
555
556
      type t = fact
      let compare fact1 fact2 =
557
        let c = Trie.compare fact1.position fact2.position in
558
        if c <> 0 then c else
559
560
        let a1 = W.first fact1.word fact1.lookahead
        and a2 = W.first fact2.word fact2.lookahead in
561
        (* note: [a1] and [a2] can be [any] here *)
562
563
        Terminal.compare a1 a2
    end)
564

565
  let table = (* a pretty large table... *)
566
    Array.make (Lr1.n * Terminal.n) M.empty
567
  (* TEMPORARY this space is wasted for solid states *)
568

POTTIER Francois's avatar
POTTIER Francois committed
569
  let index current z =
570
    Terminal.n * (Lr1.number current) + Terminal.t2i z
571

572
573
  let count = ref 0

574
  let register fact =
POTTIER Francois's avatar
POTTIER Francois committed
575
    let current = current fact in
576
    let z = fact.lookahead in
577
    assert (non_error z);
578
579
    (* [z] is [any] iff [current] is solid. *)
    assert ((z = any) = is_solid current);
POTTIER Francois's avatar
POTTIER Francois committed
580
    let i = index current z in
581
582
583
584
585
586
587
588
589
590
591
592
    let m = table.(i) in
    (* We crucially rely on the fact that [M.add] guarantees not to
       change the set if an ``equal'' fact already exists. Thus, a
       later, longer path is ignored in favor of an earlier, shorter
       path. *)
    let m' = M.add fact m in
    m != m' && begin
      incr count;
      table.(i) <- m';
      true
    end

POTTIER Francois's avatar
POTTIER Francois committed
593
  let query current z f =
594
595
596
597
    assert (z <> any);
    (* if [current] is solid then the facts that concern it are stored
       under any [any], not under [z] *)
    let i = index current (if is_solid current then any else z) in
598
599
    let m = table.(i) in
    M.iter f m
600

601
  let verbose () =
602
603
    Printf.fprintf stderr "T stores %d facts.\n%!" !count

604
605
end

POTTIER Francois's avatar
POTTIER Francois committed
606
607
(* ------------------------------------------------------------------------ *)

608
609
610
611
612
613
614
(* The module [E] is in charge of recording the non-terminal edges that we have
   discovered, or more precisely, the conditions under which these edges can be
   taken. *)

module E : sig

  (* [register s nt w z] records that, in state [s], the outgoing edge labeled
615
616
617
     [nt] can be taken by consuming the word [w], if the next symbol is [z].
     It returns [true] if this information is new. *)
  val register: Lr1.node -> Nonterminal.t -> W.word -> Terminal.t -> bool
618
619
620
621

  (* [query s nt a z] answers whether, in state [s], the outgoing edge labeled
     [nt] can be taken by consuming some word [w], under the assumption that
     the next symbol is [z], and under the constraint that the first symbol of
622
623
     [w.z] is [a]. *)
  val query: Lr1.node -> Nonterminal.t -> Terminal.t -> Terminal.t -> (W.word -> unit) -> unit
624

625
  val verbose: unit -> unit
626

627
628
end = struct

629
630
631
632
633
634
635
636
637
638
  (* At a high level, we must implement a mapping of [s, nt, a, z] to [w]. In
     practice, we can implement this specification using any combination of
     arrays, hash tables, balanced binary trees, and perfect hashing (i.e.,
     packing several of [s], [nt], [a], [z] in one word.) Here, we choose to
     use an array, indexed by [s], of hash tables, indexed by a key that packs
     [nt], [a], and [z] in one word. According to a quick experiment, the
     final population of the hash table [table.(index s)] seems to be roughly
     [Terminal.n * Trie.size s]. We note that using an initial capacity
     of 0 and relying on the hash table's resizing mechanism has a significant
     cost, which is why we try to guess a good initial capacity. *)
639

640
  module H = Hashtbl
641

642
  let table = (* a pretty large table... *)
643
644
645
646
    Array.init (Lr1.n) (fun i ->
      let size = Trie.size i in
      H.create (if size = 1 then 0 else Terminal.n * size)
    )
647
648
649

  let index s =
    Lr1.number s
650

651
652
653
654
  let pack nt a z : int =
    (Nonterminal.n2i nt lsl 16) lor
    (Terminal.t2i a lsl 8) lor
    (Terminal.t2i z)
655

656
657
  let count = ref 0

658
  let register s nt w z =
659
    assert (Terminal.real z);
660
    let i = index s in
661
    let m = table.(i) in
662
    let a = W.first w z in
663
    assert (not (causes_an_error s a));
664
665
666
667
    let key = pack nt a z in
    if H.mem m key then
      false
    else begin
668
      incr count;
669
      H.add m key w;
670
671
      true
    end
672

673
  let rec query s nt a z f =
674
    assert (Terminal.real z);
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
    (* [a] can be [any] *)
    if a <> any then begin
      let i = index s in
      let m = table.(i) in
      let key = pack nt a z in
      match H.find m key with
      | w -> f w
      | exception Not_found -> ()
    end
    else begin
      (* If [a] is [any], we query the table for every concrete [a].
         We can limit ourselves to symbols that do not cause an error
         in state [s]. Those that do certainly do not have an entry;
         see the assertion in [register] above. *)
      foreach_terminal_not_causing_an_error s (fun a ->
        query s nt a z f
      )
        (* TEMPORARY try a scheme that allows a more efficient iteration? *)
    end
694

695
  let verbose () =
696
697
    Printf.fprintf stderr "E stores %d facts.\n%!" !count

698
699
end

POTTIER Francois's avatar
POTTIER Francois committed
700
701
(* ------------------------------------------------------------------------ *)

702
let new_edge s nt w z =
703
  assert (Terminal.real z);
704
  if E.register s nt w z then
705
    let sym = Symbol.N nt in
706
    T.query s (W.first w z) (fun fact ->
707
      assert (compatible fact.lookahead (W.first w z));
708
709
      match Trie.step sym fact.position with
      | position ->
710
          assert (not (is_solid (Trie.current position)));
711
          if not (causes_an_error (Trie.current position) z) then
712
            add {
713
              position;
714
715
716
717
718
              word = W.append fact.word w;
              lookahead = z
            }
      | exception Not_found ->
          ()
719
    )
720
721
722
723
724
725
726
727
728
729

(* [consequences fact] is invoked when we discover a new fact (i.e., one that
   was not previously known). It studies the consequences of this fact. These
   consequences are of two kinds:

   - As in Dijkstra's algorithm, the new fact can be viewed as a newly
   discovered vertex. We study its (currently known) outgoing edges,
   and enqueue new facts in the priority queue.

   - Sometimes, a fact can also be viewed as a newly discovered edge.
POTTIER Francois's avatar
POTTIER Francois committed
730
731
   This is the case when the word from [fact.source] to [fact.current]
   represents a production of the grammar and [fact.current] is willing
732
733
734
735
736
737
738
739
   to reduce this production. We record the existence of this edge,
   and re-inspect any previously discovered vertices which are
   interested in this outgoing edge.
*)
(**)

let consequences fact =

POTTIER Francois's avatar
POTTIER Francois committed
740
  let current = current fact in
741

POTTIER Francois's avatar
POTTIER Francois committed
742
  (* 1. View [fact] as a vertex. Examine the transitions out of [current]. *)
743
  
744
  SymbolMap.iter (fun sym s' ->
745
    match Trie.step sym fact.position, sym with
746
    | exception Not_found -> ()
747
    | position, Symbol.T t ->
748
749
        (* [t] cannot be the [error] token, because the trie does not have
           any edges labeled [error]. *)
750
        assert (non_error t);
751

POTTIER Francois's avatar
POTTIER Francois committed
752
        (* 1a. There is a transition labeled [t] out of [current]. If
753
754
755
756
757
           the lookahead assumption [fact.lookahead] is compatible with [t],
           then we derive a new fact, where one more edge has been taken. We
           enqueue this new fact for later examination. *)
        (**)

758
        if compatible fact.lookahead t then begin
759
          let word = W.append fact.word (W.singleton t) in
760
761
762
763
764
765
766
767
          (* assert (Lr1.Node.compare (Trie.current position) s' = 0); *)
          (* [s'] has a terminal incoming symbol. It is always entered
             without consideration for the next lookahead symbol. Thus,
             we use [any] as the lookahead assumption in the new fact
             that we produce. *)
          assert (is_solid (Trie.current position));
          add { position; word; lookahead = any }
        end
768

769
    | position, Symbol.N nt ->
770

POTTIER Francois's avatar
POTTIER Francois committed
771
        (* 1b. There is a transition labeled [nt] out of [current]. We
772
773
774
775
776
777
           need to know how this nonterminal edge can be taken. We query for a
           word [w] that allows us to take this edge. The answer depends on
           the terminal symbol [z] that comes *after* this word: we try all
           such symbols. Furthermore, we need the first symbol of [w.z] to
           satisfy the lookahead assumption [fact.lookahead], so the answer
           also depends on this assumption. *)
POTTIER Francois's avatar
POTTIER Francois committed
778
779
        (* TEMPORARY it could be that the answer does not depend on [z]...
           (default reduction) *)
780
781
782
        (**)

        foreach_terminal_not_causing_an_error s' (fun z ->
POTTIER Francois's avatar
POTTIER Francois committed
783
          E.query current nt fact.lookahead z (fun w ->
784
785
            assert (compatible fact.lookahead (W.first w z));
            assert (not (is_solid (Trie.current position)));
786
            add {
787
              position;
788
789
790
791
792
              word = W.append fact.word w;
              lookahead = z
            }
          )
        )
793

POTTIER Francois's avatar
POTTIER Francois committed
794
  ) (Lr1.transitions current);
795
796

  (* 2. View [fact] as a possible edge. This is possible if the path from
POTTIER Francois's avatar
POTTIER Francois committed
797
798
     [fact.source] to [current] represents a production [prod] and
     [current] is willing to reduce this production. We check that
799
     [fact.position] accepts [epsilon]. This guarantees that reducing [prod]
800
801
802
803
     takes us all the way back to [fact.source]. Thus, this production gives
     rise to an edge labeled [nt] -- the left-hand side of [prod] -- out of
     [fact.source]. This edge is subject to the lookahead assumption
     [fact.lookahead], so we record that. *)
804
805
  (**)

806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
  if fact.lookahead <> any then begin
    match has_reduction current fact.lookahead with
    | Some prod when Trie.accepts prod fact.position ->
        new_edge (source fact) (Production.nt prod) fact.word fact.lookahead
    | _ ->
        ()
  end
  else begin
    (* Every reduction must be considered. *)
    match Invariant.has_default_reduction current with
    | Some (prod, _) ->
        if Trie.accepts prod fact.position then
          (* TEMPORARY for now, avoid sending [any] into [new_edge] *)
          foreach_terminal (fun z ->
            new_edge (source fact) (Production.nt prod) (fact.word) z
          )
    | None ->
       TerminalMap.iter (fun z prods ->
824
         if non_error z then
825
826
827
828
829
           let prod = Misc.single prods in
           if Trie.accepts prod fact.position then
             new_edge (source fact) (Production.nt prod) (fact.word) z
       ) (Lr1.reductions current)
  end
830

831
let level = ref 0
832

POTTIER Francois's avatar
POTTIER Francois committed
833
834
835
let extracted, considered =
  ref 0, ref 0

POTTIER Francois's avatar
POTTIER Francois committed
836
837
let done_with_level () =
  Printf.fprintf stderr "Done with level %d.\n" !level;
838
  W.verbose();
839
840
  T.verbose();
  E.verbose();
POTTIER Francois's avatar
POTTIER Francois committed
841
842
843
  Printf.fprintf stderr "Q stores %d facts.\n" (Q.cardinal q);
  Printf.fprintf stderr "%d facts extracted out of Q, of which %d considered.\n%!"
    !extracted !considered
POTTIER Francois's avatar
POTTIER Francois committed
844

845
let discover fact =
POTTIER Francois's avatar
POTTIER Francois committed
846
  incr extracted;
847
  if T.register fact then begin
848
    if W.length fact.word > ! level then begin
POTTIER Francois's avatar
POTTIER Francois committed
849
      done_with_level();
850
851
      level := W.length fact.word;
    end;
POTTIER Francois's avatar
POTTIER Francois committed
852
    incr considered;
853
    consequences fact
854
  end
855

POTTIER Francois's avatar
POTTIER Francois committed
856
let () =
857
  Trie.verbose();
858
  Q.repeat q discover;
859
  Time.tick "Running LRijkstra";
POTTIER Francois's avatar
POTTIER Francois committed
860
  done_with_level()
861

862
863
(* ------------------------------------------------------------------------ *)

864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
(* The following code validates the fact that an error can be triggered in
   state [s'] by beginning in the initial state [s] and reading the
   sequence of terminal symbols [w]. We use this for debugging purposes. *)

let fail msg =
  Printf.fprintf stderr "coverage: internal error: %s.\n%!" msg;
  false

open ReferenceInterpreter

let validate s s' w : bool =
  match
    ReferenceInterpreter.check_error_path (Lr1.nt_of_entry s) (W.elements w)
  with
  | OInputReadPastEnd ->
      fail "input was read past its end"
  | OInputNotFullyConsumed ->
      fail "input was not fully consumed"
  | OUnexpectedAccept ->
      fail "input was unexpectedly accepted"
  | OK state ->
      Lr1.Node.compare state s' = 0 ||
      fail (
        Printf.sprintf "error occurred in state %d instead of %d"
          (Lr1.number state)
          (Lr1.number s')
      )

(* ------------------------------------------------------------------------ *)

894
895
896
897
898
899
900
(* We now wish to determine, given a state [s'] and a terminal symbol [z], a
   minimal path that takes us from some entry state to state [s'] with [z] as
   the next (unconsumed) symbol. *)

(* This can be formulated as a search for a shortest path in a graph. The
   graph is not just the automaton, though. It is a (much) larger graph whose
   vertices are pairs [s, z] and whose edges are obtained by querying the
901
   module [E] above. *)
902

903
let forward () =
904

905
  let module A = Astar.Make(struct
906

907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
    (* A vertex is a pair [s, z].
       [z] cannot be the [error] token. *)
    type node =
        Lr1.node * Terminal.t

    let equal (s'1, z1) (s'2, z2) =
      Lr1.Node.compare s'1 s'2 = 0 && Terminal.compare z1 z2 = 0

    let hash (s, z) =
      Hashtbl.hash (Lr1.number s, z)

    (* An edge is labeled with a word. *)
    type label =
      W.word

    (* Forward search from every [s, z], where [s] is an initial state. *)
    let sources f =
      foreach_terminal (fun z ->
        ProductionMap.iter (fun _ s ->
          f (s, z)
        ) Lr1.entry
928
929
      )

930
    let successors (s, z) edge =
931
      assert (Terminal.real z);
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
      SymbolMap.iter (fun sym s' ->
        match sym with
        | Symbol.T t ->
            if Terminal.equal z t then
              let w = W.singleton t in
              foreach_terminal (fun z ->
                edge w 1 (s', z)
              )
        | Symbol.N nt ->
           foreach_terminal (fun z' ->
             E.query s nt z z' (fun w ->
               edge w (W.length w) (s', z')
             )
           )
      ) (Lr1.transitions s)

    let estimate _ =
      0

  end) in

  (* Search forward. *)

955
  Printf.fprintf stderr "Forward search:\n%!";
956
  let seen = ref Lr1.NodeSet.empty in
957
  let _, _ = A.search (fun ((s', z), path) ->
958
959
960
961
962
963
964
965
966
967
968
969
970
971
    if causes_an_error s' z && not (Lr1.NodeSet.mem s' !seen) then begin
      seen := Lr1.NodeSet.add s' !seen;
      (* An error can be triggered in state [s'] by beginning in the initial
         state [s] and reading the sequence of terminal symbols [w]. *)
      let (s, _), ws = A.reverse path in
      let w = List.fold_right W.append ws (W.singleton z) in
      Printf.fprintf stderr
        "An error can be reached from state %d to state %d:\n%!"
        (Lr1.number s)
        (Lr1.number s');
      Printf.fprintf stderr "%s\n%!" (W.print w);
      assert (validate s s' w)
    end
  ) in
972
973
974
  Printf.fprintf stderr "Reachable (forward): %d states\n%!"
    (Lr1.NodeSet.cardinal !seen);
  !seen
975
976

let () =
977
978
  let f = forward() in
  Time.tick "Forward search";
POTTIER Francois's avatar
POTTIER Francois committed
979
980
981
982
  let stat = Gc.quick_stat() in
  Printf.fprintf stderr
    "Maximum size reached by the major heap: %dM\n"
    (stat.Gc.top_heap_words * (Sys.word_size / 8) / 1024 / 1024);
983
  ignore f
984
985

(* TODO:
POTTIER Francois's avatar
POTTIER Francois committed
986
  can we store fewer facts when we hit a default reduction?
987
  remove CompletedNatWitness?, revert Fix
988
  collect performance data, correlated with star size and alphabet size; draw a graph
POTTIER Francois's avatar
POTTIER Francois committed
989
  count the unreachable states and see if they are numerous in practice
POTTIER Francois's avatar
POTTIER Francois committed
990
991
  optionally report several ways of reaching an error in state s
    (with different lookahead tokens) (report all of them?)
POTTIER Francois's avatar
POTTIER Francois committed
992
993
994
  warn if --list-errors is set AND the grammar uses [error]
  remove $syntaxerror?
  how do we maintain the list of error messages when the grammar evolves?
995
996
  implement a naive semi-algorithm that enumerates all input sentences,
    and evaluate how well (or how badly) it scales
997
*)
POTTIER Francois's avatar
POTTIER Francois committed
998

999
end