LRijkstra.ml 19.6 KB
Newer Older
1
open Grammar
2
module Q = LowIntegerPriorityQueue
3
module W = Terminal.Word(struct end) (* TEMPORARY wrap side effect in functor *)
4

5 6 7 8
(* Throughout, we ignore the [error] pseudo-token completely. We consider that
   it never appears on the input stream. Hence, any state whose incoming
   symbol is [error] is considered unreachable. *)

9 10
(* ------------------------------------------------------------------------ *)

11 12 13 14 15 16 17 18
(* We begin with a number of auxiliary functions that provide information
   about the LR(1) automaton. These functions could perhaps be moved to a
   separate module. We keep them here, for the moment, because they are not
   used anywhere else. *)

(* [reductions s z] is the list of reductions permitted in state [s] when the
   lookahead symbol is [z]. This is a list of zero or one elements. This does
   not take default reductions into account. *)
19

20
let reductions s z : Production.index list =
21 22 23 24 25
  try
    TerminalMap.find z (Lr1.reductions s)
  with Not_found ->
    []

26 27 28
(* [has_reduction s z] tells whether state [s] is willing to reduce some
   production (and if so, which one) when the lookahead symbol is [z]. It
   takes a possible default reduction into account. *)
29 30 31 32 33 34 35 36 37 38 39 40 41

let has_reduction s z : Production.index option =
  match Invariant.has_default_reduction s with
  | Some (prod, _) ->
      Some prod
  | None ->
      match reductions s z with
      | prod :: prods ->
          assert (prods = []);
          Some prod
      | [] ->
          None

42 43
(* [causes_an_error s z] tells whether state [s] will initiate an error on the
   lookahead symbol [z]. *)
44

45
let causes_an_error s z : bool =
46 47 48 49 50 51 52
  match Invariant.has_default_reduction s with
  | Some _ ->
      false
  | None ->
      reductions s z = [] &&
      not (SymbolMap.mem (Symbol.T z) (Lr1.transitions s))

53 54 55
(* [foreach_terminal f] applies the function [f] to every terminal symbol in
   turn, except [error]. *)

56 57 58 59 60 61
let foreach_terminal f =
  Terminal.iter (fun t ->
    if not (Terminal.equal t Terminal.error) then
      f t
  )

62 63 64 65 66
(* [foreach_terminal_not_causing_an_error s f] applies the function [f] to
   every terminal symbol [z] such that [causes_an_error s z] is false. This
   could be implemented in a naive manner using [foreach_terminal] and
   [causes_an_error]. This implementation is slightly more efficient. *)

67 68 69
let foreach_terminal_not_causing_an_error s f =
  match Invariant.has_default_reduction s with
  | Some _ ->
70
      (* There is a default reduction. No symbol causes an error. *)
71 72
      foreach_terminal f
  | None ->
73 74 75 76 77
      (* Enumerate every terminal symbol [z] for which there is a
         reduction. *)
      TerminalMap.iter (fun z _ ->
        if not (Terminal.equal z Terminal.error) then
          f z
78
      ) (Lr1.reductions s);
79 80
      (* Enumerate every terminal symbol [z] for which there is a
         transition. *)
81 82
      SymbolMap.iter (fun sym _ ->
        match sym with
83 84 85
        | Symbol.T z ->
            if not (Terminal.equal z Terminal.error) then
              f z
86 87 88 89
        | Symbol.N _ ->
            ()
      ) (Lr1.transitions s)

90 91
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
92 93 94 95 96 97 98 99 100 101 102 103
(* Suppose [s] is a state that carries an outgoing edge labeled with a
   non-terminal symbol [nt]. We are interested in finding out how this edge
   can be taken. In order to do that, we must determine how, by starting in
   [s], one can follow a path that corresponds to (the right-hand side of) a
   production [prod] associated with [nt]. There are in general several such
   productions. The paths that they determine in the automaton form a "star".
   We represent the star rooted at [s] as a trie. For every state [s], the
   star rooted at [s] is constructed in advance, before the algorithm runs.
   While the algorithm runs, a point in the trie (that is, a sub-trie) tells
   us where we come form, where we are, and which production(s) we are hoping
   to reduce in the future. *)

104 105 106 107 108 109 110 111 112 113 114 115 116 117
module Trie : sig

  type trie
  val fresh: Lr1.node -> trie
  val is_empty: trie -> bool
  val accepts: Production.index -> trie -> bool
  val insert: Symbol.t list -> Production.index -> trie -> trie
  val source: trie -> Lr1.node
  val target: trie -> Lr1.node
  val derivative: Symbol.t -> trie -> trie
  val compare: trie -> trie -> int
  val stats: unit -> unit

end = struct
118 119 120

  let c = ref 0

121 122
  type trie = {
    identity: int;
123 124
    source: Lr1.node;
    target: Lr1.node;
125
    productions: Production.index list;
126
    transitions: trie SymbolMap.t;
127
  }
128

129
  let mktrie source target productions transitions =
130
    let identity = Misc.postincrement c in
131
    { identity; source; target; productions; transitions }
132

133
  let fresh source =
134
    mktrie source source [] SymbolMap.empty
135

136 137
  let is_empty t =
    t.productions = [] && SymbolMap.is_empty t.transitions
138

139 140
  let accepts prod t =
    List.mem prod t.productions
141

142 143
  (* [insert] logically consumes its argument [t], which should no
     longer be used. *)
144
  let rec insert target w prod t =
145 146
    match w with
    | [] ->
147 148 149 150
        (* We consume (update) the trie [t], so there is no need to allocate a
           new stamp. (Of course we could allocate a new stamp, but I prefer
           to be precise.) *)
        { t with productions = prod :: t.productions }
151
    | a :: w ->
152 153 154 155 156
        (* Check if there is a transition labeled [a] out of [target]. If
           there is, we add a child to the trie [t]. If there isn't, then it
           must have been removed by conflict resolution. (Indeed, it must be
           present in a canonical automaton.) The trie remains unchanged in
           this case. *)
157 158
        match SymbolMap.find a (Lr1.transitions target) with
        | successor ->
159 160 161 162 163 164 165 166 167 168 169
            (* Find our child at [a], or create it. *)
            let t' =
              try
                SymbolMap.find a t.transitions
              with Not_found ->
                mktrie t.source successor [] SymbolMap.empty
            in
            (* Update the child [t']. *)
            let t' = insert successor w prod t' in
            (* Update [t]. Again, no need to allocate a new stamp. *)
            { t with transitions = SymbolMap.add a t' t.transitions }
170 171 172 173 174
        | exception Not_found ->
            t

  let insert w prod t =
    insert t.source w prod t
175

176 177 178 179 180 181
  let source t =
    t.source

  let target t =
    t.target

182
  let derivative a t =
183
    SymbolMap.find a t.transitions (* careful: may raise [Not_found] *)
184

185 186
  let compare t1 t2 =
    Pervasives.compare (t1.identity : int) t2.identity
187

188 189
  let stats () =
    Printf.fprintf stderr "Cumulated star size: %d\n%!" !c
190

191 192
end

193
type fact = {
194
  future: Trie.trie;
195
  word: W.word;
196
  lookahead: Terminal.t
197 198
}

199
let source fact =
200
  Trie.source fact.future
201 202

let target fact =
203
  Trie.target fact.future
204

205 206 207 208 209 210 211 212
let star s : Trie.trie =
  SymbolMap.fold (fun sym _ accu ->
    match sym with
    | Symbol.T _ ->
        accu
    | Symbol.N nt ->
        Production.foldnt nt accu (fun prod accu ->
          let w = Array.to_list (Production.rhs prod) in
213
          (* could insert this branch only if viable -- leads to 12600 instead of 12900 in ocaml.mly --lalr *)
214
          Trie.insert w prod accu
215
        )
216
  ) (Lr1.transitions s) (Trie.fresh s)
217

218 219 220 221
let q =
  Q.create()

let add fact =
222 223
  (* assert (not (causes_an_error (target fact) fact.lookahead)); *)

224 225
  (* The length of the word serves as the priority of this fact. *)
  Q.add q fact (W.length fact.word)
POTTIER Francois's avatar
POTTIER Francois committed
226 227
    (* In principle, there is no need to insert the fact into the queue
       if [T] already stores a comparable fact. *)
228 229

let init s =
230
  let trie = star s in
POTTIER Francois's avatar
POTTIER Francois committed
231
  if not (Trie.is_empty trie) then
232 233 234 235 236 237
    foreach_terminal_not_causing_an_error s (fun z ->
      add {
        future = trie;
        word = W.epsilon;
        lookahead = z
      }
238
    )
239 240

module T : sig
241 242

  (* [register fact] registers the fact [fact]. It returns [true] if this fact
243
     is new, i.e., no fact concerning the same quintuple of [source], [future],
244 245 246 247 248
     [target], [a], and [z] was previously known. *)
  val register: fact -> bool

  (* [query target z f] enumerates all known facts whose target state is [target]
     and whose lookahead assumption is [z]. *)
249
  val query: Lr1.node -> Terminal.t -> (fact -> unit) -> unit
250

251 252
  val stats: unit -> unit

253
end = struct
254

255 256 257 258 259 260 261 262 263 264 265
  (* This module implements a set of facts. Two facts are considered equal
     (for the purposes of this set) if they have the same [future], [a], and
     [z] fields. The [word] is not considered. Indeed, we are not interested
     in keeping track of several words that produce the same effect. Only the
     shortest such word is of interest. *)

  (* We need to query the set of facts in two ways. In [register], we need to
     test whether a fact is in the set. In [query], we need to find all facts
     that match a pair [target, z]. For this reason, we use a two-level table.
     The first level is a matrix indexed by [target] and [z]. At the second
     level, we find sets of facts. *)
266 267 268
(**)

  module M =
269
    MySet.Make(struct
270 271 272
      type t = fact
      let compare fact1 fact2 =
        let c = Trie.compare fact1.future fact2.future in
273
        if c <> 0 then c else
274 275
        let a1 = W.first fact1.word fact1.lookahead
        and a2 = W.first fact2.word fact2.lookahead in
276 277
        Terminal.compare a1 a2
    end)
278

279 280 281 282 283
  let table = (* a pretty large table... *)
    Array.make (Lr1.n * Terminal.n) M.empty

  let index target z =
    Terminal.n * (Lr1.number target) + Terminal.t2i z
284

285 286
  let count = ref 0

287
  let register fact =
288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
    let target = target fact in
    let z = fact.lookahead in
    let i = index target z in
    let m = table.(i) in
    (* We crucially rely on the fact that [M.add] guarantees not to
       change the set if an ``equal'' fact already exists. Thus, a
       later, longer path is ignored in favor of an earlier, shorter
       path. *)
    let m' = M.add fact m in
    m != m' && begin
      incr count;
      table.(i) <- m';
      true
    end

  let query target z f =
    let i = index target z in
    let m = table.(i) in
    M.iter f m
307

308 309 310
  let stats () =
    Printf.fprintf stderr "T stores %d facts.\n%!" !count

311 312 313 314 315 316 317 318 319
end

(* The module [E] is in charge of recording the non-terminal edges that we have
   discovered, or more precisely, the conditions under which these edges can be
   taken. *)

module E : sig

  (* [register s nt w z] records that, in state [s], the outgoing edge labeled
320 321 322
     [nt] can be taken by consuming the word [w], if the next symbol is [z].
     It returns [true] if this information is new. *)
  val register: Lr1.node -> Nonterminal.t -> W.word -> Terminal.t -> bool
323 324 325 326

  (* [query s nt a z] answers whether, in state [s], the outgoing edge labeled
     [nt] can be taken by consuming some word [w], under the assumption that
     the next symbol is [z], and under the constraint that the first symbol of
327 328
     [w.z] is [a]. *)
  val query: Lr1.node -> Nonterminal.t -> Terminal.t -> Terminal.t -> (W.word -> unit) -> unit
329

330 331
  val stats: unit -> unit

332 333 334 335 336
end = struct

  (* For now, we implement a mapping of [s, nt, a, z] to [w]. *)

  module M =
337 338 339
    MySet.Make(struct
      type t = Nonterminal.t * Terminal.t * Terminal.t * W.word
      let compare (nt1, a1, z1, _w1) (nt2, a2, z2, _w2) =
340 341 342 343 344 345 346
        let c = Nonterminal.compare nt1 nt2 in
        if c <> 0 then c else
        let c = Terminal.compare a1 a2 in
        if c <> 0 then c else
        Terminal.compare z1 z2
    end)

347 348
  let table =
    Array.make Lr1.n M.empty
349

350 351
  let count = ref 0

352
  let register s nt w z =
353 354
    let i = Lr1.number s in
    let m = table.(i) in
355
    let a = W.first w z in
356
    let m' = M.add (nt, a, z, w) m in
357 358 359 360 361
    m != m' && begin
      incr count;
      table.(i) <- m';
      true
    end
362

363
  let query s nt a z f =
364 365
    let i = Lr1.number s in
    let m = table.(i) in
366 367 368
    let dummy = W.epsilon in
    match M.find (nt, a, z, dummy) m with
    | (_, _, _, w) -> f w
369
    | exception Not_found -> ()
370

371 372 373
  let stats () =
    Printf.fprintf stderr "E stores %d facts.\n%!" !count

374 375
end

376
let new_edge s nt w z =
POTTIER Francois's avatar
POTTIER Francois committed
377 378 379 380
  (*
  Printf.fprintf stderr "Considering reduction on %s in state %d\n"
    (Terminal.print z) (Lr1.number s);
  *)
381
  if E.register s nt w z then
382
    let sym = Symbol.N nt in
383
    T.query s (W.first w z) (fun fact ->
384 385 386
      assert (Terminal.equal fact.lookahead (W.first w z));
      match Trie.derivative sym fact.future with
      | future ->
387
          if not (causes_an_error (Trie.target future) z) then
388 389 390 391 392 393 394
            add {
              future;
              word = W.append fact.word w;
              lookahead = z
            }
      | exception Not_found ->
          ()
395
    )
396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415

(* [consequences fact] is invoked when we discover a new fact (i.e., one that
   was not previously known). It studies the consequences of this fact. These
   consequences are of two kinds:

   - As in Dijkstra's algorithm, the new fact can be viewed as a newly
   discovered vertex. We study its (currently known) outgoing edges,
   and enqueue new facts in the priority queue.

   - Sometimes, a fact can also be viewed as a newly discovered edge.
   This is the case when the word from [fact.source] to [fact.target]
   represents a production of the grammar and [fact.target] is willing
   to reduce this production. We record the existence of this edge,
   and re-inspect any previously discovered vertices which are
   interested in this outgoing edge.
*)
(**)

let consequences fact =

416 417 418
  let target = target fact in

  (* 1. View [fact] as a vertex. Examine the transitions out of [target]. *)
419
  
420
  SymbolMap.iter (fun sym s' ->
421 422 423 424
    match Trie.derivative sym fact.future, sym with
    | exception Not_found -> ()
    | future, Symbol.T t ->

425
        (* 1a. There is a transition labeled [t] out of [target]. If
426 427 428 429 430 431 432 433
           the lookahead assumption [fact.lookahead] is compatible with [t],
           then we derive a new fact, where one more edge has been taken. We
           enqueue this new fact for later examination. *)
        (**)

        if Terminal.equal fact.lookahead t then
          let word = W.append fact.word (W.singleton t) in
          (* assert (Lr1.Node.compare future.Trie.target s' = 0); *)
434
          foreach_terminal_not_causing_an_error s' (fun z ->
435
            add { future; word; lookahead = z }
436
          )
437 438 439

    | future, Symbol.N nt ->

440
        (* 1b. There is a transition labeled [nt] out of [target]. We
441 442 443 444 445 446 447 448 449
           need to know how this nonterminal edge can be taken. We query for a
           word [w] that allows us to take this edge. The answer depends on
           the terminal symbol [z] that comes *after* this word: we try all
           such symbols. Furthermore, we need the first symbol of [w.z] to
           satisfy the lookahead assumption [fact.lookahead], so the answer
           also depends on this assumption. *)
        (**)

        foreach_terminal_not_causing_an_error s' (fun z ->
450
          E.query target nt fact.lookahead z (fun w ->
451 452 453 454 455 456 457 458
            assert (Terminal.equal fact.lookahead (W.first w z));
            add {
              future;
              word = W.append fact.word w;
              lookahead = z
            }
          )
        )
459

460
  ) (Lr1.transitions target);
461 462

  (* 2. View [fact] as a possible edge. This is possible if the path from
463 464
     [fact.source] to [target] represents a production [prod] and
     [target] is willing to reduce this production. We check that
465 466 467 468 469
     [fact.future] accepts [epsilon]. This guarantees that reducing [prod]
     takes us all the way back to [fact.source]. Thus, this production gives
     rise to an edge labeled [nt] -- the left-hand side of [prod] -- out of
     [fact.source]. This edge is subject to the lookahead assumption
     [fact.lookahead], so we record that. *)
470 471
  (**)

472
  match has_reduction target fact.lookahead with
473
  | Some prod when Trie.accepts prod fact.future ->
474
      new_edge (source fact) (Production.nt prod) fact.word fact.lookahead
475 476 477
  | _ ->
      ()

478
let level = ref 0
479

POTTIER Francois's avatar
POTTIER Francois committed
480 481 482 483 484 485
let done_with_level () =
  Printf.fprintf stderr "Done with level %d.\n" !level;
  T.stats();
  E.stats();
  Printf.fprintf stderr "Q stores %d facts.\n%!" (Q.cardinal q)

486
let discover fact =
487
  if T.register fact then begin
488
    if W.length fact.word > ! level then begin
POTTIER Francois's avatar
POTTIER Francois committed
489
      done_with_level();
490 491
      level := W.length fact.word;
    end;
492
    consequences fact
493
  end
494

POTTIER Francois's avatar
POTTIER Francois committed
495
let () =
496
  Lr1.iter init;
497
  Trie.stats();
498
  Q.repeat q discover;
499
  Time.tick "Running LRijkstra";
POTTIER Francois's avatar
POTTIER Francois committed
500
  done_with_level()
501

502 503
(* ------------------------------------------------------------------------ *)

504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533
(* The following code validates the fact that an error can be triggered in
   state [s'] by beginning in the initial state [s] and reading the
   sequence of terminal symbols [w]. We use this for debugging purposes. *)

let fail msg =
  Printf.fprintf stderr "coverage: internal error: %s.\n%!" msg;
  false

open ReferenceInterpreter

let validate s s' w : bool =
  match
    ReferenceInterpreter.check_error_path (Lr1.nt_of_entry s) (W.elements w)
  with
  | OInputReadPastEnd ->
      fail "input was read past its end"
  | OInputNotFullyConsumed ->
      fail "input was not fully consumed"
  | OUnexpectedAccept ->
      fail "input was unexpectedly accepted"
  | OK state ->
      Lr1.Node.compare state s' = 0 ||
      fail (
        Printf.sprintf "error occurred in state %d instead of %d"
          (Lr1.number state)
          (Lr1.number s')
      )

(* ------------------------------------------------------------------------ *)

534 535 536 537 538 539 540
(* We now wish to determine, given a state [s'] and a terminal symbol [z], a
   minimal path that takes us from some entry state to state [s'] with [z] as
   the next (unconsumed) symbol. *)

(* This can be formulated as a search for a shortest path in a graph. The
   graph is not just the automaton, though. It is a (much) larger graph whose
   vertices are pairs [s, z] and whose edges are obtained by querying the
541
   module [E] above. *)
542

543
let forward () =
544

545
  let module A = Astar.Make(struct
546

547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567
    (* A vertex is a pair [s, z].
       [z] cannot be the [error] token. *)
    type node =
        Lr1.node * Terminal.t

    let equal (s'1, z1) (s'2, z2) =
      Lr1.Node.compare s'1 s'2 = 0 && Terminal.compare z1 z2 = 0

    let hash (s, z) =
      Hashtbl.hash (Lr1.number s, z)

    (* An edge is labeled with a word. *)
    type label =
      W.word

    (* Forward search from every [s, z], where [s] is an initial state. *)
    let sources f =
      foreach_terminal (fun z ->
        ProductionMap.iter (fun _ s ->
          f (s, z)
        ) Lr1.entry
568 569
      )

570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594
    let successors (s, z) edge =
      assert (not (Terminal.equal z Terminal.error));
      SymbolMap.iter (fun sym s' ->
        match sym with
        | Symbol.T t ->
            if Terminal.equal z t then
              let w = W.singleton t in
              foreach_terminal (fun z ->
                edge w 1 (s', z)
              )
        | Symbol.N nt ->
           foreach_terminal (fun z' ->
             E.query s nt z z' (fun w ->
               edge w (W.length w) (s', z')
             )
           )
      ) (Lr1.transitions s)

    let estimate _ =
      0

  end) in

  (* Search forward. *)

595
  Printf.fprintf stderr "Forward search:\n%!";
596
  let seen = ref Lr1.NodeSet.empty in
597
  let _, _ = A.search (fun ((s', z), path) ->
598 599 600 601 602 603 604 605 606 607 608 609 610 611
    if causes_an_error s' z && not (Lr1.NodeSet.mem s' !seen) then begin
      seen := Lr1.NodeSet.add s' !seen;
      (* An error can be triggered in state [s'] by beginning in the initial
         state [s] and reading the sequence of terminal symbols [w]. *)
      let (s, _), ws = A.reverse path in
      let w = List.fold_right W.append ws (W.singleton z) in
      Printf.fprintf stderr
        "An error can be reached from state %d to state %d:\n%!"
        (Lr1.number s)
        (Lr1.number s');
      Printf.fprintf stderr "%s\n%!" (W.print w);
      assert (validate s s' w)
    end
  ) in
612 613 614
  Printf.fprintf stderr "Reachable (forward): %d states\n%!"
    (Lr1.NodeSet.cardinal !seen);
  !seen
615

616
(* TEMPORARY the code in this module should run only if --coverage is set *)
617 618

let () =
619 620
  let f = forward() in
  Time.tick "Forward search";
621
  ignore f