LRijkstra.ml 21.3 KB
Newer Older
1
open Grammar
2
module Q = LowIntegerPriorityQueue
3
module W = Terminal.Word(struct end) (* TEMPORARY wrap side effect in functor *)
4

5 6 7 8
(* Throughout, we ignore the [error] pseudo-token completely. We consider that
   it never appears on the input stream. Hence, any state whose incoming
   symbol is [error] is considered unreachable. *)

9 10
(* ------------------------------------------------------------------------ *)

11 12 13 14 15 16 17 18
(* We begin with a number of auxiliary functions that provide information
   about the LR(1) automaton. These functions could perhaps be moved to a
   separate module. We keep them here, for the moment, because they are not
   used anywhere else. *)

(* [reductions s z] is the list of reductions permitted in state [s] when the
   lookahead symbol is [z]. This is a list of zero or one elements. This does
   not take default reductions into account. *)
19

20
let reductions s z : Production.index list =
21 22 23 24 25
  try
    TerminalMap.find z (Lr1.reductions s)
  with Not_found ->
    []

26 27 28
(* [has_reduction s z] tells whether state [s] is willing to reduce some
   production (and if so, which one) when the lookahead symbol is [z]. It
   takes a possible default reduction into account. *)
29 30 31 32 33 34 35 36 37 38 39 40 41

let has_reduction s z : Production.index option =
  match Invariant.has_default_reduction s with
  | Some (prod, _) ->
      Some prod
  | None ->
      match reductions s z with
      | prod :: prods ->
          assert (prods = []);
          Some prod
      | [] ->
          None

42 43
(* [causes_an_error s z] tells whether state [s] will initiate an error on the
   lookahead symbol [z]. *)
44

45
let causes_an_error s z : bool =
46 47 48 49 50 51 52
  match Invariant.has_default_reduction s with
  | Some _ ->
      false
  | None ->
      reductions s z = [] &&
      not (SymbolMap.mem (Symbol.T z) (Lr1.transitions s))

53 54 55
(* [foreach_terminal f] applies the function [f] to every terminal symbol in
   turn, except [error]. *)

56 57 58 59 60 61
let foreach_terminal f =
  Terminal.iter (fun t ->
    if not (Terminal.equal t Terminal.error) then
      f t
  )

62 63 64 65 66
(* [foreach_terminal_not_causing_an_error s f] applies the function [f] to
   every terminal symbol [z] such that [causes_an_error s z] is false. This
   could be implemented in a naive manner using [foreach_terminal] and
   [causes_an_error]. This implementation is slightly more efficient. *)

67 68 69
let foreach_terminal_not_causing_an_error s f =
  match Invariant.has_default_reduction s with
  | Some _ ->
70
      (* There is a default reduction. No symbol causes an error. *)
71 72
      foreach_terminal f
  | None ->
73 74 75 76 77
      (* Enumerate every terminal symbol [z] for which there is a
         reduction. *)
      TerminalMap.iter (fun z _ ->
        if not (Terminal.equal z Terminal.error) then
          f z
78
      ) (Lr1.reductions s);
79 80
      (* Enumerate every terminal symbol [z] for which there is a
         transition. *)
81 82
      SymbolMap.iter (fun sym _ ->
        match sym with
83 84 85
        | Symbol.T z ->
            if not (Terminal.equal z Terminal.error) then
              f z
86 87 88 89
        | Symbol.N _ ->
            ()
      ) (Lr1.transitions s)

90 91
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
92 93 94 95 96 97 98 99 100 101 102 103
(* Suppose [s] is a state that carries an outgoing edge labeled with a
   non-terminal symbol [nt]. We are interested in finding out how this edge
   can be taken. In order to do that, we must determine how, by starting in
   [s], one can follow a path that corresponds to (the right-hand side of) a
   production [prod] associated with [nt]. There are in general several such
   productions. The paths that they determine in the automaton form a "star".
   We represent the star rooted at [s] as a trie. For every state [s], the
   star rooted at [s] is constructed in advance, before the algorithm runs.
   While the algorithm runs, a point in the trie (that is, a sub-trie) tells
   us where we come form, where we are, and which production(s) we are hoping
   to reduce in the future. *)

104 105 106
module Trie : sig

  type trie
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122

  (* [star s] creates a (new) trie whose source is [s], populated with its
     branches. (There is one branch for every production [prod] associated
     with every non-terminal symbol [nt] for which [s] carries an outgoing
     edge.) If the star turns out to be trivial (i.e., without any branches)
     then [None] is returned. *)
  val star: Lr1.node -> trie option

  (* Every (sub-)trie has a unique identity. (One can think of it as its
     address.) [compare] compares the identity of two tries. This can be
     used, e.g., to set up a map whose keys are tries. *)
  val compare: trie -> trie -> int

  (* [source t] returns the source state of the (sub-)trie [t]. This is
     the root of the star of which [t] is a sub-trie. In other words, this
     tells us "where we come from". *)
123
  val source: trie -> Lr1.node
124 125 126 127
    
  (* [target t] returns the current state of the (sub-)trie [t]. This is
     the root of the sub-trie [t]. In other words, this tells us "where
     we are". *)
128 129
  val target: trie -> Lr1.node

130 131 132 133 134
  (* [accepts prod t] tells whether the current state of the trie [t] is
     the end of a branch associated with production [prod]. If so, this
     means that we have successfully followed a path that corresponds to
     the right-hand side of production [prod]. *)
  val accepts: Production.index -> trie -> bool
135

136 137 138 139 140 141 142 143
  (* [step sym t] is the immediate sub-trie of [t] along the symbol [sym].
     This function raises [Not_found] if [t] has no child labeled [sym]. *)
  val step: Symbol.t -> trie -> trie

  (* [verbose()] outputs debugging & performance information. *)
  val verbose: unit -> unit

end = struct
144

145 146
  type trie = {
    identity: int;
147 148
    source: Lr1.node;
    target: Lr1.node;
149
    productions: Production.index list;
150
    transitions: trie SymbolMap.t;
151
  }
152

153 154
  let c = ref 0

155
  let mktrie source target productions transitions =
156
    let identity = Misc.postincrement c in
157
    { identity; source; target; productions; transitions }
158

159 160
  let accepts prod t =
    List.mem prod t.productions
161

162 163
  (* TEMPORARY could insert this branch only if viable -- leads to 12600 instead of 12900 in ocaml.mly --lalr *)

164 165
  (* [insert] logically consumes its argument [t], which should no
     longer be used. *)
166
  let rec insert target w prod t =
167 168
    match w with
    | [] ->
169 170 171 172
        (* We consume (update) the trie [t], so there is no need to allocate a
           new stamp. (Of course we could allocate a new stamp, but I prefer
           to be precise.) *)
        { t with productions = prod :: t.productions }
173
    | a :: w ->
174 175 176 177 178
        (* Check if there is a transition labeled [a] out of [target]. If
           there is, we add a child to the trie [t]. If there isn't, then it
           must have been removed by conflict resolution. (Indeed, it must be
           present in a canonical automaton.) The trie remains unchanged in
           this case. *)
179 180
        match SymbolMap.find a (Lr1.transitions target) with
        | successor ->
181 182 183 184 185 186 187 188 189 190 191
            (* Find our child at [a], or create it. *)
            let t' =
              try
                SymbolMap.find a t.transitions
              with Not_found ->
                mktrie t.source successor [] SymbolMap.empty
            in
            (* Update the child [t']. *)
            let t' = insert successor w prod t' in
            (* Update [t]. Again, no need to allocate a new stamp. *)
            { t with transitions = SymbolMap.add a t' t.transitions }
192 193 194
        | exception Not_found ->
            t

195 196 197 198 199
  (* [insert prod t] inserts a new branch, corresponding to production
     [prod], into the trie [t]. This function consumes its argument,
     which should no longer be used afterwards. *)
  let insert prod t =
    let w = Array.to_list (Production.rhs prod) in
200
    insert t.source w prod t
201

202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
  (* [fresh s] creates a new empty trie whose source is [s]. *)
  let fresh source =
    mktrie source source [] SymbolMap.empty

  let star s =
    SymbolMap.fold (fun sym _ accu ->
      match sym with
      | Symbol.T _ ->
          accu
      | Symbol.N nt ->
          Production.foldnt nt accu insert
    ) (Lr1.transitions s) (fresh s)

  let nontrivial t =
    not (t.productions = [] && SymbolMap.is_empty t.transitions)

  let star s =
    let t = star s in
    if nontrivial t then
      Some t
    else
      None

225 226 227 228 229 230
  let source t =
    t.source

  let target t =
    t.target

231
  let step a t =
232
    SymbolMap.find a t.transitions (* careful: may raise [Not_found] *)
233

234 235
  let compare t1 t2 =
    Pervasives.compare (t1.identity : int) t2.identity
236

237
  let verbose () =
238
    Printf.fprintf stderr "Cumulated star size: %d\n%!" !c
239

240 241
end

242
type fact = {
243
  future: Trie.trie;
244
  word: W.word;
245
  lookahead: Terminal.t
246 247
}

248
let source fact =
249
  Trie.source fact.future
250 251

let target fact =
252
  Trie.target fact.future
253

254 255 256 257
let q =
  Q.create()

let add fact =
258 259
  (* assert (not (causes_an_error (target fact) fact.lookahead)); *)

260 261
  (* The length of the word serves as the priority of this fact. *)
  Q.add q fact (W.length fact.word)
POTTIER Francois's avatar
POTTIER Francois committed
262 263
    (* In principle, there is no need to insert the fact into the queue
       if [T] already stores a comparable fact. *)
264 265

let init s =
266 267 268 269 270 271 272 273 274 275 276
  match Trie.star s with
  | Some trie ->
      foreach_terminal_not_causing_an_error s (fun z ->
        add {
          future = trie;
          word = W.epsilon;
          lookahead = z
        }
      )
  | None ->
      ()
277 278

module T : sig
279 280

  (* [register fact] registers the fact [fact]. It returns [true] if this fact
281
     is new, i.e., no fact concerning the same quintuple of [source], [future],
282 283 284 285 286
     [target], [a], and [z] was previously known. *)
  val register: fact -> bool

  (* [query target z f] enumerates all known facts whose target state is [target]
     and whose lookahead assumption is [z]. *)
287
  val query: Lr1.node -> Terminal.t -> (fact -> unit) -> unit
288

289
  val verbose: unit -> unit
290

291
end = struct
292

293 294 295 296 297 298 299 300 301 302 303
  (* This module implements a set of facts. Two facts are considered equal
     (for the purposes of this set) if they have the same [future], [a], and
     [z] fields. The [word] is not considered. Indeed, we are not interested
     in keeping track of several words that produce the same effect. Only the
     shortest such word is of interest. *)

  (* We need to query the set of facts in two ways. In [register], we need to
     test whether a fact is in the set. In [query], we need to find all facts
     that match a pair [target, z]. For this reason, we use a two-level table.
     The first level is a matrix indexed by [target] and [z]. At the second
     level, we find sets of facts. *)
304 305 306
(**)

  module M =
307
    MySet.Make(struct
308 309 310
      type t = fact
      let compare fact1 fact2 =
        let c = Trie.compare fact1.future fact2.future in
311
        if c <> 0 then c else
312 313
        let a1 = W.first fact1.word fact1.lookahead
        and a2 = W.first fact2.word fact2.lookahead in
314 315
        Terminal.compare a1 a2
    end)
316

317 318 319 320 321
  let table = (* a pretty large table... *)
    Array.make (Lr1.n * Terminal.n) M.empty

  let index target z =
    Terminal.n * (Lr1.number target) + Terminal.t2i z
322

323 324
  let count = ref 0

325
  let register fact =
326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344
    let target = target fact in
    let z = fact.lookahead in
    let i = index target z in
    let m = table.(i) in
    (* We crucially rely on the fact that [M.add] guarantees not to
       change the set if an ``equal'' fact already exists. Thus, a
       later, longer path is ignored in favor of an earlier, shorter
       path. *)
    let m' = M.add fact m in
    m != m' && begin
      incr count;
      table.(i) <- m';
      true
    end

  let query target z f =
    let i = index target z in
    let m = table.(i) in
    M.iter f m
345

346
  let verbose () =
347 348
    Printf.fprintf stderr "T stores %d facts.\n%!" !count

349 350 351 352 353 354 355 356 357
end

(* The module [E] is in charge of recording the non-terminal edges that we have
   discovered, or more precisely, the conditions under which these edges can be
   taken. *)

module E : sig

  (* [register s nt w z] records that, in state [s], the outgoing edge labeled
358 359 360
     [nt] can be taken by consuming the word [w], if the next symbol is [z].
     It returns [true] if this information is new. *)
  val register: Lr1.node -> Nonterminal.t -> W.word -> Terminal.t -> bool
361 362 363 364

  (* [query s nt a z] answers whether, in state [s], the outgoing edge labeled
     [nt] can be taken by consuming some word [w], under the assumption that
     the next symbol is [z], and under the constraint that the first symbol of
365 366
     [w.z] is [a]. *)
  val query: Lr1.node -> Nonterminal.t -> Terminal.t -> Terminal.t -> (W.word -> unit) -> unit
367

368
  val verbose: unit -> unit
369

370 371 372 373 374
end = struct

  (* For now, we implement a mapping of [s, nt, a, z] to [w]. *)

  module M =
375 376 377
    MySet.Make(struct
      type t = Nonterminal.t * Terminal.t * Terminal.t * W.word
      let compare (nt1, a1, z1, _w1) (nt2, a2, z2, _w2) =
378 379 380 381 382 383 384
        let c = Nonterminal.compare nt1 nt2 in
        if c <> 0 then c else
        let c = Terminal.compare a1 a2 in
        if c <> 0 then c else
        Terminal.compare z1 z2
    end)

385 386
  let table =
    Array.make Lr1.n M.empty
387

388 389
  let count = ref 0

390
  let register s nt w z =
391 392
    let i = Lr1.number s in
    let m = table.(i) in
393
    let a = W.first w z in
394
    let m' = M.add (nt, a, z, w) m in
395 396 397 398 399
    m != m' && begin
      incr count;
      table.(i) <- m';
      true
    end
400

401
  let query s nt a z f =
402 403
    let i = Lr1.number s in
    let m = table.(i) in
404 405 406
    let dummy = W.epsilon in
    match M.find (nt, a, z, dummy) m with
    | (_, _, _, w) -> f w
407
    | exception Not_found -> ()
408

409
  let verbose () =
410 411
    Printf.fprintf stderr "E stores %d facts.\n%!" !count

412 413
end

414
let new_edge s nt w z =
POTTIER Francois's avatar
POTTIER Francois committed
415 416 417 418
  (*
  Printf.fprintf stderr "Considering reduction on %s in state %d\n"
    (Terminal.print z) (Lr1.number s);
  *)
419
  if E.register s nt w z then
420
    let sym = Symbol.N nt in
421
    T.query s (W.first w z) (fun fact ->
422
      assert (Terminal.equal fact.lookahead (W.first w z));
423
      match Trie.step sym fact.future with
424
      | future ->
425
          if not (causes_an_error (Trie.target future) z) then
426 427 428 429 430 431 432
            add {
              future;
              word = W.append fact.word w;
              lookahead = z
            }
      | exception Not_found ->
          ()
433
    )
434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453

(* [consequences fact] is invoked when we discover a new fact (i.e., one that
   was not previously known). It studies the consequences of this fact. These
   consequences are of two kinds:

   - As in Dijkstra's algorithm, the new fact can be viewed as a newly
   discovered vertex. We study its (currently known) outgoing edges,
   and enqueue new facts in the priority queue.

   - Sometimes, a fact can also be viewed as a newly discovered edge.
   This is the case when the word from [fact.source] to [fact.target]
   represents a production of the grammar and [fact.target] is willing
   to reduce this production. We record the existence of this edge,
   and re-inspect any previously discovered vertices which are
   interested in this outgoing edge.
*)
(**)

let consequences fact =

454 455 456
  let target = target fact in

  (* 1. View [fact] as a vertex. Examine the transitions out of [target]. *)
457
  
458
  SymbolMap.iter (fun sym s' ->
459
    match Trie.step sym fact.future, sym with
460 461 462
    | exception Not_found -> ()
    | future, Symbol.T t ->

463
        (* 1a. There is a transition labeled [t] out of [target]. If
464 465 466 467 468 469 470 471
           the lookahead assumption [fact.lookahead] is compatible with [t],
           then we derive a new fact, where one more edge has been taken. We
           enqueue this new fact for later examination. *)
        (**)

        if Terminal.equal fact.lookahead t then
          let word = W.append fact.word (W.singleton t) in
          (* assert (Lr1.Node.compare future.Trie.target s' = 0); *)
472
          foreach_terminal_not_causing_an_error s' (fun z ->
473
            add { future; word; lookahead = z }
474
          )
475 476 477

    | future, Symbol.N nt ->

478
        (* 1b. There is a transition labeled [nt] out of [target]. We
479 480 481 482 483 484 485 486 487
           need to know how this nonterminal edge can be taken. We query for a
           word [w] that allows us to take this edge. The answer depends on
           the terminal symbol [z] that comes *after* this word: we try all
           such symbols. Furthermore, we need the first symbol of [w.z] to
           satisfy the lookahead assumption [fact.lookahead], so the answer
           also depends on this assumption. *)
        (**)

        foreach_terminal_not_causing_an_error s' (fun z ->
488
          E.query target nt fact.lookahead z (fun w ->
489 490 491 492 493 494 495 496
            assert (Terminal.equal fact.lookahead (W.first w z));
            add {
              future;
              word = W.append fact.word w;
              lookahead = z
            }
          )
        )
497

498
  ) (Lr1.transitions target);
499 500

  (* 2. View [fact] as a possible edge. This is possible if the path from
501 502
     [fact.source] to [target] represents a production [prod] and
     [target] is willing to reduce this production. We check that
503 504 505 506 507
     [fact.future] accepts [epsilon]. This guarantees that reducing [prod]
     takes us all the way back to [fact.source]. Thus, this production gives
     rise to an edge labeled [nt] -- the left-hand side of [prod] -- out of
     [fact.source]. This edge is subject to the lookahead assumption
     [fact.lookahead], so we record that. *)
508 509
  (**)

510
  match has_reduction target fact.lookahead with
511
  | Some prod when Trie.accepts prod fact.future ->
512
      new_edge (source fact) (Production.nt prod) fact.word fact.lookahead
513 514 515
  | _ ->
      ()

516
let level = ref 0
517

POTTIER Francois's avatar
POTTIER Francois committed
518 519
let done_with_level () =
  Printf.fprintf stderr "Done with level %d.\n" !level;
520 521
  T.verbose();
  E.verbose();
POTTIER Francois's avatar
POTTIER Francois committed
522 523
  Printf.fprintf stderr "Q stores %d facts.\n%!" (Q.cardinal q)

524
let discover fact =
525
  if T.register fact then begin
526
    if W.length fact.word > ! level then begin
POTTIER Francois's avatar
POTTIER Francois committed
527
      done_with_level();
528 529
      level := W.length fact.word;
    end;
530
    consequences fact
531
  end
532

POTTIER Francois's avatar
POTTIER Francois committed
533
let () =
534
  Lr1.iter init;
535
  Trie.verbose();
536
  Q.repeat q discover;
537
  Time.tick "Running LRijkstra";
POTTIER Francois's avatar
POTTIER Francois committed
538
  done_with_level()
539

540 541
(* ------------------------------------------------------------------------ *)

542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571
(* The following code validates the fact that an error can be triggered in
   state [s'] by beginning in the initial state [s] and reading the
   sequence of terminal symbols [w]. We use this for debugging purposes. *)

let fail msg =
  Printf.fprintf stderr "coverage: internal error: %s.\n%!" msg;
  false

open ReferenceInterpreter

let validate s s' w : bool =
  match
    ReferenceInterpreter.check_error_path (Lr1.nt_of_entry s) (W.elements w)
  with
  | OInputReadPastEnd ->
      fail "input was read past its end"
  | OInputNotFullyConsumed ->
      fail "input was not fully consumed"
  | OUnexpectedAccept ->
      fail "input was unexpectedly accepted"
  | OK state ->
      Lr1.Node.compare state s' = 0 ||
      fail (
        Printf.sprintf "error occurred in state %d instead of %d"
          (Lr1.number state)
          (Lr1.number s')
      )

(* ------------------------------------------------------------------------ *)

572 573 574 575 576 577 578
(* We now wish to determine, given a state [s'] and a terminal symbol [z], a
   minimal path that takes us from some entry state to state [s'] with [z] as
   the next (unconsumed) symbol. *)

(* This can be formulated as a search for a shortest path in a graph. The
   graph is not just the automaton, though. It is a (much) larger graph whose
   vertices are pairs [s, z] and whose edges are obtained by querying the
579
   module [E] above. *)
580

581
let forward () =
582

583
  let module A = Astar.Make(struct
584

585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605
    (* A vertex is a pair [s, z].
       [z] cannot be the [error] token. *)
    type node =
        Lr1.node * Terminal.t

    let equal (s'1, z1) (s'2, z2) =
      Lr1.Node.compare s'1 s'2 = 0 && Terminal.compare z1 z2 = 0

    let hash (s, z) =
      Hashtbl.hash (Lr1.number s, z)

    (* An edge is labeled with a word. *)
    type label =
      W.word

    (* Forward search from every [s, z], where [s] is an initial state. *)
    let sources f =
      foreach_terminal (fun z ->
        ProductionMap.iter (fun _ s ->
          f (s, z)
        ) Lr1.entry
606 607
      )

608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632
    let successors (s, z) edge =
      assert (not (Terminal.equal z Terminal.error));
      SymbolMap.iter (fun sym s' ->
        match sym with
        | Symbol.T t ->
            if Terminal.equal z t then
              let w = W.singleton t in
              foreach_terminal (fun z ->
                edge w 1 (s', z)
              )
        | Symbol.N nt ->
           foreach_terminal (fun z' ->
             E.query s nt z z' (fun w ->
               edge w (W.length w) (s', z')
             )
           )
      ) (Lr1.transitions s)

    let estimate _ =
      0

  end) in

  (* Search forward. *)

633
  Printf.fprintf stderr "Forward search:\n%!";
634
  let seen = ref Lr1.NodeSet.empty in
635
  let _, _ = A.search (fun ((s', z), path) ->
636 637 638 639 640 641 642 643 644 645 646 647 648 649
    if causes_an_error s' z && not (Lr1.NodeSet.mem s' !seen) then begin
      seen := Lr1.NodeSet.add s' !seen;
      (* An error can be triggered in state [s'] by beginning in the initial
         state [s] and reading the sequence of terminal symbols [w]. *)
      let (s, _), ws = A.reverse path in
      let w = List.fold_right W.append ws (W.singleton z) in
      Printf.fprintf stderr
        "An error can be reached from state %d to state %d:\n%!"
        (Lr1.number s)
        (Lr1.number s');
      Printf.fprintf stderr "%s\n%!" (W.print w);
      assert (validate s s' w)
    end
  ) in
650 651 652
  Printf.fprintf stderr "Reachable (forward): %d states\n%!"
    (Lr1.NodeSet.cardinal !seen);
  !seen
653

654
(* TEMPORARY the code in this module should run only if --coverage is set *)
655 656

let () =
657 658
  let f = forward() in
  Time.tick "Forward search";
659
  ignore f
660 661 662 663 664 665

(* TODO:
  subject to --coverage
  write to .coverage file
  collect performance data, correlated with star size and alphabet size; draw a graph
*)