LRijkstra.ml 11.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
(******************************************************************************)
(*                                                                            *)
(*                                   Menhir                                   *)
(*                                                                            *)
(*                       François Pottier, Inria Paris                        *)
(*              Yann Régis-Gianas, PPS, Université Paris Diderot              *)
(*                                                                            *)
(*  Copyright Inria. All rights reserved. This file is distributed under the  *)
(*  terms of the GNU General Public License version 2, as described in the    *)
(*  file LICENSE.                                                             *)
(*                                                                            *)
(******************************************************************************)

14 15 16 17
(* This module implements [--list-errors]. Its purpose is to find, for each
   pair of a state [s] and a terminal symbol [z] such that looking at [z] in
   state [s] causes an error, a minimal path (starting in some initial state)
   that actually triggers this error. *)
18 19 20 21 22 23 24 25 26 27

(* This is potentially useful for grammar designers who wish to better
   understand the properties of their grammar, or who wish to produce a
   list of all possible syntax errors (or, at least, one syntax error in
   each automaton state where an error may occur). *)

(* The problem seems rather tricky. One might think that it suffices to
   compute shortest paths in the automaton, and to use [Analysis.minimal] to
   replace each non-terminal symbol in a path with a minimal word that this
   symbol generates. One can indeed do so, but this yields only a lower bound
POTTIER Francois's avatar
POTTIER Francois committed
28 29 30 31
   on the actual shortest path to the error at [s, z]. Indeed, several
   difficulties arise, including the fact that reductions are subject to a
   lookahead hypothesis; the fact that some states have a default reduction,
   hence will never trigger an error; the fact that conflict resolution
32
x   removes some (shift or reduce) actions, hence may suppress the shortest
POTTIER Francois's avatar
POTTIER Francois committed
33
   path. *)
34

35
(* NOTE: THIS FILE IS COMPILED WITH -noassert BY DEFAULT. If you would like
36 37
   the assertions to be tested at runtime, change that in the file _tags.
   The performance impact of the assertions is about 10%. *)
38

39 40 41
(* ------------------------------------------------------------------------ *)

(* To delay the side effects performed by this module, we wrap everything in
42
   in a big functor. The functor also serves to pass verbosity parameters. *)
43

44
module Run (X : sig
45

46 47
  (* If [verbose] is set, produce various messages on [stderr]. *)
  val verbose: bool
48

49 50 51
  (* If [statistics] is defined, it is interpreted as the name of
     a file to which one line of statistics is appended. *)
  val statistics: string option
52

53
end) = struct
54

55
open Grammar
56
open Default
57

58 59 60 61 62 63 64 65 66 67 68
(* ------------------------------------------------------------------------ *)

(* Record our start time. *)

let now () =
  match X.statistics with
  | Some _ ->
      Unix.((times()).tms_utime)
  | None ->
      0.0

69
let start =
70
  now()
71

72 73
(* ------------------------------------------------------------------------ *)

74 75 76
(* Run the core reachability analysis, which finds out exactly under
   what conditions each nonterminal transition in the automaton can be
   taken. *)
77

78 79
module Core =
  LRijkstraCore.Run(X)
80

81 82
module W =
  Core.W
83

84 85
(* ------------------------------------------------------------------------ *)

86
(* The following code validates the fact that an error can be triggered in
87 88 89 90
   state [s'] by beginning at the start symbol [nt] and reading the
   sequence of terminal symbols [w]. We use this for debugging purposes.
   Furthermore, this gives us a list of spurious reductions, which we use
   to produce a comment. *)
91 92

let fail msg =
93
  Printf.eprintf "LRijkstra: internal error: %s.\n%!" msg;
94
  exit 1
95

96 97 98
let fail format =
  Printf.ksprintf fail format

99
let validate nt s' w : ReferenceInterpreter.target =
100
  let open ReferenceInterpreter in
101
  match
102
    check_error_path false nt (W.elements w)
103 104 105 106 107 108 109
  with
  | OInputReadPastEnd ->
      fail "input was read past its end"
  | OInputNotFullyConsumed ->
      fail "input was not fully consumed"
  | OUnexpectedAccept ->
      fail "input was unexpectedly accepted"
110 111
  | OK ((state, _) as target) ->
      if Lr1.Node.compare state s' <> 0 then
112 113 114
        fail "error occurred in state %d instead of %d"
          (Lr1.number state)
          (Lr1.number s')
115 116
      else
        target
117 118 119

(* ------------------------------------------------------------------------ *)

120 121 122 123 124 125 126
(* We now wish to determine, given a state [s'] and a terminal symbol [z], a
   minimal path that takes us from some entry state to state [s'] with [z] as
   the next (unconsumed) symbol. *)

(* This can be formulated as a search for a shortest path in a graph. The
   graph is not just the automaton, though. It is a (much) larger graph whose
   vertices are pairs [s, z] and whose edges are obtained by querying the
127 128 129
   module [E] above. For this purpose, we use Dijkstra's algorithm,
   unmodified. Experiments show that the running time of this phase is
   typically 10x shorter than the running time of the main loop above. *)
130

131
module A = Astar.Make(struct
132

133 134 135
  (* A vertex is a pair [s, z], where [z] is a real terminal symbol. *)
  type node =
      Lr1.node * Terminal.t
136

137 138
  let equal (s'1, z1) (s'2, z2) =
    Lr1.Node.compare s'1 s'2 = 0 && Terminal.compare z1 z2 = 0
139

140 141
  let hash (s, z) =
    Hashtbl.hash (Lr1.number s, z)
142

143 144 145
  (* An edge is labeled with a word. *)
  type label =
    W.word
146

147 148
  (* We search forward from every [s, z], where [s] is an initial state. *)
  let sources f =
149
    Terminal.iter_real (fun z ->
150 151 152 153
      ProductionMap.iter (fun _ s ->
        f (s, z)
      ) Lr1.entry
    )
154

155 156 157 158 159 160 161 162 163 164 165 166 167
  (* The successors of [s, z] are defined as follows. *)
  let successors (s, z) edge =
    assert (Terminal.real z);
    (* For every transition out of [s], labeled [sym], leading to [s']... *)
    Lr1.transitions s |> SymbolMap.iter (fun sym s' ->
      match sym with
      | Symbol.T t ->
          if Terminal.equal z t then
            (* If [sym] is the terminal symbol [z], then this transition
               matches our lookahead assumption, so we can take it. For
               every [z'], we have an edge to [s', z'], labeled with the
               singleton word [z]. *)
            let w = W.singleton z in
168
            Terminal.iter_real (fun z' ->
169 170 171 172 173 174 175 176 177 178
              edge w 1 (s', z')
            )
      | Symbol.N nt ->
          (* If [sym] is a nonterminal symbol [nt], then we query [E]
             in order to find out which (minimal) words [w] allow us
             to take this transition. We must again try every [z'],
             and must respect the constraint that the first symbol
             of the word [w.z'] is [z]. For every [z'] and [w] that
             fulfill these requirements, we have an edge to [s', z'],
             labeled with the word [w]. *)
179
          Core.query s nt z (fun w z' ->
180
            edge w (W.length w) (s', z')
POTTIER Francois's avatar
POTTIER Francois committed
181
          )
182
    )
183

184 185 186 187 188 189 190 191 192
  (* Algorithm A*, used with a zero estimate, is Dijkstra's algorithm.
     We have experimented with a non-zero estimate, but the performance
     increase was minimal. *)
  let estimate _ =
    0

end)

(* ------------------------------------------------------------------------ *)
193

194 195
(* [explored] counts how many graph nodes we have discovered during the
   search. *)
196

197 198
let explored =
  ref 0
199

200 201 202 203 204 205 206
(* We wish to store a set of triples [nt, w, (s', spurious)], meaning that an
   error can be triggered in state [s'] by beginning in the initial state that
   corresponds to [nt] and by reading the sequence of terminal symbols [w]. We
   wish to store at most one such triple for every state [s'], so we organize
   the data as a set [domain] of states [s'] and a list [data] of triples [nt,
   w, (s', spurious)]. The list [spurious] documents the spurious reductions
   that are performed by the parser at the end. *)
207 208 209 210

(* We could print this data as we go, which would naturally result in sorting
   the output by increasing word sizes. However, it seems preferable to sort
   the sentences lexicographically, so that similar sentences end up close to
POTTIER Francois's avatar
POTTIER Francois committed
211 212
   one another. (We could also sort them by state number. The result would be
   roughly similar.) This is why we store a list of triples and sort it before
213 214 215 216
   printing it out. *)

let domain =
  ref Lr1.NodeSet.empty
217

218
let data : (Nonterminal.t * W.word * ReferenceInterpreter.target) list ref =
219
  ref []
220

221 222
(* The set [reachable] stores every reachable state (regardless of whether an
   error can be triggered in that state). *)
223 224 225 226

let reachable =
  ref Lr1.NodeSet.empty

227 228 229 230 231
(* Perform the forward search. *)

let _, _ =
  A.search (fun ((s', z), path) ->
    incr explored;
232
    reachable := Lr1.NodeSet.add s' !reachable;
233 234
    (* If [z] causes an error in state [s'] and this is the first time
       we are able to trigger an error in this state, ... *)
235
    if causes_an_error s' z && not (Lr1.NodeSet.mem s' !domain) then begin
236 237
      (* Reconstruct the initial state [s] and the word [w] that lead
         to this error. *)
238 239
      let (s, _), ws = A.reverse path in
      let w = List.fold_right W.append ws (W.singleton z) in
240 241
      (* Check that the reference interpreter confirms our finding.
         At the same time, compute a list of spurious reductions. *)
242
      let nt = Lr1.nt_of_entry s in
243 244
      let target = validate nt s' w in
      (* Store this new data. *)
245
      domain := Lr1.NodeSet.add s' !domain;
246
      data := (nt, w, target) :: !data
247
    end
248
  )
249

250 251 252
(* Sort and output the data. *)

let () =
253 254
  !data
  |> List.fast_sort (fun (nt1, w1, _) (nt2, w2, _) ->
255 256
    let c = Nonterminal.compare nt1 nt2 in
    if c <> 0 then c else W.compare w2 w1
257
  )
258
  |> List.map (fun (nt, w, target) -> (nt, W.elements w, target))
259
  |> List.iter Interpret.print_messages_item
260 261 262 263 264

(* ------------------------------------------------------------------------ *)

(* Verbosity. *)

265 266 267 268 269 270 271
let max_heap_size =
  if X.verbose || X.statistics <> None then
    let stat = Gc.quick_stat() in
    (stat.Gc.top_heap_words * (Sys.word_size / 8) / 1024 / 1024)
  else
    0 (* dummy *)

272
let () =
273
  Time.tick "Forward search";
274 275 276
  if X.verbose then begin
    Printf.eprintf
      "%d graph nodes explored by forward search.\n\
277
       %d out of %d states are reachable.\n\
278 279
       Found %d states where an error can occur.\n\
       Maximum size reached by the major heap: %dM\n%!"
280
    !explored
281
    (Lr1.NodeSet.cardinal !reachable) Lr1.n
282
    (Lr1.NodeSet.cardinal !domain)
283
    max_heap_size
284
  end
285

286 287
(* ------------------------------------------------------------------------ *)

288
(* If requested by the client, write one line of statistics to a .csv file. *)
289

290 291
let stop =
  now()
292 293

let () =
294 295
  X.statistics |> Option.iter (fun filename ->
    let c = open_out_gen [ Open_creat; Open_append; Open_text ] 0o644 filename in
296
    Printf.fprintf c
297
      "%s,%d,%d,%d,%d,%d,%d,%d,%.2f,%d\n%!"
298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315
      (* Grammar name. *)
      Settings.base
      (* Number of terminal symbols. *)
      Terminal.n
      (* Number of nonterminal symbols. *)
      Nonterminal.n
      (* Grammar size (not counting the error productions). *)
      begin
        Production.foldx (fun prod accu ->
          let rhs = Production.rhs prod in
          if List.mem (Symbol.T Terminal.error) (Array.to_list rhs) then
            accu
          else
            accu + Array.length rhs
        ) 0
      end
      (* Automaton size (i.e., number of states). *)
      Lr1.n
316
      (* Total trie size. *)
317
      Core.total_trie_size
318
      (* Size of [F]. *)
319
      Core.facts
320
      (* Size of [E]. *)
321
      Core.edge_facts
322 323
      (* Elapsed user time, in seconds. *)
      (stop -. start)
324 325
      (* Max heap size, in megabytes. *)
      max_heap_size
326 327
    ;
    close_out c
328
  )
329 330 331

(* ------------------------------------------------------------------------ *)

332
end