item.ml 12 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
(******************************************************************************)
(*                                                                            *)
(*                                   Menhir                                   *)
(*                                                                            *)
(*                       François Pottier, Inria Paris                        *)
(*              Yann Régis-Gianas, PPS, Université Paris Diderot              *)
(*                                                                            *)
(*  Copyright Inria. All rights reserved. This file is distributed under the  *)
(*  terms of the GNU General Public License version 2, as described in the    *)
(*  file LICENSE.                                                             *)
(*                                                                            *)
(******************************************************************************)

14 15 16 17 18 19 20 21 22 23 24 25 26
open Grammar

(* ------------------------------------------------------------------------ *)
(* Items. *)

(* An LR(0) item encodes a pair of integers, namely the index of the
   production and the index of the bullet in the production's
   right-hand side. *)

(* Both integers are packed into a single integer, using 7 bits for
   the bullet position and the rest (usually 24 bits) for the
   production index. These widths could be adjusted. *)

27 28 29
(* The function [export] is duplicated in [TableInterpreter]. Do not
   modify it; or modify it here and there in a consistent manner. *)

30 31 32 33 34 35 36 37 38
type t = int

let import (prod, pos) =
  assert (pos < 128);
  (Production.p2i prod) lsl 7 + pos

let export t =
  (Production.i2p (t lsr 7), t mod 128)

39 40 41
let marshal (item : t) : int =
  item

42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
(* Comparison. *)

let equal (item1 : t) (item2: t) =
  item1 = item2

(* Position. *)

let positions (item : t) =
  let prod, _ = export item in
  Production.positions prod

(* [def item] looks up the production associated with this item in the
   grammar and returns [prod, nt, rhs, pos, length], where [prod] is
   the production's index, [nt] and [rhs] represent the production,
   [pos] is the position of the bullet in the item, and [length] is
   the length of the production's right-hand side. *)

let def t =
  let prod, pos = export t in
  let nt, rhs = Production.def prod in
  let length = Array.length rhs in
  assert ((pos >= 0) && (pos <= length));
  prod, nt, rhs, pos, length

let startnt t =
  let _, _, rhs, pos, length = def t in
  assert (pos = 0 && length = 1);
  match rhs.(0) with
  | Symbol.N nt ->
      nt
  | Symbol.T _ ->
      assert false

(* Printing. *)

let print item =
78
  let _, nt, rhs, pos, _ = def item in
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
  Printf.sprintf "%s -> %s" (Nonterminal.print false nt) (Symbol.printaod 0 pos rhs)

(* Classifying items. *)

type kind =
  | Shift of Symbol.t * t
  | Reduce of Production.index

let classify item =
  let prod, _, rhs, pos, length = def item in
  if pos = length then
    Reduce prod
  else
    Shift (rhs.(pos), import (prod, pos + 1))

(* Sets of items and maps over items. Hashing these data structures is
   specifically allowed, so balanced trees (for instance) would not be
   applicable here. *)

module Map = Patricia.Big
module Set = Map.Domain

(* This functor performs precomputation that helps efficiently compute
   the closure of an LR(0) or LR(1) state. The precomputation requires
   time linear in the size of the grammar. The nature of the lookahead
   sets remains abstract. *)

(* The precomputation consists in building the LR(0) nondeterministic
   automaton. This is a graph whose nodes are items and whose edges
   are epsilon transitions. (We do not care about shift transitions
   here.) Lookahead information can be attached to nodes and is
   propagated through the graph during closure computations. *)

module Closure (L : Lookahead.S) = struct

  type state = L.t Map.t

  type node = {

      (* Nodes are sequentially numbered so as to allow applying
119
         Tarjan's algorithm (below). *)
120 121 122 123 124 125 126 127

      num: int;

      (* Each node is associated with an item. *)

      item: t;

      (* All of the epsilon transitions that leave a node have the
128
         same behavior with respect to lookahead information. *)
129 130

      (* The lookahead set transmitted along an epsilon transition is
131 132 133 134
         either a constant, or the union of a constant and the lookahead
         set at the source node. The former case corresponds to a source
         item whose trailer is not nullable, the latter to a source item
         whose trailer is nullable. *)
135 136 137 138 139

      epsilon_constant: L.t;
      epsilon_transmits: bool;

      (* Each node carries pointers to its successors through
140 141
         epsilon transitions. This field is never modified
         once initialization is over. *)
142 143 144 145

      mutable epsilon_transitions: node list;

      (* The following fields are transient, that is, only used
146 147 148 149 150
         temporarily during graph traversals. Marks are used to
         recognize which nodes have been traversed already. Lists
         of predecessors are used to record which edges have been
         traversed. Lookahead information is attached with each
         node. *)
151 152 153 154 155 156 157 158 159 160 161 162 163

      mutable mark: Mark.t;
      mutable predecessors: node list;
      mutable lookahead: L.t;
    }

  (* Allocate one graph node per item and build a mapping of
     items to nodes. *)

  let count =
    ref 0

  let mapping : node array array =
164
    Array.make Production.n [||]
165 166 167 168 169 170 171

  let item2node item =
    let prod, pos = export item in
    mapping.(Production.p2i prod).(pos)

  let () =
    Production.iter (fun prod ->
172
      let _nt, rhs = Production.def prod in
173 174 175
      let length = Array.length rhs in
      mapping.(Production.p2i prod) <- Array.init (length+1) (fun pos ->

176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
        let item = import (prod, pos) in
        let num = !count in
        count := num + 1;

        (* The lookahead set transmitted through an epsilon
           transition is the FIRST set of the remainder of
           the source item, plus, if that is nullable, the
           lookahead set of the source item. *)

        let constant, transmits =
          if pos < length then
            let nullable, first = Analysis.nullable_first_prod prod (pos + 1) in
            L.constant first, nullable
          else
            (* No epsilon transitions leave this item. *)
            L.empty, false
        in
193

194 195 196 197 198 199 200 201 202 203
        {
          num = num;
          item = item;
          epsilon_constant = constant;
          epsilon_transmits = transmits;
          epsilon_transitions = []; (* temporary placeholder *)
          mark = Mark.none;
          predecessors = [];
          lookahead = L.empty;
        }
204 205 206

      )
    )
207

208 209 210 211
  (* At each node, compute transitions. *)

  let () =
    Production.iter (fun prod ->
212
      let _nt, rhs = Production.def prod in
213 214 215
      let length = Array.length rhs in
      Array.iteri (fun pos node ->

216 217 218 219 220 221 222 223 224 225 226
        node.epsilon_transitions <-
          if pos < length then
            match rhs.(pos) with
            | Symbol.N nt ->
                Production.foldnt nt [] (fun prod nodes ->
                  (item2node (import (prod, 0))) :: nodes
                )
            | Symbol.T _ ->
                []
          else
            []
227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264

      ) mapping.(Production.p2i prod)
    )

  (* Detect and reject cycles of transitions that transmit a lookahead
     set.

     We need to ensure that there are no such cycles in order to be
     able to traverse these transitions in topological order.

     Each such cycle corresponds to a set of productions of the form
     A1 -> A2, A2 -> A3, ..., An -> A1 (modulo nullable
     trailers). Such cycles are unlikely to occur in realistic
     grammars, so our current approach is to reject the grammar if
     such a cycle exists. Actually, according to DeRemer and Pennello
     (1982), such a cycle is exactly an includes cycle, and implies
     that the grammar is not LR(k) for any k, unless A1, ..., An are
     in fact uninhabited. In other words, this is a pathological
     case. *)

  (* Yes, indeed, this is called a cycle in Aho & Ullman's book,
     and a loop in Grune & Jacobs' book. It is not difficult to
     see that (provided all symbols are inhabited) the grammar
     is infinitely ambiguous if and only if there is a loop. *)

  module P = struct

    type foo = node
    type node = foo

    let n =
      !count

    let index node =
      node.num

    let iter f =
      Array.iter (fun nodes ->
265
        Array.iter f nodes
266 267 268 269
      ) mapping

    let successors f node =
      if node.epsilon_transmits then
270
        List.iter f node.epsilon_transitions
271 272 273 274 275 276 277 278 279 280 281

  end

  module T = Tarjan.Run (P)

  let cycle scc =
    let items = List.map (fun node -> node.item) scc in
    let positions = List.flatten (List.map positions items) in
    let names = String.concat "\n" (List.map print items) in
    Error.error
      positions
282 283 284
      "the grammar is ambiguous.\n\
       The following items participate in an epsilon-cycle:\n\
       %s" names
285 286 287 288 289 290

  let () =
     P.iter (fun node ->
       let scc = T.scc node in
       match scc with
       | [] ->
291
           ()
292 293
       | [ node ] ->

294 295 296 297
           (* This is a strongly connected component of one node. Check
              whether it carries a self-loop. Forbidding self-loops is not
              strictly required by the code that follows, but is consistent
              with the fact that we forbid cycles of length greater than 1. *)
298

299 300 301
           P.successors (fun successor ->
             if successor.num = node.num then
               cycle scc
302 303 304 305 306 307 308
           ) node

       | _ ->

           (* This is a strongly connected component of at least two
              elements. *)

309
           cycle scc
310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332
     )

  (* Closure computation. *)

  let closure (items : state) : state =

    (* Explore the graph forwards, starting from these items. Marks
       are used to tell which nodes have been visited. Build a list of
       all visited nodes; this is in fact the list of all items in the
       closure.

       At initial nodes and when reaching a node through a transition,
       record a lookahead set.

       When we reach a node through a transition that transmits the
       lookahead set found at its source, record its source, so as to
       allow re-traversing this transition backwards (below). *)

    let this = Mark.fresh() in
    let nodes = ref [] in

    let rec visit father transmits toks node =
      if Mark.same node.mark this then begin
333 334 335 336
        (* Node has been visited already. *)
        node.lookahead <- L.union toks node.lookahead;
        if transmits then
          node.predecessors <- father :: node.predecessors
337 338
      end
      else begin
339 340 341 342
        (* Node is new. *)
        node.predecessors <- if transmits then [ father ] else [];
        node.lookahead <- toks;
        follow node
343 344 345 346 347 348
      end

    and follow node =
      node.mark <- this;
      nodes := node :: !nodes;
      List.iter
349 350
        (visit node node.epsilon_transmits node.epsilon_constant)
        node.epsilon_transitions
351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369

    in

    Map.iter (fun item toks ->
      let node = item2node item in
      visit node (* dummy! *) false toks node
    ) items;

    let nodes =
      !nodes in

    (* Explore the graph of transmitting transitions backwards. By
       hypothesis, it is acyclic, so this is a topological
       walk. Lookahead sets are inherited through transitions. *)

    let this = Mark.fresh() in

    let rec walk node =
      if not (Mark.same node.mark this) then begin
370 371 372 373 374 375 376
        (* Node is new. *)
        node.mark <- this;
        (* Explore all predecessors and merge their lookahead
           sets into the current node's own lookahead set. *)
        List.iter (fun predecessor ->
          walk predecessor;
          node.lookahead <- L.union predecessor.lookahead node.lookahead
377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397
        ) node.predecessors
      end
    in

    List.iter walk nodes;

    (* Done. Produce a mapping of items to lookahead sets.
       Clear all transient fields so as to reduce pressure
       on the GC -- this does not make much difference. *)

    List.fold_left (fun closure node ->
      node.predecessors <- [];
      let closure = Map.add node.item node.lookahead closure in
      node.lookahead <- L.empty;
      closure
    ) Map.empty nodes

  (* End of closure computation *)

end