item.ml 10.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
open Grammar

(* ------------------------------------------------------------------------ *)
(* Items. *)

(* An LR(0) item encodes a pair of integers, namely the index of the
   production and the index of the bullet in the production's
   right-hand side. *)

(* Both integers are packed into a single integer, using 7 bits for
   the bullet position and the rest (usually 24 bits) for the
   production index. These widths could be adjusted. *)

type t = int

let import (prod, pos) =
  assert (pos < 128);
  (Production.p2i prod) lsl 7 + pos

let export t =
  (Production.i2p (t lsr 7), t mod 128)

(* Comparison. *)

let equal (item1 : t) (item2: t) =
  item1 = item2

(* Position. *)

let positions (item : t) =
  let prod, _ = export item in
  Production.positions prod

(* [def item] looks up the production associated with this item in the
   grammar and returns [prod, nt, rhs, pos, length], where [prod] is
   the production's index, [nt] and [rhs] represent the production,
   [pos] is the position of the bullet in the item, and [length] is
   the length of the production's right-hand side. *)

let def t =
  let prod, pos = export t in
  let nt, rhs = Production.def prod in
  let length = Array.length rhs in
  assert ((pos >= 0) && (pos <= length));
  prod, nt, rhs, pos, length

let startnt t =
  let _, _, rhs, pos, length = def t in
  assert (pos = 0 && length = 1);
  match rhs.(0) with
  | Symbol.N nt ->
      nt
  | Symbol.T _ ->
      assert false

(* Printing. *)

let print item =
POTTIER Francois's avatar
POTTIER Francois committed
59
  let _, nt, rhs, pos, _ = def item in
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
  Printf.sprintf "%s -> %s" (Nonterminal.print false nt) (Symbol.printaod 0 pos rhs)

(* Classifying items. *)

type kind =
  | Shift of Symbol.t * t
  | Reduce of Production.index

let classify item =
  let prod, _, rhs, pos, length = def item in
  if pos = length then
    Reduce prod
  else
    Shift (rhs.(pos), import (prod, pos + 1))

(* Sets of items and maps over items. Hashing these data structures is
   specifically allowed, so balanced trees (for instance) would not be
   applicable here. *)

module Map = Patricia.Big
module Set = Map.Domain

(* This functor performs precomputation that helps efficiently compute
   the closure of an LR(0) or LR(1) state. The precomputation requires
   time linear in the size of the grammar. The nature of the lookahead
   sets remains abstract. *)

(* The precomputation consists in building the LR(0) nondeterministic
   automaton. This is a graph whose nodes are items and whose edges
   are epsilon transitions. (We do not care about shift transitions
   here.) Lookahead information can be attached to nodes and is
   propagated through the graph during closure computations. *)

module Closure (L : Lookahead.S) = struct

  type state = L.t Map.t

  type node = {

      (* Nodes are sequentially numbered so as to allow applying
	 Tarjan's algorithm (below). *)

      num: int;

      (* Each node is associated with an item. *)

      item: t;

      (* All of the epsilon transitions that leave a node have the
	 same behavior with respect to lookahead information. *)

      (* The lookahead set transmitted along an epsilon transition is
	 either a constant, or the union of a constant and the lookahead
	 set at the source node. The former case corresponds to a source
	 item whose trailer is not nullable, the latter to a source item
	 whose trailer is nullable. *)

      epsilon_constant: L.t;
      epsilon_transmits: bool;

      (* Each node carries pointers to its successors through
	 epsilon transitions. This field is never modified
	 once initialization is over. *)

      mutable epsilon_transitions: node list;

      (* The following fields are transient, that is, only used
	 temporarily during graph traversals. Marks are used to
	 recognize which nodes have been traversed already. Lists
	 of predecessors are used to record which edges have been
	 traversed. Lookahead information is attached with each
	 node. *)

      mutable mark: Mark.t;
      mutable predecessors: node list;
      mutable lookahead: L.t;
    }

  (* Allocate one graph node per item and build a mapping of
     items to nodes. *)

  let count =
    ref 0

  let mapping : node array array =
    Array.create Production.n [||]

  let item2node item =
    let prod, pos = export item in
    mapping.(Production.p2i prod).(pos)

  let () =
    Production.iter (fun prod ->
POTTIER Francois's avatar
POTTIER Francois committed
153
      let _nt, rhs = Production.def prod in
154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
      let length = Array.length rhs in
      mapping.(Production.p2i prod) <- Array.init (length+1) (fun pos ->

	let item = import (prod, pos) in
	let num = !count in
	count := num + 1;

	(* The lookahead set transmitted through an epsilon
	   transition is the FIRST set of the remainder of
	   the source item, plus, if that is nullable, the
	   lookahead set of the source item. *)

	let constant, transmits =
	  if pos < length then
	    let nullable, first = Analysis.nullable_first_rhs rhs (pos + 1) in
	    L.constant first, nullable
	  else
	    (* No epsilon transitions leave this item. *)
	    L.empty, false
	in
	  
	{
	  num = num;
	  item = item;
	  epsilon_constant = constant;
  	  epsilon_transmits = transmits;
	  epsilon_transitions = []; (* temporary placeholder *)
	  mark = Mark.none;
	  predecessors = [];
  	  lookahead = L.empty;
	}

      )
    )
  
  (* At each node, compute transitions. *)

  let () =
    Production.iter (fun prod ->
POTTIER Francois's avatar
POTTIER Francois committed
193
      let _nt, rhs = Production.def prod in
194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
      let length = Array.length rhs in
      Array.iteri (fun pos node ->

	node.epsilon_transitions <-
	  if pos < length then
	    match rhs.(pos) with
	    | Symbol.N nt ->
		Production.foldnt nt [] (fun prod nodes ->
		  (item2node (import (prod, 0))) :: nodes
		)
	    | Symbol.T _ ->
		[]
	  else
	    []

      ) mapping.(Production.p2i prod)
    )

  (* Detect and reject cycles of transitions that transmit a lookahead
     set.

     We need to ensure that there are no such cycles in order to be
     able to traverse these transitions in topological order.

     Each such cycle corresponds to a set of productions of the form
     A1 -> A2, A2 -> A3, ..., An -> A1 (modulo nullable
     trailers). Such cycles are unlikely to occur in realistic
     grammars, so our current approach is to reject the grammar if
     such a cycle exists. Actually, according to DeRemer and Pennello
     (1982), such a cycle is exactly an includes cycle, and implies
     that the grammar is not LR(k) for any k, unless A1, ..., An are
     in fact uninhabited. In other words, this is a pathological
     case. *)

  (* Yes, indeed, this is called a cycle in Aho & Ullman's book,
     and a loop in Grune & Jacobs' book. It is not difficult to
     see that (provided all symbols are inhabited) the grammar
     is infinitely ambiguous if and only if there is a loop. *)

  module P = struct

    type foo = node
    type node = foo

    let n =
      !count

    let index node =
      node.num

    let iter f =
      Array.iter (fun nodes ->
	Array.iter f nodes
      ) mapping

    let successors f node =
      if node.epsilon_transmits then
	List.iter f node.epsilon_transitions

  end

  module T = Tarjan.Run (P)

  let cycle scc =
    let items = List.map (fun node -> node.item) scc in
    let positions = List.flatten (List.map positions items) in
    let names = String.concat "\n" (List.map print items) in
    Error.error
      positions
      (Printf.sprintf "the grammar is ambiguous.\n\
                       The following items participate in an epsilon-cycle:\n\
                       %s" names)

  let () =
     P.iter (fun node ->
       let scc = T.scc node in
       match scc with
       | [] ->
	   ()
       | [ node ] ->

	   (* This is a strongly connected component of one node. Check
	      whether it carries a self-loop. Forbidding self-loops is not
	      strictly required by the code that follows, but is consistent
	      with the fact that we forbid cycles of length greater than 1. *)

	   P.successors (fun successor ->
	     if successor.num = node.num then
	       cycle scc
           ) node

       | _ ->

           (* This is a strongly connected component of at least two
              elements. *)

	   cycle scc
     )

  (* Closure computation. *)

  let closure (items : state) : state =

    (* Explore the graph forwards, starting from these items. Marks
       are used to tell which nodes have been visited. Build a list of
       all visited nodes; this is in fact the list of all items in the
       closure.

       At initial nodes and when reaching a node through a transition,
       record a lookahead set.

       When we reach a node through a transition that transmits the
       lookahead set found at its source, record its source, so as to
       allow re-traversing this transition backwards (below). *)

    let this = Mark.fresh() in
    let nodes = ref [] in

    let rec visit father transmits toks node =
      if Mark.same node.mark this then begin
	(* Node has been visited already. *)
	node.lookahead <- L.union toks node.lookahead;
	if transmits then
	  node.predecessors <- father :: node.predecessors
      end
      else begin
	(* Node is new. *)
	node.predecessors <- if transmits then [ father ] else [];
	node.lookahead <- toks;
	follow node
      end

    and follow node =
      node.mark <- this;
      nodes := node :: !nodes;
      List.iter
	(visit node node.epsilon_transmits node.epsilon_constant)
	node.epsilon_transitions

    in

    Map.iter (fun item toks ->
      let node = item2node item in
      visit node (* dummy! *) false toks node
    ) items;

    let nodes =
      !nodes in

    (* Explore the graph of transmitting transitions backwards. By
       hypothesis, it is acyclic, so this is a topological
       walk. Lookahead sets are inherited through transitions. *)

    let this = Mark.fresh() in

    let rec walk node =
      if not (Mark.same node.mark this) then begin
	(* Node is new. *)
	node.mark <- this;
	(* Explore all predecessors and merge their lookahead
	   sets into the current node's own lookahead set. *)
	List.iter (fun predecessor ->
	  walk predecessor;
	  node.lookahead <- L.union predecessor.lookahead node.lookahead
        ) node.predecessors
      end
    in

    List.iter walk nodes;

    (* Done. Produce a mapping of items to lookahead sets.
       Clear all transient fields so as to reduce pressure
       on the GC -- this does not make much difference. *)

    List.fold_left (fun closure node ->
      node.predecessors <- [];
      let closure = Map.add node.item node.lookahead closure in
      node.lookahead <- L.empty;
      closure
    ) Map.empty nodes

  (* End of closure computation *)

end