invariant.ml 30.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
(******************************************************************************)
(*                                                                            *)
(*                                   Menhir                                   *)
(*                                                                            *)
(*                       François Pottier, Inria Paris                        *)
(*              Yann Régis-Gianas, PPS, Université Paris Diderot              *)
(*                                                                            *)
(*  Copyright Inria. All rights reserved. This file is distributed under the  *)
(*  terms of the GNU General Public License version 2, as described in the    *)
(*  file LICENSE.                                                             *)
(*                                                                            *)
(******************************************************************************)

14 15 16 17
 (* This module discovers information about the shape and content of the stack
    in each of the automaton's states. *)

open Grammar
18
module C = Conflict (* artificial dependency; ensures that [Conflict] runs first *)
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44

(* ------------------------------------------------------------------------ *)
(* Compute a lower bound on the height of the stack at every state. At the
   same time, compute which symbols are held in this stack prefix. *)

(* In order to compute (a lower bound on) the height of the stack at a state
   [s], we examine the LR(0) items that compose [s]. For each item, if the
   bullet is at position [pos], then we can be assured that the height of the
   stack is at least [pos]. Thus, we compute the maximum of [pos] over all
   items (of which there is at least one). *)

(* The set of items that we use is not closed, but this does not matter; the
   items that would be added by the closure would not add any information
   regarding the height of the stack, since the bullet is at position 0 in
   these items. *)

(* Instead of computing just the stack height, we compute, in the same manner,
   which symbols are on the stack at a state [s]. This is an array of symbols
   whose length is the height of the stack at [s]. By convention, the top of
   the stack is the end of the array. *)

(* We first compute and tabulate this information at the level of the LR(0)
   automaton. *)

let stack_symbols : Lr0.node -> Symbol.t array =
  let dummy =
45
    Array.make 0 (Symbol.T Terminal.sharp)
46 47 48
  in
  Misc.tabulate Lr0.n (fun node ->
    Item.Set.fold (fun item accu ->
49
      let _prod, _nt, rhs, pos, _length = Item.def item in
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
      if pos > Array.length accu then Array.sub rhs 0 pos else accu
    ) (Lr0.items node) dummy
  )

(* Then, it is easy to extend it to the LR(1) automaton. *)

let stack_symbols (node : Lr1.node) : Symbol.t array =
  stack_symbols (Lr0.core (Lr1.state node))

let stack_height (node : Lr1.node) : int =
  Array.length (stack_symbols node)

(* ------------------------------------------------------------------------ *)
(* Above, we have computed a prefix of the stack at every state. We have
   computed the length of this prefix and the symbols that are held in
   this prefix of the stack. Now, compute which states may be held in this
   prefix. *)

(* In order to compute this information, we perform an analysis of the
   automaton, via a least fixed fixed point computation. *)

(* It is worth noting that it would be possible to use an analysis based on a
   least fixed point computation to discover at the same time the length of
   the stack prefix, the symbols that it contains, and the states that it may
   contain. This alternate approach, which was used until 2012/08/25, would
   lead us to discovering a richer invariant, that is, potentially longer
   prefixes. This extra information, however, was useless; computing it was a
   waste of time. Hence, as of 2012/08/25, the height of the stack prefix and
   the symbols that it contains are predicted (see above), and the least fixed
   computation is used only to populate these prefixes of predictable length
   with state information. *)

(* By the way, this least fixed point analysis remains the most costly
   computation throughout this module. *)

(* Vectors of sets of states. *)

module StateVector = struct

  type property =
    Lr1.NodeSet.t list

  let empty =
    []

  let rec equal v1 v2 =
    match v1, v2 with
    | [], [] ->
        true
    | states1 :: v1, states2 :: v2 ->
        Lr1.NodeSet.equal states1 states2 &&
        equal v1 v2
    | _, _ ->
        (* Because all heights are known ahead of time, we are able
104
           to (and careful to) compare only vectors of equal length. *)
105 106 107 108 109
        assert false

  let rec join v1 v2 =
    match v1, v2 with
    | [], [] ->
110
        []
111
    | states1 :: v1, states2 :: v2 ->
112 113
        Lr1.NodeSet.union states1 states2 ::
        join v1 v2
114 115
    | _, _ ->
        (* Because all heights are known ahead of time, we are able
116 117
           to (and careful to) compare only vectors of equal length. *)
        assert false
118 119 120 121

  let push v x =
    x :: v

122 123
  let truncate =
    MenhirLib.General.take
124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162

end

(* In order to perform the fixed point computation, we must extend our type of
   vectors with a bottom element. This element will not appear in the least
   fixed point, provided every state of the automaton is reachable. *)

module StateLattice = struct

  type property =
    | Bottom
    | NonBottom of StateVector.property

  let bottom =
    Bottom

  let empty =
    NonBottom StateVector.empty

  let equal v1 v2 =
    match v1, v2 with
    | Bottom, Bottom ->
        true
    | NonBottom v1, NonBottom v2 ->
        StateVector.equal v1 v2
    | _, _ ->
      false

  let join v1 v2 =
    match v1, v2 with
    | Bottom, v
    | v, Bottom ->
        v
    | NonBottom v1, NonBottom v2 ->
        NonBottom (StateVector.join v1 v2)

  let push v x =
    match v with
    | Bottom ->
163
        Bottom
164 165 166 167 168 169 170 171
    | NonBottom v ->
        NonBottom (StateVector.push v x)

  let truncate h v =
    match v with
    | Bottom ->
        Bottom
    | NonBottom v ->
172
        NonBottom (StateVector.truncate h v)
173 174 175 176 177 178 179 180 181 182 183 184 185

  let is_maximal _ =
    false

end

open StateLattice

(* Define the fixed point. *)

let stack_states : Lr1.node -> property =

  let module F =
186 187 188
    Fix.Make
      (Maps.PersistentMapsToImperativeMaps(Lr1.NodeMap))
      (StateLattice)
189 190 191 192 193 194 195 196 197 198
  in

  F.lfp (fun node (get : Lr1.node -> property) ->

    (* We use the fact that a state has incoming transitions if and only if
       it is not a start state. *)

    match Lr1.incoming_symbol node with

    | None ->
199
        assert (Lr1.predecessors node = []);
200 201
        assert (stack_height node = 0);

202 203
        (* If [node] is a start state, then the stack at [node] may be (in
           fact, must be) the empty stack. *)
204

205
        empty
206

207
    | Some _symbol ->
208

209 210 211 212 213 214
        (* If [node] is not a start state, then include the contribution of
           every incoming transition. We compute a join over all predecessors.
           The contribution of one predecessor is the abstract value found at
           this predecessor, extended with a new cell for this transition, and
           truncated to the stack height at [node], so as to avoid obtaining a
           vector that is longer than expected/necessary. *)
215

216
        let height = stack_height node in
217

218 219 220 221 222 223
        List.fold_left (fun v predecessor ->
          join v
            (truncate height
              (push (get predecessor) (Lr1.NodeSet.singleton predecessor))
            )
        ) bottom (Lr1.predecessors node)
224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247

  )

(* If every state is reachable, then the least fixed point must be non-bottom
   everywhere, so we may view it as a function that produces a vector of sets
   of states. *)

let stack_states (node : Lr1.node) : StateVector.property =
  match stack_states node with
  | Bottom ->
      (* apparently this node is unreachable *)
      assert false
  | NonBottom v ->
      v

(* ------------------------------------------------------------------------ *)
(* For each production, compute where (that is, in which states) this
   production can be reduced. *)

let production_where : Lr1.NodeSet.t ProductionMap.t =
  Lr1.fold (fun accu node ->
    TerminalMap.fold (fun _ prods accu ->
      let prod = Misc.single prods in
      let nodes =
248 249 250 251
        try
          ProductionMap.lookup prod accu
        with Not_found ->
          Lr1.NodeSet.empty
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266
      in
      ProductionMap.add prod (Lr1.NodeSet.add node nodes) accu
    ) (Lr1.reductions node) accu
  ) ProductionMap.empty

let production_where (prod : Production.index) : Lr1.NodeSet.t =
  try
    (* Production [prod] may be reduced at [nodes]. *)
    let nodes = ProductionMap.lookup prod production_where in
    assert (not (Lr1.NodeSet.is_empty nodes));
    nodes
  with Not_found ->
    (* The production [prod] is never reduced. *)
    Lr1.NodeSet.empty

267 268 269
let may_reduce node prod =
  Lr1.NodeSet.mem node (production_where prod)

270 271 272 273 274 275 276 277 278 279 280 281 282 283 284
let ever_reduced prod =
  not (Lr1.NodeSet.is_empty (production_where prod))

let fold_reduced f prod accu =
  Lr1.NodeSet.fold f (production_where prod) accu

(* ------------------------------------------------------------------------ *)
(* Warn about productions that are never reduced. *)

let () =
  let count = ref 0 in
  Production.iter (fun prod ->
    if Lr1.NodeSet.is_empty (production_where prod) then
      match Production.classify prod with
      | Some nt ->
285 286 287 288
          incr count;
          Error.grammar_warning
            (Nonterminal.positions nt)
            "symbol %s is never accepted." (Nonterminal.print false nt)
289
      | None ->
290 291 292 293
          incr count;
          Error.grammar_warning
            (Production.positions prod)
            "production %sis never reduced." (Production.print prod)
294 295 296
  );
  if !count > 0 then
    Error.grammar_warning []
297
      "in total, %d productions are never reduced." !count
298 299 300 301 302 303 304 305 306 307 308 309 310 311

(* ------------------------------------------------------------------------ *)
(* From the above information, deduce, for each production, the states that
   may appear in the stack when this production is reduced. *)

(* We are careful to produce a vector of states whose length is exactly that
   of the production [prod]. *)

let production_states : Production.index -> StateLattice.property =
  Production.tabulate (fun prod ->
    let nodes = production_where prod in
    let height = Production.length prod in
    Lr1.NodeSet.fold (fun node accu ->
      join accu
312
        (truncate height
313 314
           (NonBottom (stack_states node))
        )
315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380
    ) nodes bottom
  )

(* ------------------------------------------------------------------------ *)
(* We now determine which states must be represented, that is,
   explicitly pushed onto the stack. For simplicity, a state is either
   always represented or never represented. More fine-grained
   strategies, where a single state is sometimes pushed onto the stack
   and sometimes not pushed, depending on which outgoing transition is
   being taken, are conceivable, but quite tricky, and probably not
   worth the trouble.

   (1) If two states are liable to appear within a single stack cell,
   then one is represented if and only if the other is
   represented. This ensures that the structure of stacks is known
   everywhere and that we can propose types for stacks.

   (2) If a state [s] has an outgoing transition along nonterminal
   symbol [nt], and if the [goto] table for symbol [nt] has more than
   one target, then state [s] is represented.

   (3) If a stack cell contains more than one state and if at least
   one of these states is able to handle the [error] token, then these
   states are represented.

   (4) If the semantic action associated with a production mentions
   the [$syntaxerror] keyword, then the state that is being reduced to
   (that is, the state that initiated the recognition of this
   production) is represented. (Indeed, it will be passed as an
   argument to [errorcase].) *)

(* Data. *)

let rep : bool UnionFind.point array =
  Array.init Lr1.n (fun _ -> UnionFind.fresh false)

(* Getter. *)

let represented state =
  rep.(Lr1.number state)

(* Setters. *)

let represent state =
  UnionFind.change (represented state) true

let represents states =
  represent (Lr1.NodeSet.choose states)

(* Enforce condition (1) above. *)

let share (v : StateVector.property) =
  List.iter (fun states ->
    let dummy = UnionFind.fresh false in
    Lr1.NodeSet.iter (fun state ->
      UnionFind.eunion dummy (represented state)
    ) states
  ) v

let () =
  Lr1.iter (fun node ->
    share (stack_states node)
  );
  Production.iter (fun prod ->
    match production_states prod with
    | Bottom ->
381
        ()
382
    | NonBottom v ->
383
        share v
384 385 386 387 388 389
  )

(* Enforce condition (2) above. *)

let () =
  Nonterminal.iter (fun nt ->
390
    let count =
391
      Lr1.targets (fun count _ _ ->
392
        count + 1
393 394 395 396
      ) 0 (Symbol.N nt)
    in
    if count > 1 then
      Lr1.targets (fun () sources _ ->
397
        List.iter represent sources
398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421
      ) () (Symbol.N nt)
  )

(* Enforce condition (3) above. *)

let handler state =
  try
    let _ = SymbolMap.find (Symbol.T Terminal.error) (Lr1.transitions state) in
    true
  with Not_found ->
    try
      let _ = TerminalMap.lookup Terminal.error (Lr1.reductions state) in
      true
    with Not_found ->
      false

let handlers states =
  Lr1.NodeSet.exists handler states

let () =
  Lr1.iter (fun node ->
    let v = stack_states node in
    List.iter (fun states ->
      if Lr1.NodeSet.cardinal states >= 2 && handlers states then
422
        represents states
423 424 425 426 427 428 429 430 431 432
    ) v
  )

(* Enforce condition (4) above. *)

let () =
  Production.iterx (fun prod ->
    if Action.has_syntaxerror (Production.action prod) then
      match production_states prod with
      | Bottom ->
433
          ()
434
      | NonBottom v ->
435 436 437 438 439 440 441
          let sites = production_where prod in
          let length = Production.length prod in
          if length = 0 then
            Lr1.NodeSet.iter represent sites
          else
            let states = List.nth v (length - 1) in
            represents states
442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506
  )

(* Define accessors. *)

let represented state =
  UnionFind.find (represented state)

let representeds states =
  if Lr1.NodeSet.is_empty states then
    assert false
  else
    represented (Lr1.NodeSet.choose states)

(* Statistics. *)

let () =
  Error.logC 1 (fun f ->
    let count =
      Lr1.fold (fun count node ->
        if represented node then count + 1 else count
      ) 0
    in
    Printf.fprintf f "%d out of %d states are represented.\n" count Lr1.n
  )

(* ------------------------------------------------------------------------ *)
(* Accessors for information about the stack. *)

(* We describe a stack prefix as a list of cells, where each cell is a pair
   of a symbol and a set of states. The top of the stack is the head of the
   list. *)

type cell =
    Symbol.t * Lr1.NodeSet.t

type word =
    cell list

(* This auxiliary function converts a stack-as-an-array (top of stack
   at the right end) to a stack-as-a-list (top of stack at list head). *)

let convert a =
  let n = Array.length a in
  let rec loop i accu =
    if i = n then accu else loop (i + 1) (a.(i) :: accu)
  in
  loop 0 []

(* [stack s] describes the stack when the automaton is in state [s]. *)

let stack node : word =
  List.combine
    (convert (stack_symbols node))
    (stack_states node)

(* [prodstack prod] describes the stack when production [prod] is about to be
   reduced. *)

let prodstack prod : word =
  match production_states prod with
  | Bottom ->
      (* This production is never reduced. *)
      assert false
  | NonBottom v ->
      List.combine
507 508
        (convert (Production.rhs prod))
        v
509 510 511 512 513 514 515 516 517

(* [gotostack nt] is the structure of the stack when a shift
   transition over nonterminal [nt] is about to be taken. It
   consists of just one cell. *)

let gotostack : Nonterminal.t -> word =
  Nonterminal.tabulate (fun nt ->
    let sources =
      Lr1.targets (fun accu sources _ ->
518
        List.fold_right Lr1.NodeSet.add sources accu
519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535
      ) Lr1.NodeSet.empty (Symbol.N nt)
    in
    [ Symbol.N nt, sources ]
  )

let fold f accu w =
  List.fold_right (fun (symbol, states) accu ->
    f accu (representeds states) symbol states
  ) w accu

let fold_top f accu w =
  match w with
  | [] ->
      accu
  | (symbol, states) :: _ ->
      f (representeds states) symbol

536 537 538 539 540 541 542 543
let print (w : word) =
  let b = Buffer.create 64 in
  fold (fun () _represented symbol _states ->
    Buffer.add_string b (Symbol.print symbol);
    Buffer.add_char b ' '
  ) () w;
  Buffer.contents b

544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571
(* ------------------------------------------------------------------------ *)
(* Explain how the stack should be deconstructed when an error is
   found.

   We sometimes have a choice as too how many stack cells should be
   popped. Indeed, several cells in the known suffix of the stack may
   physically hold a state. If neither of these states handles errors,
   then we could jump to either. (Indeed, if we jump to one that's
   nearer, it will in turn pop further stack cells and jump to one
   that's farther.) In the interests of code size, we should pop as
   few stack cells as possible. So, we jump to the topmost represented
   state in the known suffix. *)

type state =
  | Represented
  | UnRepresented of Lr1.node

type instruction =
  | Die
  | DownTo of word * state

let rewind node : instruction =
  let w = stack node in

  let rec rewind w =
    match w with
    | [] ->

572 573 574 575
        (* I believe that every stack description either is definite
           (that is, ends with [TailEmpty]) or contains at least one
           represented state. Thus, if we find an empty [w], this
           means that the stack is definitely empty. *)
576 577 578 579 580

        Die

    | ((_, states) as cell) :: w ->

581
        if representeds states then
582

583 584
          (* Here is a represented state. We will pop this
             cell and no more. *)
585

586
          DownTo ([ cell ], Represented)
587

588
        else if handlers states then begin
589

590 591 592 593
          (* Here is an unrepresented state that can handle
             errors. The cell must hold a singleton set of states, so
             we know which state to jump to, even though it isn't
             represented. *)
594

595 596 597
          assert (Lr1.NodeSet.cardinal states = 1);
          let state = Lr1.NodeSet.choose states in
          DownTo ([ cell ], UnRepresented state)
598

599 600
        end
        else
601

602 603
          (* Here is an unrepresented state that does not handle
             errors. Pop this cell and look further. *)
604

605 606 607 608 609
          match rewind w with
          | Die ->
              Die
          | DownTo (w, st) ->
              DownTo (cell :: w, st)
610 611 612 613 614

  in
  rewind w

(* ------------------------------------------------------------------------ *)
615

616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657
(* Machinery for the computation of which symbols must keep track of their
   start or end positions. *)

open Keyword

type variable =
  Symbol.t * where (* WhereStart or WhereEnd *)

module M : Fix.IMPERATIVE_MAPS with type key = variable = struct
  type key = variable
  type 'data t = {
    mutable startp: 'data SymbolMap.t;
    mutable endp:   'data SymbolMap.t;
  }
  open SymbolMap
  let create() =
    { startp = empty; endp = empty }
  let clear m =
    m.startp <- empty; m.endp <- empty
  let add (sym, where) data m =
    match where with
    | WhereStart ->
        m.startp <- add sym data m.startp
    | WhereEnd ->
        m.endp <- add sym data m.endp
    | WhereSymbolStart ->
        assert false
  let find (sym, where) m =
    match where with
    | WhereStart ->
        find sym m.startp
    | WhereEnd ->
        find sym m.endp
    | WhereSymbolStart ->
        assert false
  let iter f m =
    iter (fun sym -> f (sym, WhereStart)) m.startp;
    iter (fun sym -> f (sym, WhereEnd)) m.endp
end

(* ------------------------------------------------------------------------ *)

658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673
(* We now determine which positions must be kept track of. For simplicity, we
   do this on a per-symbol basis. That is, for each symbol, either we never
   keep track of position information, or we always do. In fact, we do
   distinguish start and end positions. This leads to computing two sets of
   symbols -- those that keep track of their start position and those that
   keep track of their end position.

   A symbol on the right-hand side of a production must keep track of its
   (start or end) position if that position is explicitly requested by a
   semantic action.

   Furthermore, if the left-hand symbol of a production must keep track of its
   start (resp. end) position, then the first (resp. last) symbol of its
   right-hand side (if there is one) must do so as well. That is, unless the
   right-hand side is empty. *)

674 675
(* 2015/11/11. When a production [prod] is reduced, the top stack cell may be
   consulted for its end position. This implies that this cell must exist
676 677
   and must store an end position! Now, when does this happen?

678 679
   1- This happens if [prod] is an epsilon production and the left-hand symbol
      of the production, [nt prod], keeps track of its start or end position.
680 681 682 683 684 685 686 687 688 689 690
   2- This happens if the semantic action explicitly mentions the keyword
      [$endpos($0)].

   Now, if this happens, what should we do?

   a- If this happens in a state [s] whose incoming symbol is [sym], then [sym]
      must keep track of its end position.
   b- If this happens in an initial state, where the stack may be empty, then
      the sentinel cell at the bottom of the stack must contain an end position.

   Point (b) doesn't concern us here, but point (a) does. We must implement the
691 692
   constraint (1) \/ (2) -> (a). Point (b) is taken care of in the code back-end,
   where, for simplicity, we always create a sentinel cell. *)
693

694 695 696 697
(* I will say that this is a lot more sophisticated than I would like. The code
   back-end has been known for its efficiency and I am trying to maintain this
   property -- in particular, I would like to keep track of no positions at all,
   if the user doesn't use any position keyword. But I am suffering. *)
698

699 700
module S =
  FixSolver.Make(M)(Boolean)
701

702 703
let record_ConVar, record_VarVar, solve =
  S.create()
704 705

let () =
706 707 708 709 710

  (* We gather the constraints explained above in two loops. The first loop
     looks at every (non-start) production [prod]. The second loop looks at
     every (non-initial) state [s]. *)

711
  Production.iterx (fun prod ->
712 713

    let nt, rhs = Production.def prod
714 715
    and ids = Production.identifiers prod
    and action = Production.action prod in
716 717 718 719 720 721 722 723 724 725 726
    let length = Array.length rhs in

    if length > 0 then begin
      (* If [nt] keeps track of its start position, then the first symbol
         in the right-hand side must do so as well. *)
      record_VarVar (Symbol.N nt, WhereStart) (rhs.(0), WhereStart);
      (* If [nt] keeps track of its end position, then the last symbol
         in the right-hand side must do so as well. *)
      record_VarVar (Symbol.N nt, WhereEnd) (rhs.(length - 1), WhereEnd)
    end;

727 728
    KeywordSet.iter (function
      | SyntaxError ->
729
          ()
730
      | Position (Before, _, _) ->
731 732
          (* Doing nothing here because [$endpos($0)] is dealt with in
             the second loop. *)
733
          ()
734 735 736 737 738 739
      | Position (Left, _, _) ->
          (* [$startpos] and [$endpos] have been expanded away. *)
          assert false
      | Position (RightNamed _, WhereSymbolStart, _) ->
          (* [$symbolstartpos(x)] does not exist. *)
          assert false
740
      | Position (RightNamed id, where, _) ->
741 742 743
          (* If the semantic action mentions [$startpos($i)], then the
             [i]-th symbol in the right-hand side must keep track of
             its start position. Similarly for end positions. *)
744 745
          Array.iteri (fun i id' ->
            if id = id' then
746
              record_ConVar true (rhs.(i), where)
747
          ) ids
748
    ) (Action.keywords action)
749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775

  ); (* end of loop on productions *)

  Lr1.iterx (fun s ->
    (* Let [sym] be the incoming symbol of state [s]. *)
    let sym = Misc.unSome (Lr1.incoming_symbol s) in

    (* Condition (1) in the long comment above (2015/11/11). If an epsilon
       production [prod] can be reduced in state [s], if its left-hand side
       [nt] keeps track of its start or end position, then [sym] must keep
       track of its end position. *)
    TerminalMap.iter (fun _ prods ->
      let prod = Misc.single prods in
      let nt, rhs = Production.def prod in
      let length = Array.length rhs in
      if length = 0 then begin
        record_VarVar (Symbol.N nt, WhereStart) (sym, WhereEnd);
        record_VarVar (Symbol.N nt, WhereEnd) (sym, WhereEnd)
      end
    ) (Lr1.reductions s);

    (* Condition (2) in the long comment above (2015/11/11). If a production
       can be reduced in state [s] and mentions [$endpos($0)], then [sym]
       must keep track of its end position. *)
    if Lr1.has_beforeend s then
      record_ConVar true (sym, WhereEnd)

776 777
  )

778 779 780 781 782 783 784 785 786 787 788 789
let track : variable -> bool =
  solve()

let startp symbol =
  track (symbol, WhereStart)

let endp symbol =
  track (symbol, WhereEnd)

let for_every_symbol (f : Symbol.t -> unit) : unit =
  Terminal.iter (fun t -> f (Symbol.T t));
  Nonterminal.iter (fun nt -> f (Symbol.N nt))
790

791 792 793 794
let sum_over_every_symbol (f : Symbol.t -> bool) : int =
  let c = ref 0 in
  for_every_symbol (fun sym -> if f sym then c := !c + 1);
  !c
795 796 797 798 799 800

let () =
  Error.logC 1 (fun f ->
    Printf.fprintf f
      "%d out of %d symbols keep track of their start position.\n\
       %d out of %d symbols keep track of their end position.\n"
801 802
        (sum_over_every_symbol startp) (Terminal.n + Nonterminal.n)
        (sum_over_every_symbol endp) (Terminal.n + Nonterminal.n))
803 804 805 806 807 808 809 810 811 812 813

(* ------------------------------------------------------------------------- *)
(* Miscellaneous. *)

let universal symbol =
  Lr1.fold (fun universal s ->
    universal && (if represented s then SymbolMap.mem symbol (Lr1.transitions s) else true)
  ) true

(* ------------------------------------------------------------------------ *)
(* Discover which states can peek at an error. These are the states
814 815
   where an error token may be on the stream. These are the states
   that are targets of a reduce action on [error]. *)
816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832

(* 2012/08/25 I am optimizing this code, whose original version I found had
   quadratic complexity. The problem is as follows. We can easily iterate over
   all states to find which states [s] have a reduce action on error. What we
   must find out, then, is into which state [t] this reduce action takes us.
   This is not easy to predict, as it depends on the contents of the stack.
   The original code used an overapproximation, as follows: if the reduction
   concerns a production whose head symbol is [nt], then all of the states
   that have an incoming transition labeled [nt] are potential targets. The
   new version of the code below relies on the same approximation, but uses
   two successive loops instead of two nested loops. *)

let errorpeekers =
  (* First compute a set of symbols [nt]... *)
  let nts : SymbolSet.t =
    Lr1.fold (fun nts node ->
      try
833 834 835 836
        let prods = TerminalMap.lookup Terminal.error (Lr1.reductions node) in
        let prod = Misc.single prods in
        let nt = Production.nt prod in
        SymbolSet.add (Symbol.N nt) nts
837
      with Not_found ->
838
        nts
839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889
    ) SymbolSet.empty
  in
  (* ... then compute the set of all target states of all transitions
     labeled by some symbol in the set [nt]. *)
  SymbolSet.fold (fun nt errorpeekers ->
    Lr1.targets (fun errorpeekers _ target ->
      Lr1.NodeSet.add target errorpeekers
    ) errorpeekers nt
  ) nts Lr1.NodeSet.empty

let errorpeeker node =
  Lr1.NodeSet.mem node errorpeekers

(* ------------------------------------------------------------------------ *)
(* Here is how we check whether state [s] should have a default
   reduction.

   We check whether [s] has no outgoing shift transitions and only has
   one possible reduction action. In that case, we produce a default
   reduction action, that is, we perform reduction without consulting
   the lookahead token. This saves code, but can alter the parser's
   behavior in the presence of errors.

   The check for default actions subsumes the check for the case where
   [s] admits a reduce action with lookahead symbol "#". In that case,
   it must be the only possible action -- see
   [Lr1.default_conflict_resolution]. That is, we have reached a point
   where we have recognized a well-formed input and are now expecting
   an end-of-stream. In that case, performing reduction without
   looking at the next token is the right thing to do, since there
   should in fact be none. The state that we reduce to will also have
   the same property, and so on, so we will in fact end up rewinding
   the entire stack and accepting the input when the stack becomes
   empty.

   (New as of 2012/01/23.) A state where a shift/reduce conflict was
   solved in favor of neither (due to a use of the %nonassoc
   directive) must not perform a default reduction. Indeed, this would
   effectively mean that the failure that was requested by the user is
   forgotten and replaced with a reduction. This surprising behavior
   is present in ocamlyacc and was present in earlier versions of
   Menhir. See e.g. http://caml.inria.fr/mantis/view.php?id=5462

   There is a chance that we might run into trouble if the ideas
   described in the above two paragraphs collide, that is, if we
   forbid a default reduction (due to a shift/reduce conflict solved
   by %nonassoc) in a node where we would like to have default
   reduction on "#". This situation seems unlikely to arise, so I will
   not do anything about it for the moment. (Furthermore, someone who
   uses precedence declarations is looking for trouble anyway.)

890 891 892 893 894 895 896
   Between 2012/05/25 and 2015/09/25, if [--canonical] has been specified,
   then we disallow default reductions on a normal token, because we do not
   want to introduce any spurious actions into the automaton. We do still
   allow default reductions on "#", since they are needed for the automaton to
   terminate properly. From 2015/09/25 on, we again always allow default
   reductions, as they seem to be beneficial when explaining syntax errors. *)

897 898 899 900 901 902 903
let (has_default_reduction : Lr1.node -> (Production.index * TerminalSet.t) option), hdrcount =
  Misc.tabulateo Lr1.number Lr1.fold Lr1.n (fun s ->

    if Lr1.forbid_default_reduction s then
      None
    else

904 905 906
      let reduction = ProductionMap.is_singleton (Lr1.invert (Lr1.reductions s)) in
      match reduction with
      | Some _ ->
907
          if SymbolMap.purelynonterminal (Lr1.transitions s)
908 909
          then reduction
          else None
910
      | None ->
911
          reduction
912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934

  )

let () =
  Error.logC 1 (fun f ->
    Printf.fprintf f
       "%d out of %d states have a default reduction.\n"
       hdrcount Lr1.n)

(* ------------------------------------------------------------------------ *)

let () =
  Time.tick "Constructing the invariant"

(* ------------------------------------------------------------------------ *)

(* If any fatal error was signaled up to this point, stop now. This may include
   errors signaled in the modules [lr1] and [invariant] by calling the function
   [Error.grammar_warning]. *)

let () =
  if Error.errors() then
    exit 1