engine.ml 22.1 KB
Newer Older
1 2 3 4 5 6 7 8 9
open EngineTypes

(* The LR parsing engine. *)

(* This module is used:

   - at compile time, if so requested by the user, via the --interpret options;
   - at run time, in the table-based back-end. *)

10 11 12 13 14 15 16 17 18 19
(* A tainted dummy position. In principe, it should never be exposed. *)

let dummy_pos =
  let open Lexing in {
    pos_fname = "<MenhirLib.Engine>";
    pos_lnum = 0;
    pos_bol = 0;
    pos_cnum = -1;
  }

20 21 22 23 24 25
module Make (T : TABLE) = struct

  (* This propagates type and exception definitions. *)

  include T

26
  type env =
27
      (state, semantic_value, token) EngineTypes.env
28

29 30
  (* --------------------------------------------------------------------------- *)

31
  (* The type [checkpoint] represents an intermediate or final result of the
32
     parser. See [EngineTypes]. *)
33

34 35
  (* The type [checkpoint] is presented to the user as a private type (see
     [IncrementalEngine]). This prevents the user from manufacturing checkpoints
36 37 38
     (i.e., continuations) that do not make sense. (Such continuations could
     potentially violate the LR invariant and lead to crashes.) *)

39
  type 'a checkpoint =
40
    | InputNeeded of env
41
    | Shifting of env * env * bool
42 43
    | AboutToReduce of env * production
    | HandlingError of env
44
    | Accepted of 'a
45
    | Rejected
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68

  (* --------------------------------------------------------------------------- *)

  (* In the code-based back-end, the [run] function is sometimes responsible
     for pushing a new cell on the stack. This is motivated by code sharing
     concerns. In this interpreter, there is no such concern; [run]'s caller
     is always responsible for updating the stack. *)

  (* In the code-based back-end, there is a [run] function for each state
     [s]. This function can behave in two slightly different ways, depending
     on when it is invoked, or (equivalently) depending on [s].

     If [run] is invoked after shifting a terminal symbol (or, equivalently,
     if [s] has a terminal incoming symbol), then [run] discards a token,
     unless [s] has a default reduction on [#]. (Indeed, in that case,
     requesting the next token might drive the lexer off the end of the input
     stream.)

     If, on the other hand, [run] is invoked after performing a goto transition,
     or invoked directly by an entry point, then there is nothing to discard.

     These two cases are reflected in [CodeBackend.gettoken].

69 70 71 72
     Here, the code is structured in a slightly different way. It is up to the
     caller of [run] to indicate whether to discard a token, via the parameter
     [please_discard]. This flag is set when [s] is being entered by shifting
     a terminal symbol and [s] does not have a default reduction on [#]. *)
73

74
  (* The following recursive group of functions are tail recursive, produce a
75 76 77
     checkpoint of type [semantic_value checkpoint], and cannot raise an
     exception. A semantic action can raise [Error], but this exception is
     immediately caught within [reduce]. *)
78

79
  let rec run env please_discard : semantic_value checkpoint =
80 81 82

    (* Log the fact that we just entered this state. *)
    
83 84
    if log then
      Log.state env.current;
85

86 87 88 89 90
    (* If [please_discard] is set, we discard the current lookahead token and
       fetch the next one. In order to request a token from the user, we
       return an [InputNeeded] continuation, which, when invoked by the user,
       will take us to [discard]. If [please_discard] is not set, we skip this
       step and jump directly to [check_for_default_reduction]. *)
91 92

    if please_discard then
93
      InputNeeded env
94 95 96
    else
      check_for_default_reduction env

97 98
  (* [discard env triple] stores [triple] into [env], overwriting the previous
     token. It is invoked by [offer], which itself is invoked by the user in
99
     response to an [InputNeeded] checkpoint. *)
100

101
  and discard env triple =
102 103 104 105
    if log then begin
      let (token, startp, endp) = triple in
      Log.lookahead_token (T.token2terminal token) startp endp
    end;
106
    let env = { env with error = false; triple } in
107 108 109
    check_for_default_reduction env

  and check_for_default_reduction env =
110 111 112 113 114 115

    (* Examine what situation we are in. This case analysis is analogous to
       that performed in [CodeBackend.gettoken], in the sub-case where we do
       not have a terminal incoming symbol. *)

    T.default_reduction
116
      env.current
117
      announce_reduce       (* there is a default reduction; perform it *)
118
      check_for_error_token (* there is none; continue below *)
119 120
      env

121
  and check_for_error_token env =
122 123 124 125 126

    (* There is no default reduction. Consult the current lookahead token
       so as to determine which action should be taken. *)

    (* Peeking at the first input token, without taking it off the input
127
       stream, is done by reading [env.triple]. We are careful to first
128
       check [env.error]. *)
129

130 131
    (* Note that, if [please_discard] was true, then we have just called
       [discard], so the lookahead token cannot be [error]. *)
132

133 134 135
    (* Returning [HandlingError env] is equivalent to calling [error env]
       directly, except it allows the user to regain control. *)

136
    if env.error then begin
137 138
      if log then
        Log.resuming_error_handling();
139
      HandlingError env
140 141
    end
    else
142
      let (token, _, _) = env.triple in
143

144 145 146
      (* We consult the two-dimensional action table, indexed by the
         current state and the current lookahead token, in order to
         determine which action should be taken. *)
147

148 149 150 151 152
      T.action
        env.current                    (* determines a row *)
        (T.token2terminal token)       (* determines a column *)
        (T.token2value token)
        shift                          (* shift continuation *)
153
        announce_reduce                (* reduce continuation *)
154 155
        initiate                       (* failure continuation *)
        env
156 157 158 159 160 161 162

  (* --------------------------------------------------------------------------- *)

  (* This function takes care of shift transitions along a terminal symbol.
     (Goto transitions are taken care of within [reduce] below.) The symbol
     can be either an actual token or the [error] pseudo-token. *)

163
  (* Here, the lookahead token CAN be [error]. *)
164

165 166 167 168
  and shift env
      (please_discard : bool)
      (terminal : terminal)
      (value : semantic_value)
169
      (s' : state) =
170 171 172

    (* Log the transition. *)

173 174
    if log then
      Log.shift terminal s';
175 176 177 178

    (* Push a new cell onto the stack, containing the identity of the
       state that we are leaving. *)

179
    let (_, startp, endp) = env.triple in
180
    let stack = {
181 182
      state = env.current;
      semv = value;
183 184
      startp;
      endp;
185
      next = env.stack;
186
    } in
187 188 189

    (* Switch to state [s']. *)

190 191 192 193 194 195 196 197 198 199
    let new_env = { env with stack; current = s' } in

    (* Expose the transition to the user. (In principle, we have a choice
       between exposing the transition before we take it, after we take
       it, or at some point in between. This affects the number and type
       of the parameters carried by [Shifting]. Here, we choose to expose
       the transition after we take it; this allows [Shifting] to carry
       only three parameters, whose meaning is simple.) *)

    Shifting (env, new_env, please_discard)
200 201 202

  (* --------------------------------------------------------------------------- *)

203
  (* The function [announce_reduce] stops the parser and returns a checkpoint
204 205
     which allows the parser to be resumed by calling [reduce]. *)

206 207 208 209
  (* Only ordinary productions are exposed to the user. Start productions
     are not exposed to the user. Reducing a start production simply leads
     to the successful termination of the parser. *)

210
  and announce_reduce env (prod : production) =
211 212 213 214
    if T.is_start prod then
      accept env prod
    else
      AboutToReduce (env, prod)
215

216 217
  (* The function [reduce] takes care of reductions. It is invoked by
     [resume] after an [AboutToReduce] event has been produced. *)
218

219
  (* Here, the lookahead token CAN be [error]. *)
220

221 222
  (* The production [prod] CANNOT be a start production. *)

223
  and reduce env (prod : production) =
224 225 226

    (* Log a reduction event. *)

227 228
    if log then
      Log.reduce_or_accept prod;
229 230

    (* Invoke the semantic action. The semantic action is responsible for
231
       truncating the stack and pushing a new cell onto the stack, which
232
       contains a new semantic value. It can raise [Error]. *)
233

234
    (* If the semantic action terminates normally, it returns a new stack,
235
       which becomes the current stack. *)
236

237 238
    (* If the semantic action raises [Error], we catch it and initiate error
       handling. *)
239

240 241 242 243
    (* This [match/with/exception] construct requires OCaml 4.02. *)

    match T.semantic_action prod env with
    | stack ->
244

245 246 247
        (* By our convention, the semantic action has produced an updated
           stack. The state now found in the top stack cell is the return
           state. *)
248

249 250 251
        (* Perform a goto transition. The target state is determined
           by consulting the goto table at the return state and at
           production [prod]. *)
252

253 254 255
        let current = T.goto stack.state prod in
        let env = { env with stack; current } in
        run env false
256

257
    | exception Error ->
258
        initiate env
259

260 261 262 263 264 265 266 267 268
  and accept env prod =
    (* Log an accept event. *)
    if log then
      Log.reduce_or_accept prod;
    (* Extract the semantic value out of the stack. *)
    let v = env.stack.semv in
    (* Finish. *)
    Accepted v

269 270 271 272
  (* --------------------------------------------------------------------------- *)

  (* The following functions deal with errors. *)

273 274
  (* [initiate] initiates or resumes error handling. *)

275
  (* Here, the lookahead token CAN be [error]. *)
276

277
  and initiate env =
278 279
    if log then
      Log.initiating_error_handling();
280
    let env = { env with error = true } in
281
    HandlingError env
282 283 284

  (* [error] handles errors. *)

285
  and error env =
286
    assert env.error;
287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307

    (* Consult the column associated with the [error] pseudo-token in the
       action table. *)

    T.action
      env.current                    (* determines a row *)
      T.error_terminal               (* determines a column *)
      T.error_value
      error_shift                    (* shift continuation *)
      error_reduce                   (* reduce continuation *)
      error_fail                     (* failure continuation *)
      env

  and error_shift env please_discard terminal value s' =

    (* Here, [terminal] is [T.error_terminal], and [value] is [T.error_value]. *)

    assert (terminal = T.error_terminal && value = T.error_value);

    (* This state is capable of shifting the [error] token. *)

308 309
    if log then
      Log.handling_error env.current;
310 311 312 313 314 315
    shift env please_discard terminal value s'

  and error_reduce env prod =

    (* This state is capable of performing a reduction on [error]. *)

316 317
    if log then
      Log.handling_error env.current;
318
    reduce env prod
319 320 321
      (* Intentionally calling [reduce] instead of [announce_reduce].
         It does not seem very useful, and it could be confusing, to
         expose the reduction steps taken during error handling. *)
322 323 324 325 326 327 328 329 330 331 332 333

  and error_fail env =

    (* This state is unable to handle errors. Attempt to pop a stack
       cell. *)

    let cell = env.stack in
    let next = cell.next in
    if next == cell then

      (* The stack is empty. Die. *)

334
      Rejected
335 336 337 338 339 340

    else begin

      (* The stack is nonempty. Pop a cell, updating the current state
	 with that found in the popped cell, and try again. *)

341 342 343 344
      let env = { env with
        stack = next;
        current = cell.state
      } in
345
      HandlingError env
346 347 348

    end

349 350
  (* End of the nest of tail recursive functions. *)

POTTIER Francois's avatar
POTTIER Francois committed
351
  (* --------------------------------------------------------------------------- *)
352 353
  (* --------------------------------------------------------------------------- *)

POTTIER Francois's avatar
POTTIER Francois committed
354
  (* The incremental interface. See [EngineTypes]. *)
355

POTTIER Francois's avatar
POTTIER Francois committed
356
  (* [start s] begins the parsing process. *)
357

358
  let start (s : state) : semantic_value checkpoint =
359
    
360
    (* Build an empty stack. This is a dummy cell, which is its own
POTTIER Francois's avatar
POTTIER Francois committed
361 362 363
       successor. Its fields other than [next] contain dummy values.
       Its [next] field WILL be accessed by [error_fail] if an error
       occurs and is propagated all the way until the stack is empty. *)
364 365 366 367

    let rec empty = {
      state = s;                          (* dummy *)
      semv = T.error_value;               (* dummy *)
368 369
      startp = dummy_pos;                 (* dummy *)
      endp = dummy_pos;                   (* dummy *)
370 371 372 373 374
      next = empty;
    } in

    (* Build an initial environment. *)

POTTIER Francois's avatar
POTTIER Francois committed
375 376 377 378 379 380 381 382
    (* Unfortunately, there is no type-safe way of constructing a
       dummy token. Tokens carry semantic values, which in general
       we cannot manufacture. This instance of [Obj.magic] could
       be avoided by adopting a different representation (e.g., no
       [env.error] field, and an option in the first component of
       [env.triple]), but I like this representation better. *)

    let dummy_token = Obj.magic () in
383
    let env = {
384
      error = false;
385
      triple = (dummy_token, dummy_pos, dummy_pos); (* dummy *)
386 387 388 389
      stack = empty;
      current = s;
    } in

390
    (* Begin parsing. *)
391

392 393 394 395 396
    (* The parameter [please_discard] here is [true], which means we know
       that we must read at least one token. This claim relies on the fact
       that we have ruled out the two special cases where a start symbol
       recognizes the empty language or the singleton language {epsilon}. *)

397
    run env true
398

399 400 401
  (* [offer checkpoint triple] is invoked by the user in response to a
     checkpoint of the form [InputNeeded env]. It checks that [checkpoint] is
     indeed of this form, and invokes [discard]. *)
402

403 404 405 406
  (* [resume checkpoint] is invoked by the user in response to a checkpoint of
     the form [AboutToReduce (env, prod)] or [HandlingError env]. It checks
     that [checkpoint] is indeed of this form, and invokes [reduce] or
     [error], as appropriate. *)
407

408
  (* In reality, [offer] and [resume] accept an argument of type
409 410 411
     [semantic_value checkpoint] and produce a checkpoint of the same type.
     The choice of [semantic_value] is forced by the fact that this is the
     parameter of the checkpoint [Accepted]. *)
412

413
  (* We change this as follows. *)
414

415
  (* We change the argument and result type of [offer] and [resume] from
416 417 418 419 420 421 422 423 424 425 426 427 428 429
     [semantic_value checkpoint] to ['a checkpoint]. This is safe, in this
     case, because we give the user access to values of type [t checkpoint]
     only if [t] is indeed the type of the eventual semantic value for this
     run. (More precisely, by examining the signatures [INCREMENTAL_ENGINE]
     and [INCREMENTAL_ENGINE_START], one finds that the user can build a value
     of type ['a checkpoint] only if ['a] is [semantic_value]. The table
     back-end goes further than this and produces versions of [start] composed
     with a suitable cast, which give the user access to a value of type
     [t checkpoint] where [t] is the type of the start symbol.) *)

  let offer : 'a . 'a checkpoint ->
                   token * Lexing.position * Lexing.position ->
                   'a checkpoint
  = function
430 431 432 433 434
    | InputNeeded env ->
        Obj.magic discard env
    | _ ->
        raise (Invalid_argument "offer expects InputNeeded")

435
  let resume : 'a . 'a checkpoint -> 'a checkpoint = function
436 437
    | HandlingError env ->
        Obj.magic error env
438 439
    | Shifting (_, env, please_discard) ->
        Obj.magic run env please_discard
440 441 442 443
    | AboutToReduce (env, prod) ->
        Obj.magic reduce env prod
    | _ ->
        raise (Invalid_argument "resume expects HandlingError | AboutToReduce")
444

POTTIER Francois's avatar
POTTIER Francois committed
445 446 447 448
  (* --------------------------------------------------------------------------- *)
  (* --------------------------------------------------------------------------- *)

  (* The traditional interface. See [EngineTypes]. *)
449 450 451

  (* --------------------------------------------------------------------------- *)

452
  (* Wrapping a lexer and lexbuf as a token supplier. *)
453

454
  type supplier =
POTTIER Francois's avatar
POTTIER Francois committed
455 456
    unit -> token * Lexing.position * Lexing.position

457
  let lexer_lexbuf_to_supplier
458 459
      (lexer : Lexing.lexbuf -> token)
      (lexbuf : Lexing.lexbuf)
460
  : supplier =
461 462 463 464 465 466 467 468
    fun () ->
      let token = lexer lexbuf in
      let startp = lexbuf.Lexing.lex_start_p
      and endp = lexbuf.Lexing.lex_curr_p in
      token, startp, endp

  (* --------------------------------------------------------------------------- *)

469 470 471 472
  (* The main loop repeatedly handles intermediate checkpoints, until a final
     checkpoint is obtained. This allows implementing the monolithic interface
     ([entry]) in terms of the incremental interface ([start], [offer],
     [handle], [reduce]). *)
473 474 475

  (* By convention, acceptance is reported by returning a semantic value, whereas
     rejection is reported by raising [Error]. *)
POTTIER Francois's avatar
POTTIER Francois committed
476

477 478 479 480
  (* [loop] is polymorphic in ['a]. No cheating is involved in achieving this.
     All of the cheating resides in the types assigned to [offer] and [handle]
     above. *)

481 482 483
  let rec loop : 'a . supplier -> 'a checkpoint -> 'a =
    fun read checkpoint ->
    match checkpoint with
484
    | InputNeeded _ ->
485 486
        (* The parser needs a token. Request one from the lexer,
           and offer it to the parser, which will produce a new
487
           checkpoint. Then, repeat. *)
POTTIER Francois's avatar
POTTIER Francois committed
488
        let triple = read() in
489 490
        let checkpoint = offer checkpoint triple in
        loop read checkpoint
491
    | Shifting _
492 493
    | AboutToReduce _
    | HandlingError _ ->
494 495
        (* The parser has suspended itself, but does not need
           new input. Just resume the parser. Then, repeat. *)
496 497
        let checkpoint = resume checkpoint in
        loop read checkpoint
POTTIER Francois's avatar
POTTIER Francois committed
498
    | Accepted v ->
499 500
        (* The parser has succeeded and produced a semantic value.
           Return this semantic value to the user. *)
POTTIER Francois's avatar
POTTIER Francois committed
501 502
        v
    | Rejected ->
503
        (* The parser rejects this input. Raise an exception. *)
POTTIER Francois's avatar
POTTIER Francois committed
504
        raise Error
505

POTTIER Francois's avatar
POTTIER Francois committed
506
  let entry (s : state) lexer lexbuf : semantic_value =
507
    loop (lexer_lexbuf_to_supplier lexer lexbuf) (start s)
508

509 510
  (* --------------------------------------------------------------------------- *)

511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540
  (* [loop_handle] stops if it encounters an error, and at this point, invokes
     its failure continuation, without letting Menhir do its own traditional
     error-handling (which involves popping the stack, etc.). *)

  let rec loop_handle succeed fail read checkpoint =
    match checkpoint with
    | InputNeeded _ ->
        (* The parser needs a token. Request one from the lexer,
           and offer it to the parser, which will produce a new
           checkpoint. Then, repeat. *)
        let triple = read() in
        let checkpoint = offer checkpoint triple in
        loop_handle succeed fail read checkpoint
    | Shifting _
    | AboutToReduce _ ->
        (* The parser has suspended itself, but does not need
           new input. Just resume the parser. Then, repeat. *)
        let checkpoint = resume checkpoint in
        loop_handle succeed fail read checkpoint
    | HandlingError _
    | Rejected ->
        (* The parser has detected an error. Invoke the failure continuation. *)
        fail checkpoint
    | Accepted v ->
        (* The parser has succeeded and produced a semantic value.
           Return this semantic value to the user. *)
        succeed v

  (* --------------------------------------------------------------------------- *)

541 542 543 544 545 546 547 548 549 550 551 552 553 554 555
  (* The type ['a lr1state] describes the (non-initial) states of the LR(1)
     automaton. The index ['a] represents the type of the semantic value
     associated with the state's incoming symbol. *)

  (* The type ['a lr1state] is defined as an alias for [state], which itself
     is usually defined as [int] (see [TableInterpreter]). So, ['a lr1state]
     is technically a phantom type, but should really be thought of as a GADT
     whose data constructors happen to be represented as integers. It is
     presented to the user as an abstract type (see [IncrementalEngine]). *)

  type 'a lr1state =
      state

  (* --------------------------------------------------------------------------- *)

556 557 558 559 560 561 562
  (* Stack inspection. *)

  (* We offer a read-only view of the parser's state as a stream of elements.
     Each element contains a pair of a (non-initial) state and a semantic
     value associated with (the incoming symbol of) this state. Note that the
     type [element] is an existential type. *)

563
  type element =
564 565
    | Element: 'a lr1state * 'a * Lexing.position * Lexing.position -> element

566 567
  open General

568 569 570
  type stack =
    element stream

571
  (* If [current] is the current state and [cell] is the top stack cell,
572
     then [stack cell current] is a view of the parser's state as a stream
573
     of elements. *)
574

575
  let rec stack cell current : element stream =
576 577 578 579 580 581 582 583 584 585 586 587
    lazy (
      (* The stack is empty iff the top stack cell is its own successor. In
         that case, the current state [current] should be an initial state
         (which has no incoming symbol).
         We do not allow the user to inspect this state. *)
      let next = cell.next in
      if next == cell then
        Nil
      else
        (* Construct an element containing the current state [current] as well
           as the semantic value contained in the top stack cell. This semantic
           value is associated with the incoming symbol of this state, so it
588 589 590 591 592 593
           makes sense to pair them together. The state has type ['a state] and
           the semantic value has type ['a], for some type ['a]. Here, the OCaml
           type-checker thinks ['a] is [semantic_value] and considers this code
           well-typed. Outside, we will use magic to provide the user with a way
           of inspecting states and recovering the value of ['a]. *)
        let element = Element (
594 595 596 597 598
          current,
          cell.semv,
          cell.startp,
          cell.endp
        ) in
599
        Cons (element, stack next cell.state)
600 601
    )

602 603
  let stack env : element stream =
    stack env.stack env.current
604

605 606 607 608 609 610 611 612 613 614 615 616
  (* --------------------------------------------------------------------------- *)

  (* Access to the position of the lookahead token. *)

  let positions { triple = (_, startp, endp); _ } =
    (* In principle, as soon as the lexer has been called at least once,
       [startp] cannot be a dummy position. Our dummy position risks
       exposure only if we are in the very initial state, as produced
       by [start s] above. We declare this situation illegal. *)
    assert (startp != dummy_pos && endp != dummy_pos);
    startp, endp

617 618
end