ErrorReporting.ml 9.33 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
(* An explanation is a description of what the parser has recognized in the
   recent past and what it expects next. *)

type ('item, 'symbol) explanation = {

  (* An explanation is based on an item. *)
  item: 'item;

  (* A past. This is a non-empty sequence of (terminal and non-terminal)
     symbols, each of which corresponds to a range of the input file. These
     symbols correspond to the first half (up to the bullet) of the item's
     right-hand side. In short, they represent what we have recognized in
     the recent past. *)
  past: ('symbol * Lexing.position * Lexing.position) list;

  (* A future. This is a non-empty sequence of (terminal and non-terminal)
     symbols These symbols correspond to the second half (after the bullet)
     of the item's right-hand side. In short, they represent what we expect
     to recognize in the future, if this item is a good prediction. *)
  future: 'symbol list;

  (* A goal. This is a non-terminal symbol. It corresponds to the item's
     left-hand side. In short, it represents the reduction that we will
     be able to perform if we successfully recognize this future. *)
  goal: 'symbol

}

29
module Make
30
  (I : IncrementalEngine.EVERYTHING)
31 32 33 34 35 36 37 38 39 40 41 42
  (User : sig

    (* In order to submit artificial tokens to the parser, we need a function
       that converts a terminal symbol to a token. Unfortunately, we cannot
       (in general) auto-generate this code, because it requires making up
       semantic values of arbitrary OCaml types. *)

    val terminal2token: _ I.terminal -> I.token

  end)
= struct

43
  open General
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
  open I
  open User

  (* [items_current env] assumes that [env] is not an initial state (which
     implies that the stack is non-empty). Under this assumption, it extracts
     the automaton's current state, i.e., the LR(1) state found in the top
     stack cell. It then goes through [items] so as to obtain the LR(0) items
     associated with this state. *)

  let items_current env : item list =
    (* Get the current state. *)
    match Lazy.force (stack env) with
    | Nil ->
        (* If we get here, then the stack is empty, which means the parser
           is an initial state. This should not happen. *)
        invalid_arg "items_current"
    | Cons (Element (current, _, _, _), _) ->
        (* Extract the current state out of the top stack element, and
           convert it to a set of LR(0) items. Returning a set of items
           instead of an ['a lr1state] is convenient; returning [current]
           would require wrapping it in an existential type. *)
        items current

67 68
  (* [is_shift_item t item] determines whether [item] justifies a shift
     transition along the terminal symbol [t]. *)
69

70
  let is_shift_item (t : _ terminal) (prod, index) : bool =
71 72 73 74 75 76 77 78
    let rhs = rhs prod in
    let length = List.length rhs in
    assert (0 < index && index <= length);
    (* We test that there is one symbol after the bullet and this symbol
       is [t] or can generate a word that begins with [t]. (Note that we
       don't need to worry about the case where this symbol is nullable
       and [t] is generated by the following symbol. In that situation,
       we would have to reduce before we can shift [t].) *)
79
    index < length && xfirst (List.nth rhs index) t
80

81 82 83 84 85 86 87 88 89 90
  let compare_explanations x1 x2 =
    let c = compare_items x1.item x2.item in
    (* TEMPORARY checking that if [c] is 0 then the positions are the same *)
    assert (
      c <> 0 || List.for_all2 (fun (_, start1, end1) (_, start2, end2) ->
        start1.Lexing.pos_cnum = start2.Lexing.pos_cnum &&
        end1.Lexing.pos_cnum = end2.Lexing.pos_cnum
      ) x1.past x2.past
    );
    c
91

92
  (* [marry past stack] TEMPORARY comment *)
93 94 95 96 97 98 99 100 101 102

  let rec marry past stack =
    match past, stack with
    | [], _ ->
        []
    | symbol :: past, lazy (Cons (Element (s, _, startp, endp), stack)) ->
        assert (compare_symbols symbol (X (incoming_symbol s)) = 0);
        (symbol, startp, endp) :: marry past stack
    | _ :: _, lazy Nil ->
        assert false
103 104 105 106 107

  (* [investigate t result] assumes that [result] has been obtained by
     offering the terminal symbol [t] to the parser. It runs the parser,
     through an arbitrary number of reductions, until the parser either
     accepts this token (i.e., shifts) or rejects it (i.e., signals an
108 109
     error). If the parser decides to shift, then the shift items found
     in the LR(1) state before the shift are used to produce new explanations. *)
110 111 112 113

  (* It is desirable that the semantic actions be side-effect free, or
     that their side-effects be harmless (replayable). *)

114
  let rec investigate (t : _ terminal) (result : _ result) explanations =
115 116 117 118 119 120 121
    match result with
    | Shifting (env, _, _) ->
        (* The parser is about to shift, which means it is willing to
           consume the terminal symbol [t]. In the state before the
           transition, look at the items that justify shifting [t].
           We view these items as explanations: they explain what
           we have read and what we expect to read. *)
122
        let stack = stack env in
123
        List.fold_left (fun explanations item ->
124 125 126 127 128 129 130 131 132 133 134
          if is_shift_item t item then
            let prod, index = item in
            let rhs = rhs prod in
            {
              item = item;
              past = List.rev (marry (List.rev (take index rhs)) stack);
              future = drop index rhs;
              goal = lhs prod
            } :: explanations
          else
            explanations
135
        ) explanations (items_current env)
136
    | AboutToReduce _ ->
137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
        (* The parser wishes to reduce. Just follow. *)
        investigate t (resume result) explanations
    | HandlingError _ ->
        (* The parser fails, which means the terminal symbol [t] does
           not make sense at this point. Thus, no new explanations of
           what the parser expects need be produced. *)
        explanations
    | InputNeeded _
    | Accepted _
    | Rejected ->
        (* None of these cases can arise. Indeed, after a token is submitted
           to it, the parser must shift, reduce, or signal an error, before
           it can request another token or terminate. *)
        assert false

152 153 154 155 156 157 158
  (* [investigate pos result] assumes that [result] is of the form
     [InputNeeded _].  For every terminal symbol [t], it investigates
     how the parser reacts when fed the symbol [t], and returns a list
     of explanations. The position [pos] is where a syntax error was
     detected; it is used when manufacturing dummy tokens. This is
     important because the position of the dummy token may end up in
     the explanations that we produce. *)
159

160
  let investigate pos (result : _ result) =
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
    weed compare_explanations (
      foreach_terminal_but_error (fun symbol explanations ->
        match symbol with
        | X (N _) -> assert false
        | X (T t) ->
            (* Build a dummy token for the terminal symbol [t]. *)
            let token = (terminal2token t, pos, pos) in
            (* Submit it to the parser. Accumulate explanations. *)
            investigate t (offer result token) explanations
      ) []
    )

  (* TEMPORARY copied from engine.ml; move/share with [Convert] *)

  type reader =
    unit -> token * Lexing.position * Lexing.position

  let wrap (lexer : Lexing.lexbuf -> token) (lexbuf : Lexing.lexbuf) : reader =
    fun () ->
      let token = lexer lexbuf in
      let startp = lexbuf.Lexing.lex_start_p
      and endp = lexbuf.Lexing.lex_curr_p in
      token, startp, endp

185 186 187 188 189
  (* The following is a custom version of the loop found in [Engine]. It
     drives the parser in the usual way, but keeps a checkpoint, which is the
     last [InputNeeded] result. If a syntax error is detected, it goes back to
     this state and analyzes it in order to produce a meaningful
     diagnostic. *)
190

191
  exception Error of (Lexing.position * Lexing.position) * (item, xsymbol) explanation list
192 193 194 195 196 197 198 199 200 201

  (* TEMPORARY why loop-style? we should offer a simplified incremental API *)

  type 'a result = {
    checkpoint: 'a I.result;
    current: 'a I.result
  }

  let rec loop (read : reader) ({ checkpoint; current } : 'a result) : 'a =
    match current with
202
    | InputNeeded _ ->
203 204 205 206 207 208 209 210 211
        (* Update the checkpoint. *)
        let checkpoint = current in
        let triple = read() in
        let current = offer current triple in
        loop read { checkpoint; current }
    | Shifting _
    | AboutToReduce _ ->
        let current = resume current in
        loop read { checkpoint; current }
212 213 214 215 216 217
    | HandlingError env ->
        (* The parser signals a syntax error. Note the position of the
           problematic token, which is useful. Then, go back to the
           checkpoint and investigate. *)
        let (startp, _) as positions = positions env in
        raise (Error (positions, investigate startp checkpoint))
218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
    | Accepted v ->
        v
    | Rejected ->
        (* The parser rejects this input. This cannot happen, because
           we stop as soon as the parser reports [HandlingError]. *)
        assert false

  let entry (start : 'a I.result) lexer lexbuf =
    (* The parser cannot accept or reject before it asks for the very first
       character of input. (Indeed, we statically reject a symbol that
       generates the empty language or the singleton language {epsilon}.)
       So, [start] must be [InputNeeded _]. *)
    assert (match start with InputNeeded _ -> true | _ -> false);
    loop (wrap lexer lexbuf) { checkpoint = start; current = start }

end