Commit d09e98a3 authored by Frédéric Bour's avatar Frédéric Bour

recover tool: import existing sources

parent d60be303
.PHONY: all clean
all:
dune build src/main.bc
clean:
dune clean
test:
dune build src/main.bc demo/calc.bc
_build/default/src/main.bc _build/default/demo/parser.cmly
# Menhir-recover
This tool makes Menhir-generated parsers resilient to errors.
When a parser is successfully processed with Menhir-recover, parse always
succeed (a semantic value -- an AST -- is always produced).
This is done by generating input to complete the parse. Doing so requires at
least the ability to generate semantic values (e.g to finish the parsing of `1
+`, Menhir-recover needs to be told how to produce integers).
But beside that the algorithm is grammar-independent. Making a grammar
error-resilient or maintaining an error-resilient grammar requires few work.
## Using resilient-menhir
The integration relies on a few features of Menhir:
1. grammar attributes, which are used to pass meta-data to resilient-menhir
(how to produce integers in the case above)
2. Menhir ability to save the grammar and the LR automaton in a
program-friendly form (`--cmly`)
3. a custom interpretation loop that will handle errors at run-time.
In practice, 1. means that some changes in the grammar are required, 2. means
that a few more steps are needed in the build system to connect Menhir to
Menhir-recover, and 3. means that the parser is invoked in a slightly different
way during execution.
This tool only covers part 1. and 2. The runtime library is provided
separately. (TODO: runtime library is not there yet)
The `MenhirSdk` module lets us access Menhir representation of the grammar and
the automaton.
## The recovery pipeline
The algorithm for recovery splits the work in three main steps.
*Synthesis* answer the local question of producing the symbols necessary to
finish a rule. It does not need to look at the stack, just at the automaton.
For instance, if the user wrote invalid input after a parenthesis `(` in an
arithmetic language, *Synthesis* will find a way to reach the corresponding
`)`.
*Recovery* establishes plans to finish parsing by combining multiple syntheses.
It does so by enumerating enough prefixes of the stack to know that synthesis
will always be able to progress.
This ensures that recovery is complete: any state the parser can reach can be
recovered from.
These two steps are static. They happen at compile time and can be customized
using _grammar attributes_ (see [Menhir manual, section 13.2](http://gallium.inria.fr/~fpottier/menhir/manual.html#sec78)).
*Dynamic recovery* is the last step and happen during execution.
Using knowledge produced by synthesis and recovery , it will act on a parser.
Interesting behaviors can be obtained by doing partial recovery and then
resuming the normal Menhir parser, but these decisions do not affect the static
analysis.
In other words:
- *Synthesis* establishes a list of tactics to work with the grammar.
- *Recovery* establishes strategies that apply tactics repeatedly .
- *Dynamic recovery* applies those strategies to a concrete instance of a
parser.
**TODO**: Find better names :]
**TODO**: The static and dynamic split in Recovery might be revisited.
Static step is useful to guarantee that recovery is complete.
Different algorithms can deal with the dynamic step.
Recovery currently is greedy.
**TODO**: Implement it as an instance of Dijkstra short-path algorithm.
There are good reasons to believe that in practice, it will be linear in the
size of the stack.
calc.exe
_build
.merlin
.PHONY: all clean test
DUNE := dune
EXECUTABLE := calc.exe
all:
@ if command -v $(DUNE) > /dev/null ; then \
$(DUNE) build $(EXECUTABLE) ; \
else \
echo "Error: $(DUNE) is required." ; \
fi
clean:
dune clean
test: all
@echo "The following command should print 42:"
echo "(1 + 2 * 10) * 2" | ../_build/default/demo/$(EXECUTABLE)
echo "([1 + 2] * 10) * 2" | ../_build/default/demo/$(EXECUTABLE)
open Lexing
(* A short name for the incremental parser API. *)
module I =
Parser.MenhirInterpreter
(* -------------------------------------------------------------------------- *)
(* The above loop is shown for explanatory purposes, but can in fact be
replaced with the following code, which exploits the functions
[lexer_lexbuf_to_supplier] and [loop_handle] offered by Menhir. *)
let succeed (v : int) =
(* The parser has succeeded and produced a semantic value. Print it. *)
Printf.printf "%d\n%!" v
let fail lexbuf (_ : int I.checkpoint) =
(* The parser has suspended itself because of a syntax error. Stop. *)
Printf.fprintf stderr
"At offset %d: syntax error.\n%!"
(lexeme_start lexbuf)
let loop lexbuf result =
let supplier = I.lexer_lexbuf_to_supplier Lexer.token lexbuf in
I.loop_handle succeed (fail lexbuf) supplier result
(* -------------------------------------------------------------------------- *)
(* Initialize the lexer, and catch any exception raised by the lexer. *)
let process (line : string) =
let lexbuf = from_string line in
try
loop lexbuf (Parser.Incremental.main lexbuf.lex_curr_p)
with
| Lexer.Error msg ->
Printf.fprintf stderr "%s%!" msg
(* -------------------------------------------------------------------------- *)
(* The rest of the code is as in the [calc] demo. *)
let process (optional_line : string option) =
match optional_line with
| None ->
()
| Some line ->
process line
let rec repeat channel =
(* Attempt to read one line. *)
let optional_line, continue = Lexer.line channel in
process optional_line;
if continue then
repeat channel
let () =
repeat (from_channel stdin)
(ocamllex lexer)
(menhir
(modules parser)
(flags "--table" "--cmly"))
(executable
(name calc)
(libraries menhirLib))
{
open Parser
exception Error of string
}
(* This rule looks for a single line, terminated with '\n' or eof.
It returns a pair of an optional string (the line that was found)
and a Boolean flag (false if eof was reached). *)
rule line = parse
| ([^'\n']* '\n') as line
(* Normal case: one line, no eof. *)
{ Some line, true }
| eof
(* Normal case: no data, eof. *)
{ None, false }
| ([^'\n']+ as line) eof
(* Special case: some data but missing '\n', then eof.
Consider this as the last line, and add the missing '\n'. *)
{ Some (line ^ "\n"), false }
(* This rule analyzes a single line and turns it into a stream of
tokens. *)
and token = parse
| [' ' '\t']
{ token lexbuf }
| '\n'
{ EOL }
| ['0'-'9']+ as i
{ INT (int_of_string i) }
| '+'
{ PLUS }
| '-'
{ MINUS }
| '*'
{ TIMES }
| '/'
{ DIV }
| '('
{ LPAREN }
| ')'
{ RPAREN }
| '['
{ LBRACKET }
| ']'
{ RBRACKET }
| _
{ raise (Error (Printf.sprintf "At offset %d: unexpected character.\n" (Lexing.lexeme_start lexbuf))) }
%{
[@@@ocaml.warning "-9"]
%}
%token <int> INT [@recover.expr 0]
%token PLUS MINUS TIMES DIV
%token LPAREN RPAREN LBRACKET RBRACKET
%token EOL
%left PLUS MINUS /* lowest precedence */
%left TIMES DIV /* medium precedence */
%nonassoc UMINUS /* highest precedence */
%start <int> main
%%
main:
| e = expr EOL
{ e }
expr:
| i = INT
{ i }
| LPAREN e = expr RPAREN
{ e }
| LBRACKET e = expr RBRACKET
{ e }
| e1 = expr PLUS e2 = expr
{ e1 + e2 }
| e1 = expr MINUS e2 = expr
{ e1 - e2 }
| e1 = expr TIMES e2 = expr
{ e1 * e2 }
| e1 = expr DIV e2 = expr
{ e1 / e2 }
| MINUS e = expr %prec UMINUS
{ - e }
(lang dune 1.6)
(using menhir 2.0)
open MenhirSdk.Cmly_api
(* Attributes guide the recovery .
Some information can be passed to Menhir-recover via attributes. These are
pieces of string that are ignored by Menhir itself and are transmitted to
Menhir-recover.
The attributes that are relevant to Menhir-recover are always prefixed with
`recover.`.
An attribute with the same prefix and that is not understood by
Menhir-recover will produce a warning message (to detect a typo or a
misplaced attribute).
*)
(** Specification of attributes that are meaningful for recovery *)
module type ATTRIBUTES = sig
(** The Menhir grammar to which these apply *)
module G : GRAMMAR
(** Recovery cost
When the parser is in an error state, Menhir-recover will invent some input
that recovers from this error. In most grammars, this problem has many
solutions, often an infinity.
But not all solutions are equally nice. Some will have repetitions, some
will generate undesirable AST nodes or trigger error reductions...
To guide this process, a cost can be associated to each symbol (terminal
or non-terminal), and the cost of the recovery will be the sum of the
cost of all symbols in the generated sentence.
*)
(** Symbol cost
The `recover.cost` attribute is attached to the definition of symbols
(terminals and non-terminals) and takes a floating point value.
%token PLUS [@recover.cost 1.0]
expr [@recover.cost 1.0]:
...
;
*)
(** Cost of a grammar symbol *)
val cost_of_symbol : G.symbol -> float
(** Item cost
The cost can be applied to a specific item (an occurrence of a symbol in
a rule).
In this case, the more specific cost will replace the global cost for
this specific occurrence.
expr:
| INT PLUS [@recover.cost 0.0] INT { ... }
| INT TIMES [@recover.cost 10.0] INT { ... }
;
In this example, if an error happens just after an integer in an
expression, the `PLUS` rule will be favored over the `TIMES` rule because
the first token is more expensive.
*)
(** Penalty (added cost) for shifting an item *)
val penalty_of_item : G.production * int -> float
(** Reduction cost
The last place where a `recover.cost` is accepted is in a production.
This is convenient to prevent the recovery to trigger some semantic
actions.
expr:
LPAREN expr error { ... } [@recover.cost infinity]
;
It would not make much sense for the recovery to select an error rule.
Associating an infinite cost to the production ensures that this never
happen.
*)
(** Cost of reducing a production *)
val cost_of_prod : G.production -> float
(** Meaning of costs
The cost should be a positive floating-point value. +∞ and 0.0 are
accepted.
If not specified, the default cost depends on the presence of a semantic
value:
- for a terminal without semantic value (such as `%token DOT`) it is 0.0.
- for a terminal with a semantic value (such as `%token<int> INT`) or a
non-terminal it is +∞.
If the attribute happens multiple times, the sum of all occurrences is
used.
**TODO**: specify how null values are treated with respect to minimal
cost, can the algorithm diverge?
*)
(** Recovery expressions
Symbols with a semantic value cannot be picked by the recovery algorithm
if it does not know how to produce this value.
The `recover.expr` attribute associates an ocaml expression to a symbol.
This expression should evaluate to a semantic value for this symbol.
%token<string> IDENT [@recover.expr "invalid-identifier"]
When applied to non-terminals, it is particularly useful to produce a
value that could not be the result of a normal parse.
expr [@recover.expr Invalid_expression]:
...
;
Here `Invalid_expression` is a node added to the AST for the purpose of
identifying parts that were recovered.
Furthermore, specifying fallback values for non-terminals prevents
Menhir-recover from generating a hardly predictable sequence of tokens
just for filling holes in the AST.
*)
(** An optional ocaml expression that should evaluate to a
semantic value valid for this terminal. *)
val default_terminal : G.terminal -> string option
(** An optional ocaml expression that should evaluate to a
semantic value valid for this non-terminal. *)
val default_nonterminal : G.nonterminal -> string option
(** The expressions are evaluated every time a new instance of a symbol is
needed, although it is not specified whether every evaluation will be
kept in the final solution (at run time, the algorithm is free to explore
different branches and throw them away as needed).
**TODO**: decide how information can be communicated with recovery
expressions (for instance the current location of the parser)
*)
(** Recovery prelude
The `recover.prelude` attribute is attached to the grammar.
It is an arbitrary piece of OCaml code that will be inserted before the
code of `recover.expr` expressions.
It is useful for defining definitions shared by the recovery expressions,
in the same way as `%{ ... %}` is used to share definitions in semantic
actions of the grammar.
*)
(** Output the grammar prelude in this formatter *)
val default_prelude : Format.formatter -> unit
end (* module type ATTRIBUTES *)
module Recover_attributes (G : GRAMMAR)
: ATTRIBUTES with module G = G =
struct
module G = G
open G
let string_starts_with str ~prefix =
let len = String.length prefix in
(String.length str >= len) &&
(try
for i = 0 to len - 1 do
if str.[i] <> prefix.[i] then raise Exit;
done;
true
with Exit -> false)
let prefix = "recover."
let all_attributes = [
"recover.cost";
"recover.expr";
"recover.prelude";
]
let validate_attribute accepted kind attr =
let label = Attribute.label attr in
if string_starts_with ~prefix label &&
not (List.mem label accepted) then (
let split_pos pos =
(pos.Lexing.pos_fname,
pos.Lexing.pos_lnum,
pos.Lexing.pos_cnum - pos.Lexing.pos_bol)
in
let range () range =
let s = Printf.sprintf in
let sf, sl, sc = split_pos (Range.startp range) in
let ef, el, ec = split_pos (Range.endp range) in
if sf <> ef then
s "%s:%d.%d-%s:%d.%d" sf sl sc ef el ec
else if sl <> el then
s "%s:%d.%d-%d.%d" sf sl sc el ec
else if sc <> ec then
s "%s:%d.%d-%d" sf sl sc ec
else
s "%s:%d.%d" sf sl sc
in
let f fmt = Printf.ksprintf prerr_endline fmt in
if List.mem label all_attributes then
f "%a: attribute %S cannot be put in %s"
range (Attribute.position attr) label kind
else
f "%a: attribute %S is not recognized (found in %s)"
range (Attribute.position attr) label kind
)
let validate_attributes accepted kind attrs =
List.iter (validate_attribute accepted kind) attrs
let () =
validate_attributes
["recover.prelude"] "grammar attributes"
Grammar.attributes;
let symbol prj attrs =
validate_attributes
["recover.cost"; "recover.expr"] "symbol attributes"
(prj attrs)
in
Nonterminal.iter (symbol G.Nonterminal.attributes);
Terminal.iter (symbol G.Terminal.attributes);
Production.iter (fun p ->
validate_attributes
["recover.cost"] "production attributes"
(Production.attributes p);
Array.iter
(fun (_,_,attrs) ->
validate_attributes ["recover.cost"] "item attributes" attrs)
(Production.rhs p)
)
let cost_of_attributes prj attrs =
List.fold_left
(fun total attr ->
if Attribute.has_label "recover.cost" attr then
total +. float_of_string (Attribute.payload attr)
else total)
0. (prj attrs)
let cost_of_symbol =
let measure ~has_default prj attrs =
if List.exists (Attribute.has_label "recover.expr") (prj attrs) || has_default
then cost_of_attributes prj attrs
else infinity
in
let ft = Terminal.tabulate
(fun t -> measure ~has_default:(Terminal.typ t = None) Terminal.attributes t)
in
let fn =
Nonterminal.tabulate (measure ~has_default:false Nonterminal.attributes)
in
function
| T t -> ft t
| N n -> fn n
let cost_of_prod =
Production.tabulate (cost_of_attributes Production.attributes)
let penalty_of_item =
let f = Production.tabulate @@ fun p ->
Array.map (cost_of_attributes (fun (_,_,a) -> a))
(Production.rhs p)
in
fun (p,i) ->
let costs = f p in
if i < Array.length costs then costs.(i) else cost_of_prod p
let default_prelude ppf =
List.iter (fun a ->
if Attribute.has_label "recover.prelude" a then
Format.fprintf ppf "%s\n" (Attribute.payload a)
) Grammar.attributes
let default_expr ?(fallback="raise Not_found") attrs =
match List.find (Attribute.has_label "recover.expr") attrs with
| exception Not_found -> fallback
| attr -> Attribute.payload attr
let default_terminal t =
match Terminal.kind t with
| `REGULAR | `ERROR | `EOF ->
let fallback = match Terminal.typ t with
| None -> Some "()"
| Some _ -> None
in
Some (default_expr ?fallback (Terminal.attributes t))
| `PSEUDO -> None
let default_nonterminal n =
match Nonterminal.kind n with
| `REGULAR -> Some (default_expr (Nonterminal.attributes n))
| `START -> None
end
(executable
(name main)
(libraries fix menhirSdk))
; (rule
; (targets static.ml)
; (deps static.mdx)
; (action (with-stdout-to %{targets}
; (run mdx pp %{deps}))))
open MenhirSdk.Cmly_api
open Utils
open Attributes
open Synthesis
open Recovery
let menhir = "MenhirInterpreter"
(* Generation scheme doing checks and failing at runtime, or not ... *)
let safe = false
type var = int
module Codesharing (S : SYNTHESIZER) (R : RECOVERY with module G = S.G) : sig
type instr =
| IRef of var