Commit 956ba5a1 authored by POTTIER Francois's avatar POTTIER Francois

The new rule syntax.

parent e5d97d9e
# Changes
## 2018/11/06
## 2018/11/12
* In `.mly` files, a new syntax for rules has been introduced, which is
slightly more pleasant than the old syntax. (A rule is the definition of a
nonterminal symbol.) The old syntax remains available; the user chooses
between the two syntaxes on a per-rule basis. The new syntax is fully
documented in the manual;
[a brief summary of the differences](doc/new-rule-syntax.md)
with respect to the old syntax is also available.
**The new syntax is considered experimental**
and is subject to change in the near future.
* In the Coq back-end, avoid pattern-matching at type `int31`,
which will disappear in future versions of Coq.
......
* A drawback of point-free semantic actions: if the application of the OCaml
identifier to the tuple is ill-typed, then we get a type error in the mock
file. Can this be fixed? Generate more # directives around the application
node?
* Set up continuous integration (ci.inria.fr)...
Check that Menhir builds (and the test pass)
with multiple versions of OCaml.
......
......@@ -21,6 +21,8 @@ DEMOS := \
calc-inspection \
generate-printers \
calc-dune \
calc-new-syntax-dune \
calc-ast-dune \
calc-param-dune \
calc-two-dune \
calc-incremental-dune \
......
calc.exe
_build
.merlin
.PHONY: all clean test
DUNE := dune
EXECUTABLE := calc.exe
all:
@ if command -v $(DUNE) > /dev/null ; then \
$(DUNE) build $(EXECUTABLE) ; \
else \
echo "Error: $(DUNE) is required." ; \
fi
clean:
rm -rf `cat .gitignore`
rm -f *~
test: all
@echo "The following command should print 42:"
echo "(1 + 2 * 5) * 2 + 10 - 2 * -2 * 1 + 4 + 2" | ./_build/default/$(EXECUTABLE)
This demo illustrates how to build abstract syntax trees.
It uses the new rule syntax.
open Syntax
let rec interpret (e : expr) =
match e.value with
| ELiteral i ->
i
| EBinOp (e1, OpPlus, e2) ->
interpret e1 + interpret e2
| EBinOp (e1, OpMinus, e2) ->
interpret e1 - interpret e2
| EBinOp (e1, OpTimes, e2) ->
interpret e1 * interpret e2
| EBinOp (e1, OpDiv, e2) ->
interpret e1 / interpret e2
| EUnOp (OpNeg, e) ->
- (interpret e)
let process (line : string) =
let linebuf = Lexing.from_string line in
try
(* Run the parser on this line of input. *)
Printf.printf "%d\n%!" (interpret (Parser.main Lexer.token linebuf))
with
| Lexer.Error msg ->
Printf.fprintf stderr "%s%!" msg
| Parser.Error ->
Printf.fprintf stderr "At offset %d: syntax error.\n%!" (Lexing.lexeme_start linebuf)
let process (optional_line : string option) =
match optional_line with
| None ->
()
| Some line ->
process line
let rec repeat channel =
(* Attempt to read one line. *)
let optional_line, continue = Lexer.line channel in
process optional_line;
if continue then
repeat channel
let () =
repeat (Lexing.from_channel stdin)
(ocamllex
(modules lexer)
)
(menhir
(modules parser)
(flags ("-la" "2"))
)
(executable
(name calc)
)
(lang dune 1.4)
(using menhir 2.0)
{
open Parser
exception Error of string
}
(* This rule looks for a single line, terminated with '\n' or eof.
It returns a pair of an optional string (the line that was found)
and a Boolean flag (false if eof was reached). *)
rule line = parse
| ([^'\n']* '\n') as line
(* Normal case: one line, no eof. *)
{ Some line, true }
| eof
(* Normal case: no data, eof. *)
{ None, false }
| ([^'\n']+ as line) eof
(* Special case: some data but missing '\n', then eof.
Consider this as the last line, and add the missing '\n'. *)
{ Some (line ^ "\n"), false }
(* This rule analyzes a single line and turns it into a stream of
tokens. *)
and token = parse
| [' ' '\t']
{ token lexbuf }
| '\n'
{ EOL }
| ['0'-'9']+ as i
{ INT (int_of_string i) }
| '+'
{ PLUS }
| '-'
{ MINUS }
| '*'
{ TIMES }
| '/'
{ DIV }
| '('
{ LPAREN }
| ')'
{ RPAREN }
| _
{ raise (Error (Printf.sprintf "At offset %d: unexpected character.\n" (Lexing.lexeme_start lexbuf))) }
(* In this demo, instead of interpreting arithmetic expressions on the fly
as in the other demos (e.g. demos/calc), we build an AST, which we later
interpret. *)
(* We use a two-level AST, where the types [expr] and [raw_expr] are mutually
recursive; this means that every subexpression is annotated with its
location in the input text. In this file, we use the parameterized symbol
[located] to annotate semantic values with locations without polluting the
grammar. *)
(* We make heavy use of ~ patterns and <...> semantic actions to avoid naming
or numbering the semantic values that we manipulate. *)
%token <int> INT
%token PLUS MINUS TIMES DIV
%token LPAREN RPAREN
%token EOL
%start <Syntax.expr> main
%{ open Syntax %}
%%
(* -------------------------------------------------------------------------- *)
(* We wish to parse an expression followed with an end-of-line. *)
let main :=
~ = expr; EOL; <>
(* An expression is an additive expression. *)
let expr ==
additive_expr
(* An additive expression is
either a multiplicative expression,
or the application of an additive operator to two subexpressions.
In the second case, the left-hand subexpression is additive, while the
right-hand subexpression is multiplicative; this reflects the fact that
the operator is left-associative. The three semantic values of interest
(the left subexpression, the operator, the right subexpression) are
matched against ~ patterns, which means that, in the end, the data
constructor [EBinOp] is applied to a tuple of these three components.
Furthermore, this whole construction is wrapped in [located], so the
result of [EBinOp], a raw expression, is turned into an expression. *)
let additive_expr :=
| multiplicative_expr
| located(
~ = additive_expr; ~ = additive_op; ~ = multiplicative_expr; <EBinOp>
)
(* These are the additive operators and their meaning. *)
let additive_op ==
| PLUS; { OpPlus }
| MINUS; { OpMinus }
(* A multiplicative expression is either an atomic expression or the
application of a multiplicative operator to two subexpressions. *)
let multiplicative_expr :=
| atomic_expr
| located(
~ = multiplicative_expr; ~ = multiplicative_op; ~ = atomic_expr; <EBinOp>
)
(* These are the multiplicative operators and their meaning. *)
let multiplicative_op ==
| TIMES; { OpTimes }
| DIV; { OpDiv }
(* An atomic expression is one of:
an expression between parentheses,
an integer literal,
an application of a unary operator to an atomic expression. *)
(* Only the last two cases are wrapped in [located]; in the first case, this is
not necessary, as the expression already carries a location. Note that, this
way, we get tight locations (i.e., the parentheses are not included). *)
let atomic_expr :=
| LPAREN; ~ = expr; RPAREN; <>
| located(
| ~ = INT; <ELiteral>
| ~ = unary_op; ~ = atomic_expr; <EUnOp>
)
(* These are the unary operators and their meaning. *)
let unary_op ==
| MINUS; { OpNeg }
(* -------------------------------------------------------------------------- *)
(* [located(x)] recognizes the same input fragment as [x] and wraps its
semantic value of type ['a] as a value of type ['a located]. *)
let located(x) ==
~ = x; { { loc = $loc; value = x } }
type unop =
| OpNeg
type binop =
| OpPlus | OpMinus | OpTimes | OpDiv
type 'a located =
{ loc: Lexing.position * Lexing.position; value: 'a }
type expr =
raw_expr located
and raw_expr =
| ELiteral of int
| EUnOp of unop * expr
| EBinOp of expr * binop * expr
.PHONY: all clean test
DUNE := dune
EXECUTABLE := calc.exe
all:
@ if command -v $(DUNE) > /dev/null ; then \
$(DUNE) build $(EXECUTABLE) ; \
else \
echo "Error: $(DUNE) is required." ; \
fi
clean:
rm -rf `cat .gitignore`
rm -f *~
test: all
@echo "The following command should print 42:"
echo "(1 + 2 * 5) * 2 + 10 - 2 * -2 * 1 + 4 + 2" | ./_build/default/$(EXECUTABLE)
This demo illustrates the new rule syntax,
and heavily exploits parameterized symbols.
let process (line : string) =
let linebuf = Lexing.from_string line in
try
(* Run the parser on this line of input. *)
Printf.printf "%d\n%!" (Parser.main Lexer.token linebuf)
with
| Lexer.Error msg ->
Printf.fprintf stderr "%s%!" msg
| Parser.Error ->
Printf.fprintf stderr "At offset %d: syntax error.\n%!" (Lexing.lexeme_start linebuf)
let process (optional_line : string option) =
match optional_line with
| None ->
()
| Some line ->
process line
let rec repeat channel =
(* Attempt to read one line. *)
let optional_line, continue = Lexer.line channel in
process optional_line;
if continue then
repeat channel
let () =
repeat (Lexing.from_channel stdin)
(ocamllex
(modules lexer)
)
(menhir
(modules parser)
(flags ("-la" "2"))
)
(executable
(name calc)
)
(lang dune 1.4)
(using menhir 2.0)
{
open Parser
exception Error of string
}
(* This rule looks for a single line, terminated with '\n' or eof.
It returns a pair of an optional string (the line that was found)
and a Boolean flag (false if eof was reached). *)
rule line = parse
| ([^'\n']* '\n') as line
(* Normal case: one line, no eof. *)
{ Some line, true }
| eof
(* Normal case: no data, eof. *)
{ None, false }
| ([^'\n']+ as line) eof
(* Special case: some data but missing '\n', then eof.
Consider this as the last line, and add the missing '\n'. *)
{ Some (line ^ "\n"), false }
(* This rule analyzes a single line and turns it into a stream of
tokens. *)
and token = parse
| [' ' '\t']
{ token lexbuf }
| '\n'
{ EOL }
| ['0'-'9']+ as i
{ INT (int_of_string i) }
| '+'
{ PLUS }
| '-'
{ MINUS }
| '*'
{ TIMES }
| '/'
{ DIV }
| '('
{ LPAREN }
| ')'
{ RPAREN }
| _
{ raise (Error (Printf.sprintf "At offset %d: unexpected character.\n" (Lexing.lexeme_start lexbuf))) }
%token <int> INT
%token PLUS MINUS TIMES DIV
%token LPAREN RPAREN
%token EOL
%start <int> main
(* In this demo, we do not use precedence declarations (%left, etc.).
Instead, we manually stratify the grammar. It is quite easy and
allows us to demonstrate the use of parameterized definitions. *)
%%
(* -------------------------------------------------------------------------- *)
(* [fold_left(op, elem)] recognizes a nonempty, left-associative list of
elements, separated with operators. The semantic value of the symbol [op]
is expected to be a binary function, which is applied to the left-hand
summary and to the right-hand element. *)
let fold_left(op, elem) :=
| elem
| sum = fold_left(op, elem); ~ = op; ~ = elem; { op sum elem }
(* -------------------------------------------------------------------------- *)
(* [app(f, x)] recognizes the sequence [f; x]. Its semantic value is the
application of the semantic value of [f] to the semantic value of [x]. *)
let app(f, x) ==
~ = f; ~ = x; { f x }
(* -------------------------------------------------------------------------- *)
(* We wish to parse an expression followed with an end-of-line. *)
(* The notation <> is a short-hand for a semantic action {...} that builds
a tuple of the variables that have been introduced. Here, one variable
[expr] has been introduced by [~ = expr], so <> stands for {expr}. *)
let main :=
~ = expr; EOL; <>
(* An expression is an additive expression. *)
let expr ==
additive_expr
(* An additive expression is a left-associative list of multiplicative
expressions, separated with additive operators. *)
let additive_expr ==
fold_left(additive_op, multiplicative_expr)
(* These are the additive operators and their meaning. *)
let additive_op ==
| PLUS; { ( + ) }
| MINUS; { ( - ) }
(* A multiplicative expression is a left-associative list of atomic
expressions, separated with multiplicative operators. *)
let multiplicative_expr ==
fold_left(multiplicative_op, atomic_expr)
(* These are the multiplicative operators and their meaning. *)
let multiplicative_op ==
| TIMES; { ( * ) }
| DIV; { ( / ) }
(* An atomic expression is one of:
an integer literal,
an expression between parentheses,
an application of a unary operator to an atomic expression. *)
let atomic_expr :=
| INT
| delimited(LPAREN, expr, RPAREN)
| app(unary_op, atomic_expr)
(* These are the unary operators and their meaning. *)
let unary_op ==
| MINUS; { (~- ) }
......@@ -2,16 +2,29 @@
\let\nt\textit % Nonterminal.
\newcommand{\is}{& ${} ::= {}$ &}
\newcommand{\newprod}{\\\hspace{1cm}\barre\hspace{2mm}}
\newcommand{\phaprod}{\\\hspace{1cm}\phantom\barre\hspace{2mm}}
% Options and choices.
\newcommand{\optional}[1]{%
\ifmmode [\,#1\,]%
\else $[\,\text{#1}\,]$%
\fi
} % Option.
}
\newcommand{\metachoice}{$\,\mid\,$}
% Lists.
\newcommand{\seplist}[2]{#2#1${}\ldots{}$#1#2}
\newcommand{\sepspacelist}[1]{\seplist{\ }{#1}}
\newcommand{\sepcommalist}[1]{\seplist{,\ }{#1}}
\newcommand{\newprod}{\\\hspace{1cm}\barre\hspace{2mm}}
\newcommand{\phaprod}{\\\hspace{1cm}\phantom\barre\hspace{2mm}}
\newcommand{\precseplist}[2]{%
\optional{#1} \seplist{\ #1}{#2}%
}
% Optional parameters.
\newcommand{\tuple}[1]{\dlpar\sepcommalist{#1}\drpar}
\newcommand{\oparams}[1]{\optional{\tuple{#1}}}
% Some nonterminal symbols used in the grammar.
\newcommand{\expression}{\nt{expression}\xspace}
\newcommand{\expressionsub}[1]{\nt{expression}${}_{#1}$}
\newcommand{\pattern}{\nt{pattern}\xspace}
% Concrete syntax.
......@@ -35,15 +48,23 @@
\newcommand{\dattribute}{\kw{\%attribute}\xspace}
\newcommand{\dpaction}[1]{\kw{\{} #1 \kw{\}}\xspace}
\newcommand{\daction}{\dpaction{\textit{\ocaml code}}\xspace}
\newcommand{\dpfaction}[1]{\kw{<} #1 \kw{>}\xspace}
\newcommand{\dpfidentityaction}{\kw{<>}\xspace}
\newcommand{\dprec}{\kw{\%prec}\xspace}
\newcommand{\dequal}{\kw{=}\xspace}
\newcommand{\dequal}{\kw{\small =}\xspace}
\newcommand{\dquestion}{\kw{?}\xspace}
\newcommand{\dplus}{\raisebox{2pt}{\kw{\small +}}\xspace}
\newcommand{\dcolonequal}{\kw{\small :=}\xspace}
\newcommand{\dequalequal}{\kw{\small ==}\xspace}
\newcommand{\dstar}{\kw{*}\xspace}
\newcommand{\dlpar}{\kw{(}\,\xspace}
\newcommand{\drpar}{\,\kw{)}\xspace}
\newcommand{\eos}{\kw{\#}\xspace}
\newcommand{\dnewline}{\kw{\textbackslash n}\xspace}
\newcommand{\dlet}{\kw{let}\xspace}
\newcommand{\dsemi}{\kw{;}\xspace}
\newcommand{\dunderscore}{\kw{\_}\xspace}
\newcommand{\dtilde}{\raisebox{-4pt}{\kw{\textasciitilde}}\xspace}
% Stylistic conventions.
......@@ -60,6 +81,7 @@
\newcommand{\menhirlibconvert}{\repo{src/Convert.mli}{\texttt{MenhirLib.Convert}}\xspace}
\newcommand{\menhirlibincrementalengine}{\repo{src/IncrementalEngine.ml}{\texttt{MenhirLib.IncrementalEngine}}\xspace}
\newcommand{\standardmly}{\repo{src/standard.mly}{\texttt{standard.mly}}\xspace}
\newcommand{\distrib}[1]{\repo{#1}{\texttt{#1}}}
% Links to CompCert's repository.
......@@ -97,10 +119,6 @@
\newcommand{\conflicts}{\texttt{.conflicts}\xspace}
\newcommand{\dott}{\texttt{.dot}\xspace}
% Files in the distribution.
\newcommand{\distrib}[1]{\texttt{#1}}
% Environments.
\newcommand{\question}[1]{\vspace{3mm}$\diamond$ \textbf{#1}}
......
This diff is collapsed.
This diff is collapsed.
# Differences between the old and new rule syntax
This presentation of the new rule syntax is meant to be useful to a reader who
is familiar with the old rule syntax. For a direct, self-contained
presentation of the new rule syntax, please consult Menhir's manual.
## Rules
A rule begins with `let`.
| | | |
|----------------------------|---------|--------------------------|
| `foo: ...` | becomes | `let foo := ...` |
| `%inline foo: ...` | becomes | `let foo == ...` |
| `%public foo: ...` | becomes | `%public let foo := ...` |
| `%public %inline foo: ...` | becomes | `%public let foo == ...` |
A rule **cannot** be terminated with a semicolon.
## Sums
Productions are separated with `|`. A leading `|` is permitted, and ignored.
For instance, the rule `let a := | A; { () }` has only one production, which
recognizes the symbol `A`. In contrast with the old syntax,two productions
**cannot** share a semantic action.
## Sequences
In a sequence `p1 = e1; e2`, the semicolon is **mandatory**.
The pattern `p1` binds the semantic values produced by `e1`.
| | | |
|-----------------|-------|-------------------------------|
| `x = foo;` | means | the same as in the old syntax |
| `foo;` | means | `_ = foo;` |
| `~ = foo;` | means | `foo = foo;` |
| `(x, y) = foo;` | means | `_xy = foo;` where the following semantic action is wrapped in `let (x, y) = _xy in ...` |
In contrast with the old syntax, when a sequence ends with a semantic action,
the semicolon that precedes the semantic action is still mandatory. For
instance, in `let literal := i = INT; { i }`, the semicolon is required.
In contrast with the old syntax, **a sequence need not end with a semantic
action**. A sequence can also end with a symbol, whose semantic value is
then implicitly returned. For instance,
| | | |
|--------------------------------|-------|------------------|
| `foo` at the end of a sequence | means | `x = foo; { x }` |
This implies that **it becomes easy to define a symbol as a synonym for
another symbol** or for a complex expression. For instance,
| | | |
|---------------------------------|---------|-------------------|
| `%inline foo: xs = bar* { xs }` | becomes | `let foo == bar*` |
## Semantic actions
Traditional semantic actions, such as `{ (x, y) }`, remain available.
In addition, so-called **point-free semantic actions** appear. They take the
form of a single OCaml identifier between angle brackets. This identifier,
which should denote a function or a data constructor, is implicitly
**applied** to a tuple of the variables that have been bound earlier in the
sequence. If this identifier is omitted, the identity function is
assumed. Thus,
| | | |
|-----------------------------------------------------|-------|--------------------------------------------------------|
| `let pair(x, y) := ~ = x; ~ = y; <Pair>` | means | `let pair(x, y) := x = x; y = y; { Pair (x, y) }` |
| `let pair(x, y) := ~ = x; ~ = y; <>` | means | `let pair(x, y) := x = x; y = y; { (x, y) }` |
| `let parenthesized(x) := LPAREN; ~ = x; RPAREN; <>` | means | `let parenthesized(x) := LPAREN; x = x; RPAREN; { x }` |
This often removes the need to invent names for semantic values.
`$1`, `$2`, etc. are forbidden. Semantic values must be named.
A semantic value can be named either explicitly or via a `~` pattern.
......@@ -38,6 +38,12 @@ let from_stretch s = {
keywords = KeywordSet.of_list s.Stretch.stretch_keywords
}
let from_il_expr e = {
expr = e;
filenames = [];
keywords = KeywordSet.empty;
}
(* Defining a keyword in terms of other keywords. *)
let define keyword keywords f action =
......@@ -60,6 +66,15 @@ let compose x a1 a2 =
filenames = a1.filenames @ a2.filenames;
}
(* Binding an OCaml pattern to an OCaml variable in a semantic action. *)
let bind p x a =
{
expr = CodeBits.blet ([ p, IL.EVar x ], a.expr);
keywords = a.keywords;
filenames = a.filenames;
}
(* Substitutions, represented as association lists.
In principle, no name appears twice in the domain. *)
......
......@@ -20,6 +20,10 @@ type t
feature is used during the processing of the %inline keyword. *)
val compose : string -> t -> t -> t
(** [bind p x a] binds the OCaml pattern [p] to the OCaml variable [x] in the
semantic action [a]. Therefore, it builds the action [let p = x in a]. *)
val bind: IL.pattern -> string -> t -> t