syntax.ml 12.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
(******************************************************************************)
(*                                                                            *)
(*                                   Menhir                                   *)
(*                                                                            *)
(*                       François Pottier, Inria Paris                        *)
(*              Yann Régis-Gianas, PPS, Université Paris Diderot              *)
(*                                                                            *)
(*  Copyright Inria. All rights reserved. This file is distributed under the  *)
(*  terms of the GNU General Public License version 2, as described in the    *)
(*  file LICENSE.                                                             *)
(*                                                                            *)
(******************************************************************************)

14 15 16 17 18 19 20 21
(* The type [partial_grammar] describes the abstract syntax that is produced
   by the parsers (yacc-parser and fancy-parser).

   The type [grammar] describes the abstract syntax that is obtained after one
   or more partial grammars are joined (see [PartialGrammar]). It differs in
   that declarations are organized in a more useful way and a number of
   well-formedness checks have been performed. *)

POTTIER Francois's avatar
POTTIER Francois committed
22 23 24
type 'a located =
    'a Positions.located

25 26
(* ------------------------------------------------------------------------ *)

27
(* Terminals and nonterminal symbols are strings. *)
28 29 30 31 32

type terminal =
    string

type nonterminal =
33
    string
34 35

type symbol =
36
    string
37

38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
(* In a somewhat fragile convention, in a partial grammar, a reference to a
   terminal symbol either is a normal identifier [LID], in which case it is
   the name of the terminal symbol, or is a quoted identifier [QID], in which
   case it is a token alias.

   Token aliases are eliminated by replacing them with the corresponding
   terminal symbols very early on during the joining of the partial grammars
   -- see the function [dealias_pg] in [PartialGrammar].

   In a complete grammar, there are no token aliases any longer. *)

type alias =
    string option

(* Identifiers (which are used to refer to a symbol's semantic value) are
   strings. *)

55 56
type identifier =
    string
57

58 59
(* A file name is a string. *)

60
type filename =
61 62
    string

63 64
(* ------------------------------------------------------------------------ *)

65
(* A postlude is a source file fragment. *)
66

67
type postlude =
68
    Stretch.t
69

70 71
(* ------------------------------------------------------------------------ *)

72
(* OCaml semantic actions are represented as stretches. *)
73 74 75 76

type action =
    Action.t

77 78
(* ------------------------------------------------------------------------ *)

79 80 81 82
(* An attribute consists of an attribute name and an attribute payload. The
   payload is an uninterpreted stretch of source text. *)

type attribute =
POTTIER Francois's avatar
POTTIER Francois committed
83
    string located * Stretch.t
84 85 86 87

type attributes =
    attribute list

88 89 90 91 92 93 94 95 96 97 98 99 100 101
(* Attributes allow the user to annotate the grammar with information that is
   ignored by Menhir, but can be exploited by other tools, via the SDK. *)

(* Attributes can be attached in the following places:

   - with the grammar:         %[@bar ...]
   - with a terminal symbol:   %token FOO [@bar ...]
   - with a rule:              foo(X) [@bar ...]: ...
   - with a producer:          e = foo(quux) [@bar ...]
   - with an arbitrary symbol: %attribute FOO foo(quux) [@bar ...]

   After expanding away parameterized nonterminal symbols, things become
   a bit simpler, as %attribute declarations are desugared away. *)

102 103
(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
104
(* Information about tokens. (Only after joining.) *)
105

106 107
type token_associativity =
    LeftAssoc
108 109 110 111
  | RightAssoc
  | NonAssoc
  | UndefinedAssoc

112 113
type precedence_level =
    UndefinedPrecedence
114 115

  (* Items are incomparable when they originate in different files. A
116
     value of type [input_file] is used to record an item's origin. The
117 118
     positions allow locating certain warnings. *)

119
  | PrecedenceLevel of InputFile.input_file * int * Lexing.position * Lexing.position
120

121 122
type token_properties =
    {
123 124 125
               tk_filename      : filename;
               tk_ocamltype     : Stretch.ocamltype option;
               tk_position      : Positions.t;
126
               tk_attributes    : attributes;
127
      mutable  tk_associativity : token_associativity;
128
      mutable  tk_precedence    : precedence_level;
129
      mutable  tk_is_declared   : bool;
130 131
    }

132 133
(* ------------------------------------------------------------------------ *)

134
(* A [%prec] annotation is optional. A production can carry at most one.
135
   If there is one, it is a symbol name. See [ParserAux]. *)
136 137

type branch_prec_annotation =
POTTIER Francois's avatar
POTTIER Francois committed
138
    symbol located option
139

POTTIER Francois's avatar
POTTIER Francois committed
140 141
(* ------------------------------------------------------------------------ *)

142 143 144 145
(* A "production level" is used to solve reduce/reduce conflicts. It reflects
   which production appears first in the grammar. See [ParserAux]. *)

type branch_production_level =
146
  | ProductionLevel of InputFile.input_file * int
147

POTTIER Francois's avatar
POTTIER Francois committed
148 149
(* ------------------------------------------------------------------------ *)

150 151 152 153 154 155 156 157
(* A level is attached to every [%on_error_reduce] declaration. It is used
   to decide what to do when several such declarations are applicable in a
   single state. *)

type on_error_reduce_level =
  branch_production_level (* we re-use the above type, to save code *)

(* ------------------------------------------------------------------------ *)
POTTIER Francois's avatar
POTTIER Francois committed
158 159 160 161
(* ------------------------------------------------------------------------ *)

(* The old rule syntax. Although old, still used internally. The new syntax
   is translated down to it. *)
162

163 164 165 166 167
(* A parameter is either just a symbol or an application of a symbol to a
   nonempty tuple of parameters. Before anonymous rules have been eliminated,
   it can also be an anonymous rule, represented as a list of branches. *)

type parameter =
POTTIER Francois's avatar
POTTIER Francois committed
168 169 170
  | ParameterVar of symbol located
  | ParameterApp of symbol located * parameters
  | ParameterAnonymous of parameterized_branch list located
171 172 173 174 175 176

and parameters =
    parameter list

(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
177
(* A producer is a pair of identifier and a parameter. In concrete syntax,
POTTIER Francois's avatar
POTTIER Francois committed
178
   it could be [e = expr], for instance. It carries a number of attributes. *)
POTTIER Francois's avatar
POTTIER Francois committed
179

180
and producer =
POTTIER Francois's avatar
POTTIER Francois committed
181
    identifier located * parameter * attributes
182

POTTIER Francois's avatar
POTTIER Francois committed
183 184 185 186
(* ------------------------------------------------------------------------ *)

(* A branch contains a series of producers and a semantic action. *)

187
and parameterized_branch =
188
    {
189 190
      pr_branch_position           : Positions.t;
      pr_producers                 : producer list;
191
      pr_action                    : action;
192
      pr_branch_prec_annotation    : branch_prec_annotation;
193
      pr_branch_production_level   : branch_production_level
194 195
    }

POTTIER Francois's avatar
POTTIER Francois committed
196 197 198 199
(* ------------------------------------------------------------------------ *)

(* A rule has a header and several branches. *)

200 201
type parameterized_rule =
    {
202 203 204 205
      pr_public_flag       : bool;
      pr_inline_flag       : bool;
      pr_nt                : nonterminal;
      pr_positions         : Positions.t list;
206
      pr_attributes        : attributes;
207 208
      pr_parameters        : symbol list;
      pr_branches          : parameterized_branch list;
209 210
    }

POTTIER Francois's avatar
POTTIER Francois committed
211
(* ------------------------------------------------------------------------ *)
POTTIER Francois's avatar
POTTIER Francois committed
212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
(* ------------------------------------------------------------------------ *)

(* The new rule syntax. *)

(* In the user's eyes, this replaces the old rule syntax, which corresponds to
   the types [parameter], [producer], [parameterized_branch], and
   [parameterized_rule] above. *)

(* Internally, the new rule syntax is translated down to the old rule syntax;
   see [NewRuleSyntax]. This is done on the fly during parsing. *)

type pattern =
  | SemPatVar of identifier located
  | SemPatWildcard
  | SemPatTilde of Positions.t
  | SemPatTuple of pattern list
  (* Patterns: as in the manual. *)

type raw_action =
  Settings.dollars -> identifier option array -> action
  (* Ugly type produced by the lexer for an ACTION token. *)

type expression =
  choice_expression located
  (* A toplevel expression is a choice expression. *)

and choice_expression =
  | EChoice of branch list
  (* A choice expression is a list of branches. *)

and branch =
  | Branch of seq_expression * branch_production_level
  (* A branch is a sequence expression,
     plus an ugly [branch_production_level]. *)

and seq_expression =
  raw_seq_expression located

and raw_seq_expression =
  | ECons of pattern * symbol_expression * seq_expression
  | ESingleton of symbol_expression
  | EAction of extended_action * branch_prec_annotation
  (* A sequence is either a cons [p = e1; e2]
     or a lone symbol expression [e]
     or a semantic action. *)

and symbol_expression =
  | ESymbol of symbol located * expression list * attributes
  (* A symbol expression is a symbol,
     possibly accompanied with actual parameters and attributes. *)

and extended_action =
  | XATraditional of raw_action
265 266 267 268
  | XAPointFree of Stretch.t option
  (* A semantic action is either traditional { ... } or point-free.
     There are two forms of point-free actions, <> and <id>.
     In the latter case, [id] is an OCaml identifier. *)
POTTIER Francois's avatar
POTTIER Francois committed
269 270 271 272 273 274 275 276 277 278 279 280 281

type rule =
  {
    rule_public: bool;
    rule_inline: bool;
    rule_lhs: symbol located;
    rule_attributes: attributes;
    rule_formals: symbol located list;
    rule_rhs: expression;
  }

(* ------------------------------------------------------------------------ *)
(* ------------------------------------------------------------------------ *)
POTTIER Francois's avatar
POTTIER Francois committed
282

283 284 285 286 287 288 289 290 291 292 293 294 295 296
(* A declaration. (Only before joining.) *)

type declaration =

    (* Raw OCaml code. *)

  | DCode of Stretch.t

    (* Raw OCaml functor parameter. *)

  | DParameter of Stretch.ocamltype (* really a stretch *)

    (* Terminal symbol (token) declaration. *)

297
  | DToken of Stretch.ocamltype option * terminal * alias * attributes
298 299 300 301 302 303 304 305 306 307 308 309 310

    (* Start symbol declaration. *)

  | DStart of nonterminal

    (* Priority and associativity declaration. *)

  | DTokenProperties of terminal * token_associativity * precedence_level

    (* Type declaration. *)

  | DType of Stretch.ocamltype * parameter

311 312 313 314 315 316 317 318
    (* Grammar-level attribute declaration. *)

  | DGrammarAttribute of attribute

    (* Attributes shared among multiple symbols, i.e., [%attribute]. *)

  | DSymbolAttributes of parameter list * attributes

319 320
    (* On-error-reduce declaration. *)

321
  | DOnErrorReduce of parameter * on_error_reduce_level
322 323 324

(* ------------------------------------------------------------------------ *)

POTTIER Francois's avatar
POTTIER Francois committed
325 326
(* A partial grammar. (Only before joining.) *)

327
type partial_grammar =
328 329
    {
      pg_filename          : filename;
330
      pg_postlude          : postlude option;
POTTIER Francois's avatar
POTTIER Francois committed
331
      pg_declarations      : declaration located list;
332 333
      pg_rules             : parameterized_rule list;
    }
334

POTTIER Francois's avatar
POTTIER Francois committed
335
(* ------------------------------------------------------------------------ *)
POTTIER Francois's avatar
POTTIER Francois committed
336
(* ------------------------------------------------------------------------ *)
POTTIER Francois's avatar
POTTIER Francois committed
337 338 339 340 341

(* A grammar. (Only after joining.) *)

(* The differences with partial grammars (above) are as follows:
   1. the file name is gone (there could be several file names, anyway).
342
   2. there can be several postludes.
343 344 345
   3. declarations are organized by kind: preludes, postludes,
      functor %parameters, %start symbols, %types, %tokens, %on_error_reduce,
      grammar attributes, %attributes.
POTTIER Francois's avatar
POTTIER Francois committed
346
   4. rules are stored in a map, indexed by symbol names, instead of a list.
347
   5. token aliases have been replaced with ordinary named terminal symbols.
POTTIER Francois's avatar
POTTIER Francois committed
348
 *)
349
 type grammar =
350 351
    {
      p_preludes           : Stretch.t list;
352
      p_postludes          : postlude list;
353 354
      p_parameters         : Stretch.t list;
      p_start_symbols      : Positions.t StringMap.t;
POTTIER Francois's avatar
POTTIER Francois committed
355
      p_types              : (parameter * Stretch.ocamltype located) list;
356
      p_tokens             : token_properties StringMap.t;
357
      p_on_error_reduce    : (parameter * on_error_reduce_level) list;
358 359
      p_grammar_attributes : attributes;
      p_symbol_attributes  : (parameter list * attributes) list;
POTTIER Francois's avatar
POTTIER Francois committed
360
      p_rules              : parameterized_rule StringMap.t;
361
    }