sentenceLexer.mll 3.07 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
(******************************************************************************)
(*                                                                            *)
(*                                   Menhir                                   *)
(*                                                                            *)
(*                       François Pottier, Inria Paris                        *)
(*              Yann Régis-Gianas, PPS, Université Paris Diderot              *)
(*                                                                            *)
(*  Copyright Inria. All rights reserved. This file is distributed under the  *)
(*  terms of the GNU General Public License version 2, as described in the    *)
(*  file LICENSE.                                                             *)
(*                                                                            *)
(******************************************************************************)

14 15 16 17 18 19 20 21 22 23 24
(* This lexer is used to read the sentences provided on the standard input
   channel when [--interpret] is enabled. *)

{

  open Lexing
  open SentenceParser
  open Grammar

  (* A short-hand. *)

25 26
  let error2 lexbuf =
    Error.error (Positions.two lexbuf.lex_start_p lexbuf.lex_curr_p)
27 28 29 30 31 32 33 34 35 36 37 38 39

}

let newline   = ('\010' | '\013' | "\013\010")

let whitespace = [ ' ' '\t' ';' ]

let lowercase = ['a'-'z' '\223'-'\246' '\248'-'\255' '_']

let uppercase = ['A'-'Z' '\192'-'\214' '\216'-'\222']

let identchar = ['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '0'-'9'] (* '\'' forbidden *)

40 41 42
let autocomment = "##" [^'\010''\013']* newline

let comment = "#" [^'\010''\013']* newline
43 44 45

let skip = newline whitespace* newline

46
rule lex = parse
47 48
  (* An identifier that begins with an lowercase letter is considered a
     non-terminal symbol. It should be a start symbol. *)
49 50
  | (lowercase identchar *) as lid
      { try
51 52 53 54 55 56 57
          let nt = Nonterminal.lookup lid in
          if StringSet.mem lid Front.grammar.UnparameterizedSyntax.start_symbols then
            NONTERMINAL (nt, lexbuf.lex_start_p, lexbuf.lex_curr_p)
          else
            error2 lexbuf "\"%s\" is not a start symbol." lid
        with Not_found ->
          error2 lexbuf "\"%s\" is not a known non-terminal symbol." lid
58
      }
59 60
  (* An identifier that begins with an uppercase letter is considered a
     terminal symbol. *)
61 62
  | (uppercase identchar *) as uid
      { try
63 64 65
          TERMINAL (Terminal.lookup uid, lexbuf.lex_start_p, lexbuf.lex_curr_p)
        with Not_found ->
          error2 lexbuf "\"%s\" is not a known terminal symbol." uid
66
      }
67
  (* Whitespace is ignored. *)
68
  | whitespace
69
      { lex lexbuf }
70
  (* The end of a line is translated to [EOL]. *)
71
  | newline
72
      { new_line lexbuf; EOL }
73 74
  (* An auto-generated comment is ignored. *)
  | autocomment
75
      { new_line lexbuf; lex lexbuf }
76 77 78
  (* A manually-written comment is preserved. *)
  | comment as c
      { new_line lexbuf; COMMENT c }
79
  (* The end of file is translated to [EOF]. *)
80 81
  | eof
      { EOF }
82
  (* A colon. *)
83 84 85
  | ':'
      { COLON }
  | _
86
      { error2 lexbuf "unexpected character." }
87