bank_lexer.ml 5.52 KB
Newer Older
huet's avatar
huet committed
1 2 3 4 5 6
(**************************************************************************)
(*                                                                        *)
(*                     The Sanskrit Heritage Platform                     *)
(*                                                                        *)
(*                              Gérard Huet                               *)
(*                                                                        *)
Gérard Huet's avatar
Gérard Huet committed
7
(* ©2018 Institut National de Recherche en Informatique et en Automatique *)
huet's avatar
huet committed
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
(**************************************************************************)

(* A simple lexer recognizing idents formed from ASCII letters and integers 
   and skipping spaces and comments between % and eol. 
   Used by [Parse_tree] and [Reader]. *)

module Bank_lexer = struct

open Camlp4.PreCast;
open Format;

module Loc = Loc (* Using the PreCast Loc *)
;
module Error = struct
  type t = string
  ;
  exception E of t
  ;
  value to_string x = x
  ;
  value print = Format.pp_print_string
  ;
  end
;
module Token = struct
  module Loc = Loc
  ; 
  type t =
    [ KEYWORD of string
    | IDENT of string 
    | TEXT of string 
    | INT of int
    | INTS of int
    | EOI
    ]
  ;
  module Error = Error
  ;
  module Filter = struct
    type token_filter = Camlp4.Sig.stream_filter t Loc.t
    ;
    type t = string -> bool
    ;
    value mk is_kwd = is_kwd
    ;
    value rec filter is_kwd = parser
        [ [: `((KEYWORD s, loc) as p); strm :] -> [: `p; filter is_kwd strm :]
(* PB        [if is_kwd s then [: `p; filter is_kwd strm :]
              else failwith ("Undefined token: " ^ s)]      *)
        | [: `x; s :] -> [: `x; filter is_kwd s :]
        | [: :] -> [: :] 
        ]
    ;
    value define_filter _ _ = ()
    ;
    value keyword_added _ _ _ = ()
    ;
    value keyword_removed _ _ = ()
    ;
    end
  ;
  value to_string = fun
    [ KEYWORD s -> sprintf "KEYWORD %S" s
    | IDENT s -> sprintf "IDENT %S" s
    | TEXT s -> sprintf "TEXT %S" s
    | INT i -> sprintf "INT %d" i
    | INTS i -> sprintf "INTS %d" i
    | EOI -> "EOI"
    ]
  ;
  value print ppf x = pp_print_string ppf (to_string x)
  ;
  value match_keyword kwd = fun
    [ KEYWORD kwd' -> kwd' = kwd
    | _ -> False
    ]
  ;
  value extract_string = fun
    [ INT i -> string_of_int i
    | INTS i -> string_of_int i
    | IDENT s | KEYWORD s | TEXT s -> s
    | EOI -> "" 
    ]
  ;
end
;

open Token
;

(* The string buffering machinery - ddr + np *)
value store buf c = do { Buffer.add_char buf c; buf }
;
value rec base_number len =
  parser
  [ [: a = number len :] -> a ]
and number buf =
  parser
  [ [: `('0'..'9' as c); s :] -> number (store buf c) s
  | [: :] -> Buffer.contents buf
  ]
  ;
value rec skip_to_eol =
  parser
  [ [: `'\n' | '\026' | '\012'; s :] -> ()
  | [: `c ; s :] -> skip_to_eol s
  ]
;
value ident_char =
  parser
  [ [: `('a'..'z' | 'A'..'Z' | '.' | ':' | '"' | '~' | '\'' as c) :] 
     -> c ]
;
value rec ident2 buff =
  parser
  [ [: c = ident_char; s :] -> ident2 (store buff c) s
  | [: `('0'..'9' as c); s :] -> ident2 (store buff c) s
  | [: :] -> Buffer.contents buff
  ]
;
value rec text buff =
  parser
  [ [: `'}' :] -> Buffer.contents buff
  | [: `'{'; buff = text_buff (store buff '{'); s :] -> 
                   text (store buff '}') s
  | [: `c; s :] -> text (store buff c) s
  ]
and text_buff buff =
  parser
  [ [: `'}' :] -> buff
  | [: `'{'; buff = text_buff (store buff '{'); s :] -> 
                   text_buff (store buff '}') s
  | [: `c; s :] -> text_buff (store buff c) s 
  ]
;
value next_token_fun =
  let rec next_token buff =
    parser _bp
    [ [: `'{'; t = text buff :] -> TEXT t
    | [: `('1'..'9' as c); s = number (store buff c) :] -> INT (int_of_string s)
    | [: `'0'; s = base_number (store buff '0') :] -> INT (int_of_string s)
    | [: c = ident_char; s = ident2 (store buff c) :] -> 
      if s = "Comment" then KEYWORD "Comment" else 
      if s = "Example" then KEYWORD "Example" else
      if s = "Continue" then KEYWORD "Continue" else
      if s = "Source" then KEYWORD "Source" else 
      if s = "Parse" then KEYWORD "Parse" else 
      if s = "Gloss" then KEYWORD "Gloss" else IDENT s
    | [: `c :] _ep -> KEYWORD (String.make 1 c)
    ] in
  let rec next_token_loc =
      parser bp
      [ [: `'%' ; _ = skip_to_eol; s :] -> next_token_loc s
      | [: `' ' | '\n' | '\r' | '\t' | '\026' | '\012'; s :] -> next_token_loc s
      | [: `'^' ; s :] -> let (tok,loc) = next_token_loc s in
                          match tok with [ INT n -> (INTS n,loc)
                                         | _ -> raise (Token.Error.E "+n")
                                         ] (* for Gillon's dislocated phrases *)
      | [: `'!' ; s :] -> let (tok,loc) = next_token_loc s in
                          match tok with [ INT n -> (INTS (-n),loc)
                                         | _ -> raise (Token.Error.E "-n")
                                         ] (* for Gillon's dislocation context *)
      | [: tok = next_token (Buffer.create 80) :] ep -> (tok, (bp, ep))
      | [: _ = Stream.empty :] -> (EOI, (bp, succ bp)) 
      ] in
  next_token_loc
  ;
value mk () =
  let err loc msg = Loc.raise loc (Token.Error.E msg) in
  fun init_loc cstrm -> Stream.from lexer
  where lexer _ = 
    try let (tok, (bp, ep)) = next_token_fun cstrm in
        let loc = Loc.move `start bp (Loc.move `stop ep init_loc) in
        Some (tok, loc)
    with [ Stream.Error str ->
            let bp = Stream.count cstrm in
            let loc = Loc.move `start bp (Loc.move `stop (bp+1) init_loc) in 
            err loc str ]
;
end;