grew_lexer.mll 8.01 KB
Newer Older
bguillaum's avatar
bguillaum committed
1 2 3 4 5 6 7 8 9
(**********************************************************************************)
(*    Libcaml-grew - a Graph Rewriting library dedicated to NLP applications      *)
(*                                                                                *)
(*    Copyright 2011-2013 Inria, Université de Lorraine                           *)
(*                                                                                *)
(*    Webpage: http://grew.loria.fr                                               *)
(*    License: CeCILL (see LICENSE folder or "http://www.cecill.info")            *)
(*    Authors: see AUTHORS file                                                   *)
(**********************************************************************************)
pj2m's avatar
pj2m committed
10 11

{
12
  open Printf
13
  open Log
bguillaum's avatar
bguillaum committed
14
  open Grew_base
bguillaum's avatar
bguillaum committed
15
  open Grew_ast
bguillaum's avatar
bguillaum committed
16
  open Grew_parser
pj2m's avatar
pj2m committed
17 18

  exception Error of string
19

pj2m's avatar
pj2m committed
20
  let escaped = ref false
21

bguillaum's avatar
bguillaum committed
22 23
  let split_comment com =
    let raw = Str.split (Str.regexp "\n") com in
24
    List.filter (fun l -> not (Str.string_match (Str.regexp "[ \t]*$") l 0)) raw
bguillaum's avatar
bguillaum committed
25

26
  let buff = Buffer.create 32
pj2m's avatar
pj2m committed
27 28 29 30
}

let digit = ['0'-'9']
let letter = ['a'-'z' 'A'-'Z']
31 32 33 34 35 36 37 38 39 40

(* a general_ident is an arbitrary sequence of:
   - letter
   - digit
   - underscore '_'
   - dash '-'
  for basic ident construction and
   - dot '.'
   - colon ':'
   - star '*'
bguillaum's avatar
bguillaum committed
41
  The first characted cannot be a digit, or a colon (to avoid confusion).
42 43 44 45 46
 *)
let label_ident =
  (letter | '_' | '-' | '.' | '*') (letter | digit | '_' | '\'' | '-' | '.' | ':' | '*')*

let general_ident =
47
  (letter | '_' ) |
bguillaum's avatar
bguillaum committed
48
  (letter | '_' | '.' ) (letter | digit | '_' | '\'' | '-' | '.')* (letter | digit | '_' | '\'' | '.')
pj2m's avatar
pj2m committed
49

50
let hex = ['0'-'9' 'a'-'f' 'A'-'F']
bguillaum's avatar
bguillaum committed
51
let color = hex hex hex hex hex hex | hex hex hex
52

53 54 55 56 57

(* ------------------------------------------------------------------------------- *)
(* Rules                                                                           *)
(* ------------------------------------------------------------------------------- *)

pj2m's avatar
pj2m committed
58
rule comment target = parse
59 60 61
| '\n'  { incr Global.current_line; Lexing.new_line lexbuf; target lexbuf }
| eof   { EOF }
| _     { comment target lexbuf }
pj2m's avatar
pj2m committed
62 63 64

and comment_multi_doc target = shortest
| (_* as comment)"--%" {
bguillaum's avatar
bguillaum committed
65 66 67
  let start = ref 0 in
  try while (Str.search_forward (Str.regexp "\n") comment !start != -1) do
    start := Str.match_end ();
bguillaum's avatar
bguillaum committed
68
    incr Global.current_line;
bguillaum's avatar
bguillaum committed
69 70 71
    Lexing.new_line lexbuf;
  done; assert false
  with Not_found ->
72
    COMMENT(split_comment comment)
bguillaum's avatar
bguillaum committed
73
}
74

pj2m's avatar
pj2m committed
75 76
and comment_multi target = parse
| "*/" { target lexbuf }
bguillaum's avatar
bguillaum committed
77
| '\n' { incr Global.current_line; Lexing.new_line lexbuf; comment_multi target lexbuf }
pj2m's avatar
pj2m committed
78
| _  { comment_multi target lexbuf }
79

80
and string_lex re target = parse
81 82
  | '\\' {
    if !escaped
83 84
    then (bprintf buff "\\"; escaped := false; string_lex re target lexbuf)
    else (escaped := true; string_lex re target lexbuf)
85
  }
86
  | '\n' { incr Global.current_line; Lexing.new_line lexbuf; bprintf buff "\n"; string_lex re target lexbuf }
87 88
  | '\"' {
    if !escaped
89 90
    then (bprintf buff "\""; escaped := false; string_lex re target lexbuf)
    else (if re then REGEXP (Buffer.contents buff) else STRING (Buffer.contents buff))
91 92 93 94 95
  }
  | _ as c {
    if !escaped then bprintf buff "\\";
    escaped := false;
    bprintf buff "%c" c;
96
    string_lex re target lexbuf
97 98 99
  }

(* a dedicated lexer for lexical parameter: read everything until "#END" *)
100
and lp_lex target = parse
bguillaum's avatar
bguillaum committed
101
| '\n'                    { incr Global.current_line; Lexing.new_line lexbuf; bprintf buff "\n"; lp_lex target lexbuf }
102
| _ as c                  { bprintf buff "%c" c; lp_lex target lexbuf }
bguillaum's avatar
bguillaum committed
103
| "#END" [' ' '\t']* '\n' { incr Global.current_line; LEX_PAR (Str.split (Str.regexp "\n") (Buffer.contents buff)) }
104

105
(* The lexer must be different when label_ident are parsed. The [global] lexer calls either
bguillaum's avatar
bguillaum committed
106
   [label_parser] or [standard] depending on the flag [Global.label_flag].
107 108 109
   Difference are:
   - a label_ident may contain ':' (like in D:suj:obj) and ':' is a token elsewhere
   - a label_ident may contain '-' anywhere (like "--" in Tiger) but '-' is fordiden as the first or last character elsewhere
110
   - the string "*" is lexed as ID by [label_parser] and as STAR by [standard]
111
*)
pj2m's avatar
pj2m committed
112
and global = parse
bguillaum's avatar
bguillaum committed
113
| ""   {  if !Global.label_flag
114 115 116 117 118 119 120 121 122
          then label_parser global lexbuf
          else standard global lexbuf
        }


and label_parser target = parse
| [' ' '\t'] { global lexbuf }
| "/*"       { comment_multi global lexbuf }
| '%'        { comment global lexbuf }
bguillaum's avatar
bguillaum committed
123
| '\n'       { incr Global.current_line; Lexing.new_line lexbuf; global lexbuf}
124 125

| '{'   { LACC }
bguillaum's avatar
bguillaum committed
126
| '}'   { Global.label_flag := false; RACC }
127 128 129 130 131 132 133
| ','   { COMA }
| '|'   { PIPE }

| '@' general_ident as cmd_var     { AROBAS_ID cmd_var }
| "@#" color as col        { COLOR col }

| label_ident as id { ID id }
134 135
| '"'      { Buffer.clear buff; string_lex false global lexbuf }
| "re\""   { Buffer.clear buff; string_lex true global lexbuf }
136

bguillaum's avatar
bguillaum committed
137 138 139
| "]->" { Global.label_flag := false; LTR_EDGE_RIGHT }
| "]-"  { Global.label_flag := false; RTL_EDGE_RIGHT }
| "]=>" { Global.label_flag := false; ARROW_RIGHT }
140

bguillaum's avatar
bguillaum committed
141
| _ as c { raise (Error (sprintf "unexpected character '%c'" c)) }
142 143

and standard target = parse
pj2m's avatar
pj2m committed
144
| [' ' '\t'] { global lexbuf }
145

bguillaum's avatar
bguillaum committed
146 147 148 149
| "%--"      { comment_multi_doc global lexbuf }
| "/*"       { comment_multi global lexbuf }
| '%'        { comment global lexbuf }

bguillaum's avatar
bguillaum committed
150
| "#BEGIN" [' ' '\t']* '\n' { incr Global.current_line; Buffer.clear buff; lp_lex global lexbuf}
151

bguillaum's avatar
bguillaum committed
152
| '\n'       { incr Global.current_line; Lexing.new_line lexbuf; global lexbuf}
bguillaum's avatar
bguillaum committed
153

bguillaum's avatar
bguillaum committed
154
| "include"     { INCL }
155
| "domain"      { DOMAIN }
bguillaum's avatar
bguillaum committed
156 157 158
| "features"    { FEATURES }
| "feature"     { FEATURE }
| "file"        { FILE }
bguillaum's avatar
bguillaum committed
159
| "labels"      { Global.label_flag := true; LABELS }
bguillaum's avatar
bguillaum committed
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
| "match"       { MATCH }
| "without"     { WITHOUT }
| "commands"    { COMMANDS }

| "add_edge"    { ADD_EDGE }
| "del_edge"    { DEL_EDGE }
| "shift_in"    { SHIFT_IN }
| "shift_out"   { SHIFT_OUT }
| "shift"       { SHIFT }
| "merge"       { MERGE }
| "del_node"    { DEL_NODE }
| "add_node"    { ADD_NODE }
| "del_feat"    { DEL_FEAT }

| "module"      { MODULE }
| "confluent"   { CONFLUENT }
| "rule"        { RULE }
177
| "lex_rule"    { Log.fwarning "[file %s, line %d]\"lex_rule\" is deprecated, please use \"rule\" instead" ! Global.current_file !Global.current_line; RULE }
178
| "filter"      { FILTER }
pj2m's avatar
pj2m committed
179 180
| "sequences"   { SEQUENCES }

181
| "graph"       { GRAPH }
pj2m's avatar
pj2m committed
182

183 184
| digit+ ('.' digit*)? as number  { FLOAT (float_of_string number) }

185 186
| '$' general_ident as pat_var     { DOLLAR_ID pat_var}
| '@' general_ident as cmd_var     { AROBAS_ID cmd_var }
187 188
| "@#" color as col        { COLOR col }

189
| '*'   { STAR }
190
| general_ident as id { ID id }
191

bguillaum's avatar
bguillaum committed
192 193 194 195 196 197 198 199 200 201
| '{'   { LACC }
| '}'   { RACC }
| '['   { LBRACKET }
| ']'   { RBRACKET }
| '('   { LPAREN }
| ')'   { RPAREN }
| ':'   { DDOT }
| ';'   { SEMIC }
| ','   { COMA }
| '+'   { PLUS }
bguillaum's avatar
bguillaum committed
202
| '#'   { SHARP }
bguillaum's avatar
bguillaum committed
203
| '='   { EQUAL }
bguillaum's avatar
bguillaum committed
204
| "!"   { BANG }
bguillaum's avatar
bguillaum committed
205
| "<>"  { DISEQUAL }
bguillaum's avatar
bguillaum committed
206

bguillaum's avatar
bguillaum committed
207 208
| "<<"       { LPREC }
| ">>"       { LSUCC }
bguillaum's avatar
bguillaum committed
209 210 211 212

| ":<"         { BEFORE }
| ":>"         { AFTER }

bguillaum's avatar
bguillaum committed
213 214 215 216 217
| "<"        { LT }
| ">"        { GT }
| "<=" | "≤" { LE }
| ">=" | "≥" { GE }

bguillaum's avatar
bguillaum committed
218
| '|'   { PIPE }
219
| "->"  { EDGE }
bguillaum's avatar
bguillaum committed
220 221
| "-[^" { Global.label_flag := true; LTR_EDGE_LEFT_NEG }
| "-["  { Global.label_flag := true; LTR_EDGE_LEFT }
bguillaum's avatar
bguillaum committed
222
| "]->" { LTR_EDGE_RIGHT }
bguillaum's avatar
bguillaum committed
223
| "<-[" { Global.label_flag := true; RTL_EDGE_LEFT }
bguillaum's avatar
bguillaum committed
224
| "]-"  { RTL_EDGE_RIGHT }
225 226

| "==>" { ARROW }
bguillaum's avatar
bguillaum committed
227 228
| "=["  { Global.label_flag := true; ARROW_LEFT }
| "=[^" { Global.label_flag := true; ARROW_LEFT_NEG }
229
| "]=>" { ARROW_RIGHT }
230

231 232
| '"'      { Buffer.clear buff; string_lex false global lexbuf }
| "re\""   { Buffer.clear buff; string_lex true global lexbuf }
pj2m's avatar
pj2m committed
233

bguillaum's avatar
bguillaum committed
234
| eof   { EOF }
bguillaum's avatar
bguillaum committed
235
| _ as c { raise (Error (sprintf "unexpected character '%c'" c)) }
236 237 238 239 240 241 242 243 244 245 246

| _ as c { raise (Error (sprintf "At line %d: unexpected character '%c'" (lexbuf.Lexing.lex_start_p.Lexing.pos_lnum) c)) }

and const = parse
  | [' ' '\t']        { const lexbuf }
  | '\n'              { incr Global.current_line; const lexbuf}
  | '('               { printf ">>>LPAREN<<<\n%!"; LPAREN }
  | ')'               { printf ">>>RPAREN<<<\n%!"; RPAREN }
  | "SENT"            { printf ">>>SENT<<<\n%!"; SENT }
  | [^'(' ')' ' ']+ as id { printf "ID=>>>%s<<<\n%!" id; ID id }