segment.mll 2.97 KB
Newer Older
POTTIER Francois's avatar
POTTIER Francois committed
1 2 3
(* This lexer is used to cut an input into segments, delimited by a
   blank line. (More precisely, by a run of at least one blank lines
   and zero or more comment lines.) It produces a list of segments,
POTTIER Francois's avatar
POTTIER Francois committed
4 5
   where each segment is represented as a pair of positions. It is
   stand-alone and cannot fail. *)
POTTIER Francois's avatar
POTTIER Francois committed
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66

{

  open Lexing

}

let newline    = ('\010' | '\013' | "\013\010")

let whitespace = [ ' ' '\t' ';' ]

let comment    = '#' [^'\010''\013']* newline

(* In the idle state, we skip whitespace, newlines and comments
   (while updating the liner counter). If we reach the end of file,
   we return the list of all segments found so far. If we reach a
   non-blank non-comment character, we record its position and
   switch to the busy state. *)

rule idle segments = parse
| whitespace
    { idle segments lexbuf }
| newline
    { new_line lexbuf; idle segments lexbuf }
| comment
    { new_line lexbuf; idle segments lexbuf }
| eof
    { List.rev segments }
| _
    { let opening = lexbuf.lex_start_p in
      busy segments opening false lexbuf }

(* In the busy state, we skip everything, maintaining one bit
   [just_saw_a_newline], until [just_saw_a_newline] is true
   and we find a second newline. This marks the end of a
   segment, and we revert back to the idle state. If we
   reach the end of file, we consider that this is also
   the end of a segment. *)

and busy segments opening just_saw_a_newline = parse
| whitespace
    { busy segments opening just_saw_a_newline lexbuf }
| newline
    { new_line lexbuf;
      (* The newline that we just saw is already included in the segment.
         This one is not included. *)
      let closing = lexbuf.lex_start_p in
      if just_saw_a_newline then
        let segment = (opening, closing) in
        let segments = segment :: segments in
        idle segments lexbuf
      else
        busy segments opening true lexbuf }
| eof
    { let closing = lexbuf.lex_start_p in
      let segment = (opening, closing) in
      let segments = segment :: segments in
      List.rev segments }
| _
    { busy segments opening false lexbuf }

POTTIER Francois's avatar
POTTIER Francois committed
67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
{

  (* This wrapper function reads a file, cuts it into segments, and
     creates a fresh lexbuf for each segment, taking care to adjust
     its start position. *)

  let segment filename : (string * lexbuf) list =
    let content = IO.read_whole_file filename in
    let lexbuf = from_string content in
    lexbuf.lex_curr_p <- { lexbuf.lex_curr_p with pos_fname = filename };
    let segments = idle [] lexbuf in
    List.map (fun (startp, endp) ->
      let start = startp.pos_cnum in
      let length = endp.pos_cnum - start in
      let content = String.sub content start length in
      let lexbuf = from_string content in
      lexbuf.lex_start_p <- startp;
      lexbuf.lex_curr_p <- startp;
      lexbuf.lex_abs_pos <- startp.pos_cnum;
        (* That was tricky to find out. See [Lexing.engine]. [pos_cnum] is
           updated based on [buf.lex_abs_pos + buf.lex_curr_pos]. *)
      content, lexbuf
    ) segments
      
}