segment.mll 3.76 KB
Newer Older
1 2 3 4 5 6 7 8 9 10
(* This lexer is used to cut an input into segments, delimited by a blank
   line. (More precisely, by a run of at least one blank line and zero or more
   comment lines.) It produces a list of segments, where each segment is
   represented as a pair of positions. It is stand-alone and cannot fail. *)

(* The whitespace in between two segments can contain comments, and the user
   may wish to preserve them. For this reason, we view a run of whitespace as
   a segment, too, and we accompany each segment with a tag which is either
   [Segment] or [Whitespace]. The two kinds of segments must alternate in the
   list that we produce. *)
11 12 13

{

14 15 16 17
  type tag =
    | Segment
    | Whitespace

18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
  open Lexing

}

let newline    = ('\010' | '\013' | "\013\010")

let whitespace = [ ' ' '\t' ';' ]

let comment    = '#' [^'\010''\013']* newline

(* In the idle state, we skip whitespace, newlines and comments
   (while updating the liner counter). If we reach the end of file,
   we return the list of all segments found so far. If we reach a
   non-blank non-comment character, we record its position and
   switch to the busy state. *)

34
rule idle opening segments = parse
35
| whitespace
36
    { idle opening segments lexbuf }
37
| newline
38
    { new_line lexbuf; idle opening segments lexbuf }
39
| comment
40
    { new_line lexbuf; idle opening segments lexbuf }
41
| eof
42 43 44 45
    { let closing = lexbuf.lex_start_p in
      let segment = Whitespace, opening, closing in
      let segments = segment :: segments in
      List.rev segments }
46
| _
47 48 49 50
    { let closing = lexbuf.lex_start_p in
      let segment = Whitespace, opening, closing in
      let segments = segment :: segments in
      let opening = closing in
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
      busy segments opening false lexbuf }

(* In the busy state, we skip everything, maintaining one bit
   [just_saw_a_newline], until [just_saw_a_newline] is true
   and we find a second newline. This marks the end of a
   segment, and we revert back to the idle state. If we
   reach the end of file, we consider that this is also
   the end of a segment. *)

and busy segments opening just_saw_a_newline = parse
| whitespace
    { busy segments opening just_saw_a_newline lexbuf }
| newline
    { new_line lexbuf;
      (* The newline that we just saw is already included in the segment.
         This one is not included. *)
      let closing = lexbuf.lex_start_p in
      if just_saw_a_newline then
69
        let segment = Segment, opening, closing in
70
        let segments = segment :: segments in
71 72
        let opening = closing in
        idle opening segments lexbuf
73 74 75 76
      else
        busy segments opening true lexbuf }
| eof
    { let closing = lexbuf.lex_start_p in
77
      let segment = Segment, opening, closing in
78 79 80 81 82
      let segments = segment :: segments in
      List.rev segments }
| _
    { busy segments opening false lexbuf }

POTTIER Francois's avatar
POTTIER Francois committed
83 84 85 86 87 88
{

  (* This wrapper function reads a file, cuts it into segments, and
     creates a fresh lexbuf for each segment, taking care to adjust
     its start position. *)

89
  let segment filename : (tag * string * lexbuf) list =
POTTIER Francois's avatar
POTTIER Francois committed
90 91 92
    let content = IO.read_whole_file filename in
    let lexbuf = from_string content in
    lexbuf.lex_curr_p <- { lexbuf.lex_curr_p with pos_fname = filename };
93 94 95 96
    let segments : (tag * position * position) list =
      idle lexbuf.lex_curr_p [] lexbuf
    in
    List.map (fun (tag, startp, endp) ->
POTTIER Francois's avatar
POTTIER Francois committed
97 98 99 100 101 102 103 104 105
      let start = startp.pos_cnum in
      let length = endp.pos_cnum - start in
      let content = String.sub content start length in
      let lexbuf = from_string content in
      lexbuf.lex_start_p <- startp;
      lexbuf.lex_curr_p <- startp;
      lexbuf.lex_abs_pos <- startp.pos_cnum;
        (* That was tricky to find out. See [Lexing.engine]. [pos_cnum] is
           updated based on [buf.lex_abs_pos + buf.lex_curr_pos]. *)
106
      tag, content, lexbuf
POTTIER Francois's avatar
POTTIER Francois committed
107 108 109 110
    ) segments
      
}