segment.mll 4.71 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
(******************************************************************************)
(*                                                                            *)
(*                                   Menhir                                   *)
(*                                                                            *)
(*                       François Pottier, Inria Paris                        *)
(*              Yann Régis-Gianas, PPS, Université Paris Diderot              *)
(*                                                                            *)
(*  Copyright Inria. All rights reserved. This file is distributed under the  *)
(*  terms of the GNU General Public License version 2, as described in the    *)
(*  file LICENSE.                                                             *)
(*                                                                            *)
(******************************************************************************)

14 15 16 17 18 19 20 21 22 23
(* This lexer is used to cut an input into segments, delimited by a blank
   line. (More precisely, by a run of at least one blank line and zero or more
   comment lines.) It produces a list of segments, where each segment is
   represented as a pair of positions. It is stand-alone and cannot fail. *)

(* The whitespace in between two segments can contain comments, and the user
   may wish to preserve them. For this reason, we view a run of whitespace as
   a segment, too, and we accompany each segment with a tag which is either
   [Segment] or [Whitespace]. The two kinds of segments must alternate in the
   list that we produce. *)
24 25 26

{

27 28 29 30
  type tag =
    | Segment
    | Whitespace

31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
  open Lexing

}

let newline    = ('\010' | '\013' | "\013\010")

let whitespace = [ ' ' '\t' ';' ]

let comment    = '#' [^'\010''\013']* newline

(* In the idle state, we skip whitespace, newlines and comments
   (while updating the liner counter). If we reach the end of file,
   we return the list of all segments found so far. If we reach a
   non-blank non-comment character, we record its position and
   switch to the busy state. *)

47
rule idle opening segments = parse
48
| whitespace
49
    { idle opening segments lexbuf }
50
| newline
51
    { new_line lexbuf; idle opening segments lexbuf }
52
| comment
53
    { new_line lexbuf; idle opening segments lexbuf }
54
| eof
55 56 57 58
    { let closing = lexbuf.lex_start_p in
      let segment = Whitespace, opening, closing in
      let segments = segment :: segments in
      List.rev segments }
59
| _
60 61 62 63
    { let closing = lexbuf.lex_start_p in
      let segment = Whitespace, opening, closing in
      let segments = segment :: segments in
      let opening = closing in
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
      busy segments opening false lexbuf }

(* In the busy state, we skip everything, maintaining one bit
   [just_saw_a_newline], until [just_saw_a_newline] is true
   and we find a second newline. This marks the end of a
   segment, and we revert back to the idle state. If we
   reach the end of file, we consider that this is also
   the end of a segment. *)

and busy segments opening just_saw_a_newline = parse
| whitespace
    { busy segments opening just_saw_a_newline lexbuf }
| newline
    { new_line lexbuf;
      (* The newline that we just saw is already included in the segment.
         This one is not included. *)
      let closing = lexbuf.lex_start_p in
      if just_saw_a_newline then
82
        let segment = Segment, opening, closing in
83
        let segments = segment :: segments in
84 85
        let opening = closing in
        idle opening segments lexbuf
86 87 88 89
      else
        busy segments opening true lexbuf }
| eof
    { let closing = lexbuf.lex_start_p in
90
      let segment = Segment, opening, closing in
91 92 93 94 95
      let segments = segment :: segments in
      List.rev segments }
| _
    { busy segments opening false lexbuf }

96 97 98 99 100 101
{

  (* This wrapper function reads a file, cuts it into segments, and
     creates a fresh lexbuf for each segment, taking care to adjust
     its start position. *)

102
  let segment filename : (tag * string * lexbuf) list =
103 104 105
    let content = IO.read_whole_file filename in
    let lexbuf = from_string content in
    lexbuf.lex_curr_p <- { lexbuf.lex_curr_p with pos_fname = filename };
106 107 108 109
    let segments : (tag * position * position) list =
      idle lexbuf.lex_curr_p [] lexbuf
    in
    List.map (fun (tag, startp, endp) ->
110 111 112 113 114 115 116 117 118
      let start = startp.pos_cnum in
      let length = endp.pos_cnum - start in
      let content = String.sub content start length in
      let lexbuf = from_string content in
      lexbuf.lex_start_p <- startp;
      lexbuf.lex_curr_p <- startp;
      lexbuf.lex_abs_pos <- startp.pos_cnum;
        (* That was tricky to find out. See [Lexing.engine]. [pos_cnum] is
           updated based on [buf.lex_abs_pos + buf.lex_curr_pos]. *)
119
      tag, content, lexbuf
120
    ) segments
121

122 123
}