mk_corpus.ml 2.58 KB
Newer Older
Idir Lankri's avatar
Idir Lankri committed
1 2 3 4 5 6 7 8
(**************************************************************************)
(*                     The Sanskrit Heritage Platform                     *)
(*                                                                        *)
(*                              Idir Lankri                               *)
(*                                                                        *)
(* ©2017 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)

9 10 11 12 13 14
value abort report_error status =
  do
  { report_error ()
  ; exit status
  }
;
15 16
value citation_regexp = Str.regexp "\\\\citation{\\(.*\\)}"
;
17
value extract_citation state save_sentence line line_no =
18 19
  try
    if Str.string_match citation_regexp line 0 then
20
      save_sentence [ ("text", Str.matched_group 1 line) :: state ]
21 22 23 24
    else
      raise Exit
  with
  [ _ ->
25 26 27 28 29
    abort (fun () ->
        Printf.eprintf
          "Line %d: \
           Wrong input format (expect one citation macro per line)\n" line_no
      ) 1
30
  ]
31
;
32 33 34 35 36 37 38
value populate_corpus dirname file =
  if dirname.val <> "" then
    let ch = open_in file in
    let (corpus_location, dirname) =
      if Filename.is_relative dirname.val then
        ("", dirname.val)
      else
39
        (Filename.dirname dirname.val, Filename.basename dirname.val)
40
    in
41
    let module Corp = Corpus.Make (struct value path = corpus_location; end) in
42 43 44 45 46 47
    let dirname =
      if Filename.check_suffix dirname Filename.dir_sep then
        Filename.chop_suffix dirname Filename.dir_sep
      else
        dirname
    in
48 49 50
    let rec aux i =
      try
        let line = input_line ch in
51
        let state =
52 53 54 55 56 57
          [ (Params.corpus_dir, dirname)
          ; (Params.sentence_no, string_of_int i)
          ; ("t", Paths.default_transliteration)
          ]
        in
        do
58
        { extract_citation state (Corp.save_sentence True Web.graph_cgi) line i
59 60 61 62 63 64
        ; aux (i + 1)
        }
      with
      [ End_of_file -> () ]
    in
    do
65
    { Corp.mkdir dirname
66 67 68 69 70 71 72 73 74
    ; aux 1
    ; close_in ch
    }
  else
    abort (fun () ->
        Printf.eprintf
          "Please specify the destination directory.  \
           See %s --help.\n" (Filename.basename Sys.argv.(0))
      ) 1
75 76 77 78 79
;
(***************)
(* Entry point *)
(***************)
value main =
80
  let dirname = ref "" in
81 82
  let opts =
    Arg.align
83 84 85 86 87
      [ ("-d", Arg.Set_string dirname,
         " Specify the destination directory") ]
  in
  let usage_msg =
    Filename.basename Sys.argv.(0) ^ " -d <dest_dir> <citation_file>"
88
  in
89
  Arg.parse opts (populate_corpus dirname) usage_msg
90
;