grew_corpus.ml 20.8 KB
Newer Older
Bruno Guillaume's avatar
Bruno Guillaume committed
1
2
3
4
5
6
7
8
9
10
(**********************************************************************************)
(*    Libcaml-grew - a Graph Rewriting library dedicated to NLP applications      *)
(*                                                                                *)
(*    Copyright 2011-2021 Inria, Université de Lorraine                           *)
(*                                                                                *)
(*    Webpage: https://grew.fr                                                     *)
(*    License: CeCILL (see LICENSE folder or "http://cecill.info/")            *)
(*    Authors: see AUTHORS file                                                   *)
(**********************************************************************************)

11
open Printf
12
open Conllx
13
14
15
16
open Libamr

open Grew_base
open Grew_loader
17
open Grew_edge
18
19
20
21
22
open Grew_graph
open Grew_grs

(* ==================================================================================================== *)
module Pst_corpus = struct
23
  let load_files files =
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
    let sub_corp = List.map
        (fun file ->
           let line_list = File.read file in
           List.mapi
             (fun i line ->
                match Str.split (Str.regexp "\t") line with
                | [pst] -> (sprintf "%s_%05d" file (i+1), pst)
                | [id; pst] -> (id, pst)
                | _ -> failwith "Pst syntax error"
             ) line_list
        ) files in
    Array.of_list (List.flatten sub_corp)
end

(* ==================================================================================================== *)
module Corpus = struct
Bruno Guillaume's avatar
Bruno Guillaume committed
40
  type kind = Conll | Pst | Amr | Gr | Json | Dmrs
41
42
43
44
45
46
47
48
49

  type item = {
    sent_id: string;
    text: string;
    graph: G_graph.t;
  }

  type t = {
    items: item array;
50
    kind: kind;
51
52
  }

Bruno Guillaume's avatar
Bruno Guillaume committed
53
54
55
56
57
  let graph_of_sent_id sent_id corpus =
    match CCArray.find_idx (fun item -> item.sent_id = sent_id) corpus.items with
    | Some (_,item) -> Some item.graph
    | None -> None

Bruno Guillaume's avatar
Bruno Guillaume committed
58
59
60
61
62
63
64
65
66
67
  let item_of_graph graph =
    let sent_id =
      match G_graph.get_meta_opt "sent_id" graph with
      | Some s -> s
      | None -> "_" in
    let text =
      match G_graph.get_meta_opt "text" graph with
      | Some s -> s
      | None -> "_" in
    { sent_id; text; graph }
68

69
70
71
72
  let merge = function
    | [] -> Error.bug "Empty list in Corpus.merge"
    | [one] -> one
    | h::t ->
Bruno Guillaume's avatar
Bruno Guillaume committed
73
      if List.exists (fun t -> t.kind <> h.kind) t
74
75
76
77
78
79
80
81
      then Error.run "Cannot merge corpora with incompatible kinds"
      else {h with items = Array.concat (List.map (fun t -> t.items) (h::t)) }

  let of_conllx_corpus conllx_corpus =
    let items =
      Array.map
        (fun (sent_id, conllx) ->
           let text = match List.assoc_opt "text" (Conllx.get_meta conllx) with Some t -> t | None -> "__missing text metadata__" in
82
           let graph = conllx |> Conllx.to_json |> G_graph.of_json in
83
84
           { sent_id; text; graph }
        ) (Conllx_corpus.get_data conllx_corpus) in
Bruno Guillaume's avatar
Bruno Guillaume committed
85
    { kind = Conll; items }
86

Bruno Guillaume's avatar
Bruno Guillaume committed
87
88
89
90
91
92
  let of_amr_file file =
    try
      let amr_corpus = Amr_corpus.load file in
      let items =
        Array.map
          (fun (sent_id, amr) ->
Bruno Guillaume's avatar
Bruno Guillaume committed
93
             let json = Amr.to_json ~unfold:true amr in
Bruno Guillaume's avatar
Bruno Guillaume committed
94
95
96
97
98
99
             let graph = G_graph.of_json json in
             let text = match G_graph.get_meta_opt "text" graph with Some t -> t | None -> "__missing text metadata__" in
             { sent_id; text; graph }
          ) amr_corpus in
      { kind=Amr; items; }
    with Amr.Error msg -> Error.build "Amr error in file `%s`: %s" file msg
100

101
102
  let fold_left fct init t =
    Array.fold_left
103
      (fun acc item -> fct acc item.sent_id item.graph)
104
105
      init t.items

106
107
108
109
110
111
  let fold_right fct t init =
    Array.fold_right
      (fun item acc -> fct item.sent_id item.graph acc)
      t.items init


112
113
114
115
  let iteri fct t = Array.iteri (fun i item -> fct i item.sent_id item.graph) t.items



116
117
118
119
120
  let size t = Array.length t.items

  let get_graph position t = t.items.(position).graph
  let get_sent_id position t = t.items.(position).sent_id

Bruno Guillaume's avatar
Bruno Guillaume committed
121
  let is_conll t = match t.kind with
Bruno Guillaume's avatar
Bruno Guillaume committed
122
123
    | Conll | Dmrs -> true
    | _ -> false
124
125
126
127
128
129
130
131
132
133
134
135
136

  let get_text position t = t.items.(position).text


  let permut_length t =
    let items_with_length =
      Array.mapi
        (fun i item -> (i,G_graph.size item.graph)) t.items in
    let _ = Array.sort
        (fun (_,s1) (_,s2) -> Stdlib.compare s1 s2)
        items_with_length in
    Array.map fst items_with_length

137
138
139
140
141
142
143
144
145
146
147
148
  let from_json ?loc json =
    try
      match json with
      | `List jsons ->
        Array.of_list (
          List.map (fun json -> json |> G_graph.of_json |> item_of_graph) jsons
        )
      | json -> [| json |> G_graph.of_json |> item_of_graph |]
    with
    | Yojson.Json_error msg -> Error.run ?loc "Error in the JSON file format: %s" msg


149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
  let from_stdin ?ext ?log_file ?config () =
    match ext with
    | Some ".json" -> 
      let s = CCIO.read_all stdin in
      { kind=Json; items = from_json (Yojson.Basic.from_string s)}
    | Some ".conll" | Some ".conllu" | Some ".cupt" | Some ".orfeo" | Some ".frsemcor" 
    | _ -> (* TODO: use Conll by default --> more robust stuff needed *)
      let lines = CCIO.read_lines_l stdin in
      of_conllx_corpus (Conllx_corpus.of_lines ?log_file ?config lines)

  let from_string ?ext ?log_file ?config s =
    match ext with
    | Some ".json" -> { kind=Json; items = from_json (Yojson.Basic.from_string s)}
    | Some ".conll" | Some ".conllu" | Some ".cupt" | Some ".orfeo" | Some ".frsemcor" 
    | _ -> (* TODO: use Conll by default --> more robust stuff needed *)
      let lines = Str.split (Str.regexp "\n") s in
      of_conllx_corpus (Conllx_corpus.of_lines ?log_file ?config lines)

  let from_file ?ext ?log_file ?config file =
    let extension = match ext with Some e -> e | None -> Filename.extension file in
    match extension with
Bruno Guillaume's avatar
Bruno Guillaume committed
170
    | ".conll" | ".conllu" | ".cupt" | ".orfeo" | ".frsemcor" ->
Bruno Guillaume's avatar
Bruno Guillaume committed
171
      of_conllx_corpus (Conllx_corpus.load ?log_file ?config file)
172
    | ".amr" | ".txt" ->
Bruno Guillaume's avatar
Bruno Guillaume committed
173
      of_amr_file file
Bruno Guillaume's avatar
Bruno Guillaume committed
174
175
176
    | ".gr" ->
      let item = Loader.gr file |> G_graph.of_ast ~config:(Conllx_config.build "basic") |> item_of_graph in
      { kind= Gr; items = [| item |] }
177
    | ".json" ->
Bruno Guillaume's avatar
Bruno Guillaume committed
178
      begin
179
180
        try { kind=Json; items = from_json ~loc: (Loc.file file) (Yojson.Basic.from_file file)}
        with Yojson.Json_error msg -> Error.run ~loc:(Loc.file file) "Error in the JSON file format: %s" msg
Bruno Guillaume's avatar
Bruno Guillaume committed
181
      end
182
183
184
185
    | ".melt" | ".brown" ->
      let lines = File.read file in
      let config = match config with Some c -> c | None -> Conllx_config.build "ud" in
      let items = List.mapi (fun i line -> {
Bruno Guillaume's avatar
Bruno Guillaume committed
186
187
            sent_id= sprintf "%05d" (i + 1);
            text= "__No_text__";
188
            graph= G_graph.of_brown ~config line;
Bruno Guillaume's avatar
Bruno Guillaume committed
189
          }) lines |> Array.of_list in
Bruno Guillaume's avatar
Bruno Guillaume committed
190
      { items; kind=Conll }
191
192
    | ext -> Error.run "Cannot load file `%s`, unknown extension `%s`" file ext

Bruno Guillaume's avatar
Bruno Guillaume committed
193
  let from_dir ?log_file ?config dir =
194
195
196
197
198
199
200
201
202
203
204
205
206
207
    let files = Sys.readdir dir in
    let (conll_files, amr_files, txt_files) =
      Array.fold_right
        (fun file (conll_acc, amr_acc, txt_acc) ->
           match Filename.extension file with
           | ".conll" | ".conllu" | ".cupt" | ".orfeo" -> (file::conll_acc, amr_acc, txt_acc)
           | ".amr" -> (conll_acc, file::amr_acc, txt_acc)
           | ".txt" -> (conll_acc, amr_acc, file::txt_acc)
           | _ -> (conll_acc, amr_acc, txt_acc)
        ) files ([],[],[]) in

    (* txt files are interpreted as AMR files only if there is no conll-like files (eg: UD containts txt files in parallel to conllu) *)
    match (conll_files, amr_files, txt_files) with
    | ([],[],[]) -> Error.run "The directory `%s` does not contain any graphs" dir
Bruno Guillaume's avatar
Bruno Guillaume committed
208
    | (conll_files,[],_) -> of_conllx_corpus (Conllx_corpus.load_list ?log_file ?config conll_files)
Bruno Guillaume's avatar
Bruno Guillaume committed
209
    | ([],amr_files, txt_files) -> (amr_files @ txt_files) |> List.map of_amr_file |> merge
210
    | _ -> Error.run "The directory `%s` contains both Conll data and Amr data" dir
211
212
end

Bruno Guillaume's avatar
Bruno Guillaume committed
213
(* ==================================================================================================== *)
214
215
216
217
module Corpus_desc = struct

  type t = {
    id: string;
218
    lang: string option;
Bruno Guillaume's avatar
Bruno Guillaume committed
219
    kind: Corpus.kind;
220
    config: Conllx_config.t; (* "ud" is used as the default: TODO make config mandatory in desc? *)
Bruno Guillaume's avatar
Bruno Guillaume committed
221
    columns: Conllx_columns.t option;
222
223
224
225
226
227
228
229
    directory: string;
    files: string list;
    rtl: bool;
    audio: bool;
    preapply: string option;
  }

  let get_id corpus_desc = corpus_desc.id
230
  let get_lang_opt corpus_desc = corpus_desc.lang
Bruno Guillaume's avatar
Bruno Guillaume committed
231
  let get_config corpus_desc = corpus_desc.config
232
233
234
235
236
237
238
  let get_directory corpus_desc = corpus_desc.directory
  let is_rtl corpus_desc = corpus_desc.rtl
  let is_audio corpus_desc = corpus_desc.audio


  (* ---------------------------------------------------------------------------------------------------- *)
  let extensions = function
239
    | Corpus.Conll -> [".conll"; ".conllu"; ".cupt"; ".orfeo"; "frsemcor"]
240
241
    | Amr -> [".amr"; ".txt"]
    | Pst -> [".const"]
Bruno Guillaume's avatar
Bruno Guillaume committed
242
243
    | Json -> [".json"]
    | Gr -> [".gr"]
Bruno Guillaume's avatar
Bruno Guillaume committed
244
    | Dmrs -> [".json"]
245
246
247
248
249
250

  (* ---------------------------------------------------------------------------------------------------- *)
  (* if [files] is empty, all files of the directory with correct suffix are considered *)
  let get_full_files { kind; directory; files } =
    let file_list = match files with
      | [] ->
Bruno Guillaume's avatar
Bruno Guillaume committed
251
252
253
254
255
256
257
        begin
          try
            Array.fold_left
              (fun acc file -> if List.mem (Filename.extension file) (extensions kind) then file::acc else acc)
              [] (Sys.readdir directory)
          with Sys_error _ -> Error.run "[Corpus] cannot read directory %s" directory
        end
258
259
260
261
262
      | l -> l in
    List.map (fun f -> Filename.concat directory f) file_list

  (* ---------------------------------------------------------------------------------------------------- *)
  let build_corpus corpus_desc =
263
    let config = corpus_desc.config in
264
265
    match corpus_desc.kind with
    | Conll ->
Bruno Guillaume's avatar
Bruno Guillaume committed
266
      let conll_corpus = Conllx_corpus.load_list ~config (get_full_files corpus_desc) in
267
268
269
      let items =
        CCArray.filter_map (fun (sent_id,conll) ->
            try
Bruno Guillaume's avatar
Bruno Guillaume committed
270
              let init_graph = G_graph.of_json (Conllx.to_json conll) in
271
              let graph = match corpus_desc.preapply with
272
                | Some grs -> Grs.apply ~config grs init_graph
Bruno Guillaume's avatar
Bruno Guillaume committed
273
                | None -> init_graph in
274
              Some {Corpus.sent_id; text=G_graph.to_sentence graph; graph }
275
            with Error.Build (msg, loc_opt) ->
Bruno Guillaume's avatar
Bruno Guillaume committed
276
              Error.warning "[build_corpus, sent_id=%s%s] skipped: %s"
277
278
279
                sent_id
                (match loc_opt with None -> "" | Some loc -> "; " ^ (Loc.to_string loc))
                msg; None
Bruno Guillaume's avatar
Bruno Guillaume committed
280
          ) (Conllx_corpus.get_data conll_corpus) in
Bruno Guillaume's avatar
Bruno Guillaume committed
281
      { Corpus.items; kind=Conll }
282
283
    | _ -> Error.bug "[Corpus_desc.build_corpus] is available only on Conll format"

284
285
286
  (* ---------------------------------------------------------------------------------------------------- *)
  let load_corpus_opt corpus_desc =
    let marshal_file = (Filename.concat corpus_desc.directory corpus_desc.id) ^ ".marshal" in
287
288
289
290
291
292
    try
      let in_ch = open_in_bin marshal_file in
      let data = (Marshal.from_channel in_ch : Corpus.t) in
      close_in in_ch;
      Some data
    with Sys_error _ -> None
293
294


295
296
297
298
  (* ---------------------------------------------------------------------------------------------------- *)
  let load_json json_file =
    let open Yojson.Basic.Util in

Bruno Guillaume's avatar
Bruno Guillaume committed
299
300
    let json =
      try Yojson.Basic.from_file json_file
Bruno Guillaume's avatar
Bruno Guillaume committed
301
      with 
Bruno Guillaume's avatar
Bruno Guillaume committed
302
303
304
      | Sys_error _ -> Error.run "[Corpus.load_json] file `%s` not found" json_file
      | Yojson.Json_error msg -> Error.run "[Corpus.load_json] invalid JSON file `%s`:\n%s" json_file msg
    in
305
306
307
308
309
310

    let parse_one json =
      let id =
        try json |> member "id" |> to_string
        with Type_error _ -> Error.run "[Corpus.load_json, file \"%s\"] \"id\" field is mandatory and must be a string" json_file in

311
312
313
314
      let lang =
        try Some (json |> member "lang" |> to_string)
        with Type_error _ -> None in

315
316
317
318
319
      let kind =
        try match json |> member "kind" |> to_string_option with
          | None | Some "conll" -> Corpus.Conll
          | Some "pst" -> Pst
          | Some "amr" -> Amr
Bruno Guillaume's avatar
Bruno Guillaume committed
320
          | Some "dmrs" -> Dmrs
321
          | Some "json" -> Json
322
323
324
          | Some x -> Error.run "[Corpus.load_json] Unknown \"kind\":\"%s\" field in file: \"%s\"" x json_file
        with Type_error _ -> Error.run "[Corpus.load_json, file \"%s\"] \"kind\" must be a string" json_file in

Bruno Guillaume's avatar
Bruno Guillaume committed
325
      let config =
326
        try json |> member "config" |> to_string_option |> (function Some c -> Conllx_config.build c | None -> Conllx_config.build "ud")
Bruno Guillaume's avatar
Bruno Guillaume committed
327
328
329
        with Type_error _ -> Error.run "[Corpus.load_json, file \"%s\"] \"config\" field must be a string" json_file in

      let columns =
330
        try json |> member "columns" |> to_string_option |> (CCOption.map Conllx_columns.build)
Bruno Guillaume's avatar
Bruno Guillaume committed
331
332
        with Type_error _ -> Error.run "[Corpus.load_json, file \"%s\"] \"columns\" field must be a string" json_file in

333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
      let directory =
        try json |> member "directory" |> to_string
        with Type_error _ -> Error.run "[Corpus.load_json, file \"%s\"] \"directory\" field is mandatory and must be a string" json_file in

      let preapply =
        try json |> member "preapply" |> to_string_option
        with Type_error _ -> Error.run "[Corpus.load_json, file \"%s\"] \"preapply\" field must be a string" json_file in

      let files =
        try json |> member "files" |> to_list |> filter_string
        with Type_error _ -> [] in

      let rtl =
        try json |> member "rtl" |> to_bool
        with Type_error _ -> false in

      let audio =
        try json |> member "audio" |> to_bool
        with Type_error _ -> false in

353
      { id; lang; kind; config; columns; directory; files; rtl; audio; preapply; } in
354
355
356
357

    List.map parse_one (json |> member "corpora" |> to_list)

  (* ---------------------------------------------------------------------------------------------------- *)
358
  let grew_match_table_and_desc ?config ~tmp_gm2 dir_opt name corpus =
359
360
361
    match dir_opt with
    | None -> ()
    | Some dir ->
362
      let stat = Conllx_stat.build ?config ("upos", None) ("ExtPos", Some "upos") corpus in
363
      let html = Conllx_stat.to_html ~tmp_gm2 name ("upos", None) ("ExtPos", Some "upos")  stat in
364
365
366
      let out_file = Filename.concat dir (name ^ "_table.html") in
      CCIO.with_out out_file (fun oc -> CCIO.write_line oc html);

Bruno Guillaume's avatar
Bruno Guillaume committed
367
368
369
370
371
372
373
374
375
376
377
378
      let (nb_trees, nb_tokens) = Conllx_corpus.sizes corpus in
      let desc = `Assoc (CCList.filter_map CCFun.id [
        Some ("nb_trees", `Int nb_trees);
        Some ("nb_tokens", `Int nb_tokens);
        (
          if List.exists (fun suf -> CCString.suffix ~suf name) ["latest"; "dev"; "master"; "conv"]
          then Some ("update", `Int (int_of_float ((Unix.gettimeofday ()) *. 1000.)))
          else None
        )
      ]) in
      Yojson.Basic.to_file (Filename.concat dir (name ^ "_desc.json")) desc;

379
380
  (* ---------------------------------------------------------------------------------------------------- *)
  exception Skip
Bruno Guillaume's avatar
Bruno Guillaume committed
381
382
383
384
385
  let ensure_dir dir =
    try (* catch if dir does not exist *)
      begin (* test if "dir" exists but is not a directory *)
        match Unix.stat dir with
        | { Unix.st_kind = Unix.S_DIR } -> ()
Bruno Guillaume's avatar
Bruno Guillaume committed
386
        | _ -> Error.warning "grew_match option ignored: %s already exists and is not directory" dir; raise Skip
Bruno Guillaume's avatar
Bruno Guillaume committed
387
388
389
390
      end; ()
    with Unix.Unix_error (Unix.ENOENT,_,_) ->
      begin (* dir does not exist -> try to create it *)
        try Unix.mkdir dir 0o755
Bruno Guillaume's avatar
Bruno Guillaume committed
391
        with exc -> Error.warning "grew_match option ignored: cannot create dir %s (%s)" dir (Printexc.to_string exc); raise Skip
Bruno Guillaume's avatar
Bruno Guillaume committed
392
393
394
      end

  (* ---------------------------------------------------------------------------------------------------- *)
395
  (* [grew_match] is a folder where tables, logs and corpus desc is stored *)
396
  let build_marshal_file ?grew_match ~tmp_gm2 corpus_desc =
397
    let config = corpus_desc.config in
Bruno Guillaume's avatar
Bruno Guillaume committed
398
399
    let full_files = get_full_files corpus_desc in
    let marshal_file = (Filename.concat corpus_desc.directory corpus_desc.id) ^ ".marshal" in
400
401

    let (grew_match_dir, log_file) =
Bruno Guillaume's avatar
Bruno Guillaume committed
402
      match (corpus_desc.kind, grew_match) with
403
      | (Conll, Some dir) ->
Bruno Guillaume's avatar
Bruno Guillaume committed
404
405
406
407
408
        begin
          try
            ensure_dir dir;
            let log = Filename.concat dir (sprintf "%s.log" corpus_desc.id) in
            try close_out (open_out log); (Some dir, Some log)
Bruno Guillaume's avatar
Bruno Guillaume committed
409
            with exc -> Error.warning "grew_match option ignored: cannot create file in dir %s (%s)" dir (Printexc.to_string exc); raise Skip
Bruno Guillaume's avatar
Bruno Guillaume committed
410
411
412
          with Skip -> (None,None)
        end
      | _ -> (None, None) in
413
414

    try
Bruno Guillaume's avatar
Bruno Guillaume committed
415
      let items = match corpus_desc.kind with
416
        | Conll ->
Bruno Guillaume's avatar
Bruno Guillaume committed
417
          let conll_corpus = Conllx_corpus.load_list ?log_file ~config:corpus_desc.config ?columns:corpus_desc.columns full_files in
418
          grew_match_table_and_desc ~config:corpus_desc.config ~tmp_gm2 grew_match_dir corpus_desc.id conll_corpus;
419
          CCArray.filter_map (fun (sent_id,conllx) ->
420
              try
421
                let init_graph = G_graph.of_json (Conllx.to_json conllx) in
Bruno Guillaume's avatar
Bruno Guillaume committed
422
                let graph = match corpus_desc.preapply with
423
                  | Some grs -> Grs.apply ~config grs init_graph
Bruno Guillaume's avatar
Bruno Guillaume committed
424
                  | None -> init_graph in
425
426
                Some {Corpus.sent_id; text=G_graph.to_sentence graph; graph }
              with Error.Build (msg, loc_opt) ->
Bruno Guillaume's avatar
Bruno Guillaume committed
427
                Error.warning "[build_marshal_file, sent_id=%s%s] skipped: %s"
428
429
430
                  sent_id
                  (match loc_opt with None -> "" | Some loc -> "; " ^ (Loc.to_string loc))
                  msg; None
431
            ) (Conllx_corpus.get_data conll_corpus)
432
433

        | Pst ->
434
          let pst_corpus = Pst_corpus.load_files full_files in
435
436
          CCArray.filter_map (fun (sent_id,pst) ->
              try
Bruno Guillaume's avatar
Bruno Guillaume committed
437
                let graph = G_graph.of_pst (Parser.phrase_structure_tree pst) in
438
                Some {Corpus.sent_id; text=G_graph.to_sentence graph; graph }
Bruno Guillaume's avatar
Bruno Guillaume committed
439
              with exc -> Error.warning "[id=%s] PST skipped [exception: %s]" sent_id (Printexc.to_string exc); None
440
441
442
            ) pst_corpus

        | Amr ->
Bruno Guillaume's avatar
Bruno Guillaume committed
443
          let amr_corpus = match full_files with
444
445
446
            | [one] -> Amr_corpus.load one
            | _ -> failwith "AMR multi-files corpus is not handled"
          in
447
          CCArray.filter_map (fun (sent_id,amr) ->
448
              try
Bruno Guillaume's avatar
Bruno Guillaume committed
449
                let json = Amr.to_json ~unfold:true amr in
450
451
                let graph = G_graph.of_json json in
                let text = match G_graph.get_meta_opt "text" graph with Some t -> t | None -> "__missing text metadata__" in
452
                Some {Corpus.sent_id; text; graph }
Bruno Guillaume's avatar
Bruno Guillaume committed
453
              with exc -> Error.warning "[id=%s] AMR skipped [exception: %s]" sent_id (Printexc.to_string exc); None
454
            ) amr_corpus
Bruno Guillaume's avatar
Bruno Guillaume committed
455
        | Json | Dmrs ->
456
457
458
          Array.concat (
            List.map (
              fun file ->
459
460
                try Corpus.from_json ~loc: (Loc.file file) (Yojson.Basic.from_file file)
                with Yojson.Json_error msg -> Error.run ~loc:(Loc.file file) "Error in the JSON file format: %s" msg
461
462
            ) full_files
          )
Bruno Guillaume's avatar
Bruno Guillaume committed
463
        | Gr -> Error.run "Gr corpora are not supported in file compilation" in
Bruno Guillaume's avatar
Bruno Guillaume committed
464
      let _ = Error.info "[%s] %d graphs loaded" corpus_desc.id (Array.length items) in
465
      let out_ch = open_out_bin marshal_file in
Bruno Guillaume's avatar
Bruno Guillaume committed
466
      let (data : Corpus.t) = {Corpus.items; kind=corpus_desc.kind } in
467
468
469
      Marshal.to_channel out_ch data [];
      close_out out_ch
    with
Bruno Guillaume's avatar
Bruno Guillaume committed
470
471
472
    | Conllx_error json -> Error.warning "[Conllx_error] fail to load corpus %s, skip it\nexception: %s" corpus_desc.id (Yojson.Basic.pretty_to_string json)
    | Error.Run (msg,_) -> Error.warning "[Libgrew error] %s, fail to load corpus %s: skip it" msg corpus_desc.id
    | exc -> Error.warning "[Error] fail to load corpus %s, skip it\nexception: %s" corpus_desc.id (Printexc.to_string exc)
473
474
475


  (* ---------------------------------------------------------------------------------------------------- *)
476
  let compile ?(force=false) ?grew_match ?(tmp_gm2=false) corpus_desc =
477
478
    let full_files = get_full_files corpus_desc in
    let marshal_file = (Filename.concat corpus_desc.directory corpus_desc.id) ^ ".marshal" in
479
    let really_marshal () = build_marshal_file ?grew_match ~tmp_gm2 corpus_desc in
480
481
482
483
484
485
486
    if force
    then really_marshal ()
    else
      try
        let marshal_time = (Unix.stat marshal_file).Unix.st_mtime in
        if List.exists (fun f -> (Unix.stat f).Unix.st_mtime > marshal_time) full_files
        then really_marshal () (* one of the data files is more recent than the marshal file *)
Bruno Guillaume's avatar
Bruno Guillaume committed
487
        else Error.info "--> %s is uptodate" corpus_desc.id
488
489
490
491
      with
      | Unix.Unix_error _ ->
        (* the marshal file does not exists *)
        really_marshal ()
492
493

  (* ---------------------------------------------------------------------------------------------------- *)
Bruno Guillaume's avatar
domain    
Bruno Guillaume committed
494
  let clean {kind; id; directory; files; preapply}  =
495
496
497
    let marshal_file = (Filename.concat directory id) ^ ".marshal" in
    if Sys.file_exists marshal_file then Unix.unlink marshal_file
end