corpus.ml 7.94 KB
Newer Older
Idir Lankri's avatar
Idir Lankri committed
1
(**************************************************************************)
Idir Lankri's avatar
Idir Lankri committed
2
(*                                                                        *)
Idir Lankri's avatar
Idir Lankri committed
3
4
5
6
7
8
9
(*                     The Sanskrit Heritage Platform                     *)
(*                                                                        *)
(*                              Idir Lankri                               *)
(*                                                                        *)
(* ©2017 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)

10
module Section : sig
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
  type t
  ;
  value make : string -> t
  ;
  value label : t -> string
  ;
  value compare : t -> t -> int
  ;
end = struct
  type t = string
  ;
  value make h = h
  ;
  value label h = h
  ;
  value compare h h' = String.compare (label h) (label h')
  ;
end
29
;
30
31
32
33
34
module Analyzer : sig
  type t = [ Graph ]
  ;
  value path : t -> string
  ;
35
36
  value relocatable_path : t -> string
  ;
37
38
39
end = struct
  type t = [ Graph ]
  ;
40
  value path = fun [ Graph -> Paths.(cgi_dir_url ^ cgi_graph) ]
41
  and relocatable_path = fun [ Graph -> "!CGIGRAPH" ]
42
43
44
45
46
47
48
  ;
end
;
module Analysis : sig
  type t
  ;
  value make :
49
    Analyzer.t -> Html.language -> string -> Num.num -> t
50
51
52
53
54
  ;
  value analyzer : t -> Analyzer.t
  ;
  value lang : t -> Html.language
  ;
55
  value checkpoints : t -> string
56
57
58
59
60
61
62
  ;
  value nb_sols : t -> Num.num
  ;
end = struct
  type t =
    { analyzer : Analyzer.t
    ; lang : Html.language
63
    ; checkpoints : string
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
    ; nb_sols : Num.num
    }
  ;
  value make analyzer lang checkpoints nb_sols =
    { analyzer ; lang; checkpoints; nb_sols }
  ;
  value analyzer a = a.analyzer
  ;
  value lang a = a.lang
  ;
  value checkpoints a = a.checkpoints
  ;
  value nb_sols a = a.nb_sols
  ;
end
;
module Encoding : sig
  type t = [ Velthuis | WX | KH | SLP1 | Devanagari | IAST ]
  ;
  value to_string : t -> string
  ;
end = struct
  type t = [ Velthuis | WX | KH | SLP1 | Devanagari | IAST ]
  ;
  value to_string = fun
    [ Velthuis -> "VH"
    | WX -> "WX"
    | KH -> "KH"
    | SLP1 -> "SL"
    | Devanagari -> "deva"
    | IAST -> "roma"
    ]
  ;
end
;
99
(* What about metadata (date, author, history...) ?  *)
100
101
102
module Sentence : sig
  type t
  ;
103
  value make : int -> list Word.word -> bool -> Analysis.t -> t
104
105
106
  ;
  value id : t -> int
  ;
107
  value text : Encoding.t -> t -> string
108
  ;
109
  value analysis : t -> Analysis.t
110
111
112
113
114
115
  ;
  value compare : t -> t -> int
  ;
end = struct
  type t =
    { id : int
116
117
118
    ; text : list Word.word
    ; unsandhied : bool
    ; analysis : Analysis.t
119
120
    }
  ;
121
  value make id text unsandhied analysis =
122
    { id = id
123
124
125
    ; text = text
    ; unsandhied = unsandhied
    ; analysis = analysis
126
127
128
129
    }
  ;
  value id s = s.id
  ;
130
131
132
133
134
135
136
137
138
139
  value text encoding s =
    let encode_word =
      match encoding with
      [ Encoding.Velthuis | Encoding.WX | Encoding.KH | Encoding.SLP1 ->
        encoding |> Encoding.to_string |> Canon.switch_decode
      | Encoding.Devanagari -> Canon.unidevcode
      | Encoding.IAST -> Canon.uniromcode
      ]
    in
    s.text |> List.map encode_word |> String.concat " "
140
  ;
141
  value unsandhied s = s.unsandhied
142
  ;
143
  value analysis s = s.analysis
144
  ;
145
  value compare s s' = compare (id s) (id s')
146
147
  ;
end
148
;
149
module type Location = sig
150
  value path : string
151
152
  ;
end
153
;
154
155
156
module type S = sig
  (* Contents of a corpus subdirectory: either we are on leaves of the
     tree (constructor [Sentences]) or on branches (constructor
157
     [Sections]).  *)
158
  type contents =
159
    [ Empty
160
    | Sections of list Section.t
161
162
163
164
    | Sentences of list Sentence.t
    ]
  ;
  (* List the contents of the given corpus subdirectory.  Note that the
165
     returned elements are sorted according to [Section.compare] or
166
167
168
     [Sentence.compare] depending on the case.  *)
  value contents : string -> contents
  ;
169
170
  exception Sentence_already_exists
  ;
171
172
  value save_sentence :
    bool -> string -> int -> list Word.word -> bool -> Analysis.t -> unit
173
  ;
174
  exception Section_already_exists of string
175
  ;
176
177
  value mkdir : string -> unit
  ;
178
179
180
181
  exception No_such_sentence
  ;
  value sentence : string -> int -> Sentence.t
  ;
182
  type permission = [ Reader | Annotator | Manager ]
183
  ;
184
  value default_permission : permission
185
  ;
186
  value string_of_permission : permission -> string
187
  ;
188
  value permission_of_string : string -> permission
189
  ;
190
  value url : string -> permission -> Sentence.t -> string
191
  ;
192
  value relocatable_url : string -> permission -> Sentence.t -> string
193
  ;
194
  value citation : string -> int -> string 
195
  ;
196
end
197
;
198
199
module Make (Loc : Location) : S = struct
  type contents =
200
    [ Empty
201
    | Sections of list Section.t
202
203
204
    | Sentences of list Sentence.t
    ]
  ;
205
206
  value ( /^ ) = Filename.concat
  ;
207
  value ( ~/ ) file = Loc.path /^ file
208
  ;
209
210
211
212
213
214
215
216
217
  value sentence_ext = "rem"
  ;
  value sentence_file subdir id =
    ~/subdir /^ Printf.sprintf "%d.%s" id sentence_ext
  ;
  exception No_such_sentence
  ;
  value sentence subdir id =
    let file = sentence_file subdir id in
Gérard Huet's avatar
Gérard Huet committed
218
219
    if Sys.file_exists file then (Gen.gobble file : Sentence.t) 
                            else raise No_such_sentence 
220
  ;
221
  value contents subdir =
222
    let subdir = ~/subdir in
223
224
225
226
    match Dir.subdirs subdir with
    [ [] ->
      let sentences =
        subdir
227
        |> Dir.files_with_ext sentence_ext
228
        |> List.map (fun x -> (Gen.gobble (subdir /^ x) : Sentence.t))
229
230
        |> List.sort Sentence.compare
      in
231
      match sentences with [ [] -> Empty | sentences -> Sentences sentences ]
232
    | subdirs ->
233
      let sections =
234
        subdirs
235
236
        |> List.map Section.make
        |> List.sort Section.compare
237
      in
238
      Sections sections
239
240
    ]
  ;
241
  value metadata_file dir id = ~/dir /^ "." ^ string_of_int id
242
  ;
243
244
  exception Sentence_already_exists
  ;
245
246
247
  value save_sentence force dir id text unsandhied analysis =
    let file = sentence_file dir id in
    let sentence = Sentence.make id text unsandhied analysis in
Gérard Huet's avatar
Gérard Huet committed
248
249
    if not force && Sys.file_exists file then raise Sentence_already_exists 
                                         else Gen.dump sentence file
250
  ;
251
  exception Section_already_exists of string
252
253
  ;
  value mkdir dirname =
254
    try Unix.mkdir ~/dirname 0o755 with
255
    [ Unix.Unix_error (Unix.EEXIST, _, _) ->
Gérard Huet's avatar
Gérard Huet committed
256
      raise (Section_already_exists (Filename.basename dirname)) 
257
    ]
258
  ;
259
  type permission = [ Reader | Annotator | Manager ]
260
  ;
261
  value default_permission = Reader
262
  ;
263
  value string_of_permission = fun
264
265
266
267
268
    [ Reader -> "reader"
    | Annotator -> "annotator"
    | Manager -> "manager"
    ]
  ;
269
270
271
272
  value restrict_permission perm =
    match Html.target with
    [ Html.Server -> Reader
    | Html.Simputer | Html.Computer | Html.Station -> perm
273
274
    ]
  ;
275
276
277
278
279
280
281
  value permission_of_string s = s |> to_perm |> restrict_permission
    where to_perm = fun
      [ "annotator" -> Annotator
      | "manager" -> Manager
      | _ -> Reader
      ]
  ;
282
  value url dir permission sentence =
283
284
    let analysis = Sentence.analysis sentence in
    let env =
285
      [ (Params.corpus_permission, string_of_permission permission)
286
287
288
289
290
291
292
293
294
295
296
297
298
      ; ("text", Sentence.text Encoding.Velthuis sentence)
      ; ("cpts", Analysis.checkpoints analysis)
      ; (Params.corpus_dir, dir)
      ; (Params.sentence_no, sentence |> Sentence.id |> string_of_int)
      ]
    in
    let path =
      analysis
      |> Analysis.analyzer
      |> Analyzer.path
    in
    Cgi.url path ~query:(Cgi.query_of_env env)
  ;
299
  value relocatable_url dir permission sentence =
300
301
    let analysis = Sentence.analysis sentence in
    let env =
302
      [ (Params.corpus_permission, string_of_permission permission)
303
304
305
306
307
308
309
310
311
312
313
314
315
      ; ("text", Sentence.text Encoding.Velthuis sentence)
      ; ("cpts", Analysis.checkpoints analysis)
      ; (Params.corpus_dir, dir)
      ; (Params.sentence_no, sentence |> Sentence.id |> string_of_int)
      ]
    in
    let path =
      analysis
      |> Analysis.analyzer
      |> Analyzer.relocatable_path
    in
    Cgi.url path ~query:(Cgi.query_of_env env)
  ;
316
value citation subdir id =
317
   relocatable_url subdir Reader (sentence subdir id)
318
;
319
end
320
;