Commit 00e9cde0 authored by Idir Lankri's avatar Idir Lankri

Modularize and clarify the abstraction of the corpus

parent 6885ffc5
......@@ -287,29 +287,30 @@ mk_sandhi_page.cmx : web.cmx html.cmx
mk_corpus_page.cmo : web.cmo html.cmo
mk_corpus_page.cmx : web.cmx html.cmx
corpus.cmi : ../ZEN/word.cmo
corpus.cmo : ../ZEN/word.cmo web.cmo sanskrit.cmi params.cmi interface.cmi \
../ZEN/gen.cmo encode.cmo dir.cmi cgi.cmo corpus.cmi
corpus.cmx : ../ZEN/word.cmx web.cmx sanskrit.cmx params.cmx interface.cmx \
../ZEN/gen.cmx encode.cmx dir.cmx cgi.cmx corpus.cmi
corpus.cmo : ../ZEN/word.cmo web.cmo sanskrit.cmi params.cmi ../ZEN/gen.cmo \
encode.cmo dir.cmi cgi.cmo corpus.cmi
corpus.cmx : ../ZEN/word.cmx web.cmx sanskrit.cmx params.cmx ../ZEN/gen.cmx \
encode.cmx dir.cmx cgi.cmx corpus.cmi
web_corpus.cmi : corpus.cmi
web_corpus.cmo : web.cmo corpus.cmi web_corpus.cmi
web_corpus.cmx : web.cmx corpus.cmx web_corpus.cmi
corpus_manager.cmi :
corpus_manager.cmo : web.cmo paths.cmo params.cmi multilingual.cmo \
mkdir_corpus_params.cmi html.cmo dir.cmi corpus.cmi canon.cmo \
corpus_manager.cmi
corpus_manager.cmx : web.cmx paths.cmx params.cmx multilingual.cmx \
mkdir_corpus_params.cmx html.cmx dir.cmx corpus.cmx canon.cmx \
corpus_manager.cmi
corpus_manager.cmo : web_corpus.cmi web.cmo paths.cmo params.cmi \
multilingual.cmo mkdir_corpus_params.cmi html.cmo dir.cmi corpus.cmi \
canon.cmo corpus_manager.cmi
corpus_manager.cmx : web_corpus.cmx web.cmx paths.cmx params.cmx \
multilingual.cmx mkdir_corpus_params.cmx html.cmx dir.cmx corpus.cmx \
canon.cmx corpus_manager.cmi
corpus_manager_cgi.cmo : params.cmi corpus_manager.cmi cgi.cmo
corpus_manager_cgi.cmx : params.cmx corpus_manager.cmx cgi.cmx
save_corpus_cgi.cmo : web.cmo params.cmi corpus_manager.cmi corpus.cmi \
cgi.cmo
save_corpus_cgi.cmx : web.cmx params.cmx corpus_manager.cmx corpus.cmx \
cgi.cmx
save_corpus_cgi.cmo : web_corpus.cmi params.cmi corpus_manager.cmi cgi.cmo
save_corpus_cgi.cmx : web_corpus.cmx params.cmx corpus_manager.cmx cgi.cmx
mkdir_corpus_params.cmi :
mkdir_corpus_params.cmo : params.cmi mkdir_corpus_params.cmi
mkdir_corpus_params.cmx : params.cmx mkdir_corpus_params.cmi
mkdir_corpus_cgi.cmo : web.cmo mkdir_corpus_params.cmi corpus_manager.cmi \
corpus.cmi cgi.cmo
mkdir_corpus_cgi.cmx : web.cmx mkdir_corpus_params.cmx corpus_manager.cmx \
corpus.cmx cgi.cmx
mkdir_corpus_cgi.cmo : web_corpus.cmi mkdir_corpus_params.cmi \
corpus_manager.cmi cgi.cmo
mkdir_corpus_cgi.cmx : web_corpus.cmx mkdir_corpus_params.cmx \
corpus_manager.cmx cgi.cmx
mk_corpus.cmo : paths.cmo params.cmi corpus.cmi
mk_corpus.cmx : paths.cmx params.cmx corpus.cmx
......@@ -55,10 +55,10 @@ mk_index_page.ml mk_grammar_page.ml mk_reader_page.ml mk_sandhi_page.ml \
mk_corpus_page.ml
# CORPUS package - corpus manager
CORPUS = corpus.mli corpus.ml corpus_manager.mli corpus_manager.ml \
corpus_manager_cgi.ml save_corpus_cgi.ml \
mkdir_corpus_params.mli mkdir_corpus_params.ml \
mkdir_corpus_cgi.ml mk_corpus.ml
CORPUS = corpus.mli corpus.ml web_corpus.mli web_corpus.ml \
corpus_manager.mli corpus_manager.ml corpus_manager_cgi.ml \
save_corpus_cgi.ml mkdir_corpus_params.mli \
mkdir_corpus_params.ml mkdir_corpus_cgi.ml mk_corpus.ml
# extra file SCLpaths.ml for SCL interfacing - not distributed.
......@@ -244,7 +244,8 @@ corpus_manager: corpus_manager_cgi.cmx
share.cmx minimap.cmx mini.cmx nums.cmxa \
graph_segmenter.cmx checkpoints.cmx automaton.cmx \
params.cmx interface.cmx multilingual.cmx dir.cmx \
corpus.cmx mkdir_corpus_params.cmx corpus_manager.cmx $< -o $@
corpus.cmx web_corpus.cmx mkdir_corpus_params.cmx \
corpus_manager.cmx $< -o $@
save_corpus: save_corpus_cgi.cmx
$(LINK) str.cmxa unix.cmxa list2.cmx gen.cmx paths.cmx \
......@@ -259,7 +260,8 @@ save_corpus: save_corpus_cgi.cmx
share.cmx minimap.cmx mini.cmx nums.cmxa \
graph_segmenter.cmx checkpoints.cmx automaton.cmx \
params.cmx interface.cmx multilingual.cmx dir.cmx \
corpus.cmx mkdir_corpus_params.cmx corpus_manager.cmx $< -o $@
corpus.cmx web_corpus.cmx mkdir_corpus_params.cmx \
corpus_manager.cmx $< -o $@
mkdir_corpus: mkdir_corpus_cgi.cmx
$(LINK) str.cmxa unix.cmxa list2.cmx gen.cmx paths.cmx \
......@@ -274,7 +276,8 @@ mkdir_corpus: mkdir_corpus_cgi.cmx
share.cmx minimap.cmx mini.cmx nums.cmxa \
graph_segmenter.cmx checkpoints.cmx automaton.cmx \
params.cmx interface.cmx multilingual.cmx dir.cmx \
corpus.cmx mkdir_corpus_params.cmx corpus_manager.cmx $< -o $@
corpus.cmx web_corpus.cmx mkdir_corpus_params.cmx \
corpus_manager.cmx $< -o $@
mk_corpus: mk_corpus.cmx
$(LINK) str.cmxa unix.cmxa list2.cmx gen.cmx paths.cmx \
......@@ -289,7 +292,7 @@ mk_corpus: mk_corpus.cmx
share.cmx minimap.cmx mini.cmx nums.cmxa \
graph_segmenter.cmx checkpoints.cmx automaton.cmx \
params.cmx interface.cmx multilingual.cmx dir.cmx \
corpus.cmx mkdir_corpus_params.cmx corpus_manager.cmx $< -o $@
corpus.cmx mkdir_corpus_params.cmx $< -o $@
css: css.cmx
$(LINK) gen.cmx paths.cmx version.cmx date.cmx html.cmx \
......
type content =
[ Sections of list string
| Sentences of list string
]
module Heading : sig
type t
;
value make : string -> t
;
value label : t -> string
;
value compare : t -> t -> int
;
end = struct
type t = string
;
value make h = h
;
value label h = h
;
value compare h h' = String.compare (label h) (label h')
;
end
;
value chop_extension file =
try Filename.chop_extension file with [ Invalid_argument _ -> file ]
module Sentence : sig
type t
;
value make : int -> string -> string -> t
;
value id : t -> int
;
value analyzer : t -> string
;
value state : t -> string
;
value compare : t -> t -> int
;
type metadata = { text : list Word.word }
;
value url : t -> string
;
end = struct
type t =
{ id : int
; analyzer : string
; state : string
}
;
value make id analyzer state =
{ id = id
; analyzer = analyzer
; state = state
}
;
value id s = s.id
;
value analyzer s = s.analyzer
;
value state s = s.state
;
value compare s s' = compare (id s) (id s')
;
type metadata = { text : list Word.word }
;
value url sentence = analyzer sentence ^ "?" ^ state sentence
;
end
;
value sentence_id file =
file
|> Filename.basename
|> chop_extension
|> int_of_string
module type Location = sig
value dir : string
;
end
;
value content subdir =
match Dir.subdirs subdir String.compare with
[ [] ->
let cmp file file' = compare (sentence_id file) (sentence_id file') in
Sentences (Dir.files_with_ext "html" subdir cmp)
| subdirs -> Sections subdirs
]
module type S = sig
(* Contents of a corpus subdirectory: either we are on leaves of the
tree (constructor [Sentences]) or on branches (constructor
[Headings]). *)
type contents =
[ Headings of list Heading.t
| Sentences of list Sentence.t
]
;
(* List the contents of the given corpus subdirectory. Note that the
returned elements are sorted according to [Heading.compare] or
[Sentence.compare] depending on the case. *)
value contents : string -> contents
;
value save_sentence : string -> unit
;
value mkdir : string -> unit
;
value gobble_metadata : string -> Sentence.t -> Sentence.metadata
;
value dump_metadata : string -> Sentence.t -> Sentence.metadata -> unit
;
end
;
type sentence_metadata = { text : list Word.word }
;
value sentence_metadata_file dir file = dir ^ "." ^ chop_extension file
;
value gobble_sentence_metadata dir file =
(Gen.gobble (sentence_metadata_file dir file) : sentence_metadata)
;
value dump_sentence_metadata metadata dir file =
Gen.dump metadata (sentence_metadata_file dir file)
;
value save_sentence ~corpus_location ~query =
let env = Cgi.create_env query in
let corpus_dir = Cgi.decoded_get Params.corpus_dir "" env in
let sentence_no = Cgi.decoded_get Params.sentence_no "" env in
let translit = Cgi.decoded_get "t" "" env in
let unsandhied = Cgi.decoded_get "us" "" env = "t" in
let text = Cgi.decoded_get "text" "" env in
let corpus_abs_dir = corpus_location ^ corpus_dir in
let sentence_no =
sentence_no |> float_of_string |> int_of_float |> string_of_int
in
let file = corpus_abs_dir ^ sentence_no ^ ".html" in
let sentence =
let encode = Encode.switch_code translit in
let chunker =
if unsandhied then (* sandhi undone *)
Sanskrit.read_raw_sanskrit
else (* blanks non-significant *)
Sanskrit.read_sanskrit
module Make (Loc : Location) : S = struct
type contents =
[ Headings of list Heading.t
| Sentences of list Sentence.t
]
;
value contents subdir =
match Dir.subdirs subdir with
[ [] ->
let sentences =
subdir
|> Dir.files_with_ext "rem"
|> List.map (fun x -> (Gen.gobble (subdir ^ x) : Sentence.t))
|> List.sort Sentence.compare
in
Sentences sentences
| subdirs ->
let headings =
subdirs
|> List.map Heading.make
|> List.sort Heading.compare
in
Headings headings
]
;
value metadata_file dir id = dir ^ "." ^ string_of_int id
;
value gobble_metadata dir sentence =
(Gen.gobble (metadata_file dir (Sentence.id sentence)) : Sentence.metadata)
;
value dump_metadata dir sentence metadata =
Gen.dump metadata (metadata_file dir (Sentence.id sentence))
;
value save_sentence state =
let env = Cgi.create_env state in
let corpus_dir = Cgi.decoded_get Params.corpus_dir "" env in
let sentence_no = Cgi.decoded_get Params.sentence_no "" env in
let translit = Cgi.decoded_get "t" "" env in
let unsandhied = Cgi.decoded_get "us" "" env = "t" in
let text = Cgi.decoded_get "text" "" env in
let corpus_abs_dir = Loc.dir ^ corpus_dir in
let sentence_no =
sentence_no |> float_of_string |> int_of_float
in
{ text = chunker encode text }
in
do
{ Unix.putenv Cgi.query_string_env_var query
; Web.output_channel.val := open_out file
; Interface.safe_engine ()
; dump_sentence_metadata sentence corpus_abs_dir sentence_no
; close_out Web.output_channel.val
; Web.output_channel.val := stdout
}
;
value mkdir ~corpus_location ~dirname =
Unix.mkdir (corpus_location ^ dirname) 0o755
let file = corpus_abs_dir ^ string_of_int sentence_no ^ ".rem" in
let metadata =
let encode = Encode.switch_code translit in
let chunker =
if unsandhied then (* sandhi undone *)
Sanskrit.read_raw_sanskrit
else (* blanks non-significant *)
Sanskrit.read_sanskrit
in
{ Sentence.text = chunker encode text }
in
let sentence = Sentence.make sentence_no Web.graph_cgi state in
do
{ dump_metadata corpus_abs_dir sentence metadata
; Gen.dump sentence file
}
;
value mkdir dirname = Unix.mkdir (Loc.dir ^ dirname) 0o755
;
end
;
(* Operations on the corpus tree *)
(* Content of a corpus subdirectory: either we are on leaves of the tree
(constructor [Sentences]) or on branches (constructor
[Sections]). *)
type content =
[ Sections of list string
| Sentences of list string
]
;
(* List the content of the given corpus subdirectory. *)
value content : string -> content
;
(* TODO: Determine all the fields. *)
type sentence_metadata = { text : list Word.word }
;
value gobble_sentence_metadata : string -> string -> sentence_metadata
;
value dump_sentence_metadata : sentence_metadata -> string -> string -> unit
;
(* Return the identifier of the sentence stored in the given file. *)
value sentence_id : string -> int
;
value save_sentence : ~corpus_location:string -> ~query:string -> unit
;
value mkdir : ~corpus_location:string -> ~dirname:string -> unit
module Heading : sig
type t
;
value label : t -> string
;
end
;
module Sentence : sig
type t
;
value id : t -> int
;
value url : t -> string
;
(* TODO: Determine all the fields. *)
type metadata = { text : list Word.word }
;
end
;
module type Location = sig
value dir : string
;
end
;
module type S = sig
(* Contents of a corpus subdirectory: either we are on leaves of the
tree (constructor [Sentences]) or on branches (constructor
[Headings]). *)
type contents =
[ Headings of list Heading.t
| Sentences of list Sentence.t
]
;
(* List the contents of the given corpus subdirectory. Note that the
returned elements are sorted according to [Heading.compare] or
[Sentence.compare] depending on the case. *)
value contents : string -> contents
;
value save_sentence : string -> unit
;
value mkdir : string -> unit
;
value gobble_metadata : string -> Sentence.t -> Sentence.metadata
;
value dump_metadata : string -> Sentence.t -> Sentence.metadata -> unit
;
end
;
module Make (Loc : Location) : S
;
......@@ -105,10 +105,10 @@ value uplinks dir =
;
(* Display sentences with format "sentence || sentno" like in citations
file. *)
value sentence_links dir files =
let to_anchor_ref file =
value sentence_links dir sentences =
let to_anchor_ref sentence =
let metadata =
Corpus.gobble_sentence_metadata (Web.corpus_dir ^ dir) file
Web_corpus.gobble_metadata (Web.corpus_dir ^ dir) sentence
in
let font = Multilingual.font_of_string Paths.default_display_font in
let words =
......@@ -117,7 +117,7 @@ value sentence_links dir files =
[ Multilingual.Deva -> Canon.unidevcode
| Multilingual.Roma -> Canon.uniromcode
]
) metadata.Corpus.text
) metadata.Corpus.Sentence.text
in
let display =
match font with
......@@ -125,17 +125,18 @@ value sentence_links dir files =
| Multilingual.Roma -> Html.span Html.Trans16
]
in
let sentence = String.concat " " words in
Html.anchor_ref (Web.corpus_url ^ dir ^ file) sentence |> display
let sentence_str = String.concat " " words in
Html.anchor_ref (Corpus.Sentence.url sentence) sentence_str
|> display
in
List.map to_anchor_ref files
List.map to_anchor_ref sentences
;
value subdir_selection dir subdirs =
value heading_selection dir headings =
let options =
let prefixed_subdirs =
List.map (fun x -> dir ^ x ^ Filename.dir_sep) subdirs
let prefixes =
List.map (fun x -> dir ^ x ^ Filename.dir_sep) headings
in
List.combine prefixed_subdirs subdirs
List.combine prefixes headings
in
Html.option_select_label Params.corpus_dir options
;
......@@ -158,7 +159,7 @@ value htmlify_group dir (group, gap) =
match group with
[ [] -> ("", "")
| [ h :: _ ] ->
let id = Corpus.sentence_id h in
let id = Corpus.Sentence.id h in
(Html.ol ~start:id ~items:(sentence_links dir group), string_of_int id)
]
in
......@@ -174,22 +175,18 @@ value htmlify_group dir (group, gap) =
add_sentence_form dir gap ^
Html.div_end
;
value sentence_file dir no = string_of_int no ^ ".html"
;
value group_sentences dir files =
let groups =
files
|> List.map Corpus.sentence_id
|> groups_with_gaps
|> add_init_gap
in
List.map (fun (x, y) -> (List.map (sentence_file dir) x, y)) groups
value group_sentences dir sentences =
let ids = List.map Corpus.Sentence.id sentences in
let dict = List.combine ids sentences in
let groups = ids |> groups_with_gaps |> add_init_gap in
List.map (fun (x, y) -> (List.map (fun x -> List.assoc x dict) x, y)) groups
;
value body dir =
match Corpus.content (Web.corpus_dir ^ dir) with
(* When files = [], it is possible to create a subdir or add a sentence... *)
[ Corpus.Sentences files ->
let groups = group_sentences dir files in
match Web_corpus.contents (Web.corpus_dir ^ dir) with
(* When files = [], it is possible to create a heading or add
a sentence... *)
[ Web_corpus.Sentences sentences ->
let groups = group_sentences dir sentences in
do
{ Html.h2_begin Html.B2 |> Web.pl
; uplinks dir |> Web.pl
......@@ -197,9 +194,11 @@ value body dir =
; groups |> List.map (htmlify_group dir) |> List.iter Web.pl
; Html.html_break |> Web.pl
}
| Corpus.Sections subdirs ->
| Web_corpus.Headings headings ->
let selection_prompt =
"Explore " ^ subdir_selection dir subdirs ^ " " ^ Html.submit_input "Go"
"Explore " ^
heading_selection dir (List.map Corpus.Heading.label headings) ^ " " ^
Html.submit_input "Go"
in
do
{ Html.center_begin |> Web.pl
......
......@@ -6,16 +6,16 @@ value abs_files dir =
;
value basenames files = List.map Filename.basename files
;
value subdirs dir cmp =
value subdirs dir =
let subdirs = List.filter Sys.is_directory (abs_files dir) in
subdirs |> basenames |> List.sort cmp
subdirs |> basenames
;
value file_with_ext ext file =
not (Sys.is_directory file) && Filename.check_suffix file ("." ^ ext)
;
value files_with_ext ext dir cmp =
value files_with_ext ext dir =
let files = List.filter (file_with_ext ext) (abs_files dir) in
files |> basenames |> List.sort cmp
files |> basenames
;
value split path = Str.split (Str.regexp Filename.dir_sep) path
;
......
(* Directory operations *)
(* [subdirs dir cmp] returns the list of subdirectories of [dir] sorted
according to the function [cmp]. *)
value subdirs : string -> (string -> string -> int) -> list string
(* [subdirs dir] returns the list of subdirectories of [dir]. The order
of the returned list is unspecified. *)
value subdirs : string -> list string
;
(* [files_with_ext ext dir cmp] returns the list of files in [dir] with
the extension [ext] sorted according to the function [cmp]. *)
value files_with_ext :
string -> string -> (string -> string -> int) -> list string
(* [files_with_ext ext dir] returns the list of files in [dir] with the
extension [ext] (e.g. ["txt"]). The order of the returned list is
unspecified.*)
value files_with_ext : string -> string -> list string
;
(* [split path] splits [path] into substrings corresponding to the
subdirectories of [path]. *)
......
......@@ -45,11 +45,11 @@ value abort report_error status =
;
value citation_regexp = Str.regexp "\\\\citation{\\(.*\\)}"
;
value extract_citation env corpus_location line line_no =
value extract_citation env save_sentence line line_no =
try
if Str.string_match citation_regexp line 0 then
let query = query_of_env [ ("text", Str.matched_group 1 line) :: env ] in
Corpus.save_sentence ~corpus_location ~query
save_sentence query
else
raise Exit
with
......@@ -72,6 +72,7 @@ value populate_corpus dirname file =
Filename.basename dirname.val)
in
let dirname = dirname ^ Filename.dir_sep in
let module Corp = Corpus.Make (struct value dir = corpus_location; end) in
let rec aux i =
try
let line = input_line ch in
......@@ -82,14 +83,14 @@ value populate_corpus dirname file =
]
in
do
{ extract_citation env corpus_location line i
{ extract_citation env Corp.save_sentence line i
; aux (i + 1)
}
with
[ End_of_file -> () ]
in
do
{ Corpus.mkdir ~corpus_location ~dirname
{ Corp.mkdir dirname
; aux 1
; close_in ch
}
......
......@@ -6,7 +6,7 @@ value main =
let dirname = Cgi.decoded_get Mkdir_corpus_params.dirname "" env in
let parent_dir = Cgi.decoded_get Mkdir_corpus_params.parent_dir "" env in
do
{ Corpus.mkdir ~corpus_location:Web.corpus_dir ~dirname:(parent_dir ^ dirname)
{ Web_corpus.mkdir (parent_dir ^ dirname)
; Corpus_manager.make parent_dir
}
;
......@@ -7,7 +7,7 @@ value main =
let env = Cgi.create_env query in
let corpdir = Cgi.decoded_get Params.corpus_dir "" env in
do
{ Corpus.save_sentence ~corpus_location:Web.corpus_dir ~query:query
{ Web_corpus.save_sentence query
; Corpus_manager.make corpdir
}
;
include Corpus.Make (struct value dir = Web.corpus_dir; end)
;
include Corpus.S
;
......@@ -268,10 +268,10 @@ TREE=ML/stemmer.ml ML/parse_tree.ml ML/parse_apte.ml ML/tag_tree.ml \
ML/tag_apte.ml
# CORPUS package - corpus manager
CORPUS = ML/corpus.mli ML/corpus.ml ML/corpus_manager.mli \
ML/corpus_manager.ml ML/corpus_manager_cgi.ml \
ML/save_corpus_cgi.ml ML/mkdir_corpus_cgi.ml \
ML/mk_corpus.ml
CORPUS = ML/corpus.mli ML/corpus.ml ML/web_corpus.mli ML/web_corpus.ml \
ML/corpus_manager.mli ML/corpus_manager.ml \
ML/corpus_manager_cgi.ml ML/save_corpus_cgi.ml \
ML/mkdir_corpus_cgi.ml ML/mk_corpus.ml
DEBUG= ML/morpho_debug.ml ML/debug.ml
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment