Commit b31f82b5 authored by Idir Lankri's avatar Idir Lankri

Add program to build corpus from a LaTeX-like formatted citation file

Type 'ML/mk_corpus --help' in a terminal for usage.
parent cff3bc82
......@@ -16,6 +16,7 @@ ML/indexer
ML/indexerd
ML/interface
ML/lemmatizer
ML/mk_corpus
ML/mk_corpus_page
ML/mk_grammar_page
ML/mk_index_page
......
......@@ -287,10 +287,10 @@ mk_sandhi_page.cmx : web.cmx html.cmx
mk_corpus_page.cmo : web.cmo html.cmo
mk_corpus_page.cmx : web.cmx html.cmx
corpus.cmi : ../ZEN/word.cmo
corpus.cmo : ../ZEN/word.cmo web.cmo sanskrit.cmi interface.cmi \
../ZEN/gen.cmo encode.cmo dir.cmi corpus.cmi
corpus.cmx : ../ZEN/word.cmx web.cmx sanskrit.cmx interface.cmx \
../ZEN/gen.cmx encode.cmx dir.cmx corpus.cmi
corpus.cmo : ../ZEN/word.cmo web.cmo sanskrit.cmi params.cmi interface.cmi \
../ZEN/gen.cmo encode.cmo dir.cmi cgi.cmo corpus.cmi
corpus.cmx : ../ZEN/word.cmx web.cmx sanskrit.cmx params.cmx interface.cmx \
../ZEN/gen.cmx encode.cmx dir.cmx cgi.cmx corpus.cmi
corpus_manager.cmi :
corpus_manager.cmo : web.cmo paths.cmo params.cmi multilingual.cmo html.cmo \
dir.cmi corpus.cmi canon.cmo corpus_manager.cmi
......@@ -302,3 +302,5 @@ save_corpus_cgi.cmo : web.cmo params.cmi corpus_manager.cmi corpus.cmi \
cgi.cmo
save_corpus_cgi.cmx : web.cmx params.cmx corpus_manager.cmx corpus.cmx \
cgi.cmx
mk_corpus.cmo : paths.cmo params.cmi corpus.cmi
mk_corpus.cmx : paths.cmx params.cmx corpus.cmx
......@@ -86,7 +86,7 @@ LINK=ocamlopt -I $(ZEN) -I +camlp4 dynlink.cmxa camlp4lib.cmxa
# standard installation of Sanskrit Heritage platform - assumes ZEN library
engine: test_version cgis reset_caches static_pages regression
engine: test_version cgis reset_caches static_pages regression mk_corpus
# reader_plugin parse_apte
# testing consistency of Heritage_resources and Heritage_platform
......@@ -97,7 +97,7 @@ test_version: paths.cmx gen.cmx version.cmx control.cmx test_stamp.cmx
static_pages: css mk_index_page mk_grammar_page mk_sandhi_page mk_reader_page \
mk_corpus_page
all: engine static_pages reset_caches sandhi_test mk_corpus
all: engine static_pages reset_caches sandhi_test
# legacy in need of re-design
regression: rank.cmx regression.cmx
......
value url_encode s =
let conversion_tbl =
[ ("]", "5D") (* Must be the first element because of [Str.regexp]. *)
; ("!", "21")
; ("#", "23")
; ("$", "24")
; ("&", "26")
; ("'", "27")
; ("(", "28")
; (")", "29")
; ("*", "2A")
; ("+", "2B")
; (",", "2C")
; ("/", "2F")
; (":", "3A")
; (";", "3B")
; ("=", "3D")
; ("?", "3F")
; ("@", "40")
; ("[", "5B")
]
in
let url_encode = fun
[ " " -> "+"
| s ->
try "%" ^ List.assoc s conversion_tbl with [ Not_found -> s ]
]
in
let special_chars =
Str.regexp (
"[" ^ String.concat "" (conversion_tbl |> List.split |> fst) ^ " " ^ "]"
)
in
let subst s = s |> Str.matched_string |> url_encode in
Str.global_substitute special_chars subst s
;
value query_of_env env =
String.concat "&" (List.map (fun (k, v) -> k ^ "=" ^ url_encode v) env)
;
value citation_regexp = Str.regexp "\\\\citation{\\(.*\\)}"
;
value extract_citation save_corpus line =
if Str.string_match citation_regexp line 0 then
save_corpus ~text:(Str.matched_group 1 line)
else
prerr_endline "Wrong input format: expected one citation macro per line."
value extract_citation save_sentence env line line_no =
try
if Str.string_match citation_regexp line 0 then
let query = query_of_env [ ("text", Str.matched_group 1 line) :: env ] in
save_sentence ~query
else
raise Exit
with
[ _ ->
do
{ Printf.eprintf
"Line %d: \
Wrong input format (expect one citation macro per line)" line_no
; exit 1
}
]
;
value populate_corpus save_corpus file =
let rec aux ch i =
try
let line = input_line ch in
do
{ extract_citation (save_corpus ~sentence_no:i) line
; aux ch (i + 1) }
with
[ End_of_file -> () ]
in
let ch = open_in file in
do
{ aux ch 1
; close_in ch }
value populate_corpus save_corpus dir file =
let rec aux ch i =
try
let line = input_line ch in
let env =
[ (Params.corpus_dir, dir.val)
; (Params.sentence_no, string_of_int i)
; ("t", Paths.default_transliteration)
]
in
do
{ extract_citation save_corpus env line i
; aux ch (i + 1)
}
with
[ End_of_file -> () ]
in
let ch = open_in file in
do
{ aux ch 1
; close_in ch
}
;
(***************)
(* Entry point *)
(***************)
value main =
let save_corpus =
Corpus.save_sentence ~corpus_dir:"./"
~translit:Paths.default_transliteration ~unsandhied:False
let dir = ref "" in
let save_corpus = Corpus.save_sentence ~corpus_location:"./" in
(* -d is a mandatory option! *)
let opts =
Arg.align
[ ("-d", Arg.Set_string dir,
" Set the destination directory (ending with a slash)") ]
in
Arg.parse [] (populate_corpus save_corpus) (Sys.argv.(0) ^ " <citation_file>")
Arg.parse opts (populate_corpus save_corpus dir)
(Filename.basename Sys.argv.(0) ^ " [options] <citation_file>")
;
......@@ -267,7 +267,7 @@ ML/tag_apte.ml
# CORPUS package - corpus manager
CORPUS = ML/corpus.mli ML/corpus.ml ML/corpus_manager.mli \
ML/corpus_manager.ml ML/corpus_manager_cgi.ml \
ML/save_corpus_cgi.ml
ML/save_corpus_cgi.ml ML/mk_corpus.ml
DEBUG= ML/morpho_debug.ml ML/debug.ml
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment