Commit b444e3c1 authored by Idir Lankri's avatar Idir Lankri Committed by Gérard Huet

Corpus manager: Handle different encodings correctly

parent 9c590695
......@@ -288,9 +288,9 @@ mk_corpus_page.cmo : web_corpus.cmi web.cmo params.cmi html.cmo
mk_corpus_page.cmx : web_corpus.cmx web.cmx params.cmx html.cmx
corpus.cmi : ../ZEN/word.cmo html.cmo
corpus.cmo : ../ZEN/word.cmo paths.cmo params.cmi html.cmo ../ZEN/gen.cmo \
dir.cmi cgi.cmo canon.cmo corpus.cmi
encode.cmo dir.cmi cgi.cmo canon.cmo corpus.cmi
corpus.cmx : ../ZEN/word.cmx paths.cmx params.cmx html.cmx ../ZEN/gen.cmx \
dir.cmx cgi.cmx canon.cmx corpus.cmi
encode.cmx dir.cmx cgi.cmx canon.cmx corpus.cmi
web_corpus.cmi : corpus.cmi
web_corpus.cmo : paths.cmo corpus.cmi web_corpus.cmi
web_corpus.cmx : paths.cmx corpus.cmx web_corpus.cmi
......
......@@ -82,6 +82,12 @@ module Encoding : sig
;
value to_string : t -> string
;
value of_string : string -> t
;
value encode : t -> string -> Word.word
;
value decode : t -> Word.word -> string
;
end = struct
type t = [ Velthuis | WX | KH | SLP1 | Devanagari | IAST ]
;
......@@ -94,6 +100,25 @@ end = struct
| IAST -> "roma"
]
;
value rec of_string = fun
[ "VH" -> Velthuis
| "WX" -> WX
| "KH" -> KH
| "SL" -> SLP1
| "deva" -> Devanagari
| "roma" -> IAST
| _ -> Velthuis
]
;
value encode encoding = encoding |> to_string |> Encode.switch_code
;
value decode = fun
[ Velthuis | WX | KH | SLP1 as encoding ->
encoding |> to_string |> Canon.switch_decode
| Devanagari -> Canon.unidevcode
| IAST -> Canon.uniromcode
]
;
end
;
(* What about metadata (date, author, history...) ? *)
......@@ -128,15 +153,7 @@ end = struct
value id s = s.id
;
value text encoding s =
let encode_word =
match encoding with
[ Encoding.Velthuis | Encoding.WX | Encoding.KH | Encoding.SLP1 ->
encoding |> Encoding.to_string |> Canon.switch_decode
| Encoding.Devanagari -> Canon.unidevcode
| Encoding.IAST -> Canon.uniromcode
]
in
s.text |> List.map encode_word |> String.concat " "
s.text |> List.map (Encoding.decode encoding) |> String.concat " "
;
value unsandhied s = s.unsandhied
;
......@@ -281,9 +298,10 @@ module Make (Loc : Location) : S = struct
;
value url dir permission sentence =
let analysis = Sentence.analysis sentence in
let encoding = Encoding.of_string Paths.default_transliteration in
let env =
[ (Params.corpus_permission, string_of_permission permission)
; ("text", Sentence.text Encoding.Velthuis sentence)
; ("text", Sentence.text encoding sentence)
; ("cpts", Analysis.checkpoints analysis)
; (Params.corpus_dir, dir)
; (Params.sentence_no, sentence |> Sentence.id |> string_of_int)
......@@ -301,6 +319,7 @@ module Make (Loc : Location) : S = struct
let env =
[ (Params.corpus_permission, string_of_permission permission)
; ("text", Sentence.text Encoding.Velthuis sentence)
; ("t", Encoding.(to_string Velthuis))
; ("cpts", Analysis.checkpoints analysis)
; (Params.corpus_dir, dir)
; (Params.sentence_no, sentence |> Sentence.id |> string_of_int)
......
......@@ -45,6 +45,12 @@ module Encoding : sig
;
value to_string : t -> string
;
value of_string : string -> t
;
value encode : t -> string -> Word.word
;
value decode : t -> Word.word -> string
;
end
;
module Sentence : sig
......
......@@ -96,9 +96,18 @@ value main =
in
match permission with
[ Web_corpus.Annotator ->
let read_skt =
if unsandhied then Sanskrit.read_raw_sanskrit else
Sanskrit.read_sanskrit
in
let encode =
Cgi.decoded_get "t" Paths.default_transliteration env
|> Corpus.Encoding.of_string
|> Corpus.Encoding.encode
in
do
{ Web_corpus.save_sentence force corpdir sentno
(Sanskrit.read_VH unsandhied text) unsandhied (analysis_of_env env)
(read_skt encode text) unsandhied (analysis_of_env env)
; Corpus_manager.mk_page corpdir permission
}
| Web_corpus.Reader | Web_corpus.Manager ->
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment