Commit d3a0f200 authored by Gérard Huet's avatar Gérard Huet

Fix bug translit in indexer and indexerd

parent ff86ab64
......@@ -15,8 +15,8 @@
(*i module Indexer = struct i*)
open Html; (* abort *)
open Web; (* ps pl etc. *)
open Html; (* [table_begin] etc. *)
open Web; (* ps pl abort etc. *)
open Cgi;
value answer_begin () = do
......@@ -24,21 +24,21 @@ value answer_begin () = do
; ps tr_begin
; ps th_begin
}
;
;
value answer_end () = do
{ ps th_end
; ps tr_end
; pl table_end
; pl html_paragraph
}
;
;
value ok (mess,s) = do { ps mess; pl (Morpho_html.skt_anchor_R False s) }
and ok2 (mess,s1,s2) = do { ps mess; pl (Morpho_html.skt_anchor_R2 s1 s2) }
(* ok2 prints the entry under the spelling given by the user, i.e. without
normalisation, thus e.g. sandhi is not written sa.mdhi, and possibly
suffixed by homonymy index 1, e.g. b.rh. *)
;
(* Should share [Lemmatizer.load_inflected] *)
;
(* Should share [Lemmatizer.load_inflected] *)
value load_inflected file = (Gen.gobble file : Morphology.inflected_map)
;
value load_nouns () = load_inflected public_nouns_file
......@@ -46,7 +46,7 @@ and load_roots () = load_inflected public_roots_file
and load_vocas () = load_inflected public_vocas_file
and load_indecls () = load_inflected public_inde_file
and load_parts () = load_inflected public_parts_file
;
;
value back_ground = background Chamois
;
value display word l = do
......@@ -62,7 +62,7 @@ and report_failure s = do
; ps (Morpho_html.skt_anchor_R False s)
; pl html_break
}
;
;
value try_declensions word before =
(* before is last lexical item before word in lexical order *)
(* This is costly because of the size of inverted inflected databases *)
......@@ -112,14 +112,14 @@ value index_engine () = do
and url_encoded_entry = get "q" env "" in
let lang = language_of lex in do
{ print_title_solid Mauve (Some lang) (dico_title lang)
; answer_begin ()
; ps (div_begin Latin12)
; let str = decode_url url_encoded_entry (* in translit *)
and encode = Encode.switch_code translit
; let str = decode_url url_encoded_entry (* in translit *)
and encode = Encode.switch_code translit
and () = toggle_lexicon lex in
try let word = encode str (* normalization *) in
let str_VH = Canon.decode word in do
{ match lex with
try let word = encode str (* normalization *) in
let str_VH = Canon.decode word in do
{ answer_begin ()
; ps (div_begin Latin12)
; match lex with
[ "MW" ->
let mw_index = read_mw_index () in
let words = Deco.assoc word mw_index in
......@@ -152,13 +152,12 @@ value index_engine () = do
]
; ps div_end (* Latin12 *)
; answer_end ()
; ()
; page_end lang True
}
} (* do *)
with [ Stream.Error _ -> abort lang "Illegal transliteration " str ]
} (* do *)
} (* do *)
;
;
value safe_index_engine () =
let abor = abort Html.French (* may not preserve the current language *) in
try index_engine () with
......
......@@ -4,7 +4,7 @@
(* *)
(* Gérard Huet *)
(* *)
(* ©2017 Institut National de Recherche en Informatique et en Automatique *)
(* ©2018 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(* CGI-bin indexerd for indexing in sanskrit dico without diacritics. *)
......@@ -52,14 +52,12 @@ value print_word c = pl (Morpho_html.skt_anchor_R False (Canon.decode_ref c))
(* Each dummy is mapped to a list of words - all the words which
give back the dummy by normalisation such as removing diacritics *)
value read_dummies () =
(Gen.gobble public_dummies_file : Deco.deco Word.word)
(Gen.gobble public_dummies_file : Deco.deco Word.word)
;
value index_engine () =
let abor = abort Html.French (* may not preserve the current lang *) in
try let dummies_deco = read_dummies () in do
{ prelude ()
; answer_begin ()
; ps (div_begin Latin12)
; let query = Sys.getenv "QUERY_STRING" in
let alist = create_env query in
(* We do not assume transliteration, just ordinary roman letters *)
......@@ -67,7 +65,9 @@ value index_engine () =
let url_encoded_entry = List.assoc "q" alist in
let str = decode_url url_encoded_entry in
try let word = Encode.code_skt_ref_d str (* normalization *) in do
{ let words = Deco.assoc word dummies_deco in
{ answer_begin ()
; ps (div_begin Latin12)
; let words = Deco.assoc word dummies_deco in
match words with
[ [] -> do { ps (Morpho_html.skt_red str)
; ps " not found in Heritage dictionary"
......
......@@ -2384,7 +2384,7 @@ value intercalates root =
| "grah" -> setl
| "s.rj#1" -> [ 3 ] (* sra.s.taa *)
| "k.r.s" -> [ 3 :: vet ] (* ar -> ra optionally *)
| "bh.rjj" -> [ 3 :: anit ] (* idem *)
| "bh.rjj" | "sp.rz#1" -> [ 3 :: anit ] (* idem *)
| "ad#1" | "aap" | "krudh#1" | "kruz" | "k.sip" | "k.sud"
| "k.sudh#1" | "khid" | "chid#1" | "tud#1" | "tu.s" | "t.rp#1"
| "tvi.s#1" | "diz#1" | "dih" | "du.s" | "duh#1" | "d.rz#1"
......@@ -2396,7 +2396,7 @@ value intercalates root =
| "lup" | "vac" | "vap#1" | "vic" | "vid#2" | "viz#1" | "vi.s#1"
| "vyadh" | "zak" | "zad" | "zap" | "zi.s" | "zudh" | "zu.s"
| "zli.s" | "sa~nj" | "sic" | "sidh#1" | "s.rp" | "skand"
| "sp.rz#1" | "sva~nj" | "svid#2" | "had"
| "sva~nj" | "svid#2" | "had"
-> anit
| _ -> set (* default all multisyllabic, gana 10, nominal verbs plus:
[ "afg" | "a~nc" | "an#2" | "arh" | "av" | "az#1" | "az#2" | "as#2" | "aas#2"
......@@ -2715,7 +2715,7 @@ value perstems rstem entry =
-> [ 0 ]
| "v.rj" -> [ 1 ]
| "zuc#1" -> [ 0; 1 ] (* zoktum *)
| "d.rz#1" -> [ 3 ] (* ar -> ra dra.s.tum *)
| "d.rz#1" | "sp.rz#1" -> [ 3 ] (* ar -> ra dra.s.tum *)
| "k.r.s" | "bh.rjj" -> [ 0; 3 ] (* berk *)
| "naz#1" -> [ 0; 1; 4 ] (* berk - (1 not in WR) *)
| "radh" | "trap#1" | "d.rp" | "druh#1" | "muh" | "rudh#2"
......
......@@ -8,4 +8,4 @@
(**************************************************************************)
(* Generated by make version - see main Makefile *)
value version="3.05" and version_date="2018-03-09";
value version="3.05" and version_date="2018-03-15";
VERSION='3.05'
DATE='2018-03-09'
DATE='2018-03-15'
......@@ -2534,14 +2534,14 @@ Encore bravo!<br />
Guy Fontaine<br />
<hr />
January 2018. The Heritage site is accessible from
January 2018. The Heritage site is referenced by
<a href="http://spokensanskrit.org">The Spoken Sanskrit Site</a>
<hr />
<hr />
From: "Ganesh J. Acharya" <ganeshjacharya@gmail.com><br />
To: GH Huet <Gerard.Huet@inria.fr><br />
Date: 11 février 2018<br />
Glad to see an excellent work at sanskrit.inria.fr<br />
......@@ -2550,6 +2550,8 @@ Ganesh J. Acharya<br />
Mob: +91-9323193xxx, Ph: +91-22-28152xxx<br />
Linkedin | Facebook | Twitter
<hr />
</td></tr>
</table> <!-- body -->
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment