indexer.ml 7.12 KB
Newer Older
huet's avatar
huet committed
1 2 3 4 5 6
(**************************************************************************)
(*                                                                        *)
(*                     The Sanskrit Heritage Platform                     *)
(*                                                                        *)
(*                              Gérard Huet                               *)
(*                                                                        *)
7
(* ©2019 Institut National de Recherche en Informatique et en Automatique *)
huet's avatar
huet committed
8 9 10 11 12 13 14 15 16 17
(**************************************************************************)

(* CGI-bin indexer for indexing in sanskrit dictionary.                   *)
(* This CGI is triggered by page [index.html] in [dico_dir].              *)
(* Reads its input in shell variable [QUERY_STRING] URI-encoded.          *)
(*i Test with: QUERY_STRING = "q=yoga"; ./indexer                        i*)
(*i Web invocation is http://skt_server_url/cgi-bin/sktindex?query=str   i*)

(*i module Indexer = struct i*)

18 19
open Html; (* [table_begin] etc. *) 
open Web; (* ps pl abort etc. *) 
huet's avatar
huet committed
20 21 22 23 24 25 26
open Cgi;

value answer_begin () = do
  { pl (table_begin Yellow_cent)
  ; ps tr_begin
  ; ps th_begin
  }
27
;
huet's avatar
huet committed
28 29 30 31 32 33
value answer_end () = do
  { ps th_end
  ; ps tr_end
  ; pl table_end 
  ; pl html_paragraph
  }
34
;
35 36
value ok (mess,s) = do { ps mess; pl (Morpho_html.skt_anchor False s) }
 and ok2 (mess,s1,s2) = do { ps mess; pl (Morpho_html.skt_anchor_R s1 s2) }
huet's avatar
huet committed
37 38 39
     (* ok2 prints the entry under the spelling given by the user, i.e. without 
        normalisation, thus e.g. sandhi is not written sa.mdhi, and possibly 
        suffixed by homonymy index 1, e.g. b.rh. *)
40 41
;
(* Should share [Lemmatizer.load_inflected] *)
huet's avatar
huet committed
42 43
value load_inflected file = (Gen.gobble file : Morphology.inflected_map)
;
44 45 46 47 48
value load_nouns   () = load_inflected Data.public_nouns_file
and   load_roots   () = load_inflected Data.public_roots_file
and   load_vocas   () = load_inflected Data.public_vocas_file
and   load_indecls () = load_inflected Data.public_inde_file
and   load_parts   () = load_inflected Data.public_parts_file
49
;
huet's avatar
huet committed
50 51 52 53 54 55 56 57 58 59 60 61
value back_ground = background Chamois
;
value display word l = do 
  { ps " found as inflected form:"
  ; pl html_break
  ; let pi inv = Morpho_html.print_inflected False word inv in
    List.iter pi l
  }
and report_failure s = do
  { ps " not found in dictionary"
  ; pl html_break
  ; ps "Closest entry in lexical order: " 
62
  ; ps (Morpho_html.skt_anchor False s)
huet's avatar
huet committed
63 64
  ; pl html_break
  }
65
;
huet's avatar
huet committed
66 67 68 69 70 71 72 73
value try_declensions word before = 
  (* before is last lexical item before word in lexical order *)
  (* This is costly because of the size of inverted inflected databases *)
  let inflectedn = load_nouns () in
  match Deco.assoc word inflectedn with
  [ [] -> (* Not found; we try vocative forms *)
    let inflectedv = load_vocas () in
    match Deco.assoc word inflectedv with
Gérard Huet's avatar
Gérard Huet committed
74 75 76 77 78 79 80 81 82 83 84 85 86 87
    [ [] -> (* Not found; we try root forms *)
      let inflectedr = load_roots () in
      match Deco.assoc word inflectedr with
      [ [] -> (* Not found; we try adverbial forms *)
        let inflecteda = load_indecls () in
        match Deco.assoc word inflecteda with
        [ [] -> report_failure before
          (* NB - no look-up in parts forms since big and partly lexicalized *)
        | l -> display word l
        ] 
      | l -> display word l
      ] 
    | l -> display word l
    ]
huet's avatar
huet committed
88 89 90 91 92 93 94 95 96 97 98 99 100 101
  | l -> display word l
  ]
;
value print_word_unique word (entry,lex,page) = (* lex="other" allowed *)
  let link = Morpho_html.skt_anchor_M word entry page False in
  pl (link ^  " [ " ^ lex ^ " ]" ^ xml_empty "br")
  (* this allows access to a pseudo-entry such as "hvaaya" *)
;
value print_word word (entry,lex,page) = match lex with
  [ "other" -> ()
  | _ -> print_word_unique word (entry,lex,page)
  ]
;
value read_mw_index () = 
102
  (Gen.gobble Data.public_mw_index_file : Deco.deco (string * string * string)) 
huet's avatar
huet committed
103
;
104 105
value skt_red s = html_red (Morpho_html.skt_roma s)
;
huet's avatar
huet committed
106 107 108 109 110 111 112 113 114 115 116
value index_engine () = do
  { pl http_header
  ; page_begin heritage_dictionary_title
  ; pl (body_begin back_ground)
  ; let query = Sys.getenv "QUERY_STRING" in 
  let env = create_env query in
  let translit = get "t" env Paths.default_transliteration 
  and lex = get "lex" env Paths.default_lexicon (* default by config *)
  and url_encoded_entry = get "q" env "" in
  let lang = language_of lex in do
  { print_title_solid Mauve (Some lang) (dico_title lang)
117 118
  ; let str = decode_url url_encoded_entry (* in translit *) 
    and encode = Encode.switch_code translit  
huet's avatar
huet committed
119
    and () = toggle_lexicon lex in
120 121 122 123 124
    try let word = encode str (* normalization *) in 
        let str_VH = Canon.decode word in do 
        { answer_begin ()
        ; ps (div_begin Latin12)
        ; match lex with 
huet's avatar
huet committed
125 126 127 128
          [ "MW" -> 
            let mw_index = read_mw_index () in 
            let words = Deco.assoc word mw_index in
            match words with
129
              [ [] -> do { ps (skt_red str_VH)
huet's avatar
huet committed
130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
                         ; ps " not found in MW dictionary"
                         ; pl html_break
                         }
              | [ unique ] -> print_word_unique str_VH unique
              | _ -> List.iter (print_word str_VH) (List.rev words) 
              ] 
          | "SH" -> do (* richer search engine *)
            { let sh_index = Index.read_entries () in 
              try let (s,b,h) = Index.search word sh_index in 
                  if b || h then 
                     let r = Canon.decode word in 
                     let hr = if h then r ^ "_1" else r in
                     ok2 ("Entry found: ",s,hr)
                  else ok ("First matching entry: ",s)
                    (* remark that s may be str with some suffix,   *)
                    (* even though str may exist as inflected form  *)
              with (* Matching entry not found - we try declensions *)
                  [ Index.Last last -> do
148
                      { ps (skt_red str_VH)
huet's avatar
huet committed
149 150 151
                      ; try_declensions word last
                      }
                  ]
Gérard Huet's avatar
Gérard Huet committed
152
	    }
huet's avatar
huet committed
153 154 155 156 157
          | _ -> failwith "Unknown lexicon"
          ]
        ; ps div_end (* Latin12 *)
        ; answer_end ()
        ; page_end lang True
158
        } (* do *)
huet's avatar
huet committed
159 160 161
    with [ Stream.Error _ -> abort lang "Illegal transliteration " str ]
  } (* do *)
  } (* do *)
162
;
huet's avatar
huet committed
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
value safe_index_engine () = 
  let abor = abort Html.French (* may not preserve the current language *) in 
  try index_engine () with 
  [ Sys_error s        -> abor Control.sys_err_mess s (* file pb *)
  | Stream.Error s     -> abor Control.stream_err_mess s (* file pb *)
  | Invalid_argument s -> abor Control.fatal_err_mess s (* sub *)
  | Failure s          -> abor Control.fatal_err_mess s (* anomaly *)
  | Control.Fatal s    -> abor Control.fatal_err_mess s (* anomaly *)
  | Not_found          -> abor Control.fatal_err_mess "assoc" (* assoc *)
  | End_of_file        -> abor Control.fatal_err_mess "EOF" (* EOF *)
  | Encode.In_error s  -> abor "Wrong_input " s
  | Exit               -> abor "Wrong character in input - " "use ASCII" (* Sanskrit *)
  | _                  -> abor Control.fatal_err_mess "Unexpected anomaly" (* ? *)
  ]
;
(* typical invocation is [http://skt_server_url/cgi-bin/sktindex?t=VH&lex=SH&q=input] *)
safe_index_engine ()
;

(*i end; i*)