index.ml 3.54 KB
Newer Older
huet's avatar
huet committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
(**************************************************************************)
(*                                                                        *)
(*                     The Sanskrit Heritage Platform                     *)
(*                                                                        *)
(*                              Gérard Huet                               *)
(*                                                                        *)
(* ©2017 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)

(* Indexing utility *)

(*i module Index = struct i*)

(*i *)
open Trie; 
(* i*)

(* [extract_zip : zipper -> word] *)
value extract_zip = extract_zip_acc []
where rec extract_zip_acc suff = fun 
  [ Top -> suff
  | Zip (_,_,n,_,up) -> extract_zip_acc [ n :: suff ] up
  ]
;
exception Last of string
;
value rec previous b left z = match left with
  [ [] -> if b then extract_zip z 
          else match z with
               [ Top -> failwith "entry 'a' missing"
               | Zip (b',l',_,_,z') -> previous b' l' z'
               ]
  | [ (n,t) :: _ ] -> let w1 = extract_zip z 
                      and w2 = last_trie t in
                      w1 @ [ n :: w2 ]
  ]
;
(* Vicious hack to return first homonym if it exists - ugly *)
value next_trie_homo = next_rec [] 
  where rec next_rec pref = fun
  [ Trie (b,l) -> 
    if b then List.rev pref
    else try let _ = List.assoc 51 l (* looking for homonym \#1 *) in
             List.rev [ 51 :: pref ] (* found - we know it is accepting *)
         with (* no homonym - we keep looking for first accepting suffix *)
         [ Not_found -> match l with 
           [ [] -> failwith "next" (* should not happen if trie in normal form *)
           | [ (n,u) :: _ ] -> next_rec [ n :: pref ] u
           ]
         ] 
  ]
;
value escape w = raise (Last (Canon.decode w))
;
(* [search : (w:word) -> (t:trie) -> (string * bool * bool)] *)
(* Assert : [t] is not [Empty] *)
(* [search w t] returns either the first member of [t] with [w] as initial 
   substring with a boolean [exact] indicating if the match is exact 
   and another one [homo] marking homonymy or else raises [Last s] 
   with [s] the last member of [t] less than [w] in lexicographic order.
   Beware. Do not change this code if you do not understand fully the specs. *)
value search w t = access w t Trie.Top 
  where rec access w t z = match w with
     [ [] -> let w1 = extract_zip z 
             and w2 = next_trie_homo t in
             let exact = w2=[] 
             and homo = w2=[ 51 ] in 
             (Canon.decode (w1 @ w2),exact,homo)
     | [ n :: rest ] -> match t with
        [ Trie (b,arcs) -> match arcs with
          [ [] -> if b then escape (extract_zip z)
                  else failwith "Empty trie" 
          | _ -> let (left,right) = List2.zip n arcs in 
                 match right with
            [ [] -> let w1=extract_zip z and w2= last_trie t in 
                    escape (w1 @ w2)
            | [ (m,u) :: upper ] ->  
              if m=n then access rest u (Zip (b,left,m,upper,z)) 
              else escape (previous b left z)
            ]
          ]
        ]
     ]
;
value read_entries () = 
  (Gen.gobble Web.public_entries_file : trie)
;
value is_in_lexicon word =
  (* Checks whether entry [word] actually appears in the lexicon, *)
  (* so that a reference URL is generated in the answers or not.  *)
  (* NB: not indexed by lexical category *)
  let entries_trie = read_entries () in
  Trie.mem word entries_trie
;

(*i end; i*)