(**************************************************************************) (* *) (* The Zen Computational Linguistics Toolkit *) (* *) (* Gérard Huet *) (* *) (* ©2002 Institut National de Recherche en Informatique et en Automatique *) (**************************************************************************) (*i module Tree = struct i*) (* Trees are ternary trees for use as two-ways tries with zippers. [Tree (b,l,i,t,r)] at occurrence [u] stores [u] as a word iff [b=True], and gives access to [t] at occurrence [[ u :: i ]] as a son, having [l] and [r] as respectively left stack of elder and right list of younger brothers; [Leaf True] at occurrence [u] stores [u] as a word with no descendants; [Leaf False] is only needed to translate [Trie.empty=Trie (False,[])]. *) (*i Inspired from Sedgewick. i*) (*i Currently unused in Sanskrit processing chain excepts for tests. i*) type tree = [ Tree of (bool * forest * int * tree * forest) | Leaf of bool ] and forest = list (int * tree) ; (* Invariant : integers are in increasing order in siblings, no repetition. *) (* Simple translation of a trie as a tree. *) value rec trie_to_tree = fun [ Trie.Trie (b,arcs) -> match arcs with [ [] -> Leaf b | [ (n,t) :: arcs ] -> Tree (b,[],n,trie_to_tree t,List.map f arcs) where f (n,t) = (n,trie_to_tree t) ] ] ; exception Anomaly ; (* More sophisticated translation as a balanced tree. *) value rec balanced = fun [ Trie.Trie (b,arcs) -> match arcs with [ [] -> Leaf b | _ -> (* bal balances k first arcs of l and stacks them in acc *) let rec bal acc l k = (* assert [|l| >= k] *) if k=0 then (acc,l) else match l with [ [] -> raise Anomaly (* impossible by assertion *) | [ (n,t) :: r ] -> bal [ (n,balanced t) :: acc ] r (k-1) ] in let (stack,rest) = let half = (List.length arcs)/2 in bal [] arcs half in match rest with [ [] -> raise Anomaly (* [|rest| = |arcs| - half > 0] *) | [ (n,t) :: right ] -> Tree (b,stack,n,balanced t,List.map f right) where f (n,t) = (n,balanced t) ] ] ] ; (*i Note : when access, leave corresponding balance from access zipper i*) type zipper = [ Top | Zip of (bool * forest * int * forest * zipper) ] ; (* [zip_up : tree -> zipper -> tree] *) value rec zip_up t = fun [ Top -> t | Zip (b,left,n,right,up) -> zip_up (Tree (b,left,n,t,right)) up ] ; (* [tree_of c] builds the filiform [tree] containing [c]. *) (* [tree_of : word -> trie] *) value rec tree_of = fun [ [] -> Leaf False | [ n ] -> Tree (False,[],n,Leaf True,[]) | [ n :: rest ] -> Tree (False,[],n,tree_of rest,[]) ] ; (* [mem_tree : word -> tree -> bool] *) value rec mem_tree c = fun [ Tree (b,l,n,t,r) -> match c with [ [] -> b | [ i :: s ] -> let rec memrec l n t r = if i=n then mem_tree s t else if i False | [ (m,u) :: l'] -> memrec l' m u [ (n,t) :: r ] ] else match r with [ [] -> False | [ (m,u) :: r' ] -> memrec [ (n,t) :: l] m u r' ] in memrec l n t r ] | Leaf b -> b && c=[] ] ; (* We assume that [enter] used over tries, and that [trees] are not updated. *) (* *) (* Translates trie in [entries_file] into corresponding tree. *) value translate_entries entries_file result_file = let entries_trie = (Gen.gobble entries_file : Trie.trie) in Gen.dump (balanced entries_trie) result_file ; (*i TODO - replace tries by trees in index to have faster access. NOTE - next to useless in the cgi world. Statistics English Sanskrit index (DICO/entries) trie 4.5Mb 221Kb trie dag 1.1Mb 103Kb tree 3.6Mb 180Kb tree dag 1.0Mb 96Kb i*) (*i end; i*)