minitertree.ml 2.43 KB
Newer Older
huet's avatar
huet committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
(**************************************************************************)
(*                                                                        *)
(*                 The Zen Computational Linguistics Toolkit              *)
(*                                                                        *)
(*                              Grard Huet                               *)
(*                                                                        *)
(* 2002 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)

(*i Minimization of Ternary Trees as Ternary Dags by bottom-up hashing i*)

(*i module Minitertree = struct i*)
 
(* Similarly to [Mini] for tries, we may dagify ternary trees. *)

value hash_max = 9689 (* Mersenne 21 *) 
;
module Dag = Share.Share (struct type domain=Tertree.tree; 
                                 value size=hash_max; end)
;
value hash0 = 1 (* linear hash-code parameters *)
and hash1 letter key sum = sum + letter*key
and hash b arcsl k n arcsr = (arcsl + arcsr + n*k+ Gen.dirac b) mod hash_max;
(*i Caution - replace by (abs (...)) for very big tries to correct
   for int overflow, since mod may return a negative result, leading to
   error out-of-bound array at execution i*)

value leaff = Tertree.Leaf False 
and leaft = Tertree.Leaf True
;
value traverse lookup = travel
 where rec travel = fun
  [ Tertree.Tree(b,fl,n,t,fr) -> 
    let f (trees,span) (n,t) = 
       let (t0,k) = travel t in
       ([ (n,t0) :: trees ],hash1 n k span) in
    let (arcsl,spanl) = List.fold_left f ([],hash0) fl
    and (t1,k1) = travel t
    and (arcsr,spanr) = List.fold_left f ([],hash0) fr in
    let key = hash b spanl k1 n spanr in
    (lookup (Tertree.Tree(b,List.rev arcsl,n,t1,List.rev arcsr)) key, key)
  | Tertree.Leaf b -> if b then (leaft,1) else (leaff,0)
  ]
;

(* Now we make a dag from a trie by recognizing common subtries. *)
value compress = traverse Dag.share
;
value minimize tree = let (dag,_) = compress tree in dag
;
value rec dagify_tree () =
  let lexicon = (input_value stdin : Tertree.tree) in
  let dag = minimize lexicon in 
  output_value stdout dag
;
value reset = Dag.reset
;

(*i value ch = open_in "english.ter.rem";    
    value eng = (input_value ch : Tertree.tree); 
    value dag = Minitertree.minimize eng; 
    value cho = open_out "english.ter.min.rem";
    output_value cho dag; 
    close_out cho; i*)

(*i end; i*)