automaton.ml 8.06 KB
 huet committed May 04, 2017 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 (**************************************************************************) (* *) (* The Sanskrit Heritage Platform *) (* *) (* Gérard Huet & Pawan Goyal *) (* *) (* ©2017 Institut National de Recherche en Informatique et en Automatique *) (**************************************************************************) (*i module Automaton = struct i*) open Canon; (* decode rdecode *) open Phonetics; open Auto.Auto; (* rule auto stack *) open Deco; (* Generalises the structure of trie, seen as a representation of deterministic automaton (recognizer for prefix-shared set of strings), into the graph of a non-deterministic automaton, chaining external sandhi with recognition of inflected forms from the inflected lexicon. *) (* Algorithm. For every inflected form [f], and for every external sandhi rule [r: u|v -> w] such that [f=x.u], construct a choice point from state [S x] to an iterating block B(r). [S x] is the state reachable from the initial state (top of the trie) by input [x], going on the deterministic subgraph, copy of the trie. The set of iterating blocks pertaining to a node are grouped in a list of non-deterministic choice points. *) (* Parser operation. The parser traverses the state tree while scanning the input. Assume it is at state [S x] looking at input [z]. It has the choice of either staying in the deterministic part (word lookup) by going to the deterministic transition corresponding to the first symbol in [z], with no output, or else choosing in the non-deterministic part a choice block B(r) as an epsilon move (no scanning of [z]), and then, with [r: u|v -> w], recognize that [w] is a prefix of [z] (scan it or else backtrack), emit the parse trace [-r-] where [f=inflected(x.u)], and iterate by jumping to state [S v] (we assume that sandhi rules are stripped so that [S v] always exists). A stack of [(choices,input_index)] permits to backtrack on input failure. The final sandhi rules [u|# -> y] are treated similarly, with \# matching end of input, but instead of jumping we accept and propose the parse trace as a legal tagging of the sentence (with possible continuation into backtracking for additional solutions). On backtracking a stack of failed attempts may be kept, in order to restart gracefully when a word is missing from the lexicon. This robustification will be essential to turn the parser into a bootstrapping lexicon acquisition device. *) (* {\bf Construction of the automaton.} *) (* Remark that it is linear in one bottom-up traversal of the inflected trie. *) type rules = array stack ; (* A sandhi entry is a list [[l1; l2; ... ln] with li=[si1; si2; ... sini]] *) (* with [sij=(c1,c2,c3) where c1=code w, c2=rev (code u), c3=code v ] *) (* such that [u|v -> w] by external sandhi, with [i=|u|] *) (* [sandhis] concerns u ended by s or .h, and i = 1 or 2 *) (* [sandhir] concerns u ended by r, and i = 1 or 2 *) (* [sandhin] concerns u ended by n, and i = 1 or 2 *) (* [sandhif] concerns u ended by f, and i = 1 or 2 *) (* [sandhio] concerns u ended by other letters, and i = 1 *) (* We read sandhi rules compiled by [compile_sandhi] *) value (sandhis, sandhir, sandhin, sandhif, sandhio) = (Gen.gobble Web.sandhis_file : (rules * rules * rules * rules * rules)) ; value get_sandhi = fun (* argument is [mirror (code u)] *) [ [] -> failwith "get_sandhi 0" | [ 43 (* r *) :: before ] -> match before with [ [] -> failwith "get_sandhi 1" | [ penu :: _ ] -> sandhir.(penu) ] | [ 48 (* s *) :: before ] | [ 16 (* .h *) :: before ] -> match before with [ [] -> failwith "get_sandhi 2" | [ penu :: _ ] -> sandhis.(penu) ] | [ 36 (* n *) :: before ] -> match before with [ [] -> failwith "get_sandhi 3" | [ penu :: _ ] -> sandhin.(penu) ] | [ 21 (* f *) :: before ] -> match before with [ [] -> failwith "get_sandhi 4" | [ penu :: _ ] -> sandhif.(penu) ] | [ c :: _ ] -> if c < 0 then failwith "get_sandhi 5" else if c > 49 then failwith "get_sandhi 6" else sandhio.(c) ] ; (* Same as [Compile_sandhi.merge] *) value rec merge st1 st2 = match st1 with [ [] -> st2 | [ l1 :: r1 ] -> match st2 with [ [] -> st1 | [ l2 :: r2 ] -> [ (List2.union l1 l2) :: (merge r1 r2) ] ] ] ; (* We add to the stack arrays a deco rewrite set *) (* A rewrite deco maps revu to a list of rules (w,revu,v) *) type rewrite_set = Deco.deco rule ; value project n = fun [ Deco (_,arcs) -> try List.assoc n arcs with [ Not_found -> empty ] ] and get_rules = fun [ Deco (rules,_) -> rules ] ; (* Union of two decos *) value rec merger d1 d2 = match d1 with [ Deco (i1,l1) -> match d2 with [ Deco (i2,l2) -> Deco (i1 @ i2, mrec l1 l2) where rec mrec l1 l2 = match l1 with [ [] -> l2 | [ (n,d) :: l ] -> match l2 with [ [] -> l1 | [ (n',d') :: l' ] -> if n lexicon -> (auto * stack * rewrite_set * int)] *) (* The occurrence list [occ] is the reverse of the access word. *) where rec traverse occ = fun [ Trie.Trie (b,arcs) -> let local_stack = if b then get_sandhi occ else [] and local_rewrite = if b then rewrite else empty in let f (deter, stack, rewrite, span) (n,t) = let current = [ n :: occ ] in (* current occurrence *) let (auto, st, rew, k) = traverse current t in ([ (n, auto) :: deter ], merge st stack, merger (project n rew) rewrite, hash1 n k span) in let (deter, stack, rewrite, span) = List.fold_left f ([],[],local_rewrite,hash0) arcs in let (h,l) = match stack with [ [] -> ([],[]) | [ h :: l ] -> (h,l) ] in (* the tail [l] of [stack] initialises the stack for upper nodes, its head [h] contains the list of current choice points *) let key = hash b span h in let s = Auto.share (State (b,List.rev deter,get_rules rewrite @ h)) key in (s, merge local_stack l, rewrite, key) ] ; (* *** IMPORTANT *** The arcs in deter are in decreasing order, because of [fold_left]. We put them back in increasing order by [List.rev deter]. This is not strictly needed, and order of siblings is not important since access is done with [assoc]. However, it is crucial to maintain proper order for operations such as [split], which splits an automaton into vowel-initial and consonant-initial subparts. Thus reversal was enforced when split was introduced in V2.43. *) (* Compile builds a tagging transducer from a lexicon index. *) (* [compile : bool -> rewrites -> Trie.trie -> Auto.auto] *) value compile rewrite lexicon = let (transducer, stack, _, _) = build_auto rewrite [] lexicon in match stack with [ [] -> transducer | _ -> (* Error: some sandhi rule has action beyond one word in the lexicon *) raise Overlap ] ; (*i end; i*)