lexer.ml 18.1 KB
Newer Older
huet's avatar
huet committed
1 2 3 4 5 6
(**************************************************************************)
(*                                                                        *)
(*                     The Sanskrit Heritage Platform                     *)
(*                                                                        *)
(*                              Gérard Huet                               *)
(*                                                                        *)
Gérard Huet's avatar
Gérard Huet committed
7
(* ©2018 Institut National de Recherche en Informatique et en Automatique *)
huet's avatar
huet committed
8 9 10
(**************************************************************************)

(* Sanskrit Phrase Lexer in 40 phases version. *)
11
(* Used by Parser, and Rank for Reader/Regression. 
huet's avatar
huet committed
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
   Uses Phases from Dispatcher to define phase.
   Loads the transducers, calls Dispatch to create Disp. 
   Calls Segment to build Viccheda, the Sanskrit lexer that undoes sandhi 
   in order to produce a padapatha.
   Exports various print functions for the various modes. *)

open Transduction;
open Canon;
open Skt_morph;
open Morphology; (* [inflected inflected_map] *)
open Auto.Auto; (* auto State *)
open Segmenter; (* Segment *)
open Dispatcher; (* [generative Dispatch transition phase_of_sort trim_tags] *) 
open Word; (* word length mirror patch *)

module Lexer (* takes its prelude and control arguments as module parameters *)
  (Prel: sig value prelude : unit -> unit; end) 
  (Control: sig value star : ref bool; (* chunk = if star then word+ else word *)
                value full : ref bool; (* all kridantas and nan cpds if full *)
                value out_chan : ref out_channel; (* output channel *)
            end) = struct 

34
open Html;
huet's avatar
huet committed
35 36 37
open Web; (* ps pl abort etc. *)
open Cgi;
open Phases; (* Phases *) 
Gérard Huet's avatar
Gérard Huet committed
38
open Phases; (* phase *) 
huet's avatar
huet committed
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67

module Lemmas = Load_morphs.Morphs Prel Phases
;
open Lemmas (* [morpho tags_of] *)
;
open Load_transducers; (* [transducer_vect Trans] *)

module Transducers = Trans Prel;

module Disp = Dispatch Transducers Lemmas;
open Disp (* [transducer initial accepting dispatch input color_of_phase 
              transition] *) 
;
module Viccheda = Segment Phases Disp Control 
                  (* [init_segment continue set_offset] *)
;
value all_checks = Viccheda.all_checks
and   set_offset = Viccheda.set_offset
;
value un_analyzable (chunk : word) = 
  ([ (Unknown,mirror chunk,Disp.Id) ],Viccheda.finished)
;
value rec color_of_role = fun (* Semantic role of lexical category *)
  [ Pv | Pvk | Pvkc | Pvkv | Iic | Iic2 | Iik | Voca | Inv | Iicv | Iicc 
  | Iikv | Iikc | Iiif | A | An | Vok | Vokv | Vokc | Vocv | Vocc | Iiy 
  | Iiv | Iivv | Iivc | Peri | Auxiick -> Grey 
  | Noun | Noun2 | Nouv | Nouc | Krid | Kriv | Kric | Pron | Ifc | Ifc2
  | Kama | Lopak | Auxik -> Cyan (* Actor or Predicate *)
  | Root | Lopa |  Auxi -> Pink (* abs-tvaa in Inde *) (* Process *) 
68 69
  | Abso | Absv | Absc | Inde | Avy | Ai | Ani | Inftu (* Circumstance *)
    -> Lavender 
huet's avatar
huet committed
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
  | Unknown | Cache -> Grey 
  | Comp (_,ph) _ _ | Tad (_,ph)  _ _ -> color_of_role ph
  | Sfx -> Cyan
  | Isfx -> Grey
  ]
;
value table_morph_of phase = table_begin (background (color_of_phase phase)) 
and table_role_of phase = table_begin (background (color_of_role phase)) 
and table_labels = table_begin (background Pink)
;
value print_morph pvs cached seg_num gen form n tag = do
(* n is the index in the list of tags of an ambiguous form *)
  { ps tr_begin
  ; ps th_begin 
  ; ps (span_begin Latin12)  
  ; Morpho_html.print_inflected_link pvs cached form (seg_num,n) gen tag 
  ; ps span_end 
  ; ps th_end   
  ; ps tr_end   
  ; n+1
  }
;
(* generalisation of [print_morph] to taddhitas *)
value print_morph_tad pvs cached seg_num gen stem sfx n tag = do
(* n is the index in the list of tags of an ambiguous form *)
  { ps tr_begin
  ; ps th_begin 
  ; ps (span_begin Latin12)  
  ; Morpho_html.print_inflected_link_tad pvs cached stem sfx (seg_num,n) gen tag 
  ; ps span_end 
  ; ps th_end  
  ; ps tr_end   
  ; n+1
  }
;
value print_tags pvs seg_num phase form tags =  
  let ptag = print_morph pvs (is_cache phase) seg_num (generative phase) form in 
  let _ = List.fold_left ptag 1 tags in ()
;
109
value rec scl_phase = fun
huet's avatar
huet committed
110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
  [ Pv | Pvk | Pvkc | Pvkv -> "pv"
  | Noun | Noun2 | Nouc | Nouv | Krid | Kriv | Kric | Lopak | Pron | Auxik 
    -> "noun"
  | Root | Lopa | Auxi -> "root"
  | Inde | Abso | Absv | Absc | Avy -> "inde"
  | Iic | Iic2 | A | An | Iicv | Iicc | Iik | Iikv | Iikc | Iiif | Auxiick
  | Ai | Ani -> "iic"
  | Sfx -> "suffix"
  | Isfx -> "iicsuffix"
  | Iiv | Iivv | Iivc -> "iiv"
  | Iiy -> "iiy" 
  | Peri -> "peri" 
  | Inftu -> "inftu" 
  | Kama -> "kama" 
  | Voca | Vocv | Vocc | Inv | Vok | Vokv | Vokc -> "voca"
  | Ifc | Ifc2 -> "ifc"
  | Unknown -> "unknown"
  | Cache -> "Cache" 
128 129
  | Comp (_,ph) _ _ -> "preverbed " ^ scl_phase ph
  | Tad (ph,_)  _ _ -> "taddhita " ^ scl_phase ph
huet's avatar
huet committed
130 131
  ]
;
Gérard Huet's avatar
Gérard Huet committed
132
value print_scl_morph pvs gen form tag = do
huet's avatar
huet committed
133
  { ps (xml_begin "tag")
Gérard Huet's avatar
Gérard Huet committed
134
  ; Morpho_scl.print_scl_inflected pvs form gen tag 
huet's avatar
huet committed
135 136 137
  ; ps (xml_end "tag")
  }
;
Gérard Huet's avatar
Gérard Huet committed
138
value print_scl_tags pvs phase form tags = 
Gérard Huet's avatar
Gérard Huet committed
139
  let table phase = 
140
      xml_begin_with_att "tags" [ ("phase",scl_phase phase) ] in do
Gérard Huet's avatar
Gérard Huet committed
141
  { ps (table phase) 
Gérard Huet's avatar
Gérard Huet committed
142
  ; List.iter (print_scl_morph pvs (generative phase) form) tags 
huet's avatar
huet committed
143 144 145
  ; ps (xml_end "tags")
  }
;
146

Gérard Huet's avatar
Gérard Huet committed
147
(* Used in Parser *)
huet's avatar
huet committed
148 149 150 151 152 153 154 155 156
value extract_lemma phase word = 
 match tags_of phase word with  
 [ Atomic tags -> tags 
 | Preverbed (_,phase) pvs form tags -> (* tags to be trimmed to [ok_tags] *)
     if pvs = [] then tags 
     else trim_tags (generative phase) form (Canon.decode pvs) tags 
 | Taddhita  _ _ _ tags -> tags
 ]
; 
Gérard Huet's avatar
Gérard Huet committed
157
(* Returns the offset correction (used by SL interface) *)
huet's avatar
huet committed
158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218
value process_transition = fun  
  [ Euphony (w,u,v) ->   
    let off = if w=[] then 1 (* amui/lopa from Lopa/Lopak *)
                      else length w in
    off - (length u + length v) 
  | Id -> 0
  ]
;
value print_transition = fun
  [ Euphony (w,u,v) -> Morpho_html.print_sandhi u v w 
  | Id -> ()
  ]
;
value print_sfx_tags sfx = fun
  [ [ tag ] -> let _ = print_morph [] False 0 False sfx 1 tag in ()
  | _ -> failwith "Multiple sfx tag" 
  ]
;
value process_kridanta pvs seg_num phase form tags = do
  { ps th_begin 
  ; pl (table_morph_of phase)          (* table begin *)
  ; let ok_tags = 
        if pvs = [] then tags 
        else trim_tags (generative phase) form (Canon.decode pvs) tags in do
        (* NB Existence of the segment guarantees that [ok_tags] is not empty *)
  { print_tags pvs seg_num phase form ok_tags 
  ; ps table_end                       (* table end *) 
  ; ps th_end  
  ; (phase,form,ok_tags)
  }}
;
value process_taddhita pvs seg_num phase stem sfx_phase sfx sfx_tags = 
  let gen = generative phase 
  and cached = False in
  let ptag = print_morph_tad pvs cached seg_num gen stem sfx in do
  { ps th_begin 
  ; pl (table_morph_of sfx_phase)      (* table begin *)
  ; let _ = List.fold_left ptag 1 sfx_tags in ()
  ; ps table_end                       (* table end *) 
  ; ps th_end  
  ; (sfx_phase,sfx,sfx_tags)
  }
;
(* Same structure as [Interface.print_morpho] *)
value print_morpho phase word = do  
  { pl (table_morph_of phase)          (* table begin *)  
  ; ps tr_begin 
  ; ps th_begin 
  ; ps (span_begin Latin12)  
  ; let _ =
       match tags_of phase word with 
       [ Atomic tags ->  
          process_kridanta [] 0 phase word tags
       | Preverbed (_,phase) pvs form tags -> 
          process_kridanta pvs 0 phase form tags
       | Taddhita (ph,form) sfx sfx_phase sfx_tags -> 
            match tags_of ph form with 
            [ Atomic _ -> (* stem, tagged as iic *)
              process_taddhita [] 0 ph form sfx_phase sfx sfx_tags 
            | Preverbed _ pvs _ _ -> (* stem, tagged as iic *)
              process_taddhita pvs 0 ph form sfx_phase sfx sfx_tags 
219
            | _ -> failwith "Anomaly: taddhita recursion"
huet's avatar
huet committed
220 221 222 223 224 225 226 227 228 229 230 231
            ]
       ] in ()
  ; ps span_end  
  ; ps th_end  
  ; ps tr_end 
  ; ps table_end                       (* table end *)
  }
;
(* Segment printing with phonetics without semantics for Reader *)
value print_segment offset (phase,rword,transition) = do
  { ps "[ "
  ; Morpho_html.print_signifiant_off rword offset
232
  ; print_morpho phase (mirror rword)
huet's avatar
huet committed
233 234 235 236 237 238 239 240 241
  (* Now we print the sandhi transition *)
  ; ps "&lang;" (* < *) 
  ; let correction = process_transition transition in do  
      { print_transition transition
      ; pl "&rang;]" (* >] *)
      ; pl html_break
      ; offset+correction+length rword
      }
  }
242
; 
Gérard Huet's avatar
Gérard Huet committed
243
(* Similarly for [scl_plugin] mode (without offset and transitions) *)
244
(* Called from [Scl_parser.print_scl_output] *)
Gérard Huet's avatar
Gérard Huet committed
245
value print_scl_segment counter (phase,rword) =  
246 247 248 249
  let word = Morpho_html.visargify rword in do
  { let solid = background (Disp.color_of_phase phase) in
    pl (td_begin_class solid)
  ; let ic = string_of_int counter in
Gérard Huet's avatar
Gérard Huet committed
250
    ps ("<input type=\"hidden\" name=\"field" ^ ic ^ "\" value='<form wx=\""
251 252
        ^ Canon.decode_WX word ^ "\"/>")
  ; match tags_of phase (mirror rword) with 
253 254 255
    [ Atomic tags ->
          print_scl_tags [] phase word tags
    | Preverbed (_,phase) pvs form tags -> 
huet's avatar
huet committed
256 257
         let ok_tags = 
           if pvs = [] then tags 
258
           else trim_tags (generative phase) form (Canon.decode pvs) tags in
Gérard Huet's avatar
Gérard Huet committed
259
          print_scl_tags pvs phase form ok_tags
260 261 262 263 264 265 266 267
    | Taddhita (_,form) sfx sfx_phase sfx_tags ->
            let taddhitanta_phase = match sfx_phase with 
                [ Sfx -> Noun
                | Isfx -> Iic
                | _ -> failwith "Wrong taddhita structure"
                ] 
            and taddhitanta_stem = form @ sfx (* very experimental *) in
            print_scl_tags [] taddhitanta_phase taddhitanta_stem sfx_tags 
268
    ]
269 270
  ; ps "'>" (* closes <input *) 
  ; ps (Canon.unidevcode word)
Gérard Huet's avatar
Gérard Huet committed
271 272
  ; ps td_end
  ; ps "\n"
Gérard Huet's avatar
Gérard Huet committed
273
  ; counter+1
huet's avatar
huet committed
274 275 276 277 278 279
  } 
; 
value print_labels tags seg_num = do
    { ps th_begin  (* begin labels *) 
    ; pl table_labels
    ; let print_label n _ = do
Gérard Huet's avatar
Gérard Huet committed
280
        { ps (cell (html_red (string_of_int seg_num ^ "." ^ string_of_int n)))
huet's avatar
huet committed
281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349
        ; n+1
        } in 
      let _ = List.fold_left print_label 1 tags in () 
    ; ps table_end 
    ; ps th_end    (* end labels *)
    }
;
(* syntactico/semantical roles analysis, function of declension *)
value print_roles pr_sem phase tags form = do
    { ps th_begin 
    ; pl (table_role_of phase)
    ; let pr_roles (delta,sems) = do 
       { ps tr_begin 
       ; ps th_begin 
       ; let word = patch delta form in 
         pr_sem word sems 
       ; ps th_end
       ; ps tr_end 
       } in
      List.iter pr_roles tags  
    ; ps table_end 
    ; ps th_end  
    }
;
(* Segment printing without phonetics with semantics for Parser *)
value print_segment_roles print_sems seg_num (phase,rword,_) =  
  let word = mirror rword in do
  { Morpho_html.print_signifiant_yellow rword
  ; let (decl_phase,form,decl_tags) = match tags_of phase word with
       [ Atomic tags -> 
          process_kridanta [] seg_num phase word tags
       | Preverbed (_,phase) pvs form tags -> 
          process_kridanta pvs seg_num phase form tags
       | Taddhita (ph,form) sfx sfx_phase sfx_tags -> 
            match tags_of ph form with 
            [ Atomic _ -> (* stem, tagged as iic *)
              process_taddhita [] seg_num ph form sfx_phase sfx sfx_tags 
            | Preverbed _ pvs _ _ -> (* stem, tagged as iic *)
              process_taddhita pvs seg_num ph form sfx_phase sfx sfx_tags 
            | _ -> failwith "taddhita recursion unavailable"
            ]
       ] in do
    { print_labels decl_tags seg_num
    ; print_roles print_sems decl_phase decl_tags form
    }
  } 
;
value project n list = List.nth list (n-1) (* Ocaml's nth starts at 0 *)
; 
value print_unitag pvs phase word multitags (n,m) = 
  let (delta,polytag) = project n multitags in
  let unitag = [ project m polytag ] in do
     { ps th_begin
     ; pl (table_morph_of phase) (* table of color of phase begins *)
     ; let _ = (* print unique tagging *)
       print_morph pvs False 0 (generative phase) word 0 (delta,unitag) in ()
     ; ps table_end              (* table of color of phase ends *)
     ; ps th_end
     }
;
value print_uni_taddhita pvs m phase stem sfx sfx_phase = fun
  [ [ (delta,polytag) ] -> (* we assume n=1 taddhita form unambiguous *)
    let unitag = [ project m polytag ] 
    and gen = generative phase 
    and cached = False in do
    { ps th_begin 
    ; pl (table_morph_of sfx_phase)      (* table begin *)
    ; let _ = print_morph_tad pvs cached 0 gen stem sfx 0 (delta,unitag) in ()
    ; ps table_end                       (* table end *) 
350
    ; ps th_end
huet's avatar
huet committed
351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375
    }
  | _ -> failwith "Multiple sfx tag"
  ]
;
value print_projection phase rword ((_,m) as index) = do
  { ps tr_begin             (* tr begins *)
  ; Morpho_html.print_signifiant_yellow rword
  ; let word = mirror rword in 
    match tags_of phase word with
    [ Atomic tags -> print_unitag [] phase word tags index 
    | Preverbed (_,phase) pvs form tags -> print_unitag pvs phase word tags index
    | Taddhita (ph,form) sfx sfx_phase sfx_tags -> 
        match tags_of ph form with
        [ Atomic _ -> print_uni_taddhita [] m phase form sfx sfx_phase sfx_tags
        | Preverbed _ pvs _ _ -> 
                      print_uni_taddhita pvs m phase form sfx sfx_phase sfx_tags
        | _ -> failwith "taddhita recursion unavailable" 
        ]
    ] 
  ; ps tr_end               (* tr ends *)
  }
;
value print_proj phase rword = fun 
   [ [] -> failwith "Projection missing"
   | [ n_m :: rest ] -> do
376 377 378
       { print_projection phase rword n_m 
       ; rest (* returns the rest of projections stream *)
       }
huet's avatar
huet committed
379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515
   ]
;

module Report_chan = struct 
value chan = Control.out_chan; (* where to report *)
end;

module Morpho_out = Morpho.Morpho_out Report_chan;

(* Recording of selected solution - used only in Regression *)
value record_tagging unsandhied mode_sent mode_trans all sentence output proj = 
  let report = output_string Control.out_chan.val in
  let print_proj1 phase rword proj prevs = do 
  (* adapted from [print_proj] *)
  { report "${"
  ; let form = mirror rword in do 
    { report (decode form)
    ; let res = match proj with 
           [ [] -> failwith "Projection missing"
           | [ (n,m) :: rest ] -> 
              let gen = generative phase in
              let polytag = extract_lemma phase form in
              let (delta,tags) = project n polytag in 
              let tagging = [ project m tags ] in do 
                { report ":"
                ; report (string_of_phase phase ^ "")
                ; Morpho_out.report_morph gen form (delta,tagging) 
                ; (rest,[]) (* returns the rest of projections stream *)
                }
           ] in 
      do { report "}$&"; res }
    }
  } in do
  { report (if Control.full.val then "[{C}] " else "[{S}] ")
  ; report (if unsandhied then "<{F}> " else "<{T}> ")
  ; report (if mode_sent then "|{Sent}| " else "|{Word}| ")
  ; report ("#{" ^ mode_trans ^ "}# ")
  ; report ("({" ^ sentence ^ "})")
  ; report (" [" ^ (string_of_int all) ^ "] ")
  ; let rec pp (proj,prevs) = fun
    [ [] -> match proj with 
            [ [] -> () (* finished, projections exhausted *)
            | _ -> failwith "Too many projections"
            ]
    | [ (phase,rword,_) :: rest ] -> (* sandhi ignored *)
        let proj_prevs = print_proj1 phase rword proj prevs in
        pp proj_prevs rest 
    ] in pp (proj,[]) output
  ; report "\n"
  ; close_out Report_chan.chan.val
  }
;
(* Structured entries with generative morphology *)
type gen_morph =
   [ Gen_krid of ((string * word) * (verbal * word))
   | Lexical of word
   | Preverbs_list of list word
   ]
;
value rec morph_list = fun
  [ [ a :: rest ] -> Morpho_string.string_morph a ^ " " ^ morph_list rest
  | [] -> ""
  ]
;
value rec decode_list = fun
  [ [ a :: rest ] -> Canon.decode_ref a ^ " " ^ decode_list rest
  | [] -> ""
  ]
;
value string_of_tag = fun
  [ (x,y,a,b) -> if y = Pv then "${" ^ Canon.decode_ref x ^ "}$&"
	         else "${" ^ Canon.decode_ref x ^ ":" ^ string_of_phase y
                      ^ "{ " ^ morph_list b  ^ "}" ^ "[" 
                      ^ match a with 
     [ Gen_krid ((z, c),(d, e)) -> 
        z ^ ":" ^ Canon.decode_ref c ^ " { " ^ Morpho_string.string_verbal d
        ^ " }[" ^ Canon.decode_ref e ^ "]"
     | Lexical c -> Canon.decode_ref c
     | Preverbs_list c -> decode_list c
     ]  ^ "]}$&"
  ]
;
value rec return_morph = fun
  [ [ a :: rest ] -> string_of_tag a ^ return_morph rest
  | [] -> ""
  ]
;
value generative_stem gen stem = 
   if gen then (* interpret stem as unique name *)
        let (homo,bare_stem) = Naming.homo_undo stem in
        let krid_infos = Deco.assoc bare_stem Naming.unique_kridantas in 
        let (vb,root) = Naming.look_up_homo homo krid_infos in 
        let look_up_stem =
            match Deco.assoc stem Naming.lexical_kridantas with
            [ [] (* not in lexicon *)        -> ("G",bare_stem)
            | _  (* stem is lexical entry *) -> ("L",stem)
            ] in
        Gen_krid (look_up_stem,(vb,root))
   else Lexical stem 
;
(* Applicative version of [Morpho.report_morph] *)
value lex_cat phase = phase (*i TEMPORARY - TODO i*)
;
value get_morph gen phase form (delta,morphs) =
  let stem = patch delta form in (* stem may have homo index *)
  (form, lex_cat phase, generative_stem gen stem, morphs)
;
value return_tagging output projs = (* Used only in Regression *)
  let get_tags phase rword projs = (* adapted from [print_proj] *)
     let form = mirror rword in  
     match tags_of phase form with
     [ Atomic polytag -> match projs with 
           [ [] -> failwith "Projection missing"
           | [ (n,m) :: rest ] -> 
              let gen = generative phase in
              let (delta,tags) = project n polytag in
              let tagging = [ project m tags ] in 
              let entry = get_morph gen phase form (delta,tagging) in
              (rest, lex_cat phase, entry)
           ]
     | _ -> failwith "Not implemented yet" (*i TODO for Regression 
         [ (projs, lex_cat Pv, (form, lex_cat Pv, Preverbs_list prevs, []))] i*)
     ] in 
  let rec taggings accu projs = fun
     [ [] -> match projs with 
             [ [] -> accu
             | _ -> failwith "Too many projections"
             ]
     | [ (phase,rword,_) :: rest ] -> (* sandhi ignored *)
          let (new_projs,phase,tags) = get_tags phase rword projs in
          taggings [ tags :: accu ] new_projs rest 
     ] in 
  return_morph (List.rev (taggings [] projs output))
;

end;