Commit 10a2fd42 authored by Gérard Huet's avatar Gérard Huet

Productive taddhitas suppressed - some navyanyaaya compounds will be now...

Productive taddhitas suppressed - some navyanyaaya compounds will be now unrecognizable without lexicon acquisition
parent bd50f79b
......@@ -4,7 +4,7 @@
(* *)
(* Gérard Huet *)
(* *)
(* ©2017 Institut National de Recherche en Informatique et en Automatique *)
(* ©2019 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(* Checkpoints management *)
......@@ -16,11 +16,7 @@ value rec phase_encode = fun
"<{" ^ string_of_phase ph ^ "}{" ^
string_of_phase ph' ^ "}{" ^
Canon.decode prev ^ "}{" ^ Canon.decode form ^ "}>"
| Tad (ph,ph') form sfx ->
"(" ^ phase_encode ph ^ "{" ^
string_of_phase ph' ^ "}{" ^
Canon.decode form ^ "}{" ^ Canon.decode sfx ^ "})"
| phase -> "{" ^ string_of_phase phase ^ "}"
| phase -> "{" ^ string_of_phase phase ^ "}"
]
and bool_encode b = if b then "t" else "f"
;
......@@ -60,11 +56,7 @@ EXTEND Gram
; pre = TEXT; form = TEXT ; ">" ->
Comp (phase_of_string p, phase_of_string p')
(Encode.code_string pre) (Encode.code_string form)
| "("; p = phase; p' = TEXT (* Taddhita *)
; form = TEXT; sfx = TEXT; ")" ->
Tad (p, phase_of_string p')
(Encode.code_string form) (Encode.code_string sfx)
| p = TEXT -> phase_of_string p
| p = TEXT -> phase_of_string p
] ] ;
phase_rword:
[ [ s = phase; ","; o = TEXT -> (s, Encode.rev_code_string o) ] ] ;
......
......@@ -13,7 +13,7 @@
type vmorph =
[ Prim of int and bool and Word.word (* primary conjugation *)
(* gana pada form of present 3rd sg for checking *)
(* gana pada form of present 3rd sg for checking *)
(* pada=True Paradmaipada pada=False AAtmanepada *)
| Causa of Word.word (* causative 3rd sg form *)
| Inten of Word.word (* intensive 3rd sg form *)
......
......@@ -4,7 +4,7 @@
(* *)
(* Gérard Huet *)
(* *)
(* ©2018 Institut National de Recherche en Informatique et en Automatique *)
(* ©2019 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(* Dispatcher: Sanskrit Engine in 53 phases automaton (plus 2 fake ones) *)
......@@ -78,8 +78,6 @@ value transducer = fun
| Avy -> transducers.avya (* ifc avyayiibhava *)
| Inftu -> transducers.inftu (* infinitives in -tu iic. Renou HLS 72 *)
| Kama -> transducers.kama (* ifcs of kaama/manas: tyaktukaama dra.s.tumanas *)
| Sfx -> transducers.sfx (* ifc taddhita suffixes *)
| Isfx -> transducers.isfx (* iifc taddhita suffixes *)
| Cache -> transducers.cache (* cached forms *)
| Noun | Iic | Iik | Voca | Krid | Pvk | Vok
-> raise (Control.Anomaly "composite phase")
......@@ -116,7 +114,7 @@ value cached = (* potentially cached lexicon acquisitions *)
;
(* initial1, initial2: phases *)
value initial1 =
(* All phases but Ifc, Abso, Auxi, Auxik, Auxiick, Lopa, Lopak, Sfx, Isfx. *)
(* All phases but Ifc, Abso, Auxi, Auxik, Auxiick, Lopa, Lopak. *)
[ Inde; Iicv; Iicc; Nouv; Nouc; Pron; A; An; Root; Kriv; Kric; Iikv; Iikc
; Peri; Pv; Pvkv; Pvkc; Iiv; Iivv; Iivc; Iiy; Inv; Ai; Ani; Absv; Absc; Inftu
; Vocv; Vocc; Vokv; Vokc ] @ cached
......@@ -128,7 +126,7 @@ value initial full = if full then initial1 else initial2
(* dispatch1: Word.word -> phase -> phases *)
value dispatch1 w = fun (* w is the current input word *)
[ Nouv | Nouc | Pron | Inde | Abso | Auxi | Auxik | Kama | Ifc
| Kriv | Kric | Absv | Absc | Avy | Lopak | Sfx | Root | Lopa ->
| Kriv | Kric | Absv | Absc | Avy | Lopak | Root | Lopa ->
if phantomatic w then [ Root; Kriv; Kric; Iikv; Iikc; Abso ] (* aa- pv *)
else initial1
| A -> if phantomatic w then []
......@@ -142,19 +140,15 @@ value dispatch1 w = fun (* w is the current input word *)
justified by \Pan{2,2,6} a-x only if x is a subanta. *)
| Iicv | Iicc | Iikv | Iikc | Iiif | Auxiick -> (* Compounding *)
[ Iicv; Iicc; Nouv; Nouc; A; An; Ifc; Iikv; Iikc; Kriv; Kric
; Pvkv; Pvkc; Iiif; Iivv; Iivc; Vocv; Vocc; Vokv; Vokc ] @
[ Sfx; Isfx ] @ cached
| Pv -> if phantomatic w then [] else
; Pvkv; Pvkc; Iiif; Iivv; Iivc; Vocv; Vocc; Vokv; Vokc ] @ cached
| Pv -> if phantomatic w then [] else
if amuitic w then [ Lopa ] else [ Root; Abso; Peri; Inftu ]
| Pvkc | Pvkv -> if phantomatic w then [] else
if amuitic w then [ Lopak ] else [ Iikv; Iikc; Kriv; Kric; Vokv; Vokc ]
| Iiv -> [ Auxi ] (* as bhuu and k.r finite forms *)
| Iivv | Iivc -> [ Auxik; Auxiick ] (* bhuu and k.r kridanta forms *)
| Iiy -> [ Avy ]
| Isfx -> (* Compounding with taddhita *)
[ Iicv; Iicc; Nouv; Nouc; A; An; Ifc; Iikv; Iikc; Kriv; Kric
; Pvkv; Pvkc; Iiif; Iivv; Iivc; Vocv; Vocc; Vokv; Vokc ] @ cached
| Peri -> [ Auxi ] (* overgenerates, should be only perfect forms *)
| Peri -> [ Auxi ] (* overgenerates, should be only perfect forms *)
| Inftu -> [ Kama ]
| Vocc | Vocv | Vokv | Vokc | Cache -> []
(* only chunk-final vocatives so no Iic overlap *)
......@@ -194,8 +188,7 @@ value terminal = (* Accepting phases *)
; Vocc; Vocv; Vokv; Vokc; Inv
; Lopa; Lopak
; Avy; Kama
; Sfx
; Cache
; Cache
]
;
......@@ -410,9 +403,8 @@ value rec chop word = fun
]
]
;
value sfx_phase = fun [ Sfx | Isfx -> True | _ -> False ]
and iic_phase = fun
[ Iicv | Iicc | Iikv | Iikc
value iic_phase = fun
[ Iicv | Iicc | Iikv | Iikc
| Comp (_,Iikv) _ _ | Comp (_,Iikc) _ _ -> True
| _ -> False ]
;
......@@ -628,15 +620,7 @@ value validate out = match out with
if Phonetics.consonant_initial (Word.mirror form) then []
else out
(*i TODO: similar test for dual forms i*)
(* Finally we glue taddita suffix "forms" to the previous (iic) segment *)
(* NB This cumulates with the preverb glueing but not with itself *)
| [ (sfxph,sfx,s) :: [ (ph,rstem,sv) :: r ] ] when sfx_phase sfxph
&& iic_phase ph ->
let sfx_form = Word.mirror sfx in
let stem = Word.mirror rstem in
let tad_form = Word.mirror (apply_sandhi rstem sfx_form sv) in
[ (Tad (ph,sfxph) stem sfx_form,tad_form,s) :: r ]
| [ (phase,_,_) :: [ (pv,_,_) :: _ ] ] when preverb_phase pv ->
| [ (phase,_,_) :: [ (pv,_,_) :: _ ] ] when preverb_phase pv ->
let m = "validate: " ^ string_of_phase pv ^ " " ^ string_of_phase phase in
raise (Control.Anomaly m) (* all preverbs ought to have been processed *)
(* [ | [ (pv,_,_) :: _ ] when preverb_phase pv -> out ] noop
......@@ -676,10 +660,7 @@ value rec color_of_phase = fun
| Ifc | Ifc2 -> Cyan
| Unknown -> Grey
| Comp (_,ph) _ _ -> color_of_phase ph
| Tad (_,ph) _ _ -> if ph=Sfx then Deep_sky else Yellow
| Pv | Pvk | Pvkc | Pvkv -> failwith "Illegal preverb segment"
| Sfx -> Deep_sky (* necessary for [Lexer.print_segment2] *)
| Isfx -> Yellow (* idem *)
| Pv | Pvk | Pvkc | Pvkv -> failwith "Illegal preverb segment"
(*i NB: unused background colors: Pink Green Aquamarine Chamois i*)
]
;
......
......@@ -77,7 +77,9 @@ value strip w = match w with
| [] -> failwith "Empty stem to strip"
]
;
value rev_strip w = Word.mirror (strip (Word.mirror w)) (* ugly - temp *)
value rstem w = strip (Word.mirror w)
;
value rev_strip w = Word.mirror (rstem w) (* ugly - temp *)
;
(* Builds revword normalised stem from entry string of root *)
(* Used by [Verbs.revstem], [Nouns.enter_iic], [Print_dict] *)
......
......@@ -106,22 +106,12 @@ value print_tags_tad pvs seg_num phase stem sfx sfx_tags =
let ptag = print_morph_tad pvs seg_num False (generative phase) stem sfx in
fold_vert ptag sfx_tags
;
(* This is called "printing morphology interface style". Taddhitaanta forms
are printed as fake compounds of iic the stem and ifc the taddhita form. *)
(* This is called "printing morphology interface style". *)
value print_morpho phase word =
match tags_of phase word with
[ Atomic tags -> print_tags [] 0 phase word tags
| Preverbed (_,phase) pvs form tags -> print_tags pvs 0 phase form tags
| Taddhita (ph,form) sfx _ sfx_tags ->
match tags_of ph form with
[ Atomic _ -> (* stem, tagged as iic *)
print_tags_tad [] 0 ph form sfx sfx_tags
| Preverbed _ pvs _ _ -> (* stem, tagged as iic *)
print_tags_tad pvs 0 ph form sfx sfx_tags
| _ -> raise (Control.Anomaly "taddhita recursion")
]
]
(* PB: if form has homonymy, we get t1 t2 t for [t1 | t2].t - confusion *)
;
(* Parsing mandatory checkpoints *)
......
......@@ -4,7 +4,7 @@
(* *)
(* Gérard Huet *)
(* *)
(* ©2018 Institut National de Recherche en Informatique et en Automatique *)
(* ©2019 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(* Sanskrit Phrase Lexer in 40 phases version. *)
......@@ -75,20 +75,7 @@ value print_morph pvs cached seg_num gen form n tag = do
; n+1
}
;
(* generalisation of [print_morph] to taddhitas *)
value print_morph_tad pvs cached seg_num gen stem sfx n tag = do
(* n is the index in the list of tags of an ambiguous form *)
{ tr_begin |> ps
; th_begin |> ps
; span_begin Latin12 |> ps
; Morpho_html.print_inflected_link_tad pvs cached stem sfx (seg_num,n) gen tag
; span_end |> ps
; th_end |> ps
; tr_end |> ps
; n+1
}
;
value print_tags pvs seg_num phase form tags =
value print_tags pvs seg_num phase form tags =
let ptag = print_morph pvs (is_cache phase) seg_num (generative phase) form in
let _ = List.fold_left ptag 1 tags in ()
;
......@@ -100,9 +87,7 @@ value rec scl_phase = fun
| Inde | Abso | Absv | Absc | Avy -> "inde"
| Iic | Iic2 | A | An | Iicv | Iicc | Iik | Iikv | Iikc | Iiif | Auxiick
| Ai | Ani -> "iic"
| Sfx -> "suffix"
| Isfx -> "iicsuffix"
| Iiv | Iivv | Iivc -> "iiv"
| Iiv | Iivv | Iivc -> "iiv"
| Iiy -> "iiy"
| Peri -> "peri"
| Inftu -> "inftu"
......@@ -112,13 +97,12 @@ value rec scl_phase = fun
| Unknown -> "unknown"
| Cache -> "Cache"
| Comp (_,ph) _ _ -> "preverbed " ^ scl_phase ph
| Tad (ph,_) _ _ -> "taddhita " ^ scl_phase ph
]
;
value print_scl_morph pvs gen form tag = do
{ ps (xml_begin "tag")
; Morpho_scl.print_scl_inflected pvs form gen tag
; ps (xml_end "tag")
; Morpho_scl.print_scl_inflected pvs form gen tag
; ps (xml_end "tag")
}
;
value print_scl_tags pvs phase form tags =
......@@ -137,7 +121,6 @@ value extract_lemma phase word =
| Preverbed (_,phase) pvs form tags -> (* tags to be trimmed to [ok_tags] *)
if pvs = [] then tags
else trim_tags (generative phase) form (Canon.decode pvs) tags
| Taddhita _ _ _ tags -> tags
]
;
(* Returns the offset correction (used by SL interface) *)
......@@ -171,19 +154,7 @@ value process_kridanta pvs seg_num phase form tags = do
; th_end |> ps
; (phase, form, ok_tags)
}}
;
value process_taddhita pvs seg_num phase stem sfx_phase sfx sfx_tags =
let gen = generative phase
and cached = False in
let ptag = print_morph_tad pvs cached seg_num gen stem sfx in do
{ th_begin |> ps
; table_morph_of sfx_phase |> pl (* table begin *)
; let _ = List.fold_left ptag 1 sfx_tags in ()
; table_end |> ps (* table end *)
; th_end |> ps
; (sfx_phase, sfx, sfx_tags)
}
;
;
(* Same structure as [Interface.print_morpho] *)
value print_morpho phase word = do
{ table_morph_of phase |> pl (* table begin *)
......@@ -196,15 +167,7 @@ value print_morpho phase word = do
process_kridanta [] 0 phase word tags
| Preverbed (_,phase) pvs form tags ->
process_kridanta pvs 0 phase form tags
| Taddhita (ph,form) sfx sfx_phase sfx_tags ->
match tags_of ph form with
[ Atomic _ -> (* stem, tagged as iic *)
process_taddhita [] 0 ph form sfx_phase sfx sfx_tags
| Preverbed _ pvs _ _ -> (* stem, tagged as iic *)
process_taddhita pvs 0 ph form sfx_phase sfx sfx_tags
| _ -> failwith "Anomaly: taddhita recursion"
]
] in ()
] in ()
; span_end |> ps
; th_end |> ps
; tr_end |> ps
......@@ -212,7 +175,7 @@ value print_morpho phase word = do
}
;
(* Segment printing with phonetics without semantics for Reader *)
value print_segment offset (phase,rword,transition) = do
value print_segment offset (phase,rword,transition) = do
{ "[ " |> ps
; Morpho_html.print_signifiant_off rword offset
; print_morpho phase (mirror rword)
......@@ -243,15 +206,7 @@ value print_scl_segment counter (phase,rword) =
if pvs = [] then tags
else trim_tags (generative phase) form (Canon.decode pvs) tags in
print_scl_tags pvs phase form ok_tags
| Taddhita (_,form) sfx sfx_phase sfx_tags ->
let taddhitanta_phase = match sfx_phase with
[ Sfx -> Noun
| Isfx -> Iic
| _ -> failwith "Wrong taddhita structure"
]
and taddhitanta_stem = form @ sfx (* very experimental *) in
print_scl_tags [] taddhitanta_phase taddhitanta_stem sfx_tags
]
]
; "'>" |> ps (* closes <input *)
; Canon.unidevcode word |> ps
; td_end |> ps
......
......@@ -4,7 +4,7 @@
(* *)
(* Gérard Huet *)
(* *)
(* ©2018 Institut National de Recherche en Informatique et en Automatique *)
(* ©2019 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(* Sanskrit Phrase Lexer *)
......@@ -47,14 +47,9 @@ module Lexer : functor (* takes its prelude and iterator control as parameters *
(* Exported for Parser *)
value process_kridanta: Word.word -> int -> Phases.phase -> Word.word ->
Morphology.multitag -> (Phases.phase * Word.word * Morphology.multitag);
value process_taddhita: Word.word -> int -> Phases.phase -> Word.word ->
Phases.phase -> Word.word -> Morphology.multitag ->
(Phases.phase * Word.word * Morphology.multitag);
value table_morph_of : Phases.phase -> string;
value table_morph_of : Phases.phase -> string;
value print_morph : Word.word -> bool -> int -> bool -> Word.word -> int ->
Morphology.unitag -> int;
value print_morph_tad : Word.word -> bool -> int -> bool -> Word.word ->
Word.word -> int -> Morphology.unitag -> int;
(* END Exported for Parser *)
value all_checks : ref (list Viccheda.check);
value un_analyzable : Word.word -> (list Disp.segment * Viccheda.resumption);
......
......@@ -4,7 +4,7 @@
(* *)
(* Gérard Huet *)
(* *)
(* ©2018 Institut National de Recherche en Informatique et en Automatique *)
(* ©2019 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(* [Load_morphs] *)
......@@ -39,11 +39,9 @@ module Morphs
| Iik (* K.ridaantaas as left component - used to be called Piic *)
| Iikv | Iikc | Kriv | Kric | Vocv | Vocc | Vokv | Vokc
| Iiy | Avy | Inftu | Kama
| Sfx | Isfx
| Cache (* Cached lexicon acquisitions *)
| Unknown (* Unrecognized chunk *)
| Cache (* Cached lexicon acquisitions *)
| Unknown (* Unrecognized chunk *)
| Comp of (phase * phase) and (* pv *) Word.word and (* root form *) Word.word
| Tad of (phase * phase) and (* nominal *) Word.word and (* sfx *) Word.word
]; end)
= struct
......@@ -54,7 +52,6 @@ by Dispatcher. Preverbed segments may be finite verb forms or kridantas. *)
type tag_sort =
[ Atomic of lemmas
| Preverbed of (phase * phase) and (* pv *) Word.word and Word.word and lemmas
| Taddhita of (phase * Word.word) and (* sfx *) Word.word and phase and lemmas
]
;
(* Fake tags of nan prefixes *)
......@@ -115,8 +112,6 @@ value load_morphs () =
; ifcs2 = load_morpho Web.public_ifcs2_file
; inftu = load_morpho Web.public_inftu_file
; kama = load_morpho Web.public_kama_file
; sfxs = load_morpho Web.public_sfxs_file
; isfxs = load_morpho Web.public_isfxs_file
; caches = load_morpho_cache Web.public_cache_file
}
;
......@@ -151,9 +146,7 @@ value morpho_tags = fun
| Ifc2 -> morpho.ifcs2
| Inftu -> morpho.inftu
| Kama -> morpho.kama
| Sfx -> morpho.sfxs
| Isfx -> morpho.isfxs
| Cache -> morpho.caches
| Cache -> morpho.caches
| _ -> raise (Control.Anomaly "morpho_tags")
]
;
......@@ -171,11 +164,7 @@ value tags_of phase word =
Preverbed sort pv form tag
(* NB [Preverbed] comprises tin verbal forms of verbs with preverbs as well
as sup kridanta forms with preverbs. The preverbs are packed in pv. *)
| Tad (ph,sfx_ph) form sfx -> (* tag inherited from fake suffix entry *)
let sfx_tag = Deco.assoc sfx (morpho_tags sfx_ph) in
(* [let stem_tag = Deco.assoc sfx (morpho_tags ph) in] - possible extension *)
Taddhita (ph,form) [ 0 :: sfx ] sfx_ph sfx_tag (* 0 = "-" *)
| _ -> Atomic (Deco.assoc word (morpho_tags phase))
| _ -> Atomic (Deco.assoc word (morpho_tags phase))
(* NB Atomic comprises tin verbal forms of roots as well as sup atomic forms
and all the pure stems collections Iic Iiv etc. *)
]
......
......@@ -4,7 +4,7 @@
(* *)
(* Gérard Huet *)
(* *)
(* ©2017 Institut National de Recherche en Informatique et en Automatique *)
(* ©2019 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(* [Load_transducers] *)
......@@ -63,8 +63,6 @@ type transducer_vect =
; iikc : auto (* consonant-initial iik *)
; kriv : auto (* vowel-initial krids *)
; kric : auto (* consonant-initial krids *)
; sfx : auto (* taddhita suffixes *)
; isfx : auto (* taddhita suffixes for iic stems *)
; cache : auto (* user-defined supplement to noun *)
}
;
......@@ -113,8 +111,6 @@ value load_transducer cat =
| "Voca" -> Web.public_transvoca_file
| "Inv" -> Web.public_transinv_file
| "Prev" -> Web.public_transp_file
| "Sfx" -> Web.public_transsfx_file
| "Isfx" -> Web.public_transisfx_file
| "Cache" -> Web.public_transca_file
| _ -> failwith ("Unexpected category: " ^ cat)
] in
......@@ -216,8 +212,6 @@ value transducers =
; iikc = iikc
; absv = absv
; absc = absc
; sfx = load_transducer "Sfx"
; isfx = load_transducer "Isfx"
; cache = load_transducer "Cache"
}
;
......
......@@ -4,7 +4,7 @@
(* *)
(* Gérard Huet *)
(* *)
(* ©2018 Institut National de Recherche en Informatique et en Automatique *)
(* ©2019 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(* This module contains various service utilities for CGI programs *)
......
......@@ -4,7 +4,7 @@
(* *)
(* Gérard Huet *)
(* *)
(* ©2018 Institut National de Recherche en Informatique et en Automatique *)
(* ©20189 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(* Morphology interface *)
......@@ -70,8 +70,6 @@ type morphology =
; kama : inflected_map
; iiys : inflected_map
; avys : inflected_map
; sfxs : inflected_map
; isfxs : inflected_map
; caches : inflected_map
}
;
......
......@@ -4,7 +4,7 @@
(* *)
(* Gérard Huet *)
(* *)
(* ©2018 Institut National de Recherche en Informatique et en Automatique *)
(* ©2019 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(*i module Nouns = struct i*)
......@@ -880,6 +880,7 @@ value build_an g stem entry =
]
; Bare Noun (wrap stem 1)
; Avyayaf (fix stem "am")
; Indecl Tas (fix stem "atas")
] @ if g=Neu then [ Avyayaf (fix stem "a") ] else []) (* \Pan{5,4,109} *)
;
value build_an_god stem entry = (* Whitney §426a *)
......@@ -2014,6 +2015,7 @@ value build_neu_u trunc entry = (* stems in -u and -uu *)
]
; Bare Noun (mirror stems)
; Avyayaf (mirror stems)
; Indecl Tas (fix stems "tas") (* eg vastutas *)
]
;
value build_neu_ri trunc entry =
......@@ -4229,7 +4231,9 @@ value build_pron_a g stem entry = (* g=Mas ou g=Neu *)
else if g=Mas && stem = [ 42; 36; 1 ] (* anya *)
then [ Bare phase (code "anya") ] (* optional anya- *)
else if pseudo_nominal && g=Mas then
[ Avyayaf (fix stem "am"); Avyayaf (fix stem "aat") ]
[ Avyayaf (fix stem "am"); Avyayaf (fix stem "aat")
; Indecl Tas (fix stem "atas")
]
else [])
@ (if g=Mas then match entry with
[ "eka" -> [ Cvi (code "ekii") ]
......@@ -5018,7 +5022,7 @@ value compute_nouns_stem_form e stem d p =
| _ -> build_van Mas r3 e
]
| [ 49 :: r3 ] (* -han *) -> build_han r3 e
| _ -> build_an Mas r2 e
| _ -> build_an Mas r2 e (* raajan *)
]
| [ 3 :: r2 ] (* -in *) -> match r2 with
[ [ 33 :: r3 ] (* -thin *)-> match r3 with
......@@ -5402,6 +5406,7 @@ value compute_nouns_stem_form e stem d p =
| [ 10 :: _ ] -> report stem Fem
| [ 11 :: r1 ](* -ai *) -> match r1 with
[ [ 43 ] (* rai *) -> build_rai Fem [ 2; 43 ] e
(* | [ 39; 41; 5; 41 ] (* mumbai *) -> (* TO DO *) *)
| _ -> report stem Fem
]
| [ 12 :: r1 ] (* -o *) -> build_o Fem r1 e
......@@ -5419,6 +5424,7 @@ value compute_nouns_stem_form e stem d p =
}
| _ -> build_root Fem stem e
]
| [ 32; 7; 37 ] (* p.rt *) -> build_root_weak Fem stem "p.rtanaa"
| [ 34 :: r1 ] (* -d *) -> match r1 with
[ [ 1; 37 ] (* pad *) -> build_root_weak Fem stem "paada"
| [ 1; 37; 2 ] (* aapad *)
......@@ -5456,6 +5462,7 @@ value compute_nouns_stem_form e stem d p =
| [ 4; 34 ] (* diiv\#2 *) -> build_diiv e
| _ -> report stem g
]
| [ 46; 3; 36 ] (* niz *) -> build_root_weak Fem stem "nizaa"
| [ 47 :: r1 ] (* -.s *) -> match r1 with
[ [ 3 :: r2 ] -> match r2 with
[ [ 28 :: [ 1 :: [ 37 :: [ 3 :: [ 37 ] ] ] ] ] (* pipa.thi.s *)
......@@ -5502,8 +5509,6 @@ value compute_nouns_stem_form e stem d p =
| [ 1; 36; 2; 37; 5 ] -> build_upaanah r1 stem e (* Kale§101 *)
| _ -> build_root Fem stem e
]
| [ 46; 3; 36 ] (* niz *) -> build_root_weak Fem stem "nizaa"
| [ 32; 7; 37 ] (* p.rt *) -> build_root_weak Fem stem "p.rtanaa"
| _ -> build_root Fem stem e
]
| Deictic _ -> match stem with
......@@ -5735,29 +5740,45 @@ value compute_extra_iiv = iter enter_iiv
value enter_iiy entry =
enter1 entry (Avyayai (normal_stem entry)) (* stripped entry *)
;
value tasil_preserve () = do (* Whitney§1098 *)
(* needed since -tas etymology induces skipping the entry *)
{ enter1 "tad" (Indecl Tas (code "tatas")) (* tasil on tad \Pan{5,3,7} *)
(* Tasils are treated as vibhakti. Here are the lexicalized ones: Whitney§1098
First tasils of pronouns, not needed if lexicalised
; enter1 "tad" (Indecl Tas (code "tatas")) (* tasil on tad \Pan{5,3,7} *)
; enter1 "ya#1" (Indecl Tas (code "yatas")) (* tasil on ya \Pan{5,3,7} *)
; enter1 "ku#1" (Indecl Tas (code "kutas")) (* tasil on ku \Pan{5,3,7-8} *)
; enter1 "abhi" (Indecl Tas (code "abhitas")) (* tasil on abhi \Pan{5,3,9} *)
; enter1 "pari" (Indecl Tas (code "paritas")) (* tasil on pari \Pan{5,3,9} *)
; enter1 "anti" (Indecl Tas (code "antitas")) (* tasil on pn \Pan{5,3,7} *)
; enter1 "ayam" (Indecl Tas (code "atas")) (* tasil on ayam \Pan{5,3,5} *)
; enter1 "idam" (Indecl Tas (code "itas")) (* tasil on idam id *)
; enter1 "adas" (Indecl Tas (code "amutas")) (* id *)
; enter1 "anya" (Indecl Tas (code "anyatas")) (* id *)
; enter1 "avara" (Indecl Tas (code "avaratas")) (* id *)
; enter1 "para" (Indecl Tas (code "paratas")) (* id *)
; enter1 "vizva" (Indecl Tas (code "vizvatas")) (* id *)
; enter1 "sva" (Indecl Tas (code "svatas")) (* id *)
; enter1 "puurva" (Indecl Tas (code "puurvatas")) (* id *)
; enter1 "sarva" (Indecl Tas (code "sarvatas")) (* id *)
; enter1 "aze.sa" (Indecl Tas (code "aze.satas")) (* tasil on privative cpd *)
}
;
; enter1 "sarva" (Indecl Tas (code "sarvatas")) (* id *)
; enter1 "eka" (Indecl Tas (code "ekatas")) (* id *)
; enter1 "sva" (Indecl Tas (code "svatas")) (* id *)
; enter1 "anyatara" (Indecl Tas (code "anyataratas")) (* id *)
; enter1 "dak.si.na" (Indecl Tas (code "dak.si.natas")) (* id *)
; enter1 "avara" (Indecl Tas (code "avaratas")) (* \Pan{5,3,29} *)
; enter1 "uttara#1" (Indecl Tas (code "uttaratas")) (* check *)
; enter1 "ubhaya" (Indecl Tas (code "ubhayatas")) (* check *)
*)
value tasil_extra () = do (* add non-generative tasils *)
{ enter1 "aze.sa" (Indecl Tas (code "aze.satas")) (* tasil on privative cpd *)
; enter1 "ekaruupa" (Indecl Tas (code "ekaruupatas")) (* tasil on cpd *)
; enter1 "d.r.s.taanta" (Indecl Tas (code "d.r.s.taantatas"))(* tasil on cpd *)
; enter1 "paramaartha" (Indecl Tas (code "paramaarthatas")) (* tasil on cpd *)
; enter1 "praagbhaava" (Indecl Tas (code "praagbhaavatas")) (* tasil on cpd *)
; enter1 "bhasad" (Indecl Tas (code "bhasattas")) (* tasil on consonant stem *)
(*; enter1 "nas#2" (Indecl Tas (code "nastas")) - idem but lexicalized *)
; enter1 "yad.rcchaa" (Indecl Tas (code "yad.rcchaatas")) (* tasil on fstem *)
}
;
(* Supplementary forms - called by [Make_nouns.genders_to_nouns]
with argument [iic_stems] contents of [iic_stems_file] dumped from
[Subst.iic_stems] built by calling [Subst.record_iic] for iic only entries. *)
value compute_extra iic_only_stems = do
value compute_extra iic_only_stems = do
{ enter1 "maas" (* Siddhaanta kaumudii *) decl (*i Jha - CHECK i*)
where decl = Declined Noun Mas [ (Dual,[ (Ins,code "maabhyaam") ]) ]
; enter1 "yuu.sa" (* Siddhaanta kaumudii *) decl
......@@ -5784,7 +5805,7 @@ value compute_extra iic_only_stems = do
; enter1 "viz#2" (* Vedic Whitney§218a *) decl
where decl = Declined Noun Fem [ (Plural,[ (Loc,code "vik.su") ]) ]
; iter enter_iiy iic_avya
; tasil_preserve ()
; tasil_extra ()
; compute_extra_iic iic_indecl (* antar *)
; compute_extra_iic iic_only_stems (* aajaanu etc. *)
; compute_extra_iic iicf_extra (* abalaa etc. *)
......
......@@ -4,7 +4,7 @@
(* *)
(* Gérard Huet *)
(* *)
(* ©2018 Institut National de Recherche en Informatique et en Automatique *)
(* ©2019 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(* CGI-bin callback for shallow syntax analysis *)
......@@ -79,10 +79,8 @@ value rec color_of_role = fun (* Semantic role of lexical category *)
| Abso | Absv | Absc | Inde | Avy | Ai | Ani | Inftu (* Circumstance *)
-> Lavender
| Unknown | Cache -> Grey
| Comp (_,ph) _ _ | Tad (_,ph) _ _ -> color_of_role ph
| Sfx -> Cyan
| Isfx -> Grey
]
| Comp (_,ph) _ _ -> color_of_role ph
]
and table_role_of phase = table_begin (background (color_of_role phase))
;
(* syntactico/semantical roles analysis, function of declension *)
......@@ -111,15 +109,7 @@ value print_segment_roles print_sems seg_num (phase,rword,_) =
Lex.process_kridanta [] seg_num phase word tags
| Preverbed (_,phase) pvs form tags ->
Lex.process_kridanta pvs seg_num phase form tags
| Taddhita (ph,form) sfx sfx_phase sfx_tags ->
match Lex.tags_of ph form with
[ Atomic _ -> (* stem, tagged as iic *)
Lex.process_taddhita [] seg_num ph form sfx_phase sfx sfx_tags
| Preverbed _ pvs _ _ -> (* stem, tagged as iic *)
Lex.process_taddhita pvs seg_num ph form sfx_phase sfx sfx_tags
| _ -> failwith "taddhita recursion unavailable"
]
] in do
] in do
{ print_labels decl_tags seg_num
; print_roles print_sems decl_phase decl_tags form
}
......@@ -138,6 +128,7 @@ value print_uni_kridanta pvs phase word multitags (n,m) =
; ps th_end
}
;
(* deprecated
value print_uni_taddhita pvs m phase stem sfx sfx_phase = fun
[ [ (delta,polytag) ] -> (* we assume n=1 taddhita form unambiguous *)
let unitag = [ project m polytag ]
......@@ -151,8 +142,8 @@ value print_uni_taddhita pvs m phase stem sfx sfx_phase = fun
}
| _ -> failwith "Multiple sfx tag"
]
;
value print_projection phase rword ((_,m) as index) = do
; *)
value print_projection phase rword ((_,m) as index) = do
{ ps tr_begin (* tr begins *)
; Morpho_html.print_signifiant_yellow rword
; let word = Word.mirror rword in
......@@ -160,13 +151,14 @@ value print_projection phase rword ((_,m) as index) = do
[ Atomic tags -> print_uni_kridanta [] phase word tags index
| Preverbed (_,phase) pvs form tags ->
print_uni_kridanta pvs phase form tags index
(* deprecated
| Taddhita (ph,form) sfx sfx_phase sfx_tags ->
match Lex.tags_of ph form with
[ Atomic _ -> print_uni_taddhita [] m phase form sfx sfx_phase sfx_tags
| Preverbed _ pvs _ _ ->
print_uni_taddhita pvs m phase form sfx sfx_phase sfx_tags
| _ -> failwith "taddhita recursion unavailable"
]
] *)
]
; ps tr_end (* tr ends *)
}
......
......@@ -4,7 +4,7 @@
(* *)
(* Gérard Huet *)
(* *)
(* ©2018 Institut National de Recherche en Informatique et en Automatique *)
(* ©2019 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
module Phases = struct
......@@ -38,14 +38,11 @@ type phase =
| Iik (* Kridanta iics *)
| Iikv | Iikc | Kriv | Kric | Vocv | Vocc | Vokv | Vokc
| Iiy | Avy (* Avyayiibhaavas *)
| Inftu | Kama (* vaktukaama cpds *)
| Sfx | Isfx (* Taddhita suffixes for padas and iics *)
| Cache (* Lexicon acquisition *)
| Inftu | Kama (* vaktukaama cpds *)
| Cache (* Lexicon acquisition *)
| Unknown (* Unrecognized chunk *)
(* now pseudo phase tagging root/kridanta forms with preverbs *)
| Comp of tag and (* pv *) Word.word and (* root/krid in tag *) Word.word
(* finally pseudo-phase tagging nominal forms/stems with taddhita suffixes *)
| Tad of tag and Word.word (* nominal form in tag *) and Word.word (* sfx *)
]
and tag = (phase * phase) (* preverb phase and root/taddhita phase *)
(* NB. It is essential to keep both phases to identify transition checkpoints *)
......@@ -105,10 +102,8 @@ value rec string_of_phase = fun
| Vocc -> "Vocc"
| Peri -> "Peri"
| Inftu -> "Inftu"
| Kama -> "Kama"
| Sfx -> "Sfx"
| Isfx -> "Isfx"
| Cache -> "Cache"
| Kama -> "Kama"
| Cache -> "Cache"
| Unknown -> "Unknown"
| _ -> failwith "string_of_phase"
]
......@@ -160,9 +155,7 @@ and phase_of_string = fun (* unsafe *)
| "Kric" -> Kric
| "Vocv" -> Vocv
| "Vocc" -> Vocc
| "Sfx" -> Sfx
| "Isfx" -> Isfx
| "Peri" -> Peri
| "Peri" -> Peri
| "Inftu" -> Inftu
| "Kama" -> Kama
| "Unknown" -> Unknown
......@@ -180,7 +173,7 @@ and preverb_phase = fun
and krid_phase = fun [ Krid | Kric | Kriv -> True | _ -> False ]
and ikrid_phase = fun [ Iik | Iikc | Iikv -> True | _ -> False ]
and vkrid_phase = fun [ Vokc | Vokv -> True | _ -> False ]
and ii_phase = fun [ Iicv | Iicc | Iikv | Iikc | A | An | Isfx -> True | _ -> False ]
and ii_phase = fun [ Iicv | Iicc | Iikv | Iikc | A | An -> True | _ -> False ]
and is_cache phase = (phase = Cache)
;
(* Needed as argument of [Morpho.print_inv_morpho] *)
......