Commit 69513ce2 authored by Gérard Huet's avatar Gérard Huet

Lexicon acquisition and minor corrections

parent 4dfe4337
......@@ -61,7 +61,7 @@ value canon = fun
| 46 -> "z" (* used to be "\"s" -- fragile *)
| 47 -> ".s"
| 48 -> "s"
| 49 -> "h"
| 49 -> "h" (* h/.dh *)
| 50 -> "_" (* hiatus *)
| -1 -> "'" (* avagraha *)
| -2 -> "[-]" (* amuissement - lopa of a or aa in preceding preverb *)
......
......@@ -17,7 +17,9 @@
(* - a consistency check of the output of the segmenting transducer *)
(* Dispatch, instantiated by Transducers, is used as parameter of the
Segment functor from Segmenter or Interface. *)
Segment functor from Segmenter or Interface.
It defines the phase automaton transitions.
There are two versions: 1 for Complete, 2 for Simple. *)
open Auto.Auto;
open Load_transducers; (* [transducer_vect Trans roots_morpho krids_morpho] *)
......@@ -92,11 +94,12 @@ value phantomatic = fun
| _ -> False
]
(* Amuitic forms start wiih -2 = [-] which elides preceding -a or -aa from Pv *)
and amuitic= fun
and amuitic = fun
[ [ -2 :: _ ] -> True
| _ -> False
]
;
(* Simplified description, not with all phases *)
(* We recognize $S = (Subst + Pron + Verb + Inde + Voca)^+$\\
with $Verb = (1 + Pv).Root + Pv.Abso + Iiv.Auxi$,\\
$Subst = Noun + Iic.Ifc + Iic.Subst + Iiv.Auxik$,\\
......@@ -109,7 +112,7 @@ NB. $Abso$ = absolutives in -ya,
The following is obtained from the above recursion equation by Brzozowski's
derivatives like in Berry-Sethi's translator. *)
value cached = (* potentially cached lexicon acquisitions *)
value cached = (* potentially cached lexicon acquisitions *)
if Web.cache_active.val="t" then [ Cache ] else []
;
(* initial1, initial2: phases *)
......
......@@ -31,7 +31,7 @@ value rec previous b left z = match left with
| Zip (b',l',_,_,z') -> previous b' l' z'
]
| [ (n,t) :: _ ] -> let w1 = extract_zip z
and w2 = last_trie t in
and w2 = last_trie t in
w1 @ [ n :: w2 ]
]
;
......
......@@ -4,7 +4,7 @@
(* *)
(* Gérard Huet *)
(* *)
(* ©2018 Institut National de Recherche en Informatique et en Automatique *)
(* ©2019 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(* CGI-bin indexer for indexing in sanskrit dictionary. *)
......
......@@ -35,8 +35,8 @@ value morpho_gen = ref True (* morphology generation time *)
They are also used on the fly locally by [Declension] and [Conjugation]. *)
value lexicalized_kridantas = ref (Deco.empty : Naming.deco_krid)
(* It will be set by [Make_roots.roots_to_conjugs] for the [unique_kridantas]
computation. *)
(* It will be set by [Make_roots.roots_to_conjugs] in the first pass of
grinding, and used for the [unique_kridantas] computation in the second. *)
;
value access_lexical_krid stem = Deco.assoc stem lexicalized_kridantas.val
;
......@@ -53,7 +53,10 @@ value unique_kridantas = ref Deco.empty
to the lexicalized one in [Make_roots.roots_to_conjugs], which completes it
with the kridantas generated by Parts. At the end of morphological generation
its final value is stored in persistent [Install.unique_kridantas_file], and
transfered to [Install.public_unique_kridantas_file] read from module Naming. *)
transfered to [Install.public_unique_kridantas_file] read from module Naming.
This allows the tag of kridantas to link to the lexicon entry in case it has
been lexicalized. This does not work for kridantas with preverbs, and at the
moment concerns only participles, not nouns of agent or action. *)
;
value access_krid stem = Deco.assoc stem unique_kridantas.val
and register_krid stem vrp = (* used in [Parts.gen_stem] *)
......
......@@ -4,7 +4,7 @@
(* *)
(* Gérard Huet *)
(* *)
(* ©2018 Institut National de Recherche en Informatique et en Automatique *)
(* ©2019 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(* Unique naming mechanism. *)
......@@ -15,14 +15,14 @@
(* The problem is to find the lexical entry, if any, that matches a stem
and an etymology, corresponding to the morphological structure of a
generated stem. For instance k.rta has etymology pp(k.r\#1).
generated stem. For instance entry "k.rta" has etymology pp(k.r\#1).
It does not produce forms, and is skipped by the morphology generator,
since the pp participal stem is a productive taddhita construction,
that will indeed generate stem k.rta from its root k.r\#1.
The problem for the morphology generator is to display forms of k.rta
with a link to k.rta in the hypertext lexicon. It is non-trivial, since
homonymies occur. Thus homophony indexes associated with generators
and consistent with possible lexicalisations must be registered.
and consistent with possible lexicalizations must be registered.
A first pass of recording builds [lexical_kridantas] as a [deco_krid] deco
indexing the stems with a pair (morphology,homo). Then the morphology
generator from Inflected extends it as [unique_kridantas], accessed as
......@@ -54,7 +54,7 @@ and by [Morpho.print_inv_morpho] and [Morpho_ext.print_inv_morpho_ext]
at segmenting time. *)
(* Here we retrieve finer discrimination for verbs forms preceded by preverbs.
This is experimental, and incurs too many conversions betweeen strings
This is experimental, and incurs too many conversions between strings
and words, suggesting a restructuring of preverbs representation. *)
value preverbs_structure = (* Used in Morpho for display of pvs *)
try (Gen.gobble Web.public_preverbs_file : Deco.deco Word.word)
......
......@@ -33,10 +33,10 @@ value voices_of = fun
| "ard" | "av" | "az#2" | "as#1" | "as#2" | "aap" | "ifg" | "in" | "ind"
| "inv" | "il" | "i.s#2" | "iifkh" | "iir.s" | "uk.s" | "uc" | "ujjh" | "u~nch"
| "und" | "umbh" | "u.s" | ".rc#1" | ".rdh" | ".r.s" | "ej" | "kas" | "kiil"
| "ku.t" | "ku.n.th" | "kunth" | "kup" | "kul" | "kuuj" | "k.rt#1" | "k.rz"
| "krand" | "krii.d" | "kru~nc#1" | "krudh#1" | "kruz" | "klam" | "klid"
| "kliz" | "kvath" | "k.sar" | "k.sal" | "k.si" | "k.sii" | "k.su" | "k.sudh#1"
| "k.subh" | "k.svi.d" | "kha~nj#1" | "khaad" | "khid" | "khel"
| "ku.t" | "ku.n.th" | "kunth" | "kup" | "kul" | "ku.s" | "kuuj" | "k.rt#1"
| "k.rz" | "krand" | "krii.d" | "kru~nc#1" | "krudh#1" | "kruz" | "klam"
| "klid" | "kliz" | "kvath" | "k.sar" | "k.sal" | "k.si" | "k.sii" | "k.su"
| "k.sudh#1" | "k.subh" | "k.svi.d" | "kha~nj#1" | "khaad" | "khid" | "khel"
| "khyaa" | "gaj" | "gad" | "garj" | "gard" | "gal" | "gaa#1" | "gaa#2"
| "gu~nj" | "gu.n.th" | "gup" | "gumph" | "g.rdh" | "g.rr#1" | "g.rr#2"
| "granth" | "grah" | "glai" | "ghas" | "ghu.s" | "gh.r" | "gh.r.s"
......
......@@ -1147,7 +1147,7 @@ value compute_athematic_imperative2a strong weak set entry =
; (Second, match entry with
[ "as#1" -> code "edhi"
| "zaas" -> code "zaadhi"
(* above leads to conflict between \Pan{6.4.35} (zaa+hi) and \Pan{6.4.101}
(* above leads to conflict between \Pan{6,4,35} (zaa+hi) and \Pan{6,4,101}
(zaas+dhi) [asiddhavat] => we operate in parallel zaa+dhi= zaadhi *)
| "cakaas" -> code "cakaadhi" (* Kane§429 *)
| _ -> let w = if entry = "han#1" then revcode "ja" else weak in
......@@ -4089,7 +4089,7 @@ value compute_aorist entry =
; compute_redup_aoristm stem entry
}
| "iik.s" | "kamp" | "klid" | "gup" | "cur" | "m.r" | "d.rz#1" | "dyut#1"
| "vrazc" | "sru" -> (* active only *)
| "vrazc" | "siiv" | "sru" -> (* active only *)
let stem = redup_aor weak entry in
compute_redup_aorista stem entry
| "grah" -> do
......@@ -4118,15 +4118,14 @@ value compute_aorist entry =
| _ -> ()
]
; match entry with (* 4. sigma aorist sic *)
[ "aap" | "k.r#1" | "gup" | "chid#1" | "ji" | "tud" | "t.rr" | "dah#1"
| "daa#1" | "d.rz#1" | "draa#2" | "dhaa#1" | "dhyaa" | "dhyai" | "dhv.r"
| "nak.s" | "nii#1" | "pac" | "praz" | "prii" | "pru.s#1" | "budh#1"
| "bhaa#1" | "bhii#1" | "muc#1" | "yaj#1" | "yuj#1" | "ram" | "labh"
| "v.r#2" | "vyadh" | "zru" | "sidh#1" | "s.rj#1" | "stu" | "sp.rz#1"
| "hu" -> do
[ "aap" | "k.r#1" | "gup" | "chid#1" | "ji" | "tud" | "t.rr"
| "dah#1" | "daa#1" | "d.rz#1" | "draa#2" | "dhaa#1" | "dhyaa" | "dhyai"
| "dhv.r" | "nak.s" | "nii#1" | "pac" | "praz" | "prii"
| "budh#1" | "bhaa#1" | "bhii#1" | "muc#1" | "yaj#1" | "yuj#1" | "ram"
| "labh" | "v.r#2" | "vyadh" | "zru" | "sidh#1" | "s.rj#1" | "stu"
| "sp.rz#1" | "hu" -> do
{ let stem = match entry with
[ "d.rz#1" | "s.rj#1" | "sp.rz#1" -> long_metathesis weak
| "pru.s#1" -> strong
| "ram" -> weak
| _ -> long
] in
......@@ -4154,11 +4153,12 @@ value compute_aorist entry =
]
; match entry with (* 5. i.s aorist se.t-sic *)
[ "ak.s" | "aj" | "aas#2" | "i.s#1" | "iik.s" | "uk.s" | "uc" | "u.s"
| "uuh" | ".rc#1" | "k.rt#1" | "krand" | "kram" | "k.san" | "khan" | "car"
| "ce.s.t" | "jalp" | "jaag.r" | "t.rr" | "diip" | "pa.th" | "puu#1" | "p.rc"
| "baadh" | "budh#1" | "mad#1" | "mud#1" | "muurch" | "mlecch" | "yaac"
| "ruc#1" | "lu~nc" | "luu#1" | "vad" | "vadh" | "vid#1" | "v.r#1" | "vraj"
| "z.rr" | "sidh#2" | "skhal" | "stan" | "stu" | "hi.ms" -> do
| "uuh" | ".rc#1" | "k.rt#1" | "krand" | "kram" | "k.san" | "khan"
| "car" | "ce.s.t" | "jalp" | "jaag.r" | "t.rr" | "diip" | "pa.th" | "puu#1"
| "p.rc"| "pru.s#1" | "baadh" | "budh#1" | "mad#1" | "mud#1" | "muurch"
| "mlecch" | "yaac" | "ruc#1" | "lu~nc" | "luu#1" | "vad" | "vadh" | "vid#1"
| "v.r#1" | "vraj" | "z.rr" | "sidh#2" | "skhal" | "stan" | "stu"
| "hi.ms" -> do
{ let stem = match weak with
[ [ 7 (* .r *) :: _ ] ->
if entry = "jaag.r" then strong (* jaagari.sam RF IC 2 p 88 *)
......@@ -4177,7 +4177,7 @@ value compute_aorist entry =
compute_ath_is_aorista stem entry
; compute_ath_is_aoristm strong entry
}
| "gup" | "vrazc" | "zcut#1" | "sphu.t" -> (* active only *)
| "ku.s" | "gup" | "vrazc" | "zcut#1" | "sphu.t" -> (* active only *)
compute_ath_is_aorista strong entry
| "zuu" ->
compute_ath_is_aorista (revcode "zve") entry
......
......@@ -8,4 +8,4 @@
(**************************************************************************)
(* Generated by make version - see main Makefile *)
value version="3.14" and version_date="2019-03-21";
value version="3.14" and version_date="2019-04-02";
VERSION='3.14'
DATE='2019-03-21'
DATE='2019-04-02'
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment