Commit 70c10e8e authored by Gérard Huet's avatar Gérard Huet

improvements of sa treatment; renamings

parent a02bdf46
......@@ -5,7 +5,7 @@
# Gérard Huet & Pawan Goyal #
# #
############################################################################
# Makefile of Sanskrit Heritage Software 08-01-2018 Copyright INRIA 2018 #
# Makefile of Sanskrit Heritage Software 21-01-2018 Copyright INRIA 2018 #
############################################################################
# Prerequisites: Ocaml and Camlp4 preprocessor
......
......@@ -556,9 +556,9 @@ value look_up_and_display font gana entry =
and sort_out_u accu form = fun
[ [ (_,morphs) ] -> List.fold_left (reorg form) accu morphs
where reorg f (inf,absya,per,abstva) = fun
[ Ind_verb (c,Infi) when c=conj -> ([ (c,f) :: inf ],absya,per,abstva)
| Ind_verb (c,Absoya) when c=conj -> (inf,[ (c,f) :: absya ],per,abstva)
| Ind_verb (c,Perpft) when c=conj -> (inf,absya,[ (c,f) :: per ],abstva)
[ Und_verb (c,Infi) when c=conj -> ([ (c,f) :: inf ],absya,per,abstva)
| Und_verb (c,Absoya) when c=conj -> (inf,[ (c,f) :: absya ],per,abstva)
| Und_verb (c,Perpft) when c=conj -> (inf,absya,[ (c,f) :: per ],abstva)
| Abs_root c when c=conj -> (inf,absya,per,[ (c,f) :: abstva ])
| _ -> (inf,absya,per,abstva)
]
......
......@@ -4,7 +4,7 @@
(* *)
(* Gérard Huet *)
(* *)
(* ©2017 Institut National de Recherche en Informatique et en Automatique *)
(* ©2018 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(* Syntactico/semantic analysis and penalty computations. *)
......@@ -259,17 +259,17 @@ value get_roles entry = fun
else Actor (gram_role n entry c) g n (* beware n duplication *)
| Verb_form f n p -> get_fin_roles entry f n p
| Abs_root _ -> get_abs_roles entry
| Ind_form Conj -> match entry with
| Und_form Conj -> match entry with
[ "ca" -> Tool Coordination
| _ -> Ignored (* TODO vaa etc *)
]
| Ind_form Prep -> if entry = "saha" || entry = "vinaa" || entry = "satraa"
| Und_form Prep -> if entry = "saha" || entry = "vinaa" || entry = "satraa"
then Tool Post_instrument
else Ignored
| Ind_form Adv -> if entry = "saha" then Tool Not_Post_instrument
| Und_form Adv -> if entry = "saha" then Tool Not_Post_instrument
else Ignored
| Ind_form Abs -> get_abs_roles entry
| Ind_form Part -> match entry with
| Und_form Abs -> get_abs_roles entry
| Und_form Part -> match entry with
[ "maa#2" -> Tool Prohibition
| _ -> Ignored
]
......
......@@ -199,7 +199,7 @@ value sort_out accu form = fun
]
| Bare_stem | Auxi_form -> (mas,fem,neu,any,[ f :: iic ],avy)
| Avyayaf_form -> (mas,fem,neu,any,iic,[ f :: avy ])
| Ind_form _ | Verb_form _ _ _ | Ind_verb _ | Abs_root _
| Und_form _ | Verb_form _ _ _ | Und_verb _ | Abs_root _
| Avyayai_form | Unanalysed | PV _
| Part_form _ _ _ _ ->
failwith "Unexpected form in declensions"
......
......@@ -290,7 +290,7 @@ value extract_gana_pada = fun
| Conjug _ v | Perfut v -> (None,v)
] in
(conj,(o_gana,pada_of_voice voice))
| Ind_verb _ _ -> raise Unvoiced (* could be refined *)
| Und_verb _ _ -> raise Unvoiced (* could be refined *)
| _ -> failwith "Unexpected root form"
]
and extract_gana_pada_k krit =
......@@ -597,7 +597,7 @@ value validate out = match out with
else []
]
| [ (Abso,rev_abso_form,s) :: [ (Pv,prev,sv) :: r ] ] ->
(* Takes care of absolutives in -ya and of infinitives with preverbs *)
(* Takes care of absolutives in -ya and infinitives with preverbs *)
let pv = Word.mirror prev in
let pv_str = Canon.decode pv
and abso_form = Word.mirror rev_abso_form in
......@@ -614,10 +614,18 @@ value validate out = match out with
]
(* We now prevent overgeneration of forms "sa" and "e.sa" \Pan{6,1,132} *)
(*i TODO: similar test for dual forms i*)
| [ (ph,form,_) :: [ (Pron,[ 1; 48 ],_) :: _ ] ] (* sa *)
| [ (ph,form,_) :: [ (Pron,[ 1; 48 ],_) :: _ ] ] (* sa *) ->
if Phonetics.consonant_initial (Word.mirror form)
then out else []
| [ (ph,form,_) :: [ (Pron,[ 1; 47; 10 ],_) :: _ ] ] (* e.sa *) ->
if Phonetics.consonant_initial (Word.mirror form)
if Phonetics.consonant_initial (Word.mirror form)
then out else []
| [ (ph,form,_) :: [ (Pron,[ 48; 1; 48 ],_) :: _ ] ] (* sas *) ->
if Phonetics.consonant_initial (Word.mirror form) then []
else out
| [ (ph,form,_) :: [ (Pron,[ 48; 1; 47; 10 ],_) :: _ ] ] (* e.sas *) ->
if Phonetics.consonant_initial (Word.mirror form) then []
else out
(* Alternative: put infinitives in Root rather than Indecl+Abso
[| [ (Absc,_,_) :: _ ]
| [ (Absv,_,_) :: _ ] -> check root is autonomous
......@@ -640,7 +648,19 @@ This pv is not terminal, and should be chopped off by the next item *)
| _ -> out (* default identity *)
]
;
value terminal_sa = fun
[ [ (Pron,[ 1; 48 ],_) :: _ ] (* sa *)
| [ (Pron,[ 1; 47; 10 ],_) :: _ ] (* e.sa *) -> True
| _ -> False
]
;
(*i unused
value terminal_sas = fun
[ [ (Pron,[ 48; 1; 48 ],_) :: _ ] (* sas *)
| [ (Pron,[ 48; 1; 47; 10 ],_) :: _ ] (* e.sas *) -> True
| _ -> False
]
; i*)
open Html;
value rec color_of_phase = fun
[ Noun | Noun2 | Lopak | Nouc | Nouv | Kriv | Kric | Krid | Auxik | Kama
......
......@@ -4,7 +4,7 @@
(* *)
(* Gérard Huet *)
(* *)
(* ©2017 Institut National de Recherche en Informatique et en Automatique *)
(* ©2018 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(* Dispatcher: Sanskrit Engine in 55 phases automaton (plus 2 fake ones) *)
......@@ -53,7 +53,10 @@ value trim_tags :
;
value validate : output -> output (* consistency check and glueing *)
;
value color_of_phase : phase -> Html.color;
value terminal_sa : output -> bool
;
value color_of_phase : phase -> Html.color
;
end;
......@@ -13,8 +13,11 @@
open List2; (* unstack ass subtract *)
open Auto.Auto; (* auto rule choices State *)
module Segment
(Phases: sig
(* used by Interface : [Viccheda = Segment Phases Machine Segment_control]
where Machine = Dispatch Transducers Lemmas
where Lemmas = Load_morphs.Morphs Prel Phases *)
module Segment
(Phases: sig
type phase
and phases = list phase;
value unknown : phase;
......@@ -23,8 +26,8 @@ module Segment
value ii_phase : phase -> bool;
value un_lopa : phase -> phase;
end)
(Eilenberg: sig
value transducer : Phases.phase -> auto;
(Eilenberg: sig (* To be instanciated by Dispatcher *)
value transducer : Phases.phase -> auto;
value initial : bool -> Phases.phases;
value dispatch : bool -> Word.word -> Phases.phase -> Phases.phases;
value accepting : Phases.phase -> bool;
......@@ -36,6 +39,8 @@ module Segment
and segment = (Phases.phase * Word.word * transition)
and output = list segment;
value validate : output -> output; (* consistency check / compress *)
value terminal_sa : output -> bool;
(* unused value terminal_sas : output -> bool; *)
end)
(Control: sig value star : ref bool; (* chunk= if star then word+ else word *)
value full : ref bool; (* all kridantas and nan cpds if full *)
......@@ -130,12 +135,21 @@ value register index (phase,pada,sandhi) =
| [] -> update_graph [ (phase,[ pada_right ]) ] (* new bucket *)
]
;
type chunk_params = { offset : mutable int; segmentable : mutable bool }
(* To avoid heavy functional transmission of chunk global parameters,
we define a record of chunk parameters.
NB. offset and last are inherited attributes, segmentable is synthesized. *)
type chunk_params = { offset : mutable int
; segmentable : mutable bool
; last : mutable bool (* for sa elimination in last chunk *)
}
;
value cur_chunk = { offset = 0; segmentable = False }
value cur_chunk = { offset = 0; segmentable = False; last = False }
;
value set_cur_offset n = cur_chunk.offset := n
and set_segmentable b = cur_chunk.segmentable := b
and set_last b = cur_chunk.last := b
;
value set_offset (offset,checkpoints) = do
{ set_cur_offset offset
......@@ -154,9 +168,9 @@ value reset_visual () = for i = 0 to max_seg_rows-1 do
(* The offset permits to align each segment with the input string *)
value offset = fun
[ Euphony (w,u,v) ->
let off = if w=[] then 1 (* amui/lopa from Lopa/Lopak *)
else Word.length w in
off - (Word.length u + Word.length v)
let off = if w=[] then 1 (* amui/lopa from Lopa/Lopak *)
else Word.length w in
off - (Word.length u + Word.length v)
| Id -> 0
]
;
......@@ -361,6 +375,12 @@ type backtrack =
]
and resumption = list backtrack (* coroutine resumptions *)
;
value check_sa contracted =
not (cur_chunk.last && terminal_sa contracted) (* forbid sa last *)
(* [ && (not (terminal_sas contracted) || cur_chunk.last) (* sa.h last only *) ]
This is too strict, in view of padapatha and und-sandhied mode
et on a donc un peu d'overgeneration, avec eg "sa.h yogii" *)
;
(* Service routines of the segmenter *)
......@@ -400,29 +420,29 @@ value rec react phase input output back occ = fun
let deter cont = match input with
[ [] -> continue cont
| [ letter :: rest ] -> match ass letter det with
[ Some state ->
react phase rest output cont [ letter :: occ ] state
[ Some state -> react phase rest output cont [ letter :: occ ] state
| None -> continue cont
]
] in
let cont = if choices=[] then back (* non deterministic continuation *)
else [ Choose phase input output occ choices :: back ] in
(* now we look for - or + segmentation pragma *)
(* now we look for - or + segmentation hint *)
let (keep,cut,input') = match input with
[ [ 0 :: rest ] -> (* explicit "-" compound break hint *)
(ii_phase phase,True,rest)
| [ -10 :: rest ] -> (* mandatory segmentation indicated by "+" *)
| [ -10 :: rest ] -> (* mandatory segmentation "+" *)
(True,True,rest)
| _ -> (True,False,input) (* no hint in input *)
] in
if accept && keep then
let segment = (phase,occ,Id) in
let out = accrue segment output in (*i unknown Id sandhi - TODO i*)
match validate out with
[ [] -> if cut then continue cont else deter cont
match validate out (* validate and compact partial output *) with
[ [] -> if cut then continue cont else deter cont
| contracted -> match input' with
[ [] -> if accepting phase then (* solution found *)
do { log_chunk contracted; continue cont }
[ [] -> if accepting phase (* solution found *)
&& check_sa contracted (* forbid sa last *)
then do { log_chunk contracted; continue cont }
else continue cont
| [ first :: _ ] -> (* we first try the longest matching word *)
let cont' = schedule phase input' contracted [] cont in
......@@ -447,7 +467,8 @@ and choose phase input output back occ = fun
| contracted ->
if v=[] (* final sandhi *) then
if rest=[] && accepting phase (* solution found *)
then do { log_chunk contracted; continue cont }
&& check_sa contracted (* forbid sa last *)
then do { log_chunk contracted; continue cont }
else continue cont
else continue (schedule phase rest contracted v cont)
]
......@@ -486,7 +507,7 @@ value segment chunk = do
;
(* Splitting checkpoints into current and future ones *)
value split_check limit = split_rec []
where rec split_rec acc checkpts = match checkpts with
where rec split_rec acc checkpts = match checkpts with
[ [] -> (Word.mirror acc,[])
| [ ((index,_,_) as check) :: rest ] ->
if index > limit then (Word.mirror acc,checkpts)
......@@ -496,18 +517,18 @@ value split_check limit = split_rec []
(* We do not need to [dove_tail] like in Rank, since chunks are independent. *)
(* Returns a pair (b,n) where b is True if all chunks are segmentable so far,
and n is the number of potential solutions *)
value segment_all = List.fold_left segment_chunk (True,Num.Int 1)
where segment_chunk (flag,count) chunk =
value segment_chunk (full,count) chunk last =
let extremity = cur_chunk.offset+Word.length chunk in
let (local,future) = split_check extremity chkpts.all_checks in do
{ chkpts.segment_checks := local
; set_last last
; let segmentable = segment chunk
and local_count = get_counter () in do
{ set_segmentable False
; set_offset (succ extremity,future)
; if segmentable then do
{ reset_counter ()
; (flag,Num.mult_num count (Num.Int local_count))
; (full,Num.mult_num count (Num.Int local_count))
(* we have [local_count] segmentations of the local [chunk], and,
chunks being independent, the total number of solutions multiply *)
}
......@@ -515,6 +536,14 @@ value segment_all = List.fold_left segment_chunk (True,Num.Int 1)
}
}
;
value segment_iter chunks = segment_chunks (True,Num.Int 1) chunks
where rec segment_chunks acc = fun (* terminal recursion *)
[ [ (* last *) chunk ] -> segment_chunk acc chunk True
| [ chunk :: rest ] -> segment_chunks (segment_chunk acc chunk False) rest
| [] -> acc
]
;
end; (* Segment *)
......@@ -126,11 +126,11 @@ value peri = ref (Deco.empty : inflected_map)
value add_morphperi w d i =
peri.val := Lexmap.addl peri.val w (d w,i)
;
(* indeclinable forms - adverbs, conjonctions, particles *)
value indecls = ref (Deco.empty : inflected_map)
(* undeclinable forms - adverbs, conjonctions, particles *)
value undecls = ref (Deco.empty : inflected_map)
;
value add_morphin w d i =
indecls.val := Lexmap.addl indecls.val w (d w,i)
value add_morphund w d i =
undecls.val := Lexmap.addl undecls.val w (d w,i)
;
(* invocations are registered in invs *)
value invs = ref (Deco.empty : inflected_map)
......@@ -349,7 +349,7 @@ value add_morphauxiick w stem i =
value inftu = ref (Deco.empty : inflected_map)
and kama = ref (Deco.empty : inflected_map)
;
value add_morphinftu w d i = (* similar to [add_morphin] *)
value add_morphinftu w d i = (* similar to [add_morphund] *)
if Phonetics.phantomatic w then () else
inftu.val := Lexmap.addl inftu.val w (d w,i)
and add_morphkama w d i = (* similar to [add_morph] *)
......@@ -372,7 +372,7 @@ type nominal =
type flexion =
[ Declined of nominal and gender and list (number * list (case * word))
| Conju of finite and list (number * list (person * word))
| Indecl of ind_kind and word (* avyaya, particle, interjection, nota *)
| Undecl of und_kind and word (* avyaya, particle, interjection, nota *)
| Bare of nominal and word (* Iic *)
| Avyayai of word (* Iic of avyayiibhaava cpd *)
| Avyayaf of word (* Ifc of avyayiibhaava cpd *)
......@@ -432,10 +432,10 @@ value enter1 entry =
; (* Now auxiliaries for verbal cvi compounds *)
if auxiliary entry then add_morphauxi w delta v else ()
}
| Indecl k w -> match k with
| Undecl k w -> match k with
[ Adv | Part | Conj | Default | Prep | Tas ->
add_morphin w delta (Ind_form k)
| Interj -> add_invoc w delta (Ind_form k)
add_morphund w delta (Und_form k)
| Interj -> add_invoc w delta (Und_form k)
| Avya -> () (* since generative *)
| Abs | Infl | Nota -> () (* no recording in morph tables *)
(* Abs generated by absolutives of verbs, Infl by flexions of nouns, and
......@@ -447,7 +447,7 @@ value enter1 entry =
| Avyayaf w -> add_morphyaf w delta Avyayaf_form
| Cvi w -> add_morphvi w delta Auxi_form
| Invar m w -> let (_,vi) = m
and f = Ind_verb m in
and f = Und_verb m in
match vi with
[ Infi -> do (* 2 cases: with and without preverbs - saves one phase *)
{ add_morphabsya w delta f aapv
......@@ -466,7 +466,7 @@ value enter1 entry =
(* NB Allows perpft of verbs with preverbs but overgenerates since
it allows perpft followed by a non perfect form of auxiliary *)
]
| Inftu m w -> let f = Ind_verb (m,Infi) in
| Inftu m w -> let f = Und_verb (m,Infi) in
add_morphinftu w delta f (* infinitive in -tu *)
| Absotvaa c w -> let f = Abs_root c in
add_morphabstvaa w delta f (* abs-tvaa: no preverb *)
......
......@@ -4,7 +4,7 @@
(* *)
(* Gérard Huet *)
(* *)
(* ©2017 Institut National de Recherche en Informatique et en Automatique *)
(* ©2018 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(*i module Inflected : sig i*)
......@@ -31,7 +31,7 @@ value peri : ref inflected_map;
value auxi : ref inflected_map;
value auxik : ref inflected_map;
value auxiick : ref inflected_map;
value indecls : ref inflected_map;
value undecls : ref inflected_map;
value invs : ref inflected_map;
value absya : ref inflected_map;
value abstvaa : ref inflected_map;
......@@ -57,7 +57,7 @@ type nominal =
type flexion =
[ Declined of nominal and gender and list (number * list (case * Word.word))
| Conju of finite and list (number * list (person * Word.word))
| Indecl of ind_kind and Word.word
| Undecl of und_kind and Word.word
| Bare of nominal and Word.word
| Avyayai of Word.word (* Iic of avyayiibhaava cpd *)
| Avyayaf of Word.word (* Ifc of avyayiibhaava cpd *)
......
......@@ -53,7 +53,7 @@ module Transducers = Trans Prel
;
module Machine = Dispatch Transducers Lemmas
;
open Machine (* [cache_phase] *)
open Machine
;
(* At this point we have a Finite Eilenberg machine ready to instantiate *)
(* the Eilenberg component of the Segment module. *)
......@@ -74,7 +74,7 @@ end (* [Segment_control] *)
;
module Viccheda = Segment Phases Machine Segment_control
;
open Viccheda (* [segment_all visual_width] etc. *)
open Viccheda (* [segment_iter visual_width] etc. *)
;
(* At this point we have the sandhi inverser segmenting engine *)
......@@ -416,7 +416,7 @@ value check_sentence translit us text_orig checkpoints sentence
let devainput = String.concat " " devachunks
and cpts = sort_check checkpoints in
let _ = chkpts.all_checks := cpts
and (flag,count) = segment_all chunks in
and (full,count) = segment_iter chunks in (* full iff all chunks segment *)
let text = match sol_num with
[ "0" -> update_text_with_sol text_orig count
| _ -> text_orig
......@@ -429,7 +429,7 @@ value check_sentence translit us text_orig checkpoints sentence
; html_break |> ps
; div_begin Latin16 |> ps
; table_begin Spacing20 |> pl
; tr_begin |> pl
; tr_begin |> pl (* tr begin *)
; td_wrap (call_undo text checkpoints ^ "Undo") |> ps
; let call_scl_parser n = (* invocation of scl parser *)
if scl_toggle then
......@@ -445,13 +445,13 @@ value check_sentence translit us text_orig checkpoints sentence
}
else do
{ td_wrap (call_reader text cpts "p" ^ "Filtered Solutions") |> ps
; let info = string_of_int n ^ if flag then "" else " Partial" in
; let info = string_of_int n ^ if full then "" else " Partial" in
td_wrap (call_reader text cpts "t" ^ "All " ^ info ^ " Solutions") |> ps
; call_scl_parser n
}
| _ -> td_wrap "(More than 2^32 Solutions!)" |> ps
]
; tr_end |> pl
; tr_end |> pl (* tr end *)
; table_end |> pl
; div_end |> ps (* Latin16 *)
; html_break |> pl
......@@ -550,19 +550,17 @@ value graph_engine () = do
and uns = us="t" (* unsandhied vs sandhied corpus *)
and () = if st="f" then iterate.val:=False else () (* word stemmer? *)
and () = if cp="f" then complete.val:=False else () (* simplified reader? *)
and () = toggle_lexicon lex
and corpus = get "corpus" env ""
and () = toggle_lexicon lex (* sticky lexicon switch *)
and corpus = get "corpus" env ""
and sent_id = get "sentenceNumber" env "0"
and link_num = get "linkNumber" env "0" (* is there a better default? *)
and sol_num = get "allSol" env "0" in (* Needed for Validate mode *)
let url_enc_corpus_permission =
Cgi.get Params.corpus_permission env (string_of_bool True)
in
Cgi.get Params.corpus_permission env "true" in
let corpus_permission =
url_enc_corpus_permission
|> Cgi.decode_url
|> Web_corpus.permission_of_string
in
|> Web_corpus.permission_of_string in
let corpus_dir = Cgi.get Params.corpus_dir env "" in
let sentence_no = Cgi.get Params.sentence_no env "" in
let text = arguments translit lex cache st us cp url_encoded_input
......@@ -582,12 +580,12 @@ value graph_engine () = do
{ append_cache entry gender
; let cache_txt_file = public_cache_txt_file in
let cache = Nouns.extract_current_cache cache_txt_file in
make_cache_transducer cache
make_cache_transducer cache
}
else () in
let revised = decode_url (get "revised" env "")
and rev_off = int_of_string (get "rev_off" env "-1")
and rev_ind = int_of_string (get "rev_ind" env "-1") in
and rev_ind = int_of_string (get "rev_ind" env "-1") in
try do
{ match (revised,rev_off,rev_ind) with
[ ("",-1,-1) -> check_sentence translit uns text checkpoints
......@@ -612,15 +610,14 @@ value graph_engine () = do
let new_chunk_len = Word.length (Encode.switch_code translit revised) in
let diff = new_chunk_len-word_len in
let revised_check =
let revise (k,sec,sel) =
(if k<word_off then k else k+diff,sec,sel) in
let revise (k,sec,sel) = (if k<word_off then k else k+diff,sec,sel) in
List.map revise checkpoints
and updated_text = arguments translit lex cache st us cp updated_input
url_encoded_topic abs sol_num corpus sent_id link_num
url_enc_corpus_permission corpus_dir sentence_no
and new_input = decode_url updated_input in
check_sentence translit uns updated_text revised_check
new_input sol_num corpus sent_id link_num
new_input sol_num corpus sent_id link_num
]
(* automatically refreshing the page only if guess parameter *)
; if String.length guess_morph > 0 then
......@@ -630,13 +627,13 @@ value graph_engine () = do
else ()
(* Save sentence button *)
; if corpus_permission = Web_corpus.Annotator then
(* TODO: use [segment_all] to compute the nb of sols instead of
passing 0 to [nb_sols]. *)
save_button query (Num.num_of_int 0) |> pl
(* TODO: use [segment_iter] to compute the nb of sols instead of
passing 0 to [nb_sols]. *)
save_button query (Num.num_of_int 0) |> pl
else ()
; html_break |> pl
(* Quit button: continue reading (reader mode) or quit without
saving (annotator mode). *)
(* Quit button: continue reading (reader mode)
or quit without saving (annotator mode) *)
; if sentence_no <> "" then
quit_button corpus_permission
(Cgi.decode_url corpus_dir) (Cgi.decode_url sentence_no) |> pl
......@@ -671,6 +668,6 @@ value safe_engine () =
;
end (* Interface *)
;
Interface.safe_engine () (* Should always produce a compliant xhtml page *)
Interface.safe_engine () (* Should always produce a compliant HTML page *)
;
......@@ -8,7 +8,7 @@
(**************************************************************************)
(* Sanskrit Phrase Lexer in 40 phases version. *)
(* Used by Parser, Reader and Regression.
(* Used by Parser, and Rank for Reader/Regression.
Uses Phases from Dispatcher to define phase.
Loads the transducers, calls Dispatch to create Disp.
Calls Segment to build Viccheda, the Sanskrit lexer that undoes sandhi
......@@ -65,7 +65,8 @@ value rec color_of_role = fun (* Semantic role of lexical category *)
| Noun | Noun2 | Nouv | Nouc | Krid | Kriv | Kric | Pron | Ifc | Ifc2
| Kama | Lopak | Auxik -> Cyan (* Actor or Predicate *)
| Root | Lopa | Auxi -> Pink (* abs-tvaa in Inde *) (* Process *)
| Abso | Absv | Absc | Inde | Avy | Ai | Ani | Inftu -> Lavender (* Circumstance *)
| Abso | Absv | Absc | Inde | Avy | Ai | Ani | Inftu (* Circumstance *)
-> Lavender
| Unknown | Cache -> Grey
| Comp (_,ph) _ _ | Tad (_,ph) _ _ -> color_of_role ph
| Sfx -> Cyan
......
......@@ -124,7 +124,7 @@ value load_transducer cat =
else do { Prel.prelude (); abort cat } ]
;
(* privative prefixes automata *)
value a_trans = State(False,[(1,State(True,[],[cch]))],[])
value a_trans = State(False,[(1,State(True,[],[ cch ]))],[])
where cch = (([ 22; 23 ],[],[ 23 ]) : rule) (* a-ch \R acch *)
and an_trans = let n_trans = State(False,[(36,State(True,[],[]))],[]) in
State(False,[(1,n_trans)],[])
......
......@@ -4,7 +4,7 @@
(* *)
(* Gérard Huet *)
(* *)
(* ©2017 Institut National de Recherche en Informatique et en Automatique *)
(* ©2018 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(*i module Morpho_scl = struct i*)
......@@ -137,13 +137,13 @@ value print_scl_morph = fun
; print_scl_number n
; print_scl_person p
}
| Ind_form k -> print_scl_kind k
| Und_form k -> print_scl_kind k
| Avyayaf_form -> ps "<avya/>"
| Abs_root c -> do { print_scl_conjugation c; ps "<abs/>" }
| Auxi_form -> ps "<iiv/>"
| Ind_verb m -> print_scl_modal m
| PV _ -> ps "<pv/>"
| Unanalysed -> ps "<unknown/>"
| Abs_root c -> do { print_scl_conjugation c; ps "<abs/>" }
| Auxi_form -> ps "<iiv/>"
| Und_verb m -> print_scl_modal m
| PV _ -> ps "<pv/>"
| Unanalysed -> ps "<unknown/>"
]
;
value print_scl_morphs =
......
......@@ -4,7 +4,7 @@
(* *)
(* Gérard Huet *)
(* *)
(* ©2017 Institut National de Recherche en Informatique et en Automatique *)
(* ©2018 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(* Linearizes morphological information as a string.
......@@ -83,7 +83,7 @@ and string_person = fun
| Second -> "2"
| Third -> "3"
]
and string_ind_kind = fun
and string_und_kind = fun
[ Part -> "part."
| Prep -> "prep."
| Conj -> "conj."
......@@ -116,10 +116,10 @@ value string_morph = fun
| Bare_stem | Avyayai_form -> "iic."
| Avyayaf_form -> "ind."
| Verb_form f n p -> (string_finite f) ^ (string_number n) ^ (string_person p)
| Ind_form k -> string_ind_kind k
| Und_form k -> string_und_kind k
| Abs_root c -> (string_conjugation c) ^ "abs."
| Auxi_form -> "iiv."
| Ind_verb m -> string_modal m
| Und_verb m -> string_modal m
| Unanalysed -> "?"
| PV pvs -> "pv."
]
......
......@@ -23,8 +23,8 @@ type inflexion_tag = (* vibhakti *)
| Avyayai_form (* iic forms of avyayiibhaava cpds *)
| Avyayaf_form (* ifc forms of avyayiibhaava cpds *)
| Verb_form of finite and number and person (* finite conjugated root forms *)
| Ind_form of ind_kind (* indeclinable forms: prep, adv, etc *)
| Ind_verb of modal (* indeclinable inf abs-ya and perpft *)
| Und_form of und_kind (* undeclinable forms: prep, adv, etc *)
| Und_verb of modal (* undeclinable inf abs-ya and perpft *)
| Abs_root of conjugation (* abs-tvaa *)
| Auxi_form (* verbal auxiliaries forms *)
| Unanalysed (* un-analysable segments *)
......
......@@ -48,7 +48,7 @@ and normal_stem = Encode.normal_stem
(* declension generators *)
type declension_class =
[ Gender of gender (* declined substantive, adjective, number, pronoun *)
| Ind of ind_kind (* Indeclinable form *)
| Und of und_kind (* undeclinable form *)
]
and nmorph = (string * declension_class)
;
......@@ -201,7 +201,7 @@ value build_mas_a stem entry =
]
; Bare Noun (wrap stem 1)
; Avyayaf (fix stem "am"); Avyayaf (fix stem "aat") (* avyayiibhaava *)
; Indecl Tas (fix stem "atas") (* tasil productive *)
; Undecl Tas (fix stem "atas") (* tasil productive *)
; Cvi (wrap stem 4) (* cvi now productive for masculine stems in -a *)
])
;
......@@ -245,7 +245,7 @@ value build_mas_i stem trunc entry = (* declension of "ghi" class *)
]
; Bare Noun (mirror stem)
; Avyayaf (mirror stem)
; Indecl Tas (fix stem "tas")
; Undecl Tas (fix stem "tas")
; Cvi (wrap trunc 4) (* "aadhi1" "pratinidhi" *)
])
;
......@@ -330,7 +330,7 @@ value build_mas_u stem trunc entry = (* similar to [build_mas_i] *)
; Bare Noun (mirror stem)
; Cvi (wrap trunc 6) (* .rju maru m.rdu laghu *)
; Avyayaf (mirror stem)
; Indecl Tas (fix stem "tas")
; Undecl Tas (fix stem "tas")
]
;
value build_mas_ri_v stem entry = (* vriddhi in strong cases *)
......@@ -531,7 +531,7 @@ value build_mas_red stem entry =
; decline Loc "tsu"
])
]
; Indecl Tas (fix stem "tas")
; Undecl Tas (fix stem "tas")
]
;
value build_mas_at stem entry =
......@@ -711,7 +711,7 @@ value build_man g stem entry =
])
]
; Avyayaf (fix stem "mam")
; Indecl Tas (fix stem "matas")
; Undecl Tas (fix stem "matas")
] @ (if entry = "dharman" then [] (* redundant with dharma *)
else [ Bare Noun (mirror [ 1 :: [ 41 :: stem ]]) ])
@ (if g=Neu && man_iiv entry then [ Cvi (mirror [ 4 :: [ 41 :: stem ]]) ]
......@@ -812,7 +812,7 @@ value build_van g stem entry =
]
; Bare Noun (mirror [ 1 :: [ 44 :: stem ]])
; Avyayaf (fix stem "vam")
; Indecl Tas (fix stem "vatas")
; Undecl Tas (fix stem "vatas")
]
@ if g=Neu then [ Avyayaf (fix stem "va") ] else []) (* \Pan{5,4,109} *)
......@@ -1914,7 +1914,7 @@ value build_neu_a stem entry =
]
; Bare Noun (wrap stem 1)
; Avyayaf (fix stem "am"); Avyayaf (fix stem "aat")
; Indecl Tas (fix stem "atas")
; Undecl Tas (fix stem "atas")
] @ (if a_n_iiv entry then [ Cvi (wrap stem 4) ] else []))
;
value build_neu_i trunc entry = (* stems in -i and -ii *)
......@@ -2928,7 +2928,7 @@ value build_fem_i stem trunc entry =
]
; Bare Noun (mirror stem)
; Avyayaf (mirror stem)
; Indecl Tas (fix stem "tas")