Commit a2111408 authored by Gérard Huet's avatar Gérard Huet
Browse files

Fix in morphology printing with preverbs explicit

parent 049c3c57
This diff is collapsed.
......@@ -46,7 +46,7 @@ GRAM = index.ml phonetics.ml int_sandhi.ml skt_morph.mli morphology.mli \
dispatcher.mli dispatcher.ml segmenter.ml load_morphs.ml lexer.mli \
lexer.ml rank.ml scl_parser.ml reader.ml parser.ml constraints.mli \
constraints.ml multilingual.ml paraphrase.mli paraphrase.ml \
bank_lexer.ml regression.ml checkpoints.ml graph_segmenter.ml \
bank_lexer.ml checkpoints.ml graph_segmenter.ml \
automaton.ml interface.mli interface.ml user_aid.ml reset_caches.ml \
params.mli params.ml
......@@ -70,7 +70,7 @@ TREE = parse_tree.ml parse_apte.ml stemmer.ml tag_tree.ml tag_apte.ml
DEBUG=debug.ml
ENGINE=$(CORE) $(GRAM) $(WEB) $(CORPUS)
INACTIVE=$(TREE) nyaaya.ml mk_nyaaya_page.ml
INACTIVE=$(TREE) nyaaya.ml mk_nyaaya_page.ml regression.ml
SOURCES=$(ENGINE) $(DEBUG) $(INACTIVE)
......@@ -90,8 +90,8 @@ LINK=ocamlopt -I $(ZEN) -I +camlp4 dynlink.cmxa camlp4lib.cmxa
# standard installation of Sanskrit Heritage platform - assumes ZEN library
engine: test_version cgis reset_caches static_pages regression mk_corpus
# reader_plugin parse_apte
engine: test_version cgis reset_caches static_pages mk_corpus
# reader_plugin parse_apte regression
# testing consistency of Heritage_resources and Heritage_platform
test_version: paths.cmx gen.cmx version.cmx control.cmx test_stamp.cmx
......
......@@ -229,14 +229,7 @@ and attested prev root = (* prev is attested preverb sequence for root *)
let pvs = assoc_word root roots_usage in
List.mem prev pvs (* NB attested here means lexicalized entry *)
;
(* Now we retrieve finer discrimination for verbs forms preceded by preverbs.
This is experimental, and incurs too many conversions betweeen strings
and words, suggesting a restructuring of preverbs representation. *)
value preverbs_structure =
try (Gen.gobble Web.public_preverbs_file : Deco.deco Word.word)
(*i should probably be rather : [Deco.deco string] i*)
with [ _ -> failwith "preverbs_structure" ]
;
value gana_o = fun
[ None -> 0 (* arbitrary *)
| Some g -> g (* only used for "tap" *)
......@@ -251,7 +244,7 @@ and voice_o v = fun
value main_preverb pvs = List2.last pvs
;
value main_preverb_string pv =
Canon.decode (main_preverb (assoc_word pv preverbs_structure))
Canon.decode (main_preverb (assoc_word pv Web.preverbs_structure))
;
value attested_verb (o_gana,o_voice) pv root = attested pv root &&
let gana = gana_o o_gana in
......
......@@ -57,6 +57,5 @@ value terminal_sa : output -> bool
;
value color_of_phase : phase -> Html.color
;
end;
......@@ -39,7 +39,7 @@ open Phases; (* phase *)
module Lemmas = Load_morphs.Morphs Prel Phases
;
open Lemmas (* [morpho tags_of] *)
open Lemmas (* [morpho tag_sort tags_of] *)
;
open Load_transducers; (* [transducer_vect Trans] *)
......@@ -58,47 +58,33 @@ and set_offset = Viccheda.set_offset
value un_analyzable (chunk : word) =
([ (Unknown,mirror chunk,Disp.Id) ],Viccheda.finished)
;
value rec color_of_role = fun (* Semantic role of lexical category *)
[ Pv | Pvk | Pvkc | Pvkv | Iic | Iic2 | Iik | Voca | Inv | Iicv | Iicc
| Iikv | Iikc | Iiif | A | An | Vok | Vokv | Vokc | Vocv | Vocc | Iiy
| Iiv | Iivv | Iivc | Peri | Auxiick -> Grey
| Noun | Noun2 | Nouv | Nouc | Krid | Kriv | Kric | Pron | Ifc | Ifc2
| Kama | Lopak | Auxik -> Cyan (* Actor or Predicate *)
| Root | Lopa | Auxi -> Pink (* abs-tvaa in Inde *) (* Process *)
| Abso | Absv | Absc | Inde | Avy | Ai | Ani | Inftu (* Circumstance *)
-> Lavender
| Unknown | Cache -> Grey
| Comp (_,ph) _ _ | Tad (_,ph) _ _ -> color_of_role ph
| Sfx -> Cyan
| Isfx -> Grey
]
;
(* Printing *)
value table_morph_of phase = table_begin (background (color_of_phase phase))
and table_role_of phase = table_begin (background (color_of_role phase))
and table_labels = table_begin (background Pink)
;
value print_morph pvs cached seg_num gen form n tag = do
(* n is the index in the list of tags of an ambiguous form *)
{ ps tr_begin
; ps th_begin
; ps (span_begin Latin12)
{ tr_begin |> ps
; th_begin |> ps
; span_begin Latin12 |> ps
; Morpho_html.print_inflected_link pvs cached form (seg_num,n) gen tag
; ps span_end
; ps th_end
; ps tr_end
; span_end |> ps
; th_end |> ps
; tr_end |> ps
; n+1
}
;
(* generalisation of [print_morph] to taddhitas *)
value print_morph_tad pvs cached seg_num gen stem sfx n tag = do
(* n is the index in the list of tags of an ambiguous form *)
{ ps tr_begin
; ps th_begin
; ps (span_begin Latin12)
{ tr_begin |> ps
; th_begin |> ps
; span_begin Latin12 |> ps
; Morpho_html.print_inflected_link_tad pvs cached stem sfx (seg_num,n) gen tag
; ps span_end
; ps th_end
; ps tr_end
; span_end |> ps
; th_end |> ps
; tr_end |> ps
; n+1
}
;
......@@ -138,21 +124,21 @@ value print_scl_morph pvs gen form tag = do
value print_scl_tags pvs phase form tags =
let table phase =
xml_begin_with_att "tags" [ ("phase",scl_phase phase) ] in do
{ ps (table phase)
{ table phase |> ps
; List.iter (print_scl_morph pvs (generative phase) form) tags
; ps (xml_end "tags")
; xml_end "tags" |> ps
}
;
(* Used in Parser *)
value tags_of = Lemmas.tags_of (* For export to Parser *)
;
value extract_lemma phase word =
match tags_of phase word with
[ Atomic tags -> tags
| Preverbed (_,phase) pvs form tags -> (* tags to be trimmed to [ok_tags] *)
match tags_of phase word with
[ Atomic tags -> tags
| Preverbed (_,phase) pvs form tags -> (* tags to be trimmed to [ok_tags] *)
if pvs = [] then tags
else trim_tags (generative phase) form (Canon.decode pvs) tags
| Taddhita _ _ _ tags -> tags
]
| Taddhita _ _ _ tags -> tags
]
;
(* Returns the offset correction (used by SL interface) *)
value process_transition = fun
......@@ -174,36 +160,36 @@ value print_sfx_tags sfx = fun
]
;
value process_kridanta pvs seg_num phase form tags = do
{ ps th_begin
; pl (table_morph_of phase) (* table begin *)
{ th_begin |> ps
; table_morph_of phase |> pl (* table begin *)
; let ok_tags =
if pvs = [] then tags
else trim_tags (generative phase) form (Canon.decode pvs) tags in do
(* NB Existence of the segment guarantees that [ok_tags] is not empty *)
{ print_tags pvs seg_num phase form ok_tags
; ps table_end (* table end *)
; ps th_end
; (phase,form,ok_tags)
; table_end |> ps (* table end *)
; th_end |> ps
; (phase, form, ok_tags)
}}
;
value process_taddhita pvs seg_num phase stem sfx_phase sfx sfx_tags =
let gen = generative phase
and cached = False in
let ptag = print_morph_tad pvs cached seg_num gen stem sfx in do
{ ps th_begin
; pl (table_morph_of sfx_phase) (* table begin *)
{ th_begin |> ps
; table_morph_of sfx_phase |> pl (* table begin *)
; let _ = List.fold_left ptag 1 sfx_tags in ()
; ps table_end (* table end *)
; ps th_end
; (sfx_phase,sfx,sfx_tags)
; table_end |> ps (* table end *)
; th_end |> ps
; (sfx_phase, sfx, sfx_tags)
}
;
(* Same structure as [Interface.print_morpho] *)
value print_morpho phase word = do
{ pl (table_morph_of phase) (* table begin *)
; ps tr_begin
; ps th_begin
; ps (span_begin Latin12)
{ table_morph_of phase |> pl (* table begin *)
; tr_begin |> ps
; th_begin |> ps
; span_begin Latin12 |> ps
; let _ =
match tags_of phase word with
[ Atomic tags ->
......@@ -219,23 +205,23 @@ value print_morpho phase word = do
| _ -> failwith "Anomaly: taddhita recursion"
]
] in ()
; ps span_end
; ps th_end
; ps tr_end
; ps table_end (* table end *)
; span_end |> ps
; th_end |> ps
; tr_end |> ps
; table_end |> ps (* table end *)
}
;
(* Segment printing with phonetics without semantics for Reader *)
value print_segment offset (phase,rword,transition) = do
{ ps "[ "
{ "[ " |> ps
; Morpho_html.print_signifiant_off rword offset
; print_morpho phase (mirror rword)
(* Now we print the sandhi transition *)
; ps "&lang;" (* < *)
; "&lang;" |> ps (* < *)
; let correction = process_transition transition in do
{ print_transition transition
; pl "&rang;]" (* >] *)
; pl html_break
; "&rang;]" |> pl (* >] *)
; html_break |> pl
; offset+correction+length rword
}
}
......@@ -245,10 +231,10 @@ value print_segment offset (phase,rword,transition) = do
value print_scl_segment counter (phase,rword) =
let word = Morpho_html.visargify rword in do
{ let solid = background (Disp.color_of_phase phase) in
pl (td_begin_class solid)
td_begin_class solid |> pl
; let ic = string_of_int counter in
ps ("<input type=\"hidden\" name=\"field" ^ ic ^ "\" value='<form wx=\""
^ Canon.decode_WX word ^ "\"/>")
"<input type=\"hidden\" name=\"field" ^ ic ^ "\" value='<form wx=\""
^ Canon.decode_WX word ^ "\"/>" |> ps
; match tags_of phase (mirror rword) with
[ Atomic tags ->
print_scl_tags [] phase word tags
......@@ -266,118 +252,13 @@ value print_scl_segment counter (phase,rword) =
and taddhitanta_stem = form @ sfx (* very experimental *) in
print_scl_tags [] taddhitanta_phase taddhitanta_stem sfx_tags
]
; ps "'>" (* closes <input *)
; ps (Canon.unidevcode word)
; ps td_end
; ps "\n"
; "'>" |> ps (* closes <input *)
; Canon.unidevcode word |> ps
; td_end |> ps
; "\n" |> ps
; counter+1
}
;
value print_labels tags seg_num = do
{ ps th_begin (* begin labels *)
; pl table_labels
; let print_label n _ = do
{ ps (cell (html_red (string_of_int seg_num ^ "." ^ string_of_int n)))
; n+1
} in
let _ = List.fold_left print_label 1 tags in ()
; ps table_end
; ps th_end (* end labels *)
}
;
(* syntactico/semantical roles analysis, function of declension *)
value print_roles pr_sem phase tags form = do
{ ps th_begin
; pl (table_role_of phase)
; let pr_roles (delta,sems) = do
{ ps tr_begin
; ps th_begin
; let word = patch delta form in
pr_sem word sems
; ps th_end
; ps tr_end
} in
List.iter pr_roles tags
; ps table_end
; ps th_end
}
;
(* Segment printing without phonetics with semantics for Parser *)
value print_segment_roles print_sems seg_num (phase,rword,_) =
let word = mirror rword in do
{ Morpho_html.print_signifiant_yellow rword
; let (decl_phase,form,decl_tags) = match tags_of phase word with
[ Atomic tags ->
process_kridanta [] seg_num phase word tags
| Preverbed (_,phase) pvs form tags ->
process_kridanta pvs seg_num phase form tags
| Taddhita (ph,form) sfx sfx_phase sfx_tags ->
match tags_of ph form with
[ Atomic _ -> (* stem, tagged as iic *)
process_taddhita [] seg_num ph form sfx_phase sfx sfx_tags
| Preverbed _ pvs _ _ -> (* stem, tagged as iic *)
process_taddhita pvs seg_num ph form sfx_phase sfx sfx_tags
| _ -> failwith "taddhita recursion unavailable"
]
] in do
{ print_labels decl_tags seg_num
; print_roles print_sems decl_phase decl_tags form
}
}
;
value project n list = List.nth list (n-1) (* Ocaml's nth starts at 0 *)
;
value print_unitag pvs phase word multitags (n,m) =
let (delta,polytag) = project n multitags in
let unitag = [ project m polytag ] in do
{ ps th_begin
; pl (table_morph_of phase) (* table of color of phase begins *)
; let _ = (* print unique tagging *)
print_morph pvs False 0 (generative phase) word 0 (delta,unitag) in ()
; ps table_end (* table of color of phase ends *)
; ps th_end
}
;
value print_uni_taddhita pvs m phase stem sfx sfx_phase = fun
[ [ (delta,polytag) ] -> (* we assume n=1 taddhita form unambiguous *)
let unitag = [ project m polytag ]
and gen = generative phase
and cached = False in do
{ ps th_begin
; pl (table_morph_of sfx_phase) (* table begin *)
; let _ = print_morph_tad pvs cached 0 gen stem sfx 0 (delta,unitag) in ()
; ps table_end (* table end *)
; ps th_end
}
| _ -> failwith "Multiple sfx tag"
]
;
value print_projection phase rword ((_,m) as index) = do
{ ps tr_begin (* tr begins *)
; Morpho_html.print_signifiant_yellow rword
; let word = mirror rword in
match tags_of phase word with
[ Atomic tags -> print_unitag [] phase word tags index
| Preverbed (_,phase) pvs form tags -> print_unitag pvs phase form tags index
| Taddhita (ph,form) sfx sfx_phase sfx_tags ->
match tags_of ph form with
[ Atomic _ -> print_uni_taddhita [] m phase form sfx sfx_phase sfx_tags
| Preverbed _ pvs _ _ ->
print_uni_taddhita pvs m phase form sfx sfx_phase sfx_tags
| _ -> failwith "taddhita recursion unavailable"
]
]
; ps tr_end (* tr ends *)
}
;
value print_proj phase rword = fun
[ [] -> failwith "Projection missing"
| [ n_m :: rest ] -> do
{ print_projection phase rword n_m
; rest (* returns the rest of projections stream *)
}
]
;
module Report_chan = struct
value chan = Control.out_chan; (* where to report *)
......@@ -385,49 +266,6 @@ end;
module Morpho_out = Morpho.Morpho_out Report_chan;
(* Recording of selected solution - used only in Regression *)
value record_tagging unsandhied mode_sent mode_trans all sentence output proj =
let report = output_string Control.out_chan.val in
let print_proj1 phase rword proj prevs = do
(* adapted from [print_proj] *)
{ report "${"
; let form = mirror rword in do
{ report (decode form)
; let res = match proj with
[ [] -> failwith "Projection missing"
| [ (n,m) :: rest ] ->
let gen = generative phase in
let polytag = extract_lemma phase form in
let (delta,tags) = project n polytag in
let tagging = [ project m tags ] in do
{ report ":"
; report (string_of_phase phase ^ "")
; Morpho_out.report_morph gen form (delta,tagging)
; (rest,[]) (* returns the rest of projections stream *)
}
] in
do { report "}$&"; res }
}
} in do
{ report (if Control.full.val then "[{C}] " else "[{S}] ")
; report (if unsandhied then "<{F}> " else "<{T}> ")
; report (if mode_sent then "|{Sent}| " else "|{Word}| ")
; report ("#{" ^ mode_trans ^ "}# ")
; report ("({" ^ sentence ^ "})")
; report (" [" ^ (string_of_int all) ^ "] ")
; let rec pp (proj,prevs) = fun
[ [] -> match proj with
[ [] -> () (* finished, projections exhausted *)
| _ -> failwith "Too many projections"
]
| [ (phase,rword,_) :: rest ] -> (* sandhi ignored *)
let proj_prevs = print_proj1 phase rword proj prevs in
pp proj_prevs rest
] in pp (proj,[]) output
; report "\n"
; close_out Report_chan.chan.val
}
;
(* Structured entries with generative morphology *)
type gen_morph =
[ Gen_krid of ((string * word) * (verbal * word))
......@@ -483,33 +321,6 @@ value get_morph gen phase form (delta,morphs) =
let stem = patch delta form in (* stem may have homo index *)
(form, lex_cat phase, generative_stem gen stem, morphs)
;
value return_tagging output projs = (* Used only in Regression *)
let get_tags phase rword projs = (* adapted from [print_proj] *)
let form = mirror rword in
match tags_of phase form with
[ Atomic polytag -> match projs with
[ [] -> failwith "Projection missing"
| [ (n,m) :: rest ] ->
let gen = generative phase in
let (delta,tags) = project n polytag in
let tagging = [ project m tags ] in
let entry = get_morph gen phase form (delta,tagging) in
(rest, lex_cat phase, entry)
]
| _ -> failwith "Not implemented yet" (*i TODO for Regression
[ (projs, lex_cat Pv, (form, lex_cat Pv, Preverbs_list prevs, []))] i*)
] in
let rec taggings accu projs = fun
[ [] -> match projs with
[ [] -> accu
| _ -> failwith "Too many projections"
]
| [ (phase,rword,_) :: rest ] -> (* sandhi ignored *)
let (new_projs,phase,tags) = get_tags phase rword projs in
taggings [ tags :: accu ] new_projs rest
] in
return_morph (List.rev (taggings [] projs output))
;
end;
......@@ -4,7 +4,7 @@
(* *)
(* Gérard Huet *)
(* *)
(* ©2017 Institut National de Recherche en Informatique et en Automatique *)
(* ©2018 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(* Sanskrit Phrase Lexer *)
......@@ -44,18 +44,23 @@ module Lexer : functor (* takes its prelude and iterator control as parameters *
value extract_lemma : Phases.phase -> Word.word -> list lemma;
value print_segment : int -> Disp.segment -> int;
value print_segment_roles : (Word.word -> inflexions -> unit)
-> int -> Disp.segment -> unit;
value print_proj : Phases.phase -> Word.word ->
list (int * int) -> list (int * int);
(* Exported for Parser *)
value process_kridanta: Word.word -> int -> Phases.phase -> Word.word ->
Morphology.multitag -> (Phases.phase * Word.word * Morphology.multitag);
value process_taddhita: Word.word -> int -> Phases.phase -> Word.word ->
Phases.phase -> Word.word -> Morphology.multitag ->
(Phases.phase * Word.word * Morphology.multitag);
value table_morph_of : Phases.phase -> string;
value print_morph : Word.word -> bool -> int -> bool -> Word.word -> int ->
Morphology.unitag -> int;
value print_morph_tad : Word.word -> bool -> int -> bool -> Word.word ->
Word.word -> int -> Morphology.unitag -> int;
(* END Exported for Parser *)
value all_checks : ref (list Viccheda.check);
value un_analyzable : Word.word -> (list Disp.segment * Viccheda.resumption);
value set_offset : (int * list Viccheda.check) -> unit;
value print_scl_segment : int -> (Phases.phase * Word.word) -> int;
value record_tagging : bool -> bool -> string -> int -> string ->
list (Phases.phase * Word.word * 'a) -> list (int * int) -> unit;
value return_tagging :
list (Phases.phase * Word.word * 'a) -> list (int * int) -> string;
value tags_of : Phases.phase -> Word.word ->
(Load_morphs.Morphs Prel Phases).tag_sort; (* ugly *)
end;
......@@ -12,7 +12,8 @@
open Morphology; (* lemmas *)
module Morphs (* takes its prelude and control arguments as parameters *)
(* Morph functor takes its prelude and control arguments as parameters *)
module Morphs
(Prel: sig value prelude : unit -> unit; end)
(Phases: sig type phase = (* Phases.phase *)
[ Noun | Noun2
......@@ -48,33 +49,31 @@ module Morphs (* takes its prelude and control arguments as parameters *)
open Phases (* phase *)
;
(* Somewhat weird classification of segments accoding to their construction
(* Somewhat weird classification of segments according to their construction
by Dispatcher. Preverbed segments may be finite verb forms or kridantas. *)
type tag_sort =
[ Atomic of lemmas
| Preverbed of (phase * phase) and (* pv *) Word.word and Word.word and lemmas
| Taddhita of (phase * Word.word) and (* sfx *) Word.word and phase and lemmas
]
;
;
(* Fake tags of nan prefixes *)
value nan_prefix = Bare_stem
;
value a_tag = [ ((0,[]),[ nan_prefix ]) ]
and an_tag = [ ((0,[ 51 ]),[ nan_prefix ]) ] (* since lexicalized as an\#1 *)
(* [an_tag] has delta = (0,[51]) since an\#1 is the relevant entry. Such values
will have to be parameters of the specific lexicon used. *)(*i TODO i*)
ought to be parameters of the specific lexicon used. *)(*i TODO i*)
;
value ai_tag = a_tag (* special for privative abs-tvaa eg akritvaa *)
and ani_tag = an_tag
;
value unknown_tag = [ ((0,[]),[ Unanalysed ]) ]
;
;
value give_up cat =
let mess = "Missing " ^ cat ^ " morphology bank" in do
{ Web.abort Html.default_language
"System error - please report - " mess
(* ; exit 0 (* installation problem -- executing process fails *) *)
; Deco.empty
}
;
......@@ -158,7 +157,7 @@ value morpho_tags = fun
| _ -> raise (Control.Anomaly "morpho_tags")
]
;
(* Used in Lexer, Reader, Parser, Interface *)
(* Used in Lexer/Reader/Parser and Interface *)
value tags_of phase word =
match phase with
[ Pv | Pvk | Pvkc | Pvkv -> failwith "Preverb in tags_of"
......@@ -179,6 +178,6 @@ value tags_of phase word =
(* NB Atomic comprises tin verbal forms of roots as well as sup atomic forms
and all the pure stems collections Iic Iiv etc. *)
]
;
;
end;
......@@ -73,8 +73,10 @@ value print_inv_morpho pe pne pu form (seg_num,sub) generative (delta,morphs) =
let krit_infos = Deco.assoc bare_stem unique_kridantas in
try let (verbal,root) = look_up_homo homo krit_infos in do
{ match Deco.assoc bare_stem lexical_kridantas with
[ [] (* not in lexicon *) -> pne bare_stem
| entries (* bare stem is lexicalized *) ->
[ [] (* not in lexicon *) ->
if stem = [ 3; 32; 1 ] (* ita ifc *) then pe stem
else pne bare_stem
| entries (* bare stem is lexicalized *) ->
if List.exists (fun (_,h) -> h=homo) entries
then pe stem (* stem with exact homo is lexical entry *)
else pne bare_stem
......@@ -88,13 +90,21 @@ value print_inv_morpho pe pne pu form (seg_num,sub) generative (delta,morphs) =
; ps "]"
}
;
(* Decomposes a preverb sequence into the list of its components *)
value decomp_pvs pvs =
Deco.assoc pvs Web.preverbs_structure
;
(* Used in [Morpho_html] *)
value print_inv_morpho_link pvs pe pne pu form =
let pv = if Phonetics.phantomatic form then [ 2 ] (* aa- *)
else pvs in
let encaps print e = (* encapsulates prefixing with possible preverbs *)
if pv = [] then print e else do { pe pvs; ps "-"; print e } in
print_inv_morpho (encaps pe) (encaps pne) pu form
if pv = [] then print e else
let pr_pv pv = do { pe pv; ps "-" } in do
{ List.iter pr_pv (decomp_pvs pvs)
; print e
} in
print_inv_morpho (encaps pe) (encaps pne) pu form
(* Possible overgeneration when derivative of a root non attested with pv
since only existential test in [Dispatcher.validate_pv]. Thus