Commit a2111408 authored by Gérard Huet's avatar Gérard Huet

Fix in morphology printing with preverbs explicit

parent 049c3c57
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -46,7 +46,7 @@ GRAM = index.ml phonetics.ml int_sandhi.ml skt_morph.mli morphology.mli \
dispatcher.mli dispatcher.ml segmenter.ml load_morphs.ml lexer.mli \
lexer.ml rank.ml scl_parser.ml reader.ml parser.ml constraints.mli \
constraints.ml multilingual.ml paraphrase.mli paraphrase.ml \
bank_lexer.ml regression.ml checkpoints.ml graph_segmenter.ml \
bank_lexer.ml checkpoints.ml graph_segmenter.ml \
automaton.ml interface.mli interface.ml user_aid.ml reset_caches.ml \
params.mli params.ml
......@@ -70,7 +70,7 @@ TREE = parse_tree.ml parse_apte.ml stemmer.ml tag_tree.ml tag_apte.ml
DEBUG=debug.ml
ENGINE=$(CORE) $(GRAM) $(WEB) $(CORPUS)
INACTIVE=$(TREE) nyaaya.ml mk_nyaaya_page.ml
INACTIVE=$(TREE) nyaaya.ml mk_nyaaya_page.ml regression.ml
SOURCES=$(ENGINE) $(DEBUG) $(INACTIVE)
......@@ -90,8 +90,8 @@ LINK=ocamlopt -I $(ZEN) -I +camlp4 dynlink.cmxa camlp4lib.cmxa
# standard installation of Sanskrit Heritage platform - assumes ZEN library
engine: test_version cgis reset_caches static_pages regression mk_corpus
# reader_plugin parse_apte
engine: test_version cgis reset_caches static_pages mk_corpus
# reader_plugin parse_apte regression
# testing consistency of Heritage_resources and Heritage_platform
test_version: paths.cmx gen.cmx version.cmx control.cmx test_stamp.cmx
......
......@@ -229,14 +229,7 @@ and attested prev root = (* prev is attested preverb sequence for root *)
let pvs = assoc_word root roots_usage in
List.mem prev pvs (* NB attested here means lexicalized entry *)
;
(* Now we retrieve finer discrimination for verbs forms preceded by preverbs.
This is experimental, and incurs too many conversions betweeen strings
and words, suggesting a restructuring of preverbs representation. *)
value preverbs_structure =
try (Gen.gobble Web.public_preverbs_file : Deco.deco Word.word)
(*i should probably be rather : [Deco.deco string] i*)
with [ _ -> failwith "preverbs_structure" ]
;
value gana_o = fun
[ None -> 0 (* arbitrary *)
| Some g -> g (* only used for "tap" *)
......@@ -251,7 +244,7 @@ and voice_o v = fun
value main_preverb pvs = List2.last pvs
;
value main_preverb_string pv =
Canon.decode (main_preverb (assoc_word pv preverbs_structure))
Canon.decode (main_preverb (assoc_word pv Web.preverbs_structure))
;
value attested_verb (o_gana,o_voice) pv root = attested pv root &&
let gana = gana_o o_gana in
......
......@@ -57,6 +57,5 @@ value terminal_sa : output -> bool
;
value color_of_phase : phase -> Html.color
;
end;
This diff is collapsed.
......@@ -4,7 +4,7 @@
(* *)
(* Gérard Huet *)
(* *)
(* ©2017 Institut National de Recherche en Informatique et en Automatique *)
(* ©2018 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(* Sanskrit Phrase Lexer *)
......@@ -44,18 +44,23 @@ module Lexer : functor (* takes its prelude and iterator control as parameters *
value extract_lemma : Phases.phase -> Word.word -> list lemma;
value print_segment : int -> Disp.segment -> int;
value print_segment_roles : (Word.word -> inflexions -> unit)
-> int -> Disp.segment -> unit;
value print_proj : Phases.phase -> Word.word ->
list (int * int) -> list (int * int);
(* Exported for Parser *)
value process_kridanta: Word.word -> int -> Phases.phase -> Word.word ->
Morphology.multitag -> (Phases.phase * Word.word * Morphology.multitag);
value process_taddhita: Word.word -> int -> Phases.phase -> Word.word ->
Phases.phase -> Word.word -> Morphology.multitag ->
(Phases.phase * Word.word * Morphology.multitag);
value table_morph_of : Phases.phase -> string;
value print_morph : Word.word -> bool -> int -> bool -> Word.word -> int ->
Morphology.unitag -> int;
value print_morph_tad : Word.word -> bool -> int -> bool -> Word.word ->
Word.word -> int -> Morphology.unitag -> int;
(* END Exported for Parser *)
value all_checks : ref (list Viccheda.check);
value un_analyzable : Word.word -> (list Disp.segment * Viccheda.resumption);
value set_offset : (int * list Viccheda.check) -> unit;
value print_scl_segment : int -> (Phases.phase * Word.word) -> int;
value record_tagging : bool -> bool -> string -> int -> string ->
list (Phases.phase * Word.word * 'a) -> list (int * int) -> unit;
value return_tagging :
list (Phases.phase * Word.word * 'a) -> list (int * int) -> string;
value tags_of : Phases.phase -> Word.word ->
(Load_morphs.Morphs Prel Phases).tag_sort; (* ugly *)
end;
......@@ -12,7 +12,8 @@
open Morphology; (* lemmas *)
module Morphs (* takes its prelude and control arguments as parameters *)
(* Morph functor takes its prelude and control arguments as parameters *)
module Morphs
(Prel: sig value prelude : unit -> unit; end)
(Phases: sig type phase = (* Phases.phase *)
[ Noun | Noun2
......@@ -48,33 +49,31 @@ module Morphs (* takes its prelude and control arguments as parameters *)
open Phases (* phase *)
;
(* Somewhat weird classification of segments accoding to their construction
(* Somewhat weird classification of segments according to their construction
by Dispatcher. Preverbed segments may be finite verb forms or kridantas. *)
type tag_sort =
[ Atomic of lemmas
| Preverbed of (phase * phase) and (* pv *) Word.word and Word.word and lemmas
| Taddhita of (phase * Word.word) and (* sfx *) Word.word and phase and lemmas
]
;
;
(* Fake tags of nan prefixes *)
value nan_prefix = Bare_stem
;
value a_tag = [ ((0,[]),[ nan_prefix ]) ]
and an_tag = [ ((0,[ 51 ]),[ nan_prefix ]) ] (* since lexicalized as an\#1 *)
(* [an_tag] has delta = (0,[51]) since an\#1 is the relevant entry. Such values
will have to be parameters of the specific lexicon used. *)(*i TODO i*)
ought to be parameters of the specific lexicon used. *)(*i TODO i*)
;
value ai_tag = a_tag (* special for privative abs-tvaa eg akritvaa *)
and ani_tag = an_tag
;
value unknown_tag = [ ((0,[]),[ Unanalysed ]) ]
;
;
value give_up cat =
let mess = "Missing " ^ cat ^ " morphology bank" in do
{ Web.abort Html.default_language
"System error - please report - " mess
(* ; exit 0 (* installation problem -- executing process fails *) *)
; Deco.empty
}
;
......@@ -158,7 +157,7 @@ value morpho_tags = fun
| _ -> raise (Control.Anomaly "morpho_tags")
]
;
(* Used in Lexer, Reader, Parser, Interface *)
(* Used in Lexer/Reader/Parser and Interface *)
value tags_of phase word =
match phase with
[ Pv | Pvk | Pvkc | Pvkv -> failwith "Preverb in tags_of"
......@@ -179,6 +178,6 @@ value tags_of phase word =
(* NB Atomic comprises tin verbal forms of roots as well as sup atomic forms
and all the pure stems collections Iic Iiv etc. *)
]
;
;
end;
......@@ -73,8 +73,10 @@ value print_inv_morpho pe pne pu form (seg_num,sub) generative (delta,morphs) =
let krit_infos = Deco.assoc bare_stem unique_kridantas in
try let (verbal,root) = look_up_homo homo krit_infos in do
{ match Deco.assoc bare_stem lexical_kridantas with
[ [] (* not in lexicon *) -> pne bare_stem
| entries (* bare stem is lexicalized *) ->
[ [] (* not in lexicon *) ->
if stem = [ 3; 32; 1 ] (* ita ifc *) then pe stem
else pne bare_stem
| entries (* bare stem is lexicalized *) ->
if List.exists (fun (_,h) -> h=homo) entries
then pe stem (* stem with exact homo is lexical entry *)
else pne bare_stem
......@@ -88,13 +90,21 @@ value print_inv_morpho pe pne pu form (seg_num,sub) generative (delta,morphs) =
; ps "]"
}
;
(* Decomposes a preverb sequence into the list of its components *)
value decomp_pvs pvs =
Deco.assoc pvs Web.preverbs_structure
;
(* Used in [Morpho_html] *)
value print_inv_morpho_link pvs pe pne pu form =
let pv = if Phonetics.phantomatic form then [ 2 ] (* aa- *)
else pvs in
let encaps print e = (* encapsulates prefixing with possible preverbs *)
if pv = [] then print e else do { pe pvs; ps "-"; print e } in
print_inv_morpho (encaps pe) (encaps pne) pu form
if pv = [] then print e else
let pr_pv pv = do { pe pv; ps "-" } in do
{ List.iter pr_pv (decomp_pvs pvs)
; print e
} in
print_inv_morpho (encaps pe) (encaps pne) pu form
(* Possible overgeneration when derivative of a root non attested with pv
since only existential test in [Dispatcher.validate_pv]. Thus
[anusandhiiyate] should show [dhaa#1], not [dhaa#2], [dhii#1] or [dhyaa] *)
......
......@@ -38,7 +38,8 @@ type inflected_map = Lexmap.lexmap inflexions
and lemma = Lexmap.inverse inflexions
and lemmas = list lemma
;
type multitag = list (Word.delta * inflexions)
type unitag = (Word.delta * inflexions)
and multitag = list unitag
;
type morphology =
{ nouns : inflected_map
......
......@@ -20,6 +20,7 @@ open Html;
open Web; (* ps pl abort truncation etc. [remote_server_host] *)
open Cgi; (* get *)
open Checkpoints;
open Phases.Phases; (* generative *)
open Scl_parser; (* Interface with UoH dependency parser *)
module Prel = struct (* Parser's lexer prelude *)
......@@ -52,6 +53,135 @@ end (* [Lexer_control] *)
module Lex = Lexer.Lexer Prel Lexer_control
(* [print_proj print_segment_roles print_ext_segment extract_lemma] *)
;
(* Printing functions *)
value table_labels = table_begin (background Pink)
;
value print_labels tags seg_num = do
{ ps th_begin (* begin labels *)
; pl table_labels
; let print_label n _ = do
{ ps (cell (html_red (string_of_int seg_num ^ "." ^ string_of_int n)))
; n+1
} in
let _ = List.fold_left print_label 1 tags in ()
; ps table_end
; ps th_end (* end labels *)
}
;
value rec color_of_role = fun (* Semantic role of lexical category *)
[ Pv | Pvk | Pvkc | Pvkv | Iic | Iic2 | Iik | Voca | Inv | Iicv | Iicc
| Iikv | Iikc | Iiif | A | An | Vok | Vokv | Vokc | Vocv | Vocc | Iiy
| Iiv | Iivv | Iivc | Peri | Auxiick -> Grey
| Noun | Noun2 | Nouv | Nouc | Krid | Kriv | Kric | Pron | Ifc | Ifc2
| Kama | Lopak | Auxik -> Cyan (* Actor or Predicate *)
| Root | Lopa | Auxi -> Pink (* abs-tvaa in Inde *) (* Process *)
| Abso | Absv | Absc | Inde | Avy | Ai | Ani | Inftu (* Circumstance *)
-> Lavender
| Unknown | Cache -> Grey
| Comp (_,ph) _ _ | Tad (_,ph) _ _ -> color_of_role ph
| Sfx -> Cyan
| Isfx -> Grey
]
and table_role_of phase = table_begin (background (color_of_role phase))
;
(* syntactico/semantical roles analysis, function of declension *)
value print_roles pr_sem phase tags form = do
{ ps th_begin
; pl (table_role_of phase)
; let pr_roles (delta,sems) = do
{ ps tr_begin
; ps th_begin
; let word = Word.patch delta form in
pr_sem word sems
; ps th_end
; ps tr_end
} in
List.iter pr_roles tags
; ps table_end
; ps th_end
}
;
(* Segment printing without phonetics with semantics for Parser *)
value print_segment_roles print_sems seg_num (phase,rword,_) =
let word = Word.mirror rword in do
{ Morpho_html.print_signifiant_yellow rword
; let (decl_phase,form,decl_tags) = match Lex.tags_of phase word with
[ Atomic tags ->
Lex.process_kridanta [] seg_num phase word tags
| Preverbed (_,phase) pvs form tags ->
Lex.process_kridanta pvs seg_num phase form tags
| Taddhita (ph,form) sfx sfx_phase sfx_tags ->
match Lex.tags_of ph form with
[ Atomic _ -> (* stem, tagged as iic *)
Lex.process_taddhita [] seg_num ph form sfx_phase sfx sfx_tags
| Preverbed _ pvs _ _ -> (* stem, tagged as iic *)
Lex.process_taddhita pvs seg_num ph form sfx_phase sfx sfx_tags
| _ -> failwith "taddhita recursion unavailable"
]
] in do
{ print_labels decl_tags seg_num
; print_roles print_sems decl_phase decl_tags form
}
}
;
value project n list = List.nth list (n-1) (* Ocaml's nth starts at 0 *)
;
value print_uni_kridanta pvs phase word multitags (n,m) =
let (delta,polytag) = project n multitags in
let unitag = [ project m polytag ] in do
{ ps th_begin
; pl (Lex.table_morph_of phase) (* table of color of phase begins *)
; let _ = (* print unique tagging *)
Lex.print_morph pvs False 0 (generative phase) word 0 (delta,unitag) in ()
; ps table_end (* table of color of phase ends *)
; ps th_end
}
;
value print_uni_taddhita pvs m phase stem sfx sfx_phase = fun
[ [ (delta,polytag) ] -> (* we assume n=1 taddhita form unambiguous *)
let unitag = [ project m polytag ]
and gen = generative phase
and cached = False in do
{ ps th_begin
; pl (Lex.table_morph_of sfx_phase) (* table begin *)
; let _ = Lex.print_morph_tad pvs cached 0 gen stem sfx 0 (delta,unitag) in ()
; ps table_end (* table end *)
; ps th_end
}
| _ -> failwith "Multiple sfx tag"
]
;
value print_projection phase rword ((_,m) as index) = do
{ ps tr_begin (* tr begins *)
; Morpho_html.print_signifiant_yellow rword
; let word = Word.mirror rword in
match Lex.tags_of phase word with
[ Atomic tags -> print_uni_kridanta [] phase word tags index
| Preverbed (_,phase) pvs form tags ->
print_uni_kridanta pvs phase form tags index
| Taddhita (ph,form) sfx sfx_phase sfx_tags ->
match Lex.tags_of ph form with
[ Atomic _ -> print_uni_taddhita [] m phase form sfx sfx_phase sfx_tags
| Preverbed _ pvs _ _ ->
print_uni_taddhita pvs m phase form sfx sfx_phase sfx_tags
| _ -> failwith "taddhita recursion unavailable"
]
]
; ps tr_end (* tr ends *)
}
;
value print_proj phase rword = fun
[ [] -> failwith "Projection missing"
| [ n_m :: rest ] -> do
{ print_projection phase rword n_m
; rest (* returns the rest of projections stream *)
}
]
;
(* End Printing functions *)
value rpc = remote_server_host
and remote = ref False (* local invocation of cgi by default *)
;
......@@ -144,7 +274,7 @@ value print_sems word morphs = do
value print_out seg_num segment = do
(* Contrarily to Reader, we discard phonetic information. *)
{ tr_begin |> ps
; Lex.print_segment_roles print_sems seg_num segment
; print_segment_roles print_sems seg_num segment
; tr_end |> ps
; seg_num+1
}
......@@ -155,7 +285,7 @@ value rec print_project proj = fun
| _ -> failwith "Too many projections"
]
| [ (phase,rword,_) :: rest ] -> (* sandhi ignored *)
let new_proj = Lex.print_proj phase rword proj in
let new_proj = print_proj phase rword proj in
print_project new_proj rest
]
;
......
......@@ -190,4 +190,5 @@ value rec generative = fun
| _ -> False
]
;
end; (* Phases *)
......@@ -18,7 +18,7 @@
open Constraints;
(* [roles_of extract sort_flatten truncate_groups eval_penalty] *)
open Morphology; (* [tag_sort] *)
open Morphology;
module Prel = struct
value prelude () = Web.reader_prelude Web.reader_title;
......
......@@ -120,6 +120,77 @@ value check_tags current_sol_string tagging =
let oc = parse_phase (String.sub current_sol_string 0 pos) in
oc = tagging
;
(* OBS Used to be in Lexer - to be adapted
value return_tagging output projs =
let get_tags phase rword projs = (* adapted from [print_proj] *)
let form = mirror rword in
match tags_of phase form with
[ Atomic polytag -> match projs with
[ [] -> failwith "Projection missing"
| [ (n,m) :: rest ] ->
let gen = generative phase in
let (delta,tags) = project n polytag in
let tagging = [ project m tags ] in
let entry = get_morph gen phase form (delta,tagging) in
(rest, lex_cat phase, entry)
]
| _ -> failwith "Not implemented yet" (*i TODO for Regression
[ (projs, lex_cat Pv, (form, lex_cat Pv, Preverbs_list prevs, []))] i*)
] in
let rec taggings accu projs = fun
[ [] -> match projs with
[ [] -> accu
| _ -> failwith "Too many projections"
]
| [ (phase,rword,_) :: rest ] -> (* sandhi ignored *)
let (new_projs,phase,tags) = get_tags phase rword projs in
taggings [ tags :: accu ] new_projs rest
] in
return_morph (List.rev (taggings [] projs output))
;
value record_tagging unsandhied mode_sent mode_trans all sentence output proj =
let report = output_string Control.out_chan.val in
let print_proj1 phase rword proj prevs = do
(* adapted from [print_proj] *)
{ report "${"
; let form = mirror rword in do
{ report (decode form)
; let res = match proj with
[ [] -> failwith "Projection missing"
| [ (n,m) :: rest ] ->
let gen = generative phase in
let polytag = extract_lemma phase form in
let (delta,tags) = project n polytag in
let tagging = [ project m tags ] in do
{ report ":"
; report (string_of_phase phase ^ "")
; Morpho_out.report_morph gen form (delta,tagging)
; (rest,[]) (* returns the rest of projections stream *)
}
] in
do { report "}$&"; res }
}
} in do
{ report (if Control.full.val then "[{C}] " else "[{S}] ")
; report (if unsandhied then "<{F}> " else "<{T}> ")
; report (if mode_sent then "|{Sent}| " else "|{Word}| ")
; report ("#{" ^ mode_trans ^ "}# ")
; report ("({" ^ sentence ^ "})")
; report (" [" ^ (string_of_int all) ^ "] ")
; let rec pp (proj,prevs) = fun
[ [] -> match proj with
[ [] -> () (* finished, projections exhausted *)
| _ -> failwith "Too many projections"
]
| [ (phase,rword,_) :: rest ] -> (* sandhi ignored *)
let proj_prevs = print_proj1 phase rword proj prevs in
pp proj_prevs rest
] in pp (proj,[]) output
; report "\n"
; close_out Report_chan.chan.val
}
;
OBS *)
value look_up_tags solution output tagging sol =
let proj = List.fold_left extract "" sol in
let p = parse_proj proj in
......
......@@ -8,4 +8,4 @@
(**************************************************************************)
(* Generated by make version - see main Makefile *)
value version="3.06" and version_date="2018-05-01";
value version="3.07" and version_date="2018-05-12";
......@@ -75,9 +75,10 @@ and public_data_dir = top_site_dir "DATA" (* linguistic data for cgis *)
and var_dir = top_site_dir "VAR" (* Parser dynamic regression suites *)
and corpus_dir = top_site_dir "CORPUS" (* Corpus tree *)
;
(* This file is accessible only from Station clients in [var_dir] *)
(* Regression deprecated
[(* This file is accessible only from Station clients in [var_dir] *)
value regression_file_name = "regression" (* regression analysis stuff *)
;
; ]*)
value data name = data_dir ^ name
and dico_page name = dico_dir ^ name
......@@ -279,6 +280,15 @@ and public_sandhis_id_file = public_data "sandhis_id.rem"
and public_cache_file = public_data "cache.rem"
and public_cache_txt_file = public_data "cache.txt"
;
(* Here we retrieve finer discrimination for verbs forms preceded by preverbs.
This is experimental, and incurs too many conversions betweeen strings
and words, suggesting a restructuring of preverbs representation. *)
(* This structure is in Web which acts as a common gloabal namespace for the
various runtimes Lexer/Reader/Parser and Interface *)
value preverbs_structure = (* Used in Morpho for display of pvs *)
try (Gen.gobble public_preverbs_file : Deco.deco Word.word)
with [ _ -> failwith "preverbs_structure" ]
;
value skt_dir_url = Paths.skt_dir_url
;
(* Relative paths of top directory of site and sub directories *)
......
VERSION='3.06'
DATE='2018-05-01'
VERSION='3.07'
DATE='2018-05-12'
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment