Commit ff86ab64 authored by Gérard Huet's avatar Gérard Huet

Notation ! for mandatory break, useful for vocatives

parent 6a68ee11
......@@ -115,28 +115,23 @@ value sentence_links dir permission sentences =
match font with
[ Multilingual.Deva -> Corpus.Encoding.Devanagari
| Multilingual.Roma -> Corpus.Encoding.IAST
]
in
] in
let text = Corpus.Sentence.text encoding sentence in
let display =
match font with
[ Multilingual.Deva -> deva16_blue
| Multilingual.Roma -> span Trans16
]
in
] in
text
|> anchor_ref (sentence |> Web_corpus.url dir permission |> escape)
|> display
in
|> display in
List.map to_anchor_ref sentences
;
value section_selection dir sections =
let options =
let prefixes =
List.map (fun x -> Filename.concat dir x) sections
in
List.combine prefixes sections
in
List.map (fun x -> Filename.concat dir x) sections in
List.combine prefixes sections in
option_select_label Params.corpus_dir options
;
value add_sentence_form dir permission gap =
......
......@@ -4,7 +4,7 @@
(* *)
(* Idir Lankri *)
(* *)
(* ©2017 Institut National de Recherche en Informatique et en Automatique *)
(* ©2018 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(* CGI script [manager] for corpus management, i.e. for listing and
......@@ -12,16 +12,14 @@
value main =
let env = Cgi.create_env (Cgi.query_string ()) in
let corpdir = Cgi.decoded_get Params.corpus_dir "" env in
let permission =
Web_corpus.permission_of_string (Cgi.decoded_get Params.corpus_permission "" env)
in
let corpdir = Cgi.decoded_get Params.corpus_dir "" env
and corpperm = Cgi.decoded_get Params.corpus_permission "" env in
let permission = Web_corpus.permission_of_string corpperm in
let lang = Html.default_language in
try
Corpus_manager.mk_page corpdir permission
with
[ Sys_error msg -> Web.abort Html.default_language Control.sys_err_mess msg
| _ ->
Web.abort Html.default_language Control.fatal_err_mess
"Unexpected anomaly"
[ Sys_error msg -> Web.abort lang Control.sys_err_mess msg
| _ -> Web.abort lang Control.fatal_err_mess "Unexpected anomaly"
]
;
......@@ -38,7 +38,7 @@ value xml_next op = xml_end op ^ xml_begin op
value html_break = xml_empty "br"
and html_paragraph = xml_begin "p" ^ xml_end "p"
;
(* array operations *)
(* Array operations *)
value tr_begin = xml_begin "tr"
and tr_end = xml_end "tr"
and th_begin = xml_begin "th"
......@@ -124,13 +124,11 @@ value int_input ?id ?val ?(step = 1) ?(min = min_int) ?(max = max_int) name =
; ("step", string_of_int step)
; ("min", string_of_int min)
; ("max", string_of_int max)
]
in
] in
let opt_attrs =
[ ("id", id)
; ("value", Gen.opt_app string_of_int val)
]
in
] in
let attrs = add_opt_attrs opt_attrs attrs in
xml_empty_with_att "input" attrs
;
......@@ -172,24 +170,16 @@ value li ?id item =
(* Ordered list *)
value ol ?id ?li_id_prefix ?(start = 1) items =
let ol = "ol" in
let items =
List.mapi (fun i item ->
let id =
let genid prefix = prefix ^ string_of_int (start + i) in
Gen.opt_app genid li_id_prefix
in
li ?id item
) items
in
let list = String.concat "\n" items in
let process i item =
let id = let genid prefix = prefix ^ string_of_int (start + i) in
Gen.opt_app genid li_id_prefix in
li ?id item in
let lines = List.mapi process items in
let list = String.concat "\n" lines in
let attrs =
add_opt_attrs [ ("id", id) ] [ ("start", string_of_int start) ]
in
xml_begin_with_att ol attrs ^ "\n" ^
list ^ "\n" ^
xml_end ol
add_opt_attrs [ ("id", id) ] [ ("start", string_of_int start) ] in
xml_begin_with_att ol attrs ^ "\n" ^ list ^ "\n" ^ xml_end ol
;
value fieldn name content = [ ("name",name); ("content",content) ]
and fieldp name content = [ ("property",name); ("content",content) ]
;
......@@ -203,22 +193,6 @@ type color =
| Gold | Magenta | Mauve | Pink | Salmon | Lime | Light_blue | Lavender
| Lawngreen | Deep_pink | Pale_yellow | Pale_rose | Beige ]
;
(* TO be relocated [
type pict = string (* misc background pictures *)
[ Om | Om2 | Om3 | Om4 | Gan | Hare | Geo ]
;
(* Deprecated, for use as background pictures like in the ancient Web... *)
(* Problematic, since pollutes with installation-dependent URLs *)
value pict = fun
[ "Om" -> Install.om_jpg
| "Om2" -> Install.om2_jpg
| "Om3" -> Install.om3_jpg
| "Om4" -> Install.om4_jpg
| "Gan" -> Install.ganesh_gif
| "Geo" -> Install.geo_gif
| "Hare" -> Install.hare_jpg
]
;] *)
type basic_style =
[ Font_family of font_family
| Font_style of font_style
......@@ -357,7 +331,6 @@ type style_class =
| Deep_sky_back | Carmin_back | Orange_back | Red_back | Mauve_back
| Lavender_back | Lavender_cent | Green_back | Lawngreen_back | Magenta_back
| Aquamarine_back | Hidden_
(*[ | Pict_om | Pict_om2 | Pict_om3 | Pict_om4 | Pict_gan | Pict_hare | Pict_geo ]*)
]
;
value background = fun
......@@ -470,7 +443,7 @@ value styles = fun
]
;
(* Compiles a class into its style for non-css compliant browsers *)
(* Mostly used by Css to compile the css style sheet *)
(* Nowadays mostly used by Css to compile the css style sheet *)
value style cla = String.concat "; " (List.map style_sheet (styles cla))
;
value class_of = fun
......@@ -564,7 +537,7 @@ and div_begin = elt_begin "div"
and body_begin = elt_begin "body"
and body_begin_style = elt_begin_attrs margins "body" (* Body margins are null *)
where margins = [ ("style","margin-left: 0; margin-right: 0; margin-top: 0;") ]
(* [table_begin_style] not compliant with HTML5 (dynamic style) *)
(* Caution: [table_begin_style] is not compliant with HTML5 (dynamic style) *)
and table_begin_style style attrs = elt_begin_attrs attrs "table" style
and table_begin = elt_begin "table"
and td_begin_class = elt_begin "td"
......@@ -594,13 +567,11 @@ value span style text = span_begin style ^ text ^ span_end
and span_skt style text = span_skt_begin style ^ text ^ span_end
and div style text = div_begin style ^ text ^ div_end
;
(* Centering old style - deprecated *)
value center = div Center_
and center_begin = div_begin Center_
and center_end = div_end
;
value center_image name caption =
center (xml_empty_with_att "img" [ ("src",name); ("alt",caption) ])
;
value html_red = span Red_
and html_devared = span_skt Devared_
and html_magenta = span Magenta_
......@@ -666,11 +637,10 @@ value author = fieldn "author" author_name
and date_copyrighted = fieldp "dc:datecopyrighted" current_year
and rights_holder = fieldp "dc:rightsholder" author_name
and keywords = fieldn "keywords"
"dictionary,sanskrit,heritage,dictionnaire,sanscrit,india,inde,indology,linguistics,panini,digital humanities,cultural heritage,computational linguistics,hypertext lexicon"
"sanskrit,dictionary,heritage,dictionnaire,sanscrit,india,inde,indology,linguistics,panini,digital humanities,digital libraries,cultural heritage,computational linguistics,hypertext lexicon"
;
value heritage_dictionary_title = title "Sanskrit Heritage Dictionary"
;
(* was in Install *)
(* Supported publishing media *)
type medium = [ Html | Tex ]
;
......@@ -680,7 +650,7 @@ type platform = [ Simputer | Computer | Station | Server ]
(* Current target platform to customize - needs recompiling if changed *)
value target = match Paths.platform with
[ "Simputer" -> Simputer (* Historical - small screen *)
| "Smartphone" -> Simputer (* Smartphone version not implemented yet *)
| "Smartphone" | "Tablet" -> Simputer (* TODO *)
| "Computer" -> Computer (* Standard client installation *)
| "Station" -> Station (* Permits external Analysis mode *)
| "Server" -> Server (* Http server for Internet web services *)
......
......@@ -28,7 +28,7 @@ value admits_aa = ref False
and admits_lopa = ref False
;
value morpho_gen = ref True (* morphology generation time *)
(* Turn to [False] for cgi execution (fake conjugation and nophantoms) *)
(* Turn to [False] for cgi execution (fake conjugation and no phantoms) *)
;
(* The [inflected_map] lexicons of inflected forms: nouns, iics, etc are computed
by [Make_nouns] and are dumped as persistent global databases nouns.rem etc.
......
......@@ -7,7 +7,7 @@
(* ©2018 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(* Sanskrit Reader Summarizing interface. *)
(* Sanskrit Reader Summarizing interface. Yields sktgraph.cgi *)
(* We construct a CGI Interface displaying the segmentation graph in which the
user may indicate segments as mandatory checkpoints. At any point he may
......@@ -20,9 +20,9 @@ open Graph_segmenter; (* [Segment cur_chunk set_cur_offset graph visual] *)
open Phases; (* [Phases] *)
open Phases; (* [phase is_cache generative] *)
open Dispatcher; (* [transducer_vect phase Dispatch transition trim_tags] *)
open Html;
open Html; (* html constructors *)
open Web; (* [ps pl abort reader_cgi scl_toggle] etc. *)
open Cgi;
open Cgi; (* [url get decode_url] *)
module Prel = struct (* Interface's lexer prelude *)
......@@ -179,7 +179,7 @@ value sort_check cpts =
List.sort compare_index cpts
;
value seg_length = fun
[ [ -2 :: rest ] -> Word.length rest
[ [ -2 :: rest ] -> Word.length rest (* lopa does not count *)
| w -> Word.length w
]
;
......@@ -433,7 +433,7 @@ value check_sentence translit us text_orig checkpoints sentence
; td_wrap (call_undo text checkpoints ^ "Undo") |> ps
; let call_scl_parser n = (* invocation of scl parser *)
if scl_toggle then
ps (td_wrap (call_reader text cpts "o" ^ "UoH Analysis Mode"))
td_wrap (call_reader text cpts "o" ^ "UoH Analysis Mode") |> ps
else () (* [scl_parser] is not visible unless toggle is set *) in
match count with
[ Num.Int n -> if n > max_count then
......@@ -498,7 +498,7 @@ value make_cache_transducer (cache : Morphology.inflected_map) =
; Gen.dump auto_cache public_transca_file (* for [Load_transducers] *)
}
;
(* We fill gendered entries incrementally in a [public_cache_txt_file] *)
(* We fill gendered entries incrementally in [public_cache_txt_file] *)
value append_cache entry gender =
let cho = open_out_gen [ Open_wronly; Open_append; Open_text ] 0o777
public_cache_txt_file in do
......@@ -523,7 +523,7 @@ value quit_button corpmode corpdir sentno =
])
and permission = Web_corpus.string_of_permission corpmode in
center_begin ^
cgi_begin (Cgi.url corpus_manager_cgi ~fragment:sentno) "" ^
cgi_begin (url corpus_manager_cgi ~fragment:sentno) "" ^
hidden_input Params.corpus_dir corpdir ^
hidden_input Params.corpus_permission permission ^
submit_input submit_button_label ^
......@@ -535,6 +535,12 @@ value graph_engine () = do
{ Prel.prelude ()
; let query = Sys.getenv "QUERY_STRING" in
let env = create_env query in
(* Multiple environment variables according to modes of use are:
text topic st cp us t lex cache abs cpts (standard mode)
allSol (deprecated Validate mode)
corpus sentenceNumber linkNumber (Corpus mode)
corpdir sentno corpmode (defined in Params)
guess gender revised rev_off rev_ind (User-aid) *)
let url_encoded_input = get "text" env ""
and url_encoded_topic = get "topic" env "" (* topic carry-over *)
and st = get "st" env "t" (* sentence parse default *)
......@@ -555,14 +561,14 @@ value graph_engine () = do
and sent_id = get "sentenceNumber" env "0"
and link_num = get "linkNumber" env "0" (* is there a better default? *)
and sol_num = get "allSol" env "0" in (* Needed for Validate mode *)
let url_enc_corpus_permission =
Cgi.get Params.corpus_permission env "true" in
let url_enc_corpus_permission = (* Corpus mode *)
get Params.corpus_permission env "true" in
let corpus_permission =
url_enc_corpus_permission
|> Cgi.decode_url
|> decode_url
|> Web_corpus.permission_of_string in
let corpus_dir = Cgi.get Params.corpus_dir env "" in
let sentence_no = Cgi.get Params.sentence_no env "" in
let corpus_dir = get Params.corpus_dir env "" in
let sentence_no = get Params.sentence_no env "" in
let text = arguments translit lex cache st us cp url_encoded_input
url_encoded_topic abs sol_num corpus sent_id link_num
url_enc_corpus_permission corpus_dir sentence_no
......@@ -570,9 +576,10 @@ value graph_engine () = do
try let url_encoded_cpts = List.assoc "cpts" env in (* do not use get *)
parse_cpts (decode_url url_encoded_cpts)
with [ Not_found -> [] ]
and guess_morph = decode_url (get "guess" env "")
and guess_morph = decode_url (get "guess" env "") (* User-aid guessing *)
and pseudo_gender = decode_url (get "gender" env "") in
let _ = if String.length guess_morph > 0 && Paths.platform="Station" then
(* User-aid cache acquisition *)
let (entry,gender) = match pseudo_gender with
[ "" -> parse_guess guess_morph
| g -> (guess_morph,g)
......@@ -583,14 +590,15 @@ value graph_engine () = do
make_cache_transducer cache
}
else () in
let revised = decode_url (get "revised" env "")
let revised = decode_url (get "revised" env "") (* User-aid revision *)
and rev_off = int_of_string (get "rev_off" env "-1")
and rev_ind = int_of_string (get "rev_ind" env "-1") in
try do
{ match (revised,rev_off,rev_ind) with
[ ("",-1,-1) -> check_sentence translit uns text checkpoints
input sol_num corpus sent_id link_num
| (new_word,word_off,chunk_ind) ->
[ ("",-1,-1) -> (* Standard input processing *** main call *** *)
check_sentence translit uns text checkpoints input sol_num
corpus sent_id link_num
| (new_word,word_off,chunk_ind) (* User-aid revision *) ->
let chunks = Sanskrit.read_sanskrit (Encode.switch_code translit) input in
let rec decoded init ind = fun
[ [] -> String.sub init 0 ((String.length init)-1)
......@@ -606,8 +614,8 @@ value graph_engine () = do
| [ a :: rest ] -> if cur_ind = chunk_ind then Word.length a
else find_word_len (cur_ind+1) rest
] in
let word_len = find_word_len 1 chunks in
let new_chunk_len = Word.length (Encode.switch_code translit revised) in
let word_len = find_word_len 1 chunks
and new_chunk_len = Word.length (Encode.switch_code translit revised) in
let diff = new_chunk_len-word_len in
let revised_check =
let revise (k,sec,sel) = (if k<word_off then k else k+diff,sec,sel) in
......@@ -636,7 +644,7 @@ value graph_engine () = do
or quit without saving (annotator mode) *)
; if sentence_no <> "" then
quit_button corpus_permission
(Cgi.decode_url corpus_dir) (Cgi.decode_url sentence_no) |> pl
(decode_url corpus_dir) (decode_url sentence_no) |> pl
else ()
; close_page_with_margin ()
; page_end lang True
......
......@@ -4,9 +4,12 @@
(* *)
(* Idir Lankri *)
(* *)
(* ©2017 Institut National de Recherche en Informatique et en Automatique *)
(* ©2018 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(* This is an unfinished attempt to fiter out citations from Heritage
and make a corpus document from it - unused at present *)
value abort report_error status =
do
{ report_error ()
......
......@@ -68,7 +68,7 @@ value voices_of = fun
| "sad#1" | "sap#1" | "saa#1" | "sidh#1" | "sidh#2" | "siiv" | "sur" | "s.r"
| "s.rj#1" | "s.rp" | "skand" | "skhal" | "stan" | "stubh" | "sthag" | "snaa"
| "snih#1" | "snu" | "snuh#1" | "sp.r" | "sphal" | "sphu.t" | "sphur"
| "sm.r" | "sru" | "svan" | "svap" | "svar#1" | "svar#2" | "ha.th" | "has"
| "sm.r" | "sru" | "svan" | "svap" | "svar#1" | "svar#2" | "ha.th"
| "haa#1" | "hi#2" | "hi.ms" | "h.r.s" | "hras" | "hrii#1" | "hval"
| "maarg" (* root rather than nominal verb *)
(*| "viz#1" Atma needed for eg nivizate \Pan{1,3,17} *)
......@@ -80,6 +80,7 @@ value voices_of = fun
(*| "mah" Atma needed for pft. maamahe *)
(*| "cit#1" Atma needed for pft. cikite *)
(*| "kaafk.s" | "han#1" occur also in Atma in BhG: kaafk.se hani.sye *)
(*| "has" Atma needed for hasate *)
(*| "a~nj" also Atma afkte | "naath" "praz" "sp.rz#1" idem *)
-> Para (* active only *)
| "az#1" | "aas#2" | "indh" | "iik.s" | "ii.d" | "iir" | "iiz#1" | "ii.s"
......
......@@ -100,7 +100,7 @@ value adjust c w = match Word.mirror w with
| _ -> if c=36 (* n *) || c=41 (* m *)
then raise Glue (* since d|m->nn and n|m -> nm *)
(* Word.mirror [ 32 :: rest ] (* n -> t *) *)
(* incomplétude: raajan naasiin vocatif raajan *)
(* incompleteness: raajan naasiin vocatif raajan *)
else w
]
| 22 (* c *) -> if c=22 then Word.mirror [ 32 :: rest ] (* c -> t *)
......@@ -154,6 +154,8 @@ value adjust c w = match Word.mirror w with
]
]
;
(* Called from [Sanskrit.read_processed_skt_stream] for use in [read_sanskrit]
with argument [read_chunk=sanskrit_chunk encode] *)
value padapatha read_chunk l = (* l is list of chunks separated by blanks *)
(* returns padapatha as list of forms in terminal sandhi *)
let rec pad_rec = fun (* returns (c,l) with c first char of first pada in l *)
......
......@@ -65,6 +65,7 @@ EXTEND Gramskt
[ [ s = skt; `EOI -> s ] ] ;
pada: (* non-empty list of chunks separated by blanks *)
[ [ el = LIST1 skt -> el ] ] ;
(*i deprecated
sloka_line:
[ [ p = pada; "|"; "|" -> [ p ]
| p = pada; "|"; sl = sloka_line -> [ p :: sl ]
......@@ -73,7 +74,7 @@ EXTEND Gramskt
[ [ p = pada; "|"; sl = sloka_line -> [ p :: sl ]
| p = pada -> [ p ]
| `EOI -> failwith "Empty sanskrit input"
] ] ;
] ] ; *)
sanscrit:
[ [ p = pada; "|"; "|" -> [ p ]
| p = pada; "|"; sl = sanscrit -> [ p :: sl ]
......@@ -162,21 +163,6 @@ value read_processed_skt_stream encode strm =
where concat line lines = process line @ lines
]
;
(* assumes Velthuis encoding *)
value read_corpus unsandhied chi = (* only used by Tagger1 *)
let encode = Transduction.code_raw (* unnormalized input from stream *)
and channel = Stream.of_channel chi
and reader = if unsandhied then read_raw_skt_stream
else read_processed_skt_stream in
reader encode channel
;
value read_VH unsandhied str =
let encode = Encode.code_string (* normalized input from string *)
and channel = Stream.of_string str
and reader = if unsandhied then read_raw_skt_stream
else read_processed_skt_stream in
reader encode channel
;
(* Now general readers with encoding parameter of type [string -> word] *)
......
......@@ -4,7 +4,7 @@
(* *)
(* Gérard Huet *)
(* *)
(* ©2017 Institut National de Recherche en Informatique et en Automatique *)
(* ©2018 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(*i module Sanskrit : sig i*)
......@@ -43,8 +43,6 @@ value normal_stem_skt : skt -> string;
value code_skt_ref : skt -> Word.word;
value code_skt_ref_d : skt -> Word.word;
value decode_skt : Word.word -> skt;
value read_corpus : bool -> in_channel -> list Word.word;
value read_VH : bool -> string -> list Word.word;
value read_sanskrit : (string -> Word.word) -> string -> list Word.word;
value read_raw_sanskrit : (string -> Word.word) -> string -> list Word.word;
......
......@@ -42,7 +42,7 @@ open Format;
;
value rec filter is_kwd = parser
[ [: `((KEYWORD s, loc) as p); strm :] ->
if is_kwd s then [: `p; filter is_kwd strm :]
if is_kwd s || s = "!" then [: `p; filter is_kwd strm :]
else raise (Encode.In_error ("Undefined token : " ^ s))
| [: `x; s :] -> [: `x; filter is_kwd s :]
| [: :] -> [: :]
......
......@@ -1200,7 +1200,7 @@ value compute_active_present2 sstem wstem set entry third = do
; match wstem with
[ [ 2 :: _ ] -> (* Ppr of roots in -aa is complex and overgenerates *)
match entry with
[ "maa#1" | "yaa#1" -> () (* no ppra *)
[ "bhaa#1" | "maa#1" | "yaa#1" -> () (* no known ppra *)
| _ -> let m_pstem = wstem and f_pstem = rev (fix2w wstem "at" set) in
record_part (Ppra_ 2 Primary m_pstem f_pstem entry)
]
......
......@@ -8,4 +8,4 @@
(**************************************************************************)
(* Generated by make version - see main Makefile *)
value version="3.05" and version_date="2018-03-01";
value version="3.05" and version_date="2018-03-09";
VERSION='3.05'
DATE='2018-03-01'
DATE='2018-03-09'
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment