Commit 46923869 authored by Gérard Huet's avatar Gérard Huet

New uoh_interface (Amba)

parent 4894c283
This diff is collapsed.
......@@ -269,6 +269,7 @@ site_pages: mk_index_page mk_grammar_page mk_reader_page mk_sandhi_page css
cd ../DICO; ln -f reader.fr.html reader.html # id
./mk_sandhi_page
cd ../DICO; ln -f sandhi.fr.html sandhi.html # id
# XML info file copied in DICO by make web_services in main Makefile
./css
# but install sets index.html to .fr. ou .en. according to config lexicon choice
......
......@@ -5599,7 +5599,11 @@ value iic_indecl = (* should be lexicalized *)
; "saaci"
]
;
(* feminine stems iic for productive adjectives *)
(* Feminine stems iic for productive adjectives *)
(* This is a generic weakness, to be remedied. *)
(* Generative stems are not inspected for feminine stems *)
(* attested as substantives, and thus incurring a feminine iic stem. *)
(* This concerns privative compounds and participles. *)
value iicf_extra =
[ "abalaa" (* a-bala with fem abalaa *)
; "kaantaa" (* kaanta pp *)
......@@ -5775,6 +5779,7 @@ value enter_extra_ifcs () = do
value enter_extra_iifcs () = do
{ let entry = "ahan" in (* for -aha- like pu.nyaahavaacanam *)
enter1 entry (Bare Noun (code "aha"))
(* more entries are potentially concerned - TODO *)
}
;
(* called by [Declension.emit_decls] and [Morpho_debug.emit_decls] *)
......
......@@ -77,6 +77,7 @@ value voices_of = fun
(*| "kan" Atma needed for kaayamaana *)
(*| "van" Atma needed for vanute *)
(*| "mah" also Atma for pft. maamahe *)
(*| "cit#1" also Atma for pft. cikite *)
(*| "kaafk.s" | "han#1" occur also in Atma in BhG: kaafk.se hani.sye *)
(*| "a~nj" also Atma afkte | "naath" "praz" "sp.rz#1" idem *)
-> Para (* active only *)
......
......@@ -30,7 +30,7 @@ value prelude () = do
; page_begin parser_meta_title
; pl (body_begin Chamois_back)
; if scl_toggle then (* external call SCL (experimental) *)
pl (javascript (scl_url ^ javascript_tooltip))
pl (javascript (SCLpaths.scl_url ^ javascript_tooltip))
else ()
; pl parser_title
; open_page_with_margin 15
......
......@@ -32,8 +32,8 @@ module UOH
(****************************************************************)
(* Paths - to move to configuration time *)
value svg_interface_url = "http://localhost/cgi-bin/SCL/SHMT/"
and nn_parser_url = "http://localhost/cgi-bin/SCL/NN/parser/generate.cgi"
value svg_interface_url = "http://localhost/cgi-bin/scl/SHMT/"
and nn_parser_url = "http://localhost/cgi-bin/scl/NN/parser/generate.cgi"
and show_parses_path = "prog/interface/call_parser_summary.cgi"
;
......@@ -74,31 +74,25 @@ value print_callback_solution counter solution =
; ps table_end
; ps td_end (* end segment *)
; ps (delimitor phase)
}
and pid = string_of_int (Unix.getpid ()) (* process-id stamp *)
and segmentations = string_of_int counter in do
} in do
{ ps tr_begin
; ps td_begin
(* TODO rewrite as [ps (let url= ... and link = ... in anchor_ref url link)] *)
; ps ("<a href=\"" ^ svg_interface_url ^ show_parses_path ^
"?filename=./tmp_in") (* call-back to svg interface UoH *)
; ps pid
; ps "&amp;outscript="
; ps default_output_font
; ps "&amp;rel=''"
; ps "&amp;sentnum="
; ps segmentations
; ps "&amp;save=no\""
; ps "&amp;translate=no\""
; ps (" onmouseover=\"Tip('<img src=" ^ scl_url ^ "DEMO/tmp_in")
; ps pid
; ps "/"
; ps segmentations
; ps ".1.svg height=100%; width=100%;>')\" onmouseout=\"UnTip()\">"
; ps (html_latin12 "Solve dependencies ")
; ps (xml_end "a")
; List.iter print_segment_cbk solution
; ps td_end
; ps tr_end
; ps (html_latin12 "Verse Order")
; ps table_end
; print_string "<form name=\"word-order\" method=\"get\" action = \"http://localhost/cgi-bin/scl/SHMT/prog/Word_order/call_heritage2anu.cgi\">"
; print_newline
; print_string "<table>"
; ps tr_begin
; ps td_begin
; ps (html_latin12 "Prose Order")
; ps (xml_begin_with_att "textarea"
[ ("name","word-order"); ("rows","1"); ("cols","50") ] ^
xml_end "textarea")
; ps (submit_input "Submit")
; ps td_end
; ps tr_end
; counter+1
}
......@@ -115,14 +109,14 @@ value print_ext_solutions cho =
List.iter (print_ext_output cho)
;
(* External call-back to Amba Kulkarni's parser (from [Reader.print_ext] *)
value amba_invoke pid = (* Experimental - assumes amrita configuration *)
(*[value amba_invoke pid = (* Experimental - assumes amrita configuration *)
"mkdir -p " ^ tmp_in ^ pid ^ "; " ^
scl_dir ^ "Heritage_morph_interface/Heritage2anusaaraka_morph.sh <" ^
offline_file ^ " > " ^ tmp_in ^ pid ^ "/in" ^ pid ^ ".out; " ^
scl_dir ^ "kAraka/shabdabodha.sh YES " ^ tmp_in ^ pid ^ " in" ^
pid ^ ".out" ^ " in" ^ pid ^ ".kAraka " ^ default_output_font ^
" Full Prose NOECHO ND 2> " ^ offline ("err" ^ pid) ^ ";"
;
;]*)
(* Prints all segmentations in [offline_file]
and prepares invocation of UoH's CSL parser for dependency graph display *)
......@@ -135,11 +129,11 @@ value print_ext solutions =
(* System call to Amba Kulkarni's parser - fragile *)
; ps table_end
; ps (xml_begin "table")
; let pid = string_of_int (Unix.getpid ()) in (* stamp with process id *)
let cmd = amba_invoke pid in (* prepare cryptic UNIX command *)
let _ = Sys.command cmd in () (* call it *)
(*[; let pid = string_of_int (Unix.getpid ()) in (* stamp with process id *)
let cmd = amba_invoke pid in (* prepare cryptic UNIX command *)
let _ = Sys.command cmd in () (* call it *) ]*)
; let _ = print_callback solutions in () (* print dependency graphs *)
; ps table_end
(*[; ps table_end ] (?) *)
}
;
(* Now for processing of navya-nyaaya compounds in Experimental mode *)
......
......@@ -3008,7 +3008,7 @@ value redup_perf root =
let c = if sibilant c1 then match r with
[ [] -> error_vowel 3
| [ c2 :: _ ] -> if stop c2 then c2 else c1
(* if vowel c2 then c1
(* = if vowel c2 then c1
else if nasal c2 then c1
else if stop c2 then c2
else (* semivowel c2 *) c1 *)
......@@ -3149,8 +3149,15 @@ value compute_perfectm conj stem entry =
;
value compute_perfect_c strong weak olengthened eweak iopt entry =
match voices_of entry with
[ Para -> compute_perfecta Primary strong weak olengthened eweak iopt entry
| Atma -> let stem = match entry with
[ Para -> do
{ compute_perfecta Primary strong weak olengthened eweak iopt entry
; if entry = "cit#1" then do
{ compute_perfectm Primary weak entry
; compute_perfectm Primary (revcode "cikitr") entry (* WR *)
}
else ()
}
| Atma -> let stem = match entry with
[ "cak.s" | "ba.mh" -> strong
| _ -> weak
] in
......
......@@ -8,4 +8,4 @@
(**************************************************************************)
(* Generated by make version - see main Makefile *)
value version="2.99" and version_date="2017-05-15";
value version="2.99" and version_date="2017-05-18";
......@@ -631,15 +631,10 @@ value javascript_tooltip ="wz_tooltip.js"
(* Maybe should be put back in config? but versioning problem... *)
value remote_server_host = "http://sanskrit.inria.fr/"
;
(* SCL configuration begin *)
value scl_url = "http://localhost/SCL/SHMT/" (* Used to be set in Paths *)
;
(* This toogle controls accessibility of University of Hyderabad tools *)
value scl_toggle = (* should be [exists scl_profile] *)
not (SCLpaths.scl_url="") (* True if SCL tools are installed *)
;
(* SCL configuration begin *)
value interaction_modes_default mode =
[ (" Summary ","g",mode="g")
; (" Tagging ","t",mode="t")
......@@ -658,7 +653,7 @@ value reader_prelude title = do
; page_begin reader_meta_title
; pl (body_begin Chamois_back)
; if scl_toggle then (* external call SCL (experimental) *)
pl (javascript (scl_url ^ javascript_tooltip))
pl (javascript (SCLpaths.scl_url ^ javascript_tooltip))
else ()
; pl title
; open_page_with_margin 15
......
......@@ -274,13 +274,18 @@ FAQPAGE_FR=SITE/faq.fr.html
FAQPAGE_EN=SITE/faq.en.html
FAQPAGES=$(FAQPAGE_FR) $(FAQPAGE_EN)
MANUAL=SITE/manual.html
ROBOTS=SITE/robots.txt # currently disabled to allow Google access
XMLPAGE=SITE/xml.html
FAQ=SITE/faq.html
PORTAL=SITE/portal.html
GOLD=SITE/gold.html
GOLDEN=SITE/goldendict.html
ABREVS=SITE/abrevs.pdf
ROBOTS=SITE/robots.txt # currently disabled to allow Google access
# Goldendict data banks are now distributed from sanskrit.inria.fr
#GOLDENDICT=SITE/Heritage_du_sanskrit_san-fra.tar.gz SITE/Declension-heritage_du_sanskrit_san-eng.tar.gz SITE/Grammar-heritage_du_sanskrit_san-eng.tar.gz SITE/mw-heritage_du_sanskrit_san-eng.tar.gz
WEBSOURCES=SITE/portal.html SITE/faq.html SITE/abrevs.pdf $(GOLD) SITE/goldendict.html $(MANUAL) # $(ROBOTS)
WEBSOURCES=$(PORTAL) $(FAQ) $(ABREVS) $(GOLD) $(GOLDEN) $(MANUAL) $(XMLPAGE)
ALLWEBSOURCES=$(SITESOURCE) $(WEBSOURCES)
# Sources of morphology documents
XMLDATA=$(DATA)/XML
......@@ -457,9 +462,11 @@ Makefile: SETUP/MMakefile
# necessary if mk_*_page.ml has been updated
web_services: # Will write localized site pages in DICO and export cgis
cd ML && $(MAKE) cgis site_pages
cp -p SITE/xml.html DICO
release: $(DICO) web_services releasecgi releaseMW releasedata releasedoc \
releasebook releasexml
releasebook
# releasexml - no XML release on server - lookup Heritage_Resources
# releasepdf disabled at present as bulky and un-finished
echo "Version $(VERSION) of Sanskrit site released on $(SERVERHOST)"
......
VERSION='2.99'
DATE='2017-05-15'
DATE='2017-05-18'
<!doctype html>
<html>
<head>
<meta charset="utf-8">
<meta charset="utf-8"/>
<title>The Sanskrit Heritage Site Gold Book</title>
<meta name="author" content="G&#233;rard Huet" />
......
......@@ -13,7 +13,7 @@
<meta name="description" content="This page is for downloading the goldendict
version of the Sanskrit Heritage dictionary."/>
<link rel="shortcut icon" href="IMAGES/favicon.ico"/>
<link rel="stylesheet" type="text/css" href="DICO/style.css" media="screen,tv" />
<link rel="stylesheet" type="text/css" href="DICO/style.css" media="screen,tv"/>
</head>
<body class="pink_back"> <!-- Pale_rose -->
......
......@@ -13,7 +13,7 @@
<meta name="description" content="This page is for downloading the goldendict
version of the Sanskrit Heritage dictionary."/>
<link rel="shortcut icon" href="IMAGES/favicon.ico"/>
<link rel="stylesheet" type="text/css" href="DICO/style.css" media="screen,tv" />
<link rel="stylesheet" type="text/css" href="DICO/style.css" media="screen,tv"/>
</head>
<body class="pink_back"> <!-- Pale_rose -->
......
......@@ -223,7 +223,7 @@ as we shall see below.
A dictionary of inflected forms of Sanskrit words is provided
in XML form under various transliteration schemes.
Please visit the <a href="#RESOURCES">Sanskrit linguistic resources site</a>.
Please visit the <a href="xml.html">Sanskrit linguistic resources page</a>.
<a id="reader"></a>
<h2 class="b2"> Sanskrit Reader </h2>
......@@ -429,7 +429,7 @@ of programs implemented in Pidgin ML, functional core of the
<a href="http://ocaml.org">Objective Caml</a>
programming language. The Zen library and its documentation are available
as free software under the Gnu Lesser General Public License (LGPL) from the
<a href="#ZEN">Zen site.</a>
<a href="http://pauillac.inria.fr/~huet/ZEN/">Zen site.</a>
</p>
<!-- Forum closed
Please visit the <a href="http://sanskrit.inria.fr/zf/">Zen Forum</a> for
......@@ -438,28 +438,32 @@ announcements and discussions concerning the ZEN toolkit. -->
<h2 class="b2"><img src="IMAGES/ganesh.jpg" alt="Ganesh">
The Sanskrit Portal</h2>
Please visit our <a href="#PORTAL">Sanskrit Portal</a>
to find links to other Sanskrit resources.
Please visit our <a href="portal.html">Sanskrit Portal</a>
to find links to other Sanskrit resources.
<p>
If you are reading this from a mirror site, don't forget to regularly update
this server with the development Git site
"https://gitlab.inria.fr/huet/Heritage_Platform".
<h2 class="b2"><img src="IMAGES/om1.jpg" alt="Om">
Artwork credits</h2>
<span class="green">Orissan artwork at this site courtesy of Shauraj Rath.
&copy; Screenex, Bhubaneshwar, Ekamra, Orissa. All rights reserved.
© Screenex, Bhubaneshwar, Ekamra, Orissa. All rights reserved.
</span><br>
<span class="green">Wallpaper om images courtesy of
<a href="http://www.vishvarupa.com/aum-om-omkara-pranava.html">Vishvarupa.com</a>.
</span><br>
<span class="green">Ganesh wallpaper courtesy of
<a href="http://www.math-info.univ-paris5.fr/~patte/">Fran&ccedil;ois Patte</a>.
<a href="http://www.math-info.univ-paris5.fr/~patte/">François Patte</a>.
</span><br>
<span class="green">Shri Yantra design &copy;
<a href="http://pauillac.inria.fr/~huet">G&eacute;rard Huet</a> 1990.<br>
<span class="green">Shri Yantra design ©
<a href="MAGES/Yantra.jpg">rard Huet</a> 1990.<br>
</span>
</td></tr>
</table> <!-- body -->
<table class="pad60">
<table class="pad60"> <!--padding for bandeau -->
<tr><td></td></tr></table>
<div class="enpied">
<table class="bandeau"><tr><td>
......
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Sanskrit Linguistic Resources</title>
<meta name="author" content="Gérard Huet">
<meta property="dc:datecopyrighted" content="2017">
<meta property="dc:rightsholder" content="Gérard Huet">
<meta name ="keywords" content="india,dictionary,indology,sanskrit,lexicography,linguistics,indo-european,dictionnaire,sanscrit,panini,indology,linguistics">
<meta name ="date" content="2017-05-18">
<meta name="classification" content="computational linguistics, sanskrit, morphology, lexicography, indology">
<meta name="description" content="This page is for downloading Sanskrit resources.">
<link rel="shortcut icon" href="IMAGES/favicon.ico"/>
<link rel="stylesheet" type="text/css" href="DICO/style.css" media="screen,tv"/>
</head>
<body class="pink_back"> <!-- Pale_rose -->
<table class="body">
<table border="0pt" cellpadding="0" cellspacing="15pt" width="100%">
<tr><td>
<h1 class=b1>Sanskrit linguistic resources</h1>
<br>
<img src="IMAGES/Panini2.jpg" alt="Panini"/>
<br>
<div class="latin12">
<h2 class=b2>Sanskrit Morphology</h2>
<h3 class=b3>Background</h3>
This documents XML data banks of Sanskrit forms given with their morphological
taggings. They are produced mechanically by the declension and conjugation
engines of the Sanskrit Heritage Platform, processing the Sanskrit lexicon
underlying the Sanskrit Heritage Dictionary. The version used for this
generation is available <a href="Heritage.pdf">here</a> as a PDF document.
<p>
These databanks are regularly updated. They are available for public download
as a public git archive in the Sanskrit Heritage development site:
"https://gitlab.inria.fr/huet/Heritage_Resources".
<h3 class=b3>Databanks description</h3>
We provide here inflected forms and morphemes derived from the root forms
defined in the
<a href="DICO/index.html">Sanskrit Heritage Dictionary</a>. These forms are
presented as lemmas linking each form to its stem entry by possible morpho-phonetic
operations. We limit ourselves to classical Sanskrit, and do not cover precative,
subjunctive, injunctive and conditional forms of the verbs.
At present, we provide for two transliteration schemas, respectively
WX, used by the
<a href="http://sanskrit.uohyd.ernet.in/">Department of Sanskrit Studies at
University of Hyderabad</a>
and SLP1, used by the
<a href="http://sanskritlibrary.org/">Sanskrit Library</a>.
The respective data banks are given in compressed archives (6Mo each, gz format)
"WX_morph.tar.gz" (WX) and
"SL_morph.tar.gz" (SLP1. After downloading these documents,
and uncompressing them (typically with the utility gunzip),
you get a UNIX tar archive containing the following data.
The morphological lemmas are distributed in 6 files in
XML format, conformant to a common DTD.
The nominal morphological declensions of nouns, adjectives and numbers,
are covered in "T_nouns.xml" (where T is respectively WX or SL).
Those of pronouns are covered in "T_pronouns.xml".
The conjugated forms of roots in the present, imperfect, imperative, optative,
perfect, aorist
and future tenses, as well as passives of the present system,
for the primary conjugation and for some secondary conjugations
(causative, intensive, desiderative) are covered in "T_roots.xml".
Additional declensions of derived participial forms are given in T_parts.xml.
Absolutives, infinitives and other undeclinable words and particles
are listed in "T_adverbs.xml". In addition, "T_final.xml" gives additional
generative morphemes. The files are conformant to the DTD "T_morph.dtd".
<p>
Finally, the text file "X_preverbs.txt" lists common
preverb sequences, given with their sandhi analysis.
<h3 class=b2>Intellectual Property</h3>
All these linguistic data banks are Copyrighted Gérard Huet 1994-2017.
They are derived from the Sanskrit Heritage Dictionary
version 2.99 dated 2017-05-18.
<p>
Use of these linguistic resources is granted according to the
Lesser General Public Licence for Linguistic Resources.
Copies of this license, in pdf as well as HTML, are provided at
the Heritage_Resources distribution site in its XML subdirectory.
Thank you for referencing the origin of this data if you use it in your own work.
<h2 class=b2>Methodology</h2>
We deal here with a mixture of derivational and inflexional morphology.
For instance, from the roots we generate verbal and propositional stems, and from
these stems we generate in turn inflected forms: conjugated forms from the
verbal stems, and declined forms from the participial stems. But at present
we do not generate mechanically primary nominal stems from roots,
nor secondary nominal stems from primary ones, because of overgeneration.
The nominal stems, as well as the undeclinable forms, are taken from the
lexicon, that lists also some frequent participles.
<p>
This organization entails a different role in our morphological data bases.
The <i>basic</i> morphological categories correspond to lexical phases,
which are atomic letters in the defining grammar of Sanskrit <i>word</i>.
The forms listed in these data bases act as morphemes of this high-level
morphological definition, which is recursive, since compounding may be
iterated, as well as preverb formation, to a certain extent.
But this recursion power is limited, in the sense that the grammar of a word
is a regular one (type 0 in the Chomsky hierarchy), and its recognizer is
a finite automaton, whose states are precisely the lexical categories indexing
the basic data bases. This definition of word implements correctly the geometry
of constructions such as absolutives (which fall in two distinct categories,
the preverb form and the root form) and periphrastic phrases (periphrastic
futures with substantives, and periphratic perfects as prefixes of finite
perfect forms of the auxiliary roots <i>as</i>, <i>bhū</i> and
<i>kṛ]</i> which are duplicated in a specific auxiliary lexicon).
Here is a simplified diagram of the current state space of our lexer.
<div class="center">
<img src="IMAGES/lexer17.jpg" alt="Lexer automaton">
</div>
This automaton is also the top-level view of our Sanskrit Tagger, which
implements Sanskrit analysis from <i>devanagarī</i> text.
The technical exposition of this method, together with its correctness
justification, has been exposed in various scientific journals and conferences,
and the corresponding articles are also available freely on my
<a href="http://pauillac.inria.fr/~huet/bib.html">
<strong>publications page</strong></a>
(papers [78], [87], [88], [94], [95], [105], [106] and [110]
are specially relevant).
This material will not be repeated here. Let us just explain a few difficulties
of the large-scale implementation of this Sanskrit analyser.
<p>
As usual in a non-deterministic search algorithm (here all the possible parsings
of a sentence as a sandhied stream of forms), we have two pitfalls, silence and noise.
Silence (lack of recall) means incompleteness. Some legal Sanskrit sentences
may fail to be recognized.
Typicallly, some root word may be missing from the base lexicon,
or some Vedic form may use some construction rare in the later language,
like precative or subjunctive.
Compounding gives rise to two complications, the raising of new cases by
<i>bahuvrīhi</i> compounding,
and the formation of <i>avyayībhava</i> compounds. Some of these
constructions are treated incompletely.
<p>
The opposite of silence is noise (lack of precision), that is overgeneration.
We deal with overgeneration
in the syntactico-semantic layer of our tagger, which filters out combinations of
tags inconsistent with semantic role assignments.
We shall not discuss this technology
further in this note on morphology, and refer the interested reader to our
<a href="/DICO/reader.html"><strong>Sanskrit reader
demonstration page</strong></a> and its <a href="manual.html">
<strong>Reference manual</strong></a>
<p>
We remark that the respective data bases can be interrogated online by our
<a href="http://sanskrit.inria.fr/DICO/index.html#stemmer"><strong>stemmer
interface</strong></a>. But note that verbal forms prefixed by preverbs
are analysed by the tagger as non-atomic words, and only root forms and
their secondary conjugations are recognized by the stemmer.
<h2 class=b2>Help</h2>
Questions concerning these resources should be addressed to
<a href="mailto:Gerard.Huet@inria.fr">Gérard Huet</a>.
All suggestions for improvements will be gratefully considered.
</td></tr>
</table>
</div>
<table class="pad60"> <!--padding for bandeau -->
<tr><td></td></tr></table>
<div class="enpied">
<table class="bandeau"><tr><td>
<a href="http://ocaml.org">
<img src="IMAGES/icon_ocaml.png" alt="Objective Caml" height="50"></a>
</td><td>
<table class="center">
<tr><td>
<a href="index.html"><strong>Top</strong></a> |
<a href="DICO/index.en.html"><strong>Index</strong></a> |
<a href="DICO/index.en.html#stemmer"><strong>Stemmer</strong></a> |
<a href="DICO/grammar.en.html"><strong>Grammar</strong></a> |
<a href="DICO/sandhi.en.html"><strong>Sandhi</strong></a> |
<a href="DICO/reader.en.html"><strong>Reader</strong></a> |
<a href="faq.en.html"><strong>Help</strong></a> |
<a href="portal.en.html"><strong>Portal</strong></a>
</td></tr>
<tr><td>© Gérard Huet 1994-2017</td></tr>
</table></td><td>
<a href="http://www.inria.fr/">
<img src="IMAGES/logo_inria.png" alt="Logo Inria" height="50"></a>
<br></td></tr></table></div>
</body>
</html>
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment