Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
H
Heritage_Platform
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Packages & Registries
Packages & Registries
Container Registry
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Gérard Huet
Heritage_Platform
Commits
36d69f4e
Commit
36d69f4e
authored
Jun 02, 2017
by
Gérard Huet
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Uoh_interface replaced by Scl_parser
parent
67d8d3db
Changes
16
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
128 additions
and
335 deletions
+128
-335
ML/.depend
ML/.depend
+6
-6
ML/Makefile
ML/Makefile
+9
-17
ML/install.sv.ml
ML/install.sv.ml
+0
-28
ML/interface.ml
ML/interface.ml
+15
-21
ML/lexer.ml
ML/lexer.ml
+2
-2
ML/lexer.mli
ML/lexer.mli
+1
-1
ML/parser.ml
ML/parser.ml
+4
-5
ML/reader.ml
ML/reader.ml
+10
-10
ML/reader_plugin.ml
ML/reader_plugin.ml
+0
-232
ML/scl_parser.ml
ML/scl_parser.ml
+67
-0
ML/uoh_interface.ml
ML/uoh_interface.ml
+7
-7
ML/version.ml
ML/version.ml
+1
-1
SETUP/MMakefile
SETUP/MMakefile
+3
-2
SETUP/version.txt
SETUP/version.txt
+1
-1
SITE/manual.html
SITE/manual.html
+1
-1
SITE/xml.html
SITE/xml.html
+1
-1
No files found.
ML/.depend
View file @
36d69f4e
...
...
@@ -179,14 +179,14 @@ rank.cmo : ../ZEN/word.cmo web.cmo phases.cmo morphology.cmi \
../ZEN/list2.cmo lexer.cmi constraints.cmi
rank.cmx : ../ZEN/word.cmx web.cmx phases.cmx morphology.cmi \
../ZEN/list2.cmx lexer.cmx constraints.cmx
uoh_interface
.cmo : ../ZEN/word.cmo web.cmo SCLpaths.cmo phases.cmo \
morphology.cmi l
oad_morphs.cmo
html.cmo dispatcher.cmi
uoh_interface
.cmx : ../ZEN/word.cmx web.cmx SCLpaths.cmx phases.cmx \
morphology.cmi l
oad_morphs
.cmx html.cmx dispatcher.cmx
reader.cmo : web.cmo
uoh_interface
.cmo sanskrit.cmi rank.cmo phases.cmo \
scl_parser
.cmo : ../ZEN/word.cmo web.cmo SCLpaths.cmo phases.cmo \
morphology.cmi l
exer.cmi
html.cmo dispatcher.cmi
scl_parser
.cmx : ../ZEN/word.cmx web.cmx SCLpaths.cmx phases.cmx \
morphology.cmi l
exer
.cmx html.cmx dispatcher.cmx
reader.cmo : web.cmo
scl_parser
.cmo sanskrit.cmi rank.cmo phases.cmo \
paths.cmo html.cmo encode.cmo control.cmo checkpoints.cmo cgi.cmo \
canon.cmo
reader.cmx : web.cmx
uoh_interface
.cmx sanskrit.cmx rank.cmx phases.cmx \
reader.cmx : web.cmx
scl_parser
.cmx sanskrit.cmx rank.cmx phases.cmx \
paths.cmx html.cmx encode.cmx control.cmx checkpoints.cmx cgi.cmx \
canon.cmx
parser.cmo : ../ZEN/word.cmo web.cmo uoh_interface.cmo skt_morph.cmi \
...
...
ML/Makefile
View file @
36d69f4e
...
...
@@ -5,7 +5,7 @@
# Gérard Huet & Pawan Goyal #
# #
############################################################################
# Makefile of Sanskrit Heritage Software 02-0
5
-2017 Copyright INRIA 2017 #
# Makefile of Sanskrit Heritage Software 02-0
6
-2017 Copyright INRIA 2017 #
############################################################################
# Prerequisites: Ocaml and Camlp4 preprocessor
...
...
@@ -42,9 +42,9 @@ inflected.mli inflected.ml sandhi.ml sandhier.ml pada.ml nouns.mli nouns.ml \
verbs.mli verbs.ml parts.ml conj_infos.mli morpho_string.ml morpho.ml
\
declension.ml conjugation.ml indexer.ml indexerd.ml phases.ml lemmatizer.ml
\
auto.mli load_transducers.ml dispatcher.mli dispatcher.ml segmenter.ml
\
load_morphs.ml lexer.mli lexer.ml rank.ml
uoh_interface
.ml
\
load_morphs.ml lexer.mli lexer.ml rank.ml
scl_parser
.ml
\
reader.ml parser.ml constraints.mli constraints.ml multilingual.ml
\
paraphrase.mli paraphrase.ml
reader_plugin.ml
bank_lexer.ml regression.ml
\
paraphrase.mli paraphrase.ml bank_lexer.ml regression.ml
\
checkpoints.ml graph_segmenter.ml automaton.ml interface.mli interface.ml
\
user_aid.ml reset_caches.ml
...
...
@@ -76,7 +76,8 @@ LINK=ocamlopt -I $(ZEN) -I +camlp4 dynlink.cmxa camlp4lib.cmxa
# standard installation of Sanskrit Heritage platform - assumes ZEN library
engine
:
test_version cgis reset_caches static_pages regression reader_plugin
#
parse_apte
engine
:
test_version cgis reset_caches static_pages regression
# reader_plugin parse_apte
# testing consistency of Heritage_resources and Heritage_platform
test_version
:
paths.cmx gen.cmx version.cmx control.cmx test_stamp.cmx
...
...
@@ -85,7 +86,7 @@ test_version: paths.cmx gen.cmx version.cmx control.cmx test_stamp.cmx
static_pages
:
css mk_index_page mk_grammar_page mk_sandhi_page mk_reader_page
all
:
engine static_pages reset_caches
reader_plugin
sandhi_test
all
:
engine static_pages reset_caches sandhi_test
# legacy in need of re-design
regression
:
rank.cmx regression.cmx
...
...
@@ -184,16 +185,7 @@ inflected.cmx html.cmx SCLpaths.cmx web.cmx naming.cmx morpho_string.cmx morpho.
load_transducers.cmx pada.cmx phases.cmx dispatcher.cmx order.cmx
\
chapters.cmx morpho_html.cmx cgi.cmx segmenter.cmx morpho_ext.cmx
\
load_morphs.cmx lexer.cmx constraints.cmx rank.cmx bank_lexer.cmx
\
uoh_interface.cmx checkpoints.cmx reader.cmx
-o
reader
reader_plugin
:
reader_plugin.cmx
$(LINK)
unix.cmxa list2.cmx gen.cmx paths.cmx version.cmx date.cmx
\
control.cmx word.cmx canon.cmx zen_lexer.cmx phonetics.cmx transduction.cmx
\
encode.cmx skt_lexer.cmx padapatha.cmx sanskrit.cmx deco.cmx lexmap.cmx
\
inflected.cmx html.cmx SCLpaths.cmx web.cmx naming.cmx morpho_string.cmx morpho.cmx
\
load_transducers.cmx pada.cmx phases.cmx dispatcher.cmx order.cmx
\
chapters.cmx morpho_html.cmx cgi.cmx segmenter.cmx morpho_ext.cmx
\
load_morphs.cmx lexer.cmx constraints.cmx reader_plugin.cmx
-o
reader_plugin
scl_parser.cmx checkpoints.cmx reader.cmx
-o
reader
interface
:
interface.cmx
$(LINK)
unix.cmxa list2.cmx gen.cmx paths.cmx version.cmx date.cmx
\
...
...
@@ -221,7 +213,7 @@ inflected.cmx html.cmx SCLpaths.cmx web.cmx naming.cmx morpho_string.cmx morpho.
load_transducers.cmx pada.cmx phases.cmx dispatcher.cmx order.cmx
\
chapters.cmx morpho_html.cmx bank_lexer.cmx cgi.cmx segmenter.cmx
\
morpho_ext.cmx load_morphs.cmx lexer.cmx constraints.cmx checkpoints.cmx
\
paraphrase.cmx
uoh_interface
.cmx parser.cmx
-o
parser
paraphrase.cmx
scl_parser
.cmx parser.cmx
-o
parser
tagger
:
tagger.cmx
$(LINK)
list2.cmx gen.cmx paths.cmx version.cmx date.cmx install.cmx
\
...
...
@@ -324,7 +316,7 @@ clean:
rm
-f
*
.cmo
*
.cmi
*
.cmx
*
.ppi
*
.ppo
*
.o
rm
-f
css indexer indexerd sandhier reader parser tagger lemmatizer
\
declension conjugation mk_index_page mk_grammar_page mk_reader_page regression
\
mk_sandhi_page sandhi_test re
ader_plugin re
set_caches interface user_aid
\
mk_sandhi_page sandhi_test reset_caches interface user_aid
\
parse_apte tag_apte
# make dico.cmi auto.cmi conj_infos.cmi # needed to get dependencies right
...
...
ML/install.sv.ml
deleted
100644 → 0
View file @
67d8d3db
(**************************************************************************)
(* *)
(* The Sanskrit Heritage Platform *)
(* *)
(* Gérard Huet *)
(* *)
(* ©2017 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(* Module Install reads localisation parameters from paths.ml,
created by "make configure" in main directory, called by configure script.
Describes all installation parameters and resources *)
(*i module Install = struct i*)
(* Heritage used as the generative lexicon *)
value
lexicon
=
"Heritage"
(* for lexicon version in Parser.stamp *)
;
(* Configuration of the platform *)
(* truncation is the maximum number of solutions computed by the lexer.
Too small a truncation limit will miss solutions, too large a truncation
limit will provoke un unrecoverable choking server failure. This is relevant
only for the parser (deprecated) mode. The graph interface has no limit. *)
value
truncation
=
10000
;
(*i end; i*)
ML/interface.ml
View file @
36d69f4e
...
...
@@ -21,7 +21,7 @@ open Phases; (* [Phases] *)
open
Phases
;
(* [phase is_cache generative] *)
open
Dispatcher
;
(* [transducer_vect phase Dispatch transition trim_tags] *)
open
Html
;
open
Web
;
(*
ps pl abort
etc. *)
open
Web
;
(*
[ps pl abort reader_cgi]
etc. *)
open
Cgi
;
module
Prel
=
struct
(* Interface's lexer prelude *)
...
...
@@ -431,30 +431,24 @@ value check_sentence translit us text_orig checkpoints sentence
;
pl
(
table_begin
Spacing20
)
;
pl
tr_begin
;
ps
(
td_wrap
(
call_undo
text
checkpoints
^
"Undo"
))
;
let
invoke_web_services
n
=
(* call SCL and SL services *)
if
scl_toggle
then
do
{
if
(
not
iterate
.
val
)
&&
(
List
.
length
chunks
=
1
)
then
ps
(
td_wrap
(
call_reader
text
cpts
"n"
^
"UoH Nyaya Analysis"
))
else
ps
(
td_wrap
(
call_reader
text
cpts
"o"
^
"UoH Analysis Mode"
))
;
match
corpus
with
[
""
->
()
|
id
->
invoke_SL
sentence
cpts
id
n
sent_id
link_num
]
}
else
()
(* these services are not visible unless toggle is set *)
in
match
count
with
[
Num
.
Int
n
->
if
n
=
1
(* Unique remaining solution *)
then
do
{
ps
(
td_wrap
(
call_parser
text
cpts
^
"Unique Solution"
))
;
invoke_web_services
1
}
else
if
n
<
max_count
then
do
;
let
call_scl_parser
n
=
(* invocation of scl parser *)
if
scl_toggle
then
ps
(
td_wrap
(
call_reader
text
cpts
"o"
^
"UoH Analysis Mode"
))
else
()
(* [scl_parser] is not visible unless toggle is set *)
in
match
count
with
[
Num
.
Int
n
->
if
n
>
max_count
then
(* too many solutions would choke the parsers *)
ps
(
td_wrap
(
"("
^
string_of_int
n
^
" Solutions)"
))
else
if
n
=
1
(* Unique remaining solution *)
then
do
{
ps
(
td_wrap
(
call_parser
text
cpts
^
"Unique Solution"
))
;
call_scl_parser
1
}
else
do
{
ps
(
td_wrap
(
call_reader
text
cpts
"p"
^
"Filtered Solutions"
))
;
let
info
=
string_of_int
n
^
if
flag
then
""
else
" Partial"
in
ps
(
td_wrap
(
call_reader
text
cpts
"t"
^
"All "
^
info
^
" Solutions"
))
;
invoke_web_services
n
;
call_scl_parser
n
}
else
(* too many solutions would choke the reader service *)
ps
(
td_wrap
(
"("
^
string_of_int
n
^
" Solutions)"
))
|
_
->
ps
(
td_wrap
"(More than 2^32 Solutions!)"
)
]
;
pl
tr_end
...
...
ML/lexer.ml
View file @
36d69f4e
...
...
@@ -240,7 +240,7 @@ value print_segment offset (phase,rword,transition) = do
}
;
(* Similarly for [Reader_plugin] mode (without offset) *)
value
print_ext_segment
counter
(
phase
,
rword
,_
)
=
value
print_ext_segment
counter
(
phase
,
rword
)
=
let
print_pada
rword
=
let
word
=
Morpho_html
.
visargify
rword
in
let
ic
=
string_of_int
counter
in
...
...
@@ -266,7 +266,7 @@ value print_ext_segment counter (phase,rword,_) =
]
in
print_ext_tags
[]
taddhita_phase
word
sfx_tags
]
;
ps
"'>"
;
ps
"'>"
(* closes <input *)
;
let
word
=
Morpho_html
.
visargify
rword
in
ps
(
Canon
.
unidevcode
word
)
;
ps
td_end
...
...
ML/lexer.mli
View file @
36d69f4e
...
...
@@ -52,7 +52,7 @@ module Lexer : functor (* takes its prelude and iterator control as parameters *
value
all_checks
:
ref
(
list
Viccheda
.
check
);
value
un_analyzable
:
Word
.
word
->
(
list
Disp
.
segment
*
Viccheda
.
resumption
);
value
set_offset
:
(
int
*
list
Viccheda
.
check
)
->
unit
;
value
print_ext_segment
:
int
->
Disp
.
segment
->
int
;
value
print_ext_segment
:
int
->
(
Phases
.
phase
*
Word
.
word
)
->
int
;
value
record_tagging
:
bool
->
bool
->
string
->
int
->
string
->
list
(
Phases
.
phase
*
Word
.
word
*
'
a
)
->
list
(
int
*
int
)
->
unit
;
value
return_tagging
:
...
...
ML/parser.ml
View file @
36d69f4e
...
...
@@ -20,7 +20,7 @@ open Html;
open
Web
;
(* ps pl abort truncation etc. [remote_server_host] *)
open
Cgi
;
(* get *)
open
Checkpoints
;
open
Uoh_interface
;
(* Interface with UoH dependency parser *)
open
Scl_parser
;
(* Interface with UoH dependency parser *)
module
Prel
=
struct
(* Parser's lexer prelude *)
...
...
@@ -52,8 +52,6 @@ end (* [Lexer_control] *)
module
Lex
=
Lexer
.
Lexer
Prel
Lexer_control
(* [print_proj print_segment_roles print_ext_segment extract_lemma] *)
;
module
Ext
=
UOH
Lex
;
value
rpc
=
remote_server_host
and
remote
=
ref
False
(* local invocation of cgi by default *)
;
...
...
@@ -117,8 +115,9 @@ value analyse query output =
^
";p=','"
^
string_of_int
(
find_len
top_groups
)
^
"')"
)
]
^
html_break
)
;
pl
(
xml_empty
"p"
)
;
if
scl_toggle
then
(* Call SCL parser *)
Ext
.
print_ext
[
(
1
,
List
.
rev
output
)
]
;
if
scl_toggle
then
(* Call SCL parser *)
let
segments
=
List
.
map
(
fun
(
ph
,
w
,_
)
->
(
ph
,
w
))
output
in
Scl_parser
.
print_scl
[
List
.
rev
segments
]
else
()
(*i DEBUG ; Sys.command "ls -l > /tmp/SKT_TEMP/junk" i*)
;
List
.
iter
print_bucket
top_groups
...
...
ML/reader.ml
View file @
36d69f4e
...
...
@@ -31,10 +31,7 @@ open Web; (* ps pl abort etc. [remote_server_host] *)
open
Cgi
;
(* [get decode_url] *)
open
Phases
;
(* [Phases] *)
open
Rank
;
(* [Prel Lex segment_all iterate] *)
open
Uoh_interface
;
(* Interface with UoH dependency parser *)
module
Ext
=
UOH
Lex
(* [print_ext print_nn] *)
;
(* Reader interface *)
(* Mode parameter of the reader. Controled by service Reader for respectively
tagging, shallow parsing, or dependency analysis with the UoH parser. *)
...
...
@@ -129,19 +126,22 @@ value display limit mode text saved = fun
}
]
}
|
Analyse
->
match
saved
with
[
[]
->
Ext
.
print_ext
(
best_sols
:
list
(
int
*
list
Rank
.
Lex
.
Disp
.
segment
))
|
[
(
_
,
min_buck
)
::
_
]
->
let
zero_pen
=
List
.
append
best_sols
(
List
.
rev
min_buck
)
in
Ext
.
print_ext
zero_pen
]
|
Analyse
->
(* [best_sols: list (int * list Rank.Lex.Disp.segment)] *)
let
solutions
=
match
saved
with
[
[]
->
best_sols
|
[
(
_
,
min_buck
)
::
_
]
->
List
.
append
best_sols
(
List
.
rev
min_buck
)
]
in
let
forget_transitions
(
phase
,
word
,_
)
=
(
phase
,
word
)
in
let
forget_index
(
_
,
segments
)
=
List
.
map
forget_transitions
segments
in
let
segmentations
=
List
.
map
forget_index
best_sols
in
Scl_parser
.
print_scl
segmentations
|
_
->
()
]
}
]
;
(* NB This reader is parametrized by an encoding function, that parses the
(* NB This reader is paramet
e
rized by an encoding function, that parses the
input as a list of words, according to various transliteration schemes.
However, the use of "decode" below to compute the romanisation and devanagari
renderings does a conversion through VH transliteration which may not be
...
...
ML/reader_plugin.ml
deleted
100755 → 0
View file @
67d8d3db
(**************************************************************************)
(* *)
(* The Sanskrit Heritage Platform *)
(* *)
(* Gérard Huet & Amba Kulkarni *)
(* *)
(* ©2017 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(* This is an adaptation of module Reader from Skt Heritage engine for
external call as a plug-in. It prints on [stdout] an html document giving
segmentation/tagging of its input.
It computes a penalty of the various solutions, and returns all solutions
with minimal penalties (with a further preference for the solutions
having a minimum number of segments), using Constraints for ranking. *)
(*i module Reader_plugin = struct i*)
open
Encode
;
open
Canon
;
open
Html
;
open
Web
;
(* ps, pl, etc. abort truncation *)
open
Cgi
;
open
Morphology
;
open
Phases
;
module
Prelude
=
struct
value
prelude
()
=
()
;
end
(* Prelude *)
;
value
iterate
=
ref
True
(* by default a chunk is a list of words *)
and
complete
=
ref
True
(* by default we call the fuller segmenter *)
;
module
Lexer_control
=
struct
value
star
=
iterate
;
value
full
=
complete
;
value
out_chan
=
ref
stdout
;
(* cgi writes on standard output channel *)
end
(* [Lexer_control] *)
;
module
Lex
=
Lexer
.
Lexer
Prelude
Lexer_control
;
type
mode
=
[
Tag
|
Parse
|
Analyse
]
(* Segmentation is now obsolete *)
;
(* Builds the penalty stack, grouping together equi-penalty items. *)
(* Beware, [make_groups] reverses the list of tags. *)
value
make_groups
tagger
=
comp_rec
1
[]
where
rec
comp_rec
seg
stack
=
fun
(* going forward in time *)
[
[]
->
stack
(* result goes backward in time *)
|
[
(
phase
,
rword
,_
)
::
rest
]
->
(* we ignore euphony transition *)
let
word
=
List
.
rev
rword
in
let
keep
=
let
tags
=
tagger
phase
word
in
[
Constraints
.
roles_of
seg
word
tags
::
stack
]
in
comp_rec
(
seg
+
1
)
keep
rest
]
;
(* Computes minimum penalty in Parse mode *)
value
minimum_penalty
output
=
let
tagger
=
Lex
.
extract_lemma
in
let
out
=
List
.
rev
output
in
let
groups
=
make_groups
tagger
out
in
if
groups
=
[]
then
failwith
"Empty penalty stack !"
else
let
sort_groups
=
Constraints
.
sort_flatten
groups
in
let
min_pen
=
match
sort_groups
with
[
[]
->
failwith
"Empty penalty stack"
|
[
(
pen
,_
)
::
_
]
->
pen
]
in
min_pen
;
(* Same as [UoH_interface.print_ext_output] *)
value
print_offline_output
cho
(
n
,
output
)
=
let
ps
=
output_string
cho
in
let
print_segment
=
Lex
.
print_ext_segment
in
do
{
ps
(
xml_begin_with_att
"solution"
[
(
"num"
,
string_of_int
n
)
])
;
ps
"
\n
"
;
let
_
=
List
.
fold_left
Lex
.
print_ext_segment
1
(
List
.
rev
output
)
in
()
;
ps
(
xml_end
"solution"
)
;
ps
"
\n
"
}
;
value
print_offline_solutions
cho
=
List
.
iter
(
print_offline_output
cho
)
;
(* Experimental segmentation plug-in of Amba Kulkarni's parser at UoH *)
(* Printing all segmentations on stdout *)
value
print_offline
=
print_offline_solutions
stdout
;
(* Compound minimum path penalty with solution length *)
value
process_output
mode
((
_
,
output
)
as
sol
)
=
let
min
=
minimum_penalty
output
in
let
m
=
Constraints
.
eval_penalty
min
in
let
length_penalty
=
List
.
length
output
in
((
m
+
length_penalty
,
m
)
,
sol
)
;
type
tagging
=
(
Phases
.
phase
*
Word
.
word
*
Lex
.
Disp
.
transition
)
and
solution
=
list
tagging
and
ranked_solution
=
(
int
(* rank *)
*
solution
)
and
bucket
=
(
int
(* length *)
*
list
ranked_solution
)
;
exception
No_solution
of
Word
.
word
;
exception
Solutions
of
option
int
and
list
ranked_solution
and
list
bucket
(* [Solutions None sols saved] returns solutions sols within truncation limit
[Solutions (Some n) sols saved] returns solutions sols within total n
saved is the list of solutions of penalty 0 and worse length penalty *)
;
(* insert builds a triple [(p, sols, saved)] where sols is the list of all pairs
[(m,sol)] such that ranked [sol] has minimal length penalty [p] and
absolute penalty [m] and [saved] is the list of all ranked sols of length
penalty $>$ [p] and absolute penalty 0, arranged in buckets by increasing
length penalty *)
value
insert
((
pen
,
min
)
,
sol
)
((
min_pen
,
sols
,
saved
)
as
current
)
=
if
sols
=
[]
then
(
pen
,
[
(
min
,
sol
)
]
,
[]
)
else
if
pen
>
min_pen
then
if
min
>
0
then
current
(* sol is thrown away *)
else
(
min_pen
,
sols
,
List2
.
in_bucket
pen
sol
saved
)
else
if
pen
=
min_pen
then
(
min_pen
,
[
(
min
,
sol
)
::
sols
]
,
saved
)
else
(
pen
,
[
(
min
,
sol
)
]
,
let
rescue
=
List
.
fold_right
save
sols
[]
in
if
rescue
=
[]
then
saved
else
[
(
min_pen
,
rescue
)
::
saved
])
where
save
(
min
,
sol
)
rescued
=
if
min
=
0
then
[
sol
::
rescued
]
else
rescued
;
(* forget absolute penalties of solutions with minimal length penalty *)
value
trim
=
List
.
map
snd
;
(* does depth-first search in a stack of type [list (output * resumption)] *)
value
dove_tail
mode
init
=
dtrec
1
(
0
,
[]
,
[]
)
init
where
rec
dtrec
n
kept
stack
=
(* invariant |stack|=|init| *)
if
n
>
truncation
then
let
(
_
,
sols
,
saved
)
=
kept
in
raise
(
Solutions
None
(
trim
sols
)
saved
)
else
do
{
let
total_output
=
List
.
fold_right
conc
stack
[]
where
conc
(
o
,_
)
oo
=
o
@
oo
in
let
pen_sol
=
process_output
mode
(
n
,
total_output
)
in
let
kept_sols
=
insert
pen_sol
kept
in
dtrec
(
n
+
1
)
kept_sols
(
crank
[]
init
stack
)
where
rec
crank
acc
ini
=
fun
[
[
(
_
,
c
)
::
cc
]
->
match
Lex
.
Viccheda
.
continue
c
with
[
Some
next
->
List2
.
unstack
acc
[
next
::
cc
]
|
None
->
match
ini
with
[
[
i
::
ii
]
->
crank
[
i
::
acc
]
ii
cc
|
_
->
raise
(
Control
.
Anomaly
"Plugin dove_tail"
)
(* impossible by invariant *)
]
]
|
[]
->
let
(
_
,
sols
,
saved
)
=
kept_sols
in
raise
(
Solutions
(
Some
n
)
(
trim
sols
)
saved
)
]
}
;
value
segment_all
mode
chunks
=
let
segs
=
List
.
fold_left
init
[]
chunks
where
init
stack
chunk
=
let
ini_cont
=
Lex
.
Viccheda
.
init_segment
chunk
in
match
Lex
.
Viccheda
.
continue
ini_cont
with
[
Some
c
->
[
c
::
stack
]
|
None
->
raise
(
No_solution
chunk
)
]
in
dove_tail
mode
segs
;
value
display
limit
mode
text
saved
=
fun
[
[]
->
()
|
best_sols
->
let
zero_pen
=
match
saved
with
[
[]
->
best_sols
|
[
(
_
,
min_buck
)
::
_
]
->
List
.
append
best_sols
(
List
.
rev
min_buck
)
]
in
print_offline
zero_pen
]
;
value
process_sentence
text
us
mode
topic
(
sentence
:
string
)
encode
=
let
chunker
=
if
us
then
Sanskrit
.
read_raw_sanskrit
else
Sanskrit
.
read_sanskrit
in
let
chunks
=
chunker
encode
sentence
in
do
{
let
all_chunks
=
match
topic
with
[
Some
topic
->
chunks
@
[
code_string
topic
]
|
None
->
chunks
]
in
try
segment_all
mode
all_chunks
with
[
Solutions
limit
revsols
saved
->
let
sols
=
List
.
rev
revsols
in
let
_
=
display
limit
mode
text
saved
sols
in
True
|
No_solution
chunk
->
False
]
}
;
value
encode
=
Encode
.
switch_code
"WX"
(* encoding in WX as a normalized word *)
;
(* adapt with abort function : string -> string -> unit *)
value
abort
m1
m2
=
raise
(
Failure
(
m1
^
m2
))
;
(* input: string is the text to be segmented/parsed *)
(* unsandhied: bool is True is input is unsandhied False if it is sandhied *)
(* topic is (Some "sa.h") (Some "saa") (Some "tat") or None if no topic *)
(* st:bool is True if stemmer for one word, False for tagging sentence *)
(* cp:bool is True if Complete mode, False for Simplified mode *)
value
reader_engine
input
unsandhied
topic
st
cp
=
do
{
Prelude
.
prelude
()
;
if
st
then
iterate
.
val
:=
False
else
()
(* word stemmer *)
;
if
cp
then
complete
.
val
:=
True
else
()
(* complete reader *)
(* Contextual information from past discourse *)
;
try
process_sentence
""
unsandhied
Analyse
topic
input
encode
(* possibly use the returned bool value (success) in your control *)
with
[
Stream
.
Error
_
->
abort
"Illegal transliteration "
input
]
}
;
value
safe_engine
input
unsandhied
topic
st
cp
=
try
reader_engine
input
unsandhied
topic
st
cp
with
[
Sys_error
s
->
abort
Control
.
sys_err_mess
s
(* file pb *)
|
Stream
.
Error
s
->
abort
Control
.
stream_err_mess
s
(* file pb *)
|
Encode
.
In_error
s
->
abort
"Wrong input "
s
|
Exit
(* Sanskrit *)
->
abort
"Wrong character in input - "
"use ASCII"
|
Invalid_argument
s
->
abort
Control
.
fatal_err_mess
s
(* sub *)
|
Failure
s
->
abort
Control
.
fatal_err_mess
s
(* anomaly *)
|
End_of_file
->
abort
Control
.
fatal_err_mess
"EOF"
(* EOF *)
|
Not_found
(* assoc *)
->
abort
Control
.
fatal_err_mess
"assoc"
(* anomaly *)
|
Control
.
Fatal
s
->
abort
Control
.
fatal_err_mess
s
(* anomaly *)
|
Control
.
Anomaly
s
->
abort
Control
.
fatal_err_mess
(
"Anomaly: "
^
s
)
|
_
->
abort
Control
.
fatal_err_mess
"Unexpected anomaly"
]
;
(* call [safe_engine input unsandhied topic st cp] with proper parameters *)
let
input
=
input_line
stdin
in
safe_engine
input
False
None
False
True
;
(* eg. rAmovanaMgacCawi -> 3 solutions; second is good *)
(*i end; i*)
ML/scl_parser.ml
0 → 100644
View file @
36d69f4e
(**************************************************************************)
(* *)
(* The Sanskrit Heritage Platform *)
(* *)
(* Gérard Huet & Amba Kulkarni *)
(* *)
(* ©2017 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)
(* Interface with UoH dependency parser *)
open
Html
;
open
Web
;
(* ps pl etc. *)
open
Morphology
;
(* inflected lemma morphology *)
open
Phases
;
(* Phases *)
open
Dispatcher
;
(* Dispatch *)
open
SCLpaths
;
(* [scl_url scl_cgi] *)
module
Prel
=
struct
value
prelude
()
=
Web
.
reader_prelude
Web
.
reader_title
;
end
(* Prel *)
;
(* Global parameters of the lexer *)
value
iterate
=
ref
True
(* by default a chunk is a list of words *)
and
complete
=
ref
True
(* by default we call the fuller segmenter *)
and
output_channel
=
ref
stdout
(* by default cgi output on standard output *)
;
module
Lexer_control
=
struct
value
star
=
iterate
;
value
full
=
complete
;
value
out_chan
=
output_channel
;
end
(* [Lexer_control] *)
;
(* Multi-phase lexer *)
module
Lex
=
Lexer
.
Lexer
Prel
Lexer_control
(* [print_ext] *)
;
value
print_scl_segment
=
Lex
.
print_ext_segment
;
value
print_scl_output
output
=
List
.
fold_left
print_scl_segment
1
(
List
.
rev
output
)
;
value
print_scl_solutions
s
=
let
_
=
print_scl_output
s
in
()
;
(* Invocation of UoH's CSL parser for dependency graph display *)
value
print_scl1
(
solutions
:
list
(
Phases
.
phase
*
Word
.
word
))
=
let
svg_interface_url
=
scl_cgi
^
"SHMT/"
in
do
{
ps
(
"<script type=
\"
text/javascript
\"
src=
\"
"
^
scl_url
^
"js_files/dragtable.js
\"
></script>"
)
;
ps
(
"<form name=
\"
word-order
\"
method=
\"
POST
\"
action =
\"
"
^
svg_interface_url
^
"prog/Word_order/call_heritage2anu.cgi
\"
>
\n
"
)
;
ps
(
"<table class=
\"
draggable
\"
>"
)
;
ps
tr_begin
;
print_scl_solutions
solutions
;
ps
tr_end
;
ps
table_end
;
ps
(
submit_input
"Submit"
)
}
;
(* We restrict to the first solution - TEMPORARY *)
value
print_scl
sols
=
match
sols
with
[
[]
->
failwith
"No sol"
|
[
s
::
_
]
->
print_scl1
s
]
;
(* end; *)
ML/uoh_interface.ml
View file @
36d69f4e
...
...
@@ -54,22 +54,21 @@ module UOH
(* Interface with Amba Kulkarni's parser at UoH - Analysis mode *)
(****************************************************************)
value
print_ext_output
(
_
,
output
)
=
List
.
fold_left
Lex
.
print_ext_segment
1
(
List
.
rev
output
)
;
(* Delimitor for offline printing and piping into UoH's parser *)
(* UNUSED Delimitor for offline printing and piping into UoH's parser
value delimitor = fun
[ Iic | Iic2 | A | An | Iicv | Iicc | Iik | Iikv | Iikc | Iiif | Iiy -> "-"
| Iiv | Iivv | Iivc -> "++"
| Pv | Pvk | Pvkc | Pvkv -> failwith "No more Pv segments"
| _ -> " "
]
; *)
value
print_ext_output
(
_
,
output
)
=
List
.
fold_left
Lex
.
print_ext_segment
1
(
List
.
rev
output
)
;
value
print_ext_solutions
s
=
let
_
=
print_ext_output
s
in
()
;
(* Invocation of UoH's CSL parser for dependency graph display *)
value
print_ext1
(
solutions
:
(
int
*
list
Lex
.
Disp
.
segment
))
=
do
{
ps
(
"<script type=
\"
text/javascript
\"
src=
\"
"
^
scl_url
^
"js_files/dragtable.js
\"
></script>"
)
...
...
@@ -83,7 +82,8 @@ value print_ext1 (solutions : (int * list Lex.Disp.segment)) = do
;
ps
(
submit_input
"Submit"
)
}
;
value
print_ext
sols
=
match
sols
with
(* We restrict to the first solution - TEMPORARY *)
value
print_ext
sols
=
match
sols
with
[
[]
->
failwith
"No sol"
|
[
s
::
_
]
->
print_ext1
s
]
...
...
ML/version.ml
View file @
36d69f4e
...
...
@@ -8,4 +8,4 @@
(**************************************************************************)