interface.ml 26.7 KB
Newer Older
huet's avatar
huet committed
1 2 3 4 5 6
(**************************************************************************)
(*                                                                        *)
(*                     The Sanskrit Heritage Platform                     *)
(*                                                                        *)
(*                        Gérard Huet & Pawan Goyal                       *)
(*                                                                        *)
7
(* ©2018 Institut National de Recherche en Informatique et en Automatique *)
huet's avatar
huet committed
8 9
(**************************************************************************)

10
(* Sanskrit Reader Summarizing interface. Yields sktgraph.cgi *)
huet's avatar
huet committed
11 12 13 14 15 16

(* We construct a CGI Interface displaying the segmentation graph in which the 
   user may indicate segments as mandatory checkpoints. At any point he may
   call the standard displaying of all, or of preferred solutions consistent 
   with the current checkpoints. An undo button allows backtracking. *)

17 18
module Interface = struct

19
open Graph_segmenter; (* [Segment cur_chunk set_cur_offset graph visual] *)
huet's avatar
huet committed
20 21 22
open Phases; (* [Phases] *) 
open Phases; (* [phase is_cache generative] *) 
open Dispatcher; (* [transducer_vect phase Dispatch transition trim_tags] *) 
23 24 25
open Html; (* html constructors *)
open Web; (* [ps pl abort reader_cgi scl_toggle] etc. *) 
open Cgi; (* [url get decode_url] *)
huet's avatar
huet committed
26 27 28 29

module Prel = struct (* Interface's lexer prelude *)

 value prelude () = do
30
  { pl http_header
huet's avatar
huet committed
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
  ; page_begin graph_meta_title 
  ; pl (body_begin Chamois_back)
  ; pl interface_title
  ; pl (h3_begin C3 ^ "Click on " ^ html_green check_sign 
                    ^ " to select segment, click on " ^ html_red x_sign 
                    ^ " to rule out segment" ^ h3_end)
  ; pl (h3_begin C3 ^ mouse_action_help 
                    ^ " on segment to get its lemma" ^ h3_end)
  ; open_page_with_margin 15
  }
;
 end (* Prel *)
;
(* Service routines for morphological query, loading the morphology banks *)

module Lemmas = Load_morphs.Morphs Prel Phases  
;
open Lemmas (* [tags_of morpho] *)
;
open Load_transducers (* [Trans] *)
;
module Transducers = Trans Prel
;
module Machine = Dispatch Transducers Lemmas
;
56
open Machine 
huet's avatar
huet committed
57
;
Gérard Huet's avatar
Gérard Huet committed
58 59
(* At this point we have a Finite Eilenberg machine ready to instantiate *)
(* the Eilenberg component of the Segment module.                        *)
huet's avatar
huet committed
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76

(* Viccheda sandhi splitting *)

(* Global parameters of the lexer *)
value iterate = ref True (* by default a chunk is a list of words *)
and complete  = ref True (* by default we call the complete segmenter *)
and output_channel = ref stdout (* by default cgi output on standard output *)
;
module Segment_control = struct
 value star = iterate;  (* vaakya vs pada *)
 value full = complete; (* complete vs simplified *)
 value out_chan = output_channel
; 
end (* [Segment_control] *)
;
module Viccheda = Segment Phases Machine Segment_control
;
77
open Viccheda (* [segment_iter visual_width] etc. *)
huet's avatar
huet committed
78 79 80
;
(* At this point we have the sandhi inverser segmenting engine *)

81
(* Separates tags of homophonous segments vertically *)
huet's avatar
huet committed
82 83 84 85 86 87 88 89 90
value fold_vert f = fold 1 where rec fold n = fun
  [ [] -> () 
  | [ x ] -> f n x
  | [ x :: l ] -> do { f n x; ps html_break; fold (n+1) l }
  ]
;
value print_morph pvs seg_num cached gen form n tag = 
  Morpho_html.print_graph_link pvs cached form (seg_num,n) gen tag 
;
91
(* tags : Morphology.multitag is the multi-tag of the form of a given phase *)
huet's avatar
huet committed
92 93 94 95 96
value print_tags pvs seg_num phase form tags =
  let gen = generative phase 
  and cached = is_cache phase in 
  let ok_tags = if pvs = [] then tags 
                else trim_tags (generative phase) form (Canon.decode pvs) tags
97
  (* NB Existence of the segment warrants that [ok_tags] is not empty *)
huet's avatar
huet committed
98 99 100 101 102 103 104 105 106 107 108
  and ptag = print_morph pvs seg_num cached gen form in 
  fold_vert ptag ok_tags  
;
(*i EXPERIMENTAL: taddhitaantas (ad-hoc) i*)
value print_morph_tad pvs seg_num cache gen stem sfx n tag = 
  Morpho_html.print_graph_link_tad pvs cache stem sfx (seg_num,n) gen tag  
; 
value print_tags_tad pvs seg_num phase stem sfx sfx_tags =  
  let ptag = print_morph_tad pvs seg_num False (generative phase) stem sfx in 
  fold_vert ptag sfx_tags 
;
109
(* This is called "printing morphology interface style". *)
huet's avatar
huet committed
110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
value print_morpho phase word = 
     match tags_of phase word with 
        [ Atomic tags -> print_tags [] 0 phase word tags 
        | Preverbed (_,phase) pvs form tags -> print_tags pvs 0 phase form tags
        ]
;

(* Parsing mandatory checkpoints *)
open Checkpoints; (* [string_points] *) 

value rpc = Paths.remote_server_host 
and remote = ref False (* local invocation of cgi by default 
                          (switched on to [True] by "abs" cgi parameter) *)
;
value invoke cgi = if remote.val then rpc ^ cgi else cgi 
;
value mem_cpts ind phase_pada =  memrec where rec memrec = fun
  [ [] -> False
  | [ (k,pw,_) :: rest ] -> (k=ind && pw = phase_pada) || memrec rest
  ]
;
value unanalysed (phase,_) = (phase=Phases.unknown)
;
value already_checked = html_blue check_sign
(*i TODO: call to undo this specific checkpoint i*)
;
value call_back text cpts (k,seg) conflict = 
  if mem_cpts k seg cpts then already_checked 
  else if not conflict && not (unanalysed seg) then already_checked
  else let choices b = string_points [ (k,seg,b) :: cpts ] 
       and (out_cgi,sign,color) = 
           if unanalysed seg then (user_aid_cgi,spade_sign,Red_) 
                             else (graph_cgi,   check_sign,Green_) in
       let cgi_select = out_cgi ^ "?" ^ text ^ ";cpts=" ^ (choices True)
       and cgi_reject = out_cgi ^ "?" ^ text ^ ";cpts=" ^ (choices False) in
       anchor color (invoke cgi_select) sign ^ 
          if unanalysed seg then "" else anchor Red_ (invoke cgi_reject) x_sign
;
value call_reader text cpts mode = (* mode = "o", "p", "n" or "t" *)
  let cgi = reader_cgi ^ "?" ^ text ^ ";mode=" ^ mode ^ 
            ";cpts=" ^ string_points cpts in 
  anchor Green_ (invoke cgi) check_sign
;
value call_parser text cpts =
  let cgi = parser_cgi ^ "?" ^ text ^ ";mode=p" ^ 
            ";cpts=" ^ string_points cpts ^ ";n=1" in
  anchor Green_ (invoke cgi) check_sign
;
158
(* Legacy interface with Sanskrit Library [
huet's avatar
huet committed
159 160 161 162 163 164 165 166 167
value call_SL text cpts mode corpus solutions sent_id link_num = 
  let cgi = tomcat ^ corpus ^ "/SaveTagging?slp1Sentence=" 
            ^ text ^ "&numSolutions=" ^ (string_of_int solutions) 
            ^ "&submit=submit&command=resend&sentenceNumber=" ^ sent_id 
            ^ "&linkNumber=" ^ link_num ^ "&displayEncoding=roman&"
            ^ "inflectionFormat=SL&inputEncoding=slp1&OS=MacOS&cpts=" 
            ^ string_points cpts in
  anchor Green_ (invoke cgi) check_sign
;
168 169 170 171
value invoke_SL text cpts corpus_id count sent_id link_num =
  ps (td_wrap (call_SL text cpts "t" corpus_id count sent_id link_num 
               ^ "Sanskrit Library Interface"))
;] *)
huet's avatar
huet committed
172 173 174 175 176
value sort_check cpts = 
  let compare_index (a,_,_) (b,_,_) = compare a b in
  List.sort compare_index cpts
;
value seg_length = fun
177 178
 [ [ -2 :: rest ] -> Word.length rest (* lopa does not count *)
 | w -> Word.length w 
huet's avatar
huet committed
179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
 ]
;
value rec merge_rec lpw = fun 
  [ [] -> lpw
  | [ (p, lw) :: rest ] -> merge_rec (fill p lpw lw) rest
       where rec fill p lpw = fun
         [ [] -> lpw
         | [ wh :: rest1 ] -> fill p [ (p,wh) :: lpw ] rest1
         ]
  ] 
;
value build_visual k segments = 
  if segments = [] then () else 
  let phw = merge_rec [] segments in
  let comp_length (_,(a,_)) (_,(b,_)) = compare (seg_length a) (seg_length b) in
  let sorted_seg = List.rev (List.sort comp_length phw) in
  ass_rec sorted_seg
    where rec ass_rec seg =
      let start_ind = find_ind_rec 0
          where rec find_ind_rec n = 
          if k < visual_width.(n) then find_ind_rec (n+1) else n in 
      match seg with
      [ [] -> ()
      | [ (phase,(w1,tr)) :: rest ] -> match phase with 
           [ Phases.Pv | Phases.Pvk | Phases.Pvkc | Phases.Pvkv -> 
             failwith "Preverb in build_visual"
           | _ -> do
             { visual.(start_ind) := visual.(start_ind) @ [ (w1,tr,phase,k) ]
             ; visual_width.(start_ind) := (seg_length w1) + k
             ; ass_rec rest
             }
           ]
      ]
;
(* We check whether the current segment [(w,tr,phase,k)] is conflicting with 
214 215
   others at previous offset [n]; if not it is mandatory and marked blue. *)
(* Returns True for blue mandatory segments, False for green/red optional ones *)
huet's avatar
huet committed
216
(* Warning: hairy code, do not change without understanding the theory.  *)
217 218 219 220 221 222 223 224 225 226 227 228 229
value is_conflicting ((w,tr,ph,k) as segment) =
 let l = seg_length w in is_conflicting_rec 0
 where rec is_conflicting_rec n = (* n is position in input string *)
 match visual.(n) with 
 [ [] -> False (* will exit here when n is length of input *) 
 | segs -> does_conflict segs (* we search for conflicting segments *) 
     where rec does_conflict = fun 
       [ [] -> is_conflicting_rec (n+1) (* go to next input position *)
       | [ ((w',tr',ph',k') as segment') :: rest ] -> 
           if segment'=segment then (* skip itself *) does_conflict rest
           else let l' = seg_length w' in
                if (k'<=k && k'+l'-1>k) (* w inside w' *)
                || (k'<=k && k'+l'-1>=k && l=1) (* w is a or aa *)
huet's avatar
huet committed
230
      (* This condition is necessary for the overlapping case *)
231 232 233
                || (k<=k' && k+l-1>k') then 
                   if k+l-1=k' then let r' = Word.mirror w' in match_tr tr
      (* This is to check for the overlapping case, occurs when [k=k', l=1]. 
huet's avatar
huet committed
234 235 236 237 238 239
         We need to check the sandhi conditions to decide whether this is a case 
         of overlap or conflict. *)
                       where rec match_tr = fun
                         [ [] -> True
                         | [ v :: rst ] -> match v with
                             [ [] -> match_tr rst
240
                             | _  -> if Word.prefix v r' 
huet's avatar
huet committed
241 242 243 244
                                        then does_conflict rest
                                     else match_tr rst
                             ]
                         ]
245 246 247 248 249
                   else if (k'<=k && k'+l'-1>=k && l=1) then match_tr' tr'
      (* For the case with [l=1], this is to check whether w is the only 
         possible v for w', in which case it is an overlap returning a blue sign.
         If w' has any other possible v's, there is a conflict. *)
                           where rec match_tr' = fun
huet's avatar
huet committed
250
                             [ [] -> does_conflict rest
251 252 253 254
                             | [ v :: rst ] -> match v with 
                                   [ [] -> does_conflict rest
                                   | _ -> if v = w then match_tr' rst else True
                                   ] 
huet's avatar
huet committed
255
                             ]
256 257
                        else True 
                else does_conflict rest
huet's avatar
huet committed
258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326
       ]
 ]
;
value rec find_conflict_seg acc l = fun
  [ [] -> List.rev acc
  | [ (w1,tr,phase,k) :: rest ] ->
      let conflict = is_conflicting (w1,tr,phase,k) in
      let seg_here = (w1,phase,k,conflict) in 
      find_conflict_seg [ seg_here :: acc ] l rest
  ]
;
value rec find_conflict l = match visual.(l) with
  [ [] -> ()
  | segs -> do 
    { visual_conf.(l) := find_conflict_seg [] l segs
    ; find_conflict (succ l) 
    }
  ]
;
value make_visual n = vrec 0 
  where rec vrec k = do
    { build_visual k graph.(k)
    ; if k = n-1 then () else vrec (succ k)
    } 
;
value rec print_extra = fun 
  [ 0 -> ()
  | l -> do { ps (td_wrap ""); print_extra (l-1) }
  ]
and fixed_space = td_wrap "&nbsp;"
;
value rec print_first_server chunk = 
  match Word.length chunk with
  [ 0 -> ps fixed_space
  | l -> match chunk with 
         [ [] -> ps fixed_space
         | [ st :: rest ] -> let to_print = Canon.uniromcode [ st ] in do
             { ps (td_wrap to_print)
             ; print_first_server rest
             }
         ]
  ]
;
value call_back_pseudo text cpts ph newpt =
  if List.mem newpt cpts then already_checked 
  else let list_points = [ newpt :: cpts ] in
       let out_cgi = user_aid_cgi in
       let cgi = out_cgi ^ "?" ^ text ^ ";cpts=" ^ (string_points list_points) in
       anchor_pseudo (invoke cgi) ph
;
value un_analyzable (chunk:Word.word) = (Phases.Unknown,Word.mirror chunk)
;
value rec print_first text cpts chunk_orig chunk chunk_ind = 
  match Word.length chunk with
  [ 0 -> ps fixed_space
  | l -> match chunk with 
         [ [] -> ps fixed_space
         | [ st :: rest ] -> let to_print = Canon.uniromcode [ st ] in do
             { let unknown_chunk = (chunk_ind,un_analyzable chunk_orig,True) in
               ps (td_wrap (call_back_pseudo text cpts to_print unknown_chunk))
             ; print_first text cpts chunk_orig rest chunk_ind
             }
         ]
  ]
 ;
(* Making use of the index for printing the chunk callback *)
value rec print_all text cpts chunks index = match chunks with
  [ [] -> ()
  | [ chunk :: rest ] -> do
327 328 329
      { print_first text cpts chunk chunk index
      ; print_all text cpts rest (succ (Word.length chunk))
      }
huet's avatar
huet committed
330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382
  ]
;
value print_word last_ind text cpts (rword,phase,k,conflict) = 
  let word = Word.mirror rword in do
  { let extra_space = k-last_ind in 
    if extra_space > 0 then print_extra extra_space else ()
(*i ZZ following not implementable with a fixed css -- not HTML5 compliant i*)
  ; ps (td_begin_att [ ("colspan",string_of_int (seg_length word))
                     ; ("align","left") 
                     ])
  ; let back = background (color_of_phase phase) in 
    pl (table_begin back)
  ; ps tr_begin
  ; ps ("<td " ^ display_morph_action ^ "=\"showBox('")
  ; print_morpho phase word
  ; let close_box = 
        "<a href=&quot;javascript:hideBox()&quot;> " ^ x_sign ^ "</a>', '" in 
    ps (close_box ^ rgb (color_of_phase phase) ^ "', this, event)\">")
  ; Morpho_html.print_final rword (* visarga correction *)
  ; ps td_end
  ; ps tr_end
  ; ps table_end
  ; ps (call_back text cpts (k,(phase,rword)) conflict)
  ; ps td_end
  }
;
value max_col = ref 0
;
value print_row text cpts =  print_this text cpts 0 
  where rec print_this text cpts last_ind = fun 
  [ [] -> let adjust = max_col.val - last_ind in
          if adjust > 0 then print_extra adjust else ()
  | [ (word,phase,k,conflict) :: rest ] -> do
      { print_word last_ind text cpts (word,phase,k,conflict)
      ; print_this text cpts (k + seg_length word) rest
      }
  ]
;
value print_interf text cpts () = vgrec 0 
  where rec vgrec k = 
  match visual_width.(k) with
  [ 0 -> ()
  | _ -> do
    { ps tr_begin
    ; print_row text cpts visual_conf.(k)
    ; pl tr_end
    ; vgrec (succ k)
    }
  ]
;
value update_col_length chunk = 
  max_col.val := succ (max_col.val + Word.length chunk)
;
383
value update_text_with_sol text count = text ^ ";allSol=" ^ string_of_int count
huet's avatar
huet committed
384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403
;
value call_undo text cpts  = 
  let string_pts = match cpts with 
      [ [] -> "" (* Could raise warning "undo stack empty" *)
      | [ _ :: rest ] -> string_points rest
      ] in
  let cgi = graph_cgi ^ "?" ^ text ^ ";cpts=" ^ string_pts in
  anchor Green_ (invoke cgi) check_sign
;
(* The main procedure for computing the graph segmentation structure *)
value check_sentence translit us text_orig checkpoints sentence 
             (* finally SL corpus links: *) sol_num corpus sent_id link_num = 
  let encode = Encode.switch_code translit in
  let chunker = if us (* sandhi undone *) then Sanskrit.read_raw_sanskrit 
                else (* blanks non-significant *) Sanskrit.read_sanskrit in
  let chunks = chunker encode sentence in 
  let devachunks = List.map Canon.unidevcode chunks in
  let devainput = String.concat " " devachunks  
  and cpts = sort_check checkpoints in 
  let _ = chkpts.all_checks := cpts
404
  and (full,count) = segment_iter chunks in (* full iff all chunks segment *)
huet's avatar
huet committed
405 406 407 408 409 410
  let text = match sol_num with
             [ "0" -> update_text_with_sol text_orig count
             | _ -> text_orig
             ] in do
  { make_visual cur_chunk.offset
  ; find_conflict 0
Gérard Huet's avatar
Gérard Huet committed
411 412 413 414 415 416
  ; html_break |> pl
  ; html_latin16 "Sentence: " |> pl
  ; deva16_blue devainput |> ps (* devanagari *)
  ; html_break |> ps
  ; div_begin Latin16 |> ps
  ; table_begin Spacing20 |> pl
417
  ; tr_begin |> pl (* tr begin *)
Gérard Huet's avatar
Gérard Huet committed
418
  ; td_wrap (call_undo text checkpoints ^ "Undo") |> ps
419 420
  ; let call_scl_parser n = (* invocation of scl parser *)
        if scl_toggle then
421
           td_wrap (call_reader text cpts "o" ^ "UoH Analysis Mode") |> ps
422
        else () (* [scl_parser] is not visible unless toggle is set *) in
423 424 425 426 427 428 429 430
    if count > max_count then 
       (* too many solutions would choke the parsers *) 
       td_wrap ("(" ^ string_of_int count ^ " Solutions)") |> ps
    else if count=1 (* Unique remaining solution *) then do
            { td_wrap (call_parser text cpts ^ "Unique Solution") |> ps
            ; call_scl_parser 1
            }
         else do
Gérard Huet's avatar
Gérard Huet committed
431
       { td_wrap (call_reader text cpts "p" ^ "Filtered Solutions") |> ps
432
       ; let info = string_of_int count ^ if full then "" else " Partial" in 
Gérard Huet's avatar
Gérard Huet committed
433
         td_wrap (call_reader text cpts "t" ^ "All " ^ info ^ " Solutions") |> ps
434
       ; call_scl_parser count
Gérard Huet's avatar
Gérard Huet committed
435
       } 
436
  ; tr_end |> pl   (* tr end *)
Gérard Huet's avatar
Gérard Huet committed
437 438 439 440 441 442
  ; table_end |> pl
  ; div_end |> ps (* Latin16 *)
  ; html_break |> pl
  ; div_begin Latin12 |> ps
  ; table_begin Tcenter |> pl
  ; tr_begin |> ps
huet's avatar
huet committed
443 444 445
  ; List.iter update_col_length chunks 
  ; if Paths.platform="Station" then print_all text checkpoints chunks 0
                                else List.iter print_first_server chunks
Gérard Huet's avatar
Gérard Huet committed
446
  ; tr_end |> pl
huet's avatar
huet committed
447
  ; print_interf text checkpoints ()
Gérard Huet's avatar
Gérard Huet committed
448 449 450
  ; table_end |> pl
  ; div_end |> ps (* Latin12 *)
  ; html_break |> pl
huet's avatar
huet committed
451 452 453 454 455 456 457
  ; reset_graph () 
  ; reset_visual ()
  ; set_cur_offset 0
  ; chkpts.segment_checks := []
  ; max_col.val := 0
  }
;
458
value arguments trans lex cache st us cp input topic abs sol_num corpus id ln
459
                corpus_permission corpus_dir sentence_no =
huet's avatar
huet committed
460 461 462 463 464 465 466 467 468
  "t=" ^ trans ^ ";lex=" ^ lex ^ ";cache=" ^ cache ^ ";st=" ^ st ^ ";us=" ^ us ^
  ";cp=" ^ cp ^ ";text=" ^ input ^ ";topic=" ^ topic ^ ";abs=" ^ abs ^ 
  match sol_num with
    [ "0" -> ""
    | n -> ";allSol=" ^ n
    ] ^
  match corpus with
    [ "" -> ""
    | c -> ";corpus=" ^ c ^ ";sentenceNumber=" ^ id ^ ";linkNumber=" ^ ln
469
    ] ^
470
  ";" ^ Params.corpus_permission ^ "=" ^ corpus_permission ^
471 472
  ";" ^ Params.corpus_dir ^ "=" ^ corpus_dir ^
  ";" ^ Params.sentence_no ^ "=" ^ sentence_no
huet's avatar
huet committed
473 474 475 476 477 478 479 480 481 482
;

(* Cache management *)
value make_cache_transducer (cache : Morphology.inflected_map) = 
  let deco_cache = Mini.minimize (Deco.forget_deco cache) in
  let auto_cache = Automaton.compile Deco.empty deco_cache in do
  { Gen.dump cache public_cache_file (* for [Load_morphs] *)
  ; Gen.dump auto_cache public_transca_file (* for [Load_transducers] *)
  }
;
483
(* We fill gendered entries incrementally in [public_cache_txt_file] *)
huet's avatar
huet committed
484 485 486 487 488 489 490
value append_cache entry gender =    
  let cho = open_out_gen [ Open_wronly; Open_append; Open_text ] 0o777 
                         public_cache_txt_file in do
  { output_string cho ("[{" ^ entry ^ "}] ({" ^ gender  ^ "})\n")
  ; close_out cho
  }
;
491
value save_button query nb_sols =
492 493 494
  center_begin ^
  cgi_begin save_corpus_cgi "" ^
  hidden_input Save_corpus_params.state (escape query) ^
495 496
  hidden_input Save_corpus_params.nb_sols (nb_sols |> string_of_int |> escape) ^
  submit_input "Save" ^ 
497 498
  cgi_end ^
  center_end
499
;
500
value quit_button corpmode corpdir sentno =
501
  let submit_button_label = Web_corpus.(
502
    match corpmode with
503 504
    [ Annotator -> "Abort"
    | Reader | Manager -> "Continue reading"
Gérard Huet's avatar
Gérard Huet committed
505 506
    ])
  and permission = Web_corpus.string_of_permission corpmode in
507
  center_begin ^
508
      cgi_begin (url corpus_manager_cgi ~fragment:sentno) "" ^
Gérard Huet's avatar
Gérard Huet committed
509 510 511 512
           hidden_input Params.corpus_dir corpdir ^
           hidden_input Params.corpus_permission permission ^
           submit_input submit_button_label ^
      cgi_end ^
513
  center_end
514
;
huet's avatar
huet committed
515 516 517 518 519
(* Main body of graph segmenter cgi *)
value graph_engine () = do
  { Prel.prelude () 
  ; let query = Sys.getenv "QUERY_STRING" in
    let env = create_env query in
520 521 522 523 524
    (* Multiple environment variables according to modes of use are: 
       text topic st cp us t lex cache abs cpts (standard mode) 
       allSol (deprecated Validate mode)
       corpus sentenceNumber linkNumber (Corpus mode)
       corpdir sentno corpmode (defined in Params) 
Gérard Huet's avatar
Gérard Huet committed
525
       guess gender revised [rev_off] [rev_ind] (User-aid) *)
526
    let url_encoded_input = get "text" env "" 
huet's avatar
huet committed
527 528 529 530 531 532 533 534 535 536 537 538 539 540
    and url_encoded_topic = get "topic" env "" (* topic carry-over *)
    and st = get "st" env "t" (* sentence parse default *)
    and cp = get "cp" env "t" (* complete mode default *)
    and us = get "us" env "f" (* sandhied text default *)
    and translit = get "t" env Paths.default_transliteration (* translit input *)
    and lex = get "lex" env Paths.default_lexicon (* lexicon choice *)
    and cache = get "cache" env "f" (* no cache default *) in
    let () = cache_active.val := cache 
    and abs = get "abs" env "f" (* default local paths *) in 
    let lang = language_of lex (* language default *)
    and input = decode_url url_encoded_input (* unnormalized string *)
    and uns = us="t" (* unsandhied vs sandhied corpus *) 
    and () = if st="f" then iterate.val:=False else () (* word stemmer? *)
    and () = if cp="f" then complete.val:=False else () (* simplified reader? *) 
541 542
    and () = toggle_lexicon lex (* sticky lexicon switch *)
    and corpus = get "corpus" env "" 
huet's avatar
huet committed
543 544 545
    and sent_id = get "sentenceNumber" env "0" 
    and link_num = get "linkNumber" env "0" (* is there a better default? *)
    and sol_num = get "allSol" env "0" in (* Needed for Validate mode *)
546 547 548
    let url_enc_corpus_permission = (* Corpus mode *)
        get Params.corpus_permission env "true" in
    let corpus_permission = 
549
      url_enc_corpus_permission
550
      |> decode_url
551
      |> Web_corpus.permission_of_string in
552 553
    let corpus_dir = get Params.corpus_dir env "" in
    let sentence_no = get Params.sentence_no env "" in
huet's avatar
huet committed
554 555
    let text = arguments translit lex cache st us cp url_encoded_input
                         url_encoded_topic abs sol_num corpus sent_id link_num
556
                         url_enc_corpus_permission corpus_dir sentence_no
huet's avatar
huet committed
557 558 559 560
    and checkpoints = 
      try let url_encoded_cpts = List.assoc "cpts" env in (* do not use get *)
          parse_cpts (decode_url url_encoded_cpts)
      with [ Not_found -> [] ]
561
    and guess_morph = decode_url (get "guess" env "") (* User-aid guessing *)
huet's avatar
huet committed
562 563
    and pseudo_gender = decode_url (get "gender" env "") in 
    let _ = if String.length guess_morph > 0 && Paths.platform="Station" then
564
               (* User-aid cache acquisition *)
huet's avatar
huet committed
565 566 567 568 569
               let (entry,gender) = match pseudo_gender with 
                                    [ "" -> parse_guess guess_morph 
                                    | g -> (guess_morph,g) 
                                    ] in do
               { append_cache entry gender
570
               ; let cache_txt_file = public_cache_txt_file in
huet's avatar
huet committed
571
                 let cache = Nouns.extract_current_cache cache_txt_file in
572
                 make_cache_transducer cache
huet's avatar
huet committed
573 574
               }
            else () in
575 576
    let revised = decode_url (get "revised" env "") (* User-aid revision *)
    and rev_off = int_of_string (get "rev_off" env "-1") 
577
    and rev_ind = int_of_string (get "rev_ind" env "-1") in 
huet's avatar
huet committed
578 579
   try do
   { match (revised,rev_off,rev_ind) with
580 581 582 583
     [ ("",-1,-1) -> (* Standard input processing *** main call *** *)
       check_sentence translit uns text checkpoints input sol_num
                      corpus sent_id link_num
     | (new_word,word_off,chunk_ind) (* User-aid revision *) -> 
huet's avatar
huet committed
584 585 586 587 588 589 590 591 592 593 594 595 596 597 598
       let chunks = Sanskrit.read_sanskrit (Encode.switch_code translit) input in
       let rec decoded init ind = fun
           [ [] -> String.sub init 0 ((String.length init)-1)
           | [ a :: rest ] -> 
               let ind' = ind+1 
               and init' = if ind = chunk_ind then init ^ new_word ^ "+"
                           else init ^ Canon.switch_decode translit a ^ "+" in
               decoded init' ind' rest
           ] in
       let updated_input = decoded "" 1 chunks in
       let rec find_word_len cur_ind = fun 
           [ [] -> 0
           | [ a :: rest ] -> if cur_ind = chunk_ind then Word.length a
                              else find_word_len (cur_ind+1) rest
           ] in
599 600
       let word_len = find_word_len 1 chunks 
       and new_chunk_len = Word.length (Encode.switch_code translit revised) in
Gérard Huet's avatar
Gérard Huet committed
601
       let diff = new_chunk_len-word_len in
huet's avatar
huet committed
602
       let revised_check = 
603
         let revise (k,sec,sel) = (if k<word_off then k else k+diff,sec,sel) in
huet's avatar
huet committed
604 605 606
         List.map revise checkpoints
       and updated_text = arguments translit lex cache st us cp updated_input
                            url_encoded_topic abs sol_num corpus sent_id link_num
607
                            url_enc_corpus_permission corpus_dir sentence_no
huet's avatar
huet committed
608 609
       and new_input = decode_url updated_input in
       check_sentence translit uns updated_text revised_check 
610
                      new_input sol_num corpus sent_id link_num
huet's avatar
huet committed
611 612 613 614 615 616 617
     ]
     (* automatically refreshing the page only if guess parameter *)
   ; if String.length guess_morph > 0 then 
        ps ("<script>\nwindow.onload = function () {window.location=\"" ^
            graph_cgi ^ "?" ^ text ^  
            ";cpts=" ^ (string_points checkpoints) ^ "\";}\n</script>")
     else ()
Idir Lankri's avatar
Idir Lankri committed
618
     (* Save sentence button *)
619
   ; if corpus_permission = Web_corpus.Annotator then
620 621
     (* TODO: use [segment_iter] to compute the nb of sols instead of
        passing 0 to [nb_sols]. *)
622 623
        save_button query 0 |> pl
     else () 
624
   ; html_break |> pl
625 626
     (* Quit button: continue reading (reader mode) 
                  or quit without saving (annotator mode) *)
627
   ; if sentence_no <> "" then
628 629
        quit_button corpus_permission
                    (decode_url corpus_dir) (decode_url sentence_no) |> pl
Gérard Huet's avatar
Gérard Huet committed
630
     else ()
huet's avatar
huet committed
631 632 633 634 635 636 637 638 639
   ; close_page_with_margin ()
   ; page_end lang True
   }
   with 
 [ Sys_error s         -> abort lang Control.sys_err_mess s (* file pb *)
 | Stream.Error s      -> abort lang Control.stream_err_mess s (* file pb *)
 | Encode.In_error s   -> abort lang "Wrong input " s
 | Exit (* Sanskrit *) -> abort lang "Wrong character in input" "" 
 | Overflow            -> abort lang "Maximum input size exceeded" ""
640
 | Invalid_argument s  -> abort lang Control.fatal_err_mess s (* sub array *)
huet's avatar
huet committed
641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658
 | Failure s           -> abort lang Control.fatal_err_mess s (* anomaly *)
 | End_of_file         -> abort lang Control.fatal_err_mess "EOF" (* EOF *)
 | Not_found           -> let s = "You must choose a parsing option" in
                          abort lang "Unset button in form - " s
 | Control.Fatal s     -> abort lang Control.fatal_err_mess s (* anomaly *)
 | Control.Anomaly s   -> abort lang Control.anomaly_err_mess s
 | _                   -> abort lang Control.fatal_err_mess "Unexpected anomaly" 
 ]
 }
; 
value safe_engine () =
  (* Problem: in case of error, we lose the current language of the session *)
  let abor = abort default_language in
  try graph_engine () with  
  [ Failure s -> abor Control.fatal_err_mess s (* [parse_cpts phase_string] ? *)
  | _ -> abor Control.fatal_err_mess "Unexpected anomaly - broken session" 
  ]
;
659 660
end (* Interface *)
;
661
Interface.safe_engine () (* Should always produce a compliant HTML page *)
662 663
;