parser.ml 14.7 KB
Newer Older
huet's avatar
huet committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
(**************************************************************************)
(*                                                                        *)
(*                     The Sanskrit Heritage Platform                     *)
(*                                                                        *)
(*                              Gérard Huet                               *)
(*                                                                        *)
(* ©2017 Institut National de Recherche en Informatique et en Automatique *)
(**************************************************************************)

(* CGI-bin callback for shallow syntax analysis *)
(* Parser is similar to Reader, but it is invoked from the green hearts
in the output of the reader, in order to give the semantic analysis of
a specific solution. It basically replays reading until this specific solution *)

(*i module Parser = struct i*)
 
open Encode;
open Canon;
open Html;
20
open Web; (* ps pl abort truncation etc. [remote_server_host] *)
huet's avatar
huet committed
21 22
open Cgi; (* get *)
open Checkpoints;
23
open Scl_parser; (* Interface with UoH dependency parser *)
huet's avatar
huet committed
24 25 26 27 28 29 30 31 32

module Prel = struct (* Parser's lexer prelude *)

(* prelude is executed by Lexer when initialisation of transducers fails *)
value prelude () = do
  { pl http_header
  ; page_begin parser_meta_title 
  ; pl (body_begin Chamois_back)
  ; if scl_toggle then (* external call SCL (experimental) *)
Gérard Huet's avatar
Gérard Huet committed
33
       pl (javascript (SCLpaths.scl_url ^ javascript_tooltip))
huet's avatar
huet committed
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
    else ()
  ; pl parser_title
  ; open_page_with_margin 15
  }
;
end (* Prel *)
;
value iterate = ref True (* by default we read a sentence (list of words) *)
and  complete = ref True (* by default we call the fuller segmenter *)
and output_channel = ref stdout (* by default cgi output *)
;

module Lexer_control = struct
 value star = iterate;
 value full = complete;
 value out_chan = output_channel;
end (* [Lexer_control] *)
;
module Lex = Lexer.Lexer Prel Lexer_control 
(* [print_proj print_segment_roles print_ext_segment extract_lemma] *)
;
55
value rpc = remote_server_host 
huet's avatar
huet committed
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
and remote = ref False (* local invocation of cgi by default *) 
;
open Skt_morph;
open Inflected;
open Constraints; (* [roles_of sort_flatten extract] *)
open Paraphrase; (* [display_penalties print_sem print_role] *)

value query = ref "" (* ugly - stores the query string *)
;
value set_query q = query.val := q (* [Parser.parser_engine] *)
;
(* Duplicated from Rank *)
value make_groups tagger = comp_rec 1 []  
  where rec comp_rec seg stack = fun (* going forward in time *)
  [ [] -> stack (* result goes backward in time *)
  | [ (phase,rword,_) :: rest ] -> (* we ignore euphony transition *)
      let word = Word.mirror rword (* segment is mirror word *) in 
      let keep = let tags = tagger phase word in 
                 [ roles_of seg word tags :: stack ] in
      comp_rec (seg+1) keep rest
  ] 
;
value print_sols sol = 
  let xmlify_call sol = (* sol in reverse order *) 
    let projections = List.fold_left extract "" sol in
    let invoke = parser_cgi ^ "?" ^ query.val ^ ";p=" ^ projections in
    anchor Green_ invoke heart_sign in do
  { ps html_break
  ; List.iter print_role (List.rev sol)
  ; ps (xmlify_call sol)
  ; ps html_break
  }
;
value monitoring = True (* We show explicitly the penalty vector by default *)
;
value display_penalty p = "Penalty " ^ 
   if monitoring then Constraints.show_penalty p
   else string_of_int (Constraints.eval_penalty p)
;
value print_bucket (p,b_p) = do 
  { ps html_break 
  ; ps (html_green (display_penalty p))
  ; ps html_break
  ; List.iter print_sols b_p
  } 
;
value analyse query output = 
  let tagger = Lex.extract_lemma in 
  let groups = make_groups tagger output in
  let sorted_groups = sort_flatten groups in 
  let (top_groups, threshold) = truncate_groups sorted_groups in do
  { pl (xml_empty "p")
  ; let find_len = fun
      [ [ (_,[ a :: _ ]) :: _ ] -> List.length a
      | _ -> 0
      ] in
    pl (xml_empty_with_att "input" 
           [ ("type","submit"); ("value","Submit"); 
             ("onclick","unique('" ^ parser_cgi ^ "?" ^ query 
             ^ ";p=','" ^ string_of_int (find_len top_groups) ^ "')" )
           ] ^ html_break)
  ; pl (xml_empty "p") 
118 119 120
  ; if scl_toggle then (* Call SCL parser *)
       let segments = List.map (fun (ph,w,_) -> (ph,w)) output in
       Scl_parser.print_scl [ List.rev segments ] 
huet's avatar
huet committed
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
       else () 
  (*i DEBUG ; Sys.command "ls -l > /tmp/SKT_TEMP/junk" i*)
  ; List.iter print_bucket top_groups  
  ; match threshold with      
    [ None -> ()
    | Some p -> do
       { ps html_break 
       ; ps (html_red ("Truncated penalty " ^ string_of_int p ^ " or more")) 
       ; ps html_break 
       }
    ]
  }
;
value print_sems word morphs = do  
  { ps (span_begin Latin12)
  ; ps "{ "
  ; let bar () = ps " | " 
    and sem = Canon.decode word in 
    List2.process_list_sep (print_sem sem) bar morphs
  ; ps " }" 
  ; ps span_end 
  }
;
value print_out seg_num segment = do 
  (* Contrarily to Reader, we discard phonetic information. *)
  { ps tr_begin
  ; Lex.print_segment_roles print_sems seg_num segment 
  ; ps tr_end 
  ; seg_num+1
  }
;
value rec print_project proj = fun
    [ [] -> match proj with 
            [ [] -> () (* finished, projections exhausted *)
            | _ -> failwith "Too many projections"
            ]
    | [ (phase,rword,_) :: rest ] -> (* sandhi ignored *)
      let new_proj = Lex.print_proj phase rword proj in
      print_project new_proj rest 
    ]
;
exception Truncation (* raised if more solutions than [Web.truncation] *)
;
(* Replay reader until solution index - quick and dirty way to recreate it. *)
(* Follows the infamous exponential [Rank.dove_tail]. *)
value dove_tail_until sol_index init = 
  let init_stack = List.map (fun (_,s) -> s) init (* erasing constraints *) in
  dtrec 1 (0,[],[]) init_stack
  where rec dtrec n kept stack = (* invariant: |stack|=|init|=number of chunks *)
  if n = Web.truncation then raise Truncation
  else if n = sol_index then (* return total output *)
          List.fold_right conc stack [] 
          where conc (o,_) oo = o @ oo 
  else dtrec (n+1) kept (crank [] init stack)
         where rec crank acc ini = fun
         [ [ (_,c) :: cc ] -> match ini with
            [ [ (constraints,i) :: ii ] -> do
              { Lex.Viccheda.set_offset constraints
              ; match Lex.Viccheda.continue c with
                [ Some next -> List2.unstack acc [ next :: cc ]
                | None -> crank [ i :: acc ] ii cc
                ]
              }
            | _ -> raise (Control.Anomaly "dove_tail_until")
            ]
         | [] -> raise Truncation
         ]
;
(* From Interface: splitting checkpoints into current and future ones *)
value split_check limit = split_rec []
   where rec split_rec acc checkpts = match checkpts with
      [ [] -> (List.rev acc,[])
      | [ ((index,_,_) as check) :: rest ] -> 
          if index > limit then (List.rev acc,checkpts)
          else split_rec [ check :: acc ] rest 
      ]
;
value segment_until sol_index chunks cpts = 
   let (_,constrained_segs) = List.fold_left init ((0,cpts),[]) chunks
   where init ((offset,checkpoints),stack) chunk = do
   { let ini_cont = Lex.Viccheda.init_segment chunk in 
     let chunk_length = Word.length chunk in
     let extremity = offset+chunk_length in 
     let (local,future) = split_check extremity checkpoints in
     let chunk_constraints = (offset,local) in
     ((succ extremity,future), do 
        { Lex.Viccheda.set_offset chunk_constraints (* Sets local constraints *)
        ; let res = match Lex.Viccheda.continue ini_cont with
              [ Some c -> c 
              | None -> Lex.un_analyzable chunk
              ] in 
          [ (chunk_constraints,res) :: stack ]
        }) 
   } in
   dove_tail_until sol_index constrained_segs 
;
value stamp = 
  "Heritage" ^ " " ^ Date.version
;
value print_validate_button query = 
  let cgi = parser_cgi ^ "?" ^ query ^ ";validate=t" in
  let invocation = if remote.val then rpc ^ cgi else cgi in
  anchor Green_ invocation check_sign
;
(* Follows [Reader.process_input] *)
value process_until sol_index query topic mode_sent translit sentence 
                    cpts us encode proj sol_num query do_validate = 
  let pieces = Sanskrit.read_raw_sanskrit encode sentence in
  let romapieces = List.map Canon.uniromcode pieces in
  let romasentence = String.concat " " romapieces in
  let chunker = if us then Sanskrit.read_raw_sanskrit 
                      else Sanskrit.read_sanskrit in
  let chunks = chunker encode sentence in 
  let devachunks = List.map Canon.unidevcode chunks in
  let devasentence = String.concat " " devachunks in do
  { pl html_break
  ; let lex_stamp = "Lexicon: " ^ stamp in 
    ps (html_green lex_stamp) (* in order to keep relation corpus/lexicon *)
  ; pl html_break
  ; pl hr
  ; pl html_break
  ; ps (roma16_red_sl romasentence) (* romanisation *)
  ; pl html_break
  ; ps (deva16_blue devasentence) (* devanagari *)
  ; pl html_break
  ; let all_chunks = match topic with
       [ Some topic -> chunks @ [ code_string topic ]
       | None -> chunks
       ] in
    try let output = segment_until sol_index all_chunks cpts in 
        let solution = List.rev output in do
        { pl html_break
        ; pl (xml_begin_with_att "table" [ noborder; padding10; spacing5 ])
        ; match proj with 
          [ None -> let _ = List.fold_left print_out 1 solution in ()
          | Some triples -> do
            { print_project triples solution
 (*i Validate action: to be restored
    If the cgi called with has [do_validate], we record the tagging in a file,
    otherwise, we print a call back with [do_validate]. 
 -- TODO [; if Paths.platform = "Station" then 
                match do_validate with
                [ "f" -> ps (td_wrap (print_validate_button query 
                             ^ html_green "Validate")) 
                | _ -> Lex.record_tagging us mode_sent translit sol_num
                                          sentence solution triples
                ]
              else ()] i*)
            }
          ]
        ; ps table_end 
        ; match proj with 
          [ None -> analyse query solution 
          | Some p -> ()
	  ]
        }
Gérard Huet's avatar
Gérard Huet committed
277
    with [ Truncation -> pl (html_red "Solution not found" ^ html_break) ]
huet's avatar
huet committed
278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
  }
;

value sort_check cpts = 
  let compare_index (a,_,_) (b,_,_) = compare a b in 
  List.sort compare_index cpts
;

value parser_engine () = do
(* Replays Reader until given solution - dumb but reliable *)
  { Prel.prelude ()
  ; let query = Sys.getenv "QUERY_STRING" in 
    let alist = create_env query in 
    let url_encoded_input = get "text" alist "" 
    and url_encoded_sol_index = get "n" alist "1"
    and url_encoded_topic = get "topic" alist "" 
    and st = get "st" alist "t" 
    and cp = get "cp" alist "t"
    and us = get "us" alist "f"
    and translit = get "t" alist Paths.default_transliteration 
    and lex = get "lex" alist Paths.default_lexicon
    and abs = get "abs" alist "f" (* default local paths *) in
    let lang = language_of lex
    and input = decode_url url_encoded_input (* unnormalized string *)
    and uns = us="t" (* unsandhied vs sandhied corpus *)
    and mode_sent = st="t" (* default sentence mode *)
    and encode = Encode.switch_code translit (* encoding as a normalized word *)
    and () = toggle_lexicon lex
    and () = if abs="t" then remote.val:=True else () (* Web service mode *)
    and () = if st="f" then iterate.val:=False else () (* word stemmer *)
    and () = if cp="f" then complete.val:=False else () (* simplified reader *)
    and sol_index = int_of_string (decode_url url_encoded_sol_index) 
    (* For Validate mode, register number of solutions *)
    and sol_num = int_of_string (get "allSol" alist "0")
    (* Only register this solution if validate is true *)
    and do_validate = get "validate" alist "f" 
    (* Contextual information from past discourse *)
    and topic_mark = decode_url url_encoded_topic in
    let topic = match topic_mark with
        [ "m" -> Some "sa.h"
        | "f" -> Some "saa"
        | "n" -> Some "tat"
        | _ -> None
Gérard Huet's avatar
Gérard Huet committed
321 322 323 324
        ] in
 (* Corpus interaction disabled 
   (* File where to store locally the taggings - only for [Station] platform *)
   [let corpus_file = (* optionally transmitted by argument "out_file" *)
huet's avatar
huet committed
325 326
        try let file_name = List.assoc "out_file" alist (* do not use get *) in 
            Some file_name  
Gérard Huet's avatar
Gérard Huet committed
327
        with [ Not_found -> Some regression_file_name ] in] *)
huet's avatar
huet committed
328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381
(* Regression disabled
   [let () = if Paths.platform = "Station" then match corpus_file with 
                [ Some file_name -> 
                   let regression_file = var_dir ^ file_name ^ ".txt" in 
                   output_channel.val := open_out_gen 
                     [ Open_wronly; Open_append; Open_creat; Open_text ] 
                     0o777 regression_file
                | None -> ()
                ]
             else () in] *) 
    let proj = (* checks for parsing mode *)
        try let url_encoded_proj = List.assoc "p" alist in (* do not use get *) 
            Some (parse_proj (decode_url url_encoded_proj))
        with [ Not_found -> do 
                 { set_query query (* query regurgitated - horror *)
                 ; None
                 } 
             ] 
    and checkpoints = (* checkpoints for graph *)
       try let url_encoded_cpts = List.assoc "cpts" alist in (* do not use get *)
           parse_cpts (decode_url url_encoded_cpts)
       with [ Not_found -> [] ] in     
    let cpts = sort_check checkpoints in 
    try do
      { process_until sol_index query topic mode_sent translit input 
                      cpts uns encode proj sol_num query do_validate
      ; close_page_with_margin ()
      ; let bandeau = not (Gen.active proj) in
        page_end lang bandeau
      }
    with [ Stream.Error _ -> abort lang "Illegal transliteration " input ]
 }
; 
value safe_engine () =
  let abor = abort default_language in
  try parser_engine () with 
  [ Sys_error s           -> abor Control.sys_err_mess s (* file pb *)
  | Stream.Error s        -> abor Control.stream_err_mess s (* file pb *)
  | Encode.In_error s     -> abor "Wrong input " s
  | Exit (* Sanskrit *)   -> abor "Wrong character in input - " "use ASCII" 
  | Invalid_argument s    -> abor Control.fatal_err_mess s (* sub *)
  | Failure s             -> abor Control.fatal_err_mess s (* anomaly *)
  | End_of_file           -> abor Control.fatal_err_mess "EOF" (* EOF *)
  | Not_found (* assoc *) -> let s = "You must choose a parsing option" in
                             abor "Unset button in form - " s
  | Control.Fatal s       -> abor Control.fatal_err_mess s (* anomaly *)
  | Control.Anomaly s     -> abor Control.fatal_err_mess ("Anomaly: " ^ s)
  | _                     -> abor Control.fatal_err_mess "Unexpected anomaly" 
  ] 
;
safe_engine () (* Should always produce a valid HTML page *)
;

(*i end; i*)