parameterizedGrammar.ml 18.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
open Positions
open Syntax
open UnparameterizedSyntax
open InternalSyntax
open Misc

(* Inference for non terminals. *)

(* Unification variables convey [variable_info] to describe
   the multi-equation they take part of. *)
type variable_info = 
    {
      mutable structure : nt_type option;
      mutable name      : string option;
      mutable mark      : Mark.t
    }

(* [UnionFind] is used to improve the union and the equality test
   between multi-equations. *)
and variable = variable_info UnionFind.point

(* Types are simple types. 
   [star] denotes the type of ground symbol (non terminal or terminal).
   [Arrow] describes the type of a parameterized non terminal. *)
and nt_type = 
    Arrow of variable list 

let star =
  Arrow []

(* [var_name] is a name generator for unification variables. *)
let var_name =
  let name_counter = ref (-1) in
  let next_name () = 
    incr name_counter;
    String.make 1 (char_of_int (97 + !name_counter mod 26))
    ^ let d = !name_counter / 26 in if d = 0 then "" else string_of_int d
  in
    fun v -> 
      let repr = UnionFind.find v in 
	match repr.name with
	    None -> let name = next_name () in repr.name <- Some name; name
	  | Some x -> x

(* [string_of_nt_type] is a simple pretty printer for types (they can be 
   recursive). *)

(* 2011/04/05: types can no longer be recursive, but I won't touch the printer -fpottier *)

let string_of paren_fun ?paren ?colors t : string = 
  let colors = 
    match colors with 
	None    -> (Mark.fresh (), Mark.fresh ()) 
      | Some cs -> cs 
  in
  let s, p = paren_fun colors t in
    if paren <> None && p = true then 
      "("^ s ^")"
    else s

61 62
let rec paren_nt_type colors = function
  (* [colors] is a pair [white, black] *)
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
    
    Arrow [] -> 
      "*", false

  | Arrow ins ->
      let args = separated_list_to_string 
	(string_of paren_var ~paren:true ~colors) ", " ins 
      in
      let args = 
	if List.length ins > 1 then
	  "("^ args ^ ")"
	else 
	  args
      in
	args^" -> *", true
	
and paren_var (white, black) x = 
  let descr = UnionFind.find x in
    if Mark.same descr.mark white then begin
      descr.mark <- black;
      var_name x, false
    end 
    else begin 
      descr.mark <- white;
      let s, p = match descr.structure with
	  None -> var_name x, false
	| Some t -> paren_nt_type (white, black) t
      in
	if Mark.same descr.mark black then
	  (var_name x ^ " = " ^ s, true)
	else 
	  (s, p)
    end

97 98
let string_of_nt_type ?colors t = 
  (* TEMPORARY note: always called without a [colors] argument! *)
99 100
  string_of ?colors paren_nt_type t

101 102
let string_of_var ?colors v = 
  (* TEMPORARY note: always called without a [colors] argument! *)
103 104
  string_of ?colors paren_var v

105 106
(* for debugging:

107 108 109 110 111
(* [print_env env] returns a string description of the typing environment. *)
let print_env = 
  List.iter (fun (k, (_, v)) -> 
	       Printf.eprintf "%s: %s\n" k (string_of_var v))

112 113
*)

114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
(* [occurs_check x y] checks that [x] does not occur within [y]. *)

let dfs action x =

  let black = Mark.fresh () in

  let rec visit_var x =
    let descr = UnionFind.find x in 
    if not (Mark.same descr.mark black) then begin
      descr.mark <- black;
      action x;
      match descr.structure with
      | None ->
	  ()
      | Some t -> 
	  visit_term t
    end

  and visit_term (Arrow ins) =
    List.iter visit_var ins

  in
  visit_var x

exception OccursError of variable * variable

let occurs_check x y =
  dfs (fun z -> if UnionFind.equivalent x z then raise (OccursError (x, y))) y

(* First order unification. *)

(* 2011/04/05: perform an eager occurs check and prevent the construction
   of any cycles. *)

let fresh_flexible_variable () = 
  UnionFind.fresh { structure = None; name = None; mark = Mark.none }

let fresh_structured_variable t = 
  UnionFind.fresh { structure = Some t; name = None; mark = Mark.none }

let star_variable =
  fresh_structured_variable star

exception UnificationError of nt_type * nt_type
exception BadArityError of int * int

let rec unify_var toplevel x y =
  if not (UnionFind.equivalent x y) then
    let reprx, repry = UnionFind.find x, UnionFind.find y in
      match reprx.structure, repry.structure with
164 165
	  None, Some _    -> occurs_check x y; UnionFind.union x y
	| Some _, None    -> occurs_check y x; UnionFind.union y x
166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263
	| None, None      -> UnionFind.union x y
	| Some t, Some t' -> unify toplevel t t'; UnionFind.union x y
	    
and unify toplevel t1 t2 = 
  match t1, t2 with

    | Arrow ins, Arrow ins' ->
	let n1, n2 = List.length ins, List.length ins' in
	if n1 <> n2 then
	  if n1 = 0 || n2 = 0 || not toplevel then
	    raise (UnificationError (t1, t2))
	  else
	    (* the flag [toplevel] is used only here and influences which
	       exception is raised; BadArityError is raised only at toplevel *)
	    raise (BadArityError (n1, n2));
	List.iter2 (unify_var false) ins ins'

let unify_var x y =
  unify_var true x y

(* Typing environment. *)
type environment =
    (string * (Positions.t list * variable)) list

(* [lookup x env] returns the type related to [x] in the typing environment
   [env]. 
   By convention, identifiers that are not in [env] are terminals. They are
   given the type [Star]. *)
let lookup x (env: environment) = 
  try 
    snd (List.assoc x env)
  with Not_found -> star_variable

(* This function checks that the symbol [k] has the type [expected_type]. *)
let check positions env k expected_type =
  let inference_var = lookup k env in
  let checking_var = fresh_structured_variable expected_type in
    try
      unify_var inference_var checking_var
    with 
	UnificationError (t1, t2) ->
	  Error.error
	    positions
	    (Printf.sprintf 
	       "How is this symbol parameterized?\n\
	      It is used at sorts %s and %s.\n\
              The sort %s is not compatible with the sort %s."
	       (string_of_var inference_var) (string_of_var checking_var)
	       (string_of_nt_type t1) (string_of_nt_type t2))
	    
      | BadArityError (n1, n2) ->
	  Error.error
	    positions
	    (Printf.sprintf
	       "does this symbol expect %d or %d arguments?" 
	       (min n1 n2) (max n1 n2))

      | OccursError (x, y) ->
	  Error.error
	    positions
	    (Printf.sprintf 
	       "How is this symbol parameterized?\n\
	      It is used at sorts %s and %s.\n\
              The sort %s cannot be unified with the sort %s."
	       (string_of_var inference_var) (string_of_var checking_var)
	       (string_of_var x) (string_of_var y))
	  


(* An identifier can be used either in a total application or as a
   higher-order non terminal (no partial application is allowed). *)
let rec parameter_type env = function
  | ParameterVar x ->
      lookup x.value env

  | ParameterApp (x, args) ->
      assert (args <> []);
      let expected_type =
	(* [x] is applied, it must be to the exact number 
	   of arguments. *)
	Arrow (List.map (parameter_type env) args) 
      in
	(* Check the well-formedness of the application. *)
	check [x.position] env x.value expected_type;

	(* Similarly, if it was a total application the result is 
	   [Star] otherwise it is the flexible variable. *)
	star_variable

let check_grammar p_grammar = 
  (* [n] is the grammar size. *)
  let n        = StringMap.cardinal p_grammar.p_rules in

  (* The successors of the non terminal [N] are its producers. It 
     induce a graph over the non terminals and its successor function
     is implemented by [successors]. Non terminals are indexed using
     [nt].
  *) 
264
  let nt, conv, _iconv = index_map p_grammar.p_rules in
265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332
  let parameters, name, branches, positions = 
    (fun n -> (nt n).pr_parameters), (fun n -> (nt n).pr_nt),
    (fun n -> (nt n).pr_branches), (fun n -> (nt n).pr_positions)
  in
    
  (* The successors function is implemented as an array using the 
     indexing previously created. *)
  let successors = 
    Array.init n (fun node -> 
      (* We only are interested by parameterized non terminals. *)
      if parameters node <> [] then
	List.fold_left (fun succs { pr_producers = symbols } ->
	  List.fold_left (fun succs -> function (_, p) -> 
	    let symbol, _ = Parameters.unapp p in
	    try 
	      let symbol_node = conv symbol.value in
		(* [symbol] is a parameterized non terminal, we add it 
		   to the successors set. *)
		if parameters symbol_node <> [] then
		  IntSet.add symbol_node succs
		else 
		  succs
	    with Not_found -> 
	      (* [symbol] is a token, it is not interesting for type inference
		 purpose. *)
	      succs
	  ) succs symbols
        ) IntSet.empty (branches node)
      else
	Misc.IntSet.empty
    )
  in

  (* The successors function and the indexing induce the following graph 
     module. *)
  let module RulesGraph = 
      struct

	type node = int

	let n = n

	let index node = 
	  node

	let successors f node = 
	  IntSet.iter f successors.(node)

	let iter f = 
	  for i = 0 to n - 1 do 
	    f i
	  done

      end
  in
  let module ConnectedComponents = Tarjan.Run (RulesGraph) in
    (* We check that:
       - all the parameterized definitions of a particular component
       have the same number of parameters.
       - every parameterized non terminal definition always uses
       parameterized definitions of the same component with its
       formal parameters.
    
       Components are marked during the traversal:
       -1 means unvisited
       n with n > 0 is the number of parameters of the clique.
    *)
  let unseen = -1 in
333
  let marked_components = Array.make n unseen in
334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465
    
  let flexible_arrow args =
    let ty = Arrow (List.map (fun _ -> fresh_flexible_variable ()) args) in
      fresh_structured_variable ty 
  in

  (* [nt_type i] is the type of the i-th non terminal. *)
  let nt_type i =
    match parameters i with
      | [] -> 
	  star_variable
	    
      | x -> 
	  flexible_arrow x
  in

  (* [actual_parameters_as_formal] is the well-formedness checker for 
     parameterized non terminal application. *)
  let actual_parameters_as_formal actual_parameters formal_parameters = 
    List.for_all2 (fun y -> (function ParameterVar x -> x.value = y 
			      | _ -> false)) 
      formal_parameters actual_parameters
  in

  (* The environment is initialized. *)
  let env : environment = StringMap.fold  
    (fun k r acu -> 
       (k, (r.pr_positions, nt_type (conv k))) 
       :: acu)
    p_grammar.p_rules []
  in

    (* We traverse the graph checking each parameterized non terminal
       definition is well-formed. *)
    RulesGraph.iter 
      (fun i ->
	 let params    = parameters i 
	 and iname     = name i 
	 and repr      = ConnectedComponents.representative i 
	 and positions = positions i
	 in

	 (* The environment is augmented with the parameters whose types are
	    unknown. *)
	 let env' = List.map 
	   (fun k -> (k, (positions, fresh_flexible_variable ()))) params
	 in
	 let env = env' @ env in
	   
	 (* The type of the parameterized non terminal is constrained to be
	    [expected_ty]. *)
	 let check_type () = 
	   check positions env iname (Arrow (List.map (fun (_, (_, t)) -> t) env'))
	 in

	 (* We check the number of parameters. *)
	 let check_parameters () = 
	   let parameters_len = List.length params in
	     (* The component is visited for the first time. *)
	     if marked_components.(repr) = unseen then
	       marked_components.(repr) <- parameters_len
	     else (* Otherwise, we check that the arity is homogeneous 
		     in the component. *) 
	       if marked_components.(repr) <> parameters_len then 
		 Error.error positions
		   (Printf.sprintf 
		      "Mutually recursive definitions must have the same parameters.\n\
                       This is not the case for %s and %s."
			 (name repr) iname)
	 in

	(* In each production rule, the parameterized non terminal
	   of the same component must be instantiated with the same
	   formal arguments. *)
	 let check_producers () =
	   List.iter 
	     (fun { pr_producers = symbols } -> List.iter 
		(function (_, p) ->
		   let symbol, actuals = Parameters.unapp p in
		   (* We take the use of each symbol into account. *)
		     check [ symbol.position ] env symbol.value 
		       (if actuals = [] then star else 
			  Arrow (List.map (parameter_type env) actuals));
		   (* If it is in the same component, check in addition that
		      the arguments are the formal arguments. *)
		   try 
		     let idx = conv symbol.value in 
		       if ConnectedComponents.representative idx = repr then
			 if not (actual_parameters_as_formal actuals params)
			 then
			   Error.error [ symbol.position ]
			     (Printf.sprintf
				"Mutually recursive definitions must have the same \
                                 parameters.\n\
				 This is not the case for %s."
				 (let name1, name2 = (name idx), (name i) in
				    if name1 <> name2 then name1 ^ " and "^ name2
				    else name1))
		   with _ -> ())
		    symbols) (branches i)
	 in
	   check_type ();
	   check_parameters ();
	   check_producers ())

      
let rec subst_parameter subst = function
  | ParameterVar x ->
      (try 
	List.assoc x.value subst 
      with Not_found ->
	ParameterVar x)

  | ParameterApp (x, ps) -> 
      (try 
	match List.assoc x.value subst with
	  | ParameterVar y ->
	      ParameterApp (y, List.map (subst_parameter subst) ps)

	  | ParameterApp _ ->
	      (* Type-checking ensures that we cannot do partial
		 application. Consequently, if an higher-order non terminal
		 is an actual argument, it cannot be the result of a 
		 partial application. *)
	      assert false

      with Not_found -> 
	  ParameterApp (x, List.map (subst_parameter subst) ps))

let subst_parameters subst = 
  List.map (subst_parameter subst)

466
(* TEMPORARY why unused?
467 468 469 470 471
let names_of_p_grammar p_grammar = 
  StringMap.fold (fun tok _ acu -> StringSet.add tok acu) 
    p_grammar.p_tokens StringSet.empty 
    $$ (StringMap.fold (fun nt _ acu -> StringSet.add nt acu)
	  p_grammar.p_rules)
472
*)
473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552

let expand p_grammar = 
  (* Check that it is safe to expand this parameterized grammar. *)
  check_grammar p_grammar;

  (* Set up a mechanism that ensures that names are unique -- and, in
     fact, ensures the stronger condition that normalized names are
     unique. *)

  let names =
    ref (StringSet.empty) 
  in
  let ensure_fresh name =
    let normalized_name = Misc.normalize name in
    if StringSet.mem normalized_name !names then
      Error.error []
	(Printf.sprintf "internal name clash over %s" normalized_name);
    names := StringSet.add normalized_name !names;
    name
  in 
  let expanded_rules = 
    Hashtbl.create 13 
  in
  let module InstanceTable = 
    Hashtbl.Make (Parameters)
  in
  let rule_names = 
    InstanceTable.create 13 
  in

  (* [mangle p] chooses a name for the new nonterminal symbol that corresponds
     to the parameter [p]. *)

  let rec mangle = function 
    | ParameterVar x
    | ParameterApp (x, []) ->
	Positions.value x
    | ParameterApp (x, ps) ->

	(* We include parentheses and commas in the names that we
	   assign to expanded nonterminals, because that is more
	   readable and acceptable in many situations. We replace them
	   with underscores in situations where these characters are
	   not valid. *)

	Printf.sprintf "%s(%s)"
	  (Positions.value x)
	  (separated_list_to_string mangle "," ps)

  in
  let name_of symbol parameters = 
    let param = ParameterApp (symbol, parameters) in
    try 
      InstanceTable.find rule_names param
    with Not_found ->
      let name = ensure_fresh (mangle param) in
      InstanceTable.add rule_names param name;
      name
  in
  (* Given the substitution [subst] from parameters to non terminal, we
     instantiate the parameterized branch. *)
  let rec expand_branch subst pbranch = 
    let new_producers = List.map 
      (function (ido, p) ->
	 let sym, actual_parameters = 
	   Parameters.unapp p in
	 let sym, actual_parameters = 	      
	   try 
	     match List.assoc sym.value subst with
	       | ParameterVar x ->
		   x, subst_parameters subst actual_parameters

	       | ParameterApp (x, ps) ->
		   assert (actual_parameters = []);
		   x, ps
		       
	   with Not_found -> 
	     sym, subst_parameters subst actual_parameters
	 in
	   (* Instantiate the definition of the producer. *)
553
	   (expand_branches subst sym actual_parameters, Positions.value ido))
554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634
      pbranch.pr_producers
    in
      {
        branch_position          = pbranch.pr_branch_position;
	producers		 = new_producers;
	action			 = pbranch.pr_action;
	branch_shift_precedence  = pbranch.pr_branch_shift_precedence;
	branch_reduce_precedence = pbranch.pr_branch_reduce_precedence;
      }

  (* Instantiate the branches of sym for a particular set of actual
     parameters. *)
  and expand_branches subst sym actual_parameters =
    let nsym = name_of sym actual_parameters in
      try
	if not (Hashtbl.mem expanded_rules nsym) then begin
	  let prule = StringMap.find (Positions.value sym) p_grammar.p_rules in
	  let subst = 
	    (* Type checking ensures that parameterized non terminal 
	       instantiations are well defined. *)
	    assert (List.length prule.pr_parameters 
		    = List.length actual_parameters);
	    List.combine prule.pr_parameters actual_parameters @ subst in
	    Hashtbl.add expanded_rules nsym 
	      { branches = []; positions = []; inline_flag = false };
	  let rules = List.map (expand_branch subst) prule.pr_branches in
	    Hashtbl.replace expanded_rules nsym
	      { 
		branches    = rules; 
		positions   = prule.pr_positions; 
		inline_flag = prule.pr_inline_flag;
	      }
	end;
	nsym
      (* If [sym] is a terminal, then it is not in [p_grammar.p_rules]. 
	 Expansion is not needed. *)
      with Not_found -> Positions.value sym 
  in
  let rec types_from_list = function
    | [] -> StringMap.empty
    | (nt, ty)::q ->
        let accu = types_from_list q in
        let mangled = mangle nt in
        if StringMap.mem mangled accu then
          Error.error [Positions.position (Parameters.with_pos nt)]
            (Printf.sprintf
               "There are multiple %%type definitions for nonterminal %s."
               mangled);
        StringMap.add mangled (Positions.value ty) accu
  in

  let start_symbols = StringMap.domain (p_grammar.p_start_symbols) in
  {
    preludes      = p_grammar.p_preludes;
    postludes	  = p_grammar.p_postludes;
    parameters    = p_grammar.p_parameters;
    start_symbols = start_symbols;
    types         = types_from_list p_grammar.p_types;
    tokens	  = p_grammar.p_tokens;
    rules	  = 
      let closed_rules = StringMap.fold 
	(fun k prule rules -> 
	   (* If [k] is a start symbol then it cannot be parameterized. *)
	   if prule.pr_parameters <> [] && StringSet.mem k start_symbols then
	     Error.error []
	       (Printf.sprintf "The start symbol `%s' cannot be parameterized."
		  k);

	   (* Entry points are the closed non terminals. *)
	   if prule.pr_parameters = [] then 
	     StringMap.add k { 
	       branches    = List.map (expand_branch []) prule.pr_branches;
	       positions   = prule.pr_positions;
	       inline_flag = prule.pr_inline_flag;
	     } rules
	   else rules)
	p_grammar.p_rules
	StringMap.empty
      in
	Hashtbl.fold StringMap.add expanded_rules closed_rules 
  }