Commit a82e07fc authored by POTTIER Francois's avatar POTTIER Francois
Browse files

Introduce SentenceGenerator.

parent 578074cc
(******************************************************************************)
(* *)
(* Menhir *)
(* *)
(* François Pottier, Inria Paris *)
(* Yann Régis-Gianas, PPS, Université Paris Diderot *)
(* *)
(* Copyright Inria. All rights reserved. This file is distributed under the *)
(* terms of the GNU General Public License version 2, as described in the *)
(* file LICENSE. *)
(* *)
(******************************************************************************)
open Grammar
open Cst
open Feat.IFSeq (* empty, ++, map, bigsum *)
(* -------------------------------------------------------------------------- *)
(* [unwrap] maps a singleton list to its unique element. This coercion is
required by the artificial wrapping that we use below. *)
let unwrap xs =
match xs with
| [x] -> x
| _ -> assert false
(* Uncurried list cons. *)
let cons (x, xs) =
unwrap x :: xs
(* [up i j] is the list of the integers of [i] included up to [j] included. *)
let rec up i j =
if i <= j then i :: up (i + 1) j else []
(* -------------------------------------------------------------------------- *)
(* [is_terminal_suffix rhs i] determines whether the suffix of [rhs] that
begins at offset [i] consists purely of terminal symbols. *)
let rec is_terminal_suffix rhs i =
i = Array.length rhs ||
Symbol.is_terminal rhs.(i) && is_terminal_suffix rhs (i + 1)
(* -------------------------------------------------------------------------- *)
(* We wish to define a function of a nonterminal symbol [nt] and a budget
[budget] to a sequence of all of the sentences that are generated by the
symbol [nt] and have length [budget]. *)
(* In order to be able to give a recursive definition, we must generalize this
function to also be able to enumerate the sentences that are generated by a
production [prod] or by a suffix [i] of a production [prod]. Thus, we have
three possibles queries: a nonterminal symbol [nt]; a production [prod]; a
suffix of a production [prod] determined by an index [i]. *)
type query =
| QNT of Nonterminal.t
| QProd of Production.index
| QProdSuffix of Production.index * int
(* Furthermore, in order to avoid the use of list concatenation, we prefer to
enumerate concrete syntax trees first, and later on compute their fringe. *)
(* Finally, in order to avoid repeated computation, we must memoize the
function that maps a query and a budget to a sequence of concrete syntax
trees. This causes a slight difficulty: ideally, the response to the
queries [QNT _] and [QProd _] should have type [cst seq], while the
response to the queries [QProdSuffix _] should have type [cst list seq].
However, our memoization combinator does not support dependently-typed
functions (which would require a heterogenous memoization table). We
work around the problem by artifically wrapping a tree as a singleton
list in the first two cases. *)
(* Our analysis of the minimal length of a sentence helps us speed up this
code by pruning insufficient budgets early. *)
(* Here is the memoized function. *)
(* The assertions in this code are evaluated at construction time, not at
iteration/sampling time, so their cost should not be a problem. *)
(* The construction phase could perhaps be sped up by using a lazy variant
of the product combinator [**]. The iteration/sampling phase would not
be sped up by this optimization, though. *)
(* Provided the grammar does not contain a loop, this recursive definition
should be well-founded, I believe. *)
let trees : query -> int -> cst list seq =
(* We exploit the fact that [query] supports hashing and equality. *)
let module M = Fix.Memoize.ForType(struct type t = query * int end) in
M.(curried fix) begin fun trees query budget ->
match query with
(* Query: what are the trees generated by the symbol [nt]? *)
| QNT nt ->
if budget < Analysis.minimal nt then
empty
else
(* A sum over all productions associated with [nt]. Because we
do not wish to generate sentences that contain the [error]
pseudo-token, any production that contains this token is
ignored. *)
Production.foldnt nt (fun prod accu ->
if Production.error_free prod
&& Analysis.minimal_prod prod 0 <= budget then
accu ++ trees (QProd prod) budget
else
accu
) empty
(* Query: what are the trees generated by production [prod]? *)
| QProd prod ->
assert (Analysis.minimal_prod prod 0 <= budget);
(* Perform a subquery for the suffix of [prod] that begins at index 0.
This yields a sequence of lists of concrete syntax trees. We wrap
each of these lists with [CstNonTerminal (prod, _)]. This yields a
sequence of concrete syntax trees. There remains to artificially
wrap each of these trees in a singleton list, for the reason that
was explained above. *)
trees (QProdSuffix (prod, 0)) budget
|> map (fun csts -> [CstNonTerminal (prod, Array.of_list csts)])
(* Query: what are the lists of trees generated by suffix [i] of
production [prod]? *)
| QProdSuffix (prod, i) ->
assert (Analysis.minimal_prod prod i <= budget);
let n = Production.length prod in
assert (0 <= i && i <= n);
if i = n then begin
(* The suffix that we are considering is empty. Only one sentence
can be generated by this suffix, namely the empty sentence. If
the budget is precisely zero, fine: we generate just this
sentence. Otherwise, the budget is too large and cannot be spent:
we generate nothing. *)
if budget = 0 then singleton [] else empty
end
else begin
let rhs = Production.rhs prod in
match rhs.(i) with
| Symbol.T tok ->
(* A terminal symbol offers no choice. We must spend one unit of
budget and cons this symbol in front of a sentence that is
generated by the rest of the right-hand side. *)
assert (1 <= budget);
trees (QProdSuffix (prod, i + 1)) (budget - 1)
|> map (List.cons (CstTerminal tok))
| Symbol.N nt ->
(* A nonterminal symbol [nt] offers a choice: we must split the
budget between this symbol and the rest of the right-hand
side. If this is the last nonterminal symbol in the
right-hand side, then the choice is trivial: [budget2] is
fixed to [n - (i + 1)], all of the budget minus [budget2]
must go the symbol [nt]. Otherwise, the amount of budget that
goes to [nt] can range between its minimal requirement and a
maximum value that ensures that the rest of the right-hand
side receives its minimum requirement. *)
if is_terminal_suffix rhs i then begin
let budget2 = n - (i + 1) in
assert (budget2 = Analysis.minimal_prod prod (i + 1));
let budget1 = budget - budget2 in
map cons (
trees (QNT nt) budget1 **
trees (QProdSuffix (prod, i + 1)) budget2
)
end
else begin
let minimum = Analysis.minimal nt in
assert (minimum <= budget);
let maximum = budget - Analysis.minimal_prod prod (i + 1) in
assert (0 <= maximum);
bigsum (
up minimum maximum |> List.map (fun budget1 ->
let budget2 = budget - budget1 in
map cons (
trees (QNT nt) budget1 **
trees (QProdSuffix (prod, i + 1)) budget2
)
)
)
end
end
end
(* -------------------------------------------------------------------------- *)
(* In the end, only the [QNT] entry point is exposed. *)
(* Artificial unwrapping is again performed so as to obtain an enumeration of
trees. *)
(* Composing with [fringe] yields an enumeration of sentences (that is, lists
of terminal symbols). It is worth noting that if the grammar is ambiguous
then two distinct trees may have the same fringe. Therefore, some sentences
appear several times in this enumeration, and if this enumeration is used
to do random sampling, then the distribution is not uniform. *)
let sentences nt budget : Terminal.t list seq =
trees (QNT nt) budget
|> map (fun t -> t |> unwrap |> fringe)
(******************************************************************************)
(* *)
(* Menhir *)
(* *)
(* François Pottier, Inria Paris *)
(* Yann Régis-Gianas, PPS, Université Paris Diderot *)
(* *)
(* Copyright Inria. All rights reserved. This file is distributed under the *)
(* terms of the GNU General Public License version 2, as described in the *)
(* file LICENSE. *)
(* *)
(******************************************************************************)
open Grammar
open Feat.Enum
(* [sentences nt] is an enumeration of all of the sentences generated by the
nonterminal symbol [nt]. It can be used, for instance, to enumerate all
sentences of a certain length, or to randomly generate sentences of a
certain length. (See Feat.Enum.) If the grammar is ambiguous, then some
sentences can appear several times in this enumeration. *)
(* These sentences are valid according to the grammar, but are not necessarily
accepted by the LR(1) automaton, because the construction of the automaton
takes precedence declarations into account, which (strictly speaking) are
not part of the grammar. *)
val sentences: Nonterminal.t -> Terminal.t list enum
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment