Commit 3f453c5c authored by POTTIER Francois's avatar POTTIER Francois

More demos. Switch to PNG output. Finish the blog post.

parent f3bfeab2
......@@ -10,6 +10,20 @@ let uniq = ListAux.uniq
(* -------------------------------------------------------------------------- *)
(* During the DFA construction, if the DFA is used in first-match mode, where
one stops as soon as an accepting state is encountered, then a final state
need not have any outgoing transitions; that would be pointless. However,
if the DFA is used in all-matches mode, then, when a final state is
reached, one records that an accepting state was reached and continues
reading the input. *)
(* The following global flag controls this. Quick and dirty. Not pretty. *)
let accepting_state_can_have_successors =
ref true
(* -------------------------------------------------------------------------- *)
(* This module is parameterized over an alphabet. *)
module Make (Char : sig
......@@ -392,17 +406,8 @@ type dfa = {
expressions. Then, we construct a DFA whose states are the reachable
expressions and whose transitions correspond to derivation. *)
(* If one wishes to stop as soon as an accepting state is encountered, then a
final state need not have any outgoing transitions; that would be
pointless. However, in some applications, one might wish to record that an
accepting state was reached and nevertheless continue reading the input. In
that case, the following flag should be set to true]. *)
let accepting_state_can_have_successors =
true
let may_have_successors (e : regexp) : bool =
accepting_state_can_have_successors || not (nullable e)
!accepting_state_can_have_successors || not (nullable e)
let dfa (e : regexp) : dfa =
(* Discover and number the nonempty reachable expressions. The most
......
......@@ -70,3 +70,7 @@ end) : sig
val dump: out_channel -> dfa -> unit
end
(* Only for this demo. *)
val accepting_state_can_have_successors: bool ref
......@@ -45,12 +45,13 @@ type test =
rejected: string list;
}
let exec test =
let exec mode test =
Brzozowski.accepting_state_can_have_successors := mode;
printf "Regular expression: %s\n%!" (print test.regexp);
printf "Automaton:\n%!";
let automaton = dfa test.regexp in
printf "%d states.\n%!" (size automaton);
let filename = test.basename ^ ".dot" in
let filename = sprintf "%s.%b.dot" test.basename mode in
let f = open_out filename in
dump f automaton;
close_out f;
......@@ -68,6 +69,10 @@ let exec test =
end
)
let exec test =
exec true test;
exec false test
(* -------------------------------------------------------------------------- *)
(* Examples. *)
......@@ -76,61 +81,68 @@ let tests =
ref []
let register test =
tests := test :: !tests;
test
tests := test :: !tests
(* [a(a|b)*] *)
(* [dead] *)
let r =
a @@ star (a ||| b)
let () =
register {
basename = "dead";
regexp = word "dead";
accepted = [ "dead"; ];
rejected = [ "dd"; ];
}
let test_r =
(* [1dead] *)
let () =
register {
basename = "r";
regexp = r;
basename = "onedead";
regexp = one @@ word "dead";
accepted = [ "dadeadda"; ];
rejected = [ "ddade"; ];
}
(* [a(a|b)*] *)
let () =
register {
basename = "starab";
regexp = a @@ star (a ||| b);
accepted = [ "a"; "aa"; "abb"; "aaababababa"; "ac"; ];
rejected = [ ""; "b"; "bab"; "ca"; ];
}
(* [a(a|b)*$] *)
let rd =
r @@ eof
let test_rd =
register {
basename = "rd";
regexp = rd;
regexp = a @@ star (a ||| b) @@ eof;
accepted = [ "a$"; "aa$"; "abb$"; "aaababababa$"; ];
rejected = [ "$"; "b$"; "bab$"; "ca$"; "ac$"; ];
}
(* [a(a|b)*(bc)*] *)
let s =
r @@ star (b @@ c)
let _ =
let () =
register {
basename = "s";
regexp = s;
accepted = test_r.accepted @
basename = "astarabstarbc";
regexp = a @@ star (a ||| b) @@ star (b @@ c);
accepted = [ "a"; "aa"; "abb"; "aaababababa"; "ac"; ] @
[ "abc"; "abcb"; "abcbc"; "aabbcbcbc"; "aaabbbcbcbc"; "ac"; ];
rejected = [ "ca" ];
}
(* [a(a|b)*(bc)*$] *)
let sd =
s @@ eof
(* [1a(a|b)*(bc)*$] *)
let _ =
let () =
register {
basename = "sd";
regexp = sd;
accepted = test_rd.accepted @
[ "abc$"; "abcbc$"; "aabbcbcbc$"; "aaabbbcbcbc$"; ];
rejected = [ "ca$"; "ac$"; "abcb$"; ];
basename = "oneastarabstarbc";
regexp = one @@ a @@ star (a ||| b) @@ star (b @@ c) @@ eof;
accepted = [ "ba"; "eeaa"; "edabb"; "eaaaababababa"; "ac"; ] @
[ "abc"; "abcb"; "abcbc"; "aabbcbcbc"; "aaabbbcbcbc"; "ac"; ];
rejected = [ "cc" ];
}
(* An automaton that tests whether a word belongs in a dictionary. *)
......@@ -138,26 +150,20 @@ let _ =
let keywords =
[ "cab"; "bed"; "ace"; "add"; "dead"; "dad"; ]
let dict =
keywords |> map word |> disjunction
let _ =
let () =
register {
basename = "dict";
regexp = dict;
regexp = keywords |> map word |> disjunction;
accepted = [ "cab"; "added"; "dead"; ];
rejected = [ "deed"; ];
}
(* An automaton that searches for a word that belongs in a dictionary. *)
let k =
one @@ ([ "dead"; "add" ] |> map word |> disjunction)
let _ =
let () =
register {
basename = "k";
regexp = k;
basename = "oneadddead";
regexp = one @@ ([ "dead"; "add" ] |> map word |> disjunction);
accepted = [ "dead"; "addead"; ];
rejected = [ "beca"; ];
}
......
......@@ -9,7 +9,7 @@ test: all
_build/default/Main.exe
@ if command -v dot >/dev/null ; then \
for f in *.dot ; do \
dot -Tpdf $$f -O ; \
dot -Tpng $$f -O ; \
done ; \
else \
echo "The program dot is required to visualize DFAs. Please install graphviz." ; \
......@@ -17,5 +17,5 @@ test: all
.PHONY: clean
clean:
rm -f *~
rm -f *~ *.dot.png
dune clean
# A feeling of déjà vu
<!-- TEMPORARY update title -->
# Fixin' your automata
There are several ways of compiling
a [regular expression](https://en.wikipedia.org/wiki/Regular_expression) (RE)
......@@ -65,9 +64,6 @@ is associative, commutative, and idempotent.
In other words, a disjunction must be viewed as a set of disjuncts.
The empty regular expression can be viewed as an empty disjunction.
[The complete code](http://gitlab.inria.fr/fpottier/fix/demos/brz/Brzozowski.ml)
for this demo is also available.
<!------------------------------------------------------------------------------>
## An alphabet
......@@ -356,11 +352,8 @@ expressions and whose edges are determined by `delta`. What I have just done
is exploit the fact that co-accessibility is easily expressed as a least fixed
point.
<!-- TEMPORARY
Accessibility, too, can be expressed as a least fixed point.
However, to do so, one must have access to the predecessors
of each vertex.
-->
<!-- Accessibility, too, can be expressed as a least fixed point. However, -->
<!-- this requires access to the predecessors of each vertex. -->
<!------------------------------------------------------------------------------>
......@@ -404,7 +397,7 @@ The fragment of this graph that is reachable from `e`
is guaranteed to be finite,
and is exactly the desired automaton.
<!-- TEMPORARY can we point to a proof of finiteness? -->
<!-- Can we point to a proof of finiteness? -->
There are several ways of approaching the construction of this finite graph
fragment. I choose to first perform a forward graph traversal during which I
......@@ -463,6 +456,71 @@ function `delta` through the encoding.
That's all!
<!-- TEMPORARY -->
<!-- show an example of searching for one word, KMP -->
<!-- and an example of searching for multiple words, Aho-Corasick -->
The automaton thus obtained is not necessarily minimal.
<!------------------------------------------------------------------------------>
## Examples
The expression `dead` gives rise, quite naturally, to the following automaton.
As usual, every transition is labelled with a character. In addition, every
state is labelled with the expression that it represents.
![](dead.true.dot.png)
Naturally, this automaton is not very useful, as it merely tests whether the
input **begins** with `dead`. To **search** the input for the word `dead`, one
should use the expression `1dead`. (Here, `1` denotes the universal expression
`one`.) This expression gives rise to the following automaton:
![](onedead.true.dot.png)
Here is another example of searching for a nontrivial pattern. This automaton
corresponds to the expression `1a(a|b)*(bc)*`:
![](oneastarabstarbc.true.dot.png)
The reader may notice that a final state can have outgoing transitions. If one
is interested in finding all matches or in finding a longest match, then this
is useful.
However, if one is interested only in searching for a first match and a
shortest match, then there is no need for a final state to have outgoing
transitions. The above construction can easily be modified so that final
states do not carry any outgoing transitions. This leads to slightly simpler
automata. For instance, the expression `1dead` leads to the following
first-match automaton:
![](onedead.false.dot.png)
Feeding an input text into this automaton is essentially equivalent to
searching for the word `dead` using the Knuth-Morris-Pratt algorithm.
The expression `1(add|dead)`, which is used to search a text for one of the
words `add` and `dead`, gives rise to the following first-match
automaton:
![](oneadddead.false.dot.png)
Feeding an input text into this automaton is essentially equivalent to
searching for one of the words `add` and `dead` using Aho and Corasick's
algorithm.
<!--
Further reading:
https://dl.acm.org/citation.cfm?id=321249
Directly Constructing Minimal DFAs: Combining Two Algorithms by Brzozowski
-->
<!------------------------------------------------------------------------------>
## Conclusion
Memoization, hash-consing, and fixed points are powerful tools. The
[fix](https://gitlab.inria.fr/fpottier/fix/) library makes these tools easy
to use. The conversion of regular expressions to deterministic finite-state
automata by Brzozowski's method is a good illustration of their application.
For more details,
please look at the
[full source code for this demo](https://gitlab.inria.fr/fpottier/fix/demos/brz/).
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment