distance.mlw 3.33 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19

(* Correctness of a program computing the minimal distance between
   two words (code by Claude Marché).

   This program computes a variant of the Levenshtein distance. Given
   two strings [w1] and [w2] of respective lengths [n1] and [n2], it
   computes the minimal numbers of insertions and deletions to
   perform in one of the strings to get the other one.  (The
   traditional edit distance also includes substitutions.)

   The nice point about this code is to work in linear space, in an
   array of min(n1,n2) integers. Time complexity is O(n1 * n2), as
   usual.  *)

module Distance

  use import int.Int
  use import int.MinMax
  use import list.List
20 21
  use import module ref.Ref
  use import module array.Array
22 23 24 25

  (* Parameters. Input of the program is composed of two arrays
     of characters, [w1] of size [n1] and [w2] of size [n2]. *)

Andrei Paskevich's avatar
Andrei Paskevich committed
26 27
  function n1 : int
  function n2 : int
28 29 30 31 32

  type a

  type word = list a

Andrei Paskevich's avatar
Andrei Paskevich committed
33 34
  val w1 : array a
  val w2 : array a
35 36 37 38 39

  (* Global variables of the program. The program uses an auxiliary
     array [t] of integers of size [n2+1] and three auxiliary
     integer variables [i], [j] and [old]. *)

Andrei Paskevich's avatar
Andrei Paskevich committed
40
  val t : array int
41

Andrei Paskevich's avatar
Andrei Paskevich committed
42 43 44
  val i : ref int
  val j : ref int
  val o : ref int
45 46 47 48

  (* Auxiliary definitions for the program and its specification. *)

  inductive dist word word int =
49
  | dist_eps :
50 51 52 53 54 55 56 57 58 59 60
      dist Nil Nil 0
  | dist_add_left :
      forall w1 w2: word, n:int.
      dist w1 w2 n -> forall a:a. dist (Cons a w1) w2 (n + 1)
  | dist_add_right :
      forall w1 w2: word, n:int.
      dist w1 w2 n -> forall a:a. dist w1 (Cons a w2) (n + 1)
  | dist_context :
      forall w1 w2: word, n:int.
      dist w1 w2 n -> forall a:a. dist (Cons a w1) (Cons a w2) n

Andrei Paskevich's avatar
Andrei Paskevich committed
61 62
  predicate min_dist (w1 w2:word) (n:int) =
    dist w1 w2 n /\ forall m:int. dist w1 w2 m -> n <= m
63

Andrei Paskevich's avatar
Andrei Paskevich committed
64
  function suffix (array a) int : word
65

66
  axiom suffix_def_1:
67
    forall m: array a. suffix m (length m) = Nil
68
  axiom suffix_def_2:
69
    forall m: array a, i: int.
70 71
    0 <= i < length m -> suffix m i = Cons m[i] (suffix m (i+1))

Andrei Paskevich's avatar
Andrei Paskevich committed
72
  predicate min_suffix (w1 w2: array a) (i j n: int) =
73 74
    min_dist (suffix w1 i) (suffix w2 j) n

Andrei Paskevich's avatar
Andrei Paskevich committed
75
  function word_of_array (m: array a) : word = suffix m 0
76 77 78 79

  (* The code. *)

  let distance () =
Andrei Paskevich's avatar
Andrei Paskevich committed
80
    { length w1 = n1 /\ length w2 = n2 /\ length t = n2+1 }
81 82 83
    begin
      (* initialization of t *)
      for i = 0 to n2 do
Andrei Paskevich's avatar
Andrei Paskevich committed
84
        invariant { length t = n2+1 /\
85
                    forall j:int. 0 <= j < i -> t[j] = n2-j }
86
       t[i] <- n2 - i
87 88 89 90
      done;
      (* loop over w1 *)
      for i = n1-1 downto 0 do
        invariant { length t = n2+1
Andrei Paskevich's avatar
Andrei Paskevich committed
91
           /\ forall j:int. 0 <= j <= n2 -> min_suffix w1 w2 (i+1) j t[j] }
92
        o := t[n2];
93
        t[n2] <- t[n2] + 1;
94 95 96
        (* loop over w2 *)
        for j = n2-1 downto 0 do
          invariant { length t = n2+1
Andrei Paskevich's avatar
Andrei Paskevich committed
97 98 99
              /\ (forall k:int. j < k <= n2 -> min_suffix w1 w2 i k t[k])
              /\ (forall k:int. 0 <= k <= j -> min_suffix w1 w2 (i+1) k t[k])
              /\ min_suffix w1 w2 (i+1) (j+1) !o }
100 101 102 103
          begin
      	    let temp = !o in
            o := t[j];
	    if w1[i] = w2[j] then
104
	      t[j] <- temp
105
	    else
106
	      t[j] <- (min t[j] t[j+1]) + 1
107 108 109 110 111 112 113 114 115 116
          end
        done
      done;
      t[0]
    end
    { min_dist (word_of_array w1) (word_of_array w2) result }

end

(*
117
Local Variables:
118
compile-command: "unset LANG; make -C ../.. examples/programs/distance.gui"
119
End:
120
*)