Commit 78fccdb4 authored by POTTIER Francois's avatar POTTIER Francois
Browse files

Create the library and package. Import the code from Vocal.

parents
_build
.merlin
*.install
*~
Arthur Charguéraud, Émilie Guermeur, François Pottier.
This diff is collapsed.
# ------------------------------------------------------------------------------
# The name of the library.
THIS := chunk
# The version number is automatically set to the current date,
# unless DATE is defined on the command line.
DATE := $(shell /bin/date +%Y%m%d)
# The repository URL (https).
REPO := https://gitlab.inria.fr/fpottier/$(THIS)
# The archive URL (https).
ARCHIVE := $(REPO)/repository/$(DATE)/archive.tar.gz
# The find utility.
FIND := $(shell if command -v gfind >/dev/null ; \
then echo gfind ; else echo find ; fi)
# ------------------------------------------------------------------------------
# Commands.
.PHONY: all
all:
@ dune build -p $(THIS)
.PHONY: install
install: all
@ dune install -p $(THIS)
.PHONY: clean
clean:
@ $(FIND) . -name "*~" | xargs rm -f
@ dune clean
.PHONY: uninstall
uninstall:
@ dune uninstall
.PHONY: pin
pin:
opam pin add $(THIS) .
.PHONY: unpin
unpin:
opam pin remove $(THIS)
# [make versions] compiles and tests the library under many versions of
# OCaml, whose list is specified in the file dune-workspace.versions.
# This requires appropriate opam switches to exist. A missing switch
# can be created like this:
# opam switch create 4.03.0
.PHONY: versions
versions:
@ dune build --workspace dune-workspace.versions @all # @test
# This requires a version of headache that supports UTF-8.
HEADACHE := headache
HEADER := $(shell pwd)/header.txt
.PHONY: headache
headache:
@ $(FIND) src -regex ".*\.ml\(i\|y\|l\)?" \
-exec $(HEADACHE) -h $(HEADER) "{}" ";"
.PHONY: release
release:
# Make sure the current version can be compiled and installed.
@ make clean
@ make install
# Check the current package description.
@ opam lint
# Check if this is the master branch.
@ if [ "$$(git symbolic-ref --short HEAD)" != "master" ] ; then \
echo "Error: this is not the master branch." ; \
git branch ; \
exit 1 ; \
fi
# Check if everything has been committed.
@ if [ -n "$$(git status --porcelain)" ] ; then \
echo "Error: there remain uncommitted changes." ; \
git status ; \
exit 1 ; \
else \
echo "Now making a release..." ; \
fi
# Create a git tag.
@ git tag -a $(DATE) -m "Release $(DATE)."
# Upload. (This automatically makes a .tar.gz archive available on gitlab.)
@ git push
@ git push --tags
.PHONY: publish
publish:
# Publish an opam description.
@ opam publish -v $(DATE) $(THIS) $(ARCHIVE) .
# Chunked Sequences
## Overview
This library offers an efficient implementation of sequences.
This data structure supports all of the standard operations on
stacks, queues, deques,
sequences,
catenable sequences,
random access sequences,
and strings.
This data structure comes in two flavors,
an ephemeral (mutable) flavor
and a persistent (immutable) flavor,
and offers constant-time conversions between these flavors.
It achieves better time complexity and uses less memory than
its competitors.
## Complexity
K = size of chunks (fixed capacity arrays in which elements are stored)
Note: the log base is K, so in practice logn is small
Ephemeral Persistent
- push/pop/front/back in O(1 + logn/K) O(K + logn/K)
usually O(1) O(1)
- concat/split/get/set O(Klogn) O(Klogn)
- iter/fold/... O(n) O(n)
- copy O(K) O(1)
- Ephemeral -> Persistent
destructive O(1)
nondestructive O(K)
- Persistent -> Ephemeral O(K)
## Implementation
### Chunked Sequences
To achieve good performance in terms of time and memory,
chunked sequences store data in
fixed-size *chunks*, implemented using cyclic arrays.
A chunked sequence is a bootstrapped tree-shaped data structure.
It consists of a front chunk and a back chunk,
whose elements are just elements of the sequence,
and a sequence in the middle,
whose elements are chunks of elements of the sequence.
### Persistence
To keep track of sharing in persistent sequences, pchunks are used instead of
chunks. A pchunk represents a fixed-size persistent sequence. It is
implemented as a chunk of shared data (called support) with a view describing
the relevant chunk segment.
This representation allows for constant time pop operations on pchunks, by
reducing the view. Push operations are also done in constant time when the
location to be pushed to in the support does not contain shared data, by
directly pushing onto the support chunk and increasing the view. Otherwise, a
copy-on-write of the support is needed, achieving amortized constant time when
push operations are iterated.
### Transience
To enable fast conversions between persistent and ephemeral sequences, both
sequences use pchunks to store data in their middle. A pchunk then needs to
keep track of whether or not it is uniquely owned by an ephemeral sequence
and can be modified in place.
This uniquely owned property is always false for pchunks in persistent
sequences, and sometimes true for pchunks in ephemeral sequences. In the
conversion from ephemeral to persistent, it must be turned off for all
pchunks.
For this operation to take constant time, sequences and pchunks keep
track of version numbers. A pchunk in an ephemeral sequence with a matching
version number is uniquely owned, and increasing the sequence version number
"disowns" all pchunks.
### Details
#### Files
SeqChunk.ml
Implementation of ephemeral and persistent sequences, and
conversions between them.
SeqChunk.mli
Interface for sequences.
SeqFixedCapacity.ml
Implementation of chunks.
SeqFixedCapacitySig.ml
Signature for chunks. This signature is needed in order to define pchunks
with a functor on chunks.
PSeqFixedCapacity.ml
Implementation of pchunks.
TestSeq.ml
A few tests for sequence operations, for debugging purposes.
#### Functor Arguments
SeqChunk:
- CapacityPolicy: Contains a function mapping tree levels to chunk sizes.
- RestoreDefault: Contains a boolean specifying whether or not to overwrite
no longer used memory with default values to prevent memory leaks.
Pchunk:
- Chunk: The chunk to be used for pchunk supports.
#### Version Logic
In Seq (ephemeral sequences), the version number passed down in function calls
is the sequence version number. New chunks are created with this version
number.
In PSeq (persistent sequences), the version number passed down is -1, which
will never match pchunk version numbers.
Note: Since a (nonempty) PSeq contains pchunks, its version number is stored
in its back pchunk as an optimization.
# Sructure Invariants
The version of a pchunk is less than or equal to the version of any sequence
containing it, and greater than or equal to 0.
In an ephemeral sequence, a support in a pchunk whose version number matches
that of the sequence is not used by any other pchunks.
Pchunks in the middle of an ephemeral sequence have aligned supports and
views.
If the middle of a sequence is nonempty, then its front and back are also
nonempty.
The combined size of any two consecutive pchunks in a sequence middle is
greater than their capacity.
* Find an elegant way of dealing with the symmetry between front and back.
* Clean up and document the source code.
* Clean up `README.md`.
* `RestoreDefault` does not currently guarantee the absence of memory leaks in
strictly ephemeral sequences. To obtain this guarantee for ephemeral
sequences with no copies or conversions, add a case for concat on pchunks
where both pchunks are uniquely owned.
* Remove `check_invariants` from the public interface.
name: "chunk"
opam-version: "2.0"
maintainer: "francois.pottier@inria.fr"
authors: [
"Arthur Charguéraud <arthur.chargueraud@inria.fr>"
"Émilie Guermeur <then at Inria>"
"François Pottier <francois.pottier@inria.fr>"
]
homepage: "https://gitlab.inria.fr/fpottier/chunk"
dev-repo: "git+https://gitlab.inria.fr/fpottier/chunk.git"
bug-reports: "francois.pottier@inria.fr"
build: [
["dune" "build" "-p" name "-j" jobs]
]
depends: [
"ocaml" { >= "4.03" }
"dune" { build & >= "2.0" }
]
synopsis: "An efficient implementation of ephemeral and persistent sequences"
(lang dune 2.0)
(lang dune 2.0)
(context (opam (switch 4.03.0)))
(context (opam (switch 4.04.2)))
(context (opam (switch 4.05.0)))
(context (opam (switch 4.06.1)))
(context (opam (switch 4.07.1)))
(context (opam (switch 4.08.1)))
(context (opam (switch 4.09.0)))
Chunk
Arthur Charguéraud, Émilie Guermeur et François Pottier
Copyright Inria. All rights reserved. This file is distributed under the
terms of the GNU Lesser General Public License as published by the Free
Software Foundation, either version 3 of the License, or (at your
option) any later version, as described in the file LICENSE.
(******************************************************************************)
(* *)
(* Chunk *)
(* *)
(* Arthur Charguéraud, Émilie Guermeur et François Pottier *)
(* *)
(* Copyright Inria. All rights reserved. This file is distributed under the *)
(* terms of the GNU Lesser General Public License as published by the Free *)
(* Software Foundation, either version 3 of the License, or (at your *)
(* option) any later version, as described in the file LICENSE. *)
(******************************************************************************)
[@@@ocaml.text
" The function [capacity] receives a depth, which is a nonnegative number.\n It must return a positive number, which represents the physical chunk size\n that should be used at this depth.\n\n For correctness, capacity should be at least 1.\n But to achieve the expected complexity bounds,\n the capacity should be at least 2.\n\n The [short_capacity] is the threshold at which persistent sequences are\n represented in a compact way using only an array.\n "]
module type S = sig val capacity : int -> int val short_capacity : int end
(******************************************************************************)
(* *)
(* Chunk *)
(* *)
(* Arthur Charguéraud, Émilie Guermeur et François Pottier *)
(* *)
(* Copyright Inria. All rights reserved. This file is distributed under the *)
(* terms of the GNU Lesser General Public License as published by the Free *)
(* Software Foundation, either version 3 of the License, or (at your *)
(* option) any later version, as described in the file LICENSE. *)
(******************************************************************************)
[@@@ocaml.text
" Representation of persistent weighted polymorphic circular\n * buffers of pointers (called pchunks) "]
module Make(Chunk:SeqFixedCapacitySig.S) =
struct
type 'a chunk = 'a Chunk.t
type version = int
let no_version = (-1)
type weight = int
type segment = Chunk.segment
type 'a t =
{
version: version ;
support: 'a chunk ;
mutable view: segment ;
mutable weight: weight }
[@@@ocaml.text
" By design, a pchunk is uniquely owned by a sequence data structure\n with version number [v] that owns a pointer to this pchunk iff\n the version number of the pchunk is exactly [v]. "]
let is_uniquely_owned p v = (p.version = v) && (v <> no_version)
let fold_left f a p = Chunk.fold_left_segment f a p.support p.view
let fold_right f p a = Chunk.fold_right_segment f p.support a p.view
let iteri f p = Chunk.iteri_segment f p.support p.view
let iteri_right f p = Chunk.iteri_segment f p.support p.view
let iter f p = Chunk.iter_segment f p.support p.view
let iter_right f p = Chunk.iter_right_segment f p.support p.view
[@@@ocaml.text " Conversions with lists "]
let to_list p = fold_right (fun x -> fun acc -> x :: acc) p []
let is_aligned p = Chunk.is_aligned p.support p.view
let is_valid w p =
(Chunk.is_valid_segment p.support p.view) &&
(p.weight =
(List.fold_left (fun acc -> fun x -> acc + (w x)) 0 (to_list p)))
let default p = Chunk.default p.support
let length p = Chunk.segment_size p.view
let weight p = p.weight
let support p = Chunk.support p.support
let capacity p = Chunk.capacity p.support
let is_empty p = (Chunk.segment_size p.view) = 0
let is_full p = (Chunk.segment_size p.view) = (Chunk.capacity p.support)
let get_contigous_segments p =
Chunk.get_contigous_segments_for_segment p.support p.view
let create d m =
{
version = 0;
support = (Chunk.create d m);
view = (Chunk.empty_segment ());
weight = 0
}
let create_dummy d =
{
version = 0;
support = (Chunk.create_dummy d);
view = (Chunk.empty_segment ());
weight = 0
}
let get_version p = p.version
let set_version p v = assert (v >= p.version); { p with version = v }
let copy_with_fresh_support p v =
{
version = v;
support = (Chunk.copy p.support);
view = (p.view);
weight = (p.weight)
}
let of_chunk c v =
{
version = v;
support = c;
view = (Chunk.get_segment c);
weight = (Chunk.length c)
}
let of_chunk_safe c v =
{
version = v;
support = (Chunk.copy c);
view = (Chunk.get_segment c);
weight = (Chunk.length c)
}
let to_chunk p v =
if is_uniquely_owned p v
then (assert (is_aligned p); p.support)
else Chunk.sub p.support p.view
let push_ephemeral_front p x w =
assert (is_aligned p);
Chunk.push_front p.support x;
p.view <- (Chunk.segment_after_push_front p.support p.view);
p.weight <- (p.weight + (w x));
p
let push_ephemeral_back p x w =
assert (is_aligned p);
Chunk.push_back p.support x;
p.view <- (Chunk.segment_after_push_back p.support p.view);
p.weight <- (p.weight + (w x));
p
let push_persistent_front p x w v =
assert (not (is_uniquely_owned p v));
(let (support, version) =
if
(Chunk.is_aligned_front p.support p.view) &&
(not (Chunk.is_full p.support))
then ((p.support), (p.version))
else ((Chunk.sub p.support p.view), v) in
Chunk.push_front support x;
{
version;
support;
view = (Chunk.segment_after_push_front p.support p.view);
weight = (p.weight + (w x))
})
let push_persistent_back p x w v =
assert (not (is_uniquely_owned p v));
(let (support, version) =
if
(Chunk.is_aligned_back p.support p.view) &&
(not (Chunk.is_full p.support))
then ((p.support), (p.version))
else ((Chunk.sub p.support p.view), v) in
Chunk.push_back support x;
{
version;
support;
view = (Chunk.segment_after_push_back p.support p.view);
weight = (p.weight + (w x))
})
let push_front p x w v =
assert (not (is_full p));
if is_uniquely_owned p v
then push_ephemeral_front p x w
else push_persistent_front p x w v
let push_back p x w v =
assert (not (is_full p));
if is_uniquely_owned p v
then push_ephemeral_back p x w
else push_persistent_back p x w v
let pop_ephemeral_front p w =
assert (is_aligned p);
(let x = Chunk.pop_front p.support in
p.view <- (Chunk.segment_after_pop_front p.support p.view);
p.weight <- (p.weight - (w x));
(x, p))
let pop_ephemeral_back p w =
assert (is_aligned p);
(let x = Chunk.pop_back p.support in
p.view <- (Chunk.segment_after_pop_back p.support p.view);
p.weight <- (p.weight - (w x));
(x, p))
let pop_persistent_front p w =
let x = Chunk.segment_front p.support p.view in
let p' =
{
version = no_version;
support = (p.support);
view = (Chunk.segment_after_pop_front p.support p.view);
weight = (p.weight - (w x))
} in
(x, p')
let pop_persistent_back p w =
let x = Chunk.segment_back p.support p.view in
let p' =
{
version = no_version;
support = (p.support);
view = (Chunk.segment_after_pop_back p.support p.view);
weight = (p.weight - (w x))
} in
(x, p')
let pop_front p w v =
if (length p) = 0 then raise Not_found;
if is_uniquely_owned p v
then pop_ephemeral_front p w
else pop_persistent_front p w
let pop_back p w v =
if (length p) = 0 then raise Not_found;
if is_uniquely_owned p v
then pop_ephemeral_back p w
else pop_persistent_back p w
let front p = Chunk.segment_front p.support p.view
let back p = Chunk.segment_back p.support p.view
[@@@ocaml.text
" Read an element in [p] at an index between 0 and [length p - 1]. "]
let get p i =
assert ((i >= 0) && (i < (length p)));
Chunk.get_in_segment p.support p.view i
[@@@ocaml.text
" Write an element in [p] at an index between 0 and [length p - 1].\n Warning: this should one be performed on uniquely-owned pchunks. "]
let set p i x =
assert ((i >= 0) && (i < (length p)));
Chunk.set_in_segment p.support p.view i x
[@@@ocaml.text " Printing of a chunk "]
let to_string f p =
let address = (2 * (Obj.magic p.support)) mod 10000 in
let str_elems = String.concat "; " (List.map f (to_list p)) in
let head = Chunk.segment_head p.view in
let support_seg = Chunk.get_segment p.support in
Printf.sprintf "(w=%d,v=%d,h=%d,s=%d,sh=%d,ss=%d)[%s]" p.weight
p.version head address (Chunk.segment_head support_seg)
(Chunk.segment_size support_seg) str_elems
[@@@ocaml.text
" Concatenation of two persistent chunks that are not uniquely owned.\n Featuring optimizations to handle the case where the support of\n either of the two chunks has room to accomodate data from the other pchunk.\n These optimization cases can be safely commented out. "]
let concat_persistent p1 p2 =
if
(Chunk.is_aligned_back p1.support p1.view) &&
(((Chunk.capacity p1.support) - (Chunk.length p1.support)) >=
(length p2))
then
let view = Chunk.copy_to_back p2.support p2.view p1.support p1.view in
{
version = no_version;
support = (p1.support);
view;
weight = (p1.weight + p2.weight)
}
else
if
(Chunk.is_aligned_front p2.support p2.view) &&
(((Chunk.capacity p2.support) - (Chunk.length p2.support)) >=
(length p1))
then
(let view =
Chunk.copy_to_front p1.support p1.view p2.support p2.view in
{
version = no_version;
support = (p2.support);
view;
weight = (p1.weight + p2.weight)
})
else
(let support =
Chunk.create (default p1) (Chunk.capacity p2.support) in
let view = Chunk.get_segment support in
let view = Chunk.copy_to_back p1.support p1.view support view in
let view = Chunk.copy_to_back p2.support p2.view support view in
{
version = no_version;
support;
view;
weight = (p1.weight + p2.weight)
})
let concat_ephemeral_front p1 p2 =
assert (is_aligned p2);
(let view = Chunk.copy_to_front p1.support p1.view p2.support p2.view in
p2.view <- view; p2.weight <- (p1.weight + p2.weight); p2)
let concat_ephemeral_back_arguments_reversed p1 p2 =
assert (is_aligned p2);
(let view = Chunk.copy_to_back p1.support p1.view p2.support p2.view in
p2.view <- view; p2.weight <- (p1.weight + p2.weight); p2)
let concat_ephemeral_back p1 p2 =
concat_ephemeral_back_arguments_reversed p2 p1
let concat p1 p2 v =
assert
(((Chunk.segment_size p1.view) + (Chunk.segment_size p2.view)) <=
(Chunk.capacity p2.support));
if is_uniquely_owned p1 v
then concat_ephemeral_back p1 p2
else
if is_uniquely_owned p2 v
then concat_ephemeral_front p1 p2
else concat_persistent p1 p2
let find_weight_index p i w =
assert (i <= p.weight);
(let break_from_loop = ref false in
let cur_index = ref 0 in
let w_before = ref 0 in
let f x =
if not (!break_from_loop)
then
let w_until_cur = (!w_before) + (w x) in
(if w_until_cur <= i
then (incr cur_index; w_before := w_until_cur)
else break_from_loop := true) in
Chunk.iter_segment f p.support p.view; ((!cur_index), (!w_before)))
let three_way_split_index_persistent p j wl w v =
assert (not (is_uniquely_owned p v));
(let (v1, x, v2) = Chunk.three_way_split_segment p.support p.view j in
let p1 =
{
version = no_version;
support = (p.support);
view = v1;