diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 12a5bd615bf0ad7c3cf64ee91ea72f4d3c3f2611..7204f50084f5652a5926741da54246841f2d834d 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -3,7 +3,7 @@
 # @copyright (c) 2009-2014 The University of Tennessee and The University
 #                          of Tennessee Research Foundation.
 #                          All rights reserved.
-# @copyright (c) 2012-2014 Inria. All rights reserved.
+# @copyright (c) 2012-2017 Inria. All rights reserved.
 # @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
 #
 ###
@@ -22,6 +22,7 @@
 #  @author Cedric Castagnede
 #  @author Emmanuel Agullo
 #  @author Mathieu Faverge
+#  @author Florent Pruvost
 #  @date 13-07-2012
 #
 ###
@@ -34,8 +35,8 @@ cmake_minimum_required(VERSION 2.8)
 #                                           #
 #############################################
 add_subdirectory(doxygen)
-add_subdirectory(texinfo)
-
+add_subdirectory(orgmode)
+#add_subdirectory(texinfo)
 ###
 ### END CMakeLists.txt
 ###
diff --git a/doc/orgmode/CMakeLists.txt b/doc/orgmode/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..124cbe2e20da88677854d44d090dd7c0287e444a
--- /dev/null
+++ b/doc/orgmode/CMakeLists.txt
@@ -0,0 +1,103 @@
+###
+#
+# @copyright (c) 2017 Inria. All rights reserved.
+#
+###
+#
+#  @file CMakeLists.txt
+#
+#  @project MORSE
+#  MORSE is a software package provided by:
+#     Inria Bordeaux - Sud-Ouest,
+#     Univ. of Tennessee,
+#     King Abdullah Univesity of Science and Technology
+#     Univ. of California Berkeley,
+#     Univ. of Colorado Denver. 
+#
+#  @version 1.0.0
+#  @author Florent Pruvost
+#  @date 25-08-2017
+#
+###
+
+cmake_minimum_required(VERSION 2.8)
+
+# Create file version.org
+# -----------------------
+configure_file("version.org.in"
+               "version.org"
+               @ONLY)
+configure_file("users_guide.org.in"
+               "users_guide.org"
+               @ONLY)
+
+set(FIGURES 
+    tile_lu.pdf
+    tile_lu.jpg
+    tile_layout.pdf
+    tile_layout.jpg
+    trace_qr.pdf
+    trace_qr.jpg
+    potri_async.png
+    morse_header.png
+    )
+set(FIGURES_HTML 
+    tile_lu.jpg
+    tile_layout.jpg
+    trace_qr.jpg
+    potri_async.png
+    morse_header.png
+    )
+
+foreach(_fig ${FIGURES})
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/figures/${_fig}
+                   ${CMAKE_CURRENT_BINARY_DIR}/${_fig}
+                   COPYONLY)
+endforeach()
+
+# Looking for emacs
+# -----------------
+FIND_PROGRAM(EMACS_COMPILER emacs)
+
+if(EMACS_COMPILER)
+    # Add target
+    # ----------
+    add_custom_command(OUTPUT  users_guide.html
+                       COMMAND ${EMACS_COMPILER}
+                       ARGS    ${CMAKE_CURRENT_BINARY_DIR}/users_guide.org 
+                               --batch
+                               -f
+                               org-html-export-to-html
+                               --kill
+                       DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/users_guide.org
+                      )
+    add_custom_command(OUTPUT  users_guide.pdf
+                       COMMAND ${EMACS_COMPILER}
+                       ARGS    ${CMAKE_CURRENT_BINARY_DIR}/users_guide.org 
+                               --batch
+                               -f
+                               org-latex-export-to-pdf
+                               --kill
+                       DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/users_guide.org
+                      )
+    add_custom_target(doc-html-users_guide ALL DEPENDS users_guide.html)
+    add_custom_target(doc-pdf-users_guide ALL DEPENDS users_guide.pdf)
+
+    # Installation
+    # ------------
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/users_guide.html
+            DESTINATION share/chameleon/html)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/users_guide.pdf
+            DESTINATION share/chameleon/pdf)
+    foreach(_fig ${FIGURES_HTML})
+        install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${_fig}
+                DESTINATION share/chameleon/html)
+    endforeach()
+
+else(EMACS_COMPILER)
+    message(STATUS "Looking for emacs - not found")
+endif(EMACS_COMPILER)
+
+###
+### END CMakeLists.txt
+###
diff --git a/doc/orgmode/chapters/configuration.org b/doc/orgmode/chapters/configuration.org
new file mode 100644
index 0000000000000000000000000000000000000000..8d963735bb19d672fd8ff7cb7649c03730b15205
--- /dev/null
+++ b/doc/orgmode/chapters/configuration.org
@@ -0,0 +1,366 @@
+@c -*-texinfo-*-
+
+@c This file is part of the MORSE Handbook.
+@c Copyright (C) 2017 Inria
+@c Copyright (C) 2014 The University of Tennessee
+@c Copyright (C) 2014 King Abdullah University of Science and Technology
+@c See the file ../chameleon.texi for copying conditions.
+
+@menu
+* Compilation configuration::
+* Dependencies detection::
+@c * Dependencies compilation::
+* Use FxT profiling through StarPU::
+* Use simulation mode with StarPU-SimGrid::
+* Use out of core support with StarPU::
+@end menu
+
+@c @code{} @option{}
+@c @table @code
+@c @item truc
+@c @item muche
+@c @item et zut
+@c @c @end table
+
+@node Compilation configuration
+@section Compilation configuration
+
+The following arguments can be given to the @command{cmake <path to source
+directory>} script.
+
+In this chapter, the following convention is used:
+@itemize @bullet
+@item
+@option{path} is a path in your filesystem,
+@item
+@option{var} is a string and the correct value or an example will be given,
+@item
+@option{trigger} is an CMake option and the correct value is @code{ON} or
+@code{OFF}.
+@end itemize
+
+Using CMake there are several ways to give options:
+@enumerate
+@item directly as CMake command line arguments
+@item invoque @command{cmake <path to source directory>} once and then use
+@command{ccmake <path to source directory>} to edit options through a
+minimalist gui (required
+@samp{cmake-curses-gui} installed on a Linux system)
+@item invoque @command{cmake-gui} command and fill information about the
+location of the sources and where to build the project, then you have
+access to options through a user-friendly Qt interface (required
+@samp{cmake-qt-gui} installed on a Linux system)
+@end enumerate
+
+Example of configuration using the command line
+@example
+cmake ~/chameleon/ -DCMAKE_BUILD_TYPE=Debug          \
+                   -DCMAKE_INSTALL_PREFIX=~/install  \
+                   -DCHAMELEON_USE_CUDA=ON           \
+                   -DCHAMELEON_USE_MPI=ON            \
+                   -DBLA_VENDOR=Intel10_64lp         \
+                   -DSTARPU_DIR=~/install/starpu-1.1 \
+                   -DCHAMELEON_ENABLE_TRACING=ON
+@end example
+
+You can get the full list of options with @option{-L[A][H]} options of
+@command{cmake} command:
+@example
+cmake -LH <path to source directory>
+@end example
+
+@menu
+* General CMake options::
+* CHAMELEON options::
+@end menu
+
+@node General CMake options
+@subsection General CMake options
+
+@table @code
+
+@item -DCMAKE_INSTALL_PREFIX=@option{path} (default:@option{path=/usr/local})
+Install directory used by @code{make install} where some headers and libraries
+will be copied.
+Permissions have to be granted to write onto @option{path} during @code{make
+install} step.
+
+@item -DCMAKE_BUILD_TYPE=@option{var} (default: @option{Release})
+Define the build type and the compiler optimization level.
+The possible values for @option{var} are:
+@table @code
+@item empty
+@item Debug
+@item Release
+@item RelWithDebInfo
+@item MinSizeRel
+@end table
+
+@item -DBUILD_SHARED_LIBS=@option{trigger} (default:@option{OFF})
+Indicate wether or not CMake has to build CHAMELEON static (@option{OFF}) or
+shared (@option{ON}) libraries.
+
+@end table
+
+@node CHAMELEON options
+@subsection CHAMELEON options
+
+List of CHAMELEON options that can be enabled/disabled (value=@code{ON}
+or @code{OFF}):
+@table @code
+
+@item @option{-DCHAMELEON_SCHED_STARPU}=@option{trigger} (default: @code{ON})
+to link with StarPU library (runtime system)
+
+@item @option{-DCHAMELEON_SCHED_QUARK}=@option{trigger} (default: @code{OFF})
+to link with QUARK library (runtime system)
+
+@item @option{-DCHAMELEON_USE_CUDA}=@option{trigger} (default: @code{OFF})
+to link with CUDA runtime (implementation paradigm for accelerated codes on
+GPUs) and cuBLAS library (optimized BLAS kernels on GPUs), can only be used with
+StarPU
+
+@item @option{-DCHAMELEON_USE_MPI}=@option{trigger} (default: @code{OFF})
+to link with MPI library (message passing implementation for use of multiple
+nodes with distributed memory), can only be used with StarPU
+
+@item @option{-DCHAMELEON_ENABLE_TRACING}=@option{trigger} (default: @code{OFF})
+to enable trace generation during execution of timing drivers.
+It requires StarPU to be linked with FxT library (trace execution of kernels on workers).
+
+@item @option{-DCHAMELEON_SIMULATION=trigger} (default: @code{OFF})
+to enable simulation mode, means CHAMELEON will not really execute tasks,
+see details in section @ref{Use simulation mode with StarPU-SimGrid}.
+This option must be used with StarPU compiled with
+@uref{http://simgrid.gforge.inria.fr/, SimGrid} allowing to guess the
+execution time on any architecture.
+This feature should be used to make experiments on the scheduler behaviors and
+performances not to produce solutions of linear systems.
+
+@item @option{-DCHAMELEON_ENABLE_DOCS=trigger} (default: @code{ON})
+to control build of the documentation contained in @file{docs/} sub-directory
+@item @option{-DCHAMELEON_ENABLE_EXAMPLE=trigger} (default: @code{ON})
+to control build of the examples executables (API usage)
+contained in @file{example/} sub-directory
+@item @option{-DCHAMELEON_ENABLE_TESTING=trigger} (default: @code{ON})
+to control build of testing executables (numerical check) contained in
+@file{testing/} sub-directory
+@item @option{-DCHAMELEON_ENABLE_TIMING=trigger} (default: @code{ON})
+to control build of timing executables (performances check) contained in
+@file{timing/} sub-directory
+
+@item @option{-DCHAMELEON_PREC_S=trigger} (default: @code{ON})
+to enable the support of simple arithmetic precision (float in C)
+@item @option{-DCHAMELEON_PREC_D=trigger} (default: @code{ON})
+to enable the support of double arithmetic precision (double in C)
+@item @option{-DCHAMELEON_PREC_C=trigger} (default: @code{ON})
+to enable the support of complex arithmetic precision (complex in C)
+@item @option{-DCHAMELEON_PREC_Z=trigger} (default: @code{ON})
+to enable the support of double complex arithmetic precision (double complex
+in C)
+
+@item @option{-DBLAS_VERBOSE=trigger} (default: @code{OFF})
+to make BLAS library discovery verbose
+@item @option{-DLAPACK_VERBOSE=trigger} (default: @code{OFF})
+to make LAPACK library discovery verbose (automatically enabled if
+@option{BLAS_VERBOSE=@code{ON}})
+@end table
+
+List of CHAMELEON options that needs a specific value:
+@table @code
+@item @option{-DBLA_VENDOR=@option{var}} (default: @option{empty})
+The possible values for @option{var} are:
+@table @code
+@item empty
+@item all
+@item Intel10_64lp
+@item Intel10_64lp_seq
+@item ACML
+@item Apple
+@item Generic
+@item ...
+@end table
+to force CMake to find a specific BLAS library, see the full list of BLA_VENDOR
+in @file{FindBLAS.cmake} in @file{cmake_modules/morse/find}.
+By default @option{BLA_VENDOR} is empty so that CMake tries to detect all
+possible BLAS vendor with a preference for Intel MKL.
+@end table
+
+List of CHAMELEON options which requires to give a path:
+@table @code
+@item @option{-DLIBNAME_DIR=@option{path}} (default: empty)
+root directory of the LIBNAME library installation
+@item @option{-DLIBNAME_INCDIR=@option{path}} (default: empty)
+directory of the LIBNAME library headers installation
+@item @option{-DLIBNAME_LIBDIR=@option{path}} (default: empty)
+directory of the LIBNAME libraries (.so, .a, .dylib, etc) installation
+@end table
+LIBNAME can be one of the following: BLAS - CBLAS - FXT - HWLOC -
+LAPACK - LAPACKE - QUARK - STARPU - TMG.
+See paragraph about @ref{Dependencies detection} for details.
+
+Libraries detected with an official CMake module (see module files in
+@file{CMAKE_ROOT/Modules/}):
+@itemize @bullet
+@item CUDA
+@item MPI
+@item Threads
+@end itemize
+
+Libraries detected with CHAMELEON cmake modules (see module files in
+@file{cmake_modules/morse/find/} directory of CHAMELEON sources):
+@itemize @bullet
+@item BLAS
+@item CBLAS
+@item FXT
+@item HWLOC
+@item LAPACK
+@item LAPACKE
+@item QUARK
+@item STARPU
+@item TMG
+@end itemize
+
+
+@node Dependencies detection
+@section Dependencies detection
+You have different choices to detect dependencies on your system, either by
+setting some environment variables containing paths to the libs and headers or
+by specifying them directly at cmake configure.
+Different cases :
+@enumerate
+@item detection of dependencies through environment variables:
+  @itemize @bullet
+  @item @env{LD_LIBRARY_PATH} environment variable should contain the list of
+paths
+where to find the libraries:
+    @example
+    export @env{LD_LIBRARY_PATH}=$@env{LD_LIBRARY_PATH}:path/to/your/libs
+    @end example
+  @item @env{INCLUDE} environment variable should contain the list of paths
+where to find the header files of libraries
+    @example
+    export @env{INCLUDE}=$@env{INCLUDE}:path/to/your/headers
+    @end example
+  @end itemize
+
+@item detection with user's given paths:
+  @itemize @bullet
+  @item you can specify the path at cmake configure by invoking
+  @example
+  cmake <path to SOURCE_DIR> -DLIBNAME_DIR=path/to/your/lib
+  @end example
+  where LIB stands for the name of the lib to look for, example
+  @example
+  cmake <path to SOURCE_DIR> -DSTARPU_DIR=path/to/starpudir \
+                             -DCBLAS_DIR= ...
+  @end example
+  @item it is also possible to specify headers and library directories
+separately, example
+  @example
+  cmake <path to SOURCE_DIR>                           \
+  -DSTARPU_INCDIR=path/to/libstarpu/include/starpu/1.1 \
+  -DSTARPU_LIBDIR=path/to/libstarpu/lib
+  @end example
+  @item Note BLAS and LAPACK detection can be tedious so that we provide a
+verbose mode. Use @option{-DBLAS_VERBOSE=ON} or @option{-DLAPACK_VERBOSE=ON} to
+enable it.
+  @end itemize
+
+@end enumerate
+
+
+@c @node Dependencies compilation
+@c @section Dependencies compilation
+
+@node Use FxT profiling through StarPU
+@section Use FxT profiling through StarPU
+
+StarPU can generate its own trace log files by compiling it with the
+@option{--with-fxt}
+option at the configure step (you can have to specify the directory where you
+installed FxT by giving @option{--with-fxt=...} instead of @option{--with-fxt}
+alone).
+By doing so, traces are generated after each execution of a program which uses
+StarPU in the directory pointed by the @env{STARPU_FXT_PREFIX} environment
+variable. Example:
+@example
+export @env{STARPU_FXT_PREFIX}=/home/yourname/fxt_files/
+@end example
+
+When executing a @command{./timing/...} CHAMELEON program, if it has been
+enabled (StarPU compiled with FxT and @option{-DCHAMELEON_ENABLE_TRACING=ON}), you
+can give the option @option{--trace} to tell the program to generate trace log
+files.
+
+Finally, to generate the trace file which can be opened with
+@uref{http://vite.gforge.inria.fr/, Vite} program, you have to use the
+@command{starpu_fxt_tool} executable of StarPU.
+This tool should be in @file{path/to/your/install/starpu/bin}.
+You can use it to generate the trace file like this:
+@itemize @bullet
+@item @command{path/to/your/install/starpu/bin/starpu_fxt_tool -i prof_filename}
+
+There is one file per mpi processus (prof_filename_0, prof_filename_1 ...).
+To generate a trace of mpi programs you can call it like this:
+@item @command{path/to/your/install/starpu/bin/starpu_fxt_tool -i
+prof_filename*}
+
+The trace file will be named paje.trace (use -o option to specify an output
+name).
+@end itemize
+
+Alternatively, one can also generate directly .paje trace files after the execution
+by setting @env{STARPU_GENERATE_TRACE=1}.
+
+@node Use simulation mode with StarPU-SimGrid
+@section Use simulation mode with StarPU-SimGrid
+
+Simulation mode can be enabled by setting the cmake option
+@option{-DCHAMELEON_SIMULATION=ON}.
+This mode allows you to simulate execution of algorithms with StarPU compiled
+with @uref{http://simgrid.gforge.inria.fr/, SimGrid}.
+To do so, we provide some perfmodels in the @file{simucore/perfmodels/}
+directory of CHAMELEON sources.
+To use these perfmodels, please set the following
+@itemize @bullet
+@item @env{STARPU_HOME} environment variable to:
+  @example
+  @code{<path to SOURCE_DIR>/simucore/perfmodels}
+  @end example
+@item @env{STARPU_HOSTNAME} environment variable to the name of the machine to
+simulate. For example, on our platform (PlaFRIM) with GPUs at Inria Bordeaux
+  @example
+  @env{STARPU_HOSTNAME}=mirage
+  @end example
+Note that only POTRF kernels with block sizes of 320 or 960 (simple and double
+precision) on mirage machine are available for now.
+Database of models is subject to change, it should be enrich in a near future.
+@end itemize
+
+@node Use out of core support with StarPU
+@section Use out of core support with StarPU
+
+If the matrix can not fit in the main memory, StarPU can automatically evict
+tiles to the disk.  The descriptors for the matrices which can not fit in the
+main memory need to be created with @code{MORSE_Desc_Create_OOC}, so that MORSE
+does not force StarPU to keep it in the main memory.
+
+The following variables then need to be set:
+@itemize @bullet
+@item @env{STARPU_DISK_SWAP} environment variable to a place where to store
+evicted tiles, for example:
+  @example
+  @env{STARPU_DISK_SWAP}=/tmp
+  @end example
+@item @env{STARPU_DISK_SWAP_BACKEND} environment variable to the I/O method,
+for example:
+  @example
+  @env{STARPU_DISK_SWAP_BACKEND}=unistd_o_direct
+  @end example
+@item @env{STARPU_LIMIT_CPU_MEM} environment variable to the amount of memory
+that can be used in MBytes, for example:
+  @example
+  @env{STARPU_LIMIT_CPU_MEM}=1000
+  @end example
+@end itemize
diff --git a/doc/orgmode/chapters/installing.org b/doc/orgmode/chapters/installing.org
new file mode 100644
index 0000000000000000000000000000000000000000..745e6863a9f604a49f6497547135d276d076927d
--- /dev/null
+++ b/doc/orgmode/chapters/installing.org
@@ -0,0 +1,485 @@
+# This file is part of the Chameleon User's Guide.
+# Copyright (C) 2017 Inria
+# See the file ../users_guide.org for copying conditions.
+
+Chameleon is written in C, it provides an interface to be called from
+Fortran and depends on a couple of external libraries that must be
+installed on the system.
+
+Chameleon can be built and installed by the standard means of CMake
+(@uref{http://www.cmake.org/}).  General information about CMake, as
+well as installation binaries and CMake source code are available from
+@uref{http://www.cmake.org/cmake/resources/software.html}.
+
+To get support to install a full distribution Chameleon + dependencies
+we encourage users to use the morse branch of *Spack*.
+
+
+** Getting Chameleon
+   The latest official release tarballs of Chameleon sources are
+   available for download from
+   https://gitlab.inria.fr/solverstack/chameleon/tags.
+
+   The latest development snapshot is available on gitlab
+   https://gitlab.inria.fr/solverstack/chameleon.
+
+** Chameleon prerequisites
+   To install Chameleon's libraries, header files, and executables, one
+   needs:
+   - CMake (version 2.8 minimum): the build system
+   - C and Fortran compilers: GNU compiler suite, Clang, Intel or IBM
+     can be used
+   - python: to generate files in the different precisions
+   - external libraries: this depends on the configuration, by default
+     the required libraries are
+     - StarPU: http://runtime.bordeaux.inria.fr/StarPU/
+     - CBLAS, LAPACKE: these are interfaces and there exist several
+       providers that can be used with Chameleon
+       - Intel MKL, Netlib, OpenBlas
+     - BLAS, LAPACK, TMGLIB: there exist several providers that can be
+       used with Chameleon
+       - Eigen, Intel MKL, Netlib, OpenBlas
+     - pthread (libpthread)
+     - math (libm)
+
+   Optional libraries:
+   - quark: http://icl.cs.utk.edu/quark/
+   - cuda: https://developer.nvidia.com/cuda-downloads
+   - cublas: comes with cuda http://docs.nvidia.com/cuda/cublas/
+   - mpi: openmpi http://www.open-mpi.org/
+
+   These packages must be installed on the system before trying to
+   configure/build chameleon.  Please look at the distrib/ directory
+   which gives some hints for the installation of dependencies for Unix
+   systems.
+
+   We give here some examples for a Debian system:
+   #+begin_src
+
+   # Update Debian packages list
+   sudo apt-get update
+   # Install Netlib blas, lapack, tmglib, cblas and lapacke suite
+   sudo apt-get install -y liblapack-dev liblapacke-dev
+   # Alernatively to Netlib, OpenBLAS could be used (faster kernels)
+   sudo apt-get install -y libopenblas-dev liblapacke-dev
+   # Install OpenMPI
+   sudo apt-get install -y libopenmpi-dev
+   # Install hwloc (used by StarPU or QUARK, already a dependency of OpenMPI)
+   sudo apt-get install -y libhwloc-dev
+   # install FxT, usefull to export some nice execution traces with StarPU
+   sudo apt-get install -y libfxt-dev
+   # Install cuda and cuBLAS : only if you have a GPU cuda compatible
+   sudo apt-get install -y nvidia-cuda-toolkit nvidia-cuda-dev
+
+   # Install StarPU (with MPI and FxT enabled)
+   mkdir -p $HOME/install
+   cd $HOME/install
+   wget http://starpu.gforge.inria.fr/files/starpu-1.2.2/starpu-1.2.2.tar.gz
+   tar xvzf starpu-1.2.2.tar.gz
+   cd starpu-1.2.2/
+   ./configure --prefix=$HOME/install/starpu --disable-opencl --disable-cuda --with-fxt=/usr/lib/x86_64-linux-gnu/
+   make
+   make install
+   cd $HOME/install
+   rm starpu-1.2.2/ starpu-1.2.2.tar.gz -rf
+
+   # Install QUARK : to be used in place of StarPU
+   mkdir -p $HOME/install
+   cd $HOME/install
+   wget http://icl.cs.utk.edu/projectsfiles/quark/pubs/quark-0.9.0.tgz
+   tar xvzf quark-0.9.0.tgz
+   cd quark-0.9.0/
+   sed -i -e "s#prefix=\.\/install#prefix=$HOME/install/quark#g" make.inc
+   sed -i -e "s#CFLAGS=-O2#CFLAGS=-O2 -fPIC#g" make.inc
+   make
+   make install
+   cd $HOME/install
+   rm quark-0.9.0/ quark-0.9.0.tgz -rf
+
+   #+end_src
+
+*** Some details about dependencies
+**** BLAS implementation
+     [[http://www.netlib.org/blas/][BLAS]] (Basic Linear Algebra Subprograms), are a de facto standard
+     for basic linear algebra operations such as vector and matrix
+     multiplication.  FORTRAN implementation of BLAS is available from
+     Netlib.  Also, C implementation of BLAS is included in GSL (GNU
+     Scientific Library).  Both these implementations are reference
+     implementation of BLAS, are not optimized for modern processor
+     architectures and provide an order of magnitude lower performance
+     than optimized implementations.  Highly optimized implementations
+     of BLAS are available from many hardware vendors, such as Intel
+     MKL, IBM ESSL and AMD ACML.  Fast implementations are also
+     available as academic packages, such as ATLAS and OpenBLAS.  The
+     standard interface to BLAS is the FORTRAN interface.
+
+     *Caution about the compatibility:* Chameleon has been mainly tested
+     with the reference BLAS from NETLIB, OpenBLAS and Intel MKL.
+**** CBLAS
+     [[http://www.netlib.org/blas/#_cblas][CBLAS]] is a C language interface to BLAS.  Most commercial and
+     academic implementations of BLAS also provide CBLAS.  Netlib
+     provides a reference implementation of CBLAS on top of FORTRAN
+     BLAS (Netlib CBLAS).  Since GSL is implemented in C, it naturally
+     provides CBLAS.
+
+     *Caution about the compatibility:* Chameleon has been mainly tested with
+     the reference CBLAS from NETLIB, OpenBLAS and Intel MKL.
+
+**** LAPACK implementation
+     [[http://www.netlib.org/lapack/][LAPACK]] (Linear Algebra PACKage) is a software library for
+     numerical linear algebra, a successor of LINPACK and EISPACK and
+     a predecessor of Chameleon.  LAPACK provides routines for solving
+     linear systems of equations, linear least square problems,
+     eigenvalue problems and singular value problems.  Most commercial
+     and academic BLAS packages also provide some LAPACK routines.
+
+     *Caution about the compatibility:* Chameleon has been mainly tested
+     with the reference LAPACK from NETLIB, OpenBLAS and Intel MKL.
+
+**** LAPACKE
+     [[http://www.netlib.org/lapack/][LAPACKE]] is a C language interface to LAPACK (or CLAPACK).  It is
+     produced by Intel in coordination with the LAPACK team and is
+     available in source code from Netlib in its original version
+     (Netlib LAPACKE) and from Chameleon website in an extended
+     version (LAPACKE for Chameleon).  In addition to implementing the
+     C interface, LAPACKE also provides routines which automatically
+     handle workspace allocation, making the use of LAPACK much more
+     convenient.
+
+     *Caution about the compatibility:* Chameleon has been mainly tested
+     with the reference LAPACKE from NETLIB, OpenBLAS and Intel MKL.
+
+**** libtmg
+     [[http://www.netlib.org/lapack/][libtmg]] is a component of the LAPACK library, containing routines
+     for generation of input matrices for testing and timing of
+     LAPACK.  The testing and timing suites of LAPACK require libtmg,
+     but not the library itself. Note that the LAPACK library can be
+     built and used without libtmg.
+
+     *Caution about the compatibility:* Chameleon has been mainly tested
+     with the reference TMGLIB from NETLIB, OpenBLAS and Intel MKL.
+
+**** QUARK
+     [[http://icl.cs.utk.edu/quark/][QUARK]] (QUeuing And Runtime for Kernels) provides a library that
+     enables the dynamic execution of tasks with data dependencies in
+     a multi-core, multi-socket, shared-memory environment.  One of
+     QUARK or StarPU Runtime systems has to be enabled in order to
+     schedule tasks on the architecture.  If QUARK is enabled then
+     StarPU is disabled and conversely.  Note StarPU is enabled by
+     default.  When Chameleon is linked with QUARK, it is not possible
+     to exploit neither CUDA (for GPUs) nor MPI (distributed-memory
+     environment).  You can use StarPU to do so.
+
+     *Caution about the compatibility:* Chameleon has been mainly tested
+     with the QUARK library 0.9.
+
+**** StarPU
+     [[http://runtime.bordeaux.inria.fr/StarPU/][StarPU]] is a task programming library for hybrid architectures.
+     StarPU handles run-time concerns such as: @itemize @bullet @item
+     Task dependencies @item Optimized heterogeneous scheduling @item
+     Optimized data transfers and replication between main memory and
+     discrete memories @item Optimized cluster communications @end
+     itemize StarPU can be used to benefit from GPUs and
+     distributed-memory environment.  One of QUARK or StarPU runtime
+     system has to be enabled in order to schedule tasks on the
+     architecture.  If StarPU is enabled then QUARK is disabled and
+     conversely.  Note StarPU is enabled by default.
+
+     *Caution about the compatibility:* Chameleon has been mainly tested
+     with StarPU-1.1 and 1.2 releases.
+
+**** FxT
+     [[http://download.savannah.gnu.org/releases/fkt/][FxT]] stands for both FKT (Fast Kernel Tracing) and FUT (Fast User
+     Tracing).  This library provides efficient support for recording
+     traces.  Chameleon can trace kernels execution on the different
+     workers and produce .paje files if FxT is enabled.  FxT can only
+     be used through StarPU and StarPU must be compiled with FxT
+     enabled, see how to use this feature here @ref{Use FxT profiling
+     through StarPU}.
+
+     *Caution about the compatibility:* FxT should be compatible with
+     the version of StarPU used.
+**** hwloc
+     [[http://www.open-mpi.org/projects/hwloc/][hwloc]] (Portable Hardware Locality) is a software package for
+     accessing the topology of a multicore system including components
+     like: cores, sockets, caches and NUMA nodes. The topology
+     discovery library, ~hwloc~, is not mandatory to use StarPU but
+     strongly recommended.  It allows to increase performance, and to
+     perform some topology aware scheduling. ~hwloc~ is available in
+     major distributions and for most OSes and can be downloaded from
+     http://www.open-mpi.org/software/hwloc.
+
+**** pthread
+     POSIX threads library is required to run Chameleon on Unix-like systems.
+     It is a standard component of any such system.
+
+@node Optional dependencies
+@subsection Optional dependencies
+
+@menu
+**** OpenMPI
+     [[http://www.open-mpi.org/][OpenMPI]] is an open source Message Passing Interface
+     implementation for execution on multiple nodes with
+     distributed-memory environment.  MPI can be enabled only if the
+     runtime system chosen is StarPU (default).  To use MPI through
+     StarPU, it is necessary to compile StarPU with MPI enabled.
+
+     *Caution about the compatibility:* OpenMPI should be built with the
+     --enable-mpi-thread-multiple option.
+
+**** Nvidia CUDA Toolkit
+     [[https://developer.nvidia.com/cuda-toolkit][Nvidia CUDA Toolkit]] provides a comprehensive development
+     environment for C and C++ developers building GPU-accelerated
+     applications.  Chameleon can use a set of low level optimized
+     kernels coming from cuBLAS to accelerate computations on GPUs.
+     The [[http://docs.nvidia.com/cuda/cublas/][cuBLAS]] library is an implementation of BLAS (Basic Linear
+     Algebra Subprograms) on top of the Nvidia CUDA runtime.  cuBLAS
+     is normaly distributed with Nvidia CUDA Toolkit.  CUDA/cuBLAS can
+     be enabled in Chameleon only if the runtime system chosen is
+     StarPU (default).  To use CUDA through StarPU, it is necessary to
+     compile StarPU with CUDA enabled.
+
+     *Caution about the compatibility:* Chameleon has been mainly tested
+     with CUDA releases from versions 4 to 7.5.  Your compiler must be
+     compatible with CUDA.
+
+** Distribution of Chameleon using Spack
+   To get support to install a full distribution (Chameleon +
+   dependencies) we encourage users to use the morse branch of *Spack*.
+
+   Please read these documentations:
+   * [[http://morse.gforge.inria.fr/spack/spack.html][Spack Morse]]
+   * [[http://morse.gforge.inria.fr/spack/spack.html#orgd5b1afe][Section Chameleon]]
+
+*** Usage example for a simple distribution of Chameleon
+    #+begin_src sh
+    git clone https://github.com/solverstack/spack.git
+    . ./spack/share/spack/setup-env.sh
+    spack install -v chameleon
+    # chameleon is installed here:
+    `spack location -i chameleon`
+    #+end_src
+
+** Build and install Chameleon with CMake
+   Compilation of Chameleon libraries and executables are done with
+   CMake (http://www.cmake.org/). This version has been tested with
+   CMake 3.5.1 but any version superior to 2.8 should be fine.
+
+   Here the steps to configure, build, test and install
+   1. configure :
+      #+begin_src
+      cmake path/to/chameleon -DOPTION1= -DOPTION2= ...
+      # see the "Options" section to get list of options
+      # see the "Dependencies detection" for details about libraries detection
+      #+end_src
+   2. build :
+      #+begin_src
+      make
+      # do not hesitate to use -j[ncores] option to speedup the compilation
+      #+end_src
+   3. test (optional, required CHAMELEON_ENABLE_TESTING=ON and/or
+      CHAMELEON_ENABLE_TIMING=ON) :
+      #+begin_src
+      make test
+      # or
+      ctest
+      #+end_src
+   4. install (optional) :
+      #+begin_src
+      make install
+      #+end_src
+      Do not forget to specify the install directory with
+      *-DCMAKE_INSTALL_PREFIX* at configure
+      #+begin_example
+      cmake /home/jdoe/chameleon -DCMAKE_INSTALL_PREFIX=/home/jdoe/install/chameleon
+      #+end_example
+      Note that the install process is optional. You are free to use
+      Chameleon binaries compiled in the build directory.
+*** Configuration options
+    You can optionally activate some options at cmake configure (like CUDA, MPI, ...)
+    invoking ~cmake path/to/your/CMakeLists.txt -DOPTION1= -DOPTION2= ...~
+    #+begin_src
+    cmake /home/jdoe/chameleon/ -DCMAKE_BUILD_TYPE=Debug \
+                                -DCMAKE_INSTALL_PREFIX=/home/jdoe/install/ \
+                                -DCHAMELEON_USE_CUDA=ON \
+                                -DCHAMELEON_USE_MPI=ON \
+                                -DBLA_VENDOR=Intel10_64lp \
+                                -DSTARPU_DIR=/home/jdoe/install/starpu-1.2/ \
+                                -DCHAMELEON_ENABLE_TRACING=ON
+    #+end_src
+
+    You can get the full list of options with *-L[A][H]* options of cmake command
+    #+begin_src
+    cmake -LH /home/jdoe/chameleon/
+    #+end_src
+
+    You can also set the options thanks to the *ccmake* interface.
+
+**** Native CMake options (non-exhaustive list)
+     * *CMAKE_BUILD_TYPE=Debug|Release|RelWithDebInfo|MinSizeRel* :
+       level of compiler optimization, enable debug information
+     * *CMAKE_INSTALL_PREFIX=path/to/your/install/dir* : where headers,
+       libraries, executables, etc, will be copied when invoking make
+       install
+     * *BUILD_SHARED_LIBS=ON|OFF* : Indicate wether or not CMake has to
+       build CHAMELEON static (~OFF~) or shared (~ON~) libraries.
+     * *CMAKE_C_COMPILER=gcc|icc|...* : to choose the C compilers
+       if several exist in the environment
+     * *CMAKE_Fortran_COMPILER=gfortran|ifort|...*: to choose the
+       Fortran compilers if several exist in the environment
+
+**** Related to specific modules (find_package) to find external libraries
+     * *BLA_VENDOR=All|Eigen|Open|Generic|Intel10_64lp|Intel10_64lp_seq* :
+       to use intel mkl for example, see the list of BLA_VENDOR in
+       FindBLAS.cmake in cmake_modules/morse/find
+     * *STARPU_DIR=path/to/root/starpu/install*, see [[sec:depdet][Dependencies
+       detection]]
+     * *STARPU_INCDIR=path/to/root/starpu/install/headers*, see
+       [[sec:depdet][Dependencies detection]]
+     * *STARPU_LIBDIR=path/to/root/starpu/install/libs*, see
+       [[sec:depdet][Dependencies detection]]
+     * List of packages that can searched just like STARPU (with _DIR,
+       _INCDIR and _LIBDIR):
+       * *BLAS*, *CBLAS*, *EZTRACE*, *FXT*, *HWLOC*, *LAPACK*, *LAPACKE*, *QUARK*,
+         *SIMGRID, *TMG*
+
+     Libraries detected with an official cmake module (see module files
+     in CMAKE_ROOT/Modules/): CUDA - MPI - Threads.
+
+     Libraries detected with our cmake modules (see module files in
+     cmake_modules/morse_cmake/modules/find/ directory of Chameleon
+     sources): BLAS - CBLAS - EZTRACE - FXT - HWLOC - LAPACK -
+     LAPACKE - QUARK - SIMGRID - STARPU - TMG.
+
+**** Chameleon specific options
+     * *CHAMELEON_SCHED_STARPU=ON|OFF* (default ON) : to link with
+       StarPU library (runtime system)
+     * *CHAMELEON_SCHED_QUARK=ON|OFF* (default OFF) : to link with QUARK
+       library (runtime system)
+     * *CHAMELEON_USE_MPI=ON|OFF* (default OFF) : to link with MPI
+       library (message passing implementation for use of multiple
+       nodes with distributed memory), can only be used with StarPU
+     * *CHAMELEON_USE_CUDA=ON|OFF* (default OFF) : to link with CUDA
+       runtime (implementation paradigm for accelerated codes on GPUs)
+       and cuBLAS library (optimized BLAS kernels on GPUs), can only
+       be used with StarPU
+     * *CHAMELEON_ENABLE_DOC=ON|OFF* (default OFF) : to control build of
+       the documentation contained in doc/ sub-directory
+     * *CHAMELEON_ENABLE_EXAMPLE=ON|OFF* (default ON) : to control build
+       of the examples executables (API usage) contained in example/
+       sub-directory
+     * *CHAMELEON_ENABLE_PRUNING_STATS=ON|OFF* (default OFF)
+     * *CHAMELEON_ENABLE_TESTING=ON|OFF* (default ON) : to control build
+       of testing executables (numerical check) contained in testing/
+       sub-directory
+     * *CHAMELEON_ENABLE_TIMING=ON|OFF* (default ON) : to control build
+       of timing executables (performances check) contained in timing/
+       sub-directory
+     * *CHAMELEON_ENABLE_TRACING=ON|OFF* (default OFF) : to enable trace
+       generation during execution of timing drivers. It requires
+       StarPU to be linked with FxT library (trace execution of
+       kernels on workers), see also [[sec:trace][Execution tracing
+       with StarPU]].
+     * *CHAMELEON_SIMULATION=ON|OFF* (default OFF) : to enable
+       simulation mode, means CHAMELEON will not really execute tasks,
+       see details in section [[sec:simu][Use simulation mode with
+       StarPU-SimGrid]]. This option must be used with StarPU compiled
+       with [[http://simgrid.gforge.inria.fr/][SimGrid]] allowing to guess the execution time on any
+       architecture. This feature should be used to make experiments
+       on the scheduler behaviors and performances not to produce
+       solutions of linear systems.
+
+*** Dependencies detection
+    <<sec:depdet>>
+    You have different choices to detect dependencies on your system,
+    either by setting some environment variables containing paths to
+    the libs and headers or by specifying them directly at cmake
+    configure. Different cases :
+
+    1) detection of dependencies through environment variables:
+       - LD_LIBRARY_PATH should contain the list of paths where to find
+         the libraries:
+         #+begin_src
+         export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:install/path/to/your/lib
+         #+end_src
+       - INCLUDE should contain the list of paths where to find the
+         header files of libraries
+         #+begin_src
+         export INCLUDE=$INCLUDE:install/path/to/your/headers
+         #+end_src
+    2) detection with user's given paths:
+       - you can specify the path at cmake configure by invoking ~cmake
+         path/to/your/CMakeLists.txt -DLIB_DIR=path/to/your/lib~ where
+         LIB stands for the name of the lib to look for
+         #+begin_src
+         cmake path/to/your/CMakeLists.txt -DSTARPU_DIR=path/to/starpudir \
+                                           -DCBLAS_DIR= ...
+         #+end_src
+         it is also possible to specify headers and library directories
+         separately
+         #+begin_src
+         cmake path/to/your/CMakeLists.txt -DSTARPU_INCDIR=path/to/libstarpu/include/starpu/1.1 \
+                                           -DSTARPU_LIBDIR=path/to/libstarpu/lib
+         #+end_src
+       - note: BLAS and LAPACK detection can be tedious so that we
+         provide a verbose mode you can set *-DBLAS_VERBOSE=ON* or
+         *-DLAPACK_VERBOSE=ON* to activate it
+    3) detection with custom environment variables: all variables like
+       _DIR, _INCDIR, _LIBDIR can be set as environment variables
+       instead of CMake options, there will be read
+    4) using pkg-config for libraries that provide .pc files
+       - update your *PKG_CONFIG_PATH* to the paths where to find .pc
+         files of installed external libraries like hwloc, starpu, some
+         blas/lapack, etc
+*** Execution tracing StarPU
+    <<sec:trace>>
+    StarPU can generate its own trace log files by compiling it with
+    the ~--with-fxt~ option at the configure step (you can have to
+    specify the directory where you installed FxT by giving
+    ~--with-fxt=...~ instead of ~--with-fxt~ alone).  By doing so, traces
+    are generated after each execution of a program which uses StarPU
+    in the directory pointed by the STARPU_FXT_PREFIX environment
+    variable.
+    #+begin_example
+    export STARPU_FXT_PREFIX=/home/jdoe/fxt_files/
+    #+end_example
+    When executing a ~./timing/...~ Chameleon program, if it has been
+    enabled (StarPU compiled with FxT and
+    *-DCHAMELEON_ENABLE_TRACING=ON*), you can give the option ~--trace~ to
+    tell the program to generate trace log files.
+
+    Finally, to generate the trace file which can be opened with Vite
+    program (http://vite.gforge.inria.fr/), you can use the
+    *starpu_fxt_tool* executable of StarPU.  This tool should be in
+    ~$STARPU_INSTALL_REPOSITORY/bin~.  You can use it to generate the
+    trace file like this:
+    #+begin_src
+    path/to/your/install/starpu/bin/starpu_fxt_tool -i prof_filename
+    #+end_src
+    There is one file per mpi processus (prof_filename_0,
+    prof_filename_1 ...).  To generate a trace of mpi programs you can
+    call it like this:
+    #+begin_src
+    path/to/your/install/starpu/bin/starpu_fxt_tool -i prof_filename*
+    #+end_src
+    The trace file will be named paje.trace (use -o option to specify
+    an output name).  Alternatively, for non mpi execution (only one
+    processus and profiling file), you can set the environment
+    variable *STARPU_GENERATE_TRACE=1* to automatically generate the
+    paje trace file.
+
+*** Use simulation mode with StarPU-SimGrid
+    <<sec:simu>>
+    Simulation mode can be activated by setting the cmake option
+    CHAMELEON_SIMULATION to ON.  This mode allows you to simulate
+    execution of algorithms with StarPU compiled with SimGrid
+    (http://simgrid.gforge.inria.fr/).  To do so, we provide some
+    perfmodels in the simucore/perfmodels/ directory of Chameleon
+    sources.  To use these perfmodels, please set your *STARPU_HOME*
+    environment variable to
+    ~path/to/your/chameleon_sources/simucore/perfmodels~.  Finally, you
+    need to set your *STARPU_HOSTNAME* environment variable to the name
+    of the machine to simulate.  For example: *STARPU_HOSTNAME=mirage*.
+    Note that only POTRF kernels with block sizes of 320 or 960
+    (simple and double precision) on mirage and sirocco machines are
+    available for now.  Database of models is subject to change.
diff --git a/doc/orgmode/chapters/introduction.org b/doc/orgmode/chapters/introduction.org
new file mode 100644
index 0000000000000000000000000000000000000000..ee33b7597b7ca6f517bb63362aa6b3a75cb8d258
--- /dev/null
+++ b/doc/orgmode/chapters/introduction.org
@@ -0,0 +1,302 @@
+# This file is part of the CHAMELEON User's Guide.
+# Copyright (C) 2017 Inria
+# See the file ../users_guide.org for copying conditions.
+** MORSE project
+   #+NAME: fig:morse_header
+   #+ATTR_HTML: :align center
+   [[file:morse_header.png]]
+*** MORSE Objectives
+    When processor clock speeds flatlined in 2004, after more than
+    fifteen years of exponential increases, the era of near automatic
+    performance improvements that the HPC application community had
+    previously enjoyed came to an abrupt end.  To develop software that
+    will perform well on petascale and exascale systems with thousands
+    of nodes and millions of cores, the list of major challenges that
+    must now be confronted is formidable:
+    1) dramatic escalation in the costs of intrasystem communication
+       between processors and/or levels of memory hierarchy;
+    2) increased heterogeneity of the processing units (mixing CPUs,
+       GPUs, etc. in varying and unexpected design combinations);
+    3) high levels of parallelism and more complex constraints means
+       that cooperating processes must be dynamically and unpredictably
+       scheduled for asynchronous execution;
+    4) software will not run at scale without much better resilience to
+       faults and far more robustness; and
+    5) new levels of self-adaptivity will be required to enable
+       software to modulate process speed in order to satisfy limited
+       energy budgets.
+    The MORSE associate team will tackle the first three challenges in
+    a orchestrating work between research groups respectively
+    specialized in sparse linear algebra, dense linear algebra and
+    runtime systems.  The overall objective is to develop robust linear
+    algebra libraries relying on innovative runtime systems that can
+    fully benefit from the potential of those future large-scale
+    complex machines.  Challenges 4) and 5) will also be investigated
+    by the different teams in the context of other partnerships, but
+    they will not be the main focus of the associate team as they are
+    much more prospective.
+
+*** Research fields
+    The overall goal of the MORSE associate team is to enable advanced
+    numerical algorithms to be executed on a scalable unified runtime
+    system for exploiting the full potential of future exascale
+    machines.  We expect advances in three directions based first on
+    strong and closed interactions between the runtime and numerical
+    linear algebra communities.  This initial activity will then
+    naturally expand to more focused but still joint research in both
+    fields.
+
+**** Fine interaction between linear algebra and runtime systems
+     On parallel machines, HPC applications need to take care of data
+     movement and consistency, which can be either explicitly managed
+     at the level of the application itself or delegated to a runtime
+     system.  We adopt the latter approach in order to better keep up
+     with hardware trends whose complexity is growing exponentially.
+     One major task in this project is to define a proper interface
+     between HPC applications and runtime systems in order to maximize
+     productivity and expressivity.  As mentioned in the next section,
+     a widely used approach consists in abstracting the application as
+     a DAG that the runtime system is in charge of scheduling.
+     Scheduling such a DAG over a set of heterogeneous processing units
+     introduces a lot of new challenges, such as predicting accurately
+     the execution time of each type of task over each kind of unit,
+     minimizing data transfers between memory banks, performing data
+     prefetching, etc.  Expected advances: In a nutshell, a new runtime
+     system API will be designed to allow applications to provide
+     scheduling hints to the runtime system and to get real-time
+     feedback about the consequences of scheduling decisions.
+
+**** Runtime systems
+     A runtime environment is an intermediate layer between the system
+     and the application.  It provides low-level functionality not
+     provided by the system (such as scheduling or management of the
+     heterogeneity) and high-level features (such as performance
+     portability).  In the framework of this proposal, we will work on
+     the scalability of runtime environment. To achieve scalability it
+     is required to avoid all centralization.  Here, the main problem
+     is the scheduling of the tasks.  In many task-based runtime
+     environments the scheduler is centralized and becomes a bottleneck
+     as soon as too many cores are involved.  It is therefore required
+     to distribute the scheduling decision or to compute a data
+     distribution that impose the mapping of task using, for instance
+     the so-called ``owner-compute'' rule.  Expected advances: We will
+     design runtime systems that enable an efficient and scalable use
+     of thousands of distributed multicore nodes enhanced with
+     accelerators.
+
+**** Linear algebra
+     Because of its central position in HPC and of the well understood
+     structure of its algorithms, dense linear algebra has often
+     pioneered new challenges that HPC had to face.  Again, dense
+     linear algebra has been in the vanguard of the new era of
+     petascale computing with the design of new algorithms that can
+     efficiently run on a multicore node with GPU accelerators. These
+     algorithms are called ``communication-avoiding'' since they have
+     been redesigned to limit the amount of communication between
+     processing units (and between the different levels of memory
+     hierarchy).  They are expressed through Direct Acyclic Graphs
+     (DAG) of fine-grained tasks that are dynamically
+     scheduled. Expected advances: First, we plan to investigate the
+     impact of these principles in the case of sparse applications
+     (whose algorithms are slightly more complicated but often rely on
+     dense kernels).  Furthermore, both in the dense and sparse cases,
+     the scalability on thousands of nodes is still limited; new
+     numerical approaches need to be found.  We will specifically
+     design sparse hybrid direct/iterative methods that represent a
+     promising approach.
+
+*** Research papers
+    Research papers about MORSE can be found at
+    http://icl.cs.utk.edu/projectsdev/morse/pubs/index.html
+
+** CHAMELEON
+*** CHAMELEON software
+    The main purpose is to address the performance shortcomings of the
+    [[http://www.netlib.org/lapack/][LAPACK]] and [[http://www.netlib.org/scalapack/][ScaLAPACK]] libraries on multicore processors and
+    multi-socket systems of multicore processors and their inability to
+    efficiently utilize accelerators such as Graphics Processing Units
+    (GPUs).
+
+    CHAMELEON is a framework written in C which provides routines to
+    solve dense general systems of linear equations, symmetric positive
+    definite systems of linear equations and linear least squares
+    problems, using LU, Cholesky, QR and LQ factorizations.  Real
+    arithmetic and complex arithmetic are supported in both single
+    precision and double precision.  It supports Linux and Mac OS/X
+    machines (only tested on Intel x86-64 architecture).
+
+    CHAMELEON is based on [[http://icl.cs.utk.edu/plasma/][PLASMA]] source code but is not limited to
+    shared-memory environment and can exploit multiple GPUs.  CHAMELEON
+    is interfaced in a generic way with both [[http://icl.cs.utk.edu/quark/][QUARK]] and [[http://runtime.bordeaux.inria.fr/StarPU/][StarPU]] runtime
+    systems.  This feature allows to analyze in a unified framework how
+    sequential task-based algorithms behave regarding different runtime
+    systems implementations.  Using CHAMELEON with [[http://runtime.bordeaux.inria.fr/StarPU/][StarPU]] runtime
+    system allows to exploit GPUs through kernels provided by [[https://developer.nvidia.com/cublas][cuBLAS]]
+    and clusters of interconnected nodes with distributed memory (using
+    [[http://www.open-mpi.org/][MPI]]).  Computation of very large systems with dense matrices on a
+    cluster of nodes is still being experimented and stabilized.  It is
+    not expected to get stable performances with the current version
+    using MPI.
+
+*** PLASMA's design principles
+    CHAMELEON is originally based on [[http://icl.cs.utk.edu/plasma/][PLASMA]] so that design principles
+    are very similar.  The content of this section PLASMA's design
+    principles has been copied from the /Design principles/ section of
+    the PLASMA User's Guide.
+
+**** Tile Algorithms
+     Tile algorithms are based on the idea of processing the matrix by
+     square tiles of relatively small size, such that a tile fits
+     entirely in one of the cache levels associated with one core.
+     This way a tile can be loaded to the cache and processed
+     completely before being evicted back to the main memory.  Of the
+     three types of cache misses, *compulsory*, *capacity* and *conflict*,
+     the use of tile algorithms minimizes the number of capacity
+     misses, since each operation loads the amount of data that does
+     not ``overflow'' the cache.
+
+     For some operations such as matrix multiplication and Cholesky
+     factorization, translating the classic algorithm to the tile
+     algorithm is trivial.  In the case of matrix multiplication, the
+     tile algorithm is simply a product of applying the technique of
+     *loop tiling* to the canonical definition of three nested loops.  It
+     is very similar for the Cholesky factorization.  The *left-looking*
+     definition of Cholesky factorization from LAPACK is a loop with a
+     sequence of calls to four routines: xSYRK (symmetric *rank-k*
+     update), xPOTRF (Cholesky factorization of a small block on the
+     diagonal), xGEMM (matrix multiplication) and xTRSM (triangular
+     solve).  If the xSYRK, xGEMM and xTRSM operations are expressed
+     with the canonical definition of three nested loops and the
+     technique of loop tiling is applied, the tile algorithm results.
+     Since the algorithm is produced by simple reordering of
+     operations, neither the number of operations nor numerical
+     stability of the algorithm are affected.
+
+     The situation becomes slightly more complicated for LU and QR
+     factorizations, where the classic algorithms factorize an entire
+     panel of the matrix (a block of columns) at every step of the
+     algorithm.  One can observe, however, that the process of matrix
+     factorization is synonymous with introducing zeros in approproate
+     places and a tile algorithm can be fought of as one that zeroes
+     one tile of the matrix at a time.  This process is referred to as
+     updating of a factorization or *incremental factorization*.  The
+     process is equivalent to factorizing the top tile of a panel, then
+     placing the upper triangle of the result on top of the tile blow
+     and factorizing again, then moving to the next tile and so on.
+     Here, the tile LU and QR algorithms perform slightly more floating
+     point operations and require slightly more memory for auxiliary
+     data.  Also, the tile LU factorization applies a different
+     pivoting pattern and, as a result, is less numerically stable than
+     classic LU with full pivoting.  Numerical stability is not an
+     issue in case of the tile QR, which relies on orthogonal
+     transformations (Householder reflections), which are numerically
+     stable.
+
+     #+CAPTION: Schematic illustration of the tile LU factorization (kernel names for real arithmetics in double precision), courtesey of the [[http://icl.cs.utk.edu/plasma/][PLASMA]] team.
+     #+NAME: fig:tile_lu
+     #+ATTR_HTML: :width 640px :align center
+     [[file:tile_lu.jpg]]
+
+**** Tile Data Layout
+     Tile layout is based on the idea of storing the matrix by square
+     tiles of relatively small size, such that each tile occupies a
+     continuous memory region.  This way a tile can be loaded to the
+     cache memory efficiently and the risk of evicting it from the
+     cache memory before it is completely processed is minimized.  Of
+     the three types of cache misses, *compulsory*, *capacity* and
+     *conflict*, the use of tile layout minimizes the number of conflict
+     misses, since a continuous region of memory will completely fill
+     out a /set-associative/ cache memory before an eviction can
+     happen.  Also, from the standpoint of multithreaded execution, the
+     probability of *false sharing* is minimized.  It can only
+     affect the cache lines containing the beginning and the ending of
+     a tile.
+
+     In standard *cache-based* architecture, tiles continously laid out
+     in memory maximize the profit from automatic prefetching.  Tile
+     layout is also beneficial in situations involving the use of
+     accelerators, where explicit communication of tiles through DMA
+     transfers is required, such as moving tiles between the system
+     memory and the local store in Cell B. E. or moving tiles between
+     the host memory and the device memory in GPUs.  In most
+     circumstances tile layout also minimizes the number of TLB misses
+     and conflicts to memory banks or partitions.  With the standard
+     (*column-major*) layout, access to each column of a tile is much
+     more likely to cause a conflict miss, a false sharing miss, a TLB
+     miss or a bank or partition conflict.  The use of the standard
+     layout for dense matrix operations is a performance minefield.
+     Although occasionally one can pass through it unscathed, the risk
+     of hitting a spot deadly to performance is very high.
+
+     Another property of the layout utilized in PLASMA is that it is
+     ``flat'', meaning that it does not involve a level of
+     indirection. Each tile stores a small square submatrix of the main
+     matrix in a *column-major* layout. In turn, the main matrix is an
+     arrangement of tiles immediately following one another in a
+     *column-major* layout.  The offset of each tile can be calculated
+     through address arithmetics and does not involve pointer
+     indirection.  Alternatively, a matrix could be represented as an
+     array of pointers to tiles, located anywhere in memory. Such
+     layout would be a radical and unjustifiable departure from LAPACK
+     and ScaLAPACK.  Flat tile layout is a natural progression from
+     LAPACK's *column-major* layout and ScaLAPACK's
+     /block-cyclic/ layout.
+
+     Another related property of PLASMA's tile layout is that it
+     includes provisions for padding of tiles, i.e., the actual region
+     of memory designated for a tile can be larger than the memory
+     occupied by the actual data.  This allows to force a certain
+     alignment of tile boundaries, while using the flat organization
+     described in the previous paragraph.  The motivation is that, at
+     the price of small memory overhead, alignment of tile boundaries
+     may prove benefivial in multiple scenarios involving memory
+     systems of standard multicore processors, as well as accelerators.
+     The issues that come into play are, again, the use of TLBs and
+     memory banks or partitions.
+
+     #+CAPTION: Schematic illustration of the tile layout with *column-major* order of tiles, *column-major* order of elements within tiles and (optional) padding for enforcing a certain alighment of tile bondaries, courtesey of the [[http://icl.cs.utk.edu/plasma/][PLASMA]] team.
+     #+NAME: fig:tile_layout
+     #+ATTR_HTML: :width 640px :align center
+     [[file:tile_layout.jpg]]
+
+**** Dynamic Task Scheduling
+
+     Dynamic scheduling is the idea of assigning work to cores based on
+     the availability of data for processing at any given point in time
+     and is also referred to as *data-driven* scheduling.  The concept is
+     related closely to the idea of expressing computation through a
+     task graph, often referred to as the DAG (*Direct Acyclic Graph*),
+     and the flexibility exploring the DAG at runtime.  Thus, to a
+     large extent, dynamic scheduling is synonymous with *runtime
+     scheduling*.  An important concept here is the one of the *critical
+     path*, which defines the upper bound on the achievable parallelism,
+     and needs to be pursued at the maximum speed.  This is in direct
+     opposition to the *fork-and-join* or *data-parallel* programming
+     models, where artificial synchronization points expose serial
+     sections of the code, where multiple cores are idle, while
+     sequential processing takes place.  The use of dynamic scheduling
+     introduces a *trade-off*, though.  The more dynamic (flexible)
+     scheduling is, the more centralized (and less scalable) the
+     scheduling mechanism is.  For that reason, currently PLASMA uses
+     two scheduling mechanisms, one which is fully dynamic and one
+     where work is assigned statically and dependency checks are done
+     at runtime.
+
+     The first scheduling mechanism relies on unfolding a *sliding
+     window* of the task graph at runtime and scheduling work by
+     resolving data hazards: *Read After Write(RAW)*, *Write After Read
+     (WAR)* and *Write After Write (WAW)*, a technique analogous to
+     instruction scheduling in superscalar processors.  It also relies
+     on *work-stealing* for balanding the load among all multiple cores.
+     The second scheduling mechanism relies on statically designating a
+     path through the execution space of the algorithm to each core and
+     following a cycle: transition to a task, wait for its
+     dependencies, execute it, update the overall progress.  Task are
+     identified by tuples and task transitions are done through locally
+     evaluated formulas.  Progress information can be centralized,
+     replicated or distributed (currently centralized).
+
+     #+CAPTION: A trace of the tile QR factorization executing on eight cores without any global synchronization points (kernel names for real arithmetics in single precision), courtesey of the [[http://icl.cs.utk.edu/plasma/][PLASMA]] team.
+     #+NAME: fig:trace_qr
+     #+ATTR_HTML: :width 640px :align center
+     [[file:trace_qr.jpg]]
diff --git a/doc/orgmode/chapters/using.org b/doc/orgmode/chapters/using.org
new file mode 100644
index 0000000000000000000000000000000000000000..4cfd8e6626b79a17baaf587765ffbeb527fb5cf0
--- /dev/null
+++ b/doc/orgmode/chapters/using.org
@@ -0,0 +1,1434 @@
+@c -*-texinfo-*-
+
+@c This file is part of the MORSE Handbook.
+@c Copyright (C) 2014 Inria
+@c Copyright (C) 2014 The University of Tennessee
+@c Copyright (C) 2014 King Abdullah University of Science and Technology
+@c See the file ../chameleon.texi for copying conditions.
+
+@menu
+* Using CHAMELEON executables::
+* Linking an external application with CHAMELEON libraries::
+* CHAMELEON API::
+@end menu
+
+@node Using CHAMELEON executables
+@section Using CHAMELEON executables
+
+CHAMELEON provides several test executables that are compiled and link with
+CHAMELEON stack of dependencies.
+Instructions about the arguments to give to executables are accessible thanks
+to the option @option{-[-]help} or @option{-[-]h}.
+This set of binaries are separated into three categories and can be found in
+three different directories:
+
+@itemize @bullet
+
+  @item example
+
+  contains examples of API usage and more specifically the
+  sub-directory lapack_to_morse/ provides a tutorial that explain how to use
+  CHAMELEON functionalities starting from a full LAPACK code, see
+@ref{Tutorial LAPACK to CHAMELEON}
+
+  @item testing
+
+  contains testing drivers to check numerical correctness of
+  CHAMELEON linear algebra routines with a wide range of parameters
+  @example
+  ./testing/stesting 4 1 LANGE 600 100 700
+  @end example
+  Two first arguments are the number of cores and gpus to use.
+  The third one is the name of the algorithm to test.
+  The other arguments depend on the algorithm, here it lies for the number of
+  rows, columns and leading dimension of the problem.
+
+  Name of algorithms available for testing are:
+  @itemize @bullet
+    @item LANGE: norms of matrices Infinite, One, Max, Frobenius
+    @item GEMM: general matrix-matrix multiply
+    @item HEMM: hermitian matrix-matrix multiply
+    @item HERK: hermitian matrix-matrix rank k update
+    @item HER2K: hermitian matrix-matrix rank 2k update
+    @item SYMM: symmetric matrix-matrix multiply
+    @item SYRK: symmetric matrix-matrix rank k update
+    @item SYR2K: symmetric matrix-matrix rank 2k update
+    @item PEMV: matrix-vector multiply with pentadiagonal matrix
+    @item TRMM: triangular matrix-matrix multiply
+    @item TRSM: triangular solve, multiple rhs
+    @item POSV: solve linear systems with symmetric positive-definite matrix
+    @item GESV_INCPIV: solve linear systems with general matrix
+    @item GELS: linear least squares with general matrix
+  @end itemize
+
+  @item timing
+
+  contains timing drivers to assess performances of CHAMELEON routines.
+  There are two sets of executables, those who do not use the tile interface
+and those who do (with _tile in the name of the executable).
+  Executables without tile interface allocates data following LAPACK
+conventions and these data can be given as arguments to CHAMELEON routines
+as you would do with LAPACK.
+  Executables with tile interface generate directly the data in the format
+  CHAMELEON tile algorithms used to submit tasks to the runtime system.
+  Executables with tile interface should be more performant because no data
+copy from LAPACK matrix layout to tile matrix layout are necessary.
+  Calling example:
+  @example
+  ./timing/time_dpotrf --n_range=1000:10000:1000 --nb=320
+                       --threads=9 --gpus=3
+                       --nowarmup
+  @end example
+
+  List of main options that can be used in timing:
+  @itemize @bullet
+    @item @option{--help}: show usage
+    @item @option{--threads}: Number of CPU workers (default:
+@option{_SC_NPROCESSORS_ONLN})
+    @item @option{--gpus}: number of GPU workers (default: @option{0})
+    @item @option{--n_range=R}: range of N values, with
+@option{R=Start:Stop:Step}
+(default: @option{500:5000:500})
+    @item @option{--m=X}: dimension (M) of the matrices (default: @option{N})
+    @item @option{--k=X}: dimension (K) of the matrices (default: @option{1}),
+useful for GEMM algorithm (k is the shared dimension and must be defined >1 to
+consider matrices and not vectors)
+    @item @option{--nrhs=X}: number of right-hand size (default: @option{1})
+    @item @option{--nb=X}: block/tile size. (default: @option{128})
+    @item @option{--ib=X}: inner-blocking/IB size. (default: @option{32})
+    @item @option{--niter=X}: number of iterations performed for each test
+(default: @option{1})
+    @item @option{--rhblk=X}: if X > 0, enable Householder mode for QR and LQ
+factorization. X is the size of each subdomain (default: @option{0})
+    @item @option{--[no]check}: check result (default: @option{nocheck})
+    @item @option{--[no]profile}: print profiling informations (default:
+@option{noprofile})
+    @item @option{--[no]trace}: enable/disable trace generation (default:
+@option{notrace})
+    @item @option{--[no]dag}: enable/disable DAG generation (default:
+@option{nodag})
+    @item @option{--[no]inv}: check on inverse (default: @option{noinv})
+    @item @option{--nocpu}: all GPU kernels are exclusively executed on GPUs
+(default: @option{0})
+  @end itemize
+
+  List of timing algorithms available:
+  @itemize @bullet
+    @item LANGE: norms of matrices
+    @item GEMM: general matrix-matrix multiply
+    @item TRSM: triangular solve
+    @item POTRF: Cholesky factorization with a symmetric
+positive-definite matrix
+    @item POSV: solve linear systems with symmetric positive-definite matrix
+    @item GETRF_NOPIV: LU factorization of a general matrix
+using the tile LU algorithm without row pivoting
+    @item GESV_NOPIV: solve linear system for a general matrix
+using the tile LU algorithm without row pivoting
+    @item GETRF_INCPIV: LU factorization of a general matrix
+using the tile LU algorithm with partial tile pivoting with row interchanges
+    @item GESV_INCPIV: solve linear system for a general matrix
+using the tile LU algorithm with partial tile pivoting with row interchanges
+matrix
+    @item GEQRF: QR factorization of a general matrix
+    @item GELS: solves overdetermined or underdetermined linear systems
+involving a general matrix using the QR or the LQ factorization
+  @end itemize
+
+@end itemize
+
+@node Linking an external application with CHAMELEON libraries
+@section Linking an external application with CHAMELEON libraries
+
+Compilation and link with CHAMELEON libraries have been tested with
+@strong{gcc/gfortran 4.8.1} and @strong{icc/ifort 14.0.2}.
+
+@menu
+* Static linking in C::
+* Dynamic linking in C::
+* Build a Fortran program with CHAMELEON::
+@end menu
+
+@node Static linking in C
+@subsection Static linking in C
+
+Lets imagine you have a file main.c that you want to link with CHAMELEON
+static libraries.
+Lets consider @file{/home/yourname/install/chameleon} is the install directory
+of CHAMELEON containing sub-directories @file{include/} and @file{lib/}.
+Here could be your compilation command with gcc compiler:
+@example
+gcc -I/home/yourname/install/chameleon/include -o main.o -c main.c
+@end example
+
+Now if you want to link your application with CHAMELEON static libraries, you
+could do:
+@example
+gcc main.o -o main                                         \
+/home/yourname/install/chameleon/lib/libchameleon.a        \
+/home/yourname/install/chameleon/lib/libchameleon_starpu.a \
+/home/yourname/install/chameleon/lib/libcoreblas.a         \
+-lstarpu-1.1 -Wl,--no-as-needed -lmkl_intel_lp64           \
+-lmkl_sequential -lmkl_core -lpthread -lm -lrt
+@end example
+As you can see in this example, we also link with some dynamic libraries
+@option{starpu-1.1}, @option{Intel MKL} libraries (for
+BLAS/LAPACK/CBLAS/LAPACKE), @option{pthread}, @option{m} (math) and
+@option{rt}.
+These libraries will depend on the configuration of your CHAMELEON build.
+You can find these dependencies in .pc files we generate during compilation and
+that are installed in the sub-directory @file{lib/pkgconfig} of your
+CHAMELEON install directory.
+Note also that you could need to specify where to find these libraries with
+@option{-L} option of your compiler/linker.
+
+Before to run your program, make sure that all shared libraries paths your
+executable depends on are known.
+Enter @code{ldd main} to check.
+If some shared libraries paths are missing append them in the
+@env{LD_LIBRARY_PATH} (for Linux systems) environment variable
+(@env{DYLD_LIBRARY_PATH} on Mac, @env{LIB} on Windows).
+
+@node Dynamic linking in C
+@subsection Dynamic linking in C
+
+For dynamic linking (need to build CHAMELEON with CMake
+option @option{BUILD_SHARED_LIBS=ON}) it is similar to static compilation/link
+but instead of specifying path to your static libraries you indicate the path
+to dynamic libraries with @option{-L} option and you give the name of libraries
+with @option{-l} option like this:
+@example
+gcc main.o -o main                               \
+-L/home/yourname/install/chameleon/lib           \
+-lchameleon -lchameleon_starpu -lcoreblas        \
+-lstarpu-1.1 -Wl,--no-as-needed -lmkl_intel_lp64 \
+-lmkl_sequential -lmkl_core -lpthread -lm -lrt
+@end example
+
+Note that an update of your environment variable
+@env{LD_LIBRARY_PATH} (@env{DYLD_LIBRARY_PATH} on Mac, @env{LIB} on Windows)
+with the path of the libraries could be required before executing, example:
+@example
+export @env{LD_LIBRARY_PATH}=path/to/libs:path/to/chameleon/lib
+@end example
+
+@node Build a Fortran program with CHAMELEON
+@subsection Build a Fortran program with CHAMELEON
+
+CHAMELEON provides a Fortran interface to user functions. Example:
+@example
+call morse_version(major, minor, patch) !or
+call MORSE_VERSION(major, minor, patch)
+@end example
+
+Build and link are very similar to the C case.
+
+Compilation example:
+@example
+gfortran -o main.o -c main.c
+@end example
+
+Static linking example:
+@example
+gfortran main.o -o main                                    \
+/home/yourname/install/chameleon/lib/libchameleon.a        \
+/home/yourname/install/chameleon/lib/libchameleon_starpu.a \
+/home/yourname/install/chameleon/lib/libcoreblas.a         \
+-lstarpu-1.1 -Wl,--no-as-needed -lmkl_intel_lp64           \
+-lmkl_sequential -lmkl_core -lpthread -lm -lrt
+@end example
+
+Dynamic linking example:
+@example
+gfortran main.o -o main                          \
+-L/home/yourname/install/chameleon/lib           \
+-lchameleon -lchameleon_starpu -lcoreblas        \
+-lstarpu-1.1 -Wl,--no-as-needed -lmkl_intel_lp64 \
+-lmkl_sequential -lmkl_core -lpthread -lm -lrt
+@end example
+
+@node CHAMELEON API
+@section CHAMELEON API
+
+CHAMELEON provides routines to solve dense general systems of linear
+equations, symmetric positive definite systems of linear equations and linear
+least squares problems, using LU, Cholesky, QR and LQ factorizations.
+Real arithmetic and complex arithmetic are supported in both single precision
+and double precision.
+Routines that compute linear algebra are of the folowing form:
+@example
+MORSE_name[_Tile[_Async]]
+@end example
+@itemize @bullet
+@item all user routines are prefixed with @code{MORSE}
+@item @code{name} follows BLAS/LAPACK naming scheme for algorithms
+(@emph{e.g.} sgemm for general matrix-matrix multiply simple precision)
+@item CHAMELEON provides three interface levels
+  @itemize @minus
+  @item @code{MORSE_name}: simplest interface, very close to CBLAS and LAPACKE,
+matrices are given following the LAPACK data layout (1-D array column-major).
+It involves copy of data from LAPACK layout to tile layout and conversely (to
+update LAPACK data), see @ref{Step1}.
+  @item @code{MORSE_name_Tile}: the tile interface avoid copies between LAPACK
+and tile layouts. It is the standard interface of CHAMELEON and it should
+achieved better performance than the previous simplest interface. The data are
+given through a specific structure called a descriptor, see @ref{Step2}.
+  @item @code{MORSE_name_Tile_Async}: similar to the tile interface, it avoids
+synchonization barrier normally called between @code{Tile} routines.
+At the end of an @code{Async} function, completion of tasks is not guarentee
+and data are not necessarily up-to-date.
+To ensure that tasks have been all executed a synchronization function has to
+be called after the sequence of @code{Async} functions, see @ref{Step4}.
+  @end itemize
+@end itemize
+
+MORSE routine calls have to be precede from
+@example
+MORSE_Init( NCPU, NGPU );
+@end example
+to initialize MORSE and the runtime system and followed by
+@example
+MORSE_Finalize();
+@end example
+to free some data and finalize the runtime and/or MPI.
+
+@menu
+* Tutorial LAPACK to CHAMELEON::
+* List of available routines::
+@end menu
+
+@node Tutorial LAPACK to CHAMELEON
+@subsection Tutorial LAPACK to CHAMELEON
+
+This tutorial is dedicated to the API usage of CHAMELEON.
+The idea is to start from a simple code and step by step explain how to
+use CHAMELEON routines.
+The first step is a full BLAS/LAPACK code without dependencies to CHAMELEON,
+a code that most users should easily understand.
+Then, the different interfaces CHAMELEON provides are exposed, from the
+simplest API (step1) to more complicated ones (until step4).
+The way some important parameters are set is discussed in step5.
+step6 is an example about distributed computation with MPI.
+Finally step7 shows how to let Chameleon initialize user's data
+(matrices/vectors) in parallel.
+
+Source files can be found in the @file{example/lapack_to_morse/}
+directory.
+If CMake option @option{CHAMELEON_ENABLE_EXAMPLE} is @option{ON} then source
+files are compiled with the project libraries.
+The arithmetic precision is @code{double}.
+To execute a step @samp{X}, enter the following command:
+@example
+./step@samp{X} --option1 --option2 ...
+@end example
+Instructions about the arguments to give to executables are accessible thanks
+to the option @option{-[-]help} or @option{-[-]h}.
+Note there exist default values for options.
+
+For all steps, the program solves a linear system @math{Ax=B}
+The matrix values are randomly generated but ensure that matrix @math{A} is
+symmetric positive definite so that @math{A} can be factorized in a @math{LL^T}
+form using the Cholesky factorization.
+
+
+Lets comment the different steps of the tutorial
+@menu
+* Step0:: a simple Cholesky example using the C interface of
+BLAS/LAPACK
+* Step1:: introduces the LAPACK equivalent interface of Chameleon
+* Step2:: introduces the tile interface
+* Step3:: indicates how to give your own tile matrix to Chameleon
+* Step4:: introduces the tile async interface
+* Step5:: shows how to set some important parameters
+* Step6:: introduces how to benefit from MPI in Chameleon
+* Step7:: introduces how to let Chameleon initialize the user's matrix data
+@end menu
+
+@node Step0
+@subsubsection Step0
+
+The C interface of BLAS and LAPACK, that is, CBLAS and
+LAPACKE, are used to solve the system. The size of the system (matrix) and the
+number of right hand-sides can be given as arguments to the executable (be
+careful not to give huge numbers if you do not have an infinite amount of RAM!).
+As for every step, the correctness of the solution is checked by calculating
+the norm @math{||Ax-B||/(||A||||x||+||B||)}.
+The time spent in factorization+solve is recorded and, because we know exactly
+the number of operations of these algorithms, we deduce the number of
+operations that have been processed per second (in GFlops/s).
+The important part of the code that solves the problem is:
+@verbatim
+/* Cholesky factorization:
+ * A is replaced by its factorization L or L^T depending on uplo */
+LAPACKE_dpotrf( LAPACK_COL_MAJOR, 'U', N, A, N );
+/* Solve:
+ * B is stored in X on entry, X contains the result on exit.
+ * Forward ...
+ */
+cblas_dtrsm(
+    CblasColMajor,
+    CblasLeft,
+    CblasUpper,
+    CblasConjTrans,
+    CblasNonUnit,
+    N, NRHS, 1.0, A, N, X, N);
+/* ... and back substitution */
+cblas_dtrsm(
+    CblasColMajor,
+    CblasLeft,
+    CblasUpper,
+    CblasNoTrans,
+    CblasNonUnit,
+    N, NRHS, 1.0, A, N, X, N);
+@end verbatim
+
+@node Step1
+@subsubsection Step1
+
+It introduces the simplest CHAMELEON interface which is equivalent to
+CBLAS/LAPACKE.
+The code is very similar to step0 but instead of calling CBLAS/LAPACKE
+functions, we call CHAMELEON equivalent functions.
+The solving code becomes:
+@verbatim
+/* Factorization: */
+MORSE_dpotrf( UPLO, N, A, N );
+/* Solve: */
+MORSE_dpotrs(UPLO, N, NRHS, A, N, X, N);
+@end verbatim
+The API is almost the same so that it is easy to use for beginners.
+It is important to keep in mind that before any call to MORSE routines,
+@code{MORSE_Init} has to be invoked to initialize MORSE and the runtime system.
+Example:
+@verbatim
+MORSE_Init( NCPU, NGPU );
+@end verbatim
+After all MORSE calls have been done, a call to @code{MORSE_Finalize} is
+required to free some data and finalize the runtime and/or MPI.
+@verbatim
+MORSE_Finalize();
+@end verbatim
+We use MORSE routines with the LAPACK interface which means the routines
+accepts the same matrix format as LAPACK (1-D array column-major).
+Note that we copy the matrix to get it in our own tile structures, see details
+about this format here @ref{Tile Data Layout}.
+This means you can get an overhead coming from copies.
+
+@node Step2
+@subsubsection Step2
+
+This program is a copy of step1 but instead of using the LAPACK interface which
+leads to copy LAPACK matrices inside MORSE routines we use the tile interface.
+We will still use standard format of matrix but we will see how to give this
+matrix to create a MORSE descriptor, a structure wrapping data on which we want
+to apply sequential task-based algorithms.
+The solving code becomes:
+@verbatim
+/* Factorization: */
+MORSE_dpotrf_Tile( UPLO, descA );
+/* Solve: */
+MORSE_dpotrs_Tile( UPLO, descA, descX );
+@end verbatim
+To use the tile interface, a specific structure @code{MORSE_desc_t} must be
+created.
+This can be achieved from different ways.
+@enumerate
+@item Use the existing function @code{MORSE_Desc_Create}: means the
+matrix data are considered contiguous in memory as it is considered in PLASMA
+(@ref{Tile Data Layout}).
+@item Use the existing function @code{MORSE_Desc_Create_OOC}: means the
+matrix data is allocated on-demand in memory tile by tile, and possibly pushed
+to disk if that does not fit memory.
+@item Use the existing function @code{MORSE_Desc_Create_User}: it is more
+flexible than @code{Desc_Create} because you can give your own way to access to
+tile data so that your tiles can be allocated wherever you want in memory, see
+next paragraph @ref{Step3}.
+@item Create you own function to fill the descriptor.
+If you understand well the meaning of each item of @code{MORSE_desc_t}, you
+should be able to fill correctly the structure (good luck).
+@end enumerate
+
+In Step2, we use the first way to create the descriptor:
+@verbatim
+MORSE_Desc_Create(&descA, NULL, MorseRealDouble,
+                  NB, NB, NB*NB, N, N,
+                  0, 0, N, N,
+                  1, 1);
+@end verbatim
+
+@itemize @bullet
+
+@item @code{descA} is the descriptor to create.
+
+@item The second argument is a pointer to existing data.
+The existing data must follow LAPACK/PLASMA matrix layout @ref{Tile Data
+Layout} (1-D array column-major) if @code{MORSE_Desc_Create} is used to create
+the descriptor.
+The @code{MORSE_Desc_Create_User} function can be used if you have data
+organized differently.
+This is discussed in the next paragraph @ref{Step3}.
+Giving a @code{NULL} pointer means you let the function allocate memory space.
+This requires to copy your data in the memory allocated by the
+@code{Desc_Create}.
+This can be done with
+@verbatim
+MORSE_Lapack_to_Tile(A, N, descA);
+@end verbatim
+
+@item Third argument of @code{Desc_Create} is the datatype (used for memory
+allocation).
+
+@item Fourth argument until sixth argument stand for respectively, the number
+of rows (@code{NB}), columns (@code{NB}) in each tile, the total number of
+values in a tile (@code{NB*NB}), the number of rows (@code{N}), colmumns
+(@code{N}) in the entire matrix.
+
+@item Seventh argument until ninth argument stand for respectively, the
+beginning row (@code{0}), column (@code{0}) indexes of the submatrix and the
+number of rows (@code{N}), columns (@code{N}) in the submatrix.
+These arguments are specific and used in precise cases.
+If you do not consider submatrices, just use @code{0, 0, NROWS, NCOLS}.
+
+@item Two last arguments are the parameter of the 2-D block-cyclic distribution
+grid, see @uref{http://www.netlib.org/scalapack/slug/node75.html, ScaLAPACK}.
+To be able to use other data distribution over the nodes,
+@code{MORSE_Desc_Create_User} function should be used.
+
+@end itemize
+
+
+@node Step3
+@subsubsection Step3
+
+This program makes use of the same interface than Step2 (tile interface) but
+does not allocate LAPACK matrices anymore so that no copy between LAPACK matrix
+layout and tile matrix layout are necessary to call MORSE routines.
+To generate random right hand-sides you can use:
+@verbatim
+/* Allocate memory and initialize descriptor B */
+MORSE_Desc_Create(&descB,  NULL, MorseRealDouble,
+                  NB, NB,  NB*NB, N, NRHS,
+                  0, 0, N, NRHS, 1, 1);
+/* generate RHS with random values */
+MORSE_dplrnt_Tile( descB, 5673 );
+@end verbatim
+
+The other important point is that is it possible to create a descriptor, the
+necessary structure to call MORSE efficiently, by giving your own pointer to
+tiles if your matrix is not organized as a 1-D array column-major.
+This can be achieved with the @code{MORSE_Desc_Create_User} routine.
+Here is an example:
+@verbatim
+MORSE_Desc_Create_User(&descA, matA, MorseRealDouble,
+                       NB, NB, NB*NB, N, N,
+                       0, 0, N, N, 1, 1,
+                       user_getaddr_arrayofpointers,
+                       user_getblkldd_arrayofpointers,
+                       user_getrankof_zero);
+@end verbatim
+Firsts arguments are the same than @code{MORSE_Desc_Create} routine.
+Following arguments allows you to give pointer to functions that manage the
+access to tiles from the structure given as second argument.
+Here for example, @code{matA} is an array containing addresses to tiles, see
+the function @code{allocate_tile_matrix} defined in @file{step3.h}.
+The three functions you have to define for @code{Desc_Create_User} are:
+@itemize @bullet
+@item a function that returns address of tile @math{A(m,n)}, m and n standing
+for the indexes of the tile in the global matrix. Lets consider a matrix
+@math{4x4} with tile size @math{2x2}, the matrix contains four tiles of
+indexes: @math{A(m=0,n=0)}, @math{A(m=0,n=1)}, @math{A(m=1,n=0)},
+@math{A(m=1,n=1)}
+@item a function that returns the leading dimension of tile @math{A(m,*)}
+@item a function that returns MPI rank of tile @math{A(m,n)}
+@end itemize
+Examples for these functions are vizible in @file{step3.h}.
+Note that the way we define these functions is related to the tile matrix
+format and to the data distribution considered.
+This example should not be used with MPI since all tiles are affected to
+processus @code{0}, which means a large amount of data will be
+potentially transfered between nodes.
+
+@node Step4
+@subsubsection Step4
+This program is a copy of step2 but instead of using the tile interface, it
+uses the tile async interface.
+The goal is to exhibit the runtime synchronization barriers.
+Keep in mind that when the tile interface is called, like
+@code{MORSE_dpotrf_Tile}, a synchronization function, waiting for the actual
+execution and termination of all tasks, is called to ensure the
+proper completion of the algorithm (i.e. data are up-to-date).
+The code shows how to exploit the async interface to pipeline subsequent
+algorithms so that less synchronisations are done.
+The code becomes:
+@verbatim
+/* Morse structure containing parameters and a structure to interact with
+ * the Runtime system */
+MORSE_context_t *morse;
+/* MORSE sequence uniquely identifies a set of asynchronous function calls
+ * sharing common exception handling */
+MORSE_sequence_t *sequence = NULL;
+/* MORSE request uniquely identifies each asynchronous function call */
+MORSE_request_t request = MORSE_REQUEST_INITIALIZER;
+int status;
+
+...
+
+morse_sequence_create(morse, &sequence);
+
+/* Factorization: */
+MORSE_dpotrf_Tile_Async( UPLO, descA, sequence, &request );
+
+/* Solve: */
+MORSE_dpotrs_Tile_Async( UPLO, descA, descX, sequence, &request);
+
+/* Synchronization barrier (the runtime ensures that all submitted tasks
+ * have been terminated */
+RUNTIME_barrier(morse);
+/* Ensure that all data processed on the gpus we are depending on are back
+ * in main memory */
+RUNTIME_desc_getoncpu(descA);
+RUNTIME_desc_getoncpu(descX);
+
+status = sequence->status;
+
+@end verbatim
+Here the sequence of @code{dpotrf} and @code{dpotrs} algorithms is processed
+without synchronization so that some tasks of @code{dpotrf} and @code{dpotrs}
+can be concurently executed which could increase performances.
+The async interface is very similar to the tile one.
+It is only necessary to give two new objects @code{MORSE_sequence_t} and
+@code{MORSE_request_t} used to handle asynchronous function calls.
+
+@center @image{potri_async,13cm,8cm}
+POTRI (POTRF, TRTRI, LAUUM) algorithm with and without synchronization
+barriers, courtesey of the @uref{http://icl.cs.utk.edu/plasma/, PLASMA} team.
+
+@node Step5
+@subsubsection Step5
+
+Step5 shows how to set some important parameters.
+This program is a copy of Step4 but some additional parameters are given by
+the user.
+The parameters that can be set are:
+@itemize @bullet
+@item number of Threads
+@item number of GPUs
+
+The number of workers can be given as argument to the executable with
+@option{--threads=} and @option{--gpus=} options.
+It is important to notice that we assign one thread per gpu to optimize data
+transfer between main memory and devices memory.
+The number of workers of each type @code{CPU} and @code{CUDA} must be given at
+@code{MORSE_Init}.
+@verbatim
+if ( iparam[IPARAM_THRDNBR] == -1 ) {
+    get_thread_count( &(iparam[IPARAM_THRDNBR]) );
+    /* reserve one thread par cuda device to optimize memory transfers */
+    iparam[IPARAM_THRDNBR] -= iparam[IPARAM_NCUDAS];
+}
+NCPU = iparam[IPARAM_THRDNBR];
+NGPU = iparam[IPARAM_NCUDAS];
+
+/* initialize MORSE with main parameters */
+MORSE_Init( NCPU, NGPU );
+@end verbatim
+
+@item matrix size
+@item number of right-hand sides
+@item block (tile) size
+
+The problem size is given with @option{--n=} and @option{--nrhs=} options.
+The tile size is given with option @option{--nb=}.
+These parameters are required to create descriptors.
+The size tile @code{NB} is a key parameter to get performances since it
+defines the granularity of tasks.
+If @code{NB} is too large compared to @code{N}, there are few tasks to
+schedule.
+If the number of workers is large this leads to limit parallelism.
+On the contrary, if @code{NB} is too small (@emph{i.e.} many small tasks),
+workers could not be correctly fed and the runtime systems operations
+could represent a substantial overhead.
+A trade-off has to be found depending on many parameters: problem size,
+algorithm (drive data dependencies), architecture (number of workers,
+workers speed, workers uniformity, memory bus speed).
+By default it is set to 128.
+Do not hesitate to play with this parameter and compare performances on your
+machine.
+
+@item inner-blocking size
+
+The inner-blocking size is given with option @option{--ib=}.
+This parameter is used by kernels (optimized algorithms applied on tiles) to
+perform subsequent operations with data block-size that fits the cache of
+workers.
+Parameters @code{NB} and @code{IB} can be given with @code{MORSE_Set} function:
+@verbatim
+MORSE_Set(MORSE_TILE_SIZE,        iparam[IPARAM_NB] );
+MORSE_Set(MORSE_INNER_BLOCK_SIZE, iparam[IPARAM_IB] );
+@end verbatim
+@end itemize
+
+@node Step6
+@subsubsection Step6
+
+This program is a copy of Step5 with some additional parameters to be set for
+the data distribution.
+To use this program properly MORSE must use StarPU Runtime system and MPI
+option must be activated at configure.
+The data distribution used here is 2-D block-cyclic, see for example
+@uref{http://www.netlib.org/scalapack/slug/node75.html, ScaLAPACK} for
+explanation.
+The user can enter the parameters of the distribution grid at execution with
+@option{--p=} option.
+Example using OpenMPI on four nodes with one process per node:
+@example
+mpirun -np 4 ./step6 --n=10000 --nb=320 --ib=64 \
+                     --threads=8 --gpus=2 --p=2
+@end example
+
+In this program we use the tile data layout from PLASMA so that the call
+@verbatim
+MORSE_Desc_Create_User(&descA, NULL, MorseRealDouble,
+                       NB, NB, NB*NB, N, N,
+                       0, 0, N, N,
+                       GRID_P, GRID_Q,
+                       morse_getaddr_ccrb,
+                       morse_getblkldd_ccrb,
+                       morse_getrankof_2d);
+@end verbatim
+is equivalent to the following call
+@verbatim
+MORSE_Desc_Create(&descA, NULL, MorseRealDouble,
+                  NB, NB, NB*NB, N, N,
+                  0, 0, N, N,
+                  GRID_P, GRID_Q);
+@end verbatim
+functions @code{morse_getaddr_ccrb}, @code{morse_getblkldd_ccrb},
+@code{morse_getrankof_2d} being used in @code{Desc_Create}.
+It is interesting to notice that the code is almost the same as Step5.
+The only additional information to give is the way tiles are distributed
+through the third function given to @code{MORSE_Desc_Create_User}.
+Here, because we have made experiments only with a 2-D block-cyclic
+distribution, we have parameters P and Q in the interface of @code{Desc_Create}
+but they have sense only for 2-D block-cyclic distribution and then using
+@code{morse_getrankof_2d} function.
+Of course it could be used with other distributions, being no more the
+parameters of a 2-D block-cyclic grid but of another distribution.
+
+@node Step7
+@subsubsection Step7
+
+This program is a copy of step6 with some additional calls to
+build a matrix from within chameleon using a function provided by the user.
+This can be seen as a replacement of the function like @code{MORSE_dplgsy_Tile()} that can be used
+to fill the matrix with random data, @code{MORSE_dLapack_to_Tile()} to fill the matrix
+with data stored in a lapack-like buffer, or @code{MORSE_Desc_Create_User()} that can be used
+to describe an arbitrary tile matrix structure.
+In this example, the build callback function are just wrapper towards @code{CORE_xxx()} functions, so the output
+of the program step7 should be exactly similar to that of step6.
+The difference is that the function used to fill the tiles is provided by the user,
+and therefore this approach is much more flexible.
+
+The new function to understand is @code{MORSE_dbuild_Tile}, e.g.
+@verbatim
+struct data_pl data_A={(double)N, 51, N};
+MORSE_dbuild_Tile(MorseUpperLower, descA, (void*)&data_A, Morse_build_callback_plgsy);
+@end verbatim
+The idea here is to let Chameleon fill the matrix data in a task-based fashion
+(parallel) by using a function given by the user.
+First, the user should define if all the blocks must be entirelly filled or just
+the upper/lower part with, e.g. @code{MorseUpperLower}.
+We still relies on the same structure @code{MORSE_desc_t} which must be
+initialized with the proper parameters, by calling for example
+@code{MORSE_Desc_Create}.
+Then, an opaque pointer is used to let the user give some extra data used by
+his function.
+The last parameter is the pointer to the user's function.
+
+@node List of available routines
+@subsection List of available routines
+
+@menu
+* Auxiliary routines:: Init, Finalize, Version, etc
+* Descriptor routines:: To handle descriptors
+* Options routines:: To set options
+* Sequences routines:: To manage asynchronous function calls
+* Linear Algebra routines:: Computional routines
+@end menu
+
+@node Auxiliary routines
+@subsubsection Auxiliary routines
+
+Reports MORSE version number.
+@verbatim
+int MORSE_Version        (int *ver_major, int *ver_minor, int *ver_micro);
+@end verbatim
+
+Initialize MORSE: initialize some parameters, initialize the runtime and/or MPI.
+@verbatim
+int MORSE_Init           (int nworkers, int ncudas);
+@end verbatim
+
+Finalyze MORSE: free some data and finalize the runtime and/or MPI.
+@verbatim
+int MORSE_Finalize       (void);
+@end verbatim
+
+Return the MPI rank of the calling process.
+@verbatim
+int MORSE_My_Mpi_Rank    (void);
+@end verbatim
+
+Suspend MORSE runtime to poll for new tasks, to avoid useless CPU consumption when
+no tasks have to be executed by MORSE runtime system.
+@verbatim
+int MORSE_Pause          (void);
+@end verbatim
+
+Symmetrical call to MORSE_Pause, used to resume the workers polling for new tasks.
+@verbatim
+int MORSE_Resume         (void);
+@end verbatim
+
+Conversion from LAPACK layout to tile layout.
+@verbatim
+int MORSE_Lapack_to_Tile (void *Af77, int LDA, MORSE_desc_t *A);
+@end verbatim
+
+Conversion from tile layout to LAPACK layout.
+@verbatim
+int MORSE_Tile_to_Lapack (MORSE_desc_t *A, void *Af77, int LDA);
+@end verbatim
+
+@node Descriptor routines
+@subsubsection Descriptor routines
+
+@c /* Descriptor */
+Create matrix descriptor, internal function.
+@verbatim
+int MORSE_Desc_Create  (MORSE_desc_t **desc, void *mat, MORSE_enum dtyp,
+                        int mb, int nb, int bsiz, int lm, int ln,
+                        int i, int j, int m, int n, int p, int q);
+@end verbatim
+
+Create matrix descriptor, user function.
+@verbatim
+int MORSE_Desc_Create_User(MORSE_desc_t **desc, void *mat, MORSE_enum dtyp,
+                           int mb, int nb, int bsiz, int lm, int ln,
+                           int i, int j, int m, int n, int p, int q,
+                           void* (*get_blkaddr)( const MORSE_desc_t*, int, int),
+                           int (*get_blkldd)( const MORSE_desc_t*, int ),
+                           int (*get_rankof)( const MORSE_desc_t*, int, int ));
+@end verbatim
+
+Destroys matrix descriptor.
+@verbatim
+int MORSE_Desc_Destroy (MORSE_desc_t **desc);
+@end verbatim
+
+Ensure that all data are up-to-date in main memory (even if some tasks have
+been processed on GPUs)
+@verbatim
+int MORSE_Desc_Getoncpu(MORSE_desc_t  *desc);
+@end verbatim
+
+@node Options routines
+@subsubsection Options routines
+
+@c /* Options */
+Enable MORSE feature.
+@verbatim
+int MORSE_Enable  (MORSE_enum option);
+@end verbatim
+Feature to be enabled:
+@itemize @bullet
+@item @code{MORSE_WARNINGS}:   printing of warning messages,
+@item @code{MORSE_ERRORS}:     printing of error messages,
+@item @code{MORSE_AUTOTUNING}: autotuning for tile size and inner block size,
+@item @code{MORSE_PROFILING_MODE}:  activate kernels profiling.
+@end itemize
+
+Disable MORSE feature.
+@verbatim
+int MORSE_Disable (MORSE_enum option);
+@end verbatim
+Symmetric to @code{MORSE_Enable}.
+
+Set MORSE parameter.
+@verbatim
+int MORSE_Set     (MORSE_enum param, int  value);
+@end verbatim
+Parameters to be set:
+@itemize @bullet
+@item @code{MORSE_TILE_SIZE}:        size matrix tile,
+@item @code{MORSE_INNER_BLOCK_SIZE}: size of tile inner block,
+@item @code{MORSE_HOUSEHOLDER_MODE}: type of householder trees (FLAT or TREE),
+@item @code{MORSE_HOUSEHOLDER_SIZE}: size of the groups in householder trees,
+@item @code{MORSE_TRANSLATION_MODE}: related to the
+@code{MORSE_Lapack_to_Tile}, see @file{ztile.c}.
+@end itemize
+
+Get value of MORSE parameter.
+@verbatim
+int MORSE_Get     (MORSE_enum param, int *value);
+@end verbatim
+
+@node Sequences routines
+@subsubsection Sequences routines
+
+@c /* Sequences */
+Create a sequence.
+@verbatim
+int MORSE_Sequence_Create  (MORSE_sequence_t **sequence);
+@end verbatim
+
+Destroy a sequence.
+@verbatim
+int MORSE_Sequence_Destroy (MORSE_sequence_t *sequence);
+@end verbatim
+
+Wait for the completion of a sequence.
+@verbatim
+int MORSE_Sequence_Wait    (MORSE_sequence_t *sequence);
+@end verbatim
+
+@node Linear Algebra routines
+@subsubsection Linear Algebra routines
+
+Routines computing linear algebra of the form
+@code{MORSE_name[_Tile[_Async]]} (@code{name} follows LAPACK naming scheme, see
+@uref{http://www.netlib.org/lapack/lug/node24.html} availables:
+
+@verbatim
+/** ********************************************************
+ *  Declarations of computational functions (LAPACK layout)
+ **/
+
+int MORSE_zgelqf(int M, int N, MORSE_Complex64_t *A, int LDA,
+                 MORSE_desc_t *descT);
+
+int MORSE_zgelqs(int M, int N, int NRHS, MORSE_Complex64_t *A, int LDA,
+                 MORSE_desc_t *descT, MORSE_Complex64_t *B, int LDB);
+
+int MORSE_zgels(MORSE_enum trans, int M, int N, int NRHS,
+                MORSE_Complex64_t *A, int LDA, MORSE_desc_t *descT,
+                MORSE_Complex64_t *B, int LDB);
+
+int MORSE_zgemm(MORSE_enum transA, MORSE_enum transB, int M, int N, int K,
+                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
+                MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta,
+                MORSE_Complex64_t *C, int LDC);
+
+int MORSE_zgeqrf(int M, int N, MORSE_Complex64_t *A, int LDA,
+                 MORSE_desc_t *descT);
+
+int MORSE_zgeqrs(int M, int N, int NRHS, MORSE_Complex64_t *A, int LDA,
+                 MORSE_desc_t *descT, MORSE_Complex64_t *B, int LDB);
+
+int MORSE_zgesv_incpiv(int N, int NRHS, MORSE_Complex64_t *A, int LDA,
+                       MORSE_desc_t *descL, int *IPIV,
+                       MORSE_Complex64_t *B, int LDB);
+
+int MORSE_zgesv_nopiv(int N, int NRHS, MORSE_Complex64_t *A, int LDA,
+                      MORSE_Complex64_t *B, int LDB);
+
+int MORSE_zgetrf_incpiv(int M, int N, MORSE_Complex64_t *A, int LDA,
+                        MORSE_desc_t *descL, int *IPIV);
+
+int MORSE_zgetrf_nopiv(int M, int N, MORSE_Complex64_t *A, int LDA);
+
+int MORSE_zgetrs_incpiv(MORSE_enum trans, int N, int NRHS,
+                        MORSE_Complex64_t *A, int LDA,
+                        MORSE_desc_t *descL, int *IPIV,
+                        MORSE_Complex64_t *B, int LDB);
+
+int MORSE_zgetrs_nopiv(MORSE_enum trans, int N, int NRHS,
+                       MORSE_Complex64_t *A, int LDA,
+                       MORSE_Complex64_t *B, int LDB);
+
+#ifdef COMPLEX
+int MORSE_zhemm(MORSE_enum side, MORSE_enum uplo, int M, int N,
+                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
+                MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta,
+                MORSE_Complex64_t *C, int LDC);
+
+int MORSE_zherk(MORSE_enum uplo, MORSE_enum trans, int N, int K,
+                double alpha, MORSE_Complex64_t *A, int LDA,
+                double beta, MORSE_Complex64_t *C, int LDC);
+
+int MORSE_zher2k(MORSE_enum uplo, MORSE_enum trans, int N, int K,
+                 MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
+                 MORSE_Complex64_t *B, int LDB, double beta,
+                 MORSE_Complex64_t *C, int LDC);
+#endif
+
+int MORSE_zlacpy(MORSE_enum uplo, int M, int N,
+                 MORSE_Complex64_t *A, int LDA,
+                 MORSE_Complex64_t *B, int LDB);
+
+double MORSE_zlange(MORSE_enum norm, int M, int N,
+                    MORSE_Complex64_t *A, int LDA);
+
+#ifdef COMPLEX
+double MORSE_zlanhe(MORSE_enum norm, MORSE_enum uplo, int N,
+                    MORSE_Complex64_t *A, int LDA);
+#endif
+
+double MORSE_zlansy(MORSE_enum norm, MORSE_enum uplo, int N,
+                    MORSE_Complex64_t *A, int LDA);
+
+double MORSE_zlantr(MORSE_enum norm, MORSE_enum uplo, MORSE_enum diag,
+                    int M, int N, MORSE_Complex64_t *A, int LDA);
+
+int MORSE_zlaset(MORSE_enum uplo, int M, int N, MORSE_Complex64_t alpha,
+                 MORSE_Complex64_t beta, MORSE_Complex64_t *A, int LDA);
+
+int MORSE_zlauum(MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA);
+
+#ifdef COMPLEX
+int MORSE_zplghe( double bump, MORSE_enum uplo, int N,
+                  MORSE_Complex64_t *A, int LDA,
+                  unsigned long long int seed );
+#endif
+
+int MORSE_zplgsy( MORSE_Complex64_t bump, MORSE_enum uplo, int N,
+                  MORSE_Complex64_t *A, int LDA,
+                  unsigned long long int seed );
+
+int MORSE_zplrnt( int M, int N, MORSE_Complex64_t *A, int LDA,
+                  unsigned long long int seed );
+
+int MORSE_zposv(MORSE_enum uplo, int N, int NRHS,
+                MORSE_Complex64_t *A, int LDA,
+                MORSE_Complex64_t *B, int LDB);
+
+int MORSE_zpotrf(MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA);
+
+int MORSE_zsytrf(MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA);
+
+int MORSE_zpotri(MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA);
+
+int MORSE_zpotrs(MORSE_enum uplo, int N, int NRHS,
+                 MORSE_Complex64_t *A, int LDA,
+                 MORSE_Complex64_t *B, int LDB);
+
+#if defined (PRECISION_c) || defined(PRECISION_z)
+int MORSE_zsytrs(MORSE_enum uplo, int N, int NRHS,
+                 MORSE_Complex64_t *A, int LDA,
+                 MORSE_Complex64_t *B, int LDB);
+#endif
+
+int MORSE_zsymm(MORSE_enum side, MORSE_enum uplo, int M, int N,
+                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
+                MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta,
+                MORSE_Complex64_t *C, int LDC);
+
+int MORSE_zsyrk(MORSE_enum uplo, MORSE_enum trans, int N, int K,
+                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
+                MORSE_Complex64_t beta, MORSE_Complex64_t *C, int LDC);
+
+int MORSE_zsyr2k(MORSE_enum uplo, MORSE_enum trans, int N, int K,
+                 MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
+                 MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta,
+                 MORSE_Complex64_t *C, int LDC);
+
+int MORSE_ztrmm(MORSE_enum side, MORSE_enum uplo,
+                MORSE_enum transA, MORSE_enum diag,
+                int N, int NRHS,
+                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
+                MORSE_Complex64_t *B, int LDB);
+
+int MORSE_ztrsm(MORSE_enum side, MORSE_enum uplo,
+                MORSE_enum transA, MORSE_enum diag,
+                int N, int NRHS,
+                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
+                MORSE_Complex64_t *B, int LDB);
+
+int MORSE_ztrsmpl(int N, int NRHS, MORSE_Complex64_t *A, int LDA,
+                  MORSE_desc_t *descL, int *IPIV,
+                  MORSE_Complex64_t *B, int LDB);
+
+int MORSE_ztrsmrv(MORSE_enum side, MORSE_enum uplo,
+                  MORSE_enum transA, MORSE_enum diag,
+                  int N, int NRHS,
+                  MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
+                  MORSE_Complex64_t *B, int LDB);
+
+int MORSE_ztrtri(MORSE_enum uplo, MORSE_enum diag, int N,
+                 MORSE_Complex64_t *A, int LDA);
+
+int MORSE_zunglq(int M, int N, int K, MORSE_Complex64_t *A, int LDA,
+                 MORSE_desc_t *descT, MORSE_Complex64_t *B, int LDB);
+
+int MORSE_zungqr(int M, int N, int K, MORSE_Complex64_t *A, int LDA,
+                 MORSE_desc_t *descT, MORSE_Complex64_t *B, int LDB);
+
+int MORSE_zunmlq(MORSE_enum side, MORSE_enum trans, int M, int N, int K,
+                 MORSE_Complex64_t *A, int LDA,
+                 MORSE_desc_t *descT,
+                 MORSE_Complex64_t *B, int LDB);
+
+int MORSE_zunmqr(MORSE_enum side, MORSE_enum trans, int M, int N, int K,
+                 MORSE_Complex64_t *A, int LDA, MORSE_desc_t *descT,
+                 MORSE_Complex64_t *B, int LDB);
+
+/** ******************************************************
+ *  Declarations of computational functions (tile layout)
+ **/
+
+int MORSE_zgelqf_Tile(MORSE_desc_t *A, MORSE_desc_t *T);
+
+int MORSE_zgelqs_Tile(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B);
+
+int MORSE_zgels_Tile(MORSE_enum trans, MORSE_desc_t *A, MORSE_desc_t *T,
+                     MORSE_desc_t *B);
+
+int MORSE_zgemm_Tile(MORSE_enum transA, MORSE_enum transB,
+                     MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                     MORSE_desc_t *B, MORSE_Complex64_t beta,
+                     MORSE_desc_t *C);
+
+int MORSE_zgeqrf_Tile(MORSE_desc_t *A, MORSE_desc_t *T);
+
+int MORSE_zgeqrs_Tile(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B);
+
+int MORSE_zgesv_incpiv_Tile(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV,
+                            MORSE_desc_t *B);
+
+int MORSE_zgesv_nopiv_Tile(MORSE_desc_t *A, MORSE_desc_t *B);
+
+int MORSE_zgetrf_incpiv_Tile(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV);
+
+int MORSE_zgetrf_nopiv_Tile(MORSE_desc_t *A);
+
+int MORSE_zgetrs_incpiv_Tile(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV,
+                             MORSE_desc_t *B);
+
+int MORSE_zgetrs_nopiv_Tile(MORSE_desc_t *A, MORSE_desc_t *B);
+
+#ifdef COMPLEX
+int MORSE_zhemm_Tile(MORSE_enum side, MORSE_enum uplo,
+                     MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                     MORSE_desc_t *B, MORSE_Complex64_t beta,
+                     MORSE_desc_t *C);
+
+int MORSE_zherk_Tile(MORSE_enum uplo, MORSE_enum trans,
+                     double alpha, MORSE_desc_t *A,
+                     double beta, MORSE_desc_t *C);
+
+int MORSE_zher2k_Tile(MORSE_enum uplo, MORSE_enum trans,
+                      MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                      MORSE_desc_t *B, double beta, MORSE_desc_t *C);
+#endif
+
+int MORSE_zlacpy_Tile(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B);
+
+double MORSE_zlange_Tile(MORSE_enum norm, MORSE_desc_t *A);
+
+#ifdef COMPLEX
+double MORSE_zlanhe_Tile(MORSE_enum norm, MORSE_enum uplo, MORSE_desc_t *A);
+#endif
+
+double MORSE_zlansy_Tile(MORSE_enum norm, MORSE_enum uplo, MORSE_desc_t *A);
+
+double MORSE_zlantr_Tile(MORSE_enum norm, MORSE_enum uplo,
+                         MORSE_enum diag, MORSE_desc_t *A);
+
+int MORSE_zlaset_Tile(MORSE_enum uplo, MORSE_Complex64_t alpha,
+                      MORSE_Complex64_t beta, MORSE_desc_t *A);
+
+int MORSE_zlauum_Tile(MORSE_enum uplo, MORSE_desc_t *A);
+
+#ifdef COMPLEX
+int MORSE_zplghe_Tile(double bump, MORSE_enum uplo, MORSE_desc_t *A,
+                      unsigned long long int seed);
+#endif
+
+int MORSE_zplgsy_Tile(MORSE_Complex64_t bump, MORSE_enum uplo, MORSE_desc_t *A,
+                      unsigned long long int seed );
+
+int MORSE_zplrnt_Tile(MORSE_desc_t *A, unsigned long long int seed );
+
+int MORSE_zposv_Tile(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B);
+
+int MORSE_zpotrf_Tile(MORSE_enum uplo, MORSE_desc_t *A);
+
+int MORSE_zsytrf_Tile(MORSE_enum uplo, MORSE_desc_t *A);
+
+int MORSE_zpotri_Tile(MORSE_enum uplo, MORSE_desc_t *A);
+
+int MORSE_zpotrs_Tile(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B);
+
+#if defined (PRECISION_c) || defined(PRECISION_z)
+int MORSE_zsytrs_Tile(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B);
+#endif
+
+int MORSE_zsymm_Tile(MORSE_enum side, MORSE_enum uplo,
+                     MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                     MORSE_desc_t *B, MORSE_Complex64_t beta,
+                     MORSE_desc_t *C);
+
+int MORSE_zsyrk_Tile(MORSE_enum uplo, MORSE_enum trans,
+                     MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                     MORSE_Complex64_t beta, MORSE_desc_t *C);
+
+int MORSE_zsyr2k_Tile(MORSE_enum uplo, MORSE_enum trans,
+                      MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                      MORSE_desc_t *B, MORSE_Complex64_t beta,
+                      MORSE_desc_t *C);
+
+int MORSE_ztrmm_Tile(MORSE_enum side, MORSE_enum uplo,
+                     MORSE_enum transA, MORSE_enum diag,
+                     MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                     MORSE_desc_t *B);
+
+int MORSE_ztrsm_Tile(MORSE_enum side, MORSE_enum uplo,
+                     MORSE_enum transA, MORSE_enum diag,
+                     MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                     MORSE_desc_t *B);
+
+int MORSE_ztrsmpl_Tile(MORSE_desc_t *A, MORSE_desc_t *L,
+                       int *IPIV, MORSE_desc_t *B);
+
+int MORSE_ztrsmrv_Tile(MORSE_enum side, MORSE_enum uplo,
+                       MORSE_enum transA, MORSE_enum diag,
+                       MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                       MORSE_desc_t *B);
+
+int MORSE_ztrtri_Tile(MORSE_enum uplo, MORSE_enum diag, MORSE_desc_t *A);
+
+int MORSE_zunglq_Tile(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B);
+
+int MORSE_zungqr_Tile(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B);
+
+int MORSE_zunmlq_Tile(MORSE_enum side, MORSE_enum trans, MORSE_desc_t *A,
+                      MORSE_desc_t *T, MORSE_desc_t *B);
+
+int MORSE_zunmqr_Tile(MORSE_enum side, MORSE_enum trans, MORSE_desc_t *A,
+                      MORSE_desc_t *T, MORSE_desc_t *B);
+
+/** ****************************************
+ *  Declarations of computational functions
+ *  (tile layout, asynchronous execution)
+ **/
+
+int MORSE_zgelqf_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T,
+                            MORSE_sequence_t *sequence,
+                            MORSE_request_t *request);
+
+int MORSE_zgelqs_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T,
+                            MORSE_desc_t *B,
+                            MORSE_sequence_t *sequence,
+                            MORSE_request_t *request);
+
+int MORSE_zgels_Tile_Async(MORSE_enum trans, MORSE_desc_t *A,
+                           MORSE_desc_t *T, MORSE_desc_t *B,
+                           MORSE_sequence_t *sequence,
+                           MORSE_request_t *request);
+
+int MORSE_zgemm_Tile_Async(MORSE_enum transA, MORSE_enum transB,
+                           MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                           MORSE_desc_t *B, MORSE_Complex64_t beta,
+                           MORSE_desc_t *C, MORSE_sequence_t *sequence,
+                           MORSE_request_t *request);
+
+int MORSE_zgeqrf_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T,
+                            MORSE_sequence_t *sequence,
+                            MORSE_request_t *request)
+
+int MORSE_zgeqrs_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T,
+                            MORSE_desc_t *B,
+                            MORSE_sequence_t *sequence,
+                            MORSE_request_t *request);
+
+int MORSE_zgesv_incpiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *L,
+                                  int *IPIV, MORSE_desc_t *B,
+                                  MORSE_sequence_t *sequence,
+                                  MORSE_request_t *request);
+
+int MORSE_zgesv_nopiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *B,
+                                 MORSE_sequence_t *sequence,
+                                 MORSE_request_t *request);
+
+int MORSE_zgetrf_incpiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *L,
+                                   int *IPIV, MORSE_sequence_t *sequence,
+                                   MORSE_request_t *request);
+
+int MORSE_zgetrf_nopiv_Tile_Async(MORSE_desc_t *A,
+                                  MORSE_sequence_t *sequence,
+                                  MORSE_request_t *request);
+
+int MORSE_zgetrs_incpiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *L,
+                                   int *IPIV, MORSE_desc_t *B,
+                                   MORSE_sequence_t *sequence,
+                                   MORSE_request_t *request);
+
+int MORSE_zgetrs_nopiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *B,
+                                  MORSE_sequence_t *sequence,
+                                  MORSE_request_t *request);
+
+#ifdef COMPLEX
+int MORSE_zhemm_Tile_Async(MORSE_enum side, MORSE_enum uplo,
+                           MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                           MORSE_desc_t *B, MORSE_Complex64_t beta,
+                           MORSE_desc_t *C, MORSE_sequence_t *sequence,
+                           MORSE_request_t *request);
+
+int MORSE_zherk_Tile_Async(MORSE_enum uplo, MORSE_enum trans,
+                           double alpha, MORSE_desc_t *A,
+                           double beta, MORSE_desc_t *C,
+                           MORSE_sequence_t *sequence,
+                           MORSE_request_t *request);
+
+int MORSE_zher2k_Tile_Async(MORSE_enum uplo, MORSE_enum trans,
+                            MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                            MORSE_desc_t *B, double beta, MORSE_desc_t *C,
+                            MORSE_sequence_t *sequence,
+                            MORSE_request_t *request);
+#endif
+
+int MORSE_zlacpy_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A,
+                            MORSE_desc_t *B, MORSE_sequence_t *sequence,
+                            MORSE_request_t *request);
+
+int MORSE_zlange_Tile_Async(MORSE_enum norm, MORSE_desc_t *A, double *value,
+                            MORSE_sequence_t *sequence,
+                            MORSE_request_t *request);
+
+#ifdef COMPLEX
+int MORSE_zlanhe_Tile_Async(MORSE_enum norm, MORSE_enum uplo,
+                            MORSE_desc_t *A, double *value,
+                            MORSE_sequence_t *sequence,
+                            MORSE_request_t *request);
+#endif
+
+int MORSE_zlansy_Tile_Async(MORSE_enum norm, MORSE_enum uplo,
+                            MORSE_desc_t *A, double *value,
+                            MORSE_sequence_t *sequence,
+                            MORSE_request_t *request);
+
+int MORSE_zlantr_Tile_Async(MORSE_enum norm, MORSE_enum uplo,
+                            MORSE_enum diag, MORSE_desc_t *A, double *value,
+                            MORSE_sequence_t *sequence,
+                            MORSE_request_t *request);
+
+int MORSE_zlaset_Tile_Async(MORSE_enum uplo, MORSE_Complex64_t alpha,
+                            MORSE_Complex64_t beta, MORSE_desc_t *A,
+                            MORSE_sequence_t *sequence,
+                            MORSE_request_t *request);
+
+int MORSE_zlauum_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A,
+                            MORSE_sequence_t *sequence,
+                            MORSE_request_t *request);
+
+#ifdef COMPLEX
+int MORSE_zplghe_Tile_Async(double bump, MORSE_enum uplo, MORSE_desc_t *A,
+                            unsigned long long int seed,
+                            MORSE_sequence_t *sequence,
+                            MORSE_request_t *request );
+#endif
+
+int MORSE_zplgsy_Tile_Async(MORSE_Complex64_t bump, MORSE_enum uplo, MORSE_desc_t *A,
+                            unsigned long long int seed,
+                            MORSE_sequence_t *sequence,
+                            MORSE_request_t *request );
+
+int MORSE_zplrnt_Tile_Async(MORSE_desc_t *A, unsigned long long int seed,
+                            MORSE_sequence_t *sequence,
+                            MORSE_request_t *request );
+
+int MORSE_zposv_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A,
+                           MORSE_desc_t *B,
+                           MORSE_sequence_t *sequence,
+                           MORSE_request_t *request);
+
+int MORSE_zpotrf_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A,
+                            MORSE_sequence_t *sequence,
+                            MORSE_request_t *request);
+
+int MORSE_zsytrf_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A,
+                            MORSE_sequence_t *sequence,
+                            MORSE_request_t *request);
+
+int MORSE_zpotri_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A,
+                            MORSE_sequence_t *sequence,
+                            MORSE_request_t *request);
+
+int MORSE_zpotrs_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A,
+                            MORSE_desc_t *B, MORSE_sequence_t *sequence,
+                            MORSE_request_t *request);
+
+#if defined (PRECISION_c) || defined(PRECISION_z)
+int MORSE_zsytrs_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A,
+                            MORSE_desc_t *B,
+                            MORSE_sequence_t *sequence,
+                            MORSE_request_t *request);
+#endif
+
+int MORSE_zsymm_Tile_Async(MORSE_enum side, MORSE_enum uplo,
+                           MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                           MORSE_desc_t *B, MORSE_Complex64_t beta,
+                           MORSE_desc_t *C, MORSE_sequence_t *sequence,
+                           MORSE_request_t *request);
+
+int MORSE_zsyrk_Tile_Async(MORSE_enum uplo, MORSE_enum trans,
+                           MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                           MORSE_Complex64_t beta, MORSE_desc_t *C,
+                           MORSE_sequence_t *sequence,
+                           MORSE_request_t *request);
+
+int MORSE_zsyr2k_Tile_Async(MORSE_enum uplo, MORSE_enum trans,
+                            MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                            MORSE_desc_t *B, MORSE_Complex64_t beta,
+                            MORSE_desc_t *C, MORSE_sequence_t *sequence,
+                            MORSE_request_t *request);
+
+int MORSE_ztrmm_Tile_Async(MORSE_enum side, MORSE_enum uplo,
+                           MORSE_enum transA, MORSE_enum diag,
+                           MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                           MORSE_desc_t *B, MORSE_sequence_t *sequence,
+                           MORSE_request_t *request);
+
+int MORSE_ztrsm_Tile_Async(MORSE_enum side, MORSE_enum uplo,
+                           MORSE_enum transA, MORSE_enum diag,
+                           MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                           MORSE_desc_t *B, MORSE_sequence_t *sequence,
+                           MORSE_request_t *request);
+
+int MORSE_ztrsmpl_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV,
+                             MORSE_desc_t *B, MORSE_sequence_t *sequence,
+                             MORSE_request_t *request);
+
+int MORSE_ztrsmrv_Tile_Async(MORSE_enum side, MORSE_enum uplo,
+                             MORSE_enum transA, MORSE_enum diag,
+                             MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                             MORSE_desc_t *B, MORSE_sequence_t *sequence,
+                             MORSE_request_t *request);
+
+int MORSE_ztrtri_Tile_Async(MORSE_enum uplo, MORSE_enum diag,
+                            MORSE_desc_t *A,
+                            MORSE_sequence_t *sequence,
+                            MORSE_request_t *request);
+
+int MORSE_zunglq_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T,
+                            MORSE_desc_t *B,
+                            MORSE_sequence_t *sequence,
+                            MORSE_request_t *request);
+
+int MORSE_zungqr_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T,
+                            MORSE_desc_t *B,
+                            MORSE_sequence_t *sequence,
+                            MORSE_request_t *request);
+
+int MORSE_zunmlq_Tile_Async(MORSE_enum side, MORSE_enum trans,
+                            MORSE_desc_t *A, MORSE_desc_t *T,
+                            MORSE_desc_t *B, MORSE_sequence_t *sequence,
+                            MORSE_request_t *request);
+
+int MORSE_zunmqr_Tile_Async(MORSE_enum side, MORSE_enum trans,
+                            MORSE_desc_t *A, MORSE_desc_t *T,
+                            MORSE_desc_t *B, MORSE_sequence_t *sequence,
+                            MORSE_request_t *request);
+
+@end verbatim
+
+@c -nofor_main
diff --git a/doc/orgmode/figures/morse_header.png b/doc/orgmode/figures/morse_header.png
new file mode 100644
index 0000000000000000000000000000000000000000..ada315a235dfd4ee4a35064e13ae0d680b480059
Binary files /dev/null and b/doc/orgmode/figures/morse_header.png differ
diff --git a/doc/orgmode/figures/potri_async.png b/doc/orgmode/figures/potri_async.png
new file mode 100644
index 0000000000000000000000000000000000000000..85ebe6ad9af3db6070cd898323400a8a584b7583
Binary files /dev/null and b/doc/orgmode/figures/potri_async.png differ
diff --git a/doc/orgmode/figures/tile_layout.jpg b/doc/orgmode/figures/tile_layout.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..16a44b08afab7de2c15a75f200baf210c7fe6d3e
Binary files /dev/null and b/doc/orgmode/figures/tile_layout.jpg differ
diff --git a/doc/orgmode/figures/tile_layout.pdf b/doc/orgmode/figures/tile_layout.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..f5df80dbe06de18346c1df6c14a20c6e1c24edd1
Binary files /dev/null and b/doc/orgmode/figures/tile_layout.pdf differ
diff --git a/doc/orgmode/figures/tile_lu.jpg b/doc/orgmode/figures/tile_lu.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9da660ab607fae57cec55eb3c8ddc0512ea7fd62
Binary files /dev/null and b/doc/orgmode/figures/tile_lu.jpg differ
diff --git a/doc/orgmode/figures/tile_lu.pdf b/doc/orgmode/figures/tile_lu.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..c9b6df65197c83449c6335ebb1da393d92cd683f
Binary files /dev/null and b/doc/orgmode/figures/tile_lu.pdf differ
diff --git a/doc/orgmode/figures/trace_qr.jpg b/doc/orgmode/figures/trace_qr.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..92504d096fe829e3a0d9f2a296262c00cef3e792
Binary files /dev/null and b/doc/orgmode/figures/trace_qr.jpg differ
diff --git a/doc/orgmode/figures/trace_qr.pdf b/doc/orgmode/figures/trace_qr.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..e030af5e252dd26828aa156e17c6b1d240a534db
Binary files /dev/null and b/doc/orgmode/figures/trace_qr.pdf differ
diff --git a/doc/orgmode/morse.css b/doc/orgmode/morse.css
new file mode 100644
index 0000000000000000000000000000000000000000..41adb20078f7e5b0af0af434fa51bdb361af022f
--- /dev/null
+++ b/doc/orgmode/morse.css
@@ -0,0 +1,72 @@
+body {
+  padding: 2em 1em 2em 70px;
+  margin: 0;
+  font-family: sans-serif;
+  color: black;
+  background: white;
+  background-position: top left;
+  background-attachment: fixed;
+  background-repeat: no-repeat;
+}
+:link { color: #00C; background: transparent }
+:visited { color: #609; background: transparent }
+a:active { color: #C00; background: transparent }
+
+a:link img, a:visited img { border-style: none }
+
+a img { color: white; }
+@media all {
+  a img { color: inherit; }
+}
+
+th, td {
+  font-family: sans-serif;
+}
+
+h1, h2, h3, h4, h5, h6 { text-align: left }
+h1, h2, h3 { color: #005A9C; background: white }
+h1 { font: 170% sans-serif }
+h2 { font: 140% sans-serif }
+h3 { font: 120% sans-serif }
+h4 { font: bold 100% sans-serif }
+h5 { font: italic 100% sans-serif }
+h6 { font: small-caps 100% sans-serif }
+
+.hide { display: none }
+
+div.head { margin-bottom: 1em }
+div.head h1 { margin-top: 2em; clear: both }
+div.head table { margin-left: 2em; margin-top: 2em }
+
+p.copyright { font-size: small }
+p.copyright small { font-size: small }
+
+@media screen {
+a[href]:hover { background: #ffa }
+}
+
+pre { margin-left: 2em }
+
+dt, dd { margin-top: 0; margin-bottom: 0 }
+dt { font-weight: bold }
+
+pre, code { font-family: monospace }
+
+ul.toc, ol.toc {
+  list-style: disc;
+  list-style: none;
+}
+
+@media aural {  
+  h1, h2, h3 { stress: 20; richness: 90 }
+  .hide { speak: none }
+  p.copyright { volume: x-soft; speech-rate: x-fast }
+  dt { pause-before: 20% }
+  pre { speak-punctuation: code } 
+}
+
+/*
+body {
+  background-image: url();
+}
+*/
diff --git a/doc/orgmode/users_guide.org.in b/doc/orgmode/users_guide.org.in
new file mode 100644
index 0000000000000000000000000000000000000000..517ea729681ccd54a7371bc66ca5cd33bc4bd795
--- /dev/null
+++ b/doc/orgmode/users_guide.org.in
@@ -0,0 +1,54 @@
+#+TITLE: CHAMELEON User's Guide
+#+SUBTITLE: A dense linear algebra software for heterogeneous architectures
+#+LANGUAGE:  en
+#+OPTIONS: H:3 num:t \n:nil @:t ::t |:t _:nil ^:nil -:t f:t *:t <:t
+#+OPTIONS: TeX:t LaTeX:t skip:nil d:nil pri:nil tags:not-in-toc html-style:nil
+#+INCLUDE: "./version.org"
+#+AUTHOR: version {{{VERSION}}}
+* Version
+  This manual documents the usage of CHAMELEON *version {{{VERSION}}}*.
+  It was last updated on {{{UPDATED}}}.
+* Authors
+  * Inria,
+  * University of Tennessee,
+  * University of Colorado Denver,
+  * King Abdullah University of Science and Technology
+* Copying
+
+  Copyright \copy 2017 Inria
+
+  Copyright \copy 2014 The University of Tennessee
+
+  Copyright \copy 2014 King Abdullah University of Science and Technology
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+  - Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  - Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer listed
+    in this license in the documentation and/or other materials provided
+    with the distribution.
+  - Neither the name of the copyright holders nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+  This software is provided by the copyright holders and contributors
+  "as is" and any express or implied warranties, including, but not
+  limited to, the implied warranties of merchantability and fitness for
+  a particular purpose are disclaimed.  In no event shall the copyright
+  owner or contributors be liable for any direct, indirect, incidental,
+  special, exemplary, or consequential damages (including, but not
+  limited to, procurement of substitute goods or services; loss of use,
+  data, or profits; or business interruption) however caused and on any
+  theory of liability, whether in contract, strict liability, or tort
+  (including negligence or otherwise) arising in any way out of the use
+  of this software, even if advised of the possibility of such damage.
+* Introduction to Chameleon
+#+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/introduction.org
+* Installing Chameleon
+#+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/installing.org
+# #+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/configuration.org
+# #+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/using.org
diff --git a/doc/orgmode/version.org.in b/doc/orgmode/version.org.in
new file mode 100644
index 0000000000000000000000000000000000000000..4481ea26c10481ec5021c69c06ac1027704296e9
--- /dev/null
+++ b/doc/orgmode/version.org.in
@@ -0,0 +1,4 @@
+#+MACRO: UPDATED 25 August 2017
+#+MACRO: UPDATED-MONTH August 2017
+#+MACRO: EDITION @CHAMELEON_VERSION_MAJOR@.@CHAMELEON_VERSION_MINOR@.@CHAMELEON_VERSION_MICRO@
+#+MACRO: VERSION @CHAMELEON_VERSION_MAJOR@.@CHAMELEON_VERSION_MINOR@.@CHAMELEON_VERSION_MICRO@