From 556840cffb3d1fca2b6acade000308d5def9537c Mon Sep 17 00:00:00 2001
From: Florent Pruvost <florent.pruvost@inria.fr>
Date: Wed, 2 Dec 2015 13:34:44 +0000
Subject: [PATCH] update doc concerning trace option name trailing whitespaces

---
 docs/texinfo/chapters/configuration.texi | 158 ++--
 docs/texinfo/chapters/installing.texi    | 224 +++---
 docs/texinfo/chapters/introduction.texi  | 388 +++++-----
 docs/texinfo/chapters/using.texi         | 910 +++++++++++------------
 docs/texinfo/users_guide.texi.in         |  40 +-
 docs/texinfo/version.texi.in             |   4 +-
 6 files changed, 862 insertions(+), 862 deletions(-)

diff --git a/docs/texinfo/chapters/configuration.texi b/docs/texinfo/chapters/configuration.texi
index fef7d02fa..6b66acb0e 100644
--- a/docs/texinfo/chapters/configuration.texi
+++ b/docs/texinfo/chapters/configuration.texi
@@ -24,7 +24,7 @@
 @node Compilation configuration
 @section Compilation configuration
 
-The following arguments can be given to the @command{cmake <path to source 
+The following arguments can be given to the @command{cmake <path to source
 directory>} script.
 
 In this chapter, the following convention is used:
@@ -34,24 +34,24 @@ In this chapter, the following convention is used:
 @item
 @option{var} is a string and the correct value or an example will be given,
 @item
-@option{trigger} is an CMake option and the correct value is @code{ON} or 
+@option{trigger} is an CMake option and the correct value is @code{ON} or
 @code{OFF}.
 @end itemize
 
 Using CMake there are several ways to give options:
 @enumerate
 @item directly as CMake command line arguments
-@item invoque @command{cmake <path to source directory>} once and then use 
-@command{ccmake <path to source directory>} to edit options through a 
-minimalist gui (required 
+@item invoque @command{cmake <path to source directory>} once and then use
+@command{ccmake <path to source directory>} to edit options through a
+minimalist gui (required
 @samp{cmake-curses-gui} installed on a Linux system)
-@item invoque @command{cmake-gui} command and fill information about the 
-location of the sources and where to build the project, then you have 
-access to options through a user-friendly Qt interface (required 
+@item invoque @command{cmake-gui} command and fill information about the
+location of the sources and where to build the project, then you have
+access to options through a user-friendly Qt interface (required
 @samp{cmake-qt-gui} installed on a Linux system)
 @end enumerate
 
-Example of configuration using the command line 
+Example of configuration using the command line
 @example
 cmake ~/chameleon/ -DCMAKE_BUILD_TYPE=Debug          \
                    -DCMAKE_INSTALL_PREFIX=~/install  \
@@ -59,11 +59,11 @@ cmake ~/chameleon/ -DCMAKE_BUILD_TYPE=Debug          \
                    -DCHAMELEON_USE_MAGMA=ON          \
                    -DCHAMELEON_USE_MPI=ON            \
                    -DBLA_VENDOR=Intel10_64lp         \
-                   -DSTARPU_DIR=~/install/starpu-1.1 \                
-                   -DCHAMELEON_USE_FXT=ON 
+                   -DSTARPU_DIR=~/install/starpu-1.1 \
+                   -DCHAMELEON_ENABLE_TRACING=ON
 @end example
 
-You can get the full list of options with @option{-L[A][H]} options of 
+You can get the full list of options with @option{-L[A][H]} options of
 @command{cmake} command:
 @example
 cmake -LH <path to source directory>
@@ -80,14 +80,14 @@ cmake -LH <path to source directory>
 @table @code
 
 @item -DCMAKE_INSTALL_PREFIX=@option{path} (default:@option{path=/usr/local})
-Install directory used by @code{make install} where some headers and libraries 
+Install directory used by @code{make install} where some headers and libraries
 will be copied.
-Permissions have to be granted to write onto @option{path} during @code{make 
+Permissions have to be granted to write onto @option{path} during @code{make
 install} step.
 
 @item -DCMAKE_BUILD_TYPE=@option{var} (default: @option{Release})
 Define the build type and the compiler optimization level.
-The possible values for @option{var} are: 
+The possible values for @option{var} are:
 @table @code
 @item empty
 @item Debug
@@ -97,7 +97,7 @@ The possible values for @option{var} are:
 @end table
 
 @item -DBUILD_SHARED_LIBS=@option{trigger} (default:@option{OFF})
-Indicate wether or not CMake has to build CHAMELEON static (@option{OFF}) or 
+Indicate wether or not CMake has to build CHAMELEON static (@option{OFF}) or
 shared (@option{ON}) libraries.
 
 @end table
@@ -105,7 +105,7 @@ shared (@option{ON}) libraries.
 @node CHAMELEON options
 @subsection CHAMELEON options
 
-List of CHAMELEON options that can be enabled/disabled (value=@code{ON} 
+List of CHAMELEON options that can be enabled/disabled (value=@code{ON}
 or @code{OFF}):
 @table @code
 
@@ -116,40 +116,40 @@ to link with StarPU library (runtime system)
 to link with QUARK library (runtime system)
 
 @item @option{-DCHAMELEON_USE_CUDA}=@option{trigger} (default: @code{OFF})
-to link with CUDA runtime (implementation paradigm for accelerated codes on 
-GPUs) and cuBLAS library (optimized BLAS kernels on GPUs), can only be used with 
+to link with CUDA runtime (implementation paradigm for accelerated codes on
+GPUs) and cuBLAS library (optimized BLAS kernels on GPUs), can only be used with
 StarPU
 @item @option{-DCHAMELEON_USE_MAGMA}=@option{trigger} (default: @code{OFF})
-to link with MAGMA library (kernels on GPUs, higher level than cuBLAS), can only 
+to link with MAGMA library (kernels on GPUs, higher level than cuBLAS), can only
 be used with StarPU
 
 @item @option{-DCHAMELEON_USE_MPI}=@option{trigger} (default: @code{OFF})
-to link with MPI library (message passing implementation for use of multiple 
+to link with MPI library (message passing implementation for use of multiple
 nodes with distributed memory), can only be used with StarPU
 
-@item @option{-DCHAMELEON_USE_FXT}=@option{trigger} (default: @code{OFF})
-to link with FxT library (trace execution of kernels on workers), can only be 
-used with StarPU
+@item @option{-DCHAMELEON_ENABLE_TRACING}=@option{trigger} (default: @code{OFF})
+to enable trace generation during execution of timing drivers.
+It requires StarPU to be linked with FxT library (trace execution of kernels on workers).
 
 @item @option{-DCHAMELEON_SIMULATION=trigger} (default: @code{OFF})
-to enable simulation mode, means CHAMELEON will not really execute tasks, 
-see details in section @ref{Use simulation mode with StarPU-SimGrid}. 
-This option must be used with StarPU compiled with  
-@uref{http://simgrid.gforge.inria.fr/, SimGrid} allowing to guess the 
+to enable simulation mode, means CHAMELEON will not really execute tasks,
+see details in section @ref{Use simulation mode with StarPU-SimGrid}.
+This option must be used with StarPU compiled with
+@uref{http://simgrid.gforge.inria.fr/, SimGrid} allowing to guess the
 execution time on any architecture.
-This feature should be used to make experiments on the scheduler behaviors and 
+This feature should be used to make experiments on the scheduler behaviors and
 performances not to produce solutions of linear systems.
 
 @item @option{-DCHAMELEON_ENABLE_DOCS=trigger} (default: @code{ON})
 to control build of the documentation contained in @file{docs/} sub-directory
 @item @option{-DCHAMELEON_ENABLE_EXAMPLE=trigger} (default: @code{ON})
-to control build of the examples executables (API usage) 
+to control build of the examples executables (API usage)
 contained in @file{example/} sub-directory
 @item @option{-DCHAMELEON_ENABLE_TESTING=trigger} (default: @code{ON})
-to control build of testing executables (numerical check) contained in 
+to control build of testing executables (numerical check) contained in
 @file{testing/} sub-directory
 @item @option{-DCHAMELEON_ENABLE_TIMING=trigger} (default: @code{ON})
-to control build of timing executables (performances check) contained in 
+to control build of timing executables (performances check) contained in
 @file{timing/} sub-directory
 
 @item @option{-DCHAMELEON_PREC_S=trigger} (default: @code{ON})
@@ -159,13 +159,13 @@ to enable the support of double arithmetic precision (double in C)
 @item @option{-DCHAMELEON_PREC_C=trigger} (default: @code{ON})
 to enable the support of complex arithmetic precision (complex in C)
 @item @option{-DCHAMELEON_PREC_Z=trigger} (default: @code{ON})
-to enable the support of double complex arithmetic precision (double complex 
+to enable the support of double complex arithmetic precision (double complex
 in C)
 
 @item @option{-DBLAS_VERBOSE=trigger} (default: @code{OFF})
 to make BLAS library discovery verbose
 @item @option{-DLAPACK_VERBOSE=trigger} (default: @code{OFF})
-to make LAPACK library discovery verbose (automatically enabled if 
+to make LAPACK library discovery verbose (automatically enabled if
 @option{BLAS_VERBOSE=@code{ON}})
 @end table
 
@@ -183,9 +183,9 @@ The possible values for @option{var} are:
 @item Generic
 @item ...
 @end table
-to force CMake to find a specific BLAS library, see the full list of BLA_VENDOR 
+to force CMake to find a specific BLAS library, see the full list of BLA_VENDOR
 in @file{FindBLAS.cmake} in @file{cmake_modules/morse/find}.
-By default @option{BLA_VENDOR} is empty so that CMake tries to detect all 
+By default @option{BLA_VENDOR} is empty so that CMake tries to detect all
 possible BLAS vendor with a preference for Intel MKL.
 @end table
 
@@ -198,11 +198,11 @@ directory of the LIBNAME library headers installation
 @item @option{-DLIBNAME_LIBDIR=@option{path}} (default: empty)
 directory of the LIBNAME libraries (.so, .a, .dylib, etc) installation
 @end table
-LIBNAME can be one of the following: BLAS - CBLAS - FXT - HWLOC - 
+LIBNAME can be one of the following: BLAS - CBLAS - FXT - HWLOC -
 LAPACK - LAPACKE - MAGMA - QUARK - STARPU - TMG.
 See paragraph about @ref{Dependencies detection} for details.
 
-Libraries detected with an official CMake module (see module files in 
+Libraries detected with an official CMake module (see module files in
 @file{CMAKE_ROOT/Modules/}):
 @itemize @bullet
 @item CUDA
@@ -210,7 +210,7 @@ Libraries detected with an official CMake module (see module files in
 @item Threads
 @end itemize
 
-Libraries detected with CHAMELEON cmake modules (see module files in 
+Libraries detected with CHAMELEON cmake modules (see module files in
 @file{cmake_modules/morse/find/} directory of CHAMELEON sources):
 @itemize @bullet
 @item BLAS
@@ -222,26 +222,26 @@ Libraries detected with CHAMELEON cmake modules (see module files in
 @item MAGMA
 @item QUARK
 @item STARPU
-@item TMG 
+@item TMG
 @end itemize
 
 
 @node Dependencies detection
 @section Dependencies detection
-You have different choices to detect dependencies on your system, either by 
-setting some environment variables containing paths to the libs and headers or 
-by specifying them directly at cmake configure. 
+You have different choices to detect dependencies on your system, either by
+setting some environment variables containing paths to the libs and headers or
+by specifying them directly at cmake configure.
 Different cases :
 @enumerate
-@item detection of dependencies through environment variables: 
+@item detection of dependencies through environment variables:
   @itemize @bullet
-  @item @env{LD_LIBRARY_PATH} environment variable should contain the list of 
-paths 
+  @item @env{LD_LIBRARY_PATH} environment variable should contain the list of
+paths
 where to find the libraries:
-    @example 
+    @example
     export @env{LD_LIBRARY_PATH}=$@env{LD_LIBRARY_PATH}:path/to/your/libs
     @end example
-  @item @env{INCLUDE} environment variable should contain the list of paths 
+  @item @env{INCLUDE} environment variable should contain the list of paths
 where to find the header files of libraries
     @example
     export @env{INCLUDE}=$@env{INCLUDE}:path/to/your/headers
@@ -250,27 +250,27 @@ where to find the header files of libraries
 
 @item detection with user's given paths:
   @itemize @bullet
-  @item you can specify the path at cmake configure by invoking 
-  @example 
-  cmake <path to SOURCE_DIR> -DLIBNAME_DIR=path/to/your/lib 
+  @item you can specify the path at cmake configure by invoking
+  @example
+  cmake <path to SOURCE_DIR> -DLIBNAME_DIR=path/to/your/lib
   @end example
   where LIB stands for the name of the lib to look for, example
   @example
   cmake <path to SOURCE_DIR> -DSTARPU_DIR=path/to/starpudir \
                              -DCBLAS_DIR= ...
   @end example
-  @item it is also possible to specify headers and library directories 
+  @item it is also possible to specify headers and library directories
 separately, example
   @example
   cmake <path to SOURCE_DIR>                           \
   -DSTARPU_INCDIR=path/to/libstarpu/include/starpu/1.1 \
   -DSTARPU_LIBDIR=path/to/libstarpu/lib
   @end example
-  @item Note BLAS and LAPACK detection can be tedious so that we provide a 
-verbose mode. Use @option{-DBLAS_VERBOSE=ON} or @option{-DLAPACK_VERBOSE=ON} to 
+  @item Note BLAS and LAPACK detection can be tedious so that we provide a
+verbose mode. Use @option{-DBLAS_VERBOSE=ON} or @option{-DLAPACK_VERBOSE=ON} to
 enable it.
   @end itemize
-  
+
 @end enumerate
 
 
@@ -280,49 +280,49 @@ enable it.
 @node Use FxT profiling through StarPU
 @section Use FxT profiling through StarPU
 
-StarPU can generate its own trace log files by compiling it with the 
-@option{--with-fxt} 
-option at the configure step (you can have to specify the directory where you 
-installed FxT by giving @option{--with-fxt=...} instead of @option{--with-fxt} 
-alone). 
-By doing so, traces are generated after each execution of a program which uses 
-StarPU in the directory pointed by the @env{STARPU_FXT_PREFIX} environment 
-variable. Example: 
+StarPU can generate its own trace log files by compiling it with the
+@option{--with-fxt}
+option at the configure step (you can have to specify the directory where you
+installed FxT by giving @option{--with-fxt=...} instead of @option{--with-fxt}
+alone).
+By doing so, traces are generated after each execution of a program which uses
+StarPU in the directory pointed by the @env{STARPU_FXT_PREFIX} environment
+variable. Example:
 @example
 export @env{STARPU_FXT_PREFIX}=/home/yourname/fxt_files/
 @end example
 
-When executing a @command{./timing/...} CHAMELEON program, if it has been 
-enabled (StarPU compiled with FxT and @option{-DCHAMELEON_USE_FXT=ON}), you 
-can give the option @option{--trace} to tell the program to generate trace log 
+When executing a @command{./timing/...} CHAMELEON program, if it has been
+enabled (StarPU compiled with FxT and @option{-DCHAMELEON_ENABLE_TRACING=ON}), you
+can give the option @option{--trace} to tell the program to generate trace log
 files.
 
-Finally, to generate the trace file which can be opened with 
-@uref{http://vite.gforge.inria.fr/, Vite} program, you have to use the 
-@command{starpu_fxt_tool} executable of StarPU. 
-This tool should be in @file{path/to/your/install/starpu/bin}. 
-You can use it to generate the trace file like this: 
+Finally, to generate the trace file which can be opened with
+@uref{http://vite.gforge.inria.fr/, Vite} program, you have to use the
+@command{starpu_fxt_tool} executable of StarPU.
+This tool should be in @file{path/to/your/install/starpu/bin}.
+You can use it to generate the trace file like this:
 @itemize @bullet
 @item @command{path/to/your/install/starpu/bin/starpu_fxt_tool -i prof_filename}
 
 There is one file per mpi processus (prof_filename_0, prof_filename_1 ...).
 To generate a trace of mpi programs you can call it like this:
-@item @command{path/to/your/install/starpu/bin/starpu_fxt_tool -i 
+@item @command{path/to/your/install/starpu/bin/starpu_fxt_tool -i
 prof_filename*}
 
-The trace file will be named paje.trace (use -o option to specify an output 
+The trace file will be named paje.trace (use -o option to specify an output
 name).
-@end itemize 
+@end itemize
 
 
 @node Use simulation mode with StarPU-SimGrid
 @section Use simulation mode with StarPU-SimGrid
 
-Simulation mode can be enabled by setting the cmake option 
+Simulation mode can be enabled by setting the cmake option
 @option{-DCHAMELEON_SIMULATION=ON}.
-This mode allows you to simulate execution of algorithms with StarPU compiled 
+This mode allows you to simulate execution of algorithms with StarPU compiled
 with @uref{http://simgrid.gforge.inria.fr/, SimGrid}.
-To do so, we provide some perfmodels in the @file{simucore/perfmodels/} 
+To do so, we provide some perfmodels in the @file{simucore/perfmodels/}
 directory of CHAMELEON sources.
 To use these perfmodels, please set the following
 @itemize @bullet
@@ -330,12 +330,12 @@ To use these perfmodels, please set the following
   @example
   @code{<path to SOURCE_DIR>/simucore/perfmodels}
   @end example
-@item @env{STARPU_HOSTNAME} environment variable to the name of the machine to 
+@item @env{STARPU_HOSTNAME} environment variable to the name of the machine to
 simulate. For example, on our platform (PlaFRIM) with GPUs at Inria Bordeaux
   @example
   @env{STARPU_HOSTNAME}=mirage
   @end example
-Note that only POTRF kernels with block sizes of 320 or 960 (simple and double 
+Note that only POTRF kernels with block sizes of 320 or 960 (simple and double
 precision) on mirage machine are available for now.
 Database of models is subject to change, it should be enrich in a near future.
 @end itemize
diff --git a/docs/texinfo/chapters/installing.texi b/docs/texinfo/chapters/installing.texi
index 0ed0735e6..d46c112ce 100644
--- a/docs/texinfo/chapters/installing.texi
+++ b/docs/texinfo/chapters/installing.texi
@@ -11,12 +11,12 @@
 * Build process of CHAMELEON::
 @end menu
 
-CHAMELEON can be built and installed by the standard means of CMake 
+CHAMELEON can be built and installed by the standard means of CMake
 (@uref{http://www.cmake.org/}).
-General information about CMake, as well as installation binaries and CMake 
-source code are available from 
+General information about CMake, as well as installation binaries and CMake
+source code are available from
 @uref{http://www.cmake.org/cmake/resources/software.html}.
-The following chapter is intended to briefly remind how these tools can be used 
+The following chapter is intended to briefly remind how these tools can be used
 to install CHAMELEON.
 
 @node Downloading CHAMELEON
@@ -31,11 +31,11 @@ to install CHAMELEON.
 @node Getting Sources
 @subsection Getting Sources
 
-The latest official release tarballs of CHAMELEON sources are available for 
-download from 
+The latest official release tarballs of CHAMELEON sources are available for
+download from
 @uref{http://morse.gforge.inria.fr/chameleon-0.9.1.tar.gz, chameleon-0.9.1}.
 
-@c The latest development snapshot is available from  
+@c The latest development snapshot is available from
 @c @uref{http://hydra.bordeaux.inria.fr/job/hiepacs/morse-cmake/tarball/latest/
 @c download-by-type/file/source-dist}.
 
@@ -57,141 +57,141 @@ download from
 @node a BLAS implementation
 @subsubsection a BLAS implementation
 
-@uref{http://www.netlib.org/blas/, BLAS} (Basic Linear Algebra Subprograms), 
-are a de facto standard for basic linear algebra operations such as vector and 
-matrix multiplication. 
-FORTRAN implementation of BLAS is available from Netlib. 
-Also, C implementation of BLAS is included in GSL (GNU Scientific Library). 
-Both these implementations are reference implementation of BLAS, are not 
-optimized for modern processor architectures and provide an order of magnitude 
-lower performance than optimized implementations. 
-Highly optimized implementations of BLAS are available from many hardware 
-vendors, such as Intel MKL and AMD ACML. 
-Fast implementations are also available as academic packages, such as ATLAS and 
-Goto BLAS. 
+@uref{http://www.netlib.org/blas/, BLAS} (Basic Linear Algebra Subprograms),
+are a de facto standard for basic linear algebra operations such as vector and
+matrix multiplication.
+FORTRAN implementation of BLAS is available from Netlib.
+Also, C implementation of BLAS is included in GSL (GNU Scientific Library).
+Both these implementations are reference implementation of BLAS, are not
+optimized for modern processor architectures and provide an order of magnitude
+lower performance than optimized implementations.
+Highly optimized implementations of BLAS are available from many hardware
+vendors, such as Intel MKL and AMD ACML.
+Fast implementations are also available as academic packages, such as ATLAS and
+Goto BLAS.
 The standard interface to BLAS is the FORTRAN interface.
 
-@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with 
-the reference BLAS from NETLIB and the Intel MKL 11.1 from Intel distribution 
+@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with
+the reference BLAS from NETLIB and the Intel MKL 11.1 from Intel distribution
 2013_sp1.
 
 @node CBLAS
 @subsubsection CBLAS
 
-@uref{http://www.netlib.org/blas/#_cblas, CBLAS} is a C language interface to 
+@uref{http://www.netlib.org/blas/#_cblas, CBLAS} is a C language interface to
 BLAS.
-Most commercial and academic implementations of BLAS also provide CBLAS. 
-Netlib provides a reference implementation of CBLAS on top of FORTRAN BLAS 
-(Netlib CBLAS). 
+Most commercial and academic implementations of BLAS also provide CBLAS.
+Netlib provides a reference implementation of CBLAS on top of FORTRAN BLAS
+(Netlib CBLAS).
 Since GSL is implemented in C, it naturally provides CBLAS.
 
-@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with 
-the reference CBLAS from NETLIB and the Intel MKL 11.1 from Intel distribution 
+@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with
+the reference CBLAS from NETLIB and the Intel MKL 11.1 from Intel distribution
 2013_sp1.
 
 @node a LAPACK implementation
 @subsubsection a LAPACK implementation
 
-@uref{http://www.netlib.org/lapack/, LAPACK} (Linear Algebra PACKage) is a 
-software library for numerical linear algebra, a successor of LINPACK and 
-EISPACK and a predecessor of CHAMELEON. 
-LAPACK provides routines for solving linear systems of equations, linear least 
-square problems, eigenvalue problems and singular value problems. 
+@uref{http://www.netlib.org/lapack/, LAPACK} (Linear Algebra PACKage) is a
+software library for numerical linear algebra, a successor of LINPACK and
+EISPACK and a predecessor of CHAMELEON.
+LAPACK provides routines for solving linear systems of equations, linear least
+square problems, eigenvalue problems and singular value problems.
 Most commercial and academic BLAS packages also provide some LAPACK routines.
 
-@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with 
-the reference LAPACK from NETLIB and the Intel MKL 11.1 from Intel distribution 
+@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with
+the reference LAPACK from NETLIB and the Intel MKL 11.1 from Intel distribution
 2013_sp1.
 
 @node LAPACKE
 @subsubsection LAPACKE
 
-@uref{http://www.netlib.org/lapack/, LAPACKE} is a C language interface to 
-LAPACK (or CLAPACK). 
-It is produced by Intel in coordination with the LAPACK team and is available 
-in source code from Netlib in its original version (Netlib LAPACKE) and from 
-CHAMELEON website in an extended version (LAPACKE for CHAMELEON). 
-In addition to implementing the C interface, LAPACKE also provides routines 
-which automatically handle workspace allocation, making the use of LAPACK much 
+@uref{http://www.netlib.org/lapack/, LAPACKE} is a C language interface to
+LAPACK (or CLAPACK).
+It is produced by Intel in coordination with the LAPACK team and is available
+in source code from Netlib in its original version (Netlib LAPACKE) and from
+CHAMELEON website in an extended version (LAPACKE for CHAMELEON).
+In addition to implementing the C interface, LAPACKE also provides routines
+which automatically handle workspace allocation, making the use of LAPACK much
 more convenient.
 
-@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with 
-the reference LAPACKE from NETLIB. 
+@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with
+the reference LAPACKE from NETLIB.
 A stand-alone version of LAPACKE is required.
 
 @node libtmg
 @subsubsection libtmg
 
-@uref{http://www.netlib.org/lapack/, libtmg} is a component of the LAPACK 
-library, containing routines for generation 
-of input matrices for testing and timing of LAPACK. 
-The testing and timing suites of LAPACK require libtmg, but not the library 
+@uref{http://www.netlib.org/lapack/, libtmg} is a component of the LAPACK
+library, containing routines for generation
+of input matrices for testing and timing of LAPACK.
+The testing and timing suites of LAPACK require libtmg, but not the library
 itself. Note that the LAPACK library can be built and used without libtmg.
 
-@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with 
-the reference TMG from NETLIB and the Intel MKL 11.1 from Intel distribution 
+@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with
+the reference TMG from NETLIB and the Intel MKL 11.1 from Intel distribution
 2013_sp1.
 
 @node QUARK
 @subsubsection QUARK
 
-@uref{http://icl.cs.utk.edu/quark/, QUARK} (QUeuing And Runtime for Kernels) 
-provides a library that enables the dynamic execution of tasks with data 
-dependencies in a multi-core, multi-socket, shared-memory environment. 
-One of QUARK or StarPU Runtime systems has to be enabled in order to schedule 
+@uref{http://icl.cs.utk.edu/quark/, QUARK} (QUeuing And Runtime for Kernels)
+provides a library that enables the dynamic execution of tasks with data
+dependencies in a multi-core, multi-socket, shared-memory environment.
+One of QUARK or StarPU Runtime systems has to be enabled in order to schedule
 tasks on the architecture.
 If QUARK is enabled then StarPU is disabled and conversely.
 Note StarPU is enabled by default.
-When CHAMELEON is linked with QUARK, it is not possible to exploit neither 
+When CHAMELEON is linked with QUARK, it is not possible to exploit neither
 CUDA (for GPUs) nor MPI (distributed-memory environment).
 You can use StarPU to do so.
 
-@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with 
+@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with
 the QUARK library from PLASMA release between versions 2.5.0 and 2.6.0.
 
 @node StarPU
 @subsubsection StarPU
 
-@uref{http://runtime.bordeaux.inria.fr/StarPU/, StarPU} is a task programming 
+@uref{http://runtime.bordeaux.inria.fr/StarPU/, StarPU} is a task programming
 library for hybrid architectures.
 StarPU handles run-time concerns such as:
 @itemize @bullet
 @item Task dependencies
 @item Optimized heterogeneous scheduling
-@item Optimized data transfers and replication between main memory and discrete 
+@item Optimized data transfers and replication between main memory and discrete
 memories
 @item Optimized cluster communications
 @end itemize
 StarPU can be used to benefit from GPUs and distributed-memory environment.
-One of QUARK or StarPU runtime system has to be enabled in order to schedule 
+One of QUARK or StarPU runtime system has to be enabled in order to schedule
 tasks on the architecture.
 If StarPU is enabled then QUARK is disabled and conversely.
 Note StarPU is enabled by default.
 
-@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with 
+@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with
 StarPU-1.1 releases.
 
 @node hwloc
 @subsubsection hwloc
 
-@uref{http://www.open-mpi.org/projects/hwloc/, hwloc} (Portable Hardware 
-Locality) is a software package for accessing the  topology of a multicore 
-system including components like: cores, sockets, caches and NUMA nodes. 
-@c The topology discovery library, @code{hwloc}, is not mandatory to use StarPU 
-@c but strongly recommended. 
-It allows to increase performance, and to perform some topology aware 
+@uref{http://www.open-mpi.org/projects/hwloc/, hwloc} (Portable Hardware
+Locality) is a software package for accessing the  topology of a multicore
+system including components like: cores, sockets, caches and NUMA nodes.
+@c The topology discovery library, @code{hwloc}, is not mandatory to use StarPU
+@c but strongly recommended.
+It allows to increase performance, and to perform some topology aware
 scheduling.
-@code{hwloc} is available in major distributions and for most OSes and can be  
+@code{hwloc} is available in major distributions and for most OSes and can be
 downloaded from @uref{http://www.open-mpi.org/software/hwloc}.
 
-@strong{Caution about the compatibility:} hwloc should be compatible with the 
+@strong{Caution about the compatibility:} hwloc should be compatible with the
 version of StarPU used.
 
 @node pthread
 @subsubsection pthread
 
-POSIX threads library is required to run CHAMELEON on Unix-like systems. 
-It is a standard component of any such system. 
+POSIX threads library is required to run CHAMELEON on Unix-like systems.
+It is a standard component of any such system.
 @comment  Windows threads are used on Microsoft Windows systems.
 
 @node Optional dependencies
@@ -207,71 +207,71 @@ It is a standard component of any such system.
 @node OpenMPI
 @subsubsection OpenMPI
 
-@uref{http://www.open-mpi.org/, OpenMPI} is an open source Message Passing 
-Interface implementation for execution on multiple nodes with 
+@uref{http://www.open-mpi.org/, OpenMPI} is an open source Message Passing
+Interface implementation for execution on multiple nodes with
 distributed-memory environment.
 MPI can be enabled only if the runtime system chosen is StarPU (default).
-To use MPI through StarPU, it is necessary to compile StarPU with MPI 
+To use MPI through StarPU, it is necessary to compile StarPU with MPI
 enabled.
 
-@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with 
+@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with
 OpenMPI releases from versions 1.4 to 1.6.
 
 @node Nvidia CUDA Toolkit
 @subsubsection Nvidia CUDA Toolkit
 
-@uref{https://developer.nvidia.com/cuda-toolkit, Nvidia CUDA Toolkit} provides 
-a 
-comprehensive development environment for C and C++ developers building 
-GPU-accelerated applications. 
-CHAMELEON can use a set of low level optimized kernels coming from cuBLAS to 
+@uref{https://developer.nvidia.com/cuda-toolkit, Nvidia CUDA Toolkit} provides
+a
+comprehensive development environment for C and C++ developers building
+GPU-accelerated applications.
+CHAMELEON can use a set of low level optimized kernels coming from cuBLAS to
 accelerate computations on GPUs.
-The @uref{http://docs.nvidia.com/cuda/cublas/, cuBLAS} library is an 
-implementation of BLAS (Basic Linear Algebra Subprograms) on top of the Nvidia 
+The @uref{http://docs.nvidia.com/cuda/cublas/, cuBLAS} library is an
+implementation of BLAS (Basic Linear Algebra Subprograms) on top of the Nvidia
 CUDA runtime.
 cuBLAS is normaly distributed with Nvidia CUDA Toolkit.
-CUDA/cuBLAS can be enabled in CHAMELEON only if the runtime system chosen 
+CUDA/cuBLAS can be enabled in CHAMELEON only if the runtime system chosen
 is StarPU (default).
-To use CUDA through StarPU, it is necessary to compile StarPU with CUDA 
+To use CUDA through StarPU, it is necessary to compile StarPU with CUDA
 enabled.
 
-@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with 
-CUDA releases from versions 4 to 6. 
+@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with
+CUDA releases from versions 4 to 6.
 MAGMA library must be compatible with CUDA.
 
 @node MAGMA
 @subsubsection MAGMA
 
-@uref{http://icl.cs.utk.edu/magma/, MAGMA} project aims to develop a dense 
-linear algebra library similar to LAPACK but for heterogeneous/hybrid 
+@uref{http://icl.cs.utk.edu/magma/, MAGMA} project aims to develop a dense
+linear algebra library similar to LAPACK but for heterogeneous/hybrid
 architectures, starting with current "Multicore+GPU" systems.
-CHAMELEON can use a set of high level MAGMA routines to accelerate 
+CHAMELEON can use a set of high level MAGMA routines to accelerate
 computations on GPUs.
-To fully benefit from GPUs, the user should enable MAGMA in addition to 
+To fully benefit from GPUs, the user should enable MAGMA in addition to
 CUDA/cuBLAS.
 
-@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with 
+@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with
 MAGMA releases from versions 1.4 to 1.6.
 MAGMA library must be compatible with CUDA.
-MAGMA library should be built with sequential versions of BLAS/LAPACK. 
-We should not get some MAGMA link flags embarking multithreaded 
-BLAS/LAPACK because it could affect permformances (take care about the 
-MAGMA link flag @option{-lmkl_intel_thread} for example that we could heritate 
+MAGMA library should be built with sequential versions of BLAS/LAPACK.
+We should not get some MAGMA link flags embarking multithreaded
+BLAS/LAPACK because it could affect permformances (take care about the
+MAGMA link flag @option{-lmkl_intel_thread} for example that we could heritate
 from the pkg-config file @file{magma.pc}).
 
 @node FxT
 @subsubsection FxT
 
-@uref{http://download.savannah.gnu.org/releases/fkt/, FxT} stands for both 
-FKT (Fast Kernel Tracing) and FUT (Fast User Tracing). 
+@uref{http://download.savannah.gnu.org/releases/fkt/, FxT} stands for both
+FKT (Fast Kernel Tracing) and FUT (Fast User Tracing).
 This library provides efficient support for recording traces.
-CHAMELEON can trace kernels execution on the different workers and produce 
-.paje files if FxT is enabled. 
-FxT can only be used through StarPU and StarPU must be compiled with FxT 
-enabled, see how to use this feature here @ref{Use FxT profiling through 
+CHAMELEON can trace kernels execution on the different workers and produce
+.paje files if FxT is enabled.
+FxT can only be used through StarPU and StarPU must be compiled with FxT
+enabled, see how to use this feature here @ref{Use FxT profiling through
 StarPU}.
 
-@strong{Caution about the compatibility:} FxT should be compatible with the 
+@strong{Caution about the compatibility:} FxT should be compatible with the
 version of StarPU used.
 
 @node Build process of CHAMELEON
@@ -288,14 +288,14 @@ version of StarPU used.
 @node Setting up a build directory
 @subsection Setting up a build directory
 
-The CHAMELEON build process requires CMake version 2.8.0 or higher and 
+The CHAMELEON build process requires CMake version 2.8.0 or higher and
 working C and Fortran compilers.
-Compilation and link with CHAMELEON libraries have been tested with 
+Compilation and link with CHAMELEON libraries have been tested with
 @strong{gcc/gfortran 4.8.1} and @strong{icc/ifort 14.0.2}.
 On Unix-like operating systems, it also requires Make.
-The CHAMELEON project can not be configured for an in-source build. 
-You will get an error message if you try to compile in-source. 
-Please clean the root of your project by deleting the generated 
+The CHAMELEON project can not be configured for an in-source build.
+You will get an error message if you try to compile in-source.
+Please clean the root of your project by deleting the generated
 @file{CMakeCache.txt} file (and other CMake generated files).
 
 @example
@@ -304,7 +304,7 @@ cd build
 @end example
 
 @quotation
-You can create a build directory from any location you would like. It can be a 
+You can create a build directory from any location you would like. It can be a
 sub-directory of the CHAMELEON base source directory or anywhere else.
 @end quotation
 
@@ -314,10 +314,10 @@ sub-directory of the CHAMELEON base source directory or anywhere else.
 @example
 cmake <path to SOURCE_DIR> -DOPTION1= -DOPTION2= ...
 @end example
-@file{<path to SOURCE_DIR>} represents the root of CHAMELEON project where 
-stands 
+@file{<path to SOURCE_DIR>} represents the root of CHAMELEON project where
+stands
 the main (parent) @file{CMakeLists.txt} file.
-Details about options that are useful to give to @command{cmake <path to 
+Details about options that are useful to give to @command{cmake <path to
 SOURCE_DIR>} are given in @ref{Compilation configuration}.
 
 @node Building
@@ -331,7 +331,7 @@ do not hesitate to use @option{-j[ncores]} option to speedup the compilation
 @node Tests
 @subsection Tests
 
-In order to make sure that CHAMELEON is working properly on the system, it is 
+In order to make sure that CHAMELEON is working properly on the system, it is
 also possible to run a test suite.
 
 @example
@@ -345,13 +345,13 @@ ctest
 @node Installing
 @subsection Installing
 
-In order to install CHAMELEON at the location that was specified during 
+In order to install CHAMELEON at the location that was specified during
 configuration:
 
 @example
 make install
 @end example
-do not forget to specify the install directory with 
+do not forget to specify the install directory with
 @option{-DCMAKE_INSTALL_PREFIX} at cmake configure
 @example
 cmake <path to SOURCE_DIR> -DCMAKE_INSTALL_PREFIX=<path to INSTALL_DIR>
diff --git a/docs/texinfo/chapters/introduction.texi b/docs/texinfo/chapters/introduction.texi
index a666e2786..695d252cd 100644
--- a/docs/texinfo/chapters/introduction.texi
+++ b/docs/texinfo/chapters/introduction.texi
@@ -15,7 +15,7 @@
 @section MORSE project
 
 @ifnottex
-@center @image{morse_header} 
+@center @image{morse_header}
 @end ifnottex
 
 @menu
@@ -27,43 +27,43 @@
 @node MORSE Objectives
 @subsection MORSE Objectives
 
-When processor clock speeds flatlined in 2004, after more than fifteen years 
-of exponential increases, the era of near automatic performance improvements 
-that the HPC application community had previously enjoyed came to an abrupt 
-end. 
-To develop software that will perform well on petascale and exascale systems 
-with thousands of nodes and millions of cores, the list of major challenges 
-that must now be confronted is formidable: 
-1) dramatic escalation in the costs of intrasystem communication between 
-processors and/or levels of memory hierarchy; 
-2) increased heterogeneity of the processing units (mixing CPUs, GPUs, etc. in 
-varying and unexpected design combinations); 
-3) high levels of parallelism and more complex constraints means that 
-cooperating processes must be dynamically and unpredictably scheduled for 
-asynchronous execution; 
-4) software will not run at scale without much better resilience to faults and 
-far more robustness; and 
-5) new levels of self-adaptivity will be required to enable software to 
-modulate process speed in order to satisfy limited energy budgets. 
-The MORSE associate team will tackle the first three challenges in a 
-orchestrating work between research groups respectively specialized in sparse 
-linear algebra, dense linear algebra and runtime systems. 
-The overall objective is to develop robust linear algebra libraries relying on 
-innovative runtime systems that can fully benefit from the potential of those 
-future large-scale complex machines. 
-Challenges 4) and 5) will also be investigated by the different teams in the 
-context of other partnerships, but they will not be the main focus of the 
+When processor clock speeds flatlined in 2004, after more than fifteen years
+of exponential increases, the era of near automatic performance improvements
+that the HPC application community had previously enjoyed came to an abrupt
+end.
+To develop software that will perform well on petascale and exascale systems
+with thousands of nodes and millions of cores, the list of major challenges
+that must now be confronted is formidable:
+1) dramatic escalation in the costs of intrasystem communication between
+processors and/or levels of memory hierarchy;
+2) increased heterogeneity of the processing units (mixing CPUs, GPUs, etc. in
+varying and unexpected design combinations);
+3) high levels of parallelism and more complex constraints means that
+cooperating processes must be dynamically and unpredictably scheduled for
+asynchronous execution;
+4) software will not run at scale without much better resilience to faults and
+far more robustness; and
+5) new levels of self-adaptivity will be required to enable software to
+modulate process speed in order to satisfy limited energy budgets.
+The MORSE associate team will tackle the first three challenges in a
+orchestrating work between research groups respectively specialized in sparse
+linear algebra, dense linear algebra and runtime systems.
+The overall objective is to develop robust linear algebra libraries relying on
+innovative runtime systems that can fully benefit from the potential of those
+future large-scale complex machines.
+Challenges 4) and 5) will also be investigated by the different teams in the
+context of other partnerships, but they will not be the main focus of the
 associate team as they are much more prospective.
 
 @node Research fields
 @subsection Research fields
 
-The overall goal of the MORSE associate team is to enable advanced numerical 
-algorithms to be executed on a scalable unified runtime system for exploiting 
+The overall goal of the MORSE associate team is to enable advanced numerical
+algorithms to be executed on a scalable unified runtime system for exploiting
 the full potential of future exascale machines.
-We expect advances in three directions based first on strong and closed 
-interactions between the runtime and numerical linear algebra communities. 
-This initial activity will then naturally expand to more focused but still 
+We expect advances in three directions based first on strong and closed
+interactions between the runtime and numerical linear algebra communities.
+This initial activity will then naturally expand to more focused but still
 joint research in both fields.
 
 @menu
@@ -72,67 +72,67 @@ joint research in both fields.
 * Linear algebra::
 @end menu
 
-@node Fine interaction between linear algebra and runtime systems 
+@node Fine interaction between linear algebra and runtime systems
 @subsubsection Fine interaction between linear algebra and runtime systems
 
-On parallel machines, HPC applications need to take care of data movement and 
-consistency, which can be either explicitly managed at the level of the 
-application itself or delegated to a runtime system. 
-We adopt the latter approach in order to better keep up with hardware trends 
-whose complexity is growing exponentially. 
-One major task in this project is to define a proper interface between HPC 
-applications and runtime systems in order to maximize productivity and 
-expressivity. 
-As mentioned in the next section, a widely used approach consists in 
-abstracting the application as a DAG that the runtime system is in charge of 
-scheduling. 
-Scheduling such a DAG over a set of heterogeneous processing units introduces a 
-lot of new challenges, such as predicting accurately the execution time of each 
-type of task over each kind of unit, minimizing data transfers between memory 
-banks, performing data prefetching, etc. 
-Expected advances: In a nutshell, a new runtime system API will be designed to 
-allow applications to provide scheduling hints to the runtime system and to get 
+On parallel machines, HPC applications need to take care of data movement and
+consistency, which can be either explicitly managed at the level of the
+application itself or delegated to a runtime system.
+We adopt the latter approach in order to better keep up with hardware trends
+whose complexity is growing exponentially.
+One major task in this project is to define a proper interface between HPC
+applications and runtime systems in order to maximize productivity and
+expressivity.
+As mentioned in the next section, a widely used approach consists in
+abstracting the application as a DAG that the runtime system is in charge of
+scheduling.
+Scheduling such a DAG over a set of heterogeneous processing units introduces a
+lot of new challenges, such as predicting accurately the execution time of each
+type of task over each kind of unit, minimizing data transfers between memory
+banks, performing data prefetching, etc.
+Expected advances: In a nutshell, a new runtime system API will be designed to
+allow applications to provide scheduling hints to the runtime system and to get
 real-time feedback about the consequences of scheduling decisions.
 
 @node Runtime systems
 @subsubsection Runtime systems
 
-A runtime environment is an intermediate layer between the system and the 
-application. 
-It provides low-level functionality not provided by the system (such as 
-scheduling or management of the heterogeneity) and high-level features (such as 
-performance portability). 
-In the framework of this proposal, we will work on the scalability of runtime 
-environment. To achieve scalability it is required to avoid all centralization. 
-Here, the main problem is the scheduling of the tasks. 
-In many task-based runtime environments the scheduler is centralized and 
-becomes a bottleneck as soon as too many cores are involved. 
-It is therefore required to distribute the scheduling decision or to compute a 
-data distribution that impose the mapping of task using, for instance the 
-so-called ``owner-compute'' rule. 
-Expected advances: We will design runtime systems that enable an efficient and 
-scalable use of thousands of distributed multicore nodes enhanced with 
+A runtime environment is an intermediate layer between the system and the
+application.
+It provides low-level functionality not provided by the system (such as
+scheduling or management of the heterogeneity) and high-level features (such as
+performance portability).
+In the framework of this proposal, we will work on the scalability of runtime
+environment. To achieve scalability it is required to avoid all centralization.
+Here, the main problem is the scheduling of the tasks.
+In many task-based runtime environments the scheduler is centralized and
+becomes a bottleneck as soon as too many cores are involved.
+It is therefore required to distribute the scheduling decision or to compute a
+data distribution that impose the mapping of task using, for instance the
+so-called ``owner-compute'' rule.
+Expected advances: We will design runtime systems that enable an efficient and
+scalable use of thousands of distributed multicore nodes enhanced with
 accelerators.
 
 @node Linear algebra
 @subsubsection Linear algebra
 
-Because of its central position in HPC and of the well understood structure of 
-its algorithms, dense linear algebra has often pioneered new challenges that HPC 
-had to face. 
-Again, dense linear algebra has been in the vanguard of the new era of 
-petascale computing with the design of new algorithms that can efficiently run 
-on a multicore node with GPU accelerators. These algorithms are called 
-``communication-avoiding'' since they have been redesigned to limit the amount 
-of communication between processing units (and between the different levels of 
-memory hierarchy). 
-They are expressed through Direct Acyclic Graphs (DAG) of fine-grained tasks 
-that are dynamically scheduled. Expected advances: First, we plan to investigate 
-the impact of these principles in the case of sparse applications (whose 
-algorithms are slightly more complicated but often rely on dense kernels). 
-Furthermore, both in the dense and sparse cases, the scalability on thousands of 
-nodes is still limited; new numerical approaches need to be found. 
-We will specifically design sparse hybrid direct/iterative methods that 
+Because of its central position in HPC and of the well understood structure of
+its algorithms, dense linear algebra has often pioneered new challenges that HPC
+had to face.
+Again, dense linear algebra has been in the vanguard of the new era of
+petascale computing with the design of new algorithms that can efficiently run
+on a multicore node with GPU accelerators. These algorithms are called
+``communication-avoiding'' since they have been redesigned to limit the amount
+of communication between processing units (and between the different levels of
+memory hierarchy).
+They are expressed through Direct Acyclic Graphs (DAG) of fine-grained tasks
+that are dynamically scheduled. Expected advances: First, we plan to investigate
+the impact of these principles in the case of sparse applications (whose
+algorithms are slightly more complicated but often rely on dense kernels).
+Furthermore, both in the dense and sparse cases, the scalability on thousands of
+nodes is still limited; new numerical approaches need to be found.
+We will specifically design sparse hybrid direct/iterative methods that
 represent a promising approach.
 
 @node Research papers
@@ -154,39 +154,39 @@ Research papers about MORSE can be found at
 @node CHAMELEON software
 @subsection CHAMELEON software
 
-The main purpose is to address the performance shortcomings of 
+The main purpose is to address the performance shortcomings of
 the @uref{http://www.netlib.org/lapack/, LAPACK}
 and @uref{http://www.netlib.org/scalapack/, ScaLAPACK}
-libraries on multicore processors and multi-socket systems of multicore 
-processors and their inability to efficiently utilize accelerators such as 
+libraries on multicore processors and multi-socket systems of multicore
+processors and their inability to efficiently utilize accelerators such as
 Graphics Processing Units (GPUs).
 
-CHAMELEON is a framework written in C which provides routines to solve dense 
-general systems of linear equations, symmetric positive definite systems of 
-linear equations and linear least squares problems, using LU, Cholesky, QR and 
+CHAMELEON is a framework written in C which provides routines to solve dense
+general systems of linear equations, symmetric positive definite systems of
+linear equations and linear least squares problems, using LU, Cholesky, QR and
 LQ factorizations.
-Real arithmetic and complex arithmetic are supported in both single precision 
+Real arithmetic and complex arithmetic are supported in both single precision
 and double precision.
-It supports Linux and Mac OS/X machines (only tested on Intel x86-64 
+It supports Linux and Mac OS/X machines (only tested on Intel x86-64
 architecture).
 
-CHAMELEON is based on @uref{http://icl.cs.utk.edu/plasma/, PLASMA} source 
-code but is not limited to shared-memory environment and can exploit 
+CHAMELEON is based on @uref{http://icl.cs.utk.edu/plasma/, PLASMA} source
+code but is not limited to shared-memory environment and can exploit
 multiple GPUs.
-CHAMELEON is interfaced in a generic way with both 
-@uref{http://icl.cs.utk.edu/quark/, QUARK} and 
+CHAMELEON is interfaced in a generic way with both
+@uref{http://icl.cs.utk.edu/quark/, QUARK} and
 @uref{http://runtime.bordeaux.inria.fr/StarPU/, StarPU} runtime systems.
-This feature allows to analyze in a unified framework how sequential 
-task-based algorithms behave regarding different runtime systems 
-implementations. 
-Using CHAMELEON with @uref{http://runtime.bordeaux.inria.fr/StarPU/, 
-StarPU} runtime system allows to exploit GPUs through 
-kernels provided by @uref{https://developer.nvidia.com/cublas, cuBLAS} and 
-@uref{http://icl.cs.utk.edu/magma/, MAGMA} and clusters of interconnected 
+This feature allows to analyze in a unified framework how sequential
+task-based algorithms behave regarding different runtime systems
+implementations.
+Using CHAMELEON with @uref{http://runtime.bordeaux.inria.fr/StarPU/,
+StarPU} runtime system allows to exploit GPUs through
+kernels provided by @uref{https://developer.nvidia.com/cublas, cuBLAS} and
+@uref{http://icl.cs.utk.edu/magma/, MAGMA} and clusters of interconnected
 nodes with distributed memory (using @uref{http://www.open-mpi.org/, MPI}).
-Computation of very large systems with dense matrices on a cluster of nodes is 
+Computation of very large systems with dense matrices on a cluster of nodes is
 still being experimented and stabilized.
-It is not expected to get stable performances with the current version using 
+It is not expected to get stable performances with the current version using
 MPI.
 
 
@@ -195,9 +195,9 @@ MPI.
 @node PLASMA's design principles
 @subsection PLASMA's design principles
 
-CHAMELEON is originally based on @uref{http://icl.cs.utk.edu/plasma/, 
-PLASMA} so that design principles are very similar. 
-The content of this section @ref{PLASMA's design principles} has been copied 
+CHAMELEON is originally based on @uref{http://icl.cs.utk.edu/plasma/,
+PLASMA} so that design principles are very similar.
+The content of this section @ref{PLASMA's design principles} has been copied
 from the @samp{Design principles} section of the PLASMA User's Guide.
 
 @menu
@@ -209,55 +209,55 @@ from the @samp{Design principles} section of the PLASMA User's Guide.
 @node Tile Algorithms
 @subsubsection Tile Algorithms
 
-Tile algorithms are based on the idea of processing the matrix by square tiles 
-of relatively small size, such that a tile fits entirely in one of the cache 
+Tile algorithms are based on the idea of processing the matrix by square tiles
+of relatively small size, such that a tile fits entirely in one of the cache
 levels associated with one core.
-This way a tile can be loaded to the cache and processed completely before being 
+This way a tile can be loaded to the cache and processed completely before being
 evicted back to the main memory.
-Of the three types of cache misses, @emph{compulsory}, @emph{capacity} 
-and @emph{conflict}, the use of tile algorithms minimizes the number of 
-capacity misses, since each operation loads the amount of data that does not 
+Of the three types of cache misses, @emph{compulsory}, @emph{capacity}
+and @emph{conflict}, the use of tile algorithms minimizes the number of
+capacity misses, since each operation loads the amount of data that does not
 ``overflow'' the cache.
 
-For some operations such as matrix multiplication and Cholesky factorization, 
+For some operations such as matrix multiplication and Cholesky factorization,
 translating the classic algorithm to the tile algorithm is trivial.
-In the case of matrix multiplication, the tile algorithm is simply a product of 
-applying the technique of @emph{loop tiling} to the canonical definition of 
+In the case of matrix multiplication, the tile algorithm is simply a product of
+applying the technique of @emph{loop tiling} to the canonical definition of
 three nested loops.
 It is very similar for the Cholesky factorization.
-The @strong{left-looking} definition of Cholesky factorization from LAPACK is a 
-loop with a sequence of calls to four routines: xSYRK (symmetric 
-@strong{rank-k} update), xPOTRF (Cholesky factorization of a small block on the 
+The @strong{left-looking} definition of Cholesky factorization from LAPACK is a
+loop with a sequence of calls to four routines: xSYRK (symmetric
+@strong{rank-k} update), xPOTRF (Cholesky factorization of a small block on the
 diagonal), xGEMM (matrix multiplication) and xTRSM (triangular solve).
-If the xSYRK, xGEMM and xTRSM operations are expressed with the canonical 
-definition of three nested loops and the technique of loop tiling is applied, 
+If the xSYRK, xGEMM and xTRSM operations are expressed with the canonical
+definition of three nested loops and the technique of loop tiling is applied,
 the tile algorithm results.
-Since the algorithm is produced by simple reordering of operations, neither the 
+Since the algorithm is produced by simple reordering of operations, neither the
 number of operations nor numerical stability of the algorithm are affected.
 
-The situation becomes slightly more complicated for LU and QR factorizations, 
-where the classic algorithms factorize an entire panel of the matrix (a block 
+The situation becomes slightly more complicated for LU and QR factorizations,
+where the classic algorithms factorize an entire panel of the matrix (a block
 of columns) at every step of the algorithm.
-One can observe, however, that the process of matrix factorization is 
-synonymous with introducing zeros in approproate places and a tile algorithm 
+One can observe, however, that the process of matrix factorization is
+synonymous with introducing zeros in approproate places and a tile algorithm
 can be fought of as one that zeroes one tile of the matrix at a time.
-This process is referred to as updating of a factorization or @emph{incremental 
+This process is referred to as updating of a factorization or @emph{incremental
 factorization}.
-The process is equivalent to factorizing the top tile of a panel, then placing 
-the upper triangle of the result on top of the tile blow and factorizing again, 
+The process is equivalent to factorizing the top tile of a panel, then placing
+the upper triangle of the result on top of the tile blow and factorizing again,
 then moving to the next tile and so on.
-Here, the tile LU and QR algorithms perform slightly more floating point 
+Here, the tile LU and QR algorithms perform slightly more floating point
 operations and require slightly more memory for auxiliary data.
-Also, the tile LU factorization applies a different pivoting pattern and, as a 
+Also, the tile LU factorization applies a different pivoting pattern and, as a
 result, is less numerically stable than classic LU with full pivoting.
-Numerical stability is not an issue in case of the tile QR, which relies on 
-orthogonal transformations (Householder reflections), which are numerically 
+Numerical stability is not an issue in case of the tile QR, which relies on
+orthogonal transformations (Householder reflections), which are numerically
 stable.
 
 @center @image{tile_lu,7cm,7cm}
 
-Schematic illustration of the tile LU factorization (kernel names for 
-real arithmetics in double precision), courtesey of the 
+Schematic illustration of the tile LU factorization (kernel names for
+real arithmetics in double precision), courtesey of the
 @uref{http://icl.cs.utk.edu/plasma/, PLASMA} team.
 
 @comment  //////////////////////////////////////////////////////////
@@ -265,67 +265,67 @@ real arithmetics in double precision), courtesey of the
 @node Tile Data Layout
 @subsubsection Tile Data Layout
 
-Tile layout is based on the idea of storing the matrix by square tiles 
-of relatively small size, such that each tile occupies a continuous memory 
+Tile layout is based on the idea of storing the matrix by square tiles
+of relatively small size, such that each tile occupies a continuous memory
 region.
-This way a tile can be loaded to the cache memory efficiently and the risk of 
-evicting it from the cache memory before it is completely processed is 
+This way a tile can be loaded to the cache memory efficiently and the risk of
+evicting it from the cache memory before it is completely processed is
 minimized.
-Of the three types of cache misses, @emph{compulsory}, @emph{capacity} and 
-@emph{conflict}, the use of tile layout minimizes the number of conflict 
-misses, since a continuous region of memory will completely fill out a 
+Of the three types of cache misses, @emph{compulsory}, @emph{capacity} and
+@emph{conflict}, the use of tile layout minimizes the number of conflict
+misses, since a continuous region of memory will completely fill out a
 @strong{set-associative} cache memory before an eviction can happen.
-Also, from the standpoint of multithreaded execution, the probability of 
-@emph{false sharing} is minimized. 
-It can only affect the cache lines containing the beginning and the ending of a 
+Also, from the standpoint of multithreaded execution, the probability of
+@emph{false sharing} is minimized.
+It can only affect the cache lines containing the beginning and the ending of a
 tile.
 
-In standard @strong{cache-based} architecture, tiles continously laid out in 
+In standard @strong{cache-based} architecture, tiles continously laid out in
 memory maximize the profit from automatic prefetching.
-Tile layout is also beneficial in situations involving the use of accelerators, 
+Tile layout is also beneficial in situations involving the use of accelerators,
 where explicit communication of tiles through DMA transfers is required, such as
-moving tiles between the system memory and the local store in Cell B. E. or 
+moving tiles between the system memory and the local store in Cell B. E. or
 moving tiles between the host memory and the device memory in GPUs.
-In most circumstances tile layout also minimizes the number of TLB 
+In most circumstances tile layout also minimizes the number of TLB
 misses and conflicts to memory banks or partitions.
-With the standard (@strong{column-major}) layout, access to each column of 
+With the standard (@strong{column-major}) layout, access to each column of
 a tile is much more likely
-to cause a conflict miss, a false sharing miss, a TLB miss or a bank 
+to cause a conflict miss, a false sharing miss, a TLB miss or a bank
 or partition conflict.
-The use of the standard layout for dense matrix operations is a 
+The use of the standard layout for dense matrix operations is a
 performance minefield.
-Although occasionally one can pass through it unscathed, the risk of hitting a 
+Although occasionally one can pass through it unscathed, the risk of hitting a
 spot deadly to performance is very high.
 
-Another property of the layout utilized in PLASMA is that it is ``flat'', 
-meaning that it does not involve a level of indirection. Each tile stores a 
-small square submatrix of the main matrix in a @strong{column-major} layout. In 
-turn, the main matrix is an arrangement of tiles immediately following one 
+Another property of the layout utilized in PLASMA is that it is ``flat'',
+meaning that it does not involve a level of indirection. Each tile stores a
+small square submatrix of the main matrix in a @strong{column-major} layout. In
+turn, the main matrix is an arrangement of tiles immediately following one
 another in a @strong{column-major} layout.
-The offset of each tile can be calculated through address arithmetics and 
+The offset of each tile can be calculated through address arithmetics and
 does not involve pointer indirection.
-Alternatively, a matrix could be represented as an array of pointers to 
-tiles, located anywhere in memory. Such layout would be a radical 
+Alternatively, a matrix could be represented as an array of pointers to
+tiles, located anywhere in memory. Such layout would be a radical
 and unjustifiable departure from LAPACK and ScaLAPACK.
-Flat tile layout is a natural progression from LAPACK's @strong{column-major} 
+Flat tile layout is a natural progression from LAPACK's @strong{column-major}
 layout and ScaLAPACK's @strong{block-cyclic} layout.
 
-Another related property of PLASMA's tile layout is that it includes 
-provisions for padding of tiles, i.e., the actual region of memory designated 
+Another related property of PLASMA's tile layout is that it includes
+provisions for padding of tiles, i.e., the actual region of memory designated
 for a tile can be larger than the memory occupied by the actual data.
-This allows to force a certain alignment of tile boundaries, while using the 
+This allows to force a certain alignment of tile boundaries, while using the
 flat organization described in the previous paragraph.
-The motivation is that, at the price of small memory overhead, alignment of 
-tile boundaries may prove benefivial in multiple scenarios involving 
-memory systems of standard multicore processors, as well as accelerators. 
-The issues that come into play are, again, the use of TLBs and memory banks or 
+The motivation is that, at the price of small memory overhead, alignment of
+tile boundaries may prove benefivial in multiple scenarios involving
+memory systems of standard multicore processors, as well as accelerators.
+The issues that come into play are, again, the use of TLBs and memory banks or
 partitions.
 
 @center @image{tile_layout,7cm,7cm}
 
-Schematic illustration of the tile layout with @strong{column-major} 
-order of tiles, @strong{column-major} order of elements within tiles and 
-(optional) padding for enforcing a certain alighment of tile bondaries, 
+Schematic illustration of the tile layout with @strong{column-major}
+order of tiles, @strong{column-major} order of elements within tiles and
+(optional) padding for enforcing a certain alighment of tile bondaries,
 courtesey of the @uref{http://icl.cs.utk.edu/plasma/, PLASMA} team.
 
 @comment  %//////////////////////////////////////////////////////////
@@ -333,47 +333,47 @@ courtesey of the @uref{http://icl.cs.utk.edu/plasma/, PLASMA} team.
 @node Dynamic Task Scheduling
 @subsubsection Dynamic Task Scheduling
 
-Dynamic scheduling is the idea of assigning work to cores based on the 
-availability of data for processing at any given point in time and is also 
+Dynamic scheduling is the idea of assigning work to cores based on the
+availability of data for processing at any given point in time and is also
 referred to as @strong{@emph{data-driven}} scheduling.
-The concept is related closely to the idea of expressing computation through a 
-task graph, often referred to as the DAG (@emph{Direct Acyclic Graph}), and 
+The concept is related closely to the idea of expressing computation through a
+task graph, often referred to as the DAG (@emph{Direct Acyclic Graph}), and
 the flexibility exploring the DAG at runtime.
-Thus, to a large extent, dynamic scheduling is synonymous with 
+Thus, to a large extent, dynamic scheduling is synonymous with
 @strong{@emph{runtime scheduling}}.
-An important concept here is the one of the @emph{critical path}, which defines 
-the upper bound on the achievable parallelism, and needs to be pursued at the 
+An important concept here is the one of the @emph{critical path}, which defines
+the upper bound on the achievable parallelism, and needs to be pursued at the
 maximum speed.
-This is in direct opposition to the @strong{@emph{fork-and-join}} or 
-@strong{@emph{data-parallel}} programming models, where 
+This is in direct opposition to the @strong{@emph{fork-and-join}} or
+@strong{@emph{data-parallel}} programming models, where
 artificial synchronization points expose serial sections of
-the code, where multiple cores are idle, while sequential processing takes 
+the code, where multiple cores are idle, while sequential processing takes
 place.
 The use of dynamic scheduling introduces a @strong{trade-off}, though.
-The more dynamic (flexible) scheduling is, the more centralized (and less 
+The more dynamic (flexible) scheduling is, the more centralized (and less
 scalable) the scheduling mechanism is.
-For that reason, currently PLASMA uses two scheduling 
-mechanisms, one which is fully dynamic and one where work is assigned 
+For that reason, currently PLASMA uses two scheduling
+mechanisms, one which is fully dynamic and one where work is assigned
 statically and dependency checks are done at runtime.
 
-The first scheduling mechanism relies on unfolding a @emph{sliding window} of 
-the task graph at runtime and scheduling work by resolving data hazards: 
-@emph{Read After Write~(RAW)}, @emph{Write After Read~(WAR)} and @emph{Write 
-After Write~(WAW)}, a technique analogous to instruction scheduling in 
+The first scheduling mechanism relies on unfolding a @emph{sliding window} of
+the task graph at runtime and scheduling work by resolving data hazards:
+@emph{Read After Write~(RAW)}, @emph{Write After Read~(WAR)} and @emph{Write
+After Write~(WAW)}, a technique analogous to instruction scheduling in
 superscalar processors.
-It also relies on @strong{@emph{work-stealing}} for balanding the 
+It also relies on @strong{@emph{work-stealing}} for balanding the
 load among all multiple cores.
-The second scheduling mechanism relies on statically designating a path through 
-the execution space of the algorithm to each core and following a 
-cycle: transition to a task, wait for its dependencies, execute it, update the 
+The second scheduling mechanism relies on statically designating a path through
+the execution space of the algorithm to each core and following a
+cycle: transition to a task, wait for its dependencies, execute it, update the
 overall progress.
-Task are identified by tuples and task transitions are done through locally 
+Task are identified by tuples and task transitions are done through locally
 evaluated formulas.
-Progress information can be centralized, replicated or distributed (currently 
+Progress information can be centralized, replicated or distributed (currently
 centralized).
 
 @center @image{trace_qr,12cm,5cm}
 
-A trace of the tile QR factorization executing on eight cores without 
-any global synchronization points (kernel names for real arithmetics in single 
+A trace of the tile QR factorization executing on eight cores without
+any global synchronization points (kernel names for real arithmetics in single
 precision), courtesey of the @uref{http://icl.cs.utk.edu/plasma/, PLASMA} team.
diff --git a/docs/texinfo/chapters/using.texi b/docs/texinfo/chapters/using.texi
index db544254a..f42acdb6b 100644
--- a/docs/texinfo/chapters/using.texi
+++ b/docs/texinfo/chapters/using.texi
@@ -15,34 +15,34 @@
 @node Using CHAMELEON executables
 @section Using CHAMELEON executables
 
-CHAMELEON provides several test executables that are compiled and link with 
+CHAMELEON provides several test executables that are compiled and link with
 CHAMELEON stack of dependencies.
-Instructions about the arguments to give to executables are accessible thanks 
+Instructions about the arguments to give to executables are accessible thanks
 to the option @option{-[-]help} or @option{-[-]h}.
-This set of binaries are separated into three categories and can be found in 
+This set of binaries are separated into three categories and can be found in
 three different directories:
 
 @itemize @bullet
 
   @item example
 
-  contains examples of API usage and more specifically the 
-  sub-directory lapack_to_morse/ provides a tutorial that explain how to use 
-  CHAMELEON functionalities starting from a full LAPACK code, see 
+  contains examples of API usage and more specifically the
+  sub-directory lapack_to_morse/ provides a tutorial that explain how to use
+  CHAMELEON functionalities starting from a full LAPACK code, see
 @ref{Tutorial LAPACK to CHAMELEON}
 
   @item testing
 
-  contains testing drivers to check numerical correctness of 
+  contains testing drivers to check numerical correctness of
   CHAMELEON linear algebra routines with a wide range of parameters
   @example
   ./testing/stesting 4 1 LANGE 600 100 700
   @end example
   Two first arguments are the number of cores and gpus to use.
   The third one is the name of the algorithm to test.
-  The other arguments depend on the algorithm, here it lies for the number of 
+  The other arguments depend on the algorithm, here it lies for the number of
   rows, columns and leading dimension of the problem.
-  
+
   Name of algorithms available for testing are:
   @itemize @bullet
     @item LANGE: norms of matrices Infinite, One, Max, Frobenius
@@ -53,93 +53,93 @@ three different directories:
     @item SYMM: symmetric matrix-matrix multiply
     @item SYRK: symmetric matrix-matrix rank k update
     @item SYR2K: symmetric matrix-matrix rank 2k update
-    @item PEMV: matrix-vector multiply with pentadiagonal matrix    
+    @item PEMV: matrix-vector multiply with pentadiagonal matrix
     @item TRMM: triangular matrix-matrix multiply
     @item TRSM: triangular solve, multiple rhs
     @item POSV: solve linear systems with symmetric positive-definite matrix
-    @item GESV_INCPIV: solve linear systems with general matrix    
+    @item GESV_INCPIV: solve linear systems with general matrix
     @item GELS: linear least squares with general matrix
   @end itemize
 
   @item timing
 
   contains timing drivers to assess performances of CHAMELEON routines.
-  There are two sets of executables, those who do not use the tile interface 
+  There are two sets of executables, those who do not use the tile interface
 and those who do (with _tile in the name of the executable).
-  Executables without tile interface allocates data following LAPACK 
-conventions and these data can be given as arguments to CHAMELEON routines 
+  Executables without tile interface allocates data following LAPACK
+conventions and these data can be given as arguments to CHAMELEON routines
 as you would do with LAPACK.
-  Executables with tile interface generate directly the data in the format 
+  Executables with tile interface generate directly the data in the format
   CHAMELEON tile algorithms used to submit tasks to the runtime system.
-  Executables with tile interface should be more performant because no data 
+  Executables with tile interface should be more performant because no data
 copy from LAPACK matrix layout to tile matrix layout are necessary.
   Calling example:
   @example
-  ./timing/time_dpotrf --n_range=1000:10000:1000 --nb=320 
-                       --threads=9 --gpus=3 
+  ./timing/time_dpotrf --n_range=1000:10000:1000 --nb=320
+                       --threads=9 --gpus=3
                        --nowarmup
   @end example
-  
+
   List of main options that can be used in timing:
   @itemize @bullet
     @item @option{--help}: show usage
-    @item @option{--threads}: Number of CPU workers (default: 
+    @item @option{--threads}: Number of CPU workers (default:
 @option{_SC_NPROCESSORS_ONLN})
     @item @option{--gpus}: number of GPU workers (default: @option{0})
-    @item @option{--n_range=R}: range of N values, with 
+    @item @option{--n_range=R}: range of N values, with
 @option{R=Start:Stop:Step}
 (default: @option{500:5000:500})
     @item @option{--m=X}: dimension (M) of the matrices (default: @option{N})
-    @item @option{--k=X}: dimension (K) of the matrices (default: @option{1}), 
-useful for GEMM algorithm (k is the shared dimension and must be defined >1 to 
+    @item @option{--k=X}: dimension (K) of the matrices (default: @option{1}),
+useful for GEMM algorithm (k is the shared dimension and must be defined >1 to
 consider matrices and not vectors)
     @item @option{--nrhs=X}: number of right-hand size (default: @option{1})
     @item @option{--nb=X}: block/tile size. (default: @option{128})
     @item @option{--ib=X}: inner-blocking/IB size. (default: @option{32})
-    @item @option{--niter=X}: number of iterations performed for each test 
+    @item @option{--niter=X}: number of iterations performed for each test
 (default: @option{1})
-    @item @option{--rhblk=X}: if X > 0, enable Householder mode for QR and LQ 
+    @item @option{--rhblk=X}: if X > 0, enable Householder mode for QR and LQ
 factorization. X is the size of each subdomain (default: @option{0})
     @item @option{--[no]check}: check result (default: @option{nocheck})
-    @item @option{--[no]profile}: print profiling informations (default: 
+    @item @option{--[no]profile}: print profiling informations (default:
 @option{noprofile})
-    @item @option{--[no]trace}: enable/disable trace generation (default: 
+    @item @option{--[no]trace}: enable/disable trace generation (default:
 @option{notrace})
-    @item @option{--[no]dag}: enable/disable DAG generation (default: 
+    @item @option{--[no]dag}: enable/disable DAG generation (default:
 @option{nodag})
     @item @option{--[no]inv}: check on inverse (default: @option{noinv})
-    @item @option{--nocpu}: all GPU kernels are exclusively executed on GPUs 
+    @item @option{--nocpu}: all GPU kernels are exclusively executed on GPUs
 (default: @option{0})
   @end itemize
-  
+
   List of timing algorithms available:
   @itemize @bullet
     @item LANGE: norms of matrices
     @item GEMM: general matrix-matrix multiply
-    @item TRSM: triangular solve    
-    @item POTRF: Cholesky factorization with a symmetric 
+    @item TRSM: triangular solve
+    @item POTRF: Cholesky factorization with a symmetric
 positive-definite matrix
     @item POSV: solve linear systems with symmetric positive-definite matrix
     @item GETRF_NOPIV: LU factorization of a general matrix
-using the tile LU algorithm without row pivoting 
+using the tile LU algorithm without row pivoting
     @item GESV_NOPIV: solve linear system for a general matrix
-using the tile LU algorithm without row pivoting 
+using the tile LU algorithm without row pivoting
     @item GETRF_INCPIV: LU factorization of a general matrix
-using the tile LU algorithm with partial tile pivoting with row interchanges 
+using the tile LU algorithm with partial tile pivoting with row interchanges
     @item GESV_INCPIV: solve linear system for a general matrix
 using the tile LU algorithm with partial tile pivoting with row interchanges
 matrix
     @item GEQRF: QR factorization of a general matrix
-    @item GELS: solves overdetermined or underdetermined linear systems 
+    @item GELS: solves overdetermined or underdetermined linear systems
 involving a general matrix using the QR or the LQ factorization
   @end itemize
-  
+
 @end itemize
 
 @node Linking an external application with CHAMELEON libraries
 @section Linking an external application with CHAMELEON libraries
 
-Compilation and link with CHAMELEON libraries have been tested with 
+Compilation and link with CHAMELEON libraries have been tested with
 @strong{gcc/gfortran 4.8.1} and @strong{icc/ifort 14.0.2}.
 
 @menu
@@ -151,16 +151,16 @@ Compilation and link with CHAMELEON libraries have been tested with
 @node Static linking in C
 @subsection Static linking in C
 
-Lets imagine you have a file main.c that you want to link with CHAMELEON 
+Lets imagine you have a file main.c that you want to link with CHAMELEON
 static libraries.
-Lets consider @file{/home/yourname/install/chameleon} is the install directory 
+Lets consider @file{/home/yourname/install/chameleon} is the install directory
 of CHAMELEON containing sub-directories @file{include/} and @file{lib/}.
 Here could be your compilation command with gcc compiler:
 @example
 gcc -I/home/yourname/install/chameleon/include -o main.o -c main.c
 @end example
 
-Now if you want to link your application with CHAMELEON static libraries, you 
+Now if you want to link your application with CHAMELEON static libraries, you
 could do:
 @example
 gcc main.o -o main                                         \
@@ -170,31 +170,31 @@ gcc main.o -o main                                         \
 -lstarpu-1.1 -Wl,--no-as-needed -lmkl_intel_lp64           \
 -lmkl_sequential -lmkl_core -lpthread -lm -lrt
 @end example
-As you can see in this example, we also link with some dynamic libraries 
-@option{starpu-1.1}, @option{Intel MKL} libraries (for 
-BLAS/LAPACK/CBLAS/LAPACKE), @option{pthread}, @option{m} (math) and 
+As you can see in this example, we also link with some dynamic libraries
+@option{starpu-1.1}, @option{Intel MKL} libraries (for
+BLAS/LAPACK/CBLAS/LAPACKE), @option{pthread}, @option{m} (math) and
 @option{rt}.
 These libraries will depend on the configuration of your CHAMELEON build.
-You can find these dependencies in .pc files we generate during compilation and 
-that are installed in the sub-directory @file{lib/pkgconfig} of your 
+You can find these dependencies in .pc files we generate during compilation and
+that are installed in the sub-directory @file{lib/pkgconfig} of your
 CHAMELEON install directory.
-Note also that you could need to specify where to find these libraries with 
+Note also that you could need to specify where to find these libraries with
 @option{-L} option of your compiler/linker.
 
-Before to run your program, make sure that all shared libraries paths your 
+Before to run your program, make sure that all shared libraries paths your
 executable depends on are known.
 Enter @code{ldd main} to check.
-If some shared libraries paths are missing append them in the 
-@env{LD_LIBRARY_PATH} (for Linux systems) environment variable 
+If some shared libraries paths are missing append them in the
+@env{LD_LIBRARY_PATH} (for Linux systems) environment variable
 (@env{DYLD_LIBRARY_PATH} on Mac, @env{LIB} on Windows).
 
-@node Dynamic linking in C 
+@node Dynamic linking in C
 @subsection Dynamic linking in C
 
-For dynamic linking (need to build CHAMELEON with CMake 
-option @option{BUILD_SHARED_LIBS=ON}) it is similar to static compilation/link 
-but instead of specifying path to your static libraries you indicate the path 
-to dynamic libraries with @option{-L} option and you give the name of libraries 
+For dynamic linking (need to build CHAMELEON with CMake
+option @option{BUILD_SHARED_LIBS=ON}) it is similar to static compilation/link
+but instead of specifying path to your static libraries you indicate the path
+to dynamic libraries with @option{-L} option and you give the name of libraries
 with @option{-l} option like this:
 @example
 gcc main.o -o main                               \
@@ -204,11 +204,11 @@ gcc main.o -o main                               \
 -lmkl_sequential -lmkl_core -lpthread -lm -lrt
 @end example
 
-Note that an update of your environment variable 
-@env{LD_LIBRARY_PATH} (@env{DYLD_LIBRARY_PATH} on Mac, @env{LIB} on Windows) 
+Note that an update of your environment variable
+@env{LD_LIBRARY_PATH} (@env{DYLD_LIBRARY_PATH} on Mac, @env{LIB} on Windows)
 with the path of the libraries could be required before executing, example:
 @example
-export @env{LD_LIBRARY_PATH}=path/to/libs:path/to/chameleon/lib 
+export @env{LD_LIBRARY_PATH}=path/to/libs:path/to/chameleon/lib
 @end example
 
 @node Build a Fortran program with CHAMELEON
@@ -249,10 +249,10 @@ gfortran main.o -o main                          \
 @node CHAMELEON API
 @section CHAMELEON API
 
-CHAMELEON provides routines to solve dense general systems of linear 
-equations, symmetric positive definite systems of linear equations and linear 
+CHAMELEON provides routines to solve dense general systems of linear
+equations, symmetric positive definite systems of linear equations and linear
 least squares problems, using LU, Cholesky, QR and LQ factorizations.
-Real arithmetic and complex arithmetic are supported in both single precision 
+Real arithmetic and complex arithmetic are supported in both single precision
 and double precision.
 Routines that compute linear algebra are of the folowing form:
 @example
@@ -260,28 +260,28 @@ MORSE_name[_Tile[_Async]]
 @end example
 @itemize @bullet
 @item all user routines are prefixed with @code{MORSE}
-@item @code{name} follows BLAS/LAPACK naming scheme for algorithms 
+@item @code{name} follows BLAS/LAPACK naming scheme for algorithms
 (@emph{e.g.} sgemm for general matrix-matrix multiply simple precision)
 @item CHAMELEON provides three interface levels
   @itemize @minus
-  @item @code{MORSE_name}: simplest interface, very close to CBLAS and LAPACKE, 
-matrices are given following the LAPACK data layout (1-D array column-major). 
-It involves copy of data from LAPACK layout to tile layout and conversely (to 
+  @item @code{MORSE_name}: simplest interface, very close to CBLAS and LAPACKE,
+matrices are given following the LAPACK data layout (1-D array column-major).
+It involves copy of data from LAPACK layout to tile layout and conversely (to
 update LAPACK data), see @ref{Step1}.
-  @item @code{MORSE_name_Tile}: the tile interface avoid copies between LAPACK 
-and tile layouts. It is the standard interface of CHAMELEON and it should 
-achieved better performance than the previous simplest interface. The data are 
+  @item @code{MORSE_name_Tile}: the tile interface avoid copies between LAPACK
+and tile layouts. It is the standard interface of CHAMELEON and it should
+achieved better performance than the previous simplest interface. The data are
 given through a specific structure called a descriptor, see @ref{Step2}.
-  @item @code{MORSE_name_Tile_Async}: similar to the tile interface, it avoids 
-synchonization barrier normally called between @code{Tile} routines. 
-At the end of an @code{Async} function, completion of tasks is not guarentee 
-and data are not necessarily up-to-date. 
-To ensure that tasks have been all executed a synchronization function has to 
+  @item @code{MORSE_name_Tile_Async}: similar to the tile interface, it avoids
+synchonization barrier normally called between @code{Tile} routines.
+At the end of an @code{Async} function, completion of tasks is not guarentee
+and data are not necessarily up-to-date.
+To ensure that tasks have been all executed a synchronization function has to
 be called after the sequence of @code{Async} functions, see @ref{Step4}.
   @end itemize
 @end itemize
 
-MORSE routine calls have to be precede from 
+MORSE routine calls have to be precede from
 @example
 MORSE_Init( NCPU, NGPU );
 @end example
@@ -300,37 +300,37 @@ to free some data and finalize the runtime and/or MPI.
 @subsection Tutorial LAPACK to CHAMELEON
 
 This tutorial is dedicated to the API usage of CHAMELEON.
-The idea is to start from a simple code and step by step explain how to 
-use CHAMELEON routines. 
-The first step is a full BLAS/LAPACK code without dependencies to CHAMELEON, 
+The idea is to start from a simple code and step by step explain how to
+use CHAMELEON routines.
+The first step is a full BLAS/LAPACK code without dependencies to CHAMELEON,
 a code that most users should easily understand.
-Then, the different interfaces CHAMELEON provides are exposed, from the 
+Then, the different interfaces CHAMELEON provides are exposed, from the
 simplest API (step1) to more complicated ones (until step4).
 The way some important parameters are set is discussed in step5.
 Finally step6 is an example about distributed computation with MPI.
 
-Source files can be found in the @file{example/lapack_to_morse/} 
+Source files can be found in the @file{example/lapack_to_morse/}
 directory.
-If CMake option @option{CHAMELEON_ENABLE_EXAMPLE} is @option{ON} then source 
+If CMake option @option{CHAMELEON_ENABLE_EXAMPLE} is @option{ON} then source
 files are compiled with the project libraries.
 The arithmetic precision is @code{double}.
 To execute a step @samp{X}, enter the following command:
 @example
 ./step@samp{X} --option1 --option2 ...
 @end example
-Instructions about the arguments to give to executables are accessible thanks 
+Instructions about the arguments to give to executables are accessible thanks
 to the option @option{-[-]help} or @option{-[-]h}.
 Note there exist default values for options.
 
 For all steps, the program solves a linear system @math{Ax=B}
-The matrix values are randomly generated but ensure that matrix @math{A} is 
-symmetric positive definite so that @math{A} can be factorized in a @math{LL^T} 
+The matrix values are randomly generated but ensure that matrix @math{A} is
+symmetric positive definite so that @math{A} can be factorized in a @math{LL^T}
 form using the Cholesky factorization.
 
 
 Lets comment the different steps of the tutorial
 @menu
-* Step0:: a simple Cholesky example using the C interface of 
+* Step0:: a simple Cholesky example using the C interface of
 BLAS/LAPACK
 * Step1:: introduces the LAPACK equivalent interface of MORSE
 * Step2:: introduces the tile interface
@@ -343,14 +343,14 @@ BLAS/LAPACK
 @node Step0
 @subsubsection Step0
 
-The C interface of BLAS and LAPACK, that is, CBLAS and 
-LAPACKE, are used to solve the system. The size of the system (matrix) and the 
-number of right hand-sides can be given as arguments to the executable (be 
+The C interface of BLAS and LAPACK, that is, CBLAS and
+LAPACKE, are used to solve the system. The size of the system (matrix) and the
+number of right hand-sides can be given as arguments to the executable (be
 careful not to give huge numbers if you do not have an infinite amount of RAM!).
-As for every step, the correctness of the solution is checked by calculating 
+As for every step, the correctness of the solution is checked by calculating
 the norm @math{||Ax-B||/(||A||||x||+||B||)}.
-The time spent in factorization+solve is recorded and, because we know exactly 
-the number of operations of these algorithms, we deduce the number of 
+The time spent in factorization+solve is recorded and, because we know exactly
+the number of operations of these algorithms, we deduce the number of
 operations that have been processed per second (in GFlops/s).
 The important part of the code that solves the problem is:
 @verbatim
@@ -381,10 +381,10 @@ cblas_dtrsm(
 @node Step1
 @subsubsection Step1
 
-It introduces the simplest CHAMELEON interface which is equivalent to 
+It introduces the simplest CHAMELEON interface which is equivalent to
 CBLAS/LAPACKE.
-The code is very similar to step0 but instead of calling CBLAS/LAPACKE 
-functions, we call CHAMELEON equivalent functions. 
+The code is very similar to step0 but instead of calling CBLAS/LAPACKE
+functions, we call CHAMELEON equivalent functions.
 The solving code becomes:
 @verbatim
 /* Factorization: */
@@ -393,30 +393,30 @@ MORSE_dpotrf( UPLO, N, A, N );
 MORSE_dpotrs(UPLO, N, NRHS, A, N, X, N);
 @end verbatim
 The API is almost the same so that it is easy to use for beginners.
-It is important to keep in mind that before any call to MORSE routines, 
+It is important to keep in mind that before any call to MORSE routines,
 @code{MORSE_Init} has to be invoked to initialize MORSE and the runtime system.
 Example:
 @verbatim
 MORSE_Init( NCPU, NGPU );
 @end verbatim
-After all MORSE calls have been done, a call to @code{MORSE_Finalize} is 
+After all MORSE calls have been done, a call to @code{MORSE_Finalize} is
 required to free some data and finalize the runtime and/or MPI.
 @verbatim
 MORSE_Finalize();
 @end verbatim
-We use MORSE routines with the LAPACK interface which means the routines 
+We use MORSE routines with the LAPACK interface which means the routines
 accepts the same matrix format as LAPACK (1-D array column-major).
-Note that we copy the matrix to get it in our own tile structures, see details 
-about this format here @ref{Tile Data Layout}. 
+Note that we copy the matrix to get it in our own tile structures, see details
+about this format here @ref{Tile Data Layout}.
 This means you can get an overhead coming from copies.
 
 @node Step2
 @subsubsection Step2
 
-This program is a copy of step1 but instead of using the LAPACK interface which 
+This program is a copy of step1 but instead of using the LAPACK interface which
 leads to copy LAPACK matrices inside MORSE routines we use the tile interface.
 We will still use standard format of matrix but we will see how to give this
-matrix to create a MORSE descriptor, a structure wrapping data on which we want 
+matrix to create a MORSE descriptor, a structure wrapping data on which we want
 to apply sequential task-based algorithms.
 The solving code becomes:
 @verbatim
@@ -425,27 +425,27 @@ MORSE_dpotrf_Tile( UPLO, descA );
 /* Solve: */
 MORSE_dpotrs_Tile( UPLO, descA, descX );
 @end verbatim
-To use the tile interface, a specific structure @code{MORSE_desc_t} must be 
+To use the tile interface, a specific structure @code{MORSE_desc_t} must be
 created.
 This can be achieved from different ways.
 @enumerate
-@item Use the existing function @code{MORSE_Desc_Create}: means the 
-matrix data are considered contiguous in memory as it is considered in PLASMA 
+@item Use the existing function @code{MORSE_Desc_Create}: means the
+matrix data are considered contiguous in memory as it is considered in PLASMA
 (@ref{Tile Data Layout}).
-@item Use the existing function @code{MORSE_Desc_Create_User}: it is more 
-flexible than @code{Desc_Create} because you can give your own way to access to 
-tile data so that your tiles can be allocated wherever you want in memory, see 
+@item Use the existing function @code{MORSE_Desc_Create_User}: it is more
+flexible than @code{Desc_Create} because you can give your own way to access to
+tile data so that your tiles can be allocated wherever you want in memory, see
 next paragraph @ref{Step3}.
-@item Create you own function to fill the descriptor. 
-If you understand well the meaning of each item of @code{MORSE_desc_t}, you 
+@item Create you own function to fill the descriptor.
+If you understand well the meaning of each item of @code{MORSE_desc_t}, you
 should be able to fill correctly the structure (good luck).
 @end enumerate
 
 In Step2, we use the first way to create the descriptor:
 @verbatim
 MORSE_Desc_Create(&descA, NULL, MorseRealDouble,
-                  NB, NB, NB*NB, N, N, 
-                  0, 0, N, N, 
+                  NB, NB, NB*NB, N, N,
+                  0, 0, N, N,
                   1, 1);
 @end verbatim
 
@@ -454,37 +454,37 @@ MORSE_Desc_Create(&descA, NULL, MorseRealDouble,
 @item @code{descA} is the descriptor to create.
 
 @item The second argument is a pointer to existing data.
-The existing data must follow LAPACK/PLASMA matrix layout @ref{Tile Data 
-Layout} (1-D array column-major) if @code{MORSE_Desc_Create} is used to create 
+The existing data must follow LAPACK/PLASMA matrix layout @ref{Tile Data
+Layout} (1-D array column-major) if @code{MORSE_Desc_Create} is used to create
 the descriptor.
-The @code{MORSE_Desc_Create_User} function can be used if you have data 
-organized differently. 
-This is discussed in the next paragraph @ref{Step3}. 
+The @code{MORSE_Desc_Create_User} function can be used if you have data
+organized differently.
+This is discussed in the next paragraph @ref{Step3}.
 Giving a @code{NULL} pointer means you let the function allocate memory space.
-This requires to copy your data in the memory allocated by the 
+This requires to copy your data in the memory allocated by the
 @code{Desc_Create}.
 This can be done with
 @verbatim
 MORSE_Lapack_to_Tile(A, N, descA);
 @end verbatim
 
-@item Third argument of @code{Desc_Create} is the datatype (used for memory 
+@item Third argument of @code{Desc_Create} is the datatype (used for memory
 allocation).
 
-@item Fourth argument until sixth argument stand for respectively, the number 
-of rows (@code{NB}), columns (@code{NB}) in each tile, the total number of 
-values in a tile (@code{NB*NB}), the number of rows (@code{N}), colmumns 
+@item Fourth argument until sixth argument stand for respectively, the number
+of rows (@code{NB}), columns (@code{NB}) in each tile, the total number of
+values in a tile (@code{NB*NB}), the number of rows (@code{N}), colmumns
 (@code{N}) in the entire matrix.
 
-@item Seventh argument until ninth argument stand for respectively, the 
-beginning row (@code{0}), column (@code{0}) indexes of the submatrix and the 
+@item Seventh argument until ninth argument stand for respectively, the
+beginning row (@code{0}), column (@code{0}) indexes of the submatrix and the
 number of rows (@code{N}), columns (@code{N}) in the submatrix.
 These arguments are specific and used in precise cases.
 If you do not consider submatrices, just use @code{0, 0, NROWS, NCOLS}.
 
-@item Two last arguments are the parameter of the 2-D block-cyclic distribution 
+@item Two last arguments are the parameter of the 2-D block-cyclic distribution
 grid, see @uref{http://www.netlib.org/scalapack/slug/node75.html, ScaLAPACK}.
-To be able to use other data distribution over the nodes, 
+To be able to use other data distribution over the nodes,
 @code{MORSE_Desc_Create_User} function should be used.
 
 @end itemize
@@ -493,64 +493,64 @@ To be able to use other data distribution over the nodes,
 @node Step3
 @subsubsection Step3
 
-This program makes use of the same interface than Step2 (tile interface) but 
-does not allocate LAPACK matrices anymore so that no copy between LAPACK matrix 
+This program makes use of the same interface than Step2 (tile interface) but
+does not allocate LAPACK matrices anymore so that no copy between LAPACK matrix
 layout and tile matrix layout are necessary to call MORSE routines.
 To generate random right hand-sides you can use:
 @verbatim
 /* Allocate memory and initialize descriptor B */
 MORSE_Desc_Create(&descB,  NULL, MorseRealDouble,
-                  NB, NB,  NB*NB, N, NRHS, 
+                  NB, NB,  NB*NB, N, NRHS,
                   0, 0, N, NRHS, 1, 1);
 /* generate RHS with random values */
 MORSE_dplrnt_Tile( descB, 5673 );
 @end verbatim
 
-The other important point is that is it possible to create a descriptor, the 
-necessary structure to call MORSE efficiently, by giving your own pointer to 
+The other important point is that is it possible to create a descriptor, the
+necessary structure to call MORSE efficiently, by giving your own pointer to
 tiles if your matrix is not organized as a 1-D array column-major.
 This can be achieved with the @code{MORSE_Desc_Create_User} routine.
 Here is an example:
 @verbatim
 MORSE_Desc_Create_User(&descA, matA, MorseRealDouble,
-		       NB, NB, NB*NB, N, N, 
-		       0, 0, N, N, 1, 1,
-		       user_getaddr_arrayofpointers,
-		       user_getblkldd_arrayofpointers,
-		       user_getrankof_zero);
+                       NB, NB, NB*NB, N, N,
+                       0, 0, N, N, 1, 1,
+                       user_getaddr_arrayofpointers,
+                       user_getblkldd_arrayofpointers,
+                       user_getrankof_zero);
 @end verbatim
-Firsts arguments are the same than @code{MORSE_Desc_Create} routine. 
+Firsts arguments are the same than @code{MORSE_Desc_Create} routine.
 Following arguments allows you to give pointer to functions that manage the
 access to tiles from the structure given as second argument.
-Here for example, @code{matA} is an array containing addresses to tiles, see 
+Here for example, @code{matA} is an array containing addresses to tiles, see
 the function @code{allocate_tile_matrix} defined in @file{step3.h}.
 The three functions you have to define for @code{Desc_Create_User} are:
 @itemize @bullet
-@item a function that returns address of tile @math{A(m,n)}, m and n standing 
-for the indexes of the tile in the global matrix. Lets consider a matrix 
-@math{4x4} with tile size @math{2x2}, the matrix contains four tiles of 
-indexes: @math{A(m=0,n=0)}, @math{A(m=0,n=1)}, @math{A(m=1,n=0)}, 
+@item a function that returns address of tile @math{A(m,n)}, m and n standing
+for the indexes of the tile in the global matrix. Lets consider a matrix
+@math{4x4} with tile size @math{2x2}, the matrix contains four tiles of
+indexes: @math{A(m=0,n=0)}, @math{A(m=0,n=1)}, @math{A(m=1,n=0)},
 @math{A(m=1,n=1)}
 @item a function that returns the leading dimension of tile @math{A(m,*)}
 @item a function that returns MPI rank of tile @math{A(m,n)}
 @end itemize
 Examples for these functions are vizible in @file{step3.h}.
-Note that the way we define these functions is related to the tile matrix 
+Note that the way we define these functions is related to the tile matrix
 format and to the data distribution considered.
-This example should not be used with MPI since all tiles are affected to 
-processus @code{0}, which means a large amount of data will be 
+This example should not be used with MPI since all tiles are affected to
+processus @code{0}, which means a large amount of data will be
 potentially transfered between nodes.
 
 @node Step4
 @subsubsection Step4
-This program is a copy of step2 but instead of using the tile interface, it 
+This program is a copy of step2 but instead of using the tile interface, it
 uses the tile async interface.
 The goal is to exhibit the runtime synchronization barriers.
-Keep in mind that when the tile interface is called, like 
-@code{MORSE_dpotrf_Tile}, a synchronization function, waiting for the actual 
-execution and termination of all tasks, is called to ensure the 
+Keep in mind that when the tile interface is called, like
+@code{MORSE_dpotrf_Tile}, a synchronization function, waiting for the actual
+execution and termination of all tasks, is called to ensure the
 proper completion of the algorithm (i.e. data are up-to-date).
-The code shows how to exploit the async interface to pipeline subsequent 
+The code shows how to exploit the async interface to pipeline subsequent
 algorithms so that less synchronisations are done.
 The code becomes:
 @verbatim
@@ -583,35 +583,35 @@ RUNTIME_desc_getoncpu(descA);
 RUNTIME_desc_getoncpu(descX);
 
 status = sequence->status;
-    
+
 @end verbatim
-Here the sequence of @code{dpotrf} and @code{dpotrs} algorithms is processed 
-without synchronization so that some tasks of @code{dpotrf} and @code{dpotrs} 
+Here the sequence of @code{dpotrf} and @code{dpotrs} algorithms is processed
+without synchronization so that some tasks of @code{dpotrf} and @code{dpotrs}
 can be concurently executed which could increase performances.
 The async interface is very similar to the tile one.
-It is only necessary to give two new objects @code{MORSE_sequence_t} and 
+It is only necessary to give two new objects @code{MORSE_sequence_t} and
 @code{MORSE_request_t} used to handle asynchronous function calls.
 
 @center @image{potri_async,13cm,8cm}
-POTRI (POTRF, TRTRI, LAUUM) algorithm with and without synchronization 
+POTRI (POTRF, TRTRI, LAUUM) algorithm with and without synchronization
 barriers, courtesey of the @uref{http://icl.cs.utk.edu/plasma/, PLASMA} team.
 
 @node Step5
 @subsubsection Step5
 
 Step5 shows how to set some important parameters.
-This program is a copy of Step4 but some additional parameters are given by 
-the user. 
+This program is a copy of Step4 but some additional parameters are given by
+the user.
 The parameters that can be set are:
 @itemize @bullet
 @item number of Threads
 @item number of GPUs
 
-The number of workers can be given as argument to the executable with 
+The number of workers can be given as argument to the executable with
 @option{--threads=} and @option{--gpus=} options.
-It is important to notice that we assign one thread per gpu to optimize data 
+It is important to notice that we assign one thread per gpu to optimize data
 transfer between main memory and devices memory.
-The number of workers of each type @code{CPU} and @code{CUDA} must be given at 
+The number of workers of each type @code{CPU} and @code{CUDA} must be given at
 @code{MORSE_Init}.
 @verbatim
 if ( iparam[IPARAM_THRDNBR] == -1 ) {
@@ -633,26 +633,26 @@ MORSE_Init( NCPU, NGPU );
 The problem size is given with @option{--n=} and @option{--nrhs=} options.
 The tile size is given with option @option{--nb=}.
 These parameters are required to create descriptors.
-The size tile @code{NB} is a key parameter to get performances since it 
+The size tile @code{NB} is a key parameter to get performances since it
 defines the granularity of tasks.
-If @code{NB} is too large compared to @code{N}, there are few tasks to 
-schedule. 
+If @code{NB} is too large compared to @code{N}, there are few tasks to
+schedule.
 If the number of workers is large this leads to limit parallelism.
-On the contrary, if @code{NB} is too small (@emph{i.e.} many small tasks), 
-workers could not be correctly fed and the runtime systems operations 
+On the contrary, if @code{NB} is too small (@emph{i.e.} many small tasks),
+workers could not be correctly fed and the runtime systems operations
 could represent a substantial overhead.
-A trade-off has to be found depending on many parameters: problem size, 
-algorithm (drive data dependencies), architecture (number of workers, 
+A trade-off has to be found depending on many parameters: problem size,
+algorithm (drive data dependencies), architecture (number of workers,
 workers speed, workers uniformity, memory bus speed).
-By default it is set to 128. 
-Do not hesitate to play with this parameter and compare performances on your 
+By default it is set to 128.
+Do not hesitate to play with this parameter and compare performances on your
 machine.
 
 @item inner-blocking size
 
 The inner-blocking size is given with option @option{--ib=}.
-This parameter is used by kernels (optimized algorithms applied on tiles) to 
-perform subsequent operations with data block-size that fits the cache of 
+This parameter is used by kernels (optimized algorithms applied on tiles) to
+perform subsequent operations with data block-size that fits the cache of
 workers.
 Parameters @code{NB} and @code{IB} can be given with @code{MORSE_Set} function:
 @verbatim
@@ -660,18 +660,18 @@ MORSE_Set(MORSE_TILE_SIZE,        iparam[IPARAM_NB] );
 MORSE_Set(MORSE_INNER_BLOCK_SIZE, iparam[IPARAM_IB] );
 @end verbatim
 @end itemize
- 
+
 @node Step6
 @subsubsection Step6
 
-This program is a copy of Step5 with some additional parameters to be set for 
-the data distribution. 
-To use this program properly MORSE must use StarPU Runtime system and MPI 
-option must be activated at configure. 
+This program is a copy of Step5 with some additional parameters to be set for
+the data distribution.
+To use this program properly MORSE must use StarPU Runtime system and MPI
+option must be activated at configure.
 The data distribution used here is 2-D block-cyclic, see for example
-@uref{http://www.netlib.org/scalapack/slug/node75.html, ScaLAPACK} for 
+@uref{http://www.netlib.org/scalapack/slug/node75.html, ScaLAPACK} for
 explanation.
-The user can enter the parameters of the distribution grid at execution with 
+The user can enter the parameters of the distribution grid at execution with
 @option{--p=} option.
 Example using OpenMPI on four nodes with one process per node:
 @example
@@ -679,33 +679,33 @@ mpirun -np 4 ./step6 --n=10000 --nb=320 --ib=64 \
                      --threads=8 --gpus=2 --p=2
 @end example
 
-In this program we use the tile data layout from PLASMA so that the call 
+In this program we use the tile data layout from PLASMA so that the call
 @verbatim
 MORSE_Desc_Create_User(&descA, NULL, MorseRealDouble,
-		       NB, NB, NB*NB, N, N, 
-		       0, 0, N, N,
-		       GRID_P, GRID_Q,
-		       morse_getaddr_ccrb,
-		       morse_getblkldd_ccrb,
-		       morse_getrankof_2d);
+                       NB, NB, NB*NB, N, N,
+                       0, 0, N, N,
+                       GRID_P, GRID_Q,
+                       morse_getaddr_ccrb,
+                       morse_getblkldd_ccrb,
+                       morse_getrankof_2d);
 @end verbatim
 is equivalent to the following call
 @verbatim
 MORSE_Desc_Create(&descA, NULL, MorseRealDouble,
-                  NB, NB, NB*NB, N, N, 
-                  0, 0, N, N, 
+                  NB, NB, NB*NB, N, N,
+                  0, 0, N, N,
                   GRID_P, GRID_Q);
 @end verbatim
-functions @code{morse_getaddr_ccrb}, @code{morse_getblkldd_ccrb}, 
+functions @code{morse_getaddr_ccrb}, @code{morse_getblkldd_ccrb},
 @code{morse_getrankof_2d} being used in @code{Desc_Create}.
 It is interesting to notice that the code is almost the same as Step5.
-The only additional information to give is the way tiles are distributed 
+The only additional information to give is the way tiles are distributed
 through the third function given to @code{MORSE_Desc_Create_User}.
-Here, because we have made experiments only with a 2-D block-cyclic 
+Here, because we have made experiments only with a 2-D block-cyclic
 distribution, we have parameters P and Q in the interface of @code{Desc_Create}
-but they have sense only for 2-D block-cyclic distribution and then using 
+but they have sense only for 2-D block-cyclic distribution and then using
 @code{morse_getrankof_2d} function.
-Of course it could be used with other distributions, being no more the 
+Of course it could be used with other distributions, being no more the
 parameters of a 2-D block-cyclic grid but of another distribution.
 
 @node List of available routines
@@ -715,7 +715,7 @@ parameters of a 2-D block-cyclic grid but of another distribution.
 * Auxiliary routines:: Init, Finalize, Version, etc
 * Descriptor routines:: To handle descriptors
 * Options routines:: To set options
-* Sequences routines:: To manage asynchronous function calls 
+* Sequences routines:: To manage asynchronous function calls
 * Linear Algebra routines:: Computional routines
 @end menu
 
@@ -742,7 +742,7 @@ Return the MPI rank of the calling process.
 int MORSE_My_Mpi_Rank    (void);
 @end verbatim
 
-Suspend MORSE runtime to poll for new tasks, to avoid useless CPU consumption when 
+Suspend MORSE runtime to poll for new tasks, to avoid useless CPU consumption when
 no tasks have to be executed by MORSE runtime system.
 @verbatim
 int MORSE_Pause          (void);
@@ -789,7 +789,7 @@ Destroys matrix descriptor.
 int MORSE_Desc_Destroy (MORSE_desc_t **desc);
 @end verbatim
 
-Ensure that all data are up-to-date in main memory (even if some tasks have 
+Ensure that all data are up-to-date in main memory (even if some tasks have
 been processed on GPUs)
 @verbatim
 int MORSE_Desc_Getoncpu(MORSE_desc_t  *desc);
@@ -798,7 +798,7 @@ int MORSE_Desc_Getoncpu(MORSE_desc_t  *desc);
 @node Options routines
 @subsubsection Options routines
 
-@c /* Options */ 
+@c /* Options */
 Enable MORSE feature.
 @verbatim
 int MORSE_Enable  (MORSE_enum option);
@@ -827,7 +827,7 @@ Parameters to be set:
 @item @code{MORSE_INNER_BLOCK_SIZE}: size of tile inner block,
 @item @code{MORSE_HOUSEHOLDER_MODE}: type of householder trees (FLAT or TREE),
 @item @code{MORSE_HOUSEHOLDER_SIZE}: size of the groups in householder trees,
-@item @code{MORSE_TRANSLATION_MODE}: related to the 
+@item @code{MORSE_TRANSLATION_MODE}: related to the
 @code{MORSE_Lapack_to_Tile}, see @file{ztile.c}.
 @end itemize
 
@@ -858,110 +858,110 @@ int MORSE_Sequence_Wait    (MORSE_sequence_t *sequence);
 @node Linear Algebra routines
 @subsubsection Linear Algebra routines
 
-Routines computing linear algebra of the form 
-@code{MORSE_name[_Tile[_Async]]} (@code{name} follows LAPACK naming scheme, see 
+Routines computing linear algebra of the form
+@code{MORSE_name[_Tile[_Async]]} (@code{name} follows LAPACK naming scheme, see
 @uref{http://www.netlib.org/lapack/lug/node24.html} availables:
 
 @verbatim
 /** ********************************************************
- *  Declarations of computational functions (LAPACK layout) 
+ *  Declarations of computational functions (LAPACK layout)
  **/
- 
-int MORSE_zgelqf(int M, int N, MORSE_Complex64_t *A, int LDA, 
+
+int MORSE_zgelqf(int M, int N, MORSE_Complex64_t *A, int LDA,
                  MORSE_desc_t *descT);
-                 
-int MORSE_zgelqs(int M, int N, int NRHS, MORSE_Complex64_t *A, int LDA, 
+
+int MORSE_zgelqs(int M, int N, int NRHS, MORSE_Complex64_t *A, int LDA,
                  MORSE_desc_t *descT, MORSE_Complex64_t *B, int LDB);
-                 
-int MORSE_zgels(MORSE_enum trans, int M, int N, int NRHS, 
-                MORSE_Complex64_t *A, int LDA, MORSE_desc_t *descT, 
+
+int MORSE_zgels(MORSE_enum trans, int M, int N, int NRHS,
+                MORSE_Complex64_t *A, int LDA, MORSE_desc_t *descT,
                 MORSE_Complex64_t *B, int LDB);
 
-int MORSE_zgemm(MORSE_enum transA, MORSE_enum transB, int M, int N, int K, 
-                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, 
-                MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta, 
+int MORSE_zgemm(MORSE_enum transA, MORSE_enum transB, int M, int N, int K,
+                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
+                MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta,
                 MORSE_Complex64_t *C, int LDC);
 
-int MORSE_zgeqrf(int M, int N, MORSE_Complex64_t *A, int LDA, 
+int MORSE_zgeqrf(int M, int N, MORSE_Complex64_t *A, int LDA,
                  MORSE_desc_t *descT);
-                 
-int MORSE_zgeqrs(int M, int N, int NRHS, MORSE_Complex64_t *A, int LDA, 
+
+int MORSE_zgeqrs(int M, int N, int NRHS, MORSE_Complex64_t *A, int LDA,
                  MORSE_desc_t *descT, MORSE_Complex64_t *B, int LDB);
 
-int MORSE_zgesv_incpiv(int N, int NRHS, MORSE_Complex64_t *A, int LDA, 
-                       MORSE_desc_t *descL, int *IPIV, 
+int MORSE_zgesv_incpiv(int N, int NRHS, MORSE_Complex64_t *A, int LDA,
+                       MORSE_desc_t *descL, int *IPIV,
                        MORSE_Complex64_t *B, int LDB);
 
-int MORSE_zgesv_nopiv(int N, int NRHS, MORSE_Complex64_t *A, int LDA, 
+int MORSE_zgesv_nopiv(int N, int NRHS, MORSE_Complex64_t *A, int LDA,
                       MORSE_Complex64_t *B, int LDB);
 
-int MORSE_zgetrf_incpiv(int M, int N, MORSE_Complex64_t *A, int LDA, 
+int MORSE_zgetrf_incpiv(int M, int N, MORSE_Complex64_t *A, int LDA,
                         MORSE_desc_t *descL, int *IPIV);
 
 int MORSE_zgetrf_nopiv(int M, int N, MORSE_Complex64_t *A, int LDA);
 
-int MORSE_zgetrs_incpiv(MORSE_enum trans, int N, int NRHS, 
-                        MORSE_Complex64_t *A, int LDA, 
-                        MORSE_desc_t *descL, int *IPIV, 
+int MORSE_zgetrs_incpiv(MORSE_enum trans, int N, int NRHS,
+                        MORSE_Complex64_t *A, int LDA,
+                        MORSE_desc_t *descL, int *IPIV,
                         MORSE_Complex64_t *B, int LDB);
 
-int MORSE_zgetrs_nopiv(MORSE_enum trans, int N, int NRHS, 
-                       MORSE_Complex64_t *A, int LDA, 
+int MORSE_zgetrs_nopiv(MORSE_enum trans, int N, int NRHS,
+                       MORSE_Complex64_t *A, int LDA,
                        MORSE_Complex64_t *B, int LDB);
 
 #ifdef COMPLEX
-int MORSE_zhemm(MORSE_enum side, MORSE_enum uplo, int M, int N, 
-                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, 
-                MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta, 
+int MORSE_zhemm(MORSE_enum side, MORSE_enum uplo, int M, int N,
+                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
+                MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta,
                 MORSE_Complex64_t *C, int LDC);
 
-int MORSE_zherk(MORSE_enum uplo, MORSE_enum trans, int N, int K, 
-                double alpha, MORSE_Complex64_t *A, int LDA, 
+int MORSE_zherk(MORSE_enum uplo, MORSE_enum trans, int N, int K,
+                double alpha, MORSE_Complex64_t *A, int LDA,
                 double beta, MORSE_Complex64_t *C, int LDC);
 
-int MORSE_zher2k(MORSE_enum uplo, MORSE_enum trans, int N, int K, 
-                 MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, 
-                 MORSE_Complex64_t *B, int LDB, double beta, 
+int MORSE_zher2k(MORSE_enum uplo, MORSE_enum trans, int N, int K,
+                 MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
+                 MORSE_Complex64_t *B, int LDB, double beta,
                  MORSE_Complex64_t *C, int LDC);
 #endif
 
-int MORSE_zlacpy(MORSE_enum uplo, int M, int N, 
-                 MORSE_Complex64_t *A, int LDA, 
+int MORSE_zlacpy(MORSE_enum uplo, int M, int N,
+                 MORSE_Complex64_t *A, int LDA,
                  MORSE_Complex64_t *B, int LDB);
 
-double MORSE_zlange(MORSE_enum norm, int M, int N, 
+double MORSE_zlange(MORSE_enum norm, int M, int N,
                     MORSE_Complex64_t *A, int LDA);
 
 #ifdef COMPLEX
-double MORSE_zlanhe(MORSE_enum norm, MORSE_enum uplo, int N, 
+double MORSE_zlanhe(MORSE_enum norm, MORSE_enum uplo, int N,
                     MORSE_Complex64_t *A, int LDA);
 #endif
 
-double MORSE_zlansy(MORSE_enum norm, MORSE_enum uplo, int N, 
+double MORSE_zlansy(MORSE_enum norm, MORSE_enum uplo, int N,
                     MORSE_Complex64_t *A, int LDA);
 
-double MORSE_zlantr(MORSE_enum norm, MORSE_enum uplo, MORSE_enum diag, 
+double MORSE_zlantr(MORSE_enum norm, MORSE_enum uplo, MORSE_enum diag,
                     int M, int N, MORSE_Complex64_t *A, int LDA);
 
-int MORSE_zlaset(MORSE_enum uplo, int M, int N, MORSE_Complex64_t alpha, 
+int MORSE_zlaset(MORSE_enum uplo, int M, int N, MORSE_Complex64_t alpha,
                  MORSE_Complex64_t beta, MORSE_Complex64_t *A, int LDA);
 
 int MORSE_zlauum(MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA);
 
 #ifdef COMPLEX
-int MORSE_zplghe( double bump, int N, MORSE_Complex64_t *A, int LDA, 
+int MORSE_zplghe( double bump, int N, MORSE_Complex64_t *A, int LDA,
                   unsigned long long int seed );
 #endif
 
-int MORSE_zplgsy( MORSE_Complex64_t bump, int N, 
-                  MORSE_Complex64_t *A, int LDA, 
+int MORSE_zplgsy( MORSE_Complex64_t bump, int N,
+                  MORSE_Complex64_t *A, int LDA,
                   unsigned long long int seed );
 
-int MORSE_zplrnt( int M, int N, MORSE_Complex64_t *A, int LDA, 
+int MORSE_zplrnt( int M, int N, MORSE_Complex64_t *A, int LDA,
                   unsigned long long int seed );
 
-int MORSE_zposv(MORSE_enum uplo, int N, int NRHS, 
-                MORSE_Complex64_t *A, int LDA, 
+int MORSE_zposv(MORSE_enum uplo, int N, int NRHS,
+                MORSE_Complex64_t *A, int LDA,
                 MORSE_Complex64_t *B, int LDB);
 
 int MORSE_zpotrf(MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA);
@@ -970,91 +970,91 @@ int MORSE_zsytrf(MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA);
 
 int MORSE_zpotri(MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA);
 
-int MORSE_zpotrs(MORSE_enum uplo, int N, int NRHS, 
-                 MORSE_Complex64_t *A, int LDA, 
+int MORSE_zpotrs(MORSE_enum uplo, int N, int NRHS,
+                 MORSE_Complex64_t *A, int LDA,
                  MORSE_Complex64_t *B, int LDB);
 
 #if defined (PRECISION_c) || defined(PRECISION_z)
-int MORSE_zsytrs(MORSE_enum uplo, int N, int NRHS, 
-                 MORSE_Complex64_t *A, int LDA, 
+int MORSE_zsytrs(MORSE_enum uplo, int N, int NRHS,
+                 MORSE_Complex64_t *A, int LDA,
                  MORSE_Complex64_t *B, int LDB);
 #endif
 
-int MORSE_zsymm(MORSE_enum side, MORSE_enum uplo, int M, int N, 
-                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, 
-                MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta, 
+int MORSE_zsymm(MORSE_enum side, MORSE_enum uplo, int M, int N,
+                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
+                MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta,
                 MORSE_Complex64_t *C, int LDC);
 
-int MORSE_zsyrk(MORSE_enum uplo, MORSE_enum trans, int N, int K, 
-                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, 
+int MORSE_zsyrk(MORSE_enum uplo, MORSE_enum trans, int N, int K,
+                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
                 MORSE_Complex64_t beta, MORSE_Complex64_t *C, int LDC);
 
-int MORSE_zsyr2k(MORSE_enum uplo, MORSE_enum trans, int N, int K, 
-                 MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, 
-                 MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta, 
+int MORSE_zsyr2k(MORSE_enum uplo, MORSE_enum trans, int N, int K,
+                 MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
+                 MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta,
                  MORSE_Complex64_t *C, int LDC);
 
-int MORSE_ztrmm(MORSE_enum side, MORSE_enum uplo, 
-                MORSE_enum transA, MORSE_enum diag, 
-                int N, int NRHS, 
-                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, 
+int MORSE_ztrmm(MORSE_enum side, MORSE_enum uplo,
+                MORSE_enum transA, MORSE_enum diag,
+                int N, int NRHS,
+                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
                 MORSE_Complex64_t *B, int LDB);
 
-int MORSE_ztrsm(MORSE_enum side, MORSE_enum uplo, 
-                MORSE_enum transA, MORSE_enum diag, 
-                int N, int NRHS, 
-                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, 
+int MORSE_ztrsm(MORSE_enum side, MORSE_enum uplo,
+                MORSE_enum transA, MORSE_enum diag,
+                int N, int NRHS,
+                MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
                 MORSE_Complex64_t *B, int LDB);
 
-int MORSE_ztrsmpl(int N, int NRHS, MORSE_Complex64_t *A, int LDA, 
-                  MORSE_desc_t *descL, int *IPIV, 
+int MORSE_ztrsmpl(int N, int NRHS, MORSE_Complex64_t *A, int LDA,
+                  MORSE_desc_t *descL, int *IPIV,
                   MORSE_Complex64_t *B, int LDB);
 
-int MORSE_ztrsmrv(MORSE_enum side, MORSE_enum uplo, 
-                  MORSE_enum transA, MORSE_enum diag, 
-                  int N, int NRHS, 
-                  MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, 
+int MORSE_ztrsmrv(MORSE_enum side, MORSE_enum uplo,
+                  MORSE_enum transA, MORSE_enum diag,
+                  int N, int NRHS,
+                  MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA,
                   MORSE_Complex64_t *B, int LDB);
 
-int MORSE_ztrtri(MORSE_enum uplo, MORSE_enum diag, int N, 
+int MORSE_ztrtri(MORSE_enum uplo, MORSE_enum diag, int N,
                  MORSE_Complex64_t *A, int LDA);
 
-int MORSE_zunglq(int M, int N, int K, MORSE_Complex64_t *A, int LDA, 
+int MORSE_zunglq(int M, int N, int K, MORSE_Complex64_t *A, int LDA,
                  MORSE_desc_t *descT, MORSE_Complex64_t *B, int LDB);
 
-int MORSE_zungqr(int M, int N, int K, MORSE_Complex64_t *A, int LDA, 
+int MORSE_zungqr(int M, int N, int K, MORSE_Complex64_t *A, int LDA,
                  MORSE_desc_t *descT, MORSE_Complex64_t *B, int LDB);
 
-int MORSE_zunmlq(MORSE_enum side, MORSE_enum trans, int M, int N, int K, 
-                 MORSE_Complex64_t *A, int LDA, 
-                 MORSE_desc_t *descT, 
+int MORSE_zunmlq(MORSE_enum side, MORSE_enum trans, int M, int N, int K,
+                 MORSE_Complex64_t *A, int LDA,
+                 MORSE_desc_t *descT,
                  MORSE_Complex64_t *B, int LDB);
 
-int MORSE_zunmqr(MORSE_enum side, MORSE_enum trans, int M, int N, int K, 
-                 MORSE_Complex64_t *A, int LDA, MORSE_desc_t *descT, 
+int MORSE_zunmqr(MORSE_enum side, MORSE_enum trans, int M, int N, int K,
+                 MORSE_Complex64_t *A, int LDA, MORSE_desc_t *descT,
                  MORSE_Complex64_t *B, int LDB);
 
 /** ******************************************************
  *  Declarations of computational functions (tile layout)
  **/
- 
+
 int MORSE_zgelqf_Tile(MORSE_desc_t *A, MORSE_desc_t *T);
 
 int MORSE_zgelqs_Tile(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B);
 
-int MORSE_zgels_Tile(MORSE_enum trans, MORSE_desc_t *A, MORSE_desc_t *T, 
+int MORSE_zgels_Tile(MORSE_enum trans, MORSE_desc_t *A, MORSE_desc_t *T,
                      MORSE_desc_t *B);
-                     
-int MORSE_zgemm_Tile(MORSE_enum transA, MORSE_enum transB, 
-                     MORSE_Complex64_t alpha, MORSE_desc_t *A, 
-                     MORSE_desc_t *B, MORSE_Complex64_t beta, 
+
+int MORSE_zgemm_Tile(MORSE_enum transA, MORSE_enum transB,
+                     MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                     MORSE_desc_t *B, MORSE_Complex64_t beta,
                      MORSE_desc_t *C);
-                     
+
 int MORSE_zgeqrf_Tile(MORSE_desc_t *A, MORSE_desc_t *T);
 
 int MORSE_zgeqrs_Tile(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B);
 
-int MORSE_zgesv_incpiv_Tile(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV, 
+int MORSE_zgesv_incpiv_Tile(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV,
                             MORSE_desc_t *B);
 
 int MORSE_zgesv_nopiv_Tile(MORSE_desc_t *A, MORSE_desc_t *B);
@@ -1063,23 +1063,23 @@ int MORSE_zgetrf_incpiv_Tile(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV);
 
 int MORSE_zgetrf_nopiv_Tile(MORSE_desc_t *A);
 
-int MORSE_zgetrs_incpiv_Tile(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV, 
+int MORSE_zgetrs_incpiv_Tile(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV,
                              MORSE_desc_t *B);
-                             
+
 int MORSE_zgetrs_nopiv_Tile(MORSE_desc_t *A, MORSE_desc_t *B);
 
 #ifdef COMPLEX
-int MORSE_zhemm_Tile(MORSE_enum side, MORSE_enum uplo, 
-                     MORSE_Complex64_t alpha, MORSE_desc_t *A, 
-                     MORSE_desc_t *B, MORSE_Complex64_t beta, 
+int MORSE_zhemm_Tile(MORSE_enum side, MORSE_enum uplo,
+                     MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                     MORSE_desc_t *B, MORSE_Complex64_t beta,
                      MORSE_desc_t *C);
-                     
-int MORSE_zherk_Tile(MORSE_enum uplo, MORSE_enum trans, 
-                     double alpha, MORSE_desc_t *A, 
+
+int MORSE_zherk_Tile(MORSE_enum uplo, MORSE_enum trans,
+                     double alpha, MORSE_desc_t *A,
                      double beta, MORSE_desc_t *C);
-                     
-int MORSE_zher2k_Tile(MORSE_enum uplo, MORSE_enum trans, 
-                      MORSE_Complex64_t alpha, MORSE_desc_t *A, 
+
+int MORSE_zher2k_Tile(MORSE_enum uplo, MORSE_enum trans,
+                      MORSE_Complex64_t alpha, MORSE_desc_t *A,
                       MORSE_desc_t *B, double beta, MORSE_desc_t *C);
 #endif
 
@@ -1093,22 +1093,22 @@ double MORSE_zlanhe_Tile(MORSE_enum norm, MORSE_enum uplo, MORSE_desc_t *A);
 
 double MORSE_zlansy_Tile(MORSE_enum norm, MORSE_enum uplo, MORSE_desc_t *A);
 
-double MORSE_zlantr_Tile(MORSE_enum norm, MORSE_enum uplo, 
+double MORSE_zlantr_Tile(MORSE_enum norm, MORSE_enum uplo,
                          MORSE_enum diag, MORSE_desc_t *A);
 
-int MORSE_zlaset_Tile(MORSE_enum uplo, MORSE_Complex64_t alpha, 
+int MORSE_zlaset_Tile(MORSE_enum uplo, MORSE_Complex64_t alpha,
                       MORSE_Complex64_t beta, MORSE_desc_t *A);
-                      
+
 int MORSE_zlauum_Tile(MORSE_enum uplo, MORSE_desc_t *A);
 
 #ifdef COMPLEX
-int MORSE_zplghe_Tile(double bump, MORSE_desc_t *A, 
+int MORSE_zplghe_Tile(double bump, MORSE_desc_t *A,
                       unsigned long long int seed);
 #endif
 
-int MORSE_zplgsy_Tile(MORSE_Complex64_t bump, MORSE_desc_t *A, 
+int MORSE_zplgsy_Tile(MORSE_Complex64_t bump, MORSE_desc_t *A,
                       unsigned long long int seed );
-                      
+
 int MORSE_zplrnt_Tile(MORSE_desc_t *A, unsigned long long int seed );
 
 int MORSE_zposv_Tile(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B);
@@ -1125,271 +1125,271 @@ int MORSE_zpotrs_Tile(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B);
 int MORSE_zsytrs_Tile(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B);
 #endif
 
-int MORSE_zsymm_Tile(MORSE_enum side, MORSE_enum uplo, 
-                     MORSE_Complex64_t alpha, MORSE_desc_t *A, 
-                     MORSE_desc_t *B, MORSE_Complex64_t beta, 
+int MORSE_zsymm_Tile(MORSE_enum side, MORSE_enum uplo,
+                     MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                     MORSE_desc_t *B, MORSE_Complex64_t beta,
                      MORSE_desc_t *C);
-                     
-int MORSE_zsyrk_Tile(MORSE_enum uplo, MORSE_enum trans, 
-                     MORSE_Complex64_t alpha, MORSE_desc_t *A, 
+
+int MORSE_zsyrk_Tile(MORSE_enum uplo, MORSE_enum trans,
+                     MORSE_Complex64_t alpha, MORSE_desc_t *A,
                      MORSE_Complex64_t beta, MORSE_desc_t *C);
-                     
-int MORSE_zsyr2k_Tile(MORSE_enum uplo, MORSE_enum trans, 
-                      MORSE_Complex64_t alpha, MORSE_desc_t *A, 
-                      MORSE_desc_t *B, MORSE_Complex64_t beta, 
+
+int MORSE_zsyr2k_Tile(MORSE_enum uplo, MORSE_enum trans,
+                      MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                      MORSE_desc_t *B, MORSE_Complex64_t beta,
                       MORSE_desc_t *C);
-                      
-int MORSE_ztrmm_Tile(MORSE_enum side, MORSE_enum uplo, 
-                     MORSE_enum transA, MORSE_enum diag, 
-                     MORSE_Complex64_t alpha, MORSE_desc_t *A, 
+
+int MORSE_ztrmm_Tile(MORSE_enum side, MORSE_enum uplo,
+                     MORSE_enum transA, MORSE_enum diag,
+                     MORSE_Complex64_t alpha, MORSE_desc_t *A,
                      MORSE_desc_t *B);
-                     
-int MORSE_ztrsm_Tile(MORSE_enum side, MORSE_enum uplo, 
-                     MORSE_enum transA, MORSE_enum diag, 
-                     MORSE_Complex64_t alpha, MORSE_desc_t *A, 
+
+int MORSE_ztrsm_Tile(MORSE_enum side, MORSE_enum uplo,
+                     MORSE_enum transA, MORSE_enum diag,
+                     MORSE_Complex64_t alpha, MORSE_desc_t *A,
                      MORSE_desc_t *B);
-                     
-int MORSE_ztrsmpl_Tile(MORSE_desc_t *A, MORSE_desc_t *L, 
+
+int MORSE_ztrsmpl_Tile(MORSE_desc_t *A, MORSE_desc_t *L,
                        int *IPIV, MORSE_desc_t *B);
-                       
-int MORSE_ztrsmrv_Tile(MORSE_enum side, MORSE_enum uplo, 
-                       MORSE_enum transA, MORSE_enum diag, 
-                       MORSE_Complex64_t alpha, MORSE_desc_t *A, 
+
+int MORSE_ztrsmrv_Tile(MORSE_enum side, MORSE_enum uplo,
+                       MORSE_enum transA, MORSE_enum diag,
+                       MORSE_Complex64_t alpha, MORSE_desc_t *A,
                        MORSE_desc_t *B);
-                       
+
 int MORSE_ztrtri_Tile(MORSE_enum uplo, MORSE_enum diag, MORSE_desc_t *A);
 
 int MORSE_zunglq_Tile(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B);
 
 int MORSE_zungqr_Tile(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B);
 
-int MORSE_zunmlq_Tile(MORSE_enum side, MORSE_enum trans, MORSE_desc_t *A, 
+int MORSE_zunmlq_Tile(MORSE_enum side, MORSE_enum trans, MORSE_desc_t *A,
                       MORSE_desc_t *T, MORSE_desc_t *B);
 
-int MORSE_zunmqr_Tile(MORSE_enum side, MORSE_enum trans, MORSE_desc_t *A, 
+int MORSE_zunmqr_Tile(MORSE_enum side, MORSE_enum trans, MORSE_desc_t *A,
                       MORSE_desc_t *T, MORSE_desc_t *B);
 
 /** ****************************************
- *  Declarations of computational functions 
+ *  Declarations of computational functions
  *  (tile layout, asynchronous execution)
  **/
- 
-int MORSE_zgelqf_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, 
-                            MORSE_sequence_t *sequence, 
+
+int MORSE_zgelqf_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T,
+                            MORSE_sequence_t *sequence,
                             MORSE_request_t *request);
 
-int MORSE_zgelqs_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, 
-                            MORSE_desc_t *B, 
-                            MORSE_sequence_t *sequence, 
+int MORSE_zgelqs_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T,
+                            MORSE_desc_t *B,
+                            MORSE_sequence_t *sequence,
                             MORSE_request_t *request);
-                            
-int MORSE_zgels_Tile_Async(MORSE_enum trans, MORSE_desc_t *A, 
-                           MORSE_desc_t *T, MORSE_desc_t *B, 
-                           MORSE_sequence_t *sequence, 
+
+int MORSE_zgels_Tile_Async(MORSE_enum trans, MORSE_desc_t *A,
+                           MORSE_desc_t *T, MORSE_desc_t *B,
+                           MORSE_sequence_t *sequence,
                            MORSE_request_t *request);
 
-int MORSE_zgemm_Tile_Async(MORSE_enum transA, MORSE_enum transB, 
-                           MORSE_Complex64_t alpha, MORSE_desc_t *A, 
-                           MORSE_desc_t *B, MORSE_Complex64_t beta, 
-                           MORSE_desc_t *C, MORSE_sequence_t *sequence, 
+int MORSE_zgemm_Tile_Async(MORSE_enum transA, MORSE_enum transB,
+                           MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                           MORSE_desc_t *B, MORSE_Complex64_t beta,
+                           MORSE_desc_t *C, MORSE_sequence_t *sequence,
                            MORSE_request_t *request);
 
-int MORSE_zgeqrf_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, 
-                            MORSE_sequence_t *sequence, 
+int MORSE_zgeqrf_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T,
+                            MORSE_sequence_t *sequence,
                             MORSE_request_t *request)
 
-int MORSE_zgeqrs_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, 
-                            MORSE_desc_t *B, 
-			    MORSE_sequence_t *sequence, 
-			    MORSE_request_t *request);
+int MORSE_zgeqrs_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T,
+                            MORSE_desc_t *B,
+                            MORSE_sequence_t *sequence,
+                            MORSE_request_t *request);
 
-int MORSE_zgesv_incpiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *L, 
-                                  int *IPIV, MORSE_desc_t *B, 
-                                  MORSE_sequence_t *sequence, 
+int MORSE_zgesv_incpiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *L,
+                                  int *IPIV, MORSE_desc_t *B,
+                                  MORSE_sequence_t *sequence,
                                   MORSE_request_t *request);
 
-int MORSE_zgesv_nopiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *B, 
-                                 MORSE_sequence_t *sequence, 
+int MORSE_zgesv_nopiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *B,
+                                 MORSE_sequence_t *sequence,
                                  MORSE_request_t *request);
 
-int MORSE_zgetrf_incpiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *L, 
-                                   int *IPIV, MORSE_sequence_t *sequence, 
+int MORSE_zgetrf_incpiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *L,
+                                   int *IPIV, MORSE_sequence_t *sequence,
                                    MORSE_request_t *request);
 
-int MORSE_zgetrf_nopiv_Tile_Async(MORSE_desc_t *A, 
-                                  MORSE_sequence_t *sequence, 
+int MORSE_zgetrf_nopiv_Tile_Async(MORSE_desc_t *A,
+                                  MORSE_sequence_t *sequence,
                                   MORSE_request_t *request);
 
-int MORSE_zgetrs_incpiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *L, 
-                                   int *IPIV, MORSE_desc_t *B, 
-                                   MORSE_sequence_t *sequence, 
+int MORSE_zgetrs_incpiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *L,
+                                   int *IPIV, MORSE_desc_t *B,
+                                   MORSE_sequence_t *sequence,
                                    MORSE_request_t *request);
 
-int MORSE_zgetrs_nopiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *B, 
-                                  MORSE_sequence_t *sequence, 
+int MORSE_zgetrs_nopiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *B,
+                                  MORSE_sequence_t *sequence,
                                   MORSE_request_t *request);
 
 #ifdef COMPLEX
-int MORSE_zhemm_Tile_Async(MORSE_enum side, MORSE_enum uplo, 
-                           MORSE_Complex64_t alpha, MORSE_desc_t *A, 
-                           MORSE_desc_t *B, MORSE_Complex64_t beta, 
-                           MORSE_desc_t *C, MORSE_sequence_t *sequence, 
+int MORSE_zhemm_Tile_Async(MORSE_enum side, MORSE_enum uplo,
+                           MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                           MORSE_desc_t *B, MORSE_Complex64_t beta,
+                           MORSE_desc_t *C, MORSE_sequence_t *sequence,
                            MORSE_request_t *request);
 
-int MORSE_zherk_Tile_Async(MORSE_enum uplo, MORSE_enum trans, 
-                           double alpha, MORSE_desc_t *A, 
-                           double beta, MORSE_desc_t *C, 
-                           MORSE_sequence_t *sequence, 
+int MORSE_zherk_Tile_Async(MORSE_enum uplo, MORSE_enum trans,
+                           double alpha, MORSE_desc_t *A,
+                           double beta, MORSE_desc_t *C,
+                           MORSE_sequence_t *sequence,
                            MORSE_request_t *request);
 
-int MORSE_zher2k_Tile_Async(MORSE_enum uplo, MORSE_enum trans, 
-                            MORSE_Complex64_t alpha, MORSE_desc_t *A, 
-                            MORSE_desc_t *B, double beta, MORSE_desc_t *C, 
-                            MORSE_sequence_t *sequence, 
+int MORSE_zher2k_Tile_Async(MORSE_enum uplo, MORSE_enum trans,
+                            MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                            MORSE_desc_t *B, double beta, MORSE_desc_t *C,
+                            MORSE_sequence_t *sequence,
                             MORSE_request_t *request);
 #endif
 
-int MORSE_zlacpy_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, 
-                            MORSE_desc_t *B, MORSE_sequence_t *sequence, 
+int MORSE_zlacpy_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A,
+                            MORSE_desc_t *B, MORSE_sequence_t *sequence,
                             MORSE_request_t *request);
 
-int MORSE_zlange_Tile_Async(MORSE_enum norm, MORSE_desc_t *A, double *value, 
-                            MORSE_sequence_t *sequence, 
+int MORSE_zlange_Tile_Async(MORSE_enum norm, MORSE_desc_t *A, double *value,
+                            MORSE_sequence_t *sequence,
                             MORSE_request_t *request);
 
 #ifdef COMPLEX
-int MORSE_zlanhe_Tile_Async(MORSE_enum norm, MORSE_enum uplo, 
-                            MORSE_desc_t *A, double *value, 
-                            MORSE_sequence_t *sequence, 
+int MORSE_zlanhe_Tile_Async(MORSE_enum norm, MORSE_enum uplo,
+                            MORSE_desc_t *A, double *value,
+                            MORSE_sequence_t *sequence,
                             MORSE_request_t *request);
 #endif
 
-int MORSE_zlansy_Tile_Async(MORSE_enum norm, MORSE_enum uplo, 
-                            MORSE_desc_t *A, double *value, 
-                            MORSE_sequence_t *sequence, 
+int MORSE_zlansy_Tile_Async(MORSE_enum norm, MORSE_enum uplo,
+                            MORSE_desc_t *A, double *value,
+                            MORSE_sequence_t *sequence,
                             MORSE_request_t *request);
 
-int MORSE_zlantr_Tile_Async(MORSE_enum norm, MORSE_enum uplo, 
-                            MORSE_enum diag, MORSE_desc_t *A, double *value, 
-                            MORSE_sequence_t *sequence, 
+int MORSE_zlantr_Tile_Async(MORSE_enum norm, MORSE_enum uplo,
+                            MORSE_enum diag, MORSE_desc_t *A, double *value,
+                            MORSE_sequence_t *sequence,
                             MORSE_request_t *request);
 
-int MORSE_zlaset_Tile_Async(MORSE_enum uplo, MORSE_Complex64_t alpha, 
-                            MORSE_Complex64_t beta, MORSE_desc_t *A, 
-                            MORSE_sequence_t *sequence, 
+int MORSE_zlaset_Tile_Async(MORSE_enum uplo, MORSE_Complex64_t alpha,
+                            MORSE_Complex64_t beta, MORSE_desc_t *A,
+                            MORSE_sequence_t *sequence,
                             MORSE_request_t *request);
 
-int MORSE_zlauum_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, 
-                            MORSE_sequence_t *sequence, 
+int MORSE_zlauum_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A,
+                            MORSE_sequence_t *sequence,
                             MORSE_request_t *request);
 
 #ifdef COMPLEX
-int MORSE_zplghe_Tile_Async(double bump, MORSE_desc_t *A, 
-                            unsigned long long int seed, 
-                            MORSE_sequence_t *sequence, 
+int MORSE_zplghe_Tile_Async(double bump, MORSE_desc_t *A,
+                            unsigned long long int seed,
+                            MORSE_sequence_t *sequence,
                             MORSE_request_t *request );
 #endif
 
-int MORSE_zplgsy_Tile_Async(MORSE_Complex64_t bump, MORSE_desc_t *A, 
-                            unsigned long long int seed, 
-                            MORSE_sequence_t *sequence, 
+int MORSE_zplgsy_Tile_Async(MORSE_Complex64_t bump, MORSE_desc_t *A,
+                            unsigned long long int seed,
+                            MORSE_sequence_t *sequence,
                             MORSE_request_t *request );
 
-int MORSE_zplrnt_Tile_Async(MORSE_desc_t *A, unsigned long long int seed, 
-                            MORSE_sequence_t *sequence, 
+int MORSE_zplrnt_Tile_Async(MORSE_desc_t *A, unsigned long long int seed,
+                            MORSE_sequence_t *sequence,
                             MORSE_request_t *request );
 
-int MORSE_zposv_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, 
-                           MORSE_desc_t *B, 
-                           MORSE_sequence_t *sequence, 
+int MORSE_zposv_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A,
+                           MORSE_desc_t *B,
+                           MORSE_sequence_t *sequence,
                            MORSE_request_t *request);
 
-int MORSE_zpotrf_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, 
-                            MORSE_sequence_t *sequence, 
+int MORSE_zpotrf_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A,
+                            MORSE_sequence_t *sequence,
                             MORSE_request_t *request);
 
-int MORSE_zsytrf_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, 
-                            MORSE_sequence_t *sequence, 
+int MORSE_zsytrf_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A,
+                            MORSE_sequence_t *sequence,
                             MORSE_request_t *request);
 
-int MORSE_zpotri_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, 
-                            MORSE_sequence_t *sequence, 
+int MORSE_zpotri_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A,
+                            MORSE_sequence_t *sequence,
                             MORSE_request_t *request);
 
-int MORSE_zpotrs_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, 
-                            MORSE_desc_t *B, MORSE_sequence_t *sequence, 
+int MORSE_zpotrs_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A,
+                            MORSE_desc_t *B, MORSE_sequence_t *sequence,
                             MORSE_request_t *request);
 
 #if defined (PRECISION_c) || defined(PRECISION_z)
-int MORSE_zsytrs_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, 
-                            MORSE_desc_t *B, 
-                            MORSE_sequence_t *sequence, 
+int MORSE_zsytrs_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A,
+                            MORSE_desc_t *B,
+                            MORSE_sequence_t *sequence,
                             MORSE_request_t *request);
 #endif
 
-int MORSE_zsymm_Tile_Async(MORSE_enum side, MORSE_enum uplo, 
-                           MORSE_Complex64_t alpha, MORSE_desc_t *A, 
-                           MORSE_desc_t *B, MORSE_Complex64_t beta, 
-                           MORSE_desc_t *C, MORSE_sequence_t *sequence, 
+int MORSE_zsymm_Tile_Async(MORSE_enum side, MORSE_enum uplo,
+                           MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                           MORSE_desc_t *B, MORSE_Complex64_t beta,
+                           MORSE_desc_t *C, MORSE_sequence_t *sequence,
                            MORSE_request_t *request);
 
-int MORSE_zsyrk_Tile_Async(MORSE_enum uplo, MORSE_enum trans, 
-                           MORSE_Complex64_t alpha, MORSE_desc_t *A, 
-                           MORSE_Complex64_t beta, MORSE_desc_t *C, 
-                           MORSE_sequence_t *sequence, 
+int MORSE_zsyrk_Tile_Async(MORSE_enum uplo, MORSE_enum trans,
+                           MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                           MORSE_Complex64_t beta, MORSE_desc_t *C,
+                           MORSE_sequence_t *sequence,
                            MORSE_request_t *request);
 
-int MORSE_zsyr2k_Tile_Async(MORSE_enum uplo, MORSE_enum trans, 
-                            MORSE_Complex64_t alpha, MORSE_desc_t *A, 
-                            MORSE_desc_t *B, MORSE_Complex64_t beta, 
-                            MORSE_desc_t *C, MORSE_sequence_t *sequence, 
+int MORSE_zsyr2k_Tile_Async(MORSE_enum uplo, MORSE_enum trans,
+                            MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                            MORSE_desc_t *B, MORSE_Complex64_t beta,
+                            MORSE_desc_t *C, MORSE_sequence_t *sequence,
                             MORSE_request_t *request);
 
-int MORSE_ztrmm_Tile_Async(MORSE_enum side, MORSE_enum uplo, 
-                           MORSE_enum transA, MORSE_enum diag, 
-                           MORSE_Complex64_t alpha, MORSE_desc_t *A, 
-                           MORSE_desc_t *B, MORSE_sequence_t *sequence, 
+int MORSE_ztrmm_Tile_Async(MORSE_enum side, MORSE_enum uplo,
+                           MORSE_enum transA, MORSE_enum diag,
+                           MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                           MORSE_desc_t *B, MORSE_sequence_t *sequence,
                            MORSE_request_t *request);
 
-int MORSE_ztrsm_Tile_Async(MORSE_enum side, MORSE_enum uplo, 
-                           MORSE_enum transA, MORSE_enum diag, 
-                           MORSE_Complex64_t alpha, MORSE_desc_t *A, 
-                           MORSE_desc_t *B, MORSE_sequence_t *sequence, 
+int MORSE_ztrsm_Tile_Async(MORSE_enum side, MORSE_enum uplo,
+                           MORSE_enum transA, MORSE_enum diag,
+                           MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                           MORSE_desc_t *B, MORSE_sequence_t *sequence,
                            MORSE_request_t *request);
 
-int MORSE_ztrsmpl_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV, 
-                             MORSE_desc_t *B, MORSE_sequence_t *sequence, 
+int MORSE_ztrsmpl_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV,
+                             MORSE_desc_t *B, MORSE_sequence_t *sequence,
                              MORSE_request_t *request);
 
-int MORSE_ztrsmrv_Tile_Async(MORSE_enum side, MORSE_enum uplo, 
-                             MORSE_enum transA, MORSE_enum diag, 
-                             MORSE_Complex64_t alpha, MORSE_desc_t *A, 
-                             MORSE_desc_t *B, MORSE_sequence_t *sequence, 
+int MORSE_ztrsmrv_Tile_Async(MORSE_enum side, MORSE_enum uplo,
+                             MORSE_enum transA, MORSE_enum diag,
+                             MORSE_Complex64_t alpha, MORSE_desc_t *A,
+                             MORSE_desc_t *B, MORSE_sequence_t *sequence,
                              MORSE_request_t *request);
 
-int MORSE_ztrtri_Tile_Async(MORSE_enum uplo, MORSE_enum diag, 
-                            MORSE_desc_t *A, 
-                            MORSE_sequence_t *sequence, 
+int MORSE_ztrtri_Tile_Async(MORSE_enum uplo, MORSE_enum diag,
+                            MORSE_desc_t *A,
+                            MORSE_sequence_t *sequence,
                             MORSE_request_t *request);
 
-int MORSE_zunglq_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, 
-                            MORSE_desc_t *B, 
-                            MORSE_sequence_t *sequence, 
+int MORSE_zunglq_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T,
+                            MORSE_desc_t *B,
+                            MORSE_sequence_t *sequence,
                             MORSE_request_t *request);
 
-int MORSE_zungqr_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, 
-                            MORSE_desc_t *B, 
-                            MORSE_sequence_t *sequence, 
+int MORSE_zungqr_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T,
+                            MORSE_desc_t *B,
+                            MORSE_sequence_t *sequence,
                             MORSE_request_t *request);
 
-int MORSE_zunmlq_Tile_Async(MORSE_enum side, MORSE_enum trans, 
-                            MORSE_desc_t *A, MORSE_desc_t *T, 
-                            MORSE_desc_t *B, MORSE_sequence_t *sequence, 
+int MORSE_zunmlq_Tile_Async(MORSE_enum side, MORSE_enum trans,
+                            MORSE_desc_t *A, MORSE_desc_t *T,
+                            MORSE_desc_t *B, MORSE_sequence_t *sequence,
                             MORSE_request_t *request);
 
-int MORSE_zunmqr_Tile_Async(MORSE_enum side, MORSE_enum trans, 
-                            MORSE_desc_t *A, MORSE_desc_t *T, 
-                            MORSE_desc_t *B, MORSE_sequence_t *sequence, 
+int MORSE_zunmqr_Tile_Async(MORSE_enum side, MORSE_enum trans,
+                            MORSE_desc_t *A, MORSE_desc_t *T,
+                            MORSE_desc_t *B, MORSE_sequence_t *sequence,
                             MORSE_request_t *request);
 
 @end verbatim
diff --git a/docs/texinfo/users_guide.texi.in b/docs/texinfo/users_guide.texi.in
index 94c88d513..43d4bf688 100644
--- a/docs/texinfo/users_guide.texi.in
+++ b/docs/texinfo/users_guide.texi.in
@@ -19,35 +19,35 @@ Copyright @copyright{} 2014 The University of Tennessee
 Copyright @copyright{} 2014 King Abdullah University of Science and Technology
 
 @quotation
-Redistribution and use in source and binary forms, with or without 
+Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 @itemize @bullet
 @item
-Redistributions of source code must retain the above copyright notice, this 
-list 
+Redistributions of source code must retain the above copyright notice, this
+list
 of conditions and the following disclaimer.
 
 @item
-Redistributions in binary form must reproduce the above copyright notice, this  
-list of conditions and the following disclaimer listed in this license in the 
+Redistributions in binary form must reproduce the above copyright notice, this
+list of conditions and the following disclaimer listed in this license in the
 documentation and/or other materials provided with the distribution.
 
 @item
-Neither the name of the copyright holders nor the names of its contributors may 
-be used to endorse or promote products derived from this software without 
+Neither the name of the copyright holders nor the names of its contributors may
+be used to endorse or promote products derived from this software without
 specific prior written permission.
 @end itemize
 
-This software is provided by the copyright holders and contributors "as is" and 
-any express or implied warranties, including, but not limited to, the implied 
-warranties of merchantability and fitness for a particular purpose are 
-disclaimed. 
-In no event shall the copyright owner or contributors be liable for any direct, 
-indirect, incidental, special, exemplary, or consequential damages (including, 
-but not limited to, procurement of substitute goods or services; loss of use, 
-data, or profits; or business interruption) however caused and on any theory of 
-liability, whether in contract, strict liability, or tort (including negligence 
-or otherwise) arising in any way out of the use of this software, even if 
+This software is provided by the copyright holders and contributors "as is" and
+any express or implied warranties, including, but not limited to, the implied
+warranties of merchantability and fitness for a particular purpose are
+disclaimed.
+In no event shall the copyright owner or contributors be liable for any direct,
+indirect, incidental, special, exemplary, or consequential damages (including,
+but not limited to, procurement of substitute goods or services; loss of use,
+data, or profits; or business interruption) however caused and on any theory of
+liability, whether in contract, strict liability, or tort (including negligence
+or otherwise) arising in any way out of the use of this software, even if
 advised of the possibility of such damage.
 @end quotation
 @end copying
@@ -55,8 +55,8 @@ advised of the possibility of such damage.
 @c #############################################################################
 
 @titlepage
-@c @flushleft 
-@c @image{morse_header} 
+@c @flushleft
+@c @image{morse_header}
 @c @end flushleft
 @title CHAMELEON User's Guide
 @subtitle Software of MORSE project
@@ -96,7 +96,7 @@ advised of the possibility of such damage.
 @node Top
 @top Preface
 
-This manual documents the usage of CHAMELEON version @value{VERSION}. 
+This manual documents the usage of CHAMELEON version @value{VERSION}.
 It was last updated on @value{UPDATED}.
 
 @insertcopying
diff --git a/docs/texinfo/version.texi.in b/docs/texinfo/version.texi.in
index 2230db612..cb5ee79fc 100644
--- a/docs/texinfo/version.texi.in
+++ b/docs/texinfo/version.texi.in
@@ -1,4 +1,4 @@
-@set UPDATED 15 November 2014
-@set UPDATED-MONTH November 2014
+@set UPDATED 02 November 2015
+@set UPDATED-MONTH November 2015
 @set EDITION @CHAMELEON_VERSION_MAJOR@.@CHAMELEON_VERSION_MINOR@.@CHAMELEON_VERSION_PATCH@
 @set VERSION @CHAMELEON_VERSION_MAJOR@.@CHAMELEON_VERSION_MINOR@.@CHAMELEON_VERSION_PATCH@
-- 
GitLab