start an orgmode version of the texinfo users guide doc

6d71da30 · PRUVOST Florent · e9dc21e8 · 6d71da30 · 6d71da30 · 6d71da30
Commit 6d71da30 authored 7 years ago by PRUVOST Florent
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -3,7 +3,7 @@
 # @copyright (c) 2009-2014 The University of Tennessee and The University
 #                          of Tennessee Research Foundation.
 #                          All rights reserved.
-# @copyright (c) 2012-2014 Inria. All rights reserved.
+# @copyright (c) 2012-2017 Inria. All rights reserved.
 # @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
 #
 ###
@@ -22,6 +22,7 @@
 #  @author Cedric Castagnede
 #  @author Emmanuel Agullo
 #  @author Mathieu Faverge
+#  @author Florent Pruvost
 #  @date 13-07-2012
 #
 ###
@@ -34,8 +35,8 @@ cmake_minimum_required(VERSION 2.8)
 #                                           #
 #############################################
 add_subdirectory(doxygen)
-add_subdirectory(texinfo)
+add_subdirectory(orgmode)
+#add_subdirectory(texinfo)
 ###
 ### END CMakeLists.txt
 ###
--- a/doc/orgmode/CMakeLists.txt
+++ b/doc/orgmode/CMakeLists.txt
+###
+#
+# @copyright (c) 2017 Inria. All rights reserved.
+#
+###
+#
+#  @file CMakeLists.txt
+#
+#  @project MORSE
+#  MORSE is a software package provided by:
+#     Inria Bordeaux - Sud-Ouest,
+#     Univ. of Tennessee,
+#     King Abdullah Univesity of Science and Technology
+#     Univ. of California Berkeley,
+#     Univ. of Colorado Denver. 
+#
+#  @version 1.0.0
+#  @author Florent Pruvost
+#  @date 25-08-2017
+#
+###
+cmake_minimum_required(VERSION 2.8)
+# Create file version.org
+# -----------------------
+configure_file("version.org.in"
+               "version.org"
+               @ONLY)
+configure_file("users_guide.org.in"
+               "users_guide.org"
+               @ONLY)
+set(FIGURES 
+    tile_lu.pdf
+    tile_lu.jpg
+    tile_layout.pdf
+    tile_layout.jpg
+    trace_qr.pdf
+    trace_qr.jpg
+    potri_async.png
+    morse_header.png
+    )
+set(FIGURES_HTML 
+    tile_lu.jpg
+    tile_layout.jpg
+    trace_qr.jpg
+    potri_async.png
+    morse_header.png
+    )
+foreach(_fig ${FIGURES})
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/figures/${_fig}
+                   ${CMAKE_CURRENT_BINARY_DIR}/${_fig}
+                   COPYONLY)
+endforeach()
+# Looking for emacs
+# -----------------
+FIND_PROGRAM(EMACS_COMPILER emacs)
+if(EMACS_COMPILER)
+    # Add target
+    # ----------
+    add_custom_command(OUTPUT  users_guide.html
+                       COMMAND ${EMACS_COMPILER}
+                       ARGS    ${CMAKE_CURRENT_BINARY_DIR}/users_guide.org 
+                               --batch
+                               -f
+                               org-html-export-to-html
+                               --kill
+                       DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/users_guide.org
+                      )
+    add_custom_command(OUTPUT  users_guide.pdf
+                       COMMAND ${EMACS_COMPILER}
+                       ARGS    ${CMAKE_CURRENT_BINARY_DIR}/users_guide.org 
+                               --batch
+                               -f
+                               org-latex-export-to-pdf
+                               --kill
+                       DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/users_guide.org
+                      )
+    add_custom_target(doc-html-users_guide ALL DEPENDS users_guide.html)
+    add_custom_target(doc-pdf-users_guide ALL DEPENDS users_guide.pdf)
+    # Installation
+    # ------------
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/users_guide.html
+            DESTINATION share/chameleon/html)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/users_guide.pdf
+            DESTINATION share/chameleon/pdf)
+    foreach(_fig ${FIGURES_HTML})
+        install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${_fig}
+                DESTINATION share/chameleon/html)
+    endforeach()
+else(EMACS_COMPILER)
+    message(STATUS "Looking for emacs - not found")
+endif(EMACS_COMPILER)
+###
+### END CMakeLists.txt
+###
--- a/doc/orgmode/chapters/configuration.org
+++ b/doc/orgmode/chapters/configuration.org
+@c -*-texinfo-*-
+@c This file is part of the MORSE Handbook.
+@c Copyright (C) 2017 Inria
+@c Copyright (C) 2014 The University of Tennessee
+@c Copyright (C) 2014 King Abdullah University of Science and Technology
+@c See the file ../chameleon.texi for copying conditions.
+@menu
+* Compilation configuration::
+* Dependencies detection::
+@c * Dependencies compilation::
+* Use FxT profiling through StarPU::
+* Use simulation mode with StarPU-SimGrid::
+* Use out of core support with StarPU::
+@end menu
+@c @code{} @option{}
+@c @table @code
+@c @item truc
+@c @item muche
+@c @item et zut
+@c @c @end table
+@node Compilation configuration
+@section Compilation configuration
+The following arguments can be given to the @command{cmake <path to source
+directory>} script.
+In this chapter, the following convention is used:
+@itemize @bullet
+@item
+@option{path} is a path in your filesystem,
+@item
+@option{var} is a string and the correct value or an example will be given,
+@item
+@option{trigger} is an CMake option and the correct value is @code{ON} or
+@code{OFF}.
+@end itemize
+Using CMake there are several ways to give options:
+@enumerate
+@item directly as CMake command line arguments
+@item invoque @command{cmake <path to source directory>} once and then use
+@command{ccmake <path to source directory>} to edit options through a
+minimalist gui (required
+@samp{cmake-curses-gui} installed on a Linux system)
+@item invoque @command{cmake-gui} command and fill information about the
+location of the sources and where to build the project, then you have
+access to options through a user-friendly Qt interface (required
+@samp{cmake-qt-gui} installed on a Linux system)
+@end enumerate
+Example of configuration using the command line
+@example
+cmake ~/chameleon/ -DCMAKE_BUILD_TYPE=Debug          \
+                   -DCMAKE_INSTALL_PREFIX=~/install  \
+                   -DCHAMELEON_USE_CUDA=ON           \
+                   -DCHAMELEON_USE_MPI=ON            \
+                   -DBLA_VENDOR=Intel10_64lp         \
+                   -DSTARPU_DIR=~/install/starpu-1.1 \
+                   -DCHAMELEON_ENABLE_TRACING=ON
+@end example
+You can get the full list of options with @option{-L[A][H]} options of
+@command{cmake} command:
+@example
+cmake -LH <path to source directory>
+@end example
+@menu
+* General CMake options::
+* CHAMELEON options::
+@end menu
+@node General CMake options
+@subsection General CMake options
+@table @code
+@item -DCMAKE_INSTALL_PREFIX=@option{path} (default:@option{path=/usr/local})
+Install directory used by @code{make install} where some headers and libraries
+will be copied.
+Permissions have to be granted to write onto @option{path} during @code{make
+install} step.
+@item -DCMAKE_BUILD_TYPE=@option{var} (default: @option{Release})
+Define the build type and the compiler optimization level.
+The possible values for @option{var} are:
+@table @code
+@item empty
+@item Debug
+@item Release
+@item RelWithDebInfo
+@item MinSizeRel
+@end table
+@item -DBUILD_SHARED_LIBS=@option{trigger} (default:@option{OFF})
+Indicate wether or not CMake has to build CHAMELEON static (@option{OFF}) or
+shared (@option{ON}) libraries.
+@end table
+@node CHAMELEON options
+@subsection CHAMELEON options
+List of CHAMELEON options that can be enabled/disabled (value=@code{ON}
+or @code{OFF}):
+@table @code
+@item @option{-DCHAMELEON_SCHED_STARPU}=@option{trigger} (default: @code{ON})
+to link with StarPU library (runtime system)
+@item @option{-DCHAMELEON_SCHED_QUARK}=@option{trigger} (default: @code{OFF})
+to link with QUARK library (runtime system)
+@item @option{-DCHAMELEON_USE_CUDA}=@option{trigger} (default: @code{OFF})
+to link with CUDA runtime (implementation paradigm for accelerated codes on
+GPUs) and cuBLAS library (optimized BLAS kernels on GPUs), can only be used with
+StarPU
+@item @option{-DCHAMELEON_USE_MPI}=@option{trigger} (default: @code{OFF})
+to link with MPI library (message passing implementation for use of multiple
+nodes with distributed memory), can only be used with StarPU
+@item @option{-DCHAMELEON_ENABLE_TRACING}=@option{trigger} (default: @code{OFF})
+to enable trace generation during execution of timing drivers.
+It requires StarPU to be linked with FxT library (trace execution of kernels on workers).
+@item @option{-DCHAMELEON_SIMULATION=trigger} (default: @code{OFF})
+to enable simulation mode, means CHAMELEON will not really execute tasks,
+see details in section @ref{Use simulation mode with StarPU-SimGrid}.
+This option must be used with StarPU compiled with
+@uref{http://simgrid.gforge.inria.fr/, SimGrid} allowing to guess the
+execution time on any architecture.
+This feature should be used to make experiments on the scheduler behaviors and
+performances not to produce solutions of linear systems.
+@item @option{-DCHAMELEON_ENABLE_DOCS=trigger} (default: @code{ON})
+to control build of the documentation contained in @file{docs/} sub-directory
+@item @option{-DCHAMELEON_ENABLE_EXAMPLE=trigger} (default: @code{ON})
+to control build of the examples executables (API usage)
+contained in @file{example/} sub-directory
+@item @option{-DCHAMELEON_ENABLE_TESTING=trigger} (default: @code{ON})
+to control build of testing executables (numerical check) contained in
+@file{testing/} sub-directory
+@item @option{-DCHAMELEON_ENABLE_TIMING=trigger} (default: @code{ON})
+to control build of timing executables (performances check) contained in
+@file{timing/} sub-directory
+@item @option{-DCHAMELEON_PREC_S=trigger} (default: @code{ON})
+to enable the support of simple arithmetic precision (float in C)
+@item @option{-DCHAMELEON_PREC_D=trigger} (default: @code{ON})
+to enable the support of double arithmetic precision (double in C)
+@item @option{-DCHAMELEON_PREC_C=trigger} (default: @code{ON})
+to enable the support of complex arithmetic precision (complex in C)
+@item @option{-DCHAMELEON_PREC_Z=trigger} (default: @code{ON})
+to enable the support of double complex arithmetic precision (double complex
+in C)
+@item @option{-DBLAS_VERBOSE=trigger} (default: @code{OFF})
+to make BLAS library discovery verbose
+@item @option{-DLAPACK_VERBOSE=trigger} (default: @code{OFF})
+to make LAPACK library discovery verbose (automatically enabled if
+@option{BLAS_VERBOSE=@code{ON}})
+@end table
+List of CHAMELEON options that needs a specific value:
+@table @code
+@item @option{-DBLA_VENDOR=@option{var}} (default: @option{empty})
+The possible values for @option{var} are:
+@table @code
+@item empty
+@item all
+@item Intel10_64lp
+@item Intel10_64lp_seq
+@item ACML
+@item Apple
+@item Generic
+@item ...
+@end table
+to force CMake to find a specific BLAS library, see the full list of BLA_VENDOR
+in @file{FindBLAS.cmake} in @file{cmake_modules/morse/find}.
+By default @option{BLA_VENDOR} is empty so that CMake tries to detect all
+possible BLAS vendor with a preference for Intel MKL.
+@end table
+List of CHAMELEON options which requires to give a path:
+@table @code
+@item @option{-DLIBNAME_DIR=@option{path}} (default: empty)
+root directory of the LIBNAME library installation
+@item @option{-DLIBNAME_INCDIR=@option{path}} (default: empty)
+directory of the LIBNAME library headers installation
+@item @option{-DLIBNAME_LIBDIR=@option{path}} (default: empty)
+directory of the LIBNAME libraries (.so, .a, .dylib, etc) installation
+@end table
+LIBNAME can be one of the following: BLAS - CBLAS - FXT - HWLOC -
+LAPACK - LAPACKE - QUARK - STARPU - TMG.
+See paragraph about @ref{Dependencies detection} for details.
+Libraries detected with an official CMake module (see module files in
+@file{CMAKE_ROOT/Modules/}):
+@itemize @bullet
+@item CUDA
+@item MPI
+@item Threads
+@end itemize
+Libraries detected with CHAMELEON cmake modules (see module files in
+@file{cmake_modules/morse/find/} directory of CHAMELEON sources):
+@itemize @bullet
+@item BLAS
+@item CBLAS
+@item FXT
+@item HWLOC
+@item LAPACK
+@item LAPACKE
+@item QUARK
+@item STARPU
+@item TMG
+@end itemize
+@node Dependencies detection
+@section Dependencies detection
+You have different choices to detect dependencies on your system, either by
+setting some environment variables containing paths to the libs and headers or
+by specifying them directly at cmake configure.
+Different cases :
+@enumerate
+@item detection of dependencies through environment variables:
+  @itemize @bullet
+  @item @env{LD_LIBRARY_PATH} environment variable should contain the list of
+paths
+where to find the libraries:
+    @example
+    export @env{LD_LIBRARY_PATH}=$@env{LD_LIBRARY_PATH}:path/to/your/libs
+    @end example
+  @item @env{INCLUDE} environment variable should contain the list of paths
+where to find the header files of libraries
+    @example
+    export @env{INCLUDE}=$@env{INCLUDE}:path/to/your/headers
+    @end example
+  @end itemize
+@item detection with user's given paths:
+  @itemize @bullet
+  @item you can specify the path at cmake configure by invoking
+  @example
+  cmake <path to SOURCE_DIR> -DLIBNAME_DIR=path/to/your/lib
+  @end example
+  where LIB stands for the name of the lib to look for, example
+  @example
+  cmake <path to SOURCE_DIR> -DSTARPU_DIR=path/to/starpudir \
+                             -DCBLAS_DIR= ...
+  @end example
+  @item it is also possible to specify headers and library directories
+separately, example
+  @example
+  cmake <path to SOURCE_DIR>                           \
+  -DSTARPU_INCDIR=path/to/libstarpu/include/starpu/1.1 \
+  -DSTARPU_LIBDIR=path/to/libstarpu/lib
+  @end example
+  @item Note BLAS and LAPACK detection can be tedious so that we provide a
+verbose mode. Use @option{-DBLAS_VERBOSE=ON} or @option{-DLAPACK_VERBOSE=ON} to
+enable it.
+  @end itemize
+@end enumerate
+@c @node Dependencies compilation
+@c @section Dependencies compilation
+@node Use FxT profiling through StarPU
+@section Use FxT profiling through StarPU
+StarPU can generate its own trace log files by compiling it with the
+@option{--with-fxt}
+option at the configure step (you can have to specify the directory where you
+installed FxT by giving @option{--with-fxt=...} instead of @option{--with-fxt}
+alone).
+By doing so, traces are generated after each execution of a program which uses
+StarPU in the directory pointed by the @env{STARPU_FXT_PREFIX} environment
+variable. Example:
+@example
+export @env{STARPU_FXT_PREFIX}=/home/yourname/fxt_files/
+@end example
+When executing a @command{./timing/...} CHAMELEON program, if it has been
+enabled (StarPU compiled with FxT and @option{-DCHAMELEON_ENABLE_TRACING=ON}), you
+can give the option @option{--trace} to tell the program to generate trace log
+files.
+Finally, to generate the trace file which can be opened with
+@uref{http://vite.gforge.inria.fr/, Vite} program, you have to use the
+@command{starpu_fxt_tool} executable of StarPU.
+This tool should be in @file{path/to/your/install/starpu/bin}.
+You can use it to generate the trace file like this:
+@itemize @bullet
+@item @command{path/to/your/install/starpu/bin/starpu_fxt_tool -i prof_filename}
+There is one file per mpi processus (prof_filename_0, prof_filename_1 ...).
+To generate a trace of mpi programs you can call it like this:
+@item @command{path/to/your/install/starpu/bin/starpu_fxt_tool -i
+prof_filename*}
+The trace file will be named paje.trace (use -o option to specify an output
+name).
+@end itemize
+Alternatively, one can also generate directly .paje trace files after the execution
+by setting @env{STARPU_GENERATE_TRACE=1}.
+@node Use simulation mode with StarPU-SimGrid
+@section Use simulation mode with StarPU-SimGrid
+Simulation mode can be enabled by setting the cmake option
+@option{-DCHAMELEON_SIMULATION=ON}.
+This mode allows you to simulate execution of algorithms with StarPU compiled
+with @uref{http://simgrid.gforge.inria.fr/, SimGrid}.
+To do so, we provide some perfmodels in the @file{simucore/perfmodels/}
+directory of CHAMELEON sources.
+To use these perfmodels, please set the following
+@itemize @bullet
+@item @env{STARPU_HOME} environment variable to:
+  @example
+  @code{<path to SOURCE_DIR>/simucore/perfmodels}
+  @end example
+@item @env{STARPU_HOSTNAME} environment variable to the name of the machine to
+simulate. For example, on our platform (PlaFRIM) with GPUs at Inria Bordeaux
+  @example
+  @env{STARPU_HOSTNAME}=mirage
+  @end example
+Note that only POTRF kernels with block sizes of 320 or 960 (simple and double
+precision) on mirage machine are available for now.
+Database of models is subject to change, it should be enrich in a near future.
+@end itemize
+@node Use out of core support with StarPU
+@section Use out of core support with StarPU
+If the matrix can not fit in the main memory, StarPU can automatically evict
+tiles to the disk.  The descriptors for the matrices which can not fit in the
+main memory need to be created with @code{MORSE_Desc_Create_OOC}, so that MORSE
+does not force StarPU to keep it in the main memory.
+The following variables then need to be set:
+@itemize @bullet
+@item @env{STARPU_DISK_SWAP} environment variable to a place where to store
+evicted tiles, for example:
+  @example
+  @env{STARPU_DISK_SWAP}=/tmp
+  @end example
+@item @env{STARPU_DISK_SWAP_BACKEND} environment variable to the I/O method,
+for example:
+  @example
+  @env{STARPU_DISK_SWAP_BACKEND}=unistd_o_direct
+  @end example
+@item @env{STARPU_LIMIT_CPU_MEM} environment variable to the amount of memory
+that can be used in MBytes, for example:
+  @example
+  @env{STARPU_LIMIT_CPU_MEM}=1000
+  @end example
+@end itemize
--- a/doc/orgmode/chapters/installing.org
+++ b/doc/orgmode/chapters/installing.org
--- a/doc/orgmode/chapters/introduction.org
+++ b/doc/orgmode/chapters/introduction.org
+# This file is part of the CHAMELEON User's Guide.
+# Copyright (C) 2017 Inria
+# See the file ../users_guide.org for copying conditions.
+** MORSE project
+   #+NAME: fig:morse_header
+   #+ATTR_HTML: :align center
+   [[file:morse_header.png]]
+*** MORSE Objectives
+    When processor clock speeds flatlined in 2004, after more than
+    fifteen years of exponential increases, the era of near automatic
+    performance improvements that the HPC application community had
+    previously enjoyed came to an abrupt end.  To develop software that
+    will perform well on petascale and exascale systems with thousands
+    of nodes and millions of cores, the list of major challenges that
+    must now be confronted is formidable:
+    1) dramatic escalation in the costs of intrasystem communication
+       between processors and/or levels of memory hierarchy;
+    2) increased heterogeneity of the processing units (mixing CPUs,
+       GPUs, etc. in varying and unexpected design combinations);
+    3) high levels of parallelism and more complex constraints means
+       that cooperating processes must be dynamically and unpredictably
+       scheduled for asynchronous execution;
+    4) software will not run at scale without much better resilience to
+       faults and far more robustness; and
+    5) new levels of self-adaptivity will be required to enable
+       software to modulate process speed in order to satisfy limited
+       energy budgets.
+    The MORSE associate team will tackle the first three challenges in
+    a orchestrating work between research groups respectively
+    specialized in sparse linear algebra, dense linear algebra and
+    runtime systems.  The overall objective is to develop robust linear
+    algebra libraries relying on innovative runtime systems that can
+    fully benefit from the potential of those future large-scale
+    complex machines.  Challenges 4) and 5) will also be investigated
+    by the different teams in the context of other partnerships, but
+    they will not be the main focus of the associate team as they are
+    much more prospective.
+*** Research fields
+    The overall goal of the MORSE associate team is to enable advanced
+    numerical algorithms to be executed on a scalable unified runtime
+    system for exploiting the full potential of future exascale
+    machines.  We expect advances in three directions based first on
+    strong and closed interactions between the runtime and numerical
+    linear algebra communities.  This initial activity will then
+    naturally expand to more focused but still joint research in both
+    fields.
+**** Fine interaction between linear algebra and runtime systems
+     On parallel machines, HPC applications need to take care of data
+     movement and consistency, which can be either explicitly managed
+     at the level of the application itself or delegated to a runtime
+     system.  We adopt the latter approach in order to better keep up
+     with hardware trends whose complexity is growing exponentially.
+     One major task in this project is to define a proper interface
+     between HPC applications and runtime systems in order to maximize
+     productivity and expressivity.  As mentioned in the next section,
+     a widely used approach consists in abstracting the application as
+     a DAG that the runtime system is in charge of scheduling.
+     Scheduling such a DAG over a set of heterogeneous processing units
+     introduces a lot of new challenges, such as predicting accurately
+     the execution time of each type of task over each kind of unit,
+     minimizing data transfers between memory banks, performing data
+     prefetching, etc.  Expected advances: In a nutshell, a new runtime
+     system API will be designed to allow applications to provide
+     scheduling hints to the runtime system and to get real-time
+     feedback about the consequences of scheduling decisions.
+**** Runtime systems
+     A runtime environment is an intermediate layer between the system
+     and the application.  It provides low-level functionality not
+     provided by the system (such as scheduling or management of the
+     heterogeneity) and high-level features (such as performance
+     portability).  In the framework of this proposal, we will work on
+     the scalability of runtime environment. To achieve scalability it
+     is required to avoid all centralization.  Here, the main problem
+     is the scheduling of the tasks.  In many task-based runtime
+     environments the scheduler is centralized and becomes a bottleneck
+     as soon as too many cores are involved.  It is therefore required
+     to distribute the scheduling decision or to compute a data
+     distribution that impose the mapping of task using, for instance
+     the so-called ``owner-compute'' rule.  Expected advances: We will
+     design runtime systems that enable an efficient and scalable use
+     of thousands of distributed multicore nodes enhanced with
+     accelerators.
+**** Linear algebra
+     Because of its central position in HPC and of the well understood
+     structure of its algorithms, dense linear algebra has often
+     pioneered new challenges that HPC had to face.  Again, dense
+     linear algebra has been in the vanguard of the new era of
+     petascale computing with the design of new algorithms that can
+     efficiently run on a multicore node with GPU accelerators. These
+     algorithms are called ``communication-avoiding'' since they have
+     been redesigned to limit the amount of communication between
+     processing units (and between the different levels of memory
+     hierarchy).  They are expressed through Direct Acyclic Graphs
+     (DAG) of fine-grained tasks that are dynamically
+     scheduled. Expected advances: First, we plan to investigate the
+     impact of these principles in the case of sparse applications
+     (whose algorithms are slightly more complicated but often rely on
+     dense kernels).  Furthermore, both in the dense and sparse cases,
+     the scalability on thousands of nodes is still limited; new
+     numerical approaches need to be found.  We will specifically
+     design sparse hybrid direct/iterative methods that represent a
+     promising approach.
+*** Research papers
+    Research papers about MORSE can be found at
+    http://icl.cs.utk.edu/projectsdev/morse/pubs/index.html
+** CHAMELEON
+*** CHAMELEON software
+    The main purpose is to address the performance shortcomings of the
+    [[http://www.netlib.org/lapack/][LAPACK]] and [[http://www.netlib.org/scalapack/][ScaLAPACK]] libraries on multicore processors and
+    multi-socket systems of multicore processors and their inability to
+    efficiently utilize accelerators such as Graphics Processing Units
+    (GPUs).
+    CHAMELEON is a framework written in C which provides routines to
+    solve dense general systems of linear equations, symmetric positive
+    definite systems of linear equations and linear least squares
+    problems, using LU, Cholesky, QR and LQ factorizations.  Real
+    arithmetic and complex arithmetic are supported in both single
+    precision and double precision.  It supports Linux and Mac OS/X
+    machines (only tested on Intel x86-64 architecture).
+    CHAMELEON is based on [[http://icl.cs.utk.edu/plasma/][PLASMA]] source code but is not limited to
+    shared-memory environment and can exploit multiple GPUs.  CHAMELEON
+    is interfaced in a generic way with both [[http://icl.cs.utk.edu/quark/][QUARK]] and [[http://runtime.bordeaux.inria.fr/StarPU/][StarPU]] runtime
+    systems.  This feature allows to analyze in a unified framework how
+    sequential task-based algorithms behave regarding different runtime
+    systems implementations.  Using CHAMELEON with [[http://runtime.bordeaux.inria.fr/StarPU/][StarPU]] runtime
+    system allows to exploit GPUs through kernels provided by [[https://developer.nvidia.com/cublas][cuBLAS]]
+    and clusters of interconnected nodes with distributed memory (using
+    [[http://www.open-mpi.org/][MPI]]).  Computation of very large systems with dense matrices on a
+    cluster of nodes is still being experimented and stabilized.  It is
+    not expected to get stable performances with the current version
+    using MPI.
+*** PLASMA's design principles
+    CHAMELEON is originally based on [[http://icl.cs.utk.edu/plasma/][PLASMA]] so that design principles
+    are very similar.  The content of this section PLASMA's design
+    principles has been copied from the /Design principles/ section of
+    the PLASMA User's Guide.
+**** Tile Algorithms
+     Tile algorithms are based on the idea of processing the matrix by
+     square tiles of relatively small size, such that a tile fits
+     entirely in one of the cache levels associated with one core.
+     This way a tile can be loaded to the cache and processed
+     completely before being evicted back to the main memory.  Of the
+     three types of cache misses, *compulsory*, *capacity* and *conflict*,
+     the use of tile algorithms minimizes the number of capacity
+     misses, since each operation loads the amount of data that does
+     not ``overflow'' the cache.
+     For some operations such as matrix multiplication and Cholesky
+     factorization, translating the classic algorithm to the tile
+     algorithm is trivial.  In the case of matrix multiplication, the
+     tile algorithm is simply a product of applying the technique of
+     *loop tiling* to the canonical definition of three nested loops.  It
+     is very similar for the Cholesky factorization.  The *left-looking*
+     definition of Cholesky factorization from LAPACK is a loop with a
+     sequence of calls to four routines: xSYRK (symmetric *rank-k*
+     update), xPOTRF (Cholesky factorization of a small block on the
+     diagonal), xGEMM (matrix multiplication) and xTRSM (triangular
+     solve).  If the xSYRK, xGEMM and xTRSM operations are expressed
+     with the canonical definition of three nested loops and the
+     technique of loop tiling is applied, the tile algorithm results.
+     Since the algorithm is produced by simple reordering of
+     operations, neither the number of operations nor numerical
+     stability of the algorithm are affected.
+     The situation becomes slightly more complicated for LU and QR
+     factorizations, where the classic algorithms factorize an entire
+     panel of the matrix (a block of columns) at every step of the
+     algorithm.  One can observe, however, that the process of matrix
+     factorization is synonymous with introducing zeros in approproate
+     places and a tile algorithm can be fought of as one that zeroes
+     one tile of the matrix at a time.  This process is referred to as
+     updating of a factorization or *incremental factorization*.  The
+     process is equivalent to factorizing the top tile of a panel, then
+     placing the upper triangle of the result on top of the tile blow
+     and factorizing again, then moving to the next tile and so on.
+     Here, the tile LU and QR algorithms perform slightly more floating
+     point operations and require slightly more memory for auxiliary
+     data.  Also, the tile LU factorization applies a different
+     pivoting pattern and, as a result, is less numerically stable than
+     classic LU with full pivoting.  Numerical stability is not an
+     issue in case of the tile QR, which relies on orthogonal
+     transformations (Householder reflections), which are numerically
+     stable.
+     #+CAPTION: Schematic illustration of the tile LU factorization (kernel names for real arithmetics in double precision), courtesey of the [[http://icl.cs.utk.edu/plasma/][PLASMA]] team.
+     #+NAME: fig:tile_lu
+     #+ATTR_HTML: :width 640px :align center
+     [[file:tile_lu.jpg]]
+**** Tile Data Layout
+     Tile layout is based on the idea of storing the matrix by square
+     tiles of relatively small size, such that each tile occupies a
+     continuous memory region.  This way a tile can be loaded to the
+     cache memory efficiently and the risk of evicting it from the
+     cache memory before it is completely processed is minimized.  Of
+     the three types of cache misses, *compulsory*, *capacity* and
+     *conflict*, the use of tile layout minimizes the number of conflict
+     misses, since a continuous region of memory will completely fill
+     out a /set-associative/ cache memory before an eviction can
+     happen.  Also, from the standpoint of multithreaded execution, the
+     probability of *false sharing* is minimized.  It can only
+     affect the cache lines containing the beginning and the ending of
+     a tile.
+     In standard *cache-based* architecture, tiles continously laid out
+     in memory maximize the profit from automatic prefetching.  Tile
+     layout is also beneficial in situations involving the use of
+     accelerators, where explicit communication of tiles through DMA
+     transfers is required, such as moving tiles between the system
+     memory and the local store in Cell B. E. or moving tiles between
+     the host memory and the device memory in GPUs.  In most
+     circumstances tile layout also minimizes the number of TLB misses
+     and conflicts to memory banks or partitions.  With the standard
+     (*column-major*) layout, access to each column of a tile is much
+     more likely to cause a conflict miss, a false sharing miss, a TLB
+     miss or a bank or partition conflict.  The use of the standard
+     layout for dense matrix operations is a performance minefield.
+     Although occasionally one can pass through it unscathed, the risk
+     of hitting a spot deadly to performance is very high.
+     Another property of the layout utilized in PLASMA is that it is
+     ``flat'', meaning that it does not involve a level of
+     indirection. Each tile stores a small square submatrix of the main
+     matrix in a *column-major* layout. In turn, the main matrix is an
+     arrangement of tiles immediately following one another in a
+     *column-major* layout.  The offset of each tile can be calculated
+     through address arithmetics and does not involve pointer
+     indirection.  Alternatively, a matrix could be represented as an
+     array of pointers to tiles, located anywhere in memory. Such
+     layout would be a radical and unjustifiable departure from LAPACK
+     and ScaLAPACK.  Flat tile layout is a natural progression from
+     LAPACK's *column-major* layout and ScaLAPACK's
+     /block-cyclic/ layout.
+     Another related property of PLASMA's tile layout is that it
+     includes provisions for padding of tiles, i.e., the actual region
+     of memory designated for a tile can be larger than the memory
+     occupied by the actual data.  This allows to force a certain
+     alignment of tile boundaries, while using the flat organization
+     described in the previous paragraph.  The motivation is that, at
+     the price of small memory overhead, alignment of tile boundaries
+     may prove benefivial in multiple scenarios involving memory
+     systems of standard multicore processors, as well as accelerators.
+     The issues that come into play are, again, the use of TLBs and
+     memory banks or partitions.
+     #+CAPTION: Schematic illustration of the tile layout with *column-major* order of tiles, *column-major* order of elements within tiles and (optional) padding for enforcing a certain alighment of tile bondaries, courtesey of the [[http://icl.cs.utk.edu/plasma/][PLASMA]] team.
+     #+NAME: fig:tile_layout
+     #+ATTR_HTML: :width 640px :align center
+     [[file:tile_layout.jpg]]
+**** Dynamic Task Scheduling
+     Dynamic scheduling is the idea of assigning work to cores based on
+     the availability of data for processing at any given point in time
+     and is also referred to as *data-driven* scheduling.  The concept is
+     related closely to the idea of expressing computation through a
+     task graph, often referred to as the DAG (*Direct Acyclic Graph*),
+     and the flexibility exploring the DAG at runtime.  Thus, to a
+     large extent, dynamic scheduling is synonymous with *runtime
+     scheduling*.  An important concept here is the one of the *critical
+     path*, which defines the upper bound on the achievable parallelism,
+     and needs to be pursued at the maximum speed.  This is in direct
+     opposition to the *fork-and-join* or *data-parallel* programming
+     models, where artificial synchronization points expose serial
+     sections of the code, where multiple cores are idle, while
+     sequential processing takes place.  The use of dynamic scheduling
+     introduces a *trade-off*, though.  The more dynamic (flexible)
+     scheduling is, the more centralized (and less scalable) the
+     scheduling mechanism is.  For that reason, currently PLASMA uses
+     two scheduling mechanisms, one which is fully dynamic and one
+     where work is assigned statically and dependency checks are done
+     at runtime.
+     The first scheduling mechanism relies on unfolding a *sliding
+     window* of the task graph at runtime and scheduling work by
+     resolving data hazards: *Read After Write(RAW)*, *Write After Read
+     (WAR)* and *Write After Write (WAW)*, a technique analogous to
+     instruction scheduling in superscalar processors.  It also relies
+     on *work-stealing* for balanding the load among all multiple cores.
+     The second scheduling mechanism relies on statically designating a
+     path through the execution space of the algorithm to each core and
+     following a cycle: transition to a task, wait for its
+     dependencies, execute it, update the overall progress.  Task are
+     identified by tuples and task transitions are done through locally
+     evaluated formulas.  Progress information can be centralized,
+     replicated or distributed (currently centralized).
+     #+CAPTION: A trace of the tile QR factorization executing on eight cores without any global synchronization points (kernel names for real arithmetics in single precision), courtesey of the [[http://icl.cs.utk.edu/plasma/][PLASMA]] team.
+     #+NAME: fig:trace_qr
+     #+ATTR_HTML: :width 640px :align center
+     [[file:trace_qr.jpg]]
--- a/doc/orgmode/chapters/using.org
+++ b/doc/orgmode/chapters/using.org
--- a/doc/orgmode/figures/morse_header.png
+++ b/doc/orgmode/figures/morse_header.png
--- a/doc/orgmode/figures/potri_async.png
+++ b/doc/orgmode/figures/potri_async.png
--- a/doc/orgmode/figures/tile_layout.jpg
+++ b/doc/orgmode/figures/tile_layout.jpg
--- a/doc/orgmode/figures/tile_layout.pdf
+++ b/doc/orgmode/figures/tile_layout.pdf
--- a/doc/orgmode/figures/tile_lu.jpg
+++ b/doc/orgmode/figures/tile_lu.jpg
--- a/doc/orgmode/figures/tile_lu.pdf
+++ b/doc/orgmode/figures/tile_lu.pdf
--- a/doc/orgmode/figures/trace_qr.jpg
+++ b/doc/orgmode/figures/trace_qr.jpg
--- a/doc/orgmode/figures/trace_qr.pdf
+++ b/doc/orgmode/figures/trace_qr.pdf
--- a/doc/orgmode/morse.css
+++ b/doc/orgmode/morse.css
+body {
+  padding: 2em 1em 2em 70px;
+  margin: 0;
+  font-family: sans-serif;
+  color: black;
+  background: white;
+  background-position: top left;
+  background-attachment: fixed;
+  background-repeat: no-repeat;
+}
+:link { color: #00C; background: transparent }
+:visited { color: #609; background: transparent }
+a:active { color: #C00; background: transparent }
+a:link img, a:visited img { border-style: none }
+a img { color: white; }
+@media all {
+  a img { color: inherit; }
+}
+th, td {
+  font-family: sans-serif;
+}
+h1, h2, h3, h4, h5, h6 { text-align: left }
+h1, h2, h3 { color: #005A9C; background: white }
+h1 { font: 170% sans-serif }
+h2 { font: 140% sans-serif }
+h3 { font: 120% sans-serif }
+h4 { font: bold 100% sans-serif }
+h5 { font: italic 100% sans-serif }
+h6 { font: small-caps 100% sans-serif }
+.hide { display: none }
+div.head { margin-bottom: 1em }
+div.head h1 { margin-top: 2em; clear: both }
+div.head table { margin-left: 2em; margin-top: 2em }
+p.copyright { font-size: small }
+p.copyright small { font-size: small }
+@media screen {
+a[href]:hover { background: #ffa }
+}
+pre { margin-left: 2em }
+dt, dd { margin-top: 0; margin-bottom: 0 }
+dt { font-weight: bold }
+pre, code { font-family: monospace }
+ul.toc, ol.toc {
+  list-style: disc;
+  list-style: none;
+}
+@media aural {  
+  h1, h2, h3 { stress: 20; richness: 90 }
+  .hide { speak: none }
+  p.copyright { volume: x-soft; speech-rate: x-fast }
+  dt { pause-before: 20% }
+  pre { speak-punctuation: code } 
+}
+/*
+body {
+  background-image: url();
+}
+*/
--- a/doc/orgmode/users_guide.org.in
+++ b/doc/orgmode/users_guide.org.in
+#+TITLE: CHAMELEON User's Guide
+#+SUBTITLE: A dense linear algebra software for heterogeneous architectures
+#+LANGUAGE:  en
+#+OPTIONS: H:3 num:t \n:nil @:t ::t |:t _:nil ^:nil -:t f:t *:t <:t
+#+OPTIONS: TeX:t LaTeX:t skip:nil d:nil pri:nil tags:not-in-toc html-style:nil
+#+INCLUDE: "./version.org"
+#+AUTHOR: version {{{VERSION}}}
+* Version
+  This manual documents the usage of CHAMELEON *version {{{VERSION}}}*.
+  It was last updated on {{{UPDATED}}}.
+* Authors
+  * Inria,
+  * University of Tennessee,
+  * University of Colorado Denver,
+  * King Abdullah University of Science and Technology
+* Copying
+  Copyright \copy 2017 Inria
+  Copyright \copy 2014 The University of Tennessee
+  Copyright \copy 2014 King Abdullah University of Science and Technology
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+  - Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  - Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer listed
+    in this license in the documentation and/or other materials provided
+    with the distribution.
+  - Neither the name of the copyright holders nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+  This software is provided by the copyright holders and contributors
+  "as is" and any express or implied warranties, including, but not
+  limited to, the implied warranties of merchantability and fitness for
+  a particular purpose are disclaimed.  In no event shall the copyright
+  owner or contributors be liable for any direct, indirect, incidental,
+  special, exemplary, or consequential damages (including, but not
+  limited to, procurement of substitute goods or services; loss of use,
+  data, or profits; or business interruption) however caused and on any
+  theory of liability, whether in contract, strict liability, or tort
+  (including negligence or otherwise) arising in any way out of the use
+  of this software, even if advised of the possibility of such damage.
+* Introduction to Chameleon
+#+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/introduction.org
+* Installing Chameleon
+#+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/installing.org
+# #+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/configuration.org
+# #+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/using.org
--- a/doc/orgmode/version.org.in
+++ b/doc/orgmode/version.org.in
+#+MACRO: UPDATED 25 August 2017
+#+MACRO: UPDATED-MONTH August 2017
+#+MACRO: EDITION @CHAMELEON_VERSION_MAJOR@.@CHAMELEON_VERSION_MINOR@.@CHAMELEON_VERSION_MICRO@
+#+MACRO: VERSION @CHAMELEON_VERSION_MAJOR@.@CHAMELEON_VERSION_MINOR@.@CHAMELEON_VERSION_MICRO@