diff --git a/.gitmodules b/.gitmodules
index 0ab797b12027a56786a5464f6376ccec9350719c..5f0b612967c16d216bf3c4c816e9f9b431819f04 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,15 +1,12 @@
 [submodule "cmake_modules/morse_cmake"]
-	path = cmake_modules/morse_cmake
-	url = https://gitlab.inria.fr/solverstack/morse_cmake.git
+        path = cmake_modules/morse_cmake
+        url = https://gitlab.inria.fr/solverstack/morse_cmake.git
 [submodule "hqr"]
-	path = hqr
-	url = https://gitlab.inria.fr/solverstack/hqr.git
-[submodule "doc/orgmode/org-html-themes"]
-	path = doc/orgmode/org-html-themes
-	url = https://github.com/fniessen/org-html-themes.git
+        path = hqr
+        url = https://gitlab.inria.fr/solverstack/hqr.git
 [submodule "coreblas/hmat-oss"]
-	path = coreblas/hmat-oss
-	url = https://github.com/jeromerobert/hmat-oss.git
+        path = coreblas/hmat-oss
+        url = https://github.com/jeromerobert/hmat-oss.git
 [submodule "testing/test_fembem"]
-	path = testing/test_fembem
-	url = https://gitlab.inria.fr/solverstack/test_fembem.git
+        path = testing/test_fembem
+        url = https://gitlab.inria.fr/solverstack/test_fembem.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8dbc5f4b3fda1685968154087aa656d7ea21f88d..ddad6ab292083638bf3a78953b238d43812c91e0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -569,7 +569,7 @@ add_custom_target(chameleon_all_sources ALL DEPENDS ${CHAMELEON_SOURCES_TARGETS}
-    add_custom_target(doc ALL DEPENDS doxygen-out doc-html-users_guide doc-pdf-users_guide)
+    add_custom_target(doc ALL DEPENDS doc-homepage doxygen-out)
diff --git a/CONTRIBUTING.org b/CONTRIBUTING.org
index 68757f7c27e5e9fcd85d2b181aa76384b8feb09f..debbd85d269b9d448c381a23fbe96baed711dee8 100644
@@ -1,170 +1,177 @@
-* To contribute to the project, you need to do it through merge request
-** Regular / Inria contributors
-*** Create a fork
-   First you need to fork the repository into your own account. You can
-   do that simply by clicking the fork button on the gitlab interface.
-   https://gitlab.inria.fr/solverstack/chameleon/forks/new
-   Then, clone the repository on your laptop:
-   #+begin_src sh
-   git clone git@gitlab.inria.fr:username/forkname.git
-   #+end_src
-   Once this is done, you can setup the chameleon repository as the
-   upstream of your clone to simplify the update of your fork
-   repository.
-   #+begin_src sh
-   git remote add upstream git@gitlab.inria.fr:solverstack/chameleon.git
-   #+end_src
-   To update your fork with the upstream chameleon's state:
-   #+begin_src sh
-   git pull upstream master
-   git push -u origin master
-   #+end_src
-*** Create a "Feature" branch in your fork
-   To add a new feature, fix a bug, and so on, you need to create a
-   new branch from the last state of the master branch
-   #+begin_src sh
-   git branch your_branch_name
-   git checkout your_branch_name
-   #+end_src
-   Apply your modifications in that "Feature" branch. Then, you need
-   to push this branch on your online repository
-   #+begin_src sh
-   git push origin your_branch_name
-   #+end_src
-*** Merge request
-   Once your branch is online, on the gitlab interface, go to the
-   branches webpage, select the branch you want to push as a merge
-   request, and push the button !!!
-   *Be careful to check the 'close after merge' check box, and to push
-   to the solverstack/chameleon repository*. By default the checkbox
-   may not be checked, and the default repository is your fork.
-   If the pull request is made to fix an issue, please name the branch
-   "issueXX" so it is automatically linked to the issue. In addition,
-   please add "fix issue #xx" in the comment of the pull request to
-   automatically close the issue when the PR is merged.
-*** Rebase on top of 'master'
-   In some cases your "feature" branch you want to merge into "master"
-   has a long life span so that your branch and the master branch
-   could make some conflicts. To avoid having to handle the possible
-   conflicts at *merge request* time, please rebase your "feature" on
-   top of "master" before pushing the button *merge request*.
-   To do that, just go at the HEAD of your "feature" branch and rebase
-   #+begin_src sh
-   git checkout feature
-   git rebase master
-   #+end_src
-   Then force to push on your origin
-   #+begin_src sh
-   git push --force origin feature
-   #+end_src
-   Then push the button *merge request*.
-** Occasional / external contributors
-*** Create a gitlab account
-Whereas [[https://gitlab.inria.fr/solverstack/chameleon][Chameleon]] is a public project and does not require an authentication
-to access it, a gitlab account is necessary to contribute. If you do not
-already have one, this is the first step to do. Note that Inria members can
-login directly with their Inria login in the *iLDAP* tab of the [[https://gitlab.inria.fr/users/sign_in][sign_in]] page.
-External users need to first [[https://gitlab-account.inria.fr/][register]] and can then login in the *Standard*
-tab of the [[https://gitlab.inria.fr/users/sign_in][sign_in]] page.
-*** Post an issue
-Create a new issue (see [[https://gitlab.inria.fr/solverstack/chameleon/issues][issues]]) presenting your contribution proposal (feature,
-fix, ...). The Chameleon team will set up a contribution branch for you. You can
-attach a patch to the issue, which we will use in this case to initiate the
-branch. In any case, we will then provide you with further instructions to work
-on the branch and eventually perform your merge request.
-* Configure a runner to test your branch
-  To be effectively merged, your branch must be tested through the
-  [[https://gitlab.inria.fr/help/ci/README.md][gitlab-ci]] mechanism.
-  In order to execute the tests the contributor should define his own
-  /gitlab runner/, /e.g/. his laptop or any other remote machine. To avoid
-  having to install the proper dependencies in every runners we use
-  the [[https://www.docker.com/][Docker]] image /hpclib/hiepacs/ whose recipe is defined
-  [[https://gitlab.inria.fr/sed-bso/hpclib/blob/master/tools/dockerfiles/hiepacs/Dockerfile][here]]. Consequently, to register a compatible runner the requirements
-  on the system are :
-  * OS must be Linux
-  * Docker must be installed, e.g.
+** To contribute to the project, you need to do it through merge request
+*** Regular / Inria contributors
+**** Create a fork
+     First you need to fork the repository into your own account. You can
+     do that simply by clicking the fork button on the gitlab interface.
+     https://gitlab.inria.fr/solverstack/chameleon/forks/new
+     Then, clone the repository on your laptop:
+     #+begin_src sh
+     git clone git@gitlab.inria.fr:username/forkname.git
+     #+end_src
+     Once this is done, you can setup the chameleon repository as the
+     upstream of your clone to simplify the update of your fork
+     repository.
+     #+begin_src sh
+     git remote add upstream git@gitlab.inria.fr:solverstack/chameleon.git
+     #+end_src
+     To update your fork with the upstream chameleon's state:
+     #+begin_src sh
+     git pull upstream master
+     git push -u origin master
+     #+end_src
+**** Create a "Feature" branch in your fork
+     To add a new feature, fix a bug, and so on, you need to create a
+     new branch from the last state of the master branch
+     #+begin_src sh
+     git branch your_branch_name
+     git checkout your_branch_name
+     #+end_src
+     Apply your modifications in that "Feature" branch. Then, you need
+     to push this branch on your online repository
+     #+begin_src sh
+     git push origin your_branch_name
+     #+end_src
+**** Merge request
+     Once your branch is online, on the gitlab interface, go to the
+     branches webpage, select the branch you want to push as a merge
+     request, and push the button !!!
+     *Be careful to check the 'close after merge' check box, and to push
+     to the solverstack/chameleon repository*. By default the checkbox
+     may not be checked, and the default repository is your fork.
+     If the pull request is made to fix an issue, please name the branch
+     "issueXX" so it is automatically linked to the issue. In addition,
+     please add "fix issue #xx" in the comment of the pull request to
+     automatically close the issue when the PR is merged.
+**** Rebase on top of 'master'
+     In some cases your "feature" branch you want to merge into "master"
+     has a long life span so that your branch and the master branch
+     could make some conflicts. To avoid having to handle the possible
+     conflicts at *merge request* time, please rebase your "feature" on
+     top of "master" before pushing the button *merge request*.
+     To do that, just go at the HEAD of your "feature" branch and rebase
+     #+begin_src sh
+     git checkout feature
+     git rebase master
+     #+end_src
+     Then force to push on your origin
+     #+begin_src sh
+     git push --force origin feature
+     #+end_src
+     Then push the button *merge request*.
+*** Occasional / external contributors
+**** Create a gitlab account
+     Whereas [[https://gitlab.inria.fr/solverstack/chameleon][Chameleon]] is a public project and does not require an
+     authentication to access it, a gitlab account is necessary to
+     contribute. If you do not already have one, this is the first
+     step to do.
+     Inria members can login directly with their Inria login in the
+     *iLDAP* tab of the [[https://gitlab.inria.fr/users/sign_in][sign_in]] page.
+     External users need to ask for an [[https://external-account.inria.fr/][external account]], send an email
+     to [[mailto:mathieu.faverge@inria.fr][mathieu.faverge@inria.fr]]. Then login in the *Standard* tab of
+     the [[https://gitlab.inria.fr/users/sign_in][sign_in]] page.
+**** Post an issue
+     Create a new issue (see [[https://gitlab.inria.fr/solverstack/chameleon/issues][issues]]) presenting your contribution
+     proposal (feature, fix, ...). The Chameleon team will set up a
+     contribution branch for you. You can attach a patch to the issue,
+     which we will use in this case to initiate the branch. In any
+     case, we will then provide you with further instructions to work
+     on the branch and eventually perform your merge request.
+** Configure a runner to test your branch
+   To be effectively merged, your branch must be tested through the
+   [[https://gitlab.inria.fr/help/ci/README.md][gitlab-ci]] mechanism.
+   In order to execute the tests the contributor should define his own
+   /gitlab runner/, /e.g/. his laptop or any other remote machine. To
+   avoid having to install the proper dependencies in every runners we
+   use the [[https://www.docker.com/][Docker]] image
+   *registry.gitlab.inria.fr/solverstack/docker/distrib* whose recipe is
+   defined [[https://gitlab.inria.fr/solverstack/docker/-/blob/master/dockerfile-distrib][here]]. Consequently, to register a compatible runner the
+   requirements on the system are :
+   * OS must be Linux
+   * Docker must be installed, e.g.
+     #+begin_src sh
+     sudo apt-get update && sudo apt-get install -y curl
+     curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
+     sudo apt install -y software-properties-common
+     sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
+     sudo apt-get update
+     sudo apt install -y docker-ce
+     sudo usermod -aG docker ${USER}
+     newgrp docker
+     #+end_src
+**** Register your runner
+     Please read first the [[https://gitlab.inria.fr/help/ci/runners/README.md][Gitlab documentation]] for general information
+     about runners registration.
+     Three steps are required:
+     1) install the gitlab-runner program
+     2) register your runner to your project (your fork of Chameleon)
+     3) start gitlab-runner as a service
+     #+begin_src sh
+     # install gitlab-runner
+     sudo wget -O /usr/local/bin/gitlab-runner https://gitlab-ci-multi-runner-downloads.s3.amazonaws.com/latest/binaries/gitlab-ci-multi-runner-linux-amd64
+     sudo chmod +x /usr/local/bin/gitlab-runner
+     sudo useradd --comment 'GitLab Runner' --create-home gitlab-runner --shell /bin/bash
+     # register runner to https://gitlab.inria.fr/
+     sudo gitlab-runner register # see just after for an example
+     # install and run as a service
+     sudo gitlab-runner install --user=gitlab-runner --working-directory=/home/gitlab-runner
+     sudo gitlab-runner start
+     #+end_src
+     Example of registering sequence:
+     #+begin_example
+     sudo gitlab-runner register
+     Please enter the gitlab-ci coordinator URL (e.g. https://gitlab.com/):
+     https://gitlab.inria.fr/
+     Please enter the gitlab-ci token for this runner:
+     # copy/paste the project's secret token here
+     Please enter the gitlab-ci description for this runner:
+     [ubuntu1604]:
+     Please enter the gitlab-ci tags for this runner (comma separated):
+     linux, ubuntu
+     Whether to run untagged builds [true/false]:
+     [false]: true
+     Whether to lock Runner to current project [true/false]:
+     [false]:
+     Registering runner... succeeded                     runner=4jknGvoz
+     Please enter the executor: shell, ssh, docker+machine, docker-ssh+machine, kubernetes, docker, parallels, virtualbox, docker-ssh:
+     docker
+     Please enter the default Docker image (e.g. ruby:2.1):
+     ubuntu
+     Runner registered successfully. Feel free to start it, but if it's running already the config should be automatically reloaded!
+     #+end_example
+** To review locally a private pull request submitted by someone else
+    Get the patch from the pull request (Need to update that !!!!
+    Coming from bitbucket)
     #+begin_src sh
-    sudo apt-get update && sudo apt-get install -y curl
-    curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
-    sudo apt install -y software-properties-common
-    sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
-    sudo apt-get update
-    sudo apt install -y docker-ce
-    sudo usermod -aG docker ${USER}
-    newgrp docker
+    curl https://bitbucket.org/api/2.0/repositories/icldistcomp/parsec/pullrequests/#PR/patch > pr#PR.patch
-*** Register your runner
-    Please read first the [[https://gitlab.inria.fr/help/ci/runners/README.md][Gitlab documentation]] for general information
-    about runners registration.
-    Three steps are required:
-    1) install the gitlab-runner program
-    2) register your runner to your project (your fork of Chameleon)
-    3) start gitlab-runner as a service
+    Then apply the patch on your local copy
     #+begin_src sh
-    # install gitlab-runner
-    sudo wget -O /usr/local/bin/gitlab-runner https://gitlab-ci-multi-runner-downloads.s3.amazonaws.com/latest/binaries/gitlab-ci-multi-runner-linux-amd64
-    sudo chmod +x /usr/local/bin/gitlab-runner
-    sudo useradd --comment 'GitLab Runner' --create-home gitlab-runner --shell /bin/bash
-    # register runner to https://gitlab.inria.fr/
-    sudo gitlab-runner register # see just after for an example
-    # install and run as a service
-    sudo gitlab-runner install --user=gitlab-runner --working-directory=/home/gitlab-runner
-    sudo gitlab-runner start
+    git apply pr#PR.patch
-    Example of registering sequence:
-    #+begin_example
-    sudo gitlab-runner register
-    Please enter the gitlab-ci coordinator URL (e.g. https://gitlab.com/):
-    https://gitlab.inria.fr/
-    Please enter the gitlab-ci token for this runner:
-    # copy/paste the project's secret token here
-    Please enter the gitlab-ci description for this runner:
-    [ubuntu1604]:
-    Please enter the gitlab-ci tags for this runner (comma separated):
-    linux, ubuntu
-    Whether to run untagged builds [true/false]:
-    [false]: true
-    Whether to lock Runner to current project [true/false]:
-    [false]:
-    Registering runner... succeeded                     runner=4jknGvoz
-    Please enter the executor: shell, ssh, docker+machine, docker-ssh+machine, kubernetes, docker, parallels, virtualbox, docker-ssh:
-    docker
-    Please enter the default Docker image (e.g. ruby:2.1):
-    ubuntu
-    Runner registered successfully. Feel free to start it, but if it's running already the config should be automatically reloaded!
-    #+end_example
-* To review locally a private pull request submitted by someone else
-   Get the patch from the pull request (Need to update that !!!!
-   Coming from bitbucket)
-   #+begin_src sh
-   curl https://bitbucket.org/api/2.0/repositories/icldistcomp/parsec/pullrequests/#PR/patch > pr#PR.patch
-   #+end_src
-   Then apply the patch on your local copy
-   #+begin_src sh
-   git apply pr#PR.patch
-   #+end_src
diff --git a/README.org b/README.org
index 7463d62c11591794b59d249dc0c689a965c9b5b3..2907869a6a40f351bfa073e37ddbf829b9b7240c 100644
--- a/README.org
+++ b/README.org
@@ -12,21 +12,6 @@
 Chameleon is a C library providing parallel algorithms to perform
 BLAS/LAPACK operations exploiting fully modern architectures.
-Chameleon dense linear algebra software relies on sequential
-task-based algorithms where sub-tasks of the overall algorithms are
-submitted to a Runtime system. Such a system is a layer between the
-application and the hardware which handles the scheduling and the
-effective execution of tasks on the processing units. A Runtime system
-such as [[http://starpu.gforge.inria.fr/][StarPU]] is able to manage automatically data transfers between
-not shared memory area (CPUs-GPUs, distributed nodes).
-This kind of implementation paradigm allows to design high performing
-linear algebra algorithms on very different type of architecture:
-laptop, many-core nodes, CPUs-GPUs, multiple nodes. For example,
-Chameleon is able to perform a Cholesky factorization
-(double-precision) at 80 TFlop/s on a dense matrix of order 400 000
-(i.e. 4 min). Chameleon is a sub-project of [[http://icl.cs.utk.edu/morse/][MORSE]] specifically
-dedicated to dense linear algebra.
 * Get Chameleon
@@ -35,49 +20,38 @@ dedicated to dense linear algebra.
   To get sources please use these commands:
   #+begin_src sh
-    # if git version >= 1.9
-    git clone --recursive git@gitlab.inria.fr:solverstack/chameleon.git
-    cd chameleon
-    # else
-    git clone git@gitlab.inria.fr:solverstack/chameleon.git
-    cd chameleon
-    git submodule init
-    git submodule update
+  # if git version >= 1.9
+  git clone --recursive git@gitlab.inria.fr:solverstack/chameleon.git
+  cd chameleon
+  # else
+  git clone git@gitlab.inria.fr:solverstack/chameleon.git
+  cd chameleon
+  git submodule init
+  git submodule update
-  Last releases of Chameleon are hosted on the [[https://gforge.inria.fr/frs/?group_id=2884][gforge.inria.fr]] for
-  now. Future releases will be available on this gitlab project.
-* Documentation
-** User guide
-   Please refer to the [[https://solverstack.gitlabpages.inria.fr/chameleon/users_guide.html][User guide]] to learn how to install and use
-   Chameleon.
-   The user guide is also available directly in the sources as emacs
-   orgmode files, see :
-   1) [[file:doc/orgmode/chapters/introduction.org][Introduction]] : description of the scientific context
-   2) [[file:doc/orgmode/chapters/installing.org][Installing]] :
-      * Getting Chameleon
-      * Prerequisites for installing Chameleon
-      * Distribution of Chameleon using Spack
-      * Build and install Chameleon with CMake
-   3) [[file:doc/orgmode/chapters/using.org][Using]] :
-      * Linking an external application with Chameleon libraries
-      * Using Chameleon executables
-      * Chameleon API
-   This documentation could also be generated in html and/or pdf :
+  Last releases of Chameleon are hosted on the [[https://gitlab.inria.fr/solverstack/chameleon/-/releases][releases page]].
+* Documentations
+** Homepage and user's guide
+   Please visit our [[https://solverstack.gitlabpages.inria.fr/chameleon/][Homepage]] to get:
+   * download links,
+   * quick start guides,
+   * installation instructions,
+   * tutorials,
+   * bencmarks.
+   This documentation could also be generated in html:
    # build the doc with cmake (emacs with orgmode and latex are required), e.g.
    make doc
-   see the ~doc/orgmode~ directory.
+   see the ~doc/user~ directory.
-** Source code documentation (doxygen)
+** Source code documentation, API (doxygen)
-   Please refer to the [[https://solverstack.gitlabpages.inria.fr/chameleon/doxygen/index.html][doxygen documentation]] to get more precise
+   Please refer to the [[https://solverstack.gitlabpages.inria.fr/chameleon/dev/index.html][doxygen documentation]] to get more precise
    information about the API, the public and internal functions
    prototypes and the data structures.
@@ -87,71 +61,18 @@ dedicated to dense linear algebra.
 ** For developers
    Please refer to the [[file:READMEDEV.org][READMEDEV]] page.
-* Get involved!
-** Mailing list
+* Contact
+  If you have an account on [[https://gitlab.inria.fr/][gitlab inria]] please submit a [[https://gitlab.inria.fr/solverstack/chameleon/-/issues][new issue]].
-   To contact the developers send an email to
-   [[mailto:morse-devel@lists.gforge.inria.fr][morse-devel@lists.gforge.inria.fr]]
+  If you don't have an account on [[https://gitlab.inria.fr][gitlab inria]] you can send emails to
+  [[mailto:chameleon-issues@inria.fr][chameleon-issues@inria.fr]].
-** Contributions
+  To get the news, register to the mailing list
+  [[https://sympa.inria.fr/sympa/info/chameleon-announce][chameleon-announce@inria.fr]] (click on "S'abonner" in the left
+  panel).
+* Get involved!
-* Authors
- First, since the Chameleon library started as an extension of the
- PLASMA library to support multiple runtime systems, all developpers
- of the PLASMA library are developpers of the Chameleon library.
- The following people contributed to the development of Chameleon:
- * Emmanuel Agullo, PI
- * Olivier Aumage
- * Cedric Castagnede
- * Terry Cojean
- * Mathieu Faverge, PI
- * Nathalie Furmento
- * Reazul Hoque
- * Hatem Ltaief
- * Gregoire Pichon
- * Florent Pruvost, PI
- * Marc Sergent
- * Guillaume Sylvand
- * Samuel Thibault
- * Stanimire Tomov
- * Omar Zenati
- If we forgot your name, please let us know that we can fix that mistake.
-* Citing Chameleon
-Feel free to use the following publications to reference Chameleon:
-  * Original paper that initiated Chameleon and the principles:
-    - Agullo, Emmanuel and Augonnet, Cédric and Dongarra, Jack and
-      Ltaief, Hatem and Namyst, Raymond and Thibault, Samuel and Tomov,
-      Stanimire, *Faster, Cheaper, Better -- a Hybridization Methodology
-      to Develop Linear Algebra Software for GPUs*, /GPU Computing Gems/,
-      [[https://hal.inria.fr/inria-00547847][First Online: 17 December 2010]].
-  * Design of the QR algorithms:
-    - Agullo, Emmanuel and Augonnet, Cédric and Dongarra, Jack and
-      Faverge, Mathieu and Ltaief, Hatem and Thibault, Samuel an
-      Tomov, Stanimire, *QR Factorization on a Multicore Node Enhanced
-      with Multiple GPU Accelerators*, /25th IEEE International Parallel
-      & Distributed Processing Symposium/, [[https://hal.inria.fr/inria-00547614][First Online: 16 December
-      2010]].
-  * Design of the LU algorithms:
-    - Agullo, Emmanuel and Augonnet, Cédric and Dongarra, Jack and
-      Faverge, Mathieu and Langou, Julien and Ltaief, Hatem and Tomov,
-      Stanimire, *LU Factorization for Accelerator-based Systems*,
-      /9th ACS/IEEE International Conference on Computer Systems and
-      Applications (AICCSA 11)/, [[https://hal.inria.fr/hal-00654193][First Online: 21 December 2011]].
-  * Regarding distributed memory:
-    - Agullo, Emmanuel and Aumage, Olivier and Faverge, Mathieu and
-      Furmento, Nathalie and Pruvost, Florent and Sergent, Marc and
-      Thibault, Samuel, *Achieving High Performance on Supercomputers
-      with a Sequential Task-based Programming Model*, /Research Report/,
-      [[https://hal.inria.fr/hal-01332774][First Online: 16 June 2016]].
 * Licence
diff --git a/compute/zcesca.c b/compute/zcesca.c
index 35f2839a72d3c1d3a97c555e3747cf2435f5bec2..822858dad6683f5748cb7073b6bfcbd4c2286491 100644
--- a/compute/zcesca.c
+++ b/compute/zcesca.c
@@ -142,8 +142,8 @@ void CHAMELEON_zcesca_WS_Free( void *user_ws )
  *  A bicentered gives \f[\bar{A} = (\bar{a}_{i,j})_{1 \leq i \leq m, 1 \leq j \leq n}\f] such that
  *  \f[ \bar{a}_{i,j} = a_{i,j} - g_i - g_j + g \f]
  * Lets
- * \f[d_i = || a_{i*} || = \sqrt{ \sum_j a_{ij}²} \\
- *    d_j = || a_{*j} || = \sqrt{ \sum_i a_{ij}²} \f]
+ * \f[d_i = || a_{i*} || = \sqrt{ \sum_j a_{ij}^2} \\
+ *    d_j = || a_{*j} || = \sqrt{ \sum_i a_{ij}^2} \f]
  * A scaled rowwise gives \f[A' = (a_{i,j}')_{1 \leq i \leq m, 1 \leq j \leq n}\f] such that
  * \f[ a_{i*}' = \frac{a_{i*}}{d_i} \f]
  * A scaled columnwise gives \f[A' = (a_{i,j}')_{1 \leq i \leq m, 1 \leq j \leq n}\f] such that
diff --git a/coreblas/compute/core_zcesca.c b/coreblas/compute/core_zcesca.c
index 8bdc87dac306a36aecd8ba527832af7516d240c2..989e48b82ad718b41d0975628368df85ba39c232 100644
--- a/coreblas/compute/core_zcesca.c
+++ b/coreblas/compute/core_zcesca.c
@@ -36,8 +36,8 @@
  *  A bicentered gives \f[\bar{A} = (\bar{a}_{i,j})_{1 \leq i \leq m, 1 \leq j \leq n}\f] such that
  *  \f[ \bar{a}_{i,j} = a_{i,j} - g_i - g_j + g \f]
  * Lets
- * \f[d_i = || a_{i*} || = \sqrt{ \sum_j a_{ij}²} \\
- *    d_j = || a_{*j} || = \sqrt{ \sum_i a_{ij}²} \f]
+ * \f[d_i = || a_{i*} || = \sqrt{ \sum_j a_{ij}^2} \\
+ *    d_j = || a_{*j} || = \sqrt{ \sum_i a_{ij}^2} \f]
  * A scaled rowwise gives \f[A' = (a_{i,j}')_{1 \leq i \leq m, 1 \leq j \leq n}\f] such that
  * \f[ a_{i*}' = \frac{a_{i*}}{d_i} \f]
  * A scaled columnwise gives \f[A' = (a_{i,j}')_{1 \leq i \leq m, 1 \leq j \leq n}\f] such that
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 98e18955dc5ea0db2eb00b092cf0e2374dec85eb..d7077e2a524f2abce42eed968c1dfc76223aef81 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -33,8 +33,8 @@ cmake_minimum_required(VERSION 3.1)
 #      Compilation of documentation         #
 #                                           #
 ### END CMakeLists.txt
diff --git a/doc/doxygen/CMakeLists.txt b/doc/dev/CMakeLists.txt
similarity index 87%
rename from doc/doxygen/CMakeLists.txt
rename to doc/dev/CMakeLists.txt
index 3663c7a62a4ee501da415e945fa2e6ae364e5fab..03e930ef1e699317eb9e236f30b0d508bf194f7a 100644
--- a/doc/doxygen/CMakeLists.txt
+++ b/doc/dev/CMakeLists.txt
@@ -48,21 +48,21 @@ if(DOXYGEN_EXECUTABLE)
     # Doxygen documentation
     # ---------------------
-    add_custom_command(OUTPUT  ${CMAKE_CURRENT_BINARY_DIR}/out/index.html
+    add_custom_command(OUTPUT  ${CMAKE_CURRENT_BINARY_DIR}/index.html
                        COMMAND ${DOXYGEN_EXECUTABLE}
                        ARGS    ${CMAKE_CURRENT_BINARY_DIR}/chameleon.dox
                        DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/chameleon.dox
     add_custom_target(doxygen-out ALL
-                      DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/out/index.html
+                      DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/index.html
     # Installation for html version
     # -----------------------------
-    install(DIRECTORY   ${CMAKE_CURRENT_BINARY_DIR}/out/html
-            DESTINATION share/chameleon/doxygen/dev)
+            DESTINATION share/chameleon/doc/dev)
     message(STATUS "Looking for doxygen - not found")
diff --git a/doc/doxygen/README.org b/doc/dev/README.org
similarity index 100%
rename from doc/doxygen/README.org
rename to doc/dev/README.org
diff --git a/doc/dev/conf.dox.in b/doc/dev/conf.dox.in
new file mode 100644
index 0000000000000000000000000000000000000000..e3ce59be0e5afd58ae1d311055437a120620d6fd
--- /dev/null
+++ b/doc/dev/conf.dox.in
-     scheduled. Expected advances: First, we plan to investigate the
-     impact of these principles in the case of sparse applications
-     (whose algorithms are slightly more complicated but often rely on
-     dense kernels).  Furthermore, both in the dense and sparse cases,
-     the scalability on thousands of nodes is still limited; new
-     numerical approaches need to be found.  We will specifically
-     design sparse hybrid direct/iterative methods that represent a
-     promising approach.
-*** Research papers
-    Research papers about MORSE can be found [[http://icl.cs.utk.edu/projectsdev/morse/pubs/index.html][here]].
-** Chameleon
-*** Chameleon software
-    The main purpose is to address the performance shortcomings of the
-    [[http://www.netlib.org/lapack/][LAPACK]] and [[http://www.netlib.org/scalapack/][ScaLAPACK]] libraries on multicore processors and
-    multi-socket systems of multicore processors and their inability
-    to efficiently utilize accelerators such as Graphics Processing
-    Units (GPUs).
-    Chameleon is a framework written in C which provides routines to
-    solve dense general systems of linear equations, symmetric
-    positive definite systems of linear equations and linear least
-    squares problems, using LU, Cholesky, QR and LQ factorizations.
-    Real arithmetic and complex arithmetic are supported in both
-    single precision and double precision.  It supports Linux and Mac
-    OS/X machines (mainly tested on Intel x86-64 and IBM Power
-    architectures).
-    Chameleon is based on the [[http://icl.cs.utk.edu/plasma/][PLASMA]] source code but is not limited to
-    shared-memory environment and can exploit multiple GPUs.
-    Chameleon is interfaced in a generic way with [[http://runtime.bordeaux.inria.fr/StarPU/][StarPU]], [[http://icl.utk.edu/parsec/][PaRSEC]],
-    [[http://icl.cs.utk.edu/quark/][QUARK]] runtime systems.  This feature allows to analyze in a
-    unified framework how sequential task-based algorithms behave
-    regarding different runtime systems implementations.  Using
-    Chameleon with *StarPU* or *PaRSEC* runtime systems allows to exploit
-    GPUs through kernels provided by [[https://developer.nvidia.com/cublas][cuBLAS]] and clusters of
-    interconnected nodes with distributed memory (using [[http://www.open-mpi.org/][MPI]]).
-    Computation of very large systems with dense matrices on a cluster
-    of nodes is still being experimented and stabilized.  It is not
-    expected to get stable performances with the current version using
-    MPI.
-*** PLASMA's design principles
-    Chameleon is originally based on [[http://icl.cs.utk.edu/plasma/][PLASMA]] so that design principles
-    are very similar.  The content of this section PLASMA's design
-    principles has been copied from the /Design principles/ section of
-    the PLASMA User's Guide.
-**** Tile Algorithms
-     Tile algorithms are based on the idea of processing the matrix by
-     square tiles of relatively small size, such that a tile fits
-     entirely in one of the cache levels associated with one core.
-     This way a tile can be loaded to the cache and processed
-     completely before being evicted back to the main memory.  Of the
-     three types of cache misses, *compulsory*, *capacity* and *conflict*,
-     the use of tile algorithms minimizes the number of capacity
-     misses, since each operation loads the amount of data that does
-     not ``overflow'' the cache.
-     For some operations such as matrix multiplication and Cholesky
-     factorization, translating the classic algorithm to the tile
-     algorithm is trivial.  In the case of matrix multiplication, the
-     tile algorithm is simply a product of applying the technique of
-     *loop tiling* to the canonical definition of three nested loops.
-     It is very similar for the Cholesky factorization.  The
-     *left-looking* definition of Cholesky factorization from LAPACK is
-     a loop with a sequence of calls to four routines: xSYRK
-     (symmetric *rank-k* update), xPOTRF (Cholesky factorization of a
-     small block on the diagonal), xGEMM (matrix multiplication) and
-     xTRSM (triangular solve).  If the xSYRK, xGEMM and xTRSM
-     operations are expressed with the canonical definition of three
-     nested loops and the technique of loop tiling is applied, the
-     tile algorithm results.  Since the algorithm is produced by
-     simple reordering of operations, neither the number of operations
-     nor numerical stability of the algorithm are affected.
-     The situation becomes slightly more complicated for LU and QR
-     factorizations, where the classic algorithms factorize an entire
-     panel of the matrix (a block of columns) at every step of the
-     algorithm.  One can observe, however, that the process of matrix
-     factorization is synonymous with introducing zeros in approproate
-     places and a tile algorithm can be fought of as one that zeroes
-     one tile of the matrix at a time.  This process is referred to as
-     updating of a factorization or *incremental factorization*.  The
-     process is equivalent to factorizing the top tile of a panel,
-     then placing the upper triangle of the result on top of the tile
-     blow and factorizing again, then moving to the next tile and so
-     on.  Here, the tile LU and QR algorithms perform slightly more
-     floating point operations and require slightly more memory for
-     auxiliary data.  Also, the tile LU factorization applies a
-     different pivoting pattern and, as a result, is less numerically
-     stable than classic LU with full pivoting.  Numerical stability
-     is not an issue in case of the tile QR, which relies on
-     orthogonal transformations (Householder reflections), which are
-     numerically stable.
Schematic illustration of the tile LU factorization (kernel names for real arithmetics in double precision), courtesey of the PLASMA team.
fig:tile_lu
-     #+ATTR_HTML: :width 640px :align center
-     [[file:tile_lu.jpg]]
-**** Tile Data Layout
-     <<sec:tile>>
-     Tile layout is based on the idea of storing the matrix by square
-     tiles of relatively small size, such that each tile occupies a
-     continuous memory region.  This way a tile can be loaded to the
-     cache memory efficiently and the risk of evicting it from the
-     cache memory before it is completely processed is minimized.  Of
-     the three types of cache misses, *compulsory*, *capacity* and
-     *conflict*, the use of tile layout minimizes the number of conflict
-     misses, since a continuous region of memory will completely fill
-     out a /set-associative/ cache memory before an eviction can happen.
-     Also, from the standpoint of multithreaded execution, the
-     probability of *false sharing* is minimized.  It can only affect
-     the cache lines containing the beginning and the ending of a
-     tile.
-     In standard *cache-based* architecture, tiles continously laid out
-     in memory maximize the profit from automatic prefetching.  Tile
-     layout is also beneficial in situations involving the use of
-     accelerators, where explicit communication of tiles through DMA
-     transfers is required, such as moving tiles between the system
-     memory and the local store in Cell B. E. or moving tiles between
-     the host memory and the device memory in GPUs.  In most
-     circumstances tile layout also minimizes the number of TLB misses
-     and conflicts to memory banks or partitions.  With the standard
-     (*column-major*) layout, access to each column of a tile is much
-     more likely to cause a conflict miss, a false sharing miss, a TLB
-     miss or a bank or partition conflict.  The use of the standard
-     layout for dense matrix operations is a performance minefield.
-     Although occasionally one can pass through it unscathed, the risk
-     of hitting a spot deadly to performance is very high.
-     Another property of the layout utilized in PLASMA is that it is
-     ``flat'', meaning that it does not involve a level of
-     indirection. Each tile stores a small square submatrix of the
-     main matrix in a *column-major* layout. In turn, the main matrix is
-     an arrangement of tiles immediately following one another in a
-     *column-major* layout.  The offset of each tile can be calculated
-     through address arithmetics and does not involve pointer
-     indirection.  Alternatively, a matrix could be represented as an
-     array of pointers to tiles, located anywhere in memory. Such
-     layout would be a radical and unjustifiable departure from LAPACK
-     and ScaLAPACK.  Flat tile layout is a natural progression from
-     LAPACK's *column-major* layout and ScaLAPACK's /block-cyclic/ layout.
-     Another related property of PLASMA's tile layout is that it
-     includes provisions for padding of tiles, i.e., the actual region
-     of memory designated for a tile can be larger than the memory
-     occupied by the actual data.  This allows to force a certain
-     alignment of tile boundaries, while using the flat organization
-     described in the previous paragraph.  The motivation is that, at
-     the price of small memory overhead, alignment of tile boundaries
-     may prove benefivial in multiple scenarios involving memory
-     systems of standard multicore processors, as well as
-     accelerators.  The issues that come into play are, again, the use
-     of TLBs and memory banks or partitions.
Schematic illustration of the tile layout with *column-major* order of tiles, *column-major* order of elements within tiles and (optional) padding for enforcing a certain alighment of tile bondaries, courtesey of the PLASMA team.
fig:tile_layout
-     #+ATTR_HTML: :width 640px :align center
-     [[file:tile_layout.jpg]]
-**** Dynamic Task Scheduling
-     Dynamic scheduling is the idea of assigning work to cores based
-     on the availability of data for processing at any given point in
-     time and is also referred to as *data-driven* scheduling.  The
-     concept is related closely to the idea of expressing computation
-     through a task graph, often referred to as the DAG (*Direct
-     Acyclic Graph*), and the flexibility exploring the DAG at runtime.
-     Thus, to a large extent, dynamic scheduling is synonymous with
-     *runtime scheduling*.  An important concept here is the one of
-     the *critical path*, which defines the upper bound on the
-     achievable parallelism, and needs to be pursued at the maximum
-     speed.  This is in direct opposition to the *fork-and-join* or
-     *data-parallel* programming models, where artificial
-     synchronization points expose serial sections of the code, where
-     multiple cores are idle, while sequential processing takes place.
-     The use of dynamic scheduling introduces a *trade-off*, though.
-     The more dynamic (flexible) scheduling is, the more centralized
-     (and less scalable) the scheduling mechanism is.  For that
-     reason, currently PLASMA uses two scheduling mechanisms, one
-     which is fully dynamic and one where work is assigned statically
-     and dependency checks are done at runtime.
-     The first scheduling mechanism relies on unfolding a *sliding
-     window* of the task graph at runtime and scheduling work by
-     resolving data hazards: *Read After Write(RAW)*, *Write After Read
-     (WAR)* and *Write After Write (WAW)*, a technique analogous to
-     instruction scheduling in superscalar processors.  It also relies
-     on *work-stealing* for balanding the load among all multiple cores.
-     The second scheduling mechanism relies on statically designating
-     a path through the execution space of the algorithm to each core
-     and following a cycle: transition to a task, wait for its
-     dependencies, execute it, update the overall progress.  Task are
-     identified by tuples and task transitions are done through
-     locally evaluated formulas.  Progress information can be
-     centralized, replicated or distributed (currently centralized).
A trace of the tile QR factorization executing on eight cores without any global synchronization points (kernel names for real arithmetics in single precision), courtesey of the PLASMA team.
fig:trace_qr
-     #+ATTR_HTML: :width 640px :align center
-     [[file:trace_qr.jpg]]
@@ -1,65 +0,0 @@
-# This file is part of the Chameleon User's Guide.
-# Copyright (C) 2020 Inria
-# See the file ../users_guide.org for copying conditions.
-Show performances on [[https://www.plafrim.fr/][PlaFRIM]] supercomputer.
-See [[https://www.plafrim.fr/hardware-documentation/][characteristics]] to get details about the hardwares.
-See script ~tools/bench/plafrim/run.sh~ to get details about the environment (Guix, Slurm,
-etc) and the build.
-Chameleon is run this way:
-#+begin_src sh
-mpiexec -np $nmpi $CHAMELEON_BUILD/testing/chameleon_${precision}testing -o ${algorithm} -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b
-- runtime : *starpu*
-- precision : *s* or *d* for simple or double precision
-- algorithm : *gemm* or *potrf* or *geqrf_hqr*
-- nmpi = p x p
-- nthr : depends on the node
-- ngpu : depends on the node
-- m = n = k
-- b : depends on the node
-** bora (36 CPUs) nodes
-   - nmpi = *1*, *4*, *9*
-   - 2D block cyclic parameters : PxQ = 1x1, 2x2 and 3x3
-   - Number of threads (t) = *34*, one CPU being dedicated for the
-     scheduler and one other for MPI communications
-   - Number of GPUs = *0*
-   - Tile Size (b) = *280*
-*** CPU times
-    #+CAPTION: Performances in CPU time of GEMM, POTRF and QR on bora nodes
-    #+NAME: fig:chameleon_plafrim_bora_time_openmpi
-    #+ATTR_HTML: :align center :width 75%
-    [[file:chameleon_plafrim_bora_time_openmpi.png]]
-*** GFLOP/s
-    #+CAPTION: Performances in GFlop/s of GEMM, POTRF and QR on bora nodes
-    #+NAME: fig:chameleon_plafrim_bora_perf_openmpi
-    #+ATTR_HTML: :align center :width 75%
-    [[file:chameleon_plafrim_bora_perf_openmpi.png]]
-** sirocco [14-17] (32 CPUs + 2 GPUs V100) nodes
-   - nmpi = *1*
-   - 2D block cyclic parameters : PxQ = 1x1
-   - Number of threads (t) = *29*, one CPU being dedicated for the
-     scheduler and two others for the 2 GPUs
-   - Number of GPUs = *2*
-   - Tile Size (b) = *1600*
-    #+CAPTION: Performances in CPU time of GEMM, POTRF and QR on sirocco nodes
-    #+NAME: fig:chameleon_plafrim_sirocco
-    #+ATTR_HTML: :align center :width 75%
-    [[file:chameleon_plafrim_sirocco.png]]
-# *** CPU times
-#     #+CAPTION: Performances in CPU time of GEMM, POTRF and QR on sirocco nodes
-#     #+NAME: fig:chameleon_plafrim_sirocco_openmpi_time
-#     #+ATTR_HTML: :align center :height 1024
-#     [[file:chameleon_plafrim_sirocco_openmpi_time.png]]
-# *** GFLOP/s
-#     #+CAPTION: Performances in GFlop/s of GEMM, POTRF and QR on sirocco nodes
-#     #+NAME: fig:chameleon_plafrim_sirocco_openmpi_perf
-#     #+ATTR_HTML: :align center :height 1024
-#     [[file:chameleon_plafrim_sirocco_openmpi_perf.png]]
diff --git a/doc/orgmode/users_guide.org.in b/doc/orgmode/users_guide.org.in
deleted file mode 100644
index 226ca624c719ca1b6fcc10c15ccd23301176163d..0000000000000000000000000000000000000000
--- a/doc/orgmode/users_guide.org.in
+++ /dev/null
@@ -1,67 +0,0 @@
-#+TITLE: CHAMELEON User's Guide
-#+SUBTITLE: A dense linear algebra software for heterogeneous architectures
-#+LANGUAGE:  en
-#+OPTIONS: H:3 num:t \n:nil @:t ::t |:t _:nil ^:nil -:t f:t *:t <:t
-#+OPTIONS: TeX:t LaTeX:t skip:nil d:nil pri:nil tags:not-in-toc html-style:nil
-#+BEAMER_THEME: Rochester
-#+HTML_HEAD: <link rel="stylesheet" type="text/css" href="org-html-themes/styles/readtheorg/css/htmlize.css"/>
-#+HTML_HEAD: <link rel="stylesheet" type="text/css" href="org-html-themes/styles/readtheorg/css/readtheorg.css"/>
-#+HTML_HEAD: <script type="text/javascript" src="org-html-themes/styles/readtheorg/js/jquery.min.js"></script>
-#+HTML_HEAD: <script type="text/javascript" src="org-html-themes/styles/readtheorg/js/bootstrap.min.js"></script>
-#+HTML_HEAD: <script type="text/javascript" src="org-html-themes/styles/readtheorg/js/jquery.stickytableheaders.min.js"></script>
-#+HTML_HEAD: <script type="text/javascript" src="org-html-themes/styles/readtheorg/js/readtheorg.js"></script>
-#+INCLUDE: "./version.org"
-This is the users guide to Chameleon.  The software ecosystem will be
-presented, the installation instructions detailed and some usage
-examples are presented.  To get more information about the application
-programming interface, please refer to the [[https://solverstack.gitlabpages.inria.fr/chameleon/doxygen/index.html][doxygen documentation]].
-* Version
-  This manual documents the usage of Chameleon *version {{{VERSION}}}*.
-  It was last updated on {{{UPDATED}}}.
-* Authors
-  * Inria
-  * University of Tennessee
-  * University of Colorado Denver
-  * King Abdullah University of Science and Technology
-* Copying
-  * Copyright \copy {{{UPDATED-YEAR}}} Inria
-  * Copyright \copy 2014 The University of Tennessee
-  * Copyright \copy 2014 King Abdullah University of Science and Technology
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions are
-  met:
-  - Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-  - Redistributions in binary form must reproduce the above copyright
-    notice, this list of conditions and the following disclaimer listed
-    in this license in the documentation and/or other materials provided
-    with the distribution.
-  - Neither the name of the copyright holders nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-  This software is provided by the copyright holders and contributors
-  "as is" and any express or implied warranties, including, but not
-  limited to, the implied warranties of merchantability and fitness for
-  a particular purpose are disclaimed.  In no event shall the copyright
-  owner or contributors be liable for any direct, indirect, incidental,
-  special, exemplary, or consequential damages (including, but not
-  limited to, procurement of substitute goods or services; loss of use,
-  data, or profits; or business interruption) however caused and on any
-  theory of liability, whether in contract, strict liability, or tort
-  (including negligence or otherwise) arising in any way out of the use
-  of this software, even if advised of the possibility of such damage.
-* Introduction to Chameleon
-#+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/introduction.org
-* Installing Chameleon
-#+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/installing.org
-* Using Chameleon
-#+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/using.org
@@ -0,0 +1,105 @@
+# @file CMakeLists.txt
+# @copyright (c) 2017 Inria. All rights reserved.
+#  @project CHAMELEON
+#  CHAMELEON is a software package provided by:
+#     Inria Bordeaux - Sud-Ouest,
+#     Univ. of Tennessee,
+#     King Abdullah Univesity of Science and Technology
+#     Univ. of California Berkeley,
+#     Univ. of Colorado Denver.
+# @version 1.1.0
+#  @author Florent Pruvost
+#  @author Mathieu Faverge
+#  @date 2021-01-04
+cmake_minimum_required(VERSION 3.1)
+# Create files in binary dir
+# --------------------------
+               ${CMAKE_CURRENT_BINARY_DIR}/homepage.org
+               COPYONLY)
+               ${CMAKE_CURRENT_BINARY_DIR}/news.org
+               COPYONLY)
+               ${CMAKE_CURRENT_BINARY_DIR}/publish.el
+               COPYONLY)
+               ${CMAKE_CURRENT_BINARY_DIR}/version.org
+               @ONLY)
+               ${CMAKE_CURRENT_BINARY_DIR}/users_guide.org
+               @ONLY)
+               COPYONLY)
+    chameleon.svg
+    )
+foreach(_fig ${FIGURES_HOMEPAGE})
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${_fig}
+                   ${CMAKE_CURRENT_BINARY_DIR}/${_fig}
+                   COPYONLY)
+    tile_lu.jpg
+    tile_layout.jpg
+    trace_qr.jpg
+    potri_async.png
+    chameleon_header.png
+    )
+foreach(_fig ${FIGURES_USERGUIDE})
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/chapters/${_fig}
+                   ${CMAKE_CURRENT_BINARY_DIR}/${_fig}
+                   COPYONLY)
+# Looking for emacs
+# -----------------
+    # Add target for homepage
+    # -----------------------
+    add_custom_command(OUTPUT  index.html
+                       COMMAND ${EMACS_COMPILER}
+                       ARGS    --batch --no-init-file --load publish.el --funcall org-publish-all
+                       DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/homepage.org
+                               ${CMAKE_CURRENT_BINARY_DIR}/news.org
+                               ${CMAKE_CURRENT_BINARY_DIR}/version.org
+                               ${CMAKE_CURRENT_BINARY_DIR}/users_guide.org
+                               ${CMAKE_CURRENT_BINARY_DIR}/CONTRIBUTING.org
+                               ${CMAKE_CURRENT_BINARY_DIR}/publish.el
+                               ${CMAKE_CURRENT_BINARY_DIR}/chameleon.svg)
+    add_custom_target(doc-homepage ALL DEPENDS index.html)
+    # Installation
+    # ------------
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/index.html
+            DESTINATION share/chameleon/doc/)
+        install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${_fig}
+                DESTINATION share/chameleon/doc)
+    endforeach()
+    message(STATUS "Looking for emacs - not found")
+### END CMakeLists.txt
+digraph G{
+# Pre-Depends:             purple, bold
+# Depends:                 black
+# Recommends:              grey
+# Suggests:                black, dotted
+# Conflicts:               red
+node [shape=box];
+chameleon [style="filled,bold",fillcolor=lightgrey,color=blue,label=<<B>Chameleon</B>>]
+subgraph cluster_runtime {
+        label = "Runtime Systems"
+        style = rounded
+        color = black
+        sched [
+        shape = "record"
+        label = "<sched_openmp> OpenMP | <sched_parsec> PaRSEC | <sched_quark> QUARK | <sched_starpu> StarPU"
+        color=green
+        ]
+        }
+chameleon -> sched [lhead=cluster_runtime]
+sched:sched_starpu -> cuda [color=dimgrey]
+sched:sched_starpu -> mpi [color=dimgrey]
+sched:sched_parsec -> cuda [color=dimgrey]
+sched:sched_parsec -> mpi [color=dimgrey]
+subgraph cluster_paradigm {
+        label = "Paradigms"
+        style = rounded
+        color = black
+        cuda [color=gold, style=bold,label=<<B>CUDA</B>>]
+        mpi [color=gold, style=bold,label=<<B>MPI</B>>]
+        }
+subgraph cluster_kernel {
+        label = "Kernels"
+        style = rounded
+        color = black
+        cublas [color=red, style=bold,label=<<B>cuBLAS</B>>]
+        lapacke [color=red, style=bold,label=<<B>LAPACKE</B>>]
+        cblas [color=red, style=bold,label=<<B>CBLAS</B>>]
+        lapacke -> cblas
+        cublas -> cuda
+        }
+chameleon -> cblas
+chameleon -> lapacke
+chameleon -> mpi [color=dimgrey,lhead=cluster_paradigm]
+chameleon -> cublas
\ No newline at end of file
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 2.43.0 (0)
+ -->
+<!-- Title: G Pages: 1 -->
+<svg width="579pt" height="219pt"
+ viewBox="0.00 0.00 579.35 219.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 215)">
+<g id="clust1" class="cluster">
+<path fill="transparent" stroke="black" d="M20,-91C20,-91 302,-91 302,-91 308,-91 314,-97 314,-103 314,-103 314,-155 314,-155 314,-161 308,-167 302,-167 302,-167 20,-167 20,-167 14,-167 8,-161 8,-155 8,-155 8,-103 8,-103 8,-97 14,-91 20,-91"/>
+<text text-anchor="middle" x="161" y="-151.8" font-family="Times,serif" font-size="14.00">Runtime Systems</text>
+<g id="clust2" class="cluster">
+<path fill="transparent" stroke="black" d="M191,-8C191,-8 317,-8 317,-8 323,-8 329,-14 329,-20 329,-20 329,-71 329,-71 329,-77 323,-83 317,-83 317,-83 191,-83 191,-83 185,-83 179,-77 179,-71 179,-71 179,-20 179,-20 179,-14 185,-8 191,-8"/>
+<text text-anchor="middle" x="254" y="-67.8" font-family="Times,serif" font-size="14.00">Paradigms</text>
+<g id="clust3" class="cluster">
+<path fill="transparent" stroke="black" d="M366,-8C366,-8 545,-8 545,-8 551,-8 557,-14 557,-20 557,-20 557,-154.5 557,-154.5 557,-160.5 551,-166.5 545,-166.5 545,-166.5 366,-166.5 366,-166.5 360,-166.5 354,-160.5 354,-154.5 354,-154.5 354,-20 354,-20 354,-14 360,-8 366,-8"/>
+<text text-anchor="middle" x="455.5" y="-151.3" font-family="Times,serif" font-size="14.00">Kernels</text>
+<!-- chameleon -->
+<g id="node1" class="node">
+<polygon fill="lightgrey" stroke="blue" stroke-width="2" points="453.5,-211 348.5,-211 348.5,-175 453.5,-175 453.5,-211"/>
+<text text-anchor="start" x="356.5" y="-190.3" font-family="Times,serif" font-weight="bold" font-size="14.00">Chameleon</text>
+<!-- sched -->
+<g id="node2" class="node">
+<polygon fill="none" stroke="green" points="15.5,-99.5 15.5,-135.5 306.5,-135.5 306.5,-99.5 15.5,-99.5"/>
+<text text-anchor="middle" x="54.5" y="-113.8" font-family="Times,serif" font-size="14.00">OpenMP</text>
+<polyline fill="none" stroke="green" points="93.5,-99.5 93.5,-135.5 "/>
+<text text-anchor="middle" x="131" y="-113.8" font-family="Times,serif" font-size="14.00">PaRSEC</text>
+<polyline fill="none" stroke="green" points="168.5,-99.5 168.5,-135.5 "/>
+<text text-anchor="middle" x="203.5" y="-113.8" font-family="Times,serif" font-size="14.00">QUARK</text>
+<polyline fill="none" stroke="green" points="238.5,-99.5 238.5,-135.5 "/>
+<text text-anchor="middle" x="272.5" y="-113.8" font-family="Times,serif" font-size="14.00">StarPU</text>
+<!-- chameleon&#45;&gt;sched -->
+<g id="edge1" class="edge">
+<path fill="none" stroke="black" d="M348.33,-175.87C340.64,-173.51 332.39,-170.99 323.81,-168.36"/>
+<polygon fill="black" stroke="black" points="324.58,-164.94 314,-165.36 322.53,-171.63 324.58,-164.94"/>
+<!-- mpi -->
+<g id="node4" class="node">
+<polygon fill="none" stroke="gold" stroke-width="2" points="241,-52 187,-52 187,-16 241,-16 241,-52"/>
+<text text-anchor="start" x="197" y="-31.3" font-family="Times,serif" font-weight="bold" font-size="14.00">MPI</text>
+<!-- chameleon&#45;&gt;mpi -->
+<g id="edge10" class="edge">
+<path fill="none" stroke="dimgrey" d="M359.38,-174.94C355.97,-172.58 352.78,-169.94 350,-167 324.84,-140.35 347.29,-113.02 318,-91 296.72,-75 280.81,-92.5 259.5,-86.95"/>
+<polygon fill="dimgrey" stroke="dimgrey" points="260.58,-83.61 250,-83 257.89,-90.07 260.58,-83.61"/>
+<!-- cublas -->
+<g id="node5" class="node">
+<polygon fill="none" stroke="red" stroke-width="2" points="439.5,-135.5 362.5,-135.5 362.5,-99.5 439.5,-99.5 439.5,-135.5"/>
+<text text-anchor="start" x="370.5" y="-114.8" font-family="Times,serif" font-weight="bold" font-size="14.00">cuBLAS</text>
+<!-- chameleon&#45;&gt;cublas -->
+<g id="edge11" class="edge">
+<path fill="none" stroke="black" d="M401,-174.95C401,-166.3 401,-155.57 401,-145.79"/>
+<polygon fill="black" stroke="black" points="404.5,-145.71 401,-135.71 397.5,-145.71 404.5,-145.71"/>
+<!-- lapacke -->
+<g id="node6" class="node">
+<polygon fill="none" stroke="red" stroke-width="2" points="548.5,-135.5 457.5,-135.5 457.5,-99.5 548.5,-99.5 548.5,-135.5"/>
+<text text-anchor="start" x="465.5" y="-114.8" font-family="Times,serif" font-weight="bold" font-size="14.00">LAPACKE</text>
+<!-- chameleon&#45;&gt;lapacke -->
+<g id="edge9" class="edge">
+<path fill="none" stroke="black" d="M436.21,-174.98C440.62,-172.47 444.98,-169.78 449,-167 459.25,-159.89 469.65,-151.03 478.55,-142.83"/>
+<polygon fill="black" stroke="black" points="481.07,-145.27 485.94,-135.86 476.27,-140.18 481.07,-145.27"/>
+<!-- cblas -->
+<g id="node7" class="node">
+<polygon fill="none" stroke="red" stroke-width="2" points="543,-52 473,-52 473,-16 543,-16 543,-52"/>
+<text text-anchor="start" x="481" y="-31.3" font-family="Times,serif" font-weight="bold" font-size="14.00">CBLAS</text>
+<!-- chameleon&#45;&gt;cblas -->
+<g id="edge8" class="edge">
+<path fill="none" stroke="black" d="M453.77,-190.72C494,-188.5 544.7,-182.59 558,-167 579.93,-141.31 570.04,-122.56 558,-91 553.46,-79.11 545.11,-68.14 536.52,-59.12"/>
+<polygon fill="black" stroke="black" points="538.9,-56.55 529.32,-52.03 533.98,-61.54 538.9,-56.55"/>
+<!-- cuda -->
+<g id="node3" class="node">
+<polygon fill="none" stroke="gold" stroke-width="2" points="321,-52 259,-52 259,-16 321,-16 321,-52"/>
+<text text-anchor="start" x="267" y="-31.3" font-family="Times,serif" font-weight="bold" font-size="14.00">CUDA</text>
+<!-- sched&#45;&gt;cuda -->
+<g id="edge2" class="edge">
+<path fill="none" stroke="dimgrey" d="M273,-98.5C273,-86.18 276.02,-72.96 279.52,-61.79"/>
+<polygon fill="dimgrey" stroke="dimgrey" points="282.88,-62.78 282.79,-52.19 276.25,-60.52 282.88,-62.78"/>
+<!-- sched&#45;&gt;cuda -->
+<g id="edge4" class="edge">
+<path fill="none" stroke="dimgrey" d="M131,-98.5C131,-71.83 226.63,-95.85 250,-83 259.7,-77.67 268.01,-68.97 274.52,-60.42"/>
+<polygon fill="dimgrey" stroke="dimgrey" points="277.5,-62.26 280.39,-52.07 271.78,-58.23 277.5,-62.26"/>
+<!-- sched&#45;&gt;mpi -->
+<g id="edge3" class="edge">
+<path fill="none" stroke="dimgrey" d="M273,-98.5C273,-86.17 258.93,-91.49 250,-83 243.05,-76.4 236.45,-68.34 230.84,-60.73"/>
+<polygon fill="dimgrey" stroke="dimgrey" points="233.38,-58.26 224.75,-52.11 227.66,-62.3 233.38,-58.26"/>
+<!-- sched&#45;&gt;mpi -->
+<g id="edge5" class="edge">
+<path fill="none" stroke="dimgrey" d="M131,-98.5C131,-73.42 155.21,-56.91 177.47,-47.02"/>
+<polygon fill="dimgrey" stroke="dimgrey" points="178.9,-50.22 186.82,-43.19 176.24,-43.75 178.9,-50.22"/>
+<!-- cublas&#45;&gt;cuda -->
+<g id="edge7" class="edge">
+<path fill="none" stroke="black" d="M373.95,-99.4C366.12,-94.28 357.63,-88.54 350,-83 339.46,-75.35 328.23,-66.55 318.37,-58.59"/>
+<polygon fill="black" stroke="black" points="320.42,-55.75 310.45,-52.15 316,-61.18 320.42,-55.75"/>
+<!-- lapacke&#45;&gt;cblas -->
+<g id="edge6" class="edge">
+<path fill="none" stroke="black" d="M504.06,-99.22C504.72,-88.52 505.57,-74.55 506.32,-62.37"/>
+<polygon fill="black" stroke="black" points="509.82,-62.45 506.94,-52.26 502.84,-62.03 509.82,-62.45"/>
diff --git a/doc/orgmode/chapters/chameleon_header.png b/doc/user/chapters/chameleon_header.png
similarity index 100%
rename from doc/orgmode/chapters/chameleon_header.png
rename to doc/user/chapters/chameleon_header.png
@@ -0,0 +1,821 @@
+# This file is part of the Chameleon User's Guide.
+# Copyright (C) 2017 Inria
+# See the file ../users_guide.org for copying conditions.
+Chameleon is written in C and depends on a couple of external
+libraries that must be installed on the system.
+# , it provides an interface to be called from Fortran
+Chameleon can be built and installed on UNIX systems (Linux) by the
+standard means of [[http://www.cmake.org/][CMake]].  General information about CMake, as well as
+installation binaries and CMake source code are available from [[http://www.cmake.org/cmake/resources/software.html][here]].
+To get support to install a full distribution Chameleon + dependencies
+we encourage users to use [[sec:ug:guix][GNU Guix]] or [[sec:ug:spack][Spack]].
+Finally some packages or also available for [[sec:ug:debian][Debian/Ubuntu]] and [[sec:ug:brew][MacOSX]].
+*** Getting Chameleon
+    <<sec:ug:git>>
+    The latest official release tarballs of Chameleon sources are
+    available for download from the [[https://gitlab.inria.fr/solverstack/chameleon/tags][gitlab tags page]].
+    The latest development state is available on [[https://gitlab.inria.fr/solverstack/chameleon][gitlab]]. You need [[https://git-scm.com/downloads][Git]]
+    #+begin_src
+    git clone --recursive https://gitlab.inria.fr/solverstack/chameleon.git
+    #+end_src
+*** Prerequisites for installing Chameleon
+    <<sec:ug:prereq>>
+    To install Chameleon's libraries, header files, and executables, one
+    needs:
+    - CMake (version 2.8 minimum): the build system
+    - C and Fortran compilers: GNU compiler suite, Clang, Intel or IBM
+      can be used
+    - python: to generate files in the different precisions
+    - external libraries: this depends on the configuration, by default
+      the required libraries are
+      - runtimes: [[http://runtime.bordeaux.inria.fr/StarPU/][StarPU]] or [[http://icl.utk.edu/parsec/][PaRSEC]] or [[http://icl.cs.utk.edu/quark/][QUARK]] or [[https://www.openmp.org/][OpenMP]]
+      - kernels : CBLAS, LAPACKE (with TMG). These are C interfaces to
+        Fortran kernels BLAS and LAPACK. There exist several providers
+        that can be used with Chameleon (Intel MKL, Netlib, OpenBLAS,
+        BLIS/FLAME)
+    Optional libraries:
+    - cuda: [[https://developer.nvidia.com/cuda-downloads][cuda]], [[http://docs.nvidia.com/cuda/cublas/][cublas]] (comes with cuda)
+    - mpi: [[http://www.open-mpi.org/][openmpi]], [[https://www.mpich.org/][mpich]], [[https://software.intel.com/en-us/mpi-library][intelmpi]]
+    These packages must be installed on the system before trying to
+    configure/build chameleon.  Please look at the distrib/ directory
+    which gives some hints for the installation of dependencies for
+    Unix systems.
+    We give here some examples for a Debian system:
+    #+begin_src
+    # Update Debian packages list
+    sudo apt-get update
+    # Install BLAS/LAPACK, can be OpenBLAS, Intel MKL, Netlib LAPACK
+    sudo apt-get install -y libopenblas-dev liblapacke-dev
+    # or sudo apt-get install -y libmkl-dev
+    # or sudo apt-get install -y liblapack-dev liblapacke-dev
+    # Install OpenMPI
+    sudo apt-get install -y libopenmpi-dev
+    # Install StarPU
+    sudo apt-get install libstarpu-dev
+    # Optionnaly to make some specific developments, the following may be installed
+    # Install hwloc (used by StarPU or QUARK, already a dependency of OpenMPI)
+    sudo apt-get install -y libhwloc-dev
+    # install EZTrace, usefull to export some nice execution traces
+    with all runtimes
+    sudo apt-get install -y libeztrace-dev
+    # install FxT, usefull to export some nice execution traces with StarPU
+    sudo apt-get install -y libfxt-dev
+    # Install cuda and cuBLAS: only if you have a GPU cuda compatible
+    sudo apt-get install -y nvidia-cuda-toolkit nvidia-cuda-dev
+    # If you prefer a specific version of StarPU, install it yourself, e.g.
+    # Install StarPU (with MPI and FxT enabled)
+    mkdir -p $HOME/install
+    cd $HOME/install
+    wget https://files.inria.fr/starpu/starpu-1.3.7/starpu-1.3.7.tar.gz
+    tar xvzf starpu-1.3.7.tar.gz
+    cd starpu-1.3.7/
+    ./configure --prefix=/usr/local --with-fxt=/usr/lib/x86_64-linux-gnu/
+    make -j5
+    sudo make install
+    # Install PaRSEC: to be used in place of StarPU
+    mkdir -p $HOME/install
+    cd $HOME/install
+    git clone https://bitbucket.org/mfaverge/parsec.git
+    cd parsec
+    git checkout mymaster
+    git submodule update
+    mkdir -p build
+    cd build
+    make -j5
+    sudo make install
+    # Install QUARK: to be used in place of StarPU
+    mkdir -p $HOME/install
+    cd $HOME/install
+    git clone https://github.com/ecrc/quark
+    cd quark/
+    sed -i -e "s#prefix=.*#prefix=/usr/local#g" make.inc
+    sed -i -e "s#CFLAGS=.*#CFLAGS= -O2 -DADD_ -fPIC#g" make.inc
+    make
+    sudo make install
+    #+end_src
+    See also our script example in the [[https://gitlab.inria.fr/solverstack/chameleon/-/blob/master/distrib/debian/install_dependencies.sh][distrib/debian]] sub-directory.
+**** Known issues
+     - we need the lapacke interface to tmg routines and symbol like
+       ~LAPACKE_dlatms_work~ should be defined in the lapacke
+       library. Make sure the Debian packages /libopenblas-dev/ and
+       /liblapacke-dev/ (no problem with Intel MKL) do provide the tmg
+       interface. If not you can possibly update your distribution or
+       install the lapacke interface library in another way, by
+       yourself from source or with [[https://gitlab.inria.fr/solverstack/spack-repo][Spack]], or with [[https://gitlab.inria.fr/guix-hpc/guix-hpc-non-free][Guix-HPC]],...
+**** Some details about dependencies
+***** BLAS implementation
+      [[http://www.netlib.org/blas/][BLAS]] (Basic Linear Algebra Subprograms), are a de facto standard
+      for basic linear algebra operations such as vector and matrix
+      multiplication.  FORTRAN implementation of BLAS is available from
+      Netlib.  Also, C implementation of BLAS is included in GSL (GNU
+      Scientific Library).  Both these implementations are reference
+      implementation of BLAS, are not optimized for modern processor
+      architectures and provide an order of magnitude lower performance
+      than optimized implementations.  Highly optimized implementations
+      of BLAS are available from many hardware vendors, such as Intel
+      MKL, IBM ESSL and AMD ACML.  Fast implementations are also
+      available as academic packages, such as ATLAS and OpenBLAS.  The
+      standard interface to BLAS is the FORTRAN interface.
+      *Caution about the compatibility:* Chameleon has been mainly tested
+      with the reference BLAS from NETLIB, OpenBLAS and Intel MKL.
+***** CBLAS
+      [[http://www.netlib.org/blas/#_cblas][CBLAS]] is a C language interface to BLAS.  Most commercial and
+      academic implementations of BLAS also provide CBLAS.  Netlib
+      provides a reference implementation of CBLAS on top of FORTRAN
+      BLAS (Netlib CBLAS).  Since GSL is implemented in C, it naturally
+      provides CBLAS.
+      *Caution about the compatibility:* Chameleon has been mainly tested with
+      the reference CBLAS from NETLIB, OpenBLAS and Intel MKL.
+***** LAPACK implementation
+      [[http://www.netlib.org/lapack/][LAPACK]] (Linear Algebra PACKage) is a software library for
+      numerical linear algebra, a successor of LINPACK and EISPACK and
+      a predecessor of Chameleon.  LAPACK provides routines for solving
+      linear systems of equations, linear least square problems,
+      eigenvalue problems and singular value problems.  Most commercial
+      and academic BLAS packages also provide some LAPACK routines.
+      *Caution about the compatibility:* Chameleon has been mainly tested
+      with the reference LAPACK from NETLIB, OpenBLAS and Intel MKL.
+***** LAPACKE
+      [[http://www.netlib.org/lapack/][LAPACKE]] is a C language interface to LAPACK (or CLAPACK).  It is
+      produced by Intel in coordination with the LAPACK team and is
+      available in source code from Netlib in its original version
+      (Netlib LAPACKE) and from Chameleon website in an extended
+      version (LAPACKE for Chameleon).  In addition to implementing the
+      C interface, LAPACKE also provides routines which automatically
+      handle workspace allocation, making the use of LAPACK much more
+      convenient.
+      *Caution about the compatibility:* Chameleon has been mainly tested
+      with the reference LAPACKE from NETLIB, OpenBLAS and Intel
+      MKL. In addition the LAPACKE library *must* be configured to
+      provide the interface with the TMG routines and symbols like
+      ~LAPACKE_dlatms_work~ should be defined.
+***** libtmg
+      [[http://www.netlib.org/lapack/][libtmg]] is a component of the LAPACK library, containing routines
+      for generation of input matrices for testing and timing of
+      LAPACK.  The testing and timing suites of LAPACK require libtmg,
+      but not the library itself. Note that the LAPACK library can be
+      built and used without libtmg.
+      *Caution about the compatibility:* Chameleon has been mainly tested
+      with the reference TMGLIB from NETLIB, OpenBLAS and Intel MKL.
+***** StarPU
+      [[http://runtime.bordeaux.inria.fr/StarPU/][StarPU]] is a task programming library for hybrid architectures.
+      StarPU handles run-time concerns such as:
+      * Task dependencies
+      * Optimized heterogeneous scheduling
+      * Optimized data transfers and replication between main memory
+        and discrete memories
+      * Optimized cluster communications
+      StarPU can be used to benefit from GPUs and distributed-memory
+      environment. Note StarPU is enabled by default.
+      *Caution about the compatibility:* Chameleon has been mainly tested
+      with StarPU-1.1, 1.2 and 1.3 releases.
+***** PaRSEC
+      [[http://icl.utk.edu/parsec/][PaRSEC]] is a generic framework for architecture aware scheduling
+      and management of micro-tasks on distributed many-core
+      heterogeneous architectures. It can be used with MPI and Cuda.
+      *Caution about the compatibility:* Chameleon is compatible with
+      this version
+      https://bitbucket.org/mfaverge/parsec/branch/mymaster.
+***** QUARK
+      [[http://icl.cs.utk.edu/quark/][QUARK]] (QUeuing And Runtime for Kernels) provides a library that
+      enables the dynamic execution of tasks with data dependencies in
+      a multi-core, multi-socket, shared-memory environment. When
+      Chameleon is linked with QUARK, it is not possible to exploit
+      neither CUDA (for GPUs) nor MPI (distributed-memory environment).
+      You can use PaRSEC or StarPU to do so.
+      *Caution about the compatibility:* Chameleon has been mainly tested
+      with the QUARK library coming from https://github.com/ecrc/quark.
+***** EZTrace
+      This library provides efficient modules for recording
+      traces. Chameleon can trace kernels execution on CPU workers
+      thanks to EZTrace and produce .paje files. EZTrace also provides
+      integrated modules to trace MPI calls and/or memory usage. See
+      how to use this feature here [[sec:trace_ezt][Execution trace using EZTrace]]. To
+      trace kernels execution on all kind of workers, such as CUDA, We
+      recommend to use the internal tracing support of the runtime
+      system used done by the underlying runtime.  See how to use this
+      feature here [[sec:trace_fxt][Execution trace using StarPU/FxT]].
+***** hwloc
+      [[http://www.open-mpi.org/projects/hwloc/][hwloc]] (Portable Hardware Locality) is a software package for
+      accessing the topology of a multicore system including components
+      like: cores, sockets, caches and NUMA nodes. The topology
+      discovery library, ~hwloc~, is strongly recommended to be used
+      through the runtime system. It allows to increase performance,
+      and to perform some topology aware scheduling. ~hwloc~ is available
+      in major distributions and for most OSes and can be downloaded
+      from http://www.open-mpi.org/software/hwloc.
+      *Caution about the compatibility:* hwloc should be compatible with
+      the runtime system used.
+***** OpenMPI
+      [[http://www.open-mpi.org/][OpenMPI]] is an open source Message Passing Interface
+      implementation for execution on multiple nodes with
+      distributed-memory environment.  MPI can be enabled only if the
+      runtime system chosen is StarPU (default).  To use MPI through
+      StarPU, it is necessary to compile StarPU with MPI enabled.
+      *Caution about the compatibility:* OpenMPI should be built with the
+      --enable-mpi-thread-multiple option.
+***** Nvidia CUDA Toolkit
+      [[https://developer.nvidia.com/cuda-toolkit][Nvidia CUDA Toolkit]] provides a comprehensive development
+      environment for C and C++ developers building GPU-accelerated
+      applications.  Chameleon can use a set of low level optimized
+      kernels coming from cuBLAS to accelerate computations on GPUs.
+      The [[http://docs.nvidia.com/cuda/cublas/][cuBLAS]] library is an implementation of BLAS (Basic Linear
+      Algebra Subprograms) on top of the Nvidia CUDA runtime.  cuBLAS
+      is normaly distributed with Nvidia CUDA Toolkit.  CUDA/cuBLAS can
+      be enabled in Chameleon only if the runtime system chosen is
+      StarPU (default).  To use CUDA through StarPU, it is necessary to
+      compile StarPU with CUDA enabled.
+      *Caution about the compatibility:* your compiler must be compatible
+      with CUDA.
+*** Build and install Chameleon with CMake
+    <<sec:ug:cmake>>
+    Compilation of Chameleon libraries and executables are done with
+    CMake (http://www.cmake.org/). This version has been tested with
+    CMake 3.10.2 but any version superior to 2.8 should be fine.
+    Here the steps to configure, build, test and install
+    1. configure:
+       #+begin_src
+       cmake path/to/chameleon -DOPTION1= -DOPTION2= ...
+       # see the "Configuration options" section to get list of options
+       # see the "Dependencies detection" for details about libraries detection
+       #+end_src
+    2. build:
+       #+begin_src
+       make
+       # do not hesitate to use -j[ncores] option to speedup the compilation
+       #+end_src
+    3. test (optional, required CHAMELEON_ENABLE_TESTING=ON):
+       #+begin_src
+       make test
+       # or
+       ctest
+       #+end_src
+    4. install (optional):
+       #+begin_src
+       make install
+       #+end_src
+       Do not forget to specify the install directory with
+       *-DCMAKE_INSTALL_PREFIX* at configure.
+       #+begin_example
+       cmake /home/jdoe/chameleon -DCMAKE_INSTALL_PREFIX=/home/jdoe/install/chameleon
+       #+end_example
+       Note that the install process is optional. You are free to use
+       Chameleon binaries compiled in the build directory.
+**** Configuration options
+     You can optionally activate some options at cmake configure (like CUDA, MPI, ...)
+     invoking ~cmake path/to/your/CMakeLists.txt -DOPTION1= -DOPTION2= ...~
+     #+begin_src
+     cmake /home/jdoe/chameleon/ -DCMAKE_BUILD_TYPE=Debug \
+                                 -DCMAKE_INSTALL_PREFIX=/home/jdoe/install/ \
+                                 -DCHAMELEON_USE_CUDA=ON \
+                                 -DCHAMELEON_USE_MPI=ON \
+                                 -DBLA_VENDOR=Intel10_64lp \
+                                 -DCHAMELEON_ENABLE_TRACING=ON
+     #+end_src
+     You can get the full list of options with *-L[A][H]* options of cmake command
+     #+begin_src
+     cmake -LH /home/jdoe/chameleon/
+     #+end_src
+     You can also set the options thanks to the *ccmake* interface.
+***** Native CMake options (non-exhaustive list)
+      * *CMAKE_BUILD_TYPE=Debug|Release|RelWithDebInfo|MinSizeRel*:
+        level of compiler optimization, enable/disable debug
+        information
+      * *CMAKE_INSTALL_PREFIX=path/to/your/install/dir*: where headers,
+        libraries, executables, etc, will be copied when invoking make
+        install
+      * *BUILD_SHARED_LIBS=ON|OFF*: indicate wether or not CMake has to
+        build Chameleon static (~OFF~) or shared (~ON~) libraries.
+      * *CMAKE_C_COMPILER=gcc|icc|...*: to choose the C compilers
+        if several exist in the environment
+      * *CMAKE_Fortran_COMPILER=gfortran|ifort|...*: to choose the
+        Fortran compilers if several exist in the environment
+***** Related to specific modules (find_package) to find external libraries
+      * *BLA_VENDOR=All|OpenBLAS|Generic|Intel10_64lp|Intel10_64lp_seq|FLAME*:
+        to use intel mkl for example, see the list of BLA_VENDOR in
+        FindBLAS.cmake in cmake_modules/morse_cmake/modules/find
+      Libraries detected with an official cmake module (see module
+      files in CMAKE_ROOT/Modules/): BLAS - LAPACK - CUDA - MPI -
+      OpenMP - Threads.
+      Libraries detected with our cmake modules (see module files in
+      cmake_modules/morse_cmake/modules/find/ directory of Chameleon
+      sources): CBLAS - EZTRACE - FXT - HWLOC - LAPACKE - PARSEC -
+***** Chameleon specific options
+        link respectively with StarPU, PaRSEC, Quark, OpenMP library
+        (runtime system)
+      * *CHAMELEON_USE_MPI=ON|OFF* (default OFF): to link with MPI
+        library (message passing implementation for use of multiple
+        nodes with distributed memory), can only be used with StarPU
+        and PaRSEC
+      * *CHAMELEON_USE_CUDA=ON|OFF* (default OFF): to link with CUDA
+        runtime (implementation paradigm for accelerated codes on GPUs)
+        and cuBLAS library (optimized BLAS kernels on GPUs), can only
+        be used with StarPU and PaRSEC
+      * *CHAMELEON_ENABLE_DOC=ON|OFF* (default OFF): to control build of
+        the documentation contained in doc/ sub-directory
+      * *CHAMELEON_ENABLE_EXAMPLE=ON|OFF* (default ON): to control build
+        of the examples executables (API usage) contained in example/
+        sub-directory
+      * *CHAMELEON_ENABLE_TESTING=ON|OFF* (default ON): to control build
+        of testing executables (timer and numerical check) contained in testing/
+        sub-directory
+      * *CHAMELEON_SIMULATION=ON|OFF* (default OFF): to enable
+        simulation mode, means Chameleon will not really execute tasks,
+        see details in section [[sec:simu][Use simulation mode with
+        StarPU-SimGrid]]. This option must be used with StarPU compiled
+        with [[http://simgrid.gforge.inria.fr/][SimGrid]] allowing to guess the execution time on any
+        architecture. This feature should be used to make experiments
+        on the scheduler behaviors and performances not to produce
+        solutions of linear systems.
+      * *CHAMELEON_USE_MIGRATE=ON|OFF* (default OFF): enables the data
+        migration in QR algorithms
+**** Dependencies detection
+     <<sec:depdet>>
+     You have different choices to detect dependencies on your system,
+     either by setting some environment variables containing paths to
+     the libs and headers or by specifying them directly at cmake
+     configure. Different cases:
+     1) detection of dependencies through environment variables:
+        - LD_LIBRARY_PATH (DYLD_LIBRARY_PATH on Mac OSX) should contain
+          the list of paths where to find the libraries:
+          #+begin_src
+          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:install/path/to/your/lib
+          #+end_src
+        - INCLUDE (or CPATH, or C_INCLUDE_PATH)should contain the list
+          of paths where to find the header files of libraries
+          #+begin_src
+          export INCLUDE=$INCLUDE:install/path/to/your/headers
+          #+end_src
+     2) detection with user's given paths:
+        - you can specify the path at cmake configure by invoking
+          #+begin_example
+          cmake path/to/your/CMakeLists.txt -DLIB_DIR=path/to/your/lib
+          #+end_example
+          where LIB stands for the name of the lib to look for, /e.g./
+          #+begin_src
+          cmake path/to/your/CMakeLists.txt -DQUARK_DIR=path/to/quarkdir \
+                                            -DCBLAS_DIR= ...
+          #+end_src
+          it is also possible to specify headers and library directories
+          separately
+          #+begin_src
+          cmake path/to/your/CMakeLists.txt \
+                -DQUARK_INCDIR=path/to/quark/include \
+                -DQUARK_LIBDIR=path/to/quark/lib
+          #+end_src
+     3) detection with custom environment variables: all variables like
+        _DIR, _INCDIR, _LIBDIR can be set as environment variables
+        instead of CMake options, there will be read
+     4) using [[https://www.freedesktop.org/wiki/Software/pkg-config/][pkg-config]] for libraries that provide .pc files
+        - update your *PKG_CONFIG_PATH* to the paths where to find .pc
+          files of installed external libraries like hwloc, starpu, some
+          blas/lapack, etc
+     Note that PaRSEC and StarPU are only detected with pkg-config
+     mechanism because it is always provided and this avoids errors.
+*** Distribution Debian
+    <<sec:ug:debian>>
+    Download one of the available package for your distribution here
+    https://gitlab.inria.fr/solverstack/chameleon/-/packages, then
+    install as follows
+    #+begin_src sh
+    sudo apt-get install ./chameleon_1.1.0-1_amd64.deb -y
+    #+end_src
+    Chameleon will be installed on your system meaning you can use
+    drivers for performance tests
+    #+begin_src sh
+    mpiexec -n 2 chameleon_stesting -t 2 -o gemm -n 1000
+    #+end_src
+    and use Chameleon library in your own project
+    #+begin_src sh
+    # example usage: use chameleon library in your own cmake project (we provide a CHAMELEONConfig.cmake)
+    git clone https://gitlab.inria.fr/solverstack/distrib.git
+    cd distrib/cmake/test/chameleon && mkdir build && cd build && cmake .. && make && ./test_chameleon
+    # example usage: use chameleon library in your own not cmake project
+    # use pkg-config to get compiler flags and linking
+    pkg-config --cflags chameleon
+    pkg-config --libs chameleon
+    # if there are static libraries use the --static option of pkg-config
+    #+end_src
+    Do not hesitate to send an [[mailto:florent.pruvost@inria.fr][email]] if you need a package for your
+    Debian distribution.
+*** Distribution of Chameleon using GNU Guix
+    <<sec:ug:guix>>
+    We provide [[http://guix.gnu.org/][Guix]] packages to install Chameleon with its dependencies
+    in a reproducible way on GNU/Linux systems. For MacOSX please refer
+    to the next sections about Brew or Spack packaging.
+    If you are "root" on the system you can install Guix and directly
+    use it to install the libraries. On supercomputers your are not
+    root on you may still be able to use it if Docker or Singularity
+    are available on the machine because Chameleon can be packaged as
+    Docker/Singularity images with Guix.
+**** Installing Guix
+     Guix requires a running GNU/Linux system, GNU tar and Xz.
+     #+begin_src sh
+     gpg --keyserver pgp.mit.edu --recv-keys 3CE464558A84FDC69DB40CFB090B11993D9AEBB5
+     wget https://git.savannah.gnu.org/cgit/guix.git/plain/etc/guix-install.sh
+     chmod +x guix-install.sh
+     sudo ./guix-install.sh
+     #+end_src
+     The Chameleon packages are not official Guix packages. It is then
+     necessary to add a channel to get additional packages.  Create a
+     ~/.config/guix/channels.scm file with the following snippet:
+     #+begin_example
+     (cons (channel
+         (name 'guix-hpc-non-free)
+         (url "https://gitlab.inria.fr/guix-hpc/guix-hpc-non-free.git"))
+       %default-channels)
+     #+end_example
+     Update guix package definition
+     #+begin_src sh
+     guix pull
+     #+end_src
+     Update new guix in the path
+     #+begin_src sh
+     PATH="$HOME/.config/guix/current/bin${PATH:+:}$PATH"
+     hash guix
+     #+end_src
+     For further shell sessions, add this to the ~/.bash_profile file
+     #+begin_example
+     export PATH="$HOME/.config/guix/current/bin${PATH:+:}$PATH"
+     export GUIX_LOCPATH="$HOME/.guix-profile/lib/locale"
+     #+end_example
+     Chameleon packages are now available
+     #+begin_src sh
+     guix search ^chameleon
+     #+end_src
+     Refer to the [[https://guix.gnu.org/manual/en/guix.html#Invoking-guix-package][official documentation of Guix]] to learn the basic
+     commands.
+**** Installing Chameleon with Guix
+     Standard Chameleon, last release
+     #+begin_src sh
+     guix install chameleon
+     #+end_src
+     Notice that there exist several build variants
+     - chameleon (default) : with starpu - with mpi - with OpenBlas
+     - chameleon-mkl-mt : default version but with Intel MKL multithreaded to replace OpenBlas
+     - chameleon-cuda : with starpu - with mpi - with cuda
+     - chameleon-cuda-mkl-mt : with starpu - with mpi - with cuda - with Intel MKL multithreaded to replace OpenBlas
+     - chameleon-simgrid : with starpu - with mpi - with simgrid
+     - chameleon-openmp : with openmp - without mpi
+     - chameleon-parsec : with parsec - without mpi
+     - chameleon-quark : with quark - without mpi
+     Change the version
+     #+begin_src sh
+     guix install chameleon --with-branch=chameleon=master
+     guix install chameleon --with-commit=chameleon=b31d7575fb7d9c0e1ba2d8ec633e16cb83778e8b
+     guix install chameleon --with-git-url=chameleon=https://gitlab.inria.fr/fpruvost/chameleon.git
+     guix install chameleon --with-git-url=chameleon=$HOME/git/chameleon
+     #+end_src
+     Notice also that default mpi is OpenMPI and default blas/lapack is
+     Openblas. This can be changed with a [[https://guix.gnu.org/manual/en/guix.html#Package-Transformation-Options][transformation option]].
+     Change some dependencies
+     #+begin_src sh
+     # install chameleon with intel mkl to replace openblas, nmad to replace openmpi and starpu with fxt
+     guix install chameleon --with-input=openblas=mkl --with-input=openmpi=nmad --with-input=starpu=starpu-fxt
+     #+end_src
+**** Generate a Chameleon Docker image with Guix
+     To install Chameleon and its dependencies within a docker image
+     (OpenMPI stack)
+     #+begin_src sh
+     docker_chameleon=`guix pack -f docker chameleon chameleon --with-branch=chameleon=master --with-input=openblas=mkl mkl starpu hwloc openmpi openssh slurm bash coreutils inetutils util-linux procps git grep tar sed gzip which gawk perl emacs-minimal vim gcc-toolchain make cmake pkg-config -S /bin=bin --entry-point=/bin/bash`
+     # Load the generated tarball as a docker image
+     docker_chameleon_tag=`docker load --input $docker_chameleon | grep "Loaded image: " | cut -d " " -f 3-`
+     # Change tag name, see the existing image name with "docker images" command, then change to a more simple name
+     docker tag $docker_chameleon_tag guix/chameleon-tmp
+     #+end_src
+     Create a Dockerfile inheriting from the image (renamed
+     =guix/chameleon= here):
+     #+begin_src sh :eval no :tangle Dockerfile :export none
+     FROM guix/chameleon-tmp
+     # Create a directory for user 1000
+     RUN mkdir -p /builds
+     RUN chown -R 1000 /builds
+     ENTRYPOINT ["/bin/bash", "-l"]
+     # Enter the image as user 1000 in /builds
+     USER 1000
+     WORKDIR /builds
+     ENV HOME /builds
+     #+end_src
+     Then create the final docker image from this docker file.
+     #+begin_src sh
+     docker build -t guix/chameleon .
+     #+end_src
+     Test the image
+     #+begin_src sh
+     docker run -it guix/chameleon
+     # test starpu
+     STARPU=`pkg-config --variable=prefix libstarpu`
+     mpiexec -np 4 $STARPU/lib/starpu/mpi/comm
+     # test chameleon
+     CHAMELEON=`pkg-config --variable=prefix chameleon`
+     mpiexec -np 2 $CHAMELEON/bin/chameleon_stesting -H -o gemm -P 2 -t 2 -m 2000 -n 2000 -k 2000
+     #+end_src
+**** Generate a Chameleon Singularity image with Guix
+     To package Chameleon and its dependencies within a singularity image
+     (OpenMPI stack)
+     #+begin_src sh
+     singularity_chameleon=`guix pack -f squashfs chameleon --with-branch=chameleon=master --with-input=openblas=mkl mkl starpu hwloc openmpi openssh slurm hdf5 zlib bash coreutils inetutils util-linux procps git grep tar sed gzip which gawk perl emacs-minimal vim gcc-toolchain make cmake pkg-config -S /bin=bin --entry-point=/bin/bash`
+     cp $singularity_chameleon chameleon-pack.gz.squashfs
+     # copy the singularity image on the supercomputer, e.g. 'supercomputer'
+     scp chameleon-pack.gz.squashfs supercomputer:
+     #+end_src
+     On a machine where Singularity is installed Chameleon can then be
+     called as follows
+     #+begin_src sh
+     # at least openmpi and singularity are required here, e.g. module add openmpi singularity
+     mpiexec -np 2 singularity exec chameleon-pack.gz.squashfs /bin/chameleon_stesting -H -o gemm -P 2 -t 2 -m 2000 -n 2000 -k 2000
+     #+end_src
+*** Distribution of Chameleon using Spack
+    <<sec:ug:spack>>
+**** Installing Spack
+     To get support to install a full distribution on Linux or MacOS X,
+     Chameleon plus dependencies, we encourage users to use
+     [[https://spack.io/][Spack]]. Please refer to our [[https://gitlab.inria.fr/solverstack/spack-repo/blob/master/README.org][Spack Repository]].
+     #+begin_src sh
+     git clone https://github.com/llnl/spack.git
+     export SPACK_ROOT=$PWD/spack
+     cd spack/
+     git checkout v0.16.0
+     . $SPACK_ROOT/share/spack/setup-env.sh
+     git clone https://gitlab.inria.fr/solverstack/spack-repo.git ./var/spack/repos/solverstack
+     spack repo add ./var/spack/repos/solverstack
+     #+end_src
+     Chameleon is then available
+     #+begin_src sh
+     spack info chameleon
+     spack spec chameleon
+     #+end_src
+     Refer to te [[https://spack.readthedocs.io/en/latest/basic_usage.html][official documentation of Spack]] to learn the basic
+     commands.
+**** Installing Chameleon with Spack
+     Standard Chameleon, last state on the 'master' branch
+     #+begin_src sh
+     spack install -v chameleon
+     # chameleon is installed here:
+     spack location -i chameleon
+     #+end_src
+     Notice that there exist several build variants
+     - chameleon (default) : with starpu - with mpi
+     - tune the build type (CMake) with build_type=RelWithDebInfo|Debug|Release
+     - enable/disable shared libraries with +/- shared
+     - enable/disable mpi with +/- mpi
+     - enable/disable cuda with +/- cuda
+     - enable/disable fxt with +/- fxt
+     - enable/disable simgrid with +/- simgrid
+     - +openmp~starpu : with openmp - without starpu
+     - +quark~starpu : with quark - without starpu
+     Change the version
+     #+begin_src sh
+     spack install -v chameleon@1.0.0
+     #+end_src
+     Notice also that default mpi is OpenMPI and default blas/lapack is
+     Openblas. This can be changed by adding some [[https://spack.readthedocs.io/en/latest/basic_usage.html#constraining-virtual-packages][constraints on
+     virtual packages]].
+     Change some dependencies
+     #+begin_src sh
+     # see lapack providers
+     spack providers lapack
+     # see mpi providers
+     spack providers mpi
+     # install chameleon with intel mkl to replace openblas
+     spack install -v chameleon ^intel-mkl
+     # install chameleon with nmad to replace openmpi
+     spack install -v chameleon ^nmad
+     #+end_src
+*** Distribution Brew for Mac OS X
+    <<sec:ug:brew>>
+    We provide some [[https://brew.sh/][brew]] packages here
+    https://gitlab.inria.fr/solverstack/brew-repo (under construction).
+*** Linking an external application with Chameleon libraries
+    <<sec:ug:link>>
+    Compilation and link with Chameleon libraries have been tested with
+    the GNU compiler suite ~gcc/gfortran~ and the Intel compiler suite
+    ~icc/ifort~.
+**** For CMake projects
+     A CHAMELEONConfig.cmake file is provided at installation, stored
+     in <prefix>/lib/cmake/chameleon, so that users in cmake project
+     can use through the variable CHAMELEON_ROOT (set it as environment
+     or CMake variable).
+     #+begin_src
+     sudo apt-get update
+     sudo apt-get install -y libopenblas-dev liblapacke-dev libstarpu-dev
+     git clone --recursive https://gitlab.inria.fr/solverstack/chameleon.git
+     cd chameleon && mkdir -p build && cd build
+     CHAMELEON_ROOT=$PWD/install
+     cmake .. -DCMAKE_INSTALL_PREFIX=$CHAMELEON_ROOT && make -j5 install
+     # chameleon is installed in $CHAMELEON_ROOT
+     # if your work in a cmake project you can use the CHAMELEONConfig.cmake file
+     # installed under <prefix>/lib/cmake/chameleon/ by setting your
+     # CMAKE_PREFIX_PATH with the path of installation. In your cmake project, use
+     # find_package(CHAMELEON) and link your libraries and/or executables with the
+     # library target MORSE::CHAMELEON
+     #+end_src
+**** For non CMake projects
+     The compiler, linker flags that are necessary to build an
+     application using Chameleon are given through the [[https://www.freedesktop.org/wiki/Software/pkg-config/][pkg-config]]
+     mechanism.
+     #+begin_src
+     sudo apt-get update
+     sudo apt-get install -y libopenblas-dev liblapacke-dev libstarpu-dev
+     git clone --recursive https://gitlab.inria.fr/solverstack/chameleon.git
+     cd chameleon && mkdir -p build && cd build
+     CHAMELEON_ROOT=$PWD/install
+     cmake .. -DCMAKE_INSTALL_PREFIX=$CHAMELEON_ROOT && make -j5 install
+     # chameleon is installed in $CHAMELEON_ROOT
+     pkg-config --cflags chameleon
+     pkg-config --libs chameleon
+     pkg-config --libs --static chameleon
+     # use it in your configure/make
+     #+end_src
+     The .pc files required are located in the sub-directory
+     ~lib/pkgconfig~ of your Chameleon install directory.
+**** Static linking in C
+     Lets imagine you have a file ~main.c~ that you want to link with
+     Chameleon static libraries.  Lets consider
+     ~/home/yourname/install/chameleon~ is the install directory
+     of Chameleon containing sub-directories ~include/~ and
+     ~lib/~.  Here could be your compilation command with gcc
+     compiler:
+     #+begin_src
+     gcc -I/home/yourname/install/chameleon/include -o main.o -c main.c
+     #+end_src
+     Now if you want to link your application with Chameleon static libraries, you
+     could do:
+     #+begin_src
+     gcc main.o -o main                                         \
+     /home/yourname/install/chameleon/lib/libchameleon.a        \
+     /home/yourname/install/chameleon/lib/libchameleon_starpu.a \
+     /home/yourname/install/chameleon/lib/libcoreblas.a         \
+     -lstarpu-1.2 -Wl,--no-as-needed -lmkl_intel_lp64           \
+     -lmkl_sequential -lmkl_core -lpthread -lm -lrt
+     #+end_src
+     As you can see in this example, we also link with some dynamic
+     libraries *starpu-1.2*, *Intel MKL* libraries (for
+     BLAS/LAPACK/CBLAS/LAPACKE), *pthread*, *m* (math) and *rt*. These
+     libraries will depend on the configuration of your Chameleon
+     build.  You can find these dependencies in .pc files we generate
+     during compilation and that are installed in the sub-directory
+     ~lib/pkgconfig~ of your Chameleon install directory.  Note also that
+     you could need to specify where to find these libraries with *-L*
+     option of your compiler/linker.
+     Before to run your program, make sure that all shared libraries
+     paths your executable depends on are known.  Enter ~ldd main~
+     to check.  If some shared libraries paths are missing append them
+     in the LD_LIBRARY_PATH (for Linux systems) environment
+     variable (DYLD_LIBRARY_PATH on Mac).
+**** Dynamic linking in C
+     For dynamic linking (need to build Chameleon with CMake option
+     BUILD_SHARED_LIBS=ON) it is similar to static compilation/link but
+     instead of specifying path to your static libraries you indicate
+     the path to dynamic libraries with *-L* option and you give
+     the name of libraries with *-l* option like this:
+     #+begin_src
+     gcc main.o -o main \
+     -L/home/yourname/install/chameleon/lib \
+     -lchameleon -lchameleon_starpu -lcoreblas \
+     -lstarpu-1.2 -Wl,--no-as-needed -lmkl_intel_lp64 \
+     -lmkl_sequential -lmkl_core -lpthread -lm -lrt
+     #+end_src
+     Note that an update of your environment variable LD_LIBRARY_PATH
+     (DYLD_LIBRARY_PATH on Mac) with the path of the libraries could be
+     required before executing
+     #+begin_src
+     export LD_LIBRARY_PATH=path/to/libs:path/to/chameleon/lib
+     #+end_src
+ # # *** Build a Fortran program with Chameleon                         :noexport:
+ # #
+ # #     Chameleon provides a Fortran interface to user functions. Example:
+ # #     #+begin_src
+ # #     call chameleon_version(major, minor, patch) !or
+ # #     call CHAMELEON_VERSION(major, minor, patch)
+ # #     #+end_src
+ # #
+ # #     Build and link are very similar to the C case.
+ # #
+ # #     Compilation example:
+ # #     #+begin_src
+ # #     gfortran -o main.o -c main.f90
+ # #     #+end_src
+ # #
+ # #     Static linking example:
+ # #     #+begin_src
+ # #     gfortran main.o -o main                                    \
+ # #     /home/yourname/install/chameleon/lib/libchameleon.a        \
+ # #     /home/yourname/install/chameleon/lib/libchameleon_starpu.a \
+ # #     /home/yourname/install/chameleon/lib/libcoreblas.a         \
+ # #     -lstarpu-1.2 -Wl,--no-as-needed -lmkl_intel_lp64           \
+ # #     -lmkl_sequential -lmkl_core -lpthread -lm -lrt
+ # #     #+end_src
+ # #
+ # #     Dynamic linking example:
+ # #     #+begin_src
+ # #     gfortran main.o -o main                          \
+ # #     -L/home/yourname/install/chameleon/lib           \
+ # #     -lchameleon -lchameleon_starpu -lcoreblas        \
+ # #     -lstarpu-1.2 -Wl,--no-as-needed -lmkl_intel_lp64 \
+ # #     -lmkl_sequential -lmkl_core -lpthread -lm -lrt
+ # #     #+end_src
diff --git a/doc/user/chapters/introduction.org b/doc/user/chapters/introduction.org
new file mode 100644
index 0000000000000000000000000000000000000000..a1fbc023b9e9b934b45ea5a51a4d19c33cb8ea99
--- /dev/null
+++ b/doc/user/chapters/introduction.org
@@ -0,0 +1,310 @@
+# This file is part of the Chameleon User's Guide.
+# Copyright (C) 2018 Inria
+# See the file ../users_guide.org for copying conditions.
+*** MORSE project
+    #+NAME: fig:chameleon_header
+    #+ATTR_HTML: :align center
+    [[file:chameleon_header.png]]
+    Chameleon is a linear algebra software created jointly by several
+    research teams as part of the MORSE associate team: [[http://www.icl.utk.edu/][ICL]], [[https://www.inria.fr/en/][Inria]],
+    [[https://www.kaust.edu.sa/en][KAUST]], [[http://www.ucdenver.edu/pages/ucdwelcomepage.aspx][The University of Colorado Denver]].
+**** MORSE Objectives
+     When processor clock speeds flatlined in 2004, after more than
+     fifteen years of exponential increases, the era of near automatic
+     performance improvements that the HPC application community had
+     previously enjoyed came to an abrupt end.  To develop software that
+     will perform well on petascale and exascale systems with thousands
+     of nodes and millions of cores, the list of major challenges that
+     must now be confronted is formidable:
+     1) dramatic escalation in the costs of intrasystem communication
+        between processors and/or levels of memory hierarchy;
+     2) increased heterogeneity of the processing units (mixing CPUs,
+        GPUs, etc. in varying and unexpected design combinations);
+     3) high levels of parallelism and more complex constraints means
+        that cooperating processes must be dynamically and unpredictably
+        scheduled for asynchronous execution;
+     4) software will not run at scale without much better resilience to
+        faults and far more robustness; and
+     5) new levels of self-adaptivity will be required to enable
+        software to modulate process speed in order to satisfy limited
+        energy budgets.
+     The MORSE associate team will tackle the first three challenges in
+     a orchestrating work between research groups respectively
+     specialized in sparse linear algebra, dense linear algebra and
+     runtime systems.  The overall objective is to develop robust
+     linear algebra libraries relying on innovative runtime systems
+     that can fully benefit from the potential of those future
+     large-scale complex machines.  Challenges 4) and 5) will also be
+     investigated by the different teams in the context of other
+     partnerships, but they will not be the main focus of the associate
+     team as they are much more prospective.
+**** Research fields
+     The overall goal of the MORSE associate team is to enable advanced
+     numerical algorithms to be executed on a scalable unified runtime
+     system for exploiting the full potential of future exascale
+     machines.  We expect advances in three directions based first on
+     strong and closed interactions between the runtime and numerical
+     linear algebra communities.  This initial activity will then
+     naturally expand to more focused but still joint research in both
+     fields.
+***** Fine interaction between linear algebra and runtime systems
+      On parallel machines, HPC applications need to take care of data
+      movement and consistency, which can be either explicitly managed
+      at the level of the application itself or delegated to a runtime
+      system.  We adopt the latter approach in order to better keep up
+      with hardware trends whose complexity is growing exponentially.
+      One major task in this project is to define a proper interface
+      between HPC applications and runtime systems in order to maximize
+      productivity and expressivity.  As mentioned in the next section,
+      a widely used approach consists in abstracting the application as
+      a DAG that the runtime system is in charge of scheduling.
+      Scheduling such a DAG over a set of heterogeneous processing
+      units introduces a lot of new challenges, such as predicting
+      accurately the execution time of each type of task over each kind
+      of unit, minimizing data transfers between memory banks,
+      performing data prefetching, etc.  Expected advances: In a
+      nutshell, a new runtime system API will be designed to allow
+      applications to provide scheduling hints to the runtime system
+      and to get real-time feedback about the consequences of
+      scheduling decisions.
+***** Runtime systems
+      A runtime environment is an intermediate layer between the system
+      and the application.  It provides low-level functionality not
+      provided by the system (such as scheduling or management of the
+      heterogeneity) and high-level features (such as performance
+      portability).  In the framework of this proposal, we will work on
+      the scalability of runtime environment. To achieve scalability it
+      is required to avoid all centralization.  Here, the main problem
+      is the scheduling of the tasks.  In many task-based runtime
+      environments the scheduler is centralized and becomes a
+      bottleneck as soon as too many cores are involved.  It is
+      therefore required to distribute the scheduling decision or to
+      compute a data distribution that impose the mapping of task
+      using, for instance the so-called ``owner-compute'' rule.
+      Expected advances: We will design runtime systems that enable an
+      efficient and scalable use of thousands of distributed multicore
+      nodes enhanced with accelerators.
+***** Linear algebra
+      Because of its central position in HPC and of the well understood
+      structure of its algorithms, dense linear algebra has often
+      pioneered new challenges that HPC had to face.  Again, dense
+      linear algebra has been in the vanguard of the new era of
+      petascale computing with the design of new algorithms that can
+      efficiently run on a multicore node with GPU accelerators. These
+      algorithms are called ``communication-avoiding'' since they have
+      been redesigned to limit the amount of communication between
+      processing units (and between the different levels of memory
+      hierarchy).  They are expressed through Direct Acyclic Graphs
+      (DAG) of fine-grained tasks that are dynamically
+      scheduled. Expected advances: First, we plan to investigate the
+      impact of these principles in the case of sparse applications
+      (whose algorithms are slightly more complicated but often rely on
+      dense kernels).  Furthermore, both in the dense and sparse cases,
+      the scalability on thousands of nodes is still limited; new
+      numerical approaches need to be found.  We will specifically
+      design sparse hybrid direct/iterative methods that represent a
+      promising approach.
+**** Research papers
+     Research papers about MORSE can be found [[http://icl.cs.utk.edu/projectsdev/morse/pubs/index.html][here]].
+*** Chameleon
+**** Chameleon software
+     The main purpose is to address the performance shortcomings of the
+     [[http://www.netlib.org/lapack/][LAPACK]] and [[http://www.netlib.org/scalapack/][ScaLAPACK]] libraries on multicore processors and
+     multi-socket systems of multicore processors and their inability
+     to efficiently utilize accelerators such as Graphics Processing
+     Units (GPUs).
+     Chameleon is a framework written in C which provides routines to
+     solve dense general systems of linear equations, symmetric
+     positive definite systems of linear equations and linear least
+     squares problems, using LU, Cholesky, QR and LQ factorizations.
+     Real arithmetic and complex arithmetic are supported in both
+     single precision and double precision.  It supports Linux and Mac
+     OS/X machines (mainly tested on Intel x86-64 and IBM Power
+     architectures).
+     Chameleon is based on the [[http://icl.cs.utk.edu/plasma/][PLASMA]] source code but is not limited to
+     shared-memory environment and can exploit multiple GPUs.
+     Chameleon is interfaced in a generic way with [[http://runtime.bordeaux.inria.fr/StarPU/][StarPU]], [[http://icl.utk.edu/parsec/][PaRSEC]],
+     [[http://icl.cs.utk.edu/quark/][QUARK]] runtime systems.  This feature allows to analyze in a
+     unified framework how sequential task-based algorithms behave
+     regarding different runtime systems implementations.  Using
+     Chameleon with *StarPU* or *PaRSEC* runtime systems allows to exploit
+     GPUs through kernels provided by [[https://developer.nvidia.com/cublas][cuBLAS]] and clusters of
+     interconnected nodes with distributed memory (using [[http://www.open-mpi.org/][MPI]]).
+     Computation of very large systems with dense matrices on a cluster
+     of nodes is still being experimented and stabilized.  It is not
+     expected to get stable performances with the current version using
+     MPI.
+**** PLASMA's design principles
+     Chameleon is originally based on [[http://icl.cs.utk.edu/plasma/][PLASMA]] so that design principles
+     are very similar.  The content of this section PLASMA's design
+     principles has been copied from the /Design principles/ section of
+     the PLASMA User's Guide.
+***** Tile Algorithms
+      Tile algorithms are based on the idea of processing the matrix by
+      square tiles of relatively small size, such that a tile fits
+      entirely in one of the cache levels associated with one core.
+      This way a tile can be loaded to the cache and processed
+      completely before being evicted back to the main memory.  Of the
+      three types of cache misses, *compulsory*, *capacity* and *conflict*,
+      the use of tile algorithms minimizes the number of capacity
+      misses, since each operation loads the amount of data that does
+      not ``overflow'' the cache.
+      For some operations such as matrix multiplication and Cholesky
+      factorization, translating the classic algorithm to the tile
+      algorithm is trivial.  In the case of matrix multiplication, the
+      tile algorithm is simply a product of applying the technique of
+      *loop tiling* to the canonical definition of three nested loops.
+      It is very similar for the Cholesky factorization.  The
+      *left-looking* definition of Cholesky factorization from LAPACK is
+      a loop with a sequence of calls to four routines: xSYRK
+      (symmetric *rank-k* update), xPOTRF (Cholesky factorization of a
+      small block on the diagonal), xGEMM (matrix multiplication) and
+      xTRSM (triangular solve).  If the xSYRK, xGEMM and xTRSM
+      operations are expressed with the canonical definition of three
+      nested loops and the technique of loop tiling is applied, the
+      tile algorithm results.  Since the algorithm is produced by
+      simple reordering of operations, neither the number of operations
+      nor numerical stability of the algorithm are affected.
+      The situation becomes slightly more complicated for LU and QR
+      factorizations, where the classic algorithms factorize an entire
+      panel of the matrix (a block of columns) at every step of the
+      algorithm.  One can observe, however, that the process of matrix
+      factorization is synonymous with introducing zeros in approproate
+      places and a tile algorithm can be fought of as one that zeroes
+      one tile of the matrix at a time.  This process is referred to as
+      updating of a factorization or *incremental factorization*.  The
+      process is equivalent to factorizing the top tile of a panel,
+      then placing the upper triangle of the result on top of the tile
+      blow and factorizing again, then moving to the next tile and so
+      on.  Here, the tile LU and QR algorithms perform slightly more
+      floating point operations and require slightly more memory for
+      auxiliary data.  Also, the tile LU factorization applies a
+      different pivoting pattern and, as a result, is less numerically
+      stable than classic LU with full pivoting.  Numerical stability
+      is not an issue in case of the tile QR, which relies on
+      orthogonal transformations (Householder reflections), which are
+      numerically stable.
+      #+CAPTION: Schematic illustration of the tile LU factorization (kernel names for real arithmetics in double precision), courtesey of the [[http://icl.cs.utk.edu/plasma/][PLASMA]] team.
+      #+NAME: fig:tile_lu
+      #+ATTR_HTML: :width 640px :align center
+      [[file:tile_lu.jpg]]
+***** Tile Data Layout
+      <<sec:tile>>
+      Tile layout is based on the idea of storing the matrix by square
+      tiles of relatively small size, such that each tile occupies a
+      continuous memory region.  This way a tile can be loaded to the
+      cache memory efficiently and the risk of evicting it from the
+      cache memory before it is completely processed is minimized.  Of
+      the three types of cache misses, *compulsory*, *capacity* and
+      *conflict*, the use of tile layout minimizes the number of conflict
+      misses, since a continuous region of memory will completely fill
+      out a /set-associative/ cache memory before an eviction can happen.
+      Also, from the standpoint of multithreaded execution, the
+      probability of *false sharing* is minimized.  It can only affect
+      the cache lines containing the beginning and the ending of a
+      tile.
+      In standard *cache-based* architecture, tiles continously laid out
+      in memory maximize the profit from automatic prefetching.  Tile
+      layout is also beneficial in situations involving the use of
+      accelerators, where explicit communication of tiles through DMA
+      transfers is required, such as moving tiles between the system
+      memory and the local store in Cell B. E. or moving tiles between
+      the host memory and the device memory in GPUs.  In most
+      circumstances tile layout also minimizes the number of TLB misses
+      and conflicts to memory banks or partitions.  With the standard
+      (*column-major*) layout, access to each column of a tile is much
+      more likely to cause a conflict miss, a false sharing miss, a TLB
+      miss or a bank or partition conflict.  The use of the standard
+      layout for dense matrix operations is a performance minefield.
+      Although occasionally one can pass through it unscathed, the risk
+      of hitting a spot deadly to performance is very high.
+      Another property of the layout utilized in PLASMA is that it is
+      ``flat'', meaning that it does not involve a level of
+      indirection. Each tile stores a small square submatrix of the
+      main matrix in a *column-major* layout. In turn, the main matrix is
+      an arrangement of tiles immediately following one another in a
+      *column-major* layout.  The offset of each tile can be calculated
+      through address arithmetics and does not involve pointer
+      indirection.  Alternatively, a matrix could be represented as an
+      array of pointers to tiles, located anywhere in memory. Such
+      layout would be a radical and unjustifiable departure from LAPACK
+      and ScaLAPACK.  Flat tile layout is a natural progression from
+      LAPACK's *column-major* layout and ScaLAPACK's /block-cyclic/ layout.
+      Another related property of PLASMA's tile layout is that it
+      includes provisions for padding of tiles, i.e., the actual region
+      of memory designated for a tile can be larger than the memory
+      occupied by the actual data.  This allows to force a certain
+      alignment of tile boundaries, while using the flat organization
+      described in the previous paragraph.  The motivation is that, at
+      the price of small memory overhead, alignment of tile boundaries
+      may prove benefivial in multiple scenarios involving memory
+      systems of standard multicore processors, as well as
+      accelerators.  The issues that come into play are, again, the use
+      of TLBs and memory banks or partitions.
+      #+CAPTION: Schematic illustration of the tile layout with *column-major* order of tiles, *column-major* order of elements within tiles and (optional) padding for enforcing a certain alighment of tile bondaries, courtesey of the [[http://icl.cs.utk.edu/plasma/][PLASMA]] team.
+      #+NAME: fig:tile_layout
+      #+ATTR_HTML: :width 640px :align center
+      [[file:tile_layout.jpg]]
+***** Dynamic Task Scheduling
+      Dynamic scheduling is the idea of assigning work to cores based
+      on the availability of data for processing at any given point in
+      time and is also referred to as *data-driven* scheduling.  The
+      concept is related closely to the idea of expressing computation
+      through a task graph, often referred to as the DAG (*Direct
+      Acyclic Graph*), and the flexibility exploring the DAG at runtime.
+      Thus, to a large extent, dynamic scheduling is synonymous with
+      *runtime scheduling*.  An important concept here is the one of
+      the *critical path*, which defines the upper bound on the
+      achievable parallelism, and needs to be pursued at the maximum
+      speed.  This is in direct opposition to the *fork-and-join* or
+      *data-parallel* programming models, where artificial
+      synchronization points expose serial sections of the code, where
+      multiple cores are idle, while sequential processing takes place.
+      The use of dynamic scheduling introduces a *trade-off*, though.
+      The more dynamic (flexible) scheduling is, the more centralized
+      (and less scalable) the scheduling mechanism is.  For that
+      reason, currently PLASMA uses two scheduling mechanisms, one
+      which is fully dynamic and one where work is assigned statically
+      and dependency checks are done at runtime.
+      The first scheduling mechanism relies on unfolding a *sliding
+      window* of the task graph at runtime and scheduling work by
+      resolving data hazards: *Read After Write(RAW)*, *Write After Read
+      (WAR)* and *Write After Write (WAW)*, a technique analogous to
+      instruction scheduling in superscalar processors.  It also relies
+      on *work-stealing* for balanding the load among all multiple cores.
+      The second scheduling mechanism relies on statically designating
+      a path through the execution space of the algorithm to each core
+      and following a cycle: transition to a task, wait for its
+      dependencies, execute it, update the overall progress.  Task are
+      identified by tuples and task transitions are done through
+      locally evaluated formulas.  Progress information can be
+      centralized, replicated or distributed (currently centralized).
+      #+CAPTION: A trace of the tile QR factorization executing on eight cores without any global synchronization points (kernel names for real arithmetics in single precision), courtesey of the [[http://icl.cs.utk.edu/plasma/][PLASMA]] team.
+      #+NAME: fig:trace_qr
+      #+ATTR_HTML: :width 640px :align center
+      [[file:trace_qr.jpg]]
diff --git a/doc/user/chapters/performances.org b/doc/user/chapters/performances.org
new file mode 100644
index 0000000000000000000000000000000000000000..cbf157482270056b9c66d3cba0ec35dd4475c911
--- /dev/null
+++ b/doc/user/chapters/performances.org
@@ -0,0 +1,65 @@
+# This file is part of the Chameleon User's Guide.
+# Copyright (C) 2020 Inria
+# See the file ../users_guide.org for copying conditions.
+Show performances on [[https://www.plafrim.fr/][PlaFRIM]] supercomputer.
+See [[https://www.plafrim.fr/hardware-documentation/][characteristics]] to get details about the hardwares.
+See script ~tools/bench/plafrim/run.sh~ to get details about the environment (Guix, Slurm,
+etc) and the build.
+Chameleon is run this way:
+#+begin_src sh
+mpiexec -np $nmpi $CHAMELEON_BUILD/testing/chameleon_${precision}testing -o ${algorithm} -P $p -t $nthr -g $ngpu -m $m -n $n -k $k -b $b
+- runtime : *starpu*
+- precision : *s* or *d* for simple or double precision
+- algorithm : *gemm* or *potrf* or *geqrf_hqr*
+- nmpi = p x p
+- nthr : depends on the node
+- ngpu : depends on the node
+- m = n = k
+- b : depends on the node
+*** bora (36 CPUs) nodes
+    - nmpi = *1*, *4*, *9*
+    - 2D block cyclic parameters : PxQ = 1x1, 2x2 and 3x3
+    - Number of threads (t) = *34*, one CPU being dedicated for the
+      scheduler and one other for MPI communications
+    - Number of GPUs = *0*
+    - Tile Size (b) = *280*
+**** CPU times
+     #+CAPTION: Performances in CPU time of GEMM, POTRF and QR on bora nodes
+     #+NAME: fig:chameleon_plafrim_bora_time_openmpi
+     #+ATTR_HTML: :align center :width 75%
+     [[file:chameleon_plafrim_bora_time_openmpi.png]]
+**** GFLOP/s
+     #+CAPTION: Performances in GFlop/s of GEMM, POTRF and QR on bora nodes
+     #+NAME: fig:chameleon_plafrim_bora_perf_openmpi
+     #+ATTR_HTML: :align center :width 75%
+     [[file:chameleon_plafrim_bora_perf_openmpi.png]]
+*** sirocco [14-17] (32 CPUs + 2 GPUs V100) nodes
+    - nmpi = *1*
+    - 2D block cyclic parameters : PxQ = 1x1
+    - Number of threads (t) = *29*, one CPU being dedicated for the
+      scheduler and two others for the 2 GPUs
+    - Number of GPUs = *2*
+    - Tile Size (b) = *1600*
+     #+CAPTION: Performances in CPU time of GEMM, POTRF and QR on sirocco nodes
+     #+NAME: fig:chameleon_plafrim_sirocco
+     #+ATTR_HTML: :align center :width 75%
+     [[file:chameleon_plafrim_sirocco.png]]
+ # *** CPU times
+ #     #+CAPTION: Performances in CPU time of GEMM, POTRF and QR on sirocco nodes
+ #     #+NAME: fig:chameleon_plafrim_sirocco_openmpi_time
+ #     #+ATTR_HTML: :align center :height 1024
+ #     [[file:chameleon_plafrim_sirocco_openmpi_time.png]]
+ #
+ # *** GFLOP/s
+ #     #+CAPTION: Performances in GFlop/s of GEMM, POTRF and QR on sirocco nodes
+ #     #+NAME: fig:chameleon_plafrim_sirocco_openmpi_perf
+ #     #+ATTR_HTML: :align center :height 1024
+ #     [[file:chameleon_plafrim_sirocco_openmpi_perf.png]]
diff --git a/doc/orgmode/chapters/potri_async.png b/doc/user/chapters/potri_async.png
similarity index 100%
rename from doc/orgmode/chapters/potri_async.png
rename to doc/user/chapters/potri_async.png
diff --git a/doc/orgmode/chapters/tile_layout.jpg b/doc/user/chapters/tile_layout.jpg
similarity index 100%
rename from doc/orgmode/chapters/tile_layout.jpg
rename to doc/user/chapters/tile_layout.jpg
diff --git a/doc/orgmode/chapters/tile_layout.pdf b/doc/user/chapters/tile_layout.pdf
similarity index 100%
rename from doc/orgmode/chapters/tile_layout.pdf
rename to doc/user/chapters/tile_layout.pdf
diff --git a/doc/orgmode/chapters/tile_lu.jpg b/doc/user/chapters/tile_lu.jpg
similarity index 100%
rename from doc/orgmode/chapters/tile_lu.jpg
rename to doc/user/chapters/tile_lu.jpg
diff --git a/doc/orgmode/chapters/tile_lu.pdf b/doc/user/chapters/tile_lu.pdf
similarity index 100%
rename from doc/orgmode/chapters/tile_lu.pdf
rename to doc/user/chapters/tile_lu.pdf
diff --git a/doc/orgmode/chapters/trace_qr.jpg b/doc/user/chapters/trace_qr.jpg
similarity index 100%
rename from doc/orgmode/chapters/trace_qr.jpg
rename to doc/user/chapters/trace_qr.jpg
diff --git a/doc/orgmode/chapters/trace_qr.pdf b/doc/user/chapters/trace_qr.pdf
similarity index 100%
rename from doc/orgmode/chapters/trace_qr.pdf
rename to doc/user/chapters/trace_qr.pdf
diff --git a/doc/orgmode/chapters/using.org b/doc/user/chapters/using.org
similarity index 73%
rename from doc/orgmode/chapters/using.org
rename to doc/user/chapters/using.org
index 74be7d6be8537a9d3f0cafb28f42c7f32c73d6e1..6e640c2bda2405fb4b8634173df121250ec5635c 100644
--- a/doc/orgmode/chapters/using.org
+++ b/doc/user/chapters/using.org
@@ -1,253 +1,252 @@
 # This file is part of the Chameleon User's Guide.
 # Copyright (C) 2017 Inria
 # See the file ../users_guide.org for copying conditions.
-** Using Chameleon executables
-   Chameleon provides several test executables that are compiled and
-   linked with Chameleon's dependencies.  Instructions about the
-   arguments to give to executables are accessible thanks to the
-   option ~-[-]help~ or ~-[-]h~.  This set of binaries are separated into
-   three categories and can be found in three different directories:
-   * *example*: contains examples of API usage and more specifically the
-     sub-directory ~lapack_to_chameleon/~ provides a tutorial that explains
-     how to use Chameleon functionalities starting from a full LAPACK
-     code, see [[sec:tuto][Tutorial LAPACK to Chameleon]]
-   * *testing*: contains testing drivers to check numerical
-     correctness and assess performance of Chameleon linear algebra
-     routines with a wide range of parameters
-     #+begin_src
-     ./testing/chameleon_stesting -H -o gemm -t 2 -m 2000 -n 2000 -k 2000
-     #+end_src
-     To get the list of parameters, use the ~-h~ or ~--help~ option.
-     #+begin_src
-     ./testing/chameleon_stesting -h
-     #+end_src
-     Available algorithms for testing are:
-     * gels_hqr:  Linear least squares with general matrix using hierarchical reduction trees
-     * ormlq_hqr: Q application with hierarchical reduction trees (LQ)
-     * orglq_hqr: Q generation with hierarchical reduction trees (LQ)
-     * gelqf_hqr: General LQ factorization with hierachical reduction trees
-     * ormqr_hqr: Q application with hierarchical reduction trees (QR)
-     * orgqr_hqr: Q generation with hierarchical reduction trees (QR)
-     * geqrf_hqr: General QR factorization with hierachical reduction trees
-     * gels:      Linear least squares with general matrix
-     * ormlq:     Q application (LQ)
-     * orglq:     Q generation (LQ)
-     * gelqf:     General LQ factorization
-     * ormqr:     Q application (QR)
-     * orgqr:     Q generation (QR)
-     * geqrf:     General QR factorization
-     * gesv:      General linear system solve (LU without pivoting)
-     * getrs:     General triangular solve (LU without pivoting)
-     * getrf:     General factorization (LU without pivoting)
-     * potri:     Symmetric positive definite matrix inversion
-     * lauum:     Trianguilar in-place matrix-matrix computation for Cholesky inversion
-     * trtri:     Triangular matrix inversion
-     * posv:      Symmetric positive definite linear system solve (Cholesky)
-     * potrs:     Symmetric positive definite solve (Cholesky)
-     * potrf:     Symmetric positive definite factorization (Cholesky)
-     * trsm:      Triangular matrix solve
-     * trmm:      Triangular matrix-matrix multiply
-     * syr2k:     Symmetrix matrix-matrix rank 2k update
-     * syrk:      Symmetrix matrix-matrix rank k update
-     * symm:      Symmetric matrix-matrix multiply
-     * gemm:      General matrix-matrix multiply
-     * lascal:    General matrix scaling
-     * tradd:     Triangular matrix-matrix addition
-     * geadd:     General matrix-matrix addition
-     * lantr:     Triangular matrix norm
-     * lansy:     Symmetric matrix norm
-     * lange:     General matrix norm
-     * lacpy:     General matrix copy
-*** Execution trace using EZTrace
-    <<sec:trace_ezt>>
-    [[http://eztrace.gforge.inria.fr/support.html][EZTrace]] can be used by chameleon to generate traces. Two modules
-    are automatically generated as soon as EZTrace is detected on the
-    system. The first one (which is recommended) is the
-    ~chameleon_tcore~ module. It traces all the ~TCORE_...()~ functions
-    that are called by the codelets of all the runtime but PaRSEC. The
-    second one is the ~chameleon_core~ module which traces the lower
-    level ~CORE_...()~ functions. If using PaRSEC, you need to use this
-    module to generate the traces.
-    To generate traces with EZTrace, you need first to compile with
-    *-DBUILD_SHARED_LIBS=ON*. EZTrace is using weak symbols to overload
-    function calls with ld_preload and enable trace generation. Then,
-    either you install the ~libeztrace-*.so~ files into the EZTrace
-    install directory, or you can add the path of the modules to your
-    environement
-    #+begin_src
-    export EZTRACE_LIBRARY_PATH=/path/to/your/modules
-    #+end_src
+*** Using Chameleon executables
+    <<sec:usetesting>>
+    Chameleon provides several test executables that are compiled and
+    linked with Chameleon's dependencies.  Instructions about the
+    arguments to give to executables are accessible thanks to the
+    option ~-[-]help~ or ~-[-]h~.  This set of binaries are separated into
+    three categories and can be found in three different directories:
+    * *example*: contains examples of API usage and more specifically the
+      sub-directory ~lapack_to_chameleon/~ provides a tutorial that explains
+      how to use Chameleon functionalities starting from a full LAPACK
+      code, see [[sec:tuto][Tutorial LAPACK to Chameleon]]
+    * *testing*: contains testing drivers to check numerical
+      correctness and assess performance of Chameleon linear algebra
+      routines with a wide range of parameters
+      #+begin_src
+      ./testing/chameleon_stesting -H -o gemm -t 2 -m 2000 -n 2000 -k 2000
+      #+end_src
+      To get the list of parameters, use the ~-h~ or ~--help~ option.
+      #+begin_src
+      ./testing/chameleon_stesting -h
+      #+end_src
+      Available algorithms for testing are:
+      * gels_hqr:  Linear least squares with general matrix using hierarchical reduction trees
+      * ormlq_hqr: Q application with hierarchical reduction trees (LQ)
+      * orglq_hqr: Q generation with hierarchical reduction trees (LQ)
+      * gelqf_hqr: General LQ factorization with hierachical reduction trees
+      * ormqr_hqr: Q application with hierarchical reduction trees (QR)
+      * orgqr_hqr: Q generation with hierarchical reduction trees (QR)
+      * geqrf_hqr: General QR factorization with hierachical reduction trees
+      * gels:      Linear least squares with general matrix
+      * ormlq:     Q application (LQ)
+      * orglq:     Q generation (LQ)
+      * gelqf:     General LQ factorization
+      * ormqr:     Q application (QR)
+      * orgqr:     Q generation (QR)
+      * geqrf:     General QR factorization
+      * gesv:      General linear system solve (LU without pivoting)
+      * getrs:     General triangular solve (LU without pivoting)
+      * getrf:     General factorization (LU without pivoting)
+      * potri:     Symmetric positive definite matrix inversion
+      * lauum:     Trianguilar in-place matrix-matrix computation for Cholesky inversion
+      * trtri:     Triangular matrix inversion
+      * posv:      Symmetric positive definite linear system solve (Cholesky)
+      * potrs:     Symmetric positive definite solve (Cholesky)
+      * potrf:     Symmetric positive definite factorization (Cholesky)
+      * trsm:      Triangular matrix solve
+      * trmm:      Triangular matrix-matrix multiply
+      * syr2k:     Symmetrix matrix-matrix rank 2k update
+      * syrk:      Symmetrix matrix-matrix rank k update
+      * symm:      Symmetric matrix-matrix multiply
+      * gemm:      General matrix-matrix multiply
+      * lascal:    General matrix scaling
+      * tradd:     Triangular matrix-matrix addition
+      * geadd:     General matrix-matrix addition
+      * lantr:     Triangular matrix norm
+      * lansy:     Symmetric matrix norm
+      * lange:     General matrix norm
+      * lacpy:     General matrix copy
+**** Execution trace using EZTrace
+     <<sec:trace_ezt>>
+     [[http://eztrace.gforge.inria.fr/support.html][EZTrace]] can be used by chameleon to generate traces. Two modules
+     are automatically generated as soon as EZTrace is detected on the
+     system. The first one (which is recommended) is the
+     ~chameleon_tcore~ module. It traces all the ~TCORE_...()~ functions
+     that are called by the codelets of all the runtime but PaRSEC. The
+     second one is the ~chameleon_core~ module which traces the lower
+     level ~CORE_...()~ functions. If using PaRSEC, you need to use this
+     module to generate the traces.
+     To generate traces with EZTrace, you need first to compile with
+     *-DBUILD_SHARED_LIBS=ON*. EZTrace is using weak symbols to overload
+     function calls with ld_preload and enable trace generation. Then,
+     either you install the ~libeztrace-*.so~ files into the EZTrace
+     install directory, or you can add the path of the modules to your
+     environement
+     #+begin_src
+     export EZTRACE_LIBRARY_PATH=/path/to/your/modules
+     #+end_src
+     To check if the modules are available you should have
+     #+begin_src
+     $ eztrace_avail
+     1	omp	Module for OpenMP parallel regions
+     2	pthread	Module for PThread synchronization functions (mutex, semaphore, spinlock, etc.)
+     3	stdio	Module for stdio functions (read, write, select, poll, etc.)
+     4	mpi	Module for MPI functions
+     5	memory	Module for memory functions (malloc, free, etc.)
+     6	papi	Module for PAPI Performance counters
+     128	chameleon_core	Module for Chameleon CORE functions
+     129	chameleon_tcore	Module for Chameleon TCORE functions
+     #+end_src
+     Then, you can restrict the modules used during the execution
+     #+begin_src
+     export EZTRACE_TRACE="mpi chameleon_tcore"
+     #+end_src
+     _The module ~mpi~ is required if you want to run in distributed._
+     The setup can be checked with ~eztrace_loaded~
+     #+begin_src
+     $ eztrace_loaded
+     4	mpi	Module for MPI functions
+     129	chameleon_tcore	Module for Chameleon TCORE functions
+     #+end_src
+     To generate the traces, you need to run your binary through
+     eztrace:
+     #+begin_src
+     eztrace ./chameleon_dtesting -o gemm -n 1000 -b 200
+     mpirun -np 4 eztrace ./chameleon_dtesting -o gemm -n 1000 -b 200 -P 2
+     #+end_src
+     Convert the binary files into a ~.trace~ file, and visualize it.
+     #+begin_src
+     eztrace_convert <username>_eztrace_log_rank_<[0-9]*>
+     vite eztrace_output.trace
+     #+end_src
+     For more information on EZTrace, you can follow the [[http://eztrace.gforge.inria.fr/support.html][support page]].
+**** Execution trace using StarPU/FxT
+     <<sec:trace_fxt>>
+     StarPU can generate its own trace log files by compiling it with
+     the ~--with-fxt~ option at the configure step (you can have to
+     specify the directory where you installed FxT by giving
+     ~--with-fxt=...~ instead of ~--with-fxt~ alone).  By doing so, traces
+     are generated after each execution of a program which uses StarPU
+     in the directory pointed by the [[http://starpu.gforge.inria.fr/doc/html/ExecutionConfigurationThroughEnvironmentVariables.html][STARPU_FXT_PREFIX]] environment
+     variable.
+     #+begin_example
+     export STARPU_FXT_PREFIX=/home/jdoe/fxt_files/
+     #+end_example
+     When executing a ~./testing/...~ Chameleon program, if it has been
+     enabled (StarPU compiled with FxT), the program will generate
+     trace files in the directory $STARPU_FXT_PREFIX.
+     Finally, to generate the trace file which can be opened with [[http://vite.gforge.inria.fr/][Vite]]
+     program, you can use the *starpu_fxt_tool* executable of StarPU.
+     This tool should be in the bin directory of StarPU's installation.
+     You can use it to generate the trace file like this:
+     #+begin_src
+     path/to/your/install/starpu/bin/starpu_fxt_tool -i prof_filename
+     #+end_src
+     There is one file per mpi processus (prof_filename_0,
+     prof_filename_1 ...).  To generate a trace of mpi programs you can
+     call it like this:
+     #+begin_src
+     path/to/your/install/starpu/bin/starpu_fxt_tool -i prof_filename*
+     #+end_src
+     The trace file will be named paje.trace (use -o option to specify
+     an output name).  Alternatively, for non mpi execution (only one
+     processus and profiling file), you can set the environment
+     variable *STARPU_GENERATE_TRACE=1* to automatically generate the
+     paje trace file.
+**** Use simulation mode with StarPU-SimGrid
+     <<sec:simu>>
+     Simulation mode can be activated by setting the cmake option
+     CHAMELEON_SIMULATION to ON.  This mode allows you to simulate
+     execution of algorithms with StarPU compiled with [[http://simgrid.gforge.inria.fr/][SimGrid]].  To do
+     so, we provide some perfmodels in the simucore/perfmodels/
+     directory of Chameleon sources.  To use these perfmodels, please
+     set your *STARPU_HOME* environment variable to
+     ~path/to/your/chameleon_sources/simucore/perfmodels~.  Finally, you
+     need to set your *STARPU_HOSTNAME* environment variable to the name
+     of the machine to simulate.  Note that only *potrf* algorithm,
+     simple and double precisions, on /mirage/ and /sirocco/ machines, with
+     or without gpus, and for the following tile sizes (choose a size *N*
+     multiple of the tile size) are available for now
+     - /mirage/: 320, 960
+     - /sirocco/: 80, 440, 960, 1440, 1920
+     Database of models is subject to change.
-    To check if the modules are available you should have
-    #+begin_src
-    $ eztrace_avail
-    1	omp	Module for OpenMP parallel regions
-    2	pthread	Module for PThread synchronization functions (mutex, semaphore, spinlock, etc.)
-    3	stdio	Module for stdio functions (read, write, select, poll, etc.)
-    4	mpi	Module for MPI functions
-    5	memory	Module for memory functions (malloc, free, etc.)
-    6	papi	Module for PAPI Performance counters
-    128	chameleon_core	Module for Chameleon CORE functions
-    129	chameleon_tcore	Module for Chameleon TCORE functions
-    #+end_src
+     #+begin_example
+     export STARPU_HOME=/tmp/chameleon/simucore/perfmodels/
+     export STARPU_HOSTNAME=sirocco
+     ./testing/chameleon_dtesting -o potrf -t 22 -g 2 -n 14400 -b 1440
+     Id;Function;threads;gpus;P;Q;mtxfmt;nb;uplo;n;lda;seedA;time;gflops
+     0;dpotrf;22;2;1;1;0;1440;121;14400;14400;846930886;7.814116e-01;1.273889e+03
+     #+end_example
-    Then, you can restrict the modules used during the execution
-    #+begin_src
-    export EZTRACE_TRACE="mpi chameleon_tcore"
-    #+end_src
+**** Use out of core support with StarPU
+     <<sec:ooc>>
-    _The module ~mpi~ is required if you want to run in distributed._
+     If the matrix can not fit in the main memory, StarPU can automatically evict
+     tiles to the disk. The following variables need to be set:
+     * *STARPU_DISK_SWAP* environment variable to a place where to store
+     evicted tiles, for example: ~STARPU_DISK_SWAP=/tmp~
+     * *STARPU_DISK_SWAP_BACKEND* environment variable to the I/O method,
+     for example: ~STARPU_DISK_SWAP_BACKEND=unistd_o_direct~
+     * *STARPU_LIMIT_CPU_MEM* environment variable to the amount of memory
+     that can be used in MBytes, for example: ~STARPU_LIMIT_CPU_MEM=1000~
-    The setup can be checked with ~eztrace_loaded~
-    #+begin_src
-    $ eztrace_loaded
-    4	mpi	Module for MPI functions
-    129	chameleon_tcore	Module for Chameleon TCORE functions
-    #+end_src
-    To generate the traces, you need to run your binary through
-    eztrace:
-    #+begin_src
-    eztrace ./chameleon_dtesting -o gemm -n 1000 -b 200
-    mpirun -np 4 eztrace ./chameleon_dtesting -o gemm -n 1000 -b 200 -P 2
-    #+end_src
+*** Tutorial LAPACK to Chameleon
+    <<sec:tuto>>
-    Convert the binary files into a ~.trace~ file, and visualize it.
+    Chameleon provides routines to solve dense general systems of
+    linear equations, symmetric positive definite systems of linear
+    equations and linear least squares problems, using LU, Cholesky, QR
+    and LQ factorizations.  Real arithmetic and complex arithmetic are
+    supported in both single precision and double precision.  Routines
+    that compute linear algebra are of the following form:
-    eztrace_convert <username>_eztrace_log_rank_<[0-9]*>
-    vite eztrace_output.trace
+    CHAMELEON_name[_Tile[_Async]]
-    For more information on EZTrace, you can follow the [[http://eztrace.gforge.inria.fr/support.html][support page]].
-*** Execution trace using StarPU/FxT
-    <<sec:trace_fxt>>
-    StarPU can generate its own trace log files by compiling it with
-    the ~--with-fxt~ option at the configure step (you can have to
-    specify the directory where you installed FxT by giving
-    ~--with-fxt=...~ instead of ~--with-fxt~ alone).  By doing so, traces
-    are generated after each execution of a program which uses StarPU
-    in the directory pointed by the [[http://starpu.gforge.inria.fr/doc/html/ExecutionConfigurationThroughEnvironmentVariables.html][STARPU_FXT_PREFIX]] environment
-    variable.
-    #+begin_example
-    export STARPU_FXT_PREFIX=/home/jdoe/fxt_files/
-    #+end_example
-    When executing a ~./testing/...~ Chameleon program, if it has been
-    enabled (StarPU compiled with FxT), the program will generate
-    trace files in the directory $STARPU_FXT_PREFIX.
-    Finally, to generate the trace file which can be opened with [[http://vite.gforge.inria.fr/][Vite]]
-    program, you can use the *starpu_fxt_tool* executable of StarPU.
-    This tool should be in the bin directory of StarPU's installation.
-    You can use it to generate the trace file like this:
+    * all user routines are prefixed with *CHAMELEON*
+    * in the pattern *CHAMELEON_name[_Tile[_Async]]*, /name/ follows the
+      BLAS/LAPACK naming scheme for algorithms (/e.g./ sgemm for general
+      matrix-matrix multiply simple precision)
+    * Chameleon provides three interface levels
+      * *CHAMELEON_name*: simplest interface, very close to CBLAS and
+        LAPACKE, matrices are given following the LAPACK data layout
+        (1-D array column-major).  It involves copy of data from LAPACK
+        layout to tile layout and conversely (to update LAPACK data),
+        see [[sec:tuto_step1][Step1]].
+      * *CHAMELEON_name_Tile*: the tile interface avoid copies between LAPACK
+        and tile layouts. It is the standard interface of Chameleon and
+        it should achieved better performance than the previous
+        simplest interface. The data are given through a specific
+        structure called a descriptor, see [[sec:tuto_step2][Step2]].
+      * *CHAMELEON_name_Tile_Async*: similar to the tile interface, it avoids
+        synchonization barrier normally called between *Tile* routines.
+        At the end of an *Async* function, completion of tasks is not
+        guaranteed and data are not necessarily up-to-date.  To ensure
+        that tasks have been all executed, a synchronization function
+        has to be called after the sequence of *Async* functions, see
+        [[sec:tuto_step4][Step4]].
+    CHAMELEON routine calls have to be preceded from
-    path/to/your/install/starpu/bin/starpu_fxt_tool -i prof_filename
-    There is one file per mpi processus (prof_filename_0,
-    prof_filename_1 ...).  To generate a trace of mpi programs you can
-    call it like this:
+    to initialize CHAMELEON and the runtime system and followed by
-    path/to/your/install/starpu/bin/starpu_fxt_tool -i prof_filename*
+    CHAMELEON_Finalize();
-    The trace file will be named paje.trace (use -o option to specify
-    an output name).  Alternatively, for non mpi execution (only one
-    processus and profiling file), you can set the environment
-    variable *STARPU_GENERATE_TRACE=1* to automatically generate the
-    paje trace file.
-*** Use simulation mode with StarPU-SimGrid
-    <<sec:simu>>
-    Simulation mode can be activated by setting the cmake option
-    CHAMELEON_SIMULATION to ON.  This mode allows you to simulate
-    execution of algorithms with StarPU compiled with [[http://simgrid.gforge.inria.fr/][SimGrid]].  To do
-    so, we provide some perfmodels in the simucore/perfmodels/
-    directory of Chameleon sources.  To use these perfmodels, please
-    set your *STARPU_HOME* environment variable to
-    ~path/to/your/chameleon_sources/simucore/perfmodels~.  Finally, you
-    need to set your *STARPU_HOSTNAME* environment variable to the name
-    of the machine to simulate.  Note that only *potrf* algorithm,
-    simple and double precisions, on /mirage/ and /sirocco/ machines, with
-    or without gpus, and for the following tile sizes (choose a size *N*
-    multiple of the tile size) are available for now
-    - /mirage/: 320, 960
-    - /sirocco/: 80, 440, 960, 1440, 1920
-    Database of models is subject to change.
-    #+begin_example
-    export STARPU_HOME=/tmp/chameleon/simucore/perfmodels/
-    export STARPU_HOSTNAME=sirocco
-    ./testing/chameleon_dtesting -o potrf -t 22 -g 2 -n 14400 -b 1440
-    Id;Function;threads;gpus;P;Q;mtxfmt;nb;uplo;n;lda;seedA;time;gflops
-    0;dpotrf;22;2;1;1;0;1440;121;14400;14400;846930886;7.814116e-01;1.273889e+03
-    #+end_example
-*** Use out of core support with StarPU
-    <<sec:ooc>>
-    If the matrix can not fit in the main memory, StarPU can automatically evict
-    tiles to the disk. The following variables need to be set:
-    * *STARPU_DISK_SWAP* environment variable to a place where to store
-    evicted tiles, for example: ~STARPU_DISK_SWAP=/tmp~
-    * *STARPU_DISK_SWAP_BACKEND* environment variable to the I/O method,
-    for example: ~STARPU_DISK_SWAP_BACKEND=unistd_o_direct~
-    * *STARPU_LIMIT_CPU_MEM* environment variable to the amount of memory
-    that can be used in MBytes, for example: ~STARPU_LIMIT_CPU_MEM=1000~
-** Chameleon API
-   Chameleon provides routines to solve dense general systems of
-   linear equations, symmetric positive definite systems of linear
-   equations and linear least squares problems, using LU, Cholesky, QR
-   and LQ factorizations.  Real arithmetic and complex arithmetic are
-   supported in both single precision and double precision.  Routines
-   that compute linear algebra are of the following form:
-   #+begin_src
-   CHAMELEON_name[_Tile[_Async]]
-   #+end_src
-   * all user routines are prefixed with *CHAMELEON*
-   * in the pattern *CHAMELEON_name[_Tile[_Async]]*, /name/ follows the
-     BLAS/LAPACK naming scheme for algorithms (/e.g./ sgemm for general
-     matrix-matrix multiply simple precision)
-   * Chameleon provides three interface levels
-     * *CHAMELEON_name*: simplest interface, very close to CBLAS and
-       LAPACKE, matrices are given following the LAPACK data layout
-       (1-D array column-major).  It involves copy of data from LAPACK
-       layout to tile layout and conversely (to update LAPACK data),
-       see [[sec:tuto_step1][Step1]].
-     * *CHAMELEON_name_Tile*: the tile interface avoid copies between LAPACK
-       and tile layouts. It is the standard interface of Chameleon and
-       it should achieved better performance than the previous
-       simplest interface. The data are given through a specific
-       structure called a descriptor, see [[sec:tuto_step2][Step2]].
-     * *CHAMELEON_name_Tile_Async*: similar to the tile interface, it avoids
-       synchonization barrier normally called between *Tile* routines.
-       At the end of an *Async* function, completion of tasks is not
-       guaranteed and data are not necessarily up-to-date.  To ensure
-       that tasks have been all executed, a synchronization function
-       has to be called after the sequence of *Async* functions, see
-       [[sec:tuto_step4][Step4]].
-   CHAMELEON routine calls have to be preceded from
-   #+begin_src
-   #+end_src
-   to initialize CHAMELEON and the runtime system and followed by
-   #+begin_src
-   CHAMELEON_Finalize();
-   #+end_src
-   to free some data and finalize the runtime and/or MPI.
-*** Tutorial LAPACK to Chameleon
-    <<sec:tuto>>
+    to free some data and finalize the runtime and/or MPI.
     This tutorial is dedicated to the API usage of Chameleon.  The
     idea is to start from a simple code and step by step explain how
diff --git a/doc/user/homepage.org b/doc/user/homepage.org
new file mode 100644
index 0000000000000000000000000000000000000000..4604ba0f1440cdb66cc0772e8d35b749cb44b758
--- /dev/null
+++ b/doc/user/homepage.org
@@ -0,0 +1,578 @@
+#+TITLE: Chameleon
+#+SUBTITLE: A dense linear algebra software for heterogeneous architectures
+#+EXPORT_FILE_NAME: index.html
+#+LANGUAGE:  en
+#+OPTIONS: H:3 num:t \n:nil @:t ::t |:t _:nil ^:nil -:t f:t *:t <:t
+#+OPTIONS: TeX:t LaTeX:t skip:nil d:nil pri:nil tags:not-in-toc html-style:nil
+# #+HTML_LINK_HOME: https://solverstack.gitlabpages.inria.fr/chameleon/
+#+SETUPFILE: https://mfelsoci.gitlabpages.inria.fr/inria-org-html-themes/online-theme-readtheorginria.setup
+* Overview
+  <<sec:overview>>
+  Chameleon is a framework written in C which provides routines to
+  solve dense general systems of linear equations, symmetric positive
+  definite systems of linear equations and linear least squares
+  problems, using LU, Cholesky, QR and LQ factorizations. Real
+  arithmetic and complex arithmetic are supported in both single
+  precision and double precision. It supports Linux and Mac OS/X
+  machines (mainly tested on Intel x86-64 and IBM Power
+  architectures).  Chameleon is based on the PLASMA source code but
+  is not limited to shared-memory environment and can exploit
+  multiple GPUs. Chameleon is interfaced in a generic way with
+  StarPU, PaRSEC, QUARK, OpenMP runtime systems. This feature allows
+  to analyze in a unified framework how sequential task-based
+  algorithms behave regarding different runtime systems
+  implementations. Using Chameleon with StarPU or PaRSEC runtime
+  systems allows to exploit GPUs through kernels provided by cuBLAS
+  and clusters of interconnected nodes with distributed memory (using
+  MPI).
+  Main features:
+  * Written in C, Fortran interface, CMake build system
+  * Algorithms: GEMM, POTRF, GETRF, GEQRF, GESVD, ...
+  * Matrices forms: general, symmetric, triangular
+  * Precisions: simple, double, complex, double complex
+  #+ATTR_HTML: :alt DAG of Chameleon image :title DAG of Chameleon :align center :width 600px
+  [[./chameleon.svg]]
+* News
+  <<sec:news>>
+#+INCLUDE: "news.org"
+* Download
+  <<sec:download>>
+  Depending on how much you need to tune the library installation we
+  propose several solutions.
+  1) You just want to have a try, to see if it can be installed well
+     on your system, what are the performances on simple cases, run
+     the examples, or simply use the last stable version: we recommand
+     to use one of our packages, .deb ones for those who work on a
+     Linux Debian or Ubuntu distribution, Guix or Spack on other Linux
+     systems, Brew on macOS.
+  2) You want to use it but you need a change somewhere in the stack
+     like considering another version (git branch), change the default
+     BLAS/LAPACK or MPI, use your favorite compiler, modify some
+     sources: you may try with Guix or Spack because these package
+     managers allow to build from sources and thus many options can be
+     changed dynamically (command line), or directly build from source
+     with the native build system of the library (Makefile, GNU
+     autotools, CMake) following the procedures described in the
+     installation guide of the library, cf. [[sec:quickstartguide]].
+  3) You need a tool for reproducibility of your experiments: Guix is
+     recommended.
+  #+ATTR_HTML: :alt Table of download materials image :title Table of download materials :align center
+  | Git       | Release source | Debian/Ubuntu | [[https://brew.sh/][Brew]] (Mac) | [[https://guix.gnu.org/en/][Guix]] (Linux) | [[https://spack.readthedocs.io/en/latest/][Spack]] (Linux/Mac) |
+  |-----------+----------------+---------------+------------+--------------+-------------------|
+  | [[https://gitlab.inria.fr/solverstack/chameleon][Chameleon]] | [[https://gitlab.inria.fr/solverstack/chameleon/uploads/b299d6037d7636c6be16108c89bc2aab/chameleon-1.1.0.tar.gz][1.1.0]]          | [[https://gitlab.inria.fr/solverstack/chameleon/-/packages][packages]]      | [[https://gitlab.inria.fr/solverstack/brew-repo][brew-repo]]  | [[https://gitlab.inria.fr/guix-hpc/guix-hpc-non-free][guix-repo]]    | [[https://gitlab.inria.fr/solverstack/spack-repo][spack-repo]]        |
+  Some packages are part of the official distribution and we just
+  provide the package name. For others we provide links where you can
+  find either a file to install or a package recipe that can be used
+  with Brew, Guix, Spack. If there are no package available for your
+  distribution please [[sec:contact][contact us]] and we will try to find a solution.
+  All these packages have been successfully installed and tested on
+  Unix systems: *Linux* (Debian testing, Ubuntu 20.04 LTS) and *macOS*
+  (Catalina).
+* Quick start guide
+  <<sec:quickstartguide>>
+  Here a quick starting guide for using Chameleon.  For more
+  information please refer to the [[sec:documentation][full documentation]].
+** Install
+   In the following we present quick examples of installation of the
+   [[sec:download][packages]].
+*** Release source installation with CMake
+**** Linux Ubuntu 20.04
+     Start by installing common development tools
+     #+begin_src sh :eval never-export
+     sudo apt-get update -y
+     sudo apt-get install -y git cmake build-essential gfortran python wget tar curl pkg-config
+     #+end_src
+     - CBLAS/LAPACKE is required (OpenBLAS, Intel MKL, BLIS/FLAME, IBM
+       ESSL + Reference LAPACK for cblas/lapacke interface)
+     - we recommend to install StarPU as runtime system with MPI
+       enabled and optionally CUDA/cuBLAS if enabled on your system
+     #+begin_src sh :eval never-export
+     sudo apt-get install -y libopenblas-dev liblapacke-dev libhwloc-dev libopenmpi-dev libstarpu-dev
+     #+end_src
+     Remarks:
+     - The pair ~libopenblas-dev liblapacke-dev~ can be replaced by
+       ~libmkl-dev~.
+     - One can also use lib blis and flame but be sure to install a
+       cblas and lapacke, from the Reference LAPACK (with CBLAS=ON,
+       LAPACKE=ON), linked to blis/flame.
+     - The lib essl (IBM) can also be used as BLAS/LAPACK with the
+       Reference LAPACK providing cblas/lapacke.
+     Then to install Chameleon from sources with CMake, proceed as follows
+     #+begin_src sh :eval never-export
+     wget https://gitlab.inria.fr/solverstack/chameleon/uploads/b299d6037d7636c6be16108c89bc2aab/chameleon-1.1.0.tar.gz
+     tar xvf chameleon-1.1.0.tar.gz
+     cd chameleon-1.1.0
+     # or clone from git repository
+     # git clone --recursive https://gitlab.inria.fr/solverstack/chameleon.git && cd chameleon
+     mkdir build && cd build
+     make -j5 install
+     #+end_src
+     See more examples in the [[sec:ug:cmake][user's guide]].
+**** macOS
+     Start by installing common development tools
+     #+begin_src sh :eval never-export
+     # install Homebrew if not already available
+     /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
+     # install compiler and tools
+     brew install htop tmux gcc automake autoconf libtool doxygen emacs zlib bzip2 bison hwloc pkgconfig openblas openmpi
+     # gcc and g++ are missing (avoid to use clang version in /usr/bin)
+     ln -sf /usr/local/bin/gcc-11 /usr/local/bin/gcc
+     ln -sf /usr/local/bin/g++-11 /usr/local/bin/g++
+     # use pkg-config .pc files to detect some dependencies
+     export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig:/usr/local/opt/openblas/lib/pkgconfig
+     # cmake checks blas.pc not openblas.pc
+     sudo cp /usr/local/opt/openblas/lib/pkgconfig/openblas.pc /usr/local/opt/openblas/lib/pkgconfig/blas.pc
+     #+end_src
+     - CBLAS/LAPACKE is required (OpenBLAS or Intel MKL)
+     - we recommend to install StarPU as runtime system with MPI
+       enabled
+     #+begin_src sh :eval never-export
+     # install last starpu release
+     cd $HOME
+     wget https://files.inria.fr/starpu/starpu-1.3.8/starpu-1.3.8.tar.gz
+     tar xvf starpu-1.3.8.tar.gz
+     cd starpu-1.3.8
+     ./configure
+     make -j5
+     sudo make install
+     #+end_src
+     Then to install Chameleon from sources with CMake, proceed as follows
+     #+begin_src sh :eval never-export
+     wget https://gitlab.inria.fr/solverstack/chameleon/uploads/b299d6037d7636c6be16108c89bc2aab/chameleon-1.1.0.tar.gz
+     tar xvf chameleon-1.1.0.tar.gz
+     cd chameleon-1.1.0
+     # or clone from git repository
+     # git clone --recursive https://gitlab.inria.fr/solverstack/chameleon.git && cd chameleon
+     mkdir build && cd build
+     make -j5 install
+     #+end_src
+*** Debian/Ubuntu packages
+    Visit the [[https://gitlab.inria.fr/solverstack/chameleon/-/packages][packages]] page and download the appropriate file for your
+    system. Then in a shell terminal
+    #+begin_src sh :eval never-export
+    # to install one of our non offical package, for example on Ubuntu 20.04
+    curl https://gitlab.inria.fr/solverstack/chameleon/-/package_files/15259/download -o chameleon_1.1.0-1_amd64.deb
+    sudo apt-get install -y ./chameleon_1.1.0-1_amd64.deb
+    #+end_src
+    If there are no package available for your Debian/Ubuntu
+    distribution please [[sec:contact][contact us]] and we will try to provide one.
+*** Brew packages
+    Brew packages for macOS are stored in our [[https://gitlab.inria.fr/solverstack/brew-repo][brew-repo]] git
+    repository. Please refer to the [[https://gitlab.inria.fr/solverstack/brew-repo/-/blob/master/README.org][README]] for installation
+    instructions.
+    Examples:
+    #+begin_src sh :eval never-export
+    /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
+    git clone https://gitlab.inria.fr/solverstack/brew-repo.git
+    brew install --build-from-source ./brew-repo/starpu.rb
+    brew install --build-from-source ./brew-repo/chameleon.rb
+    #+end_src
+*** Guix packages
+    [[https://guix.gnu.org/][Guix]] requires a running GNU/Linux system, GNU tar and Xz.
+    #+begin_src sh :eval never-export
+    gpg --keyserver pgp.mit.edu --recv-keys 3CE464558A84FDC69DB40CFB090B11993D9AEBB5
+    wget https://git.savannah.gnu.org/cgit/guix.git/plain/etc/guix-install.sh
+    chmod +x guix-install.sh
+    sudo ./guix-install.sh
+    #+end_src
+    Guix packages are stored in our [[https://gitlab.inria.fr/guix-hpc/guix-hpc][guix-hpc]], and [[https://gitlab.inria.fr/guix-hpc/guix-hpc-non-free][guix-hpc-non-free]]
+    (for versions with Intel MKL and/or CUDA) git repositories. Please
+    refer to the README to see how to add our package to the list of
+    Guix available packages (/i.e./ add a channel).
+    The package definitions in this [[https://gitlab.inria.fr/guix-hpc/guix-hpc-non-free][repo]] extend those that come with
+    Guix.  To make them visible to the guix command-line tools, create
+    the ~~/.config/guix/channels.scm~ file with the following snippet to
+    request the guix-hpc channel:
+    #+begin_src :eval never-export
+    (cons (channel
+            (name 'guix-hpc-non-free)
+            (url "https://gitlab.inria.fr/guix-hpc/guix-hpc-non-free.git"))
+          %default-channels)
+    #+end_src
+    That way, ~guix pull~ will systematically pull not only Guix, but
+    also Guix-HPC-non-free and Guix-HPC.
+    #+begin_src sh :eval never-export
+    guix pull
+    #+end_src
+    Then to install Chameleon last release
+    #+begin_src sh
+    guix install chameleon
+    #+end_src
+    See more examples in the [[sec:ug:guix][user's guide]].
+*** Spack packages
+    Spack packages for Linux or macOS are stored in our [[https://gitlab.inria.fr/solverstack/spack-repo][spack-repo]] git
+    repository. Please refer to the [[https://gitlab.inria.fr/solverstack/spack-repo/-/blob/master/README.org][README]] for installation
+    instructions.
+    Examples:
+    #+begin_src sh :eval never-export
+    # please read https://spack.readthedocs.io/en/latest/getting_started.html
+    git clone https://github.com/llnl/spack.git
+    cd spack
+    git checkout v0.16.2
+    . share/spack/setup-env.sh
+    # Currently spack provides openmpi v3 as default openmpi. Add the
+    # following in your etc/spack/defaults/packages.yaml in order to set
+    # openmpi 4.0.5 to be the default:
+    # openmpi:
+    #   version: [4.0.5]
+    git clone https://gitlab.inria.fr/solverstack/spack-repo.git
+    spack repo add spack-repo
+    spack install chameleon
+    # chameleon is installed here:
+    spack location -i chameleon
+    #+end_src
+    Spack allows to expose many build variants so that it is difficult
+    to ensure that all installations will succeed.
+    See more examples in the [[sec:ug:spack][user's guide]].
+** Linking
+   If you do not use CMake we provide a pkg-config file at
+   installation in the subdirectory ~lib/pkgconfig~.
+   #+begin_src sh :eval never-export
+   # lets CHAMELEON_ROOT be the installation path
+   pkg-config --cflags chameleon
+   pkg-config --libs chameleon
+   pkg-config --libs --static chameleon
+   #+end_src
+   If you build your project with CMake we provide a
+   ~CHAMELEONConfig.cmake~ file at installation, in the subdirectory
+   ~lib/cmake/~ of the installation. Example of ~CMakeLists.txt~ for
+   Chameleon
+   #+begin_src :eval never-export
+   project(CHAMELEON_EXAMPLE C Fortran)
+   # to be able to use CHAMELEON_ROOT env. var.
+   cmake_policy(SET CMP0074 NEW)
+   # look for CHAMELEON on the system
+   # Hint: use CHAMELEON_ROOT (env. var. or cmake var.) to the installation directory of
+   # CHAMELEON if not installed in a standard path
+   find_package(CHAMELEON REQUIRED)
+   # compile your example
+   add_executable(chameleon_example chameleon_example.c)
+   # link to chameleon
+   target_link_libraries(chameleon_example PRIVATE CHAMELEON::chameleon)
+   #+end_src
+** Using
+   Considering that the ~bin/~ directory of the Chameleon installation
+   is in the ~PATH~, the testing executables can be used to check main
+   linear algebra operations such as 'gemm', 'potrf', 'getrf',
+   'geqrf', 'gels', etc
+   #+begin_src sh :eval never-export
+   chameleon_stesting -H -o gemm -t 2 -m 2000 -n 2000 -k 2000
+   #+end_src
+   See the options with
+   #+begin_src sh :eval never-export
+   chameleon_stesting -h
+   #+end_src
+   See the available linear algebra operations with
+   #+begin_src sh :eval never-export
+   chameleon_stesting -o help
+   #+end_src
+   Remarks:
+   - If using OpenBLAS multithreaded ensure to set
+     ~OPENBLAS_NUM_THREADS=1~ because Chameleon handles mutithreading
+     directly
+   - same for Intel MKL, ensure to set ~MKL_NUM_THREADS=1~
+   Here an example of linear system solving written in C through a
+   Cholesky factorization on a SPD matrix with *LAPACK format*
+   #+begin_src C :eval never-export
+   #include <chameleon.h>
+   #include <stdlib.h>
+   int main(void)
+   {
+       size_t N; // matrix order
+       size_t NRHS; // number of RHS vectors
+       int NCPU; // number of cores to use
+       int NGPU; // number of gpus (cuda devices) to use
+       int UPLO = ChamUpper; // where is stored L
+       int major, minor, patch;
+       CHAMELEON_Version(&major, &minor, &patch);
+       /* Linear system parameters */
+       N    = 1000;
+       NRHS = 10;
+       /* Initialize the number of CPUs to be used with threads */
+       NCPU = 2;
+       NGPU = 0;
+       /* Initialize CHAMELEON with main parameters */
+       CHAMELEON_Init( NCPU, NGPU );
+       /*
+        * allocate memory for our data
+        *     - matrix A                   : size N x N
+        *     - set of RHS vectors B       : size N x NRHS
+        *     - set of solutions vectors X : size N x NRHS
+        */
+       double *A    = malloc( N * N    * sizeof(double) );
+       double *B    = malloc( N * NRHS * sizeof(double) );
+       double *X    = malloc( N * NRHS * sizeof(double) );
+       /* generate A matrix with random values such that it is spd */
+       CHAMELEON_dplgsy( (double)N, ChamUpperLower, N, A, N, 51 );
+       /* generate RHS */
+       CHAMELEON_dplrnt( N, NRHS, B, N, 5673 );
+       /* copy B in X before solving */
+       memcpy(X, B, N*NRHS*sizeof(double));
+       /************************************************************/
+       /* solve the system AX = B using the Cholesky factorization */
+       /************************************************************/
+       /* Cholesky facorization:
+        * A is replaced by its factorization L or L^T depending on uplo */
+       CHAMELEON_dpotrf( UPLO, N, A, N );
+       /* Solve:
+        * B is stored in X on entry, X contains the result on exit.
+        * Forward and back substitutions
+        */
+       CHAMELEON_dpotrs(UPLO, N, NRHS, A, N, X, N);
+       /* deallocate data */
+       free(A);
+       free(B);
+       free(X);
+       /* Finalize CHAMELEON */
+       CHAMELEON_Finalize();
+       return EXIT_SUCCESS;
+   }
+   #+end_src
+   In this example the LAPACK matrix is internally converted into
+   Chameleon /tiled/ matrix format then task-based algorithms can be
+   called. The copy operation can be costly. Please consider learning
+   how to work directly with the Chameleon /tiled/ matrix format to get
+   faster executions and the ability to handle distributed matrices
+   over several machines. The user's data can be given in several way
+   to fill the Chameleon /tiled/ matrix, see [[sec:tuto]].
+   Here a simple example of linear system solving written in C through
+   a Cholesky factorization on a SPD matrix with *Chameleon format*
+   #+begin_src C :eval never-export
+   #include <chameleon.h>
+   #include <stdlib.h>
+   int main(void)
+   {
+       size_t N; // matrix order
+       size_t NRHS; // number of RHS vectors
+       int NCPU; // number of cores to use
+       int NGPU; // number of gpus (cuda devices) to use
+       int UPLO = ChamUpper; // where is stored L
+       /* descriptors necessary for calling CHAMELEON tile interface  */
+       CHAM_desc_t *descA = NULL, *descB = NULL, *descX = NULL;
+       int major, minor, patch;
+       CHAMELEON_Version(&major, &minor, &patch);
+       /* Linear system parameters */
+       N    = 1000;
+       NRHS = 10;
+       /* Initialize the number of CPUs to be used with threads */
+       NCPU = 2;
+       NGPU = 0;
+       /* Initialize CHAMELEON with main parameters */
+       CHAMELEON_Init( NCPU, NGPU );
+       /*
+        * Initialize the structure required for CHAMELEON tile interface
+        * CHAM_desc_t is a structure wrapping your data allowing CHAMELEON to get
+        * pointers to tiles. A tile is a data subset of your matrix on which we
+        * apply some optimized CPU/GPU kernels.
+        * Notice that this routine suppose your matrix is a contiguous vector of
+        * data (1D array), as a data you would give to BLAS/LAPACK.
+        * Main arguments:
+        *     - descA is a pointer to a descriptor, you need to give the address
+        *     of this pointer
+        *     - if you want to give your allocated matrix give its address,
+        *     if not give a NULL pointer, the routine will allocate the memory
+        *     and you access the matrix data with descA->mat
+        *     - give the data type (ChamByte, ChamInteger, ChamRealFloat,
+        *     ChamRealDouble, ChamComplexFloat, ChamComplexDouble)
+        *     - number of rows in a block (tile)
+        *     - number of columns in a block (tile)
+        *     - number of elements in a block (tile)
+        * The other parameters are specific, use:
+        * CHAMELEON_Desc_Create( ... , 0, 0, number of rows, number of columns, 1, 1);
+        * Have a look to the documentation for details about these parameters.
+        */
+       CHAMELEON_Desc_Create(&descA,  NULL, ChamRealDouble,
+                         NB, NB,  NB*NB, N, N, 0, 0, N, N, 1, 1);
+       CHAMELEON_Desc_Create(&descB,  NULL, ChamRealDouble,
+                         NB, NB,  NB*NB, N, NRHS, 0, 0, N, NRHS, 1, 1);
+       CHAMELEON_Desc_Create(&descX,  NULL, ChamRealDouble,
+                         NB, NB,  NB*NB, N, NRHS, 0, 0, N, NRHS, 1, 1);
+       /* generate A matrix with random values such that it is spd */
+       CHAMELEON_dplgsy_Tile( (double)N, ChamUpperLower, descA, 51 );
+       /* generate RHS */
+       CHAMELEON_dplrnt_Tile( descB, 5673 );
+       /* copy B in X before solving */
+       CHAMELEON_dlacpy_Tile(ChamUpperLower, descB, descX);
+       /************************************************************/
+       /* solve the system AX = B using the Cholesky factorization */
+       /************************************************************/
+       /* Cholesky facorization:
+        * A is replaced by its factorization L or L^T depending on uplo */
+       CHAMELEON_dpotrf_Tile( UPLO, descA );
+       /* Solve:
+        * B is stored in X on entry, X contains the result on exit.
+        * Forward and back substitutions
+        */
+       CHAMELEON_dpotrs_Tile( UPLO, descA, descX );
+       /* deallocate data */
+       CHAMELEON_Desc_Destroy( &descA );
+       CHAMELEON_Desc_Destroy( &descB );
+       CHAMELEON_Desc_Destroy( &descX );
+       /* Finalize CHAMELEON */
+       CHAMELEON_Finalize();
+       return EXIT_SUCCESS;
+   }
+   #+end_src
+* Documentation
+  <<sec:documentation>>
+#+INCLUDE: "users_guide.org"
+* Tutorials
+  <<sec:tutorials>>
+  * [[sec:usetesting][Using Chameleon executables]]
+  * [[sec:tuto][Tutorial LAPACK to Chameleon]]
+  * [[https://fpruvost.gitlabpages.inria.fr/exa2pro/][Exa2pro: Chameleon usage on a supercomputer]]
+* Contact
+  <<sec:contact>>
+  If you have an account on [[https://gitlab.inria.fr/][gitlab inria]] please submit a [[https://gitlab.inria.fr/solverstack/chameleon/-/issues][new issue]].
+  If you don't have an account on [[https://gitlab.inria.fr/solverstack/chameleon/-/issues][gitlab inria]] you can send emails to
+  [[mailto:chameleon-issues@inria.fr][chameleon-issues@inria.fr]].
+  To get the news, register to the mailing list
+  [[https://sympa.inria.fr/sympa/info/chameleon-announce][chameleon-announce@inria.fr]] (click on "S'abonner" on the left
+  panel).
+* Get involved!
+* Authors
+ First, since the Chameleon library started as an extension of the
+ PLASMA library to support multiple runtime systems, all developpers
+ of the PLASMA library are developpers of the Chameleon library.
+ The following people contributed to the development of Chameleon:
+ * Emmanuel Agullo, PI
+ * Olivier Aumage
+ * Cedric Castagnede
+ * Terry Cojean
+ * Mathieu Faverge, PI
+ * Nathalie Furmento
+ * Reazul Hoque
+ * Hatem Ltaief
+ * Gregoire Pichon
+ * Florent Pruvost, PI
+ * Marc Sergent
+ * Guillaume Sylvand
+ * Samuel Thibault
+ * Stanimire Tomov
+ * Omar Zenati
+ If we forgot your name, please let us know that we can fix that mistake.
+* Citing Chameleon
+Feel free to use the following publications to reference Chameleon:
+  * Original paper that initiated Chameleon and the principles:
+    - Agullo, Emmanuel and Augonnet, Cédric and Dongarra, Jack and
+      Ltaief, Hatem and Namyst, Raymond and Thibault, Samuel and Tomov,
+      Stanimire, *Faster, Cheaper, Better -- a Hybridization Methodology
+      to Develop Linear Algebra Software for GPUs*, /GPU Computing Gems/,
+      [[https://hal.inria.fr/inria-00547847][First Online: 17 December 2010]].
+  * Design of the QR algorithms:
+    - Agullo, Emmanuel and Augonnet, Cédric and Dongarra, Jack and
+      Faverge, Mathieu and Ltaief, Hatem and Thibault, Samuel an
+      Tomov, Stanimire, *QR Factorization on a Multicore Node Enhanced
+      with Multiple GPU Accelerators*, /25th IEEE International Parallel
+      & Distributed Processing Symposium/, [[https://hal.inria.fr/inria-00547614][First Online: 16 December
+      2010]].
+  * Design of the LU algorithms:
+    - Agullo, Emmanuel and Augonnet, Cédric and Dongarra, Jack and
+      Faverge, Mathieu and Langou, Julien and Ltaief, Hatem and Tomov,
+      Stanimire, *LU Factorization for Accelerator-based Systems*,
+      /9th ACS/IEEE International Conference on Computer Systems and
+      Applications (AICCSA 11)/, [[https://hal.inria.fr/hal-00654193][First Online: 21 December 2011]].
+  * Regarding distributed memory:
+    - Agullo, Emmanuel and Aumage, Olivier and Faverge, Mathieu and
+      Furmento, Nathalie and Pruvost, Florent and Sergent, Marc and
+      Thibault, Samuel, *Achieving High Performance on Supercomputers
+      with a Sequential Task-based Programming Model*, /Research Report/,
+      [[https://hal.inria.fr/hal-01332774][First Online: 16 June 2016]].
+* Licence
+  [[https://gitlab.inria.fr/solverstack/chameleon/-/blob/master/LICENCE.txt][LICENCE]]
diff --git a/doc/user/news.org b/doc/user/news.org
new file mode 100644
index 0000000000000000000000000000000000000000..488f4af81b1ac75014db45fcc45b3e54cbc41e7d
--- /dev/null
+++ b/doc/user/news.org
@@ -0,0 +1,6 @@
+** [2021-04-21] *Release 1.1.0* available
+   * See [[https://gitlab.inria.fr/solverstack/chameleon/-/releases/v1.1.0][changes]]
+   * [[https://gitlab.inria.fr/solverstack/chameleon/uploads/b299d6037d7636c6be16108c89bc2aab/chameleon-1.1.0.tar.gz][Download source tarball]]
+** [2021-04-06] Chameleon integration into FMR
+   Chameleon has been successfully integrated into the C++ Randomized
+   SVD library [[https://gitlab.inria.fr/compose/oldstack/fmr][FMR]].
diff --git a/doc/user/publish.el b/doc/user/publish.el
new file mode 100644
index 0000000000000000000000000000000000000000..7a2325217e6f73b15790b74a0cf8521e72d0f750
--- /dev/null
+++ b/doc/user/publish.el
@@ -0,0 +1,29 @@
+;; publish.el
+;; Emacs publish file for project.
+;; Run the following command to execute:
+;; emacs --batch --no-init-file --load publish.el --funcall org-publish-all
+;; Packages:
+(require 'package)
+(require 'ox-publish)
+(require 'org)
+(require 'htmlize)
+(setq org-html-htmlize-output-type 'css)
+(setq org-src-fontify-natively t)
+ 'org-babel-load-languages
+ '(
+   (C . t)
+   (fortran . t)
+   (python . t)
+   (shell . t)
+  ))
+(setq org-publish-project-alist
+      `(("homepage"
+         :base-directory "."
+         :base-extension "org"
+         :publishing-directory "."
+         :publishing-function org-html-publish-to-html
+         )))
diff --git a/doc/user/users_guide.org.in b/doc/user/users_guide.org.in
new file mode 100644
index 0000000000000000000000000000000000000000..106be47a5d30f8b5dbcea05d44ab830de9fa85b9
--- /dev/null
+++ b/doc/user/users_guide.org.in
@@ -0,0 +1,55 @@
+#+INCLUDE: "version.org"
+ This is the users guide to Chameleon.  The software ecosystem will be
+ presented, the installation instructions detailed and some usage
+ examples are presented.  To get more information about the
+ application programming interface, please refer to the [[https://solverstack.gitlabpages.inria.fr/chameleon/dev/index.html][doxygen
+ documentation]].
+** Version
+   This manual documents the usage of Chameleon *version {{{VERSION}}}*.
+   It was last updated on {{{UPDATED}}}.
+** Authors
+   * Inria
+   * University of Tennessee
+   * University of Colorado Denver
+   * King Abdullah University of Science and Technology
+** Copying
+   * Copyright \copy {{{UPDATED-YEAR}}} Inria
+   * Copyright \copy 2014 The University of Tennessee
+   * Copyright \copy 2014 King Abdullah University of Science and Technology
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+   - Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+   - Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer listed
+     in this license in the documentation and/or other materials provided
+     with the distribution.
+   - Neither the name of the copyright holders nor the names of its
+     contributors may be used to endorse or promote products derived from
+     this software without specific prior written permission.
+   This software is provided by the copyright holders and contributors
+   "as is" and any express or implied warranties, including, but not
+   limited to, the implied warranties of merchantability and fitness for
+   a particular purpose are disclaimed.  In no event shall the copyright
+   owner or contributors be liable for any direct, indirect, incidental,
+   special, exemplary, or consequential damages (including, but not
+   limited to, procurement of substitute goods or services; loss of use,
+   data, or profits; or business interruption) however caused and on any
+   theory of liability, whether in contract, strict liability, or tort
+   (including negligence or otherwise) arising in any way out of the use
+   of this software, even if advised of the possibility of such damage.
+** Introduction to Chameleon
+ #+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/introduction.org
+** Installing Chameleon
+   <<sec:ug:install>>
+ #+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/installing.org
+** Using Chameleon
+   <<sec:ug:using>>
+ #+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/using.org
diff --git a/doc/orgmode/version.org.in b/doc/user/version.org.in
similarity index 100%
rename from doc/orgmode/version.org.in
rename to doc/user/version.org.in
diff --git a/tools/pages.sh b/tools/pages.sh
index f7be64c651dc28f187569243170dfabfafb8cd97..251e472a33ba9c53c0ede3d3bbf2402851d405a7 100755
--- a/tools/pages.sh
+++ b/tools/pages.sh
@@ -27,8 +27,8 @@ Rscript ${CHAMELEON_SRC_DIR}/tools/bench/jube/GenFigures.R
 # add the performance chapter to the doc. we need the performances files to add
 # this chapter that are not commited in the sources so that this chapter is not
 # here by default
-cat >> ${CHAMELEON_SRC_DIR}/doc/orgmode/users_guide.org.in <<EOF
-* Chameleon Performances on PlaFRIM
+cat >> ${CHAMELEON_SRC_DIR}/doc/user/users_guide.org.in <<EOF
+** Chameleon Performances on PlaFRIM
 Chameleon commit: *$commit_sha*.
 #+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/performances.org
@@ -43,19 +43,15 @@ cd build-$VERSION
 make doc -j5
+## Copy files in public/ used as an artefact (zip archive) to upload on gitlab pages, see
+## Homepage: https://solverstack.gitlabpages.inria.fr/chameleon/index.html
+## API: https://solverstack.gitlabpages.inria.fr/chameleon/dev/index.html
 cd ..
-mv build-$VERSION/doc/orgmode          public/
-mv build-$VERSION/doc/doxygen/out/html public/doxygen
-mv tmp_fig/*.png public/
-cd public/
-if [ -f users_guide.html ]
-    ln -sfn users_guide.html index.html
-    echo -e "ERROR: missing users_guide.html file"
-    exit 1
-# Change the width of the page in the CSS file
-sed -i -e "s#max-width:800px#max-width:1800px#" org-html-themes/styles/readtheorg/css/readtheorg.css
+mkdir public/
+mkdir public/dev/
+cp build-$VERSION/doc/user/*.html public/
+cp build-$VERSION/doc/user/*.png public/
+cp build-$VERSION/doc/user/*.jpg public/
+cp build-$VERSION/doc/user/*.svg public/
+cp -r build-$VERSION/doc/dev/html/* public/dev/
+cp tmp_fig/* public/