diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt index 12a5bd615bf0ad7c3cf64ee91ea72f4d3c3f2611..7204f50084f5652a5926741da54246841f2d834d 100644 --- a/doc/CMakeLists.txt +++ b/doc/CMakeLists.txt @@ -3,7 +3,7 @@ # @copyright (c) 2009-2014 The University of Tennessee and The University # of Tennessee Research Foundation. # All rights reserved. -# @copyright (c) 2012-2014 Inria. All rights reserved. +# @copyright (c) 2012-2017 Inria. All rights reserved. # @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. # ### @@ -22,6 +22,7 @@ # @author Cedric Castagnede # @author Emmanuel Agullo # @author Mathieu Faverge +# @author Florent Pruvost # @date 13-07-2012 # ### @@ -34,8 +35,8 @@ cmake_minimum_required(VERSION 2.8) # # ############################################# add_subdirectory(doxygen) -add_subdirectory(texinfo) - +add_subdirectory(orgmode) +#add_subdirectory(texinfo) ### ### END CMakeLists.txt ### diff --git a/doc/orgmode/CMakeLists.txt b/doc/orgmode/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..124cbe2e20da88677854d44d090dd7c0287e444a --- /dev/null +++ b/doc/orgmode/CMakeLists.txt @@ -0,0 +1,103 @@ +### +# +# @copyright (c) 2017 Inria. All rights reserved. +# +### +# +# @file CMakeLists.txt +# +# @project MORSE +# MORSE is a software package provided by: +# Inria Bordeaux - Sud-Ouest, +# Univ. of Tennessee, +# King Abdullah Univesity of Science and Technology +# Univ. of California Berkeley, +# Univ. of Colorado Denver. +# +# @version 1.0.0 +# @author Florent Pruvost +# @date 25-08-2017 +# +### + +cmake_minimum_required(VERSION 2.8) + +# Create file version.org +# ----------------------- +configure_file("version.org.in" + "version.org" + @ONLY) +configure_file("users_guide.org.in" + "users_guide.org" + @ONLY) + +set(FIGURES + tile_lu.pdf + tile_lu.jpg + tile_layout.pdf + tile_layout.jpg + trace_qr.pdf + trace_qr.jpg + potri_async.png + morse_header.png + ) +set(FIGURES_HTML + tile_lu.jpg + tile_layout.jpg + trace_qr.jpg + potri_async.png + morse_header.png + ) + +foreach(_fig ${FIGURES}) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/figures/${_fig} + ${CMAKE_CURRENT_BINARY_DIR}/${_fig} + COPYONLY) +endforeach() + +# Looking for emacs +# ----------------- +FIND_PROGRAM(EMACS_COMPILER emacs) + +if(EMACS_COMPILER) + # Add target + # ---------- + add_custom_command(OUTPUT users_guide.html + COMMAND ${EMACS_COMPILER} + ARGS ${CMAKE_CURRENT_BINARY_DIR}/users_guide.org + --batch + -f + org-html-export-to-html + --kill + DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/users_guide.org + ) + add_custom_command(OUTPUT users_guide.pdf + COMMAND ${EMACS_COMPILER} + ARGS ${CMAKE_CURRENT_BINARY_DIR}/users_guide.org + --batch + -f + org-latex-export-to-pdf + --kill + DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/users_guide.org + ) + add_custom_target(doc-html-users_guide ALL DEPENDS users_guide.html) + add_custom_target(doc-pdf-users_guide ALL DEPENDS users_guide.pdf) + + # Installation + # ------------ + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/users_guide.html + DESTINATION share/chameleon/html) + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/users_guide.pdf + DESTINATION share/chameleon/pdf) + foreach(_fig ${FIGURES_HTML}) + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${_fig} + DESTINATION share/chameleon/html) + endforeach() + +else(EMACS_COMPILER) + message(STATUS "Looking for emacs - not found") +endif(EMACS_COMPILER) + +### +### END CMakeLists.txt +### diff --git a/doc/orgmode/chapters/configuration.org b/doc/orgmode/chapters/configuration.org new file mode 100644 index 0000000000000000000000000000000000000000..8d963735bb19d672fd8ff7cb7649c03730b15205 --- /dev/null +++ b/doc/orgmode/chapters/configuration.org @@ -0,0 +1,366 @@ +@c -*-texinfo-*- + +@c This file is part of the MORSE Handbook. +@c Copyright (C) 2017 Inria +@c Copyright (C) 2014 The University of Tennessee +@c Copyright (C) 2014 King Abdullah University of Science and Technology +@c See the file ../chameleon.texi for copying conditions. + +@menu +* Compilation configuration:: +* Dependencies detection:: +@c * Dependencies compilation:: +* Use FxT profiling through StarPU:: +* Use simulation mode with StarPU-SimGrid:: +* Use out of core support with StarPU:: +@end menu + +@c @code{} @option{} +@c @table @code +@c @item truc +@c @item muche +@c @item et zut +@c @c @end table + +@node Compilation configuration +@section Compilation configuration + +The following arguments can be given to the @command{cmake <path to source +directory>} script. + +In this chapter, the following convention is used: +@itemize @bullet +@item +@option{path} is a path in your filesystem, +@item +@option{var} is a string and the correct value or an example will be given, +@item +@option{trigger} is an CMake option and the correct value is @code{ON} or +@code{OFF}. +@end itemize + +Using CMake there are several ways to give options: +@enumerate +@item directly as CMake command line arguments +@item invoque @command{cmake <path to source directory>} once and then use +@command{ccmake <path to source directory>} to edit options through a +minimalist gui (required +@samp{cmake-curses-gui} installed on a Linux system) +@item invoque @command{cmake-gui} command and fill information about the +location of the sources and where to build the project, then you have +access to options through a user-friendly Qt interface (required +@samp{cmake-qt-gui} installed on a Linux system) +@end enumerate + +Example of configuration using the command line +@example +cmake ~/chameleon/ -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_INSTALL_PREFIX=~/install \ + -DCHAMELEON_USE_CUDA=ON \ + -DCHAMELEON_USE_MPI=ON \ + -DBLA_VENDOR=Intel10_64lp \ + -DSTARPU_DIR=~/install/starpu-1.1 \ + -DCHAMELEON_ENABLE_TRACING=ON +@end example + +You can get the full list of options with @option{-L[A][H]} options of +@command{cmake} command: +@example +cmake -LH <path to source directory> +@end example + +@menu +* General CMake options:: +* CHAMELEON options:: +@end menu + +@node General CMake options +@subsection General CMake options + +@table @code + +@item -DCMAKE_INSTALL_PREFIX=@option{path} (default:@option{path=/usr/local}) +Install directory used by @code{make install} where some headers and libraries +will be copied. +Permissions have to be granted to write onto @option{path} during @code{make +install} step. + +@item -DCMAKE_BUILD_TYPE=@option{var} (default: @option{Release}) +Define the build type and the compiler optimization level. +The possible values for @option{var} are: +@table @code +@item empty +@item Debug +@item Release +@item RelWithDebInfo +@item MinSizeRel +@end table + +@item -DBUILD_SHARED_LIBS=@option{trigger} (default:@option{OFF}) +Indicate wether or not CMake has to build CHAMELEON static (@option{OFF}) or +shared (@option{ON}) libraries. + +@end table + +@node CHAMELEON options +@subsection CHAMELEON options + +List of CHAMELEON options that can be enabled/disabled (value=@code{ON} +or @code{OFF}): +@table @code + +@item @option{-DCHAMELEON_SCHED_STARPU}=@option{trigger} (default: @code{ON}) +to link with StarPU library (runtime system) + +@item @option{-DCHAMELEON_SCHED_QUARK}=@option{trigger} (default: @code{OFF}) +to link with QUARK library (runtime system) + +@item @option{-DCHAMELEON_USE_CUDA}=@option{trigger} (default: @code{OFF}) +to link with CUDA runtime (implementation paradigm for accelerated codes on +GPUs) and cuBLAS library (optimized BLAS kernels on GPUs), can only be used with +StarPU + +@item @option{-DCHAMELEON_USE_MPI}=@option{trigger} (default: @code{OFF}) +to link with MPI library (message passing implementation for use of multiple +nodes with distributed memory), can only be used with StarPU + +@item @option{-DCHAMELEON_ENABLE_TRACING}=@option{trigger} (default: @code{OFF}) +to enable trace generation during execution of timing drivers. +It requires StarPU to be linked with FxT library (trace execution of kernels on workers). + +@item @option{-DCHAMELEON_SIMULATION=trigger} (default: @code{OFF}) +to enable simulation mode, means CHAMELEON will not really execute tasks, +see details in section @ref{Use simulation mode with StarPU-SimGrid}. +This option must be used with StarPU compiled with +@uref{http://simgrid.gforge.inria.fr/, SimGrid} allowing to guess the +execution time on any architecture. +This feature should be used to make experiments on the scheduler behaviors and +performances not to produce solutions of linear systems. + +@item @option{-DCHAMELEON_ENABLE_DOCS=trigger} (default: @code{ON}) +to control build of the documentation contained in @file{docs/} sub-directory +@item @option{-DCHAMELEON_ENABLE_EXAMPLE=trigger} (default: @code{ON}) +to control build of the examples executables (API usage) +contained in @file{example/} sub-directory +@item @option{-DCHAMELEON_ENABLE_TESTING=trigger} (default: @code{ON}) +to control build of testing executables (numerical check) contained in +@file{testing/} sub-directory +@item @option{-DCHAMELEON_ENABLE_TIMING=trigger} (default: @code{ON}) +to control build of timing executables (performances check) contained in +@file{timing/} sub-directory + +@item @option{-DCHAMELEON_PREC_S=trigger} (default: @code{ON}) +to enable the support of simple arithmetic precision (float in C) +@item @option{-DCHAMELEON_PREC_D=trigger} (default: @code{ON}) +to enable the support of double arithmetic precision (double in C) +@item @option{-DCHAMELEON_PREC_C=trigger} (default: @code{ON}) +to enable the support of complex arithmetic precision (complex in C) +@item @option{-DCHAMELEON_PREC_Z=trigger} (default: @code{ON}) +to enable the support of double complex arithmetic precision (double complex +in C) + +@item @option{-DBLAS_VERBOSE=trigger} (default: @code{OFF}) +to make BLAS library discovery verbose +@item @option{-DLAPACK_VERBOSE=trigger} (default: @code{OFF}) +to make LAPACK library discovery verbose (automatically enabled if +@option{BLAS_VERBOSE=@code{ON}}) +@end table + +List of CHAMELEON options that needs a specific value: +@table @code +@item @option{-DBLA_VENDOR=@option{var}} (default: @option{empty}) +The possible values for @option{var} are: +@table @code +@item empty +@item all +@item Intel10_64lp +@item Intel10_64lp_seq +@item ACML +@item Apple +@item Generic +@item ... +@end table +to force CMake to find a specific BLAS library, see the full list of BLA_VENDOR +in @file{FindBLAS.cmake} in @file{cmake_modules/morse/find}. +By default @option{BLA_VENDOR} is empty so that CMake tries to detect all +possible BLAS vendor with a preference for Intel MKL. +@end table + +List of CHAMELEON options which requires to give a path: +@table @code +@item @option{-DLIBNAME_DIR=@option{path}} (default: empty) +root directory of the LIBNAME library installation +@item @option{-DLIBNAME_INCDIR=@option{path}} (default: empty) +directory of the LIBNAME library headers installation +@item @option{-DLIBNAME_LIBDIR=@option{path}} (default: empty) +directory of the LIBNAME libraries (.so, .a, .dylib, etc) installation +@end table +LIBNAME can be one of the following: BLAS - CBLAS - FXT - HWLOC - +LAPACK - LAPACKE - QUARK - STARPU - TMG. +See paragraph about @ref{Dependencies detection} for details. + +Libraries detected with an official CMake module (see module files in +@file{CMAKE_ROOT/Modules/}): +@itemize @bullet +@item CUDA +@item MPI +@item Threads +@end itemize + +Libraries detected with CHAMELEON cmake modules (see module files in +@file{cmake_modules/morse/find/} directory of CHAMELEON sources): +@itemize @bullet +@item BLAS +@item CBLAS +@item FXT +@item HWLOC +@item LAPACK +@item LAPACKE +@item QUARK +@item STARPU +@item TMG +@end itemize + + +@node Dependencies detection +@section Dependencies detection +You have different choices to detect dependencies on your system, either by +setting some environment variables containing paths to the libs and headers or +by specifying them directly at cmake configure. +Different cases : +@enumerate +@item detection of dependencies through environment variables: + @itemize @bullet + @item @env{LD_LIBRARY_PATH} environment variable should contain the list of +paths +where to find the libraries: + @example + export @env{LD_LIBRARY_PATH}=$@env{LD_LIBRARY_PATH}:path/to/your/libs + @end example + @item @env{INCLUDE} environment variable should contain the list of paths +where to find the header files of libraries + @example + export @env{INCLUDE}=$@env{INCLUDE}:path/to/your/headers + @end example + @end itemize + +@item detection with user's given paths: + @itemize @bullet + @item you can specify the path at cmake configure by invoking + @example + cmake <path to SOURCE_DIR> -DLIBNAME_DIR=path/to/your/lib + @end example + where LIB stands for the name of the lib to look for, example + @example + cmake <path to SOURCE_DIR> -DSTARPU_DIR=path/to/starpudir \ + -DCBLAS_DIR= ... + @end example + @item it is also possible to specify headers and library directories +separately, example + @example + cmake <path to SOURCE_DIR> \ + -DSTARPU_INCDIR=path/to/libstarpu/include/starpu/1.1 \ + -DSTARPU_LIBDIR=path/to/libstarpu/lib + @end example + @item Note BLAS and LAPACK detection can be tedious so that we provide a +verbose mode. Use @option{-DBLAS_VERBOSE=ON} or @option{-DLAPACK_VERBOSE=ON} to +enable it. + @end itemize + +@end enumerate + + +@c @node Dependencies compilation +@c @section Dependencies compilation + +@node Use FxT profiling through StarPU +@section Use FxT profiling through StarPU + +StarPU can generate its own trace log files by compiling it with the +@option{--with-fxt} +option at the configure step (you can have to specify the directory where you +installed FxT by giving @option{--with-fxt=...} instead of @option{--with-fxt} +alone). +By doing so, traces are generated after each execution of a program which uses +StarPU in the directory pointed by the @env{STARPU_FXT_PREFIX} environment +variable. Example: +@example +export @env{STARPU_FXT_PREFIX}=/home/yourname/fxt_files/ +@end example + +When executing a @command{./timing/...} CHAMELEON program, if it has been +enabled (StarPU compiled with FxT and @option{-DCHAMELEON_ENABLE_TRACING=ON}), you +can give the option @option{--trace} to tell the program to generate trace log +files. + +Finally, to generate the trace file which can be opened with +@uref{http://vite.gforge.inria.fr/, Vite} program, you have to use the +@command{starpu_fxt_tool} executable of StarPU. +This tool should be in @file{path/to/your/install/starpu/bin}. +You can use it to generate the trace file like this: +@itemize @bullet +@item @command{path/to/your/install/starpu/bin/starpu_fxt_tool -i prof_filename} + +There is one file per mpi processus (prof_filename_0, prof_filename_1 ...). +To generate a trace of mpi programs you can call it like this: +@item @command{path/to/your/install/starpu/bin/starpu_fxt_tool -i +prof_filename*} + +The trace file will be named paje.trace (use -o option to specify an output +name). +@end itemize + +Alternatively, one can also generate directly .paje trace files after the execution +by setting @env{STARPU_GENERATE_TRACE=1}. + +@node Use simulation mode with StarPU-SimGrid +@section Use simulation mode with StarPU-SimGrid + +Simulation mode can be enabled by setting the cmake option +@option{-DCHAMELEON_SIMULATION=ON}. +This mode allows you to simulate execution of algorithms with StarPU compiled +with @uref{http://simgrid.gforge.inria.fr/, SimGrid}. +To do so, we provide some perfmodels in the @file{simucore/perfmodels/} +directory of CHAMELEON sources. +To use these perfmodels, please set the following +@itemize @bullet +@item @env{STARPU_HOME} environment variable to: + @example + @code{<path to SOURCE_DIR>/simucore/perfmodels} + @end example +@item @env{STARPU_HOSTNAME} environment variable to the name of the machine to +simulate. For example, on our platform (PlaFRIM) with GPUs at Inria Bordeaux + @example + @env{STARPU_HOSTNAME}=mirage + @end example +Note that only POTRF kernels with block sizes of 320 or 960 (simple and double +precision) on mirage machine are available for now. +Database of models is subject to change, it should be enrich in a near future. +@end itemize + +@node Use out of core support with StarPU +@section Use out of core support with StarPU + +If the matrix can not fit in the main memory, StarPU can automatically evict +tiles to the disk. The descriptors for the matrices which can not fit in the +main memory need to be created with @code{MORSE_Desc_Create_OOC}, so that MORSE +does not force StarPU to keep it in the main memory. + +The following variables then need to be set: +@itemize @bullet +@item @env{STARPU_DISK_SWAP} environment variable to a place where to store +evicted tiles, for example: + @example + @env{STARPU_DISK_SWAP}=/tmp + @end example +@item @env{STARPU_DISK_SWAP_BACKEND} environment variable to the I/O method, +for example: + @example + @env{STARPU_DISK_SWAP_BACKEND}=unistd_o_direct + @end example +@item @env{STARPU_LIMIT_CPU_MEM} environment variable to the amount of memory +that can be used in MBytes, for example: + @example + @env{STARPU_LIMIT_CPU_MEM}=1000 + @end example +@end itemize diff --git a/doc/orgmode/chapters/installing.org b/doc/orgmode/chapters/installing.org new file mode 100644 index 0000000000000000000000000000000000000000..745e6863a9f604a49f6497547135d276d076927d --- /dev/null +++ b/doc/orgmode/chapters/installing.org @@ -0,0 +1,485 @@ +# This file is part of the Chameleon User's Guide. +# Copyright (C) 2017 Inria +# See the file ../users_guide.org for copying conditions. + +Chameleon is written in C, it provides an interface to be called from +Fortran and depends on a couple of external libraries that must be +installed on the system. + +Chameleon can be built and installed by the standard means of CMake +(@uref{http://www.cmake.org/}). General information about CMake, as +well as installation binaries and CMake source code are available from +@uref{http://www.cmake.org/cmake/resources/software.html}. + +To get support to install a full distribution Chameleon + dependencies +we encourage users to use the morse branch of *Spack*. + + +** Getting Chameleon + The latest official release tarballs of Chameleon sources are + available for download from + https://gitlab.inria.fr/solverstack/chameleon/tags. + + The latest development snapshot is available on gitlab + https://gitlab.inria.fr/solverstack/chameleon. + +** Chameleon prerequisites + To install Chameleon's libraries, header files, and executables, one + needs: + - CMake (version 2.8 minimum): the build system + - C and Fortran compilers: GNU compiler suite, Clang, Intel or IBM + can be used + - python: to generate files in the different precisions + - external libraries: this depends on the configuration, by default + the required libraries are + - StarPU: http://runtime.bordeaux.inria.fr/StarPU/ + - CBLAS, LAPACKE: these are interfaces and there exist several + providers that can be used with Chameleon + - Intel MKL, Netlib, OpenBlas + - BLAS, LAPACK, TMGLIB: there exist several providers that can be + used with Chameleon + - Eigen, Intel MKL, Netlib, OpenBlas + - pthread (libpthread) + - math (libm) + + Optional libraries: + - quark: http://icl.cs.utk.edu/quark/ + - cuda: https://developer.nvidia.com/cuda-downloads + - cublas: comes with cuda http://docs.nvidia.com/cuda/cublas/ + - mpi: openmpi http://www.open-mpi.org/ + + These packages must be installed on the system before trying to + configure/build chameleon. Please look at the distrib/ directory + which gives some hints for the installation of dependencies for Unix + systems. + + We give here some examples for a Debian system: + #+begin_src + + # Update Debian packages list + sudo apt-get update + # Install Netlib blas, lapack, tmglib, cblas and lapacke suite + sudo apt-get install -y liblapack-dev liblapacke-dev + # Alernatively to Netlib, OpenBLAS could be used (faster kernels) + sudo apt-get install -y libopenblas-dev liblapacke-dev + # Install OpenMPI + sudo apt-get install -y libopenmpi-dev + # Install hwloc (used by StarPU or QUARK, already a dependency of OpenMPI) + sudo apt-get install -y libhwloc-dev + # install FxT, usefull to export some nice execution traces with StarPU + sudo apt-get install -y libfxt-dev + # Install cuda and cuBLAS : only if you have a GPU cuda compatible + sudo apt-get install -y nvidia-cuda-toolkit nvidia-cuda-dev + + # Install StarPU (with MPI and FxT enabled) + mkdir -p $HOME/install + cd $HOME/install + wget http://starpu.gforge.inria.fr/files/starpu-1.2.2/starpu-1.2.2.tar.gz + tar xvzf starpu-1.2.2.tar.gz + cd starpu-1.2.2/ + ./configure --prefix=$HOME/install/starpu --disable-opencl --disable-cuda --with-fxt=/usr/lib/x86_64-linux-gnu/ + make + make install + cd $HOME/install + rm starpu-1.2.2/ starpu-1.2.2.tar.gz -rf + + # Install QUARK : to be used in place of StarPU + mkdir -p $HOME/install + cd $HOME/install + wget http://icl.cs.utk.edu/projectsfiles/quark/pubs/quark-0.9.0.tgz + tar xvzf quark-0.9.0.tgz + cd quark-0.9.0/ + sed -i -e "s#prefix=\.\/install#prefix=$HOME/install/quark#g" make.inc + sed -i -e "s#CFLAGS=-O2#CFLAGS=-O2 -fPIC#g" make.inc + make + make install + cd $HOME/install + rm quark-0.9.0/ quark-0.9.0.tgz -rf + + #+end_src + +*** Some details about dependencies +**** BLAS implementation + [[http://www.netlib.org/blas/][BLAS]] (Basic Linear Algebra Subprograms), are a de facto standard + for basic linear algebra operations such as vector and matrix + multiplication. FORTRAN implementation of BLAS is available from + Netlib. Also, C implementation of BLAS is included in GSL (GNU + Scientific Library). Both these implementations are reference + implementation of BLAS, are not optimized for modern processor + architectures and provide an order of magnitude lower performance + than optimized implementations. Highly optimized implementations + of BLAS are available from many hardware vendors, such as Intel + MKL, IBM ESSL and AMD ACML. Fast implementations are also + available as academic packages, such as ATLAS and OpenBLAS. The + standard interface to BLAS is the FORTRAN interface. + + *Caution about the compatibility:* Chameleon has been mainly tested + with the reference BLAS from NETLIB, OpenBLAS and Intel MKL. +**** CBLAS + [[http://www.netlib.org/blas/#_cblas][CBLAS]] is a C language interface to BLAS. Most commercial and + academic implementations of BLAS also provide CBLAS. Netlib + provides a reference implementation of CBLAS on top of FORTRAN + BLAS (Netlib CBLAS). Since GSL is implemented in C, it naturally + provides CBLAS. + + *Caution about the compatibility:* Chameleon has been mainly tested with + the reference CBLAS from NETLIB, OpenBLAS and Intel MKL. + +**** LAPACK implementation + [[http://www.netlib.org/lapack/][LAPACK]] (Linear Algebra PACKage) is a software library for + numerical linear algebra, a successor of LINPACK and EISPACK and + a predecessor of Chameleon. LAPACK provides routines for solving + linear systems of equations, linear least square problems, + eigenvalue problems and singular value problems. Most commercial + and academic BLAS packages also provide some LAPACK routines. + + *Caution about the compatibility:* Chameleon has been mainly tested + with the reference LAPACK from NETLIB, OpenBLAS and Intel MKL. + +**** LAPACKE + [[http://www.netlib.org/lapack/][LAPACKE]] is a C language interface to LAPACK (or CLAPACK). It is + produced by Intel in coordination with the LAPACK team and is + available in source code from Netlib in its original version + (Netlib LAPACKE) and from Chameleon website in an extended + version (LAPACKE for Chameleon). In addition to implementing the + C interface, LAPACKE also provides routines which automatically + handle workspace allocation, making the use of LAPACK much more + convenient. + + *Caution about the compatibility:* Chameleon has been mainly tested + with the reference LAPACKE from NETLIB, OpenBLAS and Intel MKL. + +**** libtmg + [[http://www.netlib.org/lapack/][libtmg]] is a component of the LAPACK library, containing routines + for generation of input matrices for testing and timing of + LAPACK. The testing and timing suites of LAPACK require libtmg, + but not the library itself. Note that the LAPACK library can be + built and used without libtmg. + + *Caution about the compatibility:* Chameleon has been mainly tested + with the reference TMGLIB from NETLIB, OpenBLAS and Intel MKL. + +**** QUARK + [[http://icl.cs.utk.edu/quark/][QUARK]] (QUeuing And Runtime for Kernels) provides a library that + enables the dynamic execution of tasks with data dependencies in + a multi-core, multi-socket, shared-memory environment. One of + QUARK or StarPU Runtime systems has to be enabled in order to + schedule tasks on the architecture. If QUARK is enabled then + StarPU is disabled and conversely. Note StarPU is enabled by + default. When Chameleon is linked with QUARK, it is not possible + to exploit neither CUDA (for GPUs) nor MPI (distributed-memory + environment). You can use StarPU to do so. + + *Caution about the compatibility:* Chameleon has been mainly tested + with the QUARK library 0.9. + +**** StarPU + [[http://runtime.bordeaux.inria.fr/StarPU/][StarPU]] is a task programming library for hybrid architectures. + StarPU handles run-time concerns such as: @itemize @bullet @item + Task dependencies @item Optimized heterogeneous scheduling @item + Optimized data transfers and replication between main memory and + discrete memories @item Optimized cluster communications @end + itemize StarPU can be used to benefit from GPUs and + distributed-memory environment. One of QUARK or StarPU runtime + system has to be enabled in order to schedule tasks on the + architecture. If StarPU is enabled then QUARK is disabled and + conversely. Note StarPU is enabled by default. + + *Caution about the compatibility:* Chameleon has been mainly tested + with StarPU-1.1 and 1.2 releases. + +**** FxT + [[http://download.savannah.gnu.org/releases/fkt/][FxT]] stands for both FKT (Fast Kernel Tracing) and FUT (Fast User + Tracing). This library provides efficient support for recording + traces. Chameleon can trace kernels execution on the different + workers and produce .paje files if FxT is enabled. FxT can only + be used through StarPU and StarPU must be compiled with FxT + enabled, see how to use this feature here @ref{Use FxT profiling + through StarPU}. + + *Caution about the compatibility:* FxT should be compatible with + the version of StarPU used. +**** hwloc + [[http://www.open-mpi.org/projects/hwloc/][hwloc]] (Portable Hardware Locality) is a software package for + accessing the topology of a multicore system including components + like: cores, sockets, caches and NUMA nodes. The topology + discovery library, ~hwloc~, is not mandatory to use StarPU but + strongly recommended. It allows to increase performance, and to + perform some topology aware scheduling. ~hwloc~ is available in + major distributions and for most OSes and can be downloaded from + http://www.open-mpi.org/software/hwloc. + +**** pthread + POSIX threads library is required to run Chameleon on Unix-like systems. + It is a standard component of any such system. + +@node Optional dependencies +@subsection Optional dependencies + +@menu +**** OpenMPI + [[http://www.open-mpi.org/][OpenMPI]] is an open source Message Passing Interface + implementation for execution on multiple nodes with + distributed-memory environment. MPI can be enabled only if the + runtime system chosen is StarPU (default). To use MPI through + StarPU, it is necessary to compile StarPU with MPI enabled. + + *Caution about the compatibility:* OpenMPI should be built with the + --enable-mpi-thread-multiple option. + +**** Nvidia CUDA Toolkit + [[https://developer.nvidia.com/cuda-toolkit][Nvidia CUDA Toolkit]] provides a comprehensive development + environment for C and C++ developers building GPU-accelerated + applications. Chameleon can use a set of low level optimized + kernels coming from cuBLAS to accelerate computations on GPUs. + The [[http://docs.nvidia.com/cuda/cublas/][cuBLAS]] library is an implementation of BLAS (Basic Linear + Algebra Subprograms) on top of the Nvidia CUDA runtime. cuBLAS + is normaly distributed with Nvidia CUDA Toolkit. CUDA/cuBLAS can + be enabled in Chameleon only if the runtime system chosen is + StarPU (default). To use CUDA through StarPU, it is necessary to + compile StarPU with CUDA enabled. + + *Caution about the compatibility:* Chameleon has been mainly tested + with CUDA releases from versions 4 to 7.5. Your compiler must be + compatible with CUDA. + +** Distribution of Chameleon using Spack + To get support to install a full distribution (Chameleon + + dependencies) we encourage users to use the morse branch of *Spack*. + + Please read these documentations: + * [[http://morse.gforge.inria.fr/spack/spack.html][Spack Morse]] + * [[http://morse.gforge.inria.fr/spack/spack.html#orgd5b1afe][Section Chameleon]] + +*** Usage example for a simple distribution of Chameleon + #+begin_src sh + git clone https://github.com/solverstack/spack.git + . ./spack/share/spack/setup-env.sh + spack install -v chameleon + # chameleon is installed here: + `spack location -i chameleon` + #+end_src + +** Build and install Chameleon with CMake + Compilation of Chameleon libraries and executables are done with + CMake (http://www.cmake.org/). This version has been tested with + CMake 3.5.1 but any version superior to 2.8 should be fine. + + Here the steps to configure, build, test and install + 1. configure : + #+begin_src + cmake path/to/chameleon -DOPTION1= -DOPTION2= ... + # see the "Options" section to get list of options + # see the "Dependencies detection" for details about libraries detection + #+end_src + 2. build : + #+begin_src + make + # do not hesitate to use -j[ncores] option to speedup the compilation + #+end_src + 3. test (optional, required CHAMELEON_ENABLE_TESTING=ON and/or + CHAMELEON_ENABLE_TIMING=ON) : + #+begin_src + make test + # or + ctest + #+end_src + 4. install (optional) : + #+begin_src + make install + #+end_src + Do not forget to specify the install directory with + *-DCMAKE_INSTALL_PREFIX* at configure + #+begin_example + cmake /home/jdoe/chameleon -DCMAKE_INSTALL_PREFIX=/home/jdoe/install/chameleon + #+end_example + Note that the install process is optional. You are free to use + Chameleon binaries compiled in the build directory. +*** Configuration options + You can optionally activate some options at cmake configure (like CUDA, MPI, ...) + invoking ~cmake path/to/your/CMakeLists.txt -DOPTION1= -DOPTION2= ...~ + #+begin_src + cmake /home/jdoe/chameleon/ -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_INSTALL_PREFIX=/home/jdoe/install/ \ + -DCHAMELEON_USE_CUDA=ON \ + -DCHAMELEON_USE_MPI=ON \ + -DBLA_VENDOR=Intel10_64lp \ + -DSTARPU_DIR=/home/jdoe/install/starpu-1.2/ \ + -DCHAMELEON_ENABLE_TRACING=ON + #+end_src + + You can get the full list of options with *-L[A][H]* options of cmake command + #+begin_src + cmake -LH /home/jdoe/chameleon/ + #+end_src + + You can also set the options thanks to the *ccmake* interface. + +**** Native CMake options (non-exhaustive list) + * *CMAKE_BUILD_TYPE=Debug|Release|RelWithDebInfo|MinSizeRel* : + level of compiler optimization, enable debug information + * *CMAKE_INSTALL_PREFIX=path/to/your/install/dir* : where headers, + libraries, executables, etc, will be copied when invoking make + install + * *BUILD_SHARED_LIBS=ON|OFF* : Indicate wether or not CMake has to + build CHAMELEON static (~OFF~) or shared (~ON~) libraries. + * *CMAKE_C_COMPILER=gcc|icc|...* : to choose the C compilers + if several exist in the environment + * *CMAKE_Fortran_COMPILER=gfortran|ifort|...*: to choose the + Fortran compilers if several exist in the environment + +**** Related to specific modules (find_package) to find external libraries + * *BLA_VENDOR=All|Eigen|Open|Generic|Intel10_64lp|Intel10_64lp_seq* : + to use intel mkl for example, see the list of BLA_VENDOR in + FindBLAS.cmake in cmake_modules/morse/find + * *STARPU_DIR=path/to/root/starpu/install*, see [[sec:depdet][Dependencies + detection]] + * *STARPU_INCDIR=path/to/root/starpu/install/headers*, see + [[sec:depdet][Dependencies detection]] + * *STARPU_LIBDIR=path/to/root/starpu/install/libs*, see + [[sec:depdet][Dependencies detection]] + * List of packages that can searched just like STARPU (with _DIR, + _INCDIR and _LIBDIR): + * *BLAS*, *CBLAS*, *EZTRACE*, *FXT*, *HWLOC*, *LAPACK*, *LAPACKE*, *QUARK*, + *SIMGRID, *TMG* + + Libraries detected with an official cmake module (see module files + in CMAKE_ROOT/Modules/): CUDA - MPI - Threads. + + Libraries detected with our cmake modules (see module files in + cmake_modules/morse_cmake/modules/find/ directory of Chameleon + sources): BLAS - CBLAS - EZTRACE - FXT - HWLOC - LAPACK - + LAPACKE - QUARK - SIMGRID - STARPU - TMG. + +**** Chameleon specific options + * *CHAMELEON_SCHED_STARPU=ON|OFF* (default ON) : to link with + StarPU library (runtime system) + * *CHAMELEON_SCHED_QUARK=ON|OFF* (default OFF) : to link with QUARK + library (runtime system) + * *CHAMELEON_USE_MPI=ON|OFF* (default OFF) : to link with MPI + library (message passing implementation for use of multiple + nodes with distributed memory), can only be used with StarPU + * *CHAMELEON_USE_CUDA=ON|OFF* (default OFF) : to link with CUDA + runtime (implementation paradigm for accelerated codes on GPUs) + and cuBLAS library (optimized BLAS kernels on GPUs), can only + be used with StarPU + * *CHAMELEON_ENABLE_DOC=ON|OFF* (default OFF) : to control build of + the documentation contained in doc/ sub-directory + * *CHAMELEON_ENABLE_EXAMPLE=ON|OFF* (default ON) : to control build + of the examples executables (API usage) contained in example/ + sub-directory + * *CHAMELEON_ENABLE_PRUNING_STATS=ON|OFF* (default OFF) + * *CHAMELEON_ENABLE_TESTING=ON|OFF* (default ON) : to control build + of testing executables (numerical check) contained in testing/ + sub-directory + * *CHAMELEON_ENABLE_TIMING=ON|OFF* (default ON) : to control build + of timing executables (performances check) contained in timing/ + sub-directory + * *CHAMELEON_ENABLE_TRACING=ON|OFF* (default OFF) : to enable trace + generation during execution of timing drivers. It requires + StarPU to be linked with FxT library (trace execution of + kernels on workers), see also [[sec:trace][Execution tracing + with StarPU]]. + * *CHAMELEON_SIMULATION=ON|OFF* (default OFF) : to enable + simulation mode, means CHAMELEON will not really execute tasks, + see details in section [[sec:simu][Use simulation mode with + StarPU-SimGrid]]. This option must be used with StarPU compiled + with [[http://simgrid.gforge.inria.fr/][SimGrid]] allowing to guess the execution time on any + architecture. This feature should be used to make experiments + on the scheduler behaviors and performances not to produce + solutions of linear systems. + +*** Dependencies detection + <<sec:depdet>> + You have different choices to detect dependencies on your system, + either by setting some environment variables containing paths to + the libs and headers or by specifying them directly at cmake + configure. Different cases : + + 1) detection of dependencies through environment variables: + - LD_LIBRARY_PATH should contain the list of paths where to find + the libraries: + #+begin_src + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:install/path/to/your/lib + #+end_src + - INCLUDE should contain the list of paths where to find the + header files of libraries + #+begin_src + export INCLUDE=$INCLUDE:install/path/to/your/headers + #+end_src + 2) detection with user's given paths: + - you can specify the path at cmake configure by invoking ~cmake + path/to/your/CMakeLists.txt -DLIB_DIR=path/to/your/lib~ where + LIB stands for the name of the lib to look for + #+begin_src + cmake path/to/your/CMakeLists.txt -DSTARPU_DIR=path/to/starpudir \ + -DCBLAS_DIR= ... + #+end_src + it is also possible to specify headers and library directories + separately + #+begin_src + cmake path/to/your/CMakeLists.txt -DSTARPU_INCDIR=path/to/libstarpu/include/starpu/1.1 \ + -DSTARPU_LIBDIR=path/to/libstarpu/lib + #+end_src + - note: BLAS and LAPACK detection can be tedious so that we + provide a verbose mode you can set *-DBLAS_VERBOSE=ON* or + *-DLAPACK_VERBOSE=ON* to activate it + 3) detection with custom environment variables: all variables like + _DIR, _INCDIR, _LIBDIR can be set as environment variables + instead of CMake options, there will be read + 4) using pkg-config for libraries that provide .pc files + - update your *PKG_CONFIG_PATH* to the paths where to find .pc + files of installed external libraries like hwloc, starpu, some + blas/lapack, etc +*** Execution tracing StarPU + <<sec:trace>> + StarPU can generate its own trace log files by compiling it with + the ~--with-fxt~ option at the configure step (you can have to + specify the directory where you installed FxT by giving + ~--with-fxt=...~ instead of ~--with-fxt~ alone). By doing so, traces + are generated after each execution of a program which uses StarPU + in the directory pointed by the STARPU_FXT_PREFIX environment + variable. + #+begin_example + export STARPU_FXT_PREFIX=/home/jdoe/fxt_files/ + #+end_example + When executing a ~./timing/...~ Chameleon program, if it has been + enabled (StarPU compiled with FxT and + *-DCHAMELEON_ENABLE_TRACING=ON*), you can give the option ~--trace~ to + tell the program to generate trace log files. + + Finally, to generate the trace file which can be opened with Vite + program (http://vite.gforge.inria.fr/), you can use the + *starpu_fxt_tool* executable of StarPU. This tool should be in + ~$STARPU_INSTALL_REPOSITORY/bin~. You can use it to generate the + trace file like this: + #+begin_src + path/to/your/install/starpu/bin/starpu_fxt_tool -i prof_filename + #+end_src + There is one file per mpi processus (prof_filename_0, + prof_filename_1 ...). To generate a trace of mpi programs you can + call it like this: + #+begin_src + path/to/your/install/starpu/bin/starpu_fxt_tool -i prof_filename* + #+end_src + The trace file will be named paje.trace (use -o option to specify + an output name). Alternatively, for non mpi execution (only one + processus and profiling file), you can set the environment + variable *STARPU_GENERATE_TRACE=1* to automatically generate the + paje trace file. + +*** Use simulation mode with StarPU-SimGrid + <<sec:simu>> + Simulation mode can be activated by setting the cmake option + CHAMELEON_SIMULATION to ON. This mode allows you to simulate + execution of algorithms with StarPU compiled with SimGrid + (http://simgrid.gforge.inria.fr/). To do so, we provide some + perfmodels in the simucore/perfmodels/ directory of Chameleon + sources. To use these perfmodels, please set your *STARPU_HOME* + environment variable to + ~path/to/your/chameleon_sources/simucore/perfmodels~. Finally, you + need to set your *STARPU_HOSTNAME* environment variable to the name + of the machine to simulate. For example: *STARPU_HOSTNAME=mirage*. + Note that only POTRF kernels with block sizes of 320 or 960 + (simple and double precision) on mirage and sirocco machines are + available for now. Database of models is subject to change. diff --git a/doc/orgmode/chapters/introduction.org b/doc/orgmode/chapters/introduction.org new file mode 100644 index 0000000000000000000000000000000000000000..ee33b7597b7ca6f517bb63362aa6b3a75cb8d258 --- /dev/null +++ b/doc/orgmode/chapters/introduction.org @@ -0,0 +1,302 @@ +# This file is part of the CHAMELEON User's Guide. +# Copyright (C) 2017 Inria +# See the file ../users_guide.org for copying conditions. +** MORSE project + #+NAME: fig:morse_header + #+ATTR_HTML: :align center + [[file:morse_header.png]] +*** MORSE Objectives + When processor clock speeds flatlined in 2004, after more than + fifteen years of exponential increases, the era of near automatic + performance improvements that the HPC application community had + previously enjoyed came to an abrupt end. To develop software that + will perform well on petascale and exascale systems with thousands + of nodes and millions of cores, the list of major challenges that + must now be confronted is formidable: + 1) dramatic escalation in the costs of intrasystem communication + between processors and/or levels of memory hierarchy; + 2) increased heterogeneity of the processing units (mixing CPUs, + GPUs, etc. in varying and unexpected design combinations); + 3) high levels of parallelism and more complex constraints means + that cooperating processes must be dynamically and unpredictably + scheduled for asynchronous execution; + 4) software will not run at scale without much better resilience to + faults and far more robustness; and + 5) new levels of self-adaptivity will be required to enable + software to modulate process speed in order to satisfy limited + energy budgets. + The MORSE associate team will tackle the first three challenges in + a orchestrating work between research groups respectively + specialized in sparse linear algebra, dense linear algebra and + runtime systems. The overall objective is to develop robust linear + algebra libraries relying on innovative runtime systems that can + fully benefit from the potential of those future large-scale + complex machines. Challenges 4) and 5) will also be investigated + by the different teams in the context of other partnerships, but + they will not be the main focus of the associate team as they are + much more prospective. + +*** Research fields + The overall goal of the MORSE associate team is to enable advanced + numerical algorithms to be executed on a scalable unified runtime + system for exploiting the full potential of future exascale + machines. We expect advances in three directions based first on + strong and closed interactions between the runtime and numerical + linear algebra communities. This initial activity will then + naturally expand to more focused but still joint research in both + fields. + +**** Fine interaction between linear algebra and runtime systems + On parallel machines, HPC applications need to take care of data + movement and consistency, which can be either explicitly managed + at the level of the application itself or delegated to a runtime + system. We adopt the latter approach in order to better keep up + with hardware trends whose complexity is growing exponentially. + One major task in this project is to define a proper interface + between HPC applications and runtime systems in order to maximize + productivity and expressivity. As mentioned in the next section, + a widely used approach consists in abstracting the application as + a DAG that the runtime system is in charge of scheduling. + Scheduling such a DAG over a set of heterogeneous processing units + introduces a lot of new challenges, such as predicting accurately + the execution time of each type of task over each kind of unit, + minimizing data transfers between memory banks, performing data + prefetching, etc. Expected advances: In a nutshell, a new runtime + system API will be designed to allow applications to provide + scheduling hints to the runtime system and to get real-time + feedback about the consequences of scheduling decisions. + +**** Runtime systems + A runtime environment is an intermediate layer between the system + and the application. It provides low-level functionality not + provided by the system (such as scheduling or management of the + heterogeneity) and high-level features (such as performance + portability). In the framework of this proposal, we will work on + the scalability of runtime environment. To achieve scalability it + is required to avoid all centralization. Here, the main problem + is the scheduling of the tasks. In many task-based runtime + environments the scheduler is centralized and becomes a bottleneck + as soon as too many cores are involved. It is therefore required + to distribute the scheduling decision or to compute a data + distribution that impose the mapping of task using, for instance + the so-called ``owner-compute'' rule. Expected advances: We will + design runtime systems that enable an efficient and scalable use + of thousands of distributed multicore nodes enhanced with + accelerators. + +**** Linear algebra + Because of its central position in HPC and of the well understood + structure of its algorithms, dense linear algebra has often + pioneered new challenges that HPC had to face. Again, dense + linear algebra has been in the vanguard of the new era of + petascale computing with the design of new algorithms that can + efficiently run on a multicore node with GPU accelerators. These + algorithms are called ``communication-avoiding'' since they have + been redesigned to limit the amount of communication between + processing units (and between the different levels of memory + hierarchy). They are expressed through Direct Acyclic Graphs + (DAG) of fine-grained tasks that are dynamically + scheduled. Expected advances: First, we plan to investigate the + impact of these principles in the case of sparse applications + (whose algorithms are slightly more complicated but often rely on + dense kernels). Furthermore, both in the dense and sparse cases, + the scalability on thousands of nodes is still limited; new + numerical approaches need to be found. We will specifically + design sparse hybrid direct/iterative methods that represent a + promising approach. + +*** Research papers + Research papers about MORSE can be found at + http://icl.cs.utk.edu/projectsdev/morse/pubs/index.html + +** CHAMELEON +*** CHAMELEON software + The main purpose is to address the performance shortcomings of the + [[http://www.netlib.org/lapack/][LAPACK]] and [[http://www.netlib.org/scalapack/][ScaLAPACK]] libraries on multicore processors and + multi-socket systems of multicore processors and their inability to + efficiently utilize accelerators such as Graphics Processing Units + (GPUs). + + CHAMELEON is a framework written in C which provides routines to + solve dense general systems of linear equations, symmetric positive + definite systems of linear equations and linear least squares + problems, using LU, Cholesky, QR and LQ factorizations. Real + arithmetic and complex arithmetic are supported in both single + precision and double precision. It supports Linux and Mac OS/X + machines (only tested on Intel x86-64 architecture). + + CHAMELEON is based on [[http://icl.cs.utk.edu/plasma/][PLASMA]] source code but is not limited to + shared-memory environment and can exploit multiple GPUs. CHAMELEON + is interfaced in a generic way with both [[http://icl.cs.utk.edu/quark/][QUARK]] and [[http://runtime.bordeaux.inria.fr/StarPU/][StarPU]] runtime + systems. This feature allows to analyze in a unified framework how + sequential task-based algorithms behave regarding different runtime + systems implementations. Using CHAMELEON with [[http://runtime.bordeaux.inria.fr/StarPU/][StarPU]] runtime + system allows to exploit GPUs through kernels provided by [[https://developer.nvidia.com/cublas][cuBLAS]] + and clusters of interconnected nodes with distributed memory (using + [[http://www.open-mpi.org/][MPI]]). Computation of very large systems with dense matrices on a + cluster of nodes is still being experimented and stabilized. It is + not expected to get stable performances with the current version + using MPI. + +*** PLASMA's design principles + CHAMELEON is originally based on [[http://icl.cs.utk.edu/plasma/][PLASMA]] so that design principles + are very similar. The content of this section PLASMA's design + principles has been copied from the /Design principles/ section of + the PLASMA User's Guide. + +**** Tile Algorithms + Tile algorithms are based on the idea of processing the matrix by + square tiles of relatively small size, such that a tile fits + entirely in one of the cache levels associated with one core. + This way a tile can be loaded to the cache and processed + completely before being evicted back to the main memory. Of the + three types of cache misses, *compulsory*, *capacity* and *conflict*, + the use of tile algorithms minimizes the number of capacity + misses, since each operation loads the amount of data that does + not ``overflow'' the cache. + + For some operations such as matrix multiplication and Cholesky + factorization, translating the classic algorithm to the tile + algorithm is trivial. In the case of matrix multiplication, the + tile algorithm is simply a product of applying the technique of + *loop tiling* to the canonical definition of three nested loops. It + is very similar for the Cholesky factorization. The *left-looking* + definition of Cholesky factorization from LAPACK is a loop with a + sequence of calls to four routines: xSYRK (symmetric *rank-k* + update), xPOTRF (Cholesky factorization of a small block on the + diagonal), xGEMM (matrix multiplication) and xTRSM (triangular + solve). If the xSYRK, xGEMM and xTRSM operations are expressed + with the canonical definition of three nested loops and the + technique of loop tiling is applied, the tile algorithm results. + Since the algorithm is produced by simple reordering of + operations, neither the number of operations nor numerical + stability of the algorithm are affected. + + The situation becomes slightly more complicated for LU and QR + factorizations, where the classic algorithms factorize an entire + panel of the matrix (a block of columns) at every step of the + algorithm. One can observe, however, that the process of matrix + factorization is synonymous with introducing zeros in approproate + places and a tile algorithm can be fought of as one that zeroes + one tile of the matrix at a time. This process is referred to as + updating of a factorization or *incremental factorization*. The + process is equivalent to factorizing the top tile of a panel, then + placing the upper triangle of the result on top of the tile blow + and factorizing again, then moving to the next tile and so on. + Here, the tile LU and QR algorithms perform slightly more floating + point operations and require slightly more memory for auxiliary + data. Also, the tile LU factorization applies a different + pivoting pattern and, as a result, is less numerically stable than + classic LU with full pivoting. Numerical stability is not an + issue in case of the tile QR, which relies on orthogonal + transformations (Householder reflections), which are numerically + stable. + + #+CAPTION: Schematic illustration of the tile LU factorization (kernel names for real arithmetics in double precision), courtesey of the [[http://icl.cs.utk.edu/plasma/][PLASMA]] team. + #+NAME: fig:tile_lu + #+ATTR_HTML: :width 640px :align center + [[file:tile_lu.jpg]] + +**** Tile Data Layout + Tile layout is based on the idea of storing the matrix by square + tiles of relatively small size, such that each tile occupies a + continuous memory region. This way a tile can be loaded to the + cache memory efficiently and the risk of evicting it from the + cache memory before it is completely processed is minimized. Of + the three types of cache misses, *compulsory*, *capacity* and + *conflict*, the use of tile layout minimizes the number of conflict + misses, since a continuous region of memory will completely fill + out a /set-associative/ cache memory before an eviction can + happen. Also, from the standpoint of multithreaded execution, the + probability of *false sharing* is minimized. It can only + affect the cache lines containing the beginning and the ending of + a tile. + + In standard *cache-based* architecture, tiles continously laid out + in memory maximize the profit from automatic prefetching. Tile + layout is also beneficial in situations involving the use of + accelerators, where explicit communication of tiles through DMA + transfers is required, such as moving tiles between the system + memory and the local store in Cell B. E. or moving tiles between + the host memory and the device memory in GPUs. In most + circumstances tile layout also minimizes the number of TLB misses + and conflicts to memory banks or partitions. With the standard + (*column-major*) layout, access to each column of a tile is much + more likely to cause a conflict miss, a false sharing miss, a TLB + miss or a bank or partition conflict. The use of the standard + layout for dense matrix operations is a performance minefield. + Although occasionally one can pass through it unscathed, the risk + of hitting a spot deadly to performance is very high. + + Another property of the layout utilized in PLASMA is that it is + ``flat'', meaning that it does not involve a level of + indirection. Each tile stores a small square submatrix of the main + matrix in a *column-major* layout. In turn, the main matrix is an + arrangement of tiles immediately following one another in a + *column-major* layout. The offset of each tile can be calculated + through address arithmetics and does not involve pointer + indirection. Alternatively, a matrix could be represented as an + array of pointers to tiles, located anywhere in memory. Such + layout would be a radical and unjustifiable departure from LAPACK + and ScaLAPACK. Flat tile layout is a natural progression from + LAPACK's *column-major* layout and ScaLAPACK's + /block-cyclic/ layout. + + Another related property of PLASMA's tile layout is that it + includes provisions for padding of tiles, i.e., the actual region + of memory designated for a tile can be larger than the memory + occupied by the actual data. This allows to force a certain + alignment of tile boundaries, while using the flat organization + described in the previous paragraph. The motivation is that, at + the price of small memory overhead, alignment of tile boundaries + may prove benefivial in multiple scenarios involving memory + systems of standard multicore processors, as well as accelerators. + The issues that come into play are, again, the use of TLBs and + memory banks or partitions. + + #+CAPTION: Schematic illustration of the tile layout with *column-major* order of tiles, *column-major* order of elements within tiles and (optional) padding for enforcing a certain alighment of tile bondaries, courtesey of the [[http://icl.cs.utk.edu/plasma/][PLASMA]] team. + #+NAME: fig:tile_layout + #+ATTR_HTML: :width 640px :align center + [[file:tile_layout.jpg]] + +**** Dynamic Task Scheduling + + Dynamic scheduling is the idea of assigning work to cores based on + the availability of data for processing at any given point in time + and is also referred to as *data-driven* scheduling. The concept is + related closely to the idea of expressing computation through a + task graph, often referred to as the DAG (*Direct Acyclic Graph*), + and the flexibility exploring the DAG at runtime. Thus, to a + large extent, dynamic scheduling is synonymous with *runtime + scheduling*. An important concept here is the one of the *critical + path*, which defines the upper bound on the achievable parallelism, + and needs to be pursued at the maximum speed. This is in direct + opposition to the *fork-and-join* or *data-parallel* programming + models, where artificial synchronization points expose serial + sections of the code, where multiple cores are idle, while + sequential processing takes place. The use of dynamic scheduling + introduces a *trade-off*, though. The more dynamic (flexible) + scheduling is, the more centralized (and less scalable) the + scheduling mechanism is. For that reason, currently PLASMA uses + two scheduling mechanisms, one which is fully dynamic and one + where work is assigned statically and dependency checks are done + at runtime. + + The first scheduling mechanism relies on unfolding a *sliding + window* of the task graph at runtime and scheduling work by + resolving data hazards: *Read After Write(RAW)*, *Write After Read + (WAR)* and *Write After Write (WAW)*, a technique analogous to + instruction scheduling in superscalar processors. It also relies + on *work-stealing* for balanding the load among all multiple cores. + The second scheduling mechanism relies on statically designating a + path through the execution space of the algorithm to each core and + following a cycle: transition to a task, wait for its + dependencies, execute it, update the overall progress. Task are + identified by tuples and task transitions are done through locally + evaluated formulas. Progress information can be centralized, + replicated or distributed (currently centralized). + + #+CAPTION: A trace of the tile QR factorization executing on eight cores without any global synchronization points (kernel names for real arithmetics in single precision), courtesey of the [[http://icl.cs.utk.edu/plasma/][PLASMA]] team. + #+NAME: fig:trace_qr + #+ATTR_HTML: :width 640px :align center + [[file:trace_qr.jpg]] diff --git a/doc/orgmode/chapters/using.org b/doc/orgmode/chapters/using.org new file mode 100644 index 0000000000000000000000000000000000000000..4cfd8e6626b79a17baaf587765ffbeb527fb5cf0 --- /dev/null +++ b/doc/orgmode/chapters/using.org @@ -0,0 +1,1434 @@ +@c -*-texinfo-*- + +@c This file is part of the MORSE Handbook. +@c Copyright (C) 2014 Inria +@c Copyright (C) 2014 The University of Tennessee +@c Copyright (C) 2014 King Abdullah University of Science and Technology +@c See the file ../chameleon.texi for copying conditions. + +@menu +* Using CHAMELEON executables:: +* Linking an external application with CHAMELEON libraries:: +* CHAMELEON API:: +@end menu + +@node Using CHAMELEON executables +@section Using CHAMELEON executables + +CHAMELEON provides several test executables that are compiled and link with +CHAMELEON stack of dependencies. +Instructions about the arguments to give to executables are accessible thanks +to the option @option{-[-]help} or @option{-[-]h}. +This set of binaries are separated into three categories and can be found in +three different directories: + +@itemize @bullet + + @item example + + contains examples of API usage and more specifically the + sub-directory lapack_to_morse/ provides a tutorial that explain how to use + CHAMELEON functionalities starting from a full LAPACK code, see +@ref{Tutorial LAPACK to CHAMELEON} + + @item testing + + contains testing drivers to check numerical correctness of + CHAMELEON linear algebra routines with a wide range of parameters + @example + ./testing/stesting 4 1 LANGE 600 100 700 + @end example + Two first arguments are the number of cores and gpus to use. + The third one is the name of the algorithm to test. + The other arguments depend on the algorithm, here it lies for the number of + rows, columns and leading dimension of the problem. + + Name of algorithms available for testing are: + @itemize @bullet + @item LANGE: norms of matrices Infinite, One, Max, Frobenius + @item GEMM: general matrix-matrix multiply + @item HEMM: hermitian matrix-matrix multiply + @item HERK: hermitian matrix-matrix rank k update + @item HER2K: hermitian matrix-matrix rank 2k update + @item SYMM: symmetric matrix-matrix multiply + @item SYRK: symmetric matrix-matrix rank k update + @item SYR2K: symmetric matrix-matrix rank 2k update + @item PEMV: matrix-vector multiply with pentadiagonal matrix + @item TRMM: triangular matrix-matrix multiply + @item TRSM: triangular solve, multiple rhs + @item POSV: solve linear systems with symmetric positive-definite matrix + @item GESV_INCPIV: solve linear systems with general matrix + @item GELS: linear least squares with general matrix + @end itemize + + @item timing + + contains timing drivers to assess performances of CHAMELEON routines. + There are two sets of executables, those who do not use the tile interface +and those who do (with _tile in the name of the executable). + Executables without tile interface allocates data following LAPACK +conventions and these data can be given as arguments to CHAMELEON routines +as you would do with LAPACK. + Executables with tile interface generate directly the data in the format + CHAMELEON tile algorithms used to submit tasks to the runtime system. + Executables with tile interface should be more performant because no data +copy from LAPACK matrix layout to tile matrix layout are necessary. + Calling example: + @example + ./timing/time_dpotrf --n_range=1000:10000:1000 --nb=320 + --threads=9 --gpus=3 + --nowarmup + @end example + + List of main options that can be used in timing: + @itemize @bullet + @item @option{--help}: show usage + @item @option{--threads}: Number of CPU workers (default: +@option{_SC_NPROCESSORS_ONLN}) + @item @option{--gpus}: number of GPU workers (default: @option{0}) + @item @option{--n_range=R}: range of N values, with +@option{R=Start:Stop:Step} +(default: @option{500:5000:500}) + @item @option{--m=X}: dimension (M) of the matrices (default: @option{N}) + @item @option{--k=X}: dimension (K) of the matrices (default: @option{1}), +useful for GEMM algorithm (k is the shared dimension and must be defined >1 to +consider matrices and not vectors) + @item @option{--nrhs=X}: number of right-hand size (default: @option{1}) + @item @option{--nb=X}: block/tile size. (default: @option{128}) + @item @option{--ib=X}: inner-blocking/IB size. (default: @option{32}) + @item @option{--niter=X}: number of iterations performed for each test +(default: @option{1}) + @item @option{--rhblk=X}: if X > 0, enable Householder mode for QR and LQ +factorization. X is the size of each subdomain (default: @option{0}) + @item @option{--[no]check}: check result (default: @option{nocheck}) + @item @option{--[no]profile}: print profiling informations (default: +@option{noprofile}) + @item @option{--[no]trace}: enable/disable trace generation (default: +@option{notrace}) + @item @option{--[no]dag}: enable/disable DAG generation (default: +@option{nodag}) + @item @option{--[no]inv}: check on inverse (default: @option{noinv}) + @item @option{--nocpu}: all GPU kernels are exclusively executed on GPUs +(default: @option{0}) + @end itemize + + List of timing algorithms available: + @itemize @bullet + @item LANGE: norms of matrices + @item GEMM: general matrix-matrix multiply + @item TRSM: triangular solve + @item POTRF: Cholesky factorization with a symmetric +positive-definite matrix + @item POSV: solve linear systems with symmetric positive-definite matrix + @item GETRF_NOPIV: LU factorization of a general matrix +using the tile LU algorithm without row pivoting + @item GESV_NOPIV: solve linear system for a general matrix +using the tile LU algorithm without row pivoting + @item GETRF_INCPIV: LU factorization of a general matrix +using the tile LU algorithm with partial tile pivoting with row interchanges + @item GESV_INCPIV: solve linear system for a general matrix +using the tile LU algorithm with partial tile pivoting with row interchanges +matrix + @item GEQRF: QR factorization of a general matrix + @item GELS: solves overdetermined or underdetermined linear systems +involving a general matrix using the QR or the LQ factorization + @end itemize + +@end itemize + +@node Linking an external application with CHAMELEON libraries +@section Linking an external application with CHAMELEON libraries + +Compilation and link with CHAMELEON libraries have been tested with +@strong{gcc/gfortran 4.8.1} and @strong{icc/ifort 14.0.2}. + +@menu +* Static linking in C:: +* Dynamic linking in C:: +* Build a Fortran program with CHAMELEON:: +@end menu + +@node Static linking in C +@subsection Static linking in C + +Lets imagine you have a file main.c that you want to link with CHAMELEON +static libraries. +Lets consider @file{/home/yourname/install/chameleon} is the install directory +of CHAMELEON containing sub-directories @file{include/} and @file{lib/}. +Here could be your compilation command with gcc compiler: +@example +gcc -I/home/yourname/install/chameleon/include -o main.o -c main.c +@end example + +Now if you want to link your application with CHAMELEON static libraries, you +could do: +@example +gcc main.o -o main \ +/home/yourname/install/chameleon/lib/libchameleon.a \ +/home/yourname/install/chameleon/lib/libchameleon_starpu.a \ +/home/yourname/install/chameleon/lib/libcoreblas.a \ +-lstarpu-1.1 -Wl,--no-as-needed -lmkl_intel_lp64 \ +-lmkl_sequential -lmkl_core -lpthread -lm -lrt +@end example +As you can see in this example, we also link with some dynamic libraries +@option{starpu-1.1}, @option{Intel MKL} libraries (for +BLAS/LAPACK/CBLAS/LAPACKE), @option{pthread}, @option{m} (math) and +@option{rt}. +These libraries will depend on the configuration of your CHAMELEON build. +You can find these dependencies in .pc files we generate during compilation and +that are installed in the sub-directory @file{lib/pkgconfig} of your +CHAMELEON install directory. +Note also that you could need to specify where to find these libraries with +@option{-L} option of your compiler/linker. + +Before to run your program, make sure that all shared libraries paths your +executable depends on are known. +Enter @code{ldd main} to check. +If some shared libraries paths are missing append them in the +@env{LD_LIBRARY_PATH} (for Linux systems) environment variable +(@env{DYLD_LIBRARY_PATH} on Mac, @env{LIB} on Windows). + +@node Dynamic linking in C +@subsection Dynamic linking in C + +For dynamic linking (need to build CHAMELEON with CMake +option @option{BUILD_SHARED_LIBS=ON}) it is similar to static compilation/link +but instead of specifying path to your static libraries you indicate the path +to dynamic libraries with @option{-L} option and you give the name of libraries +with @option{-l} option like this: +@example +gcc main.o -o main \ +-L/home/yourname/install/chameleon/lib \ +-lchameleon -lchameleon_starpu -lcoreblas \ +-lstarpu-1.1 -Wl,--no-as-needed -lmkl_intel_lp64 \ +-lmkl_sequential -lmkl_core -lpthread -lm -lrt +@end example + +Note that an update of your environment variable +@env{LD_LIBRARY_PATH} (@env{DYLD_LIBRARY_PATH} on Mac, @env{LIB} on Windows) +with the path of the libraries could be required before executing, example: +@example +export @env{LD_LIBRARY_PATH}=path/to/libs:path/to/chameleon/lib +@end example + +@node Build a Fortran program with CHAMELEON +@subsection Build a Fortran program with CHAMELEON + +CHAMELEON provides a Fortran interface to user functions. Example: +@example +call morse_version(major, minor, patch) !or +call MORSE_VERSION(major, minor, patch) +@end example + +Build and link are very similar to the C case. + +Compilation example: +@example +gfortran -o main.o -c main.c +@end example + +Static linking example: +@example +gfortran main.o -o main \ +/home/yourname/install/chameleon/lib/libchameleon.a \ +/home/yourname/install/chameleon/lib/libchameleon_starpu.a \ +/home/yourname/install/chameleon/lib/libcoreblas.a \ +-lstarpu-1.1 -Wl,--no-as-needed -lmkl_intel_lp64 \ +-lmkl_sequential -lmkl_core -lpthread -lm -lrt +@end example + +Dynamic linking example: +@example +gfortran main.o -o main \ +-L/home/yourname/install/chameleon/lib \ +-lchameleon -lchameleon_starpu -lcoreblas \ +-lstarpu-1.1 -Wl,--no-as-needed -lmkl_intel_lp64 \ +-lmkl_sequential -lmkl_core -lpthread -lm -lrt +@end example + +@node CHAMELEON API +@section CHAMELEON API + +CHAMELEON provides routines to solve dense general systems of linear +equations, symmetric positive definite systems of linear equations and linear +least squares problems, using LU, Cholesky, QR and LQ factorizations. +Real arithmetic and complex arithmetic are supported in both single precision +and double precision. +Routines that compute linear algebra are of the folowing form: +@example +MORSE_name[_Tile[_Async]] +@end example +@itemize @bullet +@item all user routines are prefixed with @code{MORSE} +@item @code{name} follows BLAS/LAPACK naming scheme for algorithms +(@emph{e.g.} sgemm for general matrix-matrix multiply simple precision) +@item CHAMELEON provides three interface levels + @itemize @minus + @item @code{MORSE_name}: simplest interface, very close to CBLAS and LAPACKE, +matrices are given following the LAPACK data layout (1-D array column-major). +It involves copy of data from LAPACK layout to tile layout and conversely (to +update LAPACK data), see @ref{Step1}. + @item @code{MORSE_name_Tile}: the tile interface avoid copies between LAPACK +and tile layouts. It is the standard interface of CHAMELEON and it should +achieved better performance than the previous simplest interface. The data are +given through a specific structure called a descriptor, see @ref{Step2}. + @item @code{MORSE_name_Tile_Async}: similar to the tile interface, it avoids +synchonization barrier normally called between @code{Tile} routines. +At the end of an @code{Async} function, completion of tasks is not guarentee +and data are not necessarily up-to-date. +To ensure that tasks have been all executed a synchronization function has to +be called after the sequence of @code{Async} functions, see @ref{Step4}. + @end itemize +@end itemize + +MORSE routine calls have to be precede from +@example +MORSE_Init( NCPU, NGPU ); +@end example +to initialize MORSE and the runtime system and followed by +@example +MORSE_Finalize(); +@end example +to free some data and finalize the runtime and/or MPI. + +@menu +* Tutorial LAPACK to CHAMELEON:: +* List of available routines:: +@end menu + +@node Tutorial LAPACK to CHAMELEON +@subsection Tutorial LAPACK to CHAMELEON + +This tutorial is dedicated to the API usage of CHAMELEON. +The idea is to start from a simple code and step by step explain how to +use CHAMELEON routines. +The first step is a full BLAS/LAPACK code without dependencies to CHAMELEON, +a code that most users should easily understand. +Then, the different interfaces CHAMELEON provides are exposed, from the +simplest API (step1) to more complicated ones (until step4). +The way some important parameters are set is discussed in step5. +step6 is an example about distributed computation with MPI. +Finally step7 shows how to let Chameleon initialize user's data +(matrices/vectors) in parallel. + +Source files can be found in the @file{example/lapack_to_morse/} +directory. +If CMake option @option{CHAMELEON_ENABLE_EXAMPLE} is @option{ON} then source +files are compiled with the project libraries. +The arithmetic precision is @code{double}. +To execute a step @samp{X}, enter the following command: +@example +./step@samp{X} --option1 --option2 ... +@end example +Instructions about the arguments to give to executables are accessible thanks +to the option @option{-[-]help} or @option{-[-]h}. +Note there exist default values for options. + +For all steps, the program solves a linear system @math{Ax=B} +The matrix values are randomly generated but ensure that matrix @math{A} is +symmetric positive definite so that @math{A} can be factorized in a @math{LL^T} +form using the Cholesky factorization. + + +Lets comment the different steps of the tutorial +@menu +* Step0:: a simple Cholesky example using the C interface of +BLAS/LAPACK +* Step1:: introduces the LAPACK equivalent interface of Chameleon +* Step2:: introduces the tile interface +* Step3:: indicates how to give your own tile matrix to Chameleon +* Step4:: introduces the tile async interface +* Step5:: shows how to set some important parameters +* Step6:: introduces how to benefit from MPI in Chameleon +* Step7:: introduces how to let Chameleon initialize the user's matrix data +@end menu + +@node Step0 +@subsubsection Step0 + +The C interface of BLAS and LAPACK, that is, CBLAS and +LAPACKE, are used to solve the system. The size of the system (matrix) and the +number of right hand-sides can be given as arguments to the executable (be +careful not to give huge numbers if you do not have an infinite amount of RAM!). +As for every step, the correctness of the solution is checked by calculating +the norm @math{||Ax-B||/(||A||||x||+||B||)}. +The time spent in factorization+solve is recorded and, because we know exactly +the number of operations of these algorithms, we deduce the number of +operations that have been processed per second (in GFlops/s). +The important part of the code that solves the problem is: +@verbatim +/* Cholesky factorization: + * A is replaced by its factorization L or L^T depending on uplo */ +LAPACKE_dpotrf( LAPACK_COL_MAJOR, 'U', N, A, N ); +/* Solve: + * B is stored in X on entry, X contains the result on exit. + * Forward ... + */ +cblas_dtrsm( + CblasColMajor, + CblasLeft, + CblasUpper, + CblasConjTrans, + CblasNonUnit, + N, NRHS, 1.0, A, N, X, N); +/* ... and back substitution */ +cblas_dtrsm( + CblasColMajor, + CblasLeft, + CblasUpper, + CblasNoTrans, + CblasNonUnit, + N, NRHS, 1.0, A, N, X, N); +@end verbatim + +@node Step1 +@subsubsection Step1 + +It introduces the simplest CHAMELEON interface which is equivalent to +CBLAS/LAPACKE. +The code is very similar to step0 but instead of calling CBLAS/LAPACKE +functions, we call CHAMELEON equivalent functions. +The solving code becomes: +@verbatim +/* Factorization: */ +MORSE_dpotrf( UPLO, N, A, N ); +/* Solve: */ +MORSE_dpotrs(UPLO, N, NRHS, A, N, X, N); +@end verbatim +The API is almost the same so that it is easy to use for beginners. +It is important to keep in mind that before any call to MORSE routines, +@code{MORSE_Init} has to be invoked to initialize MORSE and the runtime system. +Example: +@verbatim +MORSE_Init( NCPU, NGPU ); +@end verbatim +After all MORSE calls have been done, a call to @code{MORSE_Finalize} is +required to free some data and finalize the runtime and/or MPI. +@verbatim +MORSE_Finalize(); +@end verbatim +We use MORSE routines with the LAPACK interface which means the routines +accepts the same matrix format as LAPACK (1-D array column-major). +Note that we copy the matrix to get it in our own tile structures, see details +about this format here @ref{Tile Data Layout}. +This means you can get an overhead coming from copies. + +@node Step2 +@subsubsection Step2 + +This program is a copy of step1 but instead of using the LAPACK interface which +leads to copy LAPACK matrices inside MORSE routines we use the tile interface. +We will still use standard format of matrix but we will see how to give this +matrix to create a MORSE descriptor, a structure wrapping data on which we want +to apply sequential task-based algorithms. +The solving code becomes: +@verbatim +/* Factorization: */ +MORSE_dpotrf_Tile( UPLO, descA ); +/* Solve: */ +MORSE_dpotrs_Tile( UPLO, descA, descX ); +@end verbatim +To use the tile interface, a specific structure @code{MORSE_desc_t} must be +created. +This can be achieved from different ways. +@enumerate +@item Use the existing function @code{MORSE_Desc_Create}: means the +matrix data are considered contiguous in memory as it is considered in PLASMA +(@ref{Tile Data Layout}). +@item Use the existing function @code{MORSE_Desc_Create_OOC}: means the +matrix data is allocated on-demand in memory tile by tile, and possibly pushed +to disk if that does not fit memory. +@item Use the existing function @code{MORSE_Desc_Create_User}: it is more +flexible than @code{Desc_Create} because you can give your own way to access to +tile data so that your tiles can be allocated wherever you want in memory, see +next paragraph @ref{Step3}. +@item Create you own function to fill the descriptor. +If you understand well the meaning of each item of @code{MORSE_desc_t}, you +should be able to fill correctly the structure (good luck). +@end enumerate + +In Step2, we use the first way to create the descriptor: +@verbatim +MORSE_Desc_Create(&descA, NULL, MorseRealDouble, + NB, NB, NB*NB, N, N, + 0, 0, N, N, + 1, 1); +@end verbatim + +@itemize @bullet + +@item @code{descA} is the descriptor to create. + +@item The second argument is a pointer to existing data. +The existing data must follow LAPACK/PLASMA matrix layout @ref{Tile Data +Layout} (1-D array column-major) if @code{MORSE_Desc_Create} is used to create +the descriptor. +The @code{MORSE_Desc_Create_User} function can be used if you have data +organized differently. +This is discussed in the next paragraph @ref{Step3}. +Giving a @code{NULL} pointer means you let the function allocate memory space. +This requires to copy your data in the memory allocated by the +@code{Desc_Create}. +This can be done with +@verbatim +MORSE_Lapack_to_Tile(A, N, descA); +@end verbatim + +@item Third argument of @code{Desc_Create} is the datatype (used for memory +allocation). + +@item Fourth argument until sixth argument stand for respectively, the number +of rows (@code{NB}), columns (@code{NB}) in each tile, the total number of +values in a tile (@code{NB*NB}), the number of rows (@code{N}), colmumns +(@code{N}) in the entire matrix. + +@item Seventh argument until ninth argument stand for respectively, the +beginning row (@code{0}), column (@code{0}) indexes of the submatrix and the +number of rows (@code{N}), columns (@code{N}) in the submatrix. +These arguments are specific and used in precise cases. +If you do not consider submatrices, just use @code{0, 0, NROWS, NCOLS}. + +@item Two last arguments are the parameter of the 2-D block-cyclic distribution +grid, see @uref{http://www.netlib.org/scalapack/slug/node75.html, ScaLAPACK}. +To be able to use other data distribution over the nodes, +@code{MORSE_Desc_Create_User} function should be used. + +@end itemize + + +@node Step3 +@subsubsection Step3 + +This program makes use of the same interface than Step2 (tile interface) but +does not allocate LAPACK matrices anymore so that no copy between LAPACK matrix +layout and tile matrix layout are necessary to call MORSE routines. +To generate random right hand-sides you can use: +@verbatim +/* Allocate memory and initialize descriptor B */ +MORSE_Desc_Create(&descB, NULL, MorseRealDouble, + NB, NB, NB*NB, N, NRHS, + 0, 0, N, NRHS, 1, 1); +/* generate RHS with random values */ +MORSE_dplrnt_Tile( descB, 5673 ); +@end verbatim + +The other important point is that is it possible to create a descriptor, the +necessary structure to call MORSE efficiently, by giving your own pointer to +tiles if your matrix is not organized as a 1-D array column-major. +This can be achieved with the @code{MORSE_Desc_Create_User} routine. +Here is an example: +@verbatim +MORSE_Desc_Create_User(&descA, matA, MorseRealDouble, + NB, NB, NB*NB, N, N, + 0, 0, N, N, 1, 1, + user_getaddr_arrayofpointers, + user_getblkldd_arrayofpointers, + user_getrankof_zero); +@end verbatim +Firsts arguments are the same than @code{MORSE_Desc_Create} routine. +Following arguments allows you to give pointer to functions that manage the +access to tiles from the structure given as second argument. +Here for example, @code{matA} is an array containing addresses to tiles, see +the function @code{allocate_tile_matrix} defined in @file{step3.h}. +The three functions you have to define for @code{Desc_Create_User} are: +@itemize @bullet +@item a function that returns address of tile @math{A(m,n)}, m and n standing +for the indexes of the tile in the global matrix. Lets consider a matrix +@math{4x4} with tile size @math{2x2}, the matrix contains four tiles of +indexes: @math{A(m=0,n=0)}, @math{A(m=0,n=1)}, @math{A(m=1,n=0)}, +@math{A(m=1,n=1)} +@item a function that returns the leading dimension of tile @math{A(m,*)} +@item a function that returns MPI rank of tile @math{A(m,n)} +@end itemize +Examples for these functions are vizible in @file{step3.h}. +Note that the way we define these functions is related to the tile matrix +format and to the data distribution considered. +This example should not be used with MPI since all tiles are affected to +processus @code{0}, which means a large amount of data will be +potentially transfered between nodes. + +@node Step4 +@subsubsection Step4 +This program is a copy of step2 but instead of using the tile interface, it +uses the tile async interface. +The goal is to exhibit the runtime synchronization barriers. +Keep in mind that when the tile interface is called, like +@code{MORSE_dpotrf_Tile}, a synchronization function, waiting for the actual +execution and termination of all tasks, is called to ensure the +proper completion of the algorithm (i.e. data are up-to-date). +The code shows how to exploit the async interface to pipeline subsequent +algorithms so that less synchronisations are done. +The code becomes: +@verbatim +/* Morse structure containing parameters and a structure to interact with + * the Runtime system */ +MORSE_context_t *morse; +/* MORSE sequence uniquely identifies a set of asynchronous function calls + * sharing common exception handling */ +MORSE_sequence_t *sequence = NULL; +/* MORSE request uniquely identifies each asynchronous function call */ +MORSE_request_t request = MORSE_REQUEST_INITIALIZER; +int status; + +... + +morse_sequence_create(morse, &sequence); + +/* Factorization: */ +MORSE_dpotrf_Tile_Async( UPLO, descA, sequence, &request ); + +/* Solve: */ +MORSE_dpotrs_Tile_Async( UPLO, descA, descX, sequence, &request); + +/* Synchronization barrier (the runtime ensures that all submitted tasks + * have been terminated */ +RUNTIME_barrier(morse); +/* Ensure that all data processed on the gpus we are depending on are back + * in main memory */ +RUNTIME_desc_getoncpu(descA); +RUNTIME_desc_getoncpu(descX); + +status = sequence->status; + +@end verbatim +Here the sequence of @code{dpotrf} and @code{dpotrs} algorithms is processed +without synchronization so that some tasks of @code{dpotrf} and @code{dpotrs} +can be concurently executed which could increase performances. +The async interface is very similar to the tile one. +It is only necessary to give two new objects @code{MORSE_sequence_t} and +@code{MORSE_request_t} used to handle asynchronous function calls. + +@center @image{potri_async,13cm,8cm} +POTRI (POTRF, TRTRI, LAUUM) algorithm with and without synchronization +barriers, courtesey of the @uref{http://icl.cs.utk.edu/plasma/, PLASMA} team. + +@node Step5 +@subsubsection Step5 + +Step5 shows how to set some important parameters. +This program is a copy of Step4 but some additional parameters are given by +the user. +The parameters that can be set are: +@itemize @bullet +@item number of Threads +@item number of GPUs + +The number of workers can be given as argument to the executable with +@option{--threads=} and @option{--gpus=} options. +It is important to notice that we assign one thread per gpu to optimize data +transfer between main memory and devices memory. +The number of workers of each type @code{CPU} and @code{CUDA} must be given at +@code{MORSE_Init}. +@verbatim +if ( iparam[IPARAM_THRDNBR] == -1 ) { + get_thread_count( &(iparam[IPARAM_THRDNBR]) ); + /* reserve one thread par cuda device to optimize memory transfers */ + iparam[IPARAM_THRDNBR] -= iparam[IPARAM_NCUDAS]; +} +NCPU = iparam[IPARAM_THRDNBR]; +NGPU = iparam[IPARAM_NCUDAS]; + +/* initialize MORSE with main parameters */ +MORSE_Init( NCPU, NGPU ); +@end verbatim + +@item matrix size +@item number of right-hand sides +@item block (tile) size + +The problem size is given with @option{--n=} and @option{--nrhs=} options. +The tile size is given with option @option{--nb=}. +These parameters are required to create descriptors. +The size tile @code{NB} is a key parameter to get performances since it +defines the granularity of tasks. +If @code{NB} is too large compared to @code{N}, there are few tasks to +schedule. +If the number of workers is large this leads to limit parallelism. +On the contrary, if @code{NB} is too small (@emph{i.e.} many small tasks), +workers could not be correctly fed and the runtime systems operations +could represent a substantial overhead. +A trade-off has to be found depending on many parameters: problem size, +algorithm (drive data dependencies), architecture (number of workers, +workers speed, workers uniformity, memory bus speed). +By default it is set to 128. +Do not hesitate to play with this parameter and compare performances on your +machine. + +@item inner-blocking size + +The inner-blocking size is given with option @option{--ib=}. +This parameter is used by kernels (optimized algorithms applied on tiles) to +perform subsequent operations with data block-size that fits the cache of +workers. +Parameters @code{NB} and @code{IB} can be given with @code{MORSE_Set} function: +@verbatim +MORSE_Set(MORSE_TILE_SIZE, iparam[IPARAM_NB] ); +MORSE_Set(MORSE_INNER_BLOCK_SIZE, iparam[IPARAM_IB] ); +@end verbatim +@end itemize + +@node Step6 +@subsubsection Step6 + +This program is a copy of Step5 with some additional parameters to be set for +the data distribution. +To use this program properly MORSE must use StarPU Runtime system and MPI +option must be activated at configure. +The data distribution used here is 2-D block-cyclic, see for example +@uref{http://www.netlib.org/scalapack/slug/node75.html, ScaLAPACK} for +explanation. +The user can enter the parameters of the distribution grid at execution with +@option{--p=} option. +Example using OpenMPI on four nodes with one process per node: +@example +mpirun -np 4 ./step6 --n=10000 --nb=320 --ib=64 \ + --threads=8 --gpus=2 --p=2 +@end example + +In this program we use the tile data layout from PLASMA so that the call +@verbatim +MORSE_Desc_Create_User(&descA, NULL, MorseRealDouble, + NB, NB, NB*NB, N, N, + 0, 0, N, N, + GRID_P, GRID_Q, + morse_getaddr_ccrb, + morse_getblkldd_ccrb, + morse_getrankof_2d); +@end verbatim +is equivalent to the following call +@verbatim +MORSE_Desc_Create(&descA, NULL, MorseRealDouble, + NB, NB, NB*NB, N, N, + 0, 0, N, N, + GRID_P, GRID_Q); +@end verbatim +functions @code{morse_getaddr_ccrb}, @code{morse_getblkldd_ccrb}, +@code{morse_getrankof_2d} being used in @code{Desc_Create}. +It is interesting to notice that the code is almost the same as Step5. +The only additional information to give is the way tiles are distributed +through the third function given to @code{MORSE_Desc_Create_User}. +Here, because we have made experiments only with a 2-D block-cyclic +distribution, we have parameters P and Q in the interface of @code{Desc_Create} +but they have sense only for 2-D block-cyclic distribution and then using +@code{morse_getrankof_2d} function. +Of course it could be used with other distributions, being no more the +parameters of a 2-D block-cyclic grid but of another distribution. + +@node Step7 +@subsubsection Step7 + +This program is a copy of step6 with some additional calls to +build a matrix from within chameleon using a function provided by the user. +This can be seen as a replacement of the function like @code{MORSE_dplgsy_Tile()} that can be used +to fill the matrix with random data, @code{MORSE_dLapack_to_Tile()} to fill the matrix +with data stored in a lapack-like buffer, or @code{MORSE_Desc_Create_User()} that can be used +to describe an arbitrary tile matrix structure. +In this example, the build callback function are just wrapper towards @code{CORE_xxx()} functions, so the output +of the program step7 should be exactly similar to that of step6. +The difference is that the function used to fill the tiles is provided by the user, +and therefore this approach is much more flexible. + +The new function to understand is @code{MORSE_dbuild_Tile}, e.g. +@verbatim +struct data_pl data_A={(double)N, 51, N}; +MORSE_dbuild_Tile(MorseUpperLower, descA, (void*)&data_A, Morse_build_callback_plgsy); +@end verbatim +The idea here is to let Chameleon fill the matrix data in a task-based fashion +(parallel) by using a function given by the user. +First, the user should define if all the blocks must be entirelly filled or just +the upper/lower part with, e.g. @code{MorseUpperLower}. +We still relies on the same structure @code{MORSE_desc_t} which must be +initialized with the proper parameters, by calling for example +@code{MORSE_Desc_Create}. +Then, an opaque pointer is used to let the user give some extra data used by +his function. +The last parameter is the pointer to the user's function. + +@node List of available routines +@subsection List of available routines + +@menu +* Auxiliary routines:: Init, Finalize, Version, etc +* Descriptor routines:: To handle descriptors +* Options routines:: To set options +* Sequences routines:: To manage asynchronous function calls +* Linear Algebra routines:: Computional routines +@end menu + +@node Auxiliary routines +@subsubsection Auxiliary routines + +Reports MORSE version number. +@verbatim +int MORSE_Version (int *ver_major, int *ver_minor, int *ver_micro); +@end verbatim + +Initialize MORSE: initialize some parameters, initialize the runtime and/or MPI. +@verbatim +int MORSE_Init (int nworkers, int ncudas); +@end verbatim + +Finalyze MORSE: free some data and finalize the runtime and/or MPI. +@verbatim +int MORSE_Finalize (void); +@end verbatim + +Return the MPI rank of the calling process. +@verbatim +int MORSE_My_Mpi_Rank (void); +@end verbatim + +Suspend MORSE runtime to poll for new tasks, to avoid useless CPU consumption when +no tasks have to be executed by MORSE runtime system. +@verbatim +int MORSE_Pause (void); +@end verbatim + +Symmetrical call to MORSE_Pause, used to resume the workers polling for new tasks. +@verbatim +int MORSE_Resume (void); +@end verbatim + +Conversion from LAPACK layout to tile layout. +@verbatim +int MORSE_Lapack_to_Tile (void *Af77, int LDA, MORSE_desc_t *A); +@end verbatim + +Conversion from tile layout to LAPACK layout. +@verbatim +int MORSE_Tile_to_Lapack (MORSE_desc_t *A, void *Af77, int LDA); +@end verbatim + +@node Descriptor routines +@subsubsection Descriptor routines + +@c /* Descriptor */ +Create matrix descriptor, internal function. +@verbatim +int MORSE_Desc_Create (MORSE_desc_t **desc, void *mat, MORSE_enum dtyp, + int mb, int nb, int bsiz, int lm, int ln, + int i, int j, int m, int n, int p, int q); +@end verbatim + +Create matrix descriptor, user function. +@verbatim +int MORSE_Desc_Create_User(MORSE_desc_t **desc, void *mat, MORSE_enum dtyp, + int mb, int nb, int bsiz, int lm, int ln, + int i, int j, int m, int n, int p, int q, + void* (*get_blkaddr)( const MORSE_desc_t*, int, int), + int (*get_blkldd)( const MORSE_desc_t*, int ), + int (*get_rankof)( const MORSE_desc_t*, int, int )); +@end verbatim + +Destroys matrix descriptor. +@verbatim +int MORSE_Desc_Destroy (MORSE_desc_t **desc); +@end verbatim + +Ensure that all data are up-to-date in main memory (even if some tasks have +been processed on GPUs) +@verbatim +int MORSE_Desc_Getoncpu(MORSE_desc_t *desc); +@end verbatim + +@node Options routines +@subsubsection Options routines + +@c /* Options */ +Enable MORSE feature. +@verbatim +int MORSE_Enable (MORSE_enum option); +@end verbatim +Feature to be enabled: +@itemize @bullet +@item @code{MORSE_WARNINGS}: printing of warning messages, +@item @code{MORSE_ERRORS}: printing of error messages, +@item @code{MORSE_AUTOTUNING}: autotuning for tile size and inner block size, +@item @code{MORSE_PROFILING_MODE}: activate kernels profiling. +@end itemize + +Disable MORSE feature. +@verbatim +int MORSE_Disable (MORSE_enum option); +@end verbatim +Symmetric to @code{MORSE_Enable}. + +Set MORSE parameter. +@verbatim +int MORSE_Set (MORSE_enum param, int value); +@end verbatim +Parameters to be set: +@itemize @bullet +@item @code{MORSE_TILE_SIZE}: size matrix tile, +@item @code{MORSE_INNER_BLOCK_SIZE}: size of tile inner block, +@item @code{MORSE_HOUSEHOLDER_MODE}: type of householder trees (FLAT or TREE), +@item @code{MORSE_HOUSEHOLDER_SIZE}: size of the groups in householder trees, +@item @code{MORSE_TRANSLATION_MODE}: related to the +@code{MORSE_Lapack_to_Tile}, see @file{ztile.c}. +@end itemize + +Get value of MORSE parameter. +@verbatim +int MORSE_Get (MORSE_enum param, int *value); +@end verbatim + +@node Sequences routines +@subsubsection Sequences routines + +@c /* Sequences */ +Create a sequence. +@verbatim +int MORSE_Sequence_Create (MORSE_sequence_t **sequence); +@end verbatim + +Destroy a sequence. +@verbatim +int MORSE_Sequence_Destroy (MORSE_sequence_t *sequence); +@end verbatim + +Wait for the completion of a sequence. +@verbatim +int MORSE_Sequence_Wait (MORSE_sequence_t *sequence); +@end verbatim + +@node Linear Algebra routines +@subsubsection Linear Algebra routines + +Routines computing linear algebra of the form +@code{MORSE_name[_Tile[_Async]]} (@code{name} follows LAPACK naming scheme, see +@uref{http://www.netlib.org/lapack/lug/node24.html} availables: + +@verbatim +/** ******************************************************** + * Declarations of computational functions (LAPACK layout) + **/ + +int MORSE_zgelqf(int M, int N, MORSE_Complex64_t *A, int LDA, + MORSE_desc_t *descT); + +int MORSE_zgelqs(int M, int N, int NRHS, MORSE_Complex64_t *A, int LDA, + MORSE_desc_t *descT, MORSE_Complex64_t *B, int LDB); + +int MORSE_zgels(MORSE_enum trans, int M, int N, int NRHS, + MORSE_Complex64_t *A, int LDA, MORSE_desc_t *descT, + MORSE_Complex64_t *B, int LDB); + +int MORSE_zgemm(MORSE_enum transA, MORSE_enum transB, int M, int N, int K, + MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, + MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta, + MORSE_Complex64_t *C, int LDC); + +int MORSE_zgeqrf(int M, int N, MORSE_Complex64_t *A, int LDA, + MORSE_desc_t *descT); + +int MORSE_zgeqrs(int M, int N, int NRHS, MORSE_Complex64_t *A, int LDA, + MORSE_desc_t *descT, MORSE_Complex64_t *B, int LDB); + +int MORSE_zgesv_incpiv(int N, int NRHS, MORSE_Complex64_t *A, int LDA, + MORSE_desc_t *descL, int *IPIV, + MORSE_Complex64_t *B, int LDB); + +int MORSE_zgesv_nopiv(int N, int NRHS, MORSE_Complex64_t *A, int LDA, + MORSE_Complex64_t *B, int LDB); + +int MORSE_zgetrf_incpiv(int M, int N, MORSE_Complex64_t *A, int LDA, + MORSE_desc_t *descL, int *IPIV); + +int MORSE_zgetrf_nopiv(int M, int N, MORSE_Complex64_t *A, int LDA); + +int MORSE_zgetrs_incpiv(MORSE_enum trans, int N, int NRHS, + MORSE_Complex64_t *A, int LDA, + MORSE_desc_t *descL, int *IPIV, + MORSE_Complex64_t *B, int LDB); + +int MORSE_zgetrs_nopiv(MORSE_enum trans, int N, int NRHS, + MORSE_Complex64_t *A, int LDA, + MORSE_Complex64_t *B, int LDB); + +#ifdef COMPLEX +int MORSE_zhemm(MORSE_enum side, MORSE_enum uplo, int M, int N, + MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, + MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta, + MORSE_Complex64_t *C, int LDC); + +int MORSE_zherk(MORSE_enum uplo, MORSE_enum trans, int N, int K, + double alpha, MORSE_Complex64_t *A, int LDA, + double beta, MORSE_Complex64_t *C, int LDC); + +int MORSE_zher2k(MORSE_enum uplo, MORSE_enum trans, int N, int K, + MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, + MORSE_Complex64_t *B, int LDB, double beta, + MORSE_Complex64_t *C, int LDC); +#endif + +int MORSE_zlacpy(MORSE_enum uplo, int M, int N, + MORSE_Complex64_t *A, int LDA, + MORSE_Complex64_t *B, int LDB); + +double MORSE_zlange(MORSE_enum norm, int M, int N, + MORSE_Complex64_t *A, int LDA); + +#ifdef COMPLEX +double MORSE_zlanhe(MORSE_enum norm, MORSE_enum uplo, int N, + MORSE_Complex64_t *A, int LDA); +#endif + +double MORSE_zlansy(MORSE_enum norm, MORSE_enum uplo, int N, + MORSE_Complex64_t *A, int LDA); + +double MORSE_zlantr(MORSE_enum norm, MORSE_enum uplo, MORSE_enum diag, + int M, int N, MORSE_Complex64_t *A, int LDA); + +int MORSE_zlaset(MORSE_enum uplo, int M, int N, MORSE_Complex64_t alpha, + MORSE_Complex64_t beta, MORSE_Complex64_t *A, int LDA); + +int MORSE_zlauum(MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA); + +#ifdef COMPLEX +int MORSE_zplghe( double bump, MORSE_enum uplo, int N, + MORSE_Complex64_t *A, int LDA, + unsigned long long int seed ); +#endif + +int MORSE_zplgsy( MORSE_Complex64_t bump, MORSE_enum uplo, int N, + MORSE_Complex64_t *A, int LDA, + unsigned long long int seed ); + +int MORSE_zplrnt( int M, int N, MORSE_Complex64_t *A, int LDA, + unsigned long long int seed ); + +int MORSE_zposv(MORSE_enum uplo, int N, int NRHS, + MORSE_Complex64_t *A, int LDA, + MORSE_Complex64_t *B, int LDB); + +int MORSE_zpotrf(MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA); + +int MORSE_zsytrf(MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA); + +int MORSE_zpotri(MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA); + +int MORSE_zpotrs(MORSE_enum uplo, int N, int NRHS, + MORSE_Complex64_t *A, int LDA, + MORSE_Complex64_t *B, int LDB); + +#if defined (PRECISION_c) || defined(PRECISION_z) +int MORSE_zsytrs(MORSE_enum uplo, int N, int NRHS, + MORSE_Complex64_t *A, int LDA, + MORSE_Complex64_t *B, int LDB); +#endif + +int MORSE_zsymm(MORSE_enum side, MORSE_enum uplo, int M, int N, + MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, + MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta, + MORSE_Complex64_t *C, int LDC); + +int MORSE_zsyrk(MORSE_enum uplo, MORSE_enum trans, int N, int K, + MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, + MORSE_Complex64_t beta, MORSE_Complex64_t *C, int LDC); + +int MORSE_zsyr2k(MORSE_enum uplo, MORSE_enum trans, int N, int K, + MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, + MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta, + MORSE_Complex64_t *C, int LDC); + +int MORSE_ztrmm(MORSE_enum side, MORSE_enum uplo, + MORSE_enum transA, MORSE_enum diag, + int N, int NRHS, + MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, + MORSE_Complex64_t *B, int LDB); + +int MORSE_ztrsm(MORSE_enum side, MORSE_enum uplo, + MORSE_enum transA, MORSE_enum diag, + int N, int NRHS, + MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, + MORSE_Complex64_t *B, int LDB); + +int MORSE_ztrsmpl(int N, int NRHS, MORSE_Complex64_t *A, int LDA, + MORSE_desc_t *descL, int *IPIV, + MORSE_Complex64_t *B, int LDB); + +int MORSE_ztrsmrv(MORSE_enum side, MORSE_enum uplo, + MORSE_enum transA, MORSE_enum diag, + int N, int NRHS, + MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, + MORSE_Complex64_t *B, int LDB); + +int MORSE_ztrtri(MORSE_enum uplo, MORSE_enum diag, int N, + MORSE_Complex64_t *A, int LDA); + +int MORSE_zunglq(int M, int N, int K, MORSE_Complex64_t *A, int LDA, + MORSE_desc_t *descT, MORSE_Complex64_t *B, int LDB); + +int MORSE_zungqr(int M, int N, int K, MORSE_Complex64_t *A, int LDA, + MORSE_desc_t *descT, MORSE_Complex64_t *B, int LDB); + +int MORSE_zunmlq(MORSE_enum side, MORSE_enum trans, int M, int N, int K, + MORSE_Complex64_t *A, int LDA, + MORSE_desc_t *descT, + MORSE_Complex64_t *B, int LDB); + +int MORSE_zunmqr(MORSE_enum side, MORSE_enum trans, int M, int N, int K, + MORSE_Complex64_t *A, int LDA, MORSE_desc_t *descT, + MORSE_Complex64_t *B, int LDB); + +/** ****************************************************** + * Declarations of computational functions (tile layout) + **/ + +int MORSE_zgelqf_Tile(MORSE_desc_t *A, MORSE_desc_t *T); + +int MORSE_zgelqs_Tile(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B); + +int MORSE_zgels_Tile(MORSE_enum trans, MORSE_desc_t *A, MORSE_desc_t *T, + MORSE_desc_t *B); + +int MORSE_zgemm_Tile(MORSE_enum transA, MORSE_enum transB, + MORSE_Complex64_t alpha, MORSE_desc_t *A, + MORSE_desc_t *B, MORSE_Complex64_t beta, + MORSE_desc_t *C); + +int MORSE_zgeqrf_Tile(MORSE_desc_t *A, MORSE_desc_t *T); + +int MORSE_zgeqrs_Tile(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B); + +int MORSE_zgesv_incpiv_Tile(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV, + MORSE_desc_t *B); + +int MORSE_zgesv_nopiv_Tile(MORSE_desc_t *A, MORSE_desc_t *B); + +int MORSE_zgetrf_incpiv_Tile(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV); + +int MORSE_zgetrf_nopiv_Tile(MORSE_desc_t *A); + +int MORSE_zgetrs_incpiv_Tile(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV, + MORSE_desc_t *B); + +int MORSE_zgetrs_nopiv_Tile(MORSE_desc_t *A, MORSE_desc_t *B); + +#ifdef COMPLEX +int MORSE_zhemm_Tile(MORSE_enum side, MORSE_enum uplo, + MORSE_Complex64_t alpha, MORSE_desc_t *A, + MORSE_desc_t *B, MORSE_Complex64_t beta, + MORSE_desc_t *C); + +int MORSE_zherk_Tile(MORSE_enum uplo, MORSE_enum trans, + double alpha, MORSE_desc_t *A, + double beta, MORSE_desc_t *C); + +int MORSE_zher2k_Tile(MORSE_enum uplo, MORSE_enum trans, + MORSE_Complex64_t alpha, MORSE_desc_t *A, + MORSE_desc_t *B, double beta, MORSE_desc_t *C); +#endif + +int MORSE_zlacpy_Tile(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B); + +double MORSE_zlange_Tile(MORSE_enum norm, MORSE_desc_t *A); + +#ifdef COMPLEX +double MORSE_zlanhe_Tile(MORSE_enum norm, MORSE_enum uplo, MORSE_desc_t *A); +#endif + +double MORSE_zlansy_Tile(MORSE_enum norm, MORSE_enum uplo, MORSE_desc_t *A); + +double MORSE_zlantr_Tile(MORSE_enum norm, MORSE_enum uplo, + MORSE_enum diag, MORSE_desc_t *A); + +int MORSE_zlaset_Tile(MORSE_enum uplo, MORSE_Complex64_t alpha, + MORSE_Complex64_t beta, MORSE_desc_t *A); + +int MORSE_zlauum_Tile(MORSE_enum uplo, MORSE_desc_t *A); + +#ifdef COMPLEX +int MORSE_zplghe_Tile(double bump, MORSE_enum uplo, MORSE_desc_t *A, + unsigned long long int seed); +#endif + +int MORSE_zplgsy_Tile(MORSE_Complex64_t bump, MORSE_enum uplo, MORSE_desc_t *A, + unsigned long long int seed ); + +int MORSE_zplrnt_Tile(MORSE_desc_t *A, unsigned long long int seed ); + +int MORSE_zposv_Tile(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B); + +int MORSE_zpotrf_Tile(MORSE_enum uplo, MORSE_desc_t *A); + +int MORSE_zsytrf_Tile(MORSE_enum uplo, MORSE_desc_t *A); + +int MORSE_zpotri_Tile(MORSE_enum uplo, MORSE_desc_t *A); + +int MORSE_zpotrs_Tile(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B); + +#if defined (PRECISION_c) || defined(PRECISION_z) +int MORSE_zsytrs_Tile(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B); +#endif + +int MORSE_zsymm_Tile(MORSE_enum side, MORSE_enum uplo, + MORSE_Complex64_t alpha, MORSE_desc_t *A, + MORSE_desc_t *B, MORSE_Complex64_t beta, + MORSE_desc_t *C); + +int MORSE_zsyrk_Tile(MORSE_enum uplo, MORSE_enum trans, + MORSE_Complex64_t alpha, MORSE_desc_t *A, + MORSE_Complex64_t beta, MORSE_desc_t *C); + +int MORSE_zsyr2k_Tile(MORSE_enum uplo, MORSE_enum trans, + MORSE_Complex64_t alpha, MORSE_desc_t *A, + MORSE_desc_t *B, MORSE_Complex64_t beta, + MORSE_desc_t *C); + +int MORSE_ztrmm_Tile(MORSE_enum side, MORSE_enum uplo, + MORSE_enum transA, MORSE_enum diag, + MORSE_Complex64_t alpha, MORSE_desc_t *A, + MORSE_desc_t *B); + +int MORSE_ztrsm_Tile(MORSE_enum side, MORSE_enum uplo, + MORSE_enum transA, MORSE_enum diag, + MORSE_Complex64_t alpha, MORSE_desc_t *A, + MORSE_desc_t *B); + +int MORSE_ztrsmpl_Tile(MORSE_desc_t *A, MORSE_desc_t *L, + int *IPIV, MORSE_desc_t *B); + +int MORSE_ztrsmrv_Tile(MORSE_enum side, MORSE_enum uplo, + MORSE_enum transA, MORSE_enum diag, + MORSE_Complex64_t alpha, MORSE_desc_t *A, + MORSE_desc_t *B); + +int MORSE_ztrtri_Tile(MORSE_enum uplo, MORSE_enum diag, MORSE_desc_t *A); + +int MORSE_zunglq_Tile(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B); + +int MORSE_zungqr_Tile(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B); + +int MORSE_zunmlq_Tile(MORSE_enum side, MORSE_enum trans, MORSE_desc_t *A, + MORSE_desc_t *T, MORSE_desc_t *B); + +int MORSE_zunmqr_Tile(MORSE_enum side, MORSE_enum trans, MORSE_desc_t *A, + MORSE_desc_t *T, MORSE_desc_t *B); + +/** **************************************** + * Declarations of computational functions + * (tile layout, asynchronous execution) + **/ + +int MORSE_zgelqf_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, + MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_zgelqs_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, + MORSE_desc_t *B, + MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_zgels_Tile_Async(MORSE_enum trans, MORSE_desc_t *A, + MORSE_desc_t *T, MORSE_desc_t *B, + MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_zgemm_Tile_Async(MORSE_enum transA, MORSE_enum transB, + MORSE_Complex64_t alpha, MORSE_desc_t *A, + MORSE_desc_t *B, MORSE_Complex64_t beta, + MORSE_desc_t *C, MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_zgeqrf_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, + MORSE_sequence_t *sequence, + MORSE_request_t *request) + +int MORSE_zgeqrs_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, + MORSE_desc_t *B, + MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_zgesv_incpiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *L, + int *IPIV, MORSE_desc_t *B, + MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_zgesv_nopiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *B, + MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_zgetrf_incpiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *L, + int *IPIV, MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_zgetrf_nopiv_Tile_Async(MORSE_desc_t *A, + MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_zgetrs_incpiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *L, + int *IPIV, MORSE_desc_t *B, + MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_zgetrs_nopiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *B, + MORSE_sequence_t *sequence, + MORSE_request_t *request); + +#ifdef COMPLEX +int MORSE_zhemm_Tile_Async(MORSE_enum side, MORSE_enum uplo, + MORSE_Complex64_t alpha, MORSE_desc_t *A, + MORSE_desc_t *B, MORSE_Complex64_t beta, + MORSE_desc_t *C, MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_zherk_Tile_Async(MORSE_enum uplo, MORSE_enum trans, + double alpha, MORSE_desc_t *A, + double beta, MORSE_desc_t *C, + MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_zher2k_Tile_Async(MORSE_enum uplo, MORSE_enum trans, + MORSE_Complex64_t alpha, MORSE_desc_t *A, + MORSE_desc_t *B, double beta, MORSE_desc_t *C, + MORSE_sequence_t *sequence, + MORSE_request_t *request); +#endif + +int MORSE_zlacpy_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, + MORSE_desc_t *B, MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_zlange_Tile_Async(MORSE_enum norm, MORSE_desc_t *A, double *value, + MORSE_sequence_t *sequence, + MORSE_request_t *request); + +#ifdef COMPLEX +int MORSE_zlanhe_Tile_Async(MORSE_enum norm, MORSE_enum uplo, + MORSE_desc_t *A, double *value, + MORSE_sequence_t *sequence, + MORSE_request_t *request); +#endif + +int MORSE_zlansy_Tile_Async(MORSE_enum norm, MORSE_enum uplo, + MORSE_desc_t *A, double *value, + MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_zlantr_Tile_Async(MORSE_enum norm, MORSE_enum uplo, + MORSE_enum diag, MORSE_desc_t *A, double *value, + MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_zlaset_Tile_Async(MORSE_enum uplo, MORSE_Complex64_t alpha, + MORSE_Complex64_t beta, MORSE_desc_t *A, + MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_zlauum_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, + MORSE_sequence_t *sequence, + MORSE_request_t *request); + +#ifdef COMPLEX +int MORSE_zplghe_Tile_Async(double bump, MORSE_enum uplo, MORSE_desc_t *A, + unsigned long long int seed, + MORSE_sequence_t *sequence, + MORSE_request_t *request ); +#endif + +int MORSE_zplgsy_Tile_Async(MORSE_Complex64_t bump, MORSE_enum uplo, MORSE_desc_t *A, + unsigned long long int seed, + MORSE_sequence_t *sequence, + MORSE_request_t *request ); + +int MORSE_zplrnt_Tile_Async(MORSE_desc_t *A, unsigned long long int seed, + MORSE_sequence_t *sequence, + MORSE_request_t *request ); + +int MORSE_zposv_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, + MORSE_desc_t *B, + MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_zpotrf_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, + MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_zsytrf_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, + MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_zpotri_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, + MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_zpotrs_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, + MORSE_desc_t *B, MORSE_sequence_t *sequence, + MORSE_request_t *request); + +#if defined (PRECISION_c) || defined(PRECISION_z) +int MORSE_zsytrs_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, + MORSE_desc_t *B, + MORSE_sequence_t *sequence, + MORSE_request_t *request); +#endif + +int MORSE_zsymm_Tile_Async(MORSE_enum side, MORSE_enum uplo, + MORSE_Complex64_t alpha, MORSE_desc_t *A, + MORSE_desc_t *B, MORSE_Complex64_t beta, + MORSE_desc_t *C, MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_zsyrk_Tile_Async(MORSE_enum uplo, MORSE_enum trans, + MORSE_Complex64_t alpha, MORSE_desc_t *A, + MORSE_Complex64_t beta, MORSE_desc_t *C, + MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_zsyr2k_Tile_Async(MORSE_enum uplo, MORSE_enum trans, + MORSE_Complex64_t alpha, MORSE_desc_t *A, + MORSE_desc_t *B, MORSE_Complex64_t beta, + MORSE_desc_t *C, MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_ztrmm_Tile_Async(MORSE_enum side, MORSE_enum uplo, + MORSE_enum transA, MORSE_enum diag, + MORSE_Complex64_t alpha, MORSE_desc_t *A, + MORSE_desc_t *B, MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_ztrsm_Tile_Async(MORSE_enum side, MORSE_enum uplo, + MORSE_enum transA, MORSE_enum diag, + MORSE_Complex64_t alpha, MORSE_desc_t *A, + MORSE_desc_t *B, MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_ztrsmpl_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV, + MORSE_desc_t *B, MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_ztrsmrv_Tile_Async(MORSE_enum side, MORSE_enum uplo, + MORSE_enum transA, MORSE_enum diag, + MORSE_Complex64_t alpha, MORSE_desc_t *A, + MORSE_desc_t *B, MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_ztrtri_Tile_Async(MORSE_enum uplo, MORSE_enum diag, + MORSE_desc_t *A, + MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_zunglq_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, + MORSE_desc_t *B, + MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_zungqr_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, + MORSE_desc_t *B, + MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_zunmlq_Tile_Async(MORSE_enum side, MORSE_enum trans, + MORSE_desc_t *A, MORSE_desc_t *T, + MORSE_desc_t *B, MORSE_sequence_t *sequence, + MORSE_request_t *request); + +int MORSE_zunmqr_Tile_Async(MORSE_enum side, MORSE_enum trans, + MORSE_desc_t *A, MORSE_desc_t *T, + MORSE_desc_t *B, MORSE_sequence_t *sequence, + MORSE_request_t *request); + +@end verbatim + +@c -nofor_main diff --git a/doc/orgmode/figures/morse_header.png b/doc/orgmode/figures/morse_header.png new file mode 100644 index 0000000000000000000000000000000000000000..ada315a235dfd4ee4a35064e13ae0d680b480059 Binary files /dev/null and b/doc/orgmode/figures/morse_header.png differ diff --git a/doc/orgmode/figures/potri_async.png b/doc/orgmode/figures/potri_async.png new file mode 100644 index 0000000000000000000000000000000000000000..85ebe6ad9af3db6070cd898323400a8a584b7583 Binary files /dev/null and b/doc/orgmode/figures/potri_async.png differ diff --git a/doc/orgmode/figures/tile_layout.jpg b/doc/orgmode/figures/tile_layout.jpg new file mode 100644 index 0000000000000000000000000000000000000000..16a44b08afab7de2c15a75f200baf210c7fe6d3e Binary files /dev/null and b/doc/orgmode/figures/tile_layout.jpg differ diff --git a/doc/orgmode/figures/tile_layout.pdf b/doc/orgmode/figures/tile_layout.pdf new file mode 100644 index 0000000000000000000000000000000000000000..f5df80dbe06de18346c1df6c14a20c6e1c24edd1 Binary files /dev/null and b/doc/orgmode/figures/tile_layout.pdf differ diff --git a/doc/orgmode/figures/tile_lu.jpg b/doc/orgmode/figures/tile_lu.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9da660ab607fae57cec55eb3c8ddc0512ea7fd62 Binary files /dev/null and b/doc/orgmode/figures/tile_lu.jpg differ diff --git a/doc/orgmode/figures/tile_lu.pdf b/doc/orgmode/figures/tile_lu.pdf new file mode 100644 index 0000000000000000000000000000000000000000..c9b6df65197c83449c6335ebb1da393d92cd683f Binary files /dev/null and b/doc/orgmode/figures/tile_lu.pdf differ diff --git a/doc/orgmode/figures/trace_qr.jpg b/doc/orgmode/figures/trace_qr.jpg new file mode 100644 index 0000000000000000000000000000000000000000..92504d096fe829e3a0d9f2a296262c00cef3e792 Binary files /dev/null and b/doc/orgmode/figures/trace_qr.jpg differ diff --git a/doc/orgmode/figures/trace_qr.pdf b/doc/orgmode/figures/trace_qr.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e030af5e252dd26828aa156e17c6b1d240a534db Binary files /dev/null and b/doc/orgmode/figures/trace_qr.pdf differ diff --git a/doc/orgmode/morse.css b/doc/orgmode/morse.css new file mode 100644 index 0000000000000000000000000000000000000000..41adb20078f7e5b0af0af434fa51bdb361af022f --- /dev/null +++ b/doc/orgmode/morse.css @@ -0,0 +1,72 @@ +body { + padding: 2em 1em 2em 70px; + margin: 0; + font-family: sans-serif; + color: black; + background: white; + background-position: top left; + background-attachment: fixed; + background-repeat: no-repeat; +} +:link { color: #00C; background: transparent } +:visited { color: #609; background: transparent } +a:active { color: #C00; background: transparent } + +a:link img, a:visited img { border-style: none } + +a img { color: white; } +@media all { + a img { color: inherit; } +} + +th, td { + font-family: sans-serif; +} + +h1, h2, h3, h4, h5, h6 { text-align: left } +h1, h2, h3 { color: #005A9C; background: white } +h1 { font: 170% sans-serif } +h2 { font: 140% sans-serif } +h3 { font: 120% sans-serif } +h4 { font: bold 100% sans-serif } +h5 { font: italic 100% sans-serif } +h6 { font: small-caps 100% sans-serif } + +.hide { display: none } + +div.head { margin-bottom: 1em } +div.head h1 { margin-top: 2em; clear: both } +div.head table { margin-left: 2em; margin-top: 2em } + +p.copyright { font-size: small } +p.copyright small { font-size: small } + +@media screen { +a[href]:hover { background: #ffa } +} + +pre { margin-left: 2em } + +dt, dd { margin-top: 0; margin-bottom: 0 } +dt { font-weight: bold } + +pre, code { font-family: monospace } + +ul.toc, ol.toc { + list-style: disc; + list-style: none; +} + +@media aural { + h1, h2, h3 { stress: 20; richness: 90 } + .hide { speak: none } + p.copyright { volume: x-soft; speech-rate: x-fast } + dt { pause-before: 20% } + pre { speak-punctuation: code } +} + +/* +body { + background-image: url(); +} +*/ diff --git a/doc/orgmode/users_guide.org.in b/doc/orgmode/users_guide.org.in new file mode 100644 index 0000000000000000000000000000000000000000..517ea729681ccd54a7371bc66ca5cd33bc4bd795 --- /dev/null +++ b/doc/orgmode/users_guide.org.in @@ -0,0 +1,54 @@ +#+TITLE: CHAMELEON User's Guide +#+SUBTITLE: A dense linear algebra software for heterogeneous architectures +#+LANGUAGE: en +#+OPTIONS: H:3 num:t \n:nil @:t ::t |:t _:nil ^:nil -:t f:t *:t <:t +#+OPTIONS: TeX:t LaTeX:t skip:nil d:nil pri:nil tags:not-in-toc html-style:nil +#+INCLUDE: "./version.org" +#+AUTHOR: version {{{VERSION}}} +* Version + This manual documents the usage of CHAMELEON *version {{{VERSION}}}*. + It was last updated on {{{UPDATED}}}. +* Authors + * Inria, + * University of Tennessee, + * University of Colorado Denver, + * King Abdullah University of Science and Technology +* Copying + + Copyright \copy 2017 Inria + + Copyright \copy 2014 The University of Tennessee + + Copyright \copy 2014 King Abdullah University of Science and Technology + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer listed + in this license in the documentation and/or other materials provided + with the distribution. + - Neither the name of the copyright holders nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + This software is provided by the copyright holders and contributors + "as is" and any express or implied warranties, including, but not + limited to, the implied warranties of merchantability and fitness for + a particular purpose are disclaimed. In no event shall the copyright + owner or contributors be liable for any direct, indirect, incidental, + special, exemplary, or consequential damages (including, but not + limited to, procurement of substitute goods or services; loss of use, + data, or profits; or business interruption) however caused and on any + theory of liability, whether in contract, strict liability, or tort + (including negligence or otherwise) arising in any way out of the use + of this software, even if advised of the possibility of such damage. +* Introduction to Chameleon +#+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/introduction.org +* Installing Chameleon +#+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/installing.org +# #+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/configuration.org +# #+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/using.org diff --git a/doc/orgmode/version.org.in b/doc/orgmode/version.org.in new file mode 100644 index 0000000000000000000000000000000000000000..4481ea26c10481ec5021c69c06ac1027704296e9 --- /dev/null +++ b/doc/orgmode/version.org.in @@ -0,0 +1,4 @@ +#+MACRO: UPDATED 25 August 2017 +#+MACRO: UPDATED-MONTH August 2017 +#+MACRO: EDITION @CHAMELEON_VERSION_MAJOR@.@CHAMELEON_VERSION_MINOR@.@CHAMELEON_VERSION_MICRO@ +#+MACRO: VERSION @CHAMELEON_VERSION_MAJOR@.@CHAMELEON_VERSION_MINOR@.@CHAMELEON_VERSION_MICRO@