diff --git a/READMEDEV.org b/READMEDEV.org index 486662318f704fa4113176ee322d308cf50ddbd3..7c7866be45d4b1bb6ba95f32b5083d2ba352e3c6 100644 --- a/READMEDEV.org +++ b/READMEDEV.org @@ -31,11 +31,11 @@ developers must follow and that should be read by contributors. *** Prerequisites To generate the documentation you need to have [[http://www.stack.nl/~dimitri/doxygen/][Doxygen]] and - [[https://www.gnu.org/software/texinfo/][Texinfo]] installed on your system. + [[https://orgmode.org/][org-mode]] installed on your system. For example, on Debian systems: #+begin_src sh - sudo apt install doxygen texinfo texlive texlive-latex-extra emacs + sudo apt install doxygen org-mode texlive texlive-latex-extra emacs #+end_src *** configure + make documentation diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt index 3f96ef056b0a3663480f4747e1a80f9f39707d83..dc2b7c8fcd9b450ea770147cede32614bbfd0803 100644 --- a/doc/CMakeLists.txt +++ b/doc/CMakeLists.txt @@ -35,7 +35,6 @@ cmake_minimum_required(VERSION 2.8) ############################################# add_subdirectory(doxygen) add_subdirectory(orgmode) -#add_subdirectory(texinfo) ### ### END CMakeLists.txt ### diff --git a/doc/texinfo/CMakeLists.txt b/doc/texinfo/CMakeLists.txt deleted file mode 100644 index 155151b895e1ee27e97a5fc395900081d0639781..0000000000000000000000000000000000000000 --- a/doc/texinfo/CMakeLists.txt +++ /dev/null @@ -1,132 +0,0 @@ -### -# -# @file CMakeLists.txt -# -# @copyright 2009-2014 The University of Tennessee and The University of -# Tennessee Research Foundation. All rights reserved. -# @copyright 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, -# Univ. Bordeaux. All rights reserved. -# -### -# -# @project MORSE -# MORSE is a software package provided by: -# Inria Bordeaux - Sud-Ouest, -# Univ. of Tennessee, -# King Abdullah Univesity of Science and Technology -# Univ. of California Berkeley, -# Univ. of Colorado Denver. -# -# @version 1.0.0 -# @author Cedric Castagnede -# @author Emmanuel Agullo -# @author Mathieu Faverge -# @author Florent Pruvost -# @date 2012-07-13 -# -### - -cmake_minimum_required(VERSION 2.8) - -# Create file version.texi -# ------------------------ -configure_file("version.texi.in" - "version.texi" - @ONLY) -configure_file("users_guide.texi.in" - "users_guide.texi" - @ONLY) - -set(FIGURES - tile_lu.pdf - tile_lu.jpg - tile_layout.pdf - tile_layout.jpg - trace_qr.pdf - trace_qr.jpg - potri_async.png - morse_header.png - ) -set(FIGURES_HTML - tile_lu.jpg - tile_layout.jpg - trace_qr.jpg - potri_async.png - morse_header.png - ) - -foreach(_fig ${FIGURES}) - configure_file(${CMAKE_CURRENT_SOURCE_DIR}/figures/${_fig} - ${CMAKE_CURRENT_BINARY_DIR}/${_fig} - COPYONLY) -endforeach() - -# Looking which version we can compile -# ------------------------------------ -FIND_PROGRAM(MAKEINFO_COMPILER makeinfo) -FIND_PROGRAM(TEXI2DVI_COMPILER texi2dvi) -FIND_PROGRAM(TEX_COMPILER tex) -FIND_PROGRAM(DOT_COMPILER dot) - -# Looking for makeinfo -# -------------------- -if(MAKEINFO_COMPILER) - # Add target - # ---------- - add_custom_command(OUTPUT users_guide.info - COMMAND ${MAKEINFO_COMPILER} - ARGS users_guide.texi - DEPENDS users_guide.texi.in - ) - add_custom_command(OUTPUT users_guide.html - COMMAND ${MAKEINFO_COMPILER} - ARGS --html - --no-split - --css-include=${CMAKE_CURRENT_SOURCE_DIR}/morse.css - users_guide.texi - DEPENDS users_guide.texi.in - ) - add_custom_target(doc-info-users_guide ALL DEPENDS users_guide.info) - add_custom_target(doc-html-users_guide ALL DEPENDS users_guide.html) - - # Installation - # ------------ - install(FILES ${CMAKE_CURRENT_BINARY_DIR}/users_guide.info - DESTINATION share/chameleon/info) - install(FILES ${CMAKE_CURRENT_BINARY_DIR}/users_guide.html - DESTINATION share/chameleon/html) - foreach(_fig ${FIGURES_HTML}) - install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${_fig} - DESTINATION share/chameleon/html) - endforeach() - -else(MAKEINFO_COMPILER) - message(STATUS "Looking for makeinfo - not found") -endif(MAKEINFO_COMPILER) - -# Looking for texi2dvi -# -------------------- -if(TEXI2DVI_COMPILER AND TEX_COMPILER) - # Add target - # ---------- - add_custom_command(OUTPUT users_guide.pdf - COMMAND ${TEXI2DVI_COMPILER} - ARGS --pdf - --batch - users_guide.texi - DEPENDS users_guide.texi.in - ) - add_custom_target(doc-pdf-users_guide ALL DEPENDS users_guide.pdf) - - # Installation - # ------------ - install(FILES ${CMAKE_CURRENT_BINARY_DIR}/users_guide.pdf - DESTINATION share/chameleon/pdf) - -else() - message(STATUS "Looking for texi2dvi - not found") -endif() - -### -### END CMakeLists.txt -### diff --git a/doc/texinfo/chapters/configuration.texi b/doc/texinfo/chapters/configuration.texi deleted file mode 100644 index df23adb6f3088b931be4a876cc546b360ac78cc0..0000000000000000000000000000000000000000 --- a/doc/texinfo/chapters/configuration.texi +++ /dev/null @@ -1,360 +0,0 @@ -@c -*-texinfo-*- - -@c This file is part of the MORSE Handbook. -@c Copyright (C) 2017 Inria -@c Copyright (C) 2014 The University of Tennessee -@c Copyright (C) 2014 King Abdullah University of Science and Technology -@c See the file ../chameleon.texi for copying conditions. - -@menu -* Compilation configuration:: -* Dependencies detection:: -@c * Dependencies compilation:: -* Use FxT profiling through StarPU:: -* Use simulation mode with StarPU-SimGrid:: -* Use out of core support with StarPU:: -@end menu - -@c @code{} @option{} -@c @table @code -@c @item truc -@c @item muche -@c @item et zut -@c @c @end table - -@node Compilation configuration -@section Compilation configuration - -The following arguments can be given to the @command{cmake <path to source -directory>} script. - -In this chapter, the following convention is used: -@itemize @bullet -@item -@option{path} is a path in your filesystem, -@item -@option{var} is a string and the correct value or an example will be given, -@item -@option{trigger} is an CMake option and the correct value is @code{ON} or -@code{OFF}. -@end itemize - -Using CMake there are several ways to give options: -@enumerate -@item directly as CMake command line arguments -@item invoque @command{cmake <path to source directory>} once and then use -@command{ccmake <path to source directory>} to edit options through a -minimalist gui (required -@samp{cmake-curses-gui} installed on a Linux system) -@item invoque @command{cmake-gui} command and fill information about the -location of the sources and where to build the project, then you have -access to options through a user-friendly Qt interface (required -@samp{cmake-qt-gui} installed on a Linux system) -@end enumerate - -Example of configuration using the command line -@example -cmake ~/chameleon/ -DCMAKE_BUILD_TYPE=Debug \ - -DCMAKE_INSTALL_PREFIX=~/install \ - -DCHAMELEON_USE_CUDA=ON \ - -DCHAMELEON_USE_MPI=ON \ - -DBLA_VENDOR=Intel10_64lp \ - -DSTARPU_DIR=~/install/starpu-1.1 \ - -DCHAMELEON_ENABLE_TRACING=ON -@end example - -You can get the full list of options with @option{-L[A][H]} options of -@command{cmake} command: -@example -cmake -LH <path to source directory> -@end example - -@menu -* General CMake options:: -* CHAMELEON options:: -@end menu - -@node General CMake options -@subsection General CMake options - -@table @code - -@item -DCMAKE_INSTALL_PREFIX=@option{path} (default:@option{path=/usr/local}) -Install directory used by @code{make install} where some headers and libraries -will be copied. -Permissions have to be granted to write onto @option{path} during @code{make -install} step. - -@item -DCMAKE_BUILD_TYPE=@option{var} (default: @option{Release}) -Define the build type and the compiler optimization level. -The possible values for @option{var} are: -@table @code -@item empty -@item Debug -@item Release -@item RelWithDebInfo -@item MinSizeRel -@end table - -@item -DBUILD_SHARED_LIBS=@option{trigger} (default:@option{OFF}) -Indicate wether or not CMake has to build CHAMELEON static (@option{OFF}) or -shared (@option{ON}) libraries. - -@end table - -@node CHAMELEON options -@subsection CHAMELEON options - -List of CHAMELEON options that can be enabled/disabled (value=@code{ON} -or @code{OFF}): -@table @code - -@item @option{-DCHAMELEON_SCHED_STARPU}=@option{trigger} (default: @code{ON}) -to link with StarPU library (runtime system) - -@item @option{-DCHAMELEON_SCHED_QUARK}=@option{trigger} (default: @code{OFF}) -to link with QUARK library (runtime system) - -@item @option{-DCHAMELEON_USE_CUDA}=@option{trigger} (default: @code{OFF}) -to link with CUDA runtime (implementation paradigm for accelerated codes on -GPUs) and cuBLAS library (optimized BLAS kernels on GPUs), can only be used with -StarPU - -@item @option{-DCHAMELEON_USE_MPI}=@option{trigger} (default: @code{OFF}) -to link with MPI library (message passing implementation for use of multiple -nodes with distributed memory), can only be used with StarPU - -@item @option{-DCHAMELEON_ENABLE_TRACING}=@option{trigger} (default: @code{OFF}) -to enable trace generation during execution of timing drivers. -It requires StarPU to be linked with FxT library (trace execution of kernels on workers). - -@item @option{-DCHAMELEON_SIMULATION=trigger} (default: @code{OFF}) -to enable simulation mode, means CHAMELEON will not really execute tasks, -see details in section @ref{Use simulation mode with StarPU-SimGrid}. -This option must be used with StarPU compiled with -@uref{http://simgrid.gforge.inria.fr/, SimGrid} allowing to guess the -execution time on any architecture. -This feature should be used to make experiments on the scheduler behaviors and -performances not to produce solutions of linear systems. - -@item @option{-DCHAMELEON_ENABLE_DOCS=trigger} (default: @code{ON}) -@item @option{-DCHAMELEON_ENABLE_EXAMPLE=trigger} (default: @code{ON}) -to control build of the examples executables (API usage) -@item @option{-DCHAMELEON_ENABLE_TESTING=trigger} (default: @code{ON}) -to control build of testing executables (numerical check) contained in -@item @option{-DCHAMELEON_ENABLE_TIMING=trigger} (default: @code{ON}) -to control build of timing executables (performances check) contained in - -@item @option{-DCHAMELEON_PREC_S=trigger} (default: @code{ON}) -to enable the support of simple arithmetic precision (float in C) -@item @option{-DCHAMELEON_PREC_D=trigger} (default: @code{ON}) -to enable the support of double arithmetic precision (double in C) -@item @option{-DCHAMELEON_PREC_C=trigger} (default: @code{ON}) -to enable the support of complex arithmetic precision (complex in C) -@item @option{-DCHAMELEON_PREC_Z=trigger} (default: @code{ON}) -to enable the support of double complex arithmetic precision (double complex -in C) - -@item @option{-DBLAS_VERBOSE=trigger} (default: @code{OFF}) -to make BLAS library discovery verbose -@item @option{-DLAPACK_VERBOSE=trigger} (default: @code{OFF}) -to make LAPACK library discovery verbose (automatically enabled if -@option{BLAS_VERBOSE=@code{ON}}) -@end table - -List of CHAMELEON options that needs a specific value: -@table @code -@item @option{-DBLA_VENDOR=@option{var}} (default: @option{empty}) -The possible values for @option{var} are: -@table @code -@item empty -@item all -@item Intel10_64lp -@item Intel10_64lp_seq -@item ACML -@item Apple -@item Generic -@item ... -@end table -to force CMake to find a specific BLAS library, see the full list of BLA_VENDOR -By default @option{BLA_VENDOR} is empty so that CMake tries to detect all -possible BLAS vendor with a preference for Intel MKL. -@end table - -List of CHAMELEON options which requires to give a path: -@table @code -@item @option{-DLIBNAME_DIR=@option{path}} (default: empty) -root directory of the LIBNAME library installation -@item @option{-DLIBNAME_INCDIR=@option{path}} (default: empty) -directory of the LIBNAME library headers installation -@item @option{-DLIBNAME_LIBDIR=@option{path}} (default: empty) -directory of the LIBNAME libraries (.so, .a, .dylib, etc) installation -@end table -LIBNAME can be one of the following: BLAS - CBLAS - FXT - HWLOC - -LAPACK - LAPACKE - QUARK - STARPU - TMG. -See paragraph about @ref{Dependencies detection} for details. - -Libraries detected with an official CMake module (see module files in -@itemize @bullet -@item CUDA -@item MPI -@item Threads -@end itemize - -Libraries detected with CHAMELEON cmake modules (see module files in -@itemize @bullet -@item BLAS -@item CBLAS -@item FXT -@item HWLOC -@item LAPACK -@item LAPACKE -@item QUARK -@item STARPU -@item TMG -@end itemize - - -@node Dependencies detection -@section Dependencies detection -You have different choices to detect dependencies on your system, either by -setting some environment variables containing paths to the libs and headers or -by specifying them directly at cmake configure. -Different cases : -@enumerate -@item detection of dependencies through environment variables: - @itemize @bullet - @item @env{LD_LIBRARY_PATH} environment variable should contain the list of -paths -where to find the libraries: - @example - export @env{LD_LIBRARY_PATH}=$@env{LD_LIBRARY_PATH}:path/to/your/libs - @end example - @item @env{INCLUDE} environment variable should contain the list of paths -where to find the header files of libraries - @example - export @env{INCLUDE}=$@env{INCLUDE}:path/to/your/headers - @end example - @end itemize - -@item detection with user's given paths: - @itemize @bullet - @item you can specify the path at cmake configure by invoking - @example - cmake <path to SOURCE_DIR> -DLIBNAME_DIR=path/to/your/lib - @end example - where LIB stands for the name of the lib to look for, example - @example - cmake <path to SOURCE_DIR> -DSTARPU_DIR=path/to/starpudir \ - -DCBLAS_DIR= ... - @end example - @item it is also possible to specify headers and library directories -separately, example - @example - cmake <path to SOURCE_DIR> \ - -DSTARPU_INCDIR=path/to/libstarpu/include/starpu/1.1 \ - -DSTARPU_LIBDIR=path/to/libstarpu/lib - @end example - @item Note BLAS and LAPACK detection can be tedious so that we provide a -verbose mode. Use @option{-DBLAS_VERBOSE=ON} or @option{-DLAPACK_VERBOSE=ON} to -enable it. - @end itemize - -@end enumerate - - -@c @node Dependencies compilation -@c @section Dependencies compilation - -@node Use FxT profiling through StarPU -@section Use FxT profiling through StarPU - -StarPU can generate its own trace log files by compiling it with the -@option{--with-fxt} -option at the configure step (you can have to specify the directory where you -installed FxT by giving @option{--with-fxt=...} instead of @option{--with-fxt} -alone). -By doing so, traces are generated after each execution of a program which uses -StarPU in the directory pointed by the @env{STARPU_FXT_PREFIX} environment -variable. Example: -@example -export @env{STARPU_FXT_PREFIX}=/home/yourname/fxt_files/ -@end example - -When executing a @command{./timing/...} CHAMELEON program, if it has been -enabled (StarPU compiled with FxT and @option{-DCHAMELEON_ENABLE_TRACING=ON}), you -can give the option @option{--trace} to tell the program to generate trace log -files. - -Finally, to generate the trace file which can be opened with -@uref{http://vite.gforge.inria.fr/, Vite} program, you have to use the -@command{starpu_fxt_tool} executable of StarPU. -You can use it to generate the trace file like this: -@itemize @bullet -@item @command{path/to/your/install/starpu/bin/starpu_fxt_tool -i prof_filename} - -There is one file per mpi processus (prof_filename_0, prof_filename_1 ...). -To generate a trace of mpi programs you can call it like this: -@item @command{path/to/your/install/starpu/bin/starpu_fxt_tool -i -prof_filename*} - -The trace file will be named paje.trace (use -o option to specify an output -name). -@end itemize - -Alternatively, one can also generate directly .paje trace files after the execution -by setting @env{STARPU_GENERATE_TRACE=1}. - -@node Use simulation mode with StarPU-SimGrid -@section Use simulation mode with StarPU-SimGrid - -Simulation mode can be enabled by setting the cmake option -@option{-DCHAMELEON_SIMULATION=ON}. -This mode allows you to simulate execution of algorithms with StarPU compiled -with @uref{http://simgrid.gforge.inria.fr/, SimGrid}. -directory of CHAMELEON sources. -To use these perfmodels, please set the following -@itemize @bullet -@item @env{STARPU_HOME} environment variable to: - @example - @code{<path to SOURCE_DIR>/simucore/perfmodels} - @end example -@item @env{STARPU_HOSTNAME} environment variable to the name of the machine to -simulate. For example, on our platform (PlaFRIM) with GPUs at Inria Bordeaux - @example - @env{STARPU_HOSTNAME}=mirage - @end example -Note that only POTRF kernels with block sizes of 320 or 960 (simple and double -precision) on mirage machine are available for now. -Database of models is subject to change, it should be enrich in a near future. -@end itemize - -@node Use out of core support with StarPU -@section Use out of core support with StarPU - -If the matrix can not fit in the main memory, StarPU can automatically evict -tiles to the disk. The descriptors for the matrices which can not fit in the -main memory need to be created with @code{MORSE_Desc_Create_OOC}, so that MORSE -does not force StarPU to keep it in the main memory. - -The following variables then need to be set: -@itemize @bullet -@item @env{STARPU_DISK_SWAP} environment variable to a place where to store -evicted tiles, for example: - @example - @env{STARPU_DISK_SWAP}=/tmp - @end example -@item @env{STARPU_DISK_SWAP_BACKEND} environment variable to the I/O method, -for example: - @example - @env{STARPU_DISK_SWAP_BACKEND}=unistd_o_direct - @end example -This will create a hierarchy of directory to store one file per tile. If that -poses problems, one can use the hdf5 I/O method which stores all tiles in a -single file. -@item @env{STARPU_LIMIT_CPU_MEM} environment variable to the amount of memory -that can be used in MBytes, for example: - @example - @env{STARPU_LIMIT_CPU_MEM}=1000 - @end example -@end itemize diff --git a/doc/texinfo/chapters/installing.texi b/doc/texinfo/chapters/installing.texi deleted file mode 100644 index f056c1ef319dfd9bf1b976aadbc01d0613773a47..0000000000000000000000000000000000000000 --- a/doc/texinfo/chapters/installing.texi +++ /dev/null @@ -1,330 +0,0 @@ -@c -*-texinfo-*- - -@c This file is part of the CHAMELEON Handbook. -@c Copyright (C) 2017 Inria -@c Copyright (C) 2014 The University of Tennessee -@c Copyright (C) 2014 King Abdullah University of Science and Technology -@c See the file ../chameleon.texi for copying conditions. - -@menu -* Downloading CHAMELEON:: -* Build process of CHAMELEON:: -@end menu - -CHAMELEON can be built and installed by the standard means of CMake -(@uref{http://www.cmake.org/}). -General information about CMake, as well as installation binaries and CMake -source code are available from -@uref{http://www.cmake.org/cmake/resources/software.html}. -The following chapter is intended to briefly remind how these tools can be used -to install CHAMELEON. - -@node Downloading CHAMELEON -@section Downloading CHAMELEON - -@menu -* Getting Sources:: -* Required dependencies:: -* Optional dependencies:: -@end menu - -@node Getting Sources -@subsection Getting Sources - -The latest official release tarballs of CHAMELEON sources are available for -download from -@uref{https://gforge.inria.fr/frs/download.php/file/34884/chameleon-0.9.1.tar.gz, chameleon-0.9.1}. - -The latest development snapshot is available on gitlab: -@uref{https://gitlab.inria.fr/solverstack/chameleon} - -@node Required dependencies -@subsection Required dependencies - -@menu -* a BLAS implementation:: -* CBLAS:: -* a LAPACK implementation:: -* LAPACKE:: -* libtmg:: -* QUARK:: -* StarPU:: -* hwloc:: -* pthread:: -@end menu - -@node a BLAS implementation -@subsubsection a BLAS implementation - -@uref{http://www.netlib.org/blas/, BLAS} (Basic Linear Algebra Subprograms), -are a de facto standard for basic linear algebra operations such as vector and -matrix multiplication. -FORTRAN implementation of BLAS is available from Netlib. -Also, C implementation of BLAS is included in GSL (GNU Scientific Library). -Both these implementations are reference implementation of BLAS, are not -optimized for modern processor architectures and provide an order of magnitude -lower performance than optimized implementations. -Highly optimized implementations of BLAS are available from many hardware -vendors, such as Intel MKL, IBM ESSL and AMD ACML. -Fast implementations are also available as academic packages, such as ATLAS and -OpenBLAS. -The standard interface to BLAS is the FORTRAN interface. - -@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with -the reference BLAS from NETLIB, OpenBLAS and Intel MKL. - -@node CBLAS -@subsubsection CBLAS - -@uref{http://www.netlib.org/blas/#_cblas, CBLAS} is a C language interface to -BLAS. -Most commercial and academic implementations of BLAS also provide CBLAS. -Netlib provides a reference implementation of CBLAS on top of FORTRAN BLAS -(Netlib CBLAS). -Since GSL is implemented in C, it naturally provides CBLAS. - -@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with -the reference CBLAS from NETLIB, OpenBLAS and Intel MKL. - -@node a LAPACK implementation -@subsubsection a LAPACK implementation - -@uref{http://www.netlib.org/lapack/, LAPACK} (Linear Algebra PACKage) is a -software library for numerical linear algebra, a successor of LINPACK and -EISPACK and a predecessor of CHAMELEON. -LAPACK provides routines for solving linear systems of equations, linear least -square problems, eigenvalue problems and singular value problems. -Most commercial and academic BLAS packages also provide some LAPACK routines. - -@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with -the reference LAPACK from NETLIB, OpenBLAS and Intel MKL. - -@node LAPACKE -@subsubsection LAPACKE - -@uref{http://www.netlib.org/lapack/, LAPACKE} is a C language interface to -LAPACK (or CLAPACK). -It is produced by Intel in coordination with the LAPACK team and is available -in source code from Netlib in its original version (Netlib LAPACKE) and from -CHAMELEON website in an extended version (LAPACKE for CHAMELEON). -In addition to implementing the C interface, LAPACKE also provides routines -which automatically handle workspace allocation, making the use of LAPACK much -more convenient. - -@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with -the reference LAPACKE from NETLIB, OpenBLAS and Intel MKL. - -@node libtmg -@subsubsection libtmg - -@uref{http://www.netlib.org/lapack/, libtmg} is a component of the LAPACK -library, containing routines for generation -of input matrices for testing and timing of LAPACK. -The testing and timing suites of LAPACK require libtmg, but not the library -itself. Note that the LAPACK library can be built and used without libtmg. - -@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with -the reference TMGLIB from NETLIB, OpenBLAS and Intel MKL. - -@node QUARK -@subsubsection QUARK - -@uref{http://icl.cs.utk.edu/quark/, QUARK} (QUeuing And Runtime for Kernels) -provides a library that enables the dynamic execution of tasks with data -dependencies in a multi-core, multi-socket, shared-memory environment. -One of QUARK or StarPU Runtime systems has to be enabled in order to schedule -tasks on the architecture. -If QUARK is enabled then StarPU is disabled and conversely. -Note StarPU is enabled by default. -When CHAMELEON is linked with QUARK, it is not possible to exploit neither -CUDA (for GPUs) nor MPI (distributed-memory environment). -You can use StarPU to do so. - -@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with -the QUARK library 0.9. - -@node StarPU -@subsubsection StarPU - -@uref{http://runtime.bordeaux.inria.fr/StarPU/, StarPU} is a task programming -library for hybrid architectures. -StarPU handles run-time concerns such as: -@itemize @bullet -@item Task dependencies -@item Optimized heterogeneous scheduling -@item Optimized data transfers and replication between main memory and discrete -memories -@item Optimized cluster communications -@end itemize -StarPU can be used to benefit from GPUs and distributed-memory environment. -One of QUARK or StarPU runtime system has to be enabled in order to schedule -tasks on the architecture. -If StarPU is enabled then QUARK is disabled and conversely. -Note StarPU is enabled by default. - -@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with -StarPU-1.1 and 1.2 releases. - -@node hwloc -@subsubsection hwloc - -@uref{http://www.open-mpi.org/projects/hwloc/, hwloc} (Portable Hardware -Locality) is a software package for accessing the topology of a multicore -system including components like: cores, sockets, caches and NUMA nodes. -@c The topology discovery library, @code{hwloc}, is not mandatory to use StarPU -@c but strongly recommended. -It allows to increase performance, and to perform some topology aware -scheduling. -@code{hwloc} is available in major distributions and for most OSes and can be -downloaded from @uref{http://www.open-mpi.org/software/hwloc}. - -@strong{Caution about the compatibility:} hwloc should be compatible with the -version of StarPU used. - -@node pthread -@subsubsection pthread - -POSIX threads library is required to run CHAMELEON on Unix-like systems. -It is a standard component of any such system. -@comment Windows threads are used on Microsoft Windows systems. - -@node Optional dependencies -@subsection Optional dependencies - -@menu -* OpenMPI:: -* Nvidia CUDA Toolkit:: -* FxT:: -@end menu - -@node OpenMPI -@subsubsection OpenMPI - -@uref{http://www.open-mpi.org/, OpenMPI} is an open source Message Passing -Interface implementation for execution on multiple nodes with -distributed-memory environment. -MPI can be enabled only if the runtime system chosen is StarPU (default). -To use MPI through StarPU, it is necessary to compile StarPU with MPI -enabled. - -@strong{Caution about the compatibility:} OpenMPI should be built with the ---enable-mpi-thread-multiple option. - -@node Nvidia CUDA Toolkit -@subsubsection Nvidia CUDA Toolkit - -@uref{https://developer.nvidia.com/cuda-toolkit, Nvidia CUDA Toolkit} provides -a -comprehensive development environment for C and C++ developers building -GPU-accelerated applications. -CHAMELEON can use a set of low level optimized kernels coming from cuBLAS to -accelerate computations on GPUs. -The @uref{http://docs.nvidia.com/cuda/cublas/, cuBLAS} library is an -implementation of BLAS (Basic Linear Algebra Subprograms) on top of the Nvidia -CUDA runtime. -cuBLAS is normaly distributed with Nvidia CUDA Toolkit. -CUDA/cuBLAS can be enabled in CHAMELEON only if the runtime system chosen -is StarPU (default). -To use CUDA through StarPU, it is necessary to compile StarPU with CUDA -enabled. - -@strong{Caution about the compatibility:} CHAMELEON has been mainly tested with -CUDA releases from versions 4 to 7.5. -Your compiler must be compatible with CUDA. - -@node FxT -@subsubsection FxT - -@uref{http://download.savannah.gnu.org/releases/fkt/, FxT} stands for both -FKT (Fast Kernel Tracing) and FUT (Fast User Tracing). -This library provides efficient support for recording traces. -CHAMELEON can trace kernels execution on the different workers and produce -.paje files if FxT is enabled. -FxT can only be used through StarPU and StarPU must be compiled with FxT -enabled, see how to use this feature here @ref{Use FxT profiling through -StarPU}. - -@strong{Caution about the compatibility:} FxT should be compatible with the -version of StarPU used. - -@node Build process of CHAMELEON -@section Build process of CHAMELEON - -@menu -* Setting up a build directory:: -* Configuring the project with best efforts:: -* Building:: -* Tests:: -* Installing:: -@end menu - -@node Setting up a build directory -@subsection Setting up a build directory - -The CHAMELEON build process requires CMake version 2.8.0 or higher and -working C and Fortran compilers. -Compilation and link with CHAMELEON libraries have been tested with -@strong{gcc/gfortran} and @strong{icc/ifort}. -On Unix-like operating systems, it also requires Make. -The CHAMELEON project can not be configured for an in-source build. -You will get an error message if you try to compile in-source. -Please clean the root of your project by deleting the generated - -@example -mkdir build -cd build -@end example - -@quotation -You can create a build directory from any location you would like. It can be a -sub-directory of the CHAMELEON base source directory or anywhere else. -@end quotation - -@node Configuring the project with best efforts -@subsection Configuring the project with best efforts - -@example -cmake <path to SOURCE_DIR> -DOPTION1= -DOPTION2= ... -@end example -stands -Details about options that are useful to give to @command{cmake <path to -SOURCE_DIR>} are given in @ref{Compilation configuration}. - -@node Building -@subsection Building - -@example -make [-j[ncores]] -@end example -do not hesitate to use @option{-j[ncores]} option to speedup the compilation - -@node Tests -@subsection Tests - -In order to make sure that CHAMELEON is working properly on the system, it is -also possible to run a test suite. - -@example -make check -@end example -or -@example -ctest -@end example - -@node Installing -@subsection Installing - -In order to install CHAMELEON at the location that was specified during -configuration: - -@example -make install -@end example -do not forget to specify the install directory with -@option{-DCMAKE_INSTALL_PREFIX} at cmake configure -@example -cmake <path to SOURCE_DIR> -DCMAKE_INSTALL_PREFIX=<path to INSTALL_DIR> -@end example -Note that the install process is optional. -You are free to use CHAMELEON binaries compiled in the build directory. diff --git a/doc/texinfo/chapters/introduction.texi b/doc/texinfo/chapters/introduction.texi deleted file mode 100644 index b94921f37cc78db466d9e7e40806d004f5b9e7c9..0000000000000000000000000000000000000000 --- a/doc/texinfo/chapters/introduction.texi +++ /dev/null @@ -1,379 +0,0 @@ -@c -*-texinfo-*- - -@c This file is part of the CHAMELEON User's Guide. -@c Copyright (C) 2014 Inria -@c Copyright (C) 2014 The University of Tennessee -@c Copyright (C) 2014 King Abdullah University of Science and Technology -@c See the file ../chameleon.texi for copying conditions. - -@menu -* MORSE project:: Discussion about MORSE project motivation -* CHAMELEON:: Outline of this specific MORSE subproject -@end menu - -@node MORSE project -@section MORSE project - -@ifnottex -@center @image{morse_header} -@end ifnottex - -@menu -* MORSE Objectives:: -* Research fields:: -* Research papers:: -@end menu - -@node MORSE Objectives -@subsection MORSE Objectives - -When processor clock speeds flatlined in 2004, after more than fifteen years -of exponential increases, the era of near automatic performance improvements -that the HPC application community had previously enjoyed came to an abrupt -end. -To develop software that will perform well on petascale and exascale systems -with thousands of nodes and millions of cores, the list of major challenges -that must now be confronted is formidable: -1) dramatic escalation in the costs of intrasystem communication between -processors and/or levels of memory hierarchy; -2) increased heterogeneity of the processing units (mixing CPUs, GPUs, etc. in -varying and unexpected design combinations); -3) high levels of parallelism and more complex constraints means that -cooperating processes must be dynamically and unpredictably scheduled for -asynchronous execution; -4) software will not run at scale without much better resilience to faults and -far more robustness; and -5) new levels of self-adaptivity will be required to enable software to -modulate process speed in order to satisfy limited energy budgets. -The MORSE associate team will tackle the first three challenges in a -orchestrating work between research groups respectively specialized in sparse -linear algebra, dense linear algebra and runtime systems. -The overall objective is to develop robust linear algebra libraries relying on -innovative runtime systems that can fully benefit from the potential of those -future large-scale complex machines. -Challenges 4) and 5) will also be investigated by the different teams in the -context of other partnerships, but they will not be the main focus of the -associate team as they are much more prospective. - -@node Research fields -@subsection Research fields - -The overall goal of the MORSE associate team is to enable advanced numerical -algorithms to be executed on a scalable unified runtime system for exploiting -the full potential of future exascale machines. -We expect advances in three directions based first on strong and closed -interactions between the runtime and numerical linear algebra communities. -This initial activity will then naturally expand to more focused but still -joint research in both fields. - -@menu -* Fine interaction between linear algebra and runtime systems:: -* Runtime systems:: -* Linear algebra:: -@end menu - -@node Fine interaction between linear algebra and runtime systems -@subsubsection Fine interaction between linear algebra and runtime systems - -On parallel machines, HPC applications need to take care of data movement and -consistency, which can be either explicitly managed at the level of the -application itself or delegated to a runtime system. -We adopt the latter approach in order to better keep up with hardware trends -whose complexity is growing exponentially. -One major task in this project is to define a proper interface between HPC -applications and runtime systems in order to maximize productivity and -expressivity. -As mentioned in the next section, a widely used approach consists in -abstracting the application as a DAG that the runtime system is in charge of -scheduling. -Scheduling such a DAG over a set of heterogeneous processing units introduces a -lot of new challenges, such as predicting accurately the execution time of each -type of task over each kind of unit, minimizing data transfers between memory -banks, performing data prefetching, etc. -Expected advances: In a nutshell, a new runtime system API will be designed to -allow applications to provide scheduling hints to the runtime system and to get -real-time feedback about the consequences of scheduling decisions. - -@node Runtime systems -@subsubsection Runtime systems - -A runtime environment is an intermediate layer between the system and the -application. -It provides low-level functionality not provided by the system (such as -scheduling or management of the heterogeneity) and high-level features (such as -performance portability). -In the framework of this proposal, we will work on the scalability of runtime -environment. To achieve scalability it is required to avoid all centralization. -Here, the main problem is the scheduling of the tasks. -In many task-based runtime environments the scheduler is centralized and -becomes a bottleneck as soon as too many cores are involved. -It is therefore required to distribute the scheduling decision or to compute a -data distribution that impose the mapping of task using, for instance the -so-called ``owner-compute'' rule. -Expected advances: We will design runtime systems that enable an efficient and -scalable use of thousands of distributed multicore nodes enhanced with -accelerators. - -@node Linear algebra -@subsubsection Linear algebra - -Because of its central position in HPC and of the well understood structure of -its algorithms, dense linear algebra has often pioneered new challenges that HPC -had to face. -Again, dense linear algebra has been in the vanguard of the new era of -petascale computing with the design of new algorithms that can efficiently run -on a multicore node with GPU accelerators. These algorithms are called -``communication-avoiding'' since they have been redesigned to limit the amount -of communication between processing units (and between the different levels of -memory hierarchy). -They are expressed through Direct Acyclic Graphs (DAG) of fine-grained tasks -that are dynamically scheduled. Expected advances: First, we plan to investigate -the impact of these principles in the case of sparse applications (whose -algorithms are slightly more complicated but often rely on dense kernels). -Furthermore, both in the dense and sparse cases, the scalability on thousands of -nodes is still limited; new numerical approaches need to be found. -We will specifically design sparse hybrid direct/iterative methods that -represent a promising approach. - -@node Research papers -@subsection Research papers - -Research papers about MORSE can be found at - -@uref{http://icl.cs.utk.edu/projectsdev/morse/pubs/index.html} - - -@node CHAMELEON -@section CHAMELEON - -@menu -* CHAMELEON software:: content and objectives -* PLASMA's design principles:: algorithmic and matrix layout -@end menu - -@node CHAMELEON software -@subsection CHAMELEON software - -The main purpose is to address the performance shortcomings of -the @uref{http://www.netlib.org/lapack/, LAPACK} -and @uref{http://www.netlib.org/scalapack/, ScaLAPACK} -libraries on multicore processors and multi-socket systems of multicore -processors and their inability to efficiently utilize accelerators such as -Graphics Processing Units (GPUs). - -CHAMELEON is a framework written in C which provides routines to solve dense -general systems of linear equations, symmetric positive definite systems of -linear equations and linear least squares problems, using LU, Cholesky, QR and -LQ factorizations. -Real arithmetic and complex arithmetic are supported in both single precision -and double precision. -It supports Linux and Mac OS/X machines (only tested on Intel x86-64 -architecture). - -CHAMELEON is based on @uref{http://icl.cs.utk.edu/plasma/, PLASMA} source -code but is not limited to shared-memory environment and can exploit -multiple GPUs. -CHAMELEON is interfaced in a generic way with both -@uref{http://icl.cs.utk.edu/quark/, QUARK} and -@uref{http://runtime.bordeaux.inria.fr/StarPU/, StarPU} runtime systems. -This feature allows to analyze in a unified framework how sequential -task-based algorithms behave regarding different runtime systems -implementations. -Using CHAMELEON with @uref{http://runtime.bordeaux.inria.fr/StarPU/, -StarPU} runtime system allows to exploit GPUs through -kernels provided by @uref{https://developer.nvidia.com/cublas, cuBLAS} -and clusters of interconnected nodes with distributed memory (using -@uref{http://www.open-mpi.org/, MPI}). Computation of very large -systems with dense matrices on a cluster of nodes is still being -experimented and stabilized. -It is not expected to get stable performances with the current version using -MPI. - - -@comment %////////////////////////////////////////////////////////// - -@node PLASMA's design principles -@subsection PLASMA's design principles - -CHAMELEON is originally based on @uref{http://icl.cs.utk.edu/plasma/, -PLASMA} so that design principles are very similar. -The content of this section @ref{PLASMA's design principles} has been copied -from the @samp{Design principles} section of the PLASMA User's Guide. - -@menu -* Tile Algorithms:: -* Tile Data Layout:: -* Dynamic Task Scheduling:: -@end menu - -@node Tile Algorithms -@subsubsection Tile Algorithms - -Tile algorithms are based on the idea of processing the matrix by square tiles -of relatively small size, such that a tile fits entirely in one of the cache -levels associated with one core. -This way a tile can be loaded to the cache and processed completely before being -evicted back to the main memory. -Of the three types of cache misses, @emph{compulsory}, @emph{capacity} -and @emph{conflict}, the use of tile algorithms minimizes the number of -capacity misses, since each operation loads the amount of data that does not -``overflow'' the cache. - -For some operations such as matrix multiplication and Cholesky factorization, -translating the classic algorithm to the tile algorithm is trivial. -In the case of matrix multiplication, the tile algorithm is simply a product of -applying the technique of @emph{loop tiling} to the canonical definition of -three nested loops. -It is very similar for the Cholesky factorization. -The @strong{left-looking} definition of Cholesky factorization from LAPACK is a -loop with a sequence of calls to four routines: xSYRK (symmetric -@strong{rank-k} update), xPOTRF (Cholesky factorization of a small block on the -diagonal), xGEMM (matrix multiplication) and xTRSM (triangular solve). -If the xSYRK, xGEMM and xTRSM operations are expressed with the canonical -definition of three nested loops and the technique of loop tiling is applied, -the tile algorithm results. -Since the algorithm is produced by simple reordering of operations, neither the -number of operations nor numerical stability of the algorithm are affected. - -The situation becomes slightly more complicated for LU and QR factorizations, -where the classic algorithms factorize an entire panel of the matrix (a block -of columns) at every step of the algorithm. -One can observe, however, that the process of matrix factorization is -synonymous with introducing zeros in approproate places and a tile algorithm -can be fought of as one that zeroes one tile of the matrix at a time. -This process is referred to as updating of a factorization or @emph{incremental -factorization}. -The process is equivalent to factorizing the top tile of a panel, then placing -the upper triangle of the result on top of the tile blow and factorizing again, -then moving to the next tile and so on. -Here, the tile LU and QR algorithms perform slightly more floating point -operations and require slightly more memory for auxiliary data. -Also, the tile LU factorization applies a different pivoting pattern and, as a -result, is less numerically stable than classic LU with full pivoting. -Numerical stability is not an issue in case of the tile QR, which relies on -orthogonal transformations (Householder reflections), which are numerically -stable. - -@center @image{tile_lu,7cm,7cm} - -Schematic illustration of the tile LU factorization (kernel names for -real arithmetics in double precision), courtesey of the -@uref{http://icl.cs.utk.edu/plasma/, PLASMA} team. - -@comment ////////////////////////////////////////////////////////// - -@node Tile Data Layout -@subsubsection Tile Data Layout - -Tile layout is based on the idea of storing the matrix by square tiles -of relatively small size, such that each tile occupies a continuous memory -region. -This way a tile can be loaded to the cache memory efficiently and the risk of -evicting it from the cache memory before it is completely processed is -minimized. -Of the three types of cache misses, @emph{compulsory}, @emph{capacity} and -@emph{conflict}, the use of tile layout minimizes the number of conflict -misses, since a continuous region of memory will completely fill out a -@strong{set-associative} cache memory before an eviction can happen. -Also, from the standpoint of multithreaded execution, the probability of -@emph{false sharing} is minimized. -It can only affect the cache lines containing the beginning and the ending of a -tile. - -In standard @strong{cache-based} architecture, tiles continously laid out in -memory maximize the profit from automatic prefetching. -Tile layout is also beneficial in situations involving the use of accelerators, -where explicit communication of tiles through DMA transfers is required, such as -moving tiles between the system memory and the local store in Cell B. E. or -moving tiles between the host memory and the device memory in GPUs. -In most circumstances tile layout also minimizes the number of TLB -misses and conflicts to memory banks or partitions. -With the standard (@strong{column-major}) layout, access to each column of -a tile is much more likely -to cause a conflict miss, a false sharing miss, a TLB miss or a bank -or partition conflict. -The use of the standard layout for dense matrix operations is a -performance minefield. -Although occasionally one can pass through it unscathed, the risk of hitting a -spot deadly to performance is very high. - -Another property of the layout utilized in PLASMA is that it is ``flat'', -meaning that it does not involve a level of indirection. Each tile stores a -small square submatrix of the main matrix in a @strong{column-major} layout. In -turn, the main matrix is an arrangement of tiles immediately following one -another in a @strong{column-major} layout. -The offset of each tile can be calculated through address arithmetics and -does not involve pointer indirection. -Alternatively, a matrix could be represented as an array of pointers to -tiles, located anywhere in memory. Such layout would be a radical -and unjustifiable departure from LAPACK and ScaLAPACK. -Flat tile layout is a natural progression from LAPACK's @strong{column-major} -layout and ScaLAPACK's @strong{block-cyclic} layout. - -Another related property of PLASMA's tile layout is that it includes -provisions for padding of tiles, i.e., the actual region of memory designated -for a tile can be larger than the memory occupied by the actual data. -This allows to force a certain alignment of tile boundaries, while using the -flat organization described in the previous paragraph. -The motivation is that, at the price of small memory overhead, alignment of -tile boundaries may prove benefivial in multiple scenarios involving -memory systems of standard multicore processors, as well as accelerators. -The issues that come into play are, again, the use of TLBs and memory banks or -partitions. - -@center @image{tile_layout,7cm,7cm} - -Schematic illustration of the tile layout with @strong{column-major} -order of tiles, @strong{column-major} order of elements within tiles and -(optional) padding for enforcing a certain alighment of tile bondaries, -courtesey of the @uref{http://icl.cs.utk.edu/plasma/, PLASMA} team. - -@comment %////////////////////////////////////////////////////////// - -@node Dynamic Task Scheduling -@subsubsection Dynamic Task Scheduling - -Dynamic scheduling is the idea of assigning work to cores based on the -availability of data for processing at any given point in time and is also -referred to as @strong{@emph{data-driven}} scheduling. -The concept is related closely to the idea of expressing computation through a -task graph, often referred to as the DAG (@emph{Direct Acyclic Graph}), and -the flexibility exploring the DAG at runtime. -Thus, to a large extent, dynamic scheduling is synonymous with -@strong{@emph{runtime scheduling}}. -An important concept here is the one of the @emph{critical path}, which defines -the upper bound on the achievable parallelism, and needs to be pursued at the -maximum speed. -This is in direct opposition to the @strong{@emph{fork-and-join}} or -@strong{@emph{data-parallel}} programming models, where -artificial synchronization points expose serial sections of -the code, where multiple cores are idle, while sequential processing takes -place. -The use of dynamic scheduling introduces a @strong{trade-off}, though. -The more dynamic (flexible) scheduling is, the more centralized (and less -scalable) the scheduling mechanism is. -For that reason, currently PLASMA uses two scheduling -mechanisms, one which is fully dynamic and one where work is assigned -statically and dependency checks are done at runtime. - -The first scheduling mechanism relies on unfolding a @emph{sliding window} of -the task graph at runtime and scheduling work by resolving data hazards: -@emph{Read After Write~(RAW)}, @emph{Write After Read~(WAR)} and @emph{Write -After Write~(WAW)}, a technique analogous to instruction scheduling in -superscalar processors. -It also relies on @strong{@emph{work-stealing}} for balanding the -load among all multiple cores. -The second scheduling mechanism relies on statically designating a path through -the execution space of the algorithm to each core and following a -cycle: transition to a task, wait for its dependencies, execute it, update the -overall progress. -Task are identified by tuples and task transitions are done through locally -evaluated formulas. -Progress information can be centralized, replicated or distributed (currently -centralized). - -@center @image{trace_qr,12cm,5cm} - -A trace of the tile QR factorization executing on eight cores without -any global synchronization points (kernel names for real arithmetics in single -precision), courtesey of the @uref{http://icl.cs.utk.edu/plasma/, PLASMA} team. diff --git a/doc/texinfo/chapters/using.texi b/doc/texinfo/chapters/using.texi deleted file mode 100644 index cf83f26e85cd2f7959a3b2ce81cfd96882469bc9..0000000000000000000000000000000000000000 --- a/doc/texinfo/chapters/using.texi +++ /dev/null @@ -1,1425 +0,0 @@ -@c -*-texinfo-*- - -@c This file is part of the MORSE Handbook. -@c Copyright (C) 2014 Inria -@c Copyright (C) 2014 The University of Tennessee -@c Copyright (C) 2014 King Abdullah University of Science and Technology -@c See the file ../chameleon.texi for copying conditions. - -@menu -* Using CHAMELEON executables:: -* Linking an external application with CHAMELEON libraries:: -* CHAMELEON API:: -@end menu - -@node Using CHAMELEON executables -@section Using CHAMELEON executables - -CHAMELEON provides several test executables that are compiled and link with -CHAMELEON stack of dependencies. -Instructions about the arguments to give to executables are accessible thanks -to the option @option{-[-]help} or @option{-[-]h}. -This set of binaries are separated into three categories and can be found in -three different directories: - -@itemize @bullet - - @item example - - contains examples of API usage and more specifically the - sub-directory lapack_to_morse/ provides a tutorial that explain how to use - CHAMELEON functionalities starting from a full LAPACK code, see -@ref{Tutorial LAPACK to CHAMELEON} - - @item testing - - contains testing drivers to check numerical correctness of - CHAMELEON linear algebra routines with a wide range of parameters - @example - ./testing/stesting 4 1 LANGE 600 100 700 - @end example - Two first arguments are the number of cores and gpus to use. - The third one is the name of the algorithm to test. - The other arguments depend on the algorithm, here it lies for the number of - rows, columns and leading dimension of the problem. - - Name of algorithms available for testing are: - @itemize @bullet - @item LANGE: norms of matrices Infinite, One, Max, Frobenius - @item GEMM: general matrix-matrix multiply - @item HEMM: hermitian matrix-matrix multiply - @item HERK: hermitian matrix-matrix rank k update - @item HER2K: hermitian matrix-matrix rank 2k update - @item SYMM: symmetric matrix-matrix multiply - @item SYRK: symmetric matrix-matrix rank k update - @item SYR2K: symmetric matrix-matrix rank 2k update - @item PEMV: matrix-vector multiply with pentadiagonal matrix - @item TRMM: triangular matrix-matrix multiply - @item TRSM: triangular solve, multiple rhs - @item POSV: solve linear systems with symmetric positive-definite matrix - @item GESV_INCPIV: solve linear systems with general matrix - @item GELS: linear least squares with general matrix - @end itemize - - @item timing - - contains timing drivers to assess performances of CHAMELEON routines. - There are two sets of executables, those who do not use the tile interface -and those who do (with _tile in the name of the executable). - Executables without tile interface allocates data following LAPACK -conventions and these data can be given as arguments to CHAMELEON routines -as you would do with LAPACK. - Executables with tile interface generate directly the data in the format - CHAMELEON tile algorithms used to submit tasks to the runtime system. - Executables with tile interface should be more performant because no data -copy from LAPACK matrix layout to tile matrix layout are necessary. - Calling example: - @example - ./timing/time_dpotrf --n_range=1000:10000:1000 --nb=320 - --threads=9 --gpus=3 - --nowarmup - @end example - - List of main options that can be used in timing: - @itemize @bullet - @item @option{--help}: show usage - @item Machine parameters - @itemize @bullet - @item @option{-t x, --threads=x}: Number of CPU workers (default: automatic detection through runtime) - @item @option{-g x, --gpus=x}: Number of GPU workers (default: @option{0}) - @item @option{-P x, --P=x}: Rows (P) in the PxQ process grid (deafult: @option{1}) - @item @option{--nocpu}: All GPU kernels are exclusively executed on GPUs (default: @option{0}) - @end itemize - @item Matrix parameters - @itemize @bullet - @item @option{-m x, --m=x, --M=x}: Dimension (M) of the matrices (default: @option{N}) - @item @option{-n x, --n=x, --N=x}: Dimension (N) of the matrices - @item @option{-N R, --n_range=R}: Range of N values to time with R=Start:Stop:Step (default: @option{500:5000:500}) - @item @option{-k x, --k=x, --K=x, --nrhs=x}: Dimension (K) of the matrices or number of right-hand size (default: @option{1}). This is useful for GEMM like algorithms (k is the shared dimension and must be defined >1 to consider matrices and not vectors) - @item @option{-b x, --nb=x}: NB size. (default: @option{320}) - @item @option{-i x, --ib=x}: IB size. (default: @option{32}) - @end itemize - @item Check/prints - @itemize @bullet - @item @option{--niter=x}: number of iterations performed for each test (default: @option{1}) - @item @option{-W, --nowarnings}: Do not show warnings - @item @option{-w, --nowarmup}: Cancel the warmup run to pre-load libraries - @item @option{-c, --check}: Check result - @item @option{-C, --inv}: Check on inverse - @item @option{--mode=x}: Change the xLATMS matrix mode generation for SVD/EVD (default: @option{4}). It must be between 0 and 20 included. - @end itemize - @item Profiling parameters - @itemize @bullet - @item @option{-T, --trace}: Enable trace generation - @item @option{--progress}: Display progress indicator - @item @option{-d, --dag}: Enable DAG generation. Generates a dot_dag_file.dot. - @item @option{-p, --profile}: Print profiling informations - @end itemize - @item HQR parameters - @itemize @bullet - @item @option{-a x, --qr_a=x, --rhblk=x}: Define the size of the local TS trees in housholder reduction trees for QR and LQ factorization. N is the size of each subdomain (default: @option{-1}) - @item @option{-l x, --llvl=x}: Tree used for low level reduction inside nodes (default: @option{-1}) - @item @option{-L x, --hlvl=x}: Tree used for high level reduction between nodes, only if P > 1 (default: @option{-1}). Possible values are -1: Automatic, 0: Flat, 1: Greedy, 2: Fibonacci, 3: Binary, 4: Replicated greedy. - @item @option{-D, --domino}: Enable the domino between upper and lower trees - @end itemize - @item Advanced options - @itemize @bullet - @item @option{--nobigmat}: Disable single large matrix allocation for multiple tiled allocations - @item @option{-s, --sync}: Enable synchronous calls in wrapper function such as POTRI - @item @option{-o, --ooc}: Enable out-of-core (available only with StarPU) - @item @option{-G, --gemm3m}: Use gemm3m complex method - @item @option{--bound}: Compare result to area bound - @end itemize - - List of timing algorithms available: - @itemize @bullet - @item LANGE: norms of matrices - @item GEMM: general matrix-matrix multiply - @item TRSM: triangular solve - @item POTRF: Cholesky factorization with a symmetric -positive-definite matrix - @item POSV: solve linear systems with symmetric positive-definite matrix - @item GETRF_NOPIV: LU factorization of a general matrix -using the tile LU algorithm without row pivoting - @item GESV_NOPIV: solve linear system for a general matrix -using the tile LU algorithm without row pivoting - @item GETRF_INCPIV: LU factorization of a general matrix -using the tile LU algorithm with partial tile pivoting with row interchanges - @item GESV_INCPIV: solve linear system for a general matrix -using the tile LU algorithm with partial tile pivoting with row interchanges -matrix - @item GEQRF: QR factorization of a general matrix - @item GELS: solves overdetermined or underdetermined linear systems -involving a general matrix using the QR or the LQ factorization - @end itemize - -@end itemize - -@node Linking an external application with CHAMELEON libraries -@section Linking an external application with CHAMELEON libraries - -Compilation and link with CHAMELEON libraries have been tested with -@strong{gcc/gfortran 4.8.1} and @strong{icc/ifort 14.0.2}. - -@menu -* Static linking in C:: -* Dynamic linking in C:: -* Build a Fortran program with CHAMELEON:: -@end menu - -@node Static linking in C -@subsection Static linking in C - -Lets imagine you have a file main.c that you want to link with CHAMELEON -static libraries. -Here could be your compilation command with gcc compiler: -@example -gcc -I/home/yourname/install/chameleon/include -o main.o -c main.c -@end example - -Now if you want to link your application with CHAMELEON static libraries, you -could do: -@example -gcc main.o -o main \ -/home/yourname/install/chameleon/lib/libchameleon.a \ -/home/yourname/install/chameleon/lib/libchameleon_starpu.a \ -/home/yourname/install/chameleon/lib/libcoreblas.a \ --lstarpu-1.1 -Wl,--no-as-needed -lmkl_intel_lp64 \ --lmkl_sequential -lmkl_core -lpthread -lm -lrt -@end example -As you can see in this example, we also link with some dynamic libraries -@option{starpu-1.1}, @option{Intel MKL} libraries (for -BLAS/LAPACK/CBLAS/LAPACKE), @option{pthread}, @option{m} (math) and -@option{rt}. -These libraries will depend on the configuration of your CHAMELEON build. -You can find these dependencies in .pc files we generate during compilation and -CHAMELEON install directory. -Note also that you could need to specify where to find these libraries with -@option{-L} option of your compiler/linker. - -Before to run your program, make sure that all shared libraries paths your -executable depends on are known. -Enter @code{ldd main} to check. -If some shared libraries paths are missing append them in the -@env{LD_LIBRARY_PATH} (for Linux systems) environment variable -(@env{DYLD_LIBRARY_PATH} on Mac, @env{LIB} on Windows). - -@node Dynamic linking in C -@subsection Dynamic linking in C - -For dynamic linking (need to build CHAMELEON with CMake -option @option{BUILD_SHARED_LIBS=ON}) it is similar to static compilation/link -but instead of specifying path to your static libraries you indicate the path -to dynamic libraries with @option{-L} option and you give the name of libraries -with @option{-l} option like this: -@example -gcc main.o -o main \ --L/home/yourname/install/chameleon/lib \ --lchameleon -lchameleon_starpu -lcoreblas \ --lstarpu-1.1 -Wl,--no-as-needed -lmkl_intel_lp64 \ --lmkl_sequential -lmkl_core -lpthread -lm -lrt -@end example - -Note that an update of your environment variable -@env{LD_LIBRARY_PATH} (@env{DYLD_LIBRARY_PATH} on Mac, @env{LIB} on Windows) -with the path of the libraries could be required before executing, example: -@example -export @env{LD_LIBRARY_PATH}=path/to/libs:path/to/chameleon/lib -@end example - -@node Build a Fortran program with CHAMELEON -@subsection Build a Fortran program with CHAMELEON - -CHAMELEON provides a Fortran interface to user functions. Example: -@example -call morse_version(major, minor, patch) !or -call MORSE_VERSION(major, minor, patch) -@end example - -Build and link are very similar to the C case. - -Compilation example: -@example -gfortran -o main.o -c main.c -@end example - -Static linking example: -@example -gfortran main.o -o main \ -/home/yourname/install/chameleon/lib/libchameleon.a \ -/home/yourname/install/chameleon/lib/libchameleon_starpu.a \ -/home/yourname/install/chameleon/lib/libcoreblas.a \ --lstarpu-1.1 -Wl,--no-as-needed -lmkl_intel_lp64 \ --lmkl_sequential -lmkl_core -lpthread -lm -lrt -@end example - -Dynamic linking example: -@example -gfortran main.o -o main \ --L/home/yourname/install/chameleon/lib \ --lchameleon -lchameleon_starpu -lcoreblas \ --lstarpu-1.1 -Wl,--no-as-needed -lmkl_intel_lp64 \ --lmkl_sequential -lmkl_core -lpthread -lm -lrt -@end example - -@node CHAMELEON API -@section CHAMELEON API - -CHAMELEON provides routines to solve dense general systems of linear -equations, symmetric positive definite systems of linear equations and linear -least squares problems, using LU, Cholesky, QR and LQ factorizations. -Real arithmetic and complex arithmetic are supported in both single precision -and double precision. -Routines that compute linear algebra are of the folowing form: -@example -MORSE_name[_Tile[_Async]] -@end example -@itemize @bullet -@item all user routines are prefixed with @code{MORSE} -@item @code{name} follows BLAS/LAPACK naming scheme for algorithms -(@emph{e.g.} sgemm for general matrix-matrix multiply simple precision) -@item CHAMELEON provides three interface levels - @itemize @minus - @item @code{MORSE_name}: simplest interface, very close to CBLAS and LAPACKE, -matrices are given following the LAPACK data layout (1-D array column-major). -It involves copy of data from LAPACK layout to tile layout and conversely (to -update LAPACK data), see @ref{Step1}. - @item @code{MORSE_name_Tile}: the tile interface avoid copies between LAPACK -and tile layouts. It is the standard interface of CHAMELEON and it should -achieved better performance than the previous simplest interface. The data are -given through a specific structure called a descriptor, see @ref{Step2}. - @item @code{MORSE_name_Tile_Async}: similar to the tile interface, it avoids -synchonization barrier normally called between @code{Tile} routines. -At the end of an @code{Async} function, completion of tasks is not guarentee -and data are not necessarily up-to-date. -To ensure that tasks have been all executed a synchronization function has to -be called after the sequence of @code{Async} functions, see @ref{Step4}. - @end itemize -@end itemize - -MORSE routine calls have to be precede from -@example -MORSE_Init( NCPU, NGPU ); -@end example -to initialize MORSE and the runtime system and followed by -@example -MORSE_Finalize(); -@end example -to free some data and finalize the runtime and/or MPI. - -@menu -* Tutorial LAPACK to CHAMELEON:: -* List of available routines:: -@end menu - -@node Tutorial LAPACK to CHAMELEON -@subsection Tutorial LAPACK to CHAMELEON - -This tutorial is dedicated to the API usage of CHAMELEON. -The idea is to start from a simple code and step by step explain how to -use CHAMELEON routines. -The first step is a full BLAS/LAPACK code without dependencies to CHAMELEON, -a code that most users should easily understand. -Then, the different interfaces CHAMELEON provides are exposed, from the -simplest API (step1) to more complicated ones (until step4). -The way some important parameters are set is discussed in step5. -step6 is an example about distributed computation with MPI. -Finally step7 shows how to let Chameleon initialize user's data -(matrices/vectors) in parallel. - -directory. -If CMake option @option{CHAMELEON_ENABLE_EXAMPLE} is @option{ON} then source -files are compiled with the project libraries. -The arithmetic precision is @code{double}. -To execute a step @samp{X}, enter the following command: -@example -./step@samp{X} --option1 --option2 ... -@end example -Instructions about the arguments to give to executables are accessible thanks -to the option @option{-[-]help} or @option{-[-]h}. -Note there exist default values for options. - -For all steps, the program solves a linear system @math{Ax=B} -The matrix values are randomly generated but ensure that matrix @math{A} is -symmetric positive definite so that @math{A} can be factorized in a @math{LL^T} -form using the Cholesky factorization. - - -Lets comment the different steps of the tutorial -@menu -* Step0:: a simple Cholesky example using the C interface of -BLAS/LAPACK -* Step1:: introduces the LAPACK equivalent interface of Chameleon -* Step2:: introduces the tile interface -* Step3:: indicates how to give your own tile matrix to Chameleon -* Step4:: introduces the tile async interface -* Step5:: shows how to set some important parameters -* Step6:: introduces how to benefit from MPI in Chameleon -* Step7:: introduces how to let Chameleon initialize the user's matrix data -@end menu - -@node Step0 -@subsubsection Step0 - -The C interface of BLAS and LAPACK, that is, CBLAS and -LAPACKE, are used to solve the system. The size of the system (matrix) and the -number of right hand-sides can be given as arguments to the executable (be -careful not to give huge numbers if you do not have an infinite amount of RAM!). -As for every step, the correctness of the solution is checked by calculating -the norm @math{||Ax-B||/(||A||||x||+||B||)}. -The time spent in factorization+solve is recorded and, because we know exactly -the number of operations of these algorithms, we deduce the number of -operations that have been processed per second (in GFlops/s). -The important part of the code that solves the problem is: -@verbatim -/* Cholesky factorization: - * A is replaced by its factorization L or L^T depending on uplo */ -LAPACKE_dpotrf( LAPACK_COL_MAJOR, 'U', N, A, N ); -/* Solve: - * B is stored in X on entry, X contains the result on exit. - * Forward ... - */ -cblas_dtrsm( - CblasColMajor, - CblasLeft, - CblasUpper, - CblasConjTrans, - CblasNonUnit, - N, NRHS, 1.0, A, N, X, N); -/* ... and back substitution */ -cblas_dtrsm( - CblasColMajor, - CblasLeft, - CblasUpper, - CblasNoTrans, - CblasNonUnit, - N, NRHS, 1.0, A, N, X, N); -@end verbatim - -@node Step1 -@subsubsection Step1 - -It introduces the simplest CHAMELEON interface which is equivalent to -CBLAS/LAPACKE. -The code is very similar to step0 but instead of calling CBLAS/LAPACKE -functions, we call CHAMELEON equivalent functions. -The solving code becomes: -@verbatim -/* Factorization: */ -MORSE_dpotrf( UPLO, N, A, N ); -/* Solve: */ -MORSE_dpotrs(UPLO, N, NRHS, A, N, X, N); -@end verbatim -The API is almost the same so that it is easy to use for beginners. -It is important to keep in mind that before any call to MORSE routines, -@code{MORSE_Init} has to be invoked to initialize MORSE and the runtime system. -Example: -@verbatim -MORSE_Init( NCPU, NGPU ); -@end verbatim -After all MORSE calls have been done, a call to @code{MORSE_Finalize} is -required to free some data and finalize the runtime and/or MPI. -@verbatim -MORSE_Finalize(); -@end verbatim -We use MORSE routines with the LAPACK interface which means the routines -accepts the same matrix format as LAPACK (1-D array column-major). -Note that we copy the matrix to get it in our own tile structures, see details -about this format here @ref{Tile Data Layout}. -This means you can get an overhead coming from copies. - -@node Step2 -@subsubsection Step2 - -This program is a copy of step1 but instead of using the LAPACK interface which -leads to copy LAPACK matrices inside MORSE routines we use the tile interface. -We will still use standard format of matrix but we will see how to give this -matrix to create a MORSE descriptor, a structure wrapping data on which we want -to apply sequential task-based algorithms. -The solving code becomes: -@verbatim -/* Factorization: */ -MORSE_dpotrf_Tile( UPLO, descA ); -/* Solve: */ -MORSE_dpotrs_Tile( UPLO, descA, descX ); -@end verbatim -To use the tile interface, a specific structure @code{MORSE_desc_t} must be -created. -This can be achieved from different ways. -@enumerate -@item Use the existing function @code{MORSE_Desc_Create}: means the -matrix data are considered contiguous in memory as it is considered in PLASMA -(@ref{Tile Data Layout}). -@item Use the existing function @code{MORSE_Desc_Create_OOC}: means the -matrix data is allocated on-demand in memory tile by tile, and possibly pushed -to disk if that does not fit memory. -@item Use the existing function @code{MORSE_Desc_Create_User}: it is more -flexible than @code{Desc_Create} because you can give your own way to access to -tile data so that your tiles can be allocated wherever you want in memory, see -next paragraph @ref{Step3}. -@item Create you own function to fill the descriptor. -If you understand well the meaning of each item of @code{MORSE_desc_t}, you -should be able to fill correctly the structure (good luck). -@end enumerate - -In Step2, we use the first way to create the descriptor: -@verbatim -MORSE_Desc_Create(&descA, NULL, MorseRealDouble, - NB, NB, NB*NB, N, N, - 0, 0, N, N, - 1, 1); -@end verbatim - -@itemize @bullet - -@item @code{descA} is the descriptor to create. - -@item The second argument is a pointer to existing data. -The existing data must follow LAPACK/PLASMA matrix layout @ref{Tile Data -Layout} (1-D array column-major) if @code{MORSE_Desc_Create} is used to create -the descriptor. -The @code{MORSE_Desc_Create_User} function can be used if you have data -organized differently. -This is discussed in the next paragraph @ref{Step3}. -Giving a @code{NULL} pointer means you let the function allocate memory space. -This requires to copy your data in the memory allocated by the -@code{Desc_Create}. -This can be done with -@verbatim -MORSE_Lapack_to_Tile(A, N, descA); -@end verbatim - -@item Third argument of @code{Desc_Create} is the datatype (used for memory -allocation). - -@item Fourth argument until sixth argument stand for respectively, the number -of rows (@code{NB}), columns (@code{NB}) in each tile, the total number of -values in a tile (@code{NB*NB}), the number of rows (@code{N}), colmumns -(@code{N}) in the entire matrix. - -@item Seventh argument until ninth argument stand for respectively, the -beginning row (@code{0}), column (@code{0}) indexes of the submatrix and the -number of rows (@code{N}), columns (@code{N}) in the submatrix. -These arguments are specific and used in precise cases. -If you do not consider submatrices, just use @code{0, 0, NROWS, NCOLS}. - -@item Two last arguments are the parameter of the 2-D block-cyclic distribution -grid, see @uref{http://www.netlib.org/scalapack/slug/node75.html, ScaLAPACK}. -To be able to use other data distribution over the nodes, -@code{MORSE_Desc_Create_User} function should be used. - -@end itemize - - -@node Step3 -@subsubsection Step3 - -This program makes use of the same interface than Step2 (tile interface) but -does not allocate LAPACK matrices anymore so that no copy between LAPACK matrix -layout and tile matrix layout are necessary to call MORSE routines. -To generate random right hand-sides you can use: -@verbatim -/* Allocate memory and initialize descriptor B */ -MORSE_Desc_Create(&descB, NULL, MorseRealDouble, - NB, NB, NB*NB, N, NRHS, - 0, 0, N, NRHS, 1, 1); -/* generate RHS with random values */ -MORSE_dplrnt_Tile( descB, 5673 ); -@end verbatim - -The other important point is that is it possible to create a descriptor, the -necessary structure to call MORSE efficiently, by giving your own pointer to -tiles if your matrix is not organized as a 1-D array column-major. -This can be achieved with the @code{MORSE_Desc_Create_User} routine. -Here is an example: -@verbatim -MORSE_Desc_Create_User(&descA, matA, MorseRealDouble, - NB, NB, NB*NB, N, N, - 0, 0, N, N, 1, 1, - user_getaddr_arrayofpointers, - user_getblkldd_arrayofpointers, - user_getrankof_zero); -@end verbatim -Firsts arguments are the same than @code{MORSE_Desc_Create} routine. -Following arguments allows you to give pointer to functions that manage the -access to tiles from the structure given as second argument. -Here for example, @code{matA} is an array containing addresses to tiles, see -The three functions you have to define for @code{Desc_Create_User} are: -@itemize @bullet -@item a function that returns address of tile @math{A(m,n)}, m and n standing -for the indexes of the tile in the global matrix. Lets consider a matrix -@math{4x4} with tile size @math{2x2}, the matrix contains four tiles of -indexes: @math{A(m=0,n=0)}, @math{A(m=0,n=1)}, @math{A(m=1,n=0)}, -@math{A(m=1,n=1)} -@item a function that returns the leading dimension of tile @math{A(m,*)} -@item a function that returns MPI rank of tile @math{A(m,n)} -@end itemize -Note that the way we define these functions is related to the tile matrix -format and to the data distribution considered. -This example should not be used with MPI since all tiles are affected to -processus @code{0}, which means a large amount of data will be -potentially transfered between nodes. - -@node Step4 -@subsubsection Step4 -This program is a copy of step2 but instead of using the tile interface, it -uses the tile async interface. -The goal is to exhibit the runtime synchronization barriers. -Keep in mind that when the tile interface is called, like -@code{MORSE_dpotrf_Tile}, a synchronization function, waiting for the actual -execution and termination of all tasks, is called to ensure the -proper completion of the algorithm (i.e. data are up-to-date). -The code shows how to exploit the async interface to pipeline subsequent -algorithms so that less synchronisations are done. -The code becomes: -@verbatim -/* Morse structure containing parameters and a structure to interact with - * the Runtime system */ -MORSE_context_t *morse; -/* MORSE sequence uniquely identifies a set of asynchronous function calls - * sharing common exception handling */ -MORSE_sequence_t *sequence = NULL; -/* MORSE request uniquely identifies each asynchronous function call */ -MORSE_request_t request = MORSE_REQUEST_INITIALIZER; -int status; - -... - -morse_sequence_create(morse, &sequence); - -/* Factorization: */ -MORSE_dpotrf_Tile_Async( UPLO, descA, sequence, &request ); - -/* Solve: */ -MORSE_dpotrs_Tile_Async( UPLO, descA, descX, sequence, &request); - -/* Synchronization barrier (the runtime ensures that all submitted tasks - * have been terminated */ -RUNTIME_barrier(morse); -/* Ensure that all data processed on the gpus we are depending on are back - * in main memory */ -RUNTIME_desc_getoncpu(descA); -RUNTIME_desc_getoncpu(descX); - -status = sequence->status; - -@end verbatim -Here the sequence of @code{dpotrf} and @code{dpotrs} algorithms is processed -without synchronization so that some tasks of @code{dpotrf} and @code{dpotrs} -can be concurently executed which could increase performances. -The async interface is very similar to the tile one. -It is only necessary to give two new objects @code{MORSE_sequence_t} and -@code{MORSE_request_t} used to handle asynchronous function calls. - -@center @image{potri_async,13cm,8cm} -POTRI (POTRF, TRTRI, LAUUM) algorithm with and without synchronization -barriers, courtesey of the @uref{http://icl.cs.utk.edu/plasma/, PLASMA} team. - -@node Step5 -@subsubsection Step5 - -Step5 shows how to set some important parameters. -This program is a copy of Step4 but some additional parameters are given by -the user. -The parameters that can be set are: -@itemize @bullet -@item number of Threads -@item number of GPUs - -The number of workers can be given as argument to the executable with -@option{--threads=} and @option{--gpus=} options. -It is important to notice that we assign one thread per gpu to optimize data -transfer between main memory and devices memory. -The number of workers of each type @code{CPU} and @code{CUDA} must be given at -@code{MORSE_Init}. -@verbatim -if ( iparam[IPARAM_THRDNBR] == -1 ) { - get_thread_count( &(iparam[IPARAM_THRDNBR]) ); - /* reserve one thread par cuda device to optimize memory transfers */ - iparam[IPARAM_THRDNBR] -= iparam[IPARAM_NCUDAS]; -} -NCPU = iparam[IPARAM_THRDNBR]; -NGPU = iparam[IPARAM_NCUDAS]; - -/* initialize MORSE with main parameters */ -MORSE_Init( NCPU, NGPU ); -@end verbatim - -@item matrix size -@item number of right-hand sides -@item block (tile) size - -The problem size is given with @option{--n=} and @option{--nrhs=} options. -The tile size is given with option @option{--nb=}. -These parameters are required to create descriptors. -The size tile @code{NB} is a key parameter to get performances since it -defines the granularity of tasks. -If @code{NB} is too large compared to @code{N}, there are few tasks to -schedule. -If the number of workers is large this leads to limit parallelism. -On the contrary, if @code{NB} is too small (@emph{i.e.} many small tasks), -workers could not be correctly fed and the runtime systems operations -could represent a substantial overhead. -A trade-off has to be found depending on many parameters: problem size, -algorithm (drive data dependencies), architecture (number of workers, -workers speed, workers uniformity, memory bus speed). -By default it is set to 128. -Do not hesitate to play with this parameter and compare performances on your -machine. - -@item inner-blocking size - -The inner-blocking size is given with option @option{--ib=}. -This parameter is used by kernels (optimized algorithms applied on tiles) to -perform subsequent operations with data block-size that fits the cache of -workers. -Parameters @code{NB} and @code{IB} can be given with @code{MORSE_Set} function: -@verbatim -MORSE_Set(MORSE_TILE_SIZE, iparam[IPARAM_NB] ); -MORSE_Set(MORSE_INNER_BLOCK_SIZE, iparam[IPARAM_IB] ); -@end verbatim -@end itemize - -@node Step6 -@subsubsection Step6 - -This program is a copy of Step5 with some additional parameters to be set for -the data distribution. -To use this program properly MORSE must use StarPU Runtime system and MPI -option must be activated at configure. -The data distribution used here is 2-D block-cyclic, see for example -@uref{http://www.netlib.org/scalapack/slug/node75.html, ScaLAPACK} for -explanation. -The user can enter the parameters of the distribution grid at execution with -@option{--p=} option. -Example using OpenMPI on four nodes with one process per node: -@example -mpirun -np 4 ./step6 --n=10000 --nb=320 --ib=64 \ - --threads=8 --gpus=2 --p=2 -@end example - -In this program we use the tile data layout from PLASMA so that the call -@verbatim -MORSE_Desc_Create_User(&descA, NULL, MorseRealDouble, - NB, NB, NB*NB, N, N, - 0, 0, N, N, - GRID_P, GRID_Q, - morse_getaddr_ccrb, - morse_getblkldd_ccrb, - morse_getrankof_2d); -@end verbatim -is equivalent to the following call -@verbatim -MORSE_Desc_Create(&descA, NULL, MorseRealDouble, - NB, NB, NB*NB, N, N, - 0, 0, N, N, - GRID_P, GRID_Q); -@end verbatim -functions @code{morse_getaddr_ccrb}, @code{morse_getblkldd_ccrb}, -@code{morse_getrankof_2d} being used in @code{Desc_Create}. -It is interesting to notice that the code is almost the same as Step5. -The only additional information to give is the way tiles are distributed -through the third function given to @code{MORSE_Desc_Create_User}. -Here, because we have made experiments only with a 2-D block-cyclic -distribution, we have parameters P and Q in the interface of @code{Desc_Create} -but they have sense only for 2-D block-cyclic distribution and then using -@code{morse_getrankof_2d} function. -Of course it could be used with other distributions, being no more the -parameters of a 2-D block-cyclic grid but of another distribution. - -@node Step7 -@subsubsection Step7 - -This program is a copy of step6 with some additional calls to -build a matrix from within chameleon using a function provided by the user. -This can be seen as a replacement of the function like @code{MORSE_dplgsy_Tile()} that can be used -to fill the matrix with random data, @code{MORSE_dLapack_to_Tile()} to fill the matrix -with data stored in a lapack-like buffer, or @code{MORSE_Desc_Create_User()} that can be used -to describe an arbitrary tile matrix structure. -In this example, the build callback function are just wrapper towards @code{CORE_xxx()} functions, so the output -of the program step7 should be exactly similar to that of step6. -The difference is that the function used to fill the tiles is provided by the user, -and therefore this approach is much more flexible. - -The new function to understand is @code{MORSE_dbuild_Tile}, e.g. -@verbatim -struct data_pl data_A={(double)N, 51, N}; -MORSE_dbuild_Tile(MorseUpperLower, descA, (void*)&data_A, Morse_build_callback_plgsy); -@end verbatim -The idea here is to let Chameleon fill the matrix data in a task-based fashion -(parallel) by using a function given by the user. -First, the user should define if all the blocks must be entirelly filled or just -the upper/lower part with, e.g. @code{MorseUpperLower}. -We still relies on the same structure @code{MORSE_desc_t} which must be -initialized with the proper parameters, by calling for example -@code{MORSE_Desc_Create}. -Then, an opaque pointer is used to let the user give some extra data used by -his function. -The last parameter is the pointer to the user's function. - -@node List of available routines -@subsection List of available routines - -@menu -* Auxiliary routines:: Init, Finalize, Version, etc -* Descriptor routines:: To handle descriptors -* Options routines:: To set options -* Sequences routines:: To manage asynchronous function calls -* Linear Algebra routines:: Computional routines -@end menu - -@node Auxiliary routines -@subsubsection Auxiliary routines - -Reports MORSE version number. -@verbatim -int MORSE_Version (int *ver_major, int *ver_minor, int *ver_micro); -@end verbatim - -Initialize MORSE: initialize some parameters, initialize the runtime and/or MPI. -@verbatim -int MORSE_Init (int nworkers, int ncudas); -@end verbatim - -Finalyze MORSE: free some data and finalize the runtime and/or MPI. -@verbatim -int MORSE_Finalize (void); -@end verbatim - -Return the MPI rank of the calling process. -@verbatim -int MORSE_My_Mpi_Rank (void); -@end verbatim - -Suspend MORSE runtime to poll for new tasks, to avoid useless CPU consumption when -no tasks have to be executed by MORSE runtime system. -@verbatim -int MORSE_Pause (void); -@end verbatim - -Symmetrical call to MORSE_Pause, used to resume the workers polling for new tasks. -@verbatim -int MORSE_Resume (void); -@end verbatim - -Conversion from LAPACK layout to tile layout. -@verbatim -int MORSE_Lapack_to_Tile (void *Af77, int LDA, MORSE_desc_t *A); -@end verbatim - -Conversion from tile layout to LAPACK layout. -@verbatim -int MORSE_Tile_to_Lapack (MORSE_desc_t *A, void *Af77, int LDA); -@end verbatim - -@node Descriptor routines -@subsubsection Descriptor routines - -@c /* Descriptor */ -Create matrix descriptor, internal function. -@verbatim -int MORSE_Desc_Create (MORSE_desc_t **desc, void *mat, MORSE_enum dtyp, - int mb, int nb, int bsiz, int lm, int ln, - int i, int j, int m, int n, int p, int q); -@end verbatim - -Create matrix descriptor, user function. -@verbatim -int MORSE_Desc_Create_User(MORSE_desc_t **desc, void *mat, MORSE_enum dtyp, - int mb, int nb, int bsiz, int lm, int ln, - int i, int j, int m, int n, int p, int q, - void* (*get_blkaddr)( const MORSE_desc_t*, int, int), - int (*get_blkldd)( const MORSE_desc_t*, int ), - int (*get_rankof)( const MORSE_desc_t*, int, int )); -@end verbatim - -Destroys matrix descriptor. -@verbatim -int MORSE_Desc_Destroy (MORSE_desc_t **desc); -@end verbatim - -Ensure that all data are up-to-date in main memory (even if some tasks have -been processed on GPUs) -@verbatim -int MORSE_Desc_Flush(MORSE_desc_t *desc, MORSE_sequence_t *sequence); -@end verbatim - -@node Options routines -@subsubsection Options routines - -@c /* Options */ -Enable MORSE feature. -@verbatim -int MORSE_Enable (MORSE_enum option); -@end verbatim -Feature to be enabled: -@itemize @bullet -@item @code{MORSE_WARNINGS}: printing of warning messages, -@item @code{MORSE_ERRORS}: printing of error messages, -@item @code{MORSE_AUTOTUNING}: autotuning for tile size and inner block size, -@item @code{MORSE_PROFILING_MODE}: activate kernels profiling. -@end itemize - -Disable MORSE feature. -@verbatim -int MORSE_Disable (MORSE_enum option); -@end verbatim -Symmetric to @code{MORSE_Enable}. - -Set MORSE parameter. -@verbatim -int MORSE_Set (MORSE_enum param, int value); -@end verbatim -Parameters to be set: -@itemize @bullet -@item @code{MORSE_TILE_SIZE}: size matrix tile, -@item @code{MORSE_INNER_BLOCK_SIZE}: size of tile inner block, -@item @code{MORSE_HOUSEHOLDER_MODE}: type of householder trees (FLAT or TREE), -@item @code{MORSE_HOUSEHOLDER_SIZE}: size of the groups in householder trees, -@item @code{MORSE_TRANSLATION_MODE}: related to the -@end itemize - -Get value of MORSE parameter. -@verbatim -int MORSE_Get (MORSE_enum param, int *value); -@end verbatim - -@node Sequences routines -@subsubsection Sequences routines - -@c /* Sequences */ -Create a sequence. -@verbatim -int MORSE_Sequence_Create (MORSE_sequence_t **sequence); -@end verbatim - -Destroy a sequence. -@verbatim -int MORSE_Sequence_Destroy (MORSE_sequence_t *sequence); -@end verbatim - -Wait for the completion of a sequence. -@verbatim -int MORSE_Sequence_Wait (MORSE_sequence_t *sequence); -@end verbatim - -@node Linear Algebra routines -@subsubsection Linear Algebra routines - -Routines computing linear algebra of the form -@code{MORSE_name[_Tile[_Async]]} (@code{name} follows LAPACK naming scheme, see -@uref{http://www.netlib.org/lapack/lug/node24.html} availables: - -@verbatim -/** - * Declarations of computational functions (LAPACK layout) - **/ -int MORSE_zgelqf(int M, int N, MORSE_Complex64_t *A, int LDA, - MORSE_desc_t *descT); - -int MORSE_zgelqs(int M, int N, int NRHS, MORSE_Complex64_t *A, int LDA, - MORSE_desc_t *descT, MORSE_Complex64_t *B, int LDB); - -int MORSE_zgels(MORSE_enum trans, int M, int N, int NRHS, - MORSE_Complex64_t *A, int LDA, MORSE_desc_t *descT, - MORSE_Complex64_t *B, int LDB); - -int MORSE_zgemm(MORSE_enum transA, MORSE_enum transB, int M, int N, int K, - MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, - MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta, - MORSE_Complex64_t *C, int LDC); - -int MORSE_zgeqrf(int M, int N, MORSE_Complex64_t *A, int LDA, - MORSE_desc_t *descT); - -int MORSE_zgeqrs(int M, int N, int NRHS, MORSE_Complex64_t *A, int LDA, - MORSE_desc_t *descT, MORSE_Complex64_t *B, int LDB); - -int MORSE_zgesv_incpiv(int N, int NRHS, MORSE_Complex64_t *A, int LDA, - MORSE_desc_t *descL, int *IPIV, - MORSE_Complex64_t *B, int LDB); - -int MORSE_zgesv_nopiv(int N, int NRHS, MORSE_Complex64_t *A, int LDA, - MORSE_Complex64_t *B, int LDB); - -int MORSE_zgetrf_incpiv(int M, int N, MORSE_Complex64_t *A, int LDA, - MORSE_desc_t *descL, int *IPIV); - -int MORSE_zgetrf_nopiv(int M, int N, MORSE_Complex64_t *A, int LDA); - -int MORSE_zgetrs_incpiv(MORSE_enum trans, int N, int NRHS, - MORSE_Complex64_t *A, int LDA, - MORSE_desc_t *descL, int *IPIV, - MORSE_Complex64_t *B, int LDB); - -int MORSE_zgetrs_nopiv(MORSE_enum trans, int N, int NRHS, - MORSE_Complex64_t *A, int LDA, - MORSE_Complex64_t *B, int LDB); - -int MORSE_zhemm(MORSE_enum side, MORSE_enum uplo, int M, int N, - MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, - MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta, - MORSE_Complex64_t *C, int LDC); - -int MORSE_zherk(MORSE_enum uplo, MORSE_enum trans, int N, int K, - double alpha, MORSE_Complex64_t *A, int LDA, - double beta, MORSE_Complex64_t *C, int LDC); - -int MORSE_zher2k(MORSE_enum uplo, MORSE_enum trans, int N, int K, - MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, - MORSE_Complex64_t *B, int LDB, double beta, - MORSE_Complex64_t *C, int LDC); - -int MORSE_zlacpy(MORSE_enum uplo, int M, int N, - MORSE_Complex64_t *A, int LDA, - MORSE_Complex64_t *B, int LDB); - -double MORSE_zlange(MORSE_enum norm, int M, int N, - MORSE_Complex64_t *A, int LDA); - -double MORSE_zlanhe(MORSE_enum norm, MORSE_enum uplo, int N, - MORSE_Complex64_t *A, int LDA); - -double MORSE_zlansy(MORSE_enum norm, MORSE_enum uplo, int N, - MORSE_Complex64_t *A, int LDA); - -double MORSE_zlantr(MORSE_enum norm, MORSE_enum uplo, MORSE_enum diag, - int M, int N, MORSE_Complex64_t *A, int LDA); - -int MORSE_zlaset(MORSE_enum uplo, int M, int N, MORSE_Complex64_t alpha, - MORSE_Complex64_t beta, MORSE_Complex64_t *A, int LDA); - -int MORSE_zlauum(MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA); - -int MORSE_zplghe( double bump, MORSE_enum uplo, int N, - MORSE_Complex64_t *A, int LDA, - unsigned long long int seed ); - -int MORSE_zplgsy( MORSE_Complex64_t bump, MORSE_enum uplo, int N, - MORSE_Complex64_t *A, int LDA, - unsigned long long int seed ); - -int MORSE_zplrnt( int M, int N, MORSE_Complex64_t *A, int LDA, - unsigned long long int seed ); - -int MORSE_zposv(MORSE_enum uplo, int N, int NRHS, - MORSE_Complex64_t *A, int LDA, - MORSE_Complex64_t *B, int LDB); - -int MORSE_zpotrf(MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA); - -int MORSE_zsytrf(MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA); - -int MORSE_zpotri(MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA); - -int MORSE_zpotrs(MORSE_enum uplo, int N, int NRHS, - MORSE_Complex64_t *A, int LDA, - MORSE_Complex64_t *B, int LDB); - -#if defined (PRECISION_c) || defined(PRECISION_z) -int MORSE_zsytrs(MORSE_enum uplo, int N, int NRHS, - MORSE_Complex64_t *A, int LDA, - MORSE_Complex64_t *B, int LDB); -#endif - -int MORSE_zsymm(MORSE_enum side, MORSE_enum uplo, int M, int N, - MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, - MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta, - MORSE_Complex64_t *C, int LDC); - -int MORSE_zsyrk(MORSE_enum uplo, MORSE_enum trans, int N, int K, - MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, - MORSE_Complex64_t beta, MORSE_Complex64_t *C, int LDC); - -int MORSE_zsyr2k(MORSE_enum uplo, MORSE_enum trans, int N, int K, - MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, - MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta, - MORSE_Complex64_t *C, int LDC); - -int MORSE_ztrmm(MORSE_enum side, MORSE_enum uplo, - MORSE_enum transA, MORSE_enum diag, - int N, int NRHS, - MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, - MORSE_Complex64_t *B, int LDB); - -int MORSE_ztrsm(MORSE_enum side, MORSE_enum uplo, - MORSE_enum transA, MORSE_enum diag, - int N, int NRHS, - MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, - MORSE_Complex64_t *B, int LDB); - -int MORSE_ztrsmpl(int N, int NRHS, MORSE_Complex64_t *A, int LDA, - MORSE_desc_t *descL, int *IPIV, - MORSE_Complex64_t *B, int LDB); - -int MORSE_ztrsmrv(MORSE_enum side, MORSE_enum uplo, - MORSE_enum transA, MORSE_enum diag, - int N, int NRHS, - MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, - MORSE_Complex64_t *B, int LDB); - -int MORSE_ztrtri(MORSE_enum uplo, MORSE_enum diag, int N, - MORSE_Complex64_t *A, int LDA); - -int MORSE_zunglq(int M, int N, int K, MORSE_Complex64_t *A, int LDA, - MORSE_desc_t *descT, MORSE_Complex64_t *B, int LDB); - -int MORSE_zungqr(int M, int N, int K, MORSE_Complex64_t *A, int LDA, - MORSE_desc_t *descT, MORSE_Complex64_t *B, int LDB); - -int MORSE_zunmlq(MORSE_enum side, MORSE_enum trans, int M, int N, int K, - MORSE_Complex64_t *A, int LDA, - MORSE_desc_t *descT, - MORSE_Complex64_t *B, int LDB); - -int MORSE_zunmqr(MORSE_enum side, MORSE_enum trans, int M, int N, int K, - MORSE_Complex64_t *A, int LDA, MORSE_desc_t *descT, - MORSE_Complex64_t *B, int LDB); - -/** - * Declarations of computational functions (tile layout) - **/ -int MORSE_zgelqf_Tile(MORSE_desc_t *A, MORSE_desc_t *T); - -int MORSE_zgelqs_Tile(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B); - -int MORSE_zgels_Tile(MORSE_enum trans, MORSE_desc_t *A, MORSE_desc_t *T, - MORSE_desc_t *B); - -int MORSE_zgemm_Tile(MORSE_enum transA, MORSE_enum transB, - MORSE_Complex64_t alpha, MORSE_desc_t *A, - MORSE_desc_t *B, MORSE_Complex64_t beta, - MORSE_desc_t *C); - -int MORSE_zgeqrf_Tile(MORSE_desc_t *A, MORSE_desc_t *T); - -int MORSE_zgeqrs_Tile(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B); - -int MORSE_zgesv_incpiv_Tile(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV, - MORSE_desc_t *B); - -int MORSE_zgesv_nopiv_Tile(MORSE_desc_t *A, MORSE_desc_t *B); - -int MORSE_zgetrf_incpiv_Tile(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV); - -int MORSE_zgetrf_nopiv_Tile(MORSE_desc_t *A); - -int MORSE_zgetrs_incpiv_Tile(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV, - MORSE_desc_t *B); - -int MORSE_zgetrs_nopiv_Tile(MORSE_desc_t *A, MORSE_desc_t *B); - -int MORSE_zhemm_Tile(MORSE_enum side, MORSE_enum uplo, - MORSE_Complex64_t alpha, MORSE_desc_t *A, - MORSE_desc_t *B, MORSE_Complex64_t beta, - MORSE_desc_t *C); - -int MORSE_zherk_Tile(MORSE_enum uplo, MORSE_enum trans, - double alpha, MORSE_desc_t *A, - double beta, MORSE_desc_t *C); - -int MORSE_zher2k_Tile(MORSE_enum uplo, MORSE_enum trans, - MORSE_Complex64_t alpha, MORSE_desc_t *A, - MORSE_desc_t *B, double beta, MORSE_desc_t *C); - -int MORSE_zlacpy_Tile(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B); - -double MORSE_zlange_Tile(MORSE_enum norm, MORSE_desc_t *A); - -double MORSE_zlanhe_Tile(MORSE_enum norm, MORSE_enum uplo, MORSE_desc_t *A); - -double MORSE_zlansy_Tile(MORSE_enum norm, MORSE_enum uplo, MORSE_desc_t *A); - -double MORSE_zlantr_Tile(MORSE_enum norm, MORSE_enum uplo, - MORSE_enum diag, MORSE_desc_t *A); - -int MORSE_zlaset_Tile(MORSE_enum uplo, MORSE_Complex64_t alpha, - MORSE_Complex64_t beta, MORSE_desc_t *A); - -int MORSE_zlauum_Tile(MORSE_enum uplo, MORSE_desc_t *A); - -int MORSE_zplghe_Tile(double bump, MORSE_enum uplo, MORSE_desc_t *A, - unsigned long long int seed); - -int MORSE_zplgsy_Tile(MORSE_Complex64_t bump, MORSE_enum uplo, MORSE_desc_t *A, - unsigned long long int seed ); - -int MORSE_zplrnt_Tile(MORSE_desc_t *A, unsigned long long int seed ); - -int MORSE_zposv_Tile(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B); - -int MORSE_zpotrf_Tile(MORSE_enum uplo, MORSE_desc_t *A); - -int MORSE_zsytrf_Tile(MORSE_enum uplo, MORSE_desc_t *A); - -int MORSE_zpotri_Tile(MORSE_enum uplo, MORSE_desc_t *A); - -int MORSE_zpotrs_Tile(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B); - -#if defined (PRECISION_c) || defined(PRECISION_z) -int MORSE_zsytrs_Tile(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B); -#endif - -int MORSE_zsymm_Tile(MORSE_enum side, MORSE_enum uplo, - MORSE_Complex64_t alpha, MORSE_desc_t *A, - MORSE_desc_t *B, MORSE_Complex64_t beta, - MORSE_desc_t *C); - -int MORSE_zsyrk_Tile(MORSE_enum uplo, MORSE_enum trans, - MORSE_Complex64_t alpha, MORSE_desc_t *A, - MORSE_Complex64_t beta, MORSE_desc_t *C); - -int MORSE_zsyr2k_Tile(MORSE_enum uplo, MORSE_enum trans, - MORSE_Complex64_t alpha, MORSE_desc_t *A, - MORSE_desc_t *B, MORSE_Complex64_t beta, - MORSE_desc_t *C); - -int MORSE_ztrmm_Tile(MORSE_enum side, MORSE_enum uplo, - MORSE_enum transA, MORSE_enum diag, - MORSE_Complex64_t alpha, MORSE_desc_t *A, - MORSE_desc_t *B); - -int MORSE_ztrsm_Tile(MORSE_enum side, MORSE_enum uplo, - MORSE_enum transA, MORSE_enum diag, - MORSE_Complex64_t alpha, MORSE_desc_t *A, - MORSE_desc_t *B); - -int MORSE_ztrsmpl_Tile(MORSE_desc_t *A, MORSE_desc_t *L, - int *IPIV, MORSE_desc_t *B); - -int MORSE_ztrsmrv_Tile(MORSE_enum side, MORSE_enum uplo, - MORSE_enum transA, MORSE_enum diag, - MORSE_Complex64_t alpha, MORSE_desc_t *A, - MORSE_desc_t *B); - -int MORSE_ztrtri_Tile(MORSE_enum uplo, MORSE_enum diag, MORSE_desc_t *A); - -int MORSE_zunglq_Tile(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B); - -int MORSE_zungqr_Tile(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B); - -int MORSE_zunmlq_Tile(MORSE_enum side, MORSE_enum trans, MORSE_desc_t *A, - MORSE_desc_t *T, MORSE_desc_t *B); - -int MORSE_zunmqr_Tile(MORSE_enum side, MORSE_enum trans, MORSE_desc_t *A, - MORSE_desc_t *T, MORSE_desc_t *B); - -/** - * Declarations of computational functions - * (tile layout, asynchronous execution) - **/ -int MORSE_zgelqf_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, - MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zgelqs_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, - MORSE_desc_t *B, - MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zgels_Tile_Async(MORSE_enum trans, MORSE_desc_t *A, - MORSE_desc_t *T, MORSE_desc_t *B, - MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zgemm_Tile_Async(MORSE_enum transA, MORSE_enum transB, - MORSE_Complex64_t alpha, MORSE_desc_t *A, - MORSE_desc_t *B, MORSE_Complex64_t beta, - MORSE_desc_t *C, MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zgeqrf_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, - MORSE_sequence_t *sequence, - MORSE_request_t *request) - -int MORSE_zgeqrs_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, - MORSE_desc_t *B, - MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zgesv_incpiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *L, - int *IPIV, MORSE_desc_t *B, - MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zgesv_nopiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *B, - MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zgetrf_incpiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *L, - int *IPIV, MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zgetrf_nopiv_Tile_Async(MORSE_desc_t *A, - MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zgetrs_incpiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *L, - int *IPIV, MORSE_desc_t *B, - MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zgetrs_nopiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *B, - MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zhemm_Tile_Async(MORSE_enum side, MORSE_enum uplo, - MORSE_Complex64_t alpha, MORSE_desc_t *A, - MORSE_desc_t *B, MORSE_Complex64_t beta, - MORSE_desc_t *C, MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zherk_Tile_Async(MORSE_enum uplo, MORSE_enum trans, - double alpha, MORSE_desc_t *A, - double beta, MORSE_desc_t *C, - MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zher2k_Tile_Async(MORSE_enum uplo, MORSE_enum trans, - MORSE_Complex64_t alpha, MORSE_desc_t *A, - MORSE_desc_t *B, double beta, MORSE_desc_t *C, - MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zlacpy_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, - MORSE_desc_t *B, MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zlange_Tile_Async(MORSE_enum norm, MORSE_desc_t *A, double *value, - MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zlanhe_Tile_Async(MORSE_enum norm, MORSE_enum uplo, - MORSE_desc_t *A, double *value, - MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zlansy_Tile_Async(MORSE_enum norm, MORSE_enum uplo, - MORSE_desc_t *A, double *value, - MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zlantr_Tile_Async(MORSE_enum norm, MORSE_enum uplo, - MORSE_enum diag, MORSE_desc_t *A, double *value, - MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zlaset_Tile_Async(MORSE_enum uplo, MORSE_Complex64_t alpha, - MORSE_Complex64_t beta, MORSE_desc_t *A, - MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zlauum_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, - MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zplghe_Tile_Async(double bump, MORSE_enum uplo, MORSE_desc_t *A, - unsigned long long int seed, - MORSE_sequence_t *sequence, - MORSE_request_t *request ); - -int MORSE_zplgsy_Tile_Async(MORSE_Complex64_t bump, MORSE_enum uplo, MORSE_desc_t *A, - unsigned long long int seed, - MORSE_sequence_t *sequence, - MORSE_request_t *request ); - -int MORSE_zplrnt_Tile_Async(MORSE_desc_t *A, unsigned long long int seed, - MORSE_sequence_t *sequence, - MORSE_request_t *request ); - -int MORSE_zposv_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, - MORSE_desc_t *B, - MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zpotrf_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, - MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zsytrf_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, - MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zpotri_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, - MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zpotrs_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, - MORSE_desc_t *B, MORSE_sequence_t *sequence, - MORSE_request_t *request); - -#if defined (PRECISION_c) || defined(PRECISION_z) -int MORSE_zsytrs_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, - MORSE_desc_t *B, - MORSE_sequence_t *sequence, - MORSE_request_t *request); -#endif - -int MORSE_zsymm_Tile_Async(MORSE_enum side, MORSE_enum uplo, - MORSE_Complex64_t alpha, MORSE_desc_t *A, - MORSE_desc_t *B, MORSE_Complex64_t beta, - MORSE_desc_t *C, MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zsyrk_Tile_Async(MORSE_enum uplo, MORSE_enum trans, - MORSE_Complex64_t alpha, MORSE_desc_t *A, - MORSE_Complex64_t beta, MORSE_desc_t *C, - MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zsyr2k_Tile_Async(MORSE_enum uplo, MORSE_enum trans, - MORSE_Complex64_t alpha, MORSE_desc_t *A, - MORSE_desc_t *B, MORSE_Complex64_t beta, - MORSE_desc_t *C, MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_ztrmm_Tile_Async(MORSE_enum side, MORSE_enum uplo, - MORSE_enum transA, MORSE_enum diag, - MORSE_Complex64_t alpha, MORSE_desc_t *A, - MORSE_desc_t *B, MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_ztrsm_Tile_Async(MORSE_enum side, MORSE_enum uplo, - MORSE_enum transA, MORSE_enum diag, - MORSE_Complex64_t alpha, MORSE_desc_t *A, - MORSE_desc_t *B, MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_ztrsmpl_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV, - MORSE_desc_t *B, MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_ztrsmrv_Tile_Async(MORSE_enum side, MORSE_enum uplo, - MORSE_enum transA, MORSE_enum diag, - MORSE_Complex64_t alpha, MORSE_desc_t *A, - MORSE_desc_t *B, MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_ztrtri_Tile_Async(MORSE_enum uplo, MORSE_enum diag, - MORSE_desc_t *A, - MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zunglq_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, - MORSE_desc_t *B, - MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zungqr_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, - MORSE_desc_t *B, - MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zunmlq_Tile_Async(MORSE_enum side, MORSE_enum trans, - MORSE_desc_t *A, MORSE_desc_t *T, - MORSE_desc_t *B, MORSE_sequence_t *sequence, - MORSE_request_t *request); - -int MORSE_zunmqr_Tile_Async(MORSE_enum side, MORSE_enum trans, - MORSE_desc_t *A, MORSE_desc_t *T, - MORSE_desc_t *B, MORSE_sequence_t *sequence, - MORSE_request_t *request); - -@end verbatim - -@c -nofor_main diff --git a/doc/texinfo/figures/morse_header.png b/doc/texinfo/figures/morse_header.png deleted file mode 100644 index ada315a235dfd4ee4a35064e13ae0d680b480059..0000000000000000000000000000000000000000 Binary files a/doc/texinfo/figures/morse_header.png and /dev/null differ diff --git a/doc/texinfo/figures/potri_async.png b/doc/texinfo/figures/potri_async.png deleted file mode 100644 index 85ebe6ad9af3db6070cd898323400a8a584b7583..0000000000000000000000000000000000000000 Binary files a/doc/texinfo/figures/potri_async.png and /dev/null differ diff --git a/doc/texinfo/figures/tile_layout.jpg b/doc/texinfo/figures/tile_layout.jpg deleted file mode 100644 index 16a44b08afab7de2c15a75f200baf210c7fe6d3e..0000000000000000000000000000000000000000 Binary files a/doc/texinfo/figures/tile_layout.jpg and /dev/null differ diff --git a/doc/texinfo/figures/tile_layout.pdf b/doc/texinfo/figures/tile_layout.pdf deleted file mode 100644 index f5df80dbe06de18346c1df6c14a20c6e1c24edd1..0000000000000000000000000000000000000000 Binary files a/doc/texinfo/figures/tile_layout.pdf and /dev/null differ diff --git a/doc/texinfo/figures/tile_lu.jpg b/doc/texinfo/figures/tile_lu.jpg deleted file mode 100644 index 9da660ab607fae57cec55eb3c8ddc0512ea7fd62..0000000000000000000000000000000000000000 Binary files a/doc/texinfo/figures/tile_lu.jpg and /dev/null differ diff --git a/doc/texinfo/figures/tile_lu.pdf b/doc/texinfo/figures/tile_lu.pdf deleted file mode 100644 index c9b6df65197c83449c6335ebb1da393d92cd683f..0000000000000000000000000000000000000000 Binary files a/doc/texinfo/figures/tile_lu.pdf and /dev/null differ diff --git a/doc/texinfo/figures/trace_qr.jpg b/doc/texinfo/figures/trace_qr.jpg deleted file mode 100644 index 92504d096fe829e3a0d9f2a296262c00cef3e792..0000000000000000000000000000000000000000 Binary files a/doc/texinfo/figures/trace_qr.jpg and /dev/null differ diff --git a/doc/texinfo/figures/trace_qr.pdf b/doc/texinfo/figures/trace_qr.pdf deleted file mode 100644 index e030af5e252dd26828aa156e17c6b1d240a534db..0000000000000000000000000000000000000000 Binary files a/doc/texinfo/figures/trace_qr.pdf and /dev/null differ diff --git a/doc/texinfo/morse.css b/doc/texinfo/morse.css deleted file mode 100644 index 41adb20078f7e5b0af0af434fa51bdb361af022f..0000000000000000000000000000000000000000 --- a/doc/texinfo/morse.css +++ /dev/null @@ -1,72 +0,0 @@ -body { - padding: 2em 1em 2em 70px; - margin: 0; - font-family: sans-serif; - color: black; - background: white; - background-position: top left; - background-attachment: fixed; - background-repeat: no-repeat; -} -:link { color: #00C; background: transparent } -:visited { color: #609; background: transparent } -a:active { color: #C00; background: transparent } - -a:link img, a:visited img { border-style: none } - -a img { color: white; } -@media all { - a img { color: inherit; } -} - -th, td { - font-family: sans-serif; -} - -h1, h2, h3, h4, h5, h6 { text-align: left } -h1, h2, h3 { color: #005A9C; background: white } -h1 { font: 170% sans-serif } -h2 { font: 140% sans-serif } -h3 { font: 120% sans-serif } -h4 { font: bold 100% sans-serif } -h5 { font: italic 100% sans-serif } -h6 { font: small-caps 100% sans-serif } - -.hide { display: none } - -div.head { margin-bottom: 1em } -div.head h1 { margin-top: 2em; clear: both } -div.head table { margin-left: 2em; margin-top: 2em } - -p.copyright { font-size: small } -p.copyright small { font-size: small } - -@media screen { -a[href]:hover { background: #ffa } -} - -pre { margin-left: 2em } - -dt, dd { margin-top: 0; margin-bottom: 0 } -dt { font-weight: bold } - -pre, code { font-family: monospace } - -ul.toc, ol.toc { - list-style: disc; - list-style: none; -} - -@media aural { - h1, h2, h3 { stress: 20; richness: 90 } - .hide { speak: none } - p.copyright { volume: x-soft; speech-rate: x-fast } - dt { pause-before: 20% } - pre { speak-punctuation: code } -} - -/* -body { - background-image: url(); -} -*/ diff --git a/doc/texinfo/users_guide.texi.in b/doc/texinfo/users_guide.texi.in deleted file mode 100644 index 79051a895a58462647682e37bd8986eb5d544ee2..0000000000000000000000000000000000000000 --- a/doc/texinfo/users_guide.texi.in +++ /dev/null @@ -1,150 +0,0 @@ -\input texinfo @c -*-texinfo-*- - -@c %**start of header -@setfilename users_guide.info -@settitle CHAMELEON User's Guide -@c %**end of header - -@include version.texi - -@c ############################################################################# - -@copying -Copyright @copyright{} 2017 Inria - -@noindent -Copyright @copyright{} 2014 The University of Tennessee - -@noindent -Copyright @copyright{} 2014 King Abdullah University of Science and Technology - -@quotation -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: -@itemize @bullet -@item -Redistributions of source code must retain the above copyright notice, this -list -of conditions and the following disclaimer. - -@item -Redistributions in binary form must reproduce the above copyright notice, this -list of conditions and the following disclaimer listed in this license in the -documentation and/or other materials provided with the distribution. - -@item -Neither the name of the copyright holders nor the names of its contributors may -be used to endorse or promote products derived from this software without -specific prior written permission. -@end itemize - -This software is provided by the copyright holders and contributors "as is" and -any express or implied warranties, including, but not limited to, the implied -warranties of merchantability and fitness for a particular purpose are -disclaimed. -In no event shall the copyright owner or contributors be liable for any direct, -indirect, incidental, special, exemplary, or consequential damages (including, -but not limited to, procurement of substitute goods or services; loss of use, -data, or profits; or business interruption) however caused and on any theory of -liability, whether in contract, strict liability, or tort (including negligence -or otherwise) arising in any way out of the use of this software, even if -advised of the possibility of such damage. -@end quotation -@end copying - -@c ############################################################################# - -@titlepage -@c @flushleft -@c @image{morse_header} -@c @end flushleft -@title CHAMELEON User's Guide -@subtitle Software of MORSE project - -@flushright -@strong{A dense linear algebra software for heterogeneous architectures} -@strong{Version @value{VERSION}} - -@strong{Inria} -@strong{University of Tennessee} -@strong{University of Colorado Denver} -@strong{King Abdullah University of Science and Technology} - -@end flushright - -@page -@vskip 0pt plus 1filll - -@insertcopying - -@end titlepage - -@c ############################################################################# - -@setchapternewpage odd -@dircategory Development -@direntry -* CHAMELEON: (chameleon). CHAMELEON User's Guide -@end direntry - - -@c @summarycontents -@contents -@page - -@ifnottex -@node Top -@top Preface - -This manual documents the usage of CHAMELEON version @value{VERSION}. -It was last updated on @value{UPDATED}. - -@insertcopying -@end ifnottex - -@comment -@comment When you add a new menu item, please keep the right hand -@comment aligned to the same column. Do not use tabs. This provides -@comment better formatting. -@comment -@menu -* Introduction:: Getting started -* Installing CHAMELEON:: How to configure, build and install CHAMELEON -* Configuring CHAMELEON:: How to configure CHAMELEON -* Using CHAMELEON:: How to run CHAMELEON application -@end menu - -@c --------------------------------------------------------------------- -@c Introduction to CHAMELEON -@c --------------------------------------------------------------------- - -@node Introduction -@chapter Introduction to CHAMELEON -@include @CMAKE_CURRENT_SOURCE_DIR@/chapters/introduction.texi - -@c --------------------------------------------------------------------- -@c Installing CHAMELEON -@c --------------------------------------------------------------------- - -@node Installing CHAMELEON -@chapter Installing CHAMELEON -@include @CMAKE_CURRENT_SOURCE_DIR@/chapters/installing.texi - -@c --------------------------------------------------------------------- -@c Configuration options -@c --------------------------------------------------------------------- - -@node Configuring CHAMELEON -@chapter Configuring CHAMELEON -@include @CMAKE_CURRENT_SOURCE_DIR@/chapters/configuration.texi - -@c --------------------------------------------------------------------- -@c Using CHAMELEON -@c --------------------------------------------------------------------- - -@node Using CHAMELEON -@chapter Using CHAMELEON -@include @CMAKE_CURRENT_SOURCE_DIR@/chapters/using.texi - - -@bye diff --git a/doc/texinfo/version.texi.in b/doc/texinfo/version.texi.in deleted file mode 100644 index 4af718ef2db0e49e2646f7aa0f5bd3630dd0577f..0000000000000000000000000000000000000000 --- a/doc/texinfo/version.texi.in +++ /dev/null @@ -1,4 +0,0 @@ -@set UPDATED 30 January 2017 -@set UPDATED-MONTH January 2017 -@set EDITION @CHAMELEON_VERSION_MAJOR@.@CHAMELEON_VERSION_MINOR@.@CHAMELEON_VERSION_MICRO@ -@set VERSION @CHAMELEON_VERSION_MAJOR@.@CHAMELEON_VERSION_MINOR@.@CHAMELEON_VERSION_MICRO@