diff --git a/CMakeLists.txt b/CMakeLists.txt index b7288df348781310118754a1598614884950fae6..a35babdceb29a16e5a5f3dc1cd251e6a959b5dce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -172,31 +172,31 @@ endif() # Use intermediate variable since cmake_dependent_option doesn't have OR conditions set(CHAMELEON_ENABLE_MPI OFF CACHE INTERNAL "Tells if MPI might be supported by the runtime") -if ( CHAMELEON_SCHED_PARSEC OR CHAMELEON_SCHED_STARPU ) - set(CHAMELEON_ENABLE_MPI ON FORCE) +if ( CHAMELEON_SCHED_STARPU ) + set(CHAMELEON_ENABLE_MPI ON FORCE) endif() # Use intermediate variable since cmake_dependent_option doesn't have OR conditions set(CHAMELEON_ENABLE_CUDA OFF CACHE INTERNAL "Tells if CUDA might be supported by the runtime") if ( CHAMELEON_SCHED_PARSEC OR CHAMELEON_SCHED_STARPU ) - set(CHAMELEON_ENABLE_CUDA ON FORCE) + set(CHAMELEON_ENABLE_CUDA ON FORCE) endif() set(CHAMELEON_ENABLE_HIP OFF CACHE INTERNAL "Tells if HIP might be supported by the runtime") if ( CHAMELEON_SCHED_STARPU ) - set(CHAMELEON_ENABLE_HIP ON FORCE) + set(CHAMELEON_ENABLE_HIP ON FORCE) endif() # Additional options # ------------------ -# Enable the distributed interface (allowed only when StarPU or PaRSEC is enabled) +# Enable the distributed interface (allowed only when StarPU is enabled) # TODO: Default should be changed to ON/OFF when it will be ok cmake_dependent_option(CHAMELEON_USE_MPI "Enable distributed memory through MPI" OFF "CHAMELEON_ENABLE_MPI" OFF) if (CHAMELEON_ENABLE_MPI AND NOT CHAMELEON_USE_MPI) - message("-- ${BoldGreen}CHAMELEON_USE_MPI is set to OFF, turn it ON to use MPI (unsupported by Quark)${ColourReset}") + message("-- ${BoldGreen}CHAMELEON_USE_MPI is set to OFF, turn it ON to use MPI (only supported with StarPU)${ColourReset}") endif() cmake_dependent_option(CHAMELEON_USE_MIGRATE @@ -213,7 +213,7 @@ cmake_dependent_option(CHAMELEON_USE_CUDA "Enable CUDA kernels" OFF "CHAMELEON_ENABLE_CUDA" OFF) if (CHAMELEON_ENABLE_CUDA AND NOT CHAMELEON_USE_CUDA) - message("-- ${BoldGreen}CHAMELEON_USE_CUDA is set to OFF, turn it ON to use CUDA (unsupported by Quark)${ColourReset}") + message("-- ${BoldGreen}CHAMELEON_USE_CUDA is set to OFF, turn it ON to use CUDA (unsupported by OpenMP and Quark)${ColourReset}") endif() cmake_dependent_option(CHAMELEON_USE_HIP_CUDA @@ -551,6 +551,11 @@ if(NOT CHAMELEON_SIMULATION) morse_export_imported_target(MPI MPI_C mpi chameleon) + if (CHAMELEON_SCHED STREQUAL "PARSEC") + message(FATAL_ERROR "CHAMELEON does not support PARSEC with MPI." + " To use PARSEC set CHAMELEON_USE_MPI=OFF.") + endif() + endif (CHAMELEON_USE_MPI) else (NOT CHAMELEON_SIMULATION) diff --git a/doc/user/chameleon.dot b/doc/user/chameleon.dot index 6cbc54d892d922b74c23952fe953084a9fe6f03c..99b65026d47ca04a583865193092336d72abc411 100644 --- a/doc/user/chameleon.dot +++ b/doc/user/chameleon.dot @@ -27,7 +27,6 @@ chameleon -> sched [lhead=cluster_runtime] sched:sched_starpu -> cuda [color=dimgrey] sched:sched_starpu -> mpi [color=dimgrey] sched:sched_parsec -> cuda [color=dimgrey] -sched:sched_parsec -> mpi [color=dimgrey] subgraph cluster_paradigm { label = "Paradigms" diff --git a/doc/user/chameleon.svg b/doc/user/chameleon.svg index 4a51dd62fcb01bd34df3e301af8ce6c7c6d61016..d1f9c6fc5d16720a383e7bc34f3fe10a5731e7bd 100644 --- a/doc/user/chameleon.svg +++ b/doc/user/chameleon.svg @@ -15,8 +15,8 @@ </g> <g id="clust2" class="cluster"> <title>cluster_paradigm</title> -<path fill="transparent" stroke="black" d="M191,-8C191,-8 317,-8 317,-8 323,-8 329,-14 329,-20 329,-20 329,-71 329,-71 329,-77 323,-83 317,-83 317,-83 191,-83 191,-83 185,-83 179,-77 179,-71 179,-71 179,-20 179,-20 179,-14 185,-8 191,-8"/> -<text text-anchor="middle" x="254" y="-67.8" font-family="Times,serif" font-size="14.00">Paradigms</text> +<path fill="transparent" stroke="black" d="M208,-8C208,-8 334,-8 334,-8 340,-8 346,-14 346,-20 346,-20 346,-71 346,-71 346,-77 340,-83 334,-83 334,-83 208,-83 208,-83 202,-83 196,-77 196,-71 196,-71 196,-20 196,-20 196,-14 202,-8 208,-8"/> +<text text-anchor="middle" x="271" y="-67.8" font-family="Times,serif" font-size="14.00">Paradigms</text> </g> <g id="clust3" class="cluster"> <title>cluster_kernel</title> @@ -50,14 +50,14 @@ <!-- mpi --> <g id="node4" class="node"> <title>mpi</title> -<polygon fill="none" stroke="gold" stroke-width="2" points="241,-52 187,-52 187,-16 241,-16 241,-52"/> -<text text-anchor="start" x="197" y="-31.3" font-family="Times,serif" font-weight="bold" font-size="14.00">MPI</text> +<polygon fill="none" stroke="gold" stroke-width="2" points="338,-52 284,-52 284,-16 338,-16 338,-52"/> +<text text-anchor="start" x="294" y="-31.3" font-family="Times,serif" font-weight="bold" font-size="14.00">MPI</text> </g> <!-- chameleon->mpi --> -<g id="edge10" class="edge"> +<g id="edge9" class="edge"> <title>chameleon->mpi</title> -<path fill="none" stroke="dimgrey" d="M359.38,-174.94C355.97,-172.58 352.78,-169.94 350,-167 324.84,-140.35 347.29,-113.02 318,-91 296.72,-75 280.81,-92.5 259.5,-86.95"/> -<polygon fill="dimgrey" stroke="dimgrey" points="260.58,-83.61 250,-83 257.89,-90.07 260.58,-83.61"/> +<path fill="none" stroke="dimgrey" d="M358.3,-174.72C355.24,-172.43 352.42,-169.86 350,-167 340.22,-155.41 330.55,-122.87 323.28,-92.99"/> +<polygon fill="dimgrey" stroke="dimgrey" points="326.63,-91.92 320.92,-83 319.82,-93.54 326.63,-91.92"/> </g> <!-- cublas --> <g id="node5" class="node"> @@ -66,7 +66,7 @@ <text text-anchor="start" x="370.5" y="-114.8" font-family="Times,serif" font-weight="bold" font-size="14.00">cuBLAS</text> </g> <!-- chameleon->cublas --> -<g id="edge11" class="edge"> +<g id="edge10" class="edge"> <title>chameleon->cublas</title> <path fill="none" stroke="black" d="M401,-174.95C401,-166.3 401,-155.57 401,-145.79"/> <polygon fill="black" stroke="black" points="404.5,-145.71 401,-135.71 397.5,-145.71 404.5,-145.71"/> @@ -78,7 +78,7 @@ <text text-anchor="start" x="465.5" y="-114.8" font-family="Times,serif" font-weight="bold" font-size="14.00">LAPACKE</text> </g> <!-- chameleon->lapacke --> -<g id="edge9" class="edge"> +<g id="edge8" class="edge"> <title>chameleon->lapacke</title> <path fill="none" stroke="black" d="M436.21,-174.98C440.62,-172.47 444.98,-169.78 449,-167 459.25,-159.89 469.65,-151.03 478.55,-142.83"/> <polygon fill="black" stroke="black" points="481.07,-145.27 485.94,-135.86 476.27,-140.18 481.07,-145.27"/> @@ -90,7 +90,7 @@ <text text-anchor="start" x="481" y="-31.3" font-family="Times,serif" font-weight="bold" font-size="14.00">CBLAS</text> </g> <!-- chameleon->cblas --> -<g id="edge8" class="edge"> +<g id="edge7" class="edge"> <title>chameleon->cblas</title> <path fill="none" stroke="black" d="M453.77,-190.72C494,-188.5 544.7,-182.59 558,-167 579.93,-141.31 570.04,-122.56 558,-91 553.46,-79.11 545.11,-68.14 536.52,-59.12"/> <polygon fill="black" stroke="black" points="538.9,-56.55 529.32,-52.03 533.98,-61.54 538.9,-56.55"/> @@ -98,41 +98,35 @@ <!-- cuda --> <g id="node3" class="node"> <title>cuda</title> -<polygon fill="none" stroke="gold" stroke-width="2" points="321,-52 259,-52 259,-16 321,-16 321,-52"/> -<text text-anchor="start" x="267" y="-31.3" font-family="Times,serif" font-weight="bold" font-size="14.00">CUDA</text> +<polygon fill="none" stroke="gold" stroke-width="2" points="266,-52 204,-52 204,-16 266,-16 266,-52"/> +<text text-anchor="start" x="212" y="-31.3" font-family="Times,serif" font-weight="bold" font-size="14.00">CUDA</text> </g> <!-- sched->cuda --> <g id="edge2" class="edge"> <title>sched:sched_starpu->cuda</title> -<path fill="none" stroke="dimgrey" d="M273,-98.5C273,-86.18 276.02,-72.96 279.52,-61.79"/> -<polygon fill="dimgrey" stroke="dimgrey" points="282.88,-62.78 282.79,-52.19 276.25,-60.52 282.88,-62.78"/> +<path fill="none" stroke="dimgrey" d="M273,-98.5C273,-84.71 266.32,-71.33 258.57,-60.43"/> +<polygon fill="dimgrey" stroke="dimgrey" points="261.12,-58.01 252.22,-52.26 255.59,-62.3 261.12,-58.01"/> </g> <!-- sched->cuda --> <g id="edge4" class="edge"> <title>sched:sched_parsec->cuda</title> -<path fill="none" stroke="dimgrey" d="M131,-98.5C131,-71.83 226.63,-95.85 250,-83 259.7,-77.67 268.01,-68.97 274.52,-60.42"/> -<polygon fill="dimgrey" stroke="dimgrey" points="277.5,-62.26 280.39,-52.07 271.78,-58.23 277.5,-62.26"/> +<path fill="none" stroke="dimgrey" d="M131,-98.5C131,-67.65 164.74,-51.35 193.78,-43.02"/> +<polygon fill="dimgrey" stroke="dimgrey" points="195.01,-46.32 203.8,-40.4 193.24,-39.55 195.01,-46.32"/> </g> <!-- sched->mpi --> <g id="edge3" class="edge"> <title>sched:sched_starpu->mpi</title> -<path fill="none" stroke="dimgrey" d="M273,-98.5C273,-86.17 258.93,-91.49 250,-83 243.05,-76.4 236.45,-68.34 230.84,-60.73"/> -<polygon fill="dimgrey" stroke="dimgrey" points="233.38,-58.26 224.75,-52.11 227.66,-62.3 233.38,-58.26"/> -</g> -<!-- sched->mpi --> -<g id="edge5" class="edge"> -<title>sched:sched_parsec->mpi</title> -<path fill="none" stroke="dimgrey" d="M131,-98.5C131,-73.42 155.21,-56.91 177.47,-47.02"/> -<polygon fill="dimgrey" stroke="dimgrey" points="178.9,-50.22 186.82,-43.19 176.24,-43.75 178.9,-50.22"/> +<path fill="none" stroke="dimgrey" d="M273,-98.5C273,-84.71 279.68,-71.33 287.43,-60.43"/> +<polygon fill="dimgrey" stroke="dimgrey" points="290.41,-62.3 293.78,-52.26 284.88,-58.01 290.41,-62.3"/> </g> <!-- cublas->cuda --> -<g id="edge7" class="edge"> +<g id="edge6" class="edge"> <title>cublas->cuda</title> -<path fill="none" stroke="black" d="M373.95,-99.4C366.12,-94.28 357.63,-88.54 350,-83 339.46,-75.35 328.23,-66.55 318.37,-58.59"/> -<polygon fill="black" stroke="black" points="320.42,-55.75 310.45,-52.15 316,-61.18 320.42,-55.75"/> +<path fill="none" stroke="black" d="M362.41,-103.64C348.51,-99.28 332.64,-94.6 318,-91 299.12,-86.36 291.77,-92.83 275,-83 265.59,-77.49 257.41,-68.91 250.93,-60.52"/> +<polygon fill="black" stroke="black" points="253.72,-58.39 245.05,-52.31 248.03,-62.47 253.72,-58.39"/> </g> <!-- lapacke->cblas --> -<g id="edge6" class="edge"> +<g id="edge5" class="edge"> <title>lapacke->cblas</title> <path fill="none" stroke="black" d="M504.06,-99.22C504.72,-88.52 505.57,-74.55 506.32,-62.37"/> <polygon fill="black" stroke="black" points="509.82,-62.45 506.94,-52.26 502.84,-62.03 509.82,-62.45"/> diff --git a/doc/user/chapters/installing.org b/doc/user/chapters/installing.org index c28bee958a769afd6e0d4f857a53f1bb9b756f22..de4e616663832344647386f10e9ac4b481bc7e1f 100644 --- a/doc/user/chapters/installing.org +++ b/doc/user/chapters/installing.org @@ -208,7 +208,7 @@ Finally some packages or also available for [[sec:ug:debian][Debian/Ubuntu]] and ***** PaRSEC [[http://icl.utk.edu/parsec/][PaRSEC]] is a generic framework for architecture aware scheduling and management of micro-tasks on distributed many-core - heterogeneous architectures. It can be used with MPI and Cuda. + heterogeneous architectures. *Caution about the compatibility:* Chameleon is compatible with this version @@ -217,9 +217,9 @@ Finally some packages or also available for [[sec:ug:debian][Debian/Ubuntu]] and [[http://icl.cs.utk.edu/quark/][QUARK]] (QUeuing And Runtime for Kernels) provides a library that enables the dynamic execution of tasks with data dependencies in a multi-core, multi-socket, shared-memory environment. When - Chameleon is linked with QUARK, it is not possible to exploit + Chameleon is linked with QUARK or OPENMP, it is not possible to exploit neither CUDA (for GPUs) nor MPI (distributed-memory environment). - You can use PaRSEC or StarPU to do so. + You can use StarPU to do so. *Caution about the compatibility:* Chameleon has been mainly tested with the QUARK library coming from https://github.com/ecrc/quark. @@ -366,7 +366,6 @@ Finally some packages or also available for [[sec:ug:debian][Debian/Ubuntu]] and * *CHAMELEON_USE_MPI=ON|OFF* (default OFF): to link with MPI library (message passing implementation for use of multiple nodes with distributed memory), can only be used with StarPU - and PaRSEC * *CHAMELEON_USE_CUDA=ON|OFF* (default OFF): to link with CUDA runtime (implementation paradigm for accelerated codes on Nvidia GPUs) and cuBLAS library (optimized BLAS kernels on Nvidia GPUs), can only @@ -460,6 +459,9 @@ Finally some packages or also available for [[sec:ug:debian][Debian/Ubuntu]] and Note that PaRSEC and StarPU are only detected with pkg-config mechanism because it is always provided and this avoids errors. + The [[https://cmake.org/cmake/help/latest/variable/CMAKE_PREFIX_PATH.html#variable:CMAKE_PREFIX_PATH][CMAKE_PREFIX_PATH]] + can be used to indicate where dependencies are installed. + *** Distribution Debian :PROPERTIES: :CUSTOM_ID: doc-install-debian diff --git a/doc/user/chapters/introduction.org b/doc/user/chapters/introduction.org index 86821edecebffef514ab0b73bab7d8bc6b13cec4..8802d35c3fcc67219fde0666dbca5b567b2a81ed 100644 --- a/doc/user/chapters/introduction.org +++ b/doc/user/chapters/introduction.org @@ -139,11 +139,14 @@ Chameleon is based on the [[http://icl.cs.utk.edu/plasma/][PLASMA]] source code but is not limited to shared-memory environment and can exploit multiple GPUs. - Chameleon is interfaced in a generic way with [[http://runtime.bordeaux.inria.fr/StarPU/][StarPU]], [[http://icl.utk.edu/parsec/][PaRSEC]], - [[http://icl.cs.utk.edu/quark/][QUARK]] runtime systems. This feature allows to analyze in a - unified framework how sequential task-based algorithms behave + Chameleon is interfaced in a generic way with + [[http://runtime.bordeaux.inria.fr/StarPU/][StarPU]], + [[http://icl.utk.edu/parsec/][PaRSEC]], + [[https://www.openmp.org/][OpenMP]], + [[http://icl.cs.utk.edu/quark/][QUARK]] runtime systems. + This feature allows to analyze in a unified framework how sequential task-based algorithms behave regarding different runtime systems implementations. Using - Chameleon with *StarPU* or *PaRSEC* runtime systems allows to exploit + Chameleon with *StarPU* runtime system allows to exploit GPUs through kernels provided by [[https://developer.nvidia.com/cublas][cuBLAS]] and clusters of interconnected nodes with distributed memory (using [[http://www.open-mpi.org/][MPI]]). Computation of very large systems with dense matrices on a cluster diff --git a/doc/user/homepage.org b/doc/user/homepage.org index f2a364273bce421c2f83f3dc7db968c10142d5cb..b00bde6dc1f35379bcf09b5efa6419279969347c 100644 --- a/doc/user/homepage.org +++ b/doc/user/homepage.org @@ -27,7 +27,7 @@ StarPU, PaRSEC, QUARK, OpenMP runtime systems. This feature allows to analyze in a unified framework how sequential task-based algorithms behave regarding different runtime systems - implementations. Using Chameleon with StarPU or PaRSEC runtime + implementations. Using Chameleon with StarPU runtime systems allows to exploit GPUs through kernels provided by cuBLAS and clusters of interconnected nodes with distributed memory (using MPI).