diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e01e26e32b69683fa226bb64ddc2b5e840a9fa28..270076d745be68d22de64561940f0b22fa4ed86f 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -14,6 +14,7 @@ include: - .gitlab/docker.yml - .gitlab/build.yml - .gitlab/test_starpu.yml + - .gitlab/test_starpu_plafrim.yml - .gitlab/test_starpu_simgrid.yml - .gitlab/test_parsec.yml - .gitlab/test_quark.yml diff --git a/.gitlab/build.sh b/.gitlab/build.sh index 0d17fd1444511508505ac74072203612484ff18f..e631b37f26033fb772a37cc2bba77796a841d549 100755 --- a/.gitlab/build.sh +++ b/.gitlab/build.sh @@ -59,7 +59,7 @@ case $SYSTEM in esac # Compile -eval '${SCAN}cmake --build build-${VERSION} -j 4 > /dev/null' +eval '${SCAN}cmake --build build-${VERSION} -j ${CMAKE_BUILD_PARALLEL_LEVEL} > /dev/null' # Install cmake --install build-${VERSION} diff --git a/.gitlab/build.yml b/.gitlab/build.yml index e5d6743322aec15b494fc88129fb6f2672b926b3..2d449df9d37a7f6c3c927944d96c1497f02e010f 100644 --- a/.gitlab/build.yml +++ b/.gitlab/build.yml @@ -2,6 +2,8 @@ .build_script_common: stage: build extends: .only-master-mr + variables: + CMAKE_BUILD_PARALLEL_LEVEL: 4 artifacts: name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG" expire_in: 1 week @@ -10,6 +12,8 @@ .build_script_linux: extends: .build_script_common + variables: + SYSTEM: linux script: - bash .gitlab/build.sh | tee ${LOGNAME}.log @@ -56,29 +60,31 @@ build_starpu: .build_script_guix: tags: ['plafrim'] extends: .build_script_common + variables: + SYSTEM: guix + CMAKE_BUILD_PARALLEL_LEVEL: 20 script: - guix describe -f channels - - guix shell --pure --preserve=SYSTEM --preserve=VERSION --preserve=LOGNAME --preserve=BUILD_OPTIONS - -D chameleon-${GUIX_CHAMELEON_VARIANT} coreutils bash-minimal -- + - guix shell --pure --preserve=SYSTEM --preserve=VERSION --preserve=LOGNAME --preserve=BUILD_OPTIONS --preserve=CMAKE_BUILD_PARALLEL_LEVEL + -D chameleon-${GPU_BACKEND} ${BLAS} coreutils bash-minimal -- bash .gitlab/build.sh | tee ${LOGNAME}.log build_starpu_cuda: extends: .build_script_guix variables: - SYSTEM: guix VERSION: starpu_cuda - GUIX_CHAMELEON_VARIANT: cuda + GPU_BACKEND: cuda + BLAS: "--with-input=openblas=intel-oneapi-mkl" LOGNAME: "chameleon-build-${SYSTEM}-${VERSION}" - BUILD_OPTIONS: "-DCHAMELEON_USE_CUDA=ON -DCHAMELEON_USE_MPI=ON -DBLA_VENDOR=OpenBLAS" + BUILD_OPTIONS: "-DCHAMELEON_USE_CUDA=ON -DCHAMELEON_USE_MPI=OFF -DBLA_VENDOR=Intel10_64lp_seq" build_starpu_hip: extends: .build_script_guix variables: - SYSTEM: guix VERSION: starpu_hip - GUIX_CHAMELEON_VARIANT: hip + GPU_BACKEND: hip LOGNAME: "chameleon-build-${SYSTEM}-${VERSION}" - BUILD_OPTIONS: "-DCHAMELEON_USE_HIP_ROC=ON -DCHAMELEON_USE_MPI=ON -DBLA_VENDOR=OpenBLAS -DCMAKE_C_COMPILER=gcc -DCMAKE_Fortran_COMPILER=gfortran" + BUILD_OPTIONS: "-DCHAMELEON_USE_HIP_ROC=ON -DCHAMELEON_USE_MPI=OFF -DBLA_VENDOR=OpenBLAS -DCMAKE_C_COMPILER=gcc -DCMAKE_Fortran_COMPILER=gfortran" build_starpu_simgrid: extends: .build_script_linux @@ -97,11 +103,13 @@ build_starpu_macosx: SYSTEM: macosx VERSION: starpu LOGNAME: "chameleon-build-${SYSTEM}-${VERSION}" + CMAKE_BUILD_PARALLEL_LEVEL: 4 script: - bash .gitlab/build.sh | tee ${LOGNAME}.log artifacts: name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG" expire_in: 1 week + when: always paths: - ${LOGNAME}.log cache: @@ -119,11 +127,13 @@ build_starpu_macosx: MSYSTEM: UCRT64 VERSION: starpu LOGNAME: "chameleon-build-${SYSTEM}-${VERSION}" + CMAKE_BUILD_PARALLEL_LEVEL: 4 script: - bash -lc .gitlab/build.sh | tee "$env:LOGNAME.log" artifacts: name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG" expire_in: 1 week + when: always paths: - ${LOGNAME}.log cache: diff --git a/.gitlab/common.yml b/.gitlab/common.yml index 9e679f1c4e1b4eabe97865e7d1ff3a9fbaa57244..f4e30eb4b89a16504b85edf3900a0740fb108427 100644 --- a/.gitlab/common.yml +++ b/.gitlab/common.yml @@ -37,5 +37,6 @@ variables: name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG" expire_in: 1 week untracked: true + when: always reports: junit: ${LOGNAME}-junit.xml diff --git a/.gitlab/sbatch.sh b/.gitlab/sbatch.sh new file mode 100755 index 0000000000000000000000000000000000000000..d202d068699b939a985bbc120534955f41511ed6 --- /dev/null +++ b/.gitlab/sbatch.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# sbatch.sh : submit slurm jobs and wait for completion before exiting +set -x + +if [ $# -gt 0 ] +then + JOB_NAME=$1 +fi +JOB_NAME=${JOB_NAME:-chameleon} + +# to get kernels execution on both cpus and gpus +export STARPU_SCHED=random + +# execution commands +sbatch --wait \ + --job-name="$JOB_NAME" \ + --output="$JOB_NAME.out" \ + --nodes=1 \ + --exclusive --ntasks-per-node=1 --threads-per-core=1 \ + --constraint="$SLURM_CONSTRAINTS" \ + --time=01:00:00 \ + $(dirname "$0")/test.sh +# get the error code from the last command: sbatch --wait ... +err=$? + +cat $JOB_NAME.out + +# exit with error code from the guix command +exit $err diff --git a/.gitlab/test.sh b/.gitlab/test.sh index 0a4a8639fa4a54d1121f149e57264ed3ad2338fa..27ba5484ba023eaec8bad5cf2018da8c4008b97f 100755 --- a/.gitlab/test.sh +++ b/.gitlab/test.sh @@ -1,5 +1,4 @@ #!/usr/bin/env bash - set -e set -x diff --git a/.gitlab/test_starpu.yml b/.gitlab/test_starpu.yml index efc3bd783aa0b64355500d6bb8a5fa4b233f5e59..1adc4997f75890d7fe557417a8b46855cecba0ce 100644 --- a/.gitlab/test_starpu.yml +++ b/.gitlab/test_starpu.yml @@ -1,5 +1,5 @@ --- -.test_starpu: +.test_script_starpu: extends: .test_script_linux needs: [build_starpu] variables: @@ -9,7 +9,7 @@ TESTS_RESTRICTION: "-R _${CATEGORY}_${PRECISION}|example -E sytrf|sysv" test_starpu_master: - extends: .test_starpu + extends: .test_script_starpu rules: - if: ($CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH || $CI_COMMIT_BRANCH =~ /^ci-.*$/) && $CI_PIPELINE_SOURCE != "schedule" parallel: @@ -18,7 +18,7 @@ test_starpu_master: CATEGORY: [shm, mpi] test_starpu_mr: - extends: .test_starpu + extends: .test_script_starpu rules: - if: ($CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME !~ /^notest-.*$/) parallel: @@ -43,6 +43,7 @@ test_starpu_macosx: artifacts: name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG" expire_in: 1 week + when: always paths: - ${LOGNAME}.log reports: @@ -69,6 +70,7 @@ test_starpu_macosx: artifacts: name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG" expire_in: 1 week + when: always paths: - ${LOGNAME}.log reports: diff --git a/.gitlab/test_starpu_plafrim.yml b/.gitlab/test_starpu_plafrim.yml new file mode 100644 index 0000000000000000000000000000000000000000..c5178e6cac3910716b87ed85d3cb795413c0a0b9 --- /dev/null +++ b/.gitlab/test_starpu_plafrim.yml @@ -0,0 +1,46 @@ +--- +.test_script_starpu_plafrim: + tags: ['plafrim'] + stage: test + variables: + SYSTEM: guix + script: + - guix describe -f channels | tee guix-channels.scm + - guix shell --pure --preserve=SYSTEM --preserve=VERSION --preserve=LOGNAME --preserve=BUILD_OPTIONS --preserve=TESTS_RESTRICTION --preserve=SLURM_CONSTRAINTS --preserve=LD_PRELOAD + -D chameleon-${GPU_BACKEND} ${BLAS} slurm coreutils inetutils bash-minimal -- + bash .gitlab/sbatch.sh ${LOGNAME} | tee ${LOGNAME}.log + artifacts: + name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG" + expire_in: 1 week + when: always + paths: + - guix-channels.scm + - ${LOGNAME}.log + reports: + junit: ${LOGNAME}-junit.xml + +.test_script_starpu_plafrim_cuda: + extends: .test_script_starpu_plafrim + needs: [build_starpu_cuda] + variables: + VERSION: starpu_cuda + GPU_BACKEND: cuda + BLAS: "--with-input=openblas=intel-oneapi-mkl" + LOGNAME: "chameleon-${SYSTEM}-${VERSION}" + BUILD_OPTIONS: "-DCHAMELEON_USE_CUDA=ON -DCHAMELEON_USE_MPI=OFF -DBLA_VENDOR=Intel10_64lp_seq" + SLURM_CONSTRAINTS: "sirocco" + LD_PRELOAD: "/usr/lib64/libcuda.so" + +test_starpu_cuda_master: + extends: .test_script_starpu_plafrim_cuda + rules: + - if: ($CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH || $CI_COMMIT_BRANCH =~ /^ci-.*$/) && $CI_PIPELINE_SOURCE != "schedule" + variables: + TESTS_RESTRICTION: "-R test_shm_gpu" + +test_starpu_cuda_mr: + extends: .test_script_starpu_plafrim_cuda + rules: + - if: ($CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME !~ /^notest-.*$/) + variables: + TESTS_RESTRICTION: "-R test_shm_gpu_d|test_shm_gpu_c" diff --git a/compute/pztradd.c b/compute/pztradd.c index 5c092c937be0d58683b8e2ce570e0c49e022d022..78d7a3d282e8a4d08bd57b91ffe2935fe337210e 100644 --- a/compute/pztradd.c +++ b/compute/pztradd.c @@ -29,16 +29,17 @@ /** * Parallel tile matrix-matrix multiplication - dynamic scheduling */ -void chameleon_pztradd(cham_uplo_t uplo, cham_trans_t trans, - CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, - CHAMELEON_Complex64_t beta, CHAM_desc_t *B, - RUNTIME_sequence_t *sequence, RUNTIME_request_t *request) +void chameleon_pztradd( cham_uplo_t uplo, cham_trans_t trans, + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, + CHAMELEON_Complex64_t beta, CHAM_desc_t *B, + RUNTIME_sequence_t *sequence, + RUNTIME_request_t *request ) { CHAM_context_t *chamctxt; RUNTIME_option_t options; int tempmm, tempnn, tempmn, tempnm; - int m, n; + int m, n, minmn; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) { @@ -46,10 +47,12 @@ void chameleon_pztradd(cham_uplo_t uplo, cham_trans_t trans, } RUNTIME_options_init(&options, chamctxt, sequence, request); + minmn = chameleon_min( B->mt, B->nt ); + switch(uplo){ case ChamLower: if (trans == ChamNoTrans) { - for (n = 0; n < chameleon_min(B->mt,B->nt); n++) { + for (n = 0; n < minmn; n++) { tempnm = n == B->mt-1 ? B->m-n*B->mb : B->mb; tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; @@ -72,8 +75,8 @@ void chameleon_pztradd(cham_uplo_t uplo, cham_trans_t trans, } else { for (n = 0; n < chameleon_min(B->mt,B->nt); n++) { - tempnm = n == B->mt-1 ? B->m-n*B->mb : B->mb; - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + tempnm = n == B->mt-1 ? B->m - n * B->mb : B->mb; + tempnn = n == B->nt-1 ? B->n - n * B->nb : B->nb; INSERT_TASK_ztradd( &options, @@ -95,9 +98,9 @@ void chameleon_pztradd(cham_uplo_t uplo, cham_trans_t trans, break; case ChamUpper: if (trans == ChamNoTrans) { - for (m = 0; m < chameleon_min(B->mt,B->nt); m++) { - tempmm = m == B->mt-1 ? B->m-B->mb*m : B->nb; - tempmn = m == B->nt-1 ? B->n-m*B->nb : B->nb; + for (m = 0; m < minmn; m++) { + tempmm = m == B->mt-1 ? B->m - m * B->mb : B->nb; + tempmn = m == B->nt-1 ? B->n - m * B->nb : B->nb; INSERT_TASK_ztradd( &options, @@ -106,7 +109,7 @@ void chameleon_pztradd(cham_uplo_t uplo, cham_trans_t trans, beta, B(m, m)); for (n = m+1; n < B->nt; n++) { - tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; + tempnn = n == B->nt-1 ? B->n - n * B->nb : B->nb; INSERT_TASK_zgeadd( &options, diff --git a/compute/zgepdf_qr.c b/compute/zgepdf_qr.c index 35622025f4e1e051222f4f4515a08d0c952c9f19..d138b051773768788421ceb691c7c59d3e433617 100644 --- a/compute/zgepdf_qr.c +++ b/compute/zgepdf_qr.c @@ -82,10 +82,12 @@ int CHAMELEON_zgepdf_qr_Tile( int doqr, int optid, CHAM_desc_t *A1, CHAM_desc_t *TS1, CHAM_desc_t *TT1, CHAM_desc_t *Q1, CHAM_desc_t *A2, CHAM_desc_t *TS2, CHAM_desc_t *TT2, CHAM_desc_t *Q2 ) { - CHAM_context_t *chamctxt; + CHAM_context_t *chamctxt; RUNTIME_sequence_t *sequence = NULL; - RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; - int status; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + CHAM_desc_t D1, *D1ptr = NULL; + CHAM_desc_t D2, *D2ptr = NULL; + int status; chamctxt = chameleon_context_self(); if (chamctxt == NULL) { @@ -94,16 +96,36 @@ int CHAMELEON_zgepdf_qr_Tile( int doqr, int optid, } chameleon_sequence_create( chamctxt, &sequence ); +#if defined(CHAMELEON_COPY_DIAG) + { + int n = A1->n; + chameleon_zdesc_copy_and_restrict( A1, &D1, A1->m, n ); + D1ptr = &D1; + chameleon_zdesc_copy_and_restrict( A2, &D2, A2->m, n ); + D2ptr = &D2; + } +#endif + chameleon_pzgepdf_qr( 1, doqr, optid, qrtreeT, qrtreeB, - A1, TS1, TT1, NULL, Q1, - A2, TS2, TT2, NULL, Q2, + A1, TS1, TT1, D1ptr, Q1, + A2, TS2, TT2, D2ptr, Q2, sequence, &request ); CHAMELEON_Desc_Flush( Q1, sequence ); CHAMELEON_Desc_Flush( Q2, sequence ); + if ( D1ptr != NULL ) { + CHAMELEON_Desc_Flush( D1ptr, sequence ); + CHAMELEON_Desc_Flush( D2ptr, sequence ); + } chameleon_sequence_wait( chamctxt, sequence ); status = sequence->status; chameleon_sequence_destroy( chamctxt, sequence ); + + if ( D1ptr != NULL ) { + chameleon_desc_destroy( D1ptr ); + chameleon_desc_destroy( D2ptr ); + } + return status; } diff --git a/coreblas/compute/core_zlascal.c b/coreblas/compute/core_zlascal.c index 4355e893f0fbf106606661f0a87c27f4151c8634..801977297fcc6d9f49d8468a6e5a2e294e2ccd4e 100644 --- a/coreblas/compute/core_zlascal.c +++ b/coreblas/compute/core_zlascal.c @@ -27,9 +27,10 @@ * * @ingroup CORE_CHAMELEON_Complex64_t * - * CORE_zlascal scales a two-dimensional matrix A. As opposite to - * CORE_zlascl(), no checks is performed to prevent under/overflow. This should - * have been done at higher level. + * CORE_zlascal scales a two-dimensional matrix A. + * As opposed to CORE_zlascl(), no checks is performed to prevent + * under/overflow and scaling with a complex is possible. This should have been + * done at higher level. * ******************************************************************************* * diff --git a/coreblas/compute/core_ztradd.c b/coreblas/compute/core_ztradd.c index 924ec0fca06c4649b92519cc667cc1a92e1cf50c..de868b032af9b440abf2616d3027c4e6825d988e 100644 --- a/coreblas/compute/core_ztradd.c +++ b/coreblas/compute/core_ztradd.c @@ -146,8 +146,7 @@ int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N, M, N, 0., 0., B, LDB ); } else if ( beta != (CHAMELEON_Complex64_t)1. ) { - LAPACKE_zlascl_work( LAPACK_COL_MAJOR, chameleon_lapack_const(uplo), - 0, 0, 1., beta, M, N, B, LDB ); + CORE_zlascal( uplo, M, N, beta, B, LDB ); } /** diff --git a/runtime/starpu/codelets/codelet_map.c b/runtime/starpu/codelets/codelet_map.c index 7461c3f5aed2697acffb09623fc35e9f34224350..0e9120cd3bbe8c0511000213cbe57b5899e5d955 100644 --- a/runtime/starpu/codelets/codelet_map.c +++ b/runtime/starpu/codelets/codelet_map.c @@ -228,6 +228,7 @@ void INSERT_TASK_map( const RUNTIME_option_t *options, int exec = 0; int i, readonly = 1; size_t clargs_size = 0; + uint32_t where = 0; void (*callback)(void*); if ( ( ndata < 0 ) || ( ndata > 3 ) ) { @@ -275,6 +276,17 @@ void INSERT_TASK_map( const RUNTIME_option_t *options, (data[i].desc)->get_blktile( data[i].desc, m, n ) ); } + /* Where to execute */ + if ( op_fcts->cpufunc ) { + where |= STARPU_CPU; + } + if ( op_fcts->cudafunc ) { + where |= STARPU_CUDA; + } + if ( op_fcts->hipfunc ) { + where |= STARPU_HIP; + } + /* Insert the task */ switch( ndata ) { case 1: @@ -291,6 +303,7 @@ void INSERT_TASK_map( const RUNTIME_option_t *options, STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, + STARPU_EXECUTE_WHERE, where, STARPU_NAME, cl_name, 0 ); break; @@ -310,6 +323,7 @@ void INSERT_TASK_map( const RUNTIME_option_t *options, STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, + STARPU_EXECUTE_WHERE, where, STARPU_NAME, cl_name, 0 ); break; @@ -330,6 +344,7 @@ void INSERT_TASK_map( const RUNTIME_option_t *options, STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, + STARPU_EXECUTE_WHERE, where, STARPU_NAME, cl_name, 0 ); break; diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c index b03e2443c8a4fb53517b7d617020f373de76ff6d..438968e0af89c1c3378e00cb22e03235200e7fb8 100644 --- a/runtime/starpu/codelets/codelet_zgemm.c +++ b/runtime/starpu/codelets/codelet_zgemm.c @@ -144,6 +144,7 @@ void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options, int accessC; int exec = 0; const char *cl_name = "zgemm_Astat"; + uint32_t where = cl_zgemm.where; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; @@ -188,6 +189,13 @@ void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options, B->get_blktile( B, Bm, Bn ), C->get_blktile( C, Cm, Cn ) ); + /* WARNING: CUDA 12.3 has an issue when k=1 in complex, thus we disable gemm on gpu in these cases */ +#if defined(PRECISION_z) || defined(PRECISION_c) + if ( k == 1 ) { + where = STARPU_CPU; + } +#endif + /* Insert the task */ rt_starpu_insert_task( &cl_zgemm, @@ -204,6 +212,7 @@ void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_NODE, A->get_rankof(A, Am, An), STARPU_NAME, cl_name, + STARPU_EXECUTE_WHERE, where, 0 ); } @@ -214,7 +223,7 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options, const CHAM_desc_t *B, int Bm, int Bn, CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) { - if ( alpha == 0. ) { + if ( alpha == (CHAMELEON_Complex64_t)0. ) { INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, beta, C, Cm, Cn ); return; @@ -225,6 +234,7 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options, int accessC; int exec = 0; const char *cl_name = "zgemm"; + uint32_t where = cl_zgemm.where; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; @@ -249,7 +259,8 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options, callback = options->profiling ? cl_zgemm_callback : NULL; /* Reduce the C access if needed */ - accessC = ( beta == 0. ) ? STARPU_W : (STARPU_RW | ((beta == 1.) ? STARPU_COMMUTE : 0)); + accessC = ( beta == (CHAMELEON_Complex64_t)0. ) ? STARPU_W : + (STARPU_RW | ((beta == (CHAMELEON_Complex64_t)1.) ? STARPU_COMMUTE : 0)); /* Refine name */ cl_name = chameleon_codelet_name( cl_name, 3, @@ -257,6 +268,13 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options, B->get_blktile( B, Bm, Bn ), C->get_blktile( C, Cm, Cn ) ); + /* WARNING: CUDA 12.3 has an issue when k=1 in complex, thus we disable gemm on gpu in these cases */ +#if defined(PRECISION_z) || defined(PRECISION_c) + if ( k == 1 ) { + where = STARPU_CPU; + } +#endif + /* Insert the task */ rt_starpu_insert_task( &cl_zgemm, @@ -274,5 +292,6 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options, STARPU_EXECUTE_ON_WORKER, options->workerid, STARPU_POSSIBLY_PARALLEL, options->parallel, STARPU_NAME, cl_name, + STARPU_EXECUTE_WHERE, where, 0 ); } diff --git a/runtime/starpu/control/runtime_descriptor_ipiv.c b/runtime/starpu/control/runtime_descriptor_ipiv.c index 977c8676e3acad779f2c1f9e27ec59c92026e3d1..da2c1ff59c2d9a4a6d8cff6cd15b5eef45966e18 100644 --- a/runtime/starpu/control/runtime_descriptor_ipiv.c +++ b/runtime/starpu/control/runtime_descriptor_ipiv.c @@ -96,15 +96,15 @@ void *RUNTIME_ipiv_getaddr( const CHAM_ipiv_t *ipiv, int m ) return (void*)(*handle); } - const CHAM_desc_t *A = ipiv->desc; - int owner = A->get_rankof( A, m, m ); int ncols = (mm == (ipiv->mt-1)) ? ipiv->m - mm * ipiv->mb : ipiv->mb; starpu_vector_data_register( handle, -1, (uintptr_t)NULL, ncols, sizeof(int) ); #if defined(CHAMELEON_USE_MPI) { - int64_t tag = ipiv->mpitag_ipiv + mm; + const CHAM_desc_t *A = ipiv->desc; + int owner = A->get_rankof( A, m, m ); + int64_t tag = ipiv->mpitag_ipiv + mm; starpu_mpi_data_register( *handle, tag, owner ); } #endif /* defined(CHAMELEON_USE_MPI) */ @@ -173,15 +173,15 @@ void *RUNTIME_perm_getaddr( const CHAM_ipiv_t *ipiv, int m ) return (void*)(*handle); } - const CHAM_desc_t *A = ipiv->desc; - int owner = A->get_rankof( A, m, m ); int ncols = ipiv->mb; starpu_vector_data_register( handle, -1, (uintptr_t)NULL, ncols, sizeof(int) ); #if defined(CHAMELEON_USE_MPI) { - int64_t tag = ipiv->mpitag_perm + mm; + const CHAM_desc_t *A = ipiv->desc; + int owner = A->get_rankof( A, m, m ); + int64_t tag = ipiv->mpitag_perm + mm; starpu_mpi_data_register( *handle, tag, owner ); } #endif /* defined(CHAMELEON_USE_MPI) */ @@ -202,15 +202,15 @@ void *RUNTIME_invp_getaddr( const CHAM_ipiv_t *ipiv, int m ) return (void*)(*handle); } - const CHAM_desc_t *A = ipiv->desc; - int owner = A->get_rankof( A, m, m ); int ncols = ipiv->mb; starpu_vector_data_register( handle, -1, (uintptr_t)NULL, ncols, sizeof(int) ); #if defined(CHAMELEON_USE_MPI) { - int64_t tag = ipiv->mpitag_invp + mm; + const CHAM_desc_t *A = ipiv->desc; + int owner = A->get_rankof( A, m, m ); + int64_t tag = ipiv->mpitag_invp + mm; starpu_mpi_data_register( *handle, tag, owner ); } #endif /* defined(CHAMELEON_USE_MPI) */ @@ -303,6 +303,7 @@ void RUNTIME_perm_flushk( const RUNTIME_sequence_t *sequence, (void)sequence; (void)ipiv; (void)m; + (void)A; } void RUNTIME_ipiv_gather( const RUNTIME_sequence_t *sequence, @@ -312,14 +313,13 @@ void RUNTIME_ipiv_gather( const RUNTIME_sequence_t *sequence, int64_t mb = desc->mb; int64_t tag = chameleon_starpu_tag_book( (int64_t)(desc->mt) ); int rank = CHAMELEON_Comm_rank(); - int owner = rank; int m; for (m = 0; m < mt; m++, ipiv += mb) { starpu_data_handle_t ipiv_src = RUNTIME_ipiv_getaddr( desc, m ); #if defined(CHAMELEON_USE_MPI) - owner = starpu_mpi_data_get_rank( ipiv_src ); + int owner = starpu_mpi_data_get_rank( ipiv_src ); if ( node != owner ) { starpu_mpi_tag_t tag = starpu_mpi_data_get_tag( ipiv_src ); diff --git a/testing/testing_zcheck_aux.c b/testing/testing_zcheck_aux.c index 9ea85852eb5ad53b05fc5772d3db14a67b018e8f..98ec027e8ea26ac3140d4269421e97f8d89c79f4 100644 --- a/testing/testing_zcheck_aux.c +++ b/testing/testing_zcheck_aux.c @@ -453,6 +453,15 @@ int check_zsum_std( run_arg_list_t *args, cham_uplo_t uplo, cham_trans_t trans, run_arg_add_double( args, "||B||", Binitnorm ); run_arg_add_double( args, "||R||", Rnorm ); + if ( alpha != 0. ) { + Anorm = Anorm * cabs(alpha); + } + if ( beta != 0. ) { + Binitnorm = Binitnorm * cabs(beta); + } + + result = Rnorm / (max(Anorm, Binitnorm) * eps); + /* Verifies if the result is inside a threshold */ if ( isnan(Rnorm) || isinf(Rnorm) || isnan(result) || isinf(result) || (result > 10.0) ) { info_solution = 1; diff --git a/testing/testing_zsysv.c b/testing/testing_zsysv.c index 9de378761a97377e5e2981f028efee6c2960f8c4..4ae24d4fec3a861b10716f5f241c0c7b66fc62bf 100644 --- a/testing/testing_zsysv.c +++ b/testing/testing_zsysv.c @@ -80,7 +80,7 @@ testing_zsysv_desc( run_arg_list_t *args, int check ) /* Check the factorization */ descA0 = CHAMELEON_Desc_Copy( descA, CHAMELEON_MAT_ALLOC_TILE ); - CHAMELEON_zplgsy_Tile( (double)N, uplo, descA0, seedA ); + CHAMELEON_zplgsy_Tile( (double)N, ChamUpperLower, descA0, seedA ); hres += check_zxxtrf( args, ChamSymmetric, uplo, descA0, descA ); diff --git a/testing/testing_zsytrf.c b/testing/testing_zsytrf.c index d48e91c9d0ca0996ab705424822adca8f37cfbc7..0bc75fa82f1e5e967bc6a94c4aaaa91ad5f76901 100644 --- a/testing/testing_zsytrf.c +++ b/testing/testing_zsytrf.c @@ -63,7 +63,7 @@ testing_zsytrf_desc( run_arg_list_t *args, int check ) /* Checks the factorisation and residue */ if ( check ) { CHAM_desc_t *descA0 = CHAMELEON_Desc_Copy( descA, CHAMELEON_MAT_ALLOC_TILE ); - CHAMELEON_zplgsy_Tile( (double)N, uplo, descA0, seedA ); + CHAMELEON_zplgsy_Tile( (double)N, ChamUpperLower, descA0, seedA ); hres += check_zxxtrf( args, ChamSymmetric, uplo, descA0, descA ); diff --git a/testing/testing_ztradd.c b/testing/testing_ztradd.c index e80d8f3ad4685d5eb0a9a64337c8cd9b8842659c..7ea8b8e7b614474c3b31ea6d479612a11d775873 100644 --- a/testing/testing_ztradd.c +++ b/testing/testing_ztradd.c @@ -241,7 +241,7 @@ testing_t test_ztradd; const char *ztradd_params[] = { "mtxfmt", "nb", "trans", "uplo", "m", "n", "lda", "ldb", "alpha", "beta", "seedA", "seedB", NULL }; const char *ztradd_output[] = { NULL }; -const char *ztradd_outchk[] = { "RETURN", NULL }; +const char *ztradd_outchk[] = { "||A||", "||B||", "||R||", "RETURN", NULL }; /** * @brief Testing registration function