diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index e01e26e32b69683fa226bb64ddc2b5e840a9fa28..270076d745be68d22de64561940f0b22fa4ed86f 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -14,6 +14,7 @@ include:
   - .gitlab/docker.yml
   - .gitlab/build.yml
   - .gitlab/test_starpu.yml
+  - .gitlab/test_starpu_plafrim.yml
   - .gitlab/test_starpu_simgrid.yml
   - .gitlab/test_parsec.yml
   - .gitlab/test_quark.yml
diff --git a/.gitlab/build.sh b/.gitlab/build.sh
index 0d17fd1444511508505ac74072203612484ff18f..e631b37f26033fb772a37cc2bba77796a841d549 100755
--- a/.gitlab/build.sh
+++ b/.gitlab/build.sh
@@ -59,7 +59,7 @@ case $SYSTEM in
 esac
 
 # Compile
-eval '${SCAN}cmake --build build-${VERSION} -j 4 > /dev/null'
+eval '${SCAN}cmake --build build-${VERSION} -j ${CMAKE_BUILD_PARALLEL_LEVEL} > /dev/null'
 
 # Install
 cmake --install build-${VERSION}
diff --git a/.gitlab/build.yml b/.gitlab/build.yml
index e5d6743322aec15b494fc88129fb6f2672b926b3..2d449df9d37a7f6c3c927944d96c1497f02e010f 100644
--- a/.gitlab/build.yml
+++ b/.gitlab/build.yml
@@ -2,6 +2,8 @@
 .build_script_common:
   stage: build
   extends: .only-master-mr
+  variables:
+    CMAKE_BUILD_PARALLEL_LEVEL: 4
   artifacts:
     name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
     expire_in: 1 week
@@ -10,6 +12,8 @@
 
 .build_script_linux:
   extends: .build_script_common
+  variables:
+    SYSTEM: linux
   script:
     - bash .gitlab/build.sh | tee ${LOGNAME}.log
 
@@ -56,29 +60,31 @@ build_starpu:
 .build_script_guix:
   tags: ['plafrim']
   extends: .build_script_common
+  variables:
+    SYSTEM: guix
+    CMAKE_BUILD_PARALLEL_LEVEL: 20
   script:
     - guix describe -f channels
-    - guix shell --pure --preserve=SYSTEM --preserve=VERSION --preserve=LOGNAME --preserve=BUILD_OPTIONS
-                 -D chameleon-${GUIX_CHAMELEON_VARIANT} coreutils bash-minimal --
+    - guix shell --pure --preserve=SYSTEM --preserve=VERSION --preserve=LOGNAME --preserve=BUILD_OPTIONS --preserve=CMAKE_BUILD_PARALLEL_LEVEL
+                 -D chameleon-${GPU_BACKEND} ${BLAS} coreutils bash-minimal --
                  bash .gitlab/build.sh | tee ${LOGNAME}.log
 
 build_starpu_cuda:
   extends: .build_script_guix
   variables:
-    SYSTEM: guix
     VERSION: starpu_cuda
-    GUIX_CHAMELEON_VARIANT: cuda
+    GPU_BACKEND: cuda
+    BLAS: "--with-input=openblas=intel-oneapi-mkl"
     LOGNAME: "chameleon-build-${SYSTEM}-${VERSION}"
-    BUILD_OPTIONS: "-DCHAMELEON_USE_CUDA=ON -DCHAMELEON_USE_MPI=ON -DBLA_VENDOR=OpenBLAS"
+    BUILD_OPTIONS: "-DCHAMELEON_USE_CUDA=ON -DCHAMELEON_USE_MPI=OFF -DBLA_VENDOR=Intel10_64lp_seq"
 
 build_starpu_hip:
   extends: .build_script_guix
   variables:
-    SYSTEM: guix
     VERSION: starpu_hip
-    GUIX_CHAMELEON_VARIANT: hip
+    GPU_BACKEND: hip
     LOGNAME: "chameleon-build-${SYSTEM}-${VERSION}"
-    BUILD_OPTIONS: "-DCHAMELEON_USE_HIP_ROC=ON -DCHAMELEON_USE_MPI=ON -DBLA_VENDOR=OpenBLAS -DCMAKE_C_COMPILER=gcc -DCMAKE_Fortran_COMPILER=gfortran"
+    BUILD_OPTIONS: "-DCHAMELEON_USE_HIP_ROC=ON -DCHAMELEON_USE_MPI=OFF -DBLA_VENDOR=OpenBLAS -DCMAKE_C_COMPILER=gcc -DCMAKE_Fortran_COMPILER=gfortran"
 
 build_starpu_simgrid:
   extends: .build_script_linux
@@ -97,11 +103,13 @@ build_starpu_macosx:
     SYSTEM: macosx
     VERSION: starpu
     LOGNAME: "chameleon-build-${SYSTEM}-${VERSION}"
+    CMAKE_BUILD_PARALLEL_LEVEL: 4
   script:
     - bash .gitlab/build.sh | tee ${LOGNAME}.log
   artifacts:
     name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
     expire_in: 1 week
+    when: always
     paths:
       - ${LOGNAME}.log
   cache:
@@ -119,11 +127,13 @@ build_starpu_macosx:
     MSYSTEM: UCRT64
     VERSION: starpu
     LOGNAME: "chameleon-build-${SYSTEM}-${VERSION}"
+    CMAKE_BUILD_PARALLEL_LEVEL: 4
   script:
     - bash -lc .gitlab/build.sh | tee "$env:LOGNAME.log"
   artifacts:
     name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
     expire_in: 1 week
+    when: always
     paths:
       - ${LOGNAME}.log
   cache:
diff --git a/.gitlab/common.yml b/.gitlab/common.yml
index 9e679f1c4e1b4eabe97865e7d1ff3a9fbaa57244..f4e30eb4b89a16504b85edf3900a0740fb108427 100644
--- a/.gitlab/common.yml
+++ b/.gitlab/common.yml
@@ -37,5 +37,6 @@ variables:
     name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
     expire_in: 1 week
     untracked: true
+    when: always
     reports:
       junit: ${LOGNAME}-junit.xml
diff --git a/.gitlab/sbatch.sh b/.gitlab/sbatch.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d202d068699b939a985bbc120534955f41511ed6
--- /dev/null
+++ b/.gitlab/sbatch.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# sbatch.sh : submit slurm jobs and wait for completion before exiting
+set -x
+
+if [ $# -gt 0 ]
+then
+    JOB_NAME=$1
+fi
+JOB_NAME=${JOB_NAME:-chameleon}
+
+# to get kernels execution on both cpus and gpus
+export STARPU_SCHED=random
+
+# execution commands
+sbatch --wait \
+       --job-name="$JOB_NAME" \
+       --output="$JOB_NAME.out" \
+       --nodes=1 \
+       --exclusive --ntasks-per-node=1 --threads-per-core=1 \
+       --constraint="$SLURM_CONSTRAINTS" \
+       --time=01:00:00 \
+       $(dirname "$0")/test.sh
+# get the error code from the last command: sbatch --wait ...
+err=$?
+
+cat $JOB_NAME.out
+
+# exit with error code from the guix command
+exit $err
diff --git a/.gitlab/test.sh b/.gitlab/test.sh
index 0a4a8639fa4a54d1121f149e57264ed3ad2338fa..27ba5484ba023eaec8bad5cf2018da8c4008b97f 100755
--- a/.gitlab/test.sh
+++ b/.gitlab/test.sh
@@ -1,5 +1,4 @@
 #!/usr/bin/env bash
-
 set -e
 set -x
 
diff --git a/.gitlab/test_starpu.yml b/.gitlab/test_starpu.yml
index efc3bd783aa0b64355500d6bb8a5fa4b233f5e59..1adc4997f75890d7fe557417a8b46855cecba0ce 100644
--- a/.gitlab/test_starpu.yml
+++ b/.gitlab/test_starpu.yml
@@ -1,5 +1,5 @@
 ---
-.test_starpu:
+.test_script_starpu:
   extends: .test_script_linux
   needs: [build_starpu]
   variables:
@@ -9,7 +9,7 @@
     TESTS_RESTRICTION: "-R _${CATEGORY}_${PRECISION}|example -E sytrf|sysv"
 
 test_starpu_master:
-  extends: .test_starpu
+  extends: .test_script_starpu
   rules:
     - if: ($CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH || $CI_COMMIT_BRANCH =~ /^ci-.*$/) && $CI_PIPELINE_SOURCE != "schedule"
   parallel:
@@ -18,7 +18,7 @@ test_starpu_master:
         CATEGORY: [shm, mpi]
 
 test_starpu_mr:
-  extends: .test_starpu
+  extends: .test_script_starpu
   rules:
     - if: ($CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME !~ /^notest-.*$/)
   parallel:
@@ -43,6 +43,7 @@ test_starpu_macosx:
   artifacts:
     name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
     expire_in: 1 week
+    when: always
     paths:
       - ${LOGNAME}.log
     reports:
@@ -69,6 +70,7 @@ test_starpu_macosx:
   artifacts:
     name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
     expire_in: 1 week
+    when: always
     paths:
       - ${LOGNAME}.log
     reports:
diff --git a/.gitlab/test_starpu_plafrim.yml b/.gitlab/test_starpu_plafrim.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c5178e6cac3910716b87ed85d3cb795413c0a0b9
--- /dev/null
+++ b/.gitlab/test_starpu_plafrim.yml
@@ -0,0 +1,46 @@
+---
+.test_script_starpu_plafrim:
+  tags: ['plafrim']
+  stage: test
+  variables:
+    SYSTEM: guix
+  script:
+    - guix describe -f channels | tee guix-channels.scm
+    - guix shell --pure --preserve=SYSTEM --preserve=VERSION --preserve=LOGNAME --preserve=BUILD_OPTIONS --preserve=TESTS_RESTRICTION --preserve=SLURM_CONSTRAINTS --preserve=LD_PRELOAD
+                 -D chameleon-${GPU_BACKEND} ${BLAS} slurm coreutils inetutils bash-minimal --
+                 bash .gitlab/sbatch.sh ${LOGNAME} | tee ${LOGNAME}.log
+  artifacts:
+    name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
+    expire_in: 1 week
+    when: always
+    paths:
+      - guix-channels.scm
+      - ${LOGNAME}.log
+    reports:
+      junit: ${LOGNAME}-junit.xml
+
+.test_script_starpu_plafrim_cuda:
+  extends: .test_script_starpu_plafrim
+  needs: [build_starpu_cuda]
+  variables:
+    VERSION: starpu_cuda
+    GPU_BACKEND: cuda
+    BLAS: "--with-input=openblas=intel-oneapi-mkl"
+    LOGNAME: "chameleon-${SYSTEM}-${VERSION}"
+    BUILD_OPTIONS: "-DCHAMELEON_USE_CUDA=ON -DCHAMELEON_USE_MPI=OFF -DBLA_VENDOR=Intel10_64lp_seq"
+    SLURM_CONSTRAINTS: "sirocco"
+    LD_PRELOAD: "/usr/lib64/libcuda.so"
+
+test_starpu_cuda_master:
+  extends: .test_script_starpu_plafrim_cuda
+  rules:
+    - if: ($CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH || $CI_COMMIT_BRANCH =~ /^ci-.*$/) && $CI_PIPELINE_SOURCE != "schedule"
+  variables:
+    TESTS_RESTRICTION: "-R test_shm_gpu"
+
+test_starpu_cuda_mr:
+  extends: .test_script_starpu_plafrim_cuda
+  rules:
+    - if: ($CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_SOURCE_BRANCH_NAME !~ /^notest-.*$/)
+  variables:
+    TESTS_RESTRICTION: "-R test_shm_gpu_d|test_shm_gpu_c"
diff --git a/compute/pztradd.c b/compute/pztradd.c
index 5c092c937be0d58683b8e2ce570e0c49e022d022..78d7a3d282e8a4d08bd57b91ffe2935fe337210e 100644
--- a/compute/pztradd.c
+++ b/compute/pztradd.c
@@ -29,16 +29,17 @@
 /**
  *  Parallel tile matrix-matrix multiplication - dynamic scheduling
  */
-void chameleon_pztradd(cham_uplo_t uplo, cham_trans_t trans,
-                   CHAMELEON_Complex64_t alpha, CHAM_desc_t *A,
-                   CHAMELEON_Complex64_t beta,  CHAM_desc_t *B,
-                   RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
+void chameleon_pztradd( cham_uplo_t uplo, cham_trans_t trans,
+                        CHAMELEON_Complex64_t alpha, CHAM_desc_t *A,
+                        CHAMELEON_Complex64_t beta,  CHAM_desc_t *B,
+                        RUNTIME_sequence_t *sequence,
+                        RUNTIME_request_t  *request )
 {
     CHAM_context_t *chamctxt;
     RUNTIME_option_t options;
 
     int tempmm, tempnn, tempmn, tempnm;
-    int m, n;
+    int m, n, minmn;
 
     chamctxt = chameleon_context_self();
     if (sequence->status != CHAMELEON_SUCCESS) {
@@ -46,10 +47,12 @@ void chameleon_pztradd(cham_uplo_t uplo, cham_trans_t trans,
     }
     RUNTIME_options_init(&options, chamctxt, sequence, request);
 
+    minmn = chameleon_min( B->mt, B->nt );
+
     switch(uplo){
     case ChamLower:
         if (trans == ChamNoTrans) {
-            for (n = 0; n < chameleon_min(B->mt,B->nt); n++) {
+            for (n = 0; n < minmn; n++) {
                 tempnm = n == B->mt-1 ? B->m-n*B->mb : B->mb;
                 tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb;
 
@@ -72,8 +75,8 @@ void chameleon_pztradd(cham_uplo_t uplo, cham_trans_t trans,
         }
         else {
             for (n = 0; n < chameleon_min(B->mt,B->nt); n++) {
-                tempnm = n == B->mt-1 ? B->m-n*B->mb : B->mb;
-                tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb;
+                tempnm = n == B->mt-1 ? B->m - n * B->mb : B->mb;
+                tempnn = n == B->nt-1 ? B->n - n * B->nb : B->nb;
 
                 INSERT_TASK_ztradd(
                     &options,
@@ -95,9 +98,9 @@ void chameleon_pztradd(cham_uplo_t uplo, cham_trans_t trans,
         break;
     case ChamUpper:
         if (trans == ChamNoTrans) {
-            for (m = 0; m < chameleon_min(B->mt,B->nt); m++) {
-                tempmm = m == B->mt-1 ? B->m-B->mb*m : B->nb;
-                tempmn = m == B->nt-1 ? B->n-m*B->nb : B->nb;
+            for (m = 0; m < minmn; m++) {
+                tempmm = m == B->mt-1 ? B->m - m * B->mb : B->nb;
+                tempmn = m == B->nt-1 ? B->n - m * B->nb : B->nb;
 
                 INSERT_TASK_ztradd(
                     &options,
@@ -106,7 +109,7 @@ void chameleon_pztradd(cham_uplo_t uplo, cham_trans_t trans,
                     beta,  B(m, m));
 
                 for (n = m+1; n < B->nt; n++) {
-                    tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb;
+                    tempnn = n == B->nt-1 ? B->n - n * B->nb : B->nb;
 
                     INSERT_TASK_zgeadd(
                         &options,
diff --git a/compute/zgepdf_qr.c b/compute/zgepdf_qr.c
index 35622025f4e1e051222f4f4515a08d0c952c9f19..d138b051773768788421ceb691c7c59d3e433617 100644
--- a/compute/zgepdf_qr.c
+++ b/compute/zgepdf_qr.c
@@ -82,10 +82,12 @@ int CHAMELEON_zgepdf_qr_Tile( int doqr, int optid,
                               CHAM_desc_t *A1, CHAM_desc_t *TS1, CHAM_desc_t *TT1, CHAM_desc_t *Q1,
                               CHAM_desc_t *A2, CHAM_desc_t *TS2, CHAM_desc_t *TT2, CHAM_desc_t *Q2 )
 {
-    CHAM_context_t *chamctxt;
+    CHAM_context_t     *chamctxt;
     RUNTIME_sequence_t *sequence = NULL;
-    RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER;
-    int status;
+    RUNTIME_request_t   request = RUNTIME_REQUEST_INITIALIZER;
+    CHAM_desc_t         D1, *D1ptr = NULL;
+    CHAM_desc_t         D2, *D2ptr = NULL;
+    int                 status;
 
     chamctxt = chameleon_context_self();
     if (chamctxt == NULL) {
@@ -94,16 +96,36 @@ int CHAMELEON_zgepdf_qr_Tile( int doqr, int optid,
     }
     chameleon_sequence_create( chamctxt, &sequence );
 
+#if defined(CHAMELEON_COPY_DIAG)
+    {
+        int n = A1->n;
+        chameleon_zdesc_copy_and_restrict( A1, &D1, A1->m, n );
+        D1ptr = &D1;
+        chameleon_zdesc_copy_and_restrict( A2, &D2, A2->m, n );
+        D2ptr = &D2;
+    }
+#endif
+
     chameleon_pzgepdf_qr( 1, doqr, optid, qrtreeT, qrtreeB,
-                          A1, TS1, TT1, NULL, Q1,
-                          A2, TS2, TT2, NULL, Q2,
+                          A1, TS1, TT1, D1ptr, Q1,
+                          A2, TS2, TT2, D2ptr, Q2,
                           sequence, &request );
 
     CHAMELEON_Desc_Flush( Q1, sequence );
     CHAMELEON_Desc_Flush( Q2, sequence );
 
+    if ( D1ptr != NULL ) {
+        CHAMELEON_Desc_Flush( D1ptr, sequence );
+        CHAMELEON_Desc_Flush( D2ptr, sequence );
+    }
     chameleon_sequence_wait( chamctxt, sequence );
     status = sequence->status;
     chameleon_sequence_destroy( chamctxt, sequence );
+
+    if ( D1ptr != NULL ) {
+        chameleon_desc_destroy( D1ptr );
+        chameleon_desc_destroy( D2ptr );
+    }
+
     return status;
 }
diff --git a/coreblas/compute/core_zlascal.c b/coreblas/compute/core_zlascal.c
index 4355e893f0fbf106606661f0a87c27f4151c8634..801977297fcc6d9f49d8468a6e5a2e294e2ccd4e 100644
--- a/coreblas/compute/core_zlascal.c
+++ b/coreblas/compute/core_zlascal.c
@@ -27,9 +27,10 @@
  *
  * @ingroup CORE_CHAMELEON_Complex64_t
  *
- *  CORE_zlascal scales a two-dimensional matrix A. As opposite to
- *  CORE_zlascl(), no checks is performed to prevent under/overflow. This should
- *  have been done at higher level.
+ *  CORE_zlascal scales a two-dimensional matrix A.
+ *  As opposed to CORE_zlascl(), no checks is performed to prevent
+ *  under/overflow and scaling with a complex is possible. This should have been
+ *  done at higher level.
  *
  *******************************************************************************
  *
diff --git a/coreblas/compute/core_ztradd.c b/coreblas/compute/core_ztradd.c
index 924ec0fca06c4649b92519cc667cc1a92e1cf50c..de868b032af9b440abf2616d3027c4e6825d988e 100644
--- a/coreblas/compute/core_ztradd.c
+++ b/coreblas/compute/core_ztradd.c
@@ -146,8 +146,7 @@ int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N,
                              M, N, 0., 0., B, LDB );
     }
     else if ( beta != (CHAMELEON_Complex64_t)1. ) {
-        LAPACKE_zlascl_work( LAPACK_COL_MAJOR, chameleon_lapack_const(uplo),
-                             0, 0, 1., beta, M, N, B, LDB );
+        CORE_zlascal( uplo, M, N, beta, B, LDB );
     }
 
     /**
diff --git a/runtime/starpu/codelets/codelet_map.c b/runtime/starpu/codelets/codelet_map.c
index 7461c3f5aed2697acffb09623fc35e9f34224350..0e9120cd3bbe8c0511000213cbe57b5899e5d955 100644
--- a/runtime/starpu/codelets/codelet_map.c
+++ b/runtime/starpu/codelets/codelet_map.c
@@ -228,6 +228,7 @@ void INSERT_TASK_map( const RUNTIME_option_t *options,
     int                   exec    = 0;
     int                   i, readonly = 1;
     size_t                clargs_size = 0;
+    uint32_t              where       = 0;
     void (*callback)(void*);
 
     if ( ( ndata < 0 ) || ( ndata > 3 ) ) {
@@ -275,6 +276,17 @@ void INSERT_TASK_map( const RUNTIME_option_t *options,
                                           (data[i].desc)->get_blktile( data[i].desc, m, n ) );
     }
 
+    /* Where to execute */
+    if ( op_fcts->cpufunc ) {
+        where |= STARPU_CPU;
+    }
+    if ( op_fcts->cudafunc ) {
+        where |= STARPU_CUDA;
+    }
+    if ( op_fcts->hipfunc ) {
+        where |= STARPU_HIP;
+    }
+
     /* Insert the task */
     switch( ndata ) {
     case 1:
@@ -291,6 +303,7 @@ void INSERT_TASK_map( const RUNTIME_option_t *options,
             STARPU_PRIORITY,          options->priority,
             STARPU_CALLBACK,          callback,
             STARPU_EXECUTE_ON_WORKER, options->workerid,
+            STARPU_EXECUTE_WHERE,     where,
             STARPU_NAME,              cl_name,
             0 );
         break;
@@ -310,6 +323,7 @@ void INSERT_TASK_map( const RUNTIME_option_t *options,
             STARPU_PRIORITY,          options->priority,
             STARPU_CALLBACK,          callback,
             STARPU_EXECUTE_ON_WORKER, options->workerid,
+            STARPU_EXECUTE_WHERE,     where,
             STARPU_NAME,              cl_name,
             0 );
         break;
@@ -330,6 +344,7 @@ void INSERT_TASK_map( const RUNTIME_option_t *options,
             STARPU_PRIORITY,          options->priority,
             STARPU_CALLBACK,          callback,
             STARPU_EXECUTE_ON_WORKER, options->workerid,
+            STARPU_EXECUTE_WHERE,     where,
             STARPU_NAME,              cl_name,
             0 );
         break;
diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c
index b03e2443c8a4fb53517b7d617020f373de76ff6d..438968e0af89c1c3378e00cb22e03235200e7fb8 100644
--- a/runtime/starpu/codelets/codelet_zgemm.c
+++ b/runtime/starpu/codelets/codelet_zgemm.c
@@ -144,6 +144,7 @@ void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options,
     int                      accessC;
     int                      exec    = 0;
     const char              *cl_name = "zgemm_Astat";
+    uint32_t                 where   = cl_zgemm.where;
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
@@ -188,6 +189,13 @@ void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options,
                                       B->get_blktile( B, Bm, Bn ),
                                       C->get_blktile( C, Cm, Cn ) );
 
+    /* WARNING: CUDA 12.3 has an issue when k=1 in complex, thus we disable gemm on gpu in these cases */
+#if defined(PRECISION_z) || defined(PRECISION_c)
+    if ( k == 1 ) {
+        where = STARPU_CPU;
+    }
+#endif
+
     /* Insert the task */
     rt_starpu_insert_task(
         &cl_zgemm,
@@ -204,6 +212,7 @@ void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_NODE,   A->get_rankof(A, Am, An),
         STARPU_NAME,              cl_name,
+        STARPU_EXECUTE_WHERE,     where,
         0 );
 }
 
@@ -214,7 +223,7 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
                                                      const CHAM_desc_t *B, int Bm, int Bn,
                         CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
 {
-    if ( alpha == 0. ) {
+    if ( alpha == (CHAMELEON_Complex64_t)0. ) {
         INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
                              beta, C, Cm, Cn );
         return;
@@ -225,6 +234,7 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
     int                      accessC;
     int                      exec = 0;
     const char              *cl_name = "zgemm";
+    uint32_t                 where   = cl_zgemm.where;
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
@@ -249,7 +259,8 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
     callback = options->profiling ? cl_zgemm_callback : NULL;
 
     /* Reduce the C access if needed */
-    accessC = ( beta == 0. ) ? STARPU_W : (STARPU_RW | ((beta == 1.) ? STARPU_COMMUTE : 0));
+    accessC = ( beta == (CHAMELEON_Complex64_t)0. ) ? STARPU_W :
+        (STARPU_RW | ((beta == (CHAMELEON_Complex64_t)1.) ? STARPU_COMMUTE : 0));
 
     /* Refine name */
     cl_name = chameleon_codelet_name( cl_name, 3,
@@ -257,6 +268,13 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
                                       B->get_blktile( B, Bm, Bn ),
                                       C->get_blktile( C, Cm, Cn ) );
 
+    /* WARNING: CUDA 12.3 has an issue when k=1 in complex, thus we disable gemm on gpu in these cases */
+#if defined(PRECISION_z) || defined(PRECISION_c)
+    if ( k == 1 ) {
+        where = STARPU_CPU;
+    }
+#endif
+
     /* Insert the task */
     rt_starpu_insert_task(
         &cl_zgemm,
@@ -274,5 +292,6 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
         STARPU_POSSIBLY_PARALLEL, options->parallel,
         STARPU_NAME,              cl_name,
+        STARPU_EXECUTE_WHERE,     where,
         0 );
 }
diff --git a/runtime/starpu/control/runtime_descriptor_ipiv.c b/runtime/starpu/control/runtime_descriptor_ipiv.c
index 977c8676e3acad779f2c1f9e27ec59c92026e3d1..da2c1ff59c2d9a4a6d8cff6cd15b5eef45966e18 100644
--- a/runtime/starpu/control/runtime_descriptor_ipiv.c
+++ b/runtime/starpu/control/runtime_descriptor_ipiv.c
@@ -96,15 +96,15 @@ void *RUNTIME_ipiv_getaddr( const CHAM_ipiv_t *ipiv, int m )
         return (void*)(*handle);
     }
 
-    const CHAM_desc_t *A = ipiv->desc;
-    int owner = A->get_rankof( A, m, m );
     int ncols = (mm == (ipiv->mt-1)) ? ipiv->m - mm * ipiv->mb : ipiv->mb;
 
     starpu_vector_data_register( handle, -1, (uintptr_t)NULL, ncols, sizeof(int) );
 
 #if defined(CHAMELEON_USE_MPI)
     {
-        int64_t tag = ipiv->mpitag_ipiv + mm;
+        const CHAM_desc_t *A     = ipiv->desc;
+        int                owner = A->get_rankof( A, m, m );
+        int64_t            tag   = ipiv->mpitag_ipiv + mm;
         starpu_mpi_data_register( *handle, tag, owner );
     }
 #endif /* defined(CHAMELEON_USE_MPI) */
@@ -173,15 +173,15 @@ void *RUNTIME_perm_getaddr( const CHAM_ipiv_t *ipiv, int m )
         return (void*)(*handle);
     }
 
-    const CHAM_desc_t *A = ipiv->desc;
-    int owner = A->get_rankof( A, m, m );
     int ncols = ipiv->mb;
 
     starpu_vector_data_register( handle, -1, (uintptr_t)NULL, ncols, sizeof(int) );
 
 #if defined(CHAMELEON_USE_MPI)
     {
-        int64_t tag = ipiv->mpitag_perm + mm;
+        const CHAM_desc_t *A     = ipiv->desc;
+        int                owner = A->get_rankof( A, m, m );
+        int64_t            tag   = ipiv->mpitag_perm + mm;
         starpu_mpi_data_register( *handle, tag, owner );
     }
 #endif /* defined(CHAMELEON_USE_MPI) */
@@ -202,15 +202,15 @@ void *RUNTIME_invp_getaddr( const CHAM_ipiv_t *ipiv, int m )
         return (void*)(*handle);
     }
 
-    const CHAM_desc_t *A = ipiv->desc;
-    int owner = A->get_rankof( A, m, m );
     int ncols = ipiv->mb;
 
     starpu_vector_data_register( handle, -1, (uintptr_t)NULL, ncols, sizeof(int) );
 
 #if defined(CHAMELEON_USE_MPI)
     {
-        int64_t tag = ipiv->mpitag_invp + mm;
+        const CHAM_desc_t *A     = ipiv->desc;
+        int                owner = A->get_rankof( A, m, m );
+        int64_t            tag   = ipiv->mpitag_invp + mm;
         starpu_mpi_data_register( *handle, tag, owner );
     }
 #endif /* defined(CHAMELEON_USE_MPI) */
@@ -303,6 +303,7 @@ void RUNTIME_perm_flushk( const RUNTIME_sequence_t *sequence,
     (void)sequence;
     (void)ipiv;
     (void)m;
+    (void)A;
 }
 
 void RUNTIME_ipiv_gather( const RUNTIME_sequence_t *sequence,
@@ -312,14 +313,13 @@ void RUNTIME_ipiv_gather( const RUNTIME_sequence_t *sequence,
     int64_t mb   = desc->mb;
     int64_t tag  = chameleon_starpu_tag_book( (int64_t)(desc->mt) );
     int     rank = CHAMELEON_Comm_rank();
-    int     owner = rank;
     int     m;
 
     for (m = 0; m < mt; m++, ipiv += mb) {
         starpu_data_handle_t ipiv_src = RUNTIME_ipiv_getaddr( desc, m );
 
 #if defined(CHAMELEON_USE_MPI)
-        owner = starpu_mpi_data_get_rank( ipiv_src );
+        int owner = starpu_mpi_data_get_rank( ipiv_src );
         if ( node != owner ) {
             starpu_mpi_tag_t tag = starpu_mpi_data_get_tag( ipiv_src );
 
diff --git a/testing/testing_zcheck_aux.c b/testing/testing_zcheck_aux.c
index 9ea85852eb5ad53b05fc5772d3db14a67b018e8f..98ec027e8ea26ac3140d4269421e97f8d89c79f4 100644
--- a/testing/testing_zcheck_aux.c
+++ b/testing/testing_zcheck_aux.c
@@ -453,6 +453,15 @@ int check_zsum_std( run_arg_list_t *args, cham_uplo_t uplo, cham_trans_t trans,
     run_arg_add_double( args, "||B||", Binitnorm );
     run_arg_add_double( args, "||R||", Rnorm );
 
+    if ( alpha != 0. ) {
+        Anorm = Anorm * cabs(alpha);
+    }
+    if ( beta != 0. ) {
+        Binitnorm = Binitnorm * cabs(beta);
+    }
+
+    result = Rnorm / (max(Anorm, Binitnorm) * eps);
+
     /* Verifies if the result is inside a threshold */
     if ( isnan(Rnorm) || isinf(Rnorm) || isnan(result) || isinf(result) || (result > 10.0) ) {
         info_solution = 1;
diff --git a/testing/testing_zsysv.c b/testing/testing_zsysv.c
index 9de378761a97377e5e2981f028efee6c2960f8c4..4ae24d4fec3a861b10716f5f241c0c7b66fc62bf 100644
--- a/testing/testing_zsysv.c
+++ b/testing/testing_zsysv.c
@@ -80,7 +80,7 @@ testing_zsysv_desc( run_arg_list_t *args, int check )
 
         /* Check the factorization */
         descA0 = CHAMELEON_Desc_Copy( descA, CHAMELEON_MAT_ALLOC_TILE );
-        CHAMELEON_zplgsy_Tile( (double)N, uplo, descA0, seedA );
+        CHAMELEON_zplgsy_Tile( (double)N, ChamUpperLower, descA0, seedA );
 
         hres += check_zxxtrf( args, ChamSymmetric, uplo, descA0, descA );
 
diff --git a/testing/testing_zsytrf.c b/testing/testing_zsytrf.c
index d48e91c9d0ca0996ab705424822adca8f37cfbc7..0bc75fa82f1e5e967bc6a94c4aaaa91ad5f76901 100644
--- a/testing/testing_zsytrf.c
+++ b/testing/testing_zsytrf.c
@@ -63,7 +63,7 @@ testing_zsytrf_desc( run_arg_list_t *args, int check )
     /* Checks the factorisation and residue */
     if ( check ) {
         CHAM_desc_t *descA0 = CHAMELEON_Desc_Copy( descA, CHAMELEON_MAT_ALLOC_TILE );
-        CHAMELEON_zplgsy_Tile( (double)N, uplo, descA0, seedA );
+        CHAMELEON_zplgsy_Tile( (double)N, ChamUpperLower, descA0, seedA );
 
         hres += check_zxxtrf( args, ChamSymmetric, uplo, descA0, descA );
 
diff --git a/testing/testing_ztradd.c b/testing/testing_ztradd.c
index e80d8f3ad4685d5eb0a9a64337c8cd9b8842659c..7ea8b8e7b614474c3b31ea6d479612a11d775873 100644
--- a/testing/testing_ztradd.c
+++ b/testing/testing_ztradd.c
@@ -241,7 +241,7 @@ testing_t   test_ztradd;
 const char *ztradd_params[] = { "mtxfmt", "nb",    "trans", "uplo",  "m",     "n", "lda",
                                 "ldb",    "alpha", "beta",  "seedA", "seedB", NULL };
 const char *ztradd_output[] = { NULL };
-const char *ztradd_outchk[] = { "RETURN", NULL };
+const char *ztradd_outchk[] = {  "||A||", "||B||", "||R||", "RETURN", NULL };
 
 /**
  * @brief Testing registration function