diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3892d9b94325f20b38d2f8a1b13099786e466d8c..8f89d65c47f19efebacccf92390318f33c9af3bf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,7 +30,8 @@
 #  @author Alycia Lisito
 #  @author Loris Lucido
 #  @author Nathan Précigout
-#  @date 2024-03-11
+#  @author Abel Calluaud
+#  @date 2024-07-17
 #
 ###
 cmake_minimum_required(VERSION 3.5)
@@ -260,6 +261,8 @@ if (CHAMELEON_ENABLE_TESTING)
     message("-- ${BoldGreen}CHAMELEON_ENABLE_TESTING is set to ON, turn it OFF to avoid building testing${ColourReset}")
 endif()
 
+option(CHAMELEON_DEBUG_GERED  "Enable GERED debug"  OFF)
+
 # Option to activate or not simulation mode (use Simgrid through StarPU)
 # ----------------------------------------------------------------------
 cmake_dependent_option(CHAMELEON_SIMULATION
diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt
index 6dfa83891f9f2ded3dfb3252e5365d375508f3c6..cc25fb7d233c7206ced2d9e5fc59766e1bef25fe 100644
--- a/compute/CMakeLists.txt
+++ b/compute/CMakeLists.txt
@@ -27,7 +27,8 @@
 #  @author Alycia Lisito
 #  @author Loris Lucido
 #  @author Matthieu Kuhn
-#  @date 2024-04-03
+#  @author Ana Hourcau
+#  @date 2024-07-17
 #
 ###
 
@@ -193,10 +194,12 @@ set(ZSRC
     ##################
     # MIXED PRECISION
     ##################
+    pzhered.c
     pzlag2c.c
     pzgered.c
     pzgerst.c
     ###
+    zhered.c
     zgered.c
     zgerst.c
     #zcgels.c
diff --git a/compute/pzgered.c b/compute/pzgered.c
index c1624db1ba06c0f43a7d84ea485a089d33965ade..1051ee91f45370bba6cefbdb21d3b17a6862c99f 100644
--- a/compute/pzgered.c
+++ b/compute/pzgered.c
@@ -13,7 +13,8 @@
  *
  * @version 1.3.0
  * @author Mathieu Faverge
- * @date 2023-07-06
+ * @author Ana Hourcau
+ * @date 2024-07-17
  * @precisions normal z -> z d
  *
  */
@@ -28,8 +29,8 @@
 
 static inline void
 chameleon_pzgered_frb( cham_uplo_t uplo,
-                        CHAM_desc_t *A, CHAM_desc_t *Wnorm, CHAM_desc_t *Welt,
-                        RUNTIME_option_t *options )
+                       CHAM_desc_t *A, CHAM_desc_t *Wnorm, CHAM_desc_t *Welt,
+                       RUNTIME_option_t *options )
 {
     double alpha = 1.0;
     double beta  = 0.0;
@@ -233,21 +234,17 @@ void chameleon_pzgered( cham_uplo_t uplo, double prec, CHAM_desc_t *A,
 
         for(n = nmin; n < nmax; n++) {
             CHAM_tile_t *tile = A->get_blktile( A, m, n );
-            if ( tile->rank == A->myrank ) {
-                int tempnn = ( n == (A->nt-1) ) ? A->n - n * A->nb : A->nb;
-
-                /* Get the frobenius norm of the tile A( m, n ) */
-                lnorm = ((double*)((Wcol.get_blktile( &Wcol, m, n ))->mat))[0];
-
-                /*
-                 * u_{high} = 1e-16 (later should be application accuraccy)
-                 * u_{low} = 1e-8
-                 * ||A_{i,j}||_F  < u_{high} * || A ||_F / (nt * u_{low})
-                 * ||A_{i,j}||_F  < threshold / u_{low}
-                 */
-                INSERT_TASK_zgered( &options, threshold, lnorm,
-                                     tempmm, tempnn, A( m, n ) );
-            }
+
+            int tempnn = ( n == (A->nt-1) ) ? A->n - n * A->nb : A->nb;
+
+            /*
+                * u_{high} = 1e-16 (later should be application accuracy)
+                * u_{low} = 1e-8
+                * ||A_{i,j}||_F  < u_{high} * || A ||_F / (nt * u_{low})
+                * ||A_{i,j}||_F  < threshold / u_{low}
+                */
+            INSERT_TASK_zgered( &options, threshold,
+                                tempmm, tempnn, A( m, n ), W( &Wcol, m, n ) );
         }
     }
 
diff --git a/compute/pzhered.c b/compute/pzhered.c
new file mode 100644
index 0000000000000000000000000000000000000000..97e171b3f11dd6a40ccb44e7d2f85ee41324b62f
--- /dev/null
+++ b/compute/pzhered.c
@@ -0,0 +1,288 @@
+/**
+ *
+ * @file pzhered.c
+ *
+ * @copyright 2009-2014 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon zhered parallel algorithm
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @author Ana Hourcau
+ * @date 2024-07-17
+ * @precisions normal z -> z d
+ *
+ */
+// ALLOC_WS :  A->mb
+// ALLOC_WS :  A->nb
+// WS_ADD :  A->mb + A->nb
+#include "control/common.h"
+#include <coreblas/lapacke.h>
+
+#define A(m, n) A, (m), (n)
+#define W(desc, m, n) (desc), (m), (n)
+
+static inline void
+chameleon_pzhered_frb( cham_trans_t trans, cham_uplo_t uplo,
+                       CHAM_desc_t *A, CHAM_desc_t *Wnorm, CHAM_desc_t *Welt,
+                       RUNTIME_option_t *options )
+{
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    int m, n;
+    int MT = A->mt;
+    int NT = A->nt;
+    int M  = A->m;
+    int N  = A->n;
+    int P  = Welt->p;
+    int Q  = Welt->q;
+
+    /* Initialize workspaces for tile norms */
+    for (m = 0; m < Wnorm->mt; m++)
+    {
+        for (n = 0; n < NT; n++)
+        {
+            INSERT_TASK_dlaset(
+                options,
+                ChamUpperLower, Wnorm->mb, Wnorm->nb,
+                alpha, beta,
+                W(Wnorm, m, n));
+        }
+    }
+
+    /* Initialize workspaces */
+    for (m = 0; m < Welt->mt; m++)
+    {
+        for (n = 0; n < Welt->nt; n++)
+        {
+            INSERT_TASK_dlaset(
+                options,
+                ChamUpperLower, Welt->mb, Welt->nb,
+                alpha, beta,
+                W(Welt, m, n));
+        }
+    }
+
+    /**
+     * Step 1:
+     *  For j in [1,Q], Welt(m, j) = reduce( A(m, j+k*Q) )
+     */
+    for (m = 0; m < MT; m++)
+    {
+        int nmin = (uplo == ChamUpper) ? m : 0;
+        int nmax = (uplo == ChamLower) ? chameleon_min(m + 1, NT) : NT;
+
+        int tempmm = (m == (MT - 1)) ? M - m * A->mb : A->mb;
+
+        for (n = nmin; n < nmax; n++)
+        {
+            int tempnn = (n == (NT - 1)) ? N - n * A->nb : A->nb;
+
+            if (n == m)
+            {
+                if ( trans == ChamConjTrans ) {
+                    INSERT_TASK_zhessq(
+                        options, ChamEltwise, uplo, tempmm,
+                        A(m, n), W( Wnorm, m, n) );
+                }
+                else {
+                    INSERT_TASK_zsyssq(
+                        options, ChamEltwise, uplo, tempmm,
+                        A(m, n), W( Wnorm, m, n) );
+                }
+            }
+            else
+            {
+                INSERT_TASK_zgessq(
+                    options, ChamEltwise, tempmm, tempnn,
+                    A(m, n), W( Wnorm, m, n ));
+                INSERT_TASK_zgessq(
+                    options, ChamEltwise, tempmm, tempnn,
+                    A(m, n), W( Wnorm, n, m ));
+            }
+        }
+    }
+
+    for(m = 0; m < MT; m++) {
+        for(n = Q; n < NT; n++) {
+            INSERT_TASK_dplssq(
+                options, ChamEltwise, 1, 1, W( Wnorm, m, n), W( Welt, m, n%Q) );
+        }
+
+        /**
+         * Step 2:
+         *  For each j, W(m, j) = reduce( W( Welt, m, 0..Q-1) )
+         */
+        for(n = 1; n < Q; n++) {
+            INSERT_TASK_dplssq(
+                options, ChamEltwise, 1, 1, W( Welt, m, n), W( Welt, m, 0) );
+        }
+    }
+
+    /**
+     * Step 3:
+     *  For m in 0..P-1, Welt(m, n) = max( Welt(m..mt[P], n ) )
+     */
+    for(m = P; m < MT; m++) {
+        INSERT_TASK_dplssq(
+            options, ChamEltwise, 1, 1, W( Welt, m, 0), W( Welt, m%P, 0) );
+    }
+
+    /**
+     * Step 4:
+     *  For each i, Welt(i, n) = max( Welt(0..P-1, n) )
+     */
+    for(m = 1; m < P; m++) {
+        INSERT_TASK_dplssq(
+            options, ChamEltwise, 1, 1, W( Welt, m, 0), W( Welt, 0, 0) );
+    }
+
+    /* Compute the norm of each tile, and the full norm */
+    for (m = 0; m < MT; m++)
+    {
+        int nmin = (uplo == ChamUpper) ? m : 0;
+        int nmax = (uplo == ChamLower) ? chameleon_min(m + 1, NT) : NT;
+
+        for (n = nmin; n < nmax; n++)
+        {
+            /* Compute the final norm of the tile */
+            INSERT_TASK_dplssq2(
+                options, 1, W( Wnorm, m, n ) );
+        }
+    }
+    INSERT_TASK_dplssq2(
+        options, 1, W( Welt, 0, 0) );
+
+    /**
+     * Broadcast the result
+     */
+    for (m = 0; m < A->p; m++)
+    {
+        for (n = 0; n < A->q; n++)
+        {
+            if ((m != 0) || (n != 0))
+            {
+                INSERT_TASK_dlacpy(
+                    options,
+                    ChamUpperLower, 1, 1,
+                    W(Welt, 0, 0), W(Welt, m, n));
+            }
+        }
+    }
+}
+
+/**
+ *
+ */
+void chameleon_pzhered( cham_trans_t trans, cham_uplo_t uplo, double prec, CHAM_desc_t *A,
+                        RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
+{
+    CHAM_context_t *chamctxt;
+    RUNTIME_option_t options;
+    CHAM_desc_t Wcol;
+    CHAM_desc_t Welt;
+    double gnorm, lnorm, threshold, eps;
+
+    int workmt, worknt;
+    int m, n;
+
+    chamctxt = chameleon_context_self();
+    if (sequence->status != CHAMELEON_SUCCESS)
+    {
+        return;
+    }
+    RUNTIME_options_init(&options, chamctxt, sequence, request);
+
+    workmt = chameleon_max(A->mt, A->p);
+    worknt = chameleon_max(A->nt, A->q);
+
+    RUNTIME_options_ws_alloc(&options, 1, 0);
+
+    /* Matrix to store the norm of each element */
+    chameleon_desc_init(&Wcol, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 2, 1, 2,
+                        A->mt * 2, A->nt, 0, 0, A->mt * 2, A->nt, A->p, A->q,
+                        NULL, NULL, A->get_rankof_init, A->get_rankof_init_arg);
+
+    /* Matrix to compute the global frobenius norm */
+    chameleon_desc_init(&Welt, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 2, 1, 2,
+                        workmt * 2, worknt, 0, 0, workmt * 2, worknt, A->p, A->q,
+                        NULL, NULL, NULL, NULL);
+
+    chameleon_pzhered_frb( trans, uplo, A, &Wcol, &Welt, &options );
+
+    CHAMELEON_Desc_Flush(&Wcol, sequence);
+    CHAMELEON_Desc_Flush(&Welt, sequence);
+    CHAMELEON_Desc_Flush(A, sequence);
+
+    RUNTIME_sequence_wait(chamctxt, sequence);
+
+    gnorm = *((double *)Welt.get_blkaddr(&Welt, A->myrank / A->q, A->myrank % A->q));
+    chameleon_desc_destroy(&Welt);
+
+    /**
+     * Reduce the precision of the tiles if possible
+     */
+    if (prec < 0.)
+    {
+#if !defined(CHAMELEON_SIMULATION)
+        eps = LAPACKE_dlamch_work('e');
+#else
+#if defined(PRECISION_z) || defined(PRECISION_d)
+        eps = 1.e-15;
+#else
+        eps = 1.e-7;
+#endif
+#endif
+    }
+    else
+    {
+        eps = prec;
+    }
+    threshold = (eps * gnorm) / (double)(chameleon_min(A->mt, A->nt));
+
+#if defined(CHAMELEON_DEBUG_GERED)
+    fprintf(stderr,
+            "[%2d] The norm of A is:           %e\n"
+            "[%2d] The requested precision is: %e\n"
+            "[%2d] The computed threshold is:  %e\n",
+            A->myrank, gnorm,
+            A->myrank, eps,
+            A->myrank, threshold);
+#endif
+    for (m = 0; m < A->mt; m++)
+    {
+        int tempmm = (m == (A->mt - 1)) ? A->m - m * A->mb : A->mb;
+        int nmin = (uplo == ChamUpper) ? m : 0;
+        int nmax = (uplo == ChamLower) ? chameleon_min(m + 1, A->nt) : A->nt;
+
+        for (n = nmin; n < nmax; n++)
+        {
+            CHAM_tile_t *tile = A->get_blktile(A, m, n);
+
+            int tempnn = (n == (A->nt - 1)) ? A->n - n * A->nb : A->nb;
+
+            /*
+                * u_{high} = 1e-16 (later should be application accuracy)
+                * u_{low} = 1e-8
+                * ||A_{i,j}||_F  < u_{high} * || A ||_F / (nt * u_{low})
+                * ||A_{i,j}||_F  < threshold / u_{low}
+                */
+
+            INSERT_TASK_zgered( &options, threshold,
+                                tempmm, tempnn, A( m, n ), W( &Wcol, m, n ) );
+        }
+    }
+
+    CHAMELEON_Desc_Flush(A, sequence);
+    RUNTIME_sequence_wait(chamctxt, sequence);
+
+    chameleon_desc_destroy(&Wcol);
+    RUNTIME_options_ws_free(&options);
+    RUNTIME_options_finalize(&options, chamctxt);
+}
diff --git a/compute/zgered.c b/compute/zgered.c
index f3783ad3446449be36eb5e0b9f0015ef2a249c60..a58e8f145fa459861008e8600852e9c06878c40f 100644
--- a/compute/zgered.c
+++ b/compute/zgered.c
@@ -13,7 +13,7 @@
  *
  * @version 1.3.0
  * @author Mathieu Faverge
- * @date 2023-07-06
+ * @date 2024-07-17
  * @precisions normal z -> z d
  *
  */
@@ -166,6 +166,15 @@ int CHAMELEON_zgered_Tile_Async( cham_uplo_t uplo, double precision, CHAM_desc_t
         return CHAMELEON_SUCCESS;
     }
 
+    if ( precision < 0. ) {
+        char *algostr = chameleon_getenv( "CHAMELEON_GERED_ACC" );
+        if ( algostr == NULL ) {
+            precision = 1e-12;
+        }
+        else {
+            precision = strtod( algostr, NULL );
+        }
+    }
     chameleon_pzgered( uplo, precision, A, sequence, request );
 
     return CHAMELEON_SUCCESS;
diff --git a/compute/zhered.c b/compute/zhered.c
new file mode 100644
index 0000000000000000000000000000000000000000..32e5c81570689af3e95b69bf65d5129195d76b1d
--- /dev/null
+++ b/compute/zhered.c
@@ -0,0 +1,182 @@
+/**
+ *
+ * @file zhered.c
+ *
+ * @copyright 2009-2014 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon zhered wrappers
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @author Ana Hourcau
+ * @date 2024-07-17
+ * @precisions normal z -> z d
+ *
+ */
+#include "control/common.h"
+
+/**
+ ********************************************************************************
+ *
+ * @ingroup CHAMELEON_Complex64_t_Tile
+ *
+ * @brief Computes the Cholesky factorization of a symmetric positive definite
+ * or Hermitian positive definite matrix with mixed precision.
+ *
+ * This is the synchronous version of CHAMELEON_zheredinit_Tile_Async().  It
+ * operates on matrices stored by tiles with tiles of potentially different
+ * precisions.  All matrices are passed through descriptors.  All dimensions are
+ * taken from the descriptors.
+ *
+ *******************************************************************************
+ *
+ * @param[in] uplo
+ *          = ChamUpper: Upper triangle of A is stored;
+ *          = ChamLower: Lower triangle of A is stored.
+ *
+ * @param[in] A
+ *          On entry, the symmetric positive definite (or Hermitian) matrix A.
+ *          If uplo = ChamUpper, the leading N-by-N upper triangular part of A
+ *          contains the upper triangular part of the matrix A, and the strictly lower triangular
+ *          part of A is not referenced.
+ *          If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower
+ *          triangular part of the matrix A, and the strictly upper triangular part of A is not
+ *          referenced.
+ *          On exit, if return value = 0, the factor U or L from the Cholesky factorization
+ *          A = U^H*U or A = L*L^H.
+ *
+ *******************************************************************************
+ *
+ * @retval CHAMELEON_SUCCESS successful exit
+ * @retval >0 if i, the leading minor of order i of A is not positive definite, so the
+ *               factorization could not be completed, and the solution has not been computed.
+ *
+ *******************************************************************************
+ *
+ * @sa CHAMELEON_zhered
+ * @sa CHAMELEON_zhered_Tile_Async
+ * @sa CHAMELEON_cpotrfmp_Tile
+ * @sa CHAMELEON_dpotrfmp_Tile
+ * @sa CHAMELEON_spotrfmp_Tile
+ * @sa CHAMELEON_zpotrs_Tile
+ *
+ */
+int CHAMELEON_zhered_Tile( cham_uplo_t uplo, double precision, CHAM_desc_t *A )
+{
+    CHAM_context_t *chamctxt;
+    RUNTIME_sequence_t *sequence = NULL;
+    RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER;
+    int status;
+
+    chamctxt = chameleon_context_self();
+    if (chamctxt == NULL) {
+        chameleon_fatal_error("CHAMELEON_zheredinit_Tile", "CHAMELEON not initialized");
+        return CHAMELEON_ERR_NOT_INITIALIZED;
+    }
+    chameleon_sequence_create( chamctxt, &sequence );
+
+    CHAMELEON_zhered_Tile_Async( uplo, precision, A, sequence, &request );
+
+    CHAMELEON_Desc_Flush( A, sequence );
+
+    chameleon_sequence_wait( chamctxt, sequence );
+    status = sequence->status;
+    chameleon_sequence_destroy( chamctxt, sequence );
+    return status;
+}
+
+/**
+ ********************************************************************************
+ *
+ * @ingroup CHAMELEON_Complex64_t_Tile_Async
+ *
+ * @brief Computes the Cholesky factorization of a symmetric positive definite
+ * or Hermitian positive definite matrix with mixed precision.
+ *
+ * This is the non-blocking equivalent of CHAMELEON_zhered_Tile().  It
+ * operates on matrices stored by tiles with tiles of potentially different
+ * precisions.  All matrices are passed through descriptors.  All dimensions are
+ * taken from the descriptors. It may return before the computation is
+ * finished. This function allows for pipelining operations at runtime.
+ *
+ *******************************************************************************
+ *
+ * @param[in] sequence
+ *          Identifies the sequence of function calls that this call belongs to
+ *          (for completion checks and exception handling purposes).
+ *
+ * @param[out] request
+ *          Identifies this function call (for exception handling purposes).
+ *
+ *******************************************************************************
+ *
+ * @sa CHAMELEON_zhered
+ * @sa CHAMELEON_zhered_Tile
+ * @sa CHAMELEON_cpotrfmp_Tile_Async
+ * @sa CHAMELEON_dpotrfmp_Tile_Async
+ * @sa CHAMELEON_spotrfmp_Tile_Async
+ * @sa CHAMELEON_zpotrs_Tile_Async
+ *
+ */
+int CHAMELEON_zhered_Tile_Async( cham_uplo_t uplo, double precision, CHAM_desc_t *A,
+                                 RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
+{
+    CHAM_context_t *chamctxt;
+
+    chamctxt = chameleon_context_self();
+    if (chamctxt == NULL) {
+        chameleon_fatal_error("CHAMELEON_zhered_Tile_Async", "CHAMELEON not initialized");
+        return CHAMELEON_ERR_NOT_INITIALIZED;
+    }
+    if (sequence == NULL) {
+        chameleon_fatal_error("CHAMELEON_zhered_Tile_Async", "NULL sequence");
+        return CHAMELEON_ERR_UNALLOCATED;
+    }
+    if (request == NULL) {
+        chameleon_fatal_error("CHAMELEON_zhered_Tile_Async", "NULL request");
+        return CHAMELEON_ERR_UNALLOCATED;
+    }
+    /* Check sequence status */
+    if (sequence->status == CHAMELEON_SUCCESS) {
+        request->status = CHAMELEON_SUCCESS;
+    }
+    else {
+        return chameleon_request_fail(sequence, request, CHAMELEON_ERR_SEQUENCE_FLUSHED);
+    }
+
+    /* Check descriptors for correctness */
+    if (chameleon_desc_check(A) != CHAMELEON_SUCCESS) {
+        chameleon_error("CHAMELEON_zhered_Tile_Async", "invalid descriptor");
+        return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE);
+    }
+    /* Check input arguments */
+    if (A->nb != A->mb) {
+        chameleon_error("CHAMELEON_zhered_Tile_Async", "only square tiles supported");
+        return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE);
+    }
+
+    /*
+     * Quick return
+     */
+    if ( chameleon_max( A->m, A->n ) == 0 ) {
+        return CHAMELEON_SUCCESS;
+    }
+
+    if ( precision < 0. ) {
+        char *algostr = chameleon_getenv( "CHAMELEON_GERED_ACC" );
+        if ( algostr == NULL ) {
+            precision = 1e-12;
+        }
+        else {
+            precision = strtod( algostr, NULL );
+        }
+    }
+    chameleon_pzhered( ChamConjTrans, uplo, precision, A, sequence, request );
+
+    return CHAMELEON_SUCCESS;
+}
diff --git a/control/compute_z.h b/control/compute_z.h
index 645018f833f835cccf50ea9f25858d5410e8fce2..088e03140baff5b167727931a6fb9e6b7a1641f0 100644
--- a/control/compute_z.h
+++ b/control/compute_z.h
@@ -22,7 +22,8 @@
  * @author Alycia Lisito
  * @author Matthieu Kuhn
  * @author Lionel Eyraud-Dubois
- * @date 2023-09-08
+ * @author Ana Hourcau
+ * @date 2024-07-17
  * @precisions normal z -> c d s
  *
  */
@@ -81,6 +82,8 @@ int chameleon_zshift(CHAM_context_t *chamctxt, int m, int n, CHAMELEON_Complex64
 #if defined(PRECISION_z) || defined(PRECISION_d)
 void chameleon_pzgered( cham_uplo_t uplo, double prec, CHAM_desc_t *A,
                         RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
+void chameleon_pzhered( cham_trans_t trans, cham_uplo_t uplo, double prec, CHAM_desc_t *A,
+                        RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
 void chameleon_pzgerst( cham_uplo_t uplo, CHAM_desc_t *A,
                         RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
 #endif
diff --git a/coreblas/compute/global.c b/coreblas/compute/global.c
index 0c0ad0e769c11b6dc759996169b82c9640c00e09..54deb11cfc1915cc1c515a86b950e9bd4d385e16 100644
--- a/coreblas/compute/global.c
+++ b/coreblas/compute/global.c
@@ -11,14 +11,14 @@
  *
  * @brief Chameleon global coreblas variables and functions
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Jakub Kurzak
  * @author Piotr Luszczek
  * @author Florent Pruvost
  * @author Guillaume Sylvand
  * @author Mathieu Faverge
  * @author Alycia Lisito
- * @date 2022-02-22
+ * @date 2024-07-17
  *
  */
 #include "coreblas.h"
@@ -58,6 +58,8 @@ void __coreblas_kernel_trace( const char *func, ... )
         size += snprintf( output+size, len-size, "%s%s",
                           first ? "" : ", ",
                           tile->name );
+        size += snprintf( output+size, len-size, " / %p",
+                          CHAM_tile_get_ptr( tile ) );
         first = 0;
     }
     va_end( va_list );
diff --git a/include/chameleon/chameleon_z.h b/include/chameleon/chameleon_z.h
index 5d667cca39e1fe42eb61d29257ac45e38e2f3075..3f33260f4436ec195323874d3c13b9b44d2c62e4 100644
--- a/include/chameleon/chameleon_z.h
+++ b/include/chameleon/chameleon_z.h
@@ -23,7 +23,8 @@
  * @author Florent Pruvost
  * @author Alycia Lisito
  * @author Matthieu Kuhn
- * @date 2024-04-03
+ * @author Ana Hourcau
+ * @date 2024-07-17
  * @precisions normal z -> c d s
  *
  */
@@ -168,10 +169,9 @@ int CHAMELEON_zplrnk_Tile(int K, CHAM_desc_t *C, unsigned long long int seedA, u
 int CHAMELEON_zpoinv_Tile(cham_uplo_t uplo, CHAM_desc_t *A);
 int CHAMELEON_zposv_Tile(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B);
 int CHAMELEON_zpotrf_Tile(cham_uplo_t uplo, CHAM_desc_t *A);
-#if defined(PRECISION_z) || defined(PRECISION_d)
 int CHAMELEON_zgered_Tile( cham_uplo_t uplo, double prec, CHAM_desc_t *A );
+int CHAMELEON_zhered_Tile( cham_uplo_t uplo, double prec, CHAM_desc_t *A );
 int CHAMELEON_zgerst_Tile( cham_uplo_t uplo, CHAM_desc_t *A );
-#endif
 int CHAMELEON_zsytrf_Tile(cham_uplo_t uplo, CHAM_desc_t *A);
 int CHAMELEON_zpotri_Tile(cham_uplo_t uplo, CHAM_desc_t *A);
 int CHAMELEON_zpotrimm_Tile(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *C);
@@ -249,10 +249,9 @@ int CHAMELEON_zplrnk_Tile_Async(int K, CHAM_desc_t *C, unsigned long long int se
 int CHAMELEON_zpoinv_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 int CHAMELEON_zposv_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 int CHAMELEON_zpotrf_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
-#if defined(PRECISION_z) || defined(PRECISION_d)
 int CHAMELEON_zgered_Tile_Async(cham_uplo_t uplo, double prec, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
+int CHAMELEON_zhered_Tile_Async(cham_uplo_t uplo, double prec, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 int CHAMELEON_zgerst_Tile_Async( cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
-#endif
 int CHAMELEON_zsytrf_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 int CHAMELEON_zpotri_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 int CHAMELEON_zpotrimm_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
diff --git a/include/chameleon/config.h.in b/include/chameleon/config.h.in
index 49885a8993781dfd8e0454862f91792c96e21688..c274a28841066c66ba54536d8e29a8e909104fdc 100644
--- a/include/chameleon/config.h.in
+++ b/include/chameleon/config.h.in
@@ -11,13 +11,15 @@
  *
  * @brief Chameleon configuration file
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Florent Pruvost
  * @author Mathieu Faverge
  * @author Philippe Virouleau
  * @author Raphael Boucherie
  * @author Loris Lucido
- * @date 2023-01-30
+ * @author Abel Calluaud
+ * @author Alycia Lisito
+ * @date 2024-07-17
  *
  */
 #ifndef CHAMELEON_CONFIG_H_HAS_BEEN_INCLUDED
@@ -79,6 +81,9 @@
 /* chameleon compute */
 #cmakedefine CHAMELEON_COPY_DIAG
 
+/* Debug options */
+#cmakedefine CHAMELEON_DEBUG_GERED
+
 /* Define the maximum batch size for kernels using it */
 #define CHAMELEON_BATCH_SIZE @CHAMELEON_BATCH_SIZE@
 
diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h
index b330ec7d840bb3136f8575e240bed5b8a9bc5847..795ebd2d186f9c1e88a44ab6312d40583b1a4d5d 100644
--- a/include/chameleon/tasks_z.h
+++ b/include/chameleon/tasks_z.h
@@ -24,7 +24,8 @@
  * @author Alycia Lisito
  * @author Romain Peressoni
  * @author Matthieu Kuhn
- * @date 2023-09-11
+ * @author Ana Hourcau
+ * @date 2024-07-17
  * @precisions normal z -> c d s
  *
  */
@@ -79,8 +80,9 @@ void INSERT_TASK_zgeqrt( const RUNTIME_option_t *options,
                          const CHAM_desc_t *A, int Am, int An,
                          const CHAM_desc_t *T, int Tm, int Tn );
 void INSERT_TASK_zgered( const RUNTIME_option_t *options,
-                         double threshold, double Anorm, int m, int n,
-                         const CHAM_desc_t *A, int Am, int An );
+                         double threshold, int m, int n,
+                         const CHAM_desc_t *A, int Am, int An,
+                         const CHAM_desc_t *Wnorm, int Wnm, int Wnn );
 void INSERT_TASK_zgerst( const RUNTIME_option_t *options,
                          int m, int n,
                          const CHAM_desc_t *A, int Am, int An );
diff --git a/runtime/openmp/codelets/codelet_zgered.c b/runtime/openmp/codelets/codelet_zgered.c
index 19e6f9118969c74540a6a729af02f56f47ea47c6..20b0c191205ec8fe333c556943bac9d520ddc5f1 100644
--- a/runtime/openmp/codelets/codelet_zgered.c
+++ b/runtime/openmp/codelets/codelet_zgered.c
@@ -11,24 +11,28 @@
  *
  * @version 1.3.0
  * @author Mathieu Faverge
- * @date 2023-07-06
+ * @author Ana Hourcau
+ * @date 2024-07-17
  * @precisions normal z -> d
  *
  */
 #include "chameleon_openmp.h"
 
 void INSERT_TASK_zgered( const RUNTIME_option_t *options,
-                         double threshold, double Anorm, int m, int n,
-                         const CHAM_desc_t *A, int Am, int An )
+                         double threshold, int m, int n,
+                         const CHAM_desc_t *A,     int Am,  int An,
+                         const CHAM_desc_t *Wnorm, int Wnm, int Wnn )
 {
     fprintf( stderr, "WARNING: gered kernel is not available with OpenMP\n" );
 
     (void)options;
     (void)threshold;
-    (void)Anorm;
     (void)m;
     (void)n;
     (void)A;
     (void)Am;
     (void)An;
+    (void)Wnorm;
+    (void)Wnm;
+    (void)Wnn;
 }
diff --git a/runtime/parsec/codelets/codelet_zgered.c b/runtime/parsec/codelets/codelet_zgered.c
index dcc20888b04936244f2e6ddade9ad3932a3b8413..338a7b5ff34c0f2a3f7b7bab193bd1aa4c049bd5 100644
--- a/runtime/parsec/codelets/codelet_zgered.c
+++ b/runtime/parsec/codelets/codelet_zgered.c
@@ -11,24 +11,28 @@
  *
  * @version 1.3.0
  * @author Mathieu Faverge
- * @date 2023-07-06
+ * @author Ana Hourcau
+ * @date 2024-07-17
  * @precisions normal z -> d
  *
  */
 #include "chameleon_parsec.h"
 
 void INSERT_TASK_zgered( const RUNTIME_option_t *options,
-                         double threshold, double Anorm, int m, int n,
-                         const CHAM_desc_t *A, int Am, int An )
+                         double threshold, int m, int n,
+                         const CHAM_desc_t *A,     int Am,  int An,
+                         const CHAM_desc_t *Wnorm, int Wnm, int Wnn )
 {
     fprintf( stderr, "WARNING: gered kernel is not available with PaRSEC\n" );
 
     (void)options;
     (void)threshold;
-    (void)Anorm;
     (void)m;
     (void)n;
     (void)A;
     (void)Am;
     (void)An;
+    (void)Wnorm;
+    (void)Wnm;
+    (void)Wnn;
 }
diff --git a/runtime/quark/codelets/codelet_zgered.c b/runtime/quark/codelets/codelet_zgered.c
index 773bd7cd94dd1e20f57ef0c0f577a5bc98d68d33..b07695f70e0ebb9a10669965efa2917383f693d4 100644
--- a/runtime/quark/codelets/codelet_zgered.c
+++ b/runtime/quark/codelets/codelet_zgered.c
@@ -11,24 +11,28 @@
  *
  * @version 1.3.0
  * @author Mathieu Faverge
- * @date 2023-07-06
+ * @author Ana Hourcau
+ * @date 2024-07-17
  * @precisions normal z -> d
  *
  */
 #include "chameleon_quark.h"
 
 void INSERT_TASK_zgered( const RUNTIME_option_t *options,
-                         double threshold, double Anorm, int m, int n,
-                         const CHAM_desc_t *A, int Am, int An )
+                         double threshold, int m, int n,
+                         const CHAM_desc_t *A,     int Am,  int An,
+                         const CHAM_desc_t *Wnorm, int Wnm, int Wnn )
 {
     fprintf( stderr, "WARNING: gered kernel is not available with Quark\n" );
 
     (void)options;
     (void)threshold;
-    (void)Anorm;
     (void)m;
     (void)n;
     (void)A;
     (void)Am;
     (void)An;
+    (void)Wnorm;
+    (void)Wnm;
+    (void)Wnn;
 }
diff --git a/runtime/starpu/codelets/codelet_zgered.c b/runtime/starpu/codelets/codelet_zgered.c
index a6f8cab2804921a580f33344eae60fb88d57b744..fe1c4927ef525aa24dd53a6b83f22d3c5e9959f4 100644
--- a/runtime/starpu/codelets/codelet_zgered.c
+++ b/runtime/starpu/codelets/codelet_zgered.c
@@ -13,7 +13,8 @@
  *
  * @version 1.3.0
  * @author Mathieu Faverge
- * @date 2023-07-06
+ * @author Ana Hourcau
+ * @date 2024-07-17
  * @precisions normal z -> d
  *
  */
@@ -22,24 +23,36 @@
 #include "runtime_codelet_zc.h"
 #include "runtime_codelet_z.h"
 
-//#define CHAMELEON_DEBUG_GERED
-
 void INSERT_TASK_zgered( const RUNTIME_option_t *options,
-                         double threshold, double Anorm, int m, int n,
-                         const CHAM_desc_t *A, int Am, int An )
+                         double threshold, int m, int n,
+                         const CHAM_desc_t *A,     int Am,  int An,
+                         const CHAM_desc_t *Wnorm, int Wnm, int Wnn )
 {
     CHAM_tile_t          *tileA;
-    double                u_low;
+    double                u_low, lnorm;
     int64_t               mm, nn;
-#if defined(CHAMELEON_USE_MPI)
-    int                   tag;
-#endif
+    int                   tag = -1;
     starpu_data_handle_t *handleAin;
     starpu_data_handle_t  handleAout;
 
-    CHAMELEON_BEGIN_ACCESS_DECLARATION;
-    CHAMELEON_ACCESS_RW(A, Am, An);
-    CHAMELEON_END_ACCESS_DECLARATION;
+    /*
+     * Collect the norm of the tile on all nodes to do the the data conversion
+     * if owned, and only the new data registration if not owned
+     */
+    {
+        starpu_data_handle_t handleNorm = RTBLKADDR( Wnorm, ChamDouble, Wnm, Wnn );
+        CHAM_tile_t         *tileNorm;
+
+#if defined(CHAMELEON_USE_MPI)
+        starpu_mpi_get_data_on_all_nodes_detached( options->sequence->comm, handleNorm );
+#endif
+        starpu_data_acquire( handleNorm, STARPU_R );
+
+        tileNorm = cti_handle_get( handleNorm );
+        lnorm = ((double *)(tileNorm->mat))[0];
+
+        starpu_data_release( handleNorm );
+    }
 
     /* Get the Input handle */
     mm = Am + (A->i / A->mb);
@@ -47,8 +60,6 @@ void INSERT_TASK_zgered( const RUNTIME_option_t *options,
     handleAin = A->schedopt;
     handleAin += ((int64_t)A->lmt) * nn + mm;
 
-    assert( *handleAin != NULL );
-
     /*
      * Lets convert the tile precision based on the following criteria:
      *
@@ -56,10 +67,14 @@ void INSERT_TASK_zgered( const RUNTIME_option_t *options,
      * ||A_{i,j}||_F  < u_{high} * || A ||_F / nt *  1/ u_{low}
      * ||A_{i,j}||_F  < threshold / u_{low}
      */
-
     tileA = A->get_blktile( A, Am, An );
+
 #if defined(CHAMELEON_USE_MPI)
-    tag = starpu_mpi_data_get_tag( *handleAin );
+    /* Backup the MPI tag */
+    if (A->myrank == tileA->rank)
+    {
+        tag = starpu_mpi_data_get_tag( *handleAin );
+    }
 #endif /* defined(CHAMELEON_USE_MPI) */
 
 #if defined(CHAMELEON_USE_CUDA) && (CUDA_VERSION >= 7500)
@@ -69,15 +84,16 @@ void INSERT_TASK_zgered( const RUNTIME_option_t *options,
          * Check for half precision
          */
         u_low = 1.e-4;
-        if ( Anorm < (threshold / u_low) ) {
+        if ( lnorm < (threshold / u_low) )
+        {
 #if defined(CHAMELEON_DEBUG_GERED)
             fprintf( stderr,
                      "[%2d] Convert the tile ( %d, %d ) to half precision\n",
-                     A->myrank, Am, An );
+                    A->myrank, Am, An);
 #endif
             starpu_cham_tile_register( &handleAout, -1, tileA, ChamComplexHalf );
 
-            rt_starpu_insert_task(
+            rt_shm_starpu_insert_task(
                 &cl_dlag2h,
                 STARPU_VALUE,    &m,                 sizeof(int),
                 STARPU_VALUE,    &n,                 sizeof(int),
@@ -90,14 +106,22 @@ void INSERT_TASK_zgered( const RUNTIME_option_t *options,
 #endif
                 0);
 
-            starpu_data_unregister_submit( *handleAin );
+            starpu_data_unregister_no_coherency( *handleAin );
             *handleAin = handleAout;
             tileA->flttype = ChamComplexHalf;
-#if defined(CHAMELEON_USE_MPI)
             starpu_mpi_data_register( handleAout, tag, tileA->rank );
-#endif
-            return;
         }
+        else
+        {
+            tileA->flttype = ChamComplexHalf;
+            if (*handleAin != NULL)
+            {
+                starpu_data_unregister_no_coherency(*handleAin);
+                *handleAin = NULL;
+            }
+        }
+        return;
+
     }
 #endif
 #endif
@@ -110,33 +134,44 @@ void INSERT_TASK_zgered( const RUNTIME_option_t *options,
 #else
     u_low = 1e-8;
 #endif
-    if ( Anorm < (threshold / u_low) ) {
+    if ( lnorm < (threshold / u_low) )
+    {
 #if defined(CHAMELEON_DEBUG_GERED)
         fprintf( stderr,
                  "[%2d] Convert the tile ( %d, %d ) to single precision\n",
                  A->myrank, Am, An );
 #endif
-        starpu_cham_tile_register( &handleAout, -1, tileA, ChamComplexFloat );
+        if (A->myrank == tileA->rank)
+        {
+            starpu_cham_tile_register( &handleAout, -1, tileA, ChamComplexFloat );
 
-        rt_starpu_insert_task(
-            &cl_zlag2c,
-            STARPU_VALUE,    &m,                 sizeof(int),
-            STARPU_VALUE,    &n,                 sizeof(int),
-            STARPU_R,        *handleAin,
-            STARPU_W,         handleAout,
-            STARPU_PRIORITY,  options->priority,
-            STARPU_EXECUTE_ON_WORKER, options->workerid,
+            rt_shm_starpu_insert_task(
+                &cl_zlag2c,
+                STARPU_VALUE,    &m,                 sizeof(int),
+                STARPU_VALUE,    &n,                 sizeof(int),
+                STARPU_R,        *handleAin,
+                STARPU_W,         handleAout,
+                STARPU_PRIORITY,  options->priority,
+                STARPU_EXECUTE_ON_WORKER, options->workerid,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
-            STARPU_NAME, "zlag2c",
+                STARPU_NAME, "zlag2c",
 #endif
-            0);
+                0);
 
-        starpu_data_unregister_submit( *handleAin );
-        *handleAin = handleAout;
-        tileA->flttype = ChamComplexFloat;
-#if defined(CHAMELEON_USE_MPI)
-        starpu_mpi_data_register( *handleAin, tag, tileA->rank );
-#endif
+            starpu_data_unregister_no_coherency( *handleAin );
+            *handleAin = handleAout;
+            tileA->flttype = ChamComplexFloat;
+            starpu_mpi_data_register( *handleAin, tag, tileA->rank );
+        }
+        else
+        {
+            tileA->flttype = ChamComplexFloat;
+            if (*handleAin != NULL)
+            {
+                starpu_data_unregister_no_coherency(*handleAin);
+                *handleAin = NULL;
+            }
+        }
         return;
     }
 }
diff --git a/runtime/starpu/codelets/codelet_zgerst.c b/runtime/starpu/codelets/codelet_zgerst.c
index 7aca89b00fd3731ee07d46c8fd7ecd236abb6b26..9a5c825f149c171dd2ad14f812d6bab7ed926546 100644
--- a/runtime/starpu/codelets/codelet_zgerst.c
+++ b/runtime/starpu/codelets/codelet_zgerst.c
@@ -11,7 +11,8 @@
  *
  * @version 1.3.0
  * @author Mathieu Faverge
- * @date 2023-07-06
+ * @author Ana Hourcau
+ * @date 2024-07-17
  * @precisions normal z -> d
  *
  */
@@ -20,28 +21,17 @@
 #include "runtime_codelet_zc.h"
 #include "runtime_codelet_z.h"
 
-//#define CHAMELEON_DEBUG_GERST
-
 void INSERT_TASK_zgerst( const RUNTIME_option_t *options,
                          int m, int n,
                          const CHAM_desc_t *A, int Am, int An )
 {
     CHAM_tile_t          *tileA;
     int64_t               mm, nn;
-#if defined(CHAMELEON_USE_MPI)
-    int                   tag;
-#endif
+    int                   tag = -1;
     starpu_data_handle_t *handleAin;
     starpu_data_handle_t  handleAout;
 
-    CHAMELEON_BEGIN_ACCESS_DECLARATION;
-    CHAMELEON_ACCESS_RW(A, Am, An);
-    CHAMELEON_END_ACCESS_DECLARATION;
-
     tileA = A->get_blktile( A, Am, An );
-    if ( tileA->flttype == ChamComplexDouble ) {
-        return;
-    }
 
     /* Get the Input handle */
     mm = Am + (A->i / A->mb);
@@ -49,7 +39,36 @@ void INSERT_TASK_zgerst( const RUNTIME_option_t *options,
     handleAin = A->schedopt;
     handleAin += ((int64_t)A->lmt) * nn + mm;
 
-    assert( *handleAin != NULL );
+    if ( tileA->flttype == ChamComplexDouble ) {
+        starpu_data_handle_t *copy = handleAin;
+
+        /* Remove first copy */
+        copy += ((int64_t)A->lmt * (int64_t)A->lnt);
+        if ( *copy ) {
+            starpu_data_unregister_no_coherency( *copy );
+            *copy = NULL;
+        }
+
+        /* Remove second copy */
+        copy += ((int64_t)A->lmt * (int64_t)A->lnt);
+        if ( *copy ) {
+            starpu_data_unregister_no_coherency( *copy );
+            *copy = NULL;
+        }
+
+        return;
+    }
+
+    if (A->myrank != tileA->rank)
+    {
+        tileA->flttype = ChamComplexDouble;
+        if (*handleAin != NULL)
+        {
+            starpu_data_unregister_no_coherency(*handleAin);
+            *handleAin = NULL;
+        }
+        return;
+    }
 
 #if defined(CHAMELEON_USE_MPI)
     tag = starpu_mpi_data_get_tag( *handleAin );
@@ -64,12 +83,13 @@ void INSERT_TASK_zgerst( const RUNTIME_option_t *options,
      * Restore from half precision
      */
     case ChamComplexHalf:
-#if defined(CHAMELEON_DEBUG_GERST)
+        assert( options->withcuda );
+#if defined(CHAMELEON_DEBUG_GERED)
         fprintf( stderr,
                  "[%2d] Convert back the tile ( %d, %d ) from half precision\n",
                  A->myrank, Am, An );
 #endif
-        rt_starpu_insert_task(
+        rt_shm_starpu_insert_task(
             &cl_hlag2d,
             STARPU_VALUE,    &m,                 sizeof(int),
             STARPU_VALUE,    &n,                 sizeof(int),
@@ -86,12 +106,12 @@ void INSERT_TASK_zgerst( const RUNTIME_option_t *options,
 #endif
 
     case ChamComplexFloat:
-#if defined(CHAMELEON_DEBUG_GERST)
+#if defined(CHAMELEON_DEBUG_GERED)
         fprintf( stderr,
                  "[%2d] Convert back the tile ( %d, %d ) from half precision\n",
                  A->myrank, Am, An );
 #endif
-        rt_starpu_insert_task(
+        rt_shm_starpu_insert_task(
             &cl_clag2z,
             STARPU_VALUE,    &m,                 sizeof(int),
             STARPU_VALUE,    &n,                 sizeof(int),
@@ -109,10 +129,8 @@ void INSERT_TASK_zgerst( const RUNTIME_option_t *options,
         fprintf( stderr, "ERROR: Unknonw input datatype" );
     }
 
-    starpu_data_unregister_submit( *handleAin );
+    starpu_data_unregister_no_coherency( *handleAin );
     *handleAin = handleAout;
     tileA->flttype = ChamComplexDouble;
-#if defined(CHAMELEON_USE_MPI)
     starpu_mpi_data_register( handleAout, tag, tileA->rank );
-#endif
 }
diff --git a/runtime/starpu/control/runtime_descriptor.c b/runtime/starpu/control/runtime_descriptor.c
index cbe083b3e49a4c002fda07a9feeda7db6c360cae..e00b75badbc9a0108f55c4b5a54b6f7160e5b11c 100644
--- a/runtime/starpu/control/runtime_descriptor.c
+++ b/runtime/starpu/control/runtime_descriptor.c
@@ -20,7 +20,7 @@
  * @author Raphael Boucherie
  * @author Samuel Thibault
  * @author Loris Lucido
- * @date 2024-03-16
+ * @date 2024-07-17
  *
  */
 #include "chameleon_starpu.h"
@@ -432,6 +432,26 @@ void *RUNTIME_data_getaddr_withconversion( const RUNTIME_option_t *options,
     /* Get the correct starpu_handle */
     ptrtile += shift;
 
+    /* Invalidate copies on write access */
+    if ( access & ChamW ) {
+        starpu_data_handle_t *copy = ptrtile;
+        assert( fltshift == 0 );
+
+        /* Remove first copy */
+        copy += ((int64_t)A->lmt * (int64_t)A->lnt);
+        if ( *copy ) {
+            starpu_data_unregister_no_coherency( *copy );
+            *copy = NULL;
+        }
+
+        /* Remove second copy */
+        copy += ((int64_t)A->lmt * (int64_t)A->lnt);
+        if ( *copy ) {
+            starpu_data_unregister_no_coherency( *copy );
+            *copy = NULL;
+        }
+    }
+
     if ( *ptrtile != NULL ) {
         return (void*)(*ptrtile);
     }
@@ -440,7 +460,7 @@ void *RUNTIME_data_getaddr_withconversion( const RUNTIME_option_t *options,
     int myrank = A->myrank;
     int owner  = A->get_rankof( A, m, n );
 
-    if ( myrank == owner ) {
+    if ( (myrank == owner) && (shift == 0) ) {
         if ( (tile->format & CHAMELEON_TILE_HMAT) ||
              (tile->mat != NULL) )
         {
@@ -476,6 +496,8 @@ void *RUNTIME_data_getaddr_withconversion( const RUNTIME_option_t *options,
         starpu_data_handle_t *totile = ptrtile;
 
         fromtile += ((int64_t)A->lmt) * nn + mm;
+        assert( fromtile != totile );
+        assert( tile->flttype != flttype );
         if ( *fromtile != NULL ) {
             insert_task_convert( options, tile->m, tile->n, tile->flttype, *fromtile, flttype, *totile );
         }
diff --git a/runtime/starpu/include/cham_tile_interface.h b/runtime/starpu/include/cham_tile_interface.h
index 8abc48abcabd665bec47975bef768bc21850d8b4..5dc7672d8c90c4127cdf956f0e1bb8d4e718634a 100644
--- a/runtime/starpu/include/cham_tile_interface.h
+++ b/runtime/starpu/include/cham_tile_interface.h
@@ -9,10 +9,11 @@
  *
  * @brief Header to describe the Chameleon tile interface in StarPU
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Mathieu Faverge
  * @author Gwenole Lucas
- * @date 2022-02-22
+ * @author Ana Hourcau
+ * @date 2024-07-17
  *
  */
 #ifndef _cham_tile_interface_h_
@@ -53,6 +54,20 @@ cti_interface_get( starpu_cham_tile_interface_t *interface )
     return &(interface->tile);
 }
 
+static inline CHAM_tile_t *
+cti_handle_get( starpu_data_handle_t handle )
+{
+    starpu_cham_tile_interface_t *cham_tile_interface = (starpu_cham_tile_interface_t *)
+        starpu_data_get_interface_on_node( handle, STARPU_MAIN_RAM );
+
+#ifdef STARPU_DEBUG
+    STARPU_ASSERT_MSG( cham_tile_interface->id == STARPU_CHAM_TILE_INTERFACE_ID,
+                       "Error. The given data is not a cham_tile." );
+#endif
+
+    return &(cham_tile_interface->tile);
+}
+
 void starpu_cham_tile_interface_init();
 void starpu_cham_tile_interface_fini();
 
diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in
index b795b4c79454e65ad9e22ca9a37b124bfe6c734a..41949dfbb7c345050a5260b47646276c7af57002 100644
--- a/runtime/starpu/include/chameleon_starpu.h.in
+++ b/runtime/starpu/include/chameleon_starpu.h.in
@@ -20,7 +20,7 @@
  * @author Loris Lucido
  * @author Terry Cojean
  * @author Matthieu Kuhn
- * @date 2024-03-16
+ * @date 2024-07-17
  *
  */
 #ifndef _chameleon_starpu_h_
@@ -149,6 +149,14 @@ void *RUNTIME_data_getaddr_withconversion( const RUNTIME_option_t *options,
 
 #endif
 
+#if defined(CHAMELEON_RUNTIME_SYNC)
+#define rt_shm_starpu_insert_task( _codelet_, ... )                         \
+    starpu_insert_task( (_codelet_), STARPU_TASK_SYNCHRONOUS, 1, ##__VA_ARGS__ )
+#else
+#define rt_shm_starpu_insert_task( _codelet_, ... )                         \
+    starpu_insert_task( (_codelet_), ##__VA_ARGS__ )
+#endif
+
 /*
  * Enable codelets names
  */
diff --git a/runtime/starpu/interface/cham_tile_interface.c b/runtime/starpu/interface/cham_tile_interface.c
index 352d7bd288833a3ffcb0e0d04f7cf06a6f96fe2f..89904548b70ed8ecced9b07fe76c9d0c541e66fb 100644
--- a/runtime/starpu/interface/cham_tile_interface.c
+++ b/runtime/starpu/interface/cham_tile_interface.c
@@ -13,7 +13,9 @@
  * @author Mathieu Faverge
  * @author Gwenole Lucas
  * @author Samuel Thibault
- * @date 2023-08-22
+ * @author Abel Calluaud
+ * @author Ana Hourcau
+ * @date 2024-07-17
  *
  */
 #include "chameleon_starpu.h"
@@ -77,20 +79,6 @@ cti_get_hmat_required_size( starpu_cham_tile_interface_t *cham_tile_interface  _
 }
 #endif
 
-static inline CHAM_tile_t *
-cti_handle_get( starpu_data_handle_t handle )
-{
-    starpu_cham_tile_interface_t *cham_tile_interface = (starpu_cham_tile_interface_t *)
-        starpu_data_get_interface_on_node( handle, STARPU_MAIN_RAM );
-
-#ifdef STARPU_DEBUG
-    STARPU_ASSERT_MSG( cham_tile_interface->id == STARPU_CHAM_TILE_INTERFACE_ID,
-                       "Error. The given data is not a cham_tile." );
-#endif
-
-    return &(cham_tile_interface->tile);
-}
-
 int
 cti_handle_get_m( starpu_data_handle_t handle )
 {