diff --git a/include/chameleon/tasks.h b/include/chameleon/tasks.h
index b7281f13566b0c9a4ed274d7a9be762a5c0f3cc9..39e741a66ba4d6b10582acfdaf1fad8aa85f34c7 100644
--- a/include/chameleon/tasks.h
+++ b/include/chameleon/tasks.h
@@ -11,11 +11,11 @@
  *
  * @brief Chameleon elementary tasks main header
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Mathieu Faverge
  * @author Cedric Augonnet
  * @author Florent Pruvost
- * @date 2022-02-22
+ * @date 2023-07-06
  *
  */
 #ifndef _chameleon_tasks_h_
@@ -100,6 +100,13 @@ void INSERT_TASK_map( const RUNTIME_option_t *options,
                       cham_access_t accessA, cham_uplo_t uplo, const CHAM_desc_t *A, int Am, int An,
                       cham_unary_operator_t op_fct, void *op_args );
 
+void INSERT_TASK_hgemm( const RUNTIME_option_t *options,
+                        cham_trans_t transA, cham_trans_t transB,
+                        int m, int n, int k, int nb,
+                        CHAMELEON_Real16_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                  const CHAM_desc_t *B, int Bm, int Bn,
+                        CHAMELEON_Real16_t beta,  const CHAM_desc_t *C, int Cm, int Cn );
+
 #include "chameleon/tasks_z.h"
 #include "chameleon/tasks_d.h"
 #include "chameleon/tasks_c.h"
diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt
index 3e818990583537b7daf58f2d0208174ac9238523..c6e600f17bdd10e26f41fbb2592627dbefb66cf1 100644
--- a/runtime/starpu/CMakeLists.txt
+++ b/runtime/starpu/CMakeLists.txt
@@ -252,6 +252,7 @@ precisions_rules_py(RUNTIME_SRCS_GENERATED "${ZSRC}"
 
 set(CODELETS_SRC
   codelets/codelet_convert.c
+  codelets/codelet_hgemm.c
   ${CODELETS_SRC}
   )
 
diff --git a/runtime/starpu/codelets/codelet_hgemm.c b/runtime/starpu/codelets/codelet_hgemm.c
new file mode 100644
index 0000000000000000000000000000000000000000..a595bf990cd336e50aee8c541954ef5591c5ed8b
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_hgemm.c
@@ -0,0 +1,250 @@
+/**
+ *
+ * @file starpu/codelet_hgemm.c
+ *
+ * @copyright 2009-2014 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon hgemm StarPU codelet
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @date 2023-07-06
+ *
+ */
+#include "chameleon_starpu.h"
+#include "runtime_codelets.h"
+
+CHAMELEON_CL_CB( hgemm, cti_handle_get_m(task->handles[2]), cti_handle_get_n(task->handles[2]), cti_handle_get_n(task->handles[0]), 2. *M*N*K) /* If A^t, computation is wrong */
+
+#if !defined(CHAMELEON_SIMULATION)
+#if defined(CHAMELEON_USE_CUDA)
+static void
+cl_hgemm_cuda_func( void *descr[], void *cl_arg )
+{
+    struct cl_hgemm_args_s *clargs = (struct cl_hgemm_args_s *)cl_arg;
+    cublasHandle_t          handle = starpu_cublas_get_local_handle();
+    CHAM_tile_t *tileA;
+    CHAM_tile_t *tileB;
+    CHAM_tile_t *tileC;
+
+    tileA = cti_interface_get(descr[0]);
+    tileB = cti_interface_get(descr[1]);
+    tileC = cti_interface_get(descr[2]);
+
+    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileB->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileC->format & CHAMELEON_TILE_FULLRANK );
+
+    CUDA_hgemm(
+        clargs->transA, clargs->transB,
+        clargs->m, clargs->n, clargs->k,
+        (CHAMELEON_Real16_t*)&(clargs->alpha),
+        tileA->mat, tileA->ld,
+        tileB->mat, tileB->ld,
+        (CHAMELEON_Real16_t*)&(clargs->beta),
+        tileC->mat, tileC->ld,
+        handle );
+}
+#endif /* defined(CHAMELEON_USE_CUDA) */
+
+#if defined(CHAMELEON_USE_HIP)
+static void
+cl_hgemm_hip_func( void *descr[], void *cl_arg )
+{
+    struct cl_hgemm_args_s *clargs = (struct cl_hgemm_args_s *)cl_arg;
+    hipblasHandle_t         handle = starpu_hipblas_get_local_handle();
+    CHAM_tile_t *tileA;
+    CHAM_tile_t *tileB;
+    CHAM_tile_t *tileC;
+
+    tileA = cti_interface_get(descr[0]);
+    tileB = cti_interface_get(descr[1]);
+    tileC = cti_interface_get(descr[2]);
+
+    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileB->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileC->format & CHAMELEON_TILE_FULLRANK );
+
+    HIP_hgemm(
+        clargs->transA, clargs->transB,
+        clargs->m, clargs->n, clargs->k,
+        (CHAMELEON_Real16_t*)&(clargs->alpha),
+        tileA->mat, tileA->ld,
+        tileB->mat, tileB->ld,
+        (CHAMELEON_Real16_t*)&(clargs->beta),
+        tileC->mat, tileC->ld,
+        handle );
+
+    return;
+}
+#endif /* defined(CHAMELEON_USE_HIP) */
+#endif /* !defined(CHAMELEON_SIMULATION) */
+
+/*
+ * Codelet definition
+ */
+#if defined(CHAMELEON_USE_HIP)
+CODELETS_GPU( hgemm, NULL, cl_hgemm_hip_func, STARPU_HIP_ASYNC )
+#else
+CODELETS( hgemm, NULL, cl_hgemm_cuda_func, STARPU_CUDA_ASYNC )
+#endif
+
+void INSERT_TASK_hgemm_Astat( const RUNTIME_option_t *options,
+                              cham_trans_t transA, cham_trans_t transB,
+                              int m, int n, int k, int nb,
+                              CHAMELEON_Real16_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                        const CHAM_desc_t *B, int Bm, int Bn,
+                              CHAMELEON_Real16_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    /* if ( alpha == 0. ) { */
+    /*     INSERT_TASK_hlascal( options, ChamUpperLower, m, n, nb, */
+    /*                          beta, C, Cm, Cn ); */
+    /*     return; */
+    /* } */
+
+    struct cl_hgemm_args_s  *clargs = NULL;
+    void (*callback)(void*);
+    int                      accessC;
+    int                      exec    = 0;
+    char                    *cl_name = "hgemm_Astat";
+
+    /* Handle cache */
+    CHAMELEON_BEGIN_ACCESS_DECLARATION;
+     /* Check A as write, since it will be the owner of the computation */
+    CHAMELEON_ACCESS_W(A, Am, An);
+    CHAMELEON_ACCESS_R(B, Bm, Bn);
+     /* Check C as read, since it will be used in a reduction */
+    CHAMELEON_ACCESS_R(C, Cm, Cn);
+    exec = __chameleon_need_exec;
+    CHAMELEON_END_ACCESS_DECLARATION;
+
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_hgemm_args_s ) );
+        clargs->transA = transA;
+        clargs->transB = transB;
+        clargs->m      = m;
+        clargs->n      = n;
+        clargs->k      = k;
+        clargs->alpha  = alpha;
+        clargs->beta   = beta;
+    }
+
+    /* Callback for profiling information */
+    callback = options->profiling ? cl_hgemm_callback : NULL;
+
+    /* Reduce the C access if needed */
+    if ( beta == 0. ) {
+        accessC = STARPU_W;
+    }
+#if defined(HAVE_STARPU_MPI_REDUX)
+    else if ( beta == 1. ) {
+        accessC = STARPU_MPI_REDUX;
+    }
+#endif
+    else {
+        accessC = STARPU_RW;
+    }
+
+    /* Refine name */
+    cl_name = chameleon_codelet_name( cl_name, 3,
+                                      A->get_blktile( A, Am, An ),
+                                      B->get_blktile( B, Bm, Bn ),
+                                      C->get_blktile( C, Cm, Cn ) );
+
+    /* Insert the task */
+    rt_starpu_insert_task(
+        &cl_hgemm,
+        /* Task codelet arguments */
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_hgemm_args_s),
+
+        /* Task handles */
+        STARPU_R, RTBLKADDR(A, ChamRealHalf, Am, An),
+        STARPU_R, RTBLKADDR(B, ChamRealHalf, Bm, Bn),
+        accessC,  RTBLKADDR(C, ChamRealHalf, Cm, Cn),
+
+        /* Common task arguments */
+        STARPU_PRIORITY,          options->priority,
+        STARPU_CALLBACK,          callback,
+        STARPU_EXECUTE_ON_NODE,   A->get_rankof(A, Am, An),
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+        STARPU_NAME,              cl_name,
+#endif
+        0 );
+}
+
+void INSERT_TASK_hgemm( const RUNTIME_option_t *options,
+                        cham_trans_t transA, cham_trans_t transB,
+                        int m, int n, int k, int nb,
+                        CHAMELEON_Real16_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                  const CHAM_desc_t *B, int Bm, int Bn,
+                        CHAMELEON_Real16_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    /* if ( alpha == 0. ) { */
+    /*     INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, */
+    /*                          beta, C, Cm, Cn ); */
+    /*     return; */
+    /* } */
+
+    struct cl_hgemm_args_s  *clargs = NULL;
+    void (*callback)(void*);
+    int                      accessC;
+    int                      exec = 0;
+    char                    *cl_name = "hgemm";
+
+    /* Handle cache */
+    CHAMELEON_BEGIN_ACCESS_DECLARATION;
+    CHAMELEON_ACCESS_R(A, Am, An);
+    CHAMELEON_ACCESS_R(B, Bm, Bn);
+    CHAMELEON_ACCESS_RW(C, Cm, Cn);
+    exec = __chameleon_need_exec;
+    CHAMELEON_END_ACCESS_DECLARATION;
+
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_hgemm_args_s ) );
+        clargs->transA = transA;
+        clargs->transB = transB;
+        clargs->m      = m;
+        clargs->n      = n;
+        clargs->k      = k;
+        clargs->alpha  = alpha;
+        clargs->beta   = beta;
+    }
+
+    /* Callback for profiling information */
+    callback = options->profiling ? cl_hgemm_callback : NULL;
+
+    /* Reduce the C access if needed */
+    accessC = ( beta == 0. ) ? STARPU_W : (STARPU_RW | ((beta == 1.) ? STARPU_COMMUTE : 0));
+
+    /* Refine name */
+    cl_name = chameleon_codelet_name( cl_name, 3,
+                                      A->get_blktile( A, Am, An ),
+                                      B->get_blktile( B, Bm, Bn ),
+                                      C->get_blktile( C, Cm, Cn ) );
+
+    /* Insert the task */
+    rt_starpu_insert_task(
+        &cl_hgemm,
+        /* Task codelet arguments */
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_hgemm_args_s),
+
+        /* Task handles */
+        STARPU_R, RTBLKADDR(A, ChamRealHalf, Am, An),
+        STARPU_R, RTBLKADDR(B, ChamRealHalf, Bm, Bn),
+        accessC,  RTBLKADDR(C, ChamRealHalf, Cm, Cn),
+
+        /* Common task arguments */
+        STARPU_PRIORITY,          options->priority,
+        STARPU_CALLBACK,          callback,
+        STARPU_EXECUTE_ON_WORKER, options->workerid,
+        STARPU_POSSIBLY_PARALLEL, options->parallel,
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+        STARPU_NAME,              cl_name,
+#endif
+        0 );
+}
diff --git a/runtime/starpu/control/runtime_descriptor.c b/runtime/starpu/control/runtime_descriptor.c
index 36dda081062b9840531622c0c46e4de29a1eaaf1..ee4817fe4cc35abf5a73baffad8b224fcf79452a 100644
--- a/runtime/starpu/control/runtime_descriptor.c
+++ b/runtime/starpu/control/runtime_descriptor.c
@@ -11,7 +11,7 @@
  *
  * @brief Chameleon StarPU descriptor routines
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Cedric Augonnet
  * @author Mathieu Faverge
  * @author Cedric Castagnede
@@ -20,7 +20,7 @@
  * @author Raphael Boucherie
  * @author Samuel Thibault
  * @author Loris Lucido
- * @date 2023-01-30
+ * @date 2023-07-06
  *
  */
 #include "chameleon_starpu.h"
@@ -101,6 +101,7 @@ void RUNTIME_desc_create( CHAM_desc_t *desc )
 {
     int64_t lmt = desc->lmt;
     int64_t lnt = desc->lnt;
+    size_t  nbtiles = lmt * lnt;
 
     desc->occurences = 1;
 
@@ -108,8 +109,12 @@ void RUNTIME_desc_create( CHAM_desc_t *desc )
      * Allocate starpu_handle_t array (handlers are initialized on the fly when
      * discovered by any algorithm to save space)
      */
-    desc->schedopt = (void*)calloc(lnt*lmt,sizeof(starpu_data_handle_t));
-    assert(desc->schedopt);
+    if ( cham_is_mixed( desc->dtyp ) ) {
+        nbtiles *= 3;
+    }
+
+    desc->schedopt = (void*)calloc( nbtiles, sizeof(starpu_data_handle_t) );
+    assert( desc->schedopt );
 
 #if !defined(CHAMELEON_SIMULATION)
 #if defined(CHAMELEON_USE_CUDA) || defined(CHAMELEON_USE_HIP)
@@ -160,7 +165,7 @@ void RUNTIME_desc_create( CHAM_desc_t *desc )
      */
     {
         chameleon_starpu_tag_init();
-        desc->mpitag = chameleon_starpu_tag_book( (int64_t)lnt * (int64_t)lmt );
+        desc->mpitag = chameleon_starpu_tag_book( nbtiles );
 
         if ( desc->mpitag == -1 ) {
             chameleon_fatal_error("RUNTIME_desc_create", "Can't pursue computation since no more tags are available");
@@ -181,41 +186,45 @@ void RUNTIME_desc_destroy( CHAM_desc_t *desc )
      * If this is the last descriptor using the matrix, we release the handle
      * and unregister the GPU data
      */
-    if ( desc->occurences == 0 ) {
-        starpu_data_handle_t *handle = (starpu_data_handle_t*)(desc->schedopt);
-        int lmt = desc->lmt;
-        int lnt = desc->lnt;
-        int m, n;
-
-        for (n = 0; n < lnt; n++) {
-            for (m = 0; m < lmt; m++)
-            {
-                if ( *handle != NULL ) {
-                    starpu_data_unregister(*handle);
-                    *handle = NULL;
-                }
-                handle++;
-            }
+    if ( desc->occurences > 0 ) {
+        return;
+    }
+
+    starpu_data_handle_t *handle = (starpu_data_handle_t*)(desc->schedopt);
+    int64_t lmt = desc->lmt;
+    int64_t lnt = desc->lnt;
+    int64_t nbtiles = lmt * lnt;
+    int64_t m;
+
+    if ( cham_is_mixed( desc->dtyp ) ) {
+        nbtiles *= 3;
+    }
+
+    for (m = 0; m < nbtiles; m++, handle++)
+    {
+        if ( *handle != NULL ) {
+            starpu_data_unregister(*handle);
+            *handle = NULL;
         }
+    }
 
 #if !defined(CHAMELEON_SIMULATION)
 #if defined(CHAMELEON_USE_CUDA) || defined(CHAMELEON_USE_HIP)
-        if ( (desc->use_mat == 1) && (desc->register_mat == 1) )
+    if ( (desc->use_mat == 1) && (desc->register_mat == 1) )
+    {
+        /* Unmap the pinned memory associated to the matrix */
+        if (gpuHostUnregister(desc->mat) != gpuSuccess)
         {
-            /* Unmap the pinned memory associated to the matrix */
-            if (gpuHostUnregister(desc->mat) != gpuSuccess)
-            {
-                chameleon_warning("RUNTIME_desc_destroy(StarPU)",
-                                  "gpuHostUnregister failed to unregister the "
-                                  "pinned memory associated to the matrix");
-            }
+            chameleon_warning("RUNTIME_desc_destroy(StarPU)",
+                              "gpuHostUnregister failed to unregister the "
+                              "pinned memory associated to the matrix");
         }
+    }
 #endif
 #endif
-        chameleon_starpu_tag_release( desc->mpitag );
+    chameleon_starpu_tag_release( desc->mpitag );
 
-        free( desc->schedopt );
-    }
+    free( desc->schedopt );
 }
 
 /**
@@ -335,24 +344,37 @@ void RUNTIME_desc_flush( const CHAM_desc_t        *desc,
 void RUNTIME_data_flush( const RUNTIME_sequence_t *sequence,
                          const CHAM_desc_t *A, int m, int n )
 {
+    int local, i, imax = 1;
     int64_t mm = m + (A->i / A->mb);
     int64_t nn = n + (A->j / A->nb);
-    int64_t shift = ((int64_t)A->lmt) * nn + mm;
+    int64_t shift   = ((int64_t)A->lmt) * nn + mm;
+    int64_t nbtiles = ((int64_t)(A->lmt)) * ((int64_t)(A->lnt));
     starpu_data_handle_t *handle = A->schedopt;
     handle += shift;
 
-    if (*handle == NULL) {
-        return;
-    }
+    local = chameleon_desc_islocal( A, m, n );
+
+    if ( cham_is_mixed( A->dtyp ) ) {
+        imax = 3;
+     }
+
+    for( i=0; i<imax; i++ ) {
+        starpu_data_handle_t *handlebis;
+
+        handlebis = handle + i * nbtiles;
+
+        if ( *handlebis == NULL ) {
+            continue;
+        }
 
 #if defined(CHAMELEON_USE_MPI)
-    starpu_mpi_cache_flush( MPI_COMM_WORLD, *handle );
+        starpu_mpi_cache_flush( MPI_COMM_WORLD, *handlebis );
 #endif
 
-    if ( chameleon_desc_islocal( A, m, n ) ) {
-        chameleon_starpu_data_wont_use( *handle );
+        if ( local ) {
+            chameleon_starpu_data_wont_use( *handlebis );
+        }
     }
-
     (void)sequence;
 }
 
@@ -443,3 +465,71 @@ void *RUNTIME_data_getaddr( const CHAM_desc_t *A, int m, int n )
     assert( *ptrtile );
     return (void*)(*ptrtile);
 }
+
+void *RUNTIME_data_getaddr_withconversion( const RUNTIME_option_t *options,
+                                           cham_access_t access, cham_flttype_t flttype,
+                                           const CHAM_desc_t *A, int m, int n )
+{
+    int64_t mm = m + (A->i / A->mb);
+    int64_t nn = n + (A->j / A->nb);
+
+    CHAM_tile_t *tile = A->get_blktile( A, m, n );
+    starpu_data_handle_t *ptrtile = A->schedopt;
+
+    int     fltshift = (cham_get_arith( tile->flttype ) - cham_get_arith( flttype ) + 3 ) % 3;
+    int64_t shift = (int64_t)fltshift * ((int64_t)A->lmt * (int64_t)A->lnt);
+    shift = shift + ((int64_t)A->lmt) * nn + mm;
+
+    /* Get the correct starpu_handle */
+    ptrtile += shift;
+
+    if ( *ptrtile != NULL ) {
+        return (void*)(*ptrtile);
+    }
+
+    int home_node = -1;
+    int myrank = A->myrank;
+    int owner  = A->get_rankof( A, m, n );
+
+    if ( myrank == owner ) {
+        if ( (tile->format & CHAMELEON_TILE_HMAT) ||
+             (tile->mat != NULL) )
+        {
+            home_node = STARPU_MAIN_RAM;
+        }
+    }
+
+    starpu_cham_tile_register( ptrtile, home_node, tile, flttype );
+
+#if defined(HAVE_STARPU_DATA_SET_OOC_FLAG)
+    if ( A->ooc == 0 ) {
+        starpu_data_set_ooc_flag( *ptrtile, 0 );
+    }
+#endif
+
+#if defined(HAVE_STARPU_DATA_SET_COORDINATES)
+    starpu_data_set_coordinates( *ptrtile, 3, m, n, cham_get_arith( flttype ) );
+#endif
+
+#if defined(CHAMELEON_USE_MPI)
+    starpu_mpi_data_register( *ptrtile, A->mpitag + shift, owner );
+#endif /* defined(CHAMELEON_USE_MPI) */
+
+#if defined(CHAMELEON_KERNELS_TRACE)
+    fprintf( stderr, "%s - %p registered with tag %ld\n",
+             tile->name, *ptrtile, A->mpitag + shift );
+#endif
+    assert( *ptrtile );
+
+    /* Submit the data conversion */
+    if (( fltshift != 0 ) && (access & ChamR) && (owner == myrank) ) {
+        starpu_data_handle_t *fromtile = A->schedopt;
+        starpu_data_handle_t *totile = ptrtile;
+
+        fromtile += ((int64_t)A->lmt) * nn + mm;
+        if ( *fromtile != NULL ) {
+            insert_task_convert( options, tile->m, tile->n, tile->flttype, *fromtile, flttype, *totile );
+        }
+    }
+    return (void*)(*ptrtile);
+}
diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in
index 8e77d14a9dea84153c6b0672d63cecb3c71ed077..8d421ddbb5234d073ba9ea71b3c97ff3046ff9fc 100644
--- a/runtime/starpu/include/chameleon_starpu.h.in
+++ b/runtime/starpu/include/chameleon_starpu.h.in
@@ -116,6 +116,10 @@ static inline int cham_to_starpu_access( cham_access_t accessA ) {
     return accessA;
 }
 
+void *RUNTIME_data_getaddr_withconversion( const RUNTIME_option_t *options,
+                                           cham_access_t access, cham_flttype_t flttype,
+                                           const CHAM_desc_t *A, int m, int n );
+
 /*
  * MPI Redefinitions
  */
diff --git a/runtime/starpu/include/runtime_codelets.h b/runtime/starpu/include/runtime_codelets.h
index f89b772939ee8155540a99e5c8ef7d8cb0154827..9c9af1b6bd7c451b3522682b20652816183ea5a7 100644
--- a/runtime/starpu/include/runtime_codelets.h
+++ b/runtime/starpu/include/runtime_codelets.h
@@ -151,6 +151,17 @@
 #endif
 
 CODELETS_HEADER(map);
+CODELETS_HEADER(hgemm);
+
+struct cl_hgemm_args_s {
+    cham_trans_t transA;
+    cham_trans_t transB;
+    int m;
+    int n;
+    int k;
+    CHAMELEON_Real16_t alpha;
+    CHAMELEON_Real16_t beta;
+};
 
 void
 insert_task_convert( const RUNTIME_option_t *options,
diff --git a/runtime/starpu/interface/cham_tile_interface.c b/runtime/starpu/interface/cham_tile_interface.c
index 27eb801cb266a0038e77b19e04e729b68b3270f4..1e837e1b0ca4c4a5e8418e9acf86dcfb71029851 100644
--- a/runtime/starpu/interface/cham_tile_interface.c
+++ b/runtime/starpu/interface/cham_tile_interface.c
@@ -672,6 +672,8 @@ starpu_cham_tile_register( starpu_data_handle_t *handleptr,
             .tilesize   = tile->m * tile->n * elemsize,
         };
     memcpy( &(cham_tile_interface.tile), tile, sizeof( CHAM_tile_t ) );
+    /* Overwrite the flttype in case it comes from a data conversion */
+    cham_tile_interface.tile.flttype = flttype;
 
     if ( tile->format & CHAMELEON_TILE_FULLRANK ) {
         cham_tile_interface.allocsize = tile->m * tile->n * elemsize;