diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c
index bc972bcf54562388b6c091049c21d972b79cadf9..bd0823a6f425e600243e83e1e51794d70209f08b 100644
--- a/runtime/starpu/codelets/codelet_zgemm.c
+++ b/runtime/starpu/codelets/codelet_zgemm.c
@@ -22,7 +22,9 @@
  * @author Gwenole Lucas
  * @author Philippe Swartvagher
  * @author Lucas Nesi
- * @date 2022-02-22
+ * @author Loris Lucido
+ * @author Terry Cojean
+ * @date 2023-01-30
  * @precisions normal z -> c d s
  *
  */
@@ -61,7 +63,7 @@ cl_zgemm_cpu_func( void *descr[], void *cl_arg )
                  clargs->beta,  tileC );
 }
 
-#ifdef CHAMELEON_USE_CUDA
+#if defined(CHAMELEON_USE_CUDA)
 static void
 cl_zgemm_cuda_func( void *descr[], void *cl_arg )
 {
@@ -90,12 +92,48 @@ cl_zgemm_cuda_func( void *descr[], void *cl_arg )
         handle );
 }
 #endif /* defined(CHAMELEON_USE_CUDA) */
+
+#if defined(CHAMELEON_USE_HIP)
+static void
+cl_zgemm_hip_func( void *descr[], void *cl_arg )
+{
+    struct cl_zgemm_args_s *clargs = (struct cl_zgemm_args_s *)cl_arg;
+    hipblasHandle_t          handle = starpu_hipblas_get_local_handle();
+    CHAM_tile_t *tileA;
+    CHAM_tile_t *tileB;
+    CHAM_tile_t *tileC;
+
+    tileA = cti_interface_get(descr[0]);
+    tileB = cti_interface_get(descr[1]);
+    tileC = cti_interface_get(descr[2]);
+
+    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileB->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileC->format & CHAMELEON_TILE_FULLRANK );
+
+    HIP_zgemm(
+        clargs->transA, clargs->transB,
+        clargs->m, clargs->n, clargs->k,
+        (hipDoubleComplex*)&(clargs->alpha),
+        tileA->mat, tileA->ld,
+        tileB->mat, tileB->ld,
+        (hipDoubleComplex*)&(clargs->beta),
+        tileC->mat, tileC->ld,
+        handle );
+
+    return;
+}
+#endif /* defined(CHAMELEON_USE_HIP) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
+#if defined(CHAMELEON_USE_HIP)
+CODELETS_GPU( zgemm, cl_zgemm_cpu_func, cl_zgemm_hip_func, STARPU_HIP_ASYNC )
+#else
 CODELETS( zgemm, cl_zgemm_cpu_func, cl_zgemm_cuda_func, STARPU_CUDA_ASYNC )
+#endif
 
 void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options,
                               cham_trans_t transA, cham_trans_t transB,
diff --git a/runtime/starpu/codelets/codelet_zhemm.c b/runtime/starpu/codelets/codelet_zhemm.c
index b7f70ca6868e5a5e8c1543c5ff1c51204be7ec60..cff474bbf6c75e4fdb1108895d6ee76e8a72f0e8 100644
--- a/runtime/starpu/codelets/codelet_zhemm.c
+++ b/runtime/starpu/codelets/codelet_zhemm.c
@@ -57,7 +57,7 @@ cl_zhemm_cpu_func( void *descr[], void *cl_arg )
                  clargs->beta,  tileC );
 }
 
-#ifdef CHAMELEON_USE_CUDA
+#if defined(CHAMELEON_USE_CUDA)
 static void
 cl_zhemm_cuda_func( void *descr[], void *cl_arg )
 {
@@ -86,12 +86,46 @@ cl_zhemm_cuda_func( void *descr[], void *cl_arg )
         handle );
 }
 #endif /* defined(CHAMELEON_USE_CUDA) */
+
+#if defined(CHAMELEON_USE_HIP)
+static void
+cl_zhemm_hip_func( void *descr[], void *cl_arg )
+{
+    struct cl_zhemm_args_s *clargs = (struct cl_zhemm_args_s *)cl_arg;
+    hipblasHandle_t         handle = starpu_hipblas_get_local_handle();
+    CHAM_tile_t *tileA;
+    CHAM_tile_t *tileB;
+    CHAM_tile_t *tileC;
+
+    tileA = cti_interface_get(descr[0]);
+    tileB = cti_interface_get(descr[1]);
+    tileC = cti_interface_get(descr[2]);
+
+    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileB->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileC->format & CHAMELEON_TILE_FULLRANK );
+
+    HIP_zhemm(
+        clargs->side, clargs->uplo,
+        clargs->m, clargs->n,
+        (hipDoubleComplex*)&(clargs->alpha),
+        tileA->mat, tileA->ld,
+        tileB->mat, tileB->ld,
+        (hipDoubleComplex*)&(clargs->beta),
+        tileC->mat, tileC->ld,
+        handle );
+}
+#endif /* defined(CHAMELEON_USE_HIP) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
+#if defined(CHAMELEON_USE_HIP)
+CODELETS_GPU( zhemm, cl_zhemm_cpu_func, cl_zhemm_hip_func, STARPU_HIP_ASYNC )
+#else
 CODELETS( zhemm, cl_zhemm_cpu_func, cl_zhemm_cuda_func, STARPU_CUDA_ASYNC )
+#endif
 
 void INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options,
                               cham_side_t side, cham_uplo_t uplo,
diff --git a/runtime/starpu/codelets/codelet_zher2k.c b/runtime/starpu/codelets/codelet_zher2k.c
index 38babbe5a230611d595f4716c170f1a5618ad420..fa102852615bac660a9ea9f40367faa10cfc4f3a 100644
--- a/runtime/starpu/codelets/codelet_zher2k.c
+++ b/runtime/starpu/codelets/codelet_zher2k.c
@@ -48,7 +48,7 @@ static void cl_zher2k_cpu_func(void *descr[], void *cl_arg)
                 n, k, alpha, tileA, tileB, beta, tileC);
 }
 
-#ifdef CHAMELEON_USE_CUDA
+#if defined(CHAMELEON_USE_CUDA)
 static void cl_zher2k_cuda_func(void *descr[], void *cl_arg)
 {
     cublasHandle_t handle = starpu_cublas_get_local_handle();
@@ -75,13 +75,46 @@ static void cl_zher2k_cuda_func(void *descr[], void *cl_arg)
                  &beta,  tileC->mat, tileC->ld,
                  handle );
 }
-#endif /* CHAMELEON_USE_CUDA */
+#endif /* defined(CHAMELEON_USE_CUDA) */
+
+#if defined(CHAMELEON_USE_HIP)
+static void cl_zher2k_hip_func(void *descr[], void *cl_arg)
+{
+    hipblasHandle_t handle = starpu_hipblas_get_local_handle();
+    cham_uplo_t uplo;
+    cham_trans_t trans;
+    int n;
+    int k;
+    hipDoubleComplex alpha;
+    CHAM_tile_t *tileA;
+    CHAM_tile_t *tileB;
+    double beta;
+    CHAM_tile_t *tileC;
+
+    tileA = cti_interface_get(descr[0]);
+    tileB = cti_interface_get(descr[1]);
+    tileC = cti_interface_get(descr[2]);
+
+    starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta);
+
+    HIP_zher2k( uplo, trans,
+                 n, k,
+                 &alpha, tileA->mat, tileA->ld,
+                         tileB->mat, tileB->ld,
+                 &beta,  tileC->mat, tileC->ld,
+                 handle );
+}
+#endif /* defined(CHAMELEON_USE_HIP) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
-CODELETS(zher2k, cl_zher2k_cpu_func, cl_zher2k_cuda_func, STARPU_CUDA_ASYNC)
+#if defined(CHAMELEON_USE_HIP)
+CODELETS_GPU( zher2k, cl_zher2k_cpu_func, cl_zher2k_hip_func, STARPU_HIP_ASYNC )
+#else
+CODELETS( zher2k, cl_zher2k_cpu_func, cl_zher2k_cuda_func, STARPU_CUDA_ASYNC )
+#endif
 
 /**
  *
diff --git a/runtime/starpu/codelets/codelet_zherk.c b/runtime/starpu/codelets/codelet_zherk.c
index 894f0b0269754ba9e0c0171efda6abdaa59b34b0..29301b83097c8b61383c00861777c35917ac27cd 100644
--- a/runtime/starpu/codelets/codelet_zherk.c
+++ b/runtime/starpu/codelets/codelet_zherk.c
@@ -74,12 +74,38 @@ cl_zherk_cuda_func(void *descr[], void *cl_arg)
         handle );
 }
 #endif /* defined(CHAMELEON_USE_CUDA) */
+
+#if defined(CHAMELEON_USE_HIP)
+static void
+cl_zherk_hip_func(void *descr[], void *cl_arg)
+{
+    hipblasHandle_t handle = starpu_hipblas_get_local_handle();
+    struct cl_zherk_args_s *clargs = (struct cl_zherk_args_s *)cl_arg;
+    CHAM_tile_t *tileA;
+    CHAM_tile_t *tileC;
+
+    tileA = cti_interface_get(descr[0]);
+    tileC = cti_interface_get(descr[1]);
+
+    HIP_zherk(
+        clargs->uplo, clargs->trans, clargs->n, clargs->k,
+        &(clargs->alpha),
+        tileA->mat, tileA->ld,
+        &(clargs->beta),
+        tileC->mat, tileC->ld,
+        handle );
+}
+#endif /* defined(CHAMELEON_USE_HIP) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
+#if defined(CHAMELEON_USE_HIP)
+CODELETS_GPU( zherk, cl_zherk_cpu_func, cl_zherk_hip_func, STARPU_HIP_ASYNC )
+#else
 CODELETS( zherk, cl_zherk_cpu_func, cl_zherk_cuda_func, STARPU_CUDA_ASYNC )
+#endif
 
 void INSERT_TASK_zherk( const RUNTIME_option_t *options,
                         cham_uplo_t uplo, cham_trans_t trans,
diff --git a/runtime/starpu/codelets/codelet_zsymm.c b/runtime/starpu/codelets/codelet_zsymm.c
index 9fad20e13bdc75a10cde4bffb998343d71503fdb..54750f0b3f7de2f76083163b9a881db77d8066e3 100644
--- a/runtime/starpu/codelets/codelet_zsymm.c
+++ b/runtime/starpu/codelets/codelet_zsymm.c
@@ -58,7 +58,7 @@ cl_zsymm_cpu_func( void *descr[], void *cl_arg )
                  clargs->beta,  tileC );
 }
 
-#ifdef CHAMELEON_USE_CUDA
+#if defined(CHAMELEON_USE_CUDA)
 static void
 cl_zsymm_cuda_func( void *descr[], void *cl_arg )
 {
@@ -87,12 +87,46 @@ cl_zsymm_cuda_func( void *descr[], void *cl_arg )
         handle );
 }
 #endif /* defined(CHAMELEON_USE_CUDA) */
+
+#if defined(CHAMELEON_USE_HIP)
+static void
+cl_zsymm_hip_func( void *descr[], void *cl_arg )
+{
+    struct cl_zsymm_args_s *clargs = (struct cl_zsymm_args_s *)cl_arg;
+    hipblasHandle_t         handle = starpu_hipblas_get_local_handle();
+    CHAM_tile_t *tileA;
+    CHAM_tile_t *tileB;
+    CHAM_tile_t *tileC;
+
+    tileA = cti_interface_get(descr[0]);
+    tileB = cti_interface_get(descr[1]);
+    tileC = cti_interface_get(descr[2]);
+
+    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileB->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileC->format & CHAMELEON_TILE_FULLRANK );
+
+    HIP_zsymm(
+        clargs->side, clargs->uplo,
+        clargs->m, clargs->n,
+        (hipDoubleComplex*)&(clargs->alpha),
+        tileA->mat, tileA->ld,
+        tileB->mat, tileB->ld,
+        (hipDoubleComplex*)&(clargs->beta),
+        tileC->mat, tileC->ld,
+        handle );
+}
+#endif /* defined(CHAMELEON_USE_HIP) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
+#if defined(CHAMELEON_USE_HIP)
+CODELETS_GPU( zsymm, cl_zsymm_cpu_func, cl_zsymm_hip_func, STARPU_HIP_ASYNC )
+#else
 CODELETS( zsymm, cl_zsymm_cpu_func, cl_zsymm_cuda_func, STARPU_CUDA_ASYNC )
+#endif
 
 void INSERT_TASK_zsymm_Astat( const RUNTIME_option_t *options,
                               cham_side_t side, cham_uplo_t uplo,
diff --git a/runtime/starpu/codelets/codelet_zsyr2k.c b/runtime/starpu/codelets/codelet_zsyr2k.c
index af47c87edf546b3a4d3005081c4acff8d61161fe..7d5ce81aadff62dd3866bee1a4ee9ecc63b354a0 100644
--- a/runtime/starpu/codelets/codelet_zsyr2k.c
+++ b/runtime/starpu/codelets/codelet_zsyr2k.c
@@ -48,7 +48,7 @@ static void cl_zsyr2k_cpu_func(void *descr[], void *cl_arg)
                  n, k, alpha, tileA, tileB, beta, tileC);
 }
 
-#ifdef CHAMELEON_USE_CUDA
+#if defined(CHAMELEON_USE_CUDA)
 static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg)
 {
     cublasHandle_t handle = starpu_cublas_get_local_handle();
@@ -75,13 +75,46 @@ static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg)
                  &beta,  tileC->mat, tileC->ld,
                  handle );
 }
-#endif /* CHAMELEON_USE_CUDA */
+#endif /* defined(CHAMELEON_USE_CUDA) */
+
+#if defined(CHAMELEON_USE_HIP)
+static void cl_zsyr2k_hip_func(void *descr[], void *cl_arg)
+{
+    hipblasHandle_t handle = starpu_hipblas_get_local_handle();
+    cham_uplo_t uplo;
+    cham_trans_t trans;
+    int n;
+    int k;
+    hipDoubleComplex alpha;
+    CHAM_tile_t *tileA;
+    CHAM_tile_t *tileB;
+    hipDoubleComplex beta;
+    CHAM_tile_t *tileC;
+
+    tileA = cti_interface_get(descr[0]);
+    tileB = cti_interface_get(descr[1]);
+    tileC = cti_interface_get(descr[2]);
+
+    starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta);
+
+    HIP_zsyr2k( uplo, trans,
+                 n, k,
+                 &alpha, tileA->mat, tileA->ld,
+                         tileB->mat, tileB->ld,
+                 &beta,  tileC->mat, tileC->ld,
+                 handle );
+}
+#endif /* defined(CHAMELEON_USE_HIP) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
-CODELETS(zsyr2k, cl_zsyr2k_cpu_func, cl_zsyr2k_cuda_func, STARPU_CUDA_ASYNC)
+#if defined(CHAMELEON_USE_HIP)
+CODELETS_GPU( zsyr2k, cl_zsyr2k_cpu_func, cl_zsyr2k_hip_func, STARPU_HIP_ASYNC )
+#else
+CODELETS( zsyr2k, cl_zsyr2k_cpu_func, cl_zsyr2k_cuda_func, STARPU_CUDA_ASYNC )
+#endif
 
 /**
  *
diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c
index 8d8dcdbe752e256a40ec8c914378dbc6356ba91f..d9b7e84fd00336b445a05eeac648957f61eb51e7 100644
--- a/runtime/starpu/codelets/codelet_zsyrk.c
+++ b/runtime/starpu/codelets/codelet_zsyrk.c
@@ -75,12 +75,38 @@ cl_zsyrk_cuda_func(void *descr[], void *cl_arg)
         handle );
 }
 #endif /* defined(CHAMELEON_USE_CUDA) */
+
+#if defined(CHAMELEON_USE_HIP)
+static void
+cl_zsyrk_hip_func(void *descr[], void *cl_arg)
+{
+    hipblasHandle_t handle = starpu_hipblas_get_local_handle();
+    struct cl_zsyrk_args_s *clargs = (struct cl_zsyrk_args_s *)cl_arg;
+    CHAM_tile_t *tileA;
+    CHAM_tile_t *tileC;
+
+    tileA = cti_interface_get(descr[0]);
+    tileC = cti_interface_get(descr[1]);
+
+    HIP_zsyrk(
+        clargs->uplo, clargs->trans, clargs->n, clargs->k,
+        (hipDoubleComplex*)&(clargs->alpha),
+        tileA->mat, tileA->ld,
+        (hipDoubleComplex*)&(clargs->beta),
+        tileC->mat, tileC->ld,
+        handle );
+}
+#endif /* defined(CHAMELEON_USE_HIP) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
+#if defined(CHAMELEON_USE_HIP)
+CODELETS_GPU( zsyrk, cl_zsyrk_cpu_func, cl_zsyrk_hip_func, STARPU_HIP_ASYNC )
+#else
 CODELETS( zsyrk, cl_zsyrk_cpu_func, cl_zsyrk_cuda_func, STARPU_CUDA_ASYNC )
+#endif
 
 void INSERT_TASK_zsyrk( const RUNTIME_option_t *options,
                         cham_uplo_t uplo, cham_trans_t trans,
diff --git a/runtime/starpu/codelets/codelet_ztrmm.c b/runtime/starpu/codelets/codelet_ztrmm.c
index e51b81ab7b9b45028b7f63aa00f1643be0a948c6..3bb14d38bef1119024293da7ecdfb6f58a67c12f 100644
--- a/runtime/starpu/codelets/codelet_ztrmm.c
+++ b/runtime/starpu/codelets/codelet_ztrmm.c
@@ -53,7 +53,7 @@ cl_ztrmm_cpu_func(void *descr[], void *cl_arg)
                  clargs->m, clargs->n, clargs->alpha, tileA, tileB );
 }
 
-#ifdef CHAMELEON_USE_CUDA
+#if defined(CHAMELEON_USE_CUDA)
 static void
 cl_ztrmm_cuda_func(void *descr[], void *cl_arg)
 {
@@ -74,12 +74,38 @@ cl_ztrmm_cuda_func(void *descr[], void *cl_arg)
         handle );
 }
 #endif /* defined(CHAMELEON_USE_CUDA) */
+
+#if defined(CHAMELEON_USE_HIP)
+static void
+cl_ztrmm_hip_func(void *descr[], void *cl_arg)
+{
+    struct cl_ztrmm_args_s *clargs = (struct cl_ztrmm_args_s *)cl_arg;
+    hipblasHandle_t         handle = starpu_hipblas_get_local_handle();
+    CHAM_tile_t *tileA;
+    CHAM_tile_t *tileB;
+
+    tileA = cti_interface_get(descr[0]);
+    tileB = cti_interface_get(descr[1]);
+
+    HIP_ztrmm(
+        clargs->side, clargs->uplo, clargs->transA, clargs->diag,
+        clargs->m, clargs->n,
+        (hipDoubleComplex*)&(clargs->alpha),
+        tileA->mat, tileA->ld,
+        tileB->mat, tileB->ld,
+        handle );
+}
+#endif /* defined(CHAMELEON_USE_HIP) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
+#if defined(CHAMELEON_USE_HIP)
+CODELETS_GPU( ztrmm, cl_ztrmm_cpu_func, cl_ztrmm_hip_func, STARPU_HIP_ASYNC )
+#else
 CODELETS( ztrmm, cl_ztrmm_cpu_func, cl_ztrmm_cuda_func, STARPU_CUDA_ASYNC )
+#endif
 
 void INSERT_TASK_ztrmm( const RUNTIME_option_t *options,
                         cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag,
diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c
index 0196649684a4652ff5686bc060aad7e7879a1f2b..19e83c01a0010bcd1d495932ab7c67e1ac3dc139 100644
--- a/runtime/starpu/codelets/codelet_ztrsm.c
+++ b/runtime/starpu/codelets/codelet_ztrsm.c
@@ -20,7 +20,9 @@
  * @author Lucas Barros de Assis
  * @author Florent Pruvost
  * @author Gwenole Lucas
- * @date 2022-02-22
+ * @author Loris Lucido
+ * @author Terry Cojean
+ * @date 2023-01-30
  * @precisions normal z -> c d s
  *
  */
@@ -54,7 +56,7 @@ cl_ztrsm_cpu_func(void *descr[], void *cl_arg)
                  clargs->m, clargs->n, clargs->alpha, tileA, tileB );
 }
 
-#ifdef CHAMELEON_USE_CUDA
+#if defined(CHAMELEON_USE_CUDA)
 static void
 cl_ztrsm_cuda_func(void *descr[], void *cl_arg)
 {
@@ -75,12 +77,40 @@ cl_ztrsm_cuda_func(void *descr[], void *cl_arg)
         handle );
 }
 #endif /* defined(CHAMELEON_USE_CUDA) */
+
+#if defined(CHAMELEON_USE_HIP)
+static void
+cl_ztrsm_hip_func(void *descr[], void *cl_arg)
+{
+    struct cl_ztrsm_args_s *clargs = (struct cl_ztrsm_args_s*)cl_arg;
+    hipblasHandle_t         handle = starpu_hipblas_get_local_handle();
+    CHAM_tile_t *tileA;
+    CHAM_tile_t *tileB;
+
+    tileA = cti_interface_get(descr[0]);
+    tileB = cti_interface_get(descr[1]);
+
+    HIP_ztrsm(
+        clargs->side, clargs->uplo, clargs->transA, clargs->diag,
+        clargs->m, clargs->n,
+        (hipDoubleComplex*)&(clargs->alpha),
+        tileA->mat, tileA->ld,
+        tileB->mat, tileB->ld,
+        handle );
+}
+#endif /* defined(CHAMELEON_USE_HIP) */
+
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
+
+#if defined(CHAMELEON_USE_HIP)
+CODELETS_GPU( ztrsm, cl_ztrsm_cpu_func, cl_ztrsm_hip_func, STARPU_HIP_ASYNC )
+#else
 CODELETS( ztrsm, cl_ztrsm_cpu_func, cl_ztrsm_cuda_func, STARPU_CUDA_ASYNC )
+#endif
 
 void INSERT_TASK_ztrsm( const RUNTIME_option_t *options,
                         cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag,