diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt
index 2b4d0a1db651921f94fe509cd5976800e69c162a..0968de66d35b777f2efbbc2d75ebfdbc9ccdb475 100644
--- a/runtime/starpu/CMakeLists.txt
+++ b/runtime/starpu/CMakeLists.txt
@@ -24,7 +24,9 @@
 #  @author Florent Pruvost
 #  @author Samuel Thibault
 #  @author Matthieu Kuhn
-#  @date 2022-02-22
+#  @author Loris Lucido
+#  @author Terry Cojean
+#  @date 2023-01-30
 #
 ###
 cmake_minimum_required(VERSION 3.1)
@@ -147,6 +149,13 @@ if ( STARPU_FOUND )
     endif()
   endif()
 
+  if (CHAMELEON_USE_HIP AND NOT CHAMELEON_SIMULATION)
+    check_function_exists(starpu_hipblas_get_local_handle HAVE_STARPU_HIPBLAS_GET_LOCAL_HANDLE)
+    if ( NOT HAVE_STARPU_HIPBLAS_GET_LOCAL_HANDLE )
+      message(FATAL_ERROR "The detected StarPU library does not include hipblas support. Please disable CHAMELEON_USE_HIP_{CUDA,ROC} or provide a StarPU library with hipblas support.")
+    endif()
+  endif()
+
   morse_cmake_required_unset()
 endif ( STARPU_FOUND )
 
@@ -268,6 +277,9 @@ if (NOT CHAMELEON_SIMULATION)
   if (CHAMELEON_USE_CUDA)
     add_dependencies(chameleon_starpu cudablas_include)
   endif()
+  if (CHAMELEON_USE_HIP)
+    add_dependencies(chameleon_starpu hipblas_include)
+  endif()
 endif()
 
 target_include_directories(chameleon_starpu PUBLIC
@@ -294,6 +306,9 @@ if (NOT CHAMELEON_SIMULATION)
   if(CHAMELEON_USE_CUDA)
     target_link_libraries(chameleon_starpu PRIVATE cudablas)
   endif(CHAMELEON_USE_CUDA)
+  if(CHAMELEON_USE_HIP)
+    target_link_libraries(chameleon_starpu PRIVATE hipblas)
+  endif(CHAMELEON_USE_HIP)
 endif(NOT CHAMELEON_SIMULATION)
 target_link_libraries(chameleon_starpu PRIVATE MORSE::M)
 
@@ -307,6 +322,9 @@ if (NOT CHAMELEON_SIMULATION)
   if (CHAMELEON_USE_CUDA)
     target_link_libraries(chameleon_starpu PUBLIC CUDA::CUBLAS)
   endif()
+  if (CHAMELEON_USE_HIP)
+    target_link_libraries(chameleon_starpu PUBLIC HIP::HIPBLAS)
+  endif()
 endif()
 
 # export target
diff --git a/runtime/starpu/control/runtime_context.c b/runtime/starpu/control/runtime_context.c
index c83ab1283339f214c91c61f7643d75874e67658e..e2cdb399f401872d4712cb7894d350155994c098 100644
--- a/runtime/starpu/control/runtime_context.c
+++ b/runtime/starpu/control/runtime_context.c
@@ -16,6 +16,7 @@
  * @author Mathieu Faverge
  * @author Cedric Castagnede
  * @author Florent Pruvost
+ * @author Terry Cojean
  * @date 2022-02-22
  *
  */
diff --git a/runtime/starpu/control/runtime_control.c b/runtime/starpu/control/runtime_control.c
index 5a36b8a6f0a2643f148b09b90b900b9a323caa27..d527cd56b307ad2dc033bf7ecc878a26cddff523 100644
--- a/runtime/starpu/control/runtime_control.c
+++ b/runtime/starpu/control/runtime_control.c
@@ -19,7 +19,9 @@
  * @author Philippe Swartvagher
  * @author Samuel Thibault
  * @author Matthieu Kuhn
- * @date 2022-02-22
+ * @author Loris Lucido
+ * @author Terry Cojean
+ * @date 2023-01-30
  *
  */
 #include "chameleon_starpu.h"
@@ -221,8 +223,12 @@ int RUNTIME_init( CHAM_context_t *chamctxt,
                                              );
 #endif
 
-#if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION)
+#if !defined(CHAMELEON_SIMULATION)
+#if defined(CHAMELEON_USE_CUDA)
     starpu_cublas_init();
+#elif defined(CHAMELEON_USE_HIP)
+    starpu_hipblas_init();
+#endif
 #endif
 
     starpu_cham_tile_interface_init();
@@ -249,6 +255,9 @@ void RUNTIME_finalize( CHAM_context_t *chamctxt )
 #if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION)
     starpu_cublas_shutdown();
 #endif
+#if defined(CHAMELEON_USE_HIP) && !defined(CHAMELEON_SIMULATION)
+    starpu_hipblas_shutdown();
+#endif
 
 #if defined(CHAMELEON_USE_MPI)
     starpu_mpi_shutdown();
diff --git a/runtime/starpu/control/runtime_descriptor.c b/runtime/starpu/control/runtime_descriptor.c
index 72dd1b81d81b6830edc807e5fdb4a48c9276bd90..385adc0df81a1a87de81971fe3590853b5214287 100644
--- a/runtime/starpu/control/runtime_descriptor.c
+++ b/runtime/starpu/control/runtime_descriptor.c
@@ -19,7 +19,8 @@
  * @author Guillaume Sylvand
  * @author Raphael Boucherie
  * @author Samuel Thibault
- * @date 2022-02-22
+ * @author Loris Lucido
+ * @date 2023-01-30
  *
  */
 #include "chameleon_starpu.h"
@@ -138,6 +139,26 @@ void RUNTIME_free( void  *ptr,
 #endif
 }
 
+#if defined(CHAMELEON_USE_CUDA)
+
+#define gpuError_t              cudaError_t
+#define gpuHostRegister         cudaHostRegister
+#define gpuHostUnregister       cudaHostUnregister
+#define gpuHostRegisterPortable cudaHostRegisterPortable
+#define gpuSuccess              cudaSuccess
+#define gpuGetErrorString       cudaGetErrorString
+
+#elif defined(CHAMELEON_USE_HIP)
+
+#define gpuError_t              hipError_t
+#define gpuHostRegister         hipHostRegister
+#define gpuHostUnregister       hipHostUnregister
+#define gpuHostRegisterPortable hipHostRegisterPortable
+#define gpuSuccess              hipSuccess
+#define gpuGetErrorString       hipGetErrorString
+
+#endif
+
 /**
  *  Create data descriptor
  */
@@ -155,25 +176,27 @@ void RUNTIME_desc_create( CHAM_desc_t *desc )
     desc->schedopt = (void*)calloc(lnt*lmt,sizeof(starpu_data_handle_t));
     assert(desc->schedopt);
 
-#if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION)
+#if !defined(CHAMELEON_SIMULATION)
+#if defined(CHAMELEON_USE_CUDA) || defined(CHAMELEON_USE_HIP)
     /*
-     * Register allocated memory as CUDA pinned memory
+     * Register allocated memory as GPU pinned memory
      */
     if ( (desc->use_mat == 1) && (desc->register_mat == 1) )
     {
         int64_t eltsze = CHAMELEON_Element_Size(desc->dtyp);
         size_t size = (size_t)(desc->llm) * (size_t)(desc->lln) * eltsze;
-        cudaError_t rc;
+        gpuError_t rc;
 
         /* Register the matrix as pinned memory */
-        rc = cudaHostRegister( desc->mat, size, cudaHostRegisterPortable );
-        if ( rc != cudaSuccess )
+        rc = gpuHostRegister( desc->mat, size, gpuHostRegisterPortable );
+        if ( rc != gpuSuccess )
         {
             /* Disable the unregister as register failed */
             desc->register_mat = 0;
-            chameleon_warning("RUNTIME_desc_create(StarPU): cudaHostRegister - ", cudaGetErrorString( rc ));
+            chameleon_warning("RUNTIME_desc_create(StarPU): gpuHostRegister - ", gpuGetErrorString( rc ));
         }
     }
+#endif
 #endif
 
     if (desc->ooc) {
@@ -247,18 +270,20 @@ void RUNTIME_desc_destroy( CHAM_desc_t *desc )
             }
         }
 
-#if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION)
+#if !defined(CHAMELEON_SIMULATION)
+#if defined(CHAMELEON_USE_CUDA) || defined(CHAMELEON_USE_HIP)
         if ( (desc->use_mat == 1) && (desc->register_mat == 1) )
         {
             /* Unmap the pinned memory associated to the matrix */
-            if (cudaHostUnregister(desc->mat) != cudaSuccess)
+            if (gpuHostUnregister(desc->mat) != gpuSuccess)
             {
                 chameleon_warning("RUNTIME_desc_destroy(StarPU)",
-                                  "cudaHostUnregister failed to unregister the "
+                                  "gpuHostUnregister failed to unregister the "
                                   "pinned memory associated to the matrix");
             }
         }
-#endif /* defined(CHAMELEON_USE_CUDA) */
+#endif
+#endif
 
         free(desc->schedopt);
     }
diff --git a/runtime/starpu/control/runtime_workspace.c b/runtime/starpu/control/runtime_workspace.c
index fb209146e3c7e44b4b129379bf23536620322cf9..ad9b5cd9fd0d0abb8f7760b9a66c0936473f1281 100644
--- a/runtime/starpu/control/runtime_workspace.c
+++ b/runtime/starpu/control/runtime_workspace.c
@@ -15,11 +15,26 @@
  * @author Cedric Augonnet
  * @author Mathieu Faverge
  * @author Florent Pruvost
- * @date 2022-02-22
+ * @author Loris Lucido
+ * @date 2023-01-30
  *
  */
 #include "chameleon_starpu.h"
 
+#if defined(CHAMELEON_USE_CUDA)
+#define GPU_WORKER_TYPE STARPU_CUDA_WORKER
+#define gpuMallocHost   cudaMallocHost
+#define gpuMalloc       cudaMalloc
+#define gpuFreeHost     cudaFreeHost
+#define gpuFree         cudaFree
+#elif defined(CHAMELEON_USE_HIP)
+#define GPU_WORKER_TYPE STARPU_HIP_WORKER
+#define gpuMallocHost   hipMallocHost
+#define gpuMalloc       hipMalloc
+#define gpuFreeHost     hipFreeHost
+#define gpuFree         hipFree
+#endif
+
 static void RUNTIME_allocate_workspace_on_workers(void *arg)
 {
     struct chameleon_starpu_ws_s *workspace = arg;
@@ -28,9 +43,10 @@ static void RUNTIME_allocate_workspace_on_workers(void *arg)
 
     int id = starpu_worker_get_id();
 
-#if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION)
+#if !defined(CHAMELEON_SIMULATION)
+#if defined(CHAMELEON_USE_CUDA) || defined(CHAMELEON_USE_HIP)
     type = starpu_worker_get_type(id);
-    if (type == STARPU_CUDA_WORKER)
+    if ( type == GPU_WORKER_TYPE )
     {
         int memory_location = workspace->memory_location;
 
@@ -39,14 +55,15 @@ static void RUNTIME_allocate_workspace_on_workers(void *arg)
             /* Use pinned memory because the kernel is very likely
              * to transfer these data between the CPU and the GPU.
              * */
-            cudaMallocHost(&workspace->workspaces[id], workspace->size);
+            gpuMallocHost( &workspace->workspaces[id], workspace->size );
         }
         else {
             /* Allocate on the device */
-            cudaMalloc(&workspace->workspaces[id], workspace->size);
+            gpuMalloc( &workspace->workspaces[id], workspace->size );
         }
     }
     else
+#endif
 #endif
     {
         /* This buffer should only be used within the CPU kernel, so
@@ -54,7 +71,7 @@ static void RUNTIME_allocate_workspace_on_workers(void *arg)
         workspace->workspaces[id] = malloc(workspace->size);
     }
 
-    assert(workspace->workspaces[id]);
+    assert( workspace->workspaces[id] );
 }
 
 
@@ -65,24 +82,26 @@ static void RUNTIME_free_workspace_on_workers(void *arg)
     (void)type;
     int id = starpu_worker_get_id();
 
-#if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION)
-    type = starpu_worker_get_type(id);
-    if (type == STARPU_CUDA_WORKER)
+#if !defined(CHAMELEON_SIMULATION)
+#if defined(CHAMELEON_USE_CUDA) || defined(CHAMELEON_USE_HIP)
+    type = starpu_worker_get_type( id );
+    if ( type == GPU_WORKER_TYPE )
     {
         int memory_location = workspace->memory_location;
 
         if (memory_location == CHAMELEON_HOST_MEM)
         {
-            cudaFreeHost(workspace->workspaces[id]);
+            gpuFreeHost( workspace->workspaces[id] );
         }
         else {
-            cudaFree(workspace->workspaces[id]);
+            gpuFree( workspace->workspaces[id] );
         }
     }
     else
+#endif
 #endif
     {
-        free(workspace->workspaces[id]);
+        free( workspace->workspaces[id] );
     }
 
     workspace->workspaces[id] = NULL;
diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in
index e2d964ccc98ae105c4bd79ceac832f77c8ea89e2..e92e23387f1f20912e3d28f910047c2d9629dfd4 100644
--- a/runtime/starpu/include/chameleon_starpu.h.in
+++ b/runtime/starpu/include/chameleon_starpu.h.in
@@ -17,7 +17,9 @@
  * @author Florent Pruvost
  * @author Philippe Swartvagher
  * @author Samuel Thibault
- * @date 2022-02-22
+ * @author Loris Lucido
+ * @author Terry Cojean
+ * @date 2023-01-30
  *
  */
 #ifndef _chameleon_starpu_h_
@@ -71,6 +73,14 @@
 #include <starpu_cublas_v2.h>
 #endif
 
+#if defined(CHAMELEON_USE_HIP) && !defined(CHAMELEON_SIMULATION)
+#include <starpu_scheduler.h>
+#include <starpu_hip.h>
+
+#include <hipblas/hipblas.h>
+#include <starpu_hipblas.h>
+#endif
+
 #if defined(CHAMELEON_SIMULATION)
 # if !defined(STARPU_SIMGRID)
 #  error "Starpu was not built with simgrid support (--enable-simgrid). Can not run Chameleon with simulation support."
diff --git a/runtime/starpu/include/runtime_codelet_z.h b/runtime/starpu/include/runtime_codelet_z.h
index 82afe7b42cea6f37c0bd2ede6aeaf0041048fa34..673d8c5d7a6ea287ed2fa4e948c6d8820f35a00a 100644
--- a/runtime/starpu/include/runtime_codelet_z.h
+++ b/runtime/starpu/include/runtime_codelet_z.h
@@ -17,7 +17,8 @@
  * @author Cedric Castagnede
  * @author Florent Pruvost
  * @author Alycia Lisito
- * @date 2022-02-22
+ * @author Loris Lucido
+ * @date 2023-01-30
  * @precisions normal z -> c d s
  *
  */
@@ -31,11 +32,17 @@
 #if !defined(CHAMELEON_SIMULATION)
 #include "coreblas/coreblas_z.h"
 #include "coreblas/coreblas_ztile.h"
+
 #if defined(CHAMELEON_USE_CUDA)
 #include "cudablas.h"
 #endif
+
+#if defined(CHAMELEON_USE_HIP)
+#include "hipblas.h"
 #endif
 
+#endif /* !defined(CHAMELEON_SIMULATION) */
+
 /*
  * BLAS 1 functions
  */
diff --git a/runtime/starpu/include/runtime_codelets.h b/runtime/starpu/include/runtime_codelets.h
index d5c923355299277f32747c76f5534bf3edbd5ce0..00808fc52fec20078744d3603bec4cb2be680a24 100644
--- a/runtime/starpu/include/runtime_codelets.h
+++ b/runtime/starpu/include/runtime_codelets.h
@@ -16,7 +16,8 @@
  * @author Mathieu Faverge
  * @author Cedric Castagnede
  * @author Florent Pruvost
- * @date 2022-02-22
+ * @author Loris Lucido
+ * @date 2023-01-30
  *
  */
 #ifndef _runtime_codelets_h_
@@ -25,14 +26,31 @@
 #include "chameleon/config.h"
 #include "runtime_codelet_profile.h"
 
-//#undef STARPU_CUDA_ASYNC
-#ifdef STARPU_CUDA_ASYNC
+#if defined(STARPU_CUDA_ASYNC)
 #define CODELET_CUDA_FLAGS(flags) .cuda_flags = {(flags)},
 #else
 #define CODELET_CUDA_FLAGS(flags)
 #endif
 
-#define CODELETS_ALL(cl_name, cpu_func_name, cuda_func_name, _original_location_, cuda_flags) \
+#if defined(STARPU_HIP_ASYNC)
+#define CODELET_HIP_FLAGS(flags) .hip_flags = {(flags)},
+#else
+#define CODELET_HIP_FLAGS(flags)
+#endif
+
+#if defined(CHAMELEON_USE_CUDA)
+#define CODELET_GPU_FIELDS( gpu_func_name, gpu_flags )                 \
+        CODELET_CUDA_FLAGS( gpu_flags )                                \
+        .cuda_func = ((gpu_func_name)),
+#elif defined(CHAMELEON_USE_HIP)
+#define CODELET_GPU_FIELDS( gpu_func_name, gpu_flags )                 \
+        CODELET_HIP_FLAGS( gpu_flags )                                 \
+        .hip_funcs = {(gpu_func_name)},
+#else
+#define CODELET_GPU_FIELDS( gpu_func_name, gpu_flags )
+#endif
+
+#define CODELETS_ALL(cl_name, cpu_func_name, gpu_func_name, _original_location_, gpu_flags) \
     struct starpu_perfmodel cl_##cl_name##_fake = {                     \
         .type   = STARPU_HISTORY_BASED,                                 \
         .symbol = "fake_"#cl_name                                       \
@@ -46,8 +64,7 @@
     struct starpu_codelet cl_##cl_name = {                              \
         .where     = (_original_location_),                             \
         .cpu_func  = ((cpu_func_name)),                                 \
-        CODELET_CUDA_FLAGS(cuda_flags)                                  \
-        .cuda_func = ((cuda_func_name)),                                \
+        CODELET_GPU_FIELDS( gpu_func_name, gpu_flags )                  \
         .nbuffers  = STARPU_VARIABLE_NBUFFERS,                          \
         .model     = &cl_##cl_name##_model,                             \
         .name      = #cl_name                                           \
@@ -77,8 +94,13 @@
     CODELETS_ALL( name, cpu_func_name, NULL, STARPU_CPU, 0 )
 #endif
 
-#define CODELETS_GPU(name, cpu_func_name, cuda_func_name, cuda_flags) \
-    CODELETS_ALL( name, cpu_func_name, cuda_func_name, STARPU_CPU  | STARPU_CUDA, cuda_flags )
+#if defined(CHAMELEON_USE_HIP)
+#define CODELETS_GPU(name, cpu_func_name, gpu_func_name, gpu_flags) \
+    CODELETS_ALL( name, cpu_func_name, gpu_func_name, STARPU_CPU | STARPU_HIP, gpu_flags )
+#else
+#define CODELETS_GPU(name, cpu_func_name, gpu_func_name, gpu_flags) \
+    CODELETS_ALL( name, cpu_func_name, gpu_func_name, STARPU_CPU | STARPU_CUDA, gpu_flags )
+#endif
 
 #define CODELETS_ALL_HEADER(name)                            \
      CHAMELEON_CL_CB_HEADER(name);                           \
@@ -89,29 +111,35 @@
      void cl_##name##_restore_where(void)
 
 #if defined(CHAMELEON_SIMULATION)
-#if defined(CHAMELEON_USE_CUDA)
-#define CODELETS(name, cpu_func_name, cuda_func_name, cuda_flags) \
-    CODELETS_GPU(name, (starpu_cpu_func_t) 1, (starpu_cuda_func_t) 1, cuda_flags)
+
+#if defined(CHAMELEON_USE_CUDA) || defined(CHAMELEON_USE_HIP)
+#define CODELETS(name, cpu_func_name, gpu_func_name, gpu_flags) \
+    CODELETS_GPU(name, (starpu_cpu_func_t) 1, (starpu_cuda_func_t) 1, gpu_flags)
 
 #define CODELETS_HEADER(name)  CODELETS_ALL_HEADER(name)
 #else
-#define CODELETS(name, cpu_func_name, cuda_func_name, cuda_flags) \
+#define CODELETS(name, cpu_func_name, gpu_func_name, gpu_flags) \
     CODELETS_CPU(name, (starpu_cpu_func_t) 1)
 
 #define CODELETS_HEADER(name)  CODELETS_ALL_HEADER(name)
 #endif
-#elif defined(CHAMELEON_USE_CUDA)
-#define CODELETS(name, cpu_func_name, cuda_func_name, cuda_flags) \
-    CODELETS_GPU(name, cpu_func_name, cuda_func_name, cuda_flags)
+
+#else /* defined(CHAMELEON_SIMULATION) */
+
+#if defined(CHAMELEON_USE_CUDA) //|| defined(CHAMELEON_USE_HIP)
+#define CODELETS(name, cpu_func_name, gpu_func_name, gpu_flags) \
+    CODELETS_GPU(name, cpu_func_name, gpu_func_name, gpu_flags)
 
 #define CODELETS_HEADER(name)  CODELETS_ALL_HEADER(name)
 #else
-#define CODELETS(name, cpu_func_name, cuda_func_name, cuda_flags) \
+#define CODELETS(name, cpu_func_name, gpu_func_name, gpu_flags) \
     CODELETS_CPU(name, cpu_func_name)
 
 #define CODELETS_HEADER(name)  CODELETS_ALL_HEADER(name)
 #endif
 
+#endif
+
 CODELETS_HEADER(map);
 
 #endif /* _runtime_codelets_h_ */