diff --git a/CMakeLists.txt b/CMakeLists.txt
index 13b5ea5fa3dc09b606549c1cc0d9289b2a7ccab6..27bb9597f36a7e5ecbf7a686c9241788cca6378f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,7 +17,7 @@
 #     Univ. of California Berkeley,
 #     Univ. of Colorado Denver.
 #
-# @version 1.2.0
+# @version 1.3.0
 #  @author Cedric Castagnede
 #  @author Emmanuel Agullo
 #  @author Mathieu Faverge
@@ -30,7 +30,7 @@
 #  @author Alycia Lisito
 #  @author Loris Lucido
 #  @author Nathan Précigout
-#  @date 2023-01-30
+#  @date 2023-07-04
 #
 ###
 cmake_minimum_required(VERSION 3.3)
@@ -45,7 +45,7 @@ endif()
 
 # set project version number
 set(CHAMELEON_VERSION_MAJOR 1)
-set(CHAMELEON_VERSION_MINOR 2)
+set(CHAMELEON_VERSION_MINOR 3)
 set(CHAMELEON_VERSION_MICRO 0)
 set(CHAMELEON_VERSION "${CHAMELEON_VERSION_MAJOR}.${CHAMELEON_VERSION_MINOR}.${CHAMELEON_VERSION_MICRO}")
 
diff --git a/ChangeLog b/ChangeLog
index 3ed3e67b1a16d6fae49130dc78ec8bf325d1a8d5..a3b5120ddae5a526f4f5db287c7a85a5f5e2d9e9 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+chameleon-1.3.0
+------------------------------------------------------------------------
+ - types: add support for half precision arithmetic into the data descriptors
+ - cuda: add half precision conversion kernels, and variants of the gemm kernels (hgemm, and gemmex)
+
 chameleon-1.2.0
 ------------------------------------------------------------------------
  - NEW: Add support for AMD GPUs throug hipcublas or hip-rocm kernels
diff --git a/cmake_modules/local_subs.py b/cmake_modules/local_subs.py
index 02af5df38a90e034eebb9438238440267a1a3355..df41dd109b68835ae2f07759d0bb81b758bfe075 100644
--- a/cmake_modules/local_subs.py
+++ b/cmake_modules/local_subs.py
@@ -6,12 +6,12 @@
  @copyright 2019-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
                       Univ. Bordeaux. All rights reserved.
 
- @version 1.2.0
+ @version 1.3.0
  @author Mathieu Faverge
  @author Florent Pruvost
  @author Nathalie Furmento
  @author Alycia Lisito
- @date 2022-02-22
+ @date 2023-07-04
 
 """
 _extra_blas = [
@@ -39,6 +39,10 @@ _extra_blas = [
     ('',                     'slatm1',               'dlatm1',               'slatm1',               'dlatm1'              ),
     ('',                     'sgenm2',               'dgenm2',               'cgenm2',               'zgenm2'              ),
     ('',                     'slag2c_fake',          'dlag2z_fake',          'slag2c',               'dlag2z'              ),
+    ('',                     'slag2h',               'dlag2h',               'slag2h',               'dlag2h'              ),
+    ('',                     'hlag2s',               'hlag2d',               'hlag2s',               'hlag2d'              ),
+    ('',                     'slag2h',               'dlag2h',               'clag2x',               'zlag2x'              ),
+    ('',                     'hlag2s',               'hlag2d',               'xlag2c',               'xlag2z'              ),
     ('',                     'sgepdf',               'dgepdf',               'cgepdf',               'zgepdf'              ),
     ('',                     'scesca',               'dcesca',               'ccesca',               'zcesca'              ),
     ('',                     'sgesum',               'dgesum',               'cgesum',               'zgesum'              ),
@@ -73,6 +77,9 @@ subs = {
         ('int',                  'float',                'double',               'CHAMELEON_Complex32_t', r'\bCHAMELEON_Complex64_t'),
         ('ChamPattern',          'ChamRealFloat',        'ChamRealDouble',       'ChamComplexFloat',    r'\bChamComplexDouble' ),
         ('ChamPattern',          'ChamRealFloat',        'ChamRealDouble',       'ChamRealFloat',       r'\bChamRealDouble'    ),
+        ('ChamPattern',          'ChamRealFloat',        'ChamRealFloat',        'ChamComplexFloat',    r'\bChamComplexFloat'  ),
+        ('ChamPattern',          'ChamRealFloat',        'ChamRealFloat',        'ChamRealFloat',       r'\bChamRealFloat'     ),
+        ('ChamPattern',          'ChamRealHalf',         'ChamRealHalf',         'ChamComplexHalf',     r'\bChamComplexHalf'   ),
         ('int',                  'float',                'double',               'complex32',             'complex64'          ),
         ('Int',                  'Float',                'Double',               'Complex32',             'Complex64'          ),
         ('Int',                  'HMAT_SIMPLE_PRECISION','HMAT_DOUBLE_PRECISION','HMAT_SIMPLE_COMPLEX',   'HMAT_DOUBLE_COMPLEX'),
diff --git a/cmake_modules/morse_cmake b/cmake_modules/morse_cmake
index b9e9ec80030a349e1c768f8aec2879d1a6ed28a9..332bf7de07c7eaf85cf370236d5a209b83a58dd7 160000
--- a/cmake_modules/morse_cmake
+++ b/cmake_modules/morse_cmake
@@ -1 +1 @@
-Subproject commit b9e9ec80030a349e1c768f8aec2879d1a6ed28a9
+Subproject commit 332bf7de07c7eaf85cf370236d5a209b83a58dd7
diff --git a/control/auxiliary.c b/control/auxiliary.c
index 1d1b9147fe4e006c6557b07d225fe14130a7c754..d299d8fa8b47f9c5e2eb04f6a66c62c8dbafdc78 100644
--- a/control/auxiliary.c
+++ b/control/auxiliary.c
@@ -11,7 +11,7 @@
  *
  * @brief Chameleon auxiliary routines
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Jakub Kurzak
  * @author Piotr Luszczek
  * @author Emmanuel Agullo
@@ -19,7 +19,7 @@
  * @author Florent Pruvost
  * @author Guillaume Sylvand
  * @author Mathieu Faverge
- * @date 2022-02-22
+ * @date 2023-07-04
  *
  ***
  *
@@ -187,13 +187,17 @@ int CHAMELEON_Version(int *ver_major, int *ver_minor, int *ver_micro)
  * @retval Element size in bytes
  *
  */
-int CHAMELEON_Element_Size(int type)
+int CHAMELEON_Element_Size( cham_flttype_t type )
 {
-    switch(type) {
+    switch( cham_get_flttype(type) ) {
         case ChamByte:          return          1;
-        case ChamInteger:       return   sizeof(int);
+        case ChamInteger16:     return   sizeof(int16_t);
+        case ChamInteger32:     return   sizeof(int32_t);
+        case ChamInteger64:     return   sizeof(int64_t);
+        case ChamRealHalf:      return   2;
         case ChamRealFloat:     return   sizeof(float);
         case ChamRealDouble:    return   sizeof(double);
+        case ChamComplexHalf:   return   4;
         case ChamComplexFloat:  return 2*sizeof(float);
         case ChamComplexDouble: return 2*sizeof(double);
         default: chameleon_fatal_error("CHAMELEON_Element_Size", "undefined type");
diff --git a/control/control.c b/control/control.c
index 62762469f26f788eb215811a663355b0489ef616..3c9ba6a9259f83f1857449e6f6c429a2ecaa396e 100644
--- a/control/control.c
+++ b/control/control.c
@@ -11,7 +11,7 @@
  *
  * @brief Chameleon control routines
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Jakub Kurzak
  * @author Mathieu Faverge
  * @author Cedric Castagnede
@@ -20,7 +20,7 @@
  * @author Samuel Thibault
  * @author Philippe Swartvagher
  * @author Loris Lucido
- * @date 2023-01-30
+ * @date 2023-07-04
  *
  ***
  *
@@ -123,6 +123,7 @@ int __chameleon_initpar(int ncpus, int ngpus, int nthreads_per_worker)
     }
 #endif
 
+    chamctxt->ncudas = ngpus;
     return RUNTIME_init( chamctxt, ncpus, ngpus, nthreads_per_worker );
 }
 
diff --git a/control/descriptor.c b/control/descriptor.c
index 1005a04cff833e3b38bb8134cfa5b47ebef40fd5..c5e4ca2d23ebd0507c752c8b0852ad6edbe1ee82 100644
--- a/control/descriptor.c
+++ b/control/descriptor.c
@@ -11,14 +11,14 @@
  *
  * @brief Chameleon descriptors routines
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Mathieu Faverge
  * @author Cedric Castagnede
  * @author Florent Pruvost
  * @author Guillaume Sylvand
  * @author Raphael Boucherie
  * @author Samuel Thibault
- * @date 2022-12-13
+ * @date 2023-07-04
  *
  ***
  *
@@ -91,6 +91,7 @@ int chameleon_desc_mat_free( CHAM_desc_t *desc )
 void chameleon_desc_init_tiles( CHAM_desc_t *desc, blkrankof_fct_t rankof )
 {
     CHAM_tile_t *tile;
+    int8_t flttype = cham_get_flttype( desc->dtyp );
     int ii, jj;
 
     assert( rankof != chameleon_getrankof_tile );
@@ -101,7 +102,7 @@ void chameleon_desc_init_tiles( CHAM_desc_t *desc, blkrankof_fct_t rankof )
         for( ii=0; ii<desc->lmt; ii++, tile++ ) {
             int rank = rankof( desc, ii, jj );
             tile->format  = CHAMELEON_TILE_FULLRANK;
-            tile->flttype = (int8_t)(desc->dtyp);
+            tile->flttype = flttype;
             tile->rank    = rank;
             tile->m       = ii == desc->lmt-1 ? desc->lm - ii * desc->mb : desc->mb;
             tile->n       = jj == desc->lnt-1 ? desc->ln - jj * desc->nb : desc->nb;
@@ -368,6 +369,8 @@ void chameleon_desc_destroy( CHAM_desc_t *desc )
  */
 int chameleon_desc_check(const CHAM_desc_t *desc)
 {
+    cham_flttype_t flttype;
+
     if (desc == NULL) {
         chameleon_error("chameleon_desc_check", "NULL descriptor");
         return CHAMELEON_ERR_NOT_INITIALIZED;
@@ -376,11 +379,16 @@ int chameleon_desc_check(const CHAM_desc_t *desc)
         chameleon_error("chameleon_desc_check", "NULL matrix pointer");
         return CHAMELEON_ERR_UNALLOCATED;
     }
-    if (desc->dtyp != ChamInteger &&
-        desc->dtyp != ChamRealFloat &&
-        desc->dtyp != ChamRealDouble &&
-        desc->dtyp != ChamComplexFloat &&
-        desc->dtyp != ChamComplexDouble  ) {
+
+    flttype = cham_get_flttype( desc->dtyp );
+    if ( (flttype != ChamInteger       ) &&
+         (flttype != ChamRealHalf      ) &&
+         (flttype != ChamRealFloat     ) &&
+         (flttype != ChamRealDouble    ) &&
+         (flttype != ChamComplexHalf   ) &&
+         (flttype != ChamComplexFloat  ) &&
+         (flttype != ChamComplexDouble ) )
+    {
         chameleon_error("chameleon_desc_check", "invalid matrix type");
         return CHAMELEON_ERR_ILLEGAL_VALUE;
     }
@@ -435,8 +443,11 @@ CHAMELEON_Desc_SubMatrix( CHAM_desc_t *descA, int i, int j, int m, int n )
  *
  * @param[in] dtyp
  *          Data type of the matrix:
+ *          @arg ChamInteger:       integer (i),
+ *          @arg ChamRealHalf:      half precision real (H),
  *          @arg ChamRealFloat:     single precision real (S),
  *          @arg ChamRealDouble:    double precision real (D),
+ *          @arg ChamComplexHalf:   half precision complex (),
  *          @arg ChamComplexFloat:  single precision complex (C),
  *          @arg ChamComplexDouble: double precision complex (Z).
  *
diff --git a/control/descriptor.h b/control/descriptor.h
index cff3b76b9af50744245e6cde721b65f78788f125..96bf0bf0bc7f264f44beea28660de358c35a7a90 100644
--- a/control/descriptor.h
+++ b/control/descriptor.h
@@ -11,7 +11,7 @@
  *
  * @brief Chameleon descriptor header
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Jakub Kurzak
  * @author Mathieu Faverge
  * @author Cedric Castagnede
@@ -19,7 +19,7 @@
  * @author Guillaume Sylvand
  * @author Raphael Boucherie
  * @author Samuel Thibault
- * @date 2022-02-22
+ * @date 2023-07-04
  *
  */
 #ifndef _chameleon_descriptor_h_
@@ -76,7 +76,7 @@ int          chameleon_desc_check    ( const CHAM_desc_t *desc );
 /**
  *  Internal function to return address of block (m,n) with m,n = block indices
  */
-inline static CHAM_tile_t *chameleon_desc_gettile(const CHAM_desc_t *A, int m, int n)
+inline static CHAM_tile_t *chameleon_desc_gettile(const CHAM_desc_t *A, int m, int n )
 {
     size_t mm = m + A->i / A->mb;
     size_t nn = n + A->j / A->nb;
diff --git a/gpucublas/compute/CMakeLists.txt b/gpucublas/compute/CMakeLists.txt
index d7a745c0489c3364398267f64513737a9348d9a3..5f2987d984f4953d479665b25b494c1d2cf6a213 100644
--- a/gpucublas/compute/CMakeLists.txt
+++ b/gpucublas/compute/CMakeLists.txt
@@ -17,14 +17,17 @@
 #     Univ. of California Berkeley,
 #     Univ. of Colorado Denver.
 #
-# @version 1.2.0
+# @version 1.3.0
 #  @author Florent Pruvost
 #  @author Guillaume Sylvand
 #  @author Mathieu Faverge
-#  @date 2022-02-22
+#  @date 2023-07-04
 #
 ###
 
+# To define CMAKE_CUDA_COMPILER
+cmake_minimum_required(VERSION 3.18)
+
 # Generate the chameleon sources for all possible precisions
 # ------------------------------------------------------
 set(GPUCUBLAS_SRCS_GENERATED "")
@@ -53,6 +56,28 @@ set(ZSRC
     cuda_zunmqrt.c
     )
 
+# Add CUDA kernel if compiler and toolkit are available
+# -----------------------------------------------------
+include(CheckLanguage)
+check_language(CUDA)
+
+if(CMAKE_CUDA_COMPILER)
+  enable_language(CUDA)
+  find_package(CUDAToolkit)
+else()
+  message(STATUS "CUDA language is not supported")
+endif()
+
+if (CUDAToolkit_FOUND)
+  include(SetCMakeCudaArchitectures)
+
+  set(ZSRC
+    ${ZSRC}
+    cuda_zlag2c.cu
+    cuda_dlag2h.cu
+  )
+endif()
+
 # Former MAGMA files that are no longer supported
 # if( CHAMELEON_USE_MAGMA )
 #   set(ZSRC
@@ -77,9 +102,14 @@ precisions_rules_py(
 
 set(GPUCUBLAS_SRCS
   ${GPUCUBLAS_SRCS_GENERATED}
+  cuda_hgemm.c
+  cuda_gemmex.c
   cudaglobal.c
   )
 
+# Need to use CXX compiler to have the __half support and access to cublasHgemm()
+set_source_files_properties( cuda_hgemm.c PROPERTIES LANGUAGE CXX )
+
 # Force generation of sources
 # ---------------------------
 add_custom_target(gpucublas_sources ALL SOURCES ${GPUCUBLAS_SRCS})
@@ -102,6 +132,12 @@ set_property(TARGET gpucublas PROPERTY INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}
 target_link_libraries(gpucublas PRIVATE coreblas CUDA::CUBLAS)
 target_link_libraries(gpucublas PUBLIC MORSE::M)
 
+set_target_properties(gpucublas PROPERTIES
+  CUDA_SEPARABLE_COMPILATION OFF)
+
+#target_include_directories( gpucublas PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+#target_link_libraries( gpucublas PRIVATE CUDA::cublas CUDA::cudart )
+
 # export target coreblas
 install(EXPORT gpucublasTargets
         NAMESPACE CHAMELEON::
diff --git a/gpucublas/compute/cuda_dlag2h.cu b/gpucublas/compute/cuda_dlag2h.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8136c248fd9ff2c02a4910b251f282acdbe4cc77
--- /dev/null
+++ b/gpucublas/compute/cuda_dlag2h.cu
@@ -0,0 +1,290 @@
+/**
+ *
+ * @file cuda_dlag2h.cu
+ *
+ * @copyright 2023-2023 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon cuda_dlag2h GPU kernel
+ *
+ * @version 1.3.0
+ * @author Mark Gates
+ * @author Mathieu Faverge
+ * @date 2023-07-04
+ * @precisions normal d -> d s
+ *
+ * This file is an adaptation of the MAGMA zlag2c.cu, clag2z.cu, hlag2s.cu, and slag2h.cu
+ *
+ */
+#include "gpucublas.h"
+#include <coreblas/lapacke.h>
+
+#if CUDA_VERSION < 7500
+#error "This file should not be included as the half precision floats are not supported."
+#endif
+
+#define BLK_X 64
+#define BLK_Y 32
+
+__device__  int cuda_dlag2h_flag = 0;
+
+/*
+ * Divides matrix into ceil( m/BLK_X ) x ceil( n/BLK_Y ) blocks.
+ * Each block has BLK_X threads.
+ * Each thread loops across one row, updating BLK_Y entries.
+ */
+__global__
+void cuda_dlag2h_kernel(
+    int m, int n,
+    const double *A, int lda,
+    CHAMELEON_Real16_t *HA, int ldha,
+    double rmax )
+{
+    int ind = blockIdx.x * BLK_X + threadIdx.x;
+    int iby = blockIdx.y * BLK_Y;
+
+    /* check if full block-column */
+    bool full = (iby + BLK_Y <= n);
+    double tmp;
+
+    /* do only rows inside matrix */
+    if ( ind > m ) {
+        return;
+    }
+
+    A  += ind + iby*lda;
+    HA += ind + iby*ldha;
+    if ( full ) {
+        /* full block-column */
+#pragma unroll
+        for( int j=0; j < BLK_Y; ++j ) {
+            tmp = A[j*lda];
+
+            if ( fabs(tmp) > rmax ) {
+                cuda_dlag2h_flag = 1;
+            }
+
+            HA[j*ldha] = __double2half(tmp);
+        }
+    }
+    else {
+        /* partial block-column */
+        for( int j=0; (j < BLK_Y) && (iby+j < n); ++j ) {
+            tmp = A[j*lda];
+            if ( fabs(tmp) > rmax ) {
+                cuda_dlag2h_flag = 1;
+            }
+
+            HA[j*ldha] = __double2half(tmp);
+        }
+    }
+}
+
+
+/**
+ *
+ * @ingroup CUDA_CHAMELEON_Complex64_t
+ *
+ * CUDA_dlag2h converts a double-real matrix, A, to a half-real matrix,
+ * HA.
+ *
+ * RMAX is the overflow for the single-real arithmetic.  DLAG2H checks that
+ * all the entries of A are between -RMAX and RMAX. If not, the conversion is
+ * aborted and a magma_dlag2h_flag is raised.
+ *
+ *  @param[in] m
+ *          The number of lines of the matrix A.  m >= 0.
+ *
+ *  @param[in] n
+ *          The number of columns of the matrix A.  n >= 0.
+ *
+ *  @param[in] A
+ *          On entry, the lda-by-n coefficient matrix A.
+ *
+ *  @param[in] lda
+ *          The leading dimension of the array A.  LDA >= max(1,m).
+ *
+ *  @param[out] HA
+ *          On exit, if INFO=0, the ldha-by-n coefficient matrix HA;
+ *          if INFO > 0, the content of HA is unspecified.
+ *
+ *  @param[in] ldha
+ *          The leading dimension of the array HA.  LDHA >= max(1,m).
+ *
+ *  @param[out]
+ *    -     = 0:  successful exit.
+ *    -     < 0:  if INFO = -i, the i-th argument had an illegal value
+ *    -     = 1:  an entry of the matrix A is greater than the SINGLE PRECISION
+ *                overflow threshold, in this case, the content
+ *                of HA on exit is unspecified.
+ *
+ *  @param[in] handle
+ *          Cublas handle to execute in.
+ *
+ **/
+extern "C" int
+CUDA_dlag2h( int m, int n,
+             const double *A, int lda,
+             CHAMELEON_Real16_t *HA, int ldha,
+             cublasHandle_t handle )
+{
+    cudaStream_t stream;
+    double       rmax;
+
+    if ( m < 0 ) {
+        return -1;
+    }
+    else if ( n < 0 ) {
+        return -2;
+    }
+    else if ( lda < chameleon_max(1,m) ) {
+        return -4;
+    }
+    else if ( ldha < chameleon_max(1,m) ) {
+        return -6;
+    }
+
+    /* quick return */
+    if ( m == 0 || n == 0 ) {
+        return 0;
+    }
+
+    dim3 threads( BLK_X, 1 );
+    dim3 grid( chameleon_ceil( m, BLK_X ), chameleon_ceil( n, BLK_Y ) );
+
+    /*
+     * There is no lapackf77_hlamch, please visit:
+     * https://blogs.mathworks.com/cleve/2017/05/08/half-precision-16-bit-floating-point-arithmetic/
+     */
+    rmax = 65504.;
+
+    cublasGetStream( handle, &stream );
+
+    cuda_dlag2h_kernel<<< grid, threads, 0, stream >>>( m, n, A, lda, HA, ldha, rmax );
+
+    return 0;
+}
+
+/*
+ * Divides matrix into ceil( m/BLK_X ) x ceil( n/BLK_Y ) blocks.
+ * Each block has BLK_X threads.
+ * Each thread loops across one row, updating BLK_Y entries.
+ */
+__global__
+void cuda_hlag2d_kernel(
+    int m, int n,
+    const CHAMELEON_Real16_t *HA, int ldha,
+    double *A, int lda )
+{
+    int ind = blockIdx.x * BLK_X + threadIdx.x;
+    int iby = blockIdx.y * BLK_Y;
+
+    /* check if full block-column */
+    bool full = (iby + BLK_Y <= n);
+
+    /* do only rows inside matrix */
+    if ( ind > m ) {
+        return;
+    }
+
+    A  += ind + iby*lda;
+    HA += ind + iby*ldha;
+    if ( full ) {
+        // full block-column
+#pragma unroll
+        for( int j=0; j < BLK_Y; ++j ) {
+#if defined(PRECISION_zc)
+            A[j*ldha] = make_double( HA[j*ldha].x, HA[j*ldha].y );
+#else
+            A[j*ldha] = HA[j*ldha];
+#endif
+        }
+    }
+    else {
+        // partial block-column
+        for( int j=0; (j < BLK_Y) && (iby+j) < n; ++j ) {
+#if defined(PRECISION_zc)
+            A[j*ldha] = make_double( HA[j*ldha].x, HA[j*ldha].y );
+#else
+            A[j*ldha] = HA[j*ldha];
+#endif
+        }
+    }
+}
+
+
+/**
+ *
+ * @ingroup CUDA_CHAMELEON_Complex64_t
+ *
+ * CUDA_hlag2d converts a half-real matrix, HA, to a double-real matrix,
+ * A.
+ *
+ *  Note that while it is possible to overflow while converting from double to
+ *  single, it is not possible to overflow when converting from single to
+ *  double.
+ *
+ *  @param[in] m
+ *          The number of lines of the matrix A and HA.  m >= 0.
+ *
+ *  @param[in] n
+ *          The number of columns of the matrix A and HA.  n >= 0.
+ *
+ *  @param[in] HA
+ *          On entry, the lda-by-n coefficient matrix HA.
+ *
+ *  @param[in] ldha
+ *          The leading dimension of the array HA.  ldha >= max(1,m).
+ *
+ *  @param[out] A
+ *          On exit, the lda-by-n coefficient matrix A.
+ *
+ *  @param[in] lda
+ *          The leading dimension of the array A. lda >= max(1,m).
+ *
+ *  @param[out]
+ *    -     = 0:  successful exit.
+ *    -     < 0:  if INFO = -i, the i-th argument had an illegal value
+ *
+ *  @param[in] handle
+ *          Cublas handle to execute in.
+ *
+ **/
+extern "C" int
+CUDA_hlag2d( int m, int n,
+             const CHAMELEON_Real16_t *HA, int ldha,
+             double      *A,  int lda,
+             cublasHandle_t handle )
+{
+    cudaStream_t stream;
+
+    if ( m < 0 ) {
+        return -1;
+    }
+    else if ( n < 0 ) {
+        return -2;
+    }
+    else if ( ldha < chameleon_max(1,m) ) {
+        return -4;
+    }
+    else if ( lda < chameleon_max(1,m) ) {
+        return -6;
+    }
+
+    /* quick return */
+    if ( (m == 0) || (n == 0) ) {
+        return 0;
+    }
+
+    dim3 threads( BLK_X, 1 );
+    dim3 grid( chameleon_ceil( m, BLK_X ), chameleon_ceil( n, BLK_Y ) );
+
+    cublasGetStream( handle, &stream );
+    cuda_hlag2d_kernel<<< grid, threads, 0, stream >>> ( m, n, HA, ldha, A, lda );
+
+    return 0;
+}
diff --git a/gpucublas/compute/cuda_gemmex.c b/gpucublas/compute/cuda_gemmex.c
new file mode 100644
index 0000000000000000000000000000000000000000..c384018e900fc88a8939bac3f042c972f0dc367c
--- /dev/null
+++ b/gpucublas/compute/cuda_gemmex.c
@@ -0,0 +1,43 @@
+/**
+ *
+ * @file cuda_gemmex.c
+ *
+ * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon cuda_gemmex GPU kernel
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @date 2023-07-04
+ *
+ */
+#include "gpucublas.h"
+
+int
+CUDA_gemmex( cham_trans_t transa, cham_trans_t transb,
+             int m, int n, int k,
+             const void *alpha,
+             const void *A, int lda, cham_flttype_t Atype,
+             const void *B, int ldb, cham_flttype_t Btype,
+             const void *beta,
+             void *C, int ldc, cham_flttype_t Ctype,
+             cublasHandle_t handle )
+{
+    cublasStatus_t rc;
+
+    rc = cublasGemmEx( handle,
+                       chameleon_cublas_const(transa), chameleon_cublas_const(transb),
+                       m, n, k,
+                       CUBLAS_VALUE(alpha), A, lda, chameleon_cublas_dtype( Atype ),
+                                            B, ldb, chameleon_cublas_dtype( Btype ),
+                       CUBLAS_VALUE(beta),  C, ldc, chameleon_cublas_dtype( Ctype ),
+                       chameleon_cublas_ctype( Ctype ),
+                       CUBLAS_GEMM_DEFAULT );
+
+    assert( rc == CUBLAS_STATUS_SUCCESS );
+    (void)rc;
+    return CHAMELEON_SUCCESS;
+}
diff --git a/gpucublas/compute/cuda_hgemm.c b/gpucublas/compute/cuda_hgemm.c
new file mode 100644
index 0000000000000000000000000000000000000000..02f465a6a2b8efbc5cfb5551c3266d30736591e7
--- /dev/null
+++ b/gpucublas/compute/cuda_hgemm.c
@@ -0,0 +1,42 @@
+/**
+ *
+ * @file cuda_hgemm.c
+ *
+ * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon cuda_hgemm GPU kernel
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @date 2023-07-04
+ *
+ */
+#include "gpucublas.h"
+
+extern "C" int
+CUDA_hgemm( cham_trans_t transa, cham_trans_t transb,
+            int m, int n, int k,
+            const CHAMELEON_Real16_t *alpha,
+            const CHAMELEON_Real16_t *A, int lda,
+            const CHAMELEON_Real16_t *B, int ldb,
+            const CHAMELEON_Real16_t *beta,
+            CHAMELEON_Real16_t *C, int ldc,
+            cublasHandle_t handle )
+{
+    cublasStatus_t rc;
+
+    rc = cublasHgemm( handle,
+                      (cublasOperation_t)chameleon_cublas_const(transa),
+                      (cublasOperation_t)chameleon_cublas_const(transb),
+                      m, n, k,
+                      CUBLAS_VALUE(alpha), A, lda,
+                                           B, ldb,
+                      CUBLAS_VALUE(beta),  C, ldc);
+
+    assert( rc == CUBLAS_STATUS_SUCCESS );
+    (void)rc;
+    return CHAMELEON_SUCCESS;
+}
diff --git a/gpucublas/compute/cuda_zlag2c.cu b/gpucublas/compute/cuda_zlag2c.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cf3f35062fbe336c1c8f72119c659ba99e4693da
--- /dev/null
+++ b/gpucublas/compute/cuda_zlag2c.cu
@@ -0,0 +1,304 @@
+/**
+ *
+ * @file cuda_zlag2c.cu
+ *
+ * @copyright 2023-2023 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon cuda_zlag2c GPU kernel
+ *
+ * @version 1.3.0
+ * @author Mark Gates
+ * @author Mathieu Faverge
+ * @date 2023-07-04
+ * @precisions mixed zc -> ds
+ *
+ * This file is an adaptation of the MAGMA zlag2c.cu and clag2z files.
+ *
+ */
+#include "gpucublas.h"
+#include <coreblas/lapacke.h>
+
+#define BLK_X 64
+#define BLK_Y 32
+
+__device__  int cuda_zlag2c_flag = 0;
+
+/*
+ * Divides matrix into ceil( m/BLK_X ) x ceil( n/BLK_Y ) blocks.
+ * Each block has BLK_X threads.
+ * Each thread loops across one row, updating BLK_Y entries.
+ */
+__global__
+void cuda_zlag2c_kernel(
+    int m, int n,
+    const cuDoubleComplex *A,  int lda,
+    cuFloatComplex        *SA, int ldsa,
+    double rmax )
+{
+    cuDoubleComplex tmp;
+
+    int ind = blockIdx.x * BLK_X + threadIdx.x;
+    int iby = blockIdx.y * BLK_Y;
+
+    /* check if full block-column */
+    bool full = (iby + BLK_Y <= n);
+
+    /* do only rows inside matrix */
+    if ( ind > m ) {
+        return;
+    }
+
+    A  += ind + iby*lda;
+    SA += ind + iby*ldsa;
+    if ( full ) {
+        /* full block-column */
+#pragma unroll
+        for( int j=0; j < BLK_Y; ++j ) {
+            tmp = A[j*lda];
+
+            if (
+#if defined(PRECISION_zc)
+                (fabs(tmp.x) > rmax) || (fabs(tmp.y) > rmax)
+#else
+                (fabs(tmp) > rmax)
+#endif
+                )
+            {
+                cuda_zlag2c_flag = 1;
+            }
+
+#if defined(PRECISION_zc)
+            SA[j*ldsa] = make_cuFloatComplex(tmp.x, tmp.y);
+#else
+            SA[j*ldsa] = tmp;
+#endif
+        }
+    }
+    else {
+        /* partial block-column */
+        for( int j=0; (j < BLK_Y) && (iby+j < n); ++j ) {
+            tmp = A[j*lda];
+            if (
+#if defined(PRECISION_zc)
+                (fabs(tmp.x) > rmax) || (fabs(tmp.y) > rmax)
+#else
+                (fabs(tmp) > rmax)
+#endif
+                )
+            {
+                cuda_zlag2c_flag = 1;
+            }
+
+#if defined(PRECISION_zc)
+            SA[j*ldsa] = make_cuFloatComplex(tmp.x, tmp.y);
+#else
+            SA[j*ldsa] = tmp;
+#endif
+        }
+    }
+}
+
+
+/**
+ *
+ * @ingroup CUDA_CHAMELEON_Complex64_t
+ *
+ * CUDA_zlag2c converts a double-complex matrix, A, to a single-complex matrix,
+ * SA.
+ *
+ * RMAX is the overflow for the single-complex arithmetic.  ZLAG2C checks that
+ * all the entries of A are between -RMAX and RMAX. If not, the conversion is
+ * aborted and a magma_zlag2c_flag is raised.
+ *
+ *  @param[in] m
+ *          The number of lines of the matrix A.  m >= 0.
+ *
+ *  @param[in] n
+ *          The number of columns of the matrix A.  n >= 0.
+ *
+ *  @param[in] A
+ *          On entry, the lda-by-n coefficient matrix A.
+ *
+ *  @param[in] lda
+ *          The leading dimension of the array A.  LDA >= max(1,m).
+ *
+ *  @param[out] SA
+ *          On exit, if INFO=0, the ldsa-by-n coefficient matrix SA;
+ *          if INFO > 0, the content of SA is unspecified.
+ *
+ *  @param[in] ldsa
+ *          The leading dimension of the array SA.  LDSA >= max(1,m).
+ *
+ *  @param[out]
+ *    -     = 0:  successful exit.
+ *    -     < 0:  if INFO = -i, the i-th argument had an illegal value
+ *    -     = 1:  an entry of the matrix A is greater than the COMPLEX
+ *                overflow threshold, in this case, the content
+ *                of SA on exit is unspecified.
+ *
+ *  @param[in] handle
+ *          Cublas handle to execute in.
+ *
+ **/
+extern "C" int
+CUDA_zlag2c( int m, int n,
+             const cuDoubleComplex *A, int lda,
+             cuFloatComplex *SA, int ldsa,
+             cublasHandle_t handle )
+{
+    cudaStream_t stream;
+    double       rmax;
+
+    if ( m < 0 ) {
+        return -1;
+    }
+    else if ( n < 0 ) {
+        return -2;
+    }
+    else if ( lda < chameleon_max(1,m) ) {
+        return -4;
+    }
+    else if ( ldsa < chameleon_max(1,m) ) {
+        return -6;
+    }
+
+    /* quick return */
+    if ( m == 0 || n == 0 ) {
+        return 0;
+    }
+
+    dim3 threads( BLK_X, 1 );
+    dim3 grid( chameleon_ceil( m, BLK_X ), chameleon_ceil( n, BLK_Y ) );
+
+    rmax = LAPACKE_slamch_work( 'O' );
+    cublasGetStream( handle, &stream );
+
+    cuda_zlag2c_kernel<<< grid, threads, 0, stream >>>( m, n, A, lda, SA, ldsa, rmax );
+
+    return 0;
+}
+
+/*
+ * Divides matrix into ceil( m/BLK_X ) x ceil( n/BLK_Y ) blocks.
+ * Each block has BLK_X threads.
+ * Each thread loops across one row, updating BLK_Y entries.
+ */
+__global__
+void cuda_clag2z_kernel(
+    int m, int n,
+    const cuFloatComplex *SA, int ldsa,
+    cuDoubleComplex       *A, int lda )
+{
+    int ind = blockIdx.x * BLK_X + threadIdx.x;
+    int iby = blockIdx.y * BLK_Y;
+
+    /* check if full block-column */
+    bool full = (iby + BLK_Y <= n);
+
+    /* do only rows inside matrix */
+    if ( ind > m ) {
+        return;
+    }
+
+    A  += ind + iby*lda;
+    SA += ind + iby*ldsa;
+    if ( full ) {
+        // full block-column
+#pragma unroll
+        for( int j=0; j < BLK_Y; ++j ) {
+#if defined(PRECISION_zc)
+            A[j*ldsa] = make_cuDoubleComplex( SA[j*ldsa].x, SA[j*ldsa].y );
+#else
+            A[j*ldsa] = SA[j*ldsa];
+#endif
+        }
+    }
+    else {
+        // partial block-column
+        for( int j=0; (j < BLK_Y) && (iby+j) < n; ++j ) {
+#if defined(PRECISION_zc)
+            A[j*ldsa] = make_cuDoubleComplex( SA[j*ldsa].x, SA[j*ldsa].y );
+#else
+            A[j*ldsa] = SA[j*ldsa];
+#endif
+        }
+    }
+}
+
+
+/**
+ *
+ * @ingroup CUDA_CHAMELEON_Complex64_t
+ *
+ * CUDA_clag2z converts a single-complex matrix, SA, to a double-complex matrix,
+ * A.
+ *
+ *  Note that while it is possible to overflow while converting from double to
+ *  single, it is not possible to overflow when converting from single to
+ *  double.
+ *
+ *  @param[in] m
+ *          The number of lines of the matrix A and SA.  m >= 0.
+ *
+ *  @param[in] n
+ *          The number of columns of the matrix A and SA.  n >= 0.
+ *
+ *  @param[in] SA
+ *          On entry, the lda-by-n coefficient matrix SA.
+ *
+ *  @param[in] ldsa
+ *          The leading dimension of the array SA.  ldsa >= max(1,m).
+ *
+ *  @param[out] A
+ *          On exit, the lda-by-n coefficient matrix A.
+ *
+ *  @param[in] lda
+ *          The leading dimension of the array A. lda >= max(1,m).
+ *
+ *  @param[out]
+ *    -     = 0:  successful exit.
+ *    -     < 0:  if INFO = -i, the i-th argument had an illegal value
+ *
+ *  @param[in] handle
+ *          Cublas handle to execute in.
+ *
+ **/
+extern "C" int
+CUDA_clag2z( int m, int n,
+             const cuFloatComplex *SA, int ldsa,
+             cuDoubleComplex      *A,  int lda,
+             cublasHandle_t handle )
+{
+    cudaStream_t stream;
+
+    if ( m < 0 ) {
+        return -1;
+    }
+    else if ( n < 0 ) {
+        return -2;
+    }
+    else if ( ldsa < chameleon_max(1,m) ) {
+        return -4;
+    }
+    else if ( lda < chameleon_max(1,m) ) {
+        return -6;
+    }
+
+    /* quick return */
+    if ( (m == 0) || (n == 0) ) {
+        return 0;
+    }
+
+    dim3 threads( BLK_X, 1 );
+    dim3 grid( chameleon_ceil( m, BLK_X ), chameleon_ceil( n, BLK_Y ) );
+
+    cublasGetStream( handle, &stream );
+    cuda_clag2z_kernel<<< grid, threads, 0, stream >>> ( m, n, SA, ldsa, A, lda );
+
+    return 0;
+}
diff --git a/gpucublas/include/gpucublas.h b/gpucublas/include/gpucublas.h
index 8a9deb977ccae2c998f3b1390c5085febaee9104..8e7d4c3afa4d0165e2961c7791ae2dd2f12b3871 100644
--- a/gpucublas/include/gpucublas.h
+++ b/gpucublas/include/gpucublas.h
@@ -11,11 +11,11 @@
  *
  * @brief Chameleon GPU kernels main header
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Florent Pruvost
  * @author Mathieu Faverge
  * @author Nathalie Furmento
- * @date 2022-02-22
+ * @date 2023-07-04
  * @precisions normal z -> c d s
  *
  */
@@ -35,6 +35,9 @@
 
 #include <cuda.h>
 #include <cuComplex.h>
+#if CUDA_VERSION >= 7500
+#include <cuda_fp16.h>
+#endif
 
 #include <cublas_v2.h>
 
@@ -58,6 +61,54 @@ BEGIN_C_DECLS
 #include "gpucublas/gpucublas_c.h"
 #include "gpucublas/gpucublas_s.h"
 
+int CUDA_hgemm( cham_trans_t transa, cham_trans_t transb,
+                int m, int n, int k,
+                const CHAMELEON_Real16_t *alpha,
+                const CHAMELEON_Real16_t *A, int lda,
+                const CHAMELEON_Real16_t *B, int ldb,
+                const CHAMELEON_Real16_t *beta,
+                CHAMELEON_Real16_t *C, int ldc,
+                cublasHandle_t handle );
+
+int CUDA_gemmex( cham_trans_t transa, cham_trans_t transb,
+                 int m, int n, int k,
+                 const void *alpha,
+                 const void *A, int lda, cham_flttype_t Atype,
+                 const void *B, int ldb, cham_flttype_t Btype,
+                 const void *beta,
+                 void *C, int ldc, cham_flttype_t Ctype,
+                 cublasHandle_t handle );
+
+static inline cublasComputeType_t
+chameleon_cublas_ctype( cham_flttype_t flttype ) {
+
+    switch ( flttype ) {
+    case ChamRealHalf      : return CUBLAS_COMPUTE_16F;
+    case ChamRealFloat     : return CUBLAS_COMPUTE_32F;
+    case ChamRealDouble    : return CUBLAS_COMPUTE_64F;
+    case ChamComplexFloat  : return CUBLAS_COMPUTE_32F;
+    case ChamComplexDouble : return CUBLAS_COMPUTE_64F;
+    default:
+        fprintf( stderr, "chameleon_cublas_ctype(): Incorrect flttype\n" );
+        exit(1);
+    }
+}
+
+static inline cudaDataType_t
+chameleon_cublas_dtype( cham_flttype_t flttype ) {
+
+    switch ( flttype ) {
+    case ChamRealHalf      : return CUDA_R_16F;
+    case ChamRealFloat     : return CUDA_R_32F;
+    case ChamRealDouble    : return CUDA_R_64F;
+    case ChamComplexFloat  : return CUDA_C_32F;
+    case ChamComplexDouble : return CUDA_C_64F;
+    default:
+        fprintf( stderr, "chameleon_cublas_dtype(): Incorrect flttype\n" );
+        exit(1);
+    }
+}
+
 END_C_DECLS
 
 /**
diff --git a/gpuhipblas/compute/CMakeLists.txt b/gpuhipblas/compute/CMakeLists.txt
index bd7ac4540ccad44d08140014a6af1af43df248b5..cd3db9e9e0d0f951d03d331ed14bb8c0840cfcc5 100644
--- a/gpuhipblas/compute/CMakeLists.txt
+++ b/gpuhipblas/compute/CMakeLists.txt
@@ -17,12 +17,12 @@
 #     Univ. of California Berkeley,
 #     Univ. of Colorado Denver.
 #
-# @version 1.2.0
+# @version 1.3.0
 #  @author Florent Pruvost
 #  @author Guillaume Sylvand
 #  @author Mathieu Faverge
 #  @author Loris Lucido
-#  @date 2023-01-30
+#  @date 2023-07-04
 #
 ###
 
@@ -47,6 +47,7 @@ precisions_rules_py(
 
 set(GPUHIPBLAS_SRCS
   ${GPUHIPBLAS_SRCS_GENERATED}
+  hip_hgemm.c
   hipglobal.c
   )
 
diff --git a/gpuhipblas/compute/hip_hgemm.c b/gpuhipblas/compute/hip_hgemm.c
new file mode 100644
index 0000000000000000000000000000000000000000..12d7feac2bf453a72614e7a5271898f51b461879
--- /dev/null
+++ b/gpuhipblas/compute/hip_hgemm.c
@@ -0,0 +1,41 @@
+/**
+ *
+ * @file hip_hgemm.c
+ *
+ * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon hip_hgemm GPU kernel
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @date 2023-07-04
+ *
+ */
+#include "gpuhipblas.h"
+
+int
+HIP_hgemm( cham_trans_t transa, cham_trans_t transb,
+           int m, int n, int k,
+           const CHAMELEON_Real16_t *alpha,
+           const CHAMELEON_Real16_t *A, int lda,
+           const CHAMELEON_Real16_t *B, int ldb,
+           const CHAMELEON_Real16_t *beta,
+           CHAMELEON_Real16_t *C, int ldc,
+           hipblasHandle_t handle )
+{
+    hipblasStatus_t rc;
+
+    rc = hipblasHgemm( handle,
+                       chameleon_hipblas_const(transa), chameleon_hipblas_const(transb),
+                       m, n, k,
+                       HIPBLAS_VALUE(alpha), A, lda,
+                                             B, ldb,
+                       HIPBLAS_VALUE(beta),  C, ldc );
+
+    assert( rc == HIPBLAS_STATUS_SUCCESS );
+    (void)rc;
+    return CHAMELEON_SUCCESS;
+}
diff --git a/gpuhipblas/include/gpuhipblas.h b/gpuhipblas/include/gpuhipblas.h
index 572abcf9677fa3caf1e16f8c6a68a64f6e202cb9..705084958bc28c6c66f8d062f675d96cee90bb9d 100644
--- a/gpuhipblas/include/gpuhipblas.h
+++ b/gpuhipblas/include/gpuhipblas.h
@@ -11,12 +11,12 @@
  *
  * @brief Chameleon GPU kernels main header
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Florent Pruvost
  * @author Mathieu Faverge
  * @author Nathalie Furmento
  * @author Loris Lucido
- * @date 2023-01-30
+ * @date 2023-07-04
  * @precisions normal z -> c d s
  *
  */
@@ -36,6 +36,7 @@
 
 #include <hip/hip_runtime.h>
 #include <hip/hip_complex.h>
+#include <hip/hip_fp16.h>
 
 #include <hipblas/hipblas.h>
 
@@ -59,6 +60,15 @@ BEGIN_C_DECLS
 #include "gpuhipblas/gpuhipblas_c.h"
 #include "gpuhipblas/gpuhipblas_s.h"
 
+int HIP_hgemm( cham_trans_t transa, cham_trans_t transb,
+               int m, int n, int k,
+               const CHAMELEON_Real16_t *alpha,
+               const CHAMELEON_Real16_t *A, int lda,
+               const CHAMELEON_Real16_t *B, int ldb,
+               const CHAMELEON_Real16_t *beta,
+               CHAMELEON_Real16_t *C, int ldc,
+               hipblasHandle_t handle );
+
 END_C_DECLS
 
 /**
diff --git a/include/chameleon.h b/include/chameleon.h
index 77b6544f0af5e0e599baf8575cd8755b5e6c5f27..b6ff6913c9f74c6960d4615bfeaf82ec878001c8 100644
--- a/include/chameleon.h
+++ b/include/chameleon.h
@@ -11,13 +11,13 @@
  *
  * @brief Chameleon main header
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Mathieu Faverge
  * @author Cedric Augonnet
  * @author Cedric Castagnede
  * @author Florent Pruvost
  * @author Philippe Virouleau
- * @date 2022-02-22
+ * @date 2023-07-04
  *
  */
 #ifndef _chameleon_h_
@@ -120,7 +120,7 @@ int CHAMELEON_Lapack_to_Tile( void *Af77, int LDA, CHAM_desc_t *A ) __attribute_
 int CHAMELEON_Tile_to_Lapack( CHAM_desc_t *A, void *Af77, int LDA ) __attribute__((deprecated("Please refer to CHAMELEON_Desc2Lap() instead")));
 
 /* Descriptor */
-int CHAMELEON_Element_Size(int type);
+int CHAMELEON_Element_Size( cham_flttype_t type );
 
 int CHAMELEON_Desc_Create_User( CHAM_desc_t **desc, void *mat, cham_flttype_t dtyp, int mb, int nb, int bsiz,
                                 int lm, int ln, int i, int j, int m, int n, int p, int q,
diff --git a/include/chameleon/constants.h b/include/chameleon/constants.h
index cff1f56529ce56ce1ddf52e81bac66e8c5f219d2..fe50552f405fdef0e5e985d3ebcdcdd39e49b4d5 100644
--- a/include/chameleon/constants.h
+++ b/include/chameleon/constants.h
@@ -11,14 +11,14 @@
  *
  * @brief Chameleon global constants
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Cedric Augonnet
  * @author Mathieu Faverge
  * @author Cedric Castagnede
  * @author Florent Pruvost
  * @author Alycia Lisito
  * @author Terry Cojean
- * @date 2022-02-22
+ * @date 2023-07-04
  *
  */
 #ifndef _chameleon_constants_h_
@@ -36,15 +36,74 @@
 /**
  * @brief Matrix floating point arithmetic
  */
+typedef enum chameleon_arithmetic_e {
+    Cham8      = 0,
+    ChamHalf   = 1,
+    ChamSingle = 2,
+    ChamDouble = 3,
+} cham_arithmetic_t;
+
+#define CHAM_ARITHMETIC_MASK 0b11
+
+typedef enum chameleon_ftype_e {
+    ChamInt     = 0,
+    ChamReal    = 1,
+    ChamComplex = 2,
+} cham_ftype_t;
+
+#define CHAM_FTYPE_MASK 0b1100
+#define CHAM_MIXED_MASK 0b10000
+
+#define cham_get_arith( _ftype_ )   ( (_ftype_) & CHAM_ARITHMETIC_MASK )
+#define cham_get_ftype( _ftype_ )   (( (_ftype_) & CHAM_FTYPE_MASK ) >> 2 )
+#define cham_get_flttype( _ftype_ ) ( (_ftype_) & (CHAM_FTYPE_MASK | CHAM_ARITHMETIC_MASK) )
+#define cham_is_mixed( _ftype_ )    ( (_ftype_) & CHAM_MIXED_MASK )
+
+#define cham_clean_mixed( _ftype_ )    ( (_ftype_) & ~CHAM_MIXED_MASK )
+
+#define CHAMELEON_FLTTYPE( _ftype_, _arithmetic_ ) ( ((_ftype_) << 2) | (_arithmetic_) )
+
 typedef enum chameleon_flttype_e {
-    ChamByte          = 0,
-    ChamInteger       = 1,
-    ChamRealFloat     = 2,
-    ChamRealDouble    = 3,
-    ChamComplexFloat  = 4,
-    ChamComplexDouble = 5,
+    ChamByte               = CHAMELEON_FLTTYPE( ChamInt,     Cham8      ),
+    ChamInteger16          = CHAMELEON_FLTTYPE( ChamInt,     ChamHalf   ),
+    ChamInteger            = CHAMELEON_FLTTYPE( ChamInt,     ChamSingle ),
+    ChamInteger32          = CHAMELEON_FLTTYPE( ChamInt,     ChamSingle ),
+    ChamInteger64          = CHAMELEON_FLTTYPE( ChamInt,     ChamDouble ),
+    ChamRealHalf           = CHAMELEON_FLTTYPE( ChamReal,    ChamHalf   ),
+    ChamRealFloat          = CHAMELEON_FLTTYPE( ChamReal,    ChamSingle ),
+    ChamRealDouble         = CHAMELEON_FLTTYPE( ChamReal,    ChamDouble ),
+    ChamComplexHalf        = CHAMELEON_FLTTYPE( ChamComplex, ChamHalf   ),
+    ChamComplexFloat       = CHAMELEON_FLTTYPE( ChamComplex, ChamSingle ),
+    ChamComplexDouble      = CHAMELEON_FLTTYPE( ChamComplex, ChamDouble ),
+    ChamRealDoubleMixed    = ChamRealDouble    | CHAM_MIXED_MASK,
+    ChamComplexDoubleMixed = ChamComplexDouble | CHAM_MIXED_MASK,
 } cham_flttype_t;
 
+#define ChamComplexSingle ChamComplexFloat
+#define ChamRealSingle    ChamRealFloat
+
+#define ChamConvert( in, out ) ( cham_clean_mixed(in) | (cham_clean_mixed(out) << 5) )
+
+#define ChamConvertComplexDoubleToDouble ChamConvert( ChamComplexDouble, ChamComplexDouble )
+#define ChamConvertComplexDoubleToSingle ChamConvert( ChamComplexDouble, ChamComplexSingle )
+#define ChamConvertComplexDoubleToHalf   ChamConvert( ChamComplexDouble, ChamComplexHalf   )
+#define ChamConvertComplexSingleToDouble ChamConvert( ChamComplexSingle, ChamComplexDouble )
+#define ChamConvertComplexSingleToSingle ChamConvert( ChamComplexSingle, ChamComplexSingle )
+#define ChamConvertComplexSingleToHalf   ChamConvert( ChamComplexSingle, ChamComplexHalf   )
+#define ChamConvertComplexHalfToDouble   ChamConvert( ChamComplexHalf,   ChamComplexDouble )
+#define ChamConvertComplexHalfToSingle   ChamConvert( ChamComplexHalf,   ChamComplexSingle )
+#define ChamConvertComplexHalfToHalf     ChamConvert( ChamComplexHalf,   ChamComplexHalf   )
+
+#define ChamConvertRealDoubleToDouble ChamConvert( ChamRealDouble, ChamRealDouble )
+#define ChamConvertRealDoubleToSingle ChamConvert( ChamRealDouble, ChamRealSingle )
+#define ChamConvertRealDoubleToHalf   ChamConvert( ChamRealDouble, ChamRealHalf   )
+#define ChamConvertRealSingleToDouble ChamConvert( ChamRealSingle, ChamRealDouble )
+#define ChamConvertRealSingleToSingle ChamConvert( ChamRealSingle, ChamRealSingle )
+#define ChamConvertRealSingleToHalf   ChamConvert( ChamRealSingle, ChamRealHalf   )
+#define ChamConvertRealHalfToDouble   ChamConvert( ChamRealHalf,   ChamRealDouble )
+#define ChamConvertRealHalfToSingle   ChamConvert( ChamRealHalf,   ChamRealSingle )
+#define ChamConvertRealHalfToHalf     ChamConvert( ChamRealHalf,   ChamRealHalf   )
+
 /**
  * @brief Matrix tile storage
  */
diff --git a/include/chameleon/runtime_struct.h b/include/chameleon/runtime_struct.h
index 59a2ec3cca1b2b92cf8100bca84297f0806afa1d..4b172782507b5b43e17fb4f3672b6aebf13a7c9d 100644
--- a/include/chameleon/runtime_struct.h
+++ b/include/chameleon/runtime_struct.h
@@ -11,13 +11,13 @@
  *
  * @brief Runtime structures
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Cedric Augonnet
  * @author Mathieu Faverge
  * @author Cedric Castagnede
  * @author Florent Pruvost
  * @author Philippe Virouleau
- * @date 2022-02-22
+ * @date 2023-07-04
  *
  */
 #ifndef _chameleon_runtime_struct_h_
@@ -85,6 +85,7 @@ typedef struct runtime_option_s {
     int                 priority;  /**< Define the submitted task priority                       */
     int                 workerid;  /**< Define the prefered worker id to perform the tasks       */
     int                 forcesub;  /**< Force task submission if true                            */
+    int                 withcuda;  /**< Flag to know if cuda is enabled/disabled                 */
     size_t              ws_wsize;  /**< Define the worker workspace size                         */
     size_t              ws_hsize;  /**< Define the host workspace size for hybrid CPU/GPU kernel */
     void               *ws_worker; /**< Pointer to the worker workspace (structure)              */
diff --git a/include/chameleon/types.h b/include/chameleon/types.h
index a174db70eda2f8dca2b1e38e7aae4726a31a8fad..4d46a65fe4733414806d2589d6dc26a80a3b8e55 100644
--- a/include/chameleon/types.h
+++ b/include/chameleon/types.h
@@ -11,14 +11,14 @@
  *
  * @brief Chameleon basic datatypes header
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Cedric Augonnet
  * @author Mathieu Faverge
  * @author Cedric Castagnede
  * @author Florent Pruvost
  * @author Lucas Barros de Assis
  * @author Thomas Mijieux
- * @date 2022-02-22
+ * @date 2023-07-04
  *
  */
 #ifndef _chameleon_types_h_
@@ -104,6 +104,17 @@ typedef int8_t cham_bool_t;
     #endif
 #endif /* CHAMELEON_COMPLEX_CPP */
 
+/**
+ * Half precision on GPUs
+ */
+#if defined(__cplusplus)
+typedef __half CHAMELEON_Real16_t;
+#else
+/* use short for cuda older than 7.5 and non-cuda files
+ * corresponding routines would not work anyway since there is no half precision */
+typedef short  CHAMELEON_Real16_t;
+#endif
+
 /**
  *  CHAMELEON Deprecated attribute
  */
diff --git a/runtime/starpu/control/runtime_options.c b/runtime/starpu/control/runtime_options.c
index 6ade7b962cf17ec170c6e35465086b57583e7bdd..c70f1aa74517beb2edd41d30ebdafb52adc9a0d0 100644
--- a/runtime/starpu/control/runtime_options.c
+++ b/runtime/starpu/control/runtime_options.c
@@ -11,12 +11,12 @@
  *
  * @brief Chameleon StarPU options routines
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Cedric Augonnet
  * @author Mathieu Faverge
  * @author Cedric Castagnede
  * @author Florent Pruvost
- * @date 2022-02-22
+ * @date 2023-07-04
  *
  */
 #include "chameleon_starpu.h"
@@ -32,6 +32,7 @@ void RUNTIME_options_init( RUNTIME_option_t *options, CHAM_context_t *chamctxt,
     options->priority  = RUNTIME_PRIORITY_MIN;
     options->workerid  = (schedopt == NULL) ? -1 : schedopt->workerid;
     options->forcesub  = 0;
+    options->withcuda  = (chamctxt->ncudas > 0);
     options->ws_wsize  = 0;
     options->ws_hsize  = 0;
     options->ws_worker = NULL;