gpucublas: Add dlag2h and zlag2c cuda kernels originated from Magma

0036dfaa · Mathieu Faverge · 50590d59 · 0036dfaa · 0036dfaa · 0036dfaa
Commit 0036dfaa authored 1 year ago by Mathieu Faverge
--- a/cmake_modules/local_subs.py
+++ b/cmake_modules/local_subs.py
@@ -39,6 +39,10 @@ _extra_blas = [
    ('',                     'slatm1',               'dlatm1',               'slatm1',               'dlatm1'              ),
    ('',                     'sgenm2',               'dgenm2',               'cgenm2',               'zgenm2'              ),
    ('',                     'slag2c_fake',          'dlag2z_fake',          'slag2c',               'dlag2z'              ),
+    ('',                     'slag2h',               'dlag2h',               'slag2h',               'dlag2h'              ),
+    ('',                     'hlag2s',               'hlag2d',               'hlag2s',               'hlag2d'              ),
+    ('',                     'slag2h',               'dlag2h',               'clag2x',               'zlag2x'              ),
+    ('',                     'hlag2s',               'hlag2d',               'xlag2c',               'xlag2z'              ),
    ('',                     'sgepdf',               'dgepdf',               'cgepdf',               'zgepdf'              ),
    ('',                     'scesca',               'dcesca',               'ccesca',               'zcesca'              ),
    ('',                     'sgesum',               'dgesum',               'cgesum',               'zgesum'              ),

--- a/gpucublas/compute/CMakeLists.txt
+++ b/gpucublas/compute/CMakeLists.txt
@@ -17,14 +17,17 @@
 #     Univ. of California Berkeley,
 #     Univ. of Colorado Denver.
 #
-# @version 1.2.0
+# @version 1.3.0
 #  @author Florent Pruvost
 #  @author Guillaume Sylvand
 #  @author Mathieu Faverge
-#  @date 2022-02-22
+#  @date 2023-07-04
 #
 ###
+# To define CMAKE_CUDA_COMPILER
+cmake_minimum_required(VERSION 3.18)
 # Generate the chameleon sources for all possible precisions
 # ------------------------------------------------------
 set(GPUCUBLAS_SRCS_GENERATED "")
@@ -53,6 +56,217 @@ set(ZSRC
    cuda_zunmqrt.c
    )
+# Add CUDA kernel if compiler and toolkit are available
+# -----------------------------------------------------
+include(CheckLanguage)
+check_language(CUDA)
+if(CMAKE_CUDA_COMPILER)
+  enable_language(CUDA)
+  find_package(CUDAToolkit)
+else()
+  message(STATUS "CUDA language is not supported")
+endif()
+if (CUDAToolkit_FOUND)
+  set(CMAKE_CUDA_STANDARD 11)
+  # Define the architectures (inspired from the MAGMA project)
+  set( CHAMELEON_CUDA_TARGETS "Kepler Maxwell Pascal Volta Ampere" CACHE STRING "CUDA architectures to compile for; one or more of Fermi, Kepler, Maxwell, Pascal, Volta, Turing, Ampere, Hopper, or valid sm_[0-9][0-9]" )
+  # NVCC options for the different cards
+  # sm_xx is binary, compute_xx is PTX for forward compatibility
+  # MIN_ARCH is the lowest requested version
+  if(WIN32)
+    # Disable separable compilation on Windows because object linking list
+    # becomes too long when building multiple archs and MSVC throws errors
+    set(CUDA_SEPARABLE_COMPILATION OFF)
+  else()
+    set(CUDA_SEPARABLE_COMPILATION ON)
+  endif()
+  set(__cuda_architectures)
+  # Architectures by names
+  # ----------------------
+  if (CHAMELEON_CUDA_TARGETS MATCHES Fermi)
+    set( CHAMELEON_CUDA_TARGETS "${CHAMELEON_CUDA_TARGETS} sm_20" )
+  endif()
+  if (CHAMELEON_CUDA_TARGETS MATCHES Kepler)
+    set( CHAMELEON_CUDA_TARGETS "${CHAMELEON_CUDA_TARGETS} sm_30 sm_35 sm_37" )
+  endif()
+  if (CHAMELEON_CUDA_TARGETS MATCHES Maxwell)
+    set( CHAMELEON_CUDA_TARGETS "${CHAMELEON_CUDA_TARGETS} sm_50" )
+  endif()
+  if (CHAMELEON_CUDA_TARGETS MATCHES Pascal)
+    set( CHAMELEON_CUDA_TARGETS "${CHAMELEON_CUDA_TARGETS} sm_60" )
+  endif()
+  if (CHAMELEON_CUDA_TARGETS MATCHES Volta)
+    set( CHAMELEON_CUDA_TARGETS "${CHAMELEON_CUDA_TARGETS} sm_70" )
+  endif()
+  if (CHAMELEON_CUDA_TARGETS MATCHES Turing)
+    set( CHAMELEON_CUDA_TARGETS "${CHAMELEON_CUDA_TARGETS} sm_75" )
+  endif()
+  if (CHAMELEON_CUDA_TARGETS MATCHES Ampere)
+    set( CHAMELEON_CUDA_TARGETS "${CHAMELEON_CUDA_TARGETS} sm_80" )
+  endif()
+  if (CHAMELEON_CUDA_TARGETS MATCHES Hopper)
+    set( CHAMELEON_CUDA_TARGETS "${CHAMELEON_CUDA_TARGETS} sm_90" )
+  endif()
+  # Architectures versions
+  # ----------------------
+  if ( (CHAMELEON_CUDA_TARGETS MATCHES sm_20) AND (CUDA_VERSION VERSION_LESS "8.0") )
+    if (NOT MIN_ARCH)
+      set( MIN_ARCH 200 )
+    endif()
+    list(APPEND __cuda_architectures 20)
+    message( STATUS "    compile for CUDA arch 2.0 (Fermi)" )
+  endif()
+  if ( (CHAMELEON_CUDA_TARGETS MATCHES sm_30) AND (CUDA_VERSION VERSION_LESS "10.0") )
+    if (NOT MIN_ARCH)
+      set( MIN_ARCH 300 )
+    endif()
+    list(APPEND __cuda_architectures 30)
+    message( STATUS "    compile for CUDA arch 3.0 (Kepler)" )
+  endif()
+  if ( (CHAMELEON_CUDA_TARGETS MATCHES sm_35) AND (CUDA_VERSION VERSION_LESS "11.0") )
+    if (NOT MIN_ARCH)
+      set( MIN_ARCH 300 )
+    endif()
+    list(APPEND __cuda_architectures 35)
+    message( STATUS "    compile for CUDA arch 3.5 (Kepler)" )
+  endif()
+  if ( (CHAMELEON_CUDA_TARGETS MATCHES sm_50) AND (CUDA_VERSION VERSION_LESS "11.0") )
+    if (NOT MIN_ARCH)
+      set( MIN_ARCH 500 )
+    endif()
+    list(APPEND __cuda_architectures 50)
+    message( STATUS "    compile for CUDA arch 5.0 (Maxwell)" )
+  endif()
+  if (CHAMELEON_CUDA_TARGETS MATCHES sm_52)
+    if (NOT MIN_ARCH)
+      set( MIN_ARCH 520 )
+    endif()
+    list(APPEND __cuda_architectures 52)
+    message( STATUS "    compile for CUDA arch 5.2 (Maxwell)" )
+  endif()
+  if (CHAMELEON_CUDA_TARGETS MATCHES sm_53)
+    if (NOT MIN_ARCH)
+      set( MIN_ARCH 530 )
+    endif()
+    list(APPEND __cuda_architectures 53)
+    message( STATUS "    compile for CUDA arch 5.3 (Maxwell)" )
+  endif()
+  if (CHAMELEON_CUDA_TARGETS MATCHES sm_60)
+    if (NOT MIN_ARCH)
+      set( MIN_ARCH 600 )
+    endif()
+    list(APPEND __cuda_architectures 60)
+    message( STATUS "    compile for CUDA arch 6.0 (Pascal)" )
+  endif()
+  if (CHAMELEON_CUDA_TARGETS MATCHES sm_61)
+    if (NOT MIN_ARCH)
+      set( MIN_ARCH 610 )
+    endif()
+    list(APPEND __cuda_architectures 61)
+    message( STATUS "    compile for CUDA arch 6.1 (Pascal)" )
+  endif()
+  if (CHAMELEON_CUDA_TARGETS MATCHES sm_62)
+    if (NOT MIN_ARCH)
+      set( MIN_ARCH 620 )
+    endif()
+    list(APPEND __cuda_architectures 62)
+    message( STATUS "    compile for CUDA arch 6.2 (Pascal)" )
+  endif()
+  if (CHAMELEON_CUDA_TARGETS MATCHES sm_70)
+    if (NOT MIN_ARCH)
+      set( MIN_ARCH 700 )
+    endif()
+    list(APPEND __cuda_architectures 70)
+    message( STATUS "    compile for CUDA arch 7.0 (Volta)" )
+  endif()
+  if (CHAMELEON_CUDA_TARGETS MATCHES sm_71)
+    if (NOT MIN_ARCH)
+      set( MIN_ARCH 710 )
+    endif()
+    list(APPEND __cuda_architectures 71)
+    message( STATUS "    compile for CUDA arch 7.1 (Volta)" )
+  endif()
+  if (CHAMELEON_CUDA_TARGETS MATCHES sm_75)
+    if (NOT MIN_ARCH)
+      set( MIN_ARCH 750 )
+    endif()
+    list(APPEND __cuda_architectures 75)
+    message( STATUS "    compile for CUDA arch 7.5 (Turing)" )
+  endif()
+  if (CHAMELEON_CUDA_TARGETS MATCHES sm_80)
+    if (NOT MIN_ARCH)
+      set( MIN_ARCH 800 )
+    endif()
+    list(APPEND __cuda_architectures 80)
+    message( STATUS "    compile for CUDA arch 8.0 (Ampere)" )
+  endif()
+  if (CHAMELEON_CUDA_TARGETS MATCHES sm_90)
+    if (NOT MIN_ARCH)
+      set( MIN_ARCH 900 )
+    endif()
+    list(APPEND __cuda_architectures 90)
+    message( STATUS "    compile for CUDA arch 9.0 (Hopper)" )
+  endif()
+  if (NOT MIN_ARCH)
+    message( FATAL_ERROR "CHAMELEON_CUDA_TARGETS must contain one or more of Fermi, Kepler, Maxwell, Pascal, Volta, Turing, Ampere, or valid sm_[0-9][0-9]" )
+  endif()
+  # Remove extra
+  # ------------
+  if(CUDA_VERSION VERSION_GREATER_EQUAL "8.0")
+    list(REMOVE_ITEM __cuda_architectures "20" "21")
+  endif()
+  if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0")
+    list(REMOVE_ITEM __cuda_architectures "20" "21")
+  endif()
+  if(CUDA_VERSION VERSION_GREATER_EQUAL "10.0")
+    list(REMOVE_ITEM __cuda_architectures "30" "32")
+  endif()
+  if(CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
+    list(REMOVE_ITEM __cuda_architectures "35" "50")
+  endif()
+  set(CMAKE_CUDA_ARCHITECTURES "${__cuda_architectures}")
+  set(ZSRC
+    ${ZSRC}
+    cuda_zlag2c.cu
+    cuda_dlag2h.cu
+  )
+endif()
 # Former MAGMA files that are no longer supported
 # if( CHAMELEON_USE_MAGMA )
 #   set(ZSRC
@@ -102,6 +316,12 @@ set_property(TARGET gpucublas PROPERTY INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}
 target_link_libraries(gpucublas PRIVATE coreblas CUDA::CUBLAS)
 target_link_libraries(gpucublas PUBLIC MORSE::M)
+set_target_properties(gpucublas PROPERTIES
+  CUDA_SEPARABLE_COMPILATION OFF)
+#target_include_directories( gpucublas PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+#target_link_libraries( gpucublas PRIVATE CUDA::cublas CUDA::cudart )
 # export target coreblas
 install(EXPORT gpucublasTargets
        NAMESPACE CHAMELEON::

--- a/gpucublas/compute/cuda_dlag2h.cu
+++ b/gpucublas/compute/cuda_dlag2h.cu
+/**
+ *
+ * @file cuda_dlag2h.cu
+ *
+ * @copyright 2023-2023 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon cuda_dlag2h GPU kernel
+ *
+ * @version 1.3.0
+ * @author Mark Gates
+ * @author Mathieu Faverge
+ * @date 2023-07-04
+ * @precisions normal d -> d s
+ *
+ * This file is an adaptation of the MAGMA zlag2c.cu, clag2z.cu, hlag2s.cu, and slag2h.cu
+ *
+ */
+#include "gpucublas.h"
+#include <coreblas/lapacke.h>
+#if CUDA_VERSION < 7500
+#error "This file should not be included as the half precision floats are not supported."
+#endif
+#define BLK_X 64
+#define BLK_Y 32
+__device__  int cuda_dlag2h_flag = 0;
+/*
+ * Divides matrix into ceil( m/BLK_X ) x ceil( n/BLK_Y ) blocks.
+ * Each block has BLK_X threads.
+ * Each thread loops across one row, updating BLK_Y entries.
+ */
+__global__
+void cuda_dlag2h_kernel(
+    int m, int n,
+    const double *A, int lda,
+    CHAMELEON_Real16_t *HA, int ldha,
+    double rmax )
+{
+    int ind = blockIdx.x * BLK_X + threadIdx.x;
+    int iby = blockIdx.y * BLK_Y;
+    /* check if full block-column */
+    bool full = (iby + BLK_Y <= n);
+    double tmp;
+    /* do only rows inside matrix */
+    if ( ind > m ) {
+        return;
+    }
+    A  += ind + iby*lda;
+    HA += ind + iby*ldha;
+    if ( full ) {
+        /* full block-column */
+#pragma unroll
+        for( int j=0; j < BLK_Y; ++j ) {
+            tmp = A[j*lda];
+            if ( fabs(tmp) > rmax ) {
+                cuda_dlag2h_flag = 1;
+            }
+            HA[j*ldha] = __double2half(tmp);
+        }
+    }
+    else {
+        /* partial block-column */
+        for( int j=0; (j < BLK_Y) && (iby+j < n); ++j ) {
+            tmp = A[j*lda];
+            if ( fabs(tmp) > rmax ) {
+                cuda_dlag2h_flag = 1;
+            }
+            HA[j*ldha] = __double2half(tmp);
+        }
+    }
+}
+/**
+ *
+ * @ingroup CUDA_CHAMELEON_Complex64_t
+ *
+ * CUDA_dlag2h converts a double-real matrix, A, to a half-real matrix,
+ * HA.
+ *
+ * RMAX is the overflow for the single-real arithmetic.  DLAG2H checks that
+ * all the entries of A are between -RMAX and RMAX. If not, the conversion is
+ * aborted and a magma_dlag2h_flag is raised.
+ *
+ *  @param[in] m
+ *          The number of lines of the matrix A.  m >= 0.
+ *
+ *  @param[in] n
+ *          The number of columns of the matrix A.  n >= 0.
+ *
+ *  @param[in] A
+ *          On entry, the lda-by-n coefficient matrix A.
+ *
+ *  @param[in] lda
+ *          The leading dimension of the array A.  LDA >= max(1,m).
+ *
+ *  @param[out] HA
+ *          On exit, if INFO=0, the ldha-by-n coefficient matrix HA;
+ *          if INFO > 0, the content of HA is unspecified.
+ *
+ *  @param[in] ldha
+ *          The leading dimension of the array HA.  LDHA >= max(1,m).
+ *
+ *  @param[out]
+ *    -     = 0:  successful exit.
+ *    -     < 0:  if INFO = -i, the i-th argument had an illegal value
+ *    -     = 1:  an entry of the matrix A is greater than the SINGLE PRECISION
+ *                overflow threshold, in this case, the content
+ *                of HA on exit is unspecified.
+ *
+ *  @param[in] handle
+ *          Cublas handle to execute in.
+ *
+ **/
+extern "C" int
+CUDA_dlag2h( int m, int n,
+             const double *A, int lda,
+             CHAMELEON_Real16_t *HA, int ldha,
+             cublasHandle_t handle )
+{
+    cudaStream_t stream;
+    double       rmax;
+    if ( m < 0 ) {
+        return -1;
+    }
+    else if ( n < 0 ) {
+        return -2;
+    }
+    else if ( lda < chameleon_max(1,m) ) {
+        return -4;
+    }
+    else if ( ldha < chameleon_max(1,m) ) {
+        return -6;
+    }
+    /* quick return */
+    if ( m == 0 || n == 0 ) {
+        return 0;
+    }
+    dim3 threads( BLK_X, 1 );
+    dim3 grid( chameleon_ceil( m, BLK_X ), chameleon_ceil( n, BLK_Y ) );
+    /*
+     * There is no lapackf77_hlamch, please visit:
+     * https://blogs.mathworks.com/cleve/2017/05/08/half-precision-16-bit-floating-point-arithmetic/
+     */
+    rmax = 65504.;
+    cublasGetStream( handle, &stream );
+    cuda_dlag2h_kernel<<< grid, threads, 0, stream >>>( m, n, A, lda, HA, ldha, rmax );
+    return 0;
+}
+/*
+ * Divides matrix into ceil( m/BLK_X ) x ceil( n/BLK_Y ) blocks.
+ * Each block has BLK_X threads.
+ * Each thread loops across one row, updating BLK_Y entries.
+ */
+__global__
+void cuda_hlag2d_kernel(
+    int m, int n,
+    const CHAMELEON_Real16_t *HA, int ldha,
+    double *A, int lda )
+{
+    int ind = blockIdx.x * BLK_X + threadIdx.x;
+    int iby = blockIdx.y * BLK_Y;
+    /* check if full block-column */
+    bool full = (iby + BLK_Y <= n);
+    /* do only rows inside matrix */
+    if ( ind > m ) {
+        return;
+    }
+    A  += ind + iby*lda;
+    HA += ind + iby*ldha;
+    if ( full ) {
+        // full block-column
+#pragma unroll
+        for( int j=0; j < BLK_Y; ++j ) {
+#if defined(PRECISION_zc)
+            A[j*ldha] = make_double( HA[j*ldha].x, HA[j*ldha].y );
+#else
+            A[j*ldha] = HA[j*ldha];
+#endif
+        }
+    }
+    else {
+        // partial block-column
+        for( int j=0; (j < BLK_Y) && (iby+j) < n; ++j ) {
+#if defined(PRECISION_zc)
+            A[j*ldha] = make_double( HA[j*ldha].x, HA[j*ldha].y );
+#else
+            A[j*ldha] = HA[j*ldha];
+#endif
+        }
+    }
+}
+/**
+ *
+ * @ingroup CUDA_CHAMELEON_Complex64_t
+ *
+ * CUDA_hlag2d converts a half-real matrix, HA, to a double-real matrix,
+ * A.
+ *
+ *  Note that while it is possible to overflow while converting from double to
+ *  single, it is not possible to overflow when converting from single to
+ *  double.
+ *
+ *  @param[in] m
+ *          The number of lines of the matrix A and HA.  m >= 0.
+ *
+ *  @param[in] n
+ *          The number of columns of the matrix A and HA.  n >= 0.
+ *
+ *  @param[in] HA
+ *          On entry, the lda-by-n coefficient matrix HA.
+ *
+ *  @param[in] ldha
+ *          The leading dimension of the array HA.  ldha >= max(1,m).
+ *
+ *  @param[out] A
+ *          On exit, the lda-by-n coefficient matrix A.
+ *
+ *  @param[in] lda
+ *          The leading dimension of the array A. lda >= max(1,m).
+ *
+ *  @param[out]
+ *    -     = 0:  successful exit.
+ *    -     < 0:  if INFO = -i, the i-th argument had an illegal value
+ *
+ *  @param[in] handle
+ *          Cublas handle to execute in.
+ *
+ **/
+extern "C" int
+CUDA_hlag2d( int m, int n,
+             const CHAMELEON_Real16_t *HA, int ldha,
+             double      *A,  int lda,
+             cublasHandle_t handle )
+{
+    cudaStream_t stream;
+    if ( m < 0 ) {
+        return -1;
+    }
+    else if ( n < 0 ) {
+        return -2;
+    }
+    else if ( ldha < chameleon_max(1,m) ) {
+        return -4;
+    }
+    else if ( lda < chameleon_max(1,m) ) {
+        return -6;
+    }
+    /* quick return */
+    if ( (m == 0) || (n == 0) ) {
+        return 0;
+    }
+    dim3 threads( BLK_X, 1 );
+    dim3 grid( chameleon_ceil( m, BLK_X ), chameleon_ceil( n, BLK_Y ) );
+    cublasGetStream( handle, &stream );
+    cuda_hlag2d_kernel<<< grid, threads, 0, stream >>> ( m, n, HA, ldha, A, lda );
+    return 0;
+}
--- a/gpucublas/compute/cuda_zlag2c.cu
+++ b/gpucublas/compute/cuda_zlag2c.cu
+/**
+ *
+ * @file cuda_zlag2c.cu
+ *
+ * @copyright 2023-2023 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon cuda_zlag2c GPU kernel
+ *
+ * @version 1.3.0
+ * @author Mark Gates
+ * @author Mathieu Faverge
+ * @date 2023-07-04
+ * @precisions mixed zc -> ds
+ *
+ * This file is an adaptation of the MAGMA zlag2c.cu and clag2z files.
+ *
+ */
+#include "gpucublas.h"
+#include <coreblas/lapacke.h>
+#define BLK_X 64
+#define BLK_Y 32
+__device__  int cuda_zlag2c_flag = 0;
+/*
+ * Divides matrix into ceil( m/BLK_X ) x ceil( n/BLK_Y ) blocks.
+ * Each block has BLK_X threads.
+ * Each thread loops across one row, updating BLK_Y entries.
+ */
+__global__
+void cuda_zlag2c_kernel(
+    int m, int n,
+    const cuDoubleComplex *A,  int lda,
+    cuFloatComplex        *SA, int ldsa,
+    double rmax )
+{
+    cuDoubleComplex tmp;
+    int ind = blockIdx.x * BLK_X + threadIdx.x;
+    int iby = blockIdx.y * BLK_Y;
+    /* check if full block-column */
+    bool full = (iby + BLK_Y <= n);
+    /* do only rows inside matrix */
+    if ( ind > m ) {
+        return;
+    }
+    A  += ind + iby*lda;
+    SA += ind + iby*ldsa;
+    if ( full ) {
+        /* full block-column */
+#pragma unroll
+        for( int j=0; j < BLK_Y; ++j ) {
+            tmp = A[j*lda];
+            if (
+#if defined(PRECISION_zc)
+                (fabs(tmp.x) > rmax) || (fabs(tmp.y) > rmax)
+#else
+                (fabs(tmp) > rmax)
+#endif
+                )
+            {
+                cuda_zlag2c_flag = 1;
+            }
+#if defined(PRECISION_zc)
+            SA[j*ldsa] = make_cuFloatComplex(tmp.x, tmp.y);
+#else
+            SA[j*ldsa] = tmp;
+#endif
+        }
+    }
+    else {
+        /* partial block-column */
+        for( int j=0; (j < BLK_Y) && (iby+j < n); ++j ) {
+            tmp = A[j*lda];
+            if (
+#if defined(PRECISION_zc)
+                (fabs(tmp.x) > rmax) || (fabs(tmp.y) > rmax)
+#else
+                (fabs(tmp) > rmax)
+#endif
+                )
+            {
+                cuda_zlag2c_flag = 1;
+            }
+#if defined(PRECISION_zc)
+            SA[j*ldsa] = make_cuFloatComplex(tmp.x, tmp.y);
+#else
+            SA[j*ldsa] = tmp;
+#endif
+        }
+    }
+}
+/**
+ *
+ * @ingroup CUDA_CHAMELEON_Complex64_t
+ *
+ * CUDA_zlag2c converts a double-complex matrix, A, to a single-complex matrix,
+ * SA.
+ *
+ * RMAX is the overflow for the single-complex arithmetic.  ZLAG2C checks that
+ * all the entries of A are between -RMAX and RMAX. If not, the conversion is
+ * aborted and a magma_zlag2c_flag is raised.
+ *
+ *  @param[in] m
+ *          The number of lines of the matrix A.  m >= 0.
+ *
+ *  @param[in] n
+ *          The number of columns of the matrix A.  n >= 0.
+ *
+ *  @param[in] A
+ *          On entry, the lda-by-n coefficient matrix A.
+ *
+ *  @param[in] lda
+ *          The leading dimension of the array A.  LDA >= max(1,m).
+ *
+ *  @param[out] SA
+ *          On exit, if INFO=0, the ldsa-by-n coefficient matrix SA;
+ *          if INFO > 0, the content of SA is unspecified.
+ *
+ *  @param[in] ldsa
+ *          The leading dimension of the array SA.  LDSA >= max(1,m).
+ *
+ *  @param[out]
+ *    -     = 0:  successful exit.
+ *    -     < 0:  if INFO = -i, the i-th argument had an illegal value
+ *    -     = 1:  an entry of the matrix A is greater than the COMPLEX
+ *                overflow threshold, in this case, the content
+ *                of SA on exit is unspecified.
+ *
+ *  @param[in] handle
+ *          Cublas handle to execute in.
+ *
+ **/
+extern "C" int
+CUDA_zlag2c( int m, int n,
+             const cuDoubleComplex *A, int lda,
+             cuFloatComplex *SA, int ldsa,
+             cublasHandle_t handle )
+{
+    cudaStream_t stream;
+    double       rmax;
+    if ( m < 0 ) {
+        return -1;
+    }
+    else if ( n < 0 ) {
+        return -2;
+    }
+    else if ( lda < chameleon_max(1,m) ) {
+        return -4;
+    }
+    else if ( ldsa < chameleon_max(1,m) ) {
+        return -6;
+    }
+    /* quick return */
+    if ( m == 0 || n == 0 ) {
+        return 0;
+    }
+    dim3 threads( BLK_X, 1 );
+    dim3 grid( chameleon_ceil( m, BLK_X ), chameleon_ceil( n, BLK_Y ) );
+    rmax = LAPACKE_slamch_work( 'O' );
+    cublasGetStream( handle, &stream );
+    cuda_zlag2c_kernel<<< grid, threads, 0, stream >>>( m, n, A, lda, SA, ldsa, rmax );
+    return 0;
+}
+/*
+ * Divides matrix into ceil( m/BLK_X ) x ceil( n/BLK_Y ) blocks.
+ * Each block has BLK_X threads.
+ * Each thread loops across one row, updating BLK_Y entries.
+ */
+__global__
+void cuda_clag2z_kernel(
+    int m, int n,
+    const cuFloatComplex *SA, int ldsa,
+    cuDoubleComplex       *A, int lda )
+{
+    int ind = blockIdx.x * BLK_X + threadIdx.x;
+    int iby = blockIdx.y * BLK_Y;
+    /* check if full block-column */
+    bool full = (iby + BLK_Y <= n);
+    /* do only rows inside matrix */
+    if ( ind > m ) {
+        return;
+    }
+    A  += ind + iby*lda;
+    SA += ind + iby*ldsa;
+    if ( full ) {
+        // full block-column
+#pragma unroll
+        for( int j=0; j < BLK_Y; ++j ) {
+#if defined(PRECISION_zc)
+            A[j*ldsa] = make_cuDoubleComplex( SA[j*ldsa].x, SA[j*ldsa].y );
+#else
+            A[j*ldsa] = SA[j*ldsa];
+#endif
+        }
+    }
+    else {
+        // partial block-column
+        for( int j=0; (j < BLK_Y) && (iby+j) < n; ++j ) {
+#if defined(PRECISION_zc)
+            A[j*ldsa] = make_cuDoubleComplex( SA[j*ldsa].x, SA[j*ldsa].y );
+#else
+            A[j*ldsa] = SA[j*ldsa];
+#endif
+        }
+    }
+}
+/**
+ *
+ * @ingroup CUDA_CHAMELEON_Complex64_t
+ *
+ * CUDA_clag2z converts a single-complex matrix, SA, to a double-complex matrix,
+ * A.
+ *
+ *  Note that while it is possible to overflow while converting from double to
+ *  single, it is not possible to overflow when converting from single to
+ *  double.
+ *
+ *  @param[in] m
+ *          The number of lines of the matrix A and SA.  m >= 0.
+ *
+ *  @param[in] n
+ *          The number of columns of the matrix A and SA.  n >= 0.
+ *
+ *  @param[in] SA
+ *          On entry, the lda-by-n coefficient matrix SA.
+ *
+ *  @param[in] ldsa
+ *          The leading dimension of the array SA.  ldsa >= max(1,m).
+ *
+ *  @param[out] A
+ *          On exit, the lda-by-n coefficient matrix A.
+ *
+ *  @param[in] lda
+ *          The leading dimension of the array A. lda >= max(1,m).
+ *
+ *  @param[out]
+ *    -     = 0:  successful exit.
+ *    -     < 0:  if INFO = -i, the i-th argument had an illegal value
+ *
+ *  @param[in] handle
+ *          Cublas handle to execute in.
+ *
+ **/
+extern "C" int
+CUDA_clag2z( int m, int n,
+             const cuFloatComplex *SA, int ldsa,
+             cuDoubleComplex      *A,  int lda,
+             cublasHandle_t handle )
+{
+    cudaStream_t stream;
+    if ( m < 0 ) {
+        return -1;
+    }
+    else if ( n < 0 ) {
+        return -2;
+    }
+    else if ( ldsa < chameleon_max(1,m) ) {
+        return -4;
+    }
+    else if ( lda < chameleon_max(1,m) ) {
+        return -6;
+    }
+    /* quick return */
+    if ( (m == 0) || (n == 0) ) {
+        return 0;
+    }
+    dim3 threads( BLK_X, 1 );
+    dim3 grid( chameleon_ceil( m, BLK_X ), chameleon_ceil( n, BLK_Y ) );
+    cublasGetStream( handle, &stream );
+    cuda_clag2z_kernel<<< grid, threads, 0, stream >>> ( m, n, SA, ldsa, A, lda );
+    return 0;
+}