diff --git a/cmake_modules/local_subs.py b/cmake_modules/local_subs.py index 2e8cb71068602c648bfbf02c6fb0ac187e20fd5c..df41dd109b68835ae2f07759d0bb81b758bfe075 100644 --- a/cmake_modules/local_subs.py +++ b/cmake_modules/local_subs.py @@ -39,6 +39,10 @@ _extra_blas = [ ('', 'slatm1', 'dlatm1', 'slatm1', 'dlatm1' ), ('', 'sgenm2', 'dgenm2', 'cgenm2', 'zgenm2' ), ('', 'slag2c_fake', 'dlag2z_fake', 'slag2c', 'dlag2z' ), + ('', 'slag2h', 'dlag2h', 'slag2h', 'dlag2h' ), + ('', 'hlag2s', 'hlag2d', 'hlag2s', 'hlag2d' ), + ('', 'slag2h', 'dlag2h', 'clag2x', 'zlag2x' ), + ('', 'hlag2s', 'hlag2d', 'xlag2c', 'xlag2z' ), ('', 'sgepdf', 'dgepdf', 'cgepdf', 'zgepdf' ), ('', 'scesca', 'dcesca', 'ccesca', 'zcesca' ), ('', 'sgesum', 'dgesum', 'cgesum', 'zgesum' ), diff --git a/gpucublas/compute/CMakeLists.txt b/gpucublas/compute/CMakeLists.txt index d7a745c0489c3364398267f64513737a9348d9a3..d06b4792883a9541e66a37d259bec6a20032a31e 100644 --- a/gpucublas/compute/CMakeLists.txt +++ b/gpucublas/compute/CMakeLists.txt @@ -17,14 +17,17 @@ # Univ. of California Berkeley, # Univ. of Colorado Denver. # -# @version 1.2.0 +# @version 1.3.0 # @author Florent Pruvost # @author Guillaume Sylvand # @author Mathieu Faverge -# @date 2022-02-22 +# @date 2023-07-04 # ### +# To define CMAKE_CUDA_COMPILER +cmake_minimum_required(VERSION 3.18) + # Generate the chameleon sources for all possible precisions # ------------------------------------------------------ set(GPUCUBLAS_SRCS_GENERATED "") @@ -53,6 +56,217 @@ set(ZSRC cuda_zunmqrt.c ) +# Add CUDA kernel if compiler and toolkit are available +# ----------------------------------------------------- +include(CheckLanguage) +check_language(CUDA) + +if(CMAKE_CUDA_COMPILER) + enable_language(CUDA) + find_package(CUDAToolkit) +else() + message(STATUS "CUDA language is not supported") +endif() + +if (CUDAToolkit_FOUND) + set(CMAKE_CUDA_STANDARD 11) + + # Define the architectures (inspired from the MAGMA project) + set( CHAMELEON_CUDA_TARGETS "Kepler Maxwell Pascal Volta Ampere" CACHE STRING "CUDA architectures to compile for; one or more of Fermi, Kepler, Maxwell, Pascal, Volta, Turing, Ampere, Hopper, or valid sm_[0-9][0-9]" ) + + # NVCC options for the different cards + # sm_xx is binary, compute_xx is PTX for forward compatibility + # MIN_ARCH is the lowest requested version + + if(WIN32) + # Disable separable compilation on Windows because object linking list + # becomes too long when building multiple archs and MSVC throws errors + set(CUDA_SEPARABLE_COMPILATION OFF) + else() + set(CUDA_SEPARABLE_COMPILATION ON) + endif() + + set(__cuda_architectures) + + # Architectures by names + # ---------------------- + if (CHAMELEON_CUDA_TARGETS MATCHES Fermi) + set( CHAMELEON_CUDA_TARGETS "${CHAMELEON_CUDA_TARGETS} sm_20" ) + endif() + + if (CHAMELEON_CUDA_TARGETS MATCHES Kepler) + set( CHAMELEON_CUDA_TARGETS "${CHAMELEON_CUDA_TARGETS} sm_30 sm_35 sm_37" ) + endif() + + if (CHAMELEON_CUDA_TARGETS MATCHES Maxwell) + set( CHAMELEON_CUDA_TARGETS "${CHAMELEON_CUDA_TARGETS} sm_50" ) + endif() + + if (CHAMELEON_CUDA_TARGETS MATCHES Pascal) + set( CHAMELEON_CUDA_TARGETS "${CHAMELEON_CUDA_TARGETS} sm_60" ) + endif() + + if (CHAMELEON_CUDA_TARGETS MATCHES Volta) + set( CHAMELEON_CUDA_TARGETS "${CHAMELEON_CUDA_TARGETS} sm_70" ) + endif() + + if (CHAMELEON_CUDA_TARGETS MATCHES Turing) + set( CHAMELEON_CUDA_TARGETS "${CHAMELEON_CUDA_TARGETS} sm_75" ) + endif() + + if (CHAMELEON_CUDA_TARGETS MATCHES Ampere) + set( CHAMELEON_CUDA_TARGETS "${CHAMELEON_CUDA_TARGETS} sm_80" ) + endif() + + if (CHAMELEON_CUDA_TARGETS MATCHES Hopper) + set( CHAMELEON_CUDA_TARGETS "${CHAMELEON_CUDA_TARGETS} sm_90" ) + endif() + + # Architectures versions + # ---------------------- + if ( (CHAMELEON_CUDA_TARGETS MATCHES sm_20) AND (CUDA_VERSION VERSION_LESS "8.0") ) + if (NOT MIN_ARCH) + set( MIN_ARCH 200 ) + endif() + list(APPEND __cuda_architectures 20) + message( STATUS " compile for CUDA arch 2.0 (Fermi)" ) + endif() + + if ( (CHAMELEON_CUDA_TARGETS MATCHES sm_30) AND (CUDA_VERSION VERSION_LESS "10.0") ) + if (NOT MIN_ARCH) + set( MIN_ARCH 300 ) + endif() + list(APPEND __cuda_architectures 30) + message( STATUS " compile for CUDA arch 3.0 (Kepler)" ) + endif() + + if ( (CHAMELEON_CUDA_TARGETS MATCHES sm_35) AND (CUDA_VERSION VERSION_LESS "11.0") ) + if (NOT MIN_ARCH) + set( MIN_ARCH 300 ) + endif() + list(APPEND __cuda_architectures 35) + message( STATUS " compile for CUDA arch 3.5 (Kepler)" ) + endif() + + if ( (CHAMELEON_CUDA_TARGETS MATCHES sm_50) AND (CUDA_VERSION VERSION_LESS "11.0") ) + if (NOT MIN_ARCH) + set( MIN_ARCH 500 ) + endif() + list(APPEND __cuda_architectures 50) + message( STATUS " compile for CUDA arch 5.0 (Maxwell)" ) + endif() + + if (CHAMELEON_CUDA_TARGETS MATCHES sm_52) + if (NOT MIN_ARCH) + set( MIN_ARCH 520 ) + endif() + list(APPEND __cuda_architectures 52) + message( STATUS " compile for CUDA arch 5.2 (Maxwell)" ) + endif() + + if (CHAMELEON_CUDA_TARGETS MATCHES sm_53) + if (NOT MIN_ARCH) + set( MIN_ARCH 530 ) + endif() + list(APPEND __cuda_architectures 53) + message( STATUS " compile for CUDA arch 5.3 (Maxwell)" ) + endif() + + if (CHAMELEON_CUDA_TARGETS MATCHES sm_60) + if (NOT MIN_ARCH) + set( MIN_ARCH 600 ) + endif() + list(APPEND __cuda_architectures 60) + message( STATUS " compile for CUDA arch 6.0 (Pascal)" ) + endif() + + if (CHAMELEON_CUDA_TARGETS MATCHES sm_61) + if (NOT MIN_ARCH) + set( MIN_ARCH 610 ) + endif() + list(APPEND __cuda_architectures 61) + message( STATUS " compile for CUDA arch 6.1 (Pascal)" ) + endif() + + if (CHAMELEON_CUDA_TARGETS MATCHES sm_62) + if (NOT MIN_ARCH) + set( MIN_ARCH 620 ) + endif() + list(APPEND __cuda_architectures 62) + message( STATUS " compile for CUDA arch 6.2 (Pascal)" ) + endif() + + if (CHAMELEON_CUDA_TARGETS MATCHES sm_70) + if (NOT MIN_ARCH) + set( MIN_ARCH 700 ) + endif() + list(APPEND __cuda_architectures 70) + message( STATUS " compile for CUDA arch 7.0 (Volta)" ) + endif() + + if (CHAMELEON_CUDA_TARGETS MATCHES sm_71) + if (NOT MIN_ARCH) + set( MIN_ARCH 710 ) + endif() + list(APPEND __cuda_architectures 71) + message( STATUS " compile for CUDA arch 7.1 (Volta)" ) + endif() + + if (CHAMELEON_CUDA_TARGETS MATCHES sm_75) + if (NOT MIN_ARCH) + set( MIN_ARCH 750 ) + endif() + list(APPEND __cuda_architectures 75) + message( STATUS " compile for CUDA arch 7.5 (Turing)" ) + endif() + + if (CHAMELEON_CUDA_TARGETS MATCHES sm_80) + if (NOT MIN_ARCH) + set( MIN_ARCH 800 ) + endif() + list(APPEND __cuda_architectures 80) + message( STATUS " compile for CUDA arch 8.0 (Ampere)" ) + endif() + + if (CHAMELEON_CUDA_TARGETS MATCHES sm_90) + if (NOT MIN_ARCH) + set( MIN_ARCH 900 ) + endif() + list(APPEND __cuda_architectures 90) + message( STATUS " compile for CUDA arch 9.0 (Hopper)" ) + endif() + + if (NOT MIN_ARCH) + message( FATAL_ERROR "CHAMELEON_CUDA_TARGETS must contain one or more of Fermi, Kepler, Maxwell, Pascal, Volta, Turing, Ampere, or valid sm_[0-9][0-9]" ) + endif() + + # Remove extra + # ------------ + if(CUDA_VERSION VERSION_GREATER_EQUAL "8.0") + list(REMOVE_ITEM __cuda_architectures "20" "21") + endif() + + if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0") + list(REMOVE_ITEM __cuda_architectures "20" "21") + endif() + + if(CUDA_VERSION VERSION_GREATER_EQUAL "10.0") + list(REMOVE_ITEM __cuda_architectures "30" "32") + endif() + + if(CUDA_VERSION VERSION_GREATER_EQUAL "11.0") + list(REMOVE_ITEM __cuda_architectures "35" "50") + endif() + + set(CMAKE_CUDA_ARCHITECTURES "${__cuda_architectures}") + + set(ZSRC + ${ZSRC} + cuda_zlag2c.cu + cuda_dlag2h.cu + ) +endif() + # Former MAGMA files that are no longer supported # if( CHAMELEON_USE_MAGMA ) # set(ZSRC @@ -102,6 +316,12 @@ set_property(TARGET gpucublas PROPERTY INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX} target_link_libraries(gpucublas PRIVATE coreblas CUDA::CUBLAS) target_link_libraries(gpucublas PUBLIC MORSE::M) +set_target_properties(gpucublas PROPERTIES + CUDA_SEPARABLE_COMPILATION OFF) + +#target_include_directories( gpucublas PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) +#target_link_libraries( gpucublas PRIVATE CUDA::cublas CUDA::cudart ) + # export target coreblas install(EXPORT gpucublasTargets NAMESPACE CHAMELEON:: diff --git a/gpucublas/compute/cuda_dlag2h.cu b/gpucublas/compute/cuda_dlag2h.cu new file mode 100644 index 0000000000000000000000000000000000000000..8136c248fd9ff2c02a4910b251f282acdbe4cc77 --- /dev/null +++ b/gpucublas/compute/cuda_dlag2h.cu @@ -0,0 +1,290 @@ +/** + * + * @file cuda_dlag2h.cu + * + * @copyright 2023-2023 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon cuda_dlag2h GPU kernel + * + * @version 1.3.0 + * @author Mark Gates + * @author Mathieu Faverge + * @date 2023-07-04 + * @precisions normal d -> d s + * + * This file is an adaptation of the MAGMA zlag2c.cu, clag2z.cu, hlag2s.cu, and slag2h.cu + * + */ +#include "gpucublas.h" +#include <coreblas/lapacke.h> + +#if CUDA_VERSION < 7500 +#error "This file should not be included as the half precision floats are not supported." +#endif + +#define BLK_X 64 +#define BLK_Y 32 + +__device__ int cuda_dlag2h_flag = 0; + +/* + * Divides matrix into ceil( m/BLK_X ) x ceil( n/BLK_Y ) blocks. + * Each block has BLK_X threads. + * Each thread loops across one row, updating BLK_Y entries. + */ +__global__ +void cuda_dlag2h_kernel( + int m, int n, + const double *A, int lda, + CHAMELEON_Real16_t *HA, int ldha, + double rmax ) +{ + int ind = blockIdx.x * BLK_X + threadIdx.x; + int iby = blockIdx.y * BLK_Y; + + /* check if full block-column */ + bool full = (iby + BLK_Y <= n); + double tmp; + + /* do only rows inside matrix */ + if ( ind > m ) { + return; + } + + A += ind + iby*lda; + HA += ind + iby*ldha; + if ( full ) { + /* full block-column */ +#pragma unroll + for( int j=0; j < BLK_Y; ++j ) { + tmp = A[j*lda]; + + if ( fabs(tmp) > rmax ) { + cuda_dlag2h_flag = 1; + } + + HA[j*ldha] = __double2half(tmp); + } + } + else { + /* partial block-column */ + for( int j=0; (j < BLK_Y) && (iby+j < n); ++j ) { + tmp = A[j*lda]; + if ( fabs(tmp) > rmax ) { + cuda_dlag2h_flag = 1; + } + + HA[j*ldha] = __double2half(tmp); + } + } +} + + +/** + * + * @ingroup CUDA_CHAMELEON_Complex64_t + * + * CUDA_dlag2h converts a double-real matrix, A, to a half-real matrix, + * HA. + * + * RMAX is the overflow for the single-real arithmetic. DLAG2H checks that + * all the entries of A are between -RMAX and RMAX. If not, the conversion is + * aborted and a magma_dlag2h_flag is raised. + * + * @param[in] m + * The number of lines of the matrix A. m >= 0. + * + * @param[in] n + * The number of columns of the matrix A. n >= 0. + * + * @param[in] A + * On entry, the lda-by-n coefficient matrix A. + * + * @param[in] lda + * The leading dimension of the array A. LDA >= max(1,m). + * + * @param[out] HA + * On exit, if INFO=0, the ldha-by-n coefficient matrix HA; + * if INFO > 0, the content of HA is unspecified. + * + * @param[in] ldha + * The leading dimension of the array HA. LDHA >= max(1,m). + * + * @param[out] + * - = 0: successful exit. + * - < 0: if INFO = -i, the i-th argument had an illegal value + * - = 1: an entry of the matrix A is greater than the SINGLE PRECISION + * overflow threshold, in this case, the content + * of HA on exit is unspecified. + * + * @param[in] handle + * Cublas handle to execute in. + * + **/ +extern "C" int +CUDA_dlag2h( int m, int n, + const double *A, int lda, + CHAMELEON_Real16_t *HA, int ldha, + cublasHandle_t handle ) +{ + cudaStream_t stream; + double rmax; + + if ( m < 0 ) { + return -1; + } + else if ( n < 0 ) { + return -2; + } + else if ( lda < chameleon_max(1,m) ) { + return -4; + } + else if ( ldha < chameleon_max(1,m) ) { + return -6; + } + + /* quick return */ + if ( m == 0 || n == 0 ) { + return 0; + } + + dim3 threads( BLK_X, 1 ); + dim3 grid( chameleon_ceil( m, BLK_X ), chameleon_ceil( n, BLK_Y ) ); + + /* + * There is no lapackf77_hlamch, please visit: + * https://blogs.mathworks.com/cleve/2017/05/08/half-precision-16-bit-floating-point-arithmetic/ + */ + rmax = 65504.; + + cublasGetStream( handle, &stream ); + + cuda_dlag2h_kernel<<< grid, threads, 0, stream >>>( m, n, A, lda, HA, ldha, rmax ); + + return 0; +} + +/* + * Divides matrix into ceil( m/BLK_X ) x ceil( n/BLK_Y ) blocks. + * Each block has BLK_X threads. + * Each thread loops across one row, updating BLK_Y entries. + */ +__global__ +void cuda_hlag2d_kernel( + int m, int n, + const CHAMELEON_Real16_t *HA, int ldha, + double *A, int lda ) +{ + int ind = blockIdx.x * BLK_X + threadIdx.x; + int iby = blockIdx.y * BLK_Y; + + /* check if full block-column */ + bool full = (iby + BLK_Y <= n); + + /* do only rows inside matrix */ + if ( ind > m ) { + return; + } + + A += ind + iby*lda; + HA += ind + iby*ldha; + if ( full ) { + // full block-column +#pragma unroll + for( int j=0; j < BLK_Y; ++j ) { +#if defined(PRECISION_zc) + A[j*ldha] = make_double( HA[j*ldha].x, HA[j*ldha].y ); +#else + A[j*ldha] = HA[j*ldha]; +#endif + } + } + else { + // partial block-column + for( int j=0; (j < BLK_Y) && (iby+j) < n; ++j ) { +#if defined(PRECISION_zc) + A[j*ldha] = make_double( HA[j*ldha].x, HA[j*ldha].y ); +#else + A[j*ldha] = HA[j*ldha]; +#endif + } + } +} + + +/** + * + * @ingroup CUDA_CHAMELEON_Complex64_t + * + * CUDA_hlag2d converts a half-real matrix, HA, to a double-real matrix, + * A. + * + * Note that while it is possible to overflow while converting from double to + * single, it is not possible to overflow when converting from single to + * double. + * + * @param[in] m + * The number of lines of the matrix A and HA. m >= 0. + * + * @param[in] n + * The number of columns of the matrix A and HA. n >= 0. + * + * @param[in] HA + * On entry, the lda-by-n coefficient matrix HA. + * + * @param[in] ldha + * The leading dimension of the array HA. ldha >= max(1,m). + * + * @param[out] A + * On exit, the lda-by-n coefficient matrix A. + * + * @param[in] lda + * The leading dimension of the array A. lda >= max(1,m). + * + * @param[out] + * - = 0: successful exit. + * - < 0: if INFO = -i, the i-th argument had an illegal value + * + * @param[in] handle + * Cublas handle to execute in. + * + **/ +extern "C" int +CUDA_hlag2d( int m, int n, + const CHAMELEON_Real16_t *HA, int ldha, + double *A, int lda, + cublasHandle_t handle ) +{ + cudaStream_t stream; + + if ( m < 0 ) { + return -1; + } + else if ( n < 0 ) { + return -2; + } + else if ( ldha < chameleon_max(1,m) ) { + return -4; + } + else if ( lda < chameleon_max(1,m) ) { + return -6; + } + + /* quick return */ + if ( (m == 0) || (n == 0) ) { + return 0; + } + + dim3 threads( BLK_X, 1 ); + dim3 grid( chameleon_ceil( m, BLK_X ), chameleon_ceil( n, BLK_Y ) ); + + cublasGetStream( handle, &stream ); + cuda_hlag2d_kernel<<< grid, threads, 0, stream >>> ( m, n, HA, ldha, A, lda ); + + return 0; +} diff --git a/gpucublas/compute/cuda_zlag2c.cu b/gpucublas/compute/cuda_zlag2c.cu new file mode 100644 index 0000000000000000000000000000000000000000..cf3f35062fbe336c1c8f72119c659ba99e4693da --- /dev/null +++ b/gpucublas/compute/cuda_zlag2c.cu @@ -0,0 +1,304 @@ +/** + * + * @file cuda_zlag2c.cu + * + * @copyright 2023-2023 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon cuda_zlag2c GPU kernel + * + * @version 1.3.0 + * @author Mark Gates + * @author Mathieu Faverge + * @date 2023-07-04 + * @precisions mixed zc -> ds + * + * This file is an adaptation of the MAGMA zlag2c.cu and clag2z files. + * + */ +#include "gpucublas.h" +#include <coreblas/lapacke.h> + +#define BLK_X 64 +#define BLK_Y 32 + +__device__ int cuda_zlag2c_flag = 0; + +/* + * Divides matrix into ceil( m/BLK_X ) x ceil( n/BLK_Y ) blocks. + * Each block has BLK_X threads. + * Each thread loops across one row, updating BLK_Y entries. + */ +__global__ +void cuda_zlag2c_kernel( + int m, int n, + const cuDoubleComplex *A, int lda, + cuFloatComplex *SA, int ldsa, + double rmax ) +{ + cuDoubleComplex tmp; + + int ind = blockIdx.x * BLK_X + threadIdx.x; + int iby = blockIdx.y * BLK_Y; + + /* check if full block-column */ + bool full = (iby + BLK_Y <= n); + + /* do only rows inside matrix */ + if ( ind > m ) { + return; + } + + A += ind + iby*lda; + SA += ind + iby*ldsa; + if ( full ) { + /* full block-column */ +#pragma unroll + for( int j=0; j < BLK_Y; ++j ) { + tmp = A[j*lda]; + + if ( +#if defined(PRECISION_zc) + (fabs(tmp.x) > rmax) || (fabs(tmp.y) > rmax) +#else + (fabs(tmp) > rmax) +#endif + ) + { + cuda_zlag2c_flag = 1; + } + +#if defined(PRECISION_zc) + SA[j*ldsa] = make_cuFloatComplex(tmp.x, tmp.y); +#else + SA[j*ldsa] = tmp; +#endif + } + } + else { + /* partial block-column */ + for( int j=0; (j < BLK_Y) && (iby+j < n); ++j ) { + tmp = A[j*lda]; + if ( +#if defined(PRECISION_zc) + (fabs(tmp.x) > rmax) || (fabs(tmp.y) > rmax) +#else + (fabs(tmp) > rmax) +#endif + ) + { + cuda_zlag2c_flag = 1; + } + +#if defined(PRECISION_zc) + SA[j*ldsa] = make_cuFloatComplex(tmp.x, tmp.y); +#else + SA[j*ldsa] = tmp; +#endif + } + } +} + + +/** + * + * @ingroup CUDA_CHAMELEON_Complex64_t + * + * CUDA_zlag2c converts a double-complex matrix, A, to a single-complex matrix, + * SA. + * + * RMAX is the overflow for the single-complex arithmetic. ZLAG2C checks that + * all the entries of A are between -RMAX and RMAX. If not, the conversion is + * aborted and a magma_zlag2c_flag is raised. + * + * @param[in] m + * The number of lines of the matrix A. m >= 0. + * + * @param[in] n + * The number of columns of the matrix A. n >= 0. + * + * @param[in] A + * On entry, the lda-by-n coefficient matrix A. + * + * @param[in] lda + * The leading dimension of the array A. LDA >= max(1,m). + * + * @param[out] SA + * On exit, if INFO=0, the ldsa-by-n coefficient matrix SA; + * if INFO > 0, the content of SA is unspecified. + * + * @param[in] ldsa + * The leading dimension of the array SA. LDSA >= max(1,m). + * + * @param[out] + * - = 0: successful exit. + * - < 0: if INFO = -i, the i-th argument had an illegal value + * - = 1: an entry of the matrix A is greater than the COMPLEX + * overflow threshold, in this case, the content + * of SA on exit is unspecified. + * + * @param[in] handle + * Cublas handle to execute in. + * + **/ +extern "C" int +CUDA_zlag2c( int m, int n, + const cuDoubleComplex *A, int lda, + cuFloatComplex *SA, int ldsa, + cublasHandle_t handle ) +{ + cudaStream_t stream; + double rmax; + + if ( m < 0 ) { + return -1; + } + else if ( n < 0 ) { + return -2; + } + else if ( lda < chameleon_max(1,m) ) { + return -4; + } + else if ( ldsa < chameleon_max(1,m) ) { + return -6; + } + + /* quick return */ + if ( m == 0 || n == 0 ) { + return 0; + } + + dim3 threads( BLK_X, 1 ); + dim3 grid( chameleon_ceil( m, BLK_X ), chameleon_ceil( n, BLK_Y ) ); + + rmax = LAPACKE_slamch_work( 'O' ); + cublasGetStream( handle, &stream ); + + cuda_zlag2c_kernel<<< grid, threads, 0, stream >>>( m, n, A, lda, SA, ldsa, rmax ); + + return 0; +} + +/* + * Divides matrix into ceil( m/BLK_X ) x ceil( n/BLK_Y ) blocks. + * Each block has BLK_X threads. + * Each thread loops across one row, updating BLK_Y entries. + */ +__global__ +void cuda_clag2z_kernel( + int m, int n, + const cuFloatComplex *SA, int ldsa, + cuDoubleComplex *A, int lda ) +{ + int ind = blockIdx.x * BLK_X + threadIdx.x; + int iby = blockIdx.y * BLK_Y; + + /* check if full block-column */ + bool full = (iby + BLK_Y <= n); + + /* do only rows inside matrix */ + if ( ind > m ) { + return; + } + + A += ind + iby*lda; + SA += ind + iby*ldsa; + if ( full ) { + // full block-column +#pragma unroll + for( int j=0; j < BLK_Y; ++j ) { +#if defined(PRECISION_zc) + A[j*ldsa] = make_cuDoubleComplex( SA[j*ldsa].x, SA[j*ldsa].y ); +#else + A[j*ldsa] = SA[j*ldsa]; +#endif + } + } + else { + // partial block-column + for( int j=0; (j < BLK_Y) && (iby+j) < n; ++j ) { +#if defined(PRECISION_zc) + A[j*ldsa] = make_cuDoubleComplex( SA[j*ldsa].x, SA[j*ldsa].y ); +#else + A[j*ldsa] = SA[j*ldsa]; +#endif + } + } +} + + +/** + * + * @ingroup CUDA_CHAMELEON_Complex64_t + * + * CUDA_clag2z converts a single-complex matrix, SA, to a double-complex matrix, + * A. + * + * Note that while it is possible to overflow while converting from double to + * single, it is not possible to overflow when converting from single to + * double. + * + * @param[in] m + * The number of lines of the matrix A and SA. m >= 0. + * + * @param[in] n + * The number of columns of the matrix A and SA. n >= 0. + * + * @param[in] SA + * On entry, the lda-by-n coefficient matrix SA. + * + * @param[in] ldsa + * The leading dimension of the array SA. ldsa >= max(1,m). + * + * @param[out] A + * On exit, the lda-by-n coefficient matrix A. + * + * @param[in] lda + * The leading dimension of the array A. lda >= max(1,m). + * + * @param[out] + * - = 0: successful exit. + * - < 0: if INFO = -i, the i-th argument had an illegal value + * + * @param[in] handle + * Cublas handle to execute in. + * + **/ +extern "C" int +CUDA_clag2z( int m, int n, + const cuFloatComplex *SA, int ldsa, + cuDoubleComplex *A, int lda, + cublasHandle_t handle ) +{ + cudaStream_t stream; + + if ( m < 0 ) { + return -1; + } + else if ( n < 0 ) { + return -2; + } + else if ( ldsa < chameleon_max(1,m) ) { + return -4; + } + else if ( lda < chameleon_max(1,m) ) { + return -6; + } + + /* quick return */ + if ( (m == 0) || (n == 0) ) { + return 0; + } + + dim3 threads( BLK_X, 1 ); + dim3 grid( chameleon_ceil( m, BLK_X ), chameleon_ceil( n, BLK_Y ) ); + + cublasGetStream( handle, &stream ); + cuda_clag2z_kernel<<< grid, threads, 0, stream >>> ( m, n, SA, ldsa, A, lda ); + + return 0; +}