Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 0036dfaa authored by Mathieu Faverge's avatar Mathieu Faverge
Browse files

gpucublas: Add dlag2h and zlag2c cuda kernels originated from Magma

parent 50590d59
No related branches found
No related tags found
1 merge request!395Introduce half-precision conversion and gemm kernels for GPUs
...@@ -39,6 +39,10 @@ _extra_blas = [ ...@@ -39,6 +39,10 @@ _extra_blas = [
('', 'slatm1', 'dlatm1', 'slatm1', 'dlatm1' ), ('', 'slatm1', 'dlatm1', 'slatm1', 'dlatm1' ),
('', 'sgenm2', 'dgenm2', 'cgenm2', 'zgenm2' ), ('', 'sgenm2', 'dgenm2', 'cgenm2', 'zgenm2' ),
('', 'slag2c_fake', 'dlag2z_fake', 'slag2c', 'dlag2z' ), ('', 'slag2c_fake', 'dlag2z_fake', 'slag2c', 'dlag2z' ),
('', 'slag2h', 'dlag2h', 'slag2h', 'dlag2h' ),
('', 'hlag2s', 'hlag2d', 'hlag2s', 'hlag2d' ),
('', 'slag2h', 'dlag2h', 'clag2x', 'zlag2x' ),
('', 'hlag2s', 'hlag2d', 'xlag2c', 'xlag2z' ),
('', 'sgepdf', 'dgepdf', 'cgepdf', 'zgepdf' ), ('', 'sgepdf', 'dgepdf', 'cgepdf', 'zgepdf' ),
('', 'scesca', 'dcesca', 'ccesca', 'zcesca' ), ('', 'scesca', 'dcesca', 'ccesca', 'zcesca' ),
('', 'sgesum', 'dgesum', 'cgesum', 'zgesum' ), ('', 'sgesum', 'dgesum', 'cgesum', 'zgesum' ),
......
...@@ -17,14 +17,17 @@ ...@@ -17,14 +17,17 @@
# Univ. of California Berkeley, # Univ. of California Berkeley,
# Univ. of Colorado Denver. # Univ. of Colorado Denver.
# #
# @version 1.2.0 # @version 1.3.0
# @author Florent Pruvost # @author Florent Pruvost
# @author Guillaume Sylvand # @author Guillaume Sylvand
# @author Mathieu Faverge # @author Mathieu Faverge
# @date 2022-02-22 # @date 2023-07-04
# #
### ###
# To define CMAKE_CUDA_COMPILER
cmake_minimum_required(VERSION 3.18)
# Generate the chameleon sources for all possible precisions # Generate the chameleon sources for all possible precisions
# ------------------------------------------------------ # ------------------------------------------------------
set(GPUCUBLAS_SRCS_GENERATED "") set(GPUCUBLAS_SRCS_GENERATED "")
...@@ -53,6 +56,217 @@ set(ZSRC ...@@ -53,6 +56,217 @@ set(ZSRC
cuda_zunmqrt.c cuda_zunmqrt.c
) )
# Add CUDA kernel if compiler and toolkit are available
# -----------------------------------------------------
include(CheckLanguage)
check_language(CUDA)
if(CMAKE_CUDA_COMPILER)
enable_language(CUDA)
find_package(CUDAToolkit)
else()
message(STATUS "CUDA language is not supported")
endif()
if (CUDAToolkit_FOUND)
set(CMAKE_CUDA_STANDARD 11)
# Define the architectures (inspired from the MAGMA project)
set( CHAMELEON_CUDA_TARGETS "Kepler Maxwell Pascal Volta Ampere" CACHE STRING "CUDA architectures to compile for; one or more of Fermi, Kepler, Maxwell, Pascal, Volta, Turing, Ampere, Hopper, or valid sm_[0-9][0-9]" )
# NVCC options for the different cards
# sm_xx is binary, compute_xx is PTX for forward compatibility
# MIN_ARCH is the lowest requested version
if(WIN32)
# Disable separable compilation on Windows because object linking list
# becomes too long when building multiple archs and MSVC throws errors
set(CUDA_SEPARABLE_COMPILATION OFF)
else()
set(CUDA_SEPARABLE_COMPILATION ON)
endif()
set(__cuda_architectures)
# Architectures by names
# ----------------------
if (CHAMELEON_CUDA_TARGETS MATCHES Fermi)
set( CHAMELEON_CUDA_TARGETS "${CHAMELEON_CUDA_TARGETS} sm_20" )
endif()
if (CHAMELEON_CUDA_TARGETS MATCHES Kepler)
set( CHAMELEON_CUDA_TARGETS "${CHAMELEON_CUDA_TARGETS} sm_30 sm_35 sm_37" )
endif()
if (CHAMELEON_CUDA_TARGETS MATCHES Maxwell)
set( CHAMELEON_CUDA_TARGETS "${CHAMELEON_CUDA_TARGETS} sm_50" )
endif()
if (CHAMELEON_CUDA_TARGETS MATCHES Pascal)
set( CHAMELEON_CUDA_TARGETS "${CHAMELEON_CUDA_TARGETS} sm_60" )
endif()
if (CHAMELEON_CUDA_TARGETS MATCHES Volta)
set( CHAMELEON_CUDA_TARGETS "${CHAMELEON_CUDA_TARGETS} sm_70" )
endif()
if (CHAMELEON_CUDA_TARGETS MATCHES Turing)
set( CHAMELEON_CUDA_TARGETS "${CHAMELEON_CUDA_TARGETS} sm_75" )
endif()
if (CHAMELEON_CUDA_TARGETS MATCHES Ampere)
set( CHAMELEON_CUDA_TARGETS "${CHAMELEON_CUDA_TARGETS} sm_80" )
endif()
if (CHAMELEON_CUDA_TARGETS MATCHES Hopper)
set( CHAMELEON_CUDA_TARGETS "${CHAMELEON_CUDA_TARGETS} sm_90" )
endif()
# Architectures versions
# ----------------------
if ( (CHAMELEON_CUDA_TARGETS MATCHES sm_20) AND (CUDA_VERSION VERSION_LESS "8.0") )
if (NOT MIN_ARCH)
set( MIN_ARCH 200 )
endif()
list(APPEND __cuda_architectures 20)
message( STATUS " compile for CUDA arch 2.0 (Fermi)" )
endif()
if ( (CHAMELEON_CUDA_TARGETS MATCHES sm_30) AND (CUDA_VERSION VERSION_LESS "10.0") )
if (NOT MIN_ARCH)
set( MIN_ARCH 300 )
endif()
list(APPEND __cuda_architectures 30)
message( STATUS " compile for CUDA arch 3.0 (Kepler)" )
endif()
if ( (CHAMELEON_CUDA_TARGETS MATCHES sm_35) AND (CUDA_VERSION VERSION_LESS "11.0") )
if (NOT MIN_ARCH)
set( MIN_ARCH 300 )
endif()
list(APPEND __cuda_architectures 35)
message( STATUS " compile for CUDA arch 3.5 (Kepler)" )
endif()
if ( (CHAMELEON_CUDA_TARGETS MATCHES sm_50) AND (CUDA_VERSION VERSION_LESS "11.0") )
if (NOT MIN_ARCH)
set( MIN_ARCH 500 )
endif()
list(APPEND __cuda_architectures 50)
message( STATUS " compile for CUDA arch 5.0 (Maxwell)" )
endif()
if (CHAMELEON_CUDA_TARGETS MATCHES sm_52)
if (NOT MIN_ARCH)
set( MIN_ARCH 520 )
endif()
list(APPEND __cuda_architectures 52)
message( STATUS " compile for CUDA arch 5.2 (Maxwell)" )
endif()
if (CHAMELEON_CUDA_TARGETS MATCHES sm_53)
if (NOT MIN_ARCH)
set( MIN_ARCH 530 )
endif()
list(APPEND __cuda_architectures 53)
message( STATUS " compile for CUDA arch 5.3 (Maxwell)" )
endif()
if (CHAMELEON_CUDA_TARGETS MATCHES sm_60)
if (NOT MIN_ARCH)
set( MIN_ARCH 600 )
endif()
list(APPEND __cuda_architectures 60)
message( STATUS " compile for CUDA arch 6.0 (Pascal)" )
endif()
if (CHAMELEON_CUDA_TARGETS MATCHES sm_61)
if (NOT MIN_ARCH)
set( MIN_ARCH 610 )
endif()
list(APPEND __cuda_architectures 61)
message( STATUS " compile for CUDA arch 6.1 (Pascal)" )
endif()
if (CHAMELEON_CUDA_TARGETS MATCHES sm_62)
if (NOT MIN_ARCH)
set( MIN_ARCH 620 )
endif()
list(APPEND __cuda_architectures 62)
message( STATUS " compile for CUDA arch 6.2 (Pascal)" )
endif()
if (CHAMELEON_CUDA_TARGETS MATCHES sm_70)
if (NOT MIN_ARCH)
set( MIN_ARCH 700 )
endif()
list(APPEND __cuda_architectures 70)
message( STATUS " compile for CUDA arch 7.0 (Volta)" )
endif()
if (CHAMELEON_CUDA_TARGETS MATCHES sm_71)
if (NOT MIN_ARCH)
set( MIN_ARCH 710 )
endif()
list(APPEND __cuda_architectures 71)
message( STATUS " compile for CUDA arch 7.1 (Volta)" )
endif()
if (CHAMELEON_CUDA_TARGETS MATCHES sm_75)
if (NOT MIN_ARCH)
set( MIN_ARCH 750 )
endif()
list(APPEND __cuda_architectures 75)
message( STATUS " compile for CUDA arch 7.5 (Turing)" )
endif()
if (CHAMELEON_CUDA_TARGETS MATCHES sm_80)
if (NOT MIN_ARCH)
set( MIN_ARCH 800 )
endif()
list(APPEND __cuda_architectures 80)
message( STATUS " compile for CUDA arch 8.0 (Ampere)" )
endif()
if (CHAMELEON_CUDA_TARGETS MATCHES sm_90)
if (NOT MIN_ARCH)
set( MIN_ARCH 900 )
endif()
list(APPEND __cuda_architectures 90)
message( STATUS " compile for CUDA arch 9.0 (Hopper)" )
endif()
if (NOT MIN_ARCH)
message( FATAL_ERROR "CHAMELEON_CUDA_TARGETS must contain one or more of Fermi, Kepler, Maxwell, Pascal, Volta, Turing, Ampere, or valid sm_[0-9][0-9]" )
endif()
# Remove extra
# ------------
if(CUDA_VERSION VERSION_GREATER_EQUAL "8.0")
list(REMOVE_ITEM __cuda_architectures "20" "21")
endif()
if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0")
list(REMOVE_ITEM __cuda_architectures "20" "21")
endif()
if(CUDA_VERSION VERSION_GREATER_EQUAL "10.0")
list(REMOVE_ITEM __cuda_architectures "30" "32")
endif()
if(CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
list(REMOVE_ITEM __cuda_architectures "35" "50")
endif()
set(CMAKE_CUDA_ARCHITECTURES "${__cuda_architectures}")
set(ZSRC
${ZSRC}
cuda_zlag2c.cu
cuda_dlag2h.cu
)
endif()
# Former MAGMA files that are no longer supported # Former MAGMA files that are no longer supported
# if( CHAMELEON_USE_MAGMA ) # if( CHAMELEON_USE_MAGMA )
# set(ZSRC # set(ZSRC
...@@ -102,6 +316,12 @@ set_property(TARGET gpucublas PROPERTY INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX} ...@@ -102,6 +316,12 @@ set_property(TARGET gpucublas PROPERTY INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}
target_link_libraries(gpucublas PRIVATE coreblas CUDA::CUBLAS) target_link_libraries(gpucublas PRIVATE coreblas CUDA::CUBLAS)
target_link_libraries(gpucublas PUBLIC MORSE::M) target_link_libraries(gpucublas PUBLIC MORSE::M)
set_target_properties(gpucublas PROPERTIES
CUDA_SEPARABLE_COMPILATION OFF)
#target_include_directories( gpucublas PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
#target_link_libraries( gpucublas PRIVATE CUDA::cublas CUDA::cudart )
# export target coreblas # export target coreblas
install(EXPORT gpucublasTargets install(EXPORT gpucublasTargets
NAMESPACE CHAMELEON:: NAMESPACE CHAMELEON::
......
/**
*
* @file cuda_dlag2h.cu
*
* @copyright 2023-2023 The University of Tennessee and The University of
* Tennessee Research Foundation. All rights reserved.
* @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon cuda_dlag2h GPU kernel
*
* @version 1.3.0
* @author Mark Gates
* @author Mathieu Faverge
* @date 2023-07-04
* @precisions normal d -> d s
*
* This file is an adaptation of the MAGMA zlag2c.cu, clag2z.cu, hlag2s.cu, and slag2h.cu
*
*/
#include "gpucublas.h"
#include <coreblas/lapacke.h>
#if CUDA_VERSION < 7500
#error "This file should not be included as the half precision floats are not supported."
#endif
#define BLK_X 64
#define BLK_Y 32
__device__ int cuda_dlag2h_flag = 0;
/*
* Divides matrix into ceil( m/BLK_X ) x ceil( n/BLK_Y ) blocks.
* Each block has BLK_X threads.
* Each thread loops across one row, updating BLK_Y entries.
*/
__global__
void cuda_dlag2h_kernel(
int m, int n,
const double *A, int lda,
CHAMELEON_Real16_t *HA, int ldha,
double rmax )
{
int ind = blockIdx.x * BLK_X + threadIdx.x;
int iby = blockIdx.y * BLK_Y;
/* check if full block-column */
bool full = (iby + BLK_Y <= n);
double tmp;
/* do only rows inside matrix */
if ( ind > m ) {
return;
}
A += ind + iby*lda;
HA += ind + iby*ldha;
if ( full ) {
/* full block-column */
#pragma unroll
for( int j=0; j < BLK_Y; ++j ) {
tmp = A[j*lda];
if ( fabs(tmp) > rmax ) {
cuda_dlag2h_flag = 1;
}
HA[j*ldha] = __double2half(tmp);
}
}
else {
/* partial block-column */
for( int j=0; (j < BLK_Y) && (iby+j < n); ++j ) {
tmp = A[j*lda];
if ( fabs(tmp) > rmax ) {
cuda_dlag2h_flag = 1;
}
HA[j*ldha] = __double2half(tmp);
}
}
}
/**
*
* @ingroup CUDA_CHAMELEON_Complex64_t
*
* CUDA_dlag2h converts a double-real matrix, A, to a half-real matrix,
* HA.
*
* RMAX is the overflow for the single-real arithmetic. DLAG2H checks that
* all the entries of A are between -RMAX and RMAX. If not, the conversion is
* aborted and a magma_dlag2h_flag is raised.
*
* @param[in] m
* The number of lines of the matrix A. m >= 0.
*
* @param[in] n
* The number of columns of the matrix A. n >= 0.
*
* @param[in] A
* On entry, the lda-by-n coefficient matrix A.
*
* @param[in] lda
* The leading dimension of the array A. LDA >= max(1,m).
*
* @param[out] HA
* On exit, if INFO=0, the ldha-by-n coefficient matrix HA;
* if INFO > 0, the content of HA is unspecified.
*
* @param[in] ldha
* The leading dimension of the array HA. LDHA >= max(1,m).
*
* @param[out]
* - = 0: successful exit.
* - < 0: if INFO = -i, the i-th argument had an illegal value
* - = 1: an entry of the matrix A is greater than the SINGLE PRECISION
* overflow threshold, in this case, the content
* of HA on exit is unspecified.
*
* @param[in] handle
* Cublas handle to execute in.
*
**/
extern "C" int
CUDA_dlag2h( int m, int n,
const double *A, int lda,
CHAMELEON_Real16_t *HA, int ldha,
cublasHandle_t handle )
{
cudaStream_t stream;
double rmax;
if ( m < 0 ) {
return -1;
}
else if ( n < 0 ) {
return -2;
}
else if ( lda < chameleon_max(1,m) ) {
return -4;
}
else if ( ldha < chameleon_max(1,m) ) {
return -6;
}
/* quick return */
if ( m == 0 || n == 0 ) {
return 0;
}
dim3 threads( BLK_X, 1 );
dim3 grid( chameleon_ceil( m, BLK_X ), chameleon_ceil( n, BLK_Y ) );
/*
* There is no lapackf77_hlamch, please visit:
* https://blogs.mathworks.com/cleve/2017/05/08/half-precision-16-bit-floating-point-arithmetic/
*/
rmax = 65504.;
cublasGetStream( handle, &stream );
cuda_dlag2h_kernel<<< grid, threads, 0, stream >>>( m, n, A, lda, HA, ldha, rmax );
return 0;
}
/*
* Divides matrix into ceil( m/BLK_X ) x ceil( n/BLK_Y ) blocks.
* Each block has BLK_X threads.
* Each thread loops across one row, updating BLK_Y entries.
*/
__global__
void cuda_hlag2d_kernel(
int m, int n,
const CHAMELEON_Real16_t *HA, int ldha,
double *A, int lda )
{
int ind = blockIdx.x * BLK_X + threadIdx.x;
int iby = blockIdx.y * BLK_Y;
/* check if full block-column */
bool full = (iby + BLK_Y <= n);
/* do only rows inside matrix */
if ( ind > m ) {
return;
}
A += ind + iby*lda;
HA += ind + iby*ldha;
if ( full ) {
// full block-column
#pragma unroll
for( int j=0; j < BLK_Y; ++j ) {
#if defined(PRECISION_zc)
A[j*ldha] = make_double( HA[j*ldha].x, HA[j*ldha].y );
#else
A[j*ldha] = HA[j*ldha];
#endif
}
}
else {
// partial block-column
for( int j=0; (j < BLK_Y) && (iby+j) < n; ++j ) {
#if defined(PRECISION_zc)
A[j*ldha] = make_double( HA[j*ldha].x, HA[j*ldha].y );
#else
A[j*ldha] = HA[j*ldha];
#endif
}
}
}
/**
*
* @ingroup CUDA_CHAMELEON_Complex64_t
*
* CUDA_hlag2d converts a half-real matrix, HA, to a double-real matrix,
* A.
*
* Note that while it is possible to overflow while converting from double to
* single, it is not possible to overflow when converting from single to
* double.
*
* @param[in] m
* The number of lines of the matrix A and HA. m >= 0.
*
* @param[in] n
* The number of columns of the matrix A and HA. n >= 0.
*
* @param[in] HA
* On entry, the lda-by-n coefficient matrix HA.
*
* @param[in] ldha
* The leading dimension of the array HA. ldha >= max(1,m).
*
* @param[out] A
* On exit, the lda-by-n coefficient matrix A.
*
* @param[in] lda
* The leading dimension of the array A. lda >= max(1,m).
*
* @param[out]
* - = 0: successful exit.
* - < 0: if INFO = -i, the i-th argument had an illegal value
*
* @param[in] handle
* Cublas handle to execute in.
*
**/
extern "C" int
CUDA_hlag2d( int m, int n,
const CHAMELEON_Real16_t *HA, int ldha,
double *A, int lda,
cublasHandle_t handle )
{
cudaStream_t stream;
if ( m < 0 ) {
return -1;
}
else if ( n < 0 ) {
return -2;
}
else if ( ldha < chameleon_max(1,m) ) {
return -4;
}
else if ( lda < chameleon_max(1,m) ) {
return -6;
}
/* quick return */
if ( (m == 0) || (n == 0) ) {
return 0;
}
dim3 threads( BLK_X, 1 );
dim3 grid( chameleon_ceil( m, BLK_X ), chameleon_ceil( n, BLK_Y ) );
cublasGetStream( handle, &stream );
cuda_hlag2d_kernel<<< grid, threads, 0, stream >>> ( m, n, HA, ldha, A, lda );
return 0;
}
/**
*
* @file cuda_zlag2c.cu
*
* @copyright 2023-2023 The University of Tennessee and The University of
* Tennessee Research Foundation. All rights reserved.
* @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon cuda_zlag2c GPU kernel
*
* @version 1.3.0
* @author Mark Gates
* @author Mathieu Faverge
* @date 2023-07-04
* @precisions mixed zc -> ds
*
* This file is an adaptation of the MAGMA zlag2c.cu and clag2z files.
*
*/
#include "gpucublas.h"
#include <coreblas/lapacke.h>
#define BLK_X 64
#define BLK_Y 32
__device__ int cuda_zlag2c_flag = 0;
/*
* Divides matrix into ceil( m/BLK_X ) x ceil( n/BLK_Y ) blocks.
* Each block has BLK_X threads.
* Each thread loops across one row, updating BLK_Y entries.
*/
__global__
void cuda_zlag2c_kernel(
int m, int n,
const cuDoubleComplex *A, int lda,
cuFloatComplex *SA, int ldsa,
double rmax )
{
cuDoubleComplex tmp;
int ind = blockIdx.x * BLK_X + threadIdx.x;
int iby = blockIdx.y * BLK_Y;
/* check if full block-column */
bool full = (iby + BLK_Y <= n);
/* do only rows inside matrix */
if ( ind > m ) {
return;
}
A += ind + iby*lda;
SA += ind + iby*ldsa;
if ( full ) {
/* full block-column */
#pragma unroll
for( int j=0; j < BLK_Y; ++j ) {
tmp = A[j*lda];
if (
#if defined(PRECISION_zc)
(fabs(tmp.x) > rmax) || (fabs(tmp.y) > rmax)
#else
(fabs(tmp) > rmax)
#endif
)
{
cuda_zlag2c_flag = 1;
}
#if defined(PRECISION_zc)
SA[j*ldsa] = make_cuFloatComplex(tmp.x, tmp.y);
#else
SA[j*ldsa] = tmp;
#endif
}
}
else {
/* partial block-column */
for( int j=0; (j < BLK_Y) && (iby+j < n); ++j ) {
tmp = A[j*lda];
if (
#if defined(PRECISION_zc)
(fabs(tmp.x) > rmax) || (fabs(tmp.y) > rmax)
#else
(fabs(tmp) > rmax)
#endif
)
{
cuda_zlag2c_flag = 1;
}
#if defined(PRECISION_zc)
SA[j*ldsa] = make_cuFloatComplex(tmp.x, tmp.y);
#else
SA[j*ldsa] = tmp;
#endif
}
}
}
/**
*
* @ingroup CUDA_CHAMELEON_Complex64_t
*
* CUDA_zlag2c converts a double-complex matrix, A, to a single-complex matrix,
* SA.
*
* RMAX is the overflow for the single-complex arithmetic. ZLAG2C checks that
* all the entries of A are between -RMAX and RMAX. If not, the conversion is
* aborted and a magma_zlag2c_flag is raised.
*
* @param[in] m
* The number of lines of the matrix A. m >= 0.
*
* @param[in] n
* The number of columns of the matrix A. n >= 0.
*
* @param[in] A
* On entry, the lda-by-n coefficient matrix A.
*
* @param[in] lda
* The leading dimension of the array A. LDA >= max(1,m).
*
* @param[out] SA
* On exit, if INFO=0, the ldsa-by-n coefficient matrix SA;
* if INFO > 0, the content of SA is unspecified.
*
* @param[in] ldsa
* The leading dimension of the array SA. LDSA >= max(1,m).
*
* @param[out]
* - = 0: successful exit.
* - < 0: if INFO = -i, the i-th argument had an illegal value
* - = 1: an entry of the matrix A is greater than the COMPLEX
* overflow threshold, in this case, the content
* of SA on exit is unspecified.
*
* @param[in] handle
* Cublas handle to execute in.
*
**/
extern "C" int
CUDA_zlag2c( int m, int n,
const cuDoubleComplex *A, int lda,
cuFloatComplex *SA, int ldsa,
cublasHandle_t handle )
{
cudaStream_t stream;
double rmax;
if ( m < 0 ) {
return -1;
}
else if ( n < 0 ) {
return -2;
}
else if ( lda < chameleon_max(1,m) ) {
return -4;
}
else if ( ldsa < chameleon_max(1,m) ) {
return -6;
}
/* quick return */
if ( m == 0 || n == 0 ) {
return 0;
}
dim3 threads( BLK_X, 1 );
dim3 grid( chameleon_ceil( m, BLK_X ), chameleon_ceil( n, BLK_Y ) );
rmax = LAPACKE_slamch_work( 'O' );
cublasGetStream( handle, &stream );
cuda_zlag2c_kernel<<< grid, threads, 0, stream >>>( m, n, A, lda, SA, ldsa, rmax );
return 0;
}
/*
* Divides matrix into ceil( m/BLK_X ) x ceil( n/BLK_Y ) blocks.
* Each block has BLK_X threads.
* Each thread loops across one row, updating BLK_Y entries.
*/
__global__
void cuda_clag2z_kernel(
int m, int n,
const cuFloatComplex *SA, int ldsa,
cuDoubleComplex *A, int lda )
{
int ind = blockIdx.x * BLK_X + threadIdx.x;
int iby = blockIdx.y * BLK_Y;
/* check if full block-column */
bool full = (iby + BLK_Y <= n);
/* do only rows inside matrix */
if ( ind > m ) {
return;
}
A += ind + iby*lda;
SA += ind + iby*ldsa;
if ( full ) {
// full block-column
#pragma unroll
for( int j=0; j < BLK_Y; ++j ) {
#if defined(PRECISION_zc)
A[j*ldsa] = make_cuDoubleComplex( SA[j*ldsa].x, SA[j*ldsa].y );
#else
A[j*ldsa] = SA[j*ldsa];
#endif
}
}
else {
// partial block-column
for( int j=0; (j < BLK_Y) && (iby+j) < n; ++j ) {
#if defined(PRECISION_zc)
A[j*ldsa] = make_cuDoubleComplex( SA[j*ldsa].x, SA[j*ldsa].y );
#else
A[j*ldsa] = SA[j*ldsa];
#endif
}
}
}
/**
*
* @ingroup CUDA_CHAMELEON_Complex64_t
*
* CUDA_clag2z converts a single-complex matrix, SA, to a double-complex matrix,
* A.
*
* Note that while it is possible to overflow while converting from double to
* single, it is not possible to overflow when converting from single to
* double.
*
* @param[in] m
* The number of lines of the matrix A and SA. m >= 0.
*
* @param[in] n
* The number of columns of the matrix A and SA. n >= 0.
*
* @param[in] SA
* On entry, the lda-by-n coefficient matrix SA.
*
* @param[in] ldsa
* The leading dimension of the array SA. ldsa >= max(1,m).
*
* @param[out] A
* On exit, the lda-by-n coefficient matrix A.
*
* @param[in] lda
* The leading dimension of the array A. lda >= max(1,m).
*
* @param[out]
* - = 0: successful exit.
* - < 0: if INFO = -i, the i-th argument had an illegal value
*
* @param[in] handle
* Cublas handle to execute in.
*
**/
extern "C" int
CUDA_clag2z( int m, int n,
const cuFloatComplex *SA, int ldsa,
cuDoubleComplex *A, int lda,
cublasHandle_t handle )
{
cudaStream_t stream;
if ( m < 0 ) {
return -1;
}
else if ( n < 0 ) {
return -2;
}
else if ( ldsa < chameleon_max(1,m) ) {
return -4;
}
else if ( lda < chameleon_max(1,m) ) {
return -6;
}
/* quick return */
if ( (m == 0) || (n == 0) ) {
return 0;
}
dim3 threads( BLK_X, 1 );
dim3 grid( chameleon_ceil( m, BLK_X ), chameleon_ceil( n, BLK_Y ) );
cublasGetStream( handle, &stream );
cuda_clag2z_kernel<<< grid, threads, 0, stream >>> ( m, n, SA, ldsa, A, lda );
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment