Commit 73117bfb authored by PRUVOST Florent's avatar PRUVOST Florent

update to cublas interface v2

parent 1b4504ef
......@@ -50,13 +50,12 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
//#if defined(CHAMELEON_USE_CUBLAS_V2)
//#include <cublas_v2.h>
//#else
//#include <cublas.h>
//#endif
#if defined(CHAMELEON_USE_CUBLAS_V2)
#include <cublas_v2.h>
#else
#include <cublas.h>
#endif
#endif
#if defined(CHAMELEON_USE_OPENCL)
#include <OpenCL/cl.h>
......
......@@ -105,6 +105,84 @@ static void cl_zgemm_cpu_func(void *descr[], void *cl_arg)
}
#ifdef CHAMELEON_USE_CUDA
#if defined(CHAMELEON_USE_CUBLAS_V2)
static void cl_zgemm_cuda_func(void *descr[], void *cl_arg)
{
MORSE_enum transA;
MORSE_enum transB;
int m;
int n;
int k;
cuDoubleComplex alpha;
const cuDoubleComplex *A;
int lda;
const cuDoubleComplex *B;
int ldb;
cuDoubleComplex beta;
cuDoubleComplex *C;
int ldc;
A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
starpu_codelet_unpack_args(cl_arg, &transA, &transB, &m, &n, &k, &alpha, &lda, &ldb, &beta, &ldc);
cublasHandle_t handle;
cublasStatus_t stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("CUBLAS initialization failed\n");
assert( stat == CUBLAS_STATUS_SUCCESS );
}
CUstream stream = starpu_cuda_get_local_stream();
stat = cublasSetStream(handle, stream);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("cublasSetStream failed\n");
assert( stat == CUBLAS_STATUS_SUCCESS );
}
cublasOperation_t cublasTransA;
if (transA == MorseNoTrans){
cublasTransA = CUBLAS_OP_N;
}else if(transA == MorseTrans){
cublasTransA = CUBLAS_OP_T;
}else if(transA == MorseConjTrans){
cublasTransA = CUBLAS_OP_C;
}else{
fprintf(stderr, "Error in cl_zgemm_cuda_func: bad transA parameter %d\n", transA);
}
cublasOperation_t cublasTransB;
if (transB == MorseNoTrans){
cublasTransB = CUBLAS_OP_N;
}else if(transB == MorseTrans){
cublasTransB = CUBLAS_OP_T;
}else if(transB == MorseConjTrans){
cublasTransB = CUBLAS_OP_C;
}else{
fprintf(stderr, "Error in cl_zgemm_cuda_func: bad transB parameter %d\n", transB);
}
stat = cublasZgemm(handle,
cublasTransA, cublasTransB,
m, n, k,
(const cuDoubleComplex *) &alpha, A, lda,
B, ldb,
(const cuDoubleComplex *) &beta, C, ldc);
if (stat != CUBLAS_STATUS_SUCCESS){
printf ("cublasZgemm failed");
cublasDestroy(handle);
assert( stat == CUBLAS_STATUS_SUCCESS );
}
cublasDestroy(handle);
#ifndef STARPU_CUDA_ASYNC
cudaStreamSynchronize( stream );
#endif
return;
}
#else /* CHAMELEON_USE_CUBLAS_V2 */
static void cl_zgemm_cuda_func(void *descr[], void *cl_arg)
{
MORSE_enum transA;
......@@ -135,6 +213,7 @@ static void cl_zgemm_cuda_func(void *descr[], void *cl_arg)
alpha, A, lda,
B, ldb,
beta, C, ldc);
assert( CUBLAS_STATUS_SUCCESS == cublasGetError() );
#ifndef STARPU_CUDA_ASYNC
......@@ -143,7 +222,8 @@ static void cl_zgemm_cuda_func(void *descr[], void *cl_arg)
return;
}
#endif
#endif /* CHAMELEON_USE_CUBLAS_V2 */
#endif /* CHAMELEON_USE_CUDA */
/*
* Codelet definition
......
......@@ -153,6 +153,61 @@ static void cl_zgeqrt_cpu_func(void *descr[], void *cl_arg)
#if defined(CHAMELEON_USE_MAGMA)
#if defined(CHAMELEON_USE_CUBLAS_V2)
magma_int_t
magma_zgemerge_gpu(magma_side_t side, magma_diag_t diag,
magma_int_t M, magma_int_t N,
magmaDoubleComplex *A, magma_int_t LDA,
magmaDoubleComplex *B, magma_int_t LDB)
{
int i, j;
magmaDoubleComplex *cola, *colb;
cublasHandle_t handle;
cublasStatus_t stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("CUBLAS initialization failed\n");
assert( stat == CUBLAS_STATUS_SUCCESS );
}
CUstream stream = starpu_cuda_get_local_stream();
stat = cublasSetStream(handle, stream);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("cublasSetStream failed\n");
assert( stat == CUBLAS_STATUS_SUCCESS );
}
if (M < 0) {
return -1;
}
if (N < 0) {
return -2;
}
if ( (LDA < max(1,M)) && (M > 0) ) {
return -5;
}
if ( (LDB < max(1,M)) && (M > 0) ) {
return -7;
}
if (side == MagmaLeft){
for(i=0; i<N; i++){
cola = A + i*LDA;
colb = B + i*LDB;
cublasZcopy(handle, i+1, cola, 1, colb, 1);
}
}else{
for(i=0; i<N; i++){
cola = A + i*LDA;
colb = B + i*LDB;
cublasZcopy(handle, M-i, cola + i, 1, colb + i, 1);
}
}
cublasDestroy(handle);
return MAGMA_SUCCESS;
}
#else /* CHAMELEON_USE_CUBLAS_V2 */
magma_int_t
magma_zgemerge_gpu(magma_side_t side, magma_diag_t diag,
magma_int_t M, magma_int_t N,
......@@ -191,7 +246,7 @@ magma_zgemerge_gpu(magma_side_t side, magma_diag_t diag,
return MAGMA_SUCCESS;
}
#endif /* CHAMELEON_USE_CUBLAS_V2 */
magma_int_t
magma_zgeqrt_gpu( magma_int_t m, magma_int_t n, magma_int_t nb,
......
......@@ -102,6 +102,81 @@ static void cl_zhemm_cpu_func(void *descr[], void *cl_arg)
}
#ifdef CHAMELEON_USE_CUDA
#if defined(CHAMELEON_USE_CUBLAS_V2)
static void cl_zhemm_cuda_func(void *descr[], void *cl_arg)
{
MORSE_enum side;
MORSE_enum uplo;
int M;
int N;
cuDoubleComplex alpha;
const cuDoubleComplex *A;
int LDA;
const cuDoubleComplex *B;
int LDB;
cuDoubleComplex beta;
cuDoubleComplex *C;
int LDC;
A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &LDA, &LDB, &beta, &LDC);
cublasHandle_t handle;
cublasStatus_t stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("CUBLAS initialization failed\n");
assert( stat == CUBLAS_STATUS_SUCCESS );
}
CUstream stream = starpu_cuda_get_local_stream();
stat = cublasSetStream(handle, stream);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("cublasSetStream failed\n");
assert( stat == CUBLAS_STATUS_SUCCESS );
}
cublasSideMode_t cublasSide;
if (side == MorseLeft){
cublasSide = CUBLAS_SIDE_LEFT;
}else if (side == MorseRight){
cublasSide = CUBLAS_SIDE_RIGHT;
}else{
fprintf(stderr, "Error in cl_zhemm_cuda_func: bad side parameter %d\n", side);
}
cublasFillMode_t cublasUplo;
if (uplo == MorseUpper){
cublasUplo = CUBLAS_FILL_MODE_UPPER;
}else if(uplo == MorseLower){
cublasUplo = CUBLAS_FILL_MODE_LOWER;
}else if(uplo == MorseUpperLower){
cublasUplo = 0;
}else{
fprintf(stderr, "Error in cl_zhemm_cuda_func: bad uplo parameter %d\n", uplo);
}
stat = cublasZhemm(handle,
cublasSide, cublasUplo,
M, N,
(const cuDoubleComplex *) &alpha, A, LDA,
B, LDB,
(const cuDoubleComplex *) &beta, C, LDC);
if (stat != CUBLAS_STATUS_SUCCESS){
printf ("cublasZhemm failed");
cublasDestroy(handle);
assert( stat == CUBLAS_STATUS_SUCCESS );
}
cublasDestroy(handle);
#ifndef STARPU_CUDA_ASYNC
cudaStreamSynchronize( stream );
#endif
return;
}
#else /* CHAMELEON_USE_CUBLAS_V2 */
static void cl_zhemm_cuda_func(void *descr[], void *cl_arg)
{
MORSE_enum side;
......@@ -138,7 +213,8 @@ static void cl_zhemm_cuda_func(void *descr[], void *cl_arg)
return;
}
#endif
#endif /* CHAMELEON_USE_CUBLAS_V2 */
#endif /* CHAMELEON_USE_CUDA */
/*
* Codelet definition
......
......@@ -97,6 +97,81 @@ static void cl_zher2k_cpu_func(void *descr[], void *cl_arg)
}
#ifdef CHAMELEON_USE_CUDA
#if defined(CHAMELEON_USE_CUBLAS_V2)
static void cl_zher2k_cuda_func(void *descr[], void *cl_arg)
{
MORSE_enum uplo;
MORSE_enum trans;
int n;
int k;
cuDoubleComplex alpha;
const cuDoubleComplex *A;
int lda;
const cuDoubleComplex *B;
int ldb;
double beta;
cuDoubleComplex *C;
int ldc;
A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &lda, &ldb, &beta, &ldc);
cublasHandle_t handle;
cublasStatus_t stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("CUBLAS initialization failed\n");
assert( stat == CUBLAS_STATUS_SUCCESS );
}
CUstream stream = starpu_cuda_get_local_stream();
stat = cublasSetStream(handle, stream);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("cublasSetStream failed\n");
assert( stat == CUBLAS_STATUS_SUCCESS );
}
cublasFillMode_t cublasUplo;
if (uplo == MorseUpper){
cublasUplo = CUBLAS_FILL_MODE_UPPER;
}else if(uplo == MorseLower){
cublasUplo = CUBLAS_FILL_MODE_LOWER;
}else if(uplo == MorseUpperLower){
cublasUplo = 0;
}else{
fprintf(stderr, "Error in cl_zher2k_cuda_func: bad uplo parameter %d\n", uplo);
}
cublasOperation_t cublasTrans;
if (trans == MorseNoTrans){
cublasTrans = CUBLAS_OP_N;
}else if(trans == MorseTrans){
cublasTrans = CUBLAS_OP_T;
}else if(trans == MorseConjTrans){
cublasTrans = CUBLAS_OP_C;
}else{
fprintf(stderr, "Error in cl_zher2k_cuda_func: bad trans parameter %d\n", trans);
}
stat = cublasZher2k( handle, cublasUplo, cublasTrans,
n, k, (const cuDoubleComplex *) &alpha, A, lda, B, ldb,
(const double *) &beta, C, ldc);
if (stat != CUBLAS_STATUS_SUCCESS){
printf ("cublasZher2k failed");
cublasDestroy(handle);
assert( stat == CUBLAS_STATUS_SUCCESS );
}
cublasDestroy(handle);
#ifndef STARPU_CUDA_ASYNC
cudaStreamSynchronize( stream );
#endif
return;
}
#else /* CHAMELEON_USE_CUBLAS_V2 */
static void cl_zher2k_cuda_func(void *descr[], void *cl_arg)
{
MORSE_enum uplo;
......@@ -129,7 +204,8 @@ static void cl_zher2k_cuda_func(void *descr[], void *cl_arg)
return;
}
#endif
#endif /* CHAMELEON_USE_CUBLAS_V2 */
#endif /* CHAMELEON_USE_CUDA */
/*
* Codelet definition
......
......@@ -93,6 +93,80 @@ static void cl_zherk_cpu_func(void *descr[], void *cl_arg)
}
#ifdef CHAMELEON_USE_CUDA
#if defined(CHAMELEON_USE_CUBLAS_V2)
static void cl_zherk_cuda_func(void *descr[], void *cl_arg)
{
MORSE_enum uplo;
MORSE_enum trans;
int n;
int k;
double alpha;
const cuDoubleComplex *A;
int lda;
double beta;
cuDoubleComplex *C;
int ldc;
A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &lda, &beta, &ldc);
cublasHandle_t handle;
cublasStatus_t stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("CUBLAS initialization failed\n");
assert( stat == CUBLAS_STATUS_SUCCESS );
}
CUstream stream = starpu_cuda_get_local_stream();
stat = cublasSetStream(handle, stream);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("cublasSetStream failed\n");
assert( stat == CUBLAS_STATUS_SUCCESS );
}
cublasFillMode_t cublasUplo;
if (uplo == MorseUpper){
cublasUplo = CUBLAS_FILL_MODE_UPPER;
}else if(uplo == MorseLower){
cublasUplo = CUBLAS_FILL_MODE_LOWER;
}else if(uplo == MorseUpperLower){
cublasUplo = 0;
}else{
fprintf(stderr, "Error in cl_zherk_cuda_func: bad uplo parameter %d\n", uplo);
}
cublasOperation_t cublasTrans;
if (trans == MorseNoTrans){
cublasTrans = CUBLAS_OP_N;
}else if(trans == MorseTrans){
cublasTrans = CUBLAS_OP_T;
}else if(trans == MorseConjTrans){
cublasTrans = CUBLAS_OP_C;
}else{
fprintf(stderr, "Error in cl_zherk_cuda_func: bad trans parameter %d\n", trans);
}
stat = cublasZherk(handle,
cublasUplo, cublasTrans,
n, k,
(const double *) &alpha, A, lda,
(const double *) &beta, C, ldc);
if (stat != CUBLAS_STATUS_SUCCESS){
printf ("cublasZherk failed");
cublasDestroy(handle);
assert( stat == CUBLAS_STATUS_SUCCESS );
}
cublasDestroy(handle);
#ifndef STARPU_CUDA_ASYNC
cudaStreamSynchronize( stream );
#endif
return;
}
#else /* CHAMELEON_USE_CUBLAS_V2 */
static void cl_zherk_cuda_func(void *descr[], void *cl_arg)
{
MORSE_enum uplo;
......@@ -125,7 +199,8 @@ static void cl_zherk_cuda_func(void *descr[], void *cl_arg)
return;
}
#endif
#endif /* CHAMELEON_USE_CUBLAS_V2 */
#endif /* CHAMELEON_USE_CUDA */
/*
* Codelet definition
......
......@@ -102,6 +102,81 @@ static void cl_zsymm_cpu_func(void *descr[], void *cl_arg)
}
#ifdef CHAMELEON_USE_CUDA
#if defined(CHAMELEON_USE_CUBLAS_V2)
static void cl_zsymm_cuda_func(void *descr[], void *cl_arg)
{
MORSE_enum side;
MORSE_enum uplo;
int M;
int N;
cuDoubleComplex alpha;
const cuDoubleComplex *A;
int LDA;
const cuDoubleComplex *B;
int LDB;
cuDoubleComplex beta;
cuDoubleComplex *C;
int LDC;
A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &LDA, &LDB, &beta, &LDC);
cublasHandle_t handle;
cublasStatus_t stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("CUBLAS initialization failed\n");
assert( stat == CUBLAS_STATUS_SUCCESS );
}
CUstream stream = starpu_cuda_get_local_stream();
stat = cublasSetStream(handle, stream);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("cublasSetStream failed\n");
assert( stat == CUBLAS_STATUS_SUCCESS );
}
cublasSideMode_t cublasSide;
if (side == MorseLeft){
cublasSide = CUBLAS_SIDE_LEFT;
}else if (side == MorseRight){
cublasSide = CUBLAS_SIDE_RIGHT;
}else{
fprintf(stderr, "Error in cl_zsymm_cuda_func: bad side parameter %d\n", side);
}
cublasFillMode_t cublasUplo;
if (uplo == MorseUpper){
cublasUplo = CUBLAS_FILL_MODE_UPPER;
}else if(uplo == MorseLower){
cublasUplo = CUBLAS_FILL_MODE_LOWER;
}else if(uplo == MorseUpperLower){
cublasUplo = 0;
}else{
fprintf(stderr, "Error in cl_zsymm_cuda_func: bad uplo parameter %d\n", uplo);
}
stat = cublasZsymm(handle,
cublasSide, cublasUplo,
M, N,
(const cuDoubleComplex *) &alpha, A, LDA,
B, LDB,
(const cuDoubleComplex *) &beta, C, LDC);
if (stat != CUBLAS_STATUS_SUCCESS){
printf ("cublasZsymm failed");
cublasDestroy(handle);
assert( stat == CUBLAS_STATUS_SUCCESS );
}
cublasDestroy(handle);
#ifndef STARPU_CUDA_ASYNC
cudaStreamSynchronize( stream );
#endif
return;
}
#else /* CHAMELEON_USE_CUBLAS_V2 */
static void cl_zsymm_cuda_func(void *descr[], void *cl_arg)
{
MORSE_enum side;
......@@ -138,7 +213,8 @@ static void cl_zsymm_cuda_func(void *descr[], void *cl_arg)
return;
}
#endif
#endif /* CHAMELEON_USE_CUBLAS_V2 */
#endif /* CHAMELEON_USE_CUDA */
/*
* Codelet definition
......
......@@ -97,6 +97,81 @@ static void cl_zsyr2k_cpu_func(void *descr[], void *cl_arg)
}
#ifdef CHAMELEON_USE_CUDA
#if defined(CHAMELEON_USE_CUBLAS_V2)
static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg)
{
MORSE_enum uplo;
MORSE_enum trans;
int n;
int k;
cuDoubleComplex alpha;
const cuDoubleComplex *A;
int lda;
const cuDoubleComplex *B;
int ldb;
cuDoubleComplex beta;
cuDoubleComplex *C;
int ldc;
A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &lda, &ldb, &beta, &ldc);
cublasHandle_t handle;
cublasStatus_t stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("CUBLAS initialization failed\n");
assert( stat == CUBLAS_STATUS_SUCCESS );
}
CUstream stream = starpu_cuda_get_local_stream();
stat = cublasSetStream(handle, stream);
if (stat != CUBLAS_STATUS_SUCCESS) {
printf ("cublasSetStream failed\n");
assert( stat == CUBLAS_STATUS_SUCCESS );
}
cublasFillMode_t cublasUplo;
if (uplo == MorseUpper){
cublasUplo = CUBLAS_FILL_MODE_UPPER;
}else if(uplo == MorseLower){
cublasUplo = CUBLAS_FILL_MODE_LOWER;
}else if(uplo == MorseUpperLower){