Commit 91e7e27a authored by PRUVOST Florent's avatar PRUVOST Florent

add cudablas library to make calls to cuda kernels (magma here, cublas will...

add cudablas library to make calls to cuda kernels (magma here, cublas will follow), no more calls to magma in runtime/starpu codelets
parent 7baf178b
......@@ -789,6 +789,9 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_SOURCE_DIR}/cmake_modules/")
)
link_directories(${MAGMA_LIBRARY_DIRS})
endif()
if(CHAMELEON_USE_CUDA OR CHAMELEON_USE_MAGMA)
list(APPEND CHAMELEON_DEP -lcudablas)
endif()
list(APPEND CHAMELEON_DEP
-lcoreblas
......@@ -842,6 +845,16 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_SOURCE_DIR}/cmake_modules/")
#------------------------------------------------------------------------------
###############################################################################
# Cudablas library (kernels for CUDAs) #
########################################
if(CHAMELEON_USE_CUDA)
add_subdirectory(cudablas)
endif()
#------------------------------------------------------------------------------
###############################################################################
# Main library #
################
......
......@@ -81,8 +81,10 @@
#ifndef LAPACK_NAME
#define LAPACK_NAME(a, b) lapackef77_##a
#endif
#include "coreblas/include/lapacke.h"
#include "coreblas/include/coreblas.h"
#if defined(CHAMELEON_USE_CUDA)
#include "cudablas/include/cudablas.h"
#endif
#include "morse.h"
......
......@@ -69,6 +69,10 @@ endif()
if(NOT CHAMELEON_SIMULATION)
if(CHAMELEON_USE_CUDA OR CHAMELEON_USE_MAGMA)
list(APPEND libs_for_examples
cudablas)
endif()
if(CHAMELEON_USE_CUDA)
list(APPEND libs_for_examples
${CUDA_LIBRARIES}
......
......@@ -69,6 +69,10 @@ unset(libs_for_step0)
if(NOT CHAMELEON_SIMULATION)
if(CHAMELEON_USE_CUDA OR CHAMELEON_USE_MAGMA)
list(APPEND libs_for_ltm
cudablas)
endif()
if(CHAMELEON_USE_CUDA)
list(APPEND libs_for_ltm
${CUDA_LIBRARIES}
......
......@@ -188,6 +188,11 @@ add_dependencies(chameleon_starpu
control_include
runtime_starpu_include
)
if (CHAMELEON_USE_CUDA)
add_dependencies(chameleon_starpu
cudablas_include
)
endif()
# installation
# ------------
......
......@@ -3,24 +3,25 @@
* @copyright (c) 2009-2014 The University of Tennessee and The University
* of Tennessee Research Foundation.
* All rights reserved.
* @copyright (c) 2012-2014 Inria. All rights reserved.
* @copyright (c) 2012-2015 Inria. All rights reserved.
* @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
*
**/
/**
*
* @file codelet_zpotrf.c
* @file codelet_zcallback.c
*
* MAGMA codelets kernel
* MAGMA is a software package provided by Univ. of Tennessee,
* MORSE codelets kernel
* MORSE is a software package provided by Univ. of Tennessee,
* Univ. of California Berkeley and Univ. of Colorado Denver,
* and INRIA Bordeaux Sud-Ouest
*
* @version 2.3.1
* @author Mathieu Faverge
* @author Cedric Augonnet
* @date 2011-06-01
* @author Florent Pruvost
* @date 2015-09-16
* @precisions normal z -> c d s
*
**/
......
......@@ -150,154 +150,6 @@ static void cl_zgelqt_cpu_func(void *descr[], void *cl_arg)
}
#if defined(CHAMELEON_USE_MAGMA)
magma_int_t
magma_zgelqt_gpu( magma_int_t m, magma_int_t n, magma_int_t nb,
magmaDoubleComplex *da, magma_int_t ldda,
magmaDoubleComplex *v, magma_int_t ldv,
magmaDoubleComplex *dt, magma_int_t lddt,
magmaDoubleComplex *t, magma_int_t ldt,
magmaDoubleComplex *dd,
magmaDoubleComplex *d, magma_int_t ldd,
magmaDoubleComplex *tau,
magmaDoubleComplex *hwork,
magmaDoubleComplex *dwork)
{
#define da_ref(a_1,a_2) ( da+(a_2)*(ldda) + (a_1))
#define v_ref(a_1,a_2) ( v+(a_2)*(ldv) + (a_1))
#define dt_ref(a_1,a_2) ( dt+(a_2)*(lddt) + (a_1))
#define t_ref(a_1,a_2) ( t+(a_2)*(ldt) + (a_1))
int i, k, ib, lddwork, old_i, old_ib, rows, cols;
double _Complex one=1.;
CUstream stream;
stream = starpu_cuda_get_local_stream();
cublasSetKernelStream( stream );
if (m < 0) {
return -1;
} else if (n < 0) {
return -2;
} else if (ldda < max(1,m)) {
return -4;
}
k = min(m,n);
if (k == 0) {
hwork[0] = *((magmaDoubleComplex*) &one);
return MAGMA_SUCCESS;
}
lddwork= m;
/* lower parts of little T must be zero: memset to 0 for simplicity */
memset(t_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex));
cudaMemset(dt_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex));
/* copy first panel of A on the host */
cublasGetMatrix(min(m, nb), n, sizeof(magmaDoubleComplex),
da_ref(0, 0), ldda,
v, ldv);
/* Use blocked code initially */
for (i = 0; i < k; i += nb) {
ib = min(k-i, nb);
if (i+nb >= m) ib = min(m-i, nb);
cols = n-i;
if (i > 0){
/* copy panel of A from device to host */
cublasGetMatrix(ib, n, sizeof(magmaDoubleComplex),
da_ref(i, 0), ldda,
v, ldv);
/* Apply H' to A(i+2*ib:m, i:n) from the right */
rows = m-old_i-2*old_ib;
if (rows > 0){
magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
rows, n-old_i, old_ib,
da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt,
da_ref(old_i+2*old_ib, old_i), ldda,
dwork, lddwork);
}
/* copy the lower diag tile into d_A */
magma_zgemerge_gpu(MagmaRight, MagmaUnit, old_ib, old_ib,
dd, ldd, da_ref(old_i, old_i), ldda, stream);
}
/* Form the triangular factor of the block reflector on the host
H = H'(i+ib-1) . . . H(i+1) H(i) */
CORE_zgelqt(ib, cols, ib,
(double _Complex*) v_ref(0,i), ldv,
(double _Complex*) t_ref(0,0), ldt,
(double _Complex*) tau+i,
(double _Complex*) hwork);
if ( i + ib < m ){
/* put 0s in the lower triangular part of a panel (and 1s on the
diagonal); copy the lower triangular in d */
CORE_zgesplit(MorseRight, MorseUnit, ib, min(cols,ib),
(double _Complex*) v_ref(0,i), ldv,
(double _Complex*) d, ldd);
/* copy from host to device a tile diag */
cublasSetMatrix( ib, min(cols,ib), sizeof(magmaDoubleComplex),
d, ldd, dd, ldd );
}
/* Send the triangular factor T to the GPU */
cublasSetMatrix( ib, ib, sizeof(magmaDoubleComplex),
t_ref(0,0), ldt, dt_ref(0,i), lddt );
/* A panel (with zeros in lower tri of its diag) is ready to be used
in input of zlarfb_gpu: we send the panel to the gpu */
cublasSetMatrix( ib, cols, sizeof(magmaDoubleComplex),
v_ref(0,i), ldv, da_ref(i,i), ldda );
if (i + ib < m) {
if (i+2*ib < m){
rows = ib;
}
else{
rows = m-i-ib;
}
/* Apply H' to A(i+ib:i+2*ib, i:n) from the right */
magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
rows, cols, ib, da_ref(i,i), ldda, dt_ref(0,i),
lddt, da_ref(i+ib,i), ldda, dwork, lddwork);
old_i = i;
old_ib = ib;
if (i+nb >= k){
/* Apply H' to A(i+2*ib:m, i:n) from the right */
rows = m-old_i-2*old_ib;
if (rows > 0){
magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
rows, cols, old_ib,
da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt,
da_ref(old_i+2*old_ib, old_i), ldda,
dwork, lddwork);
}
/* copy the upper diag tile into d_A */
magma_zgemerge_gpu(MagmaRight, MagmaUnit, old_ib, old_ib,
dd, ldd, da_ref(old_i, old_i), ldda, stream);
}
}
}
#undef da_ref
#undef v_ref
#undef dt_ref
#undef t_ref
return MAGMA_SUCCESS;
} /* magma_zgelqt_gpu */
static void cl_zgelqt_cuda_func(void *descr[], void *cl_arg)
{
MORSE_starpu_ws_t *h_work;
......@@ -307,6 +159,7 @@ static void cl_zgelqt_cuda_func(void *descr[], void *cl_arg)
cuDoubleComplex *h_A, *h_T, *h_D, *h_W, *h_TAU;
cuDoubleComplex *d_A, *d_T, *d_D, *d_W;
int lda, ldt;
CUstream stream;
starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldt, &h_work);
......@@ -326,11 +179,15 @@ static void cl_zgelqt_cuda_func(void *descr[], void *cl_arg)
h_W = h_TAU + max(m,n);
h_D = h_W + ib*ib;
magma_zgelqt_gpu( m, n, ib,
d_A, lda, h_A, ib,
d_T, ldt, h_T, ib,
d_D, h_D, ib, h_TAU,
h_W, d_W);
stream = starpu_cuda_get_local_stream();
cublasSetKernelStream( stream );
CUDA_zgelqt(
m, n, ib,
d_A, lda, h_A, ib,
d_T, ldt, h_T, ib,
d_D, h_D, ib, h_TAU,
h_W, d_W, stream);
cudaThreadSynchronize();
}
......
This diff is collapsed.
......@@ -146,9 +146,6 @@ static void cl_zgessm_cuda_func(void *descr[], void *cl_arg)
cuDoubleComplex *dL, *dD, *dA;
int lddl, lddd, ldda;
int info = 0;
int ret;
/*
* hwork => nb*nb
*/
......@@ -157,21 +154,9 @@ static void cl_zgessm_cuda_func(void *descr[], void *cl_arg)
dA = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
starpu_codelet_unpack_args(cl_arg, &m, &n, &k, &ib, &IPIV, &lddl, &lddd, &ldda);
/* The kernel is just using the inverted part or nothing */
if ( lddl >= 2*ib ) {
dL += ib;
ret = magma_zgessm_gpu( MagmaColMajor, m, n, k, ib,
IPIV, dL, lddl, dD, lddd, dA, ldda, &info );
}
else {
ret = magma_zgessm_gpu( MagmaColMajor, m, n, k, ib,
IPIV, NULL, 1, dD, lddd, dA, ldda, &info );
}
if (ret != MAGMA_SUCCESS) {
fprintf(stderr, "Error in MAGMA: %d\n", ret);
exit(-1);
}
CUDA_zgessm(
MagmaColMajor, m, n, k, ib,
IPIV, dL, lddl, dD, lddd, dA, ldda, &info );
cudaThreadSynchronize();
......
......@@ -214,12 +214,13 @@ static void cl_zgetrf_incpiv_cuda_func(void *descr[], void *cl_arg)
hL = NULL;
}
magma_zgetrf_incpiv_gpu( MagmaColMajor, m, n, ib,
hA, lda, dA, lda,
hL, ib, dL, ldl,
IPIV,
dwork, lda,
&info );
CUDA_zgetrf_incpiv(
MagmaColMajor, m, n, ib,
hA, lda, dA, lda,
hL, ib, dL, ldl,
IPIV,
dwork, lda,
&info );
cudaThreadSynchronize();
}
......
......@@ -136,7 +136,7 @@ static void cl_zgetrf_nopiv_cuda_func(void *descr[], void *cl_arg)
starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &iinfo);
dA = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
magma_zgetrf_nopiv_gpu( m, n, dA, lda, &info );
CUDA_zgetrf_nopiv( m, n, dA, lda, &info );
cudaThreadSynchronize();
}
#endif
......
......@@ -77,7 +77,6 @@ static void cl_zlauum_cpu_func(void *descr[], void *cl_arg)
static void cl_zlauum_cuda_func(void *descr[], void *cl_arg)
{
MORSE_enum uplo;
int ret;
int info = 0;
int N;
cuDoubleComplex *A;
......@@ -85,12 +84,7 @@ static void cl_zlauum_cuda_func(void *descr[], void *cl_arg)
A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
starpu_codelet_unpack_args(cl_arg, &uplo, &N, &LDA);
ret = magma_zlauum_gpu( uplo, N, A, LDA, &info);
if (ret != MAGMA_SUCCESS) {
fprintf(stderr, "Error in MAGMA: %d\n", ret);
exit(-1);
}
CUDA_zlauum( uplo, N, A, LDA, &info);
cudaThreadSynchronize();
return;
}
......
......@@ -89,8 +89,6 @@ static void cl_zpotrf_cuda_func(void *descr[], void *cl_arg)
/* cuDoubleComplex *hA; */
int lda;
int iinfo;
int ret;
int info = 0;
A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
......@@ -107,14 +105,7 @@ static void cl_zpotrf_cuda_func(void *descr[], void *cl_arg)
// exit(-1);
// }
ret = magma_zpotrf_gpu(
uplo,
n, A, lda, &info);
/* hA, stream );*/
if (ret != MAGMA_SUCCESS) {
fprintf(stderr, "Error in MAGMA: %d\n", ret);
exit(-1);
}
CUDA_zpotrf( uplo, n, A, lda, &info);
cudaThreadSynchronize();
// cudaStreamDestroy( stream[1] );
......
......@@ -202,7 +202,7 @@ static void cl_zssssm_cuda_func(void *descr[], void *cl_arg)
dL1 += ib;
}
magma_zssssm_gpu(
CUDA_zssssm(
MagmaColMajor, m1, n1, m2, n2, k, ib,
dA1, lda1, dA2, lda2,
dL1, ldl1, dL2, ldl2,
......
......@@ -90,19 +90,11 @@ static void cl_ztrtri_cuda_func(void *descr[], void *cl_arg)
cuDoubleComplex *A;
int LDA;
int iinfo;
int ret;
int info = 0;
A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
starpu_codelet_unpack_args(cl_arg, &uplo, &diag, &N, &LDA, &iinfo);
ret = magma_ztrtri_gpu( uplo, diag,
N, A, LDA, &info);
if (ret != MAGMA_SUCCESS) {
fprintf(stderr, "Error in MAGMA: %d\n", ret);
exit(-1);
}
CUDA_ztrtri( uplo, diag, N, A, LDA, &info);
cudaThreadSynchronize();
return;
}
......
......@@ -171,159 +171,6 @@ static void cl_ztslqt_cpu_func(void *descr[], void *cl_arg)
}
#if defined(CHAMELEON_USE_MAGMA)
magma_int_t
magma_ztslqt_gpu( magma_int_t m, magma_int_t n, magma_int_t nb,
magmaDoubleComplex *da1, magma_int_t ldda1,
magmaDoubleComplex *da2, magma_int_t ldda2,
magmaDoubleComplex *a2, magma_int_t lda2,
magmaDoubleComplex *dt, magma_int_t lddt,
magmaDoubleComplex *t, magma_int_t ldt,
magmaDoubleComplex *dd,
magmaDoubleComplex *d, magma_int_t ldd,
magmaDoubleComplex *tau,
magmaDoubleComplex *hwork,
magmaDoubleComplex *dwork,
CUstream stream)
{
#define da1_ref(a_1,a_2) ( da1+(a_2)*ldda1 + (a_1))
#define da2_ref(a_1,a_2) ( da2+(a_2)*ldda2 + (a_1))
#define a2_ref(a_1,a_2) ( a2+(a_2)*lda2 + (a_1))
#define t_ref(a_1,a_2) ( t+(a_2)*ldt + (a_1))
#define dt_ref(a_1,a_2) ( dt+(a_2)*lddt + (a_1))
#define d_ref(a_1,a_2) ( d+(a_2)*ldd + (a_1))
int i, k, lddwork, old_i, old_ib, rows, cols;
int ib;
double _Complex one=1.;
if (m < 0) {
return -1;
} else if (n < 0) {
return -2;
} else if (ldda2 < max(1,m)) {
return -4;
}
k = min(m,n);
if (k == 0) {
hwork[0] = *((magmaDoubleComplex*) &one);
return MAGMA_SUCCESS;
}
lddwork= m;
/* lower parts of little T must be zero: memset all to 0 for simplicity */
memset(t, 0, nb*n*sizeof(magmaDoubleComplex));
cudaMemset(dt, 0, nb*n*sizeof(magmaDoubleComplex));
k = min(m, nb); // m can be lower than IB
/* copy the first diag tile of A1 from device to host: da1 -> d */
cublasGetMatrix(k, k, sizeof(magmaDoubleComplex),
da1_ref(0, 0), ldda1,
d, ldd);
/* copy first panel of A2 from device to host: da2 -> a2 */
cublasGetMatrix(k, n, sizeof(magmaDoubleComplex),
da2_ref(0, 0), ldda2,
a2, lda2);
/* This is only blocked code for now */
for (i = 0; i < m; i += nb) {
ib = min(m-i, nb);
cols = n;
/* Send the next panel (diagonal block of A1 & block column of A2)
to the CPU (in work_a1 and work_a2) and apply tsmqr update on the
remaining non updated panels */
if (i>0) {
/* copy the diag tile of A1 from device to host: da1 -> d */
cublasGetMatrix(ib, ib, sizeof(magmaDoubleComplex),
da1_ref(i, i), ldda1,
d, ldd);
/* copy panel of A2 from device to host: da2 -> a2 */
cublasGetMatrix(ib, cols, sizeof(magmaDoubleComplex),
da2_ref(i, 0), ldda2,
a2, lda2);
/* Apply H' to A(i+2*ib:m,i:n) from the left */
rows = m-old_i-2*old_ib;
if (rows > 0){
magma_ztsmlq_gpu( MagmaRight, MagmaConjTrans,
rows, old_ib, rows, cols, old_ib, old_ib,
da1_ref(old_i+2*old_ib, old_i), ldda1,
da2_ref(old_i+2*old_ib, 0), ldda2,
da2_ref(old_i, 0), ldda2,
dt_ref(0, old_i), lddt,
dwork, lddwork,
dwork + nb * lddwork, nb,
stream );
}
}
/* compute LQ factorization of the panel of A2 ib x cols */
CORE_ztslqt(ib, cols, ib,
(double _Complex*) d, ldd,
(double _Complex*) a2, lda2,
(double _Complex*) t, ldt,
(double _Complex*) tau,
(double _Complex*) hwork);
/* Send the panel from A2 back to the GPU */
cublasSetMatrix(ib, cols, sizeof(magmaDoubleComplex),
a2, lda2,
da2_ref(i, 0), ldda2);
/* Send the triangular factor T from hwork to the GPU */
cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex),
t, ldt,
dt_ref(0, i), lddt);
/* get back the diag tile in A1 from host to device: d -> da1 */
cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex),
d, ldd,
da1_ref(i, i), ldda1);
/* tsmlq update on one panel forward (look ahead 1) */
if (i + ib < m) {
if (i+2*ib < m){
rows = ib;
}
else{
rows = m-i-ib;
}
/* Apply H' to A(i+ib:i+2*ib,i:n) from the right */
magma_ztsmlq_gpu( MagmaRight, MagmaConjTrans,
rows, ib, rows, cols, ib, ib,
da1_ref(i+ib, i), ldda1,
da2_ref(i+ib, 0), ldda2,
da2_ref(i, 0), ldda2,
dt_ref(0, i), lddt,
dwork, lddwork,
dwork + nb * lddwork, nb,
stream );
old_i = i;
old_ib = ib;
}
}
#undef da1_ref
#undef da2_ref
#undef a2_ref
#undef t_ref
#undef dt_ref
#undef d_ref
return MAGMA_SUCCESS;
} /* magma_ztslqt_gpu */
static void cl_ztslqt_cuda_func(void *descr[], void *cl_arg)
{
MORSE_starpu_ws_t *h_work;
......@@ -353,12 +200,13 @@ static void cl_ztslqt_cuda_func(void *descr[], void *cl_arg)
h_D = h_W + ib*m;
stream = starpu_cuda_get_local_stream();
magma_ztslqt_gpu( m, n, ib,
d_A1, lda1, d_A2, lda2,
h_A2, ib,
d_T, ldt, h_T, ib,
d_D, h_D, ib, h_TAU,
h_W, d_W, stream);
CUDA_ztslqt(
m, n, ib,
d_A1, lda1, d_A2, lda2,
h_A2, ib,
d_T, ldt, h_T, ib,
d_D, h_D, ib, h_TAU,
h_W, d_W, stream);
cudaThreadSynchronize();
}
......
......@@ -213,135 +213,6 @@ static void cl_ztsmlq_cpu_func(void *descr[], void *cl_arg)
}
#if defined(CHAMELEON_USE_MAGMA)
magma_int_t
magma_ztsmlq_gpu( magma_side_t side, magma_trans_t trans,
magma_int_t M1, magma_int_t N1,
magma_int_t M2, magma_int_t N2,
magma_int_t K, magma_int_t IB,
magmaDoubleComplex *A1, magma_int_t LDA1,
magmaDoubleComplex *A2, magma_int_t LDA2,
const magmaDoubleComplex *V, magma_int_t LDV,
const magmaDoubleComplex *T, magma_int_t LDT,
magmaDoubleComplex *WORK, magma_int_t LDWORK,
magmaDoubleComplex *WORKC, magma_int_t LDWORKC,
CUstream stream)
{
int i, i1, i3;
int NW;
int kb;
int ic = 0;
int jc = 0;
int mi = M1;
int ni = N1;
/* Check input arguments */
if ((side != MagmaLeft) && (side != MagmaRight)) {
return -1;
}
/* NW is the minimum dimension of WORK */
if (side == MagmaLeft) {
NW = IB;
}
else {
NW = N1;
}
if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) {
return -2;
}
if (M1 < 0) {
return -3;
}
if (N1 < 0) {
return -4;
}
if ( (M2 < 0) ||
( (M2 != M1) && (side == MagmaRight) ) ){
return -5;
}
if ( (N2 < 0) ||
( (N2 != N1) && (side == MagmaLeft) ) ){
return -6;
}
if ((K < 0) ||
( (side == MagmaLeft) && (K > M1) ) ||
( (side == MagmaRight) && (K > N1) ) ) {
return -7;
}
if (IB < 0) {
return -8;
}
if (LDA1 < max(1,M1)){
return -10;
}
if (LDA2 < max(1,M2)){
return -12;
}
if (LDV < max(1,K)){
return -14;
}
if (LDT < max(1,IB)){
return -16;
}
if (LDWORK < max(1,NW)){