From 91e7e27a2b84b306dab4e5a21122adf5c00884a6 Mon Sep 17 00:00:00 2001 From: Florent Pruvost <florent.pruvost@inria.fr> Date: Thu, 17 Sep 2015 08:16:19 +0000 Subject: [PATCH] add cudablas library to make calls to cuda kernels (magma here, cublas will follow), no more calls to magma in runtime/starpu codelets --- CMakeLists.txt | 13 + control/common.h | 4 +- example/basic_zposv/CMakeLists.txt | 4 + example/lapack_to_morse/CMakeLists.txt | 4 + runtime/starpu/CMakeLists.txt | 5 + runtime/starpu/codelets/codelet_zcallback.c | 11 +- runtime/starpu/codelets/codelet_zgelqt.c | 163 +---- runtime/starpu/codelets/codelet_zgeqrt.c | 283 +------- runtime/starpu/codelets/codelet_zgessm.c | 21 +- .../starpu/codelets/codelet_zgetrf_incpiv.c | 13 +- .../starpu/codelets/codelet_zgetrf_nopiv.c | 2 +- runtime/starpu/codelets/codelet_zlauum.c | 8 +- runtime/starpu/codelets/codelet_zpotrf.c | 11 +- runtime/starpu/codelets/codelet_zssssm.c | 2 +- runtime/starpu/codelets/codelet_ztrtri.c | 10 +- runtime/starpu/codelets/codelet_ztslqt.c | 166 +---- runtime/starpu/codelets/codelet_ztsmlq.c | 132 +--- runtime/starpu/codelets/codelet_ztsmqr.c | 684 +----------------- runtime/starpu/codelets/codelet_ztsqrt.c | 192 +---- runtime/starpu/codelets/codelet_ztstrf.c | 15 +- runtime/starpu/codelets/codelet_zunmlq.c | 113 +-- runtime/starpu/codelets/codelet_zunmqr.c | 107 +-- testing/CMakeLists.txt | 5 + timing/CMakeLists.txt | 7 +- 24 files changed, 114 insertions(+), 1861 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3784f1019..323a92151 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -789,6 +789,9 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_SOURCE_DIR}/cmake_modules/") ) link_directories(${MAGMA_LIBRARY_DIRS}) endif() + if(CHAMELEON_USE_CUDA OR CHAMELEON_USE_MAGMA) + list(APPEND CHAMELEON_DEP -lcudablas) + endif() list(APPEND CHAMELEON_DEP -lcoreblas @@ -842,6 +845,16 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_SOURCE_DIR}/cmake_modules/") #------------------------------------------------------------------------------ + ############################################################################### + # Cudablas library (kernels for CUDAs) # + ######################################## + + if(CHAMELEON_USE_CUDA) + add_subdirectory(cudablas) + endif() + #------------------------------------------------------------------------------ + + ############################################################################### # Main library # ################ diff --git a/control/common.h b/control/common.h index 539e5e1cc..283286e8b 100644 --- a/control/common.h +++ b/control/common.h @@ -81,8 +81,10 @@ #ifndef LAPACK_NAME #define LAPACK_NAME(a, b) lapackef77_##a #endif -#include "coreblas/include/lapacke.h" #include "coreblas/include/coreblas.h" +#if defined(CHAMELEON_USE_CUDA) +#include "cudablas/include/cudablas.h" +#endif #include "morse.h" diff --git a/example/basic_zposv/CMakeLists.txt b/example/basic_zposv/CMakeLists.txt index e545f7581..ec479cc05 100644 --- a/example/basic_zposv/CMakeLists.txt +++ b/example/basic_zposv/CMakeLists.txt @@ -69,6 +69,10 @@ endif() if(NOT CHAMELEON_SIMULATION) + if(CHAMELEON_USE_CUDA OR CHAMELEON_USE_MAGMA) + list(APPEND libs_for_examples + cudablas) + endif() if(CHAMELEON_USE_CUDA) list(APPEND libs_for_examples ${CUDA_LIBRARIES} diff --git a/example/lapack_to_morse/CMakeLists.txt b/example/lapack_to_morse/CMakeLists.txt index e531b183e..b40720e51 100644 --- a/example/lapack_to_morse/CMakeLists.txt +++ b/example/lapack_to_morse/CMakeLists.txt @@ -69,6 +69,10 @@ unset(libs_for_step0) if(NOT CHAMELEON_SIMULATION) + if(CHAMELEON_USE_CUDA OR CHAMELEON_USE_MAGMA) + list(APPEND libs_for_ltm + cudablas) + endif() if(CHAMELEON_USE_CUDA) list(APPEND libs_for_ltm ${CUDA_LIBRARIES} diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt index 6063f65d3..fbb9b504d 100644 --- a/runtime/starpu/CMakeLists.txt +++ b/runtime/starpu/CMakeLists.txt @@ -188,6 +188,11 @@ add_dependencies(chameleon_starpu control_include runtime_starpu_include ) +if (CHAMELEON_USE_CUDA) + add_dependencies(chameleon_starpu + cudablas_include + ) +endif() # installation # ------------ diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c index 3fcf9d85b..3d7bce440 100644 --- a/runtime/starpu/codelets/codelet_zcallback.c +++ b/runtime/starpu/codelets/codelet_zcallback.c @@ -3,24 +3,25 @@ * @copyright (c) 2009-2014 The University of Tennessee and The University * of Tennessee Research Foundation. * All rights reserved. - * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2015 Inria. All rights reserved. * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. * **/ /** * - * @file codelet_zpotrf.c + * @file codelet_zcallback.c * - * MAGMA codelets kernel - * MAGMA is a software package provided by Univ. of Tennessee, + * MORSE codelets kernel + * MORSE is a software package provided by Univ. of Tennessee, * Univ. of California Berkeley and Univ. of Colorado Denver, * and INRIA Bordeaux Sud-Ouest * * @version 2.3.1 * @author Mathieu Faverge * @author Cedric Augonnet - * @date 2011-06-01 + * @author Florent Pruvost + * @date 2015-09-16 * @precisions normal z -> c d s * **/ diff --git a/runtime/starpu/codelets/codelet_zgelqt.c b/runtime/starpu/codelets/codelet_zgelqt.c index 665e88e95..f268f6363 100644 --- a/runtime/starpu/codelets/codelet_zgelqt.c +++ b/runtime/starpu/codelets/codelet_zgelqt.c @@ -150,154 +150,6 @@ static void cl_zgelqt_cpu_func(void *descr[], void *cl_arg) } #if defined(CHAMELEON_USE_MAGMA) -magma_int_t -magma_zgelqt_gpu( magma_int_t m, magma_int_t n, magma_int_t nb, - magmaDoubleComplex *da, magma_int_t ldda, - magmaDoubleComplex *v, magma_int_t ldv, - magmaDoubleComplex *dt, magma_int_t lddt, - magmaDoubleComplex *t, magma_int_t ldt, - magmaDoubleComplex *dd, - magmaDoubleComplex *d, magma_int_t ldd, - magmaDoubleComplex *tau, - magmaDoubleComplex *hwork, - magmaDoubleComplex *dwork) -{ -#define da_ref(a_1,a_2) ( da+(a_2)*(ldda) + (a_1)) -#define v_ref(a_1,a_2) ( v+(a_2)*(ldv) + (a_1)) -#define dt_ref(a_1,a_2) ( dt+(a_2)*(lddt) + (a_1)) -#define t_ref(a_1,a_2) ( t+(a_2)*(ldt) + (a_1)) - - int i, k, ib, lddwork, old_i, old_ib, rows, cols; - double _Complex one=1.; - CUstream stream; - - stream = starpu_cuda_get_local_stream(); - cublasSetKernelStream( stream ); - - if (m < 0) { - return -1; - } else if (n < 0) { - return -2; - } else if (ldda < max(1,m)) { - return -4; - } - - k = min(m,n); - if (k == 0) { - hwork[0] = *((magmaDoubleComplex*) &one); - return MAGMA_SUCCESS; - } - - lddwork= m; - - /* lower parts of little T must be zero: memset to 0 for simplicity */ - memset(t_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex)); - cudaMemset(dt_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex)); - - /* copy first panel of A on the host */ - cublasGetMatrix(min(m, nb), n, sizeof(magmaDoubleComplex), - da_ref(0, 0), ldda, - v, ldv); - - /* Use blocked code initially */ - for (i = 0; i < k; i += nb) { - - ib = min(k-i, nb); - if (i+nb >= m) ib = min(m-i, nb); - cols = n-i; - - if (i > 0){ - - /* copy panel of A from device to host */ - cublasGetMatrix(ib, n, sizeof(magmaDoubleComplex), - da_ref(i, 0), ldda, - v, ldv); - - /* Apply H' to A(i+2*ib:m, i:n) from the right */ - rows = m-old_i-2*old_ib; - if (rows > 0){ - magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise, - rows, n-old_i, old_ib, - da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt, - da_ref(old_i+2*old_ib, old_i), ldda, - dwork, lddwork); - } - - /* copy the lower diag tile into d_A */ - magma_zgemerge_gpu(MagmaRight, MagmaUnit, old_ib, old_ib, - dd, ldd, da_ref(old_i, old_i), ldda, stream); - - } - - /* Form the triangular factor of the block reflector on the host - H = H'(i+ib-1) . . . H(i+1) H(i) */ - CORE_zgelqt(ib, cols, ib, - (double _Complex*) v_ref(0,i), ldv, - (double _Complex*) t_ref(0,0), ldt, - (double _Complex*) tau+i, - (double _Complex*) hwork); - - if ( i + ib < m ){ - /* put 0s in the lower triangular part of a panel (and 1s on the - diagonal); copy the lower triangular in d */ - CORE_zgesplit(MorseRight, MorseUnit, ib, min(cols,ib), - (double _Complex*) v_ref(0,i), ldv, - (double _Complex*) d, ldd); - - /* copy from host to device a tile diag */ - cublasSetMatrix( ib, min(cols,ib), sizeof(magmaDoubleComplex), - d, ldd, dd, ldd ); - } - - /* Send the triangular factor T to the GPU */ - cublasSetMatrix( ib, ib, sizeof(magmaDoubleComplex), - t_ref(0,0), ldt, dt_ref(0,i), lddt ); - - /* A panel (with zeros in lower tri of its diag) is ready to be used - in input of zlarfb_gpu: we send the panel to the gpu */ - cublasSetMatrix( ib, cols, sizeof(magmaDoubleComplex), - v_ref(0,i), ldv, da_ref(i,i), ldda ); - - if (i + ib < m) { - - if (i+2*ib < m){ - rows = ib; - } - else{ - rows = m-i-ib; - } - /* Apply H' to A(i+ib:i+2*ib, i:n) from the right */ - magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise, - rows, cols, ib, da_ref(i,i), ldda, dt_ref(0,i), - lddt, da_ref(i+ib,i), ldda, dwork, lddwork); - old_i = i; - old_ib = ib; - if (i+nb >= k){ - /* Apply H' to A(i+2*ib:m, i:n) from the right */ - rows = m-old_i-2*old_ib; - if (rows > 0){ - magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise, - rows, cols, old_ib, - da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt, - da_ref(old_i+2*old_ib, old_i), ldda, - dwork, lddwork); - } - /* copy the upper diag tile into d_A */ - magma_zgemerge_gpu(MagmaRight, MagmaUnit, old_ib, old_ib, - dd, ldd, da_ref(old_i, old_i), ldda, stream); - } - } - - } - -#undef da_ref -#undef v_ref -#undef dt_ref -#undef t_ref - - return MAGMA_SUCCESS; -} /* magma_zgelqt_gpu */ - static void cl_zgelqt_cuda_func(void *descr[], void *cl_arg) { MORSE_starpu_ws_t *h_work; @@ -307,6 +159,7 @@ static void cl_zgelqt_cuda_func(void *descr[], void *cl_arg) cuDoubleComplex *h_A, *h_T, *h_D, *h_W, *h_TAU; cuDoubleComplex *d_A, *d_T, *d_D, *d_W; int lda, ldt; + CUstream stream; starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldt, &h_work); @@ -326,11 +179,15 @@ static void cl_zgelqt_cuda_func(void *descr[], void *cl_arg) h_W = h_TAU + max(m,n); h_D = h_W + ib*ib; - magma_zgelqt_gpu( m, n, ib, - d_A, lda, h_A, ib, - d_T, ldt, h_T, ib, - d_D, h_D, ib, h_TAU, - h_W, d_W); + stream = starpu_cuda_get_local_stream(); + cublasSetKernelStream( stream ); + + CUDA_zgelqt( + m, n, ib, + d_A, lda, h_A, ib, + d_T, ldt, h_T, ib, + d_D, h_D, ib, h_TAU, + h_W, d_W, stream); cudaThreadSynchronize(); } diff --git a/runtime/starpu/codelets/codelet_zgeqrt.c b/runtime/starpu/codelets/codelet_zgeqrt.c index 7257676cd..880ca82ef 100644 --- a/runtime/starpu/codelets/codelet_zgeqrt.c +++ b/runtime/starpu/codelets/codelet_zgeqrt.c @@ -152,274 +152,6 @@ static void cl_zgeqrt_cpu_func(void *descr[], void *cl_arg) #if defined(CHAMELEON_USE_MAGMA) - -#if defined(CHAMELEON_USE_CUBLAS_V2) -magma_int_t -magma_zgemerge_gpu(magma_side_t side, magma_diag_t diag, - magma_int_t M, magma_int_t N, - magmaDoubleComplex *A, magma_int_t LDA, - magmaDoubleComplex *B, magma_int_t LDB, - CUstream stream) -{ - int i, j; - magmaDoubleComplex *cola, *colb; - cublasHandle_t handle; - cublasStatus_t stat; - - stat = cublasCreate(&handle); - if (stat != CUBLAS_STATUS_SUCCESS) { - printf ("CUBLAS initialization failed\n"); - assert( stat == CUBLAS_STATUS_SUCCESS ); - } - - stat = cublasSetStream(handle, stream); - if (stat != CUBLAS_STATUS_SUCCESS) { - printf ("cublasSetStream failed\n"); - assert( stat == CUBLAS_STATUS_SUCCESS ); - } - - if (M < 0) { - return -1; - } - if (N < 0) { - return -2; - } - if ( (LDA < max(1,M)) && (M > 0) ) { - return -5; - } - if ( (LDB < max(1,M)) && (M > 0) ) { - return -7; - } - - if (side == MagmaLeft){ - for(i=0; i<N; i++){ - cola = A + i*LDA; - colb = B + i*LDB; -// cublasZcopy(handle, i+1, cola, 1, colb, 1); - cudaMemcpyAsync(colb , cola, - (i+1)*sizeof(cuDoubleComplex), - cudaMemcpyDeviceToDevice, stream); - } - }else{ - for(i=0; i<N; i++){ - cola = A + i*LDA; - colb = B + i*LDB; -// cublasZcopy(handle, M-i, cola + i, 1, colb + i, 1); - cudaMemcpyAsync(colb+i , cola+i, - (M-i)*sizeof(cuDoubleComplex), - cudaMemcpyDeviceToDevice, stream); - } - } - - cublasDestroy(handle); - - return MAGMA_SUCCESS; -} -#else /* CHAMELEON_USE_CUBLAS_V2 */ -magma_int_t -magma_zgemerge_gpu(magma_side_t side, magma_diag_t diag, - magma_int_t M, magma_int_t N, - magmaDoubleComplex *A, magma_int_t LDA, - magmaDoubleComplex *B, magma_int_t LDB, - CUstream stream) -{ - int i, j; - magmaDoubleComplex *cola, *colb; - - if (M < 0) { - return -1; - } - if (N < 0) { - return -2; - } - if ( (LDA < max(1,M)) && (M > 0) ) { - return -5; - } - if ( (LDB < max(1,M)) && (M > 0) ) { - return -7; - } - - if (side == MagmaLeft){ - for(i=0; i<N; i++){ - cola = A + i*LDA; - colb = B + i*LDB; -// cublasZcopy(i+1, cola, 1, colb, 1); - cudaMemcpyAsync(colb , cola, - (i+1)*sizeof(cuDoubleComplex), - cudaMemcpyDeviceToDevice, stream); - } - }else{ - for(i=0; i<N; i++){ - cola = A + i*LDA; - colb = B + i*LDB; -// cublasZcopy(M-i, cola + i, 1, colb + i, 1); - cudaMemcpyAsync(colb+i , cola+i, - (M-i)*sizeof(cuDoubleComplex), - cudaMemcpyDeviceToDevice, stream); - } - } - - return MAGMA_SUCCESS; -} -#endif /* CHAMELEON_USE_CUBLAS_V2 */ - -magma_int_t -magma_zgeqrt_gpu( magma_int_t m, magma_int_t n, magma_int_t nb, - magmaDoubleComplex *da, magma_int_t ldda, - magmaDoubleComplex *v, magma_int_t ldv, - magmaDoubleComplex *dt, magma_int_t lddt, - magmaDoubleComplex *t, magma_int_t ldt, - magmaDoubleComplex *dd, - magmaDoubleComplex *d, magma_int_t ldd, - magmaDoubleComplex *tau, - magmaDoubleComplex *hwork, - magmaDoubleComplex *dwork) -{ -#define da_ref(a_1,a_2) ( da+(a_2)*(ldda) + (a_1)) -#define v_ref(a_1,a_2) ( v+(a_2)*(ldv) + (a_1)) -#define dt_ref(a_1,a_2) ( dt+(a_2)*(lddt) + (a_1)) -#define t_ref(a_1,a_2) ( t+(a_2)*(ldt) + (a_1)) - - int i, k, ib, lddwork, old_i, old_ib, rows, cols; - double _Complex one=1.; - CUstream stream; - - stream = starpu_cuda_get_local_stream(); - cublasSetKernelStream( stream ); -// int lwkopt = n * nb; -// hwork[0] = *((magmaDoubleComplex*) &lwkopt); - - if (m < 0) { - return -1; - } else if (n < 0) { - return -2; - } else if (ldda < max(1,m)) { - return -4; - } - - k = min(m,n); - if (k == 0) { - hwork[0] = *((magmaDoubleComplex*) &one); - return MAGMA_SUCCESS; - } - - lddwork= k; - - /* lower parts of little T must be zero: memset to 0 for simplicity */ - memset(t_ref(0,0), 0, nb*nb*sizeof(magmaDoubleComplex)); - cudaMemsetAsync(dt_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex), stream); - - /* copy first panel of A on the host */ -// cublasGetMatrix(m, min(nb,n), sizeof(magmaDoubleComplex), -// da_ref(0, 0), ldda, -// v, ldv); - cudaMemcpy( v, da_ref(0,0), - m*min(nb,n)*sizeof(magmaDoubleComplex), - cudaMemcpyDeviceToHost); - - /* Use blocked code initially */ - for (i = 0; i < k; i += nb) { - - ib = min(k-i, nb); - if (i+nb>=n) ib = min(n-i, nb); - rows = m-i; - - if (i>0){ - - /* copy panel of A from device to host */ -// cublasGetMatrix(m, ib, sizeof(magmaDoubleComplex), -// da_ref(0, i), ldda, -// v, ldv); - /* copy panel of A from device to host */ - cudaMemcpy( v, da_ref(0,i), - m*ib*sizeof(magmaDoubleComplex), - cudaMemcpyDeviceToHost); - - /* Apply H' to A(i:m,i+2*ib:n) from the left */ - cols = n-old_i-2*old_ib; - if (cols > 0){ - magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, - m-old_i, cols, old_ib, - da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt, - da_ref(old_i, old_i+2*old_ib), ldda, - dwork, cols); - } - - /* copy the upper diag tile into d_A */ - magma_zgemerge_gpu(MagmaLeft, MagmaUnit, old_ib, old_ib, - dd, ldd, da_ref(old_i, old_i), ldda, stream); - - } - - /* Form the triangular factor of the block reflector on the host - H = H(i) H(i+1) . . . H(i+ib-1) */ - CORE_zgeqrt(rows, ib, ib, - (double _Complex*) v_ref(i,0), ldv, - (double _Complex*) t_ref(0,0), ldt, - (double _Complex*) tau+i, - (double _Complex*) hwork); - - if ( i + ib < n ){ - /* put 0s in the upper triangular part of a panel (and 1s on the - diagonal); copy the upper triangular in d */ - CORE_zgesplit(MorseLeft, MorseUnit, min(rows,ib), ib, - (double _Complex*) v_ref(i,0), ldv, - (double _Complex*) d, ldd); - - /* copy from host to device a tile diag */ - cublasSetMatrix( min(rows,ib), ib, sizeof(magmaDoubleComplex), - d, ldd, dd, ldd ); - } - - /* Send the triangular factor T to the GPU */ - cublasSetMatrix( ib, ib, sizeof(magmaDoubleComplex), - t_ref(0,0), ldt, dt_ref(0,i), lddt ); - - /* A panel (with zeros in upper tri of its diag) is ready to be used - in input of zlarfb_gpu: we send the panel to the gpu */ - cublasSetMatrix( rows, ib, sizeof(magmaDoubleComplex), - v_ref(i,0), ldv, da_ref(i,i), ldda ); - - if (i + ib < n) { - - if (i+2*ib < n){ - cols = ib; - } - else{ - cols = n-i-ib; - } - /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ - magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, - rows, cols, ib, da_ref(i,i), ldda, dt_ref(0,i), - lddt, da_ref(i,i+ib), ldda, dwork, cols); - old_i = i; - old_ib = ib; - if (i+nb>=k){ - /* Apply H' to A(i:m,i+2*ib:n) from the left */ - cols = n-old_i-2*old_ib; - if (cols > 0){ - magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, - rows, cols, old_ib, - da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt, - da_ref(old_i, old_i+2*old_ib), ldda, - dwork, cols); - } - /* copy the upper diag tile into d_A */ - magma_zgemerge_gpu(MagmaLeft, MagmaUnit, old_ib, old_ib, - dd, ldd, da_ref(old_i, old_i), ldda, stream); - } - } - - } - -#undef da_ref -#undef v_ref -#undef dt_ref -#undef t_ref - - return MAGMA_SUCCESS; -} /* magma_zgeqrt_gpu */ - static void cl_zgeqrt_cuda_func(void *descr[], void *cl_arg) { MORSE_starpu_ws_t *h_work; @@ -429,6 +161,7 @@ static void cl_zgeqrt_cuda_func(void *descr[], void *cl_arg) cuDoubleComplex *h_A, *h_T, *h_D, *h_W, *h_TAU; cuDoubleComplex *d_A, *d_T, *d_D, *d_W; int lda, ldt; + CUstream stream; starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldt, &h_work); @@ -448,11 +181,15 @@ static void cl_zgeqrt_cuda_func(void *descr[], void *cl_arg) h_W = h_TAU + max(m,n); h_D = h_W + ib*ib; - magma_zgeqrt_gpu( m, n, ib, - d_A, lda, h_A, m, - d_T, ldt, h_T, ib, - d_D, h_D, ib, h_TAU, - h_W, d_W); + stream = starpu_cuda_get_local_stream(); + cublasSetKernelStream( stream ); + + CUDA_zgeqrt( + m, n, ib, + d_A, lda, h_A, m, + d_T, ldt, h_T, ib, + d_D, h_D, ib, h_TAU, + h_W, d_W, stream); cudaThreadSynchronize(); } diff --git a/runtime/starpu/codelets/codelet_zgessm.c b/runtime/starpu/codelets/codelet_zgessm.c index a63c803d0..7a74296e5 100644 --- a/runtime/starpu/codelets/codelet_zgessm.c +++ b/runtime/starpu/codelets/codelet_zgessm.c @@ -146,9 +146,6 @@ static void cl_zgessm_cuda_func(void *descr[], void *cl_arg) cuDoubleComplex *dL, *dD, *dA; int lddl, lddd, ldda; int info = 0; - - int ret; - /* * hwork => nb*nb */ @@ -157,21 +154,9 @@ static void cl_zgessm_cuda_func(void *descr[], void *cl_arg) dA = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); starpu_codelet_unpack_args(cl_arg, &m, &n, &k, &ib, &IPIV, &lddl, &lddd, &ldda); - /* The kernel is just using the inverted part or nothing */ - if ( lddl >= 2*ib ) { - dL += ib; - ret = magma_zgessm_gpu( MagmaColMajor, m, n, k, ib, - IPIV, dL, lddl, dD, lddd, dA, ldda, &info ); - } - else { - ret = magma_zgessm_gpu( MagmaColMajor, m, n, k, ib, - IPIV, NULL, 1, dD, lddd, dA, ldda, &info ); - } - - if (ret != MAGMA_SUCCESS) { - fprintf(stderr, "Error in MAGMA: %d\n", ret); - exit(-1); - } + CUDA_zgessm( + MagmaColMajor, m, n, k, ib, + IPIV, dL, lddl, dD, lddd, dA, ldda, &info ); cudaThreadSynchronize(); diff --git a/runtime/starpu/codelets/codelet_zgetrf_incpiv.c b/runtime/starpu/codelets/codelet_zgetrf_incpiv.c index 489a48665..acd0ee798 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_incpiv.c +++ b/runtime/starpu/codelets/codelet_zgetrf_incpiv.c @@ -214,12 +214,13 @@ static void cl_zgetrf_incpiv_cuda_func(void *descr[], void *cl_arg) hL = NULL; } - magma_zgetrf_incpiv_gpu( MagmaColMajor, m, n, ib, - hA, lda, dA, lda, - hL, ib, dL, ldl, - IPIV, - dwork, lda, - &info ); + CUDA_zgetrf_incpiv( + MagmaColMajor, m, n, ib, + hA, lda, dA, lda, + hL, ib, dL, ldl, + IPIV, + dwork, lda, + &info ); cudaThreadSynchronize(); } diff --git a/runtime/starpu/codelets/codelet_zgetrf_nopiv.c b/runtime/starpu/codelets/codelet_zgetrf_nopiv.c index 3db007331..0792adaba 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_nopiv.c +++ b/runtime/starpu/codelets/codelet_zgetrf_nopiv.c @@ -136,7 +136,7 @@ static void cl_zgetrf_nopiv_cuda_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &iinfo); dA = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - magma_zgetrf_nopiv_gpu( m, n, dA, lda, &info ); + CUDA_zgetrf_nopiv( m, n, dA, lda, &info ); cudaThreadSynchronize(); } #endif diff --git a/runtime/starpu/codelets/codelet_zlauum.c b/runtime/starpu/codelets/codelet_zlauum.c index 0c3ba14bd..2e206cc13 100644 --- a/runtime/starpu/codelets/codelet_zlauum.c +++ b/runtime/starpu/codelets/codelet_zlauum.c @@ -77,7 +77,6 @@ static void cl_zlauum_cpu_func(void *descr[], void *cl_arg) static void cl_zlauum_cuda_func(void *descr[], void *cl_arg) { MORSE_enum uplo; - int ret; int info = 0; int N; cuDoubleComplex *A; @@ -85,12 +84,7 @@ static void cl_zlauum_cuda_func(void *descr[], void *cl_arg) A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); starpu_codelet_unpack_args(cl_arg, &uplo, &N, &LDA); - ret = magma_zlauum_gpu( uplo, N, A, LDA, &info); - if (ret != MAGMA_SUCCESS) { - fprintf(stderr, "Error in MAGMA: %d\n", ret); - exit(-1); - } - + CUDA_zlauum( uplo, N, A, LDA, &info); cudaThreadSynchronize(); return; } diff --git a/runtime/starpu/codelets/codelet_zpotrf.c b/runtime/starpu/codelets/codelet_zpotrf.c index 939a9e1da..705c81f77 100644 --- a/runtime/starpu/codelets/codelet_zpotrf.c +++ b/runtime/starpu/codelets/codelet_zpotrf.c @@ -89,8 +89,6 @@ static void cl_zpotrf_cuda_func(void *descr[], void *cl_arg) /* cuDoubleComplex *hA; */ int lda; int iinfo; - - int ret; int info = 0; A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); @@ -107,14 +105,7 @@ static void cl_zpotrf_cuda_func(void *descr[], void *cl_arg) // exit(-1); // } - ret = magma_zpotrf_gpu( - uplo, - n, A, lda, &info); -/* hA, stream );*/ - if (ret != MAGMA_SUCCESS) { - fprintf(stderr, "Error in MAGMA: %d\n", ret); - exit(-1); - } + CUDA_zpotrf( uplo, n, A, lda, &info); cudaThreadSynchronize(); // cudaStreamDestroy( stream[1] ); diff --git a/runtime/starpu/codelets/codelet_zssssm.c b/runtime/starpu/codelets/codelet_zssssm.c index 019daf9f1..6837607a8 100644 --- a/runtime/starpu/codelets/codelet_zssssm.c +++ b/runtime/starpu/codelets/codelet_zssssm.c @@ -202,7 +202,7 @@ static void cl_zssssm_cuda_func(void *descr[], void *cl_arg) dL1 += ib; } - magma_zssssm_gpu( + CUDA_zssssm( MagmaColMajor, m1, n1, m2, n2, k, ib, dA1, lda1, dA2, lda2, dL1, ldl1, dL2, ldl2, diff --git a/runtime/starpu/codelets/codelet_ztrtri.c b/runtime/starpu/codelets/codelet_ztrtri.c index 4f7145c59..20f6ff44c 100644 --- a/runtime/starpu/codelets/codelet_ztrtri.c +++ b/runtime/starpu/codelets/codelet_ztrtri.c @@ -90,19 +90,11 @@ static void cl_ztrtri_cuda_func(void *descr[], void *cl_arg) cuDoubleComplex *A; int LDA; int iinfo; - - int ret; int info = 0; A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); starpu_codelet_unpack_args(cl_arg, &uplo, &diag, &N, &LDA, &iinfo); - ret = magma_ztrtri_gpu( uplo, diag, - N, A, LDA, &info); - if (ret != MAGMA_SUCCESS) { - fprintf(stderr, "Error in MAGMA: %d\n", ret); - exit(-1); - } - + CUDA_ztrtri( uplo, diag, N, A, LDA, &info); cudaThreadSynchronize(); return; } diff --git a/runtime/starpu/codelets/codelet_ztslqt.c b/runtime/starpu/codelets/codelet_ztslqt.c index 6957cce31..264d50c51 100644 --- a/runtime/starpu/codelets/codelet_ztslqt.c +++ b/runtime/starpu/codelets/codelet_ztslqt.c @@ -171,159 +171,6 @@ static void cl_ztslqt_cpu_func(void *descr[], void *cl_arg) } #if defined(CHAMELEON_USE_MAGMA) - -magma_int_t -magma_ztslqt_gpu( magma_int_t m, magma_int_t n, magma_int_t nb, - magmaDoubleComplex *da1, magma_int_t ldda1, - magmaDoubleComplex *da2, magma_int_t ldda2, - magmaDoubleComplex *a2, magma_int_t lda2, - magmaDoubleComplex *dt, magma_int_t lddt, - magmaDoubleComplex *t, magma_int_t ldt, - magmaDoubleComplex *dd, - magmaDoubleComplex *d, magma_int_t ldd, - magmaDoubleComplex *tau, - magmaDoubleComplex *hwork, - magmaDoubleComplex *dwork, - CUstream stream) -{ -#define da1_ref(a_1,a_2) ( da1+(a_2)*ldda1 + (a_1)) -#define da2_ref(a_1,a_2) ( da2+(a_2)*ldda2 + (a_1)) -#define a2_ref(a_1,a_2) ( a2+(a_2)*lda2 + (a_1)) -#define t_ref(a_1,a_2) ( t+(a_2)*ldt + (a_1)) -#define dt_ref(a_1,a_2) ( dt+(a_2)*lddt + (a_1)) -#define d_ref(a_1,a_2) ( d+(a_2)*ldd + (a_1)) - - int i, k, lddwork, old_i, old_ib, rows, cols; - int ib; - double _Complex one=1.; - - if (m < 0) { - return -1; - } else if (n < 0) { - return -2; - } else if (ldda2 < max(1,m)) { - return -4; - } - - k = min(m,n); - if (k == 0) { - hwork[0] = *((magmaDoubleComplex*) &one); - return MAGMA_SUCCESS; - } - - lddwork= m; - - /* lower parts of little T must be zero: memset all to 0 for simplicity */ - memset(t, 0, nb*n*sizeof(magmaDoubleComplex)); - cudaMemset(dt, 0, nb*n*sizeof(magmaDoubleComplex)); - - k = min(m, nb); // m can be lower than IB - /* copy the first diag tile of A1 from device to host: da1 -> d */ - cublasGetMatrix(k, k, sizeof(magmaDoubleComplex), - da1_ref(0, 0), ldda1, - d, ldd); - - /* copy first panel of A2 from device to host: da2 -> a2 */ - cublasGetMatrix(k, n, sizeof(magmaDoubleComplex), - da2_ref(0, 0), ldda2, - a2, lda2); - - /* This is only blocked code for now */ - for (i = 0; i < m; i += nb) { - - ib = min(m-i, nb); - cols = n; - - /* Send the next panel (diagonal block of A1 & block column of A2) - to the CPU (in work_a1 and work_a2) and apply tsmqr update on the - remaining non updated panels */ - if (i>0) { - - /* copy the diag tile of A1 from device to host: da1 -> d */ - cublasGetMatrix(ib, ib, sizeof(magmaDoubleComplex), - da1_ref(i, i), ldda1, - d, ldd); - - /* copy panel of A2 from device to host: da2 -> a2 */ - cublasGetMatrix(ib, cols, sizeof(magmaDoubleComplex), - da2_ref(i, 0), ldda2, - a2, lda2); - - /* Apply H' to A(i+2*ib:m,i:n) from the left */ - rows = m-old_i-2*old_ib; - if (rows > 0){ - magma_ztsmlq_gpu( MagmaRight, MagmaConjTrans, - rows, old_ib, rows, cols, old_ib, old_ib, - da1_ref(old_i+2*old_ib, old_i), ldda1, - da2_ref(old_i+2*old_ib, 0), ldda2, - da2_ref(old_i, 0), ldda2, - dt_ref(0, old_i), lddt, - dwork, lddwork, - dwork + nb * lddwork, nb, - stream ); - } - - } - - /* compute LQ factorization of the panel of A2 ib x cols */ - CORE_ztslqt(ib, cols, ib, - (double _Complex*) d, ldd, - (double _Complex*) a2, lda2, - (double _Complex*) t, ldt, - (double _Complex*) tau, - (double _Complex*) hwork); - - /* Send the panel from A2 back to the GPU */ - cublasSetMatrix(ib, cols, sizeof(magmaDoubleComplex), - a2, lda2, - da2_ref(i, 0), ldda2); - - /* Send the triangular factor T from hwork to the GPU */ - cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex), - t, ldt, - dt_ref(0, i), lddt); - - /* get back the diag tile in A1 from host to device: d -> da1 */ - cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex), - d, ldd, - da1_ref(i, i), ldda1); - - /* tsmlq update on one panel forward (look ahead 1) */ - if (i + ib < m) { - - if (i+2*ib < m){ - rows = ib; - } - else{ - rows = m-i-ib; - } - - /* Apply H' to A(i+ib:i+2*ib,i:n) from the right */ - magma_ztsmlq_gpu( MagmaRight, MagmaConjTrans, - rows, ib, rows, cols, ib, ib, - da1_ref(i+ib, i), ldda1, - da2_ref(i+ib, 0), ldda2, - da2_ref(i, 0), ldda2, - dt_ref(0, i), lddt, - dwork, lddwork, - dwork + nb * lddwork, nb, - stream ); - old_i = i; - old_ib = ib; - } - } - -#undef da1_ref -#undef da2_ref -#undef a2_ref -#undef t_ref -#undef dt_ref -#undef d_ref - - return MAGMA_SUCCESS; - -} /* magma_ztslqt_gpu */ - static void cl_ztslqt_cuda_func(void *descr[], void *cl_arg) { MORSE_starpu_ws_t *h_work; @@ -353,12 +200,13 @@ static void cl_ztslqt_cuda_func(void *descr[], void *cl_arg) h_D = h_W + ib*m; stream = starpu_cuda_get_local_stream(); - magma_ztslqt_gpu( m, n, ib, - d_A1, lda1, d_A2, lda2, - h_A2, ib, - d_T, ldt, h_T, ib, - d_D, h_D, ib, h_TAU, - h_W, d_W, stream); + CUDA_ztslqt( + m, n, ib, + d_A1, lda1, d_A2, lda2, + h_A2, ib, + d_T, ldt, h_T, ib, + d_D, h_D, ib, h_TAU, + h_W, d_W, stream); cudaThreadSynchronize(); } diff --git a/runtime/starpu/codelets/codelet_ztsmlq.c b/runtime/starpu/codelets/codelet_ztsmlq.c index 03b74668e..0b78ac57d 100644 --- a/runtime/starpu/codelets/codelet_ztsmlq.c +++ b/runtime/starpu/codelets/codelet_ztsmlq.c @@ -213,135 +213,6 @@ static void cl_ztsmlq_cpu_func(void *descr[], void *cl_arg) } #if defined(CHAMELEON_USE_MAGMA) - -magma_int_t -magma_ztsmlq_gpu( magma_side_t side, magma_trans_t trans, - magma_int_t M1, magma_int_t N1, - magma_int_t M2, magma_int_t N2, - magma_int_t K, magma_int_t IB, - magmaDoubleComplex *A1, magma_int_t LDA1, - magmaDoubleComplex *A2, magma_int_t LDA2, - const magmaDoubleComplex *V, magma_int_t LDV, - const magmaDoubleComplex *T, magma_int_t LDT, - magmaDoubleComplex *WORK, magma_int_t LDWORK, - magmaDoubleComplex *WORKC, magma_int_t LDWORKC, - CUstream stream) -{ - int i, i1, i3; - int NW; - int kb; - int ic = 0; - int jc = 0; - int mi = M1; - int ni = N1; - - /* Check input arguments */ - if ((side != MagmaLeft) && (side != MagmaRight)) { - return -1; - } - - /* NW is the minimum dimension of WORK */ - if (side == MagmaLeft) { - NW = IB; - } - else { - NW = N1; - } - - if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) { - return -2; - } - if (M1 < 0) { - return -3; - } - if (N1 < 0) { - return -4; - } - if ( (M2 < 0) || - ( (M2 != M1) && (side == MagmaRight) ) ){ - return -5; - } - if ( (N2 < 0) || - ( (N2 != N1) && (side == MagmaLeft) ) ){ - return -6; - } - if ((K < 0) || - ( (side == MagmaLeft) && (K > M1) ) || - ( (side == MagmaRight) && (K > N1) ) ) { - return -7; - } - if (IB < 0) { - return -8; - } - if (LDA1 < max(1,M1)){ - return -10; - } - if (LDA2 < max(1,M2)){ - return -12; - } - if (LDV < max(1,K)){ - return -14; - } - if (LDT < max(1,IB)){ - return -16; - } - if (LDWORK < max(1,NW)){ - return -18; - } - - /* Quick return */ - if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) - return MAGMA_SUCCESS; - - if (((side == MagmaLeft) && (trans == MagmaNoTrans)) - || ((side == MagmaRight) && (trans != MagmaNoTrans))) { - i1 = 0; - i3 = IB; - } - else { - i1 = ((K-1) / IB)*IB; - i3 = -IB; - } - - if (trans == MagmaNoTrans) { - trans = MagmaConjTrans; - } - else { - trans = MagmaNoTrans; - } - - for(i = i1; (i > -1) && (i < K); i += i3) { - kb = min(IB, K-i); - - if (side == MagmaLeft) { - /* - * H or H' is applied to C(i:m,1:n) - */ - mi = M1 - i; - ic = i; - } - else { - /* - * H or H' is applied to C(1:m,i:n) - */ - ni = N1 - i; - jc = i; - } - - /* - * Apply H or H' (NOTE: CORE_zparfb used to be CORE_ztsrfb) - */ - magma_zparfb_gpu( side, trans, MagmaForward, MagmaRowwise, - mi, ni, M2, N2, kb, 0, - A1 + LDA1*jc+ic, LDA1, - A2, LDA2, - V + i, LDV, - T + LDT*i, LDT, - WORK, LDWORK, WORKC, LDWORKC, stream ); - } - return MAGMA_SUCCESS; -} - static void cl_ztsmlq_cuda_func(void *descr[], void *cl_arg) { MORSE_enum side; @@ -380,7 +251,7 @@ static void cl_ztsmlq_cuda_func(void *descr[], void *cl_arg) stream = starpu_cuda_get_local_stream(); cublasSetKernelStream( stream ); - magma_ztsmlq_gpu( side, trans, m1, n1, m2, n2, k, ib, + CUDA_ztsmlq( side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, W, ldwork, WC, ldworkc, stream ); @@ -388,7 +259,6 @@ static void cl_ztsmlq_cuda_func(void *descr[], void *cl_arg) cudaStreamSynchronize( stream ); #endif } - #endif /* diff --git a/runtime/starpu/codelets/codelet_ztsmqr.c b/runtime/starpu/codelets/codelet_ztsmqr.c index 46a586181..56dee7ab5 100644 --- a/runtime/starpu/codelets/codelet_ztsmqr.c +++ b/runtime/starpu/codelets/codelet_ztsmqr.c @@ -242,683 +242,6 @@ static void cl_ztsmqr_cpu_func(void *descr[], void *cl_arg) #if defined(CHAMELEON_USE_MAGMA) - -#if defined(CHAMELEON_USE_CUBLAS_V2) -magma_int_t -magma_zparfb_gpu(magma_side_t side, magma_trans_t trans, - magma_direct_t direct, magma_storev_t storev, - magma_int_t M1, magma_int_t N1, - magma_int_t M2, magma_int_t N2, - magma_int_t K, magma_int_t L, - magmaDoubleComplex *A1, magma_int_t LDA1, - magmaDoubleComplex *A2, magma_int_t LDA2, - const magmaDoubleComplex *V, magma_int_t LDV, - const magmaDoubleComplex *T, magma_int_t LDT, - magmaDoubleComplex *WORK, magma_int_t LDWORK, - magmaDoubleComplex *WORKC, magma_int_t LDWORKC, - CUstream stream) - -{ -#if defined(PRECISION_z) || defined(PRECISION_c) - cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0); - cuDoubleComplex zone = make_cuDoubleComplex(1.0, 0.0); - cuDoubleComplex mzone = make_cuDoubleComplex(-1.0, 0.0); -#else - double zzero = 0.0; - double zone = 1.0; - double mzone = -1.0; -#endif /* defined(PRECISION_z) || defined(PRECISION_c) */ - - int j; - magma_trans_t transW; - magma_trans_t transA2; - cublasHandle_t handle; - cublasStatus_t stat; - cublasOperation_t cublasTrans; - cublasOperation_t cublasTransW; - cublasOperation_t cublasTransA2; - - stat = cublasCreate(&handle); - if (stat != CUBLAS_STATUS_SUCCESS) { - printf ("CUBLAS initialization failed\n"); - assert( stat == CUBLAS_STATUS_SUCCESS ); - } - - stat = cublasSetStream(handle, stream); - if (stat != CUBLAS_STATUS_SUCCESS) { - printf ("cublasSetStream failed\n"); - assert( stat == CUBLAS_STATUS_SUCCESS ); - } - - if (trans == MagmaNoTrans){ - cublasTrans = CUBLAS_OP_N; - }else if(trans == MagmaTrans){ - cublasTrans = CUBLAS_OP_T; - }else if(trans == MagmaConjTrans){ - cublasTrans = CUBLAS_OP_C; - }else{ - fprintf(stderr, "Error in magma_zparfb_gpu: bad trans parameter %d\n", trans); - } - - /* Check input arguments */ - if ((side != MagmaLeft) && (side != MagmaRight)) { - return -1; - } - if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) { - return -2; - } - if ((direct != MagmaForward) && (direct != MagmaBackward)) { - return -3; - } - if ((storev != MagmaColumnwise) && (storev != MagmaRowwise)) { - return -4; - } - if (M1 < 0) { - return -5; - } - if (N1 < 0) { - return -6; - } - if ((M2 < 0) || - ( (side == MagmaRight) && (M1 != M2) ) ) { - return -7; - } - if ((N2 < 0) || - ( (side == MagmaLeft) && (N1 != N2) ) ) { - return -8; - } - if (K < 0) { - return -9; - } - - /* Quick return */ - if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0)) - return MAGMA_SUCCESS; - - if (direct == MagmaForward) { - - if (side == MagmaLeft) { - - /* - * Column or Rowwise / Forward / Left - * ---------------------------------- - * - * Form H * A or H' * A where A = ( A1 ) - * ( A2 ) - */ - - /* - * W = A1 + V' * A2: - * W = A1 - * W = W + V' * A2 - * - */ - cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex), - A1, LDA1 * sizeof(cuDoubleComplex), - K * sizeof(cuDoubleComplex), N1, - cudaMemcpyDeviceToDevice, stream ); - - transW = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans; - transA2 = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans; - - if (transW == MagmaNoTrans){ - cublasTransW = CUBLAS_OP_N; - }else if(transW == MagmaTrans){ - cublasTransW = CUBLAS_OP_T; - }else if(transW == MagmaConjTrans){ - cublasTransW = CUBLAS_OP_C; - }else{ - fprintf(stderr, "Error in magma_zparfb_gpu: bad transW parameter %d\n", transW); - } - if (transA2 == MagmaNoTrans){ - cublasTransA2 = CUBLAS_OP_N; - }else if(transA2 == MagmaTrans){ - cublasTransA2 = CUBLAS_OP_T; - }else if(transA2 == MagmaConjTrans){ - cublasTransA2 = CUBLAS_OP_C; - }else{ - fprintf(stderr, "Error in magma_zparfb_gpu: bad transA2 parameter %d\n", transA2); - } - - cublasZgemm(handle, cublasTransW, CUBLAS_OP_N, - K, N1, M2, - (const cuDoubleComplex *) &zone, - (const cuDoubleComplex*)V /* K*M2 */, LDV, - (const cuDoubleComplex*)A2 /* M2*N1 */, LDA2, - (const cuDoubleComplex *) &zone, - (cuDoubleComplex*)WORK /* K*N1 */, LDWORK); - - if (WORKC == NULL) { - /* W = op(T) * W */ - cublasZtrmm( handle, - CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, - cublasTrans, CUBLAS_DIAG_NON_UNIT, - K, N2, - (const cuDoubleComplex *) &zone, - (const cuDoubleComplex*)T, LDT, - (cuDoubleComplex*)WORK, LDWORK, - (cuDoubleComplex*)WORK, LDWORK); - - - /* A1 = A1 - W = A1 - op(T) * W */ - for(j = 0; j < N1; j++) { - cublasZaxpy(handle, K, (const cuDoubleComplex *) &mzone, - (const cuDoubleComplex*)(WORK + LDWORK*j), 1, - (cuDoubleComplex*)(A1 + LDA1*j), 1); - } - - /* A2 = A2 - op(V) * W */ - cublasZgemm(handle, cublasTransA2, CUBLAS_OP_N, - M2, N2, K, - (const cuDoubleComplex *) &mzone, - (const cuDoubleComplex*)V /* M2*K */, LDV, - (const cuDoubleComplex*)WORK /* K*N2 */, LDWORK, - (const cuDoubleComplex *) &zone, - (cuDoubleComplex*)A2 /* m2*N2 */, LDA2); - - } else { - /* Wc = V * op(T) */ - cublasZgemm( handle, cublasTransA2, cublasTrans, - M2, K, K, - (const cuDoubleComplex *) &zone, V, LDV, - T, LDT, - (const cuDoubleComplex *) &zzero, WORKC, LDWORKC ); - - /* A1 = A1 - opt(T) * W */ - cublasZgemm( handle, cublasTrans, CUBLAS_OP_N, - K, N1, K, - (const cuDoubleComplex *) &mzone, - (const cuDoubleComplex *)T, LDT, - (const cuDoubleComplex *)WORK, LDWORK, - (const cuDoubleComplex *) &zone, - (cuDoubleComplex*)A1, LDA1 ); - - /* A2 = A2 - Wc * W */ - cublasZgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, - M2, N2, K, - (const cuDoubleComplex *) &mzone, - (const cuDoubleComplex *)WORKC, LDWORKC, - (const cuDoubleComplex *)WORK, LDWORK, - (const cuDoubleComplex *) &zone, - (cuDoubleComplex *)A2, LDA2 ); - } - } - else { - /* - * Column or Rowwise / Forward / Right - * ----------------------------------- - * - * Form H * A or H' * A where A = ( A1 A2 ) - * - */ - - /* - * W = A1 + A2 * V': - * W = A1 - * W = W + A2 * V' - * - */ - cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex), - A1, LDA1 * sizeof(cuDoubleComplex), - M1 * sizeof(cuDoubleComplex), K, - cudaMemcpyDeviceToDevice, stream ); - - transW = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans; - transA2 = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans; - - if (transW == MagmaNoTrans){ - cublasTransW = CUBLAS_OP_N; - }else if(transW == MagmaTrans){ - cublasTransW = CUBLAS_OP_T; - }else if(transW == MagmaConjTrans){ - cublasTransW = CUBLAS_OP_C; - }else{ - fprintf(stderr, "Error in magma_zparfb_gpu: bad transW parameter %d\n", transW); - } - if (transA2 == MagmaNoTrans){ - cublasTransA2 = CUBLAS_OP_N; - }else if(transA2 == MagmaTrans){ - cublasTransA2 = CUBLAS_OP_T; - }else if(transA2 == MagmaConjTrans){ - cublasTransA2 = CUBLAS_OP_C; - }else{ - fprintf(stderr, "Error in magma_zparfb_gpu: bad transA2 parameter %d\n", transA2); - } - - cublasZgemm(handle, CUBLAS_OP_N, cublasTransW, - M1, K, N2, - (const cuDoubleComplex *) &zone, - (const cuDoubleComplex*)A2 /* M1*N2 */, LDA2, - (const cuDoubleComplex*)V /* N2*K */, LDV, - (const cuDoubleComplex *) &zone, - (cuDoubleComplex*)WORK /* M1*K */, LDWORK); - - if (WORKC == NULL) { - /* W = W * op(T) */ - cublasZtrmm( handle, - CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, - cublasTrans, CUBLAS_DIAG_NON_UNIT, - M2, K, - (const cuDoubleComplex *) &zone, - (const cuDoubleComplex*)T, LDT, - (cuDoubleComplex*)WORK, LDWORK, - (cuDoubleComplex*)WORK, LDWORK); - - - /* A1 = A1 - W = A1 - W * op(T) */ - for(j = 0; j < K; j++) { - cublasZaxpy(handle, M1, (const cuDoubleComplex *) &mzone, - (const cuDoubleComplex*)(WORK + LDWORK*j), 1, - (cuDoubleComplex*)(A1 + LDA1*j), 1); - } - - /* A2 = A2 - W * op(V) */ - cublasZgemm(handle, CUBLAS_OP_N, cublasTransA2, - M2, N2, K, - (const cuDoubleComplex *) &mzone, - (const cuDoubleComplex*)WORK /* M2*K */, LDWORK, - (const cuDoubleComplex*)V /* K*N2 */, LDV, - (const cuDoubleComplex *) &zone, - (cuDoubleComplex*)A2 /* M2*N2 */, LDA2); - - } else { - /* A1 = A1 - W * opt(T) */ - cublasZgemm( handle, CUBLAS_OP_N, cublasTrans, - M1, K, K, - (const cuDoubleComplex *) &mzone, - (const cuDoubleComplex *)WORK, LDWORK, - (const cuDoubleComplex *)T, LDT, - (const cuDoubleComplex *) &zone, - (cuDoubleComplex *)A1, LDA1 ); - - /* Wc = op(T) * V */ - cublasZgemm( handle, cublasTrans, cublasTransA2, - K, N2, K, - (const cuDoubleComplex *) &zone, - (const cuDoubleComplex *)T, LDT, - (const cuDoubleComplex *)V, LDV, - (const cuDoubleComplex *) &zzero, - (cuDoubleComplex *)WORKC, LDWORKC ); - - /* A2 = A2 - W * Wc */ - cublasZgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, - M2, N2, K, - (const cuDoubleComplex *) &mzone, - (const cuDoubleComplex *)WORK, LDWORK, - (const cuDoubleComplex *)WORKC, LDWORKC, - (const cuDoubleComplex *) &zone, - (cuDoubleComplex *)A2, LDA2 ); - } - } - } - else { - fprintf(stderr, "Not implemented (Backward / Left or Right)"); - return MAGMA_ERR_NOT_SUPPORTED; - } - - cublasDestroy(handle); - - return MAGMA_SUCCESS; -} -#else /* CHAMELEON_USE_CUBLAS_V2 */ -magma_int_t -magma_zparfb_gpu(magma_side_t side, magma_trans_t trans, - magma_direct_t direct, magma_storev_t storev, - magma_int_t M1, magma_int_t N1, - magma_int_t M2, magma_int_t N2, - magma_int_t K, magma_int_t L, - magmaDoubleComplex *A1, magma_int_t LDA1, - magmaDoubleComplex *A2, magma_int_t LDA2, - const magmaDoubleComplex *V, magma_int_t LDV, - const magmaDoubleComplex *T, magma_int_t LDT, - magmaDoubleComplex *WORK, magma_int_t LDWORK, - magmaDoubleComplex *WORKC, magma_int_t LDWORKC, - CUstream stream) -{ -#if defined(PRECISION_z) || defined(PRECISION_c) - cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0); - cuDoubleComplex zone = make_cuDoubleComplex(1.0, 0.0); - cuDoubleComplex mzone = make_cuDoubleComplex(-1.0, 0.0); -#else - double zzero = 0.0; - double zone = 1.0; - double mzone = -1.0; -#endif /* defined(PRECISION_z) || defined(PRECISION_c) */ - - int j; - magma_trans_t transW; - magma_trans_t transA2; - - /* Check input arguments */ - if ((side != MagmaLeft) && (side != MagmaRight)) { - return -1; - } - if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) { - return -2; - } - if ((direct != MagmaForward) && (direct != MagmaBackward)) { - return -3; - } - if ((storev != MagmaColumnwise) && (storev != MagmaRowwise)) { - return -4; - } - if (M1 < 0) { - return -5; - } - if (N1 < 0) { - return -6; - } - if ((M2 < 0) || - ( (side == MagmaRight) && (M1 != M2) ) ) { - return -7; - } - if ((N2 < 0) || - ( (side == MagmaLeft) && (N1 != N2) ) ) { - return -8; - } - if (K < 0) { - return -9; - } - - /* Quick return */ - if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0)) - return MAGMA_SUCCESS; - - if (direct == MagmaForward) { - - if (side == MagmaLeft) { - - /* - * Column or Rowwise / Forward / Left - * ---------------------------------- - * - * Form H * A or H' * A where A = ( A1 ) - * ( A2 ) - */ - - /* - * W = A1 + V' * A2: - * W = A1 - * W = W + V' * A2 - * - */ - cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex), - A1, LDA1 * sizeof(cuDoubleComplex), - K * sizeof(cuDoubleComplex), N1, - cudaMemcpyDeviceToDevice, stream ); - - transW = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans; - transA2 = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans; - - cublasZgemm(morse_lapack_const(transW), morse_lapack_const(MagmaNoTrans), - K, N1, M2, - zone, - (cuDoubleComplex*)V /* K*M2 */, LDV, - (cuDoubleComplex*)A2 /* M2*N1 */, LDA2, - zone, - (cuDoubleComplex*)WORK /* K*N1 */, LDWORK); - - if (WORKC == NULL) { - /* W = op(T) * W */ - cublasZtrmm( morse_lapack_const(MagmaLeft), morse_lapack_const(MagmaUpper), - morse_lapack_const(trans), morse_lapack_const(MagmaNonUnit), - K, N2, - zone, - (cuDoubleComplex*)T, LDT, - (cuDoubleComplex*)WORK, LDWORK); - - - /* A1 = A1 - W = A1 - op(T) * W */ - for(j = 0; j < N1; j++) { - cublasZaxpy(K, mzone, - (cuDoubleComplex*)(WORK + LDWORK*j), 1, - (cuDoubleComplex*)(A1 + LDA1*j), 1); - } - - /* A2 = A2 - op(V) * W */ - cublasZgemm(morse_lapack_const(transA2), morse_lapack_const(MagmaNoTrans), - M2, N2, K, - mzone, - (cuDoubleComplex*)V /* M2*K */, LDV, - (cuDoubleComplex*)WORK /* K*N2 */, LDWORK, - zone, - (cuDoubleComplex*)A2 /* m2*N2 */, LDA2); - - } else { - /* Wc = V * op(T) */ - cublasZgemm( morse_lapack_const(transA2), morse_lapack_const(trans), - M2, K, K, - zone, V, LDV, - T, LDT, - zzero, WORKC, LDWORKC ); - - /* A1 = A1 - opt(T) * W */ - cublasZgemm( morse_lapack_const(trans), morse_lapack_const(MagmaNoTrans), - K, N1, K, - mzone, T, LDT, - WORK, LDWORK, - zone, A1, LDA1 ); - - /* A2 = A2 - Wc * W */ - cublasZgemm( morse_lapack_const(MagmaNoTrans), morse_lapack_const(MagmaNoTrans), - M2, N2, K, - mzone, WORKC, LDWORKC, - WORK, LDWORK, - zone, A2, LDA2 ); - } - } - else { - /* - * Column or Rowwise / Forward / Right - * ----------------------------------- - * - * Form H * A or H' * A where A = ( A1 A2 ) - * - */ - - /* - * W = A1 + A2 * V': - * W = A1 - * W = W + A2 * V' - * - */ - cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex), - A1, LDA1 * sizeof(cuDoubleComplex), - M1 * sizeof(cuDoubleComplex), K, - cudaMemcpyDeviceToDevice, stream ); - - transW = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans; - transA2 = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans; - - cublasZgemm(morse_lapack_const(MagmaNoTrans), morse_lapack_const(transW), - M1, K, N2, - zone, - (cuDoubleComplex*)A2 /* M1*N2 */, LDA2, - (cuDoubleComplex*)V /* N2*K */, LDV, - zone, - (cuDoubleComplex*)WORK /* M1*K */, LDWORK); - - if (WORKC == NULL) { - /* W = W * op(T) */ - cublasZtrmm( morse_lapack_const(MagmaRight), morse_lapack_const(MagmaUpper), - morse_lapack_const(trans), morse_lapack_const(MagmaNonUnit), - M2, K, - zone, - (cuDoubleComplex*)T, LDT, - (cuDoubleComplex*)WORK, LDWORK); - - - /* A1 = A1 - W = A1 - W * op(T) */ - for(j = 0; j < K; j++) { - cublasZaxpy(M1, mzone, - (cuDoubleComplex*)(WORK + LDWORK*j), 1, - (cuDoubleComplex*)(A1 + LDA1*j), 1); - } - - /* A2 = A2 - W * op(V) */ - cublasZgemm(morse_lapack_const(MagmaNoTrans), morse_lapack_const(transA2), - M2, N2, K, - mzone, - (cuDoubleComplex*)WORK /* M2*K */, LDWORK, - (cuDoubleComplex*)V /* K*N2 */, LDV, - zone, - (cuDoubleComplex*)A2 /* M2*N2 */, LDA2); - - } else { - /* A1 = A1 - W * opt(T) */ - cublasZgemm( morse_lapack_const(MagmaNoTrans), morse_lapack_const(trans), - M1, K, K, - mzone, WORK, LDWORK, - T, LDT, - zone, A1, LDA1 ); - - /* Wc = op(T) * V */ - cublasZgemm( morse_lapack_const(trans), morse_lapack_const(transA2), - K, N2, K, - zone, T, LDT, - V, LDV, - zzero, WORKC, LDWORKC ); - - /* A2 = A2 - W * Wc */ - cublasZgemm( morse_lapack_const(MagmaNoTrans), morse_lapack_const(MagmaNoTrans), - M2, N2, K, - mzone, WORK, LDWORK, - WORKC, LDWORKC, - zone, A2, LDA2 ); - } - } - } - else { - fprintf(stderr, "Not implemented (Backward / Left or Right)"); - return MAGMA_ERR_NOT_SUPPORTED; - } - - return MAGMA_SUCCESS; -} -#endif /* CHAMELEON_USE_CUBLAS_V2 */ - -magma_int_t -magma_ztsmqr_gpu( magma_side_t side, magma_trans_t trans, - magma_int_t M1, magma_int_t N1, - magma_int_t M2, magma_int_t N2, - magma_int_t K, magma_int_t IB, - magmaDoubleComplex *A1, magma_int_t LDA1, - magmaDoubleComplex *A2, magma_int_t LDA2, - const magmaDoubleComplex *V, magma_int_t LDV, - const magmaDoubleComplex *T, magma_int_t LDT, - magmaDoubleComplex *WORK, magma_int_t LDWORK, - magmaDoubleComplex *WORKC, magma_int_t LDWORKC, - CUstream stream) -{ - int i, i1, i3; - int NQ, NW; - int kb; - int ic = 0; - int jc = 0; - int mi = M1; - int ni = N1; - - /* Check input arguments */ - if ((side != MagmaLeft) && (side != MagmaRight)) { - return -1; - } - - /* NQ is the order of Q */ - if (side == MagmaLeft) { - NQ = M2; - NW = IB; - } - else { - NQ = N2; - NW = M1; - } - - if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) { - return -2; - } - if (M1 < 0) { - return -3; - } - if (N1 < 0) { - return -4; - } - if ( (M2 < 0) || - ( (M2 != M1) && (side == MagmaRight) ) ){ - return -5; - } - if ( (N2 < 0) || - ( (N2 != N1) && (side == MagmaLeft) ) ){ - return -6; - } - if ((K < 0) || - ( (side == MagmaLeft) && (K > M1) ) || - ( (side == MagmaRight) && (K > N1) ) ) { - return -7; - } - if (IB < 0) { - return -8; - } - if (LDA1 < max(1,M1)){ - return -10; - } - if (LDA2 < max(1,M2)){ - return -12; - } - if (LDV < max(1,NQ)){ - return -14; - } - if (LDT < max(1,IB)){ - return -16; - } - if (LDWORK < max(1,NW)){ - return -18; - } - - /* Quick return */ - if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) - return MAGMA_SUCCESS; - - if (((side == MagmaLeft) && (trans != MagmaNoTrans)) - || ((side == MagmaRight) && (trans == MagmaNoTrans))) { - i1 = 0; - i3 = IB; - } - else { - i1 = ((K-1) / IB)*IB; - i3 = -IB; - } - - for(i = i1; (i > -1) && (i < K); i += i3) { - kb = min(IB, K-i); - - if (side == MagmaLeft) { - /* - * H or H' is applied to C(i:m,1:n) - */ - mi = M1 - i; - ic = i; - } - else { - /* - * H or H' is applied to C(1:m,i:n) - */ - ni = N1 - i; - jc = i; - } - /* - * Apply H or H' (NOTE: CORE_zparfb used to be CORE_ztsrfb) - */ - magma_zparfb_gpu( side, trans, MagmaForward, MagmaColumnwise, - mi, ni, M2, N2, kb, 0, - A1 + LDA1*jc+ic, LDA1, - A2, LDA2, - V + LDV*i, LDV, - T + LDT*i, LDT, - WORK, LDWORK, WORKC, LDWORKC, stream ); - } - return MAGMA_SUCCESS; -} - static void cl_ztsmqr_cuda_func(void *descr[], void *cl_arg) { MORSE_enum side; @@ -957,9 +280,10 @@ static void cl_ztsmqr_cuda_func(void *descr[], void *cl_arg) stream = starpu_cuda_get_local_stream(); cublasSetKernelStream( stream ); - magma_ztsmqr_gpu( side, trans, m1, n1, m2, n2, k, ib, - A1, lda1, A2, lda2, V, ldv, T, ldt, - W, ldwork, WC, ldworkc, stream ); + CUDA_ztsmqr( + side, trans, m1, n1, m2, n2, k, ib, + A1, lda1, A2, lda2, V, ldv, T, ldt, + W, ldwork, WC, ldworkc, stream ); #ifndef STARPU_CUDA_ASYNC cudaStreamSynchronize( stream ); diff --git a/runtime/starpu/codelets/codelet_ztsqrt.c b/runtime/starpu/codelets/codelet_ztsqrt.c index 8fd3383be..8fd166263 100644 --- a/runtime/starpu/codelets/codelet_ztsqrt.c +++ b/runtime/starpu/codelets/codelet_ztsqrt.c @@ -160,183 +160,7 @@ static void cl_ztsqrt_cpu_func(void *descr[], void *cl_arg) CORE_ztsqrt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK); } - #if defined(CHAMELEON_USE_MAGMA) - -magma_int_t -magma_ztsqrt2_gpu( magma_int_t m, magma_int_t n, magma_int_t nb, - magmaDoubleComplex *da1, magma_int_t ldda1, - magmaDoubleComplex *da2, magma_int_t ldda2, - magmaDoubleComplex *a2, magma_int_t lda2, - magmaDoubleComplex *dt, magma_int_t lddt, - magmaDoubleComplex *t, magma_int_t ldt, - magmaDoubleComplex *dd, - magmaDoubleComplex *d, magma_int_t ldd, - magmaDoubleComplex *tau, - magmaDoubleComplex *hwork, - magmaDoubleComplex *dwork, - CUstream stream) -{ -#define da1_ref(a_1,a_2) ( da1+(a_2)*ldda1 + (a_1)) -#define da2_ref(a_1,a_2) ( da2+(a_2)*ldda2 + (a_1)) -#define a2_ref(a_1,a_2) ( a2+(a_2)*lda2 + (a_1)) -#define t_ref(a_1,a_2) ( t+(a_2)*ldt + (a_1)) -#define dt_ref(a_1,a_2) ( dt+(a_2)*lddt + (a_1)) -#define d_ref(a_1,a_2) ( d+(a_2)*ldd + (a_1)) - - int i, k, lddwork, old_i, old_ib, rows, cols; - int ib; - double _Complex one=1.; -// int lwkopt = n * nb; -// hwork[0] = *((magmaDoubleComplex*) &lwkopt); - - if (m < 0) { - return -1; - } else if (n < 0) { - return -2; - } else if (ldda2 < max(1,m)) { - return -4; - } - - k = min(m,n); - if (k == 0) { - hwork[0] = *((magmaDoubleComplex*) &one); - return MAGMA_SUCCESS; - } - - lddwork= nb; - - /* lower parts of little T must be zero: memset all to 0 for simplicity */ - memset(t, 0, nb*nb*sizeof(magmaDoubleComplex)); - cudaMemset(dt, 0, nb*n*sizeof(magmaDoubleComplex)); - - /* copy the first diag tile of A1 from device to host: da1 -> d */ - cublasGetMatrix(nb, nb, sizeof(magmaDoubleComplex), - da1_ref(0, 0), ldda1, - d, ldd); -// cudaMemcpy( d, da1_ref(0,0), -// nb*nb*sizeof(cuDoubleComplex), -// cudaMemcpyDeviceToHost ); - - /* copy first panel of A2 from device to host: da2 -> a2 */ -// cublasGetMatrix(m, nb, sizeof(magmaDoubleComplex), -// da2_ref(0, 0), ldda2, -// a2, lda2); - cudaMemcpy( a2, da2_ref(0, 0), - m*nb*sizeof(cuDoubleComplex), - cudaMemcpyDeviceToHost ); - - /* This is only blocked code for now */ - for (i = 0; i < n; i += nb) { - - ib = min(n-i, nb); - rows = m; - - /* Send the next panel (diagonal block of A1 & block column of A2) - to the CPU (in work_a1 and work_a2) and apply tsmqr update on the - remaining non updated panels */ - if (i>0) { - - /* copy the diag tile of A1 from device to host: da1 -> d */ - cublasGetMatrix(ib, ib, sizeof(magmaDoubleComplex), - da1_ref(i, i), ldda1, - d, ldd); -// cudaMemcpy( d, da1_ref(i,i), -// ib*ib*sizeof(cuDoubleComplex), -// cudaMemcpyDeviceToHost ); - - /* copy panel of A2 from device to host: da2 -> a2 */ - cublasGetMatrix(rows, ib, sizeof(magmaDoubleComplex), - da2_ref(0, i), ldda2, - a2, lda2); -// cudaMemcpy( a2, da2_ref(0,i), -// rows*ib*sizeof(cuDoubleComplex), -// cudaMemcpyDeviceToHost ); - - /* Apply H' to A(i:m,i+2*ib:n) from the left */ - cols = n-old_i-2*old_ib; - if (cols > 0){ - magma_ztsmqr_gpu( MagmaLeft, MagmaConjTrans, - old_ib, cols, rows, cols, old_ib, old_ib, - da1_ref(old_i, old_i+2*old_ib), ldda1, - da2_ref(0, old_i+2*old_ib), ldda2, - da2_ref(0, old_i), ldda2, - dt_ref(0, old_i), lddt, - dwork, old_ib, - dwork + old_ib * cols, rows, - stream ); - } - - } - - /* compute QR factorization of the panel of A2 rows x ib */ - CORE_ztsqrt(rows, ib, ib, - (double _Complex*) d, ldd, - (double _Complex*) a2, lda2, - (double _Complex*) t, ldt, - (double _Complex*) tau, - (double _Complex*) hwork); - - /* Send the panel from A2 back to the GPU */ - cublasSetMatrix(rows, ib, sizeof(magmaDoubleComplex), - a2, lda2, - da2_ref(0, i), ldda2); -// cudaMemcpy( da2_ref(0,i), a2, -// rows*ib*sizeof(cuDoubleComplex), -// cudaMemcpyHostToDevice ); - - /* Send the triangular factor T from hwork to the GPU */ - cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex), - t, ldt, - dt_ref(0, i), lddt); -// cudaMemcpy( dt_ref(0,i), t, -// ib*ib*sizeof(cuDoubleComplex), -// cudaMemcpyHostToDevice ); - - /* get back the diag tile in A1 from host to device: d -> da1 */ - cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex), - d, ldd, - da1_ref(i, i), ldda1); -// cudaMemcpy( da1_ref(i, i), d, -// ib*ib*sizeof(cuDoubleComplex), -// cudaMemcpyHostToDevice ); - - /* tsmqr update on one panel forward (look ahead 1) */ - if (i + ib < n) { - - if (i+2*ib < n){ - cols = ib; - } - else{ - cols = n-i-ib; - } - - /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ - magma_ztsmqr_gpu( MagmaLeft, MagmaConjTrans, - ib, cols, rows, cols, ib, ib, - da1_ref(i, i+ib), ldda1, - da2_ref(0, i+ib), ldda2, - da2_ref(0, i), ldda2, - dt_ref(0, i), lddt, - dwork, ib, - dwork + ib * cols, rows, - stream ); - old_i = i; - old_ib = ib; - } - } - -#undef da1_ref -#undef da2_ref -#undef a2_ref -#undef t_ref -#undef dt_ref -#undef d_ref - - return MAGMA_SUCCESS; - -} /* magma_ztsqrt_gpu */ - static void cl_ztsqrt_cuda_func(void *descr[], void *cl_arg) { MORSE_starpu_ws_t *h_work; @@ -366,18 +190,16 @@ static void cl_ztsqrt_cuda_func(void *descr[], void *cl_arg) h_D = h_W + ib*n; stream = starpu_cuda_get_local_stream(); - magma_ztsqrt2_gpu( m, n, ib, - d_A1, lda1, d_A2, lda2, - h_A2, lda2, - d_T, ldt, h_T, ib, - d_D, h_D, ib, h_TAU, - h_W, d_W, stream); - + CUDA_ztsqrt( + m, n, ib, + d_A1, lda1, d_A2, lda2, + h_A2, lda2, + d_T, ldt, h_T, ib, + d_D, h_D, ib, h_TAU, + h_W, d_W, stream); cudaThreadSynchronize(); } - #endif - /* * Codelet definition */ diff --git a/runtime/starpu/codelets/codelet_ztstrf.c b/runtime/starpu/codelets/codelet_ztstrf.c index e76985b02..cc02ea0f1 100644 --- a/runtime/starpu/codelets/codelet_ztstrf.c +++ b/runtime/starpu/codelets/codelet_ztstrf.c @@ -247,13 +247,14 @@ static void cl_ztstrf_cuda_func(void *descr[], void *cl_arg) /* Initialize L to 0 */ memset(hL, 0, ldl*nb*sizeof(cuDoubleComplex)); - magma_ztstrf_gpu( MagmaColMajor, m, n, ib, nb, - hU, ldu, dU, ldu, - hA, lda, dA, lda, - hL, ldl, dL, ldl, - ipiv, - hw, ldwork, dw, lda, - &info ); + CUDA_ztstrf( + MagmaColMajor, m, n, ib, nb, + hU, ldu, dU, ldu, + hA, lda, dA, lda, + hL, ldl, dL, ldl, + ipiv, + hw, ldwork, dw, lda, + &info ); cudaThreadSynchronize(); } diff --git a/runtime/starpu/codelets/codelet_zunmlq.c b/runtime/starpu/codelets/codelet_zunmlq.c index 7dbc938e2..c612cd15c 100644 --- a/runtime/starpu/codelets/codelet_zunmlq.c +++ b/runtime/starpu/codelets/codelet_zunmlq.c @@ -185,114 +185,6 @@ static void cl_zunmlq_cpu_func(void *descr[], void *cl_arg) } #if defined(CHAMELEON_USE_MAGMA) - -magma_int_t -magma_zunmlqt_gpu( magma_side_t side, magma_trans_t trans, - magma_int_t M, magma_int_t N, magma_int_t K, magma_int_t IB, - const magmaDoubleComplex *A, magma_int_t LDA, - const magmaDoubleComplex *T, magma_int_t LDT, - magmaDoubleComplex *C, magma_int_t LDC, - magmaDoubleComplex *WORK, magma_int_t LDWORK ) -{ - int i, kb; - int i1, i3; - int nq, nw; - int ic = 0; - int jc = 0; - int ni = N; - int mi = M; - - /* Check input arguments */ - if ((side != MagmaLeft) && (side != MagmaRight)) { - return -1; - } - /* - * NQ is the order of Q and NW is the minimum dimension of WORK - */ - if (side == MagmaLeft) { - nq = M; - nw = N; - } - else { - nq = N; - nw = M; - } - - if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) { - return -2; - } - if (M < 0) { - return -3; - } - if (N < 0) { - return -4; - } - if ((K < 0) || (K > nq)) { - return -5; - } - if ((IB < 0) || ( (IB == 0) && ((M > 0) && (N > 0)) )) { - return -6; - } - if ((LDA < max(1,K)) && (K > 0)) { - return -8; - } - if ((LDC < max(1,M)) && (M > 0)) { - return -12; - } - if ((LDWORK < max(1,nw)) && (nw > 0)) { - return -14; - } - - /* Quick return */ - if ((M == 0) || (N == 0) || (K == 0)) - return MAGMA_SUCCESS; - - if (((side == MagmaLeft) && (trans == MagmaNoTrans)) - || ((side == MagmaRight) && (trans != MagmaNoTrans))) { - i1 = 0; - i3 = IB; - } - else { - i1 = ( ( K-1 ) / IB )*IB; - i3 = -IB; - } - - if( trans == MorseNoTrans) { - trans = MorseConjTrans; - } - else { - trans = MorseNoTrans; - } - - for(i = i1; (i >- 1) && (i < K); i+=i3 ) { - kb = min(IB, K-i); - - if (side == MagmaLeft) { - /* - * H or H' is applied to C(i:m,1:n) - */ - mi = M - i; - ic = i; - } - else { - /* - * H or H' is applied to C(1:m,i:n) - */ - ni = N - i; - jc = i; - } - - magma_zlarfb_gpu( side, trans, MagmaForward, MagmaRowwise, - mi, ni, kb, - A + LDA * i + i, LDA, - T + LDT * i, LDT, - C + LDC * jc + ic, LDC, - WORK, LDWORK); - } - return MAGMA_SUCCESS; -} - - static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg) { MORSE_starpu_ws_t *d_work; @@ -314,8 +206,9 @@ static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg) C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); WORK = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */ - magma_zunmlqt_gpu( side, trans, m, n, k, ib, - A, lda, T, ldt, C, ldc, WORK, ldwork ); + CUDA_zunmlqt( + side, trans, m, n, k, ib, + A, lda, T, ldt, C, ldc, WORK, ldwork ); cudaThreadSynchronize(); } diff --git a/runtime/starpu/codelets/codelet_zunmqr.c b/runtime/starpu/codelets/codelet_zunmqr.c index dff4ca7f9..0074e5c38 100644 --- a/runtime/starpu/codelets/codelet_zunmqr.c +++ b/runtime/starpu/codelets/codelet_zunmqr.c @@ -209,108 +209,6 @@ static void cl_zunmqr_cpu_func(void *descr[], void *cl_arg) } #if defined(CHAMELEON_USE_MAGMA) - -magma_int_t -magma_zunmqrt_gpu( magma_side_t side, magma_trans_t trans, - magma_int_t M, magma_int_t N, magma_int_t K, magma_int_t IB, - const magmaDoubleComplex *A, magma_int_t LDA, - const magmaDoubleComplex *T, magma_int_t LDT, - magmaDoubleComplex *C, magma_int_t LDC, - magmaDoubleComplex *WORK, magma_int_t LDWORK ) -{ - int i, kb; - int i1, i3; - int nq, nw; - int ic = 0; - int jc = 0; - int ni = N; - int mi = M; - - /* Check input arguments */ - if ((side != MagmaLeft) && (side != MagmaRight)) { - return -1; - } - /* - * NQ is the order of Q and NW is the minimum dimension of WORK - */ - if (side == MagmaLeft) { - nq = M; - nw = N; - } - else { - nq = N; - nw = M; - } - - if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) { - return -2; - } - if (M < 0) { - return -3; - } - if (N < 0) { - return -4; - } - if ((K < 0) || (K > nq)) { - return -5; - } - if ((IB < 0) || ( (IB == 0) && ((M > 0) && (N > 0)) )) { - return -6; - } - if ((LDA < max(1,nq)) && (nq > 0)) { - return -8; - } - if ((LDC < max(1,M)) && (M > 0)) { - return -12; - } - if ((LDWORK < max(1,nw)) && (nw > 0)) { - return -14; - } - - /* Quick return */ - if ((M == 0) || (N == 0) || (K == 0)) - return MAGMA_SUCCESS; - - if (((side == MagmaLeft) && (trans != MagmaNoTrans)) - || ((side == MagmaRight) && (trans == MagmaNoTrans))) { - i1 = 0; - i3 = IB; - } - else { - i1 = ( ( K-1 ) / IB )*IB; - i3 = -IB; - } - - for(i = i1; (i >- 1) && (i < K); i+=i3 ) { - kb = min(IB, K-i); - - if (side == MagmaLeft) { - /* - * H or H' is applied to C(i:m,1:n) - */ - mi = M - i; - ic = i; - } - else { - /* - * H or H' is applied to C(1:m,i:n) - */ - ni = N - i; - jc = i; - } - - magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, - mi, ni, kb, - A + LDA * i + i, LDA, - T + LDT * i, LDT, - C + LDC * jc + ic, LDC, - WORK, LDWORK); - } - - return MAGMA_SUCCESS; -} - - static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg) { MORSE_starpu_ws_t *d_work; @@ -332,8 +230,9 @@ static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg) C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); WORK = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */ - magma_zunmqrt_gpu( side, trans, m, n, k, ib, - A, lda, T, ldt, C, ldc, WORK, ldwork ); + CUDA_zunmqrt( + side, trans, m, n, k, ib, + A, lda, T, ldt, C, ldc, WORK, ldwork ); cudaThreadSynchronize(); } diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt index bb3c5b717..e8316ef5c 100644 --- a/testing/CMakeLists.txt +++ b/testing/CMakeLists.txt @@ -123,6 +123,11 @@ endif() if(NOT CHAMELEON_SIMULATION) + if(CHAMELEON_USE_CUDA OR CHAMELEON_USE_MAGMA) + list(APPEND libs_for_tests + cudablas + ) + endif() if(CHAMELEON_USE_CUDA) list(APPEND libs_for_tests ${CUDA_LIBRARIES} diff --git a/timing/CMakeLists.txt b/timing/CMakeLists.txt index 9d8727434..c15b1e223 100644 --- a/timing/CMakeLists.txt +++ b/timing/CMakeLists.txt @@ -169,6 +169,11 @@ endif() if(NOT CHAMELEON_SIMULATION) + if(CHAMELEON_USE_CUDA OR CHAMELEON_USE_MAGMA) + list(APPEND libs_for_timings + cudablas + ) + endif() if(CHAMELEON_USE_CUDA) list(APPEND libs_for_timings ${CUDA_LIBRARIES} @@ -185,7 +190,7 @@ if(NOT CHAMELEON_SIMULATION) list(APPEND libs_for_timings coreblas ${LAPACKE_LIBRARIES} - ${CBLAS_LIBRARIES} + ${CBLAS_LIBRARIES} ${LAPACK_SEQ_LIBRARIES} ${BLAS_SEQ_LIBRARIES} ${HWLOC_LIBRARIES} -- GitLab