diff --git a/cudablas/CMakeLists.txt b/cudablas/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8d434c5acde1a1ba2fce5b06d0e7e0b423a623a --- /dev/null +++ b/cudablas/CMakeLists.txt @@ -0,0 +1,35 @@ +### +# +# @copyright (c) 2009-2014 The University of Tennessee and The University +# of Tennessee Research Foundation. +# All rights reserved. +# @copyright (c) 2012-2014 Inria. All rights reserved. +# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. +# +### +# +# @file CMakeLists.txt +# +# @project MORSE +# MORSE is a software package provided by: +# Inria Bordeaux - Sud-Ouest, +# Univ. of Tennessee, +# King Abdullah Univesity of Science and Technology +# Univ. of California Berkeley, +# Univ. of Colorado Denver. +# +# @version 0.9.0 +# @author Cedric Castagnede +# @author Emmanuel Agullo +# @author Mathieu Faverge +# @date 13-07-2012 +# +### + +add_subdirectory(include) +add_subdirectory(compute) +#add_subdirectory(eztrace_module) + +### +### END CMakeLists.txt +### diff --git a/cudablas/compute/CMakeLists.txt b/cudablas/compute/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..fea59161b9009f47830cfdac2552df152aefa3d6 --- /dev/null +++ b/cudablas/compute/CMakeLists.txt @@ -0,0 +1,69 @@ +### +# +# @copyright (c) 2009-2014 The University of Tennessee and The University +# of Tennessee Research Foundation. +# All rights reserved. +# @copyright (c) 2012-2014 Inria. All rights reserved. +# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. +# +### +# +# @file CMakeLists.txt +# +# @project MORSE +# MORSE is a software package provided by: +# Inria Bordeaux - Sud-Ouest, +# Univ. of Tennessee, +# King Abdullah Univesity of Science and Technology +# Univ. of California Berkeley, +# Univ. of Colorado Denver. +# +# @author Florent Pruvost +# @date 16-09-2015 +# +### + +# Generate the morse sources for all possible precisions +# ------------------------------------------------------ +set(CUDABLAS_SRCS_GENERATED "") +set(ZSRC + cuda_zgelqt.c + #cuda_zgemerge.c + cuda_zgeqrt.c + cuda_zgessm.c + cuda_zgetrf.c + cuda_zlauum.c + #cuda_zparfb.c + cuda_zpotrf.c + cuda_zssssm.c + cuda_ztrtri.c + cuda_ztslqt.c + cuda_ztsmlq.c + cuda_ztsmqr.c + cuda_ztsqrt.c + cuda_ztstrf.c + cuda_zunmlqt.c + cuda_zunmqrt.c + ) + +precisions_rules_py(CUDABLAS_SRCS_GENERATED "${ZSRC}" + PRECISIONS "${CHAMELEON_PRECISION}") + +set(CUDABLAS_SRCS + ${CUDABLAS_SRCS_GENERATED} + ) + +# Compile step +# ------------ +add_library(cudablas ${CUDABLAS_SRCS}) +add_dependencies(cudablas cudablas_include) +set_property(TARGET cudablas PROPERTY LINKER_LANGUAGE Fortran) + +# installation +# ------------ +install(TARGETS cudablas + DESTINATION lib) + +### +### END CMakeLists.txt +### diff --git a/cudablas/compute/cuda_zgelqt.c b/cudablas/compute/cuda_zgelqt.c new file mode 100644 index 0000000000000000000000000000000000000000..c000530185406e40af79d966cf1a357a11d70bc0 --- /dev/null +++ b/cudablas/compute/cuda_zgelqt.c @@ -0,0 +1,172 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2015 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file cuda_zgelqt.c + * + * MORSE cudablas kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @author Florent Pruvost + * @date 2015-09-16 + * @precisions normal z -> c d s + * + **/ +#include "cudablas/include/cudablas.h" + +#if defined(CHAMELEON_USE_MAGMA) +int CUDA_zgelqt( + magma_int_t m, magma_int_t n, magma_int_t nb, + magmaDoubleComplex *da, magma_int_t ldda, + magmaDoubleComplex *v, magma_int_t ldv, + magmaDoubleComplex *dt, magma_int_t lddt, + magmaDoubleComplex *t, magma_int_t ldt, + magmaDoubleComplex *dd, + magmaDoubleComplex *d, magma_int_t ldd, + magmaDoubleComplex *tau, + magmaDoubleComplex *hwork, + magmaDoubleComplex *dwork, + CUstream stream) +{ +#define da_ref(a_1,a_2) ( da+(a_2)*(ldda) + (a_1)) +#define v_ref(a_1,a_2) ( v+(a_2)*(ldv) + (a_1)) +#define dt_ref(a_1,a_2) ( dt+(a_2)*(lddt) + (a_1)) +#define t_ref(a_1,a_2) ( t+(a_2)*(ldt) + (a_1)) + + int i, k, ib, lddwork, old_i, old_ib, rows, cols; + double _Complex one=1.; + + if (m < 0) { + return -1; + } else if (n < 0) { + return -2; + } else if (ldda < max(1,m)) { + return -4; + } + + k = min(m,n); + if (k == 0) { + hwork[0] = *((magmaDoubleComplex*) &one); + return MAGMA_SUCCESS; + } + + lddwork= m; + + /* lower parts of little T must be zero: memset to 0 for simplicity */ + memset(t_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex)); + cudaMemset(dt_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex)); + + /* copy first panel of A on the host */ + cublasGetMatrix(min(m, nb), n, sizeof(magmaDoubleComplex), + da_ref(0, 0), ldda, + v, ldv); + + /* Use blocked code initially */ + for (i = 0; i < k; i += nb) { + + ib = min(k-i, nb); + if (i+nb >= m) ib = min(m-i, nb); + cols = n-i; + + if (i > 0){ + + /* copy panel of A from device to host */ + cublasGetMatrix(ib, n, sizeof(magmaDoubleComplex), + da_ref(i, 0), ldda, + v, ldv); + + /* Apply H' to A(i+2*ib:m, i:n) from the right */ + rows = m-old_i-2*old_ib; + if (rows > 0){ + magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise, + rows, n-old_i, old_ib, + da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt, + da_ref(old_i+2*old_ib, old_i), ldda, + dwork, lddwork); + } + + /* copy the lower diag tile into d_A */ + CUDA_zgemerge(MagmaRight, MagmaUnit, old_ib, old_ib, + dd, ldd, da_ref(old_i, old_i), ldda, stream); + + } + + /* Form the triangular factor of the block reflector on the host + H = H'(i+ib-1) . . . H(i+1) H(i) */ + CORE_zgelqt(ib, cols, ib, + (double _Complex*) v_ref(0,i), ldv, + (double _Complex*) t_ref(0,0), ldt, + (double _Complex*) tau+i, + (double _Complex*) hwork); + + if ( i + ib < m ){ + /* put 0s in the lower triangular part of a panel (and 1s on the + diagonal); copy the lower triangular in d */ + CORE_zgesplit(MorseRight, MorseUnit, ib, min(cols,ib), + (double _Complex*) v_ref(0,i), ldv, + (double _Complex*) d, ldd); + + /* copy from host to device a tile diag */ + cublasSetMatrix( ib, min(cols,ib), sizeof(magmaDoubleComplex), + d, ldd, dd, ldd ); + } + + /* Send the triangular factor T to the GPU */ + cublasSetMatrix( ib, ib, sizeof(magmaDoubleComplex), + t_ref(0,0), ldt, dt_ref(0,i), lddt ); + + /* A panel (with zeros in lower tri of its diag) is ready to be used + in input of zlarfb_gpu: we send the panel to the gpu */ + cublasSetMatrix( ib, cols, sizeof(magmaDoubleComplex), + v_ref(0,i), ldv, da_ref(i,i), ldda ); + + if (i + ib < m) { + + if (i+2*ib < m){ + rows = ib; + } + else{ + rows = m-i-ib; + } + /* Apply H' to A(i+ib:i+2*ib, i:n) from the right */ + magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise, + rows, cols, ib, da_ref(i,i), ldda, dt_ref(0,i), + lddt, da_ref(i+ib,i), ldda, dwork, lddwork); + old_i = i; + old_ib = ib; + if (i+nb >= k){ + /* Apply H' to A(i+2*ib:m, i:n) from the right */ + rows = m-old_i-2*old_ib; + if (rows > 0){ + magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise, + rows, cols, old_ib, + da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt, + da_ref(old_i+2*old_ib, old_i), ldda, + dwork, lddwork); + } + /* copy the upper diag tile into d_A */ + CUDA_zgemerge(MagmaRight, MagmaUnit, old_ib, old_ib, + dd, ldd, da_ref(old_i, old_i), ldda, stream); + } + } + + } + +#undef da_ref +#undef v_ref +#undef dt_ref +#undef t_ref + + return MORSE_SUCCESS; +} +#endif diff --git a/cudablas/compute/cuda_zgemerge.c b/cudablas/compute/cuda_zgemerge.c new file mode 100644 index 0000000000000000000000000000000000000000..2d5a5d53d021d3c27803745fe2ad8d4194c6a3c5 --- /dev/null +++ b/cudablas/compute/cuda_zgemerge.c @@ -0,0 +1,137 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2015 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file cuda_zmerge.c + * + * MORSE cudablas kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @author Florent Pruvost + * @date 2015-09-16 + * @precisions normal z -> c d s + * + **/ +#include "cudablas/include/cudablas.h" + +#if defined(CHAMELEON_USE_MAGMA) +#if defined(CHAMELEON_USE_CUBLAS_V2) +int CUDA_zgemerge( + magma_side_t side, magma_diag_t diag, + magma_int_t M, magma_int_t N, + magmaDoubleComplex *A, magma_int_t LDA, + magmaDoubleComplex *B, magma_int_t LDB, + CUstream stream) +{ + int i, j; + magmaDoubleComplex *cola, *colb; + cublasHandle_t handle; + cublasStatus_t stat; + + stat = cublasCreate(&handle); + if (stat != CUBLAS_STATUS_SUCCESS) { + printf ("CUBLAS initialization failed\n"); + assert( stat == CUBLAS_STATUS_SUCCESS ); + } + + stat = cublasSetStream(handle, stream); + if (stat != CUBLAS_STATUS_SUCCESS) { + printf ("cublasSetStream failed\n"); + assert( stat == CUBLAS_STATUS_SUCCESS ); + } + + if (M < 0) { + return -1; + } + if (N < 0) { + return -2; + } + if ( (LDA < max(1,M)) && (M > 0) ) { + return -5; + } + if ( (LDB < max(1,M)) && (M > 0) ) { + return -7; + } + + if (side == MagmaLeft){ + for(i=0; i<N; i++){ + cola = A + i*LDA; + colb = B + i*LDB; +// cublasZcopy(handle, i+1, cola, 1, colb, 1); + cudaMemcpyAsync(colb , cola, + (i+1)*sizeof(cuDoubleComplex), + cudaMemcpyDeviceToDevice, stream); + } + }else{ + for(i=0; i<N; i++){ + cola = A + i*LDA; + colb = B + i*LDB; +// cublasZcopy(handle, M-i, cola + i, 1, colb + i, 1); + cudaMemcpyAsync(colb+i , cola+i, + (M-i)*sizeof(cuDoubleComplex), + cudaMemcpyDeviceToDevice, stream); + } + } + + cublasDestroy(handle); + + return MORSE_SUCCESS; +} +#else /* CHAMELEON_USE_CUBLAS_V2 */ +int CUDA_zgemerge( + magma_side_t side, magma_diag_t diag, + magma_int_t M, magma_int_t N, + magmaDoubleComplex *A, magma_int_t LDA, + magmaDoubleComplex *B, magma_int_t LDB, + CUstream stream) +{ + int i, j; + magmaDoubleComplex *cola, *colb; + + if (M < 0) { + return -1; + } + if (N < 0) { + return -2; + } + if ( (LDA < max(1,M)) && (M > 0) ) { + return -5; + } + if ( (LDB < max(1,M)) && (M > 0) ) { + return -7; + } + + if (side == MagmaLeft){ + for(i=0; i<N; i++){ + cola = A + i*LDA; + colb = B + i*LDB; +// cublasZcopy(i+1, cola, 1, colb, 1); + cudaMemcpyAsync(colb , cola, + (i+1)*sizeof(cuDoubleComplex), + cudaMemcpyDeviceToDevice, stream); + } + }else{ + for(i=0; i<N; i++){ + cola = A + i*LDA; + colb = B + i*LDB; +// cublasZcopy(M-i, cola + i, 1, colb + i, 1); + cudaMemcpyAsync(colb+i , cola+i, + (M-i)*sizeof(cuDoubleComplex), + cudaMemcpyDeviceToDevice, stream); + } + } + + return MORSE_SUCCESS; +} +#endif /* CHAMELEON_USE_CUBLAS_V2 */ +#endif diff --git a/cudablas/compute/cuda_zgeqrt.c b/cudablas/compute/cuda_zgeqrt.c new file mode 100644 index 0000000000000000000000000000000000000000..e8478ed1c13325d6e9cdfbd215fb36be40cffb8c --- /dev/null +++ b/cudablas/compute/cuda_zgeqrt.c @@ -0,0 +1,292 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2015 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file cuda_zgeqrt.c + * + * MORSE cudablas kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @author Florent Pruvost + * @date 2015-09-16 + * @precisions normal z -> c d s + * + **/ +#include "cudablas/include/cudablas.h" + +#if defined(CHAMELEON_USE_MAGMA) +int CUDA_zgeqrt( + magma_int_t m, magma_int_t n, magma_int_t nb, + magmaDoubleComplex *da, magma_int_t ldda, + magmaDoubleComplex *v, magma_int_t ldv, + magmaDoubleComplex *dt, magma_int_t lddt, + magmaDoubleComplex *t, magma_int_t ldt, + magmaDoubleComplex *dd, + magmaDoubleComplex *d, magma_int_t ldd, + magmaDoubleComplex *tau, + magmaDoubleComplex *hwork, + magmaDoubleComplex *dwork, + CUstream stream) +{ +#define da_ref(a_1,a_2) ( da+(a_2)*(ldda) + (a_1)) +#define v_ref(a_1,a_2) ( v+(a_2)*(ldv) + (a_1)) +#define dt_ref(a_1,a_2) ( dt+(a_2)*(lddt) + (a_1)) +#define t_ref(a_1,a_2) ( t+(a_2)*(ldt) + (a_1)) + + int i, k, ib, lddwork, old_i, old_ib, rows, cols; + double _Complex one=1.; + +// int lwkopt = n * nb; +// hwork[0] = *((magmaDoubleComplex*) &lwkopt); + + if (m < 0) { + return -1; + } else if (n < 0) { + return -2; + } else if (ldda < max(1,m)) { + return -4; + } + + k = min(m,n); + if (k == 0) { + hwork[0] = *((magmaDoubleComplex*) &one); + return MAGMA_SUCCESS; + } + + lddwork= k; + + /* lower parts of little T must be zero: memset to 0 for simplicity */ + memset(t_ref(0,0), 0, nb*nb*sizeof(magmaDoubleComplex)); + cudaMemsetAsync(dt_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex), stream); + + /* copy first panel of A on the host */ +// cublasGetMatrix(m, min(nb,n), sizeof(magmaDoubleComplex), +// da_ref(0, 0), ldda, +// v, ldv); + cudaMemcpy( v, da_ref(0,0), + m*min(nb,n)*sizeof(magmaDoubleComplex), + cudaMemcpyDeviceToHost); + + /* Use blocked code initially */ + for (i = 0; i < k; i += nb) { + + ib = min(k-i, nb); + if (i+nb>=n) ib = min(n-i, nb); + rows = m-i; + + if (i>0){ + + /* copy panel of A from device to host */ +// cublasGetMatrix(m, ib, sizeof(magmaDoubleComplex), +// da_ref(0, i), ldda, +// v, ldv); + /* copy panel of A from device to host */ + cudaMemcpy( v, da_ref(0,i), + m*ib*sizeof(magmaDoubleComplex), + cudaMemcpyDeviceToHost); + + /* Apply H' to A(i:m,i+2*ib:n) from the left */ + cols = n-old_i-2*old_ib; + if (cols > 0){ + magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, + m-old_i, cols, old_ib, + da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt, + da_ref(old_i, old_i+2*old_ib), ldda, + dwork, cols); + } + + /* copy the upper diag tile into d_A */ + CUDA_zgemerge(MagmaLeft, MagmaUnit, old_ib, old_ib, + dd, ldd, da_ref(old_i, old_i), ldda, stream); + + } + + /* Form the triangular factor of the block reflector on the host + H = H(i) H(i+1) . . . H(i+ib-1) */ + CORE_zgeqrt(rows, ib, ib, + (double _Complex*) v_ref(i,0), ldv, + (double _Complex*) t_ref(0,0), ldt, + (double _Complex*) tau+i, + (double _Complex*) hwork); + + if ( i + ib < n ){ + /* put 0s in the upper triangular part of a panel (and 1s on the + diagonal); copy the upper triangular in d */ + CORE_zgesplit(MorseLeft, MorseUnit, min(rows,ib), ib, + (double _Complex*) v_ref(i,0), ldv, + (double _Complex*) d, ldd); + + /* copy from host to device a tile diag */ + cublasSetMatrix( min(rows,ib), ib, sizeof(magmaDoubleComplex), + d, ldd, dd, ldd ); + } + + /* Send the triangular factor T to the GPU */ + cublasSetMatrix( ib, ib, sizeof(magmaDoubleComplex), + t_ref(0,0), ldt, dt_ref(0,i), lddt ); + + /* A panel (with zeros in upper tri of its diag) is ready to be used + in input of zlarfb_gpu: we send the panel to the gpu */ + cublasSetMatrix( rows, ib, sizeof(magmaDoubleComplex), + v_ref(i,0), ldv, da_ref(i,i), ldda ); + + if (i + ib < n) { + + if (i+2*ib < n){ + cols = ib; + } + else{ + cols = n-i-ib; + } + /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ + magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, + rows, cols, ib, da_ref(i,i), ldda, dt_ref(0,i), + lddt, da_ref(i,i+ib), ldda, dwork, cols); + old_i = i; + old_ib = ib; + if (i+nb>=k){ + /* Apply H' to A(i:m,i+2*ib:n) from the left */ + cols = n-old_i-2*old_ib; + if (cols > 0){ + magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, + rows, cols, old_ib, + da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt, + da_ref(old_i, old_i+2*old_ib), ldda, + dwork, cols); + } + /* copy the upper diag tile into d_A */ + CUDA_zgemerge(MagmaLeft, MagmaUnit, old_ib, old_ib, + dd, ldd, da_ref(old_i, old_i), ldda, stream); + } + } + + } + +#undef da_ref +#undef v_ref +#undef dt_ref +#undef t_ref + + return MORSE_SUCCESS; +} + +#if defined(CHAMELEON_USE_CUBLAS_V2) +int CUDA_zgemerge( + magma_side_t side, magma_diag_t diag, + magma_int_t M, magma_int_t N, + magmaDoubleComplex *A, magma_int_t LDA, + magmaDoubleComplex *B, magma_int_t LDB, + CUstream stream) +{ + int i, j; + magmaDoubleComplex *cola, *colb; + cublasHandle_t handle; + cublasStatus_t stat; + + stat = cublasCreate(&handle); + if (stat != CUBLAS_STATUS_SUCCESS) { + printf ("CUBLAS initialization failed\n"); + assert( stat == CUBLAS_STATUS_SUCCESS ); + } + + stat = cublasSetStream(handle, stream); + if (stat != CUBLAS_STATUS_SUCCESS) { + printf ("cublasSetStream failed\n"); + assert( stat == CUBLAS_STATUS_SUCCESS ); + } + + if (M < 0) { + return -1; + } + if (N < 0) { + return -2; + } + if ( (LDA < max(1,M)) && (M > 0) ) { + return -5; + } + if ( (LDB < max(1,M)) && (M > 0) ) { + return -7; + } + + if (side == MagmaLeft){ + for(i=0; i<N; i++){ + cola = A + i*LDA; + colb = B + i*LDB; +// cublasZcopy(handle, i+1, cola, 1, colb, 1); + cudaMemcpyAsync(colb , cola, + (i+1)*sizeof(cuDoubleComplex), + cudaMemcpyDeviceToDevice, stream); + } + }else{ + for(i=0; i<N; i++){ + cola = A + i*LDA; + colb = B + i*LDB; +// cublasZcopy(handle, M-i, cola + i, 1, colb + i, 1); + cudaMemcpyAsync(colb+i , cola+i, + (M-i)*sizeof(cuDoubleComplex), + cudaMemcpyDeviceToDevice, stream); + } + } + + cublasDestroy(handle); + + return MORSE_SUCCESS; +} +#else /* CHAMELEON_USE_CUBLAS_V2 */ +int CUDA_zgemerge( + magma_side_t side, magma_diag_t diag, + magma_int_t M, magma_int_t N, + magmaDoubleComplex *A, magma_int_t LDA, + magmaDoubleComplex *B, magma_int_t LDB, + CUstream stream) +{ + int i, j; + magmaDoubleComplex *cola, *colb; + + if (M < 0) { + return -1; + } + if (N < 0) { + return -2; + } + if ( (LDA < max(1,M)) && (M > 0) ) { + return -5; + } + if ( (LDB < max(1,M)) && (M > 0) ) { + return -7; + } + + if (side == MagmaLeft){ + for(i=0; i<N; i++){ + cola = A + i*LDA; + colb = B + i*LDB; +// cublasZcopy(i+1, cola, 1, colb, 1); + cudaMemcpyAsync(colb , cola, + (i+1)*sizeof(cuDoubleComplex), + cudaMemcpyDeviceToDevice, stream); + } + }else{ + for(i=0; i<N; i++){ + cola = A + i*LDA; + colb = B + i*LDB; +// cublasZcopy(M-i, cola + i, 1, colb + i, 1); + cudaMemcpyAsync(colb+i , cola+i, + (M-i)*sizeof(cuDoubleComplex), + cudaMemcpyDeviceToDevice, stream); + } + } + + return MORSE_SUCCESS; +} +#endif /* CHAMELEON_USE_CUBLAS_V2 */ +#endif diff --git a/cudablas/compute/cuda_zgessm.c b/cudablas/compute/cuda_zgessm.c new file mode 100644 index 0000000000000000000000000000000000000000..d8c0198ec0675259b6a128450943f04918aa6e9d --- /dev/null +++ b/cudablas/compute/cuda_zgessm.c @@ -0,0 +1,57 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2015 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file cuda_zgessm.c + * + * MORSE cudablas kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @author Florent Pruvost + * @date 2015-09-16 + * @precisions normal z -> c d s + * + **/ +#include "cudablas/include/cudablas.h" + +#if defined(CHAMELEON_USE_MAGMA) +#if defined(HAVE_MAGMA_GETRF_INCPIV_GPU) +int CUDA_zgessm( + char storev, magma_int_t m, magma_int_t n, magma_int_t k, magma_int_t ib, + magma_int_t *ipiv, + cuDoubleComplex *dL1, magma_int_t lddl1, + cuDoubleComplex *dL, magma_int_t lddl, + cuDoubleComplex *dA, magma_int_t ldda, + magma_int_t *info) +{ + int ret; + /* The kernel is just using the inverted part or nothing */ + if ( lddl1 >= 2*ib ) { + dL1 += ib; + ret = magma_zgessm_gpu( storev, m, n, k, ib, + ipiv, dL1, lddl1, dL, lddl, dA, ldda, info ); + } + else { + ret = magma_zgessm_gpu( storev, m, n, k, ib, + ipiv, NULL, 1, dL, lddl, dA, ldda, info ); + } + + if (ret != MAGMA_SUCCESS) { + fprintf(stderr, "Error in MAGMA: %d\n", ret); + exit(-1); + } + + return MORSE_SUCCESS; +} +#endif +#endif diff --git a/cudablas/compute/cuda_zgetrf.c b/cudablas/compute/cuda_zgetrf.c new file mode 100644 index 0000000000000000000000000000000000000000..dc6d18431c5bf3a6603489d7a9ae5d1e93982dd6 --- /dev/null +++ b/cudablas/compute/cuda_zgetrf.c @@ -0,0 +1,54 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2015 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file cuda_zgetrf.c + * + * MORSE cudablas kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @author Florent Pruvost + * @date 2015-09-16 + * @precisions normal z -> c d s + * + **/ +#include "cudablas/include/cudablas.h" + +#if defined(CHAMELEON_USE_MAGMA) +#if defined(HAVE_MAGMA_GETRF_INCPIV_GPU) +int CUDA_zgetrf_incpiv( + char storev, magma_int_t m, magma_int_t n, magma_int_t ib, + cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda, + cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl, + magma_int_t *ipiv, + cuDoubleComplex *dwork, magma_int_t lddwork, + magma_int_t *info) +{ + magma_zgetrf_incpiv_gpu( storev, m, n, ib, + hA, ldha, dA, ldda, + hL, ldhl, dL, lddl, + ipiv, + dwork, lddwork, + info ); + return MORSE_SUCCESS; +} +#endif +int CUDA_zgetrf_nopiv( + magma_int_t m, magma_int_t n, + cuDoubleComplex *dA, magma_int_t ldda, + magma_int_t *info) +{ + magma_zgetrf_nopiv_gpu( m, n, dA, ldda, info ); + return MORSE_SUCCESS; +} +#endif diff --git a/cudablas/compute/cuda_zlauum.c b/cudablas/compute/cuda_zlauum.c new file mode 100644 index 0000000000000000000000000000000000000000..2e0467c182863ca3ed1f0b1137aa44c027bb5f72 --- /dev/null +++ b/cudablas/compute/cuda_zlauum.c @@ -0,0 +1,40 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2015 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file cuda_zlauum.c + * + * MORSE cudablas kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @author Florent Pruvost + * @date 2015-09-16 + * @precisions normal z -> c d s + * + **/ +#include "cudablas/include/cudablas.h" + +#if defined(CHAMELEON_USE_MAGMA) +int CUDA_zlauum( + char uplo, magma_int_t n, + cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info) +{ + int ret; + ret = magma_zlauum_gpu( uplo, n, dA, ldda, info); + if (ret != MAGMA_SUCCESS) { + fprintf(stderr, "Error in MAGMA: %d\n", ret); + exit(-1); + } + return MORSE_SUCCESS; +} +#endif diff --git a/cudablas/compute/cuda_zparfb.c b/cudablas/compute/cuda_zparfb.c new file mode 100644 index 0000000000000000000000000000000000000000..ce1104d0971c893eee9266d69bda7f5b313dec30 --- /dev/null +++ b/cudablas/compute/cuda_zparfb.c @@ -0,0 +1,581 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2015 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file cuda_zparfb.c + * + * MORSE cudablas kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @author Florent Pruvost + * @date 2015-09-16 + * @precisions normal z -> c d s + * + **/ +#include "cudablas/include/cudablas.h" + +#if defined(CHAMELEON_USE_MAGMA) +#if defined(CHAMELEON_USE_CUBLAS_V2) +int CUDA_zparfb( + magma_side_t side, magma_trans_t trans, + magma_direct_t direct, magma_storev_t storev, + magma_int_t M1, magma_int_t N1, + magma_int_t M2, magma_int_t N2, + magma_int_t K, magma_int_t L, + magmaDoubleComplex *A1, magma_int_t LDA1, + magmaDoubleComplex *A2, magma_int_t LDA2, + const magmaDoubleComplex *V, magma_int_t LDV, + const magmaDoubleComplex *T, magma_int_t LDT, + magmaDoubleComplex *WORK, magma_int_t LDWORK, + magmaDoubleComplex *WORKC, magma_int_t LDWORKC, + CUstream stream) + +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0); + cuDoubleComplex zone = make_cuDoubleComplex(1.0, 0.0); + cuDoubleComplex mzone = make_cuDoubleComplex(-1.0, 0.0); +#else + double zzero = 0.0; + double zone = 1.0; + double mzone = -1.0; +#endif /* defined(PRECISION_z) || defined(PRECISION_c) */ + + int j; + magma_trans_t transW; + magma_trans_t transA2; + cublasHandle_t handle; + cublasStatus_t stat; + cublasOperation_t cublasTrans; + cublasOperation_t cublasTransW; + cublasOperation_t cublasTransA2; + + stat = cublasCreate(&handle); + if (stat != CUBLAS_STATUS_SUCCESS) { + printf ("CUBLAS initialization failed\n"); + assert( stat == CUBLAS_STATUS_SUCCESS ); + } + + stat = cublasSetStream(handle, stream); + if (stat != CUBLAS_STATUS_SUCCESS) { + printf ("cublasSetStream failed\n"); + assert( stat == CUBLAS_STATUS_SUCCESS ); + } + + if (trans == MagmaNoTrans){ + cublasTrans = CUBLAS_OP_N; + }else if(trans == MagmaTrans){ + cublasTrans = CUBLAS_OP_T; + }else if(trans == MagmaConjTrans){ + cublasTrans = CUBLAS_OP_C; + }else{ + fprintf(stderr, "Error in CUDA_zparfb: bad trans parameter %d\n", trans); + } + + /* Check input arguments */ + if ((side != MagmaLeft) && (side != MagmaRight)) { + return -1; + } + if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) { + return -2; + } + if ((direct != MagmaForward) && (direct != MagmaBackward)) { + return -3; + } + if ((storev != MagmaColumnwise) && (storev != MagmaRowwise)) { + return -4; + } + if (M1 < 0) { + return -5; + } + if (N1 < 0) { + return -6; + } + if ((M2 < 0) || + ( (side == MagmaRight) && (M1 != M2) ) ) { + return -7; + } + if ((N2 < 0) || + ( (side == MagmaLeft) && (N1 != N2) ) ) { + return -8; + } + if (K < 0) { + return -9; + } + + /* Quick return */ + if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0)) + return MAGMA_SUCCESS; + + if (direct == MagmaForward) { + + if (side == MagmaLeft) { + + /* + * Column or Rowwise / Forward / Left + * ---------------------------------- + * + * Form H * A or H' * A where A = ( A1 ) + * ( A2 ) + */ + + /* + * W = A1 + V' * A2: + * W = A1 + * W = W + V' * A2 + * + */ + cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex), + A1, LDA1 * sizeof(cuDoubleComplex), + K * sizeof(cuDoubleComplex), N1, + cudaMemcpyDeviceToDevice, stream ); + + transW = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans; + transA2 = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans; + + if (transW == MagmaNoTrans){ + cublasTransW = CUBLAS_OP_N; + }else if(transW == MagmaTrans){ + cublasTransW = CUBLAS_OP_T; + }else if(transW == MagmaConjTrans){ + cublasTransW = CUBLAS_OP_C; + }else{ + fprintf(stderr, "Error in CUDA_zparfb: bad transW parameter %d\n", transW); + } + if (transA2 == MagmaNoTrans){ + cublasTransA2 = CUBLAS_OP_N; + }else if(transA2 == MagmaTrans){ + cublasTransA2 = CUBLAS_OP_T; + }else if(transA2 == MagmaConjTrans){ + cublasTransA2 = CUBLAS_OP_C; + }else{ + fprintf(stderr, "Error in CUDA_zparfb: bad transA2 parameter %d\n", transA2); + } + + cublasZgemm(handle, cublasTransW, CUBLAS_OP_N, + K, N1, M2, + (const cuDoubleComplex *) &zone, + (const cuDoubleComplex*)V /* K*M2 */, LDV, + (const cuDoubleComplex*)A2 /* M2*N1 */, LDA2, + (const cuDoubleComplex *) &zone, + (cuDoubleComplex*)WORK /* K*N1 */, LDWORK); + + if (WORKC == NULL) { + /* W = op(T) * W */ + cublasZtrmm( handle, + CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, + cublasTrans, CUBLAS_DIAG_NON_UNIT, + K, N2, + (const cuDoubleComplex *) &zone, + (const cuDoubleComplex*)T, LDT, + (cuDoubleComplex*)WORK, LDWORK, + (cuDoubleComplex*)WORK, LDWORK); + + + /* A1 = A1 - W = A1 - op(T) * W */ + for(j = 0; j < N1; j++) { + cublasZaxpy(handle, K, (const cuDoubleComplex *) &mzone, + (const cuDoubleComplex*)(WORK + LDWORK*j), 1, + (cuDoubleComplex*)(A1 + LDA1*j), 1); + } + + /* A2 = A2 - op(V) * W */ + cublasZgemm(handle, cublasTransA2, CUBLAS_OP_N, + M2, N2, K, + (const cuDoubleComplex *) &mzone, + (const cuDoubleComplex*)V /* M2*K */, LDV, + (const cuDoubleComplex*)WORK /* K*N2 */, LDWORK, + (const cuDoubleComplex *) &zone, + (cuDoubleComplex*)A2 /* m2*N2 */, LDA2); + + } else { + /* Wc = V * op(T) */ + cublasZgemm( handle, cublasTransA2, cublasTrans, + M2, K, K, + (const cuDoubleComplex *) &zone, V, LDV, + T, LDT, + (const cuDoubleComplex *) &zzero, WORKC, LDWORKC ); + + /* A1 = A1 - opt(T) * W */ + cublasZgemm( handle, cublasTrans, CUBLAS_OP_N, + K, N1, K, + (const cuDoubleComplex *) &mzone, + (const cuDoubleComplex *)T, LDT, + (const cuDoubleComplex *)WORK, LDWORK, + (const cuDoubleComplex *) &zone, + (cuDoubleComplex*)A1, LDA1 ); + + /* A2 = A2 - Wc * W */ + cublasZgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, + M2, N2, K, + (const cuDoubleComplex *) &mzone, + (const cuDoubleComplex *)WORKC, LDWORKC, + (const cuDoubleComplex *)WORK, LDWORK, + (const cuDoubleComplex *) &zone, + (cuDoubleComplex *)A2, LDA2 ); + } + } + else { + /* + * Column or Rowwise / Forward / Right + * ----------------------------------- + * + * Form H * A or H' * A where A = ( A1 A2 ) + * + */ + + /* + * W = A1 + A2 * V': + * W = A1 + * W = W + A2 * V' + * + */ + cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex), + A1, LDA1 * sizeof(cuDoubleComplex), + M1 * sizeof(cuDoubleComplex), K, + cudaMemcpyDeviceToDevice, stream ); + + transW = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans; + transA2 = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans; + + if (transW == MagmaNoTrans){ + cublasTransW = CUBLAS_OP_N; + }else if(transW == MagmaTrans){ + cublasTransW = CUBLAS_OP_T; + }else if(transW == MagmaConjTrans){ + cublasTransW = CUBLAS_OP_C; + }else{ + fprintf(stderr, "Error in CUDA_zparfb: bad transW parameter %d\n", transW); + } + if (transA2 == MagmaNoTrans){ + cublasTransA2 = CUBLAS_OP_N; + }else if(transA2 == MagmaTrans){ + cublasTransA2 = CUBLAS_OP_T; + }else if(transA2 == MagmaConjTrans){ + cublasTransA2 = CUBLAS_OP_C; + }else{ + fprintf(stderr, "Error in CUDA_zparfb: bad transA2 parameter %d\n", transA2); + } + + cublasZgemm(handle, CUBLAS_OP_N, cublasTransW, + M1, K, N2, + (const cuDoubleComplex *) &zone, + (const cuDoubleComplex*)A2 /* M1*N2 */, LDA2, + (const cuDoubleComplex*)V /* N2*K */, LDV, + (const cuDoubleComplex *) &zone, + (cuDoubleComplex*)WORK /* M1*K */, LDWORK); + + if (WORKC == NULL) { + /* W = W * op(T) */ + cublasZtrmm( handle, + CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, + cublasTrans, CUBLAS_DIAG_NON_UNIT, + M2, K, + (const cuDoubleComplex *) &zone, + (const cuDoubleComplex*)T, LDT, + (cuDoubleComplex*)WORK, LDWORK, + (cuDoubleComplex*)WORK, LDWORK); + + + /* A1 = A1 - W = A1 - W * op(T) */ + for(j = 0; j < K; j++) { + cublasZaxpy(handle, M1, (const cuDoubleComplex *) &mzone, + (const cuDoubleComplex*)(WORK + LDWORK*j), 1, + (cuDoubleComplex*)(A1 + LDA1*j), 1); + } + + /* A2 = A2 - W * op(V) */ + cublasZgemm(handle, CUBLAS_OP_N, cublasTransA2, + M2, N2, K, + (const cuDoubleComplex *) &mzone, + (const cuDoubleComplex*)WORK /* M2*K */, LDWORK, + (const cuDoubleComplex*)V /* K*N2 */, LDV, + (const cuDoubleComplex *) &zone, + (cuDoubleComplex*)A2 /* M2*N2 */, LDA2); + + } else { + /* A1 = A1 - W * opt(T) */ + cublasZgemm( handle, CUBLAS_OP_N, cublasTrans, + M1, K, K, + (const cuDoubleComplex *) &mzone, + (const cuDoubleComplex *)WORK, LDWORK, + (const cuDoubleComplex *)T, LDT, + (const cuDoubleComplex *) &zone, + (cuDoubleComplex *)A1, LDA1 ); + + /* Wc = op(T) * V */ + cublasZgemm( handle, cublasTrans, cublasTransA2, + K, N2, K, + (const cuDoubleComplex *) &zone, + (const cuDoubleComplex *)T, LDT, + (const cuDoubleComplex *)V, LDV, + (const cuDoubleComplex *) &zzero, + (cuDoubleComplex *)WORKC, LDWORKC ); + + /* A2 = A2 - W * Wc */ + cublasZgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, + M2, N2, K, + (const cuDoubleComplex *) &mzone, + (const cuDoubleComplex *)WORK, LDWORK, + (const cuDoubleComplex *)WORKC, LDWORKC, + (const cuDoubleComplex *) &zone, + (cuDoubleComplex *)A2, LDA2 ); + } + } + } + else { + fprintf(stderr, "Not implemented (Backward / Left or Right)"); + return MAGMA_ERR_NOT_SUPPORTED; + } + + cublasDestroy(handle); + + return MORSE_SUCCESS; +} +#else /* CHAMELEON_USE_CUBLAS_V2 */ +magma_int_t +CUDA_zparfb(magma_side_t side, magma_trans_t trans, + magma_direct_t direct, magma_storev_t storev, + magma_int_t M1, magma_int_t N1, + magma_int_t M2, magma_int_t N2, + magma_int_t K, magma_int_t L, + magmaDoubleComplex *A1, magma_int_t LDA1, + magmaDoubleComplex *A2, magma_int_t LDA2, + const magmaDoubleComplex *V, magma_int_t LDV, + const magmaDoubleComplex *T, magma_int_t LDT, + magmaDoubleComplex *WORK, magma_int_t LDWORK, + magmaDoubleComplex *WORKC, magma_int_t LDWORKC, + CUstream stream) +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0); + cuDoubleComplex zone = make_cuDoubleComplex(1.0, 0.0); + cuDoubleComplex mzone = make_cuDoubleComplex(-1.0, 0.0); +#else + double zzero = 0.0; + double zone = 1.0; + double mzone = -1.0; +#endif /* defined(PRECISION_z) || defined(PRECISION_c) */ + + int j; + magma_trans_t transW; + magma_trans_t transA2; + + /* Check input arguments */ + if ((side != MagmaLeft) && (side != MagmaRight)) { + return -1; + } + if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) { + return -2; + } + if ((direct != MagmaForward) && (direct != MagmaBackward)) { + return -3; + } + if ((storev != MagmaColumnwise) && (storev != MagmaRowwise)) { + return -4; + } + if (M1 < 0) { + return -5; + } + if (N1 < 0) { + return -6; + } + if ((M2 < 0) || + ( (side == MagmaRight) && (M1 != M2) ) ) { + return -7; + } + if ((N2 < 0) || + ( (side == MagmaLeft) && (N1 != N2) ) ) { + return -8; + } + if (K < 0) { + return -9; + } + + /* Quick return */ + if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0)) + return MAGMA_SUCCESS; + + if (direct == MagmaForward) { + + if (side == MagmaLeft) { + + /* + * Column or Rowwise / Forward / Left + * ---------------------------------- + * + * Form H * A or H' * A where A = ( A1 ) + * ( A2 ) + */ + + /* + * W = A1 + V' * A2: + * W = A1 + * W = W + V' * A2 + * + */ + cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex), + A1, LDA1 * sizeof(cuDoubleComplex), + K * sizeof(cuDoubleComplex), N1, + cudaMemcpyDeviceToDevice, stream ); + + transW = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans; + transA2 = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans; + + cublasZgemm(morse_lapack_const(transW), morse_lapack_const(MagmaNoTrans), + K, N1, M2, + zone, + (cuDoubleComplex*)V /* K*M2 */, LDV, + (cuDoubleComplex*)A2 /* M2*N1 */, LDA2, + zone, + (cuDoubleComplex*)WORK /* K*N1 */, LDWORK); + + if (WORKC == NULL) { + /* W = op(T) * W */ + cublasZtrmm( morse_lapack_const(MagmaLeft), morse_lapack_const(MagmaUpper), + morse_lapack_const(trans), morse_lapack_const(MagmaNonUnit), + K, N2, + zone, + (cuDoubleComplex*)T, LDT, + (cuDoubleComplex*)WORK, LDWORK); + + + /* A1 = A1 - W = A1 - op(T) * W */ + for(j = 0; j < N1; j++) { + cublasZaxpy(K, mzone, + (cuDoubleComplex*)(WORK + LDWORK*j), 1, + (cuDoubleComplex*)(A1 + LDA1*j), 1); + } + + /* A2 = A2 - op(V) * W */ + cublasZgemm(morse_lapack_const(transA2), morse_lapack_const(MagmaNoTrans), + M2, N2, K, + mzone, + (cuDoubleComplex*)V /* M2*K */, LDV, + (cuDoubleComplex*)WORK /* K*N2 */, LDWORK, + zone, + (cuDoubleComplex*)A2 /* m2*N2 */, LDA2); + + } else { + /* Wc = V * op(T) */ + cublasZgemm( morse_lapack_const(transA2), morse_lapack_const(trans), + M2, K, K, + zone, V, LDV, + T, LDT, + zzero, WORKC, LDWORKC ); + + /* A1 = A1 - opt(T) * W */ + cublasZgemm( morse_lapack_const(trans), morse_lapack_const(MagmaNoTrans), + K, N1, K, + mzone, T, LDT, + WORK, LDWORK, + zone, A1, LDA1 ); + + /* A2 = A2 - Wc * W */ + cublasZgemm( morse_lapack_const(MagmaNoTrans), morse_lapack_const(MagmaNoTrans), + M2, N2, K, + mzone, WORKC, LDWORKC, + WORK, LDWORK, + zone, A2, LDA2 ); + } + } + else { + /* + * Column or Rowwise / Forward / Right + * ----------------------------------- + * + * Form H * A or H' * A where A = ( A1 A2 ) + * + */ + + /* + * W = A1 + A2 * V': + * W = A1 + * W = W + A2 * V' + * + */ + cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex), + A1, LDA1 * sizeof(cuDoubleComplex), + M1 * sizeof(cuDoubleComplex), K, + cudaMemcpyDeviceToDevice, stream ); + + transW = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans; + transA2 = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans; + + cublasZgemm(morse_lapack_const(MagmaNoTrans), morse_lapack_const(transW), + M1, K, N2, + zone, + (cuDoubleComplex*)A2 /* M1*N2 */, LDA2, + (cuDoubleComplex*)V /* N2*K */, LDV, + zone, + (cuDoubleComplex*)WORK /* M1*K */, LDWORK); + + if (WORKC == NULL) { + /* W = W * op(T) */ + cublasZtrmm( morse_lapack_const(MagmaRight), morse_lapack_const(MagmaUpper), + morse_lapack_const(trans), morse_lapack_const(MagmaNonUnit), + M2, K, + zone, + (cuDoubleComplex*)T, LDT, + (cuDoubleComplex*)WORK, LDWORK); + + + /* A1 = A1 - W = A1 - W * op(T) */ + for(j = 0; j < K; j++) { + cublasZaxpy(M1, mzone, + (cuDoubleComplex*)(WORK + LDWORK*j), 1, + (cuDoubleComplex*)(A1 + LDA1*j), 1); + } + + /* A2 = A2 - W * op(V) */ + cublasZgemm(morse_lapack_const(MagmaNoTrans), morse_lapack_const(transA2), + M2, N2, K, + mzone, + (cuDoubleComplex*)WORK /* M2*K */, LDWORK, + (cuDoubleComplex*)V /* K*N2 */, LDV, + zone, + (cuDoubleComplex*)A2 /* M2*N2 */, LDA2); + + } else { + /* A1 = A1 - W * opt(T) */ + cublasZgemm( morse_lapack_const(MagmaNoTrans), morse_lapack_const(trans), + M1, K, K, + mzone, WORK, LDWORK, + T, LDT, + zone, A1, LDA1 ); + + /* Wc = op(T) * V */ + cublasZgemm( morse_lapack_const(trans), morse_lapack_const(transA2), + K, N2, K, + zone, T, LDT, + V, LDV, + zzero, WORKC, LDWORKC ); + + /* A2 = A2 - W * Wc */ + cublasZgemm( morse_lapack_const(MagmaNoTrans), morse_lapack_const(MagmaNoTrans), + M2, N2, K, + mzone, WORK, LDWORK, + WORKC, LDWORKC, + zone, A2, LDA2 ); + } + } + } + else { + fprintf(stderr, "Not implemented (Backward / Left or Right)"); + return MAGMA_ERR_NOT_SUPPORTED; + } + + return MORSE_SUCCESS; +} +#endif /* CHAMELEON_USE_CUBLAS_V2 */ +#endif diff --git a/cudablas/compute/cuda_zpotrf.c b/cudablas/compute/cuda_zpotrf.c new file mode 100644 index 0000000000000000000000000000000000000000..0c4b608fbec70a9c7ca6a4eb19c43077fc3077e8 --- /dev/null +++ b/cudablas/compute/cuda_zpotrf.c @@ -0,0 +1,43 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2015 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file cuda_zpotrf.c + * + * MORSE cudablas kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @author Florent Pruvost + * @date 2015-09-16 + * @precisions normal z -> c d s + * + **/ +#include "cudablas/include/cudablas.h" + +#if defined(CHAMELEON_USE_MAGMA) +int CUDA_zpotrf( + magma_uplo_t uplo, magma_int_t n, + magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info) +{ + int ret; + ret = magma_zpotrf_gpu( + uplo, + n, dA, ldda, info); +/* hA, stream );*/ + if (ret != MAGMA_SUCCESS) { + fprintf(stderr, "Error in MAGMA: %d\n", ret); + exit(-1); + } + return MORSE_SUCCESS; +} +#endif diff --git a/cudablas/compute/cuda_zssssm.c b/cudablas/compute/cuda_zssssm.c new file mode 100644 index 0000000000000000000000000000000000000000..86349db0dd218ffdad0c23b1c60c7ee44af41c55 --- /dev/null +++ b/cudablas/compute/cuda_zssssm.c @@ -0,0 +1,46 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2015 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file cuda_zssssm.c + * + * MORSE cudablas kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @author Florent Pruvost + * @date 2015-09-16 + * @precisions normal z -> c d s + * + **/ +#include "cudablas/include/cudablas.h" + +#if defined(CHAMELEON_USE_MAGMA) +#if defined(HAVE_MAGMA_GETRF_INCPIV_GPU) +int CUDA_zssssm( + magma_storev_t storev, magma_int_t m1, magma_int_t n1, + magma_int_t m2, magma_int_t n2, magma_int_t k, magma_int_t ib, + magmaDoubleComplex *dA1, magma_int_t ldda1, + magmaDoubleComplex *dA2, magma_int_t ldda2, + magmaDoubleComplex *dL1, magma_int_t lddl1, + magmaDoubleComplex *dL2, magma_int_t lddl2, + magma_int_t *IPIV, magma_int_t *info) +{ + magma_zssssm_gpu( + storev, m1, n1, m2, n2, k, ib, + dA1, ldda1, dA2, ldda2, + dL1, lddl1, dL2, lddl2, + IPIV, info); + return MORSE_SUCCESS; +} +#endif +#endif diff --git a/cudablas/compute/cuda_ztrtri.c b/cudablas/compute/cuda_ztrtri.c new file mode 100644 index 0000000000000000000000000000000000000000..e10b207af9e3a78ca21b9dceaa6727fed97cb4e5 --- /dev/null +++ b/cudablas/compute/cuda_ztrtri.c @@ -0,0 +1,40 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2015 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file cuda_ztrtri.c + * + * MORSE cudablas kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @author Florent Pruvost + * @date 2015-09-16 + * @precisions normal z -> c d s + * + **/ +#include "cudablas/include/cudablas.h" + +#if defined(CHAMELEON_USE_MAGMA) +int CUDA_ztrtri( + magma_uplo_t uplo, magma_diag_t diag, magma_int_t n, + magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info) +{ + int ret; + ret = magma_ztrtri_gpu( uplo, diag, n, dA, ldda, info); + if (ret != MAGMA_SUCCESS) { + fprintf(stderr, "Error in MAGMA: %d\n", ret); + exit(-1); + } + return MORSE_SUCCESS; +} +#endif diff --git a/cudablas/compute/cuda_ztslqt.c b/cudablas/compute/cuda_ztslqt.c new file mode 100644 index 0000000000000000000000000000000000000000..5fb1e4e92fc34846411a7b898f7c53948176a859 --- /dev/null +++ b/cudablas/compute/cuda_ztslqt.c @@ -0,0 +1,181 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2015 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file cuda_ztslqt.c + * + * MORSE cudablas kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @author Florent Pruvost + * @date 2015-09-16 + * @precisions normal z -> c d s + * + **/ +#include "cudablas/include/cudablas.h" + +#if defined(CHAMELEON_USE_MAGMA) +int CUDA_ztslqt( + magma_int_t m, magma_int_t n, magma_int_t nb, + magmaDoubleComplex *da1, magma_int_t ldda1, + magmaDoubleComplex *da2, magma_int_t ldda2, + magmaDoubleComplex *a2, magma_int_t lda2, + magmaDoubleComplex *dt, magma_int_t lddt, + magmaDoubleComplex *t, magma_int_t ldt, + magmaDoubleComplex *dd, + magmaDoubleComplex *d, magma_int_t ldd, + magmaDoubleComplex *tau, + magmaDoubleComplex *hwork, + magmaDoubleComplex *dwork, + CUstream stream) +{ +#define da1_ref(a_1,a_2) ( da1+(a_2)*ldda1 + (a_1)) +#define da2_ref(a_1,a_2) ( da2+(a_2)*ldda2 + (a_1)) +#define a2_ref(a_1,a_2) ( a2+(a_2)*lda2 + (a_1)) +#define t_ref(a_1,a_2) ( t+(a_2)*ldt + (a_1)) +#define dt_ref(a_1,a_2) ( dt+(a_2)*lddt + (a_1)) +#define d_ref(a_1,a_2) ( d+(a_2)*ldd + (a_1)) + + int i, k, lddwork, old_i, old_ib, rows, cols; + int ib; + double _Complex one=1.; + + if (m < 0) { + return -1; + } else if (n < 0) { + return -2; + } else if (ldda2 < max(1,m)) { + return -4; + } + + k = min(m,n); + if (k == 0) { + hwork[0] = *((magmaDoubleComplex*) &one); + return MAGMA_SUCCESS; + } + + lddwork= m; + + /* lower parts of little T must be zero: memset all to 0 for simplicity */ + memset(t, 0, nb*n*sizeof(magmaDoubleComplex)); + cudaMemset(dt, 0, nb*n*sizeof(magmaDoubleComplex)); + + k = min(m, nb); // m can be lower than IB + /* copy the first diag tile of A1 from device to host: da1 -> d */ + cublasGetMatrix(k, k, sizeof(magmaDoubleComplex), + da1_ref(0, 0), ldda1, + d, ldd); + + /* copy first panel of A2 from device to host: da2 -> a2 */ + cublasGetMatrix(k, n, sizeof(magmaDoubleComplex), + da2_ref(0, 0), ldda2, + a2, lda2); + + /* This is only blocked code for now */ + for (i = 0; i < m; i += nb) { + + ib = min(m-i, nb); + cols = n; + + /* Send the next panel (diagonal block of A1 & block column of A2) + to the CPU (in work_a1 and work_a2) and apply tsmqr update on the + remaining non updated panels */ + if (i>0) { + + /* copy the diag tile of A1 from device to host: da1 -> d */ + cublasGetMatrix(ib, ib, sizeof(magmaDoubleComplex), + da1_ref(i, i), ldda1, + d, ldd); + + /* copy panel of A2 from device to host: da2 -> a2 */ + cublasGetMatrix(ib, cols, sizeof(magmaDoubleComplex), + da2_ref(i, 0), ldda2, + a2, lda2); + + /* Apply H' to A(i+2*ib:m,i:n) from the left */ + rows = m-old_i-2*old_ib; + if (rows > 0){ + CUDA_ztsmlq( + MagmaRight, MagmaConjTrans, + rows, old_ib, rows, cols, old_ib, old_ib, + da1_ref(old_i+2*old_ib, old_i), ldda1, + da2_ref(old_i+2*old_ib, 0), ldda2, + da2_ref(old_i, 0), ldda2, + dt_ref(0, old_i), lddt, + dwork, lddwork, + dwork + nb * lddwork, nb, + stream ); + } + + } + + /* compute LQ factorization of the panel of A2 ib x cols */ + CORE_ztslqt(ib, cols, ib, + (double _Complex*) d, ldd, + (double _Complex*) a2, lda2, + (double _Complex*) t, ldt, + (double _Complex*) tau, + (double _Complex*) hwork); + + /* Send the panel from A2 back to the GPU */ + cublasSetMatrix(ib, cols, sizeof(magmaDoubleComplex), + a2, lda2, + da2_ref(i, 0), ldda2); + + /* Send the triangular factor T from hwork to the GPU */ + cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex), + t, ldt, + dt_ref(0, i), lddt); + + /* get back the diag tile in A1 from host to device: d -> da1 */ + cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex), + d, ldd, + da1_ref(i, i), ldda1); + + /* tsmlq update on one panel forward (look ahead 1) */ + if (i + ib < m) { + + if (i+2*ib < m){ + rows = ib; + } + else{ + rows = m-i-ib; + } + + /* Apply H' to A(i+ib:i+2*ib,i:n) from the right */ + CUDA_ztsmlq( + MagmaRight, MagmaConjTrans, + rows, ib, rows, cols, ib, ib, + da1_ref(i+ib, i), ldda1, + da2_ref(i+ib, 0), ldda2, + da2_ref(i, 0), ldda2, + dt_ref(0, i), lddt, + dwork, lddwork, + dwork + nb * lddwork, nb, + stream ); + old_i = i; + old_ib = ib; + } + } + +#undef da1_ref +#undef da2_ref +#undef a2_ref +#undef t_ref +#undef dt_ref +#undef d_ref + + return MORSE_SUCCESS; + +} +#endif diff --git a/cudablas/compute/cuda_ztsmlq.c b/cudablas/compute/cuda_ztsmlq.c new file mode 100644 index 0000000000000000000000000000000000000000..67eb42a1412c158b15564cd37c68cbbac2b5d88e --- /dev/null +++ b/cudablas/compute/cuda_ztsmlq.c @@ -0,0 +1,156 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2015 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file cuda_ztsmlq.c + * + * MORSE cudablas kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @author Florent Pruvost + * @date 2015-09-16 + * @precisions normal z -> c d s + * + **/ +#include "cudablas/include/cudablas.h" + +#if defined(CHAMELEON_USE_MAGMA) +int CUDA_ztsmlq( + magma_side_t side, magma_trans_t trans, + magma_int_t M1, magma_int_t N1, + magma_int_t M2, magma_int_t N2, + magma_int_t K, magma_int_t IB, + magmaDoubleComplex *A1, magma_int_t LDA1, + magmaDoubleComplex *A2, magma_int_t LDA2, + const magmaDoubleComplex *V, magma_int_t LDV, + const magmaDoubleComplex *T, magma_int_t LDT, + magmaDoubleComplex *WORK, magma_int_t LDWORK, + magmaDoubleComplex *WORKC, magma_int_t LDWORKC, + CUstream stream) +{ + int i, i1, i3; + int NW; + int kb; + int ic = 0; + int jc = 0; + int mi = M1; + int ni = N1; + + /* Check input arguments */ + if ((side != MagmaLeft) && (side != MagmaRight)) { + return -1; + } + + /* NW is the minimum dimension of WORK */ + if (side == MagmaLeft) { + NW = IB; + } + else { + NW = N1; + } + + if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) { + return -2; + } + if (M1 < 0) { + return -3; + } + if (N1 < 0) { + return -4; + } + if ( (M2 < 0) || + ( (M2 != M1) && (side == MagmaRight) ) ){ + return -5; + } + if ( (N2 < 0) || + ( (N2 != N1) && (side == MagmaLeft) ) ){ + return -6; + } + if ((K < 0) || + ( (side == MagmaLeft) && (K > M1) ) || + ( (side == MagmaRight) && (K > N1) ) ) { + return -7; + } + if (IB < 0) { + return -8; + } + if (LDA1 < max(1,M1)){ + return -10; + } + if (LDA2 < max(1,M2)){ + return -12; + } + if (LDV < max(1,K)){ + return -14; + } + if (LDT < max(1,IB)){ + return -16; + } + if (LDWORK < max(1,NW)){ + return -18; + } + + /* Quick return */ + if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) + return MAGMA_SUCCESS; + + if (((side == MagmaLeft) && (trans == MagmaNoTrans)) + || ((side == MagmaRight) && (trans != MagmaNoTrans))) { + i1 = 0; + i3 = IB; + } + else { + i1 = ((K-1) / IB)*IB; + i3 = -IB; + } + + if (trans == MagmaNoTrans) { + trans = MagmaConjTrans; + } + else { + trans = MagmaNoTrans; + } + + for(i = i1; (i > -1) && (i < K); i += i3) { + kb = min(IB, K-i); + + if (side == MagmaLeft) { + /* + * H or H' is applied to C(i:m,1:n) + */ + mi = M1 - i; + ic = i; + } + else { + /* + * H or H' is applied to C(1:m,i:n) + */ + ni = N1 - i; + jc = i; + } + + /* + * Apply H or H' (NOTE: CORE_zparfb used to be CORE_ztsrfb) + */ + CUDA_zparfb( + side, trans, MagmaForward, MagmaRowwise, + mi, ni, M2, N2, kb, 0, + A1 + LDA1*jc+ic, LDA1, + A2, LDA2, + V + i, LDV, + T + LDT*i, LDT, + WORK, LDWORK, WORKC, LDWORKC, stream ); + } + return MORSE_SUCCESS; +} +#endif diff --git a/cudablas/compute/cuda_ztsmqr.c b/cudablas/compute/cuda_ztsmqr.c new file mode 100644 index 0000000000000000000000000000000000000000..de9c05dc986d7593c0ecf505325762301d788aac --- /dev/null +++ b/cudablas/compute/cuda_ztsmqr.c @@ -0,0 +1,704 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2015 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file cuda_ztsmqr.c + * + * MORSE cudablas kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @author Florent Pruvost + * @date 2015-09-16 + * @precisions normal z -> c d s + * + **/ +#include "cudablas/include/cudablas.h" + +#if defined(CHAMELEON_USE_MAGMA) +int CUDA_ztsmqr( + magma_side_t side, magma_trans_t trans, + magma_int_t M1, magma_int_t N1, + magma_int_t M2, magma_int_t N2, + magma_int_t K, magma_int_t IB, + magmaDoubleComplex *A1, magma_int_t LDA1, + magmaDoubleComplex *A2, magma_int_t LDA2, + const magmaDoubleComplex *V, magma_int_t LDV, + const magmaDoubleComplex *T, magma_int_t LDT, + magmaDoubleComplex *WORK, magma_int_t LDWORK, + magmaDoubleComplex *WORKC, magma_int_t LDWORKC, + CUstream stream) +{ + int i, i1, i3; + int NQ, NW; + int kb; + int ic = 0; + int jc = 0; + int mi = M1; + int ni = N1; + + /* Check input arguments */ + if ((side != MagmaLeft) && (side != MagmaRight)) { + return -1; + } + + /* NQ is the order of Q */ + if (side == MagmaLeft) { + NQ = M2; + NW = IB; + } + else { + NQ = N2; + NW = M1; + } + + if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) { + return -2; + } + if (M1 < 0) { + return -3; + } + if (N1 < 0) { + return -4; + } + if ( (M2 < 0) || + ( (M2 != M1) && (side == MagmaRight) ) ){ + return -5; + } + if ( (N2 < 0) || + ( (N2 != N1) && (side == MagmaLeft) ) ){ + return -6; + } + if ((K < 0) || + ( (side == MagmaLeft) && (K > M1) ) || + ( (side == MagmaRight) && (K > N1) ) ) { + return -7; + } + if (IB < 0) { + return -8; + } + if (LDA1 < max(1,M1)){ + return -10; + } + if (LDA2 < max(1,M2)){ + return -12; + } + if (LDV < max(1,NQ)){ + return -14; + } + if (LDT < max(1,IB)){ + return -16; + } + if (LDWORK < max(1,NW)){ + return -18; + } + + /* Quick return */ + if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) + return MAGMA_SUCCESS; + + if (((side == MagmaLeft) && (trans != MagmaNoTrans)) + || ((side == MagmaRight) && (trans == MagmaNoTrans))) { + i1 = 0; + i3 = IB; + } + else { + i1 = ((K-1) / IB)*IB; + i3 = -IB; + } + + for(i = i1; (i > -1) && (i < K); i += i3) { + kb = min(IB, K-i); + + if (side == MagmaLeft) { + /* + * H or H' is applied to C(i:m,1:n) + */ + mi = M1 - i; + ic = i; + } + else { + /* + * H or H' is applied to C(1:m,i:n) + */ + ni = N1 - i; + jc = i; + } + /* + * Apply H or H' (NOTE: CORE_zparfb used to be CORE_ztsrfb) + */ + CUDA_zparfb( + side, trans, MagmaForward, MagmaColumnwise, + mi, ni, M2, N2, kb, 0, + A1 + LDA1*jc+ic, LDA1, + A2, LDA2, + V + LDV*i, LDV, + T + LDT*i, LDT, + WORK, LDWORK, WORKC, LDWORKC, stream ); + } + return MORSE_SUCCESS; +} + +#if defined(CHAMELEON_USE_CUBLAS_V2) +int CUDA_zparfb( + magma_side_t side, magma_trans_t trans, + magma_direct_t direct, magma_storev_t storev, + magma_int_t M1, magma_int_t N1, + magma_int_t M2, magma_int_t N2, + magma_int_t K, magma_int_t L, + magmaDoubleComplex *A1, magma_int_t LDA1, + magmaDoubleComplex *A2, magma_int_t LDA2, + const magmaDoubleComplex *V, magma_int_t LDV, + const magmaDoubleComplex *T, magma_int_t LDT, + magmaDoubleComplex *WORK, magma_int_t LDWORK, + magmaDoubleComplex *WORKC, magma_int_t LDWORKC, + CUstream stream) + +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0); + cuDoubleComplex zone = make_cuDoubleComplex(1.0, 0.0); + cuDoubleComplex mzone = make_cuDoubleComplex(-1.0, 0.0); +#else + double zzero = 0.0; + double zone = 1.0; + double mzone = -1.0; +#endif /* defined(PRECISION_z) || defined(PRECISION_c) */ + + int j; + magma_trans_t transW; + magma_trans_t transA2; + cublasHandle_t handle; + cublasStatus_t stat; + cublasOperation_t cublasTrans; + cublasOperation_t cublasTransW; + cublasOperation_t cublasTransA2; + + stat = cublasCreate(&handle); + if (stat != CUBLAS_STATUS_SUCCESS) { + printf ("CUBLAS initialization failed\n"); + assert( stat == CUBLAS_STATUS_SUCCESS ); + } + + stat = cublasSetStream(handle, stream); + if (stat != CUBLAS_STATUS_SUCCESS) { + printf ("cublasSetStream failed\n"); + assert( stat == CUBLAS_STATUS_SUCCESS ); + } + + if (trans == MagmaNoTrans){ + cublasTrans = CUBLAS_OP_N; + }else if(trans == MagmaTrans){ + cublasTrans = CUBLAS_OP_T; + }else if(trans == MagmaConjTrans){ + cublasTrans = CUBLAS_OP_C; + }else{ + fprintf(stderr, "Error in CUDA_zparfb: bad trans parameter %d\n", trans); + } + + /* Check input arguments */ + if ((side != MagmaLeft) && (side != MagmaRight)) { + return -1; + } + if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) { + return -2; + } + if ((direct != MagmaForward) && (direct != MagmaBackward)) { + return -3; + } + if ((storev != MagmaColumnwise) && (storev != MagmaRowwise)) { + return -4; + } + if (M1 < 0) { + return -5; + } + if (N1 < 0) { + return -6; + } + if ((M2 < 0) || + ( (side == MagmaRight) && (M1 != M2) ) ) { + return -7; + } + if ((N2 < 0) || + ( (side == MagmaLeft) && (N1 != N2) ) ) { + return -8; + } + if (K < 0) { + return -9; + } + + /* Quick return */ + if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0)) + return MAGMA_SUCCESS; + + if (direct == MagmaForward) { + + if (side == MagmaLeft) { + + /* + * Column or Rowwise / Forward / Left + * ---------------------------------- + * + * Form H * A or H' * A where A = ( A1 ) + * ( A2 ) + */ + + /* + * W = A1 + V' * A2: + * W = A1 + * W = W + V' * A2 + * + */ + cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex), + A1, LDA1 * sizeof(cuDoubleComplex), + K * sizeof(cuDoubleComplex), N1, + cudaMemcpyDeviceToDevice, stream ); + + transW = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans; + transA2 = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans; + + if (transW == MagmaNoTrans){ + cublasTransW = CUBLAS_OP_N; + }else if(transW == MagmaTrans){ + cublasTransW = CUBLAS_OP_T; + }else if(transW == MagmaConjTrans){ + cublasTransW = CUBLAS_OP_C; + }else{ + fprintf(stderr, "Error in CUDA_zparfb: bad transW parameter %d\n", transW); + } + if (transA2 == MagmaNoTrans){ + cublasTransA2 = CUBLAS_OP_N; + }else if(transA2 == MagmaTrans){ + cublasTransA2 = CUBLAS_OP_T; + }else if(transA2 == MagmaConjTrans){ + cublasTransA2 = CUBLAS_OP_C; + }else{ + fprintf(stderr, "Error in CUDA_zparfb: bad transA2 parameter %d\n", transA2); + } + + cublasZgemm(handle, cublasTransW, CUBLAS_OP_N, + K, N1, M2, + (const cuDoubleComplex *) &zone, + (const cuDoubleComplex*)V /* K*M2 */, LDV, + (const cuDoubleComplex*)A2 /* M2*N1 */, LDA2, + (const cuDoubleComplex *) &zone, + (cuDoubleComplex*)WORK /* K*N1 */, LDWORK); + + if (WORKC == NULL) { + /* W = op(T) * W */ + cublasZtrmm( handle, + CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, + cublasTrans, CUBLAS_DIAG_NON_UNIT, + K, N2, + (const cuDoubleComplex *) &zone, + (const cuDoubleComplex*)T, LDT, + (cuDoubleComplex*)WORK, LDWORK, + (cuDoubleComplex*)WORK, LDWORK); + + + /* A1 = A1 - W = A1 - op(T) * W */ + for(j = 0; j < N1; j++) { + cublasZaxpy(handle, K, (const cuDoubleComplex *) &mzone, + (const cuDoubleComplex*)(WORK + LDWORK*j), 1, + (cuDoubleComplex*)(A1 + LDA1*j), 1); + } + + /* A2 = A2 - op(V) * W */ + cublasZgemm(handle, cublasTransA2, CUBLAS_OP_N, + M2, N2, K, + (const cuDoubleComplex *) &mzone, + (const cuDoubleComplex*)V /* M2*K */, LDV, + (const cuDoubleComplex*)WORK /* K*N2 */, LDWORK, + (const cuDoubleComplex *) &zone, + (cuDoubleComplex*)A2 /* m2*N2 */, LDA2); + + } else { + /* Wc = V * op(T) */ + cublasZgemm( handle, cublasTransA2, cublasTrans, + M2, K, K, + (const cuDoubleComplex *) &zone, V, LDV, + T, LDT, + (const cuDoubleComplex *) &zzero, WORKC, LDWORKC ); + + /* A1 = A1 - opt(T) * W */ + cublasZgemm( handle, cublasTrans, CUBLAS_OP_N, + K, N1, K, + (const cuDoubleComplex *) &mzone, + (const cuDoubleComplex *)T, LDT, + (const cuDoubleComplex *)WORK, LDWORK, + (const cuDoubleComplex *) &zone, + (cuDoubleComplex*)A1, LDA1 ); + + /* A2 = A2 - Wc * W */ + cublasZgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, + M2, N2, K, + (const cuDoubleComplex *) &mzone, + (const cuDoubleComplex *)WORKC, LDWORKC, + (const cuDoubleComplex *)WORK, LDWORK, + (const cuDoubleComplex *) &zone, + (cuDoubleComplex *)A2, LDA2 ); + } + } + else { + /* + * Column or Rowwise / Forward / Right + * ----------------------------------- + * + * Form H * A or H' * A where A = ( A1 A2 ) + * + */ + + /* + * W = A1 + A2 * V': + * W = A1 + * W = W + A2 * V' + * + */ + cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex), + A1, LDA1 * sizeof(cuDoubleComplex), + M1 * sizeof(cuDoubleComplex), K, + cudaMemcpyDeviceToDevice, stream ); + + transW = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans; + transA2 = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans; + + if (transW == MagmaNoTrans){ + cublasTransW = CUBLAS_OP_N; + }else if(transW == MagmaTrans){ + cublasTransW = CUBLAS_OP_T; + }else if(transW == MagmaConjTrans){ + cublasTransW = CUBLAS_OP_C; + }else{ + fprintf(stderr, "Error in CUDA_zparfb: bad transW parameter %d\n", transW); + } + if (transA2 == MagmaNoTrans){ + cublasTransA2 = CUBLAS_OP_N; + }else if(transA2 == MagmaTrans){ + cublasTransA2 = CUBLAS_OP_T; + }else if(transA2 == MagmaConjTrans){ + cublasTransA2 = CUBLAS_OP_C; + }else{ + fprintf(stderr, "Error in CUDA_zparfb: bad transA2 parameter %d\n", transA2); + } + + cublasZgemm(handle, CUBLAS_OP_N, cublasTransW, + M1, K, N2, + (const cuDoubleComplex *) &zone, + (const cuDoubleComplex*)A2 /* M1*N2 */, LDA2, + (const cuDoubleComplex*)V /* N2*K */, LDV, + (const cuDoubleComplex *) &zone, + (cuDoubleComplex*)WORK /* M1*K */, LDWORK); + + if (WORKC == NULL) { + /* W = W * op(T) */ + cublasZtrmm( handle, + CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, + cublasTrans, CUBLAS_DIAG_NON_UNIT, + M2, K, + (const cuDoubleComplex *) &zone, + (const cuDoubleComplex*)T, LDT, + (cuDoubleComplex*)WORK, LDWORK, + (cuDoubleComplex*)WORK, LDWORK); + + + /* A1 = A1 - W = A1 - W * op(T) */ + for(j = 0; j < K; j++) { + cublasZaxpy(handle, M1, (const cuDoubleComplex *) &mzone, + (const cuDoubleComplex*)(WORK + LDWORK*j), 1, + (cuDoubleComplex*)(A1 + LDA1*j), 1); + } + + /* A2 = A2 - W * op(V) */ + cublasZgemm(handle, CUBLAS_OP_N, cublasTransA2, + M2, N2, K, + (const cuDoubleComplex *) &mzone, + (const cuDoubleComplex*)WORK /* M2*K */, LDWORK, + (const cuDoubleComplex*)V /* K*N2 */, LDV, + (const cuDoubleComplex *) &zone, + (cuDoubleComplex*)A2 /* M2*N2 */, LDA2); + + } else { + /* A1 = A1 - W * opt(T) */ + cublasZgemm( handle, CUBLAS_OP_N, cublasTrans, + M1, K, K, + (const cuDoubleComplex *) &mzone, + (const cuDoubleComplex *)WORK, LDWORK, + (const cuDoubleComplex *)T, LDT, + (const cuDoubleComplex *) &zone, + (cuDoubleComplex *)A1, LDA1 ); + + /* Wc = op(T) * V */ + cublasZgemm( handle, cublasTrans, cublasTransA2, + K, N2, K, + (const cuDoubleComplex *) &zone, + (const cuDoubleComplex *)T, LDT, + (const cuDoubleComplex *)V, LDV, + (const cuDoubleComplex *) &zzero, + (cuDoubleComplex *)WORKC, LDWORKC ); + + /* A2 = A2 - W * Wc */ + cublasZgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, + M2, N2, K, + (const cuDoubleComplex *) &mzone, + (const cuDoubleComplex *)WORK, LDWORK, + (const cuDoubleComplex *)WORKC, LDWORKC, + (const cuDoubleComplex *) &zone, + (cuDoubleComplex *)A2, LDA2 ); + } + } + } + else { + fprintf(stderr, "Not implemented (Backward / Left or Right)"); + return MAGMA_ERR_NOT_SUPPORTED; + } + + cublasDestroy(handle); + + return MORSE_SUCCESS; +} +#else /* CHAMELEON_USE_CUBLAS_V2 */ +int CUDA_zparfb( + magma_side_t side, magma_trans_t trans, + magma_direct_t direct, magma_storev_t storev, + magma_int_t M1, magma_int_t N1, + magma_int_t M2, magma_int_t N2, + magma_int_t K, magma_int_t L, + magmaDoubleComplex *A1, magma_int_t LDA1, + magmaDoubleComplex *A2, magma_int_t LDA2, + const magmaDoubleComplex *V, magma_int_t LDV, + const magmaDoubleComplex *T, magma_int_t LDT, + magmaDoubleComplex *WORK, magma_int_t LDWORK, + magmaDoubleComplex *WORKC, magma_int_t LDWORKC, + CUstream stream) +{ +#if defined(PRECISION_z) || defined(PRECISION_c) + cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0); + cuDoubleComplex zone = make_cuDoubleComplex(1.0, 0.0); + cuDoubleComplex mzone = make_cuDoubleComplex(-1.0, 0.0); +#else + double zzero = 0.0; + double zone = 1.0; + double mzone = -1.0; +#endif /* defined(PRECISION_z) || defined(PRECISION_c) */ + + int j; + magma_trans_t transW; + magma_trans_t transA2; + + /* Check input arguments */ + if ((side != MagmaLeft) && (side != MagmaRight)) { + return -1; + } + if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) { + return -2; + } + if ((direct != MagmaForward) && (direct != MagmaBackward)) { + return -3; + } + if ((storev != MagmaColumnwise) && (storev != MagmaRowwise)) { + return -4; + } + if (M1 < 0) { + return -5; + } + if (N1 < 0) { + return -6; + } + if ((M2 < 0) || + ( (side == MagmaRight) && (M1 != M2) ) ) { + return -7; + } + if ((N2 < 0) || + ( (side == MagmaLeft) && (N1 != N2) ) ) { + return -8; + } + if (K < 0) { + return -9; + } + + /* Quick return */ + if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0)) + return MAGMA_SUCCESS; + + if (direct == MagmaForward) { + + if (side == MagmaLeft) { + + /* + * Column or Rowwise / Forward / Left + * ---------------------------------- + * + * Form H * A or H' * A where A = ( A1 ) + * ( A2 ) + */ + + /* + * W = A1 + V' * A2: + * W = A1 + * W = W + V' * A2 + * + */ + cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex), + A1, LDA1 * sizeof(cuDoubleComplex), + K * sizeof(cuDoubleComplex), N1, + cudaMemcpyDeviceToDevice, stream ); + + transW = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans; + transA2 = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans; + + cublasZgemm(morse_lapack_const(transW), morse_lapack_const(MagmaNoTrans), + K, N1, M2, + zone, + (cuDoubleComplex*)V /* K*M2 */, LDV, + (cuDoubleComplex*)A2 /* M2*N1 */, LDA2, + zone, + (cuDoubleComplex*)WORK /* K*N1 */, LDWORK); + + if (WORKC == NULL) { + /* W = op(T) * W */ + cublasZtrmm( morse_lapack_const(MagmaLeft), morse_lapack_const(MagmaUpper), + morse_lapack_const(trans), morse_lapack_const(MagmaNonUnit), + K, N2, + zone, + (cuDoubleComplex*)T, LDT, + (cuDoubleComplex*)WORK, LDWORK); + + + /* A1 = A1 - W = A1 - op(T) * W */ + for(j = 0; j < N1; j++) { + cublasZaxpy(K, mzone, + (cuDoubleComplex*)(WORK + LDWORK*j), 1, + (cuDoubleComplex*)(A1 + LDA1*j), 1); + } + + /* A2 = A2 - op(V) * W */ + cublasZgemm(morse_lapack_const(transA2), morse_lapack_const(MagmaNoTrans), + M2, N2, K, + mzone, + (cuDoubleComplex*)V /* M2*K */, LDV, + (cuDoubleComplex*)WORK /* K*N2 */, LDWORK, + zone, + (cuDoubleComplex*)A2 /* m2*N2 */, LDA2); + + } else { + /* Wc = V * op(T) */ + cublasZgemm( morse_lapack_const(transA2), morse_lapack_const(trans), + M2, K, K, + zone, V, LDV, + T, LDT, + zzero, WORKC, LDWORKC ); + + /* A1 = A1 - opt(T) * W */ + cublasZgemm( morse_lapack_const(trans), morse_lapack_const(MagmaNoTrans), + K, N1, K, + mzone, T, LDT, + WORK, LDWORK, + zone, A1, LDA1 ); + + /* A2 = A2 - Wc * W */ + cublasZgemm( morse_lapack_const(MagmaNoTrans), morse_lapack_const(MagmaNoTrans), + M2, N2, K, + mzone, WORKC, LDWORKC, + WORK, LDWORK, + zone, A2, LDA2 ); + } + } + else { + /* + * Column or Rowwise / Forward / Right + * ----------------------------------- + * + * Form H * A or H' * A where A = ( A1 A2 ) + * + */ + + /* + * W = A1 + A2 * V': + * W = A1 + * W = W + A2 * V' + * + */ + cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex), + A1, LDA1 * sizeof(cuDoubleComplex), + M1 * sizeof(cuDoubleComplex), K, + cudaMemcpyDeviceToDevice, stream ); + + transW = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans; + transA2 = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans; + + cublasZgemm(morse_lapack_const(MagmaNoTrans), morse_lapack_const(transW), + M1, K, N2, + zone, + (cuDoubleComplex*)A2 /* M1*N2 */, LDA2, + (cuDoubleComplex*)V /* N2*K */, LDV, + zone, + (cuDoubleComplex*)WORK /* M1*K */, LDWORK); + + if (WORKC == NULL) { + /* W = W * op(T) */ + cublasZtrmm( morse_lapack_const(MagmaRight), morse_lapack_const(MagmaUpper), + morse_lapack_const(trans), morse_lapack_const(MagmaNonUnit), + M2, K, + zone, + (cuDoubleComplex*)T, LDT, + (cuDoubleComplex*)WORK, LDWORK); + + + /* A1 = A1 - W = A1 - W * op(T) */ + for(j = 0; j < K; j++) { + cublasZaxpy(M1, mzone, + (cuDoubleComplex*)(WORK + LDWORK*j), 1, + (cuDoubleComplex*)(A1 + LDA1*j), 1); + } + + /* A2 = A2 - W * op(V) */ + cublasZgemm(morse_lapack_const(MagmaNoTrans), morse_lapack_const(transA2), + M2, N2, K, + mzone, + (cuDoubleComplex*)WORK /* M2*K */, LDWORK, + (cuDoubleComplex*)V /* K*N2 */, LDV, + zone, + (cuDoubleComplex*)A2 /* M2*N2 */, LDA2); + + } else { + /* A1 = A1 - W * opt(T) */ + cublasZgemm( morse_lapack_const(MagmaNoTrans), morse_lapack_const(trans), + M1, K, K, + mzone, WORK, LDWORK, + T, LDT, + zone, A1, LDA1 ); + + /* Wc = op(T) * V */ + cublasZgemm( morse_lapack_const(trans), morse_lapack_const(transA2), + K, N2, K, + zone, T, LDT, + V, LDV, + zzero, WORKC, LDWORKC ); + + /* A2 = A2 - W * Wc */ + cublasZgemm( morse_lapack_const(MagmaNoTrans), morse_lapack_const(MagmaNoTrans), + M2, N2, K, + mzone, WORK, LDWORK, + WORKC, LDWORKC, + zone, A2, LDA2 ); + } + } + } + else { + fprintf(stderr, "Not implemented (Backward / Left or Right)"); + return MAGMA_ERR_NOT_SUPPORTED; + } + + return MORSE_SUCCESS; +} +#endif /* CHAMELEON_USE_CUBLAS_V2 */ +#endif diff --git a/cudablas/compute/cuda_ztsqrt.c b/cudablas/compute/cuda_ztsqrt.c new file mode 100644 index 0000000000000000000000000000000000000000..8750e5a6bfd4e50d1a509d29a7cd3dd5662f2be8 --- /dev/null +++ b/cudablas/compute/cuda_ztsqrt.c @@ -0,0 +1,203 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2015 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file cuda_ztsqrt.c + * + * MORSE cudablas kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @author Florent Pruvost + * @date 2015-09-16 + * @precisions normal z -> c d s + * + **/ +#include "cudablas/include/cudablas.h" + +#if defined(CHAMELEON_USE_MAGMA) +int CUDA_ztsqrt( + magma_int_t m, magma_int_t n, magma_int_t nb, + magmaDoubleComplex *da1, magma_int_t ldda1, + magmaDoubleComplex *da2, magma_int_t ldda2, + magmaDoubleComplex *a2, magma_int_t lda2, + magmaDoubleComplex *dt, magma_int_t lddt, + magmaDoubleComplex *t, magma_int_t ldt, + magmaDoubleComplex *dd, + magmaDoubleComplex *d, magma_int_t ldd, + magmaDoubleComplex *tau, + magmaDoubleComplex *hwork, + magmaDoubleComplex *dwork, + CUstream stream) +{ +#define da1_ref(a_1,a_2) ( da1+(a_2)*ldda1 + (a_1)) +#define da2_ref(a_1,a_2) ( da2+(a_2)*ldda2 + (a_1)) +#define a2_ref(a_1,a_2) ( a2+(a_2)*lda2 + (a_1)) +#define t_ref(a_1,a_2) ( t+(a_2)*ldt + (a_1)) +#define dt_ref(a_1,a_2) ( dt+(a_2)*lddt + (a_1)) +#define d_ref(a_1,a_2) ( d+(a_2)*ldd + (a_1)) + + int i, k, lddwork, old_i, old_ib, rows, cols; + int ib; + double _Complex one=1.; +// int lwkopt = n * nb; +// hwork[0] = *((magmaDoubleComplex*) &lwkopt); + + if (m < 0) { + return -1; + } else if (n < 0) { + return -2; + } else if (ldda2 < max(1,m)) { + return -4; + } + + k = min(m,n); + if (k == 0) { + hwork[0] = *((magmaDoubleComplex*) &one); + return MAGMA_SUCCESS; + } + + lddwork= nb; + + /* lower parts of little T must be zero: memset all to 0 for simplicity */ + memset(t, 0, nb*nb*sizeof(magmaDoubleComplex)); + cudaMemset(dt, 0, nb*n*sizeof(magmaDoubleComplex)); + + /* copy the first diag tile of A1 from device to host: da1 -> d */ + cublasGetMatrix(nb, nb, sizeof(magmaDoubleComplex), + da1_ref(0, 0), ldda1, + d, ldd); +// cudaMemcpy( d, da1_ref(0,0), +// nb*nb*sizeof(cuDoubleComplex), +// cudaMemcpyDeviceToHost ); + + /* copy first panel of A2 from device to host: da2 -> a2 */ +// cublasGetMatrix(m, nb, sizeof(magmaDoubleComplex), +// da2_ref(0, 0), ldda2, +// a2, lda2); + cudaMemcpy( a2, da2_ref(0, 0), + m*nb*sizeof(cuDoubleComplex), + cudaMemcpyDeviceToHost ); + + /* This is only blocked code for now */ + for (i = 0; i < n; i += nb) { + + ib = min(n-i, nb); + rows = m; + + /* Send the next panel (diagonal block of A1 & block column of A2) + to the CPU (in work_a1 and work_a2) and apply tsmqr update on the + remaining non updated panels */ + if (i>0) { + + /* copy the diag tile of A1 from device to host: da1 -> d */ + cublasGetMatrix(ib, ib, sizeof(magmaDoubleComplex), + da1_ref(i, i), ldda1, + d, ldd); +// cudaMemcpy( d, da1_ref(i,i), +// ib*ib*sizeof(cuDoubleComplex), +// cudaMemcpyDeviceToHost ); + + /* copy panel of A2 from device to host: da2 -> a2 */ + cublasGetMatrix(rows, ib, sizeof(magmaDoubleComplex), + da2_ref(0, i), ldda2, + a2, lda2); +// cudaMemcpy( a2, da2_ref(0,i), +// rows*ib*sizeof(cuDoubleComplex), +// cudaMemcpyDeviceToHost ); + + /* Apply H' to A(i:m,i+2*ib:n) from the left */ + cols = n-old_i-2*old_ib; + if (cols > 0){ + CUDA_ztsmqr( + MagmaLeft, MagmaConjTrans, + old_ib, cols, rows, cols, old_ib, old_ib, + da1_ref(old_i, old_i+2*old_ib), ldda1, + da2_ref(0, old_i+2*old_ib), ldda2, + da2_ref(0, old_i), ldda2, + dt_ref(0, old_i), lddt, + dwork, old_ib, + dwork + old_ib * cols, rows, + stream ); + } + + } + + /* compute QR factorization of the panel of A2 rows x ib */ + CORE_ztsqrt(rows, ib, ib, + (double _Complex*) d, ldd, + (double _Complex*) a2, lda2, + (double _Complex*) t, ldt, + (double _Complex*) tau, + (double _Complex*) hwork); + + /* Send the panel from A2 back to the GPU */ + cublasSetMatrix(rows, ib, sizeof(magmaDoubleComplex), + a2, lda2, + da2_ref(0, i), ldda2); +// cudaMemcpy( da2_ref(0,i), a2, +// rows*ib*sizeof(cuDoubleComplex), +// cudaMemcpyHostToDevice ); + + /* Send the triangular factor T from hwork to the GPU */ + cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex), + t, ldt, + dt_ref(0, i), lddt); +// cudaMemcpy( dt_ref(0,i), t, +// ib*ib*sizeof(cuDoubleComplex), +// cudaMemcpyHostToDevice ); + + /* get back the diag tile in A1 from host to device: d -> da1 */ + cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex), + d, ldd, + da1_ref(i, i), ldda1); +// cudaMemcpy( da1_ref(i, i), d, +// ib*ib*sizeof(cuDoubleComplex), +// cudaMemcpyHostToDevice ); + + /* tsmqr update on one panel forward (look ahead 1) */ + if (i + ib < n) { + + if (i+2*ib < n){ + cols = ib; + } + else{ + cols = n-i-ib; + } + + /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ + CUDA_ztsmqr( + MagmaLeft, MagmaConjTrans, + ib, cols, rows, cols, ib, ib, + da1_ref(i, i+ib), ldda1, + da2_ref(0, i+ib), ldda2, + da2_ref(0, i), ldda2, + dt_ref(0, i), lddt, + dwork, ib, + dwork + ib * cols, rows, + stream ); + old_i = i; + old_ib = ib; + } + } + +#undef da1_ref +#undef da2_ref +#undef a2_ref +#undef t_ref +#undef dt_ref +#undef d_ref + + return MORSE_SUCCESS; + +} +#endif diff --git a/cudablas/compute/cuda_ztstrf.c b/cudablas/compute/cuda_ztstrf.c new file mode 100644 index 0000000000000000000000000000000000000000..1632ba706bd266e01dfef1af586f88d9e6bca6bf --- /dev/null +++ b/cudablas/compute/cuda_ztstrf.c @@ -0,0 +1,48 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2015 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file cuda_ztstrf.c + * + * MORSE cudablas kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @author Florent Pruvost + * @date 2015-09-16 + * @precisions normal z -> c d s + * + **/ +#include "cudablas/include/cudablas.h" + +#if defined(CHAMELEON_USE_MAGMA) +int CUDA_ztstrf( + char storev, magma_int_t m, magma_int_t n, magma_int_t ib, magma_int_t nb, + cuDoubleComplex *hU, magma_int_t ldhu, cuDoubleComplex *dU, magma_int_t lddu, + cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda, + cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl, + magma_int_t *ipiv, + cuDoubleComplex *hwork, magma_int_t ldhwork, + cuDoubleComplex *dwork, magma_int_t lddwork, + magma_int_t *info) +{ + + magma_ztstrf_gpu( storev, m, n, ib, nb, + hU, ldhu, dU, lddu, + hA, ldha, dA, ldda, + hL, ldhl, dL, lddl, + ipiv, + hwork, ldhwork, dwork, lddwork, + info ); + return MORSE_SUCCESS; +} +#endif diff --git a/cudablas/compute/cuda_zunmlqt.c b/cudablas/compute/cuda_zunmlqt.c new file mode 100644 index 0000000000000000000000000000000000000000..5268acea872e5d73644e3efe53b0cc9f26b4f8dd --- /dev/null +++ b/cudablas/compute/cuda_zunmlqt.c @@ -0,0 +1,133 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2015 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file cuda_zunmlqt.c + * + * MORSE cudablas kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @author Florent Pruvost + * @date 2015-09-16 + * @precisions normal z -> c d s + * + **/ +#include "cudablas/include/cudablas.h" + +#if defined(CHAMELEON_USE_MAGMA) +int CUDA_zunmlqt( + magma_side_t side, magma_trans_t trans, + magma_int_t M, magma_int_t N, magma_int_t K, magma_int_t IB, + const magmaDoubleComplex *A, magma_int_t LDA, + const magmaDoubleComplex *T, magma_int_t LDT, + magmaDoubleComplex *C, magma_int_t LDC, + magmaDoubleComplex *WORK, magma_int_t LDWORK ) +{ + int i, kb; + int i1, i3; + int nq, nw; + int ic = 0; + int jc = 0; + int ni = N; + int mi = M; + + /* Check input arguments */ + if ((side != MagmaLeft) && (side != MagmaRight)) { + return -1; + } + /* + * NQ is the order of Q and NW is the minimum dimension of WORK + */ + if (side == MagmaLeft) { + nq = M; + nw = N; + } + else { + nq = N; + nw = M; + } + + if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) { + return -2; + } + if (M < 0) { + return -3; + } + if (N < 0) { + return -4; + } + if ((K < 0) || (K > nq)) { + return -5; + } + if ((IB < 0) || ( (IB == 0) && ((M > 0) && (N > 0)) )) { + return -6; + } + if ((LDA < max(1,K)) && (K > 0)) { + return -8; + } + if ((LDC < max(1,M)) && (M > 0)) { + return -12; + } + if ((LDWORK < max(1,nw)) && (nw > 0)) { + return -14; + } + + /* Quick return */ + if ((M == 0) || (N == 0) || (K == 0)) + return MAGMA_SUCCESS; + + if (((side == MagmaLeft) && (trans == MagmaNoTrans)) + || ((side == MagmaRight) && (trans != MagmaNoTrans))) { + i1 = 0; + i3 = IB; + } + else { + i1 = ( ( K-1 ) / IB )*IB; + i3 = -IB; + } + + if( trans == MorseNoTrans) { + trans = MorseConjTrans; + } + else { + trans = MorseNoTrans; + } + + for(i = i1; (i >- 1) && (i < K); i+=i3 ) { + kb = min(IB, K-i); + + if (side == MagmaLeft) { + /* + * H or H' is applied to C(i:m,1:n) + */ + mi = M - i; + ic = i; + } + else { + /* + * H or H' is applied to C(1:m,i:n) + */ + ni = N - i; + jc = i; + } + + magma_zlarfb_gpu( side, trans, MagmaForward, MagmaRowwise, + mi, ni, kb, + A + LDA * i + i, LDA, + T + LDT * i, LDT, + C + LDC * jc + ic, LDC, + WORK, LDWORK); + } + return MORSE_SUCCESS; +} +#endif diff --git a/cudablas/compute/cuda_zunmqrt.c b/cudablas/compute/cuda_zunmqrt.c new file mode 100644 index 0000000000000000000000000000000000000000..786d0184119cbb7bbd827a0e33071cbedfda0391 --- /dev/null +++ b/cudablas/compute/cuda_zunmqrt.c @@ -0,0 +1,127 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2015 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file cuda_zunmqrt.c + * + * MORSE cudablas kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @author Florent Pruvost + * @date 2015-09-16 + * @precisions normal z -> c d s + * + **/ +#include "cudablas/include/cudablas.h" + +#if defined(CHAMELEON_USE_MAGMA) +int CUDA_zunmqrt( + magma_side_t side, magma_trans_t trans, + magma_int_t M, magma_int_t N, magma_int_t K, magma_int_t IB, + const magmaDoubleComplex *A, magma_int_t LDA, + const magmaDoubleComplex *T, magma_int_t LDT, + magmaDoubleComplex *C, magma_int_t LDC, + magmaDoubleComplex *WORK, magma_int_t LDWORK ) +{ + int i, kb; + int i1, i3; + int nq, nw; + int ic = 0; + int jc = 0; + int ni = N; + int mi = M; + + /* Check input arguments */ + if ((side != MagmaLeft) && (side != MagmaRight)) { + return -1; + } + /* + * NQ is the order of Q and NW is the minimum dimension of WORK + */ + if (side == MagmaLeft) { + nq = M; + nw = N; + } + else { + nq = N; + nw = M; + } + + if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) { + return -2; + } + if (M < 0) { + return -3; + } + if (N < 0) { + return -4; + } + if ((K < 0) || (K > nq)) { + return -5; + } + if ((IB < 0) || ( (IB == 0) && ((M > 0) && (N > 0)) )) { + return -6; + } + if ((LDA < max(1,nq)) && (nq > 0)) { + return -8; + } + if ((LDC < max(1,M)) && (M > 0)) { + return -12; + } + if ((LDWORK < max(1,nw)) && (nw > 0)) { + return -14; + } + + /* Quick return */ + if ((M == 0) || (N == 0) || (K == 0)) + return MAGMA_SUCCESS; + + if (((side == MagmaLeft) && (trans != MagmaNoTrans)) + || ((side == MagmaRight) && (trans == MagmaNoTrans))) { + i1 = 0; + i3 = IB; + } + else { + i1 = ( ( K-1 ) / IB )*IB; + i3 = -IB; + } + + for(i = i1; (i >- 1) && (i < K); i+=i3 ) { + kb = min(IB, K-i); + + if (side == MagmaLeft) { + /* + * H or H' is applied to C(i:m,1:n) + */ + mi = M - i; + ic = i; + } + else { + /* + * H or H' is applied to C(1:m,i:n) + */ + ni = N - i; + jc = i; + } + + magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, + mi, ni, kb, + A + LDA * i + i, LDA, + T + LDT * i, LDT, + C + LDC * jc + ic, LDC, + WORK, LDWORK); + } + + return MORSE_SUCCESS; +} +#endif diff --git a/cudablas/include/CMakeLists.txt b/cudablas/include/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..9635e1f953ce9a4a86f51043b911a395d25147b6 --- /dev/null +++ b/cudablas/include/CMakeLists.txt @@ -0,0 +1,58 @@ +### +# +# @copyright (c) 2009-2014 The University of Tennessee and The University +# of Tennessee Research Foundation. +# All rights reserved. +# @copyright (c) 2012-2014 Inria. All rights reserved. +# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. +# +### +# +# @file CMakeLists.txt +# +# @project MORSE +# MORSE is a software package provided by: +# Inria Bordeaux - Sud-Ouest, +# Univ. of Tennessee, +# King Abdullah Univesity of Science and Technology +# Univ. of California Berkeley, +# Univ. of Colorado Denver. +# +# @author Florent Pruvost +# @date 16-09-2015 +# +### + +# Generate header files +# --------------------- +set(CUDABLAS_HDRS_GENERATED "") +set(ZHDR + cudablas_z.h +) +precisions_rules_py(CUDABLAS_HDRS_GENERATED "${ZHDR}" + PRECISIONS "s;d;c;z;zc;ds" ) + +# Define the list of headers +# -------------------------- +set(CUDABLAS_HDRS + cudablas.h + ${CUDABLAS_HDRS_GENERATED} + ) + +# Force generation of headers +# --------------------------- +add_custom_target(cudablas_include ALL SOURCES ${CUDABLAS_HDRS}) + +set(HDR_INSTALL "cudablas.h") +foreach( hdr_file ${CUDABLAS_HDRS_GENERATED} ) + list(APPEND HDR_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/${hdr_file}) +endforeach() + +# installation +# ------------ +install(FILES ${HDR_INSTALL} + DESTINATION include/chameleon/cudablas/include) + +### +### END CMakeLists.txt +### diff --git a/cudablas/include/cudablas.h b/cudablas/include/cudablas.h new file mode 100644 index 0000000000000000000000000000000000000000..ab0e6eee738dc71b3bdb391d49c436f283d43de1 --- /dev/null +++ b/cudablas/include/cudablas.h @@ -0,0 +1,81 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2015 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file cudablas.h + * + * MORSE cudablas headers + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @author Florent Pruvost + * @date 2015-09-16 + * @precisions normal z -> c d s + * + **/ +#ifndef _CUDA_BLAS_H_ +#define _CUDA_BLAS_H_ + +#include <stdio.h> +#include <math.h> +#include <string.h> + +#if defined(CHAMELEON_USE_CUDA) +#include <cuda.h> +#include <cuComplex.h> +#if defined(CHAMELEON_USE_MAGMA) +#include <magma.h> +#endif +/** **************************************************************************** + * CUDA BLAS headers + **/ +#include "cudablas/include/cudablas_z.h" +#include "cudablas/include/cudablas_d.h" +#include "cudablas/include/cudablas_c.h" +#include "cudablas/include/cudablas_s.h" +#endif + +/** **************************************************************************** + * MORSE types and constants + **/ +#include "morse_types.h" +#include "morse_struct.h" +#include "morse_constants.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/******************************************************************************* + * Global utilities + **/ +#ifndef max +#define max(a, b) ((a) > (b) ? (a) : (b)) +#endif +#ifndef min +#define min(a, b) ((a) < (b) ? (a) : (b)) +#endif +#ifndef roundup +#define roundup(a, b) (b <= 0) ? (a) : (((a) + (b)-1) & ~((b)-1)) +#endif + +/** **************************************************************************** + * LAPACK Constants + **/ +extern char *morse_lapack_constants[]; +#define morse_lapack_const(morse_const) morse_lapack_constants[morse_const][0] + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/cudablas/include/cudablas_z.h b/cudablas/include/cudablas_z.h new file mode 100644 index 0000000000000000000000000000000000000000..c63fa090111d8342aa12490a0ee3bca13079ceee --- /dev/null +++ b/cudablas/include/cudablas_z.h @@ -0,0 +1,197 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2015 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file cudablas_z.h + * + * MORSE cudablas headers + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @author Florent Pruvost + * @date 2015-09-16 + * @precisions normal z -> c d s + * + **/ +#ifndef _MORSE_CUDA_ZBLAS_H_ +#define _MORSE_CUDA_ZBLAS_H_ + +#define COMPLEX + +#ifdef __cplusplus +extern "C" { +#endif + +/** **************************************************************************** + * Declarations of cuda kernels - alphabetical order + **/ +#if defined(CHAMELEON_USE_MAGMA) +int CUDA_zgelqt( + magma_int_t m, magma_int_t n, magma_int_t nb, + magmaDoubleComplex *da, magma_int_t ldda, + magmaDoubleComplex *v, magma_int_t ldv, + magmaDoubleComplex *dt, magma_int_t lddt, + magmaDoubleComplex *t, magma_int_t ldt, + magmaDoubleComplex *dd, + magmaDoubleComplex *d, magma_int_t ldd, + magmaDoubleComplex *tau, + magmaDoubleComplex *hwork, + magmaDoubleComplex *dwork, + CUstream stream); +int CUDA_zgemerge( + magma_side_t side, magma_diag_t diag, + magma_int_t M, magma_int_t N, + magmaDoubleComplex *A, magma_int_t LDA, + magmaDoubleComplex *B, magma_int_t LDB, + CUstream stream); +int CUDA_zgeqrt( + magma_int_t m, magma_int_t n, magma_int_t nb, + magmaDoubleComplex *da, magma_int_t ldda, + magmaDoubleComplex *v, magma_int_t ldv, + magmaDoubleComplex *dt, magma_int_t lddt, + magmaDoubleComplex *t, magma_int_t ldt, + magmaDoubleComplex *dd, + magmaDoubleComplex *d, magma_int_t ldd, + magmaDoubleComplex *tau, + magmaDoubleComplex *hwork, + magmaDoubleComplex *dwork, + CUstream stream); +int CUDA_zgessm( + char storev, magma_int_t m, magma_int_t n, magma_int_t k, magma_int_t ib, + magma_int_t *ipiv, + cuDoubleComplex *dL1, magma_int_t lddl1, + cuDoubleComplex *dL, magma_int_t lddl, + cuDoubleComplex *dA, magma_int_t ldda, + magma_int_t *info); +int CUDA_zgetrf_incpiv( + char storev, magma_int_t m, magma_int_t n, magma_int_t ib, + cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda, + cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl, + magma_int_t *ipiv, + cuDoubleComplex *dwork, magma_int_t lddwork, + magma_int_t *info); +int CUDA_zgetrf_nopiv( + magma_int_t m, magma_int_t n, + cuDoubleComplex *dA, magma_int_t ldda, + magma_int_t *info); +int CUDA_zlauum( + char uplo, magma_int_t n, + cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info); +int CUDA_zparfb( + magma_side_t side, magma_trans_t trans, + magma_direct_t direct, magma_storev_t storev, + magma_int_t M1, magma_int_t N1, + magma_int_t M2, magma_int_t N2, + magma_int_t K, magma_int_t L, + magmaDoubleComplex *A1, magma_int_t LDA1, + magmaDoubleComplex *A2, magma_int_t LDA2, + const magmaDoubleComplex *V, magma_int_t LDV, + const magmaDoubleComplex *T, magma_int_t LDT, + magmaDoubleComplex *WORK, magma_int_t LDWORK, + magmaDoubleComplex *WORKC, magma_int_t LDWORKC, + CUstream stream); +int CUDA_zpotrf( + magma_uplo_t uplo, magma_int_t n, + magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info); +int CUDA_zssssm( + magma_storev_t storev, magma_int_t m1, magma_int_t n1, + magma_int_t m2, magma_int_t n2, magma_int_t k, magma_int_t ib, + magmaDoubleComplex *dA1, magma_int_t ldda1, + magmaDoubleComplex *dA2, magma_int_t ldda2, + magmaDoubleComplex *dL1, magma_int_t lddl1, + magmaDoubleComplex *dL2, magma_int_t lddl2, + magma_int_t *IPIV, magma_int_t *info); +int CUDA_ztrtri( + magma_uplo_t uplo, magma_diag_t diag, magma_int_t n, + magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info); +int CUDA_ztslqt( + magma_int_t m, magma_int_t n, magma_int_t nb, + magmaDoubleComplex *da1, magma_int_t ldda1, + magmaDoubleComplex *da2, magma_int_t ldda2, + magmaDoubleComplex *a2, magma_int_t lda2, + magmaDoubleComplex *dt, magma_int_t lddt, + magmaDoubleComplex *t, magma_int_t ldt, + magmaDoubleComplex *dd, + magmaDoubleComplex *d, magma_int_t ldd, + magmaDoubleComplex *tau, + magmaDoubleComplex *hwork, + magmaDoubleComplex *dwork, + CUstream stream); +int CUDA_ztsmlq( + magma_side_t side, magma_trans_t trans, + magma_int_t M1, magma_int_t N1, + magma_int_t M2, magma_int_t N2, + magma_int_t K, magma_int_t IB, + magmaDoubleComplex *A1, magma_int_t LDA1, + magmaDoubleComplex *A2, magma_int_t LDA2, + const magmaDoubleComplex *V, magma_int_t LDV, + const magmaDoubleComplex *T, magma_int_t LDT, + magmaDoubleComplex *WORK, magma_int_t LDWORK, + magmaDoubleComplex *WORKC, magma_int_t LDWORKC, + CUstream stream); +int CUDA_ztsmqr( + magma_side_t side, magma_trans_t trans, + magma_int_t M1, magma_int_t N1, + magma_int_t M2, magma_int_t N2, + magma_int_t K, magma_int_t IB, + magmaDoubleComplex *A1, magma_int_t LDA1, + magmaDoubleComplex *A2, magma_int_t LDA2, + const magmaDoubleComplex *V, magma_int_t LDV, + const magmaDoubleComplex *T, magma_int_t LDT, + magmaDoubleComplex *WORK, magma_int_t LDWORK, + magmaDoubleComplex *WORKC, magma_int_t LDWORKC, + CUstream stream); +int CUDA_ztsqrt( + magma_int_t m, magma_int_t n, magma_int_t nb, + magmaDoubleComplex *da1, magma_int_t ldda1, + magmaDoubleComplex *da2, magma_int_t ldda2, + magmaDoubleComplex *a2, magma_int_t lda2, + magmaDoubleComplex *dt, magma_int_t lddt, + magmaDoubleComplex *t, magma_int_t ldt, + magmaDoubleComplex *dd, + magmaDoubleComplex *d, magma_int_t ldd, + magmaDoubleComplex *tau, + magmaDoubleComplex *hwork, + magmaDoubleComplex *dwork, + CUstream stream); +int CUDA_ztstrf( + char storev, magma_int_t m, magma_int_t n, magma_int_t ib, magma_int_t nb, + cuDoubleComplex *hU, magma_int_t ldhu, cuDoubleComplex *dU, magma_int_t lddu, + cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda, + cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl, + magma_int_t *ipiv, + cuDoubleComplex *hwork, magma_int_t ldhwork, + cuDoubleComplex *dwork, magma_int_t lddwork, + magma_int_t *info); +int CUDA_zunmlqt( + magma_side_t side, magma_trans_t trans, + magma_int_t M, magma_int_t N, magma_int_t K, magma_int_t IB, + const magmaDoubleComplex *A, magma_int_t LDA, + const magmaDoubleComplex *T, magma_int_t LDT, + magmaDoubleComplex *C, magma_int_t LDC, + magmaDoubleComplex *WORK, magma_int_t LDWORK ); +int CUDA_zunmqrt( + magma_side_t side, magma_trans_t trans, + magma_int_t M, magma_int_t N, magma_int_t K, magma_int_t IB, + const magmaDoubleComplex *A, magma_int_t LDA, + const magmaDoubleComplex *T, magma_int_t LDT, + magmaDoubleComplex *C, magma_int_t LDC, + magmaDoubleComplex *WORK, magma_int_t LDWORK ); +#endif + +#ifdef __cplusplus +} +#endif + +#undef COMPLEX + +#endif