diff --git a/coreblas/compute/core_zgetrf_panel.c b/coreblas/compute/core_zgetrf_panel.c index f3467b1a3f750b0d87263fa2acda337ac3c00820..f96dcd5129514f5d8d0479dffd0b58e78d4e9a03 100644 --- a/coreblas/compute/core_zgetrf_panel.c +++ b/coreblas/compute/core_zgetrf_panel.c @@ -12,7 +12,7 @@ * @version 1.3.0 * @author Mathieu Faverge * @author Matthieu Kuhn - * @date 2023-08-22 + * @date 2023-09-11 * @precisions normal z -> c d s * */ @@ -20,6 +20,7 @@ #include "coreblas.h" static const CHAMELEON_Complex64_t mzone = (CHAMELEON_Complex64_t)-1.0; +static const CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0; /** ****************************************************************************** @@ -83,13 +84,18 @@ static const CHAMELEON_Complex64_t mzone = (CHAMELEON_Complex64_t)-1.0; * */ int -CORE_zgetrf_panel_diag( int m, int n, int h, int m0, +CORE_zgetrf_panel_diag( int m, int n, int h, int m0, int ib, CHAMELEON_Complex64_t *A, int lda, + CHAMELEON_Complex64_t *U, int ldu, int *IPIV, CHAM_pivot_t *nextpiv, const CHAM_pivot_t *prevpiv ) { CHAMELEON_Complex64_t *subA = A + lda * h + h; CHAMELEON_Complex64_t alpha; + /* Compute localn, the number of columns to update inside the block */ + int localnt = chameleon_ceil( chameleon_min(m, n), ib ); + int localn = (( h / ib ) == (localnt-1)) ? n - ( h / ib ) * ib : ib; + if ( h > 0 ) { CHAMELEON_Complex64_t *L, *pivrow; @@ -110,6 +116,12 @@ CORE_zgetrf_panel_diag( int m, int n, int h, int m0, A + h-1, lda ); } + if( (U != NULL) && (h%ib != 0 ) ) + { + cblas_zcopy( n, pivrow, 1, + U + ((h-1)%ib), ldu ); + } + /* Store the pivot */ IPIV[h-1] = pivot + 1; @@ -136,11 +148,30 @@ CORE_zgetrf_panel_diag( int m, int n, int h, int m0, if ( h < chameleon_min( m, n ) ) { /* Applying the update */ - cblas_zgeru(CblasColMajor, m-h, n-h, - CBLAS_SADDR(mzone), - L, 1, - pivrow + h, 1, - subA, lda ); + if ( (h % ib) != 0 ) { + /* We apply a BLAS-2 operation with a single outer-product */ + cblas_zgeru( CblasColMajor, m-h, localn-(h%ib), + CBLAS_SADDR(mzone), + L, 1, + pivrow + h, 1, + subA, lda ); + } + else { + /* We apply a BLAS-3 operation with a matrix-matrix product */ + const CHAMELEON_Complex64_t *L = A + (h-ib) * lda + h; + const CHAMELEON_Complex64_t *subU = U + h * ldu; + + LAPACKE_zlacpy_work( + LAPACK_COL_MAJOR, 'A', ib, n - h, + subU, ldu, A + h * lda + (h-ib), lda ); + + cblas_zgemm( CblasColMajor, + (CBLAS_TRANSPOSE)CblasNoTrans, (CBLAS_TRANSPOSE)CblasNoTrans, + m - h, n - h, ib, + CBLAS_SADDR(mzone), L, lda, + subU, ldu, + CBLAS_SADDR( zone), subA, lda ); + } } else { return 0; @@ -225,13 +256,18 @@ CORE_zgetrf_panel_diag( int m, int n, int h, int m0, * */ int -CORE_zgetrf_panel_offdiag( int m, int n, int h, int m0, +CORE_zgetrf_panel_offdiag( int m, int n, int h, int m0, int ib, CHAMELEON_Complex64_t *A, int lda, + CHAMELEON_Complex64_t *U, int ldu, CHAM_pivot_t *nextpiv, const CHAM_pivot_t *prevpiv ) { CHAMELEON_Complex64_t *subA = A + lda * h; CHAMELEON_Complex64_t alpha; + /* Compute the number of columns to update inside the block */ + int localnt = chameleon_ceil( n, ib ); + int localn = (( h / ib ) == (localnt-1)) ? n - ( h / ib ) * ib : ib; + if ( h != 0 ) { CHAMELEON_Complex64_t *L, *pivrow; @@ -266,11 +302,26 @@ CORE_zgetrf_panel_offdiag( int m, int n, int h, int m0, */ if ( h < n ) { /* Applying the update */ - cblas_zgeru(CblasColMajor, m, n-h, - CBLAS_SADDR(mzone), - L, 1, - pivrow + h, 1, - subA, lda ); + if ( (h % ib) != 0 ) { + /* We apply a BLAS-2 operation with a single outer-product */ + cblas_zgeru( CblasColMajor, m, localn-(h%ib), + CBLAS_SADDR(mzone), + L, 1, + pivrow + h, 1, + subA, lda ); + } + else { + /* We apply a BLAS-3 operation with a matrix-matrix product */ + const CHAMELEON_Complex64_t *L = A + (h-ib) * lda; + const CHAMELEON_Complex64_t *subU = U + h * ldu; + + cblas_zgemm( CblasColMajor, + (CBLAS_TRANSPOSE)CblasNoTrans, (CBLAS_TRANSPOSE)CblasNoTrans, + m, n - h, ib, + CBLAS_SADDR(mzone), L, lda, + subU, ldu, + CBLAS_SADDR( zone), subA, lda ); + } } else { return 0; diff --git a/coreblas/include/coreblas/coreblas_z.h b/coreblas/include/coreblas/coreblas_z.h index e5e68e2989f11f56517714e2692adb4ab95771a9..79247f791dd8a2079a71d2899dc1b99d7931ddc7 100644 --- a/coreblas/include/coreblas/coreblas_z.h +++ b/coreblas/include/coreblas/coreblas_z.h @@ -22,7 +22,7 @@ * @author Cedric Castagnede * @author Florent Pruvost * @author Matthieu Kuhn - * @date 2023-08-31 + * @date 2023-09-11 * @precisions normal z -> c d s * */ @@ -87,12 +87,14 @@ int CORE_zgetrf_incpiv(int M, int N, int IB, int CORE_zgetrf_nopiv(int M, int N, int IB, CHAMELEON_Complex64_t *A, int LDA, int *INFO); -int CORE_zgetrf_panel_diag( int m, int n, int h, int m0, +int CORE_zgetrf_panel_diag( int m, int n, int h, int m0, int ib, CHAMELEON_Complex64_t *A, int lda, + CHAMELEON_Complex64_t *U, int ldu, int *IPIV, CHAM_pivot_t *nextpiv, const CHAM_pivot_t *prevpiv); -int CORE_zgetrf_panel_offdiag( int m, int n, int h, int m0, +int CORE_zgetrf_panel_offdiag( int m, int n, int h, int m0, int ib, CHAMELEON_Complex64_t *A, int lda, + CHAMELEON_Complex64_t *U, int ldu, CHAM_pivot_t *nextpiv, const CHAM_pivot_t *prevpiv ); int CORE_zgetrf_reclap(int M, int N, CHAMELEON_Complex64_t *A, int LDA,