diff --git a/coreblas/compute/CMakeLists.txt b/coreblas/compute/CMakeLists.txt index 40baddc598df9da3ce04b58a54b7fe645435b564..d458f074e25ef600048366ec650673df488694ef 100644 --- a/coreblas/compute/CMakeLists.txt +++ b/coreblas/compute/CMakeLists.txt @@ -53,6 +53,7 @@ set(ZSRC core_zgetrf.c core_zgetrf_incpiv.c core_zgetrf_nopiv.c + core_zgetrf_panel.c core_zgram.c core_zhe2ge.c core_zherfb.c diff --git a/coreblas/compute/core_zgetrf_panel.c b/coreblas/compute/core_zgetrf_panel.c new file mode 100644 index 0000000000000000000000000000000000000000..2ec4b23a8f3e2f7cb37ff08f0721d96f1ed6dd9e --- /dev/null +++ b/coreblas/compute/core_zgetrf_panel.c @@ -0,0 +1,299 @@ +/** + * + * @file core_zgetrf_panel.c + * + * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon core_zgetrf with partial pivoting CPU kernel + * + * @version 1.2.0 + * @author Mathieu Faverge + * @author Matthieu Kuhn + * @date 2022-02-22 + * @precisions normal z -> c d s + * + */ +#include "coreblas/lapacke.h" +#include "coreblas.h" + +static const CHAMELEON_Complex64_t mzone = (CHAMELEON_Complex64_t)-1.0; + +/** + ****************************************************************************** + * + * @ingroup CORE_CHAMELEON_Complex64_t + * + * CORE_zgetrf_panel_diag computes the LU factorization of a single column on + * the diagonal block. Works in combination with CORE_zgetrf_panel_offdiag() to + * perform the full LU factorization with partial pivoting of a single column in + * a panel. + * + ******************************************************************************* + * + * @param[in] m + * The number of rows of the matrix A. m >= 0. + * + * @param[in] n + * The number of columns of the matrix A. n >= 0. + * + * @param[in] h + * The index of the column to factorize in the matrix A. + * + * @param[in,out] A + * On entry, the matrix A where column h-1 needs to be factorized, and + * pivot for column h needs to be selected. + * The lower pentagonal part m-by-(h-1) are the L factors of the LU + * decomposition of A for the first (h-1) columns/rows. + * The upper pentagonal part (h-1)-by-n are the U factors of the LU + * decomposition of A for the first (h-1) columns/rows. + * On exit, A is factorized up to the (h-1)^th column/row and the + * remaining part is updated. The pivot of column h is selected. + * + * @param[in] lda + * The leading dimension of the array A. lda >= max(1,m). + * + * @param[in,out] IPIV + * On entry, the pivot array of size min(m,n) with the first h-2 columns initialized. + * On exit, IPIV[h-1] is updated with the selected pivot for the previous column. + * + * @param[in,out] nextpiv + * On entry, the allocated and initialized CHAM_piv_t structure to + * store the information related to pivot at stage h. + * On exit, the diagrow is initialized with the diagonal h row from A, + * and the pivot row is updated if a better pivot is found for step h. + * + * @param[in] prevpiv + * Holds the CHAM_piv_t structure of the previous (h-1) step with the + * (h-1) diagonal row, and the pivot selected row for (h-1) step. + * Not referenced if h is equal to 0. + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * @retval <0 if -i, the i-th argument had an illegal value + * + */ +int +CORE_zgetrf_panel_diag( int m, int n, int h, int m0, + CHAMELEON_Complex64_t *A, int lda, + int *IPIV, CHAM_pivot_t *nextpiv, const CHAM_pivot_t *prevpiv ) +{ + CHAMELEON_Complex64_t *subA = A + lda * h + h; + CHAMELEON_Complex64_t alpha; + + if ( h > 0 ) { + CHAMELEON_Complex64_t *L, *pivrow; + + int pivot_blkm0 = prevpiv->blkm0; + int pivot_blkidx = prevpiv->blkidx; + int pivot = pivot_blkm0 + pivot_blkidx; + + pivrow = (CHAMELEON_Complex64_t *)(prevpiv->pivrow); + + /* + * Copy the previously selected row to the right place if it's not the original + * diagonal row or if does not belong to the diagonal block + */ + if( (pivot_blkm0 != m0) || + ( (pivot_blkm0 == m0) && (pivot_blkidx != (h-1)) ) ) + { + cblas_zcopy( n, pivrow, 1, + A + h-1, lda ); + } + + /* Store the pivot */ + IPIV[h-1] = pivot + 1; + + /* + * Copy the diagonal row in place of the selected one (previous pivot) to the right place if: + * - we own the previous pivot + * - pivot index was not last step index + */ + if( (pivot_blkm0 == m0 ) && + (pivot_blkidx != h-1) ) + { + cblas_zcopy( n, prevpiv->diagrow, 1, + A + pivot_blkidx, lda ); + } + + /* Take the pointer to the element below the current diagonal element */ + L = A + lda * (h-1) + h; + + if ( h < m ) { + /* Scaling off-diagonal terms */ + alpha = (CHAMELEON_Complex64_t)1. / pivrow[h-1]; + cblas_zscal( m-h, CBLAS_SADDR( alpha ), L, 1 ); + } + + /* + * h is compared only to n, because if we are on the last column of a + * tile, m might be much smaller than n, and still we need to apply + * the geru call. If this is the diagonal tile, we will just look for + * the next maximum for nothing. + */ + if ( h < n ) { + /* Applying the update */ + cblas_zgeru(CblasColMajor, m-h, n-h, + CBLAS_SADDR(mzone), + L, 1, + pivrow + h, 1, + subA, lda ); + } + else { + return 0; + } + } + + /* Looking for the local max index */ + { + CHAMELEON_Complex64_t *next_pivrow = (CHAMELEON_Complex64_t *)(nextpiv->pivrow); + int index; + + index = cblas_izamax( m-h, subA, 1 ); + alpha = subA[index]; + + if ( cabs(alpha) > cabs(next_pivrow[h]) ) { + /* We save this index as a potential pivot for the next column of the panel */ + nextpiv->blkm0 = m0; /* block index */ + nextpiv->blkidx = index + h; /* index in local numbering */ + + /* Save the row holding the selected pivot */ + cblas_zcopy( n, A + index + h, lda, + next_pivrow, 1 ); + } + } + + /* Store current diagonal row (in full) into pivot structure */ + cblas_zcopy( n, A + h, lda, + nextpiv->diagrow, 1 ); + return 0; +} + +/** + ****************************************************************************** + * + * @ingroup CORE_CHAMELEON_Complex64_t + * + * CORE_zgetrf_panel_offdiag computes the LU factorization of a single column on + * an off-diagonal block. Works in combination with CORE_zgetrf_panel_diag() to + * perform the full LU factorization with partial pivoting of a single column in + * a panel. + * + ******************************************************************************* + * + * @param[in] m + * The number of rows of the matrix A. m >= 0. + * + * @param[in] n + * The number of columns of the matrix A. n >= 0. + * + * @param[in] h + * The index of the column to factorize in the matrix A. + * + * @param[in,out] A + * On entry, the matrix A where column h-1 needs to be factorized, and + * pivot for column h needs to be selected. + * The lower pentagonal part m-by-(h-1) are the L factors of the LU + * decomposition of A for the first (h-1) columns/rows. + * The upper pentagonal part (h-1)-by-n are the U factors of the LU + * decomposition of A for the first (h-1) columns/rows. + * On exit, A is factorized up to the (h-1)^th column/row and the + * remaining part is updated. The pivot of column h is selected. + * + * @param[in] lda + * The leading dimension of the array A. lda >= max(1,m). + * + * @param[in,out] nextpiv + * On entry, the allocated and initialized CHAM_piv_t structure to + * store the information related to pivot at stage h. + * On exit, the diagrow is initialized with the diagonal h row from A, + * and the pivot row is updated if a better pivot is found for step h. + * + * @param[in] prevpiv + * Holds the CHAM_piv_t structure of the previous (h-1) step with the + * (h-1) diagonal row, and the pivot selected row for (h-1) step. + * Not referenced if h is equal to 0. + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * @retval <0 if -i, the i-th argument had an illegal value + * + */ +int +CORE_zgetrf_panel_offdiag( int m, int n, int h, int m0, + CHAMELEON_Complex64_t *A, int lda, + CHAM_pivot_t *nextpiv, const CHAM_pivot_t *prevpiv ) +{ + CHAMELEON_Complex64_t *subA = A + lda * h; + CHAMELEON_Complex64_t alpha; + + if ( h != 0 ) { + CHAMELEON_Complex64_t *L, *pivrow; + + int pivot_blkm0 = prevpiv->blkm0; + int pivot_blkidx = prevpiv->blkidx; + + pivrow = (CHAMELEON_Complex64_t *)(prevpiv->pivrow); + + /* + * Copy the diagonal row in place of the selected one (previous pivot) to the right place if: + * - the previous pivot block was this one + * - it was not the previous row ?? + */ + if( pivot_blkm0 == m0 ) + { + cblas_zcopy( n, prevpiv->diagrow, 1, + A + pivot_blkidx, lda ); + } + + /* Take the pointer to the element below the current diagonal element */ + L = A + lda * (h-1); + + /* Scaling off-diagonal terms */ + alpha = (CHAMELEON_Complex64_t)1. / pivrow[h-1]; + cblas_zscal( m, CBLAS_SADDR( alpha ), L, 1 ); + + /* + * h is compared only to n, because if we are on the last column of a + * tile, m might be much smaller than n, and still we need to apply + * the geru call. If this is the diagonal tile, we will just look for + * the next maximum for nothing. + */ + if ( h < n ) { + /* Applying the update */ + cblas_zgeru(CblasColMajor, m, n-h, + CBLAS_SADDR(mzone), + L, 1, + pivrow + h, 1, + subA, lda ); + } + else { + return 0; + } + } + + /* Looking for the local max index */ + { + CHAMELEON_Complex64_t *next_pivrow = (CHAMELEON_Complex64_t *) nextpiv->pivrow; + int index; + + index = cblas_izamax( m, subA, 1 ); + alpha = subA[index]; + + if ( cabs(alpha) > cabs(next_pivrow[h]) ) + { + /* We save this index as a potential pivot for the next column of the panel */ + nextpiv->blkm0 = m0; /* block index */ + nextpiv->blkidx = index; /* index in local numbering */ + + /* Save the row holding the selected pivot */ + cblas_zcopy( n, A + index, lda, + next_pivrow, 1 ); + } + } + return 0; +} diff --git a/coreblas/include/coreblas/coreblas_z.h b/coreblas/include/coreblas/coreblas_z.h index 050a7968769a5c37a138faee18ef0a577ee6be19..2b5f36eb8bf436ba3f4cf71bafb3ab47d88c78ad 100644 --- a/coreblas/include/coreblas/coreblas_z.h +++ b/coreblas/include/coreblas/coreblas_z.h @@ -86,6 +86,13 @@ int CORE_zgetrf_incpiv(int M, int N, int IB, int CORE_zgetrf_nopiv(int M, int N, int IB, CHAMELEON_Complex64_t *A, int LDA, int *INFO); +int CORE_zgetrf_panel_diag( int m, int n, int h, int m0, + CHAMELEON_Complex64_t *A, int lda, + int *IPIV, + CHAM_pivot_t *nextpiv, const CHAM_pivot_t *prevpiv); +int CORE_zgetrf_panel_offdiag( int m, int n, int h, int m0, + CHAMELEON_Complex64_t *A, int lda, + CHAM_pivot_t *nextpiv, const CHAM_pivot_t *prevpiv ); int CORE_zgetrf_reclap(int M, int N, CHAMELEON_Complex64_t *A, int LDA, int *IPIV, int *info); diff --git a/include/chameleon/struct.h b/include/chameleon/struct.h index a6175854e0a3dcdf4841e2d5a482d2856ef2bfb2..2b324ef54d9ef40113498052627b4a06714c7a2b 100644 --- a/include/chameleon/struct.h +++ b/include/chameleon/struct.h @@ -35,6 +35,16 @@ BEGIN_C_DECLS #define CHAMELEON_TILE_DESC (1 << 1) #define CHAMELEON_TILE_HMAT (1 << 2) +/** + * @brief CHAMELEON structure to hold pivot informations for the LU factorization with partial pivoting + */ +typedef struct chameleon_pivot_s { + int blkm0; /**> The row index of the first row in the tile where the pivot has been selected */ + int blkidx; /**> The relative row index in the tile where the pivot has been selected */ + void *pivrow; /**> The copy of the row with the selected pivot */ + void *diagrow; /**> The copy of the diagonal row to permute */ +} CHAM_pivot_t; + typedef struct chameleon_tile_s { #if defined(CHAMELEON_KERNELS_TRACE) char *name;