Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 945e4dec authored by HOURCAU Ana's avatar HOURCAU Ana Committed by Mathieu Faverge
Browse files

compute: Adding hered algorithms for reduction on hermitian matrices

parent 46ba65f9
No related branches found
No related tags found
1 merge request!474Mixed Precision / Fix small issues in the conversion kernels for a better propagation of the information in distributed
......@@ -27,7 +27,8 @@
# @author Alycia Lisito
# @author Loris Lucido
# @author Matthieu Kuhn
# @date 2024-04-03
# @author Ana Hourcau
# @date 2024-07-17
#
###
......@@ -193,10 +194,12 @@ set(ZSRC
##################
# MIXED PRECISION
##################
pzhered.c
pzlag2c.c
pzgered.c
pzgerst.c
###
zhered.c
zgered.c
zgerst.c
#zcgels.c
......
/**
*
* @file pzhered.c
*
* @copyright 2009-2014 The University of Tennessee and The University of
* Tennessee Research Foundation. All rights reserved.
* @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon zhered parallel algorithm
*
* @version 1.3.0
* @author Mathieu Faverge
* @author Ana Hourcau
* @date 2024-07-17
* @precisions normal z -> z d
*
*/
// ALLOC_WS : A->mb
// ALLOC_WS : A->nb
// WS_ADD : A->mb + A->nb
#include "control/common.h"
#include <coreblas/lapacke.h>
#define A(m, n) A, (m), (n)
#define W(desc, m, n) (desc), (m), (n)
static inline void
chameleon_pzhered_frb( cham_trans_t trans, cham_uplo_t uplo,
CHAM_desc_t *A, CHAM_desc_t *Wnorm, CHAM_desc_t *Welt,
RUNTIME_option_t *options )
{
double alpha = 1.0;
double beta = 0.0;
int m, n;
int MT = A->mt;
int NT = A->nt;
int M = A->m;
int N = A->n;
int P = Welt->p;
int Q = Welt->q;
/* Initialize workspaces for tile norms */
for (m = 0; m < Wnorm->mt; m++)
{
for (n = 0; n < NT; n++)
{
INSERT_TASK_dlaset(
options,
ChamUpperLower, Wnorm->mb, Wnorm->nb,
alpha, beta,
W(Wnorm, m, n));
}
}
/* Initialize workspaces */
for (m = 0; m < Welt->mt; m++)
{
for (n = 0; n < Welt->nt; n++)
{
INSERT_TASK_dlaset(
options,
ChamUpperLower, Welt->mb, Welt->nb,
alpha, beta,
W(Welt, m, n));
}
}
/**
* Step 1:
* For j in [1,Q], Welt(m, j) = reduce( A(m, j+k*Q) )
*/
for (m = 0; m < MT; m++)
{
int nmin = (uplo == ChamUpper) ? m : 0;
int nmax = (uplo == ChamLower) ? chameleon_min(m + 1, NT) : NT;
int tempmm = (m == (MT - 1)) ? M - m * A->mb : A->mb;
for (n = nmin; n < nmax; n++)
{
int tempnn = (n == (NT - 1)) ? N - n * A->nb : A->nb;
if (n == m)
{
if ( trans == ChamConjTrans ) {
INSERT_TASK_zhessq(
options, ChamEltwise, uplo, tempmm,
A(m, n), W( Wnorm, m, n) );
}
else {
INSERT_TASK_zsyssq(
options, ChamEltwise, uplo, tempmm,
A(m, n), W( Wnorm, m, n) );
}
}
else
{
INSERT_TASK_zgessq(
options, ChamEltwise, tempmm, tempnn,
A(m, n), W( Wnorm, m, n ));
INSERT_TASK_zgessq(
options, ChamEltwise, tempmm, tempnn,
A(m, n), W( Wnorm, n, m ));
}
}
}
for(m = 0; m < MT; m++) {
for(n = Q; n < NT; n++) {
INSERT_TASK_dplssq(
options, ChamEltwise, 1, 1, W( Wnorm, m, n), W( Welt, m, n%Q) );
}
/**
* Step 2:
* For each j, W(m, j) = reduce( W( Welt, m, 0..Q-1) )
*/
for(n = 1; n < Q; n++) {
INSERT_TASK_dplssq(
options, ChamEltwise, 1, 1, W( Welt, m, n), W( Welt, m, 0) );
}
}
/**
* Step 3:
* For m in 0..P-1, Welt(m, n) = max( Welt(m..mt[P], n ) )
*/
for(m = P; m < MT; m++) {
INSERT_TASK_dplssq(
options, ChamEltwise, 1, 1, W( Welt, m, 0), W( Welt, m%P, 0) );
}
/**
* Step 4:
* For each i, Welt(i, n) = max( Welt(0..P-1, n) )
*/
for(m = 1; m < P; m++) {
INSERT_TASK_dplssq(
options, ChamEltwise, 1, 1, W( Welt, m, 0), W( Welt, 0, 0) );
}
/* Compute the norm of each tile, and the full norm */
for (m = 0; m < MT; m++)
{
int nmin = (uplo == ChamUpper) ? m : 0;
int nmax = (uplo == ChamLower) ? chameleon_min(m + 1, NT) : NT;
for (n = nmin; n < nmax; n++)
{
/* Compute the final norm of the tile */
INSERT_TASK_dplssq2(
options, 1, W( Wnorm, m, n ) );
}
}
INSERT_TASK_dplssq2(
options, 1, W( Welt, 0, 0) );
/**
* Broadcast the result
*/
for (m = 0; m < A->p; m++)
{
for (n = 0; n < A->q; n++)
{
if ((m != 0) || (n != 0))
{
INSERT_TASK_dlacpy(
options,
ChamUpperLower, 1, 1,
W(Welt, 0, 0), W(Welt, m, n));
}
}
}
}
/**
*
*/
void chameleon_pzhered( cham_trans_t trans, cham_uplo_t uplo, double prec, CHAM_desc_t *A,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
CHAM_desc_t Wcol;
CHAM_desc_t Welt;
double gnorm, lnorm, threshold, eps;
int workmt, worknt;
int m, n;
chamctxt = chameleon_context_self();
if (sequence->status != CHAMELEON_SUCCESS)
{
return;
}
RUNTIME_options_init(&options, chamctxt, sequence, request);
workmt = chameleon_max(A->mt, A->p);
worknt = chameleon_max(A->nt, A->q);
RUNTIME_options_ws_alloc(&options, 1, 0);
/* Matrix to store the norm of each element */
chameleon_desc_init(&Wcol, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 2, 1, 2,
A->mt * 2, A->nt, 0, 0, A->mt * 2, A->nt, A->p, A->q,
NULL, NULL, A->get_rankof_init, A->get_rankof_init_arg);
/* Matrix to compute the global frobenius norm */
chameleon_desc_init(&Welt, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 2, 1, 2,
workmt * 2, worknt, 0, 0, workmt * 2, worknt, A->p, A->q,
NULL, NULL, NULL, NULL);
chameleon_pzhered_frb( trans, uplo, A, &Wcol, &Welt, &options );
CHAMELEON_Desc_Flush(&Wcol, sequence);
CHAMELEON_Desc_Flush(&Welt, sequence);
CHAMELEON_Desc_Flush(A, sequence);
RUNTIME_sequence_wait(chamctxt, sequence);
gnorm = *((double *)Welt.get_blkaddr(&Welt, A->myrank / A->q, A->myrank % A->q));
chameleon_desc_destroy(&Welt);
/**
* Reduce the precision of the tiles if possible
*/
if (prec < 0.)
{
#if !defined(CHAMELEON_SIMULATION)
eps = LAPACKE_dlamch_work('e');
#else
#if defined(PRECISION_z) || defined(PRECISION_d)
eps = 1.e-15;
#else
eps = 1.e-7;
#endif
#endif
}
else
{
eps = prec;
}
threshold = (eps * gnorm) / (double)(chameleon_min(A->mt, A->nt));
#if defined(CHAMELEON_DEBUG_GERED)
fprintf(stderr,
"[%2d] The norm of A is: %e\n"
"[%2d] The requested precision is: %e\n"
"[%2d] The computed threshold is: %e\n",
A->myrank, gnorm,
A->myrank, eps,
A->myrank, threshold);
#endif
for (m = 0; m < A->mt; m++)
{
int tempmm = (m == (A->mt - 1)) ? A->m - m * A->mb : A->mb;
int nmin = (uplo == ChamUpper) ? m : 0;
int nmax = (uplo == ChamLower) ? chameleon_min(m + 1, A->nt) : A->nt;
for (n = nmin; n < nmax; n++)
{
CHAM_tile_t *tile = A->get_blktile(A, m, n);
int tempnn = (n == (A->nt - 1)) ? A->n - n * A->nb : A->nb;
/*
* u_{high} = 1e-16 (later should be application accuracy)
* u_{low} = 1e-8
* ||A_{i,j}||_F < u_{high} * || A ||_F / (nt * u_{low})
* ||A_{i,j}||_F < threshold / u_{low}
*/
INSERT_TASK_zgered( &options, threshold,
tempmm, tempnn, A( m, n ), W( &Wcol, m, n ) );
}
}
CHAMELEON_Desc_Flush(A, sequence);
RUNTIME_sequence_wait(chamctxt, sequence);
chameleon_desc_destroy(&Wcol);
RUNTIME_options_ws_free(&options);
RUNTIME_options_finalize(&options, chamctxt);
}
/**
*
* @file zhered.c
*
* @copyright 2009-2014 The University of Tennessee and The University of
* Tennessee Research Foundation. All rights reserved.
* @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon zhered wrappers
*
* @version 1.3.0
* @author Mathieu Faverge
* @author Ana Hourcau
* @date 2024-07-17
* @precisions normal z -> z d
*
*/
#include "control/common.h"
/**
********************************************************************************
*
* @ingroup CHAMELEON_Complex64_t_Tile
*
* @brief Computes the Cholesky factorization of a symmetric positive definite
* or Hermitian positive definite matrix with mixed precision.
*
* This is the synchronous version of CHAMELEON_zheredinit_Tile_Async(). It
* operates on matrices stored by tiles with tiles of potentially different
* precisions. All matrices are passed through descriptors. All dimensions are
* taken from the descriptors.
*
*******************************************************************************
*
* @param[in] uplo
* = ChamUpper: Upper triangle of A is stored;
* = ChamLower: Lower triangle of A is stored.
*
* @param[in] A
* On entry, the symmetric positive definite (or Hermitian) matrix A.
* If uplo = ChamUpper, the leading N-by-N upper triangular part of A
* contains the upper triangular part of the matrix A, and the strictly lower triangular
* part of A is not referenced.
* If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower
* triangular part of the matrix A, and the strictly upper triangular part of A is not
* referenced.
* On exit, if return value = 0, the factor U or L from the Cholesky factorization
* A = U^H*U or A = L*L^H.
*
*******************************************************************************
*
* @retval CHAMELEON_SUCCESS successful exit
* @retval >0 if i, the leading minor of order i of A is not positive definite, so the
* factorization could not be completed, and the solution has not been computed.
*
*******************************************************************************
*
* @sa CHAMELEON_zhered
* @sa CHAMELEON_zhered_Tile_Async
* @sa CHAMELEON_cpotrfmp_Tile
* @sa CHAMELEON_dpotrfmp_Tile
* @sa CHAMELEON_spotrfmp_Tile
* @sa CHAMELEON_zpotrs_Tile
*
*/
int CHAMELEON_zhered_Tile( cham_uplo_t uplo, double precision, CHAM_desc_t *A )
{
CHAM_context_t *chamctxt;
RUNTIME_sequence_t *sequence = NULL;
RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER;
int status;
chamctxt = chameleon_context_self();
if (chamctxt == NULL) {
chameleon_fatal_error("CHAMELEON_zheredinit_Tile", "CHAMELEON not initialized");
return CHAMELEON_ERR_NOT_INITIALIZED;
}
chameleon_sequence_create( chamctxt, &sequence );
CHAMELEON_zhered_Tile_Async( uplo, precision, A, sequence, &request );
CHAMELEON_Desc_Flush( A, sequence );
chameleon_sequence_wait( chamctxt, sequence );
status = sequence->status;
chameleon_sequence_destroy( chamctxt, sequence );
return status;
}
/**
********************************************************************************
*
* @ingroup CHAMELEON_Complex64_t_Tile_Async
*
* @brief Computes the Cholesky factorization of a symmetric positive definite
* or Hermitian positive definite matrix with mixed precision.
*
* This is the non-blocking equivalent of CHAMELEON_zhered_Tile(). It
* operates on matrices stored by tiles with tiles of potentially different
* precisions. All matrices are passed through descriptors. All dimensions are
* taken from the descriptors. It may return before the computation is
* finished. This function allows for pipelining operations at runtime.
*
*******************************************************************************
*
* @param[in] sequence
* Identifies the sequence of function calls that this call belongs to
* (for completion checks and exception handling purposes).
*
* @param[out] request
* Identifies this function call (for exception handling purposes).
*
*******************************************************************************
*
* @sa CHAMELEON_zhered
* @sa CHAMELEON_zhered_Tile
* @sa CHAMELEON_cpotrfmp_Tile_Async
* @sa CHAMELEON_dpotrfmp_Tile_Async
* @sa CHAMELEON_spotrfmp_Tile_Async
* @sa CHAMELEON_zpotrs_Tile_Async
*
*/
int CHAMELEON_zhered_Tile_Async( cham_uplo_t uplo, double precision, CHAM_desc_t *A,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
chamctxt = chameleon_context_self();
if (chamctxt == NULL) {
chameleon_fatal_error("CHAMELEON_zhered_Tile_Async", "CHAMELEON not initialized");
return CHAMELEON_ERR_NOT_INITIALIZED;
}
if (sequence == NULL) {
chameleon_fatal_error("CHAMELEON_zhered_Tile_Async", "NULL sequence");
return CHAMELEON_ERR_UNALLOCATED;
}
if (request == NULL) {
chameleon_fatal_error("CHAMELEON_zhered_Tile_Async", "NULL request");
return CHAMELEON_ERR_UNALLOCATED;
}
/* Check sequence status */
if (sequence->status == CHAMELEON_SUCCESS) {
request->status = CHAMELEON_SUCCESS;
}
else {
return chameleon_request_fail(sequence, request, CHAMELEON_ERR_SEQUENCE_FLUSHED);
}
/* Check descriptors for correctness */
if (chameleon_desc_check(A) != CHAMELEON_SUCCESS) {
chameleon_error("CHAMELEON_zhered_Tile_Async", "invalid descriptor");
return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE);
}
/* Check input arguments */
if (A->nb != A->mb) {
chameleon_error("CHAMELEON_zhered_Tile_Async", "only square tiles supported");
return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE);
}
/*
* Quick return
*/
if ( chameleon_max( A->m, A->n ) == 0 ) {
return CHAMELEON_SUCCESS;
}
chameleon_pzhered( ChamConjTrans, uplo, precision, A, sequence, request );
return CHAMELEON_SUCCESS;
}
......@@ -22,7 +22,8 @@
* @author Alycia Lisito
* @author Matthieu Kuhn
* @author Lionel Eyraud-Dubois
* @date 2023-09-08
* @author Ana Hourcau
* @date 2024-07-17
* @precisions normal z -> c d s
*
*/
......@@ -81,6 +82,8 @@ int chameleon_zshift(CHAM_context_t *chamctxt, int m, int n, CHAMELEON_Complex64
#if defined(PRECISION_z) || defined(PRECISION_d)
void chameleon_pzgered( cham_uplo_t uplo, double prec, CHAM_desc_t *A,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
void chameleon_pzhered( cham_trans_t trans, cham_uplo_t uplo, double prec, CHAM_desc_t *A,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
void chameleon_pzgerst( cham_uplo_t uplo, CHAM_desc_t *A,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
#endif
......
......@@ -170,6 +170,7 @@ int CHAMELEON_zpoinv_Tile(cham_uplo_t uplo, CHAM_desc_t *A);
int CHAMELEON_zposv_Tile(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B);
int CHAMELEON_zpotrf_Tile(cham_uplo_t uplo, CHAM_desc_t *A);
int CHAMELEON_zgered_Tile( cham_uplo_t uplo, double prec, CHAM_desc_t *A );
int CHAMELEON_zhered_Tile( cham_uplo_t uplo, double prec, CHAM_desc_t *A );
int CHAMELEON_zgerst_Tile( cham_uplo_t uplo, CHAM_desc_t *A );
int CHAMELEON_zsytrf_Tile(cham_uplo_t uplo, CHAM_desc_t *A);
int CHAMELEON_zpotri_Tile(cham_uplo_t uplo, CHAM_desc_t *A);
......@@ -249,6 +250,7 @@ int CHAMELEON_zpoinv_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequen
int CHAMELEON_zposv_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
int CHAMELEON_zpotrf_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
int CHAMELEON_zgered_Tile_Async(cham_uplo_t uplo, double prec, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
int CHAMELEON_zhered_Tile_Async(cham_uplo_t uplo, double prec, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
int CHAMELEON_zgerst_Tile_Async( cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
int CHAMELEON_zsytrf_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
int CHAMELEON_zpotri_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment