-
Mathieu Faverge authoredMathieu Faverge authored
pzlatms.c 10.24 KiB
/**
*
* @file pzlatms.c
*
* @copyright 2012-2021 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
* @copyright 2016-2020 KAUST. All rights reserved.
*
***
*
* @brief Chameleon zlatms parallel algorithm
*
* @version 1.1.0
* @author Mathieu Faverge
* @date 2020-11-19
* @precisions normal z -> s d c
*
*/
#include "control/common.h"
#include <stdlib.h>
#if !defined(CHAMELEON_SIMULATION)
#include <coreblas/random.h>
#include <coreblas.h>
#endif
#define A(m, n) A, m, n
/*
* Static variable to know how to handle the data within the kernel
* This assumes that only one runtime is enabled at a time.
*/
static RUNTIME_id_t zlatms_runtime_id = RUNTIME_SCHED_STARPU;
static inline int
zlaset_diag( const CHAM_desc_t *descA,
cham_uplo_t uplo, int m, int n,
CHAM_tile_t *tileA, void *op_args )
{
CHAMELEON_Complex64_t *A;
const double *D = (const double *)op_args;
int tempmm = m == descA->mt-1 ? descA->m-m*descA->mb : descA->mb;
int tempnn = n == descA->nt-1 ? descA->n-n*descA->nb : descA->nb;
int minmn = chameleon_min( tempmm, tempnn );
int lda, i;
if ( zlatms_runtime_id == RUNTIME_SCHED_PARSEC ) {
A = (CHAMELEON_Complex64_t*)tileA;
lda = descA->get_blkldd( descA, m );
}
else {
A = tileA->mat;
lda = tileA->ld;
}
assert( m == n );
/* Shift to the values corresponding to the tile */
D += m * descA->mb;
for( i=0; i<minmn; i++, D++ ) {
*A = *D;
A += lda + 1;
}
(void)uplo;
return 0;
}
/**
* Parallel scale of a matrix A
*/
void chameleon_pzlatms( cham_dist_t idist, unsigned long long int seed, cham_sym_t sym,
double *D, int mode, double cond, double dmax, CHAM_desc_t *A,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
CHAM_desc_t descU, descV;
CHAM_desc_t *descDptr = NULL;
CHAM_desc_t descTS, descTT, descD;
libhqr_matrix_t mat;
libhqr_tree_t qrtree;
cham_trans_t trans = ChamConjTrans;
int n, ib, alloc_d = 0;
int kt = chameleon_min( A->mt, A->nt );
int minmn = chameleon_min( A->m, A->n );
chamctxt = chameleon_context_self();
if (sequence->status != CHAMELEON_SUCCESS) {
return;
}
ib = CHAMELEON_IB;
zlatms_runtime_id = chamctxt->scheduler;
RUNTIME_options_init(&options, chamctxt, sequence, request);
/* Start initialiazing A */
chameleon_pzlaset( ChamUpperLower, 0., 0., A, sequence, request );
/* Check if we need to perform UDV or UDU' */
if ( sym == ChamNonsymPosv ) {
trans = ChamNoTrans;
}
else if ( sym == ChamSymPosv ) {
trans = ChamTrans;
}
#if !defined(CHAMELEON_SIMULATION)
/* Compute the diagonal D */
{
int irsign, rc;
/* Check what to do with the sign in dlatm1 */
#if defined(PRECISION_z) || defined(PRECISION_c)
irsign = ( sym == ChamHermGeev );
#else
irsign = ( sym == ChamHermGeev ) || ( sym == ChamSymPosv );
#endif
if ( D == NULL ) {
D = malloc( minmn * sizeof(double) );
alloc_d = 1;
}
rc = CORE_dlatm1( mode, cond, irsign, idist, seed, D, minmn );
if ( rc != CHAMELEON_SUCCESS ) {
chameleon_error( "CHAMELEON_zlatms", "Could not generated D. dlatm1 failed" );
CHAMELEON_Desc_Flush( A, sequence );
CHAMELEON_Sequence_Wait( sequence );
if ( alloc_d ) {
free( D );
}
return;
}
/* Shift the seed for future use to generate the random unitary matrices */
seed = CORE_rnd64_jump( 2 * minmn, seed );
/* Scale by dmax */
if ( ( mode != 0 ) && ( abs(mode) != 6 ) )
{
int imax = cblas_idamax( minmn, D, 1 );
double temp = D[imax];
if ( temp > 0. ) {
cblas_dscal( minmn, dmax / temp, D, 1 );
}
else {
chameleon_error( "CHAMELEON_zlatms", "Could not scale D (max sing. value is 0.)" );
CHAMELEON_Desc_Flush( A, sequence );
CHAMELEON_Sequence_Wait( sequence );
if ( alloc_d ) {
free( D );
}
}
}
}
#endif
/* Copy D to the diagonal of A */
for (n = 0; n < kt; n++) {
INSERT_TASK_map(
&options,
ChamUpperLower, A(n, n),
zlaset_diag, D );
}
/**
* Generate A = U D V
*
* U and V are random unitary matrices
* V = U^t or U^h if A is respectively symmetric or hermitian
*
*/
/* Generate U and apply it */
{
/* U is of size A->m by min(A->m, A->n) */
chameleon_zdesc_copy_and_restrict( A, &descU, A->m, minmn );
chameleon_pzplrnt( &descU, descU.m, 0, 0, seed, sequence, request );
/* Shift the seed to generate the next random unitary matrix */
#if !defined(CHAMELEON_SIMULATION)
seed = CORE_rnd64_jump( 2 * minmn * A->m, seed );
#endif
/* Apply a QR factorization */
mat.mt = descU.mt;
mat.nt = descU.nt;
mat.nodes = descU.p * descU.q;
mat.p = descU.p;
libhqr_init_hqr( &qrtree, LIBHQR_QR, &mat,
-1, /*low level tree */
-1, /* high level tree */
-1, /* TS tree size */
mat.p, /* High level size */
-1, /* Domino */
0 /* TSRR (unstable) */ );
#if defined(CHAMELEON_COPY_DIAG)
chameleon_zdesc_copy_and_restrict( A, &descD, A->m, minmn );
descDptr = &descD;
#endif
chameleon_desc_init( &descTS, CHAMELEON_MAT_ALLOC_TILE,
ChamComplexDouble, ib, descU.nb, ib * descU.nb,
ib * descU.mt, descU.nb * descU.nt, 0, 0,
ib * descU.mt, descU.nb * descU.nt, descU.p, descU.q,
NULL, NULL, NULL );
chameleon_desc_init( &descTT, CHAMELEON_MAT_ALLOC_TILE,
ChamComplexDouble, ib, descU.nb, ib * descU.nb,
ib * descU.mt, descU.nb * descU.nt, 0, 0,
ib * descU.mt, descU.nb * descU.nt, descU.p, descU.q,
NULL, NULL, NULL );
/* U <= qr(U) */
chameleon_pzgeqrf_param( 1, kt, &qrtree, &descU,
&descTS, &descTT, descDptr, sequence, request );
/* A <= U * D */
chameleon_pzunmqr_param( 0, &qrtree, ChamLeft, ChamNoTrans,
&descU, A, &descTS, &descTT, descDptr, sequence, request );
if ( trans != ChamNoTrans ) {
/* A <= (U * D) * U' */
chameleon_pzunmqr_param( 0, &qrtree, ChamRight, trans,
&descU, A, &descTS, &descTT, descDptr, sequence, request );
}
CHAMELEON_Desc_Flush( &descU, sequence );
CHAMELEON_Desc_Flush( &descTS, sequence );
CHAMELEON_Desc_Flush( &descTT, sequence );
if ( descDptr != NULL ) {
CHAMELEON_Desc_Flush( descDptr, sequence );
}
CHAMELEON_Desc_Flush( A, sequence );
CHAMELEON_Sequence_Wait( sequence );
chameleon_desc_destroy( &descU );
chameleon_desc_destroy( &descTS );
chameleon_desc_destroy( &descTT );
if ( descDptr != NULL ) {
chameleon_desc_destroy( descDptr );
}
libhqr_finalize( &qrtree );
}
/* Generate V and apply it */
if ( trans == ChamNoTrans )
{
/* V is of size min(A->m, A->n) by A->n */
chameleon_zdesc_copy_and_restrict( A, &descV, minmn, A->n );
chameleon_pzplrnt( &descV, descV.m, 0, 0, seed, sequence, request );
/* Apply a QR factorization */
mat.mt = descV.mt;
mat.nt = descV.nt;
mat.nodes = descV.p * descV.q;
mat.p = descV.q;
libhqr_init_hqr( &qrtree, LIBHQR_LQ, &mat,
-1, /*low level tree */
-1, /* high level tree */
-1, /* TS tree size */
mat.p, /* High level size */
-1, /* Domino */
0 /* TSRR (unstable) */ );
#if defined(CHAMELEON_COPY_DIAG)
chameleon_zdesc_copy_and_restrict( A, &descD, minmn, A->n );
descDptr = &descD;
#endif
chameleon_desc_init( &descTS, CHAMELEON_MAT_ALLOC_TILE,
ChamComplexDouble, ib, descV.nb, ib * descV.nb,
ib * descV.mt, descV.nb * descV.nt, 0, 0,
ib * descV.mt, descV.nb * descV.nt, descV.p, descV.q,
NULL, NULL, NULL );
chameleon_desc_init( &descTT, CHAMELEON_MAT_ALLOC_TILE,
ChamComplexDouble, ib, descV.nb, ib * descV.nb,
ib * descV.mt, descV.nb * descV.nt, 0, 0,
ib * descV.mt, descV.nb * descV.nt, descV.p, descV.q,
NULL, NULL, NULL );
/* V <= qr(V) */
chameleon_pzgelqf_param( 1, &qrtree, &descV,
&descTS, &descTT, descDptr, sequence, request );
/* A <= (U * D) * V */
chameleon_pzunmlq_param( 0, &qrtree, ChamRight, ChamNoTrans,
&descV, A, &descTS, &descTT, descDptr, sequence, request );
CHAMELEON_Desc_Flush( &descV, sequence );
CHAMELEON_Desc_Flush( &descTS, sequence );
CHAMELEON_Desc_Flush( &descTT, sequence );
if ( descDptr != NULL ) {
CHAMELEON_Desc_Flush( descDptr, sequence );
}
CHAMELEON_Desc_Flush( A, sequence );
CHAMELEON_Sequence_Wait( sequence );
chameleon_desc_destroy( &descV );
chameleon_desc_destroy( &descTS );
chameleon_desc_destroy( &descTT );
if ( descDptr != NULL ) {
chameleon_desc_destroy( descDptr );
}
libhqr_finalize( &qrtree );
}
RUNTIME_options_finalize(&options, chamctxt);
if ( alloc_d ) {
free( D );
}
(void)descD;
}