-
Mathieu Faverge authoredMathieu Faverge authored
pzlange.c 15.42 KiB
/**
*
* @file pzlange.c
*
* @copyright 2009-2014 The University of Tennessee and The University of
* Tennessee Research Foundation. All rights reserved.
* @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon zlange parallel algorithm
*
* @version 1.0.0
* @comment This file has been automatically generated
* from Plasma 2.6.0 for CHAMELEON 1.0.0
* @author Emmanuel Agullo
* @author Mathieu Faverge
* @author Florent Pruvost
* @date 2014-07-21
* @precisions normal z -> s d c
*
*/
//ALLOC_WS : A->mb
//ALLOC_WS : A->nb
//WS_ADD : A->mb + A->nb
#include "control/common.h"
#define A( m, n ) A, (m), (n)
#define W( desc, m, n ) (desc), (m), (n)
static inline void
chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A,
CHAM_desc_t *Wcol, CHAM_desc_t *Welt,
RUNTIME_option_t *options )
{
int m, n;
int minMNT = chameleon_min( A->mt, A->nt );
int minMN = chameleon_min( A->m, A->n );
int MT = (uplo == ChamUpper) ? minMNT : A->mt;
int NT = (uplo == ChamLower) ? minMNT : A->nt;
int M = (uplo == ChamUpper) ? minMN : A->m;
int N = (uplo == ChamLower) ? minMN : A->n;
int P = Welt->p;
int Q = Welt->q;
/**
* Step 1:
* For j in [1,P], W(i, n) = reduce( A(i+k*P, n) )
*/
for(n = 0; n < NT; n++) {
int mmin = ( uplo == ChamLower ) ? n : 0;
int mmax = ( uplo == ChamUpper ) ? chameleon_min(n+1, MT) : MT;
int tempnn = ( n == (NT-1) ) ? N - n * A->nb : A->nb;
for(m = mmin; m < mmax; m++) {
int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb;
int ldam = BLKLDD( A, m );
if ( (n == m) && (uplo != ChamUpperLower) ) {
INSERT_TASK_ztrasm(
options,
ChamColumnwise, uplo, diag, tempmm, tempnn,
A(m, n), ldam, W( Wcol, m, n ) );
}
else {
INSERT_TASK_dzasum(
options,
ChamColumnwise, ChamUpperLower, tempmm, tempnn,
A(m, n), ldam, W( Wcol, m, n ) );
}
if ( m >= P ) {
INSERT_TASK_daxpy(
options, tempnn, 1.,
W( Wcol, m, n ), 1,
W( Wcol, m%P, n ), 1 );
}
}
/**
* Step 2:
* For each i, W(i, n) = reduce( W(0..P-1, n) )
*/
for(m = 1; m < P; m++) {
INSERT_TASK_daxpy(
options, tempnn, 1.,
W( Wcol, m, n ), 1,
W( Wcol, 0, n ), 1 );
}
INSERT_TASK_dlange(
options,
ChamMaxNorm, 1, tempnn, A->nb,
W( Wcol, 0, n ), 1,
W( Welt, 0, n ) );
}
/**
* Step 3:
* For n in 0..Q-1, W(m, n) = max( W(m, n..nt[Q] ) )
*/
for(n = Q; n < NT; n++) {
INSERT_TASK_dlange_max(
options,
W( Welt, 0, n ),
W( Welt, 0, n%Q ) );
}
/**
* Step 4:
* For each i, Welt(i, n) = max( Welt(0..P-1, n) )
*/
for(n = 1; n < Q; n++) {
INSERT_TASK_dlange_max(
options,
W( Welt, 0, n ),
W( Welt, 0, 0 ) );
}
}
static inline void
chameleon_pzlange_inf( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A,
CHAM_desc_t *Wcol, CHAM_desc_t *Welt,
RUNTIME_option_t *options)
{
int m, n;
int minMNT = chameleon_min( A->mt, A->nt );
int minMN = chameleon_min( A->m, A->n );
int MT = (uplo == ChamUpper) ? minMNT : A->mt;
int NT = (uplo == ChamLower) ? minMNT : A->nt;
int M = (uplo == ChamUpper) ? minMN : A->m;
int N = (uplo == ChamLower) ? minMN : A->n;
int P = Welt->p;
int Q = Welt->q;
/**
* Step 1:
* For j in [1,Q], Wcol(m, j) = reduce( A(m, j+k*Q) )
*/
for(m = 0; m < MT; m++) {
int nmin = ( uplo == ChamUpper ) ? m : 0;
int nmax = ( uplo == ChamLower ) ? chameleon_min(m+1, NT) : NT;
int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb;
int ldam = BLKLDD( A, m );
for(n = nmin; n < nmax; n++) {
int tempnn = ( n == (NT-1) ) ? N - n * A->nb : A->nb;
if ( (n == m) && (uplo != ChamUpperLower) ) {
INSERT_TASK_ztrasm(
options,
ChamRowwise, uplo, diag, tempmm, tempnn,
A(m, n), ldam, W( Wcol, m, n) );
}
else {
INSERT_TASK_dzasum(
options,
ChamRowwise, ChamUpperLower, tempmm, tempnn,
A(m, n), ldam, W( Wcol, m, n) );
}
if ( n >= Q ) {
INSERT_TASK_daxpy(
options, tempmm, 1.,
W( Wcol, m, n ), 1,
W( Wcol, m, n%Q ), 1 );
}
}
/**
* Step 2:
* For each j, W(m, j) = reduce( Wcol(m, 0..Q-1) )
*/
for(n = 1; n < Q; n++) {
INSERT_TASK_daxpy(
options, tempmm, 1.,
W( Wcol, m, n ), 1,
W( Wcol, m, 0 ), 1 );
}
INSERT_TASK_dlange(
options,
ChamMaxNorm, tempmm, 1, A->nb,
W( Wcol, m, 0), 1, W( Welt, m, 0));
}
/**
* Step 3:
* For m in 0..P-1, Welt(m, n) = max( Wcol(m..mt[P], n ) )
*/
for(m = P; m < MT; m++) {
INSERT_TASK_dlange_max(
options,
W( Welt, m, 0), W( Welt, m%P, 0) );
}
/**
* Step 4:
* For each i, Welt(i, n) = max( Welt(0..P-1, n) )
*/
for(m = 1; m < P; m++) {
INSERT_TASK_dlange_max(
options,
W( Welt, m, 0), W( Welt, 0, 0) );
}
}
static inline void
chameleon_pzlange_max( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_desc_t *Welt,
RUNTIME_option_t *options)
{
int m, n;
int minMNT = chameleon_min( A->mt, A->nt );
int minMN = chameleon_min( A->m, A->n );
int MT = (uplo == ChamUpper) ? minMNT : A->mt;
int NT = (uplo == ChamLower) ? minMNT : A->nt;
int M = (uplo == ChamUpper) ? minMN : A->m;
int N = (uplo == ChamLower) ? minMN : A->n;
int P = Welt->p;
int Q = Welt->q;
/**
* Step 1:
* For j in [1,Q], Welt(m, j) = reduce( A(m, j+k*Q) )
*/
for(m = 0; m < MT; m++) {
int nmin = ( uplo == ChamUpper ) ? m : 0;
int nmax = ( uplo == ChamLower ) ? chameleon_min(m+1, NT) : NT;
int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb;
int ldam = BLKLDD( A, m );
for(n = nmin; n < nmax; n++) {
int tempnn = ( n == (NT-1) ) ? N - n * A->nb : A->nb;
if ( (n == m) && (uplo != ChamUpperLower) ) {
INSERT_TASK_zlantr(
options,
ChamMaxNorm, uplo, diag, tempmm, tempnn, A->nb,
A(m, n), ldam, W( Welt, m, n));
}
else {
INSERT_TASK_zlange(
options,
ChamMaxNorm, tempmm, tempnn, A->nb,
A(m, n), ldam, W( Welt, m, n ));
}
if ( n >= Q ) {
INSERT_TASK_dlange_max(
options,
W( Welt, m, n ),
W( Welt, m, n%Q ) );
}
}
/**
* Step 2:
* For each j, W(m, j) = reduce( Welt(m, 0..Q-1) )
*/
for(n = 1; n < Q; n++) {
INSERT_TASK_dlange_max(
options,
W( Welt, m, n ),
W( Welt, m, 0 ) );
}
}
/**
* Step 3:
* For m in 0..P-1, Welt(m, n) = max( Welt(m..mt[P], n ) )
*/
for(m = P; m < MT; m++) {
INSERT_TASK_dlange_max(
options,
W( Welt, m, 0 ),
W( Welt, m%P, 0 ) );
}
/**
* Step 4:
* For each i, Welt(i, n) = max( Welt(0..P-1, n) )
*/
for(m = 1; m < P; m++) {
INSERT_TASK_dlange_max(
options,
W( Welt, m, 0 ),
W( Welt, 0, 0 ) );
}
}
static inline void
chameleon_pzlange_frb( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_desc_t *Welt,
RUNTIME_option_t *options)
{
int m, n;
int minMNT = chameleon_min( A->mt, A->nt );
int minMN = chameleon_min( A->m, A->n );
int MT = (uplo == ChamUpper) ? minMNT : A->mt;
int NT = (uplo == ChamLower) ? minMNT : A->nt;
int M = (uplo == ChamUpper) ? minMN : A->m;
int N = (uplo == ChamLower) ? minMN : A->n;
int P = Welt->p;
int Q = Welt->q;
/**
* Step 1:
* For j in [1,Q], Welt(m, j) = reduce( A(m, j+k*Q) )
*/
for(m = 0; m < MT; m++) {
int nmin = ( uplo == ChamUpper ) ? m : 0;
int nmax = ( uplo == ChamLower ) ? chameleon_min(m+1, NT) : NT;
int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb;
int ldam = BLKLDD( A, m );
for(n = nmin; n < nmax; n++) {
int tempnn = ( n == (NT-1) ) ? N - n * A->nb : A->nb;
if ( (n == m) && (uplo != ChamUpperLower) ) {
INSERT_TASK_ztrssq(
options,
uplo, diag, tempmm, tempnn,
A(m, n), ldam, W( Welt, m, n) );
}
else {
INSERT_TASK_zgessq(
options,
tempmm, tempnn,
A(m, n), ldam, W( Welt, m, n) );
}
if ( n >= Q ) {
INSERT_TASK_dplssq(
options, W( Welt, m, n), W( Welt, m, n%Q) );
}
}
/**
* Step 2:
* For each j, W(m, j) = reduce( Welt(m, 0..Q-1) )
*/
for(n = 1; n < Q; n++) {
INSERT_TASK_dplssq(
options, W( Welt, m, n), W( Welt, m, 0) );
}
}
/**
* Step 3:
* For m in 0..P-1, Welt(m, n) = max( Welt(m..mt[P], n ) )
*/
for(m = P; m < MT; m++) {
INSERT_TASK_dplssq(
options, W( Welt, m, 0), W( Welt, m%P, 0) );
}
/**
* Step 4:
* For each i, Welt(i, n) = max( Welt(0..P-1, n) )
*/
for(m = 1; m < P; m++) {
INSERT_TASK_dplssq(
options, W( Welt, m, 0), W( Welt, 0, 0) );
}
INSERT_TASK_dplssq2(
options, W( Welt, 0, 0) );
}
/**
*
*/
void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag,
CHAM_desc_t *A, double *result,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
CHAM_desc_t Wcol;
CHAM_desc_t Welt;
double alpha = 0.0;
double beta = 0.0;
int workmt, worknt;
int m, n, wcol_init = 0;
chamctxt = chameleon_context_self();
if ( sequence->status != CHAMELEON_SUCCESS ) {
return;
}
RUNTIME_options_init(&options, chamctxt, sequence, request);
*result = 0.0;
workmt = chameleon_max( A->mt, A->p );
worknt = chameleon_max( A->nt, A->q );
switch ( norm ) {
case ChamOneNorm:
RUNTIME_options_ws_alloc( &options, 1, 0 );
chameleon_desc_init( &Wcol, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble, 1, A->nb, A->nb,
workmt, worknt * A->nb, 0, 0, workmt, worknt * A->nb, A->p, A->q,
NULL, NULL, NULL );
wcol_init = 1;
/*
* Use the global allocator for Welt, otherwise flush may free the data before the result is read.
*/
chameleon_desc_init( &Welt, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 1, 1, 1,
A->p, worknt, 0, 0, A->p, worknt, A->p, A->q,
NULL, NULL, NULL );
break;
/*
* ChamInfNorm
*/
case ChamInfNorm:
RUNTIME_options_ws_alloc( &options, A->mb, 0 );
chameleon_desc_init( &Wcol, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble, A->mb, 1, A->mb,
workmt * A->mb, worknt, 0, 0, workmt * A->mb, worknt, A->p, A->q,
NULL, NULL, NULL );
wcol_init = 1;
chameleon_desc_init( &Welt, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 1, 1, 1,
workmt, A->q, 0, 0, workmt, A->q, A->p, A->q,
NULL, NULL, NULL );
break;
/*
* ChamFrobeniusNorm
*/
case ChamFrobeniusNorm:
RUNTIME_options_ws_alloc( &options, 1, 0 );
alpha = 1.;
chameleon_desc_init( &Welt, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 2, 1, 2,
workmt*2, worknt, 0, 0, workmt*2, worknt, A->p, A->q,
NULL, NULL, NULL );
break;
/*
* ChamMaxNorm
*/
case ChamMaxNorm:
default:
RUNTIME_options_ws_alloc( &options, 1, 0 );
chameleon_desc_init( &Welt, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 1, 1, 1,
workmt, worknt, 0, 0, workmt, worknt, A->p, A->q,
NULL, NULL, NULL );
}
/* Initialize workspaces */
if ( (norm == ChamInfNorm) ||
(norm == ChamOneNorm) )
{
/* Initialize Wcol tile */
for(m = 0; m < Wcol.mt; m++) {
for(n = 0; n < Wcol.nt; n++) {
INSERT_TASK_dlaset(
&options,
ChamUpperLower, Wcol.mb, Wcol.nb,
alpha, beta,
W( &Wcol, m, n ), Wcol.mb );
}
}
}
for(m = 0; m < Welt.mt; m++) {
for(n = 0; n < Welt.nt; n++) {
INSERT_TASK_dlaset(
&options,
ChamUpperLower, Welt.mb, Welt.nb,
alpha, beta,
W( &Welt, m, n ), Welt.mb );
}
}
switch ( norm ) {
case ChamOneNorm:
chameleon_pzlange_one( uplo, diag, A, &Wcol, &Welt, &options );
CHAMELEON_Desc_Flush( &Wcol, sequence );
break;
case ChamInfNorm:
chameleon_pzlange_inf( uplo, diag, A, &Wcol, &Welt, &options );
CHAMELEON_Desc_Flush( &Wcol, sequence );
break;
case ChamFrobeniusNorm:
chameleon_pzlange_frb( uplo, diag, A, &Welt, &options );
break;
case ChamMaxNorm:
default:
chameleon_pzlange_max( uplo, diag, A, &Welt, &options );
}
/**
* Broadcast the result
*/
for(m = 0; m < A->p; m++) {
for(n = 0; n < A->q; n++) {
if ( (m != 0) || (n != 0) ) {
INSERT_TASK_dlacpy(
&options,
ChamUpperLower, 1, 1, 1,
W( &Welt, 0, 0 ), 1, W( &Welt, m, n ), 1);
}
}
}
if ( wcol_init ) {
CHAMELEON_Desc_Flush( &Wcol, sequence );
}
CHAMELEON_Desc_Flush( &Welt, sequence );
CHAMELEON_Desc_Flush( A, sequence );
RUNTIME_sequence_wait( chamctxt, sequence );
*result = *((double *)Welt.get_blkaddr( &Welt, A->myrank / A->q, A->myrank % A->q ));
if ( wcol_init ) {
chameleon_desc_destroy( &Wcol );
}
chameleon_desc_destroy( &Welt );
RUNTIME_options_ws_free(&options);
RUNTIME_options_finalize(&options, chamctxt);
}