Newer
Older
/**
*
* @file pzgetrf.c
*
* @copyright 2009-2014 The University of Tennessee and The University of
* Tennessee Research Foundation. All rights reserved.
* @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon zgetrf parallel algorithm
*
Matthieu Kuhn
committed
* @version 1.3.0
* @author Omar Zenati
* @author Mathieu Faverge
* @author Emmanuel Agullo
* @author Matthieu Kuhn
* @author Alycia Lisito

Mathieu Faverge
committed
* @date 2024-03-16
* @precisions normal z -> s d c
*
*/
#include "control/common.h"
Matthieu Kuhn
committed
#define A(m,n) A, m, n
#define U(m,n) &(ws->U), m, n
#define Up(m,n) &(ws->Up), m, n
/*
* All the functions below are panel factorization variant.
* The parameters are:
* @param[inout] ws
* The data structure associated to the algorithm that holds all extra
* information that may be needed for LU factorization
*
* @param[inout] A
* The descriptor of the full matrix A (not just the panel)
*
* @param[inout] ipiv
* The descriptor of the pivot array associated to A.
* @param[in] k
* The index of the column to factorize
*
* @param[inout] options
* The runtime options data structure to pass through all insert_task calls.
*/
static inline void
chameleon_pzgetrf_panel_facto_nopiv( struct chameleon_pzgetrf_s *ws,
CHAM_desc_t *A,
CHAM_ipiv_t *ipiv,
int k,
RUNTIME_option_t *options )
{
const CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t) 1.0;
int m, tempkm, tempkn, tempmm;
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
/*
* Algorithm per block without pivoting
*/
INSERT_TASK_zgetrf_nopiv(
options,
tempkm, tempkn, ws->ib, A->mb,
A(k, k), 0);
for (m = k+1; m < A->mt; m++) {
tempmm = (m == (A->mt - 1)) ? A->m - m * A->mb : A->mb;
INSERT_TASK_ztrsm(
options,
ChamRight, ChamUpper, ChamNoTrans, ChamNonUnit,
tempmm, tempkn, A->mb,
zone, A(k, k),
A(m, k) );
}
}
static inline void
chameleon_pzgetrf_panel_facto_nopiv_percol( struct chameleon_pzgetrf_s *ws,
CHAM_desc_t *A,
CHAM_ipiv_t *ipiv,
int k,
RUNTIME_option_t *options )
{
int m, h;
int tempkm, tempkn, tempmm, minmn;
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
minmn = chameleon_min( tempkm, tempkn );
/*
* Algorithm per column without pivoting
*/
for(h=0; h<minmn; h++){
Matthieu Kuhn
committed
INSERT_TASK_zgetrf_nopiv_percol_diag(
options, tempkm, tempkn, h,
A( k, k ), U( k, k ), A->mb * k );
for (m = k+1; m < A->mt; m++) {
tempmm = (m == (A->mt - 1)) ? A->m - m * A->mb : A->mb;
Matthieu Kuhn
committed
INSERT_TASK_zgetrf_nopiv_percol_trsm(
options, tempmm, tempkn, h,
A( m, k ), U( k, k ) );
}
}
RUNTIME_data_flush( options->sequence, U(k, k) );
}
static inline void
chameleon_pzgetrf_panel_facto_percol( struct chameleon_pzgetrf_s *ws,
CHAM_desc_t *A,
CHAM_ipiv_t *ipiv,
int k,
RUNTIME_option_t *options )
{
int m, h;
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
minmn = chameleon_min( tempkm, tempkn );
/* Update the number of column */
ipiv->n = minmn;
/*
* Algorithm per column with pivoting
*/
for (h=0; h<=minmn; h++){
INSERT_TASK_zgetrf_percol_diag(
options,
A(k, k),
ipiv );
for (m = k+1; m < A->mt; m++) {
tempmm = (m == (A->mt - 1)) ? A->m - m * A->mb : A->mb;
INSERT_TASK_zgetrf_percol_offdiag(
options,
A(m, k),
ipiv );
}
/* Reduce globally (between MPI processes) */
INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, h, tempkn );
}
/* Flush temporary data used for the pivoting */
INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k );
RUNTIME_ipiv_flushk( options->sequence, ipiv, A->myrank );
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
/*
* Factorization of panel k - dynamic scheduling - batched version / stock
*/
static inline void
chameleon_pzgetrf_panel_facto_percol_batched( struct chameleon_pzgetrf_s *ws,
CHAM_desc_t *A,
CHAM_ipiv_t *ipiv,
int k,
RUNTIME_option_t *options )
{
int m, h;
int tempkm, tempkn, tempmm, minmn;
void **clargs = malloc( sizeof(char *) );
memset( clargs, 0, sizeof(char *) );
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
minmn = chameleon_min( tempkm, tempkn );
/* Update the number of column */
ipiv->n = minmn;
/*
* Algorithm per column with pivoting (no recursion)
*/
/* Iterate on current panel column */
/* Since index h scales column h-1, we need to iterate up to minmn (included) */
for ( h = 0; h <= minmn; h++ ) {
INSERT_TASK_zgetrf_percol_diag( options, tempkm, tempkn, h, k * A->mb, A(k, k), ipiv );
for ( m = k+1; m < A->mt; m++ ) {
tempmm = (m == (A->mt - 1)) ? A->m - m * A->mb : A->mb;
INSERT_TASK_zgetrf_panel_offdiag_batched( options, tempmm, tempkn, h, m * A->mb,
(void *)ws, A(m, k), clargs, ipiv );
}
INSERT_TASK_zgetrf_panel_offdiag_batched_flush( options, A, k, clargs, ipiv );
INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, h, tempkn );
}
free( clargs );
/* Flush temporary data used for the pivoting */
INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k );
RUNTIME_ipiv_flushk( options->sequence, ipiv, A->myrank );
static inline void
chameleon_pzgetrf_panel_facto_blocked( struct chameleon_pzgetrf_s *ws,
CHAM_desc_t *A,
CHAM_ipiv_t *ipiv,
int k,
RUNTIME_option_t *options )
{
int m, h, b, nbblock;
if ( ! ws->involved ) {
return;
}
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
minmn = chameleon_min( tempkm, tempkn );
/* Update the number of column */
ipiv->n = minmn;
nbblock = chameleon_ceil( minmn, ws->ib );
/*
* Algorithm per column with pivoting
*/
for (b=0; b<nbblock; b++){
int hmax = b == nbblock-1 ? minmn + 1 - b * ws->ib : ws->ib;
for (h=0; h<hmax; h++){
int j = h + b * ws->ib;
INSERT_TASK_zgetrf_blocked_diag(
options,
A(k, k), Up(k, k),
ipiv );
for (m = k+1; m < A->mt; m++) {
tempmm = (m == (A->mt - 1)) ? A->m - m * A->mb : A->mb;
INSERT_TASK_zgetrf_blocked_offdiag(
options,
A(m, k), Up(k, k),
ipiv );
}
assert( j <= minmn );
/* Reduce globally (between MPI processes) */
INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, j, tempkn );
if ( ( b < (nbblock-1) ) && ( h == hmax-1 ) ) {
INSERT_TASK_zgetrf_blocked_trsm(
options,
Up(k, k),
ipiv );
}
}
}
RUNTIME_data_flush( options->sequence, Up(k, k) );
/* Flush temporary data used for the pivoting */
INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k );
RUNTIME_ipiv_flushk( options->sequence, ipiv, A->myrank );
/*
* Factorization of panel k - dynamic scheduling - batched version / stock
*/
static inline void
chameleon_pzgetrf_panel_facto_blocked_batched( struct chameleon_pzgetrf_s *ws,
CHAM_desc_t *A,
CHAM_ipiv_t *ipiv,
int k,
RUNTIME_option_t *options )
{
int m, h, b, nbblock, hmax, j;
int tempkm, tempkn, tempmm, minmn;
void **clargs = malloc( sizeof(char *) );
memset( clargs, 0, sizeof(char *) );
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
minmn = chameleon_min( tempkm, tempkn );
/* Update the number of column */
ipiv->n = minmn;
nbblock = chameleon_ceil( minmn, ws->ib );
/*
* Algorithm per column with pivoting (no recursion)
*/
/* Iterate on current panel column */
/* Since index h scales column h-1, we need to iterate up to minmn (included) */
for ( b = 0; b < nbblock; b++ ) {
hmax = b == nbblock-1 ? minmn + 1 - b * ws->ib : ws->ib;
for ( h = 0; h < hmax; h++ ) {
j = h + b * ws->ib;
for ( m = k; m < A->mt; m++ ) {
tempmm = (m == (A->mt - 1)) ? A->m - m * A->mb : A->mb;
INSERT_TASK_zgetrf_panel_blocked_batched( options, tempmm, tempkn, j, m * A->mb,
(void *)ws, A(m, k), Up(k, k), clargs, ipiv );
}
INSERT_TASK_zgetrf_panel_blocked_batched_flush( options, A, k,
Up(k, k), clargs, ipiv );
assert( j <= minmn );
/* Reduce globally (between MPI processes) */
INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, j, tempkn );
if ( (b < (nbblock-1)) && (h == hmax-1) ) {
INSERT_TASK_zgetrf_blocked_trsm(
options,
ws->ib, tempkn, b * ws->ib + hmax, ws->ib,
Up(k, k),
ipiv );
}
}
}
free( clargs );
/* Flush temporary data used for the pivoting */
INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k );
RUNTIME_ipiv_flushk( options->sequence, ipiv, A->myrank );
static inline void
chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws,
CHAM_desc_t *A,
CHAM_ipiv_t *ipiv,
int k,
RUNTIME_option_t *options )
#if defined ( CHAMELEON_USE_MPI )
int *proc_involved = malloc( sizeof( int ) * chameleon_min( A->p, A->mt - k) );
int b;
/* 2DBC only */
ws->involved = 0;
for ( b = k; (b < A->mt) && ((b-k) < A->p); b ++ ) {
int rank = chameleon_getrankof_2d( A, b, k );
proc_involved[ b-k ] = rank;
if ( rank == A->myrank ) {
ws->involved = 1;
}
}
ws->proc_involved = proc_involved;
if ( ws->involved == 0 ) {
free( proc_involved );
return;
}

Mathieu Faverge
committed
/* TODO: Should be replaced by a function pointer */
switch( ws->alg ) {
case ChamGetrfNoPivPerColumn:
chameleon_pzgetrf_panel_facto_nopiv_percol( ws, A, ipiv, k, options );
break;
case ChamGetrfPPivPerColumn:
if ( ws->batch_size > 0 ) {
chameleon_pzgetrf_panel_facto_percol_batched( ws, A, ipiv, k, options );
}
else {
chameleon_pzgetrf_panel_facto_percol( ws, A, ipiv, k, options );
}
break;
case ChamGetrfPPiv:
if ( ws->batch_size > 0 ) {
chameleon_pzgetrf_panel_facto_blocked_batched( ws, A, ipiv, k, options );
}
else {
chameleon_pzgetrf_panel_facto_blocked( ws, A, ipiv, k, options );
}
break;
case ChamGetrfNoPiv:
chameleon_attr_fallthrough;
default:
chameleon_pzgetrf_panel_facto_nopiv( ws, A, ipiv, k, options );

Mathieu Faverge
committed
}
#if defined ( CHAMELEON_USE_MPI )
free( proc_involved );
}
/**
* Permutation of the panel n at step k
*/
static inline void
chameleon_pzgetrf_panel_permute( struct chameleon_pzgetrf_s *ws,
CHAM_desc_t *A,
CHAM_ipiv_t *ipiv,
int k,
int n,
RUNTIME_option_t *options )
switch( ws->alg ) {
case ChamGetrfPPiv:
chameleon_attr_fallthrough;
case ChamGetrfPPivPerColumn:
{
int m;
int tempkm, tempkn, tempnn, minmn;
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
minmn = chameleon_min( tempkm, tempkn );
/* Extract selected rows into U */
INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn,
A(k, n), U(k, n) );
/*
* perm array is made of size tempkm for the first row especially.
* Otherwise, the final copy back to the tile may copy only a partial tile
*/
INSERT_TASK_zlaswp_get( options, k*A->mb, tempkm,
ipiv, k, A(k, n), U(k, n) );
for(m=k+1; m<A->mt; m++){
/* Extract selected rows into A(k, n) */
INSERT_TASK_zlaswp_get( options, m*A->mb, minmn,
ipiv, k, A(m, n), U(k, n) );
/* Copy rows from A(k,n) into their final position */
INSERT_TASK_zlaswp_set( options, m*A->mb, minmn,
ipiv, k, A(k, n), A(m, n) );
}
INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn,
U(k, n), A(k, n) );
RUNTIME_data_flush( options->sequence, U(k, n) );
}
break;
default:
;
}
}
static inline void
chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws,
CHAM_desc_t *A,
CHAM_ipiv_t *ipiv,
int k,
int n,
RUNTIME_option_t *options )
{
const CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t) 1.0;
const CHAMELEON_Complex64_t mzone = (CHAMELEON_Complex64_t)-1.0;
int m, tempkm, tempmm, tempnn;
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
chameleon_pzgetrf_panel_permute( ws, A, ipiv, k, n, options );
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
INSERT_TASK_ztrsm(
options,
ChamLeft, ChamLower, ChamNoTrans, ChamUnit,
tempkm, tempnn, A->mb,
zone, A(k, k),
A(k, n) );
for (m = k+1; m < A->mt; m++) {
tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
INSERT_TASK_zgemm(
options,
ChamNoTrans, ChamNoTrans,
tempmm, tempnn, A->mb, A->mb,
mzone, A(m, k),
A(k, n),
zone, A(m, n) );
}
RUNTIME_data_flush( options->sequence, A(k, n) );
}
/**
* Parallel tile LU factorization with no pivoting - dynamic scheduling
*/
void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws,

Mathieu Faverge
committed
CHAM_desc_t *A,
CHAM_ipiv_t *IPIV,

Mathieu Faverge
committed
RUNTIME_sequence_t *sequence,
RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
int k, m, n;
int min_mnt = chameleon_min( A->mt, A->nt );
chamctxt = chameleon_context_self();
if (sequence->status != CHAMELEON_SUCCESS) {
return;
}
RUNTIME_options_init( &options, chamctxt, sequence, request );
for (k = 0; k < min_mnt; k++) {
RUNTIME_iteration_push( chamctxt, k );
options.priority = A->nt;
/*
* Do the panel factorization only if the current proc contributes in the
* block column k.
*/
options.forcesub = chameleon_involved_in_panelk_2dbc( A, k );
if ( chameleon_involved_in_panelk_2dbc( A, k ) ) {
chameleon_pzgetrf_panel_facto( ws, A, IPIV, k, &options );
}
for (n = k+1; n < A->nt; n++) {
options.priority = A->nt-n;
chameleon_pzgetrf_panel_update( ws, A, IPIV, k, n, &options );
}
/* Flush panel k */
for (m = k; m < A->mt; m++) {
RUNTIME_data_flush( sequence, A(m, k) );
}
RUNTIME_iteration_pop( chamctxt );
}
/* Backward pivoting */
for (k = 1; k < min_mnt; k++) {
for (n = 0; n < k; n++) {
chameleon_pzgetrf_panel_permute( ws, A, IPIV, k, n, &options );
RUNTIME_perm_flushk( sequence, IPIV, k );
/* Initialize IPIV with default values if needed */
if ( (ws->alg == ChamGetrfNoPivPerColumn) ||
(ws->alg == ChamGetrfNoPiv ) )
{

Mathieu Faverge
committed
INSERT_TASK_ipiv_init( &options, IPIV );
RUNTIME_options_finalize( &options, chamctxt );
}