Mentions légales du service

Skip to content
Snippets Groups Projects
Commit e6909434 authored by Mathieu Faverge's avatar Mathieu Faverge
Browse files

tpqrt/tpgqrt: Add parallel algorithms using HQR reduction trees

parent 9805d22b
Branches
Tags
1 merge request!125Add Polar decomposition with QDWH algorithm
......@@ -141,7 +141,9 @@ set(ZSRC
pzunmqr_param.c
pzunmqrrh.c
pztpgqrt.c
pztpgqrt_param.c
pztpqrt.c
pztpqrt_param.c
###
zgels.c
zgels_param.c
......
/**
*
* @file pztpgqrt_param.c
*
* @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
* @copyright 2016-2020 KAUST. All rights reserved.
*
***
*
* @brief Chameleon computational routines
*
* @version 1.0.0
* @author Mathieu Faverge
* @date 2016-12-15
* @precisions normal z -> s d c
*
*/
#include "control/common.h"
#include <stdlib.h>
#define QTop(m,n) QTop, m, n
#define A(m,n) A, m, n
#define Q(m,n) Q, m, n
#define T(m,n) T, m, n
#define D(m,n) D, m, n
/**
* Parallel construction of Q using tile V (application to identity) - dynamic scheduling
*
* @param[in] genD
* Indicate if the copies of the A tiles must be done to speedup
* computations in updates. genD is considered only if D is not NULL.
*
* @param[in] uplo
* - ChamUpper: This corresponds to the TTQRT factorization kernel. Only the upper
* trapezoidal part of Q is referenced.
* - ChamLower, or ChamUpperLower: This corresponds to the TSQRT factorization
* kernel. The full Q is referenced.
*/
void chameleon_pztpgqrt_param( int genD, cham_uplo_t uplo, int K,
const libhqr_tree_t *qrtree,
CHAM_desc_t *A, CHAM_desc_t *QTop, CHAM_desc_t *Q,
CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
size_t ws_worker = 0;
size_t ws_host = 0;
int k, n, p;
int ib, node, nbtiles, *tiles;
chamctxt = chameleon_context_self();
if (sequence->status != CHAMELEON_SUCCESS) {
return;
}
RUNTIME_options_init(&options, chamctxt, sequence, request);
ib = CHAMELEON_IB;
if ( D == NULL ) {
D = A;
genD = 0;
}
/*
* zunmqr = A->nb * ib
* ztpmqrt = A->nb * ib
*/
ws_worker = A->nb * ib;
/* Allocation of temporary (scratch) working space */
#if defined(CHAMELEON_USE_CUDA)
/*
* ztpmqrt = 3 * A->nb * ib
*/
ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 );
#endif
ws_worker *= sizeof(CHAMELEON_Complex64_t);
ws_host *= sizeof(CHAMELEON_Complex64_t);
RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
/* Initialisation of temporary tiles array */
tiles = (int*)calloc(qrtree->mt, sizeof(int));
for (k = K-1; k >=0; k--) {
RUNTIME_iteration_push(chamctxt, k);
/* Setting the order of tiles */
nbtiles = libhqr_walk_stepk( qrtree, k, tiles );
p = tiles[nbtiles];
/* Combine Bottom and Top matrices by merging last pivot with ATop(k,*) */
{
CHAM_desc_t *T = TT;
int temppm = p == Q->mt-1 ? Q->m - p * Q->mb : Q->mb;
int tempkn = k == A->nt-1 ? A->n - k * A->nb : A->nb;
int tempnn;
int L = temppm;
for (n = k; n < Q->nt; n++) {
tempnn = n == Q->nt-1 ? Q->n-n*Q->nb : Q->nb;
node = Q->get_rankof( Q, p, n );
RUNTIME_data_migrate( sequence, QTop(k, n), node );
RUNTIME_data_migrate( sequence, Q( p, n), node );
INSERT_TASK_ztpmqrt(
&options,
ChamLeft, ChamNoTrans,
temppm, tempnn, tempkn, L, ib, T->nb,
A( p, k),
T( p, k),
QTop(k, n),
Q( p, n));
}
RUNTIME_data_flush( sequence, A(p, k) );
RUNTIME_data_flush( sequence, T(p, k) );
}
chameleon_pzungqr_param_step( genD, uplo, k, ib,
qrtree, nbtiles, tiles,
A, Q, TS, TT, D,
&options, sequence );
RUNTIME_iteration_pop(chamctxt);
}
free(tiles);
RUNTIME_options_ws_free(&options);
RUNTIME_options_finalize(&options, chamctxt);
}
/**
*
* @file pztpqrt_param.c
*
* @copyright 2012-2020 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
* @copyright 2016-2020 KAUST. All rights reserved.
*
***
*
* @brief Chameleon computational routines
*
* @version 1.0.0
* @author Mathieu Faverge
* @date 2016-12-15
* @precisions normal z -> s d c
*
*/
#include "control/common.h"
#include <stdlib.h>
#include <stdio.h>
#include "libhqr.h"
#define ATop(m,n) ATop, (m), (n)
#define A(m,n) A, (m), (n)
#define T(m,n) T, (m), (n)
#define D(m,n) D, (m), (n)
/**
* Parallel tile QR matrix reduction - Equivalent to tpqrt kernel for matrices.
*
* @param[in] genD
* Indicate if copies of the geqrt tiles must be done to speedup
* computations in updates. genD is considered only if D is not NULL.
*
* @param[in] uplo
* - ChamUpper: This corresponds to the former TTQRT kernel. Only the upper
* trapezoidal part of A is factorized.
* - ChamLower, or ChamUpperLower: This corresponds to the former TSQRT
* kernel. The full A is factorized.
*/
void chameleon_pztpqrt_param( int genD, cham_uplo_t uplo, int K,
const libhqr_tree_t *qrtree, CHAM_desc_t *ATop, CHAM_desc_t *A,
CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
size_t ws_worker = 0;
size_t ws_host = 0;
int k, n, p;
int ib, *tiles;
chamctxt = chameleon_context_self();
if (sequence->status != CHAMELEON_SUCCESS) {
return;
}
RUNTIME_options_init(&options, chamctxt, sequence, request);
ib = CHAMELEON_IB;
if ( (genD == 0) || (D == NULL) ) {
D = A;
genD = 0;
}
/*
* zgeqrt = A->nb * (ib+1)
* zunmqr = A->nb * ib
* ztpqrt = A->nb * (ib+1)
* ztpmqrt = A->nb * ib
*/
ws_worker = A->nb * (ib+1);
/* Allocation of temporary (scratch) working space */
#if defined(CHAMELEON_USE_CUDA)
/*
* zunmqr = A->nb * ib
* ztpmqrt = 3 * A->nb * ib
*/
ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 );
#endif
ws_worker *= sizeof(CHAMELEON_Complex64_t);
ws_host *= sizeof(CHAMELEON_Complex64_t);
RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
/* Initialisation of temporary tiles array */
tiles = (int*)calloc(qrtree->mt, sizeof(int));
for (k = 0; k < K; k++) {
RUNTIME_iteration_push(chamctxt, k);
p = chameleon_pzgeqrf_param_step( genD, uplo, k, ib, qrtree, tiles,
A, TS, TT, D, &options, sequence );
/* Combine with ATop and A by merging last pivot with A(k,k) */
{
CHAM_desc_t *T;
int temppm = p == ATop->mt-1 ? ATop->m - p * ATop->mb : ATop->mb;
int tempkn = k == ATop->nt-1 ? ATop->n - k * ATop->nb : ATop->nb;
int L, node, tempnn;
T = TT;
L = temppm;
node = A->get_rankof( A, p, k );
RUNTIME_data_migrate( sequence, ATop(k, k), node );
RUNTIME_data_migrate( sequence, A( p, k), node );
INSERT_TASK_ztpqrt(
&options,
temppm, tempkn, chameleon_min(L, tempkn), ib, T->nb,
ATop(k, k),
A(p, k),
T(p, k));
for (n = k+1; n < A->nt; n++) {
tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
node = A->get_rankof( A, p, n );
RUNTIME_data_migrate( sequence, ATop(k, n), node );
RUNTIME_data_migrate( sequence, A( p, n), node );
INSERT_TASK_ztpmqrt(
&options,
ChamLeft, ChamConjTrans,
temppm, tempnn, A->nb, L, ib, T->nb,
A(p, k),
T(p, k),
ATop(k, n),
A(p, n));
}
RUNTIME_data_flush( sequence, A(p, k) );
RUNTIME_data_flush( sequence, T(p, k) );
}
/* Restore the original location of the tiles */
for (n = k; n < ATop->nt; n++) {
RUNTIME_data_migrate( sequence, ATop(k, n),
ATop->get_rankof( ATop, k, n ) );
}
RUNTIME_iteration_pop(chamctxt);
}
free(tiles);
RUNTIME_options_ws_free(&options);
RUNTIME_options_finalize(&options, chamctxt);
}
......@@ -126,6 +126,15 @@ void chameleon_pzungqr_param( int genD, int K, const libhqr_tree_t *qrtree,
CHAM_desc_t *A, CHAM_desc_t *Q,
CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
void chameleon_pztpgqrt_param( int genD, cham_uplo_t uplo, int kt, const libhqr_tree_t *qrtree,
CHAM_desc_t *V2, CHAM_desc_t *Q1, CHAM_desc_t *Q2,
CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *DD,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
void chameleon_pztpqrt_param( int genD, cham_uplo_t uplo, int K,
const libhqr_tree_t *qrtree,
CHAM_desc_t *ATop, CHAM_desc_t *A,
CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
/**
* Gram function prototypes
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment