Mentions légales du service

Skip to content
Snippets Groups Projects
Commit e7e1c903 authored by Mathieu Faverge's avatar Mathieu Faverge
Browse files

Merge branch 'feature/Astat-gemm' into 'master'

SUMMA GEMM with A stationnary

See merge request !334
parents 01f701a8 d83e4613
No related branches found
No related tags found
1 merge request!334SUMMA GEMM with A stationnary
Showing
with 433 additions and 17 deletions
......@@ -41,6 +41,7 @@ _extra_blas = [
('', 'sgepdf', 'dgepdf', 'cgepdf', 'zgepdf' ),
('', 'scesca', 'dcesca', 'ccesca', 'zcesca' ),
('', 'sgesum', 'dgesum', 'cgesum', 'zgesum' ),
('', 'sgersum', 'dgersum', 'cgersum', 'zgersum' ),
]
_extra_BLAS = [ [ x.upper() for x in row ] for row in _extra_blas ]
......
......@@ -30,6 +30,148 @@
#define WA(m, n) WA, m, n
#define WB(m, n) WB, m, n
/**
* Parallel tile matrix-matrix multiplication.
* Generic algorithm for any data distribution.
*/
static inline void
chameleon_pzgemm_Astat( CHAM_context_t *chamctxt, cham_trans_t transA, cham_trans_t transB,
CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
CHAMELEON_Complex64_t beta, CHAM_desc_t *C,
RUNTIME_option_t *options )
{
const CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0;
RUNTIME_sequence_t *sequence = options->sequence;
int m, n, k;
int tempmm, tempnn, tempkn, tempkm;
int myrank = RUNTIME_comm_rank( chamctxt );
int reduceC[ C->mt * C->nt ];
/* Set C tiles to redux mode. */
for (n = 0; n < C->nt; n++) {
for (m = 0; m < C->mt; m++) {
reduceC[ n * C->mt + m ] = 0;
/* The node owns the C tile. */
if ( C->get_rankof( C(m, n) ) == myrank ) {
reduceC[ n * C->mt + m ] = 1;
RUNTIME_zgersum_set_methods( C(m, n) );
continue;
}
/*
* The node owns the A tile that will define the locality of the
* computations.
*/
if ( transA == ChamNoTrans ) {
for (k = 0; k < A->nt; k++) {
if ( A->get_rankof( A(m, k) ) == myrank ) {
reduceC[ n * C->mt + m ] = 1;
RUNTIME_zgersum_set_methods( C(m, n) );
break;
}
}
}
else {
for (k = 0; k < A->mt; k++) {
if ( A->get_rankof( A(k, m) ) == myrank ) {
reduceC[ n * C->mt + m ] = 1;
RUNTIME_zgersum_set_methods( C(m, n) );
break;
}
}
}
}
}
for (n = 0; n < C->nt; n++) {
tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
for (m = 0; m < C->mt; m++) {
tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
/* Scale C */
options->forcesub = 0;
INSERT_TASK_zlascal( options, ChamUpperLower, tempmm, tempnn, C->mb,
beta, C, m, n );
options->forcesub = reduceC[ n * C->mt + m ];
/*
* A: ChamNoTrans / B: ChamNoTrans
*/
if (transA == ChamNoTrans) {
if (transB == ChamNoTrans) {
for (k = 0; k < A->nt; k++) {
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
INSERT_TASK_zgemm_Astat(
options,
transA, transB,
tempmm, tempnn, tempkn, A->mb,
alpha, A(m, k), /* lda * Z */
B(k, n), /* ldb * Y */
zone, C(m, n)); /* ldc * Y */
}
}
/*
* A: ChamNoTrans / B: Cham[Conj]Trans
*/
else {
for (k = 0; k < A->nt; k++) {
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
INSERT_TASK_zgemm_Astat(
options,
transA, transB,
tempmm, tempnn, tempkn, A->mb,
alpha, A(m, k), /* lda * Z */
B(n, k), /* ldb * Z */
zone, C(m, n)); /* ldc * Y */
}
}
}
/*
* A: Cham[Conj]Trans / B: ChamNoTrans
*/
else {
if (transB == ChamNoTrans) {
for (k = 0; k < A->mt; k++) {
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
INSERT_TASK_zgemm_Astat(
options,
transA, transB,
tempmm, tempnn, tempkm, A->mb,
alpha, A(k, m), /* lda * X */
B(k, n), /* ldb * Y */
zone, C(m, n)); /* ldc * Y */
}
}
/*
* A: Cham[Conj]Trans / B: Cham[Conj]Trans
*/
else {
for (k = 0; k < A->mt; k++) {
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
INSERT_TASK_zgemm_Astat(
options,
transA, transB,
tempmm, tempnn, tempkm, A->mb,
alpha, A(k, m), /* lda * X */
B(n, k), /* ldb * Z */
zone, C(m, n)); /* ldc * Y */
}
}
}
RUNTIME_zgersum_submit_tree( options, C(m, n) );
RUNTIME_data_flush( sequence, C(m, n) );
}
}
options->forcesub = 0;
(void)chamctxt;
}
/**
* Parallel tile matrix-matrix multiplication
* SUMMA algorithm for 2D block-cyclic data distribution.
......@@ -284,6 +426,7 @@ chameleon_pzgemm( struct chameleon_pzgemm_s *ws,
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
cham_gemm_t alg = (ws != NULL) ? ws->alg : ChamGemmAlgGeneric;
chamctxt = chameleon_context_self();
if (sequence->status != CHAMELEON_SUCCESS) {
......@@ -291,13 +434,21 @@ chameleon_pzgemm( struct chameleon_pzgemm_s *ws,
}
RUNTIME_options_init( &options, chamctxt, sequence, request );
if ( ws && ws->summa )
{
switch( alg ) {
case ChamGemmAlgAuto:
case ChamGemmAlgSummaB: /* Switch back to generic since it does not exist yet. */
case ChamGemmAlgGeneric:
chameleon_pzgemm_generic( chamctxt, transA, transB, alpha, A, B, beta, C, &options );
break;
case ChamGemmAlgSummaC:
chameleon_pzgemm_summa( chamctxt, transA, transB, alpha, A, B, beta, C,
&(ws->WA), &(ws->WB), &options );
}
else {
chameleon_pzgemm_generic( chamctxt, transA, transB, alpha, A, B, beta, C, &options );
break;
case ChamGemmAlgSummaA:
chameleon_pzgemm_Astat( chamctxt, transA, transB, alpha, A, B, beta, C, &options );
break;
}
RUNTIME_options_finalize( &options, chamctxt );
......
......@@ -175,7 +175,7 @@ chameleon_pzgepdf_qdwh_init( const CHAM_desc_t *U, const CHAM_desc_t *H,
/*
* Allocate the data descriptors for the lookahead if needed
*/
*gemm_ws = CHAMELEON_zgemm_WS_Alloc( ChamNoTrans, ChamNoTrans, NULL, NULL, U );
*gemm_ws = CHAMELEON_zgemm_WS_Alloc( ChamNoTrans, ChamNoTrans, U, U, U );
return;
}
......
......@@ -103,14 +103,77 @@ void *CHAMELEON_zgemm_WS_Alloc( cham_trans_t transA __attribute__((unused)
}
options = calloc( 1, sizeof(struct chameleon_pzgemm_s) );
options->summa = 0;
options->alg = ChamGemmAlgAuto;
/*
* If only one process, or if generic has been globally enforced, we switch
* to generic immediately.
*/
if ( ((C->p == 1) && (C->q == 1)) ||
(chamctxt->generic_enabled == CHAMELEON_TRUE) )
{
options->alg = ChamGemmAlgGeneric;
}
if ( ((C->p > 1) || (C->q > 1)) &&
(C->get_rankof == chameleon_getrankof_2d) &&
(chamctxt->generic_enabled != CHAMELEON_TRUE) )
/* Look at environment variable is something enforces the variant. */
if ( options->alg == ChamGemmAlgAuto )
{
char *algostr = chameleon_getenv( "CHAMELEON_GEMM_ALGO" );
if ( algostr ) {
if ( strcasecmp( algostr, "summa_c" ) == 0 ) {
options->alg = ChamGemmAlgSummaC;
}
else if ( strcasecmp( algostr, "summa_a" ) == 0 ) {
options->alg = ChamGemmAlgSummaA;
}
else if ( strcasecmp( algostr, "summa_b" ) == 0 ) {
options->alg = ChamGemmAlgSummaB;
}
else if ( strcasecmp( algostr, "generic" ) == 0 ) {
options->alg = ChamGemmAlgGeneric;
}
else if ( strcasecmp( algostr, "auto" ) == 0 ) {
options->alg = ChamGemmAlgAuto;
}
else {
fprintf( stderr, "ERROR: CHAMELEON_GEMM_ALGO is not one of AUTO, SUMMA_A, SUMMA_B, SUMMA_C, GENERIC => Switch back to Automatic switch\n" );
}
}
chameleon_cleanenv( algostr );
}
/* Perform automatic choice if not already enforced. */
if ( options->alg == ChamGemmAlgAuto )
{
double sizeA, sizeB, sizeC;
double ratio = 1.5; /* Arbitrary ratio to give more weight to writes wrt reads. */
/* Compute the average array per node for each matrix */
sizeA = ((double)A->m * (double)A->n) / (double)(A->p * A->q);
sizeB = ((double)B->m * (double)B->n) / (double)(B->p * B->q);
sizeC = ((double)C->m * (double)C->n) / (double)(C->p * C->q) * ratio;
if ( (sizeC > sizeA) && (sizeC > sizeB) ) {
options->alg = ChamGemmAlgSummaC;
}
else {
if ( sizeA > sizeB ) {
options->alg = ChamGemmAlgSummaA;
}
else {
options->alg = ChamGemmAlgSummaB;
}
}
}
assert( options->alg != ChamGemmAlgAuto );
/* Now that we have decided which algorithm, let's allocate the required data structures. */
if ( (options->alg == ChamGemmAlgSummaC ) &&
(C->get_rankof == chameleon_getrankof_2d ) )
{
int lookahead = chamctxt->lookahead;
options->summa = 1;
chameleon_desc_init( &(options->WA), CHAMELEON_MAT_ALLOC_TILE,
ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
......@@ -150,7 +213,7 @@ void CHAMELEON_zgemm_WS_Free( void *user_ws )
{
struct chameleon_pzgemm_s *ws = (struct chameleon_pzgemm_s*)user_ws;
if ( ws->summa ) {
if ( ws->alg == ChamGemmAlgSummaC ) {
chameleon_desc_destroy( &(ws->WA) );
chameleon_desc_destroy( &(ws->WB) );
}
......
......@@ -30,7 +30,7 @@
* @brief Data structure to handle the GEMM workspaces
*/
struct chameleon_pzgemm_s {
int summa;
cham_gemm_t alg;
CHAM_desc_t WA;
CHAM_desc_t WB;
};
......
......@@ -233,9 +233,9 @@ inline static int chameleon_desc_islocal( const CHAM_desc_t *A, int m, int n )
* CHAMELEON_ACCESS_RW(C, Cm, Cn)
* CHAMELEON_END_ACCESS_DECLARATION
*/
#define CHAMELEON_BEGIN_ACCESS_DECLARATION { \
unsigned __chameleon_need_exec = 0; \
unsigned __chameleon_need_submit = 0; \
#define CHAMELEON_BEGIN_ACCESS_DECLARATION { \
unsigned __chameleon_need_exec = 0; \
unsigned __chameleon_need_submit = options->forcesub; \
RUNTIME_BEGIN_ACCESS_DECLARATION
#define CHAMELEON_ACCESS_R(A, Am, An) do { \
......
......@@ -914,6 +914,17 @@
int CHAMELEON_Get (CHAMELEON_enum param, int *value);
#+end_src
* Alternatively, Chameleon can also be configured through environment variables.
* *CHAMELEON_GEMM_ALGO* give the possibility to switch among
multiple variants of the GEMM algorithms. These variants are
*GENERIC* for the generic variant that should work with any
configuration; *SUMMA_C* that works for 2D block cyclic
distribution of the matrices A, B, and C with a C stationnary
version; *SUMMA_A* and *SUMMA_B* are SUMMA variant of the
algorithm that works for any distribution with respectively
*A*, or *B that are stationnary. Note that the last two
variants are only available with the StarPU runtime backend.
**** Auxiliary routines
Reports CHAMELEON version number.
......
......@@ -181,6 +181,16 @@ typedef enum chameleon_store_e {
ChamEltwise = 403, /**< Element by element storage */
} cham_store_t;
/**
* @brief Chameleon GEMM-like algorithms
*/
typedef enum chameleon_gemm_e {
ChamGemmAlgAuto = -1,
ChamGemmAlgGeneric,
ChamGemmAlgSummaA,
ChamGemmAlgSummaB,
ChamGemmAlgSummaC
} cham_gemm_t;
#define ChameleonTrd 1001
#define ChameleonBrd 1002
......
......@@ -84,6 +84,7 @@ typedef struct runtime_option_s {
int parallel; /**< Enable/Disable the parallel version of submitted tasks */
int priority; /**< Define the submitted task priority */
int workerid; /**< Define the prefered worker id to perform the tasks */
int forcesub; /**< Force task submission if true */
size_t ws_wsize; /**< Define the worker workspace size */
size_t ws_hsize; /**< Define the host workspace size for hybrid CPU/GPU kernel */
void *ws_worker; /**< Pointer to the worker workspace (structure) */
......
......@@ -65,6 +65,12 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
const CHAM_desc_t *B, int Bm, int Bn,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn );
void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options,
cham_trans_t transA, cham_trans_t transB,
int m, int n, int k, int nb,
CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
const CHAM_desc_t *B, int Bm, int Bn,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn );
void INSERT_TASK_zgeqrt( const RUNTIME_option_t *options,
int m, int n, int ib, int nb,
const CHAM_desc_t *A, int Am, int An,
......@@ -452,4 +458,8 @@ void INSERT_TASK_zgram( const RUNTIME_option_t *options,
const CHAM_desc_t *D, int Dm, int Dn,
CHAM_desc_t *A, int Am, int An);
void RUNTIME_zgersum_set_methods( const CHAM_desc_t *A, int Am, int An );
void RUNTIME_zgersum_submit_tree( const RUNTIME_option_t *options,
const CHAM_desc_t *A, int Am, int An );
#endif /* _chameleon_tasks_z_h_ */
......@@ -107,6 +107,10 @@ set(CODELETS_ZSRC
codelets/codelet_zgesum.c
codelets/codelet_zcesca.c
codelets/codelet_zgram.c
##################
# Reduction methods
##################
codelets/codelet_zgersum.c
)
set(CODELETS_SRC
......
......@@ -42,3 +42,16 @@ INSERT_TASK_zgemm( const RUNTIME_option_t *options,
(void)options;
(void)nb;
}
void
INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options,
cham_trans_t transA, cham_trans_t transB,
int m, int n, int k, int nb,
CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
const CHAM_desc_t *B, int Bm, int Bn,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn )
{
INSERT_TASK_zgemm( options, transA, transB, m, n, k, nb,
alpha, A, Am, An, B, Bm, Bn,
beta, C, Cm, Cn );
}
/**
*
* @file starpu/codelet_zgersum.c
*
* @copyright 2009-2014 The University of Tennessee and The University of
* Tennessee Research Foundation. All rights reserved.
* @copyright 2012-2022 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon zgersum OpenMP codelet
*
* @version 1.2.0
* @author Romain Peressoni
* @author Mathieu Faverge
* @date 2022-02-22
* @precisions normal z -> c d s
*
*/
#include "chameleon_openmp.h"
void
RUNTIME_zgersum_set_methods( const CHAM_desc_t *A, int Am, int An )
{
fprintf( stderr, "WARNING: Reductions are not available with OpenMP\n" );
(void)A;
(void)Am;
(void)An;
}
void
RUNTIME_zgersum_submit_tree( const RUNTIME_option_t *options,
const CHAM_desc_t *A, int Am, int An )
{
(void)options;
(void)A;
(void)Am;
(void)An;
}
......@@ -30,6 +30,7 @@ void RUNTIME_options_init( RUNTIME_option_t *options, CHAM_context_t *chamctxt,
options->parallel = CHAMELEON_PARALLEL == CHAMELEON_TRUE;
options->priority = RUNTIME_PRIORITY_MIN;
options->workerid = -1;
options->forcesub = 0;
options->ws_wsize = 0;
options->ws_hsize = 0;
options->ws_worker = NULL;
......
......@@ -89,3 +89,16 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
(void)nb;
}
void
INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options,
cham_trans_t transA, cham_trans_t transB,
int m, int n, int k, int nb,
CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
const CHAM_desc_t *B, int Bm, int Bn,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn )
{
INSERT_TASK_zgemm( options, transA, transB, m, n, k, nb,
alpha, A, Am, An, B, Bm, Bn,
beta, C, Cm, Cn );
}
/**
*
* @file starpu/codelet_zgersum.c
*
* @copyright 2009-2014 The University of Tennessee and The University of
* Tennessee Research Foundation. All rights reserved.
* @copyright 2012-2022 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon zgersum Parsec codelet
*
* @version 1.2.0
* @author Romain Peressoni
* @author Mathieu Faverge
* @date 2022-02-22
* @precisions normal z -> c d s
*
*/
#include "chameleon_parsec.h"
void
RUNTIME_zgersum_set_methods( const CHAM_desc_t *A, int Am, int An )
{
fprintf( stderr, "WARNING: Reductions are not available with Parsec\n" );
(void)A;
(void)Am;
(void)An;
}
void
RUNTIME_zgersum_submit_tree( const RUNTIME_option_t *options,
const CHAM_desc_t *A, int Am, int An )
{
(void)options;
(void)A;
(void)Am;
(void)An;
}
......@@ -27,7 +27,8 @@ void RUNTIME_options_init( RUNTIME_option_t *options, CHAM_context_t *chamctxt,
options->profiling = CHAMELEON_STATISTICS == CHAMELEON_TRUE;
options->parallel = CHAMELEON_PARALLEL == CHAMELEON_TRUE;
options->priority = RUNTIME_PRIORITY_MIN;
options->workerid = -1;
options->workerid = -1;
options->forcesub = 0;
options->ws_wsize = 0;
options->ws_hsize = 0;
options->ws_worker = NULL;
......
......@@ -75,3 +75,16 @@ void INSERT_TASK_zgemm(const RUNTIME_option_t *options,
sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), accessC,
0);
}
void
INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options,
cham_trans_t transA, cham_trans_t transB,
int m, int n, int k, int nb,
CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
const CHAM_desc_t *B, int Bm, int Bn,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn )
{
INSERT_TASK_zgemm( options, transA, transB, m, n, k, nb,
alpha, A, Am, An, B, Bm, Bn,
beta, C, Cm, Cn );
}
/**
*
* @file starpu/codelet_zgersum.c
*
* @copyright 2009-2014 The University of Tennessee and The University of
* Tennessee Research Foundation. All rights reserved.
* @copyright 2012-2022 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon zgersum Quark codelet
*
* @version 1.2.0
* @author Romain Peressoni
* @author Mathieu Faverge
* @date 2022-02-22
* @precisions normal z -> c d s
*
*/
#include "chameleon_quark.h"
void
RUNTIME_zgersum_set_methods( const CHAM_desc_t *A, int Am, int An )
{
fprintf( stderr, "WARNING: Reductions are not available with Quark\n" );
(void)A;
(void)Am;
(void)An;
}
void
RUNTIME_zgersum_submit_tree( const RUNTIME_option_t *options,
const CHAM_desc_t *A, int Am, int An )
{
(void)options;
(void)A;
(void)Am;
(void)An;
}
......@@ -40,6 +40,7 @@ void RUNTIME_options_init( RUNTIME_option_t *options, CHAM_context_t *chamctxt,
options->parallel = CHAMELEON_PARALLEL == CHAMELEON_TRUE;
options->priority = RUNTIME_PRIORITY_MIN;
options->workerid = -1;
options->forcesub = 0;
options->ws_wsize = 0;
options->ws_hsize = 0;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment