Commit 96ec99ec authored by Mathieu Faverge's avatar Mathieu Faverge

Merge branch 'feature/gemm_summa' into 'master'

SUMMA GEMM

See merge request !154
parents e02d3679 ef90cff0
......@@ -26,16 +26,176 @@
#define A(m, n) A, m, n
#define B(m, n) B, m, n
#define C(m, n) C, m, n
#define WA(m, n) &WA, m, n
#define WB(m, n) &WB, m, n
/**
* Parallel tile matrix-matrix multiplication - dynamic scheduling
* Parallel tile matrix-matrix multiplication
* SUMMA algorithm for 2D block-cyclic data distribution.
*/
void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
CHAMELEON_Complex64_t beta, CHAM_desc_t *C,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
static inline void
chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_trans_t transB,
CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
CHAMELEON_Complex64_t beta, CHAM_desc_t *C,
RUNTIME_option_t *options )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
RUNTIME_sequence_t *sequence = options->sequence;
int m, n, k, p, q, KT, K, lp, lq;
int ldam, ldak, ldbn, ldbk, ldcm;
int tempmm, tempnn, tempkk;
int lookahead, myp, myq;
CHAMELEON_Complex64_t zbeta;
CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0;
CHAM_desc_t WA, WB;
lookahead = chamctxt->lookahead;
chameleon_desc_init( &WA, CHAMELEON_MAT_ALLOC_TILE,
ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
C->mt * C->mb, C->nb * C->q * lookahead, 0, 0,
C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q,
NULL, NULL, NULL );
chameleon_desc_init( &WB, CHAMELEON_MAT_ALLOC_TILE,
ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
C->mb * C->p * lookahead, C->nt * C->nb, 0, 0,
C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q,
NULL, NULL, NULL );
KT = transA == ChamNoTrans ? A->nt : A->mt;
K = transA == ChamNoTrans ? A->n : A->m;
myp = A->myrank / A->q;
myq = A->myrank % A->q;
/*
* A: ChamNoTrans / B: ChamNoTrans
*/
for (k = 0; k < KT; k++ ) {
lp = (k % lookahead) * C->p;
lq = (k % lookahead) * C->q;
tempkk = k == KT - 1 ? K - k * A->nb : A->nb;
zbeta = k == 0 ? beta : zone;
ldak = BLKLDD(A, k);
ldbk = BLKLDD(B, k);
/* Transfert ownership of the k column of A */
for (m = 0; m < C->mt; m ++ ) {
tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb;
ldam = BLKLDD(A, m);
if ( transA == ChamNoTrans ) {
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempmm, tempkk, C->mb,
A( m, k ), ldam,
WA( m, (k % C->q) + lq ), WA.mb );
RUNTIME_data_flush( sequence, A( m, k ) );
for ( q=1; q < C->q; q++ ) {
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempmm, tempkk, C->mb,
WA( m, ((k+q-1) % C->q) + lq ), WA.mb,
WA( m, ((k+q) % C->q) + lq ), WA.mb );
}
}
else {
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempkk, tempmm, C->mb,
A( k, m ), ldak,
WA( m, (k % C->q) + lq ), WA.mb );
RUNTIME_data_flush( sequence, A( k, m ) );
for ( q=1; q < C->q; q++ ) {
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempkk, tempmm, C->mb,
WA( m, ((k+q-1) % C->q) + lq ), WA.mb,
WA( m, ((k+q) % C->q) + lq ), WA.mb );
}
}
}
/* Transfert ownership of the k row of B */
for (n = 0; n < C->nt; n++) {
tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
ldbn = BLKLDD(B, n);
if ( transB == ChamNoTrans ) {
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempkk, tempnn, C->mb,
B( k, n ), ldbk,
WB( (k % C->p) + lp, n ), WB.mb );
RUNTIME_data_flush( sequence, B( k, n ) );
for ( p=1; p < C->p; p++ ) {
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempkk, tempnn, C->mb,
WB( ((k+p-1) % C->p) + lp, n ), WB.mb,
WB( ((k+p) % C->p) + lp, n ), WB.mb );
}
}
else {
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempnn, tempkk, C->mb,
B( n, k ), ldbn,
WB( (k % C->p) + lp, n ), WB.mb );
RUNTIME_data_flush( sequence, B( n, k ) );
for ( p=1; p < C->p; p++ ) {
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempnn, tempkk, C->mb,
WB( ((k+p-1) % C->p) + lp, n ), WB.mb,
WB( ((k+p) % C->p) + lp, n ), WB.mb );
}
}
}
for (m = myp; m < C->mt; m+=C->p) {
tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
ldcm = BLKLDD(C, m);
for (n = myq; n < C->nt; n+=C->q) {
tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
INSERT_TASK_zgemm(
options,
transA, transB,
tempmm, tempnn, tempkk, A->mb,
alpha, WA( m, myq + lq ), WA.mb, /* lda * Z */
WB( myp + lp, n ), WB.mb, /* ldb * Y */
zbeta, C( m, n ), ldcm ); /* ldc * Y */
}
}
}
RUNTIME_desc_flush( &WA, sequence );
RUNTIME_desc_flush( &WB, sequence );
RUNTIME_desc_flush( C, sequence );
chameleon_sequence_wait( chamctxt, sequence );
chameleon_desc_destroy( &WA );
chameleon_desc_destroy( &WB );
}
/**
* Parallel tile matrix-matrix multiplication.
* Generic algorithm for any data distribution.
*/
static inline void
chameleon_pzgemm_generic( CHAM_context_t *chamctxt, cham_trans_t transA, cham_trans_t transB,
CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
CHAMELEON_Complex64_t beta, CHAM_desc_t *C,
RUNTIME_option_t *options )
{
RUNTIME_sequence_t *sequence = options->sequence;
int m, n, k;
int ldam, ldak, ldbn, ldbk, ldcm;
......@@ -44,12 +204,6 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
CHAMELEON_Complex64_t zbeta;
CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0;
chamctxt = chameleon_context_self();
if (sequence->status != CHAMELEON_SUCCESS) {
return;
}
RUNTIME_options_init(&options, chamctxt, sequence, request);
for (m = 0; m < C->mt; m++) {
tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
ldcm = BLKLDD(C, m);
......@@ -66,11 +220,11 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
ldbk = BLKLDD(B, k);
zbeta = k == 0 ? beta : zone;
INSERT_TASK_zgemm(
&options,
options,
transA, transB,
tempmm, tempnn, tempkn, A->mb,
alpha, A(m, k), ldam, /* lda * Z */
B(k, n), ldbk, /* ldb * Y */
B(k, n), ldbk, /* ldb * Y */
zbeta, C(m, n), ldcm); /* ldc * Y */
}
}
......@@ -83,11 +237,11 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
zbeta = k == 0 ? beta : zone;
INSERT_TASK_zgemm(
&options,
options,
transA, transB,
tempmm, tempnn, tempkn, A->mb,
alpha, A(m, k), ldam, /* lda * Z */
B(n, k), ldbn, /* ldb * Z */
B(n, k), ldbn, /* ldb * Z */
zbeta, C(m, n), ldcm); /* ldc * Y */
}
}
......@@ -103,11 +257,11 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
ldbk = BLKLDD(B, k);
zbeta = k == 0 ? beta : zone;
INSERT_TASK_zgemm(
&options,
options,
transA, transB,
tempmm, tempnn, tempkm, A->mb,
alpha, A(k, m), ldak, /* lda * X */
B(k, n), ldbk, /* ldb * Y */
B(k, n), ldbk, /* ldb * Y */
zbeta, C(m, n), ldcm); /* ldc * Y */
}
}
......@@ -121,11 +275,11 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
ldak = BLKLDD(A, k);
zbeta = k == 0 ? beta : zone;
INSERT_TASK_zgemm(
&options,
options,
transA, transB,
tempmm, tempnn, tempkm, A->mb,
alpha, A(k, m), ldak, /* lda * X */
B(n, k), ldbn, /* ldb * Z */
B(n, k), ldbn, /* ldb * Z */
zbeta, C(m, n), ldcm); /* ldc * Y */
}
}
......@@ -142,5 +296,37 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
}
}
}
RUNTIME_options_finalize(&options, chamctxt);
(void)chamctxt;
}
/**
* Parallel tile matrix-matrix multiplication. wrapper.
*/
void
chameleon_pzgemm( cham_trans_t transA, cham_trans_t transB,
CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
CHAMELEON_Complex64_t beta, CHAM_desc_t *C,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
chamctxt = chameleon_context_self();
if (sequence->status != CHAMELEON_SUCCESS) {
return;
}
RUNTIME_options_init( &options, chamctxt, sequence, request );
if ( ((C->p > 1) || (C->q > 1)) &&
(C->get_rankof == chameleon_getrankof_2d) &&
(chamctxt->generic_enabled != CHAMELEON_TRUE) )
{
chameleon_pzgemm_summa( chamctxt, transA, transB, alpha, A, B, beta, C, &options );
}
else {
chameleon_pzgemm_generic( chamctxt, transA, transB, alpha, A, B, beta, C, &options );
}
RUNTIME_options_finalize( &options, chamctxt );
}
......@@ -23,20 +23,230 @@
*/
#include "control/common.h"
#define A(m,n) A, m, n
#define B(m,n) B, m, n
#define C(m,n) C, m, n
#define A(m, n) A, m, n
#define B(m, n) B, m, n
#define C(m, n) C, m, n
#define WA(m, n) &WA, m, n
#define WB(m, n) &WB, m, n
/**
* Parallel tile Hermitian matrix-matrix multiplication - dynamic scheduling
* Parallel tile hermitian matrix-matrix multiplication.
* SUMMA algorithm for 2D block-cyclic data distribution.
*/
void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
CHAMELEON_Complex64_t beta, CHAM_desc_t *C,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
static inline void
chameleon_pzhemm_summa( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo,
CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
CHAMELEON_Complex64_t beta, CHAM_desc_t *C,
RUNTIME_option_t *options )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
RUNTIME_sequence_t *sequence = options->sequence;
cham_trans_t transA, transB;
int Am, An, m, n, k, p, q, KT, K, lp, lq;
int ldam, ldbk, ldbm, ldcm;
int tempmm, tempnn, tempkk;
int lookahead, myp, myq;
CHAMELEON_Complex64_t zbeta;
CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0;
CHAM_desc_t WA, WB;
lookahead = chamctxt->lookahead;
chameleon_desc_init( &WA, CHAMELEON_MAT_ALLOC_TILE,
ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
C->mt * C->mb, C->nb * C->q * lookahead, 0, 0,
C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q,
NULL, NULL, NULL );
chameleon_desc_init( &WB, CHAMELEON_MAT_ALLOC_TILE,
ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
C->mb * C->p * lookahead, C->nt * C->nb, 0, 0,
C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q,
NULL, NULL, NULL );
KT = side == ChamLeft ? A->nt : A->mt;
K = side == ChamLeft ? A->n : A->m;
myp = C->myrank / C->q;
myq = C->myrank % C->q;
for (k = 0; k < KT; k++ ) {
lp = (k % lookahead) * C->p;
lq = (k % lookahead) * C->q;
tempkk = k == KT - 1 ? K - k * A->nb : A->nb;
zbeta = k == 0 ? beta : zone;
ldbk = BLKLDD(B, k);
/* Transfert ownership of the k column of A or B */
for (m = 0; m < C->mt; m ++ ) {
tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb;
if ( side == ChamLeft ) {
if ( (( uplo == ChamUpper ) && ( m > k )) ||
(( uplo == ChamLower ) && ( m < k )) ) {
Am = k;
An = m;
}
else {
Am = m;
An = k;
}
ldam = BLKLDD(A, Am);
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempmm, tempkk, C->mb,
A( Am, An ), ldam,
WA( m, (k % C->q) + lq ), WA.mb );
RUNTIME_data_flush( sequence, A( Am, An ) );
for ( q=1; q < C->q; q++ ) {
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempmm, tempkk, C->mb,
WA( m, ((k+q-1) % C->q) + lq ), WA.mb,
WA( m, ((k+q) % C->q) + lq ), WA.mb );
}
}
else {
ldbm = BLKLDD(B, m);
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempmm, tempkk, C->mb,
B( m, k ), ldbm,
WA( m, (k % C->q) + lq ), WA.mb );
RUNTIME_data_flush( sequence, B( m, k ) );
for ( q=1; q < C->q; q++ ) {
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempmm, tempkk, C->mb,
WA( m, ((k+q-1) % C->q) + lq ), WA.mb,
WA( m, ((k+q) % C->q) + lq ), WA.mb );
}
}
}
/* Transfert ownership of the k row of B, or A */
for (n = 0; n < C->nt; n++) {
tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
if ( side == ChamRight ) {
if ( (( uplo == ChamUpper ) && ( n < k )) ||
(( uplo == ChamLower ) && ( n > k )) ) {
Am = n;
An = k;
}
else {
Am = k;
An = n;
}
ldam = BLKLDD(A, Am);
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempkk, tempnn, C->mb,
A( Am, An ), ldam,
WB( (k % C->p) + lp, n ), WB.mb );
RUNTIME_data_flush( sequence, A( Am, An ) );
for ( p=1; p < C->p; p++ ) {
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempkk, tempnn, C->mb,
WB( ((k+p-1) % C->p) + lp, n ), WB.mb,
WB( ((k+p) % C->p) + lp, n ), WB.mb );
}
}
else {
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempkk, tempnn, C->mb,
B( k, n ), ldbk,
WB( (k % C->p) + lp, n ), WB.mb );
RUNTIME_data_flush( sequence, B( k, n ) );
for ( p=1; p < C->p; p++ ) {
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempkk, tempnn, C->mb,
WB( ((k+p-1) % C->p) + lp, n ), WB.mb,
WB( ((k+p) % C->p) + lp, n ), WB.mb );
}
}
}
/*
* ChamLeft / ChamLower
*/
for (m = myp; m < C->mt; m+=C->p) {
tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
ldcm = BLKLDD(C, m);
for (n = myq; n < C->nt; n+=C->q) {
tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
if (side == ChamLeft) {
transB = ChamNoTrans;
if ( (( uplo == ChamUpper ) && ( m > k )) ||
(( uplo == ChamLower ) && ( m < k )) ) {
transA = ChamConjTrans;
}
else {
transA = ChamNoTrans;
}
}
else {
transA = ChamNoTrans;
if ( (( uplo == ChamUpper ) && ( n < k )) ||
(( uplo == ChamLower ) && ( n > k )) ) {
transB = ChamConjTrans;
}
else {
transB = ChamNoTrans;
}
}
if ( k == m ) {
INSERT_TASK_zhemm(
options, side, uplo,
tempmm, tempnn, A->mb,
alpha, WA( m, myq + lq ), WA.mb, /* lda * Z */
WB( myp + lp, n ), WB.mb, /* ldb * Y */
zbeta, C( m, n ), ldcm ); /* ldc * Y */
}
else {
INSERT_TASK_zgemm(
options, transA, transB,
tempmm, tempnn, tempkk, A->mb,
alpha, WA( m, myq + lq ), WA.mb, /* lda * Z */
WB( myp + lp, n ), WB.mb, /* ldb * Y */
zbeta, C( m, n ), ldcm ); /* ldc * Y */
}
}
}
}
RUNTIME_desc_flush( &WA, sequence );
RUNTIME_desc_flush( &WB, sequence );
RUNTIME_desc_flush( C, sequence );
chameleon_sequence_wait( chamctxt, sequence );
chameleon_desc_destroy( &WA );
chameleon_desc_destroy( &WB );
}
/**
* Parallel tile hermitian matrix-matrix multiplication.
* Generic algorithm for any data distribution.
*/
static inline void
chameleon_pzhemm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo,
CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
CHAMELEON_Complex64_t beta, CHAM_desc_t *C,
RUNTIME_option_t *options )
{
int k, m, n;
int ldam, ldan, ldak, ldbk, ldbm, ldcm;
int tempmm, tempnn, tempkn, tempkm;
......@@ -44,12 +254,6 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
CHAMELEON_Complex64_t zbeta;
CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0;
chamctxt = chameleon_context_self();
if (sequence->status != CHAMELEON_SUCCESS) {
return;
}
RUNTIME_options_init(&options, chamctxt, sequence, request);
for(m = 0; m < C->mt; m++) {
tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
ldcm = BLKLDD(C, m);
......@@ -68,7 +272,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
zbeta = k == 0 ? beta : zone;
if (k < m) {
INSERT_TASK_zgemm(
&options,
options,
ChamNoTrans, ChamNoTrans,
tempmm, tempnn, tempkm, A->mb,
alpha, A(m, k), ldam, /* lda * K */
......@@ -78,7 +282,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
else {
if (k == m) {
INSERT_TASK_zhemm(
&options,
options,
side, uplo,
tempmm, tempnn, A->mb,
alpha, A(k, k), ldak, /* ldak * X */
......@@ -87,7 +291,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
}
else {
INSERT_TASK_zgemm(
&options,
options,
ChamConjTrans, ChamNoTrans,
tempmm, tempnn, tempkm, A->mb,
alpha, A(k, m), ldak, /* ldak * X */
......@@ -108,7 +312,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
zbeta = k == 0 ? beta : zone;
if (k < m) {
INSERT_TASK_zgemm(
&options,
options,
ChamConjTrans, ChamNoTrans,
tempmm, tempnn, tempkm, A->mb,
alpha, A(k, m), ldak, /* ldak * X */
......@@ -118,7 +322,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
else {
if (k == m) {
INSERT_TASK_zhemm(
&options,
options,
side, uplo,
tempmm, tempnn, A->mb,
alpha, A(k, k), ldak, /* ldak * K */
......@@ -127,7 +331,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
}
else {
INSERT_TASK_zgemm(
&options,
options,
ChamNoTrans, ChamNoTrans,
tempmm, tempnn, tempkm, A->mb,
alpha, A(m, k), ldam, /* lda * K */
......@@ -151,7 +355,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
zbeta = k == 0 ? beta : zone;
if (k < n) {
INSERT_TASK_zgemm(
&options,
options,
ChamNoTrans, ChamConjTrans,
tempmm, tempnn, tempkn, A->mb,
alpha, B(m, k), ldbm, /* ldb * K */
......@@ -161,7 +365,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
else {
if (k == n) {
INSERT_TASK_zhemm(
&options,
options,
side, uplo,
tempmm, tempnn, A->mb,
alpha, A(k, k), ldak, /* ldak * Y */
......@@ -170,7 +374,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
}
else {
INSERT_TASK_zgemm(
&options,
options,
ChamNoTrans, ChamNoTrans,
tempmm, tempnn, tempkn, A->mb,
alpha, B(m, k), ldbm, /* ldb * K */
......@@ -190,7 +394,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
zbeta = k == 0 ? beta : zone;
if (k < n) {
INSERT_TASK_zgemm(
&options,
options,
ChamNoTrans, ChamNoTrans,