Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 96ec99ec authored by Mathieu Faverge's avatar Mathieu Faverge
Browse files

Merge branch 'feature/gemm_summa' into 'master'

SUMMA GEMM

See merge request !154
parents e02d3679 ef90cff0
No related branches found
No related tags found
1 merge request!154SUMMA GEMM
......@@ -26,16 +26,176 @@
#define A(m, n) A, m, n
#define B(m, n) B, m, n
#define C(m, n) C, m, n
#define WA(m, n) &WA, m, n
#define WB(m, n) &WB, m, n
/**
* Parallel tile matrix-matrix multiplication - dynamic scheduling
* Parallel tile matrix-matrix multiplication
* SUMMA algorithm for 2D block-cyclic data distribution.
*/
void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
CHAMELEON_Complex64_t beta, CHAM_desc_t *C,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
static inline void
chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_trans_t transB,
CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
CHAMELEON_Complex64_t beta, CHAM_desc_t *C,
RUNTIME_option_t *options )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
RUNTIME_sequence_t *sequence = options->sequence;
int m, n, k, p, q, KT, K, lp, lq;
int ldam, ldak, ldbn, ldbk, ldcm;
int tempmm, tempnn, tempkk;
int lookahead, myp, myq;
CHAMELEON_Complex64_t zbeta;
CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0;
CHAM_desc_t WA, WB;
lookahead = chamctxt->lookahead;
chameleon_desc_init( &WA, CHAMELEON_MAT_ALLOC_TILE,
ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
C->mt * C->mb, C->nb * C->q * lookahead, 0, 0,
C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q,
NULL, NULL, NULL );
chameleon_desc_init( &WB, CHAMELEON_MAT_ALLOC_TILE,
ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
C->mb * C->p * lookahead, C->nt * C->nb, 0, 0,
C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q,
NULL, NULL, NULL );
KT = transA == ChamNoTrans ? A->nt : A->mt;
K = transA == ChamNoTrans ? A->n : A->m;
myp = A->myrank / A->q;
myq = A->myrank % A->q;
/*
* A: ChamNoTrans / B: ChamNoTrans
*/
for (k = 0; k < KT; k++ ) {
lp = (k % lookahead) * C->p;
lq = (k % lookahead) * C->q;
tempkk = k == KT - 1 ? K - k * A->nb : A->nb;
zbeta = k == 0 ? beta : zone;
ldak = BLKLDD(A, k);
ldbk = BLKLDD(B, k);
/* Transfert ownership of the k column of A */
for (m = 0; m < C->mt; m ++ ) {
tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb;
ldam = BLKLDD(A, m);
if ( transA == ChamNoTrans ) {
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempmm, tempkk, C->mb,
A( m, k ), ldam,
WA( m, (k % C->q) + lq ), WA.mb );
RUNTIME_data_flush( sequence, A( m, k ) );
for ( q=1; q < C->q; q++ ) {
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempmm, tempkk, C->mb,
WA( m, ((k+q-1) % C->q) + lq ), WA.mb,
WA( m, ((k+q) % C->q) + lq ), WA.mb );
}
}
else {
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempkk, tempmm, C->mb,
A( k, m ), ldak,
WA( m, (k % C->q) + lq ), WA.mb );
RUNTIME_data_flush( sequence, A( k, m ) );
for ( q=1; q < C->q; q++ ) {
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempkk, tempmm, C->mb,
WA( m, ((k+q-1) % C->q) + lq ), WA.mb,
WA( m, ((k+q) % C->q) + lq ), WA.mb );
}
}
}
/* Transfert ownership of the k row of B */
for (n = 0; n < C->nt; n++) {
tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
ldbn = BLKLDD(B, n);
if ( transB == ChamNoTrans ) {
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempkk, tempnn, C->mb,
B( k, n ), ldbk,
WB( (k % C->p) + lp, n ), WB.mb );
RUNTIME_data_flush( sequence, B( k, n ) );
for ( p=1; p < C->p; p++ ) {
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempkk, tempnn, C->mb,
WB( ((k+p-1) % C->p) + lp, n ), WB.mb,
WB( ((k+p) % C->p) + lp, n ), WB.mb );
}
}
else {
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempnn, tempkk, C->mb,
B( n, k ), ldbn,
WB( (k % C->p) + lp, n ), WB.mb );
RUNTIME_data_flush( sequence, B( n, k ) );
for ( p=1; p < C->p; p++ ) {
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempnn, tempkk, C->mb,
WB( ((k+p-1) % C->p) + lp, n ), WB.mb,
WB( ((k+p) % C->p) + lp, n ), WB.mb );
}
}
}
for (m = myp; m < C->mt; m+=C->p) {
tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
ldcm = BLKLDD(C, m);
for (n = myq; n < C->nt; n+=C->q) {
tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
INSERT_TASK_zgemm(
options,
transA, transB,
tempmm, tempnn, tempkk, A->mb,
alpha, WA( m, myq + lq ), WA.mb, /* lda * Z */
WB( myp + lp, n ), WB.mb, /* ldb * Y */
zbeta, C( m, n ), ldcm ); /* ldc * Y */
}
}
}
RUNTIME_desc_flush( &WA, sequence );
RUNTIME_desc_flush( &WB, sequence );
RUNTIME_desc_flush( C, sequence );
chameleon_sequence_wait( chamctxt, sequence );
chameleon_desc_destroy( &WA );
chameleon_desc_destroy( &WB );
}
/**
* Parallel tile matrix-matrix multiplication.
* Generic algorithm for any data distribution.
*/
static inline void
chameleon_pzgemm_generic( CHAM_context_t *chamctxt, cham_trans_t transA, cham_trans_t transB,
CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
CHAMELEON_Complex64_t beta, CHAM_desc_t *C,
RUNTIME_option_t *options )
{
RUNTIME_sequence_t *sequence = options->sequence;
int m, n, k;
int ldam, ldak, ldbn, ldbk, ldcm;
......@@ -44,12 +204,6 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
CHAMELEON_Complex64_t zbeta;
CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0;
chamctxt = chameleon_context_self();
if (sequence->status != CHAMELEON_SUCCESS) {
return;
}
RUNTIME_options_init(&options, chamctxt, sequence, request);
for (m = 0; m < C->mt; m++) {
tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
ldcm = BLKLDD(C, m);
......@@ -66,11 +220,11 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
ldbk = BLKLDD(B, k);
zbeta = k == 0 ? beta : zone;
INSERT_TASK_zgemm(
&options,
options,
transA, transB,
tempmm, tempnn, tempkn, A->mb,
alpha, A(m, k), ldam, /* lda * Z */
B(k, n), ldbk, /* ldb * Y */
B(k, n), ldbk, /* ldb * Y */
zbeta, C(m, n), ldcm); /* ldc * Y */
}
}
......@@ -83,11 +237,11 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
zbeta = k == 0 ? beta : zone;
INSERT_TASK_zgemm(
&options,
options,
transA, transB,
tempmm, tempnn, tempkn, A->mb,
alpha, A(m, k), ldam, /* lda * Z */
B(n, k), ldbn, /* ldb * Z */
B(n, k), ldbn, /* ldb * Z */
zbeta, C(m, n), ldcm); /* ldc * Y */
}
}
......@@ -103,11 +257,11 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
ldbk = BLKLDD(B, k);
zbeta = k == 0 ? beta : zone;
INSERT_TASK_zgemm(
&options,
options,
transA, transB,
tempmm, tempnn, tempkm, A->mb,
alpha, A(k, m), ldak, /* lda * X */
B(k, n), ldbk, /* ldb * Y */
B(k, n), ldbk, /* ldb * Y */
zbeta, C(m, n), ldcm); /* ldc * Y */
}
}
......@@ -121,11 +275,11 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
ldak = BLKLDD(A, k);
zbeta = k == 0 ? beta : zone;
INSERT_TASK_zgemm(
&options,
options,
transA, transB,
tempmm, tempnn, tempkm, A->mb,
alpha, A(k, m), ldak, /* lda * X */
B(n, k), ldbn, /* ldb * Z */
B(n, k), ldbn, /* ldb * Z */
zbeta, C(m, n), ldcm); /* ldc * Y */
}
}
......@@ -142,5 +296,37 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
}
}
}
RUNTIME_options_finalize(&options, chamctxt);
(void)chamctxt;
}
/**
* Parallel tile matrix-matrix multiplication. wrapper.
*/
void
chameleon_pzgemm( cham_trans_t transA, cham_trans_t transB,
CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
CHAMELEON_Complex64_t beta, CHAM_desc_t *C,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
chamctxt = chameleon_context_self();
if (sequence->status != CHAMELEON_SUCCESS) {
return;
}
RUNTIME_options_init( &options, chamctxt, sequence, request );
if ( ((C->p > 1) || (C->q > 1)) &&
(C->get_rankof == chameleon_getrankof_2d) &&
(chamctxt->generic_enabled != CHAMELEON_TRUE) )
{
chameleon_pzgemm_summa( chamctxt, transA, transB, alpha, A, B, beta, C, &options );
}
else {
chameleon_pzgemm_generic( chamctxt, transA, transB, alpha, A, B, beta, C, &options );
}
RUNTIME_options_finalize( &options, chamctxt );
}
......@@ -23,20 +23,230 @@
*/
#include "control/common.h"
#define A(m,n) A, m, n
#define B(m,n) B, m, n
#define C(m,n) C, m, n
#define A(m, n) A, m, n
#define B(m, n) B, m, n
#define C(m, n) C, m, n
#define WA(m, n) &WA, m, n
#define WB(m, n) &WB, m, n
/**
* Parallel tile Hermitian matrix-matrix multiplication - dynamic scheduling
* Parallel tile hermitian matrix-matrix multiplication.
* SUMMA algorithm for 2D block-cyclic data distribution.
*/
void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
CHAMELEON_Complex64_t beta, CHAM_desc_t *C,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
static inline void
chameleon_pzhemm_summa( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo,
CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
CHAMELEON_Complex64_t beta, CHAM_desc_t *C,
RUNTIME_option_t *options )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
RUNTIME_sequence_t *sequence = options->sequence;
cham_trans_t transA, transB;
int Am, An, m, n, k, p, q, KT, K, lp, lq;
int ldam, ldbk, ldbm, ldcm;
int tempmm, tempnn, tempkk;
int lookahead, myp, myq;
CHAMELEON_Complex64_t zbeta;
CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0;
CHAM_desc_t WA, WB;
lookahead = chamctxt->lookahead;
chameleon_desc_init( &WA, CHAMELEON_MAT_ALLOC_TILE,
ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
C->mt * C->mb, C->nb * C->q * lookahead, 0, 0,
C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q,
NULL, NULL, NULL );
chameleon_desc_init( &WB, CHAMELEON_MAT_ALLOC_TILE,
ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
C->mb * C->p * lookahead, C->nt * C->nb, 0, 0,
C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q,
NULL, NULL, NULL );
KT = side == ChamLeft ? A->nt : A->mt;
K = side == ChamLeft ? A->n : A->m;
myp = C->myrank / C->q;
myq = C->myrank % C->q;
for (k = 0; k < KT; k++ ) {
lp = (k % lookahead) * C->p;
lq = (k % lookahead) * C->q;
tempkk = k == KT - 1 ? K - k * A->nb : A->nb;
zbeta = k == 0 ? beta : zone;
ldbk = BLKLDD(B, k);
/* Transfert ownership of the k column of A or B */
for (m = 0; m < C->mt; m ++ ) {
tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb;
if ( side == ChamLeft ) {
if ( (( uplo == ChamUpper ) && ( m > k )) ||
(( uplo == ChamLower ) && ( m < k )) ) {
Am = k;
An = m;
}
else {
Am = m;
An = k;
}
ldam = BLKLDD(A, Am);
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempmm, tempkk, C->mb,
A( Am, An ), ldam,
WA( m, (k % C->q) + lq ), WA.mb );
RUNTIME_data_flush( sequence, A( Am, An ) );
for ( q=1; q < C->q; q++ ) {
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempmm, tempkk, C->mb,
WA( m, ((k+q-1) % C->q) + lq ), WA.mb,
WA( m, ((k+q) % C->q) + lq ), WA.mb );
}
}
else {
ldbm = BLKLDD(B, m);
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempmm, tempkk, C->mb,
B( m, k ), ldbm,
WA( m, (k % C->q) + lq ), WA.mb );
RUNTIME_data_flush( sequence, B( m, k ) );
for ( q=1; q < C->q; q++ ) {
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempmm, tempkk, C->mb,
WA( m, ((k+q-1) % C->q) + lq ), WA.mb,
WA( m, ((k+q) % C->q) + lq ), WA.mb );
}
}
}
/* Transfert ownership of the k row of B, or A */
for (n = 0; n < C->nt; n++) {
tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
if ( side == ChamRight ) {
if ( (( uplo == ChamUpper ) && ( n < k )) ||
(( uplo == ChamLower ) && ( n > k )) ) {
Am = n;
An = k;
}
else {
Am = k;
An = n;
}
ldam = BLKLDD(A, Am);
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempkk, tempnn, C->mb,
A( Am, An ), ldam,
WB( (k % C->p) + lp, n ), WB.mb );
RUNTIME_data_flush( sequence, A( Am, An ) );
for ( p=1; p < C->p; p++ ) {
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempkk, tempnn, C->mb,
WB( ((k+p-1) % C->p) + lp, n ), WB.mb,
WB( ((k+p) % C->p) + lp, n ), WB.mb );
}
}
else {
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempkk, tempnn, C->mb,
B( k, n ), ldbk,
WB( (k % C->p) + lp, n ), WB.mb );
RUNTIME_data_flush( sequence, B( k, n ) );
for ( p=1; p < C->p; p++ ) {
INSERT_TASK_zlacpy(
options,
ChamUpperLower, tempkk, tempnn, C->mb,
WB( ((k+p-1) % C->p) + lp, n ), WB.mb,
WB( ((k+p) % C->p) + lp, n ), WB.mb );
}
}
}
/*
* ChamLeft / ChamLower
*/
for (m = myp; m < C->mt; m+=C->p) {
tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
ldcm = BLKLDD(C, m);
for (n = myq; n < C->nt; n+=C->q) {
tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
if (side == ChamLeft) {
transB = ChamNoTrans;
if ( (( uplo == ChamUpper ) && ( m > k )) ||
(( uplo == ChamLower ) && ( m < k )) ) {
transA = ChamConjTrans;
}
else {
transA = ChamNoTrans;
}
}
else {
transA = ChamNoTrans;
if ( (( uplo == ChamUpper ) && ( n < k )) ||
(( uplo == ChamLower ) && ( n > k )) ) {
transB = ChamConjTrans;
}
else {
transB = ChamNoTrans;
}
}
if ( k == m ) {
INSERT_TASK_zhemm(
options, side, uplo,
tempmm, tempnn, A->mb,
alpha, WA( m, myq + lq ), WA.mb, /* lda * Z */
WB( myp + lp, n ), WB.mb, /* ldb * Y */
zbeta, C( m, n ), ldcm ); /* ldc * Y */
}
else {
INSERT_TASK_zgemm(
options, transA, transB,
tempmm, tempnn, tempkk, A->mb,
alpha, WA( m, myq + lq ), WA.mb, /* lda * Z */
WB( myp + lp, n ), WB.mb, /* ldb * Y */
zbeta, C( m, n ), ldcm ); /* ldc * Y */
}
}
}
}
RUNTIME_desc_flush( &WA, sequence );
RUNTIME_desc_flush( &WB, sequence );
RUNTIME_desc_flush( C, sequence );
chameleon_sequence_wait( chamctxt, sequence );
chameleon_desc_destroy( &WA );
chameleon_desc_destroy( &WB );
}
/**
* Parallel tile hermitian matrix-matrix multiplication.
* Generic algorithm for any data distribution.
*/
static inline void
chameleon_pzhemm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo,
CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
CHAMELEON_Complex64_t beta, CHAM_desc_t *C,
RUNTIME_option_t *options )
{
int k, m, n;
int ldam, ldan, ldak, ldbk, ldbm, ldcm;
int tempmm, tempnn, tempkn, tempkm;
......@@ -44,12 +254,6 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
CHAMELEON_Complex64_t zbeta;
CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0;
chamctxt = chameleon_context_self();
if (sequence->status != CHAMELEON_SUCCESS) {
return;
}
RUNTIME_options_init(&options, chamctxt, sequence, request);
for(m = 0; m < C->mt; m++) {
tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
ldcm = BLKLDD(C, m);
......@@ -68,7 +272,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
zbeta = k == 0 ? beta : zone;
if (k < m) {
INSERT_TASK_zgemm(
&options,
options,
ChamNoTrans, ChamNoTrans,
tempmm, tempnn, tempkm, A->mb,
alpha, A(m, k), ldam, /* lda * K */
......@@ -78,7 +282,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
else {
if (k == m) {
INSERT_TASK_zhemm(
&options,
options,
side, uplo,
tempmm, tempnn, A->mb,
alpha, A(k, k), ldak, /* ldak * X */
......@@ -87,7 +291,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
}
else {
INSERT_TASK_zgemm(
&options,
options,
ChamConjTrans, ChamNoTrans,
tempmm, tempnn, tempkm, A->mb,
alpha, A(k, m), ldak, /* ldak * X */
......@@ -108,7 +312,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
zbeta = k == 0 ? beta : zone;
if (k < m) {
INSERT_TASK_zgemm(
&options,
options,
ChamConjTrans, ChamNoTrans,
tempmm, tempnn, tempkm, A->mb,
alpha, A(k, m), ldak, /* ldak * X */
......@@ -118,7 +322,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
else {
if (k == m) {
INSERT_TASK_zhemm(
&options,
options,
side, uplo,
tempmm, tempnn, A->mb,
alpha, A(k, k), ldak, /* ldak * K */
......@@ -127,7 +331,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
}
else {
INSERT_TASK_zgemm(
&options,
options,
ChamNoTrans, ChamNoTrans,
tempmm, tempnn, tempkm, A->mb,
alpha, A(m, k), ldam, /* lda * K */
......@@ -151,7 +355,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
zbeta = k == 0 ? beta : zone;
if (k < n) {
INSERT_TASK_zgemm(
&options,
options,
ChamNoTrans, ChamConjTrans,
tempmm, tempnn, tempkn, A->mb,
alpha, B(m, k), ldbm, /* ldb * K */
......@@ -161,7 +365,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
else {
if (k == n) {
INSERT_TASK_zhemm(
&options,
options,
side, uplo,
tempmm, tempnn, A->mb,
alpha, A(k, k), ldak, /* ldak * Y */
......@@ -170,7 +374,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
}
else {
INSERT_TASK_zgemm(
&options,
options,
ChamNoTrans, ChamNoTrans,
tempmm, tempnn, tempkn, A->mb,
alpha, B(m, k), ldbm, /* ldb * K */
......@@ -190,7 +394,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
zbeta = k == 0 ? beta : zone;
if (k < n) {
INSERT_TASK_zgemm(
&options,
options,
ChamNoTrans, ChamNoTrans,
tempmm, tempnn, tempkn, A->mb,
alpha, B(m, k), ldbm, /* ldb * K */
......@@ -200,7 +404,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
else {
if (k == n) {
INSERT_TASK_zhemm(
&options,
options,
side, uplo,
tempmm, tempnn, A->mb,
alpha, A(k, k), ldak, /* ldak * Y */
......@@ -209,7 +413,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
}
else {
INSERT_TASK_zgemm(
&options,
options,
ChamNoTrans, ChamConjTrans,
tempmm, tempnn, tempkn, A->mb,
alpha, B(m, k), ldbm, /* ldb * K */
......@@ -222,5 +426,36 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
}
}
}
RUNTIME_options_finalize(&options, chamctxt);
(void)chamctxt;
}
/**
* Parallel tile hermitian matrix-matrix multiplication. wrapper.
*/
void
chameleon_pzhemm( cham_side_t side, cham_uplo_t uplo,
CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
CHAMELEON_Complex64_t beta, CHAM_desc_t *C,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{
CHAM_context_t *chamctxt;
RUNTIME_option_t options;
chamctxt = chameleon_context_self();
if (sequence->status != CHAMELEON_SUCCESS) {
return;
}
RUNTIME_options_init( &options, chamctxt, sequence, request );
if ( ((C->p > 1) || (C->q > 1)) &&
(C->get_rankof == chameleon_getrankof_2d) &&
(chamctxt->generic_enabled != CHAMELEON_TRUE) )
{
chameleon_pzhemm_summa( chamctxt, side, uplo, alpha, A, B, beta, C, &options );
}
else {
chameleon_pzhemm_generic( chamctxt, side, uplo, alpha, A, B, beta, C, &options );
}
RUNTIME_options_finalize( &options, chamctxt );
}
This diff is collapsed.
......@@ -62,6 +62,7 @@ CHAM_context_t *chameleon_context_create()
chamctxt->nb = 128;
chamctxt->ib = 32;
chamctxt->rhblock = 4;
chamctxt->lookahead = 3;
chamctxt->nworkers = 1;
chamctxt->ncudas = 0;
......@@ -72,11 +73,11 @@ CHAM_context_t *chameleon_context_create()
chamctxt->parallel_enabled = CHAMELEON_FALSE;
chamctxt->profiling_enabled = CHAMELEON_FALSE;
chamctxt->progress_enabled = CHAMELEON_FALSE;
chamctxt->generic_enabled = CHAMELEON_FALSE;
chamctxt->householder = ChamFlatHouseholder;
chamctxt->translation = ChamOutOfPlace;
/* Initialize scheduler */
RUNTIME_context_create(chamctxt);
......@@ -120,6 +121,7 @@ int chameleon_context_destroy(){
* @arg CHAMELEON_PROFILING_MODE activate profiling of kernels
* @arg CHAMELEON_PROGRESS activate progress indicator
* @arg CHAMELEON_GEMM3M Use z/cgemm3m for complexe matrix-matrix products
* @arg CHAMELEON_GENERIC enable/disable GEMM3M Use z/cgemm3m for complexe matrix-matrix products
*
*******************************************************************************
*
......@@ -160,6 +162,9 @@ int CHAMELEON_Enable(int option)
/* case CHAMELEON_PARALLEL: */
/* chamctxt->parallel_enabled = CHAMELEON_TRUE; */
/* break; */
case CHAMELEON_GENERIC:
chamctxt->generic_enabled = CHAMELEON_TRUE;
break;
default:
chameleon_error("CHAMELEON_Enable", "illegal parameter value");
return CHAMELEON_ERR_ILLEGAL_VALUE;
......@@ -225,6 +230,9 @@ int CHAMELEON_Disable(int option)
case CHAMELEON_PARALLEL_MODE:
chamctxt->parallel_enabled = CHAMELEON_FALSE;
break;
case CHAMELEON_GENERIC:
chamctxt->generic_enabled = CHAMELEON_FALSE;
break;
default:
chameleon_error("CHAMELEON_Disable", "illegal parameter value");
return CHAMELEON_ERR_ILLEGAL_VALUE;
......@@ -248,6 +256,7 @@ int CHAMELEON_Disable(int option)
* Feature to be enabled:
* @arg CHAMELEON_TILE_SIZE: size matrix tile,
* @arg CHAMELEON_INNER_BLOCK_SIZE: size of tile inner block,
* @arg CHAMELEON_LOOKAHEAD: depth of the look ahead in algorithms
*
* @param[in] value
* Value of the parameter.
......@@ -321,6 +330,13 @@ int CHAMELEON_Set( int param, int value )
}
chamctxt->translation = value;
break;
case CHAMELEON_LOOKAHEAD:
if (value < 1) {
chameleon_error("CHAMELEON_Set", "illegal value of CHAMELEON_LOOKAHEAD");
return CHAMELEON_ERR_ILLEGAL_VALUE;
}
chamctxt->lookahead = value;
break;
default:
chameleon_error("CHAMELEON_Set", "unknown parameter");
return CHAMELEON_ERR_ILLEGAL_VALUE;
......@@ -341,6 +357,7 @@ int CHAMELEON_Set( int param, int value )
* Feature to be enabled:
* @arg CHAMELEON_TILE_SIZE: size matrix tile,
* @arg CHAMELEON_INNER_BLOCK_SIZE: size of tile inner block,
* @arg CHAMELEON_LOOKAHEAD: depth of the look ahead in algorithms
*
* @param[out] value
* Value of the parameter.
......@@ -375,6 +392,9 @@ int CHAMELEON_Get(int param, int *value)
case CHAMELEON_TRANSLATION_MODE:
*value = chamctxt->translation;
return CHAMELEON_SUCCESS;
case CHAMELEON_LOOKAHEAD:
*value = chamctxt->lookahead;
return CHAMELEON_SUCCESS;
default:
chameleon_error("CHAMELEON_Get", "unknown parameter");
return CHAMELEON_ERR_ILLEGAL_VALUE;
......
......@@ -68,6 +68,27 @@ int chameleon_desc_mat_free( CHAM_desc_t *desc )
return CHAMELEON_SUCCESS;
}
/**
* Internal function to return MPI rank of element A(m,n) with m,n = block indices
*/
int chameleon_getrankof_2d( const CHAM_desc_t *A, int m, int n )
{
int mm = m + A->i / A->mb;
int nn = n + A->j / A->nb;
return (mm % A->p) * A->q + (nn % A->q);
}
/**
* Internal function to return MPI rank of element DIAG(m,0) with m,n = block indices
*/
int chameleon_getrankof_2d_diag( const CHAM_desc_t *A, int m, int n )
{
int mm = m + A->i / A->mb;
assert( m == n );
return (mm % A->p) * A->q + (mm % A->q);
}
/**
******************************************************************************
*
......
......@@ -44,8 +44,8 @@ inline static int chameleon_getblkldd_ccrb(const CHAM_desc_t *A, int m);
/**
* Data distributions
*/
inline static int chameleon_getrankof_2d(const CHAM_desc_t *desc, int m, int n);
inline static int chameleon_getrankof_2d_diag(const CHAM_desc_t *desc, int m, int n);
int chameleon_getrankof_2d(const CHAM_desc_t *desc, int m, int n);
int chameleon_getrankof_2d_diag(const CHAM_desc_t *desc, int m, int n);
int chameleon_desc_init ( CHAM_desc_t *desc, void *mat,
cham_flttype_t dtyp, int mb, int nb, int bsiz,
......@@ -175,27 +175,6 @@ inline static int chameleon_getblkldd_cm(const CHAM_desc_t *A, int m) {
return A->llm;
}
/**
* Internal function to return MPI rank of element A(m,n) with m,n = block indices
*/
inline static int chameleon_getrankof_2d(const CHAM_desc_t *A, int m, int n)
{
int mm = m + A->i / A->mb;
int nn = n + A->j / A->nb;
return (mm % A->p) * A->q + (nn % A->q);
}
/**
* Internal function to return MPI rank of element DIAG(m,0) with m,n = block indices
*/
inline static int chameleon_getrankof_2d_diag(const CHAM_desc_t *A, int m, int n)
{
int mm = m + A->i / A->mb;
assert( m == n );
return (mm % A->p) * A->q + (mm % A->q);
}
/**
* Detect if the tile is local or not
*/
......
......@@ -182,6 +182,7 @@ typedef enum chameleon_store_e {
#define CHAMELEON_BOUND 7
#define CHAMELEON_PROGRESS 8
#define CHAMELEON_GEMM3M 9
#define CHAMELEON_GENERIC 10
/**
* CHAMELEON constants - configuration parameters
......@@ -192,6 +193,7 @@ typedef enum chameleon_store_e {
#define CHAMELEON_HOUSEHOLDER_MODE 5
#define CHAMELEON_HOUSEHOLDER_SIZE 6
#define CHAMELEON_TRANSLATION_MODE 7
#define CHAMELEON_LOOKAHEAD 8
/**
* @brief QR/LQ factorization trees
......
......@@ -116,6 +116,7 @@ typedef struct chameleon_context_s {
cham_bool_t parallel_enabled;
cham_bool_t profiling_enabled;
cham_bool_t progress_enabled;
cham_bool_t generic_enabled;
cham_householder_t householder; // "domino" (flat) or tree-based (reduction) Householder
cham_translation_t translation; // In place or Out of place layout conversion
......@@ -123,6 +124,7 @@ typedef struct chameleon_context_s {
int nb;
int ib;
int rhblock; // block size for tree-based (reduction) Householder
int lookahead; // depth of the look ahead in algorithms
void *schedopt; // structure for runtimes
int mpi_outer_init; // MPI has been initialized outside our functions
} CHAM_context_t;
......
......@@ -85,6 +85,8 @@ void RUNTIME_enable( void *runtime_ctxt, int lever )
default:
return;
}
(void)runtime_ctxt;
return;
}
......@@ -107,5 +109,7 @@ void RUNTIME_disable( void *runtime_ctxt, int lever )
default:
return;
}
(void)runtime_ctxt;
return;
}
......@@ -56,7 +56,7 @@ void RUNTIME_iteration_pop( CHAM_context_t *chamctxt )
void RUNTIME_start_profiling(){
#if defined(HAVE_STARPU_FXT_PROFILING)
starpu_fxt_start_profiling();
starpu_fxt_start_profiling();
#else
fprintf(stderr, "Profiling throught FxT has not been enabled in StarPU runtime (configure StarPU with --with-fxt)\n");
#endif
......@@ -64,7 +64,7 @@ void RUNTIME_start_profiling(){
void RUNTIME_stop_profiling(){
#if defined(HAVE_STARPU_FXT_PROFILING)
starpu_fxt_stop_profiling();
starpu_fxt_stop_profiling();
#else
fprintf(stderr, "Profiling throught FxT has not been enabled in StarPU runtime (configure StarPU with --with-fxt)\n");
#endif
......
......@@ -212,7 +212,6 @@ static int compute_gram_sequential(cham_uplo_t uplo,
int LDA)
{
int m, n;
double eps;
double squareij, mean_dij, mhalf;
double *work = (double *)malloc(N * sizeof(double));
......@@ -258,4 +257,4 @@ static int compute_gram_sequential(cham_uplo_t uplo,
free(work);
return 0;
}
\ No newline at end of file
}
......@@ -103,6 +103,8 @@ set(ZSRC_LAP_INT
set(ZSRC_TIL_INT
# BLAS 3
time_zgemm_tile.c
time_zhemm_tile.c
time_zsymm_tile.c
# LAPACK
time_zgels_tile.c
time_zgeqrf_hqr_tile.c
......
/**
*
* @file time_zhemm_tile.c
*
* @copyright 2009-2014 The University of Tennessee and The University of
* Tennessee Research Foundation. All rights reserved.
* @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @version 0.9.2
* @author Mathieu Faverge
* @date 2014-11-16
* @precisions normal z -> c
*
*/
#define _TYPE CHAMELEON_Complex64_t
#define _PREC double
#define _LAMCH LAPACKE_dlamch_work
#define _NAME "CHAMELEON_zhemm_Tile"
/* See Lawn 41 page 120 */
#define _FMULS FMULS_HEMM( ChamLeft, M, N )
#define _FADDS FADDS_HEMM( ChamLeft, M, N )
#include "./timing.c"
#include "timing_zauxiliary.h"
static int
RunTest(int *iparam, double *dparam, chameleon_time_t *t_)
{
CHAMELEON_Complex64_t alpha, beta;
PASTE_CODE_IPARAM_LOCALS( iparam );
LDA = chameleon_max(M, iparam[IPARAM_LDA]);
LDB = chameleon_max(M, iparam[IPARAM_LDB]);
LDC = chameleon_max(M, iparam[IPARAM_LDC]);
/* Allocate Data */
PASTE_CODE_ALLOCATE_MATRIX_TILE( descA, 1, CHAMELEON_Complex64_t, ChamComplexDouble, LDA, M, M );
PASTE_CODE_ALLOCATE_MATRIX_TILE( descB, 1, CHAMELEON_Complex64_t, ChamComplexDouble, LDB, M, N );
PASTE_CODE_ALLOCATE_MATRIX_TILE( descC, 1, CHAMELEON_Complex64_t, ChamComplexDouble, LDC, M, N );
/* Initialize Data */
CHAMELEON_zplghe_Tile( 0, ChamUpper, descA, 5373 );
CHAMELEON_zplrnt_Tile( descB, 7672 );
CHAMELEON_zplrnt_Tile( descC, 6387 );
#if !defined(CHAMELEON_SIMULATION)
LAPACKE_zlarnv_work(1, ISEED, 1, &alpha);
LAPACKE_zlarnv_work(1, ISEED, 1, &beta);
#else
alpha = 1.5;
beta = -2.3;
#endif
/* Save C for check */
PASTE_TILE_TO_LAPACK( descC, C2, check, CHAMELEON_Complex64_t, LDC, N );
START_TIMING();
CHAMELEON_zhemm_Tile( ChamLeft, ChamUpper, alpha, descA, descB, beta, descC );
STOP_TIMING();
#if !defined(CHAMELEON_SIMULATION)
/* Check the solution */
if (check)
{
PASTE_TILE_TO_LAPACK( descA, A, check, CHAMELEON_Complex64_t, LDA, M );
PASTE_TILE_TO_LAPACK( descB, B, check, CHAMELEON_Complex64_t, LDB, N );
PASTE_TILE_TO_LAPACK( descC, C, check, CHAMELEON_Complex64_t, LDC, N );
dparam[IPARAM_RES] = z_check_hemm( ChamLeft, ChamUpper, M, N,
alpha, A, LDA, B, LDB, beta, C, C2, LDC,
&(dparam[IPARAM_ANORM]),
&(dparam[IPARAM_BNORM]),
&(dparam[IPARAM_XNORM]) );
free(A); free(B); free(C); free(C2);
}
#endif
PASTE_CODE_FREE_MATRIX( descA );
PASTE_CODE_FREE_MATRIX( descB );
PASTE_CODE_FREE_MATRIX( descC );
return 0;
}
/**
*
* @file time_zsymm_tile.c
*
* @copyright 2009-2014 The University of Tennessee and The University of
* Tennessee Research Foundation. All rights reserved.
* @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @version 0.9.2
* @author Mathieu Faverge
* @date 2014-11-16
* @precisions normal z -> c d s
*
*/
#define _TYPE CHAMELEON_Complex64_t
#define _PREC double
#define _LAMCH LAPACKE_dlamch_work
#define _NAME "CHAMELEON_zsymm_Tile"
/* See Lawn 41 page 120 */
#define _FMULS FMULS_SYMM( ChamLeft, M, N )
#define _FADDS FADDS_SYMM( ChamLeft, M, N )
#include "./timing.c"
#include "timing_zauxiliary.h"
static int
RunTest(int *iparam, double *dparam, chameleon_time_t *t_)
{
CHAMELEON_Complex64_t alpha, beta;
PASTE_CODE_IPARAM_LOCALS( iparam );
LDA = chameleon_max(M, iparam[IPARAM_LDA]);
LDB = chameleon_max(M, iparam[IPARAM_LDB]);
LDC = chameleon_max(M, iparam[IPARAM_LDC]);
/* Allocate Data */
PASTE_CODE_ALLOCATE_MATRIX_TILE( descA, 1, CHAMELEON_Complex64_t, ChamComplexDouble, LDA, M, M );
PASTE_CODE_ALLOCATE_MATRIX_TILE( descB, 1, CHAMELEON_Complex64_t, ChamComplexDouble, LDB, M, N );
PASTE_CODE_ALLOCATE_MATRIX_TILE( descC, 1, CHAMELEON_Complex64_t, ChamComplexDouble, LDC, M, N );
/* Initialize Data */
CHAMELEON_zplghe_Tile( 0, ChamUpper, descA, 5373 );
CHAMELEON_zplrnt_Tile( descB, 7672 );
CHAMELEON_zplrnt_Tile( descC, 6387 );
#if !defined(CHAMELEON_SIMULATION)
LAPACKE_zlarnv_work(1, ISEED, 1, &alpha);
LAPACKE_zlarnv_work(1, ISEED, 1, &beta);
#else
alpha = 1.5;
beta = -2.3;
#endif
/* Save C for check */
PASTE_TILE_TO_LAPACK( descC, C2, check, CHAMELEON_Complex64_t, LDC, N );
START_TIMING();
CHAMELEON_zsymm_Tile( ChamLeft, ChamUpper, alpha, descA, descB, beta, descC );
STOP_TIMING();
#if !defined(CHAMELEON_SIMULATION)
/* Check the solution */
if (check)
{
PASTE_TILE_TO_LAPACK( descA, A, check, CHAMELEON_Complex64_t, LDA, M );
PASTE_TILE_TO_LAPACK( descB, B, check, CHAMELEON_Complex64_t, LDB, N );
PASTE_TILE_TO_LAPACK( descC, C, check, CHAMELEON_Complex64_t, LDC, N );
dparam[IPARAM_RES] = z_check_symm( ChamLeft, ChamUpper, M, N,
alpha, A, LDA, B, LDB, beta, C, C2, LDC,
&(dparam[IPARAM_ANORM]),
&(dparam[IPARAM_BNORM]),
&(dparam[IPARAM_XNORM]) );
free(A); free(B); free(C); free(C2);
}
#endif
PASTE_CODE_FREE_MATRIX( descA );
PASTE_CODE_FREE_MATRIX( descB );
PASTE_CODE_FREE_MATRIX( descC );
return 0;
}
......@@ -247,6 +247,70 @@ double z_check_gemm(cham_trans_t transA, cham_trans_t transB, int M, int N, int
return Rnorm;
}
#if defined(PRECISION_z) || defined(PRECISION_c)
/*--------------------------------------------------------------
* Check the hemm
*/
double z_check_hemm( cham_side_t side, cham_uplo_t uplo, int M, int N,
CHAMELEON_Complex64_t alpha, const CHAMELEON_Complex64_t *A, int LDA,
const CHAMELEON_Complex64_t *B, int LDB,
CHAMELEON_Complex64_t beta, const CHAMELEON_Complex64_t *Ccham,
CHAMELEON_Complex64_t *Cref, int LDC,
double *Cinitnorm, double *Cchamnorm, double *Clapacknorm )
{
CHAMELEON_Complex64_t beta_const = -1.0;
double Rnorm;
double *work = (double *)malloc( chameleon_max(M, N)* sizeof(double) );
*Cinitnorm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'I', M, N, Cref, LDC, work );
*Cchamnorm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'I', M, N, Ccham, LDC, work );
cblas_zhemm( CblasColMajor, (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, M, N,
CBLAS_SADDR(alpha), A, LDA, B, LDB, CBLAS_SADDR(beta), Cref, LDC );
*Clapacknorm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'I', M, N, Cref, LDC, work );
cblas_zaxpy( LDC * N, CBLAS_SADDR(beta_const), Ccham, 1, Cref, 1 );
Rnorm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'I', M, N, Cref, LDC, work );
free(work);
return Rnorm;
}
#endif /* defined(PRECISION_z) || defined(PRECISION_c) */
/*--------------------------------------------------------------
* Check the symm
*/
double z_check_symm( cham_side_t side, cham_uplo_t uplo, int M, int N,
CHAMELEON_Complex64_t alpha, const CHAMELEON_Complex64_t *A, int LDA,
const CHAMELEON_Complex64_t *B, int LDB,
CHAMELEON_Complex64_t beta, const CHAMELEON_Complex64_t *Ccham,
CHAMELEON_Complex64_t *Cref, int LDC,
double *Cinitnorm, double *Cchamnorm, double *Clapacknorm )
{
CHAMELEON_Complex64_t beta_const = -1.0;
double Rnorm;
double *work = (double *)malloc( chameleon_max(M, N)* sizeof(double) );
*Cinitnorm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'I', M, N, Cref, LDC, work );
*Cchamnorm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'I', M, N, Ccham, LDC, work );
cblas_zsymm( CblasColMajor, (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, M, N,
CBLAS_SADDR(alpha), A, LDA, B, LDB, CBLAS_SADDR(beta), Cref, LDC );
*Clapacknorm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'I', M, N, Cref, LDC, work );
cblas_zaxpy( LDC * N, CBLAS_SADDR(beta_const), Ccham, 1, Cref, 1 );
Rnorm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'I', M, N, Cref, LDC, work );
free(work);
return Rnorm;
}
/*--------------------------------------------------------------
* Check the trsm
*/
......
......@@ -28,6 +28,21 @@ double z_check_gemm(cham_trans_t transA, cham_trans_t transB, int M, int N, int
CHAMELEON_Complex64_t *Cref, int LDC,
double *Cinitnorm, double *Cchamnorm, double *Clapacknorm );
#if defined(PRECISION_z) || defined(PRECISION_c)
double z_check_hemm( cham_side_t side, cham_uplo_t uplo, int M, int N,
CHAMELEON_Complex64_t alpha, const CHAMELEON_Complex64_t *A, int LDA,
const CHAMELEON_Complex64_t *B, int LDB,
CHAMELEON_Complex64_t beta, const CHAMELEON_Complex64_t *Ccham,
CHAMELEON_Complex64_t *Cref, int LDC,
double *Cinitnorm, double *Cchamnorm, double *Clapacknorm );
#endif
double z_check_symm( cham_side_t side, cham_uplo_t uplo, int M, int N,
CHAMELEON_Complex64_t alpha, const CHAMELEON_Complex64_t *A, int LDA,
const CHAMELEON_Complex64_t *B, int LDB,
CHAMELEON_Complex64_t beta, const CHAMELEON_Complex64_t *Ccham,
CHAMELEON_Complex64_t *Cref, int LDC,
double *Cinitnorm, double *Cchamnorm, double *Clapacknorm );
double z_check_trsm(cham_side_t side, cham_uplo_t uplo, cham_trans_t trans, cham_diag_t diag,
int M, int NRHS, CHAMELEON_Complex64_t alpha,
CHAMELEON_Complex64_t *A, int LDA,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment