From b323523558ab8f0ba03a35b5c1d15c8acbe37bb4 Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Thu, 7 Nov 2019 14:54:30 +0100
Subject: [PATCH] Fix hemm/symm summa versions

---
 compute/pzhemm.c | 393 ++++++++++++++++++++++++++++++-----------------
 compute/pzsymm.c | 393 ++++++++++++++++++++++++++++++-----------------
 2 files changed, 512 insertions(+), 274 deletions(-)

diff --git a/compute/pzhemm.c b/compute/pzhemm.c
index 93efe6fca..b41220165 100644
--- a/compute/pzhemm.c
+++ b/compute/pzhemm.c
@@ -23,47 +23,36 @@
  */
 #include "control/common.h"
 
-#define A(m, n) A,  m,  n
-#define B(m, n) B,  m,  n
-#define C(m, n) C,  m,  n
-#define WA(m, n) &WA,  m,  n
-#define WB(m, n) &WB,  m,  n
+#define A(  _m_, _n_ ) A,  (_m_), (_n_)
+#define B(  _m_, _n_ ) B,  (_m_), (_n_)
+#define C(  _m_, _n_ ) C,  (_m_), (_n_)
+#define WA( _m_, _n_ ) WA, (_m_), (_n_)
+#define WB( _m_, _n_ ) WB, (_m_), (_n_)
 
 /**
  *  Parallel tile hermitian matrix-matrix multiplication.
  *  SUMMA algorithm for 2D block-cyclic data distribution.
  */
 static inline void
-chameleon_pzhemm_summa( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo,
-                        CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
-                        CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
-                        RUNTIME_option_t *options )
+chameleon_pzhemm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo,
+                             CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
+                             CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
+                             CHAM_desc_t *WA, CHAM_desc_t *WB,
+                             RUNTIME_option_t *options )
 {
     RUNTIME_sequence_t *sequence = options->sequence;
-    cham_trans_t transA, transB;
-    int Am, An, m, n, k, p, q, KT, K, lp, lq;
-    int ldam, ldbk, ldbm, ldcm;
+    cham_trans_t transA;
+    int m, n, k, p, q, KT, K, lp, lq;
+    int ldcm;
     int tempmm, tempnn, tempkk;
     int lookahead, myp, myq;
 
     CHAMELEON_Complex64_t zbeta;
     CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0;
-    CHAM_desc_t WA, WB;
 
     lookahead = chamctxt->lookahead;
-    chameleon_desc_init( &WA, CHAMELEON_MAT_ALLOC_TILE,
-                         ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
-                         C->mt * C->mb, C->nb * C->q * lookahead, 0, 0,
-                         C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q,
-                         NULL, NULL, NULL );
-    chameleon_desc_init( &WB, CHAMELEON_MAT_ALLOC_TILE,
-                         ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
-                         C->mb * C->p * lookahead, C->nt * C->nb, 0, 0,
-                         C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q,
-                         NULL, NULL, NULL );
-
-    KT  = side == ChamLeft ? A->nt : A->mt;
-    K   = side == ChamLeft ? A->n  : A->m;
+    KT  = A->nt;
+    K   = A->n;
     myp = C->myrank / C->q;
     myq = C->myrank % C->q;
 
@@ -72,162 +61,291 @@ chameleon_pzhemm_summa( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t
         lq = (k % lookahead) * C->q;
         tempkk = k == KT - 1 ? K - k * A->nb : A->nb;
         zbeta = k == 0 ? beta : zone;
-        ldbk = BLKLDD(B, k);
 
         /* Transfert ownership of the k column of A or B */
         for (m = 0; m < C->mt; m ++ ) {
-            tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb;
+            int Am, Ak, ldam;
+            int tempam, tempak;
 
-            if ( side == ChamLeft ) {
-                if ( (( uplo == ChamUpper ) && ( m > k )) ||
-                     (( uplo == ChamLower ) && ( m < k )) ) {
-                    Am = k;
-                    An = m;
-                }
-                else {
-                    Am = m;
-                    An = k;
-                }
-                ldam = BLKLDD(A, Am);
-
-                INSERT_TASK_zlacpy(
-                    options,
-                    ChamUpperLower, tempmm, tempkk, C->mb,
-                    A(  Am, An ),             ldam,
-                    WA( m, (k % C->q) + lq ), WA.mb );
-
-                RUNTIME_data_flush( sequence, A( Am, An ) );
+            tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb;
 
-                for ( q=1; q < C->q; q++ ) {
-                    INSERT_TASK_zlacpy(
-                        options,
-                        ChamUpperLower, tempmm, tempkk, C->mb,
-                        WA( m, ((k+q-1) % C->q) + lq ), WA.mb,
-                        WA( m, ((k+q)   % C->q) + lq ), WA.mb );
-                }
+            if ( (( uplo == ChamUpper ) && ( m > k )) ||
+                 (( uplo == ChamLower ) && ( m < k )) )
+            {
+                    /* Let's take A( k, m ) */
+                Am = k;
+                Ak = m;
+                tempam = tempkk;
+                tempak = tempmm;
             }
             else {
-                ldbm = BLKLDD(B, m);
+                /* Let's take A( m, k ) */
+                Am = m;
+                Ak = k;
+                tempam = tempmm;
+                tempak = tempkk;
+            }
+            ldam = BLKLDD( A, Am );
 
-                INSERT_TASK_zlacpy(
-                    options,
-                    ChamUpperLower, tempmm, tempkk, C->mb,
-                    B(  m,  k ),              ldbm,
-                    WA( m, (k % C->q) + lq ), WA.mb );
+            INSERT_TASK_zlacpy(
+                options,
+                ChamUpperLower, tempam, tempak, C->mb,
+                A( Am, Ak ),              ldam,
+                WA( m, (k % C->q) + lq ), WA->mb );
 
-                RUNTIME_data_flush( sequence, B( m, k ) );
+            RUNTIME_data_flush( sequence, A( Am, Ak ) );
 
-                for ( q=1; q < C->q; q++ ) {
-                    INSERT_TASK_zlacpy(
-                        options,
-                        ChamUpperLower, tempmm, tempkk, C->mb,
-                        WA( m, ((k+q-1) % C->q) + lq ), WA.mb,
-                        WA( m, ((k+q)   % C->q) + lq ), WA.mb );
-                }
+            for ( q=1; q < C->q; q++ ) {
+                INSERT_TASK_zlacpy(
+                    options,
+                    ChamUpperLower, tempam, tempak, C->mb,
+                    WA( m, ((k+q-1) % C->q) + lq ), WA->mb,
+                    WA( m, ((k+q)   % C->q) + lq ), WA->mb );
             }
         }
 
         /* Transfert ownership of the k row of B, or A */
         for (n = 0; n < C->nt; n++) {
+            int ldbk;
+
             tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
+            ldbk = BLKLDD( B, k );
 
-            if ( side == ChamRight ) {
-                if ( (( uplo == ChamUpper ) && ( n < k )) ||
-                     (( uplo == ChamLower ) && ( n > k )) ) {
-                    Am = n;
-                    An = k;
-                }
-                else {
-                    Am = k;
-                    An = n;
-                }
-                ldam = BLKLDD(A, Am);
+            INSERT_TASK_zlacpy(
+                options,
+                ChamUpperLower, tempkk, tempnn, C->mb,
+                B(   k,              n ), ldbk,
+                WB( (k % C->p) + lp, n ), WB->mb );
 
+            RUNTIME_data_flush( sequence, B( k, n ) );
+
+            for ( p=1; p < C->p; p++ ) {
                 INSERT_TASK_zlacpy(
                     options,
                     ChamUpperLower, tempkk, tempnn, C->mb,
-                    A(   Am,            An ), ldam,
-                    WB( (k % C->p) + lp, n ), WB.mb );
+                    WB( ((k+p-1) % C->p) + lp, n ), WB->mb,
+                    WB( ((k+p)   % C->p) + lp, n ), WB->mb );
+            }
+        }
+
+        /* Perform the update of this iteration */
+        for (m = myp; m < C->mt; m+=C->p) {
+            tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
+            ldcm = BLKLDD(C, m);
 
-                RUNTIME_data_flush( sequence, A( Am, An ) );
+            if ( k == m ) {
+                for (n = myq; n < C->nt; n+=C->q) {
+                    tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
 
-                for ( p=1; p < C->p; p++ ) {
-                    INSERT_TASK_zlacpy(
-                        options,
-                        ChamUpperLower, tempkk, tempnn, C->mb,
-                        WB( ((k+p-1) % C->p) + lp, n ), WB.mb,
-                        WB( ((k+p)   % C->p) + lp, n ), WB.mb );
+                    INSERT_TASK_zhemm(
+                        options, ChamLeft, uplo,
+                        tempmm, tempnn, A->mb,
+                        alpha, WA( m,        myq + lq ), WA->mb,
+                               WB( myp + lp, n        ), WB->mb,
+                        zbeta, C(  m,        n        ), ldcm );
                 }
             }
             else {
+                if ( (( uplo == ChamUpper ) && ( m > k )) ||
+                     (( uplo == ChamLower ) && ( m < k )) )
+                {
+                    transA = ChamConjTrans;
+                }
+                else {
+                    transA = ChamNoTrans;
+                }
+
+                for (n = myq; n < C->nt; n+=C->q) {
+                    tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
+
+                    INSERT_TASK_zgemm(
+                        options, transA, ChamNoTrans,
+                        tempmm, tempnn, tempkk, A->mb,
+                        alpha, WA( m,        myq + lq ), WA->mb,
+                               WB( myp + lp, n        ), WB->mb,
+                        zbeta, C(  m,        n        ), ldcm );
+                }
+            }
+        }
+    }
+}
+
+/**
+ *  Parallel tile hermitian matrix-matrix multiplication.
+ *  SUMMA algorithm for 2D block-cyclic data distribution.
+ */
+static inline void
+chameleon_pzhemm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo,
+                              CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
+                              CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
+                              CHAM_desc_t *WA, CHAM_desc_t *WB,
+                              RUNTIME_option_t *options )
+{
+    RUNTIME_sequence_t *sequence = options->sequence;
+    cham_trans_t transA;
+    int m, n, k, p, q, KT, K, lp, lq;
+    int ldcm;
+    int tempmm, tempnn, tempkk;
+    int lookahead, myp, myq;
+
+    CHAMELEON_Complex64_t zbeta;
+    CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0;
+
+    lookahead = chamctxt->lookahead;
+    KT  = A->mt;
+    K   = A->m;
+    myp = C->myrank / C->q;
+    myq = C->myrank % C->q;
+
+    for (k = 0; k < KT; k++ ) {
+        lp = (k % lookahead) * C->p;
+        lq = (k % lookahead) * C->q;
+        tempkk = k == KT - 1 ? K - k * A->nb : A->nb;
+        zbeta = k == 0 ? beta : zone;
+
+        /* Transfert ownership of the k column of A or B */
+        for (m = 0; m < C->mt; m++ ) {
+            int ldbm;
+
+            tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb;
+            ldbm = BLKLDD( B, m );
+
+            INSERT_TASK_zlacpy(
+                options,
+                ChamUpperLower, tempmm, tempkk, C->mb,
+                B(  m,  k ),              ldbm,
+                WA( m, (k % C->q) + lq ), WA->mb );
+
+            RUNTIME_data_flush( sequence, B( m, k ) );
+
+            for ( q=1; q < C->q; q++ ) {
                 INSERT_TASK_zlacpy(
                     options,
-                    ChamUpperLower, tempkk, tempnn, C->mb,
-                    B(   k,              n ), ldbk,
-                    WB( (k % C->p) + lp, n ), WB.mb );
+                    ChamUpperLower, tempmm, tempkk, C->mb,
+                    WA( m, ((k+q-1) % C->q) + lq ), WA->mb,
+                    WA( m, ((k+q)   % C->q) + lq ), WA->mb );
+            }
+        }
 
-                RUNTIME_data_flush( sequence, B( k, n ) );
+        /* Transfert ownership of the k row of B, or A */
+        for (n = 0; n < C->nt; n++) {
+            int Ak, An, ldak;
+            int tempak, tempan;
 
-                for ( p=1; p < C->p; p++ ) {
-                    INSERT_TASK_zlacpy(
-                        options,
-                        ChamUpperLower, tempkk, tempnn, C->mb,
-                        WB( ((k+p-1) % C->p) + lp, n ), WB.mb,
-                        WB( ((k+p)   % C->p) + lp, n ), WB.mb );
-                }
+            tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
+
+            if ( (( uplo == ChamUpper ) && ( n < k )) ||
+                 (( uplo == ChamLower ) && ( n > k )) )
+            {
+                Ak = n;
+                An = k;
+                tempak = tempnn;
+                tempan = tempkk;
+            }
+            else
+            {
+                Ak = k;
+                An = n;
+                tempak = tempkk;
+                tempan = tempnn;
+            }
+            ldak = BLKLDD( A, Ak );
+
+            INSERT_TASK_zlacpy(
+                options,
+                ChamUpperLower, tempak, tempan, C->mb,
+                A(  Ak,              An ), ldak,
+                WB( (k % C->p) + lp, n  ), WB->mb );
+
+            RUNTIME_data_flush( sequence, A( Ak, An ) );
+
+            for ( p=1; p < C->p; p++ ) {
+                INSERT_TASK_zlacpy(
+                    options,
+                    ChamUpperLower, tempak, tempan, C->mb,
+                    WB( ((k+p-1) % C->p) + lp, n ), WB->mb,
+                    WB( ((k+p)   % C->p) + lp, n ), WB->mb );
             }
         }
 
-        /*
-         *  ChamLeft / ChamLower
-         */
-        for (m = myp; m < C->mt; m+=C->p) {
-            tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
-            ldcm = BLKLDD(C, m);
+        /* Perform the update of this iteration */
+        for (n = myq; n < C->nt; n+=C->q) {
+            tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
 
-            for (n = myq; n < C->nt; n+=C->q) {
-                tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
+            if ( k == n ) {
+                for (m = myp; m < C->mt; m+=C->p) {
+                    tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
+                    ldcm = BLKLDD(C, m);
 
-                if (side == ChamLeft) {
-                    transB = ChamNoTrans;
-                    if ( (( uplo == ChamUpper ) && ( m > k )) ||
-                         (( uplo == ChamLower ) && ( m < k )) ) {
-                        transA = ChamConjTrans;
-                    }
-                    else {
-                        transA = ChamNoTrans;
-                    }
+                    /* A has been stored in WA or WB for the summa ring */
+                    INSERT_TASK_zhemm(
+                        options, ChamRight, uplo,
+                        tempmm, tempnn, A->mb,
+                        alpha, WB( myp + lp, n        ), WB->mb,
+                               WA( m,        myq + lq ), WA->mb,
+                        zbeta, C(  m,        n        ), ldcm );
+                }
+            }
+            else {
+                if ( (( uplo == ChamUpper ) && ( n < k )) ||
+                     (( uplo == ChamLower ) && ( n > k )) )
+                {
+                    transA = ChamConjTrans;
                 }
                 else {
                     transA = ChamNoTrans;
-                    if ( (( uplo == ChamUpper ) && ( n < k )) ||
-                         (( uplo == ChamLower ) && ( n > k )) ) {
-                        transB = ChamConjTrans;
-                    }
-                    else {
-                        transB = ChamNoTrans;
-                    }
                 }
 
-                if ( k == m ) {
-                    INSERT_TASK_zhemm(
-                        options, side, uplo,
-                        tempmm, tempnn, A->mb,
-                        alpha, WA( m,        myq + lq ), WA.mb,  /* lda * Z */
-                               WB( myp + lp, n        ), WB.mb,  /* ldb * Y */
-                        zbeta, C(  m,        n        ), ldcm ); /* ldc * Y */
-                }
-                else {
+                for (m = myp; m < C->mt; m+=C->p) {
+                    tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
+                    ldcm = BLKLDD(C, m);
+
                     INSERT_TASK_zgemm(
-                        options, transA, transB,
+                        options, ChamNoTrans, transA,
                         tempmm, tempnn, tempkk, A->mb,
-                        alpha, WA( m,        myq + lq ), WA.mb,  /* lda * Z */
-                               WB( myp + lp, n        ), WB.mb,  /* ldb * Y */
-                        zbeta, C(  m,        n        ), ldcm ); /* ldc * Y */
+                        alpha, WA( m,        myq + lq ), WA->mb,
+                               WB( myp + lp, n        ), WB->mb,
+                        zbeta, C(  m,        n        ), ldcm );
                 }
             }
         }
     }
+}
+
+/**
+ *  Parallel tile hermitian matrix-matrix multiplication.
+ *  SUMMA algorithm for 2D block-cyclic data distribution.
+ */
+static inline void
+chameleon_pzhemm_summa( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo,
+                        CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
+                        CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
+                        RUNTIME_option_t *options )
+{
+    RUNTIME_sequence_t *sequence = options->sequence;
+    CHAM_desc_t WA, WB;
+    int lookahead;
+
+    lookahead = chamctxt->lookahead;
+    chameleon_desc_init( &WA, CHAMELEON_MAT_ALLOC_TILE,
+                         ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
+                         C->mt * C->mb, C->nb * C->q * lookahead, 0, 0,
+                         C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q,
+                         NULL, NULL, NULL );
+    chameleon_desc_init( &WB, CHAMELEON_MAT_ALLOC_TILE,
+                         ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
+                         C->mb * C->p * lookahead, C->nt * C->nb, 0, 0,
+                         C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q,
+                         NULL, NULL, NULL );
+
+    if (side == ChamLeft) {
+        chameleon_pzhemm_summa_left( chamctxt, uplo, alpha, A, B, beta, C,
+                                     &WA, &WB, options );
+    }
+    else {
+        chameleon_pzhemm_summa_right( chamctxt, uplo, alpha, A, B, beta, C,
+                                      &WA, &WB, options );
+    }
 
     RUNTIME_desc_flush( &WA, sequence );
     RUNTIME_desc_flush( &WB, sequence );
@@ -453,7 +571,8 @@ chameleon_pzhemm( cham_side_t side, cham_uplo_t uplo,
     {
         chameleon_pzhemm_summa(   chamctxt, side, uplo, alpha, A, B, beta, C, &options );
     }
-    else {
+    else
+    {
         chameleon_pzhemm_generic( chamctxt, side, uplo, alpha, A, B, beta, C, &options );
     }
 
diff --git a/compute/pzsymm.c b/compute/pzsymm.c
index 0413d63b5..f9d724c08 100644
--- a/compute/pzsymm.c
+++ b/compute/pzsymm.c
@@ -23,47 +23,36 @@
  */
 #include "control/common.h"
 
-#define A(m, n) A,  m,  n
-#define B(m, n) B,  m,  n
-#define C(m, n) C,  m,  n
-#define WA(m, n) &WA,  m,  n
-#define WB(m, n) &WB,  m,  n
+#define A(  _m_, _n_ ) A,  (_m_), (_n_)
+#define B(  _m_, _n_ ) B,  (_m_), (_n_)
+#define C(  _m_, _n_ ) C,  (_m_), (_n_)
+#define WA( _m_, _n_ ) WA, (_m_), (_n_)
+#define WB( _m_, _n_ ) WB, (_m_), (_n_)
 
 /**
  *  Parallel tile symmetric matrix-matrix multiplication.
  *  SUMMA algorithm for 2D block-cyclic data distribution.
  */
 static inline void
-chameleon_pzsymm_summa( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo,
-                        CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
-                        CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
-                        RUNTIME_option_t *options )
+chameleon_pzsymm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo,
+                             CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
+                             CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
+                             CHAM_desc_t *WA, CHAM_desc_t *WB,
+                             RUNTIME_option_t *options )
 {
     RUNTIME_sequence_t *sequence = options->sequence;
-    cham_trans_t transA, transB;
-    int Am, An, m, n, k, p, q, KT, K, lp, lq;
-    int ldam, ldbk, ldbm, ldcm;
+    cham_trans_t transA;
+    int m, n, k, p, q, KT, K, lp, lq;
+    int ldcm;
     int tempmm, tempnn, tempkk;
     int lookahead, myp, myq;
 
     CHAMELEON_Complex64_t zbeta;
     CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0;
-    CHAM_desc_t WA, WB;
 
     lookahead = chamctxt->lookahead;
-    chameleon_desc_init( &WA, CHAMELEON_MAT_ALLOC_TILE,
-                         ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
-                         C->mt * C->mb, C->nb * C->q * lookahead, 0, 0,
-                         C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q,
-                         NULL, NULL, NULL );
-    chameleon_desc_init( &WB, CHAMELEON_MAT_ALLOC_TILE,
-                         ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
-                         C->mb * C->p * lookahead, C->nt * C->nb, 0, 0,
-                         C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q,
-                         NULL, NULL, NULL );
-
-    KT  = side == ChamLeft ? A->nt : A->mt;
-    K   = side == ChamLeft ? A->n  : A->m;
+    KT  = A->nt;
+    K   = A->n;
     myp = C->myrank / C->q;
     myq = C->myrank % C->q;
 
@@ -72,162 +61,291 @@ chameleon_pzsymm_summa( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t
         lq = (k % lookahead) * C->q;
         tempkk = k == KT - 1 ? K - k * A->nb : A->nb;
         zbeta = k == 0 ? beta : zone;
-        ldbk = BLKLDD(B, k);
 
         /* Transfert ownership of the k column of A or B */
         for (m = 0; m < C->mt; m ++ ) {
-            tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb;
+            int Am, Ak, ldam;
+            int tempam, tempak;
 
-            if ( side == ChamLeft ) {
-                if ( (( uplo == ChamUpper ) && ( m > k )) ||
-                     (( uplo == ChamLower ) && ( m < k )) ) {
-                    Am = k;
-                    An = m;
-                }
-                else {
-                    Am = m;
-                    An = k;
-                }
-                ldam = BLKLDD(A, Am);
-
-                INSERT_TASK_zlacpy(
-                    options,
-                    ChamUpperLower, tempmm, tempkk, C->mb,
-                    A(  Am, An ),             ldam,
-                    WA( m, (k % C->q) + lq ), WA.mb );
-
-                RUNTIME_data_flush( sequence, A( Am, An ) );
+            tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb;
 
-                for ( q=1; q < C->q; q++ ) {
-                    INSERT_TASK_zlacpy(
-                        options,
-                        ChamUpperLower, tempmm, tempkk, C->mb,
-                        WA( m, ((k+q-1) % C->q) + lq ), WA.mb,
-                        WA( m, ((k+q)   % C->q) + lq ), WA.mb );
-                }
+            if ( (( uplo == ChamUpper ) && ( m > k )) ||
+                 (( uplo == ChamLower ) && ( m < k )) )
+            {
+                    /* Let's take A( k, m ) */
+                Am = k;
+                Ak = m;
+                tempam = tempkk;
+                tempak = tempmm;
             }
             else {
-                ldbm = BLKLDD(B, m);
+                /* Let's take A( m, k ) */
+                Am = m;
+                Ak = k;
+                tempam = tempmm;
+                tempak = tempkk;
+            }
+            ldam = BLKLDD( A, Am );
 
-                INSERT_TASK_zlacpy(
-                    options,
-                    ChamUpperLower, tempmm, tempkk, C->mb,
-                    B(  m,  k ),              ldbm,
-                    WA( m, (k % C->q) + lq ), WA.mb );
+            INSERT_TASK_zlacpy(
+                options,
+                ChamUpperLower, tempam, tempak, C->mb,
+                A( Am, Ak ),              ldam,
+                WA( m, (k % C->q) + lq ), WA->mb );
 
-                RUNTIME_data_flush( sequence, B( m, k ) );
+            RUNTIME_data_flush( sequence, A( Am, Ak ) );
 
-                for ( q=1; q < C->q; q++ ) {
-                    INSERT_TASK_zlacpy(
-                        options,
-                        ChamUpperLower, tempmm, tempkk, C->mb,
-                        WA( m, ((k+q-1) % C->q) + lq ), WA.mb,
-                        WA( m, ((k+q)   % C->q) + lq ), WA.mb );
-                }
+            for ( q=1; q < C->q; q++ ) {
+                INSERT_TASK_zlacpy(
+                    options,
+                    ChamUpperLower, tempam, tempak, C->mb,
+                    WA( m, ((k+q-1) % C->q) + lq ), WA->mb,
+                    WA( m, ((k+q)   % C->q) + lq ), WA->mb );
             }
         }
 
         /* Transfert ownership of the k row of B, or A */
         for (n = 0; n < C->nt; n++) {
+            int ldbk;
+
             tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
+            ldbk = BLKLDD( B, k );
 
-            if ( side == ChamRight ) {
-                if ( (( uplo == ChamUpper ) && ( n < k )) ||
-                     (( uplo == ChamLower ) && ( n > k )) ) {
-                    Am = n;
-                    An = k;
-                }
-                else {
-                    Am = k;
-                    An = n;
-                }
-                ldam = BLKLDD(A, Am);
+            INSERT_TASK_zlacpy(
+                options,
+                ChamUpperLower, tempkk, tempnn, C->mb,
+                B(   k,              n ), ldbk,
+                WB( (k % C->p) + lp, n ), WB->mb );
 
+            RUNTIME_data_flush( sequence, B( k, n ) );
+
+            for ( p=1; p < C->p; p++ ) {
                 INSERT_TASK_zlacpy(
                     options,
                     ChamUpperLower, tempkk, tempnn, C->mb,
-                    A(   Am,            An ), ldam,
-                    WB( (k % C->p) + lp, n ), WB.mb );
+                    WB( ((k+p-1) % C->p) + lp, n ), WB->mb,
+                    WB( ((k+p)   % C->p) + lp, n ), WB->mb );
+            }
+        }
+
+        /* Perform the update of this iteration */
+        for (m = myp; m < C->mt; m+=C->p) {
+            tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
+            ldcm = BLKLDD(C, m);
 
-                RUNTIME_data_flush( sequence, A( Am, An ) );
+            if ( k == m ) {
+                for (n = myq; n < C->nt; n+=C->q) {
+                    tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
 
-                for ( p=1; p < C->p; p++ ) {
-                    INSERT_TASK_zlacpy(
-                        options,
-                        ChamUpperLower, tempkk, tempnn, C->mb,
-                        WB( ((k+p-1) % C->p) + lp, n ), WB.mb,
-                        WB( ((k+p)   % C->p) + lp, n ), WB.mb );
+                    INSERT_TASK_zsymm(
+                        options, ChamLeft, uplo,
+                        tempmm, tempnn, A->mb,
+                        alpha, WA( m,        myq + lq ), WA->mb,
+                               WB( myp + lp, n        ), WB->mb,
+                        zbeta, C(  m,        n        ), ldcm );
                 }
             }
             else {
+                if ( (( uplo == ChamUpper ) && ( m > k )) ||
+                     (( uplo == ChamLower ) && ( m < k )) )
+                {
+                    transA = ChamTrans;
+                }
+                else {
+                    transA = ChamNoTrans;
+                }
+
+                for (n = myq; n < C->nt; n+=C->q) {
+                    tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
+
+                    INSERT_TASK_zgemm(
+                        options, transA, ChamNoTrans,
+                        tempmm, tempnn, tempkk, A->mb,
+                        alpha, WA( m,        myq + lq ), WA->mb,
+                               WB( myp + lp, n        ), WB->mb,
+                        zbeta, C(  m,        n        ), ldcm );
+                }
+            }
+        }
+    }
+}
+
+/**
+ *  Parallel tile symmetric matrix-matrix multiplication.
+ *  SUMMA algorithm for 2D block-cyclic data distribution.
+ */
+static inline void
+chameleon_pzsymm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo,
+                              CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
+                              CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
+                              CHAM_desc_t *WA, CHAM_desc_t *WB,
+                              RUNTIME_option_t *options )
+{
+    RUNTIME_sequence_t *sequence = options->sequence;
+    cham_trans_t transA;
+    int m, n, k, p, q, KT, K, lp, lq;
+    int ldcm;
+    int tempmm, tempnn, tempkk;
+    int lookahead, myp, myq;
+
+    CHAMELEON_Complex64_t zbeta;
+    CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0;
+
+    lookahead = chamctxt->lookahead;
+    KT  = A->mt;
+    K   = A->m;
+    myp = C->myrank / C->q;
+    myq = C->myrank % C->q;
+
+    for (k = 0; k < KT; k++ ) {
+        lp = (k % lookahead) * C->p;
+        lq = (k % lookahead) * C->q;
+        tempkk = k == KT - 1 ? K - k * A->nb : A->nb;
+        zbeta = k == 0 ? beta : zone;
+
+        /* Transfert ownership of the k column of A or B */
+        for (m = 0; m < C->mt; m++ ) {
+            int ldbm;
+
+            tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb;
+            ldbm = BLKLDD( B, m );
+
+            INSERT_TASK_zlacpy(
+                options,
+                ChamUpperLower, tempmm, tempkk, C->mb,
+                B(  m,  k ),              ldbm,
+                WA( m, (k % C->q) + lq ), WA->mb );
+
+            RUNTIME_data_flush( sequence, B( m, k ) );
+
+            for ( q=1; q < C->q; q++ ) {
                 INSERT_TASK_zlacpy(
                     options,
-                    ChamUpperLower, tempkk, tempnn, C->mb,
-                    B(   k,              n ), ldbk,
-                    WB( (k % C->p) + lp, n ), WB.mb );
+                    ChamUpperLower, tempmm, tempkk, C->mb,
+                    WA( m, ((k+q-1) % C->q) + lq ), WA->mb,
+                    WA( m, ((k+q)   % C->q) + lq ), WA->mb );
+            }
+        }
 
-                RUNTIME_data_flush( sequence, B( k, n ) );
+        /* Transfert ownership of the k row of B, or A */
+        for (n = 0; n < C->nt; n++) {
+            int Ak, An, ldak;
+            int tempak, tempan;
 
-                for ( p=1; p < C->p; p++ ) {
-                    INSERT_TASK_zlacpy(
-                        options,
-                        ChamUpperLower, tempkk, tempnn, C->mb,
-                        WB( ((k+p-1) % C->p) + lp, n ), WB.mb,
-                        WB( ((k+p)   % C->p) + lp, n ), WB.mb );
-                }
+            tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
+
+            if ( (( uplo == ChamUpper ) && ( n < k )) ||
+                 (( uplo == ChamLower ) && ( n > k )) )
+            {
+                Ak = n;
+                An = k;
+                tempak = tempnn;
+                tempan = tempkk;
+            }
+            else
+            {
+                Ak = k;
+                An = n;
+                tempak = tempkk;
+                tempan = tempnn;
+            }
+            ldak = BLKLDD( A, Ak );
+
+            INSERT_TASK_zlacpy(
+                options,
+                ChamUpperLower, tempak, tempan, C->mb,
+                A(  Ak,              An ), ldak,
+                WB( (k % C->p) + lp, n  ), WB->mb );
+
+            RUNTIME_data_flush( sequence, A( Ak, An ) );
+
+            for ( p=1; p < C->p; p++ ) {
+                INSERT_TASK_zlacpy(
+                    options,
+                    ChamUpperLower, tempak, tempan, C->mb,
+                    WB( ((k+p-1) % C->p) + lp, n ), WB->mb,
+                    WB( ((k+p)   % C->p) + lp, n ), WB->mb );
             }
         }
 
-        /*
-         *  ChamLeft / ChamLower
-         */
-        for (m = myp; m < C->mt; m+=C->p) {
-            tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
-            ldcm = BLKLDD(C, m);
+        /* Perform the update of this iteration */
+        for (n = myq; n < C->nt; n+=C->q) {
+            tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
 
-            for (n = myq; n < C->nt; n+=C->q) {
-                tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
+            if ( k == n ) {
+                for (m = myp; m < C->mt; m+=C->p) {
+                    tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
+                    ldcm = BLKLDD(C, m);
 
-                if (side == ChamLeft) {
-                    transB = ChamNoTrans;
-                    if ( (( uplo == ChamUpper ) && ( m > k )) ||
-                         (( uplo == ChamLower ) && ( m < k )) ) {
-                        transA = ChamTrans;
-                    }
-                    else {
-                        transA = ChamNoTrans;
-                    }
+                    /* A has been stored in WA or WB for the summa ring */
+                    INSERT_TASK_zsymm(
+                        options, ChamRight, uplo,
+                        tempmm, tempnn, A->mb,
+                        alpha, WB( myp + lp, n        ), WB->mb,
+                               WA( m,        myq + lq ), WA->mb,
+                        zbeta, C(  m,        n        ), ldcm );
+                }
+            }
+            else {
+                if ( (( uplo == ChamUpper ) && ( n < k )) ||
+                     (( uplo == ChamLower ) && ( n > k )) )
+                {
+                    transA = ChamTrans;
                 }
                 else {
                     transA = ChamNoTrans;
-                    if ( (( uplo == ChamUpper ) && ( n < k )) ||
-                         (( uplo == ChamLower ) && ( n > k )) ) {
-                        transB = ChamTrans;
-                    }
-                    else {
-                        transB = ChamNoTrans;
-                    }
                 }
 
-                if ( k == m ) {
-                    INSERT_TASK_zsymm(
-                        options, side, uplo,
-                        tempmm, tempnn, A->mb,
-                        alpha, WA( m,        myq + lq ), WA.mb,  /* lda * Z */
-                               WB( myp + lp, n        ), WB.mb,  /* ldb * Y */
-                        zbeta, C(  m,        n        ), ldcm ); /* ldc * Y */
-                }
-                else {
+                for (m = myp; m < C->mt; m+=C->p) {
+                    tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
+                    ldcm = BLKLDD(C, m);
+
                     INSERT_TASK_zgemm(
-                        options, transA, transB,
+                        options, ChamNoTrans, transA,
                         tempmm, tempnn, tempkk, A->mb,
-                        alpha, WA( m,        myq + lq ), WA.mb,  /* lda * Z */
-                               WB( myp + lp, n        ), WB.mb,  /* ldb * Y */
-                        zbeta, C(  m,        n        ), ldcm ); /* ldc * Y */
+                        alpha, WA( m,        myq + lq ), WA->mb,
+                               WB( myp + lp, n        ), WB->mb,
+                        zbeta, C(  m,        n        ), ldcm );
                 }
             }
         }
     }
+}
+
+/**
+ *  Parallel tile symmetric matrix-matrix multiplication.
+ *  SUMMA algorithm for 2D block-cyclic data distribution.
+ */
+static inline void
+chameleon_pzsymm_summa( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo,
+                        CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
+                        CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
+                        RUNTIME_option_t *options )
+{
+    RUNTIME_sequence_t *sequence = options->sequence;
+    CHAM_desc_t WA, WB;
+    int lookahead;
+
+    lookahead = chamctxt->lookahead;
+    chameleon_desc_init( &WA, CHAMELEON_MAT_ALLOC_TILE,
+                         ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
+                         C->mt * C->mb, C->nb * C->q * lookahead, 0, 0,
+                         C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q,
+                         NULL, NULL, NULL );
+    chameleon_desc_init( &WB, CHAMELEON_MAT_ALLOC_TILE,
+                         ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
+                         C->mb * C->p * lookahead, C->nt * C->nb, 0, 0,
+                         C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q,
+                         NULL, NULL, NULL );
+
+    if (side == ChamLeft) {
+        chameleon_pzsymm_summa_left( chamctxt, uplo, alpha, A, B, beta, C,
+                                     &WA, &WB, options );
+    }
+    else {
+        chameleon_pzsymm_summa_right( chamctxt, uplo, alpha, A, B, beta, C,
+                                      &WA, &WB, options );
+    }
 
     RUNTIME_desc_flush( &WA, sequence );
     RUNTIME_desc_flush( &WB, sequence );
@@ -453,7 +571,8 @@ chameleon_pzsymm( cham_side_t side, cham_uplo_t uplo,
     {
         chameleon_pzsymm_summa(   chamctxt, side, uplo, alpha, A, B, beta, C, &options );
     }
-    else {
+    else
+    {
         chameleon_pzsymm_generic( chamctxt, side, uplo, alpha, A, B, beta, C, &options );
     }
 
-- 
GitLab