From 39ca51374a3f68388b0deb211996d9ba58398d1d Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Wed, 31 Jan 2018 17:51:32 +0100
Subject: [PATCH] Add migration and swith to TP kernels in unglq algorithms

---
 compute/pzunglq.c       | 27 ++++++++++----
 compute/pzunglq_param.c | 82 +++++++++++++++++++++--------------------
 compute/pzunglqrh.c     | 38 +++++++++++++------
 compute/pzunmlqrh.c     |  4 ++
 4 files changed, 91 insertions(+), 60 deletions(-)

diff --git a/compute/pzunglq.c b/compute/pzunglq.c
index 13a034ce3..de9c1409f 100644
--- a/compute/pzunglq.c
+++ b/compute/pzunglq.c
@@ -34,9 +34,9 @@
 #define Q(m,n) Q,  m,  n
 #define T(m,n) T,  m,  n
 #if defined(CHAMELEON_COPY_DIAG)
-#define D(k) D, k, 0
+#define D(k)   D,  k,  0
 #else
-#define D(k) A, k, k
+#define D(k)   D,  k,  k
 #endif
 
 /*******************************************************************************
@@ -69,6 +69,10 @@ void morse_pzunglq(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T, MORSE_desc
         minMT = A->mt;
     }
 
+    if (D == NULL) {
+        D = A;
+    }
+
     /*
      * zunmlq = A->nb * ib
      * ztsmlq = A->nb * ib
@@ -103,14 +107,18 @@ void morse_pzunglq(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T, MORSE_desc
             for (m = 0; m < Q->mt; m++) {
                 tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb;
                 ldqm = BLKLDD(Q, m);
-                MORSE_TASK_ztsmlq(
+
+                RUNTIME_data_migrate( sequence, Q(m, k),
+                                      Q->get_rankof( Q, m, n ) );
+
+                MORSE_TASK_ztpmlqt(
                     &options,
                     MorseRight, MorseNoTrans,
-                    tempmm, Q->nb, tempmm, tempnn, tempAkm, ib, T->nb,
-                    Q(m, k), ldqm,
-                    Q(m, n), ldqm,
+                    tempmm, tempnn, tempAkm, 0, ib, T->nb,
                     A(k, n), ldak,
-                    T(k, n), T->mb);
+                    T(k, n), T->mb,
+                    Q(m, k), ldqm,
+                    Q(m, n), ldqm);
             }
         }
 #if defined(CHAMELEON_COPY_DIAG)
@@ -130,6 +138,10 @@ void morse_pzunglq(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T, MORSE_desc
         for (m = k; m < Q->mt; m++) {
             tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb;
             ldqm = BLKLDD(Q, m);
+
+            RUNTIME_data_migrate( sequence, Q(m, k),
+                                  Q->get_rankof( Q, m, k ) );
+
             MORSE_TASK_zunmlq(
                 &options,
                 MorseRight, MorseNoTrans,
@@ -144,5 +156,4 @@ void morse_pzunglq(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T, MORSE_desc
 
     RUNTIME_options_ws_free(&options);
     RUNTIME_options_finalize(&options, morse);
-    (void)D;
 }
diff --git a/compute/pzunglq_param.c b/compute/pzunglq_param.c
index adc3512ff..8346cbd39 100644
--- a/compute/pzunglq_param.c
+++ b/compute/pzunglq_param.c
@@ -25,15 +25,10 @@
 #include "control/common.h"
 #include <stdlib.h>
 
-#define A(m,n) A,  (m),  (n)
-#define Q(m,n) Q,  (m),  (n)
-#define TS(m,n) TS,  (m),  (n)
-#define TT(m,n) TT,  (m),  (n)
-#if defined(CHAMELEON_COPY_DIAG)
+#define A(m,n) A, (m), (n)
+#define Q(m,n) Q, (m), (n)
+#define T(m,n) T, (m), (n)
 #define D(m,n) D, (m), (n)
-#else
-#define D(m,n) A, (m), (n)
-#endif
 
 /**
  *  Parallel construction of Q using tile V - dynamic scheduling
@@ -44,11 +39,12 @@ void morse_pzunglq_param(const libhqr_tree_t *qrtree, MORSE_desc_t *A, MORSE_des
 {
     MORSE_context_t *morse;
     MORSE_option_t options;
+    MORSE_desc_t *T;
     size_t ws_worker = 0;
     size_t ws_host = 0;
 
     int k, m, n, i, p;
-    int K;
+    int K, L;
     int ldak, ldqm;
     int tempkm, tempkmin, temppn, tempnn, tempmm;
     int ib;
@@ -61,6 +57,10 @@ void morse_pzunglq_param(const libhqr_tree_t *qrtree, MORSE_desc_t *A, MORSE_des
 
     ib = MORSE_IB;
 
+    if (D == NULL) {
+        D = A;
+    }
+
     /*
      * zunmqr = A->nb * ib
      * ztsmqr = A->nb * ib
@@ -103,37 +103,36 @@ void morse_pzunglq_param(const libhqr_tree_t *qrtree, MORSE_desc_t *A, MORSE_des
 
             tempnn = n == Q->nt-1 ? Q->n-n*Q->nb : Q->nb;
 
-            /* TT or TS */
-
+            /* TS or TT */
             if(qrtree->gettype(qrtree, k, n) == 0){
-                for (m = k; m < Q->mt; m++) {
-                    tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb;
-                    ldqm = BLKLDD(Q, m);
-                    MORSE_TASK_ztsmlq(
-                        &options,
-                        MorseRight, MorseNoTrans,
-                        tempmm, Q->nb, tempmm, tempnn, tempkm, ib, TS->nb,
-                        Q( m, p), ldqm,
-                        Q( m, n), ldqm,
-                        A( k, n), ldak,
-                        TS(k, n), TS->mb);
-                }
+                L = 0;
+                T = TS;
             }
             else {
-                for (m = k; m < Q->mt; m++) {
-                    tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb;
-                    ldqm = BLKLDD(Q, m);
-                    MORSE_TASK_zttmlq(
-                        &options,
-                        MorseRight, MorseNoTrans,
-                        tempmm, Q->nb, tempmm, tempnn, tempkm, ib, TT->nb,
-                        Q( m, p), ldqm,
-                        Q( m, n), ldqm,
-                        A( k, n), ldak,
-                        TT(k, n), TT->mb);
-                }
+                L = tempnn;
+                T = TT;
+            }
+            for (m = k; m < Q->mt; m++) {
+                tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb;
+                ldqm = BLKLDD(Q, m);
+
+                RUNTIME_data_migrate( sequence, Q(m, p),
+                                      Q->get_rankof( Q, m, n ) );
+                RUNTIME_data_migrate( sequence, Q(m, n),
+                                      Q->get_rankof( Q, m, n ) );
+
+                MORSE_TASK_ztpmlqt(
+                    &options,
+                    MorseRight, MorseNoTrans,
+                    tempmm, tempnn, tempkm, L, ib, T->nb,
+                    A(k, n), ldak,
+                    T(k, n), T->mb,
+                    Q(m, p), ldqm,
+                    Q(m, n), ldqm);
             }
         }
+
+        T = TS;
         for (i = 0; i < qrtree->getnbgeqrf(qrtree, k); i++) {
             p = qrtree->getm(qrtree, k, i);
 
@@ -157,13 +156,17 @@ void morse_pzunglq_param(const libhqr_tree_t *qrtree, MORSE_desc_t *A, MORSE_des
             for (m = k; m < Q->mt; m++) {
                 tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb;
                 ldqm = BLKLDD(Q, m);
+
+                RUNTIME_data_migrate( sequence, Q(m, p),
+                                      Q->get_rankof( Q, m, p ) );
+
                 MORSE_TASK_zunmlq(
                     &options,
                     MorseRight, MorseNoTrans,
-                    tempmm, temppn, tempkmin, ib, TS->nb,
-                    D( k, p), ldak,
-                    TS(k, p), TS->mb,
-                    Q( m, p), ldqm);
+                    tempmm, temppn, tempkmin, ib, T->nb,
+                    D(k, p), ldak,
+                    T(k, p), T->mb,
+                    Q(m, p), ldqm);
             }
         }
         RUNTIME_iteration_pop(morse);
@@ -172,5 +175,4 @@ void morse_pzunglq_param(const libhqr_tree_t *qrtree, MORSE_desc_t *A, MORSE_des
     free(tiles);
     RUNTIME_options_ws_free(&options);
     RUNTIME_options_finalize(&options, morse);
-    (void)D;
 }
diff --git a/compute/pzunglqrh.c b/compute/pzunglqrh.c
index 72836940f..2ae69ce03 100644
--- a/compute/pzunglqrh.c
+++ b/compute/pzunglqrh.c
@@ -102,15 +102,20 @@ void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q,
                 for (m = k; m < Q->mt; m++) {
                     tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb;
                     ldqm   = BLKLDD(Q, m   );
-                    MORSE_TASK_zttmlq(
+
+                    RUNTIME_data_migrate( sequence, Q(m, N),
+                                          Q->get_rankof( Q, m, N+RD ) );
+                    RUNTIME_data_migrate( sequence, Q(m, N+RD),
+                                          Q->get_rankof( Q, m, N+RD ) );
+
+                    MORSE_TASK_ztpmlqt(
                         &options,
                         MorseRight, MorseNoTrans,
-                        tempmm, Q->nb, tempmm, tempNRDn,
-                        tempkm, ib, T->nb,
-                        Q (m, N   ), ldqm,
-                        Q (m, N+RD), ldqm,
+                        tempmm, tempNRDn, tempkm, tempNRDn, ib, T->nb,
                         A (k, N+RD), ldak,
-                        T2(k, N+RD), T->mb);
+                        T2(k, N+RD), T->mb,
+                        Q (m, N   ), ldqm,
+                        Q (m, N+RD), ldqm);
                 }
             }
         }
@@ -123,15 +128,20 @@ void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q,
                 for (m = k; m < Q->mt; m++) {
                     tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb;
                     ldqm = BLKLDD(Q, m);
-                    MORSE_TASK_ztsmlq(
+
+                    RUNTIME_data_migrate( sequence, Q(m, N),
+                                          Q->get_rankof( Q, m, n ) );
+                    RUNTIME_data_migrate( sequence, Q(m, n),
+                                          Q->get_rankof( Q, m, n ) );
+
+                    MORSE_TASK_ztpmlqt(
                         &options,
                         MorseRight, MorseNoTrans,
-                        tempmm, Q->nb, tempmm, tempnn,
-                        tempkm, ib, T->nb,
-                        Q(m, N), ldqm,
-                        Q(m, n), ldqm,
+                        tempmm, tempnn, tempkm, 0, ib, T->nb,
                         A(k, n), ldak,
-                        T(k, n), T->mb);
+                        T(k, n), T->mb,
+                        Q(m, N), ldqm,
+                        Q(m, n), ldqm);
                 }
             }
 #if defined(CHAMELEON_COPY_DIAG)
@@ -151,6 +161,10 @@ void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q,
             for (m = k; m < Q->mt; m++) {
                 tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb;
                 ldqm = BLKLDD(Q, m);
+
+                RUNTIME_data_migrate( sequence, Q(m, N),
+                                      Q->get_rankof( Q, m, N ) );
+
                 MORSE_TASK_zunmlq(
                     &options,
                     MorseRight, MorseNoTrans,
diff --git a/compute/pzunmlqrh.c b/compute/pzunmlqrh.c
index 0b4af0cb8..6dfffc232 100644
--- a/compute/pzunmlqrh.c
+++ b/compute/pzunmlqrh.c
@@ -259,6 +259,10 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
 #endif
                     for (n = 0; n < B->nt; n++) {
                         tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb;
+
+                        RUNTIME_data_migrate( sequence, B(N, n),
+                                              B->get_rankof( B, N, n ) );
+
                         MORSE_TASK_zunmlq(
                             &options,
                             side, trans,
-- 
GitLab