From d9f3abee5eec7b3257d6bd6ba80e71247401562f Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Tue, 7 Jan 2020 13:45:05 +0100
Subject: [PATCH] Fix load balancing of the D matrix in QR/LQ algorithms

---
 compute/zgelqf.c        |  4 ++--
 compute/zgelqf_param.c  |  4 ++--
 compute/zgelqs.c        |  4 ++--
 compute/zgelqs_param.c  |  4 ++--
 compute/zgels.c         |  8 ++++----
 compute/zgels_param.c   |  8 ++++----
 compute/zgeqrf.c        |  4 ++--
 compute/zgeqrf_param.c  |  4 ++--
 compute/zgeqrs.c        |  4 ++--
 compute/zgeqrs_param.c  |  4 ++--
 compute/zgesv_incpiv.c  |  4 ++--
 compute/zgetrf_incpiv.c |  4 ++--
 compute/ztpgqrt.c       |  2 +-
 compute/zunglq.c        |  4 ++--
 compute/zunglq_param.c  |  4 ++--
 compute/zungqr.c        |  4 ++--
 compute/zungqr_param.c  |  4 ++--
 compute/zunmlq.c        |  4 ++--
 compute/zunmlq_param.c  |  4 ++--
 compute/zunmqr.c        |  4 ++--
 compute/zunmqr_param.c  |  4 ++--
 control/compute_z.h     | 23 +++++++++++++++++++++++
 22 files changed, 68 insertions(+), 45 deletions(-)

diff --git a/compute/zgelqf.c b/compute/zgelqf.c
index 0b2ee5b69..4d6633b0a 100644
--- a/compute/zgelqf.c
+++ b/compute/zgelqf.c
@@ -278,8 +278,8 @@ int CHAMELEON_zgelqf_Tile_Async( CHAM_desc_t *A, CHAM_desc_t *T,
      */
 #if defined(CHAMELEON_COPY_DIAG)
     {
-        int m = chameleon_min(A->m, A->n);
-        chameleon_zdesc_alloc(D, A->mb, A->nb, m, A->n, 0, 0, m, A->n, );
+        int m = chameleon_min( A->m, A->n );
+        chameleon_zdesc_copy_and_restrict( A, &D, m, A->n );
         Dptr = &D;
     }
 #endif
diff --git a/compute/zgelqf_param.c b/compute/zgelqf_param.c
index 6d8c125a5..742617826 100644
--- a/compute/zgelqf_param.c
+++ b/compute/zgelqf_param.c
@@ -282,8 +282,8 @@ int CHAMELEON_zgelqf_param_Tile_Async( const libhqr_tree_t *qrtree, CHAM_desc_t
      */
 #if defined(CHAMELEON_COPY_DIAG)
     {
-        int m = chameleon_min(A->m, A->n);
-        chameleon_zdesc_alloc(D, A->mb, A->nb, m, A->n, 0, 0, m, A->n, );
+        int m = chameleon_min( A->m, A->n );
+        chameleon_zdesc_copy_and_restrict( A, &D, m, A->n );
         Dptr = &D;
     }
 #endif
diff --git a/compute/zgelqs.c b/compute/zgelqs.c
index 703650460..483d3ef27 100644
--- a/compute/zgelqs.c
+++ b/compute/zgelqs.c
@@ -318,8 +318,8 @@ int CHAMELEON_zgelqs_Tile_Async( CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *B,
 
 #if defined(CHAMELEON_COPY_DIAG)
     {
-        int m = chameleon_min(A->m, A->n);
-        chameleon_zdesc_alloc(D, A->mb, A->nb, m, A->n, 0, 0, m, A->n, );
+        int m = chameleon_min( A->m, A->n );
+        chameleon_zdesc_copy_and_restrict( A, &D, m, A->n );
         Dptr = &D;
     }
 #endif
diff --git a/compute/zgelqs_param.c b/compute/zgelqs_param.c
index eedb2dacd..26839a939 100644
--- a/compute/zgelqs_param.c
+++ b/compute/zgelqs_param.c
@@ -330,8 +330,8 @@ int CHAMELEON_zgelqs_param_Tile_Async( const libhqr_tree_t *qrtree, CHAM_desc_t
 
 #if defined(CHAMELEON_COPY_DIAG)
     {
-        int m = chameleon_min(A->m, A->n);
-        chameleon_zdesc_alloc(D, A->mb, A->nb, m, A->n, 0, 0, m, A->n, );
+        int m = chameleon_min( A->m, A->n );
+        chameleon_zdesc_copy_and_restrict( A, &D, m, A->n );
         Dptr = &D;
     }
 #endif
diff --git a/compute/zgels.c b/compute/zgels.c
index b132610a8..c18c4c046 100644
--- a/compute/zgels.c
+++ b/compute/zgels.c
@@ -364,8 +364,8 @@ int CHAMELEON_zgels_Tile_Async( cham_trans_t trans, CHAM_desc_t *A,
     if (A->m >= A->n) {
 #if defined(CHAMELEON_COPY_DIAG)
         {
-            int n = chameleon_min(A->m, A->n);
-            chameleon_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, );
+            int n = chameleon_min( A->m, A->n );
+            chameleon_zdesc_copy_and_restrict( A, &D, A->m, n );
             Dptr = &D;
         }
 #endif
@@ -432,8 +432,8 @@ int CHAMELEON_zgels_Tile_Async( cham_trans_t trans, CHAM_desc_t *A,
     else {
 #if defined(CHAMELEON_COPY_DIAG)
         {
-            int m = chameleon_min(A->m, A->n);
-            chameleon_zdesc_alloc(D, A->mb, A->nb, m, A->n, 0, 0, m, A->n, );
+            int m = chameleon_min( A->m, A->n );
+            chameleon_zdesc_copy_and_restrict( A, &D, m, A->n );
             Dptr = &D;
         }
 #endif
diff --git a/compute/zgels_param.c b/compute/zgels_param.c
index 01999735f..26d777cc6 100644
--- a/compute/zgels_param.c
+++ b/compute/zgels_param.c
@@ -377,8 +377,8 @@ int CHAMELEON_zgels_param_Tile_Async( const libhqr_tree_t *qrtree, cham_trans_t
     if (A->m >= A->n) {
 #if defined(CHAMELEON_COPY_DIAG)
         {
-            int n = chameleon_min(A->m, A->n);
-            chameleon_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, );
+            int n = chameleon_min( A->m, A->n );
+            chameleon_zdesc_copy_and_restrict( A, &D, A->m, n );
             Dptr = &D;
         }
 #endif
@@ -431,8 +431,8 @@ int CHAMELEON_zgels_param_Tile_Async( const libhqr_tree_t *qrtree, cham_trans_t
     else {
 #if defined(CHAMELEON_COPY_DIAG)
         {
-            int m = chameleon_min(A->m, A->n);
-            chameleon_zdesc_alloc(D, A->mb, A->nb, m, A->n, 0, 0, m, A->n, );
+            int m = chameleon_min( A->m, A->n );
+            chameleon_zdesc_copy_and_restrict( A, &D, m, A->n );
             Dptr = &D;
         }
 #endif
diff --git a/compute/zgeqrf.c b/compute/zgeqrf.c
index 98675a467..f9557eb99 100644
--- a/compute/zgeqrf.c
+++ b/compute/zgeqrf.c
@@ -277,8 +277,8 @@ int CHAMELEON_zgeqrf_Tile_Async( CHAM_desc_t *A, CHAM_desc_t *T,
      */
 #if defined(CHAMELEON_COPY_DIAG)
     {
-        int n = chameleon_min(A->m, A->n);
-        chameleon_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, );
+        int n = chameleon_min( A->m, A->n );
+        chameleon_zdesc_copy_and_restrict( A, &D, A->m, n );
         Dptr = &D;
     }
 #endif
diff --git a/compute/zgeqrf_param.c b/compute/zgeqrf_param.c
index c406b9207..75e25f895 100644
--- a/compute/zgeqrf_param.c
+++ b/compute/zgeqrf_param.c
@@ -302,8 +302,8 @@ int CHAMELEON_zgeqrf_param_Tile_Async( const libhqr_tree_t *qrtree, CHAM_desc_t
 
 #if defined(CHAMELEON_COPY_DIAG)
     {
-        int n = chameleon_min(A->m, A->n);
-        chameleon_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, );
+        int n = chameleon_min( A->m, A->n );
+        chameleon_zdesc_copy_and_restrict( A, &D, A->m, n );
         Dptr = &D;
     }
 #endif
diff --git a/compute/zgeqrs.c b/compute/zgeqrs.c
index a6b41d0b1..0857f9e08 100644
--- a/compute/zgeqrs.c
+++ b/compute/zgeqrs.c
@@ -307,8 +307,8 @@ int CHAMELEON_zgeqrs_Tile_Async( CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *B,
      */
 #if defined(CHAMELEON_COPY_DIAG)
     {
-        int n = chameleon_min(A->m, A->n);
-        chameleon_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, );
+        int n = chameleon_min( A->m, A->n );
+        chameleon_zdesc_copy_and_restrict( A, &D, A->m, n );
         Dptr = &D;
     }
 #endif
diff --git a/compute/zgeqrs_param.c b/compute/zgeqrs_param.c
index 84aa1af68..eb914016d 100644
--- a/compute/zgeqrs_param.c
+++ b/compute/zgeqrs_param.c
@@ -311,8 +311,8 @@ int CHAMELEON_zgeqrs_param_Tile_Async( const libhqr_tree_t *qrtree,
      */
 #if defined(CHAMELEON_COPY_DIAG)
     {
-        int n = chameleon_min(A->m, A->n);
-        chameleon_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, );
+        int n = chameleon_min( A->m, A->n );
+        chameleon_zdesc_copy_and_restrict( A, &D, A->m, n );
         Dptr = &D;
     }
 #endif
diff --git a/compute/zgesv_incpiv.c b/compute/zgesv_incpiv.c
index 89cc15107..3842b95c8 100644
--- a/compute/zgesv_incpiv.c
+++ b/compute/zgesv_incpiv.c
@@ -311,8 +311,8 @@ int CHAMELEON_zgesv_incpiv_Tile_Async( CHAM_desc_t *A, CHAM_desc_t *L, int *IPIV
 
 #if defined(CHAMELEON_COPY_DIAG)
     {
-        int n = chameleon_min(A->mt, A->nt) * A->nb;
-        chameleon_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, );
+        int n = chameleon_min( A->m, A->n );
+        chameleon_zdesc_copy_and_restrict( A, &D, A->m, n );
         Dptr = &D;
     }
 #endif
diff --git a/compute/zgetrf_incpiv.c b/compute/zgetrf_incpiv.c
index f1f2e795a..6cc7614fd 100644
--- a/compute/zgetrf_incpiv.c
+++ b/compute/zgetrf_incpiv.c
@@ -283,8 +283,8 @@ int CHAMELEON_zgetrf_incpiv_Tile_Async( CHAM_desc_t *A, CHAM_desc_t *L, int *IPI
 
 #if defined(CHAMELEON_COPY_DIAG)
     {
-        int n = chameleon_min(A->mt, A->nt) * A->nb;
-        chameleon_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, );
+        int n = chameleon_min( A->m, A->n ) * A->nb;
+        chameleon_zdesc_copy_and_restrict( A, &D, A->m, n );
         Dptr = &D;
     }
 #endif
diff --git a/compute/ztpgqrt.c b/compute/ztpgqrt.c
index 98db159af..d826d8111 100644
--- a/compute/ztpgqrt.c
+++ b/compute/ztpgqrt.c
@@ -404,7 +404,7 @@ int CHAMELEON_ztpgqrt_Tile_Async( int L,
 
 #if defined(CHAMELEON_COPY_DIAG)
     {
-        chameleon_zdesc_alloc(D, V1->mb, V1->nb, V1->m, KT*V1->nb, 0, 0, V1->m, KT*V1->nb, );
+        chameleon_zdesc_copy_and_restrict( V1, &D, V1->m, KT*V1->nb );
         Dptr = &D;
     }
 #endif
diff --git a/compute/zunglq.c b/compute/zunglq.c
index 3d3ce3062..085fcd29b 100644
--- a/compute/zunglq.c
+++ b/compute/zunglq.c
@@ -300,8 +300,8 @@ int CHAMELEON_zunglq_Tile_Async( CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *Q,
 
 #if defined(CHAMELEON_COPY_DIAG)
     {
-        int m = chameleon_min(A->m, A->n);
-        chameleon_zdesc_alloc(D, A->mb, A->nb, m, A->n, 0, 0, m, A->n, );
+        int m = chameleon_min( A->m, A->n );
+        chameleon_zdesc_copy_and_restrict( A, &D, m, A->n );
         Dptr = &D;
     }
 #endif
diff --git a/compute/zunglq_param.c b/compute/zunglq_param.c
index aec224f96..4a7a908cb 100644
--- a/compute/zunglq_param.c
+++ b/compute/zunglq_param.c
@@ -303,8 +303,8 @@ int CHAMELEON_zunglq_param_Tile_Async( const libhqr_tree_t *qrtree, CHAM_desc_t
      */
 #if defined(CHAMELEON_COPY_DIAG)
     {
-        int m = chameleon_min(A->m, A->n);
-        chameleon_zdesc_alloc(D, A->mb, A->nb, m, A->n, 0, 0, m, A->n, );
+        int m = chameleon_min( A->m, A->n );
+        chameleon_zdesc_copy_and_restrict( A, &D, m, A->n );
         Dptr = &D;
     }
 #endif
diff --git a/compute/zungqr.c b/compute/zungqr.c
index 69f8be30e..0f4558e7a 100644
--- a/compute/zungqr.c
+++ b/compute/zungqr.c
@@ -297,8 +297,8 @@ int CHAMELEON_zungqr_Tile_Async( CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *Q,
      */
 #if defined(CHAMELEON_COPY_DIAG)
     {
-        int n = chameleon_min(A->m, A->n);
-        chameleon_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, );
+        int n = chameleon_min( A->m, A->n );
+        chameleon_zdesc_copy_and_restrict( A, &D, A->m, n );
         Dptr = &D;
     }
 #endif
diff --git a/compute/zungqr_param.c b/compute/zungqr_param.c
index 36bacc089..8c0eb0a68 100644
--- a/compute/zungqr_param.c
+++ b/compute/zungqr_param.c
@@ -310,8 +310,8 @@ int CHAMELEON_zungqr_param_Tile_Async( const libhqr_tree_t *qrtree, CHAM_desc_t
 
 #if defined(CHAMELEON_COPY_DIAG)
     {
-        int n = chameleon_min(A->m, A->n);
-        chameleon_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, );
+        int n = chameleon_min( A->m, A->n );
+        chameleon_zdesc_copy_and_restrict( A, &D, A->m, n );
         Dptr = &D;
     }
 #endif
diff --git a/compute/zunmlq.c b/compute/zunmlq.c
index 07c635a67..de73e20a3 100644
--- a/compute/zunmlq.c
+++ b/compute/zunmlq.c
@@ -359,8 +359,8 @@ int CHAMELEON_zunmlq_Tile_Async( cham_side_t side, cham_trans_t trans,
      */
 #if defined(CHAMELEON_COPY_DIAG)
     {
-        int m = chameleon_min(A->m, A->n);
-        chameleon_zdesc_alloc(D, A->mb, A->nb, m, A->n, 0, 0, m, A->n, );
+        int m = chameleon_min( A->m, A->n );
+        chameleon_zdesc_copy_and_restrict( A, &D, m, A->n );
         Dptr = &D;
     }
 #endif
diff --git a/compute/zunmlq_param.c b/compute/zunmlq_param.c
index df0fba8ba..0c85cc62d 100644
--- a/compute/zunmlq_param.c
+++ b/compute/zunmlq_param.c
@@ -365,8 +365,8 @@ int CHAMELEON_zunmlq_param_Tile_Async( const libhqr_tree_t *qrtree, cham_side_t
 
 #if defined(CHAMELEON_COPY_DIAG)
     {
-        int m = chameleon_min(A->m, A->n);
-        chameleon_zdesc_alloc(D, A->mb, A->nb, m, A->n, 0, 0, m, A->n, );
+        int m = chameleon_min( A->m, A->n );
+        chameleon_zdesc_copy_and_restrict( A, &D, m, A->n );
         Dptr = &D;
     }
 #endif
diff --git a/compute/zunmqr.c b/compute/zunmqr.c
index 09387b79b..979823987 100644
--- a/compute/zunmqr.c
+++ b/compute/zunmqr.c
@@ -362,8 +362,8 @@ int CHAMELEON_zunmqr_Tile_Async( cham_side_t side, cham_trans_t trans,
 
 #if defined(CHAMELEON_COPY_DIAG)
     {
-        int n = chameleon_min(A->m, A->n);
-        chameleon_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, );
+        int n = chameleon_min( A->m, A->n );
+        chameleon_zdesc_copy_and_restrict( A, &D, A->m, n );
         Dptr = &D;
     }
 #endif
diff --git a/compute/zunmqr_param.c b/compute/zunmqr_param.c
index e7b31e742..810c2ea2f 100644
--- a/compute/zunmqr_param.c
+++ b/compute/zunmqr_param.c
@@ -372,8 +372,8 @@ int CHAMELEON_zunmqr_param_Tile_Async( const libhqr_tree_t *qrtree,
 
 #if defined(CHAMELEON_COPY_DIAG)
     {
-        int n = chameleon_min(A->m, A->n);
-        chameleon_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, );
+        int n = chameleon_min( A->m, A->n );
+        chameleon_zdesc_copy_and_restrict( A, &D, A->m, n );
         Dptr = &D;
     }
 #endif
diff --git a/control/compute_z.h b/control/compute_z.h
index 196fde204..ad911e8f6 100644
--- a/control/compute_z.h
+++ b/control/compute_z.h
@@ -154,6 +154,29 @@ chameleon_zdesc_alloc_diag( CHAM_desc_t *descA, int nb, int m, int n, int p, int
         }                                                               \
     }
 
+/**
+ * @brief Create a copy of a descriptor restricted to a smaller size.
+ * @param[in]  descIn  The input descriptor from which the structure should be copied.
+ * @param[out] descOut The output descriptor that is a copy of the input one with allocation on the fly.
+ * @param[in]  m       The number of rows of the output descriptor.
+ * @param[in]  n       The number of columns of the output descriptor.
+ * @return CHAMELEON_SUCCESS on success, the associated error on failure.
+ */
+static inline int
+chameleon_zdesc_copy_and_restrict( CHAM_desc_t *descIn,
+                                   CHAM_desc_t *descOut,
+                                   int m, int n )
+{
+    int rc;
+    rc = chameleon_desc_init( descOut, CHAMELEON_MAT_ALLOC_TILE,
+                              ChamComplexDouble, descIn->mb, descIn->nb, descIn->mb * descIn->nb,
+                              m, n, 0, 0, m, n, descIn->p, descIn->q,
+                              descIn->get_blkaddr,
+                              descIn->get_blkldd,
+                              descIn->get_rankof );
+    return rc;
+}
+
 /**
  * @brief Internal function to convert the lapack format to tile format in
  * LAPACK interface calls
-- 
GitLab