From 6e31c9b960c999ed6cc10a89e184e22ef0c73f98 Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Fri, 6 May 2022 15:49:45 +0200
Subject: [PATCH] qr/lq: create a step subfunction in all qr/lq factorization
 function

---
 compute/pzgelqf.c       | 169 ++++++++++++++-----------
 compute/pzgelqf_param.c | 267 ++++++++++++++++++++++------------------
 compute/pzgeqrf.c       | 180 ++++++++++++++++-----------
 compute/pzgeqrf_param.c |   1 -
 control/compute_z.h     |  13 +-
 5 files changed, 364 insertions(+), 266 deletions(-)

diff --git a/compute/pzgelqf.c b/compute/pzgelqf.c
index 32346a6e8..9a6a4f3da 100644
--- a/compute/pzgelqf.c
+++ b/compute/pzgelqf.c
@@ -33,7 +33,99 @@
 #define D(k)   D,  k,  k
 
 /**
- *  Parallel tile LQ factorization - dynamic scheduling
+ *  Parallel tile QR factorization (reduction Householder) - dynamic scheduling
+ *
+ * @param[in] genD
+ *         Indicate if copies of the geqrt tiles must be done to speedup
+ *         computations in updates. genD is considered only if D is not NULL.
+ */
+int chameleon_pzgelqf_step( int genD, int k, int ib,
+                            CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
+                            RUNTIME_option_t *options, RUNTIME_sequence_t *sequence )
+{
+    int m, n;
+    int tempkm, tempkn, tempmm, tempnn;
+
+    tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
+    tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
+    INSERT_TASK_zgelqt(
+        options,
+        tempkm, tempkn, ib, T->nb,
+        A(k, k),
+        T(k, k));
+
+    if ( genD ) {
+        int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb;
+        int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb;
+        INSERT_TASK_zlacpy(
+            options,
+            ChamUpper, tempDkm, tempDkn,
+            A(k, k),
+            D(k) );
+#if defined(CHAMELEON_USE_CUDA)
+        INSERT_TASK_zlaset(
+            options,
+            ChamLower, tempDkm, tempDkn,
+            0., 1.,
+            D(k) );
+#endif
+    }
+
+    for (m = k+1; m < A->mt; m++) {
+        tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
+        INSERT_TASK_zunmlq(
+            options,
+            ChamRight, ChamConjTrans,
+            tempmm, tempkn, tempkn, ib, T->nb,
+            D(k),
+            T(k, k),
+            A(m, k));
+    }
+    RUNTIME_data_flush( sequence, D(k)    );
+    RUNTIME_data_flush( sequence, T(k, k) );
+
+    for (n = k+1; n < A->nt; n++) {
+        tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
+
+        RUNTIME_data_migrate( sequence, A(k, k),
+                              A->get_rankof( A, k, n ) );
+
+        /* TS kernel */
+        INSERT_TASK_ztplqt(
+            options,
+            tempkm, tempnn, 0, ib, T->nb,
+            A(k, k),
+            A(k, n),
+            T(k, n));
+        for (m = k+1; m < A->mt; m++) {
+            tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
+
+            RUNTIME_data_migrate( sequence, A(m, k),
+                                  A->get_rankof( A, m, n ) );
+
+            INSERT_TASK_ztpmlqt(
+                options,
+                ChamRight, ChamConjTrans,
+                tempmm, tempnn, A->mb, 0, ib, T->nb,
+                A(k, n),
+                T(k, n),
+                A(m, k),
+                A(m, n));
+        }
+        RUNTIME_data_flush( sequence, A(k, n) );
+        RUNTIME_data_flush( sequence, T(k, n) );
+    }
+
+    return 1;
+}
+
+/**
+ *  Parallel tile LQ factorization (reduction Householder) - dynamic scheduling
+ *
+ * @param[in] genD
+ *         Indicate if copies of the gelqt tiles must be done to speedup
+ *         computations in updates. genD is considered only if D is not NULL.
+ *
  */
 void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
                         RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
@@ -43,9 +135,7 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
     size_t ws_worker = 0;
     size_t ws_host = 0;
 
-    int k, m, n;
-    int tempkm, tempkn, tempmm, tempnn;
-    int ib, minMNT;
+    int k, m, ib, minMNT;
 
     chamctxt = chameleon_context_self();
     if (sequence->status != CHAMELEON_SUCCESS) {
@@ -61,7 +151,7 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
         minMNT = A->mt;
     }
 
-    if ( D == NULL ) {
+    if ( (genD == 0) || (D == NULL) ) {
         D    = A;
         genD = 0;
     }
@@ -92,73 +182,8 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
     for (k = 0; k < minMNT; k++) {
         RUNTIME_iteration_push(chamctxt, k);
 
-        tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
-        tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
-        INSERT_TASK_zgelqt(
-            &options,
-            tempkm, tempkn, ib, T->nb,
-            A(k, k),
-            T(k, k));
-        if ( genD ) {
-            int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb;
-            int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb;
-            INSERT_TASK_zlacpy(
-                &options,
-                ChamUpper, tempDkm, tempDkn,
-                A(k, k),
-                D(k) );
-#if defined(CHAMELEON_USE_CUDA)
-            INSERT_TASK_zlaset(
-                &options,
-                ChamLower, tempDkm, tempDkn,
-                0., 1.,
-                D(k) );
-#endif
-        }
-        for (m = k+1; m < A->mt; m++) {
-            tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
-            INSERT_TASK_zunmlq(
-                &options,
-                ChamRight, ChamConjTrans,
-                tempmm, tempkn, tempkn, ib, T->nb,
-                D(k),
-                T(k, k),
-                A(m, k));
-        }
-        RUNTIME_data_flush( sequence, D(k)    );
-        RUNTIME_data_flush( sequence, T(k, k) );
-
-        for (n = k+1; n < A->nt; n++) {
-            tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
-
-            RUNTIME_data_migrate( sequence, A(k, k),
-                                  A->get_rankof( A, k, n ) );
-
-            /* TS kernel */
-            INSERT_TASK_ztplqt(
-                &options,
-                tempkm, tempnn, 0, ib, T->nb,
-                A(k, k),
-                A(k, n),
-                T(k, n));
-            for (m = k+1; m < A->mt; m++) {
-                tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
-
-                RUNTIME_data_migrate( sequence, A(m, k),
-                                      A->get_rankof( A, m, n ) );
-
-                INSERT_TASK_ztpmlqt(
-                    &options,
-                    ChamRight, ChamConjTrans,
-                    tempmm, tempnn, A->mb, 0, ib, T->nb,
-                    A(k, n),
-                    T(k, n),
-                    A(m, k),
-                    A(m, n));
-            }
-            RUNTIME_data_flush( sequence, A(k, n) );
-            RUNTIME_data_flush( sequence, T(k, n) );
-        }
+        chameleon_pzgelqf_step( genD, k, ib,
+                                A, T, D, &options, sequence );
 
         /* Restore the original location of the tiles */
         for (m = k; m < A->mt; m++) {
diff --git a/compute/pzgelqf_param.c b/compute/pzgelqf_param.c
index b69b84b50..7bc06b159 100644
--- a/compute/pzgelqf_param.c
+++ b/compute/pzgelqf_param.c
@@ -28,21 +28,162 @@
 
 /**
  *  Parallel tile LQ factorization (reduction Householder) - dynamic scheduling
+ *
+ * @param[in] genD
+ *         Indicate if copies of the gelqt tiles must be done to speedup
+ *         computations in updates. genD is considered only if D is not NULL.
+ *
+ * @param[in] uplo
+ *         - ChamUpper: Classic LQ factorization of the matrix A.
+ *         - ChamLower: LQ factorization of the TTLQT kernel.
+ *         - ChamUpperLower: LQ factorization of the TSLQT kernel.
+ */
+int chameleon_pzgelqf_param_step( int genD, cham_uplo_t uplo, int k, int ib,
+                                  const libhqr_tree_t *qrtree, int *tiles,
+                                  CHAM_desc_t *A, CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
+                                  RUNTIME_option_t *options, RUNTIME_sequence_t *sequence )
+{
+    CHAM_desc_t *T;
+    int m, n, i, p;
+    int L, nbgelqt;
+    int tempkmin, tempkm, tempnn, tempmm, temppn;
+    int node, nbtiles;
+
+    tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
+
+    /* The number of geqrt to apply */
+    nbgelqt = qrtree->getnbgeqrf( qrtree, k );
+
+    T = TS;
+    for (i = 0; i < nbgelqt; i++) {
+        p = qrtree->getm( qrtree, k, i );
+
+        /* We skip the LQ factorization if this is the last diagonal tile */
+        if ( (uplo == ChamLower) && (p == k) ) {
+            continue;
+        }
+
+        temppn = p == A->nt-1 ? A->n-p*A->nb : A->nb;
+        tempkmin = chameleon_min(tempkm, temppn);
+
+        INSERT_TASK_zgelqt(
+            options,
+            tempkm, temppn, ib, T->nb,
+            A(k, p), T(k, p));
+
+        if ( genD ) {
+            int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb;
+            int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb;
+
+            INSERT_TASK_zlacpy(
+                options,
+                ChamUpper, tempDkm, tempDpn,
+                A(k, p), D(k, p) );
+#if defined(CHAMELEON_USE_CUDA)
+            INSERT_TASK_zlaset(
+                options,
+                ChamLower, tempDkm, tempDpn,
+                0., 1.,
+                D(k, p) );
+#endif
+        }
+
+        for (m = k+1; m < A->mt; m++) {
+            tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
+            INSERT_TASK_zunmlq(
+                options,
+                ChamRight, ChamConjTrans,
+                tempmm, temppn, tempkmin, ib, T->nb,
+                D(k, p),
+                T(k, p),
+                A(m, p));
+        }
+
+        if ( genD || ((k+1) < A->mt)) {
+            RUNTIME_data_flush( sequence, D(k, p) );
+        }
+        RUNTIME_data_flush( sequence, T(k, p) );
+    }
+
+    /* Setting the order of the tiles */
+    nbtiles = libhqr_walk_stepk( qrtree, k, tiles );
+
+    for (i = 0; i < nbtiles; i++) {
+        n = tiles[i];
+        p = qrtree->currpiv( qrtree, k, n );
+
+        tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
+
+        if ( qrtree->gettype( qrtree, k, n ) == LIBHQR_KILLED_BY_TS ) {
+            /* TS kernel */
+            T = TS;
+            L = 0;
+
+            /* Force TT kernel if this is the last diagonal tile */
+            if ( (uplo == ChamLower) && (n == k) ) {
+                L = tempnn;
+            }
+        }
+        else {
+            /* TT kernel */
+            T = TT;
+            L = tempnn;
+        }
+
+        node = A->get_rankof( A, k, n );
+        RUNTIME_data_migrate( sequence, A(k, p), node );
+        RUNTIME_data_migrate( sequence, A(k, n), node );
+
+        INSERT_TASK_ztplqt(
+            options,
+            tempkm, tempnn, chameleon_min(L, tempkm), ib, T->nb,
+            A(k, p),
+            A(k, n),
+            T(k, n));
+
+        for (m = k+1; m < A->mt; m++) {
+            tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
+
+            node = A->get_rankof( A, m, n );
+            RUNTIME_data_migrate( sequence, A(m, p), node );
+            RUNTIME_data_migrate( sequence, A(m, n), node );
+
+            INSERT_TASK_ztpmlqt(
+                options,
+                ChamRight, ChamConjTrans,
+                tempmm, tempnn, tempkm, L, ib, T->nb,
+                A(k, n),
+                T(k, n),
+                A(m, p),
+                A(m, n));
+        }
+        RUNTIME_data_flush( sequence, A(k, n) );
+        RUNTIME_data_flush( sequence, T(k, n) );
+    }
+
+    return tiles[nbtiles];
+}
+
+/**
+ *  Parallel tile LQ factorization (reduction Householder) - dynamic scheduling
+ *
+ * @param[in] genD
+ *         Indicate if copies of the gelqt tiles must be done to speedup
+ *         computations in updates. genD is considered only if D is not NULL.
+ *
  */
-void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t *A,
+void chameleon_pzgelqf_param( int genD, int K,
+                              const libhqr_tree_t *qrtree, CHAM_desc_t *A,
                               CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
                               RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
 {
     CHAM_context_t *chamctxt;
     RUNTIME_option_t options;
-    CHAM_desc_t *T;
     size_t ws_worker = 0;
     size_t ws_host = 0;
 
-    int k, m, n, i, p;
-    int K, L, nbgeqrt;
-    int tempkmin, tempkm, tempnn, tempmm, temppn;
-    int ib, node, nbtiles, *tiles;
+    int k, m;
+    int ib, *tiles;
 
     chamctxt = chameleon_context_self();
     if (sequence->status != CHAMELEON_SUCCESS) {
@@ -52,7 +193,7 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
 
     ib = CHAMELEON_IB;
 
-    if ( D == NULL ) {
+    if ( (genD == 0) || (D == NULL) ) {
         D    = A;
         genD = 0;
     }
@@ -82,119 +223,11 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
     /* Initialisation of temporary tiles array */
     tiles = (int*)calloc(qrtree->mt, sizeof(int));
 
-    K = chameleon_min(A->mt, A->nt);
-
     for (k = 0; k < K; k++) {
         RUNTIME_iteration_push(chamctxt, k);
 
-        tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
-
-        /* The number of geqrt to apply */
-        nbgeqrt = qrtree->getnbgeqrf(qrtree, k);
-
-        T = TS;
-        for (i = 0; i < nbgeqrt; i++) {
-            p = qrtree->getm(qrtree, k, i);
-
-            /* /\* We skip the LQ factorization if this is the last diagonal tile *\/ */
-            /* if ( (uplo == ChamLower) && (p == k) ) { */
-            /*     continue; */
-            /* } */
-
-            temppn = p == A->nt-1 ? A->n-p*A->nb : A->nb;
-            tempkmin = chameleon_min(tempkm, temppn);
-
-            INSERT_TASK_zgelqt(
-                &options,
-                tempkm, temppn, ib, T->nb,
-                A(k, p), T(k, p));
-
-            if ( genD ) {
-                int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb;
-                int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb;
-
-                INSERT_TASK_zlacpy(
-                    &options,
-                    ChamUpper, tempDkm, tempDpn,
-                    A(k, p), D(k, p) );
-#if defined(CHAMELEON_USE_CUDA)
-                INSERT_TASK_zlaset(
-                    &options,
-                    ChamLower, tempDkm, tempDpn,
-                    0., 1.,
-                    D(k, p) );
-#endif
-            }
-
-            for (m = k+1; m < A->mt; m++) {
-                tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
-                INSERT_TASK_zunmlq(
-                    &options,
-                    ChamRight, ChamConjTrans,
-                    tempmm, temppn, tempkmin, ib, T->nb,
-                    D(k, p),
-                    T(k, p),
-                    A(m, p));
-            }
-            RUNTIME_data_flush( sequence, D(k, p) );
-            RUNTIME_data_flush( sequence, T(k, p) );
-        }
-
-        /* Setting the order of the tiles */
-        nbtiles = libhqr_walk_stepk( qrtree, k, tiles );
-
-        for (i = 0; i < nbtiles; i++) {
-            n = tiles[i];
-            p = qrtree->currpiv( qrtree, k, n );
-
-            tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
-
-            if ( qrtree->gettype( qrtree, k, n ) == LIBHQR_KILLED_BY_TS ) {
-                /* TS kernel */
-                T = TS;
-                L = 0;
-
-                /* /\* Force TT kernel if this is the last diagonal tile *\/ */
-                /* if ( (uplo == ChamLower) && (n == k) ) { */
-                /*     L = tempnn; */
-                /* } */
-            }
-            else {
-                /* TT kernel */
-                T = TT;
-                L = tempnn;
-            }
-
-            node = A->get_rankof( A, k, n );
-            RUNTIME_data_migrate( sequence, A(k, p), node );
-            RUNTIME_data_migrate( sequence, A(k, n), node );
-
-            INSERT_TASK_ztplqt(
-                &options,
-                tempkm, tempnn, chameleon_min(L, tempkm), ib, T->nb,
-                A(k, p),
-                A(k, n),
-                T(k, n));
-
-            for (m = k+1; m < A->mt; m++) {
-                tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
-
-                node = A->get_rankof( A, m, n );
-                RUNTIME_data_migrate( sequence, A(m, p), node );
-                RUNTIME_data_migrate( sequence, A(m, n), node );
-
-                INSERT_TASK_ztpmlqt(
-                    &options,
-                    ChamRight, ChamConjTrans,
-                    tempmm, tempnn, tempkm, L, ib, T->nb,
-                    A(k, n),
-                    T(k, n),
-                    A(m, p),
-                    A(m, n));
-            }
-            RUNTIME_data_flush( sequence, A(k, n) );
-            RUNTIME_data_flush( sequence, T(k, n) );
-        }
+        chameleon_pzgelqf_param_step( genD, ChamUpper, k, ib, qrtree, tiles,
+                                      A, TS, TT, D, &options, sequence );
 
         /* Restore the original location of the tiles */
         for (m = k; m < A->mt; m++) {
diff --git a/compute/pzgeqrf.c b/compute/pzgeqrf.c
index a7637392d..49c270cf0 100644
--- a/compute/pzgeqrf.c
+++ b/compute/pzgeqrf.c
@@ -32,7 +32,101 @@
 #define D(k)   D,  k,  k
 
 /**
- *  Parallel tile QR factorization - dynamic scheduling
+ *  Parallel tile QR factorization (reduction Householder) - dynamic scheduling
+ *
+ * @param[in] genD
+ *         Indicate if copies of the geqrt tiles must be done to speedup
+ *         computations in updates. genD is considered only if D is not NULL.
+ */
+int chameleon_pzgeqrf_step( int genD, int k, int ib,
+                            CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
+                            RUNTIME_option_t *options, RUNTIME_sequence_t *sequence )
+{
+    int m, n;
+    int tempkm, tempkn, tempnn, tempmm;
+
+    tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
+    tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
+
+    INSERT_TASK_zgeqrt(
+        options,
+        tempkm, tempkn, ib, T->nb,
+        A(k, k),
+        T(k, k));
+
+    if ( genD ) {
+        int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb;
+        int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb;
+        INSERT_TASK_zlacpy(
+            options,
+            ChamLower, tempDkm, tempDkn,
+            A(k, k),
+            D(k) );
+#if defined(CHAMELEON_USE_CUDA)
+        INSERT_TASK_zlaset(
+            options,
+            ChamUpper, tempDkm, tempDkn,
+            0., 1.,
+            D(k) );
+#endif
+    }
+    for (n = k+1; n < A->nt; n++) {
+        tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
+        INSERT_TASK_zunmqr(
+            options,
+            ChamLeft, ChamConjTrans,
+            tempkm, tempnn, tempkm, ib, T->nb,
+            D(k),
+            T(k, k),
+            A(k, n));
+    }
+    RUNTIME_data_flush( sequence, D(k)    );
+    RUNTIME_data_flush( sequence, T(k, k) );
+
+    for (m = k+1; m < A->mt; m++) {
+        tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
+
+        RUNTIME_data_migrate( sequence, A(k, k),
+                              A->get_rankof( A, m, k ) );
+
+        /* TS kernel */
+        INSERT_TASK_ztpqrt(
+            options,
+            tempmm, tempkn, 0, ib, T->nb,
+            A(k, k),
+            A(m, k),
+            T(m, k));
+
+        for (n = k+1; n < A->nt; n++) {
+            tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
+
+            RUNTIME_data_migrate( sequence, A(k, n),
+                                  A->get_rankof( A, m, n ) );
+
+            /* TS kernel */
+            INSERT_TASK_ztpmqrt(
+                options,
+                ChamLeft, ChamConjTrans,
+                tempmm, tempnn, A->nb, 0, ib, T->nb,
+                A(m, k),
+                T(m, k),
+                A(k, n),
+                A(m, n));
+        }
+        RUNTIME_data_flush( sequence, A(m, k) );
+        RUNTIME_data_flush( sequence, T(m, k) );
+    }
+
+    return 1;
+}
+
+/**
+ *  Parallel tile QR factorization (reduction Householder) - dynamic scheduling
+ *
+ * @param[in] genD
+ *         Indicate if copies of the geqrt tiles must be done to speedup
+ *         computations in updates. genD is considered only if D is not NULL.
+ *
  */
 void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
                         RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
@@ -42,10 +136,7 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
     size_t ws_worker = 0;
     size_t ws_host = 0;
 
-    int k, m, n;
-    int tempkm, tempkn, tempnn, tempmm;
-    int ib;
-    int minMNT = chameleon_min(A->mt, A->nt);
+    int k, n, ib, minMNT;
 
     chamctxt = chameleon_context_self();
     if (sequence->status != CHAMELEON_SUCCESS) {
@@ -55,7 +146,13 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
 
     ib = CHAMELEON_IB;
 
-    if ( D == NULL ) {
+    if (A->m > A->n) {
+        minMNT = A->nt;
+    } else {
+        minMNT = A->mt;
+    }
+
+    if ( (genD == 0) || (D == NULL) ) {
         D    = A;
         genD = 0;
     }
@@ -86,75 +183,8 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
     for (k = 0; k < minMNT; k++) {
         RUNTIME_iteration_push(chamctxt, k);
 
-        tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
-        tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
-        INSERT_TASK_zgeqrt(
-            &options,
-            tempkm, tempkn, ib, T->nb,
-            A(k, k),
-            T(k, k));
-        if ( genD ) {
-            int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb;
-            int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb;
-            INSERT_TASK_zlacpy(
-                &options,
-                ChamLower, tempDkm, tempDkn,
-                A(k, k),
-                D(k) );
-#if defined(CHAMELEON_USE_CUDA)
-            INSERT_TASK_zlaset(
-                &options,
-                ChamUpper, tempDkm, tempDkn,
-                0., 1.,
-                D(k) );
-#endif
-        }
-        for (n = k+1; n < A->nt; n++) {
-            tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
-            INSERT_TASK_zunmqr(
-                &options,
-                ChamLeft, ChamConjTrans,
-                tempkm, tempnn, tempkm, ib, T->nb,
-                D(k),
-                T(k, k),
-                A(k, n));
-        }
-        RUNTIME_data_flush( sequence, D(k)    );
-        RUNTIME_data_flush( sequence, T(k, k) );
-
-        for (m = k+1; m < A->mt; m++) {
-            tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
-
-            RUNTIME_data_migrate( sequence, A(k, k),
-                                  A->get_rankof( A, m, k ) );
-
-            /* TS kernel */
-            INSERT_TASK_ztpqrt(
-                &options,
-                tempmm, tempkn, 0, ib, T->nb,
-                A(k, k),
-                A(m, k),
-                T(m, k));
-
-            for (n = k+1; n < A->nt; n++) {
-                tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
-
-                RUNTIME_data_migrate( sequence, A(k, n),
-                                      A->get_rankof( A, m, n ) );
-
-                /* TS kernel */
-                INSERT_TASK_ztpmqrt(
-                    &options,
-                    ChamLeft, ChamConjTrans,
-                    tempmm, tempnn, A->nb, 0, ib, T->nb,
-                    A(m, k),
-                    T(m, k),
-                    A(k, n),
-                    A(m, n));
-            }
-            RUNTIME_data_flush( sequence, A(m, k) );
-            RUNTIME_data_flush( sequence, T(m, k) );
-        }
+        chameleon_pzgeqrf_step( genD, k, ib,
+                                A, T, D, &options, sequence );
 
         /* Restore the original location of the tiles */
         for (n = k; n < A->nt; n++) {
diff --git a/compute/pzgeqrf_param.c b/compute/pzgeqrf_param.c
index a4e08e061..d2bd084d2 100644
--- a/compute/pzgeqrf_param.c
+++ b/compute/pzgeqrf_param.c
@@ -164,7 +164,6 @@ int chameleon_pzgeqrf_param_step( int genD, cham_uplo_t uplo, int k, int ib,
     return tiles[nbtiles];
 }
 
-
 /**
  *  Parallel tile QR factorization (reduction Householder) - dynamic scheduling
  *
diff --git a/control/compute_z.h b/control/compute_z.h
index d4acf6976..295c64264 100644
--- a/control/compute_z.h
+++ b/control/compute_z.h
@@ -133,6 +133,16 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, CHAM_des
 void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *T, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 void chameleon_pzbuild( cham_uplo_t uplo, CHAM_desc_t *A, void *user_data, void* user_build_callback, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
 
+int chameleon_pzgelqf_step( int genD, int k, int ib,
+                            CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
+                            RUNTIME_option_t *options, RUNTIME_sequence_t *sequence );
+int chameleon_pzgeqrf_step( int genD, int k, int ib,
+                            CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D,
+                            RUNTIME_option_t *options, RUNTIME_sequence_t *sequence );
+int  chameleon_pzgelqf_param_step( int genD, cham_uplo_t uplo, int k, int ib,
+                                   const libhqr_tree_t *qrtree, int *tiles,
+                                   CHAM_desc_t *A, CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
+                                   RUNTIME_option_t *options, RUNTIME_sequence_t *sequence );
 int  chameleon_pzgeqrf_param_step( int genD, cham_uplo_t uplo, int k, int ib,
                                    const libhqr_tree_t *qrtree, int *tiles,
                                    CHAM_desc_t *A, CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
@@ -142,7 +152,8 @@ void chameleon_pzungqr_param_step( int genD, cham_uplo_t uplo, int k, int ib,
                                    CHAM_desc_t *A, CHAM_desc_t *Q,
                                    CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
                                    RUNTIME_option_t *options, RUNTIME_sequence_t *sequence );
-void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t *A, CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
+void chameleon_pzgelqf_param( int genD, int K, const libhqr_tree_t *qrtree,
+                              CHAM_desc_t *A, CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
                               RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 void chameleon_pzgeqrf_param( int genD, int K, const libhqr_tree_t *qrtree,
                               CHAM_desc_t *A, CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
-- 
GitLab