pzhetrd_he2hb.c 14.2 KB
Newer Older
1
/**
2 3
 *
 * @file pzhetrd_he2hb.c
4
 *
Mathieu Faverge's avatar
Mathieu Faverge committed
5 6
 * @copyright 2009-2014 The University of Tennessee and The University of
 *                      Tennessee Research Foundation. All rights reserved.
Mathieu Faverge's avatar
Mathieu Faverge committed
7
 * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8
 *                      Univ. Bordeaux. All rights reserved.
9
 *
10
 ***
11
 *
12
 * @brief Chameleon zhetrd_he2hb parallel algorithm
13
 *
Mathieu Faverge's avatar
Mathieu Faverge committed
14
 * @version 1.0.0
15 16
 * @author Hatem Ltaief
 * @author Azzam Haidar
17
 * @date 2018-11-09
18 19
 * @precisions normal z -> s d c
 *
20
 */
21
#include "control/common.h"
Mathieu Faverge's avatar
Mathieu Faverge committed
22
#include <stdlib.h>
23 24 25

#define A(m, n) A,  m,  n
#define T(m, n) T,  m,  n
26
#define D(k) &D, (k)-1, 0
27

28
#define AT(k) &AT, k, 0
29 30 31 32 33 34 35

#if defined(CHAMELEON_COPY_DIAG)
#define E(m, n) E, m, 0
#else
#define E(m, n) A, m, n
#endif

36
/**
37
 *  Parallel tile BAND Tridiagonal Reduction - dynamic scheduler
38
 */
Mathieu Faverge's avatar
Mathieu Faverge committed
39
void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
40 41
                         CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *E,
                         RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
42
{
Mathieu Faverge's avatar
Mathieu Faverge committed
43
    CHAM_context_t *chamctxt;
44
    RUNTIME_option_t options;
45 46
    CHAM_desc_t D;
    CHAM_desc_t AT;
47 48 49 50 51 52 53 54
    size_t ws_worker = 0;
    size_t ws_host = 0;

    int k, m, n, i, j;
    int ldak, ldak1, ldam, ldan, ldaj, ldai;
    int tempkm, tempkn, tempmm, tempnn, tempjj;
    int ib;

Mathieu Faverge's avatar
Mathieu Faverge committed
55
    chamctxt = chameleon_context_self();
Mathieu Faverge's avatar
Mathieu Faverge committed
56
    if (sequence->status != CHAMELEON_SUCCESS) {
57
        return;
Mathieu Faverge's avatar
Mathieu Faverge committed
58
    }
59

Mathieu Faverge's avatar
Mathieu Faverge committed
60
    RUNTIME_options_init(&options, chamctxt, sequence, request);
61
    ib = CHAMELEON_IB;
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76

    /*
     * zgeqrt        = A->nb * (ib+1)
     * zunmqr        = A->nb * ib
     * ztsqrt        = A->nb * (ib+1)
     * ztsmqr        = A->nb * ib
     * zherfb        = A->nb * ib
     * ztsmqr_hetra1 = A->nb * ib
     */
    ws_worker = A->nb * (ib+1);

    /* Allocation of temporary (scratch) working space */
#if defined(CHAMELEON_USE_CUDA)
    /* Worker space
     *
77
     * zunmqr =     A->nb * ib
78
     * ztsmqr = 2 * A->nb * ib
79
     * zherfb =     A->nb * ib
80
     */
81
    ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
82 83
#endif

84 85
    ws_worker *= sizeof(CHAMELEON_Complex64_t);
    ws_host   *= sizeof(CHAMELEON_Complex64_t);
86 87 88 89

    RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );

    /* Copy of the diagonal tiles to keep the general version of the tile all along the computation */
90
    chameleon_zdesc_alloc_diag( &D, A->mb, A->m, A->n, A->p, A->q );
91

92 93 94 95
    chameleon_desc_init( &AT, CHAMELEON_MAT_ALLOC_GLOBAL, ChamComplexDouble, A->mb, A->nb, (A->mb*A->nb),
                         chameleon_min(A->mt, A->nt) * A->mb, A->nb, 0, 0,
                         chameleon_min(A->mt, A->nt) * A->mb, A->nb, 1, 1,
                         NULL, NULL, NULL );
96 97 98 99 100 101

    /* Let's extract the diagonal in a temporary copy that contains A and A' */
    for (k = 1; k < A->nt; k++){
        tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
        ldak = BLKLDD(A, k);

102
        INSERT_TASK_zhe2ge(&options,
103 104 105 106 107 108
                          uplo,
                          tempkn, tempkn, ldak,
                          A(k, k), ldak,
                          D(k),    ldak);
    }

109
    if (uplo == ChamLower) {
110
       for (k = 0; k < A->nt-1; k++){
Mathieu Faverge's avatar
Mathieu Faverge committed
111
           RUNTIME_iteration_push(chamctxt, k);
112

113 114 115 116
           tempkm = k+1 == A->mt-1 ? A->m-(k+1)*A->mb : A->mb;
           tempkn = k   == A->nt-1 ? A->n- k   *A->nb : A->nb;
           ldak1 = BLKLDD(A, k+1);

117
           INSERT_TASK_zgeqrt(
118 119 120 121 122 123
               &options,
               tempkm, tempkn, ib, A->nb,
               A(k+1, k), ldak1,
               T(k+1, k), T->mb);

#if defined(CHAMELEON_COPY_DIAG)
124
           INSERT_TASK_zlacpy(
125
               &options,
126
               ChamLower, tempkm, tempkn, A->nb,
127 128 129
               A(k+1, k), ldak1,
               E(k+1, k), ldak1 );
#if defined(CHAMELEON_USE_CUDA)
130
           INSERT_TASK_zlaset(
131
               &options,
132
               ChamUpper, tempkm, tempkn,
133 134 135 136 137 138
               0., 1.,
               E(k+1, k), ldak1 );
#endif
#endif

           /* LEFT and RIGHT on the symmetric diagonal block */
139
           INSERT_TASK_zherfb(
140
               &options,
141
               ChamLower,
142 143 144 145 146 147 148 149 150
               tempkm, tempkm, ib, A->nb,
               E(k+1, k), ldak1,
               T(k+1, k), T->mb,
               D(k+1),    ldak1);

           /* RIGHT on the remaining tiles until the bottom */
           for (m = k+2; m < A->mt ; m++) {
               tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
               ldam = BLKLDD(A, m);
151
               INSERT_TASK_zunmqr(
152
                   &options,
153
                   ChamRight, ChamNoTrans,
154 155 156 157 158 159 160 161 162 163 164
                   tempmm, A->nb, tempkm, ib, A->nb,
                   E(k+1, k),   ldak1,
                   T(k+1, k),   T->mb,
                   A(m,   k+1), ldam);
           }

           for (m = k+2; m < A->mt; m++) {
               tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
               ldam = BLKLDD(A, m);

               options.priority = 1;
165
               INSERT_TASK_ztsqrt(
166 167 168 169 170 171 172 173 174 175
                   &options,
                   tempmm, A->nb, ib, A->nb,
                   A(k+1, k), ldak1,
                   A(m  , k), ldam,
                   T(m  , k), T->mb);
               options.priority = 0;

               /* LEFT */
               for (i = k+2; i < m; i++) {
                   ldai = BLKLDD(A, i);
176
                   INSERT_TASK_ztsmqr_hetra1(
177
                       &options,
178
                       ChamLeft, ChamConjTrans,
179 180 181 182 183 184 185 186 187 188 189
                       A->mb, A->nb, tempmm, A->nb, A->nb, ib, A->nb,
                       A(i, k+1), ldai,
                       A(m,   i), ldam,
                       A(m,   k), ldam,
                       T(m,   k), T->mb);
               }

               /* RIGHT */
               for (j = m+1; j < A->mt ; j++) {
                   tempjj = j == A->mt-1 ? A->m-j*A->mb : A->mb;
                   ldaj = BLKLDD(A, j);
190
                   INSERT_TASK_ztsmqr(
191
                       &options,
192
                       ChamRight, ChamNoTrans,
193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211
                       tempjj, A->nb, tempjj, tempmm, A->nb, ib, A->nb,
                       A(j, k+1), ldaj,
                       A(j,   m), ldaj,
                       A(m,   k), ldam,
                       T(m,   k), T->mb);
               }

               /* LEFT->RIGHT */
               options.priority = 1;
               /**
                * Compute the update of the following:
                *
                *     A1   A2'
                *     A2   A3
                *
                * with A1 and A3 two diagonal tiles. This is the tsmqr_corner
                * from plasma split in 4 tasks
                */
               /*  Copy the transpose of A2 (m, k+1): AT(k) <- A2' = A2(k+1, m) */
212
               INSERT_TASK_zlatro(
213
                   &options,
214
                   ChamUpperLower, ChamConjTrans,
215 216 217 218 219 220
                   tempmm, A->nb, A->nb,
                   A(m, k+1), ldam,
                   AT(m),  ldak1);

               /*  Left application on |A1| */
               /*                      |A2| */
221
               INSERT_TASK_ztsmqr(
222
                   &options,
223
                   ChamLeft, ChamConjTrans,
224 225 226 227 228 229 230 231
                   A->mb, A->nb, tempmm, A->nb, A->nb, ib, A->nb,
                   D(k+1), ldak1,
                   A(m, k+1), ldam,
                   A(m,   k), ldam,
                   T(m,   k), T->mb);

               /*  Left application on | A2'| */
               /*                      | A3 | */
232
               INSERT_TASK_ztsmqr(
233
                   &options,
234
                   ChamLeft, ChamConjTrans,
235 236 237 238 239 240 241
                   A->mb, tempmm, tempmm, tempmm, A->nb, ib, A->nb,
                   AT(m), ldak1,
                   D(m) , ldam,
                   A(m,  k), ldam,
                   T(m,  k), T->mb);

               /*  Right application on | A1 A2' | */
242
               INSERT_TASK_ztsmqr(
243
                   &options,
244
                   ChamRight, ChamNoTrans,
245 246 247 248 249 250 251
                   A->mb, A->nb, A->mb, tempmm, A->nb, ib, A->nb,
                   D(k+1), ldak1,
                   AT(m) , ldak1,
                   A(m,   k), ldam,
                   T(m,   k), T->mb);

               /*  Right application on | A2 A3 | */
252
               INSERT_TASK_ztsmqr(
253
                   &options,
254
                   ChamRight, ChamNoTrans,
255 256 257 258 259 260 261
                   tempmm, A->nb, tempmm, tempmm, A->nb, ib, A->nb,
                   A(m, k+1), ldam,
                   D(m)  , ldam,
                   A(m,   k), ldam,
                   T(m,   k), T->mb);
               options.priority = 0;
           }
262

Mathieu Faverge's avatar
Mathieu Faverge committed
263
           RUNTIME_iteration_pop(chamctxt);
264 265 266 267
       }
    }
    else {
       for (k = 0; k < A->nt-1; k++){
Mathieu Faverge's avatar
Mathieu Faverge committed
268
           RUNTIME_iteration_push(chamctxt, k);
269

270 271 272 273
           tempkn = k+1 == A->nt-1 ? A->n-(k+1)*A->nb : A->nb;
           tempkm = k   == A->mt-1 ? A->m- k   *A->mb : A->mb;
           ldak  = BLKLDD(A, k);
           ldak1 = BLKLDD(A, k+1);
274
           INSERT_TASK_zgelqt(
275 276 277 278 279 280
               &options,
               tempkm, tempkn, ib, A->nb,
               A(k, k+1), ldak,
               T(k, k+1), T->mb);

#if defined(CHAMELEON_COPY_DIAG)
281
           INSERT_TASK_zlacpy(
282
               &options,
283
               ChamUpper, tempkm, tempkn, A->nb,
284 285 286
               A(k, k+1), ldak,
               E(k, k+1), ldak );
#if defined(CHAMELEON_USE_CUDA)
287
           INSERT_TASK_zlaset(
288
               &options,
289
               ChamLower, tempkm, tempkn,
290 291 292 293 294 295
               0., 1.,
               E(k, k+1), ldak );
#endif
#endif

           /* RIGHT and LEFT on the symmetric diagonal block */
296
           INSERT_TASK_zherfb(
297
               &options,
298
               ChamUpper,
299 300 301 302 303 304 305 306
               tempkn, tempkn, ib, A->nb,
               E(k, k+1), ldak,
               T(k, k+1), T->mb,
               D(k+1),    ldak1);

           /* LEFT on the remaining tiles until the left side */
           for (n = k+2; n < A->nt ; n++) {
               tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
307
               INSERT_TASK_zunmlq(
308
                   &options,
309
                   ChamLeft, ChamNoTrans,
310 311 312 313 314 315 316 317 318 319
                   A->mb, tempnn, tempkn, ib, A->nb,
                   E(k,   k+1), ldak,
                   T(k,   k+1), T->mb,
                   A(k+1, n  ), ldak1);
           }

           for (n = k+2; n < A->nt; n++) {
               tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
               ldan = BLKLDD(A, n);
               options.priority = 1;
320
               INSERT_TASK_ztslqt(
321 322 323 324 325 326 327 328 329 330
                   &options,
                   A->mb, tempnn, ib, A->nb,
                   A(k, k+1), ldak,
                   A(k, n  ), ldak,
                   T(k, n  ), T->mb);
               options.priority = 0;

               /* RIGHT */
               for (i = k+2; i < n; i++) {
                   ldai = BLKLDD(A, i);
331
                   INSERT_TASK_ztsmlq_hetra1(
332
                       &options,
333
                       ChamRight, ChamConjTrans,
334 335 336 337 338 339 340 341 342 343
                       A->mb, A->nb, A->nb, tempnn, A->nb, ib, A->nb,
                       A(k+1, i), ldak1,
                       A(i,   n), ldai,
                       A(k,   n), ldak,
                       T(k,   n), T->mb);
               }

               /* LEFT */
               for (j = n+1; j < A->nt ; j++) {
                   tempjj = j == A->nt-1 ? A->n-j*A->nb : A->nb;
344
                   INSERT_TASK_ztsmlq(
345
                       &options,
346
                       ChamLeft, ChamNoTrans,
347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365
                       A->nb, tempjj, tempnn, tempjj, A->nb, ib, A->nb,
                       A(k+1, j), ldak1,
                       A(n,   j), ldan,
                       A(k,   n), ldak,
                       T(k,   n), T->mb);
               }

               /* RIGHT->LEFT */
               options.priority = 1;
               /**
                * Compute the update of the following:
                *
                *     A1   A2
                *     A2'  A3
                *
                * with A1 and A3 two diagonal tiles. This is the tsmqr_corner
                * from plasma split in 4 tasks
                */
               /*  Copy the transpose of A2: AT(k) <- A2' */
366
               INSERT_TASK_zlatro(
367
                   &options,
368
                   ChamUpperLower, ChamConjTrans,
369 370 371 372 373
                   A->mb, tempnn, A->nb,
                   A(k+1, n), ldak1,
                   AT(n),     A->mb);

               /*  Right application on | A1 A2 | */
374
               INSERT_TASK_ztsmlq(
375
                   &options,
376
                   ChamRight, ChamConjTrans,
377 378 379 380 381 382 383
                   A->mb, A->nb, A->mb, tempnn, A->nb, ib, A->nb,
                   D(k+1),    ldak1,
                   A(k+1, n), ldak1,
                   A(k,   n), ldak,
                   T(k,   n), T->mb);

               /*  Right application on | A2' A3 | */
384
               INSERT_TASK_ztsmlq(
385
                   &options,
386
                   ChamRight, ChamConjTrans,
387 388 389 390 391 392 393 394
                   tempnn, A->nb, tempnn, tempnn, A->nb, ib, A->nb,
                   AT(n),    A->mb,
                   D(n),     ldan,
                   A(k,  n), ldak,
                   T(k,  n), T->mb);

               /*  Left application on |A1 | */
               /*                      |A2'| */
395
               INSERT_TASK_ztsmlq(
396
                   &options,
397
                   ChamLeft, ChamNoTrans,
398 399 400 401 402 403 404 405
                   A->mb, A->nb, tempnn, A->nb, A->nb, ib, A->nb,
                   D(k+1),  ldak1,
                   AT(n),   A->mb,
                   A(k, n), ldak,
                   T(k, n), T->mb);

               /*  Left application on | A2 | */
               /*                      | A3 | */
406
               INSERT_TASK_ztsmlq(
407
                   &options,
408
                   ChamLeft, ChamNoTrans,
409 410 411 412 413 414 415
                   A->mb, tempnn, tempnn, tempnn, A->nb, ib, A->nb,
                   A(k+1, n), ldak1,
                   D(n),      ldan,
                   A(k,   n), ldak,
                   T(k,   n), T->mb);
           }
           options.priority = 0;
416

Mathieu Faverge's avatar
Mathieu Faverge committed
417
           RUNTIME_iteration_pop(chamctxt);
418 419 420 421 422 423 424
       }
    }

    /* Copy-back into A */
    for (k = 1; k < A->nt; k++){
        tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
        ldak = BLKLDD(A, k);
425
        INSERT_TASK_zlacpy(&options,
426 427 428 429 430 431
                          uplo,
                          tempkn, tempkn, ldak,
                          D(k), ldak,
                          A(k, k), ldak);
    }

432

433
    RUNTIME_options_ws_free(&options);
Mathieu Faverge's avatar
Mathieu Faverge committed
434
    RUNTIME_options_finalize(&options, chamctxt);
435

436
    CHAMELEON_Sequence_Wait(sequence);
437 438
    chameleon_desc_destroy( &D );
    chameleon_desc_destroy( &AT );
Mathieu Faverge's avatar
Mathieu Faverge committed
439 440

    (void)E;
441
}