pzhetrd_he2hb.c 14.4 KB
Newer Older
1
/**
2 3
 *
 * @file pzhetrd_he2hb.c
4
 *
Mathieu Faverge's avatar
Mathieu Faverge committed
5 6
 * @copyright 2009-2014 The University of Tennessee and The University of
 *                      Tennessee Research Foundation. All rights reserved.
7 8
 * @copyright 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
 *                      Univ. Bordeaux. All rights reserved.
9
 *
10
 ***
11
 *
12
 * @brief Chameleon zhetrd_he2hb parallel algorithm
13
 *
Mathieu Faverge's avatar
Mathieu Faverge committed
14
 * @version 1.0.0
15 16 17 18 19
 * @author Hatem Ltaief
 * @author Azzam Haidar
 * @date 2010-11-15
 * @precisions normal z -> s d c
 *
20
 */
21
#include "control/common.h"
Mathieu Faverge's avatar
Mathieu Faverge committed
22
#include <stdlib.h>
23 24 25 26 27 28 29 30 31 32 33 34 35

#define A(m, n) A,  m,  n
#define T(m, n) T,  m,  n
#define D(k) D, (k)-1, 0

#define AT(k) AT, k, 0

#if defined(CHAMELEON_COPY_DIAG)
#define E(m, n) E, m, 0
#else
#define E(m, n) A, m, n
#endif

36
/**
37
 *  Parallel tile BAND Tridiagonal Reduction - dynamic scheduler
38
 */
Mathieu Faverge's avatar
Mathieu Faverge committed
39
void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
40 41
                         CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *E,
                         RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
42
{
Mathieu Faverge's avatar
Mathieu Faverge committed
43
    CHAM_context_t *chamctxt;
44 45 46
    RUNTIME_option_t options;
    CHAM_desc_t *D  = NULL;
    CHAM_desc_t *AT = NULL;
47 48 49 50 51 52 53 54
    size_t ws_worker = 0;
    size_t ws_host = 0;

    int k, m, n, i, j;
    int ldak, ldak1, ldam, ldan, ldaj, ldai;
    int tempkm, tempkn, tempmm, tempnn, tempjj;
    int ib;

Mathieu Faverge's avatar
Mathieu Faverge committed
55
    chamctxt = chameleon_context_self();
56
    if (sequence->status != CHAMELEON_SUCCESS)
57 58
        return;

Mathieu Faverge's avatar
Mathieu Faverge committed
59
    RUNTIME_options_init(&options, chamctxt, sequence, request);
60
    ib = CHAMELEON_IB;
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79

    /*
     * zgeqrt        = A->nb * (ib+1)
     * zunmqr        = A->nb * ib
     * ztsqrt        = A->nb * (ib+1)
     * ztsmqr        = A->nb * ib
     * zherfb        = A->nb * ib
     * ztsmqr_hetra1 = A->nb * ib
     */
    ws_worker = A->nb * (ib+1);

    /* Allocation of temporary (scratch) working space */
#if defined(CHAMELEON_USE_CUDA)
    /* Worker space
     *
     * zunmqr = A->nb * ib
     * ztsmqr = 2 * A->nb * ib
     * zherfb = A->nb * ib
     */
80
    ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
81 82
#endif

83 84
    ws_worker *= sizeof(CHAMELEON_Complex64_t);
    ws_host   *= sizeof(CHAMELEON_Complex64_t);
85 86 87 88

    RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );

    /* Copy of the diagonal tiles to keep the general version of the tile all along the computation */
89
    D = (CHAM_desc_t*)malloc(sizeof(CHAM_desc_t));
Mathieu Faverge's avatar
Mathieu Faverge committed
90
    chameleon_zdesc_alloc_diag(*D, A->mb, A->nb, chameleon_min(A->m, A->n) - A->mb, A->nb, 0, 0, chameleon_min(A->m, A->n) - A->mb, A->nb, A->p, A->q);
91

92
    AT = (CHAM_desc_t*)malloc(sizeof(CHAM_desc_t));
Mathieu Faverge's avatar
Mathieu Faverge committed
93
    *AT = chameleon_desc_init(
94
        ChamComplexDouble, A->mb, A->nb, (A->mb*A->nb),
95
        chameleon_min(A->mt, A->nt) * A->mb, A->nb, 0, 0, chameleon_min(A->mt, A->nt) * A->mb, A->nb, 1, 1);
Mathieu Faverge's avatar
Mathieu Faverge committed
96
    chameleon_desc_mat_alloc( AT );
97
    RUNTIME_desc_create( AT );
98 99 100 101 102 103

    /* Let's extract the diagonal in a temporary copy that contains A and A' */
    for (k = 1; k < A->nt; k++){
        tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
        ldak = BLKLDD(A, k);

104
        INSERT_TASK_zhe2ge(&options,
105 106 107 108 109 110
                          uplo,
                          tempkn, tempkn, ldak,
                          A(k, k), ldak,
                          D(k),    ldak);
    }

111
    if (uplo == ChamLower) {
112
       for (k = 0; k < A->nt-1; k++){
Mathieu Faverge's avatar
Mathieu Faverge committed
113
           RUNTIME_iteration_push(chamctxt, k);
114

115 116 117 118
           tempkm = k+1 == A->mt-1 ? A->m-(k+1)*A->mb : A->mb;
           tempkn = k   == A->nt-1 ? A->n- k   *A->nb : A->nb;
           ldak1 = BLKLDD(A, k+1);

119
           INSERT_TASK_zgeqrt(
120 121 122 123 124 125
               &options,
               tempkm, tempkn, ib, A->nb,
               A(k+1, k), ldak1,
               T(k+1, k), T->mb);

#if defined(CHAMELEON_COPY_DIAG)
126
           INSERT_TASK_zlacpy(
127
               &options,
128
               ChamLower, tempkm, tempkn, A->nb,
129 130 131
               A(k+1, k), ldak1,
               E(k+1, k), ldak1 );
#if defined(CHAMELEON_USE_CUDA)
132
           INSERT_TASK_zlaset(
133
               &options,
134
               ChamUpper, tempkm, tempkn,
135 136 137 138 139 140
               0., 1.,
               E(k+1, k), ldak1 );
#endif
#endif

           /* LEFT and RIGHT on the symmetric diagonal block */
141
           INSERT_TASK_zherfb(
142
               &options,
143
               ChamLower,
144 145 146 147 148 149 150 151 152
               tempkm, tempkm, ib, A->nb,
               E(k+1, k), ldak1,
               T(k+1, k), T->mb,
               D(k+1),    ldak1);

           /* RIGHT on the remaining tiles until the bottom */
           for (m = k+2; m < A->mt ; m++) {
               tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
               ldam = BLKLDD(A, m);
153
               INSERT_TASK_zunmqr(
154
                   &options,
155
                   ChamRight, ChamNoTrans,
156 157 158 159 160 161 162 163 164 165 166
                   tempmm, A->nb, tempkm, ib, A->nb,
                   E(k+1, k),   ldak1,
                   T(k+1, k),   T->mb,
                   A(m,   k+1), ldam);
           }

           for (m = k+2; m < A->mt; m++) {
               tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
               ldam = BLKLDD(A, m);

               options.priority = 1;
167
               INSERT_TASK_ztsqrt(
168 169 170 171 172 173 174 175 176 177
                   &options,
                   tempmm, A->nb, ib, A->nb,
                   A(k+1, k), ldak1,
                   A(m  , k), ldam,
                   T(m  , k), T->mb);
               options.priority = 0;

               /* LEFT */
               for (i = k+2; i < m; i++) {
                   ldai = BLKLDD(A, i);
178
                   INSERT_TASK_ztsmqr_hetra1(
179
                       &options,
180
                       ChamLeft, ChamConjTrans,
181 182 183 184 185 186 187 188 189 190 191
                       A->mb, A->nb, tempmm, A->nb, A->nb, ib, A->nb,
                       A(i, k+1), ldai,
                       A(m,   i), ldam,
                       A(m,   k), ldam,
                       T(m,   k), T->mb);
               }

               /* RIGHT */
               for (j = m+1; j < A->mt ; j++) {
                   tempjj = j == A->mt-1 ? A->m-j*A->mb : A->mb;
                   ldaj = BLKLDD(A, j);
192
                   INSERT_TASK_ztsmqr(
193
                       &options,
194
                       ChamRight, ChamNoTrans,
195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
                       tempjj, A->nb, tempjj, tempmm, A->nb, ib, A->nb,
                       A(j, k+1), ldaj,
                       A(j,   m), ldaj,
                       A(m,   k), ldam,
                       T(m,   k), T->mb);
               }

               /* LEFT->RIGHT */
               options.priority = 1;
               /**
                * Compute the update of the following:
                *
                *     A1   A2'
                *     A2   A3
                *
                * with A1 and A3 two diagonal tiles. This is the tsmqr_corner
                * from plasma split in 4 tasks
                */
               /*  Copy the transpose of A2 (m, k+1): AT(k) <- A2' = A2(k+1, m) */
214
               INSERT_TASK_zlatro(
215
                   &options,
216
                   ChamUpperLower, ChamConjTrans,
217 218 219 220 221 222
                   tempmm, A->nb, A->nb,
                   A(m, k+1), ldam,
                   AT(m),  ldak1);

               /*  Left application on |A1| */
               /*                      |A2| */
223
               INSERT_TASK_ztsmqr(
224
                   &options,
225
                   ChamLeft, ChamConjTrans,
226 227 228 229 230 231 232 233
                   A->mb, A->nb, tempmm, A->nb, A->nb, ib, A->nb,
                   D(k+1), ldak1,
                   A(m, k+1), ldam,
                   A(m,   k), ldam,
                   T(m,   k), T->mb);

               /*  Left application on | A2'| */
               /*                      | A3 | */
234
               INSERT_TASK_ztsmqr(
235
                   &options,
236
                   ChamLeft, ChamConjTrans,
237 238 239 240 241 242 243
                   A->mb, tempmm, tempmm, tempmm, A->nb, ib, A->nb,
                   AT(m), ldak1,
                   D(m) , ldam,
                   A(m,  k), ldam,
                   T(m,  k), T->mb);

               /*  Right application on | A1 A2' | */
244
               INSERT_TASK_ztsmqr(
245
                   &options,
246
                   ChamRight, ChamNoTrans,
247 248 249 250 251 252 253
                   A->mb, A->nb, A->mb, tempmm, A->nb, ib, A->nb,
                   D(k+1), ldak1,
                   AT(m) , ldak1,
                   A(m,   k), ldam,
                   T(m,   k), T->mb);

               /*  Right application on | A2 A3 | */
254
               INSERT_TASK_ztsmqr(
255
                   &options,
256
                   ChamRight, ChamNoTrans,
257 258 259 260 261 262 263
                   tempmm, A->nb, tempmm, tempmm, A->nb, ib, A->nb,
                   A(m, k+1), ldam,
                   D(m)  , ldam,
                   A(m,   k), ldam,
                   T(m,   k), T->mb);
               options.priority = 0;
           }
264

Mathieu Faverge's avatar
Mathieu Faverge committed
265
           RUNTIME_iteration_pop(chamctxt);
266 267 268 269
       }
    }
    else {
       for (k = 0; k < A->nt-1; k++){
Mathieu Faverge's avatar
Mathieu Faverge committed
270
           RUNTIME_iteration_push(chamctxt, k);
271

272 273 274 275
           tempkn = k+1 == A->nt-1 ? A->n-(k+1)*A->nb : A->nb;
           tempkm = k   == A->mt-1 ? A->m- k   *A->mb : A->mb;
           ldak  = BLKLDD(A, k);
           ldak1 = BLKLDD(A, k+1);
276
           INSERT_TASK_zgelqt(
277 278 279 280 281 282
               &options,
               tempkm, tempkn, ib, A->nb,
               A(k, k+1), ldak,
               T(k, k+1), T->mb);

#if defined(CHAMELEON_COPY_DIAG)
283
           INSERT_TASK_zlacpy(
284
               &options,
285
               ChamUpper, tempkm, tempkn, A->nb,
286 287 288
               A(k, k+1), ldak,
               E(k, k+1), ldak );
#if defined(CHAMELEON_USE_CUDA)
289
           INSERT_TASK_zlaset(
290
               &options,
291
               ChamLower, tempkm, tempkn,
292 293 294 295 296 297
               0., 1.,
               E(k, k+1), ldak );
#endif
#endif

           /* RIGHT and LEFT on the symmetric diagonal block */
298
           INSERT_TASK_zherfb(
299
               &options,
300
               ChamUpper,
301 302 303 304 305 306 307 308
               tempkn, tempkn, ib, A->nb,
               E(k, k+1), ldak,
               T(k, k+1), T->mb,
               D(k+1),    ldak1);

           /* LEFT on the remaining tiles until the left side */
           for (n = k+2; n < A->nt ; n++) {
               tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
309
               INSERT_TASK_zunmlq(
310
                   &options,
311
                   ChamLeft, ChamNoTrans,
312 313 314 315 316 317 318 319 320 321
                   A->mb, tempnn, tempkn, ib, A->nb,
                   E(k,   k+1), ldak,
                   T(k,   k+1), T->mb,
                   A(k+1, n  ), ldak1);
           }

           for (n = k+2; n < A->nt; n++) {
               tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
               ldan = BLKLDD(A, n);
               options.priority = 1;
322
               INSERT_TASK_ztslqt(
323 324 325 326 327 328 329 330 331 332
                   &options,
                   A->mb, tempnn, ib, A->nb,
                   A(k, k+1), ldak,
                   A(k, n  ), ldak,
                   T(k, n  ), T->mb);
               options.priority = 0;

               /* RIGHT */
               for (i = k+2; i < n; i++) {
                   ldai = BLKLDD(A, i);
333
                   INSERT_TASK_ztsmlq_hetra1(
334
                       &options,
335
                       ChamRight, ChamConjTrans,
336 337 338 339 340 341 342 343 344 345
                       A->mb, A->nb, A->nb, tempnn, A->nb, ib, A->nb,
                       A(k+1, i), ldak1,
                       A(i,   n), ldai,
                       A(k,   n), ldak,
                       T(k,   n), T->mb);
               }

               /* LEFT */
               for (j = n+1; j < A->nt ; j++) {
                   tempjj = j == A->nt-1 ? A->n-j*A->nb : A->nb;
346
                   INSERT_TASK_ztsmlq(
347
                       &options,
348
                       ChamLeft, ChamNoTrans,
349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367
                       A->nb, tempjj, tempnn, tempjj, A->nb, ib, A->nb,
                       A(k+1, j), ldak1,
                       A(n,   j), ldan,
                       A(k,   n), ldak,
                       T(k,   n), T->mb);
               }

               /* RIGHT->LEFT */
               options.priority = 1;
               /**
                * Compute the update of the following:
                *
                *     A1   A2
                *     A2'  A3
                *
                * with A1 and A3 two diagonal tiles. This is the tsmqr_corner
                * from plasma split in 4 tasks
                */
               /*  Copy the transpose of A2: AT(k) <- A2' */
368
               INSERT_TASK_zlatro(
369
                   &options,
370
                   ChamUpperLower, ChamConjTrans,
371 372 373 374 375
                   A->mb, tempnn, A->nb,
                   A(k+1, n), ldak1,
                   AT(n),     A->mb);

               /*  Right application on | A1 A2 | */
376
               INSERT_TASK_ztsmlq(
377
                   &options,
378
                   ChamRight, ChamConjTrans,
379 380 381 382 383 384 385
                   A->mb, A->nb, A->mb, tempnn, A->nb, ib, A->nb,
                   D(k+1),    ldak1,
                   A(k+1, n), ldak1,
                   A(k,   n), ldak,
                   T(k,   n), T->mb);

               /*  Right application on | A2' A3 | */
386
               INSERT_TASK_ztsmlq(
387
                   &options,
388
                   ChamRight, ChamConjTrans,
389 390 391 392 393 394 395 396
                   tempnn, A->nb, tempnn, tempnn, A->nb, ib, A->nb,
                   AT(n),    A->mb,
                   D(n),     ldan,
                   A(k,  n), ldak,
                   T(k,  n), T->mb);

               /*  Left application on |A1 | */
               /*                      |A2'| */
397
               INSERT_TASK_ztsmlq(
398
                   &options,
399
                   ChamLeft, ChamNoTrans,
400 401 402 403 404 405 406 407
                   A->mb, A->nb, tempnn, A->nb, A->nb, ib, A->nb,
                   D(k+1),  ldak1,
                   AT(n),   A->mb,
                   A(k, n), ldak,
                   T(k, n), T->mb);

               /*  Left application on | A2 | */
               /*                      | A3 | */
408
               INSERT_TASK_ztsmlq(
409
                   &options,
410
                   ChamLeft, ChamNoTrans,
411 412 413 414 415 416 417
                   A->mb, tempnn, tempnn, tempnn, A->nb, ib, A->nb,
                   A(k+1, n), ldak1,
                   D(n),      ldan,
                   A(k,   n), ldak,
                   T(k,   n), T->mb);
           }
           options.priority = 0;
418

Mathieu Faverge's avatar
Mathieu Faverge committed
419
           RUNTIME_iteration_pop(chamctxt);
420 421 422 423 424 425 426
       }
    }

    /* Copy-back into A */
    for (k = 1; k < A->nt; k++){
        tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
        ldak = BLKLDD(A, k);
427
        INSERT_TASK_zlacpy(&options,
428 429 430 431 432 433
                          uplo,
                          tempkn, tempkn, ldak,
                          D(k), ldak,
                          A(k, k), ldak);
    }

434

435
    RUNTIME_options_ws_free(&options);
Mathieu Faverge's avatar
Mathieu Faverge committed
436
    RUNTIME_options_finalize(&options, chamctxt);
437

438
    CHAMELEON_Sequence_Wait(sequence);
Mathieu Faverge's avatar
Mathieu Faverge committed
439
    chameleon_desc_mat_free(D);
440 441
    free(D);

Mathieu Faverge's avatar
Mathieu Faverge committed
442
    chameleon_desc_mat_free(AT);
443
    free(AT);
Mathieu Faverge's avatar
Mathieu Faverge committed
444 445

    (void)E;
446
}