pzunmqr.c 13 KB
Newer Older
1
/**
2 3
 *
 * @file pzunmqr.c
4
 *
Mathieu Faverge's avatar
Mathieu Faverge committed
5 6
 * @copyright 2009-2014 The University of Tennessee and The University of
 *                      Tennessee Research Foundation. All rights reserved.
Mathieu Faverge's avatar
Mathieu Faverge committed
7
 * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8
 *                      Univ. Bordeaux. All rights reserved.
9
 *
10
 ***
11
 *
12
 * @brief Chameleon zunmqr parallel algorithm
13
 *
Mathieu Faverge's avatar
Mathieu Faverge committed
14
 * @version 1.0.0
15
 * @comment This file has been automatically generated
16
 *          from Plasma 2.5.0 for CHAMELEON 1.0.0
17 18 19 20 21 22
 * @author Hatem Ltaief
 * @author Jakub Kurzak
 * @author Azzam Haidar
 * @author Mathieu Faverge
 * @author Emmanuel Agullo
 * @author Cedric Castagnede
23
 * @date 2018-11-09
24 25
 * @precisions normal z -> s d c
 *
26
 */
27
#include "control/common.h"
28 29 30 31

#define A(m,n) A,  m,  n
#define B(m,n) B,  m,  n
#define T(m,n) T,  m,  n
32
#define D(k)   D,  k,  k
33

34
/**
35
 *  Parallel application of Q using tile V - QR factorization - dynamic scheduling
36
 */
37 38 39
void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans,
                        CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *T, CHAM_desc_t *D,
                        RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
40
{
Mathieu Faverge's avatar
Mathieu Faverge committed
41
    CHAM_context_t *chamctxt;
42
    RUNTIME_option_t options;
43 44 45 46
    size_t ws_worker = 0;
    size_t ws_host = 0;

    int k, m, n;
47
    int ldak, ldbk, ldam, ldan, ldbm, lddk;
48 49 50
    int tempkm, tempnn, tempkmin, tempmm, tempkn;
    int ib, minMT, minM;

Mathieu Faverge's avatar
Mathieu Faverge committed
51
    chamctxt = chameleon_context_self();
Mathieu Faverge's avatar
Mathieu Faverge committed
52
    if (sequence->status != CHAMELEON_SUCCESS) {
53
        return;
Mathieu Faverge's avatar
Mathieu Faverge committed
54
    }
Mathieu Faverge's avatar
Mathieu Faverge committed
55
    RUNTIME_options_init(&options, chamctxt, sequence, request);
56

57
    ib = CHAMELEON_IB;
58

59 60 61 62 63 64 65 66
    if (A->m > A->n) {
        minM  = A->n;
        minMT = A->nt;
    } else {
        minM  = A->m;
        minMT = A->mt;
    }

67 68 69
    if ( D == NULL ) {
        D    = A;
        genD = 0;
70 71
    }

72
    /*
73 74
     * zunmqr  = A->nb * ib
     * ztpmqrt = A->nb * ib
75 76 77
     */
    ws_worker = A->nb * ib;

Mathieu Faverge's avatar
Mathieu Faverge committed
78
#if defined(CHAMELEON_USE_CUDA)
79 80
    /* Worker space
     *
81 82
     * zunmqr  =     A->nb * ib
     * ztpmqrt = 2 * A->nb * ib
83
     */
84
    ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
85 86
#endif

87 88
    ws_worker *= sizeof(CHAMELEON_Complex64_t);
    ws_host   *= sizeof(CHAMELEON_Complex64_t);
89 90 91

    RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );

92 93
    if (side == ChamLeft ) {
        if (trans == ChamConjTrans) {
94
            /*
95
             *  ChamLeft / ChamConjTrans
96
             */
97
            for (k = 0; k < minMT; k++) {
Mathieu Faverge's avatar
Mathieu Faverge committed
98
                RUNTIME_iteration_push(chamctxt, k);
99

100 101 102
                tempkm   = k == B->mt-1 ? B->m-k*B->mb : B->mb;
                tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb;
                ldak = BLKLDD(A, k);
103
                lddk = BLKLDD(D, k);
104
                ldbk = BLKLDD(B, k);
105 106 107 108 109
                if ( genD ) {
                    INSERT_TASK_zlacpy(
                        &options,
                        ChamLower, tempkm, tempkmin, A->nb,
                        A(k, k), ldak,
110
                        D(k),    lddk );
Mathieu Faverge's avatar
Mathieu Faverge committed
111
#if defined(CHAMELEON_USE_CUDA)
112 113 114 115
                    INSERT_TASK_zlaset(
                        &options,
                        ChamUpper, tempkm, tempkmin,
                        0., 1.,
116
                        D(k), lddk );
117
#endif
118
                }
119 120
                for (n = 0; n < B->nt; n++) {
                    tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb;
121
                    INSERT_TASK_zunmqr(
122 123 124
                        &options,
                        side, trans,
                        tempkm, tempnn, tempkmin, ib, T->nb,
125
                        D(k),    lddk,
126 127 128
                        T(k, k), T->mb,
                        B(k, n), ldbk);
                }
129

Mathieu Faverge's avatar
Mathieu Faverge committed
130 131
                RUNTIME_data_flush( sequence, D(k)    );
                RUNTIME_data_flush( sequence, T(k, k) );
132

133 134 135 136 137 138
                for (m = k+1; m < B->mt; m++) {
                    tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb;
                    ldam = BLKLDD(A, m);
                    ldbm = BLKLDD(B, m);
                    for (n = 0; n < B->nt; n++) {
                        tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb;
139 140 141 142

                        RUNTIME_data_migrate( sequence, B(k, n),
                                              B->get_rankof( B, m, n ) );

143
                        /* TS kernel */
144
                        INSERT_TASK_ztpmqrt(
145 146
                            &options,
                            side, trans,
147
                            tempmm, tempnn, tempkmin, 0, ib, T->nb,
148
                            A(m, k), ldam,
149 150 151
                            T(m, k), T->mb,
                            B(k, n), ldbk,
                            B(m, n), ldbm);
152
                    }
153

Mathieu Faverge's avatar
Mathieu Faverge committed
154 155
                    RUNTIME_data_flush( sequence, A(m, k) );
                    RUNTIME_data_flush( sequence, T(m, k) );
156
                }
157

158 159 160 161 162 163
                /* Restore the original location of the tiles */
                for (n = 0; n < B->nt; n++) {
                    RUNTIME_data_migrate( sequence, B(k, n),
                                          B->get_rankof( B, k, n ) );
                }

Mathieu Faverge's avatar
Mathieu Faverge committed
164
                RUNTIME_iteration_pop(chamctxt);
165 166 167
            }
        }
        /*
168
         *  ChamLeft / ChamNoTrans
169 170 171
         */
        else {
            for (k = minMT-1; k >= 0; k--) {
Mathieu Faverge's avatar
Mathieu Faverge committed
172
                RUNTIME_iteration_push(chamctxt, k);
173

Mathieu Faverge's avatar
Mathieu Faverge committed
174
                tempkm   = k == B->mt-1 ? B->m-k*B->mb : B->mb;
175 176 177 178 179 180 181 182 183
                tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb;
                ldak = BLKLDD(A, k);
                ldbk = BLKLDD(B, k);
                for (m = B->mt-1; m > k; m--) {
                    tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb;
                    ldam = BLKLDD(A, m);
                    ldbm = BLKLDD(B, m);
                    for (n = 0; n < B->nt; n++) {
                        tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb;
184 185 186 187

                        RUNTIME_data_migrate( sequence, B(k, n),
                                              B->get_rankof( B, m, n ) );

188
                        /* TS kernel */
189
                        INSERT_TASK_ztpmqrt(
190 191
                            &options,
                            side, trans,
192
                            tempmm, tempnn, tempkmin, 0, ib, T->nb,
193
                            A(m, k), ldam,
194 195 196
                            T(m, k), T->mb,
                            B(k, n), ldbk,
                            B(m, n), ldbm);
197
                    }
Mathieu Faverge's avatar
Mathieu Faverge committed
198 199
                    RUNTIME_data_flush( sequence, A(m, k) );
                    RUNTIME_data_flush( sequence, T(m, k) );
200
                }
201

202 203 204 205 206 207
                if ( genD ) {
                    INSERT_TASK_zlacpy(
                        &options,
                        ChamLower, tempkm, tempkmin, A->nb,
                        A(k, k), ldak,
                        D(k), ldak );
Mathieu Faverge's avatar
Mathieu Faverge committed
208
#if defined(CHAMELEON_USE_CUDA)
209 210 211 212 213
                    INSERT_TASK_zlaset(
                        &options,
                        ChamUpper, tempkm, tempkmin,
                        0., 1.,
                        D(k), ldak );
214
#endif
215
                }
216 217
                for (n = 0; n < B->nt; n++) {
                    tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb;
218 219 220 221

                    RUNTIME_data_migrate( sequence, B(k, n),
                                          B->get_rankof( B, k, n ) );

222
                    INSERT_TASK_zunmqr(
223 224 225
                        &options,
                        side, trans,
                        tempkm, tempnn, tempkmin, ib, T->nb,
226
                        D(k), ldak,
227 228 229
                        T(k, k), T->mb,
                        B(k, n), ldbk);
                }
Mathieu Faverge's avatar
Mathieu Faverge committed
230 231
                RUNTIME_data_flush( sequence, D(k)    );
                RUNTIME_data_flush( sequence, T(k, k) );
Mathieu Faverge's avatar
Mathieu Faverge committed
232
                RUNTIME_iteration_pop(chamctxt);
233 234 235 236
            }
        }
    }
    /*
237
     *  ChamRight / ChamConjTrans
238 239
     */
    else {
240
        if (trans == ChamConjTrans) {
241
            for (k = minMT-1; k >= 0; k--) {
Mathieu Faverge's avatar
Mathieu Faverge committed
242
                RUNTIME_iteration_push(chamctxt, k);
243

Mathieu Faverge's avatar
Mathieu Faverge committed
244 245
                tempkn   = k == B->nt - 1 ? B->n - k * B->nb : B->nb;
                tempkmin = k == minMT - 1 ? minM - k * A->nb : A->nb;
246 247 248 249 250 251 252
                ldak = BLKLDD(A, k);
                for (n = B->nt-1; n > k; n--) {
                    tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb;
                    ldan = BLKLDD(A, n);
                    for (m = 0; m < B->mt; m++) {
                        tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb;
                        ldbm = BLKLDD(B, m);
253 254 255 256

                        RUNTIME_data_migrate( sequence, B(m, k),
                                              B->get_rankof( B, m, n ) );

257
                        /* TS kernel */
258
                        INSERT_TASK_ztpmqrt(
259 260
                            &options,
                            side, trans,
261
                            tempmm, tempnn, tempkmin, 0, ib, T->nb,
262
                            A(n, k), ldan,
263 264 265
                            T(n, k), T->mb,
                            B(m, k), ldbm,
                            B(m, n), ldbm);
266
                    }
267

Mathieu Faverge's avatar
Mathieu Faverge committed
268 269
                    RUNTIME_data_flush( sequence, A(n, k) );
                    RUNTIME_data_flush( sequence, T(n, k) );
270
                }
271 272 273 274 275 276
                if ( genD ) {
                    INSERT_TASK_zlacpy(
                        &options,
                        ChamLower, tempkn, tempkmin, A->nb,
                        A(k, k), ldak,
                        D(k), ldak );
Mathieu Faverge's avatar
Mathieu Faverge committed
277
#if defined(CHAMELEON_USE_CUDA)
278 279 280 281 282
                    INSERT_TASK_zlaset(
                        &options,
                        ChamUpper, tempkn, tempkmin,
                        0., 1.,
                        D(k), ldak );
283
#endif
284
                }
285 286 287
                for (m = 0; m < B->mt; m++) {
                    tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb;
                    ldbm = BLKLDD(B, m);
288 289 290 291

                    RUNTIME_data_migrate( sequence, B(m, k),
                                          B->get_rankof( B, m, k ) );

292
                    INSERT_TASK_zunmqr(
293 294 295
                        &options,
                        side, trans,
                        tempmm, tempkn, tempkmin, ib, T->nb,
296
                        D(k), ldak,
297 298 299
                        T(k, k), T->mb,
                        B(m, k), ldbm);
                }
300

Mathieu Faverge's avatar
Mathieu Faverge committed
301 302
                RUNTIME_data_flush( sequence, D(k)    );
                RUNTIME_data_flush( sequence, T(k, k) );
303

Mathieu Faverge's avatar
Mathieu Faverge committed
304
                RUNTIME_iteration_pop(chamctxt);
305 306 307
            }
        }
        /*
308
         *  ChamRight / ChamNoTrans
309 310 311
         */
        else {
            for (k = 0; k < minMT; k++) {
Mathieu Faverge's avatar
Mathieu Faverge committed
312
                RUNTIME_iteration_push(chamctxt, k);
313

314 315 316
                tempkn   = k == B->nt-1 ? B->n-k*B->nb : B->nb;
                tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb;
                ldak = BLKLDD(A, k);
317 318 319 320 321 322
                if ( genD ) {
                    INSERT_TASK_zlacpy(
                        &options,
                        ChamLower, tempkn, tempkmin, A->nb,
                        A(k, k), ldak,
                        D(k), ldak );
Mathieu Faverge's avatar
Mathieu Faverge committed
323
#if defined(CHAMELEON_USE_CUDA)
324 325 326 327 328
                    INSERT_TASK_zlaset(
                        &options,
                        ChamUpper, tempkn, tempkmin,
                        0., 1.,
                        D(k), ldak );
329
#endif
330
                }
331 332 333
                for (m = 0; m < B->mt; m++) {
                    tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb;
                    ldbm = BLKLDD(B, m);
334
                    INSERT_TASK_zunmqr(
335 336 337
                        &options,
                        side, trans,
                        tempmm, tempkn, tempkmin, ib, T->nb,
338
                        D(k), ldak,
339 340 341
                        T(k, k), T->mb,
                        B(m, k), ldbm);
                }
342

Mathieu Faverge's avatar
Mathieu Faverge committed
343 344
                RUNTIME_data_flush( sequence, D(k)    );
                RUNTIME_data_flush( sequence, T(k, k) );
345

346 347 348 349 350 351
                for (n = k+1; n < B->nt; n++) {
                    tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb;
                    ldan = BLKLDD(A, n);
                    for (m = 0; m < B->mt; m++) {
                        tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb;
                        ldbm = BLKLDD(B, m);
352 353 354 355

                        RUNTIME_data_migrate( sequence, B(m, k),
                                              B->get_rankof( B, m, n ) );

356
                        /* TS kernel */
357
                        INSERT_TASK_ztpmqrt(
358 359
                            &options,
                            side, trans,
360
                            tempmm, tempnn, tempkmin, 0, ib, T->nb,
361
                            A(n, k), ldan,
362 363 364
                            T(n, k), T->mb,
                            B(m, k), ldbm,
                            B(m, n), ldbm);
365
                    }
366

Mathieu Faverge's avatar
Mathieu Faverge committed
367 368
                    RUNTIME_data_flush( sequence, A(n, k) );
                    RUNTIME_data_flush( sequence, T(n, k) );
369
                }
370

371 372 373 374 375 376
                /* Restore the original location of the tiles */
                for (m = 0; m < B->mt; m++) {
                    RUNTIME_data_migrate( sequence, B(m, k),
                                          B->get_rankof( B, m, k ) );
                }

Mathieu Faverge's avatar
Mathieu Faverge committed
377
                RUNTIME_iteration_pop(chamctxt);
378 379 380
            }
        }
    }
381

382
    RUNTIME_options_ws_free(&options);
Mathieu Faverge's avatar
Mathieu Faverge committed
383
    RUNTIME_options_finalize(&options, chamctxt);
384
}