pzgeqrfrh.c 6.8 KB
Newer Older
1 2
/**
 *
Mathieu Faverge's avatar
Mathieu Faverge committed
3 4
 * @copyright 2009-2014 The University of Tennessee and The University of
 *                      Tennessee Research Foundation. All rights reserved.
5 6
 * @copyright 2012-2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
 *                      Univ. Bordeaux. All rights reserved.
7
 *
Mathieu Faverge's avatar
Mathieu Faverge committed
8
 ***
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
 *
 * @file pzgeqrfrh.c
 *
 *  MORSE auxiliary routines
 *  MORSE is a software package provided by Univ. of Tennessee,
 *  Univ. of California Berkeley and Univ. of Colorado Denver
 *
 * @version 2.5.0
 * @comment This file has been automatically generated
 *          from Plasma 2.5.0 for MORSE 1.0.0
 * @author Jakub Kurzak
 * @author Hatem Ltaief
 * @author Dulceneia Becker
 * @author Mathieu Faverge
 * @author Emmanuel Agullo
 * @author Cedric Castagnede
 * @date 2010-11-15
 * @precisions normal z -> s d c
 *
 **/
29
#include "control/common.h"
30

31 32
#define A(m,n)  A,  (m),  (n)
#define T(m,n)  T,  (m),  (n)
33
#define T2(m,n) T,  (m), ((n)+A->nt)
34
#if defined(CHAMELEON_COPY_DIAG)
35
#define D(m,n) D, ((m)/BS), 0
36
#else
37
#define D(m,n) A,  (m),  (n)
38
#endif
39

Mathieu Faverge's avatar
Mathieu Faverge committed
40
/*******************************************************************************
41 42
 *  Parallel tile QR factorization (reduction Householder) - dynamic scheduling
 **/
43
void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *D, int BS,
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
                     MORSE_sequence_t *sequence, MORSE_request_t *request)
{
    MORSE_context_t *morse;
    MORSE_option_t options;
    size_t ws_worker = 0;
    size_t ws_host = 0;

    int k, m, n;
    int K, M, RD;
    int ldaM, ldam, ldaMRD;
    int tempkmin, tempkn, tempMm, tempnn, tempmm, tempMRDm;
    int ib;

    morse = morse_context_self();
    if (sequence->status != MORSE_SUCCESS)
        return;
    RUNTIME_options_init(&options, morse, sequence, request);

    ib = MORSE_IB;

    /*
     * zgeqrt = A->nb * (ib+1)
     * zunmqr = A->nb * ib
     * ztsqrt = A->nb * (ib+1)
     * zttqrt = A->nb * (ib+1)
     * ztsmqr = A->nb * ib
     * zttmqr = A->nb * ib
     */
    ws_worker = A->nb * (ib+1);

    /* Allocation of temporary (scratch) working space */
Mathieu Faverge's avatar
Mathieu Faverge committed
75 76 77 78 79 80
#if defined(CHAMELEON_USE_CUDA)
    /* Worker space
     *
     * zunmqr = A->nb * ib
     * ztsmqr = 2 * A->nb * ib
     */
81
    ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
Mathieu Faverge's avatar
Mathieu Faverge committed
82 83
#endif

84 85 86 87 88
    ws_worker *= sizeof(MORSE_Complex64_t);
    ws_host   *= sizeof(MORSE_Complex64_t);

    RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );

89
    K = chameleon_min(A->mt, A->nt);
90
    for (k = 0; k < K; k++) {
91
        RUNTIME_iteration_push(morse, k);
92

93 94 95
        tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
        for (M = k; M < A->mt; M += BS) {
            tempMm = M == A->mt-1 ? A->m-M*A->mb : A->mb;
96
            tempkmin = chameleon_min(tempMm, tempkn);
97
            ldaM = BLKLDD(A, M);
98

99 100 101 102 103 104
            MORSE_TASK_zgeqrt(
                &options,
                tempMm, tempkn, ib, T->nb,
                A(M, k), ldaM,
                T(M, k), T->mb);
            if ( k < (A->nt-1) ) {
105 106 107 108 109
#if defined(CHAMELEON_COPY_DIAG)
            MORSE_TASK_zlacpy(
                &options,
                MorseLower, tempMm, A->nb, A->nb,
                A(M, k), ldaM,
110
                D(M, k), ldaM );
Mathieu Faverge's avatar
Mathieu Faverge committed
111
#if defined(CHAMELEON_USE_CUDA)
112 113 114 115
                MORSE_TASK_zlaset(
                    &options,
                    MorseUpper, tempMm, A->nb,
                    0., 1.,
116
                    D(M, k), ldaM );
117
#endif
118
#endif
119
            }
120 121 122 123 124 125
            for (n = k+1; n < A->nt; n++) {
                tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
                MORSE_TASK_zunmqr(
                    &options,
                    MorseLeft, MorseConjTrans,
                    tempMm, tempnn, tempkmin, ib, T->nb,
126
                    D(M, k), ldaM,
127 128 129
                    T(M, k), T->mb,
                    A(M, n), ldaM);
            }
130

131
            for (m = M+1; m < chameleon_min(M+BS, A->mt); m++) {
132 133
                tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
                ldam = BLKLDD(A, m);
134 135 136 137

                RUNTIME_data_migrate( sequence, A(M, k),
                                      A->get_rankof( A, m, k ) );

138
                /* TS kernel */
139
                MORSE_TASK_ztpqrt(
140
                    &options,
141
                    tempmm, tempkn, 0, ib, T->nb,
142 143 144 145 146 147
                    A(M, k), ldaM,
                    A(m, k), ldam,
                    T(m, k), T->mb);

                for (n = k+1; n < A->nt; n++) {
                    tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
148 149 150 151 152

                    RUNTIME_data_migrate( sequence, A(M, n),
                                          A->get_rankof( A, m, n ) );

                    MORSE_TASK_ztpmqrt(
153 154
                        &options,
                        MorseLeft, MorseConjTrans,
155
                        tempmm, tempnn, A->nb, 0, ib, T->nb,
156
                        A(m, k), ldam,
157 158 159
                        T(m, k), T->mb,
                        A(M, n), ldaM,
                        A(m, n), ldam);
160 161 162 163 164 165 166 167
                }
            }
        }
        for (RD = BS; RD < A->mt-k; RD *= 2) {
            for (M = k; M+RD < A->mt; M += 2*RD) {
                tempMRDm = M+RD == A->mt-1 ? A->m-(M+RD)*A->mb : A->mb;
                ldaM   = BLKLDD(A, M   );
                ldaMRD = BLKLDD(A, M+RD);
168 169 170 171 172 173

                RUNTIME_data_migrate( sequence, A(M, k),
                                      A->get_rankof( A, M+RD, k ) );
                RUNTIME_data_migrate( sequence, A(M+RD, k),
                                      A->get_rankof( A, M+RD, k ) );

174
                /* TT kernel */
175
                MORSE_TASK_ztpqrt(
176
                    &options,
Mathieu Faverge's avatar
Mathieu Faverge committed
177
                    tempMRDm, tempkn, chameleon_min( tempMRDm, tempkn ), ib, T->nb,
178 179 180 181 182 183
                    A (M   , k), ldaM,
                    A (M+RD, k), ldaMRD,
                    T2(M+RD, k), T->mb);

                for (n = k+1; n < A->nt; n++) {
                    tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
184 185 186 187 188 189 190

                    RUNTIME_data_migrate( sequence, A(M, n),
                                          A->get_rankof( A, M+RD, n ) );
                    RUNTIME_data_migrate( sequence, A(M+RD, n),
                                          A->get_rankof( A, M+RD, n ) );

                    MORSE_TASK_ztpmqrt(
191 192
                        &options,
                        MorseLeft, MorseConjTrans,
193
                        tempMRDm, tempnn, A->nb, tempMRDm, ib, T->nb,
194
                        A (M+RD, k), ldaMRD,
195 196 197
                        T2(M+RD, k), T->mb,
                        A (M,    n), ldaM,
                        A (M+RD, n), ldaMRD);
198 199 200
                }
            }
        }
201 202 203 204 205 206 207

        /* Restore the original location of the tiles */
        for (n = k; n < A->nt; n++) {
            RUNTIME_data_migrate( sequence, A(k, n),
                                  A->get_rankof( A, k, n ) );
        }

208
        RUNTIME_iteration_pop(morse);
209
    }
Mathieu Faverge's avatar
Mathieu Faverge committed
210

211 212
    RUNTIME_options_ws_free(&options);
    RUNTIME_options_finalize(&options, morse);
Mathieu Faverge's avatar
Mathieu Faverge committed
213
    (void)D;
214
}