codelet_zttmqr.c 9.61 KB
Newer Older
1 2
/**
 *
3 4
 * @copyright (c) 2009-2014 The University of Tennessee and The University
 *                          of Tennessee Research Foundation.
5
 *                          All rights reserved.
6
 * @copyright (c) 2012-2016 Inria. All rights reserved.
7
 * @copyright (c) 2012-2014, 2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
 *
 **/

/**
 *
 * @file codelet_zttmqr.c
 *
 *  MORSE codelets kernel
 *  MORSE is a software package provided by Univ. of Tennessee,
 *  Univ. of California Berkeley and Univ. of Colorado Denver
 *
 * @version 2.5.0
 * @comment This file has been automatically generated
 *          from Plasma 2.5.0 for MORSE 1.0.0
 * @author Hatem Ltaief
 * @author Dulceneia Becker
 * @author Mathieu Faverge
 * @author Emmanuel Agullo
 * @author Cedric Castagnede
 * @date 2010-11-15
 * @precisions normal z -> c d s
 *
 **/
31 32
#include "chameleon_starpu.h"
#include "runtime_codelet_z.h"
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72

/**
 *
 * @ingroup CORE_MORSE_Complex64_t
 *
 *  CORE_zttmqr overwrites the general complex M1-by-N1 tile A1 and
 *  M2-by-N2 tile A2 (N1 == N2) with
 *
 *                        SIDE = 'L'        SIDE = 'R'
 *    TRANS = 'N':         Q * | A1 |       | A1 | * Q
 *                             | A2 |       | A2 |
 *
 *    TRANS = 'C':      Q**H * | A1 |       | A1 | * Q**H
 *                             | A2 |       | A2 |
 *
 *  where Q is a complex unitary matrix defined as the product of k
 *  elementary reflectors
 *
 *    Q = H(1) H(2) . . . H(k)
 *
 *  as returned by CORE_zttqrt.
 *
 *******************************************************************************
 *
 * @param[in] side
 *         @arg MorseLeft  : apply Q or Q**H from the Left;
 *         @arg MorseRight : apply Q or Q**H from the Right.
 *
 * @param[in] trans
 *         @arg MorseNoTrans   :  No transpose, apply Q;
 *         @arg MorseConjTrans :  ConjTranspose, apply Q**H.
 *
 * @param[in] M1
 *         The number of rows of the tile A1. M1 >= 0.
 *
 * @param[in] N1
 *         The number of columns of the tile A1. N1 >= 0.
 *
 * @param[in] M2
 *         The number of rows of the tile A2. M2 >= 0.
73
 *         M2 = M1 if side == MorseRight.
74 75 76
 *
 * @param[in] N2
 *         The number of columns of the tile A2. N2 >= 0.
77
 *         N2 = N1 if side == MorseLeft.
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
 *
 * @param[in] K
 *         The number of elementary reflectors whose product defines
 *         the matrix Q.
 *
 * @param[in] IB
 *         The inner-blocking size.  IB >= 0.
 *
 * @param[in,out] A1
 *         On entry, the M1-by-N1 tile A1.
 *         On exit, A1 is overwritten by the application of Q.
 *
 * @param[in] LDA1
 *         The leading dimension of the array A1. LDA1 >= max(1,M1).
 *
 * @param[in,out] A2
 *         On entry, the M2-by-N2 tile A2.
 *         On exit, A2 is overwritten by the application of Q.
 *
 * @param[in] LDA2
 *         The leading dimension of the tile A2. LDA2 >= max(1,M2).
 *
 * @param[in] V
 *         The i-th row must contain the vector which defines the
 *         elementary reflector H(i), for i = 1,2,...,k, as returned by
103
 *         CORE_ZTTQRT in the first k columns of its array argument V.
104 105 106 107
 *
 * @param[in] LDV
 *         The leading dimension of the array V. LDV >= max(1,K).
 *
108
 * @param[in] T
109 110 111 112 113 114 115 116
 *         The IB-by-N1 triangular factor T of the block reflector.
 *         T is upper triangular by block (economic storage);
 *         The rest of the array is not referenced.
 *
 * @param[in] LDT
 *         The leading dimension of the array T. LDT >= IB.
 *
 * @param[out] WORK
117 118 119
 *         Workspace array of size
 *             LDWORK-by-N1 if side == MorseLeft
 *             LDWORK-by-IB if side == MorseRight
120 121
 *
 * @param[in] LDWORK
122 123 124
 *         The leading dimension of the array WORK.
 *             LDWORK >= max(1,IB) if side == MorseLeft
 *             LDWORK >= max(1,M1) if side == MorseRight
125 126 127 128 129 130 131 132 133
 *
 *******************************************************************************
 *
 * @return
 *          \retval MORSE_SUCCESS successful exit
 *          \retval <0 if -i, the i-th argument had an illegal value
 *
 ******************************************************************************/

134
void MORSE_TASK_zttmqr(const MORSE_option_t *options,
135 136
                       MORSE_enum side, MORSE_enum trans,
                       int m1, int n1, int m2, int n2, int k, int ib, int nb,
137 138 139 140
                       const MORSE_desc_t *A1, int A1m, int A1n, int lda1,
                       const MORSE_desc_t *A2, int A2m, int A2n, int lda2,
                       const MORSE_desc_t *V, int Vm, int Vn, int ldv,
                       const MORSE_desc_t *T, int Tm, int Tn, int ldt)
141 142 143 144 145
{
    (void)nb;
    struct starpu_codelet *codelet = &cl_zttmqr;
    void (*callback)(void*) = options->profiling ? cl_zttmqr_callback : NULL;
    int ldwork = side == MorseLeft ? ib : nb;
146 147 148 149 150 151
    int sizeA1 = lda1*n1;
    int sizeA2 = lda2*n2;
    int sizeV = ldv*k;
    int sizeT = ldt*n1;
    int execution_rank = A2->get_rankof( A2, A2m, A2n );
    int rank_changed=0;
152
    (void)execution_rank;
153

154
    /*  force execution on the rank owning the largest data (tile) */
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
    int threshold;
    char* env = getenv("MORSE_COMM_FACTOR_THRESHOLD");
    if (env != NULL)
        threshold = (unsigned)atoi(env);
    else
        threshold = 10;
    if ( sizeA1 > threshold*sizeA2 ){
        execution_rank = A1->get_rankof( A1, A1m, A1n );
        rank_changed = 1;
    }else if( sizeV > threshold*sizeA2 ){
        execution_rank = V->get_rankof( V, Vm, Vn );
        rank_changed = 1;
    }else if( sizeT > threshold*sizeA2 ){
        execution_rank = T->get_rankof( T, Tm, Tn );
        rank_changed = 1;
    }
171

THIBAULT Samuel's avatar
THIBAULT Samuel committed
172 173 174 175 176 177
    MORSE_BEGIN_ACCESS_DECLARATION;
    MORSE_ACCESS_RW(A1, A1m, A1n);
    MORSE_ACCESS_RW(A2, A2m, A2n);
    MORSE_ACCESS_R(V, Vm, Vn);
    MORSE_ACCESS_R(T, Tm, Tn);
    if (rank_changed)
178
        MORSE_RANK_CHANGED(execution_rank);
THIBAULT Samuel's avatar
THIBAULT Samuel committed
179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
    MORSE_END_ACCESS_DECLARATION;

    starpu_insert_task(
        starpu_mpi_codelet(codelet),
        STARPU_VALUE,    &side,              sizeof(MORSE_enum),
        STARPU_VALUE,    &trans,             sizeof(MORSE_enum),
        STARPU_VALUE,    &m1,                sizeof(int),
        STARPU_VALUE,    &n1,                sizeof(int),
        STARPU_VALUE,    &m2,                sizeof(int),
        STARPU_VALUE,    &n2,                sizeof(int),
        STARPU_VALUE,    &k,                 sizeof(int),
        STARPU_VALUE,    &ib,                sizeof(int),
        STARPU_RW,        RTBLKADDR(A1, MORSE_Complex64_t, A1m, A1n),
        STARPU_VALUE,    &lda1,              sizeof(int),
        STARPU_RW,        RTBLKADDR(A2, MORSE_Complex64_t, A2m, A2n),
        STARPU_VALUE,    &lda2,              sizeof(int),
        STARPU_R,         RTBLKADDR(V, MORSE_Complex64_t, Vm, Vn),
        STARPU_VALUE,    &ldv,               sizeof(int),
        STARPU_R,         RTBLKADDR(T, MORSE_Complex64_t, Tm, Tn),
        STARPU_VALUE,    &ldt,               sizeof(int),
199
        /* max( ib*nb, 2*ib*nb ) */
THIBAULT Samuel's avatar
THIBAULT Samuel committed
200 201 202 203
        STARPU_SCRATCH,   options->ws_worker,
        STARPU_VALUE,    &ldwork,            sizeof(int),
        STARPU_PRIORITY,  options->priority,
        STARPU_CALLBACK,  callback,
204
#if defined(CHAMELEON_USE_MPI)
THIBAULT Samuel's avatar
THIBAULT Samuel committed
205
        STARPU_EXECUTE_ON_NODE, execution_rank,
206 207
#endif
#if defined(CHAMELEON_CODELETS_HAVE_NAME)
THIBAULT Samuel's avatar
THIBAULT Samuel committed
208
        STARPU_NAME, "zttmqr",
209
#endif
THIBAULT Samuel's avatar
THIBAULT Samuel committed
210
        0);
211 212 213
}


214
#if !defined(CHAMELEON_SIMULATION)
215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
static void cl_zttmqr_cpu_func(void *descr[], void *cl_arg)
{
    MORSE_enum side;
    MORSE_enum trans;
    int m1;
    int n1;
    int m2;
    int n2;
    int k;
    int ib;
    MORSE_Complex64_t *A1;
    int lda1;
    MORSE_Complex64_t *A2;
    int lda2;
    MORSE_Complex64_t *V;
    int ldv;
    MORSE_Complex64_t *T;
    int ldt;
    MORSE_Complex64_t *WORK;
    int ldwork;

    A1   = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
    A2   = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
    V    = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]);
    T    = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]);
240
    WORK = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */
241 242 243 244 245 246 247

    starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib,
                               &lda1, &lda2, &ldv, &ldt, &ldwork);

    CORE_zttmqr(side, trans, m1, n1, m2, n2, k, ib,
                A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork);
}
Mathieu Faverge's avatar
Mathieu Faverge committed
248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283

#if defined(CHAMELEON_USE_CUDA)
static void cl_zttmqr_cuda_func(void *descr[], void *cl_arg)
{
    MORSE_enum side;
    MORSE_enum trans;
    int m1;
    int n1;
    int m2;
    int n2;
    int k;
    int ib;
    cuDoubleComplex *A1;
    int lda1;
    cuDoubleComplex *A2;
    int lda2;
    cuDoubleComplex *V;
    int ldv;
    cuDoubleComplex *T;
    int ldt;
    cuDoubleComplex *W, *WC;
    int ldwork;
    int ldworkc;

    A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
    A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
    V  = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
    T  = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]);
    W  = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */

    starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib,
                               &lda1, &lda2, &ldv, &ldt, &ldwork);

    WC = W + ib * (side == MorseLeft ? m1 : n1);
    ldworkc = (side == MorseLeft) ? m2 : ib;

284
    RUNTIME_getStream(stream);
Mathieu Faverge's avatar
Mathieu Faverge committed
285 286 287 288 289 290 291 292 293 294 295

    CUDA_zttmqr(
            side, trans, m1, n1, m2, n2, k, ib,
            A1, lda1, A2, lda2, V, ldv, T, ldt,
            W, ldwork, WC, ldworkc, stream );

#ifndef STARPU_CUDA_ASYNC
    cudaStreamSynchronize( stream );
#endif
}
#endif /* defined(CHAMELEON_USE_CUDA) */
296
#endif /* !defined(CHAMELEON_SIMULATION) */
297 298 299 300

/*
 * Codelet definition
 */
Mathieu Faverge's avatar
Mathieu Faverge committed
301
CODELETS(zttmqr, 5, cl_zttmqr_cpu_func, cl_zttmqr_cuda_func, STARPU_CUDA_ASYNC)