step5.c 7.36 KB
Newer Older
1
/**
2 3
 *
 * @file step5.c
4
 *
Mathieu Faverge's avatar
Mathieu Faverge committed
5 6
 * @copyright 2009-2014 The University of Tennessee and The University of
 *                      Tennessee Research Foundation. All rights reserved.
Mathieu Faverge's avatar
Mathieu Faverge committed
7
 * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8
 *                      Univ. Bordeaux. All rights reserved.
9
 *
Mathieu Faverge's avatar
Mathieu Faverge committed
10
 ***
11
 *
12
 * @brief Chameleon step5 example
13 14 15 16 17
 *
 * @version 1.0.0
 * @author Florent Pruvost
 * @date 2014-10-29
 *
18
 */
19 20 21 22 23
#include "step5.h"

/*
 * @brief step5 shows how to set some important parameters.
 * @details This program is a copy of step4 but some additional parameters
Mathieu Faverge's avatar
Mathieu Faverge committed
24
 * are given to CHAMELEON by the user.
25 26 27 28 29 30 31 32 33
 * The parameters that can be set are:
 *  - number of Threads
 *  - number of GPUs
 *  - matrix size
 *  - block (tile) size
 *  - inner-blocking size
 *  - number of right-hand sides
 */
int main(int argc, char *argv[]) {
34
    size_t N; // matrix order
35 36
    int NB;   // number of rows and columns in tiles
    int NRHS; // number of RHS vectors
37 38
    int NCPU; // number of cores to use
    int NGPU; // number of gpus (cuda devices) to use
Mathieu Faverge's avatar
Mathieu Faverge committed
39
    int UPLO = ChamUpper; // where is stored L
40

Mathieu Faverge's avatar
Mathieu Faverge committed
41 42
    /* descriptors necessary for calling CHAMELEON tile interface  */
    CHAM_desc_t *descA = NULL, *descAC = NULL, *descB = NULL, *descX = NULL;
43 44 45 46 47 48 49 50

    /* declarations to time the program and evaluate performances */
    double fmuls, fadds, flops, gflops, cpu_time;

    /* variable to check the numerical results */
    double anorm, bnorm, xnorm, eps, res;
    int hres;

Mathieu Faverge's avatar
Mathieu Faverge committed
51
    /* CHAMELEON sequence uniquely identifies a set of asynchronous function calls
52
     * sharing common exception handling */
Mathieu Faverge's avatar
Mathieu Faverge committed
53 54 55
    RUNTIME_sequence_t *sequence = NULL;
    /* CHAMELEON request uniquely identifies each asynchronous function call */
    RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER;
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86

    /* initialize some parameters with default values */
    int iparam[IPARAM_SIZEOF];
    memset(iparam, 0, IPARAM_SIZEOF*sizeof(int));
    init_iparam(iparam);

    /* read arguments */
    read_args(argc, argv, iparam);
    N    = iparam[IPARAM_N];
    NB   = iparam[IPARAM_NB];
    NRHS = iparam[IPARAM_NRHS];

    /* compute the algorithm complexity to evaluate performances */
    fadds = (double)( FADDS_POTRF(N) + 2 * FADDS_TRSM(N,NRHS) );
    fmuls = (double)( FMULS_POTRF(N) + 2 * FMULS_TRSM(N,NRHS) );
    flops = 1e-9 * (fmuls + fadds);

    /* initialize the number of thread if not given by the user in argv
     * It makes sense only if this program is linked with pthread and
     * multithreaded BLAS and LAPACK */
    if ( iparam[IPARAM_THRDNBR] == -1 ) {
        get_thread_count( &(iparam[IPARAM_THRDNBR]) );
        /* reserve one thread par cuda device to optimize memory transfers */
        iparam[IPARAM_THRDNBR] -= iparam[IPARAM_NCUDAS];
    }
    NCPU = iparam[IPARAM_THRDNBR];
    NGPU = iparam[IPARAM_NCUDAS];

    /* print informations to user */
    print_header( argv[0], iparam);

Mathieu Faverge's avatar
Mathieu Faverge committed
87
     /* Initialize CHAMELEON with main parameters */
Philippe Virouleau's avatar
Philippe Virouleau committed
88 89 90
    int rc = CHAMELEON_Init( NCPU, NGPU );
    if (rc != CHAMELEON_SUCCESS) {
        goto finalize;
91 92
    }

Mathieu Faverge's avatar
Mathieu Faverge committed
93 94 95
    /* set some specific parameters related to CHAMELEON: blocks size and inner-blocking size */
    CHAMELEON_Set(CHAMELEON_TILE_SIZE,        iparam[IPARAM_NB] );
    CHAMELEON_Set(CHAMELEON_INNER_BLOCK_SIZE, iparam[IPARAM_IB] );
96

Mathieu Faverge's avatar
Mathieu Faverge committed
97 98
    /* Initialize the structure required for CHAMELEON tile interface */
    CHAMELEON_Desc_Create(&descA,  NULL, ChamRealDouble,
99
                      NB, NB,  NB*NB, N, N, 0, 0, N, N, 1, 1);
Mathieu Faverge's avatar
Mathieu Faverge committed
100
    CHAMELEON_Desc_Create(&descB,  NULL, ChamRealDouble,
101
                      NB, NB,  NB*NB, N, NRHS, 0, 0, N, NRHS, 1, 1);
Mathieu Faverge's avatar
Mathieu Faverge committed
102
    CHAMELEON_Desc_Create(&descX,  NULL, ChamRealDouble,
103
                      NB, NB,  NB*NB, N, NRHS, 0, 0, N, NRHS, 1, 1);
Mathieu Faverge's avatar
Mathieu Faverge committed
104
    CHAMELEON_Desc_Create(&descAC, NULL, ChamRealDouble,
105 106 107
                      NB, NB,  NB*NB, N, N, 0, 0, N, N, 1, 1);

    /* generate A matrix with random values such that it is spd */
Mathieu Faverge's avatar
Mathieu Faverge committed
108
    CHAMELEON_dplgsy_Tile( (double)N, ChamUpperLower, descA, 51 );
109 110

    /* generate RHS */
Mathieu Faverge's avatar
Mathieu Faverge committed
111
    CHAMELEON_dplrnt_Tile( descB, 5673 );
112 113

    /* copy A before facto. in order to check the result */
Mathieu Faverge's avatar
Mathieu Faverge committed
114
    CHAMELEON_dlacpy_Tile(ChamUpperLower, descA, descAC);
115 116 117

    /* copy B in X before solving
     * same sense as memcpy(X, B, N*NRHS*sizeof(double)) but for descriptors */
Mathieu Faverge's avatar
Mathieu Faverge committed
118
    CHAMELEON_dlacpy_Tile(ChamUpperLower, descB, descX);
119 120 121 122 123

    /************************************************************/
    /* solve the system AX = B using the Cholesky factorization */
    /************************************************************/

Mathieu Faverge's avatar
Mathieu Faverge committed
124
    cpu_time = -CHAMELEON_timer();
125

Mathieu Faverge's avatar
Mathieu Faverge committed
126
    CHAMELEON_Sequence_Create(&sequence);
127 128 129

    /* Cholesky factorization:
     * A is replaced by its factorization L or L^T depending on uplo */
Mathieu Faverge's avatar
Mathieu Faverge committed
130
    CHAMELEON_dpotrf_Tile_Async( UPLO, descA, sequence, &request );
131 132 133 134 135

    /* Solve:
     * B is stored in X on entry, X contains the result on exit.
     * Forward and back substitutions
     */
Mathieu Faverge's avatar
Mathieu Faverge committed
136
    CHAMELEON_dpotrs_Tile_Async( UPLO, descA, descX, sequence, &request);
137

138 139
    /* Ensure that all data processed on the gpus we are depending on are back
     * in main memory */
Mathieu Faverge's avatar
Mathieu Faverge committed
140 141
    CHAMELEON_Desc_Flush( descA, sequence );
    CHAMELEON_Desc_Flush( descX, sequence );
142

143 144
    /* Synchronization barrier (the runtime ensures that all submitted tasks
     * have been terminated */
Mathieu Faverge's avatar
Mathieu Faverge committed
145
    CHAMELEON_Sequence_Wait(sequence);
146

Philippe Virouleau's avatar
Philippe Virouleau committed
147 148 149 150
    rc = sequence->status;
    if ( rc != CHAMELEON_SUCCESS ) {
        fprintf(stderr, "Error in computation (%d)\n", rc);
        goto finalize;
151
    }
Mathieu Faverge's avatar
Mathieu Faverge committed
152
    CHAMELEON_Sequence_Destroy(sequence);
153

Mathieu Faverge's avatar
Mathieu Faverge committed
154
    cpu_time += CHAMELEON_timer();
155 156 157 158 159 160 161 162 163 164 165

    /* print informations to user */
    gflops = flops / cpu_time;
    printf( "%9.3f %9.2f\n", cpu_time, gflops);
    fflush( stdout );

    /************************************************************/
    /* check if solve is correct i.e. AX-B = 0                  */
    /************************************************************/

    /* compute norms to check the result */
Mathieu Faverge's avatar
Mathieu Faverge committed
166 167 168
    anorm = CHAMELEON_dlange_Tile( ChamInfNorm, descAC);
    bnorm = CHAMELEON_dlange_Tile( ChamInfNorm, descB);
    xnorm = CHAMELEON_dlange_Tile( ChamInfNorm, descX);
169 170

    /* compute A*X-B, store the result in B */
Mathieu Faverge's avatar
Mathieu Faverge committed
171
    CHAMELEON_dgemm_Tile( ChamNoTrans, ChamNoTrans,
172
                      1.0, descAC, descX, -1.0, descB );
Mathieu Faverge's avatar
Mathieu Faverge committed
173
    res = CHAMELEON_dlange_Tile( ChamInfNorm, descB );
174 175 176 177 178 179 180 181 182 183

    /* check residual and print a message */
    eps = LAPACKE_dlamch_work( 'e' );

    /*
     * if hres = 0 then the test succeed
     * else the test failed
     */
    hres = ( res / N / eps / (anorm * xnorm + bnorm ) > 100.0 );
    printf( "   ||Ax-b||       ||A||       ||x||       ||b|| ||Ax-b||/N/eps/(||A||||x||+||b||)  RETURN\n");
184
    if (hres) {
185
        printf( "%8.5e %8.5e %8.5e %8.5e                       %8.5e FAILURE \n",
186 187 188 189
                res, anorm, xnorm, bnorm,
                res / N / eps / (anorm * xnorm + bnorm ));
    }
    else {
190
        printf( "%8.5e %8.5e %8.5e %8.5e                       %8.5e SUCCESS \n",
191 192 193
                res, anorm, xnorm, bnorm,
                res / N / eps / (anorm * xnorm + bnorm ));
    }
194 195

    /* deallocate A, B, X, Acpy and associated descriptors descA, ... */
Mathieu Faverge's avatar
Mathieu Faverge committed
196 197 198 199
    CHAMELEON_Desc_Destroy( &descA );
    CHAMELEON_Desc_Destroy( &descB );
    CHAMELEON_Desc_Destroy( &descX );
    CHAMELEON_Desc_Destroy( &descAC );
200

Philippe Virouleau's avatar
Philippe Virouleau committed
201 202 203 204 205 206
finalize:
    /*
     * Required semicolon to have at least one inst
     * before the end of OpenMP block.
     */
    ;
Mathieu Faverge's avatar
Mathieu Faverge committed
207 208
    /* Finalize CHAMELEON */
    CHAMELEON_Finalize();
209 210 211

    return EXIT_SUCCESS;
}