step6.c 8.49 KB
Newer Older
1
/**
2 3
 *
 * @file step6.c
4
 *
Mathieu Faverge's avatar
Mathieu Faverge committed
5 6
 * @copyright 2009-2014 The University of Tennessee and The University of
 *                      Tennessee Research Foundation. All rights reserved.
Mathieu Faverge's avatar
Mathieu Faverge committed
7
 * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8
 *                      Univ. Bordeaux. All rights reserved.
9
 *
Mathieu Faverge's avatar
Mathieu Faverge committed
10
 ***
11
 *
12
 * @brief Chameleon step6 example
13 14 15 16 17
 *
 * @version 1.0.0
 * @author Florent Pruvost
 * @date 2014-10-29
 *
18
 */
19 20 21
#include "step6.h"

/*
Mathieu Faverge's avatar
Mathieu Faverge committed
22
 * @brief step6 introduces how to use CHAMELEON with MPI.
23
 * @details This program is a copy of step5 with some additional parameters to
Mathieu Faverge's avatar
Mathieu Faverge committed
24
 * be set for the data distribution. To use this program properly CHAMELEON must
25 26 27 28 29 30 31
 * use StarPU Runtime system and MPI option must be activated at configure.
 * The data distribution used here is 2D block cyclic, see for example
 * http://www.netlib.org/scalapack/slug/node75.html for explanation.
 * The user can enter the parameters of the distribution grid at execution with
 * --p= and --q=
 */
int main(int argc, char *argv[]) {
32
    size_t N; // matrix order
33 34
    int NB;   // number of rows and columns in tiles
    int NRHS; // number of RHS vectors
35 36 37 38 39
    int NCPU; // number of cores to use
    int NGPU; // number of gpus (cuda devices) to use
    int GRID_P; // parameter of the 2D block cyclic distribution
    int GRID_Q; // parameter of the 2D block cyclic distribution
    int NMPIPROC = 1; // number of MPI processus
Mathieu Faverge's avatar
Mathieu Faverge committed
40
    int UPLO = ChamUpper; // where is stored L
41

Mathieu Faverge's avatar
Mathieu Faverge committed
42 43
    /* descriptors necessary for calling CHAMELEON tile interface  */
    CHAM_desc_t *descA = NULL, *descAC = NULL, *descB = NULL, *descX = NULL;
44 45 46 47 48 49 50 51

    /* declarations to time the program and evaluate performances */
    double fmuls, fadds, flops, gflops, cpu_time;

    /* variable to check the numerical results */
    double anorm, bnorm, xnorm, eps, res;
    int hres;

Mathieu Faverge's avatar
Mathieu Faverge committed
52
    /* CHAMELEON sequence uniquely identifies a set of asynchronous function calls
53
     * sharing common exception handling */
Mathieu Faverge's avatar
Mathieu Faverge committed
54 55 56
    RUNTIME_sequence_t *sequence = NULL;
    /* CHAMELEON request uniquely identifies each asynchronous function call */
    RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER;
57 58 59 60 61 62 63 64

    /* initialize some parameters with default values */
    int iparam[IPARAM_SIZEOF];
    memset(iparam, 0, IPARAM_SIZEOF*sizeof(int));
    init_iparam(iparam);

    /* read arguments */
    read_args(argc, argv, iparam);
65 66 67
    N    = iparam[IPARAM_N];
    NB   = iparam[IPARAM_NB];
    NRHS = iparam[IPARAM_NRHS];
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84

    /* compute the algorithm complexity to evaluate performances */
    fadds = (double)( FADDS_POTRF(N) + 2 * FADDS_TRSM(N,NRHS) );
    fmuls = (double)( FMULS_POTRF(N) + 2 * FMULS_TRSM(N,NRHS) );
    flops = 1e-9 * (fmuls + fadds);

    /* initialize the number of thread if not given by the user in argv
     * It makes sense only if this program is linked with pthread and
     * multithreaded BLAS and LAPACK */
    if ( iparam[IPARAM_THRDNBR] == -1 ) {
        get_thread_count( &(iparam[IPARAM_THRDNBR]) );
        /* reserve one thread par cuda device to optimize memory transfers */
        iparam[IPARAM_THRDNBR] -= iparam[IPARAM_NCUDAS];
    }
    NCPU = iparam[IPARAM_THRDNBR];
    NGPU = iparam[IPARAM_NCUDAS];

Mathieu Faverge's avatar
Mathieu Faverge committed
85
     /* Initialize CHAMELEON with main parameters */
Philippe Virouleau's avatar
Philippe Virouleau committed
86 87 88
    int rc = CHAMELEON_Init( NCPU, NGPU );
    if (rc != CHAMELEON_SUCCESS) {
        goto finalize;
89 90
    }

Mathieu Faverge's avatar
Mathieu Faverge committed
91 92 93
    /* set some specific parameters related to CHAMELEON: blocks size and inner-blocking size */
    CHAMELEON_Set(CHAMELEON_TILE_SIZE,        iparam[IPARAM_NB] );
    CHAMELEON_Set(CHAMELEON_INNER_BLOCK_SIZE, iparam[IPARAM_IB] );
94

95
#if defined(CHAMELEON_USE_MPI)
Mathieu Faverge's avatar
Mathieu Faverge committed
96
    NMPIPROC = CHAMELEON_Comm_size();
97 98 99 100 101 102 103 104 105 106 107 108 109
    /* Check P */
    if ( (iparam[IPARAM_P] > 1) &&
         (NMPIPROC % iparam[IPARAM_P] != 0) ) {
      fprintf(stderr, "ERROR: %d doesn't divide the number of MPI processus %d\n",
              iparam[IPARAM_P], NMPIPROC );
      return EXIT_FAILURE;
    }
#endif
    iparam[IPARAM_Q] = NMPIPROC / iparam[IPARAM_P];
    iparam[IPARAM_NMPI] = NMPIPROC;
    GRID_P = iparam[IPARAM_P];
    GRID_Q = iparam[IPARAM_Q];

Mathieu Faverge's avatar
Mathieu Faverge committed
110
    if ( CHAMELEON_My_Mpi_Rank() == 0 ){
111 112 113 114
        /* print informations to user */
        print_header( argv[0], iparam);
    }

Mathieu Faverge's avatar
Mathieu Faverge committed
115 116
    /* Initialize the structure required for CHAMELEON tile interface */
    CHAMELEON_Desc_Create(&descA, NULL, ChamRealDouble,
117 118
                      NB, NB, NB*NB, N, N, 0, 0, N, N,
                      GRID_P, GRID_Q);
Mathieu Faverge's avatar
Mathieu Faverge committed
119
    CHAMELEON_Desc_Create(&descB, NULL, ChamRealDouble,
120 121
                      NB, NB, NB*NB, N, NRHS, 0, 0, N, NRHS,
                      GRID_P, GRID_Q);
Mathieu Faverge's avatar
Mathieu Faverge committed
122
    CHAMELEON_Desc_Create(&descX, NULL, ChamRealDouble,
123 124
                      NB, NB, NB*NB, N, NRHS, 0, 0, N, NRHS,
                      GRID_P, GRID_Q);
Mathieu Faverge's avatar
Mathieu Faverge committed
125
    CHAMELEON_Desc_Create(&descAC, NULL, ChamRealDouble,
126 127
                      NB, NB, NB*NB, N, N, 0, 0, N, N,
                      GRID_P, GRID_Q);
128 129

    /* generate A matrix with random values such that it is spd */
Mathieu Faverge's avatar
Mathieu Faverge committed
130
    CHAMELEON_dplgsy_Tile( (double)N, ChamUpperLower, descA, 51 );
131 132

    /* generate RHS */
Mathieu Faverge's avatar
Mathieu Faverge committed
133
    CHAMELEON_dplrnt_Tile( descB, 5673 );
134 135

    /* copy A before facto. in order to check the result */
Mathieu Faverge's avatar
Mathieu Faverge committed
136
    CHAMELEON_dlacpy_Tile(ChamUpperLower, descA, descAC);
137 138 139

    /* copy B in X before solving
     * same sense as memcpy(X, B, N*NRHS*sizeof(double)) but for descriptors */
Mathieu Faverge's avatar
Mathieu Faverge committed
140
    CHAMELEON_dlacpy_Tile(ChamUpperLower, descB, descX);
141 142 143 144 145

    /************************************************************/
    /* solve the system AX = B using the Cholesky factorization */
    /************************************************************/

Mathieu Faverge's avatar
Mathieu Faverge committed
146
    cpu_time = -CHAMELEON_timer();
147

Mathieu Faverge's avatar
Mathieu Faverge committed
148
    CHAMELEON_Sequence_Create(&sequence);
149 150 151

    /* Cholesky factorization:
     * A is replaced by its factorization L or L^T depending on uplo */
Mathieu Faverge's avatar
Mathieu Faverge committed
152
    CHAMELEON_dpotrf_Tile_Async( UPLO, descA, sequence, &request );
153 154 155 156 157

    /* Solve:
     * B is stored in X on entry, X contains the result on exit.
     * Forward and back substitutions
     */
Mathieu Faverge's avatar
Mathieu Faverge committed
158
    CHAMELEON_dpotrs_Tile_Async( UPLO, descA, descX, sequence, &request);
159

160 161
    /* Ensure that all data processed on the gpus we are depending on are back
     * in main memory */
Mathieu Faverge's avatar
Mathieu Faverge committed
162 163
    CHAMELEON_Desc_Flush( descA, sequence );
    CHAMELEON_Desc_Flush( descX, sequence );
164

165 166
    /* Synchronization barrier (the runtime ensures that all submitted tasks
     * have been terminated */
Mathieu Faverge's avatar
Mathieu Faverge committed
167
    CHAMELEON_Sequence_Wait(sequence);
168

Philippe Virouleau's avatar
Philippe Virouleau committed
169 170 171 172
    rc = sequence->status;
    if ( rc != CHAMELEON_SUCCESS ) {
        fprintf(stderr, "Error in computation (%d)\n", rc);
        goto finalize;
173
    }
Mathieu Faverge's avatar
Mathieu Faverge committed
174
    CHAMELEON_Sequence_Destroy(sequence);
175

Mathieu Faverge's avatar
Mathieu Faverge committed
176
    cpu_time += CHAMELEON_timer();
177 178 179

    /* print informations to user */
    gflops = flops / cpu_time;
Mathieu Faverge's avatar
Mathieu Faverge committed
180
    if ( CHAMELEON_My_Mpi_Rank() == 0 ) {
181
        printf( "%9.3f %9.2f\n", cpu_time, gflops);
182
    }
183 184 185 186 187 188 189
    fflush( stdout );

    /************************************************************/
    /* check if solve is correct i.e. AX-B = 0                  */
    /************************************************************/

    /* compute norms to check the result */
Mathieu Faverge's avatar
Mathieu Faverge committed
190 191 192
    anorm = CHAMELEON_dlange_Tile( ChamInfNorm, descAC);
    bnorm = CHAMELEON_dlange_Tile( ChamInfNorm, descB);
    xnorm = CHAMELEON_dlange_Tile( ChamInfNorm, descX);
193 194

    /* compute A*X-B, store the result in B */
Mathieu Faverge's avatar
Mathieu Faverge committed
195
    CHAMELEON_dgemm_Tile( ChamNoTrans, ChamNoTrans,
196
                      1.0, descAC, descX, -1.0, descB );
Mathieu Faverge's avatar
Mathieu Faverge committed
197
    res = CHAMELEON_dlange_Tile( ChamInfNorm, descB );
198 199 200 201 202 203 204 205 206

    /* check residual and print a message */
    eps = LAPACKE_dlamch_work( 'e' );

    /*
     * if hres = 0 then the test succeed
     * else the test failed
     */
    hres = ( res / N / eps / (anorm * xnorm + bnorm ) > 100.0 );
Mathieu Faverge's avatar
Mathieu Faverge committed
207
    if ( CHAMELEON_My_Mpi_Rank() == 0 ){
208
        printf( "   ||Ax-b||       ||A||       ||x||       ||b|| ||Ax-b||/N/eps/(||A||||x||+||b||)  RETURN\n");
209
        if (hres) {
210
            printf( "%8.5e %8.5e %8.5e %8.5e                       %8.5e FAILURE \n",
211 212 213 214
                    res, anorm, xnorm, bnorm,
                    res / N / eps / (anorm * xnorm + bnorm ));
        }
        else {
215
            printf( "%8.5e %8.5e %8.5e %8.5e                       %8.5e SUCCESS \n",
216 217 218
                    res, anorm, xnorm, bnorm,
                    res / N / eps / (anorm * xnorm + bnorm ));
        }
219 220 221
    }

    /* deallocate A, B, X, Acpy and associated descriptors descA, ... */
Mathieu Faverge's avatar
Mathieu Faverge committed
222 223 224 225
    CHAMELEON_Desc_Destroy( &descA );
    CHAMELEON_Desc_Destroy( &descB );
    CHAMELEON_Desc_Destroy( &descX );
    CHAMELEON_Desc_Destroy( &descAC );
226

Philippe Virouleau's avatar
Philippe Virouleau committed
227 228 229 230 231 232
finalize:
    /*
     * Required semicolon to have at least one inst
     * before the end of OpenMP block.
     */
    ;
Mathieu Faverge's avatar
Mathieu Faverge committed
233 234
    /* Finalize CHAMELEON */
    CHAMELEON_Finalize();
235 236 237

    return EXIT_SUCCESS;
}