step1.c 6.17 KB
Newer Older
1
/**
2 3
 *
 * @file step1.c
4
 *
Mathieu Faverge's avatar
Mathieu Faverge committed
5 6
 * @copyright 2009-2014 The University of Tennessee and The University of
 *                      Tennessee Research Foundation. All rights reserved.
Mathieu Faverge's avatar
Mathieu Faverge committed
7
 * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
8
 *                      Univ. Bordeaux. All rights reserved.
9
 *
Mathieu Faverge's avatar
Mathieu Faverge committed
10
 ***
11
 *
12
 * @brief Chameleon step1 example
13 14 15 16 17
 *
 * @version 1.0.0
 * @author Florent Pruvost
 * @date 2014-10-29
 *
18
 */
19 20 21
#include "step1.h"

/*
Mathieu Faverge's avatar
Mathieu Faverge committed
22
 * @brief step1 introduces the simple CHAMELEON interface
23 24 25 26
 * @details This program solves a linear system AX=B with matrix A symmetric
 * positive definite.
 * The matrix A is first factorized using the Cholesky factorization, A = LL^T.
 * Then the solution X is calculated thanks to forward and back substitutions.
Mathieu Faverge's avatar
Mathieu Faverge committed
27
 * We use CHAMELEON routines with the LAPACK interface which means the routines
28 29 30 31
 * accepts the same matrix format as LAPACK.
 * Note that we copy the matrix to get it in our own tile structures.
 * This means you can get an overhead coming from copies.
 * The resulting code is very close to step0 so that users of CBLAS/LAPACKE
Mathieu Faverge's avatar
Mathieu Faverge committed
32
 * should not be lost. The only things new are the CHAMELEON_Init/Cham_Finalize
33
 * calls, necessary to initialize/finalize the runtime system and mpi, and
Mathieu Faverge's avatar
Mathieu Faverge committed
34
 * CHAMELEON_Set to give some specific parameters.
35 36
 * This code allows you to expoit parallelism coming from all the cores of your
 * computer and from gpus if you have properly linked with pthread and CUDA
37
 * ( + CUBLAS optionnaly ).
38 39 40 41 42 43 44
 * The precision is: double
 */
int main(int argc, char *argv[]) {
    size_t N;    // matrix order
    size_t NRHS; // number of RHS vectors
    int NCPU; // number of cores to use
    int NGPU; // number of gpus (cuda devices) to use
Mathieu Faverge's avatar
Mathieu Faverge committed
45
    int UPLO = ChamUpper; // where is stored L
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80

    /* declarations to time the program and evaluate performances */
    double fmuls, fadds, flops, gflops, cpu_time;

    /* variable to check the numerical results */
    double anorm, bnorm, xnorm, eps, res;
    int hres;

    /* initialize some parameters with default values */
    int iparam[IPARAM_SIZEOF];
    memset(iparam, 0, IPARAM_SIZEOF*sizeof(int));
    init_iparam(iparam);

    /* read arguments */
    read_args(argc, argv, iparam);
    N    = iparam[IPARAM_N];
    NRHS = iparam[IPARAM_NRHS];

    /* compute the algorithm complexity to evaluate performances */
    fadds = (double)( FADDS_POTRF(N) + 2 * FADDS_TRSM(N,NRHS) );
    fmuls = (double)( FMULS_POTRF(N) + 2 * FMULS_TRSM(N,NRHS) );
    flops = 1e-9 * (fmuls + fadds);

    /* initialize the number of thread if not given by the user in argv
     * It makes sense only if this program is linked with pthread and
     * multithreaded BLAS and LAPACK */
    if ( iparam[IPARAM_THRDNBR] == -1 ) {
        get_thread_count( &(iparam[IPARAM_THRDNBR]) );
    }
    NCPU = iparam[IPARAM_THRDNBR];
    NGPU = 0;

    /* print informations to user */
    print_header( argv[0], iparam);

Mathieu Faverge's avatar
Mathieu Faverge committed
81
    /* Initialize CHAMELEON with main parameters */
Philippe Virouleau's avatar
Philippe Virouleau committed
82 83 84
    int rc = CHAMELEON_Init( NCPU, NGPU );
    if (rc != CHAMELEON_SUCCESS) {
        goto finalize;
85
    }
86 87

    /*
88
     * Allocate memory for our data using a C macro (see step1.h)
89 90 91 92
     *     - matrix A                   : size N x N
     *     - set of RHS vectors B       : size N x NRHS
     *     - set of solutions vectors X : size N x NRHS
     */
93 94 95 96
    double *A    = malloc( N * N    * sizeof(double) );
    double *Acpy = malloc( N * N    * sizeof(double) );
    double *B    = malloc( N * NRHS * sizeof(double) );
    double *X    = malloc( N * NRHS * sizeof(double) );
97 98

    /* generate A matrix with random values such that it is spd */
Mathieu Faverge's avatar
Mathieu Faverge committed
99
    CHAMELEON_dplgsy( (double)N, ChamUpperLower, N, A, N, 51 );
100 101

    /* generate RHS */
Mathieu Faverge's avatar
Mathieu Faverge committed
102
    CHAMELEON_dplrnt( N, NRHS, B, N, 5673 );
103 104 105 106 107 108 109 110 111 112 113

    /* copy A before facto. in order to check the result */
    memcpy(Acpy, A, N*N*sizeof(double));

    /* copy B in X before solving */
    memcpy(X, B, N*NRHS*sizeof(double));

    /************************************************************/
    /* solve the system AX = B using the Cholesky factorization */
    /************************************************************/

Mathieu Faverge's avatar
Mathieu Faverge committed
114
    cpu_time = -CHAMELEON_timer();
115 116 117

    /* Cholesky factorization:
     * A is replaced by its factorization L or L^T depending on uplo */
Mathieu Faverge's avatar
Mathieu Faverge committed
118
    CHAMELEON_dpotrf( UPLO, N, A, N );
119 120 121 122 123

    /* Solve:
     * B is stored in X on entry, X contains the result on exit.
     * Forward and back substitutions
     */
Mathieu Faverge's avatar
Mathieu Faverge committed
124
    CHAMELEON_dpotrs(UPLO, N, NRHS, A, N, X, N);
125

Mathieu Faverge's avatar
Mathieu Faverge committed
126
    cpu_time += CHAMELEON_timer();
127 128 129 130 131 132 133 134 135 136 137

    /* print informations to user */
    gflops = flops / cpu_time;
    printf( "%9.3f %9.2f\n", cpu_time, gflops);
    fflush( stdout );

    /************************************************************/
    /* check if solve is correct i.e. AX-B = 0                  */
    /************************************************************/

    /* compute norms to check the result */
Mathieu Faverge's avatar
Mathieu Faverge committed
138 139 140
    anorm = CHAMELEON_dlange( ChamInfNorm, N, N, Acpy, N);
    bnorm = CHAMELEON_dlange( ChamInfNorm, N, NRHS, B, N);
    xnorm = CHAMELEON_dlange( ChamInfNorm, N, NRHS, X, N);
141 142

    /* compute A*X-B, store the result in B */
Mathieu Faverge's avatar
Mathieu Faverge committed
143
    CHAMELEON_dgemm(ChamNoTrans, ChamNoTrans,
144
                N, NRHS, N, 1.0, Acpy, N, X, N, -1.0, B, N);
Mathieu Faverge's avatar
Mathieu Faverge committed
145
    res = CHAMELEON_dlange( ChamInfNorm, N, NRHS, B, N);
146 147 148 149 150 151 152 153 154 155

    /* check residual and print a message */
    eps = LAPACKE_dlamch_work( 'e' );

    /*
     * if hres = 0 then the test succeed
     * else the test failed
     */
    hres = ( res / N / eps / (anorm * xnorm + bnorm ) > 100.0 );
    printf( "   ||Ax-b||       ||A||       ||x||       ||b|| ||Ax-b||/N/eps/(||A||||x||+||b||)  RETURN\n");
156
    if (hres) {
157 158 159
        printf( "%8.5e %8.5e %8.5e %8.5e                       %8.5e FAILURE \n",
            res, anorm, xnorm, bnorm,
            res / N / eps / (anorm * xnorm + bnorm ));
160 161
    }
    else {
162 163 164
        printf( "%8.5e %8.5e %8.5e %8.5e                       %8.5e SUCCESS \n",
            res, anorm, xnorm, bnorm,
            res / N / eps / (anorm * xnorm + bnorm ));
165
    }
166 167 168 169 170 171 172

    /* deallocate data */
    free(A);
    free(Acpy);
    free(B);
    free(X);

Philippe Virouleau's avatar
Philippe Virouleau committed
173 174 175 176 177 178
finalize:
    /*
     * Required semicolon to have at least one inst
     * before the end of OpenMP block.
     */
    ;
Mathieu Faverge's avatar
Mathieu Faverge committed
179 180
    /* Finalize CHAMELEON */
    CHAMELEON_Finalize();
181

Philippe Virouleau's avatar
Philippe Virouleau committed
182
    return rc;
183
}