diff --git a/compute/pzgemm.c b/compute/pzgemm.c index 7a94be1ec7c3a1e32bc1841b4ac0e79637c73fa3..4c41e64438be82566417209028177a56ac26950a 100644 --- a/compute/pzgemm.c +++ b/compute/pzgemm.c @@ -426,6 +426,7 @@ chameleon_pzgemm( struct chameleon_pzgemm_s *ws, { CHAM_context_t *chamctxt; RUNTIME_option_t options; + cham_gemm_t alg = (ws != NULL) ? ws->alg : ChamGemmAlgGeneric; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) { @@ -433,13 +434,21 @@ chameleon_pzgemm( struct chameleon_pzgemm_s *ws, } RUNTIME_options_init( &options, chamctxt, sequence, request ); - if ( ws && ws->summa ) - { + switch( alg ) { + case ChamGemmAlgAuto: + case ChamGemmAlgSummaB: /* Switch back to generic since it does not exist yet. */ + case ChamGemmAlgGeneric: + chameleon_pzgemm_generic( chamctxt, transA, transB, alpha, A, B, beta, C, &options ); + break; + + case ChamGemmAlgSummaC: chameleon_pzgemm_summa( chamctxt, transA, transB, alpha, A, B, beta, C, &(ws->WA), &(ws->WB), &options ); - } - else { - chameleon_pzgemm_generic( chamctxt, transA, transB, alpha, A, B, beta, C, &options ); + break; + + case ChamGemmAlgSummaA: + chameleon_pzgemm_Astat( chamctxt, transA, transB, alpha, A, B, beta, C, &options ); + break; } RUNTIME_options_finalize( &options, chamctxt ); diff --git a/compute/pzgepdf_qdwh.c b/compute/pzgepdf_qdwh.c index 48617938bb3c59e4c75716e5986d888cf3aedd1d..b50edf517487e469b41f59d3113408d631cea0f6 100644 --- a/compute/pzgepdf_qdwh.c +++ b/compute/pzgepdf_qdwh.c @@ -175,7 +175,7 @@ chameleon_pzgepdf_qdwh_init( const CHAM_desc_t *U, const CHAM_desc_t *H, /* * Allocate the data descriptors for the lookahead if needed */ - *gemm_ws = CHAMELEON_zgemm_WS_Alloc( ChamNoTrans, ChamNoTrans, NULL, NULL, U ); + *gemm_ws = CHAMELEON_zgemm_WS_Alloc( ChamNoTrans, ChamNoTrans, U, U, U ); return; } diff --git a/compute/zgemm.c b/compute/zgemm.c index e3eed82ec66a299b83c74bbdc274ad867950bc2f..325f27cd90dc3cbf8dee13f26020f613e98a501a 100644 --- a/compute/zgemm.c +++ b/compute/zgemm.c @@ -103,14 +103,77 @@ void *CHAMELEON_zgemm_WS_Alloc( cham_trans_t transA __attribute__((unused) } options = calloc( 1, sizeof(struct chameleon_pzgemm_s) ); - options->summa = 0; + options->alg = ChamGemmAlgAuto; + + /* + * If only one process, or if generic has been globally enforced, we switch + * to generic immediately. + */ + if ( ((C->p == 1) && (C->q == 1)) || + (chamctxt->generic_enabled == CHAMELEON_TRUE) ) + { + options->alg = ChamGemmAlgGeneric; + } - if ( ((C->p > 1) || (C->q > 1)) && - (C->get_rankof == chameleon_getrankof_2d) && - (chamctxt->generic_enabled != CHAMELEON_TRUE) ) + /* Look at environment variable is something enforces the variant. */ + if ( options->alg == ChamGemmAlgAuto ) + { + char *algostr = chameleon_getenv( "CHAMELEON_GEMM_ALGO" ); + + if ( algostr ) { + if ( strcasecmp( algostr, "summa_c" ) == 0 ) { + options->alg = ChamGemmAlgSummaC; + } + else if ( strcasecmp( algostr, "summa_a" ) == 0 ) { + options->alg = ChamGemmAlgSummaA; + } + else if ( strcasecmp( algostr, "summa_b" ) == 0 ) { + options->alg = ChamGemmAlgSummaB; + } + else if ( strcasecmp( algostr, "generic" ) == 0 ) { + options->alg = ChamGemmAlgGeneric; + } + else if ( strcasecmp( algostr, "auto" ) == 0 ) { + options->alg = ChamGemmAlgAuto; + } + else { + fprintf( stderr, "ERROR: CHAMELEON_GEMM_ALGO is not one of AUTO, SUMMA_A, SUMMA_B, SUMMA_C, GENERIC => Switch back to Automatic switch\n" ); + } + } + chameleon_cleanenv( algostr ); + } + + /* Perform automatic choice if not already enforced. */ + if ( options->alg == ChamGemmAlgAuto ) + { + double sizeA, sizeB, sizeC; + double ratio = 1.5; /* Arbitrary ratio to give more weight to writes wrt reads. */ + + /* Compute the average array per node for each matrix */ + sizeA = ((double)A->m * (double)A->n) / (double)(A->p * A->q); + sizeB = ((double)B->m * (double)B->n) / (double)(B->p * B->q); + sizeC = ((double)C->m * (double)C->n) / (double)(C->p * C->q) * ratio; + + if ( (sizeC > sizeA) && (sizeC > sizeB) ) { + options->alg = ChamGemmAlgSummaC; + } + else { + if ( sizeA > sizeB ) { + options->alg = ChamGemmAlgSummaA; + } + else { + options->alg = ChamGemmAlgSummaB; + } + } + } + + assert( options->alg != ChamGemmAlgAuto ); + + /* Now that we have decided which algorithm, let's allocate the required data structures. */ + if ( (options->alg == ChamGemmAlgSummaC ) && + (C->get_rankof == chameleon_getrankof_2d ) ) { int lookahead = chamctxt->lookahead; - options->summa = 1; chameleon_desc_init( &(options->WA), CHAMELEON_MAT_ALLOC_TILE, ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), @@ -150,7 +213,7 @@ void CHAMELEON_zgemm_WS_Free( void *user_ws ) { struct chameleon_pzgemm_s *ws = (struct chameleon_pzgemm_s*)user_ws; - if ( ws->summa ) { + if ( ws->alg == ChamGemmAlgSummaC ) { chameleon_desc_destroy( &(ws->WA) ); chameleon_desc_destroy( &(ws->WB) ); } diff --git a/control/compute_z.h b/control/compute_z.h index d7953e9045a3e967781e8985f2ed851f2a529eae..760127d31d5eab6d4339316c2888f36e6a535666 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -30,7 +30,7 @@ * @brief Data structure to handle the GEMM workspaces */ struct chameleon_pzgemm_s { - int summa; + cham_gemm_t alg; CHAM_desc_t WA; CHAM_desc_t WB; };