diff --git a/compute/pzgepdf_qdwh.c b/compute/pzgepdf_qdwh.c index 30d52379c6e56f28b10fd617578e73e4e9978317..4d8126f5c5150febf6cd7bc8f5d161f1a92f8010 100644 --- a/compute/pzgepdf_qdwh.c +++ b/compute/pzgepdf_qdwh.c @@ -818,7 +818,7 @@ chameleon_pzgepdf_qdwh( cham_mtxtype_t mtxtype, CHAM_desc_t *descU, CHAM_desc_t switch( mtxtype ) { #if defined(PRECISION_z) || defined(PRECISION_c) case ChamHermitian: - chameleon_pzhemm( ChamRight, ChamUpper, + chameleon_pzhemm( gemm_ws, ChamRight, ChamUpper, 1., descU, &descA, 0., descH, sequence, request ); if ( info ) { diff --git a/compute/pzhemm.c b/compute/pzhemm.c index 12b61a707c6c6dd05e5ecd14dc0edfc3378db15f..24f8bb2be7ce21c1db45d8582c0ef2d1572d3930 100644 --- a/compute/pzhemm.c +++ b/compute/pzhemm.c @@ -561,39 +561,22 @@ static inline void chameleon_pzhemm_summa( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + CHAM_desc_t *WA, CHAM_desc_t *WB, RUNTIME_option_t *options ) { RUNTIME_sequence_t *sequence = options->sequence; - CHAM_desc_t WA, WB; - int lookahead; - - lookahead = chamctxt->lookahead; - chameleon_desc_init( &WA, CHAMELEON_MAT_ALLOC_TILE, - ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), - C->mt * C->mb, C->nb * C->q * lookahead, 0, 0, - C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q, - NULL, NULL, NULL ); - chameleon_desc_init( &WB, CHAMELEON_MAT_ALLOC_TILE, - ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), - C->mb * C->p * lookahead, C->nt * C->nb, 0, 0, - C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q, - NULL, NULL, NULL ); if (side == ChamLeft) { chameleon_pzhemm_summa_left( chamctxt, uplo, alpha, A, B, beta, C, - &WA, &WB, options ); + WA, WB, options ); } else { chameleon_pzhemm_summa_right( chamctxt, uplo, alpha, A, B, beta, C, - &WA, &WB, options ); + WA, WB, options ); } - RUNTIME_desc_flush( &WA, sequence ); - RUNTIME_desc_flush( &WB, sequence ); - RUNTIME_desc_flush( C, sequence ); - chameleon_sequence_wait( chamctxt, sequence ); - chameleon_desc_destroy( &WA ); - chameleon_desc_destroy( &WB ); + CHAMELEON_Desc_Flush( WA, sequence ); + CHAMELEON_Desc_Flush( WB, sequence ); } /** @@ -781,13 +764,15 @@ chameleon_pzhemm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_ * Parallel tile hermitian matrix-matrix multiplication. wrapper. */ void -chameleon_pzhemm( cham_side_t side, cham_uplo_t uplo, +chameleon_pzhemm( struct chameleon_pzgemm_s *ws, + cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { CHAM_context_t *chamctxt; RUNTIME_option_t options; + cham_gemm_t alg = (ws != NULL) ? ws->alg : ChamGemmAlgGeneric; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) { @@ -795,15 +780,26 @@ chameleon_pzhemm( cham_side_t side, cham_uplo_t uplo, } RUNTIME_options_init( &options, chamctxt, sequence, request ); - if ( ((C->p > 1) || (C->q > 1)) && - (C->get_rankof == chameleon_getrankof_2d) && - (chamctxt->generic_enabled != CHAMELEON_TRUE) ) - { - chameleon_pzhemm_summa( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); - } - else - { + switch( alg ) { + case ChamGemmAlgAuto: + case ChamGemmAlgSummaB: /* Switch back to generic since it does not exist yet. */ + case ChamGemmAlgGeneric: chameleon_pzhemm_generic( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); + break; + + case ChamGemmAlgSummaC: + chameleon_pzhemm_summa( chamctxt, side, uplo, alpha, A, B, beta, C, + &(ws->WA), &(ws->WB), &options ); + break; + + case ChamGemmAlgSummaA: + if ( side == ChamLeft ) { + chameleon_pzhemm_Astat( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); + } + else { + chameleon_pzhemm_generic( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); + } + break; } RUNTIME_options_finalize( &options, chamctxt ); diff --git a/compute/zhemm.c b/compute/zhemm.c index 422bf9eecef6ea09acdff520e99751f5560b7e01..23c7fdf985ab84b2e604ca5dec633556ff35e68c 100644 --- a/compute/zhemm.c +++ b/compute/zhemm.c @@ -24,6 +24,181 @@ */ #include "control/common.h" +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * @brief Allocate the required workspaces for asynchronous hemm + * + ******************************************************************************* + * + * @param[in] side + * Specifies whether the hermitian matrix A appears on the + * left or right in the operation as follows: + * = ChamLeft: \f[ C = \alpha \times A \times B + \beta \times C \f] + * = ChamRight: \f[ C = \alpha \times B \times A + \beta \times C \f] + * + * @param[in] uplo + * Specifies whether the upper or lower triangular part of + * the hermitian matrix A is to be referenced as follows: + * = ChamLower: Only the lower triangular part of the + * hermitian matrix A is to be referenced. + * = ChamUpper: Only the upper triangular part of the + * hermitian matrix A is to be referenced. + * + * @param[in] A + * The descriptor of the matrix A. + * + * @param[in] B + * The descriptor of the matrix B. + * + * @param[in] C + * The descriptor of the matrix C. + * + ******************************************************************************* + * + * @retval An allocated opaque pointer to use in CHAMELEON_zhemm_Tile_Async() + * and to free with CHAMELEON_zhemm_WS_Free(). + * + ******************************************************************************* + * + * @sa CHAMELEON_zhemm_Tile_Async + * @sa CHAMELEON_zhemm_WS_Free + * + */ +void *CHAMELEON_zhemm_WS_Alloc( cham_side_t side __attribute__((unused)), + cham_uplo_t uplo __attribute__((unused)), + const CHAM_desc_t *A, + const CHAM_desc_t *B, + const CHAM_desc_t *C ) +{ + CHAM_context_t *chamctxt; + struct chameleon_pzgemm_s *options; + + chamctxt = chameleon_context_self(); + if ( chamctxt == NULL ) { + return NULL; + } + + options = calloc( 1, sizeof(struct chameleon_pzgemm_s) ); + options->alg = ChamGemmAlgAuto; + + /* + * If only one process, or if generic has been globally enforced, we switch + * to generic immediately. + */ + if ( ((C->p == 1) && (C->q == 1)) || + (chamctxt->generic_enabled == CHAMELEON_TRUE) ) + { + options->alg = ChamGemmAlgGeneric; + } + + /* Look at environment variable is something enforces the variant. */ + if ( options->alg == ChamGemmAlgAuto ) + { + char *algostr = chameleon_getenv( "CHAMELEON_GEMM_ALGO" ); + + if ( algostr ) { + if ( strcasecmp( algostr, "summa_c" ) == 0 ) { + options->alg = ChamGemmAlgSummaC; + } + else if ( strcasecmp( algostr, "summa_a" ) == 0 ) { + options->alg = ChamGemmAlgSummaA; + } + else if ( strcasecmp( algostr, "summa_b" ) == 0 ) { + options->alg = ChamGemmAlgSummaB; + } + else if ( strcasecmp( algostr, "generic" ) == 0 ) { + options->alg = ChamGemmAlgGeneric; + } + else if ( strcasecmp( algostr, "auto" ) == 0 ) { + options->alg = ChamGemmAlgAuto; + } + else { + fprintf( stderr, "ERROR: CHAMELEON_GEMM_ALGO is not one of AUTO, SUMMA_A, SUMMA_B, SUMMA_C, GENERIC => Switch back to Automatic switch\n" ); + } + } + chameleon_cleanenv( algostr ); + } + + /* Perform automatic choice if not already enforced. */ + if ( options->alg == ChamGemmAlgAuto ) + { + double sizeA, sizeB, sizeC; + double ratio = 1.5; /* Arbitrary ratio to give more weight to writes wrt reads. */ + + /* Compute the average array per node for each matrix */ + sizeA = ((double)A->m * (double)A->n) / (double)(A->p * A->q); + sizeB = ((double)B->m * (double)B->n) / (double)(B->p * B->q); + sizeC = ((double)C->m * (double)C->n) / (double)(C->p * C->q) * ratio; + + if ( (sizeC > sizeA) && (sizeC > sizeB) ) { + options->alg = ChamGemmAlgSummaC; + } + else { + if ( sizeA > sizeB ) { + options->alg = ChamGemmAlgSummaA; + } + else { + options->alg = ChamGemmAlgSummaB; + } + } + } + + assert( options->alg != ChamGemmAlgAuto ); + + /* Now that we have decided which algorithm, let's allocate the required data structures. */ + if ( (options->alg == ChamGemmAlgSummaC ) && + (C->get_rankof == chameleon_getrankof_2d ) ) + { + int lookahead = chamctxt->lookahead; + + chameleon_desc_init( &(options->WA), CHAMELEON_MAT_ALLOC_TILE, + ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), + C->mt * C->mb, C->nb * C->q * lookahead, 0, 0, + C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q, + NULL, NULL, NULL ); + chameleon_desc_init( &(options->WB), CHAMELEON_MAT_ALLOC_TILE, + ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), + C->mb * C->p * lookahead, C->nt * C->nb, 0, 0, + C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q, + NULL, NULL, NULL ); + } + + return (void*)options; +} + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * @brief Free the allocated workspaces for asynchronous hemm + * + ******************************************************************************* + * + * @param[in,out] user_ws + * On entry, the opaque pointer allocated by CHAMELEON_zhemm_WS_Alloc() + * On exit, all data are freed. + * + ******************************************************************************* + * + * @sa CHAMELEON_zhemm_Tile_Async + * @sa CHAMELEON_zhemm_WS_Alloc + * + */ +void CHAMELEON_zhemm_WS_Free( void *user_ws ) +{ + struct chameleon_pzgemm_s *ws = (struct chameleon_pzgemm_s*)user_ws; + + if ( ws->alg == ChamGemmAlgSummaC ) { + chameleon_desc_destroy( &(ws->WA) ); + chameleon_desc_destroy( &(ws->WB) ); + } + free( ws ); +} + /** ******************************************************************************** * @@ -102,9 +277,9 @@ * */ int CHAMELEON_zhemm( cham_side_t side, cham_uplo_t uplo, int M, int N, - CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t *A, int LDA, - CHAMELEON_Complex64_t *B, int LDB, - CHAMELEON_Complex64_t beta, CHAMELEON_Complex64_t *C, int LDC ) + CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t *A, int LDA, + CHAMELEON_Complex64_t *B, int LDB, + CHAMELEON_Complex64_t beta, CHAMELEON_Complex64_t *C, int LDC ) { int NB; int Am; @@ -115,6 +290,7 @@ int CHAMELEON_zhemm( cham_side_t side, cham_uplo_t uplo, int M, int N, CHAM_context_t *chamctxt; RUNTIME_sequence_t *sequence = NULL; RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + void *ws; chamctxt = chameleon_context_self(); if (chamctxt == NULL) { @@ -158,7 +334,7 @@ int CHAMELEON_zhemm( cham_side_t side, cham_uplo_t uplo, int M, int N, ((alpha == (CHAMELEON_Complex64_t)0.0) && beta == (CHAMELEON_Complex64_t)1.0)) return CHAMELEON_SUCCESS; - /* Tune NB depending on M, N & NRHS; Set NBNBSIZE */ + /* Tune NB depending on M, N & NRHS; Set NBNB */ status = chameleon_tune(CHAMELEON_FUNC_ZHEMM, M, N, 0); if (status != CHAMELEON_SUCCESS) { chameleon_error("CHAMELEON_zhemm", "chameleon_tune() failed"); @@ -179,7 +355,8 @@ int CHAMELEON_zhemm( cham_side_t side, cham_uplo_t uplo, int M, int N, C, NB, NB, LDC, N, M, N, sequence, &request ); /* Call the tile interface */ - CHAMELEON_zhemm_Tile_Async( side, uplo, alpha, &descAt, &descBt, beta, &descCt, sequence, &request ); + ws = CHAMELEON_zhemm_WS_Alloc( side, uplo, &descAt, &descBt, &descCt ); + CHAMELEON_zhemm_Tile_Async( side, uplo, alpha, &descAt, &descBt, beta, &descCt, ws, sequence, &request ); /* Submit the matrix conversion back */ chameleon_ztile2lap( chamctxt, &descAl, &descAt, @@ -192,6 +369,7 @@ int CHAMELEON_zhemm( cham_side_t side, cham_uplo_t uplo, int M, int N, chameleon_sequence_wait( chamctxt, sequence ); /* Cleanup the temporary data */ + CHAMELEON_zhemm_WS_Free( ws ); chameleon_ztile2lap_cleanup( chamctxt, &descAl, &descAt ); chameleon_ztile2lap_cleanup( chamctxt, &descBl, &descBt ); chameleon_ztile2lap_cleanup( chamctxt, &descCl, &descCt ); @@ -260,13 +438,14 @@ int CHAMELEON_zhemm( cham_side_t side, cham_uplo_t uplo, int M, int N, * */ int CHAMELEON_zhemm_Tile( cham_side_t side, cham_uplo_t uplo, - CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, - CHAMELEON_Complex64_t beta, CHAM_desc_t *C ) + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C ) { CHAM_context_t *chamctxt; RUNTIME_sequence_t *sequence = NULL; RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; int status; + void *ws; chamctxt = chameleon_context_self(); if (chamctxt == NULL) { @@ -275,13 +454,16 @@ int CHAMELEON_zhemm_Tile( cham_side_t side, cham_uplo_t uplo, } chameleon_sequence_create( chamctxt, &sequence ); - CHAMELEON_zhemm_Tile_Async(side, uplo, alpha, A, B, beta, C, sequence, &request ); + ws = CHAMELEON_zhemm_WS_Alloc( side, uplo, A, B, C ); + CHAMELEON_zhemm_Tile_Async( side, uplo, alpha, A, B, beta, C, ws, sequence, &request ); CHAMELEON_Desc_Flush( A, sequence ); CHAMELEON_Desc_Flush( B, sequence ); CHAMELEON_Desc_Flush( C, sequence ); chameleon_sequence_wait( chamctxt, sequence ); + CHAMELEON_zhemm_WS_Free( ws ); + status = sequence->status; chameleon_sequence_destroy( chamctxt, sequence ); return status; @@ -316,11 +498,13 @@ int CHAMELEON_zhemm_Tile( cham_side_t side, cham_uplo_t uplo, * */ int CHAMELEON_zhemm_Tile_Async( cham_side_t side, cham_uplo_t uplo, - CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, - CHAMELEON_Complex64_t beta, CHAM_desc_t *C, - RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + void *user_ws, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { CHAM_context_t *chamctxt; + struct chameleon_pzgemm_s *ws; chamctxt = chameleon_context_self(); if (chamctxt == NULL) { @@ -391,16 +575,6 @@ int CHAMELEON_zhemm_Tile_Async( cham_side_t side, cham_uplo_t uplo, } /* Check submatrix starting point */ - /* if ( (B->i != C->i) || (B->j != C->j) ) { */ - /* chameleon_error("CHAMELEON_zhemm_Tile_Async", "B and C submatrices doesn't match"); */ - /* return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); */ - /* } */ - /* if ( (A->i != A->j) || */ - /* ( (side == ChamLeft) && (A->i != B->i ) ) || */ - /* ( (side == ChamRight) && (A->i != B->j ) ) ) { */ - /* chameleon_error("CHAMELEON_zhemm_Tile_Async", "Submatrix A must start on diagnonal and match submatrices B and C."); */ - /* return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); */ - /* } */ if( (A->i != 0) || (A->j != 0) || (B->i != 0) || (B->j != 0) || (C->i != 0) || (C->j != 0) ) { @@ -415,7 +589,21 @@ int CHAMELEON_zhemm_Tile_Async( cham_side_t side, cham_uplo_t uplo, return CHAMELEON_SUCCESS; } - chameleon_pzhemm( side, uplo, alpha, A, B, beta, C, sequence, request ); + if ( user_ws == NULL ) { + ws = CHAMELEON_zhemm_WS_Alloc( side, uplo, A, B, C ); + } + else { + ws = user_ws; + } + chameleon_pzhemm( ws, side, uplo, alpha, A, B, beta, C, sequence, request ); + + if ( user_ws == NULL ) { + CHAMELEON_Desc_Flush( A, sequence ); + CHAMELEON_Desc_Flush( B, sequence ); + CHAMELEON_Desc_Flush( C, sequence ); + chameleon_sequence_wait( chamctxt, sequence ); + CHAMELEON_zhemm_WS_Free( ws ); + } return CHAMELEON_SUCCESS; } diff --git a/control/chameleon_zf77.c b/control/chameleon_zf77.c index 3a30ddc22a2b624ec500193a9f1bfbbc4eddef9d..6132c2c9ec14cf6512a9f379c8a1dc3f2f14a534 100644 --- a/control/chameleon_zf77.c +++ b/control/chameleon_zf77.c @@ -757,7 +757,7 @@ void CHAMELEON_ZGETRS_NOPIV_TILE_ASYNC(CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_s #if defined(PRECISION_z) || defined(PRECISION_c) void CHAMELEON_ZHEMM_TILE_ASYNC(cham_side_t *side, cham_uplo_t *uplo, CHAMELEON_Complex64_t *alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t *beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request, int *info) -{ *info = CHAMELEON_zhemm_Tile_Async(*side, *uplo, *alpha, A, B, *beta, C, sequence, request); } +{ *info = CHAMELEON_zhemm_Tile_Async(*side, *uplo, *alpha, A, B, *beta, C, NULL, sequence, request); } #endif #if defined(PRECISION_z) || defined(PRECISION_c) diff --git a/control/compute_z.h b/control/compute_z.h index 32dbcca8aee50db9eacbcce2c10624db16a75ca7..1ee61f62894310b1f41c442d47422c7729745e10 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -81,7 +81,7 @@ void chameleon_pzgetrf_nopiv(CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTI void chameleon_pzgetrf_reclap(CHAM_desc_t *A, int *IPIV, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzgetrf_rectil(CHAM_desc_t *A, int *IPIV, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzhegst(int itype, cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); -void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); +void chameleon_pzhemm( struct chameleon_pzgemm_s *ws,cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); void chameleon_pzherk(cham_uplo_t uplo, cham_trans_t trans, double alpha, CHAM_desc_t *A, double beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzher2k(cham_uplo_t uplo, cham_trans_t trans, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, double beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *E, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); diff --git a/include/chameleon/chameleon_z.h b/include/chameleon/chameleon_z.h index a7cbf43fb84a6070832fac2dfd7752cadc20c689..6e576d21b1e231fee7d66277cfc72311c14d093b 100644 --- a/include/chameleon/chameleon_z.h +++ b/include/chameleon/chameleon_z.h @@ -213,7 +213,7 @@ int CHAMELEON_zgetrf_nopiv_Tile_Async(CHAM_desc_t *A, RUNTIME_sequence_t *sequen //int CHAMELEON_zgetrs_Tile_Async(cham_trans_t trans, CHAM_desc_t *A, int *IPIV, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgetrs_incpiv_Tile_Async(CHAM_desc_t *A, CHAM_desc_t *L, int *IPIV, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgetrs_nopiv_Tile_Async(CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); -int CHAMELEON_zhemm_Tile_Async(cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); +int CHAMELEON_zhemm_Tile_Async(cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, void *ws, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zherk_Tile_Async(cham_uplo_t uplo, cham_trans_t trans, double alpha, CHAM_desc_t *A, double beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zher2k_Tile_Async(cham_uplo_t uplo, cham_trans_t trans, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, double beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); //int CHAMELEON_zheev_Tile_Async(cham_job_t jobz, cham_uplo_t uplo, CHAM_desc_t *A, double *W, CHAM_desc_t *T, CHAMELEON_Complex64_t *Q, int LDQ, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); @@ -312,6 +312,8 @@ int CHAMELEON_zunmqr_param_Tile_Async(const libhqr_tree_t *qrtree, cham_side_t s */ void *CHAMELEON_zgemm_WS_Alloc( cham_trans_t transA, cham_trans_t transB, const CHAM_desc_t *A, const CHAM_desc_t *B, const CHAM_desc_t *C ); void CHAMELEON_zgemm_WS_Free( void *ws ); +void *CHAMELEON_zhemm_WS_Alloc( cham_side_t side, cham_uplo_t uplo, const CHAM_desc_t *A, const CHAM_desc_t *B, const CHAM_desc_t *C ); +void CHAMELEON_zhemm_WS_Free( void *ws ); void *CHAMELEON_zsymm_WS_Alloc( cham_side_t side, cham_uplo_t uplo, const CHAM_desc_t *A, const CHAM_desc_t *B, const CHAM_desc_t *C ); void CHAMELEON_zsymm_WS_Free( void *ws ); void *CHAMELEON_zcesca_WS_Alloc( const CHAM_desc_t *A ); diff --git a/testing/testing_zhemm.c b/testing/testing_zhemm.c index 38d91da96d7d96794e665a0551d196bd055e8779..e80471c8fd687bb301f5b703442f17a6ce9e2366 100644 --- a/testing/testing_zhemm.c +++ b/testing/testing_zhemm.c @@ -57,6 +57,7 @@ testing_zhemm_desc( run_arg_list_t *args, int check ) /* Descriptors */ int Am; CHAM_desc_t *descA, *descB, *descC, *descCinit; + void *ws = NULL; bump = run_arg_get_double( args, "bump", bump ); alpha = run_arg_get_complex64( args, "alpha", alpha ); @@ -85,11 +86,15 @@ testing_zhemm_desc( run_arg_list_t *args, int check ) CHAMELEON_zplrnt_Tile( descB, seedB ); CHAMELEON_zplrnt_Tile( descC, seedC ); + if ( async ) { + ws = CHAMELEON_zhemm_WS_Alloc( side, uplo, descA, descB, descC ); + } + /* Calculates the product */ testing_start( &test_data ); if ( async ) { hres = CHAMELEON_zhemm_Tile_Async( side, uplo, alpha, descA, descB, beta, descC, - test_data.sequence, &test_data.request ); + ws, test_data.sequence, &test_data.request ); CHAMELEON_Desc_Flush( descA, test_data.sequence ); CHAMELEON_Desc_Flush( descB, test_data.sequence ); CHAMELEON_Desc_Flush( descC, test_data.sequence ); @@ -100,6 +105,10 @@ testing_zhemm_desc( run_arg_list_t *args, int check ) test_data.hres = hres; testing_stop( &test_data, flops_zhemm( side, M, N ) ); + if ( ws != NULL ) { + CHAMELEON_zhemm_WS_Free( ws ); + } + /* Checks the solution */ if ( check ) { CHAMELEON_Desc_Create(