From 7b4b742fbd215af2f86565af059f92f8c8b39f4c Mon Sep 17 00:00:00 2001 From: Mathieu Faverge <mathieu.faverge@inria.fr> Date: Wed, 31 Aug 2022 00:31:26 +0200 Subject: [PATCH] zsymm.c: Make the Async call really asynchronous by creating a ws as in GEMM --- compute/pzgepdf_qdwh.c | 2 +- compute/pzsymm.c | 64 ++++----- compute/zsymm.c | 230 +++++++++++++++++++++++++++++--- control/chameleon_zf77.c | 2 +- control/compute_z.h | 2 +- include/chameleon/chameleon_z.h | 4 +- testing/testing_zsymm.c | 11 +- 7 files changed, 254 insertions(+), 61 deletions(-) diff --git a/compute/pzgepdf_qdwh.c b/compute/pzgepdf_qdwh.c index b50edf517..30d52379c 100644 --- a/compute/pzgepdf_qdwh.c +++ b/compute/pzgepdf_qdwh.c @@ -827,7 +827,7 @@ chameleon_pzgepdf_qdwh( cham_mtxtype_t mtxtype, CHAM_desc_t *descU, CHAM_desc_t break; #endif case ChamSymmetric: - chameleon_pzsymm( ChamRight, ChamUpper, + chameleon_pzsymm( gemm_ws, ChamRight, ChamUpper, 1., descU, &descA, 0., descH, sequence, request ); if ( info ) { diff --git a/compute/pzsymm.c b/compute/pzsymm.c index 583b2b809..e39d9f6e2 100644 --- a/compute/pzsymm.c +++ b/compute/pzsymm.c @@ -561,39 +561,22 @@ static inline void chameleon_pzsymm_summa( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + CHAM_desc_t *WA, CHAM_desc_t *WB, RUNTIME_option_t *options ) { RUNTIME_sequence_t *sequence = options->sequence; - CHAM_desc_t WA, WB; - int lookahead; - - lookahead = chamctxt->lookahead; - chameleon_desc_init( &WA, CHAMELEON_MAT_ALLOC_TILE, - ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), - C->mt * C->mb, C->nb * C->q * lookahead, 0, 0, - C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q, - NULL, NULL, NULL ); - chameleon_desc_init( &WB, CHAMELEON_MAT_ALLOC_TILE, - ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), - C->mb * C->p * lookahead, C->nt * C->nb, 0, 0, - C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q, - NULL, NULL, NULL ); if (side == ChamLeft) { chameleon_pzsymm_summa_left( chamctxt, uplo, alpha, A, B, beta, C, - &WA, &WB, options ); + WA, WB, options ); } else { chameleon_pzsymm_summa_right( chamctxt, uplo, alpha, A, B, beta, C, - &WA, &WB, options ); + WA, WB, options ); } - RUNTIME_desc_flush( &WA, sequence ); - RUNTIME_desc_flush( &WB, sequence ); - RUNTIME_desc_flush( C, sequence ); - chameleon_sequence_wait( chamctxt, sequence ); - chameleon_desc_destroy( &WA ); - chameleon_desc_destroy( &WB ); + CHAMELEON_Desc_Flush( WA, sequence ); + CHAMELEON_Desc_Flush( WB, sequence ); } /** @@ -781,13 +764,15 @@ chameleon_pzsymm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_ * Parallel tile symmetric matrix-matrix multiplication. wrapper. */ void -chameleon_pzsymm( cham_side_t side, cham_uplo_t uplo, +chameleon_pzsymm( struct chameleon_pzgemm_s *ws, + cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { CHAM_context_t *chamctxt; RUNTIME_option_t options; + cham_gemm_t alg = (ws != NULL) ? ws->alg : ChamGemmAlgGeneric; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) { @@ -795,18 +780,27 @@ chameleon_pzsymm( cham_side_t side, cham_uplo_t uplo, } RUNTIME_options_init( &options, chamctxt, sequence, request ); - /* if ( ((C->p > 1) || (C->q > 1)) && */ - /* (C->get_rankof == chameleon_getrankof_2d) && */ - /* (chamctxt->generic_enabled != CHAMELEON_TRUE) ) */ - /* { */ - /* chameleon_pzsymm_summa( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); */ - /* } */ - /* else */ - /* { */ - /* chameleon_pzsymm_generic( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); */ - /* } */ - chameleon_pzsymm_Astat( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); - + switch( alg ) { + case ChamGemmAlgAuto: + case ChamGemmAlgSummaB: /* Switch back to generic since it does not exist yet. */ + case ChamGemmAlgGeneric: + chameleon_pzsymm_generic( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); + break; + + case ChamGemmAlgSummaC: + chameleon_pzsymm_summa( chamctxt, side, uplo, alpha, A, B, beta, C, + &(ws->WA), &(ws->WB), &options ); + break; + + case ChamGemmAlgSummaA: + if ( side == ChamLeft ) { + chameleon_pzsymm_Astat( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); + } + else { + chameleon_pzsymm_generic( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); + } + break; + } RUNTIME_options_finalize( &options, chamctxt ); } diff --git a/compute/zsymm.c b/compute/zsymm.c index c4ff5fb0e..397c8f65c 100644 --- a/compute/zsymm.c +++ b/compute/zsymm.c @@ -24,6 +24,181 @@ */ #include "control/common.h" +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * @brief Allocate the required workspaces for asynchronous symm + * + ******************************************************************************* + * + * @param[in] side + * Specifies whether the symmetric matrix A appears on the + * left or right in the operation as follows: + * = ChamLeft: \f[ C = \alpha \times A \times B + \beta \times C \f] + * = ChamRight: \f[ C = \alpha \times B \times A + \beta \times C \f] + * + * @param[in] uplo + * Specifies whether the upper or lower triangular part of + * the symmetric matrix A is to be referenced as follows: + * = ChamLower: Only the lower triangular part of the + * symmetric matrix A is to be referenced. + * = ChamUpper: Only the upper triangular part of the + * symmetric matrix A is to be referenced. + * + * @param[in] A + * The descriptor of the matrix A. + * + * @param[in] B + * The descriptor of the matrix B. + * + * @param[in] C + * The descriptor of the matrix C. + * + ******************************************************************************* + * + * @retval An allocated opaque pointer to use in CHAMELEON_zsymm_Tile_Async() + * and to free with CHAMELEON_zsymm_WS_Free(). + * + ******************************************************************************* + * + * @sa CHAMELEON_zsymm_Tile_Async + * @sa CHAMELEON_zsymm_WS_Free + * + */ +void *CHAMELEON_zsymm_WS_Alloc( cham_side_t side __attribute__((unused)), + cham_uplo_t uplo __attribute__((unused)), + const CHAM_desc_t *A, + const CHAM_desc_t *B, + const CHAM_desc_t *C ) +{ + CHAM_context_t *chamctxt; + struct chameleon_pzgemm_s *options; + + chamctxt = chameleon_context_self(); + if ( chamctxt == NULL ) { + return NULL; + } + + options = calloc( 1, sizeof(struct chameleon_pzgemm_s) ); + options->alg = ChamGemmAlgAuto; + + /* + * If only one process, or if generic has been globally enforced, we switch + * to generic immediately. + */ + if ( ((C->p == 1) && (C->q == 1)) || + (chamctxt->generic_enabled == CHAMELEON_TRUE) ) + { + options->alg = ChamGemmAlgGeneric; + } + + /* Look at environment variable is something enforces the variant. */ + if ( options->alg == ChamGemmAlgAuto ) + { + char *algostr = chameleon_getenv( "CHAMELEON_GEMM_ALGO" ); + + if ( algostr ) { + if ( strcasecmp( algostr, "summa_c" ) == 0 ) { + options->alg = ChamGemmAlgSummaC; + } + else if ( strcasecmp( algostr, "summa_a" ) == 0 ) { + options->alg = ChamGemmAlgSummaA; + } + else if ( strcasecmp( algostr, "summa_b" ) == 0 ) { + options->alg = ChamGemmAlgSummaB; + } + else if ( strcasecmp( algostr, "generic" ) == 0 ) { + options->alg = ChamGemmAlgGeneric; + } + else if ( strcasecmp( algostr, "auto" ) == 0 ) { + options->alg = ChamGemmAlgAuto; + } + else { + fprintf( stderr, "ERROR: CHAMELEON_GEMM_ALGO is not one of AUTO, SUMMA_A, SUMMA_B, SUMMA_C, GENERIC => Switch back to Automatic switch\n" ); + } + } + chameleon_cleanenv( algostr ); + } + + /* Perform automatic choice if not already enforced. */ + if ( options->alg == ChamGemmAlgAuto ) + { + double sizeA, sizeB, sizeC; + double ratio = 1.5; /* Arbitrary ratio to give more weight to writes wrt reads. */ + + /* Compute the average array per node for each matrix */ + sizeA = ((double)A->m * (double)A->n) / (double)(A->p * A->q); + sizeB = ((double)B->m * (double)B->n) / (double)(B->p * B->q); + sizeC = ((double)C->m * (double)C->n) / (double)(C->p * C->q) * ratio; + + if ( (sizeC > sizeA) && (sizeC > sizeB) ) { + options->alg = ChamGemmAlgSummaC; + } + else { + if ( sizeA > sizeB ) { + options->alg = ChamGemmAlgSummaA; + } + else { + options->alg = ChamGemmAlgSummaB; + } + } + } + + assert( options->alg != ChamGemmAlgAuto ); + + /* Now that we have decided which algorithm, let's allocate the required data structures. */ + if ( (options->alg == ChamGemmAlgSummaC ) && + (C->get_rankof == chameleon_getrankof_2d ) ) + { + int lookahead = chamctxt->lookahead; + + chameleon_desc_init( &(options->WA), CHAMELEON_MAT_ALLOC_TILE, + ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), + C->mt * C->mb, C->nb * C->q * lookahead, 0, 0, + C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q, + NULL, NULL, NULL ); + chameleon_desc_init( &(options->WB), CHAMELEON_MAT_ALLOC_TILE, + ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), + C->mb * C->p * lookahead, C->nt * C->nb, 0, 0, + C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q, + NULL, NULL, NULL ); + } + + return (void*)options; +} + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * @brief Free the allocated workspaces for asynchronous symm + * + ******************************************************************************* + * + * @param[in,out] user_ws + * On entry, the opaque pointer allocated by CHAMELEON_zsymm_WS_Alloc() + * On exit, all data are freed. + * + ******************************************************************************* + * + * @sa CHAMELEON_zsymm_Tile_Async + * @sa CHAMELEON_zsymm_WS_Alloc + * + */ +void CHAMELEON_zsymm_WS_Free( void *user_ws ) +{ + struct chameleon_pzgemm_s *ws = (struct chameleon_pzgemm_s*)user_ws; + + if ( ws->alg == ChamGemmAlgSummaC ) { + chameleon_desc_destroy( &(ws->WA) ); + chameleon_desc_destroy( &(ws->WB) ); + } + free( ws ); +} + /** ******************************************************************************** * @@ -102,9 +277,9 @@ * */ int CHAMELEON_zsymm( cham_side_t side, cham_uplo_t uplo, int M, int N, - CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t *A, int LDA, - CHAMELEON_Complex64_t *B, int LDB, - CHAMELEON_Complex64_t beta, CHAMELEON_Complex64_t *C, int LDC ) + CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t *A, int LDA, + CHAMELEON_Complex64_t *B, int LDB, + CHAMELEON_Complex64_t beta, CHAMELEON_Complex64_t *C, int LDC ) { int NB; int Am; @@ -115,6 +290,7 @@ int CHAMELEON_zsymm( cham_side_t side, cham_uplo_t uplo, int M, int N, CHAM_context_t *chamctxt; RUNTIME_sequence_t *sequence = NULL; RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + void *ws; chamctxt = chameleon_context_self(); if (chamctxt == NULL) { @@ -179,7 +355,8 @@ int CHAMELEON_zsymm( cham_side_t side, cham_uplo_t uplo, int M, int N, C, NB, NB, LDC, N, M, N, sequence, &request ); /* Call the tile interface */ - CHAMELEON_zsymm_Tile_Async( side, uplo, alpha, &descAt, &descBt, beta, &descCt, sequence, &request ); + ws = CHAMELEON_zsymm_WS_Alloc( side, uplo, &descAt, &descBt, &descCt ); + CHAMELEON_zsymm_Tile_Async( side, uplo, alpha, &descAt, &descBt, beta, &descCt, ws, sequence, &request ); /* Submit the matrix conversion back */ chameleon_ztile2lap( chamctxt, &descAl, &descAt, @@ -192,6 +369,7 @@ int CHAMELEON_zsymm( cham_side_t side, cham_uplo_t uplo, int M, int N, chameleon_sequence_wait( chamctxt, sequence ); /* Cleanup the temporary data */ + CHAMELEON_zsymm_WS_Free( ws ); chameleon_ztile2lap_cleanup( chamctxt, &descAl, &descAt ); chameleon_ztile2lap_cleanup( chamctxt, &descBl, &descBt ); chameleon_ztile2lap_cleanup( chamctxt, &descCl, &descCt ); @@ -260,13 +438,14 @@ int CHAMELEON_zsymm( cham_side_t side, cham_uplo_t uplo, int M, int N, * */ int CHAMELEON_zsymm_Tile( cham_side_t side, cham_uplo_t uplo, - CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, - CHAMELEON_Complex64_t beta, CHAM_desc_t *C ) + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C ) { CHAM_context_t *chamctxt; RUNTIME_sequence_t *sequence = NULL; RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; int status; + void *ws; chamctxt = chameleon_context_self(); if (chamctxt == NULL) { @@ -275,13 +454,16 @@ int CHAMELEON_zsymm_Tile( cham_side_t side, cham_uplo_t uplo, } chameleon_sequence_create( chamctxt, &sequence ); - CHAMELEON_zsymm_Tile_Async(side, uplo, alpha, A, B, beta, C, sequence, &request ); + ws = CHAMELEON_zsymm_WS_Alloc( side, uplo, A, B, C ); + CHAMELEON_zsymm_Tile_Async( side, uplo, alpha, A, B, beta, C, ws, sequence, &request ); CHAMELEON_Desc_Flush( A, sequence ); CHAMELEON_Desc_Flush( B, sequence ); CHAMELEON_Desc_Flush( C, sequence ); chameleon_sequence_wait( chamctxt, sequence ); + CHAMELEON_zsymm_WS_Free( ws ); + status = sequence->status; chameleon_sequence_destroy( chamctxt, sequence ); return status; @@ -316,11 +498,13 @@ int CHAMELEON_zsymm_Tile( cham_side_t side, cham_uplo_t uplo, * */ int CHAMELEON_zsymm_Tile_Async( cham_side_t side, cham_uplo_t uplo, - CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, - CHAMELEON_Complex64_t beta, CHAM_desc_t *C, - RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + void *user_ws, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { CHAM_context_t *chamctxt; + struct chameleon_pzgemm_s *ws; chamctxt = chameleon_context_self(); if (chamctxt == NULL) { @@ -391,16 +575,6 @@ int CHAMELEON_zsymm_Tile_Async( cham_side_t side, cham_uplo_t uplo, } /* Check submatrix starting point */ - /* if ( (B->i != C->i) || (B->j != C->j) ) { */ - /* chameleon_error("CHAMELEON_zsymm_Tile_Async", "B and C submatrices doesn't match"); */ - /* return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); */ - /* } */ - /* if ( (A->i != A->j) || */ - /* ( (side == ChamLeft) && (A->i != B->i ) ) || */ - /* ( (side == ChamRight) && (A->i != B->j ) ) ) { */ - /* chameleon_error("CHAMELEON_zsymm_Tile_Async", "Submatrix A must start on diagnonal and match submatrices B and C."); */ - /* return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); */ - /* } */ if( (A->i != 0) || (A->j != 0) || (B->i != 0) || (B->j != 0) || (C->i != 0) || (C->j != 0) ) { @@ -415,7 +589,21 @@ int CHAMELEON_zsymm_Tile_Async( cham_side_t side, cham_uplo_t uplo, return CHAMELEON_SUCCESS; } - chameleon_pzsymm( side, uplo, alpha, A, B, beta, C, sequence, request ); + if ( user_ws == NULL ) { + ws = CHAMELEON_zsymm_WS_Alloc( side, uplo, A, B, C ); + } + else { + ws = user_ws; + } + chameleon_pzsymm( ws, side, uplo, alpha, A, B, beta, C, sequence, request ); + + if ( user_ws == NULL ) { + CHAMELEON_Desc_Flush( A, sequence ); + CHAMELEON_Desc_Flush( B, sequence ); + CHAMELEON_Desc_Flush( C, sequence ); + chameleon_sequence_wait( chamctxt, sequence ); + CHAMELEON_zsymm_WS_Free( ws ); + } return CHAMELEON_SUCCESS; } diff --git a/control/chameleon_zf77.c b/control/chameleon_zf77.c index cf1b8d94e..3a30ddc22 100644 --- a/control/chameleon_zf77.c +++ b/control/chameleon_zf77.c @@ -840,7 +840,7 @@ void CHAMELEON_ZSYTRS_TILE_ASYNC(cham_uplo_t *uplo, CHAM_desc_t *A, CHAM_desc_t #endif void CHAMELEON_ZSYMM_TILE_ASYNC(cham_side_t *side, cham_uplo_t *uplo, CHAMELEON_Complex64_t *alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t *beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request, int *info) -{ *info = CHAMELEON_zsymm_Tile_Async(*side, *uplo, *alpha, A, B, *beta, C, sequence, request); } +{ *info = CHAMELEON_zsymm_Tile_Async(*side, *uplo, *alpha, A, B, *beta, C, NULL, sequence, request); } void CHAMELEON_ZSYR2K_TILE_ASYNC(cham_uplo_t *uplo, cham_trans_t *trans, CHAMELEON_Complex64_t *alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t *beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request, int *info) { *info = CHAMELEON_zsyr2k_Tile_Async(*uplo, *trans, *alpha, A, B, *beta, C, sequence, request); } diff --git a/control/compute_z.h b/control/compute_z.h index 760127d31..32dbcca8a 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -106,7 +106,7 @@ void chameleon_pzplrnk(int K, CHAM_desc_t *C, unsigned long long int seedA, unsi void chameleon_pzpotrf(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzpotrimm(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzshift(int, int, int, CHAMELEON_Complex64_t *, int *, int, int, int, RUNTIME_sequence_t*, RUNTIME_request_t*); -void chameleon_pzsymm(cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); +void chameleon_pzsymm( struct chameleon_pzgemm_s *ws,cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); void chameleon_pzsyrk(cham_uplo_t uplo, cham_trans_t trans, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzsyr2k(cham_uplo_t uplo, cham_trans_t trans, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzsytrf(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); diff --git a/include/chameleon/chameleon_z.h b/include/chameleon/chameleon_z.h index 3a800ec85..a7cbf43fb 100644 --- a/include/chameleon/chameleon_z.h +++ b/include/chameleon/chameleon_z.h @@ -247,7 +247,7 @@ int CHAMELEON_zpotrimm_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t int CHAMELEON_zpotrs_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zsysv_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zsytrs_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); -int CHAMELEON_zsymm_Tile_Async(cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); +int CHAMELEON_zsymm_Tile_Async(cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, void *ws, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zsyrk_Tile_Async(cham_uplo_t uplo, cham_trans_t trans, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zsyr2k_Tile_Async(cham_uplo_t uplo, cham_trans_t trans, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_ztpgqrt_Tile_Async( int L, CHAM_desc_t *V1, CHAM_desc_t *T1, CHAM_desc_t *V2, CHAM_desc_t *T2, CHAM_desc_t *Q1, CHAM_desc_t *Q2, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); @@ -312,6 +312,8 @@ int CHAMELEON_zunmqr_param_Tile_Async(const libhqr_tree_t *qrtree, cham_side_t s */ void *CHAMELEON_zgemm_WS_Alloc( cham_trans_t transA, cham_trans_t transB, const CHAM_desc_t *A, const CHAM_desc_t *B, const CHAM_desc_t *C ); void CHAMELEON_zgemm_WS_Free( void *ws ); +void *CHAMELEON_zsymm_WS_Alloc( cham_side_t side, cham_uplo_t uplo, const CHAM_desc_t *A, const CHAM_desc_t *B, const CHAM_desc_t *C ); +void CHAMELEON_zsymm_WS_Free( void *ws ); void *CHAMELEON_zcesca_WS_Alloc( const CHAM_desc_t *A ); void CHAMELEON_zcesca_WS_Free( void *ws ); void *CHAMELEON_zgram_WS_Alloc( const CHAM_desc_t *A ); diff --git a/testing/testing_zsymm.c b/testing/testing_zsymm.c index b4edb1e84..39f966301 100644 --- a/testing/testing_zsymm.c +++ b/testing/testing_zsymm.c @@ -57,6 +57,7 @@ testing_zsymm_desc( run_arg_list_t *args, int check ) /* Descriptors */ int Am; CHAM_desc_t *descA, *descB, *descC, *descCinit; + void *ws = NULL; bump = run_arg_get_double( args, "bump", bump ); alpha = run_arg_get_complex64( args, "alpha", alpha ); @@ -85,11 +86,15 @@ testing_zsymm_desc( run_arg_list_t *args, int check ) CHAMELEON_zplrnt_Tile( descB, seedB ); CHAMELEON_zplrnt_Tile( descC, seedC ); + if ( async ) { + ws = CHAMELEON_zsymm_WS_Alloc( side, uplo, descA, descB, descC ); + } + /* Calculates the product */ testing_start( &test_data ); if ( async ) { hres = CHAMELEON_zsymm_Tile_Async( side, uplo, alpha, descA, descB, beta, descC, - test_data.sequence, &test_data.request ); + ws, test_data.sequence, &test_data.request ); CHAMELEON_Desc_Flush( descA, test_data.sequence ); CHAMELEON_Desc_Flush( descB, test_data.sequence ); CHAMELEON_Desc_Flush( descC, test_data.sequence ); @@ -100,6 +105,10 @@ testing_zsymm_desc( run_arg_list_t *args, int check ) test_data.hres = hres; testing_stop( &test_data, flops_zsymm( side, M, N ) ); + if ( ws != NULL ) { + CHAMELEON_zsymm_WS_Free( ws ); + } + /* Checks the solution */ if ( check ) { CHAMELEON_Desc_Create( -- GitLab