From 7b4b742fbd215af2f86565af059f92f8c8b39f4c Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Wed, 31 Aug 2022 00:31:26 +0200
Subject: [PATCH] zsymm.c: Make the Async call really asynchronous by creating
 a ws as in GEMM

---
 compute/pzgepdf_qdwh.c          |   2 +-
 compute/pzsymm.c                |  64 ++++-----
 compute/zsymm.c                 | 230 +++++++++++++++++++++++++++++---
 control/chameleon_zf77.c        |   2 +-
 control/compute_z.h             |   2 +-
 include/chameleon/chameleon_z.h |   4 +-
 testing/testing_zsymm.c         |  11 +-
 7 files changed, 254 insertions(+), 61 deletions(-)

diff --git a/compute/pzgepdf_qdwh.c b/compute/pzgepdf_qdwh.c
index b50edf517..30d52379c 100644
--- a/compute/pzgepdf_qdwh.c
+++ b/compute/pzgepdf_qdwh.c
@@ -827,7 +827,7 @@ chameleon_pzgepdf_qdwh( cham_mtxtype_t mtxtype, CHAM_desc_t *descU, CHAM_desc_t
         break;
 #endif
     case ChamSymmetric:
-        chameleon_pzsymm( ChamRight, ChamUpper,
+        chameleon_pzsymm( gemm_ws, ChamRight, ChamUpper,
                           1., descU, &descA,
                           0., descH, sequence, request );
         if ( info ) {
diff --git a/compute/pzsymm.c b/compute/pzsymm.c
index 583b2b809..e39d9f6e2 100644
--- a/compute/pzsymm.c
+++ b/compute/pzsymm.c
@@ -561,39 +561,22 @@ static inline void
 chameleon_pzsymm_summa( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo,
                         CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
                         CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
+                        CHAM_desc_t *WA, CHAM_desc_t *WB,
                         RUNTIME_option_t *options )
 {
     RUNTIME_sequence_t *sequence = options->sequence;
-    CHAM_desc_t WA, WB;
-    int lookahead;
-
-    lookahead = chamctxt->lookahead;
-    chameleon_desc_init( &WA, CHAMELEON_MAT_ALLOC_TILE,
-                         ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
-                         C->mt * C->mb, C->nb * C->q * lookahead, 0, 0,
-                         C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q,
-                         NULL, NULL, NULL );
-    chameleon_desc_init( &WB, CHAMELEON_MAT_ALLOC_TILE,
-                         ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
-                         C->mb * C->p * lookahead, C->nt * C->nb, 0, 0,
-                         C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q,
-                         NULL, NULL, NULL );
 
     if (side == ChamLeft) {
         chameleon_pzsymm_summa_left( chamctxt, uplo, alpha, A, B, beta, C,
-                                     &WA, &WB, options );
+                                     WA, WB, options );
     }
     else {
         chameleon_pzsymm_summa_right( chamctxt, uplo, alpha, A, B, beta, C,
-                                      &WA, &WB, options );
+                                      WA, WB, options );
     }
 
-    RUNTIME_desc_flush( &WA, sequence );
-    RUNTIME_desc_flush( &WB, sequence );
-    RUNTIME_desc_flush(  C,  sequence );
-    chameleon_sequence_wait( chamctxt, sequence );
-    chameleon_desc_destroy( &WA );
-    chameleon_desc_destroy( &WB );
+    CHAMELEON_Desc_Flush( WA, sequence );
+    CHAMELEON_Desc_Flush( WB, sequence );
 }
 
 /**
@@ -781,13 +764,15 @@ chameleon_pzsymm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_
  *  Parallel tile symmetric matrix-matrix multiplication. wrapper.
  */
 void
-chameleon_pzsymm( cham_side_t side, cham_uplo_t uplo,
+chameleon_pzsymm( struct chameleon_pzgemm_s *ws,
+                  cham_side_t side, cham_uplo_t uplo,
                   CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
                   CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
                   RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
 {
     CHAM_context_t *chamctxt;
     RUNTIME_option_t options;
+    cham_gemm_t alg = (ws != NULL) ? ws->alg : ChamGemmAlgGeneric;
 
     chamctxt = chameleon_context_self();
     if (sequence->status != CHAMELEON_SUCCESS) {
@@ -795,18 +780,27 @@ chameleon_pzsymm( cham_side_t side, cham_uplo_t uplo,
     }
     RUNTIME_options_init( &options, chamctxt, sequence, request );
 
-    /* if ( ((C->p > 1) || (C->q > 1)) && */
-    /*      (C->get_rankof == chameleon_getrankof_2d) && */
-    /*      (chamctxt->generic_enabled != CHAMELEON_TRUE) ) */
-    /* { */
-    /*     chameleon_pzsymm_summa(   chamctxt, side, uplo, alpha, A, B, beta, C, &options ); */
-    /* } */
-    /* else */
-    /* { */
-    /*     chameleon_pzsymm_generic( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); */
-    /* } */
-    chameleon_pzsymm_Astat( chamctxt, side, uplo, alpha, A, B, beta, C, &options );
-
+    switch( alg ) {
+    case ChamGemmAlgAuto:
+    case ChamGemmAlgSummaB: /* Switch back to generic since it does not exist yet. */
+    case ChamGemmAlgGeneric:
+        chameleon_pzsymm_generic( chamctxt, side, uplo, alpha, A, B, beta, C, &options );
+        break;
+
+    case ChamGemmAlgSummaC:
+        chameleon_pzsymm_summa( chamctxt, side, uplo, alpha, A, B, beta, C,
+                                &(ws->WA), &(ws->WB), &options );
+        break;
+
+    case ChamGemmAlgSummaA:
+        if ( side == ChamLeft ) {
+            chameleon_pzsymm_Astat( chamctxt, side, uplo, alpha, A, B, beta, C, &options );
+        }
+        else {
+            chameleon_pzsymm_generic( chamctxt, side, uplo, alpha, A, B, beta, C, &options );
+        }
+        break;
+    }
 
     RUNTIME_options_finalize( &options, chamctxt );
 }
diff --git a/compute/zsymm.c b/compute/zsymm.c
index c4ff5fb0e..397c8f65c 100644
--- a/compute/zsymm.c
+++ b/compute/zsymm.c
@@ -24,6 +24,181 @@
  */
 #include "control/common.h"
 
+/**
+ ********************************************************************************
+ *
+ * @ingroup CHAMELEON_Complex64_t
+ *
+ * @brief Allocate the required workspaces for asynchronous symm
+ *
+ *******************************************************************************
+ *
+ * @param[in] side
+ *          Specifies whether the symmetric matrix A appears on the
+ *          left or right in the operation as follows:
+ *          = ChamLeft:      \f[ C = \alpha \times A \times B + \beta \times C \f]
+ *          = ChamRight:     \f[ C = \alpha \times B \times A + \beta \times C \f]
+ *
+ * @param[in] uplo
+ *          Specifies whether the upper or lower triangular part of
+ *          the symmetric matrix A is to be referenced as follows:
+ *          = ChamLower: Only the lower triangular part of the
+ *                symmetric matrix A is to be referenced.
+ *          = ChamUpper: Only the upper triangular part of the
+ *                symmetric matrix A is to be referenced.
+ *
+ * @param[in] A
+ *          The descriptor of the matrix A.
+ *
+ * @param[in] B
+ *          The descriptor of the matrix B.
+ *
+ * @param[in] C
+ *          The descriptor of the matrix C.
+ *
+ *******************************************************************************
+ *
+ * @retval An allocated opaque pointer to use in CHAMELEON_zsymm_Tile_Async()
+ * and to free with CHAMELEON_zsymm_WS_Free().
+ *
+ *******************************************************************************
+ *
+ * @sa CHAMELEON_zsymm_Tile_Async
+ * @sa CHAMELEON_zsymm_WS_Free
+ *
+ */
+void *CHAMELEON_zsymm_WS_Alloc( cham_side_t        side __attribute__((unused)),
+                                cham_uplo_t        uplo __attribute__((unused)),
+                                const CHAM_desc_t *A,
+                                const CHAM_desc_t *B,
+                                const CHAM_desc_t *C )
+{
+    CHAM_context_t *chamctxt;
+    struct chameleon_pzgemm_s *options;
+
+    chamctxt = chameleon_context_self();
+    if ( chamctxt == NULL ) {
+        return NULL;
+    }
+
+    options = calloc( 1, sizeof(struct chameleon_pzgemm_s) );
+    options->alg = ChamGemmAlgAuto;
+
+    /*
+     * If only one process, or if generic has been globally enforced, we switch
+     * to generic immediately.
+     */
+    if ( ((C->p == 1) && (C->q == 1)) ||
+         (chamctxt->generic_enabled == CHAMELEON_TRUE) )
+    {
+        options->alg = ChamGemmAlgGeneric;
+    }
+
+    /* Look at environment variable is something enforces the variant. */
+    if ( options->alg == ChamGemmAlgAuto )
+    {
+        char *algostr = chameleon_getenv( "CHAMELEON_GEMM_ALGO" );
+
+        if ( algostr ) {
+            if ( strcasecmp( algostr, "summa_c" ) == 0 ) {
+                options->alg = ChamGemmAlgSummaC;
+            }
+            else if ( strcasecmp( algostr, "summa_a" ) == 0  ) {
+                options->alg = ChamGemmAlgSummaA;
+            }
+            else if ( strcasecmp( algostr, "summa_b" ) == 0  ) {
+                options->alg = ChamGemmAlgSummaB;
+            }
+            else if ( strcasecmp( algostr, "generic" ) == 0  ) {
+                options->alg = ChamGemmAlgGeneric;
+            }
+            else if ( strcasecmp( algostr, "auto" ) == 0  ) {
+                options->alg = ChamGemmAlgAuto;
+            }
+            else {
+                fprintf( stderr, "ERROR: CHAMELEON_GEMM_ALGO is not one of AUTO, SUMMA_A, SUMMA_B, SUMMA_C, GENERIC => Switch back to Automatic switch\n" );
+            }
+        }
+        chameleon_cleanenv( algostr );
+    }
+
+    /* Perform automatic choice if not already enforced. */
+    if ( options->alg == ChamGemmAlgAuto )
+    {
+        double sizeA, sizeB, sizeC;
+        double ratio = 1.5; /* Arbitrary ratio to give more weight to writes wrt reads. */
+
+        /* Compute the average array per node for each matrix */
+        sizeA = ((double)A->m * (double)A->n) / (double)(A->p * A->q);
+        sizeB = ((double)B->m * (double)B->n) / (double)(B->p * B->q);
+        sizeC = ((double)C->m * (double)C->n) / (double)(C->p * C->q) * ratio;
+
+        if ( (sizeC > sizeA) && (sizeC > sizeB) ) {
+            options->alg = ChamGemmAlgSummaC;
+        }
+        else {
+            if ( sizeA > sizeB ) {
+                options->alg = ChamGemmAlgSummaA;
+            }
+            else {
+                options->alg = ChamGemmAlgSummaB;
+            }
+        }
+    }
+
+    assert( options->alg != ChamGemmAlgAuto );
+
+    /* Now that we have decided which algorithm, let's allocate the required data structures. */
+    if ( (options->alg == ChamGemmAlgSummaC ) &&
+         (C->get_rankof == chameleon_getrankof_2d ) )
+    {
+        int lookahead = chamctxt->lookahead;
+
+        chameleon_desc_init( &(options->WA), CHAMELEON_MAT_ALLOC_TILE,
+                             ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
+                             C->mt * C->mb, C->nb * C->q * lookahead, 0, 0,
+                             C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q,
+                             NULL, NULL, NULL );
+        chameleon_desc_init( &(options->WB), CHAMELEON_MAT_ALLOC_TILE,
+                             ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
+                             C->mb * C->p * lookahead, C->nt * C->nb, 0, 0,
+                             C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q,
+                             NULL, NULL, NULL );
+    }
+
+    return (void*)options;
+}
+
+/**
+ ********************************************************************************
+ *
+ * @ingroup CHAMELEON_Complex64_t
+ *
+ * @brief Free the allocated workspaces for asynchronous symm
+ *
+ *******************************************************************************
+ *
+ * @param[in,out] user_ws
+ *          On entry, the opaque pointer allocated by CHAMELEON_zsymm_WS_Alloc()
+ *          On exit, all data are freed.
+ *
+ *******************************************************************************
+ *
+ * @sa CHAMELEON_zsymm_Tile_Async
+ * @sa CHAMELEON_zsymm_WS_Alloc
+ *
+ */
+void CHAMELEON_zsymm_WS_Free( void *user_ws )
+{
+    struct chameleon_pzgemm_s *ws = (struct chameleon_pzgemm_s*)user_ws;
+
+    if ( ws->alg == ChamGemmAlgSummaC ) {
+        chameleon_desc_destroy( &(ws->WA) );
+        chameleon_desc_destroy( &(ws->WB) );
+    }
+    free( ws );
+}
+
 /**
  ********************************************************************************
  *
@@ -102,9 +277,9 @@
  *
  */
 int CHAMELEON_zsymm( cham_side_t side, cham_uplo_t uplo, int M, int N,
-                 CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t *A, int LDA,
-                 CHAMELEON_Complex64_t *B, int LDB,
-                 CHAMELEON_Complex64_t beta,  CHAMELEON_Complex64_t *C, int LDC )
+                     CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t *A, int LDA,
+                                                  CHAMELEON_Complex64_t *B, int LDB,
+                     CHAMELEON_Complex64_t beta,  CHAMELEON_Complex64_t *C, int LDC )
 {
     int NB;
     int Am;
@@ -115,6 +290,7 @@ int CHAMELEON_zsymm( cham_side_t side, cham_uplo_t uplo, int M, int N,
     CHAM_context_t *chamctxt;
     RUNTIME_sequence_t *sequence = NULL;
     RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER;
+    void *ws;
 
     chamctxt = chameleon_context_self();
     if (chamctxt == NULL) {
@@ -179,7 +355,8 @@ int CHAMELEON_zsymm( cham_side_t side, cham_uplo_t uplo, int M, int N,
                      C, NB, NB, LDC, N, M,  N, sequence, &request );
 
     /* Call the tile interface */
-    CHAMELEON_zsymm_Tile_Async(  side, uplo, alpha, &descAt, &descBt, beta, &descCt, sequence, &request );
+    ws = CHAMELEON_zsymm_WS_Alloc( side, uplo, &descAt, &descBt, &descCt );
+    CHAMELEON_zsymm_Tile_Async( side, uplo, alpha, &descAt, &descBt, beta, &descCt, ws, sequence, &request );
 
     /* Submit the matrix conversion back */
     chameleon_ztile2lap( chamctxt, &descAl, &descAt,
@@ -192,6 +369,7 @@ int CHAMELEON_zsymm( cham_side_t side, cham_uplo_t uplo, int M, int N,
     chameleon_sequence_wait( chamctxt, sequence );
 
     /* Cleanup the temporary data */
+    CHAMELEON_zsymm_WS_Free( ws );
     chameleon_ztile2lap_cleanup( chamctxt, &descAl, &descAt );
     chameleon_ztile2lap_cleanup( chamctxt, &descBl, &descBt );
     chameleon_ztile2lap_cleanup( chamctxt, &descCl, &descCt );
@@ -260,13 +438,14 @@ int CHAMELEON_zsymm( cham_side_t side, cham_uplo_t uplo, int M, int N,
  *
  */
 int CHAMELEON_zsymm_Tile( cham_side_t side, cham_uplo_t uplo,
-                      CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
-                      CHAMELEON_Complex64_t beta,  CHAM_desc_t *C )
+                          CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
+                          CHAMELEON_Complex64_t beta,  CHAM_desc_t *C )
 {
     CHAM_context_t *chamctxt;
     RUNTIME_sequence_t *sequence = NULL;
     RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER;
     int status;
+    void *ws;
 
     chamctxt = chameleon_context_self();
     if (chamctxt == NULL) {
@@ -275,13 +454,16 @@ int CHAMELEON_zsymm_Tile( cham_side_t side, cham_uplo_t uplo,
     }
     chameleon_sequence_create( chamctxt, &sequence );
 
-    CHAMELEON_zsymm_Tile_Async(side, uplo, alpha, A, B, beta, C, sequence, &request );
+    ws = CHAMELEON_zsymm_WS_Alloc( side, uplo, A, B, C );
+    CHAMELEON_zsymm_Tile_Async( side, uplo, alpha, A, B, beta, C, ws, sequence, &request );
 
     CHAMELEON_Desc_Flush( A, sequence );
     CHAMELEON_Desc_Flush( B, sequence );
     CHAMELEON_Desc_Flush( C, sequence );
 
     chameleon_sequence_wait( chamctxt, sequence );
+    CHAMELEON_zsymm_WS_Free( ws );
+
     status = sequence->status;
     chameleon_sequence_destroy( chamctxt, sequence );
     return status;
@@ -316,11 +498,13 @@ int CHAMELEON_zsymm_Tile( cham_side_t side, cham_uplo_t uplo,
  *
  */
 int CHAMELEON_zsymm_Tile_Async( cham_side_t side, cham_uplo_t uplo,
-                            CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
-                            CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
-                            RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
+                                CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
+                                CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
+                                void *user_ws,
+                                RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
 {
     CHAM_context_t *chamctxt;
+    struct chameleon_pzgemm_s *ws;
 
     chamctxt = chameleon_context_self();
     if (chamctxt == NULL) {
@@ -391,16 +575,6 @@ int CHAMELEON_zsymm_Tile_Async( cham_side_t side, cham_uplo_t uplo,
     }
 
     /* Check submatrix starting point */
-    /* if ( (B->i != C->i) || (B->j != C->j) ) { */
-    /*     chameleon_error("CHAMELEON_zsymm_Tile_Async", "B and C submatrices doesn't match"); */
-    /*     return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); */
-    /* } */
-    /* if ( (A->i != A->j) ||  */
-    /*          ( (side == ChamLeft)  && (A->i != B->i ) ) ||  */
-    /*          ( (side == ChamRight) && (A->i != B->j ) ) ) { */
-    /*     chameleon_error("CHAMELEON_zsymm_Tile_Async", "Submatrix A must start on diagnonal and match submatrices B and C."); */
-    /*     return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); */
-    /* } */
     if( (A->i != 0) || (A->j != 0) ||
         (B->i != 0) || (B->j != 0) ||
         (C->i != 0) || (C->j != 0) ) {
@@ -415,7 +589,21 @@ int CHAMELEON_zsymm_Tile_Async( cham_side_t side, cham_uplo_t uplo,
         return CHAMELEON_SUCCESS;
     }
 
-    chameleon_pzsymm( side, uplo, alpha, A, B, beta, C, sequence, request );
+    if ( user_ws == NULL ) {
+        ws = CHAMELEON_zsymm_WS_Alloc( side, uplo, A, B, C );
+    }
+    else {
+        ws = user_ws;
+    }
 
+    chameleon_pzsymm( ws, side, uplo, alpha, A, B, beta, C, sequence, request );
+
+    if ( user_ws == NULL ) {
+        CHAMELEON_Desc_Flush( A, sequence );
+        CHAMELEON_Desc_Flush( B, sequence );
+        CHAMELEON_Desc_Flush( C, sequence );
+        chameleon_sequence_wait( chamctxt, sequence );
+        CHAMELEON_zsymm_WS_Free( ws );
+    }
     return CHAMELEON_SUCCESS;
 }
diff --git a/control/chameleon_zf77.c b/control/chameleon_zf77.c
index cf1b8d94e..3a30ddc22 100644
--- a/control/chameleon_zf77.c
+++ b/control/chameleon_zf77.c
@@ -840,7 +840,7 @@ void CHAMELEON_ZSYTRS_TILE_ASYNC(cham_uplo_t *uplo, CHAM_desc_t *A, CHAM_desc_t
 #endif
 
 void CHAMELEON_ZSYMM_TILE_ASYNC(cham_side_t *side, cham_uplo_t *uplo, CHAMELEON_Complex64_t *alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t *beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request, int *info)
-{ *info = CHAMELEON_zsymm_Tile_Async(*side, *uplo, *alpha, A, B, *beta, C, sequence, request); }
+{ *info = CHAMELEON_zsymm_Tile_Async(*side, *uplo, *alpha, A, B, *beta, C, NULL, sequence, request); }
 
 void CHAMELEON_ZSYR2K_TILE_ASYNC(cham_uplo_t *uplo, cham_trans_t *trans, CHAMELEON_Complex64_t *alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t *beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request, int *info)
 { *info = CHAMELEON_zsyr2k_Tile_Async(*uplo, *trans, *alpha, A, B, *beta, C, sequence, request); }
diff --git a/control/compute_z.h b/control/compute_z.h
index 760127d31..32dbcca8a 100644
--- a/control/compute_z.h
+++ b/control/compute_z.h
@@ -106,7 +106,7 @@ void chameleon_pzplrnk(int K, CHAM_desc_t *C, unsigned long long int seedA, unsi
 void chameleon_pzpotrf(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 void chameleon_pzpotrimm(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 void chameleon_pzshift(int, int, int, CHAMELEON_Complex64_t *, int *, int, int, int, RUNTIME_sequence_t*, RUNTIME_request_t*);
-void chameleon_pzsymm(cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
+void chameleon_pzsymm( struct chameleon_pzgemm_s *ws,cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
 void chameleon_pzsyrk(cham_uplo_t uplo, cham_trans_t trans, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAMELEON_Complex64_t beta,  CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 void chameleon_pzsyr2k(cham_uplo_t uplo, cham_trans_t trans, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 void chameleon_pzsytrf(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
diff --git a/include/chameleon/chameleon_z.h b/include/chameleon/chameleon_z.h
index 3a800ec85..a7cbf43fb 100644
--- a/include/chameleon/chameleon_z.h
+++ b/include/chameleon/chameleon_z.h
@@ -247,7 +247,7 @@ int CHAMELEON_zpotrimm_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t
 int CHAMELEON_zpotrs_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 int CHAMELEON_zsysv_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 int CHAMELEON_zsytrs_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
-int CHAMELEON_zsymm_Tile_Async(cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
+int CHAMELEON_zsymm_Tile_Async(cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, void *ws, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 int CHAMELEON_zsyrk_Tile_Async(cham_uplo_t uplo, cham_trans_t trans, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 int CHAMELEON_zsyr2k_Tile_Async(cham_uplo_t uplo, cham_trans_t trans, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 int CHAMELEON_ztpgqrt_Tile_Async( int L, CHAM_desc_t *V1, CHAM_desc_t *T1, CHAM_desc_t *V2, CHAM_desc_t *T2, CHAM_desc_t *Q1, CHAM_desc_t *Q2, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
@@ -312,6 +312,8 @@ int CHAMELEON_zunmqr_param_Tile_Async(const libhqr_tree_t *qrtree, cham_side_t s
  */
 void *CHAMELEON_zgemm_WS_Alloc( cham_trans_t transA, cham_trans_t transB, const CHAM_desc_t *A, const CHAM_desc_t *B, const CHAM_desc_t *C );
 void  CHAMELEON_zgemm_WS_Free( void *ws );
+void *CHAMELEON_zsymm_WS_Alloc( cham_side_t side, cham_uplo_t uplo, const CHAM_desc_t *A, const CHAM_desc_t *B, const CHAM_desc_t *C );
+void  CHAMELEON_zsymm_WS_Free( void *ws );
 void *CHAMELEON_zcesca_WS_Alloc( const CHAM_desc_t *A );
 void  CHAMELEON_zcesca_WS_Free( void *ws );
 void *CHAMELEON_zgram_WS_Alloc( const CHAM_desc_t *A );
diff --git a/testing/testing_zsymm.c b/testing/testing_zsymm.c
index b4edb1e84..39f966301 100644
--- a/testing/testing_zsymm.c
+++ b/testing/testing_zsymm.c
@@ -57,6 +57,7 @@ testing_zsymm_desc( run_arg_list_t *args, int check )
     /* Descriptors */
     int          Am;
     CHAM_desc_t *descA, *descB, *descC, *descCinit;
+    void        *ws = NULL;
 
     bump  = run_arg_get_double( args, "bump", bump );
     alpha = run_arg_get_complex64( args, "alpha", alpha );
@@ -85,11 +86,15 @@ testing_zsymm_desc( run_arg_list_t *args, int check )
     CHAMELEON_zplrnt_Tile( descB, seedB );
     CHAMELEON_zplrnt_Tile( descC, seedC );
 
+    if ( async ) {
+        ws = CHAMELEON_zsymm_WS_Alloc( side, uplo, descA, descB, descC );
+    }
+
     /* Calculates the product */
     testing_start( &test_data );
     if ( async ) {
         hres = CHAMELEON_zsymm_Tile_Async( side, uplo, alpha, descA, descB, beta, descC,
-                                           test_data.sequence, &test_data.request );
+                                           ws, test_data.sequence, &test_data.request );
         CHAMELEON_Desc_Flush( descA, test_data.sequence );
         CHAMELEON_Desc_Flush( descB, test_data.sequence );
         CHAMELEON_Desc_Flush( descC, test_data.sequence );
@@ -100,6 +105,10 @@ testing_zsymm_desc( run_arg_list_t *args, int check )
     test_data.hres = hres;
     testing_stop( &test_data, flops_zsymm( side, M, N ) );
 
+    if ( ws != NULL ) {
+        CHAMELEON_zsymm_WS_Free( ws );
+    }
+
     /* Checks the solution */
     if ( check ) {
         CHAMELEON_Desc_Create(
-- 
GitLab