diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index c5fdfdf4eb7bbbb1374dac51452782b37858c84b..5bf0e234b4d4cce6339cfa7293a38eb3624585fc 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -24,7 +24,7 @@ * @author Alycia Lisito * @author Romain Peressoni * @author Matthieu Kuhn - * @date 2023-08-31 + * @date 2023-09-11 * @precisions normal z -> c d s * */ @@ -517,4 +517,21 @@ void INSERT_TASK_zgetrf_percol_offdiag( const RUNTIME_option_t *options, CHAM_desc_t *A, int Am, int An, CHAM_ipiv_t *ws ); +void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options, + int h, int m0, int ib, + CHAM_desc_t *A, int Am, int An, + CHAM_desc_t *U, int Um, int Un, + CHAM_ipiv_t *ws ); + +void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options, + int h, int m0, int ib, + CHAM_desc_t *A, int Am, int An, + CHAM_desc_t *U, int Um, int Un, + CHAM_ipiv_t *ws ); + +void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options, + int m, int n, int h, int ib, + CHAM_desc_t *U, int Um, int Un, + CHAM_ipiv_t *ws ); + #endif /* _chameleon_tasks_z_h_ */ diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index f011e6d9693672653d8301a4743b314b65ed8d2b..239327831a8e9d11378f7a251136bb6451054a73 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -24,7 +24,7 @@ # @author Florent Pruvost # @author Philippe Virouleau # @author Matthieu Kuhn -# @date 2023-08-31 +# @date 2023-09-11 # ### @@ -68,6 +68,7 @@ set(CODELETS_ZSRC codelets/codelet_zgetrf_nopiv.c codelets/codelet_zgetrf_nopiv_percol.c codelets/codelet_zgetrf_percol.c + codelets/codelet_zgetrf_blocked.c codelets/codelet_zhe2ge.c codelets/codelet_zherfb.c codelets/codelet_zhessq.c diff --git a/runtime/openmp/codelets/codelet_zgetrf_blocked.c b/runtime/openmp/codelets/codelet_zgetrf_blocked.c new file mode 100644 index 0000000000000000000000000000000000000000..5e60b94baaae73ca9cb6f3e5f0557879d1db2039 --- /dev/null +++ b/runtime/openmp/codelets/codelet_zgetrf_blocked.c @@ -0,0 +1,78 @@ +/** + * + * @file openmp/codelet_zgetrf_blocked.c + * + * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgetrf_blocked OpenMP codelets + * + * @version 1.3.0 + * @comment Codelets to perform panel factorization with partial pivoting + * + * @author Mathieu Faverge + * @date 2023-09-11 + * @precisions normal z -> c d s + * + */ +#include "chameleon_openmp.h" +#include "chameleon/tasks_z.h" + +void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options, + int h, int m0, int ib, + CHAM_desc_t *A, int Am, int An, + CHAM_desc_t *U, int Um, int Un, + CHAM_ipiv_t *ipiv ) +{ + assert( 0 ); + (void)options; + (void)h; + (void)m0; + (void)ib; + (void)A; + (void)Am; + (void)An; + (void)U; + (void)Um; + (void)Un; + (void)ipiv; +} + +void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options, + int h, int m0, int ib, + CHAM_desc_t *A, int Am, int An, + CHAM_desc_t *U, int Um, int Un, + CHAM_ipiv_t *ipiv ) +{ + assert( 0 ); + (void)options; + (void)h; + (void)m0; + (void)ib; + (void)A; + (void)Am; + (void)An; + (void)U; + (void)Um; + (void)Un; + (void)ipiv; +} + +void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options, + int m, int n, int h, int ib, + CHAM_desc_t *U, int Um, int Un, + CHAM_ipiv_t *ipiv ) +{ + assert( 0 ); + (void)options; + (void)m; + (void)n; + (void)h; + (void)ib; + (void)U; + (void)Um; + (void)Un; + (void)ipiv; +} diff --git a/runtime/parsec/codelets/codelet_zgetrf_blocked.c b/runtime/parsec/codelets/codelet_zgetrf_blocked.c new file mode 100644 index 0000000000000000000000000000000000000000..46fa6f40aca7cd45923546e019d4091c36c54c23 --- /dev/null +++ b/runtime/parsec/codelets/codelet_zgetrf_blocked.c @@ -0,0 +1,78 @@ +/** + * + * @file parsec/codelet_zgetrf_blocked.c + * + * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgetrf_blocked Parsec codelets + * + * @version 1.3.0 + * @comment Codelets to perform panel factorization with partial pivoting + * + * @author Mathieu Faverge + * @date 2023-09-11 + * @precisions normal z -> c d s + * + */ +#include "chameleon_parsec.h" +#include "chameleon/tasks_z.h" + +void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options, + int h, int m0, int ib, + CHAM_desc_t *A, int Am, int An, + CHAM_desc_t *U, int Um, int Un, + CHAM_ipiv_t *ipiv ) +{ + assert( 0 ); + (void)options; + (void)h; + (void)m0; + (void)ib; + (void)A; + (void)Am; + (void)An; + (void)U; + (void)Um; + (void)Un; + (void)ipiv; +} + +void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options, + int h, int m0, int ib, + CHAM_desc_t *A, int Am, int An, + CHAM_desc_t *U, int Um, int Un, + CHAM_ipiv_t *ipiv ) +{ + assert( 0 ); + (void)options; + (void)h; + (void)m0; + (void)ib; + (void)A; + (void)Am; + (void)An; + (void)U; + (void)Um; + (void)Un; + (void)ipiv; +} + +void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options, + int m, int n, int h, int ib, + CHAM_desc_t *U, int Um, int Un, + CHAM_ipiv_t *ipiv ) +{ + assert( 0 ); + (void)options; + (void)m; + (void)n; + (void)h; + (void)ib; + (void)U; + (void)Um; + (void)Un; + (void)ipiv; +} diff --git a/runtime/quark/codelets/codelet_zgetrf_blocked.c b/runtime/quark/codelets/codelet_zgetrf_blocked.c new file mode 100644 index 0000000000000000000000000000000000000000..3fbbc95c35a99fc650bd724b57476b95bb325c71 --- /dev/null +++ b/runtime/quark/codelets/codelet_zgetrf_blocked.c @@ -0,0 +1,78 @@ +/** + * + * @file quark/codelet_zgetrf_blocked.c + * + * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgetrf_blocked Quark codelets + * + * @version 1.3.0 + * @comment Codelets to perform panel factorization with partial pivoting + * + * @author Mathieu Faverge + * @date 2023-09-11 + * @precisions normal z -> c d s + * + */ +#include "chameleon_quark.h" +#include "chameleon/tasks_z.h" + +void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options, + int h, int m0, int ib, + CHAM_desc_t *A, int Am, int An, + CHAM_desc_t *U, int Um, int Un, + CHAM_ipiv_t *ipiv ) +{ + assert( 0 ); + (void)options; + (void)h; + (void)m0; + (void)ib; + (void)A; + (void)Am; + (void)An; + (void)U; + (void)Um; + (void)Un; + (void)ipiv; +} + +void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options, + int h, int m0, int ib, + CHAM_desc_t *A, int Am, int An, + CHAM_desc_t *U, int Um, int Un, + CHAM_ipiv_t *ipiv ) +{ + assert( 0 ); + (void)options; + (void)h; + (void)m0; + (void)ib; + (void)A; + (void)Am; + (void)An; + (void)U; + (void)Um; + (void)Un; + (void)ipiv; +} + +void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options, + int m, int n, int h, int ib, + CHAM_desc_t *U, int Um, int Un, + CHAM_ipiv_t *ipiv ) +{ + assert( 0 ); + (void)options; + (void)m; + (void)n; + (void)h; + (void)ib; + (void)U; + (void)Um; + (void)Un; + (void)ipiv; +} diff --git a/runtime/starpu/codelets/codelet_zgetrf_blocked.c b/runtime/starpu/codelets/codelet_zgetrf_blocked.c new file mode 100644 index 0000000000000000000000000000000000000000..6e4662060aa66a18945561352d9ae7fd3e03824e --- /dev/null +++ b/runtime/starpu/codelets/codelet_zgetrf_blocked.c @@ -0,0 +1,270 @@ +/** + * + * @file starpu/codelet_zgetrf_blocked.c + * + * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zpanel StarPU codelets + * + * @version 1.3.0 + * @comment Codelets to perform panel factorization with partial pivoting + * + * @author Mathieu Faverge + * @author Matthieu Kuhn + * @date 2023-09-11 + * @precisions normal z -> c d s + * + */ +#include "chameleon_starpu.h" +#include "runtime_codelet_z.h" +#include <coreblas/cblas_wrapper.h> + +CHAMELEON_CL_CB( zgetrf_blocked_diag, cti_handle_get_m(task->handles[0]), 0, 0, M ); +CHAMELEON_CL_CB( zgetrf_blocked_offdiag, cti_handle_get_m(task->handles[0]), 0, 0, M ); +CHAMELEON_CL_CB( zgetrf_blocked_trsm, cti_handle_get_m(task->handles[0]), 0, 0, M ); + +#if !defined(CHAMELEON_SIMULATION) +static void cl_zgetrf_blocked_diag_cpu_func(void *descr[], void *cl_arg) +{ + int h, m0, ib; + RUNTIME_sequence_t *sequence; + RUNTIME_request_t *request; + CHAM_tile_t *tileA; + CHAM_tile_t *tileU; + int *ipiv; + cppi_interface_t *nextpiv; + cppi_interface_t *prevpiv; + CHAMELEON_Complex64_t *U = NULL; + int ldu = -1;; + + starpu_codelet_unpack_args( cl_arg, &h, &m0, &ib, + &sequence, &request ); + + tileA = cti_interface_get(descr[0]); + ipiv = (int *)STARPU_VECTOR_GET_PTR(descr[1]); + nextpiv = (cppi_interface_t*) descr[2]; + prevpiv = (cppi_interface_t*) descr[3]; + if ( descr[4] != NULL ) { + tileU = cti_interface_get(descr[4]); + U = CHAM_tile_get_ptr( tileU ); + ldu = tileU->ld; + } + + if ( h > 0 ) { + cppi_display_dbg( prevpiv, stderr, "Prevpiv before call: " ); + } + if ( h < tileA->n ) { + cppi_display_dbg( nextpiv, stderr, "Nextpiv before call: " ); + } + + /* + * Make sure the nextpiv interface store the right information about the + * column and diagonal row for the reduction + */ + nextpiv->h = h; + nextpiv->has_diag = 1; + + CORE_zgetrf_panel_diag( tileA->m, tileA->n, h, m0, ib, + CHAM_tile_get_ptr( tileA ), tileA->ld, + U, ldu, + ipiv, &(nextpiv->pivot), &(prevpiv->pivot) ); + + if ( h > 0 ) { + cppi_display_dbg( prevpiv, stderr, "Prevpiv after call: " ); + } + if ( h < tileA->n ) { + cppi_display_dbg( nextpiv, stderr, "Nextpiv after call: " ); + } +} +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +CODELETS_CPU( zgetrf_blocked_diag, cl_zgetrf_blocked_diag_cpu_func ); + +void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options, + int h, int m0, int ib, + CHAM_desc_t *A, int Am, int An, + CHAM_desc_t *U, int Um, int Un, + CHAM_ipiv_t *ipiv ) +{ + struct starpu_codelet *codelet = &cl_zgetrf_blocked_diag; + void (*callback)(void*) = options->profiling ? cl_zgetrf_blocked_diag_callback : NULL; + + int access_ipiv = ( h == 0 ) ? STARPU_W : STARPU_RW; + int access_npiv = ( h == ipiv->n ) ? STARPU_R : STARPU_REDUX; + int access_ppiv = ( h == 0 ) ? STARPU_NONE : STARPU_R; + int accessU = STARPU_RW; + + if ( h == 0 ) { + accessU = STARPU_NONE; + /* U can be set after ppiv because they are both none together, so it won't shift the buffers */ + } + else if ( h%ib == 0 ) { + accessU = STARPU_R; + } + else if ( h%ib == 1 ) { + accessU = STARPU_W; + } + + rt_starpu_insert_task( + codelet, + STARPU_VALUE, &h, sizeof(int), + STARPU_VALUE, &m0, sizeof(int), + STARPU_VALUE, &ib, sizeof(int), + STARPU_VALUE, &(options->sequence), sizeof(RUNTIME_sequence_t*), + STARPU_VALUE, &(options->request), sizeof(RUNTIME_request_t*), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, options->workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "zgetrf_blocked_diag", +#endif + /* STARPU_NONE must be the last argument for older version of StarPU where STARPU_NONE = 0 */ + STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + access_ipiv, RUNTIME_ipiv_getaddr( ipiv, An ), + access_npiv, RUNTIME_pivot_getaddr( ipiv, An, h ), + access_ppiv, RUNTIME_pivot_getaddr( ipiv, An, h-1 ), + accessU, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un), + 0); +} + +#if !defined(CHAMELEON_SIMULATION) +static void cl_zgetrf_blocked_offdiag_cpu_func(void *descr[], void *cl_arg) +{ + int h, m0, ib; + RUNTIME_sequence_t *sequence; + RUNTIME_request_t *request; + CHAM_tile_t *tileA; + CHAM_tile_t *tileU; + cppi_interface_t *nextpiv; + cppi_interface_t *prevpiv; + CHAMELEON_Complex64_t *U = NULL; + int ldu = -1;; + + starpu_codelet_unpack_args( cl_arg, &h, &m0, &ib, &sequence, &request ); + + tileA = cti_interface_get(descr[0]); + nextpiv = (cppi_interface_t*) descr[1]; + prevpiv = (cppi_interface_t*) descr[2]; + if ( descr[3] != NULL ) { + tileU = cti_interface_get(descr[3]); + U = CHAM_tile_get_ptr( tileU ); + ldu = tileU->ld; + } + + nextpiv->h = h; /* Initialize in case it uses a copy */ + + CORE_zgetrf_panel_offdiag( tileA->m, tileA->n, h, m0, ib, + CHAM_tile_get_ptr(tileA), tileA->ld, + U, ldu, + &(nextpiv->pivot), &(prevpiv->pivot) ); +} +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +CODELETS_CPU(zgetrf_blocked_offdiag, cl_zgetrf_blocked_offdiag_cpu_func) + +void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options, + int h, int m0, int ib, + CHAM_desc_t *A, int Am, int An, + CHAM_desc_t *U, int Um, int Un, + CHAM_ipiv_t *ipiv ) +{ + struct starpu_codelet *codelet = &cl_zgetrf_blocked_offdiag; + int access_npiv = ( h == ipiv->n ) ? STARPU_R : STARPU_REDUX; + int access_ppiv = ( h == 0 ) ? STARPU_NONE : STARPU_R; + int accessU = ((h%ib == 0) && (h > 0)) ? STARPU_R : STARPU_NONE; + + void (*callback)(void*) = options->profiling ? cl_zgetrf_blocked_offdiag_callback : NULL; + + rt_starpu_insert_task( + codelet, + STARPU_VALUE, &h, sizeof(int), + STARPU_VALUE, &m0, sizeof(int), + STARPU_VALUE, &ib, sizeof(int), + STARPU_VALUE, &(options->sequence), sizeof(RUNTIME_sequence_t *), + STARPU_VALUE, &(options->request), sizeof(RUNTIME_request_t *), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, options->workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "zgetrf_blocked_offdiag", +#endif + /* STARPU_NONE must be the last argument for older version of StarPU where STARPU_NONE = 0 */ + STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + access_npiv, RUNTIME_pivot_getaddr( ipiv, An, h ), + access_ppiv, RUNTIME_pivot_getaddr( ipiv, An, h-1 ), + accessU, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un), + 0); +} + +static const CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0; + +#if !defined(CHAMELEON_SIMULATION) +static void cl_zgetrf_blocked_trsm_cpu_func(void *descr[], void *cl_arg) +{ + int m, n, h, ib; + CHAM_tile_t *tileU; + cppi_interface_t *prevpiv; + CHAMELEON_Complex64_t *U; + int ldu; + + starpu_codelet_unpack_args( cl_arg, &m, &n, &h, &ib ); + + tileU = cti_interface_get(descr[0]); + prevpiv = (cppi_interface_t*) descr[1]; + U = CHAM_tile_get_ptr( tileU ); + ldu = tileU->ld; + + /* Copy the final max line of the block and solve */ + cblas_zcopy( n, prevpiv->pivot.pivrow, 1, + U + m - 1, ldu ); + + if ( (n-h) > 0 ) { + cblas_ztrsm( CblasColMajor, + CblasLeft, CblasLower, + CblasNoTrans, CblasUnit, + ib, n - h, + CBLAS_SADDR(zone), U + (h-ib) * ldu, ldu, + U + h * ldu, ldu ); + } +} +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +CODELETS_CPU(zgetrf_blocked_trsm, cl_zgetrf_blocked_trsm_cpu_func) + +void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options, + int m, int n, int h, int ib, + CHAM_desc_t *U, int Um, int Un, + CHAM_ipiv_t *ipiv ) +{ + struct starpu_codelet *codelet = &cl_zgetrf_blocked_trsm; + + void (*callback)(void*) = options->profiling ? cl_zgetrf_blocked_trsm_callback : NULL; + + rt_starpu_insert_task( + codelet, + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_VALUE, &h, sizeof(int), + STARPU_VALUE, &ib, sizeof(int), + STARPU_RW, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un), + STARPU_R, RUNTIME_pivot_getaddr( ipiv, Un, h-1 ), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, options->workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "zgetrf_blocked_trsm", +#endif + 0); +} diff --git a/runtime/starpu/codelets/codelet_zgetrf_percol.c b/runtime/starpu/codelets/codelet_zgetrf_percol.c index e5887b02b7faee4ccd67b3e989b040f87b142f82..3248ee0ae4df619ed6988de838ccc9c80c473e33 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_percol.c +++ b/runtime/starpu/codelets/codelet_zgetrf_percol.c @@ -14,7 +14,7 @@ * * @author Mathieu Faverge * @author Matthieu Kuhn - * @date 2023-08-22 + * @date 2023-09-11 * @precisions normal z -> c d s * */ @@ -58,8 +58,9 @@ static void cl_zgetrf_percol_diag_cpu_func(void *descr[], void *cl_arg) nextpiv->h = h; nextpiv->has_diag = 1; - CORE_zgetrf_panel_diag( tileA->m, tileA->n, h, m0, + CORE_zgetrf_panel_diag( tileA->m, tileA->n, h, m0, tileA->n, CHAM_tile_get_ptr( tileA ), tileA->ld, + NULL, -1, ipiv, &(nextpiv->pivot), &(prevpiv->pivot) ); if ( h > 0 ) { @@ -94,16 +95,17 @@ void INSERT_TASK_zgetrf_percol_diag( const RUNTIME_option_t *options, STARPU_VALUE, &m0, sizeof(int), STARPU_VALUE, &(options->sequence), sizeof(RUNTIME_sequence_t*), STARPU_VALUE, &(options->request), sizeof(RUNTIME_request_t*), - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - access_ipiv, RUNTIME_ipiv_getaddr( ipiv, An ), - access_npiv, RUNTIME_pivot_getaddr( ipiv, An, h ), - access_ppiv, RUNTIME_pivot_getaddr( ipiv, An, h-1 ), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) STARPU_NAME, "zgetrf_percol_diag", #endif + /* STARPU_NONE must be the last argument for older version of StarPU where STARPU_NONE = 0 */ + STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + access_ipiv, RUNTIME_ipiv_getaddr( ipiv, An ), + access_npiv, RUNTIME_pivot_getaddr( ipiv, An, h ), + access_ppiv, RUNTIME_pivot_getaddr( ipiv, An, h-1 ), 0); } @@ -125,8 +127,9 @@ static void cl_zgetrf_percol_offdiag_cpu_func(void *descr[], void *cl_arg) nextpiv->h = h; /* Initialize in case it uses a copy */ - CORE_zgetrf_panel_offdiag( tileA->m, tileA->n, h, m0, + CORE_zgetrf_panel_offdiag( tileA->m, tileA->n, h, m0, tileA->n, CHAM_tile_get_ptr(tileA), tileA->ld, + NULL, -1, &(nextpiv->pivot), &(prevpiv->pivot) ); } #endif /* !defined(CHAMELEON_SIMULATION) */