diff --git a/.gitlab-ci-env.sh b/.gitlab-ci-env.sh index 687df90dc59f1c561b2d45e826a197d3b83cfeab..ba9cd4dcfa9f48b59dde7a0b91cb2c2b18bdb957 100755 --- a/.gitlab-ci-env.sh +++ b/.gitlab-ci-env.sh @@ -8,6 +8,10 @@ # too noisy export STARPU_SILENT=1 +# Make sure threads are not bound +export STARPU_MPI_NOBIND=1 +export STARPU_WORKERS_NOBIND=1 + # initialize empty to get just what we need export PKG_CONFIG_PATH="" diff --git a/.gitlab/test_starpu.yml b/.gitlab/test_starpu.yml index d22d47349efe90d27955338c9a2ff3b97f17529d..1aa626ccd4c48d431aaec02e20f42043555945e8 100644 --- a/.gitlab/test_starpu.yml +++ b/.gitlab/test_starpu.yml @@ -28,8 +28,6 @@ test_starpu_shm_s: <<: *test_starpu_master variables: - STARPU_WORKERS_NOBIND: 1 - STARPU_SILENT: 1 VERSION: starpu CATEGORY: shm PRECISION: s @@ -38,8 +36,6 @@ test_starpu_shm_s: test_starpu_shm_d: <<: *test_starpu_branches variables: - STARPU_WORKERS_NOBIND: 1 - STARPU_SILENT: 1 VERSION: starpu CATEGORY: shm PRECISION: d @@ -48,8 +44,6 @@ test_starpu_shm_d: test_starpu_shm_c: <<: *test_starpu_master variables: - STARPU_WORKERS_NOBIND: 1 - STARPU_SILENT: 1 VERSION: starpu CATEGORY: shm PRECISION: c @@ -58,8 +52,6 @@ test_starpu_shm_c: test_starpu_shm_z: <<: *test_starpu_master variables: - STARPU_WORKERS_NOBIND: 1 - STARPU_SILENT: 1 VERSION: starpu CATEGORY: shm PRECISION: z @@ -68,8 +60,6 @@ test_starpu_shm_z: test_starpu_mpi_s: <<: *test_starpu_branches variables: - STARPU_WORKERS_NOBIND: 1 - STARPU_SILENT: 1 VERSION: starpu CATEGORY: mpi PRECISION: s @@ -78,8 +68,6 @@ test_starpu_mpi_s: test_starpu_mpi_d: <<: *test_starpu_master variables: - STARPU_WORKERS_NOBIND: 1 - STARPU_SILENT: 1 VERSION: starpu CATEGORY: mpi PRECISION: d @@ -88,8 +76,6 @@ test_starpu_mpi_d: test_starpu_mpi_c: <<: *test_starpu_master variables: - STARPU_WORKERS_NOBIND: 1 - STARPU_SILENT: 1 VERSION: starpu CATEGORY: mpi PRECISION: c @@ -98,8 +84,6 @@ test_starpu_mpi_c: test_starpu_mpi_z: <<: *test_starpu_master variables: - STARPU_WORKERS_NOBIND: 1 - STARPU_SILENT: 1 VERSION: starpu CATEGORY: mpi PRECISION: z diff --git a/cmake_modules/morse_cmake b/cmake_modules/morse_cmake index eeb878b31a2c149e9ede3f8bea03db94db020ae7..169c202ba9cfb54f35afd90021a3226478cf3304 160000 --- a/cmake_modules/morse_cmake +++ b/cmake_modules/morse_cmake @@ -1 +1 @@ -Subproject commit eeb878b31a2c149e9ede3f8bea03db94db020ae7 +Subproject commit 169c202ba9cfb54f35afd90021a3226478cf3304 diff --git a/control/descriptor.c b/control/descriptor.c index dd1be56c45f11f4968b4a447ea0fe9c938e7c343..58c01f7356bb17b85f8d36dc27e277baf29c54ea 100644 --- a/control/descriptor.c +++ b/control/descriptor.c @@ -860,7 +860,7 @@ int CHAMELEON_Desc_Destroy(CHAM_desc_t **desc) * @retval CHAMELEON_SUCCESS successful exit * */ -int CHAMELEON_Desc_Acquire (CHAM_desc_t *desc) { +int CHAMELEON_Desc_Acquire( const CHAM_desc_t *desc ) { return RUNTIME_desc_acquire( desc ); } @@ -883,7 +883,7 @@ int CHAMELEON_Desc_Acquire (CHAM_desc_t *desc) { * @retval CHAMELEON_SUCCESS successful exit * */ -int CHAMELEON_Desc_Release (CHAM_desc_t *desc) { +int CHAMELEON_Desc_Release( const CHAM_desc_t *desc ) { return RUNTIME_desc_release( desc ); } diff --git a/coreblas/compute/core_ztile.c b/coreblas/compute/core_ztile.c index f27cebfd2eaa0221e41afeb05ea7c25f6f035a89..a544dfa7840865ec4abe08f14f7588bd6220be3a 100644 --- a/coreblas/compute/core_ztile.c +++ b/coreblas/compute/core_ztile.c @@ -39,7 +39,7 @@ TCORE_dzasum( cham_store_t storev, double * work ) { assert( A->format & CHAMELEON_TILE_FULLRANK ); - CORE_dzasum( storev, uplo, M, N, A->mat, A->ld, work ); + CORE_dzasum( storev, uplo, M, N, CHAM_tile_get_ptr( A ), A->ld, work ); } int @@ -52,7 +52,7 @@ TCORE_zaxpy( int M, { assert( A->format & CHAMELEON_TILE_FULLRANK ); assert( B->format & CHAMELEON_TILE_FULLRANK ); - return CORE_zaxpy( M, alpha, A->mat, incA, B->mat, incB ); + return CORE_zaxpy( M, alpha, CHAM_tile_get_ptr( A ), incA, CHAM_tile_get_ptr( B ), incB ); } int @@ -64,9 +64,15 @@ TCORE_zgeadd( cham_trans_t trans, CHAMELEON_Complex64_t beta, CHAM_tile_t * B ) { - assert( A->format & CHAMELEON_TILE_FULLRANK ); - assert( B->format & CHAMELEON_TILE_FULLRANK ); - return CORE_zgeadd( trans, M, N, alpha, A->mat, A->ld, beta, B->mat, B->ld ); + if ( (A->format & CHAMELEON_TILE_DESC) && + (B->format & CHAMELEON_TILE_DESC) ) + { + assert(0); + } + + return CORE_zgeadd( trans, M, N, + alpha, CHAM_tile_get_ptr( A ), A->ld, + beta, CHAM_tile_get_ptr( B ), B->ld ); } int @@ -80,7 +86,7 @@ TCORE_zgelqt( int M, { assert( A->format & CHAMELEON_TILE_FULLRANK ); assert( T->format & CHAMELEON_TILE_FULLRANK ); - return CORE_zgelqt( M, N, IB, A->mat, A->ld, T->mat, T->ld, TAU, WORK ); + return CORE_zgelqt( M, N, IB, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( T ), T->ld, TAU, WORK ); } void @@ -112,7 +118,11 @@ TCORE_zgemm( cham_trans_t transA, assert( B->format & CHAMELEON_TILE_FULLRANK ); assert( C->format & CHAMELEON_TILE_FULLRANK ); CORE_zgemm( - transA, transB, M, N, K, alpha, A->mat, A->ld, B->mat, B->ld, beta, C->mat, C->ld ); + transA, transB, M, N, K, alpha, + CHAM_tile_get_ptr( A ), A->ld, + CHAM_tile_get_ptr( B ), B->ld, + beta, + CHAM_tile_get_ptr( C ), C->ld ); } int @@ -126,7 +136,7 @@ TCORE_zgeqrt( int M, { assert( A->format & CHAMELEON_TILE_FULLRANK ); assert( T->format & CHAMELEON_TILE_FULLRANK ); - return CORE_zgeqrt( M, N, IB, A->mat, A->ld, T->mat, T->ld, TAU, WORK ); + return CORE_zgeqrt( M, N, IB, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( T ), T->ld, TAU, WORK ); } int @@ -134,7 +144,7 @@ TCORE_zgessm( int M, int N, int K, int IB, const int *IPIV, const CHAM_tile_t *L { assert( L->format & CHAMELEON_TILE_FULLRANK ); assert( A->format & CHAMELEON_TILE_FULLRANK ); - return CORE_zgessm( M, N, K, IB, IPIV, L->mat, L->ld, A->mat, A->ld ); + return CORE_zgessm( M, N, K, IB, IPIV, CHAM_tile_get_ptr( L ), L->ld, CHAM_tile_get_ptr( A ), A->ld ); } int @@ -142,28 +152,28 @@ TCORE_zgessq( cham_store_t storev, int M, int N, const CHAM_tile_t *A, CHAM_tile { assert( A->format & CHAMELEON_TILE_FULLRANK ); assert( sclssq->format & CHAMELEON_TILE_FULLRANK ); - return CORE_zgessq( storev, M, N, A->mat, A->ld, sclssq->mat ); + return CORE_zgessq( storev, M, N, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( sclssq ) ); } int TCORE_zgetrf( int M, int N, CHAM_tile_t *A, int *IPIV, int *INFO ) { assert( A->format & CHAMELEON_TILE_FULLRANK ); - return CORE_zgetrf( M, N, A->mat, A->ld, IPIV, INFO ); + return CORE_zgetrf( M, N, CHAM_tile_get_ptr( A ), A->ld, IPIV, INFO ); } int TCORE_zgetrf_incpiv( int M, int N, int IB, CHAM_tile_t *A, int *IPIV, int *INFO ) { assert( A->format & CHAMELEON_TILE_FULLRANK ); - return CORE_zgetrf_incpiv( M, N, IB, A->mat, A->ld, IPIV, INFO ); + return CORE_zgetrf_incpiv( M, N, IB, CHAM_tile_get_ptr( A ), A->ld, IPIV, INFO ); } int TCORE_zgetrf_nopiv( int M, int N, int IB, CHAM_tile_t *A, int *INFO ) { assert( A->format & CHAMELEON_TILE_FULLRANK ); - return CORE_zgetrf_nopiv( M, N, IB, A->mat, A->ld, INFO ); + return CORE_zgetrf_nopiv( M, N, IB, CHAM_tile_get_ptr( A ), A->ld, INFO ); } void @@ -171,7 +181,7 @@ TCORE_zhe2ge( cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, CHAM_tile_t { assert( A->format & CHAMELEON_TILE_FULLRANK ); assert( B->format & CHAMELEON_TILE_FULLRANK ); - CORE_zhe2ge( uplo, M, N, A->mat, A->ld, B->mat, B->ld ); + CORE_zhe2ge( uplo, M, N, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld ); } #if defined( PRECISION_z ) || defined( PRECISION_c ) @@ -189,7 +199,7 @@ TCORE_zhemm( cham_side_t side, assert( A->format & CHAMELEON_TILE_FULLRANK ); assert( B->format & CHAMELEON_TILE_FULLRANK ); assert( C->format & CHAMELEON_TILE_FULLRANK ); - CORE_zhemm( side, uplo, M, N, alpha, A->mat, A->ld, B->mat, B->ld, beta, C->mat, C->ld ); + CORE_zhemm( side, uplo, M, N, alpha, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld, beta, CHAM_tile_get_ptr( C ), C->ld ); } void @@ -204,7 +214,7 @@ TCORE_zherk( cham_uplo_t uplo, { assert( A->format & CHAMELEON_TILE_FULLRANK ); assert( C->format & CHAMELEON_TILE_FULLRANK ); - CORE_zherk( uplo, trans, N, K, alpha, A->mat, A->ld, beta, C->mat, C->ld ); + CORE_zherk( uplo, trans, N, K, alpha, CHAM_tile_get_ptr( A ), A->ld, beta, CHAM_tile_get_ptr( C ), C->ld ); } void @@ -221,7 +231,7 @@ TCORE_zher2k( cham_uplo_t uplo, assert( A->format & CHAMELEON_TILE_FULLRANK ); assert( B->format & CHAMELEON_TILE_FULLRANK ); assert( C->format & CHAMELEON_TILE_FULLRANK ); - CORE_zher2k( uplo, trans, N, K, alpha, A->mat, A->ld, B->mat, B->ld, beta, C->mat, C->ld ); + CORE_zher2k( uplo, trans, N, K, alpha, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld, beta, CHAM_tile_get_ptr( C ), C->ld ); } #endif @@ -241,7 +251,7 @@ TCORE_zherfb( cham_uplo_t uplo, assert( T->format & CHAMELEON_TILE_FULLRANK ); assert( C->format & CHAMELEON_TILE_FULLRANK ); return CORE_zherfb( - uplo, N, K, IB, NB, A->mat, A->ld, T->mat, T->ld, C->mat, C->ld, WORK, ldwork ); + uplo, N, K, IB, NB, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( T ), T->ld, CHAM_tile_get_ptr( C ), C->ld, WORK, ldwork ); } #if defined( PRECISION_z ) || defined( PRECISION_c ) @@ -254,16 +264,19 @@ TCORE_zhessq( cham_store_t storev, { assert( A->format & CHAMELEON_TILE_FULLRANK ); assert( sclssq->format & CHAMELEON_TILE_FULLRANK ); - return CORE_zhessq( storev, uplo, N, A->mat, A->ld, sclssq->mat ); + return CORE_zhessq( storev, uplo, N, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( sclssq ) ); } #endif void TCORE_zlacpy( cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *B ) { - assert( A->format & CHAMELEON_TILE_FULLRANK ); - assert( B->format & CHAMELEON_TILE_FULLRANK ); - CORE_zlacpy( uplo, M, N, A->mat, A->ld, B->mat, B->ld ); + if (( A->format & CHAMELEON_TILE_DESC ) && + ( B->format & CHAMELEON_TILE_DESC ) ) + { + assert(0); /* This should have been handled at the codelet level */ + } + CORE_zlacpy( uplo, M, N, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld ); } void @@ -275,7 +288,7 @@ TCORE_zlange( cham_normtype_t norm, double * normA ) { assert( A->format & CHAMELEON_TILE_FULLRANK ); - CORE_zlange( norm, M, N, A->mat, A->ld, work, normA ); + CORE_zlange( norm, M, N, CHAM_tile_get_ptr( A ), A->ld, work, normA ); } #if defined( PRECISION_z ) || defined( PRECISION_c ) @@ -288,7 +301,7 @@ TCORE_zlanhe( cham_normtype_t norm, double * normA ) { assert( A->format & CHAMELEON_TILE_FULLRANK ); - CORE_zlanhe( norm, uplo, N, A->mat, A->ld, work, normA ); + CORE_zlanhe( norm, uplo, N, CHAM_tile_get_ptr( A ), A->ld, work, normA ); } #endif @@ -301,7 +314,7 @@ TCORE_zlansy( cham_normtype_t norm, double * normA ) { assert( A->format & CHAMELEON_TILE_FULLRANK ); - CORE_zlansy( norm, uplo, N, A->mat, A->ld, work, normA ); + CORE_zlansy( norm, uplo, N, CHAM_tile_get_ptr( A ), A->ld, work, normA ); } void @@ -315,14 +328,14 @@ TCORE_zlantr( cham_normtype_t norm, double * normA ) { assert( A->format & CHAMELEON_TILE_FULLRANK ); - CORE_zlantr( norm, uplo, diag, M, N, A->mat, A->ld, work, normA ); + CORE_zlantr( norm, uplo, diag, M, N, CHAM_tile_get_ptr( A ), A->ld, work, normA ); } int TCORE_zlascal( cham_uplo_t uplo, int m, int n, CHAMELEON_Complex64_t alpha, CHAM_tile_t *A ) { assert( A->format & CHAMELEON_TILE_FULLRANK ); - return CORE_zlascal( uplo, m, n, alpha, A->mat, A->ld ); + return CORE_zlascal( uplo, m, n, alpha, CHAM_tile_get_ptr( A ), A->ld ); } void @@ -334,14 +347,14 @@ TCORE_zlaset( cham_uplo_t uplo, CHAM_tile_t * A ) { assert( A->format & CHAMELEON_TILE_FULLRANK ); - CORE_zlaset( uplo, n1, n2, alpha, beta, A->mat, A->ld ); + CORE_zlaset( uplo, n1, n2, alpha, beta, CHAM_tile_get_ptr( A ), A->ld ); } void TCORE_zlaset2( cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha, CHAM_tile_t *A ) { assert( A->format & CHAMELEON_TILE_FULLRANK ); - CORE_zlaset2( uplo, n1, n2, alpha, A->mat, A->ld ); + CORE_zlaset2( uplo, n1, n2, alpha, CHAM_tile_get_ptr( A ), A->ld ); } int @@ -354,14 +367,14 @@ TCORE_zlatro( cham_uplo_t uplo, { assert( A->format & CHAMELEON_TILE_FULLRANK ); assert( B->format & CHAMELEON_TILE_FULLRANK ); - return CORE_zlatro( uplo, trans, M, N, A->mat, A->ld, B->mat, B->ld ); + return CORE_zlatro( uplo, trans, M, N, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld ); } void TCORE_zlauum( cham_uplo_t uplo, int N, CHAM_tile_t *A ) { assert( A->format & CHAMELEON_TILE_FULLRANK ); - CORE_zlauum( uplo, N, A->mat, A->ld ); + CORE_zlauum( uplo, N, CHAM_tile_get_ptr( A ), A->ld ); } #if defined( PRECISION_z ) || defined( PRECISION_c ) @@ -376,7 +389,7 @@ TCORE_zplghe( double bump, unsigned long long int seed ) { assert( tileA->format & CHAMELEON_TILE_FULLRANK ); - CORE_zplghe( bump, m, n, tileA->mat, tileA->ld, bigM, m0, n0, seed ); + CORE_zplghe( bump, m, n, CHAM_tile_get_ptr( tileA ), tileA->ld, bigM, m0, n0, seed ); } #endif @@ -391,7 +404,7 @@ TCORE_zplgsy( CHAMELEON_Complex64_t bump, unsigned long long int seed ) { assert( tileA->format & CHAMELEON_TILE_FULLRANK ); - CORE_zplgsy( bump, m, n, tileA->mat, tileA->ld, bigM, m0, n0, seed ); + CORE_zplgsy( bump, m, n, CHAM_tile_get_ptr( tileA ), tileA->ld, bigM, m0, n0, seed ); } void @@ -404,14 +417,14 @@ TCORE_zplrnt( int m, unsigned long long int seed ) { assert( tileA->format & CHAMELEON_TILE_FULLRANK ); - CORE_zplrnt( m, n, tileA->mat, tileA->ld, bigM, m0, n0, seed ); + CORE_zplrnt( m, n, CHAM_tile_get_ptr( tileA ), tileA->ld, bigM, m0, n0, seed ); } void TCORE_zpotrf( cham_uplo_t uplo, int n, CHAM_tile_t *A, int *INFO ) { assert( A->format & CHAMELEON_TILE_FULLRANK ); - CORE_zpotrf( uplo, n, A->mat, A->ld, INFO ); + CORE_zpotrf( uplo, n, CHAM_tile_get_ptr( A ), A->ld, INFO ); } int @@ -437,13 +450,13 @@ TCORE_zssssm( int M1, N2, K, IB, - A1->mat, + CHAM_tile_get_ptr( A1 ), A1->ld, - A2->mat, + CHAM_tile_get_ptr( A2 ), A2->ld, - L1->mat, + CHAM_tile_get_ptr( L1 ), L1->ld, - L2->mat, + CHAM_tile_get_ptr( L2 ), L2->ld, IPIV ); } @@ -462,7 +475,7 @@ TCORE_zsymm( cham_side_t side, assert( A->format & CHAMELEON_TILE_FULLRANK ); assert( B->format & CHAMELEON_TILE_FULLRANK ); assert( C->format & CHAMELEON_TILE_FULLRANK ); - CORE_zsymm( side, uplo, M, N, alpha, A->mat, A->ld, B->mat, B->ld, beta, C->mat, C->ld ); + CORE_zsymm( side, uplo, M, N, alpha, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld, beta, CHAM_tile_get_ptr( C ), C->ld ); } void @@ -477,7 +490,7 @@ TCORE_zsyrk( cham_uplo_t uplo, { assert( A->format & CHAMELEON_TILE_FULLRANK ); assert( C->format & CHAMELEON_TILE_FULLRANK ); - CORE_zsyrk( uplo, trans, N, K, alpha, A->mat, A->ld, beta, C->mat, C->ld ); + CORE_zsyrk( uplo, trans, N, K, alpha, CHAM_tile_get_ptr( A ), A->ld, beta, CHAM_tile_get_ptr( C ), C->ld ); } void @@ -494,7 +507,7 @@ TCORE_zsyr2k( cham_uplo_t uplo, assert( A->format & CHAMELEON_TILE_FULLRANK ); assert( B->format & CHAMELEON_TILE_FULLRANK ); assert( C->format & CHAMELEON_TILE_FULLRANK ); - CORE_zsyr2k( uplo, trans, N, K, alpha, A->mat, A->ld, B->mat, B->ld, beta, C->mat, C->ld ); + CORE_zsyr2k( uplo, trans, N, K, alpha, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld, beta, CHAM_tile_get_ptr( C ), C->ld ); } int @@ -506,7 +519,7 @@ TCORE_zsyssq( cham_store_t storev, { assert( A->format & CHAMELEON_TILE_FULLRANK ); assert( sclssq->format & CHAMELEON_TILE_FULLRANK ); - return CORE_zsyssq( storev, uplo, N, A->mat, A->ld, sclssq->mat ); + return CORE_zsyssq( storev, uplo, N, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( sclssq ) ); } #if defined( PRECISION_z ) || defined( PRECISION_c ) @@ -514,7 +527,7 @@ int TCORE_zsytf2_nopiv( cham_uplo_t uplo, int n, CHAM_tile_t *A ) { assert( A->format & CHAMELEON_TILE_FULLRANK ); - return CORE_zsytf2_nopiv( uplo, n, A->mat, A->ld ); + return CORE_zsytf2_nopiv( uplo, n, CHAM_tile_get_ptr( A ), A->ld ); } #endif @@ -531,7 +544,7 @@ TCORE_ztplqt( int M, assert( A->format & CHAMELEON_TILE_FULLRANK ); assert( B->format & CHAMELEON_TILE_FULLRANK ); assert( T->format & CHAMELEON_TILE_FULLRANK ); - return CORE_ztplqt( M, N, L, IB, A->mat, A->ld, B->mat, B->ld, T->mat, T->ld, WORK ); + return CORE_ztplqt( M, N, L, IB, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld, CHAM_tile_get_ptr( T ), T->ld, WORK ); } int @@ -559,13 +572,13 @@ TCORE_ztpmlqt( cham_side_t side, K, L, IB, - V->mat, + CHAM_tile_get_ptr( V ), V->ld, - T->mat, + CHAM_tile_get_ptr( T ), T->ld, - A->mat, + CHAM_tile_get_ptr( A ), A->ld, - B->mat, + CHAM_tile_get_ptr( B ), B->ld, WORK ); } @@ -595,13 +608,13 @@ TCORE_ztpmqrt( cham_side_t side, K, L, IB, - V->mat, + CHAM_tile_get_ptr( V ), V->ld, - T->mat, + CHAM_tile_get_ptr( T ), T->ld, - A->mat, + CHAM_tile_get_ptr( A ), A->ld, - B->mat, + CHAM_tile_get_ptr( B ), B->ld, WORK ); } @@ -619,7 +632,7 @@ TCORE_ztpqrt( int M, assert( A->format & CHAMELEON_TILE_FULLRANK ); assert( B->format & CHAMELEON_TILE_FULLRANK ); assert( T->format & CHAMELEON_TILE_FULLRANK ); - return CORE_ztpqrt( M, N, L, IB, A->mat, A->ld, B->mat, B->ld, T->mat, T->ld, WORK ); + return CORE_ztpqrt( M, N, L, IB, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld, CHAM_tile_get_ptr( T ), T->ld, WORK ); } int @@ -632,9 +645,12 @@ TCORE_ztradd( cham_uplo_t uplo, CHAMELEON_Complex64_t beta, CHAM_tile_t * B ) { - assert( A->format & CHAMELEON_TILE_FULLRANK ); - assert( B->format & CHAMELEON_TILE_FULLRANK ); - return CORE_ztradd( uplo, trans, M, N, alpha, A->mat, A->ld, beta, B->mat, B->ld ); + if (( A->format & CHAMELEON_TILE_DESC ) && + ( B->format & CHAMELEON_TILE_DESC ) ) + { + assert(0); /* This should have been handled at the codelet level */ + } + return CORE_ztradd( uplo, trans, M, N, alpha, CHAM_tile_get_ptr( A ), A->ld, beta, CHAM_tile_get_ptr( B ), B->ld ); } void @@ -647,7 +663,7 @@ TCORE_ztrasm( cham_store_t storev, double * work ) { assert( A->format & CHAMELEON_TILE_FULLRANK ); - CORE_ztrasm( storev, uplo, diag, M, N, A->mat, A->ld, work ); + CORE_ztrasm( storev, uplo, diag, M, N, CHAM_tile_get_ptr( A ), A->ld, work ); } void @@ -663,7 +679,7 @@ TCORE_ztrmm( cham_side_t side, { assert( A->format & CHAMELEON_TILE_FULLRANK ); assert( B->format & CHAMELEON_TILE_FULLRANK ); - CORE_ztrmm( side, uplo, transA, diag, M, N, alpha, A->mat, A->ld, B->mat, B->ld ); + CORE_ztrmm( side, uplo, transA, diag, M, N, alpha, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld ); } void @@ -679,7 +695,7 @@ TCORE_ztrsm( cham_side_t side, { assert( A->format & CHAMELEON_TILE_FULLRANK ); assert( B->format & CHAMELEON_TILE_FULLRANK ); - CORE_ztrsm( side, uplo, transA, diag, M, N, alpha, A->mat, A->ld, B->mat, B->ld ); + CORE_ztrsm( side, uplo, transA, diag, M, N, alpha, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld ); } int @@ -692,15 +708,15 @@ TCORE_ztrssq( cham_uplo_t uplo, { assert( A->format & CHAMELEON_TILE_FULLRANK ); assert( sclssq->format & CHAMELEON_TILE_FULLRANK ); - double *W = sclssq->mat; - return CORE_ztrssq( uplo, diag, M, N, A->mat, A->ld, W, W + 1 ); + double *W = CHAM_tile_get_ptr( sclssq ); + return CORE_ztrssq( uplo, diag, M, N, CHAM_tile_get_ptr( A ), A->ld, W, W + 1 ); } void TCORE_ztrtri( cham_uplo_t uplo, cham_diag_t diag, int N, CHAM_tile_t *A, int *info ) { assert( A->format & CHAMELEON_TILE_FULLRANK ); - CORE_ztrtri( uplo, diag, N, A->mat, A->ld, info ); + CORE_ztrtri( uplo, diag, N, CHAM_tile_get_ptr( A ), A->ld, info ); } int @@ -731,13 +747,13 @@ TCORE_ztsmlq_hetra1( cham_side_t side, n2, k, ib, - A1->mat, + CHAM_tile_get_ptr( A1 ), A1->ld, - A2->mat, + CHAM_tile_get_ptr( A2 ), A2->ld, - V->mat, + CHAM_tile_get_ptr( V ), V->ld, - T->mat, + CHAM_tile_get_ptr( T ), T->ld, WORK, ldwork ); @@ -771,13 +787,13 @@ TCORE_ztsmqr_hetra1( cham_side_t side, n2, k, ib, - A1->mat, + CHAM_tile_get_ptr( A1 ), A1->ld, - A2->mat, + CHAM_tile_get_ptr( A2 ), A2->ld, - V->mat, + CHAM_tile_get_ptr( V ), V->ld, - T->mat, + CHAM_tile_get_ptr( T ), T->ld, WORK, ldwork ); @@ -800,7 +816,7 @@ TCORE_ztstrf( int M, assert( A->format & CHAMELEON_TILE_FULLRANK ); assert( L->format & CHAMELEON_TILE_FULLRANK ); return CORE_ztstrf( - M, N, IB, NB, U->mat, U->ld, A->mat, A->ld, L->mat, L->ld, IPIV, WORK, LDWORK, INFO ); + M, N, IB, NB, CHAM_tile_get_ptr( U ), U->ld, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( L ), L->ld, IPIV, WORK, LDWORK, INFO ); } int @@ -820,7 +836,7 @@ TCORE_zunmlq( cham_side_t side, assert( T->format & CHAMELEON_TILE_FULLRANK ); assert( C->format & CHAMELEON_TILE_FULLRANK ); return CORE_zunmlq( - side, trans, M, N, K, IB, V->mat, V->ld, T->mat, T->ld, C->mat, C->ld, WORK, LDWORK ); + side, trans, M, N, K, IB, CHAM_tile_get_ptr( V ), V->ld, CHAM_tile_get_ptr( T ), T->ld, CHAM_tile_get_ptr( C ), C->ld, WORK, LDWORK ); } int @@ -840,7 +856,7 @@ TCORE_zunmqr( cham_side_t side, assert( T->format & CHAMELEON_TILE_FULLRANK ); assert( C->format & CHAMELEON_TILE_FULLRANK ); return CORE_zunmqr( - side, trans, M, N, K, IB, V->mat, V->ld, T->mat, T->ld, C->mat, C->ld, WORK, LDWORK ); + side, trans, M, N, K, IB, CHAM_tile_get_ptr( V ), V->ld, CHAM_tile_get_ptr( T ), T->ld, CHAM_tile_get_ptr( C ), C->ld, WORK, LDWORK ); } int @@ -859,5 +875,5 @@ TCORE_zgram( cham_uplo_t uplo, assert( D->format & CHAMELEON_TILE_FULLRANK ); assert( A->format & CHAMELEON_TILE_FULLRANK ); return CORE_zgram( - uplo, M, N, Mt, Nt, Di->mat, Di->ld, Dj->mat, Dj->ld, D->mat, A->mat, A->ld ); + uplo, M, N, Mt, Nt, CHAM_tile_get_ptr( Di ), Di->ld, CHAM_tile_get_ptr( Dj ), Dj->ld, CHAM_tile_get_ptr( D ), CHAM_tile_get_ptr( A ), A->ld ); } diff --git a/include/chameleon.h b/include/chameleon.h index 4ebc7ee846d4b2c382564e755255229a1ae14c7a..713f9493edebd70253dbce1e14096c2734e863fc 100644 --- a/include/chameleon.h +++ b/include/chameleon.h @@ -135,8 +135,8 @@ CHAM_desc_t *CHAMELEON_Desc_CopyOnZero( const CHAM_desc_t *descin, void *mat ); CHAM_desc_t *CHAMELEON_Desc_SubMatrix( CHAM_desc_t *descA, int i, int j, int m, int n ); int CHAMELEON_Desc_Destroy( CHAM_desc_t **desc ); -int CHAMELEON_Desc_Acquire( CHAM_desc_t *desc ); -int CHAMELEON_Desc_Release( CHAM_desc_t *desc ); +int CHAMELEON_Desc_Acquire( const CHAM_desc_t *desc ); +int CHAMELEON_Desc_Release( const CHAM_desc_t *desc ); int CHAMELEON_Desc_Flush ( const CHAM_desc_t *desc, const RUNTIME_sequence_t *sequence ); diff --git a/include/chameleon/struct.h b/include/chameleon/struct.h index 1ac2edb55d5bd51d2fa969451d14eb0796e8536c..748daaa738b1fff51914ccb59856415797ab25e8 100644 --- a/include/chameleon/struct.h +++ b/include/chameleon/struct.h @@ -143,6 +143,15 @@ typedef struct chameleon_context_s { int mpi_outer_init; // MPI has been initialized outside our functions } CHAM_context_t; +static inline void * +CHAM_tile_get_ptr( const CHAM_tile_t *tile ) +{ + if ( tile->format & CHAMELEON_TILE_DESC ) { + return ((CHAM_desc_t*)(tile->mat))->mat; + } + return tile->mat; +} + END_C_DECLS #endif /* _chameleon_struct_h_ */ diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c index 40c4c24ab4cbd1352d516a6ebae7a8447b3e5f67..d94d536a0c8343b0624c6534a3197b5d88ee7175 100644 --- a/runtime/starpu/codelets/codelet_zcallback.c +++ b/runtime/starpu/codelets/codelet_zcallback.c @@ -28,7 +28,7 @@ CHAMELEON_CL_CB(dlag2z, cti_handle_get_m(task->handles[1]), cti_handle_ge CHAMELEON_CL_CB(dzasum, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) CHAMELEON_CL_CB(zaxpy, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[1]), 0, M) CHAMELEON_CL_CB(zgeadd, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) -CHAMELEON_CL_CB(ztradd, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, 0.5*M*N) +CHAMELEON_CL_CB(ztradd, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N/2.) CHAMELEON_CL_CB(zlascal, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) CHAMELEON_CL_CB(zgelqt, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, (4./3.)*M*N*K) CHAMELEON_CL_CB(zgemv, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, 2. *M*N ) diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c index 9a3b9ea3f127a9f7656afa8e27e08ecc9c976aab..0567065f3ceb12f9abf475429111f2df0cbae312 100644 --- a/runtime/starpu/codelets/codelet_zgemm.c +++ b/runtime/starpu/codelets/codelet_zgemm.c @@ -25,9 +25,7 @@ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" -#if !defined(CHAMELEON_SIMULATION) -static void cl_zgemm_cpu_func(void *descr[], void *cl_arg) -{ +struct cl_zgemm_args_s { cham_trans_t transA; cham_trans_t transB; int m; @@ -38,47 +36,54 @@ static void cl_zgemm_cpu_func(void *descr[], void *cl_arg) CHAM_tile_t *tileB; CHAMELEON_Complex64_t beta; CHAM_tile_t *tileC; +}; + +#if !defined(CHAMELEON_SIMULATION) +static void +cl_zgemm_cpu_func( void *descr[], void *cl_arg ) +{ + struct cl_zgemm_args_s clargs; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + CHAM_tile_t *tileC; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); tileC = cti_interface_get(descr[2]); - starpu_codelet_unpack_args(cl_arg, &transA, &transB, &m, &n, &k, &alpha, &beta); - TCORE_zgemm( transA, transB, - m, n, k, - alpha, tileA, tileB, - beta, tileC ); + starpu_codelet_unpack_args( cl_arg, &clargs ); + TCORE_zgemm( clargs.transA, clargs.transB, + clargs.m, clargs.n, clargs.k, + clargs.alpha, tileA, tileB, + clargs.beta, tileC ); } #ifdef CHAMELEON_USE_CUDA -static void cl_zgemm_cuda_func(void *descr[], void *cl_arg) +static void +cl_zgemm_cuda_func( void *descr[], void *_cl_arg ) { - cham_trans_t transA; - cham_trans_t transB; - int m; - int n; - int k; - cuDoubleComplex alpha; + struct cl_zgemm_args_s clargs; CHAM_tile_t *tileA; CHAM_tile_t *tileB; - cuDoubleComplex beta; CHAM_tile_t *tileC; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); tileC = cti_interface_get(descr[2]); - starpu_codelet_unpack_args(cl_arg, &transA, &transB, &m, &n, &k, &alpha, &beta); + starpu_codelet_unpack_args( _cl_arg, &clargs ); RUNTIME_getStream( stream ); CUDA_zgemm( - transA, transB, - m, n, k, - &alpha, tileA->mat, tileA->ld, - tileB->mat, tileB->ld, - &beta, tileC->mat, tileC->ld, - stream); + clargs.transA, clargs.transB, + clargs.m, clargs.n, clargs.k, + (cuDoubleComplex*)&(clargs.alpha), + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + (cuDoubleComplex*)&(clargs.beta), + tileC->mat, tileC->ld, + stream ); #ifndef STARPU_CUDA_ASYNC cudaStreamSynchronize( stream ); @@ -92,56 +97,72 @@ static void cl_zgemm_cuda_func(void *descr[], void *cl_arg) /* * Codelet definition */ -CODELETS(zgemm, cl_zgemm_cpu_func, cl_zgemm_cuda_func, STARPU_CUDA_ASYNC) - -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - */ -void INSERT_TASK_zgemm(const RUNTIME_option_t *options, - cham_trans_t transA, cham_trans_t transB, - int m, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) +CODELETS( zgemm, cl_zgemm_cpu_func, cl_zgemm_cuda_func, STARPU_CUDA_ASYNC ) + +void INSERT_TASK_zgemm( const RUNTIME_option_t *options, + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) { if ( alpha == 0. ) { return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, beta, C, Cm, Cn ); } - (void)nb; - struct starpu_codelet *codelet = &cl_zgemm; - void (*callback)(void*) = options->profiling ? cl_zgemm_callback : NULL; - starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt); - int workerid = (schedopt == NULL) ? -1 : schedopt->workerid; - int accessC = ( beta == 0. ) ? STARPU_W : STARPU_RW; - + struct cl_zgemm_args_s clargs = { + .transA = transA, + .transB = transB, + .m = m, + .n = n, + .k = k, + .alpha = alpha, + .tileA = A->get_blktile( A, Am, An ), + .tileB = B->get_blktile( B, Bm, Bn ), + .beta = beta, + .tileC = C->get_blktile( C, Cm, Cn ) + }; + void (*callback)(void*); + RUNTIME_request_t *request = options->request; + starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); + int workerid, accessC; + char *cl_name = "zgemm"; + + /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_R(B, Bm, Bn); CHAMELEON_ACCESS_RW(C, Cm, Cn); CHAMELEON_END_ACCESS_DECLARATION; + /* Callback fro profiling information */ + callback = options->profiling ? cl_zgemm_callback : NULL; + + /* Fix the worker id */ + workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + + /* Reduce the C access if needed */ + accessC = ( beta == 0. ) ? STARPU_W : STARPU_RW; + + /* Insert the task */ rt_starpu_insert_task( - codelet, - STARPU_VALUE, &transA, sizeof(int), - STARPU_VALUE, &transB, sizeof(int), - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &k, sizeof(int), - STARPU_VALUE, &alpha, sizeof(CHAMELEON_Complex64_t), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), - STARPU_VALUE, &beta, sizeof(CHAMELEON_Complex64_t), - accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + &cl_zgemm, + /* Task codelet arguments */ + STARPU_VALUE, &clargs, sizeof(struct cl_zgemm_args_s), + STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "zgemm", + STARPU_NAME, cl_name, #endif - 0); + 0 ); + + (void)nb; } diff --git a/runtime/starpu/codelets/codelet_zherk.c b/runtime/starpu/codelets/codelet_zherk.c index 1b9d39818e5c3e45e0ae02df5dace718f160a0e7..b224473cbfa7de6d234012c033178c827bf1eeff 100644 --- a/runtime/starpu/codelets/codelet_zherk.c +++ b/runtime/starpu/codelets/codelet_zherk.c @@ -25,9 +25,7 @@ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" -#if !defined(CHAMELEON_SIMULATION) -static void cl_zherk_cpu_func(void *descr[], void *cl_arg) -{ +struct cl_zherk_args_s { cham_uplo_t uplo; cham_trans_t trans; int n; @@ -36,41 +34,46 @@ static void cl_zherk_cpu_func(void *descr[], void *cl_arg) CHAM_tile_t *tileA; double beta; CHAM_tile_t *tileC; +}; + +#if !defined(CHAMELEON_SIMULATION) +static void +cl_zherk_cpu_func(void *descr[], void *cl_arg) +{ + struct cl_zherk_args_s clargs; + CHAM_tile_t *tileA; + CHAM_tile_t *tileC; tileA = cti_interface_get(descr[0]); tileC = cti_interface_get(descr[1]); - starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta); - TCORE_zherk(uplo, trans, - n, k, - alpha, tileA, - beta, tileC); + starpu_codelet_unpack_args( cl_arg, &clargs ); + TCORE_zherk( clargs.uplo, clargs.trans, clargs.n, clargs.k, + clargs.alpha, tileA, clargs.beta, tileC ); } -#ifdef CHAMELEON_USE_CUDA -static void cl_zherk_cuda_func(void *descr[], void *cl_arg) +#if defined(CHAMELEON_USE_CUDA) +static void +cl_zherk_cuda_func(void *descr[], void *cl_arg) { - cham_uplo_t uplo; - cham_trans_t trans; - int n; - int k; - double alpha; + struct cl_zherk_args_s clargs; CHAM_tile_t *tileA; - double beta; CHAM_tile_t *tileC; tileA = cti_interface_get(descr[0]); tileC = cti_interface_get(descr[1]); - starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta); + starpu_codelet_unpack_args( cl_arg, &clargs ); RUNTIME_getStream(stream); CUDA_zherk( - uplo, trans, n, k, - &alpha, tileA->mat, tileA->ld, - &beta, tileC->mat, tileC->ld, - stream); + clargs.uplo, clargs.trans, clargs.n, clargs.k, + (cuDoubleComplex*)&(clargs.alpha), + tileA->mat, tileA->ld, + (cuDoubleComplex*)&(clargs.beta), + tileC->mat, tileC->ld, + stream ); #ifndef STARPU_CUDA_ASYNC cudaStreamSynchronize( stream ); @@ -78,52 +81,73 @@ static void cl_zherk_cuda_func(void *descr[], void *cl_arg) return; } -#endif /* CHAMELEON_USE_CUDA */ +#endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS(zherk, cl_zherk_cpu_func, cl_zherk_cuda_func, STARPU_CUDA_ASYNC) +CODELETS( zherk, cl_zherk_cpu_func, cl_zherk_cuda_func, STARPU_CUDA_ASYNC ) -void INSERT_TASK_zherk(const RUNTIME_option_t *options, - cham_uplo_t uplo, cham_trans_t trans, - int n, int k, int nb, - double alpha, const CHAM_desc_t *A, int Am, int An, - double beta, const CHAM_desc_t *C, int Cm, int Cn) +void INSERT_TASK_zherk( const RUNTIME_option_t *options, + cham_uplo_t uplo, cham_trans_t trans, + int n, int k, int nb, + double alpha, const CHAM_desc_t *A, int Am, int An, + double beta, const CHAM_desc_t *C, int Cm, int Cn ) { if ( alpha == 0. ) { return INSERT_TASK_zlascal( options, uplo, n, n, nb, beta, C, Cm, Cn ); } - (void)nb; - struct starpu_codelet *codelet = &cl_zherk; - void (*callback)(void*) = options->profiling ? cl_zherk_callback : NULL; - starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt); - int workerid = (schedopt == NULL) ? -1 : schedopt->workerid; - int accessC = ( beta == 0. ) ? STARPU_W : STARPU_RW; + struct cl_zherk_args_s clargs = { + .uplo = uplo, + .trans = trans, + .n = n, + .k = k, + .alpha = alpha, + .tileA = A->get_blktile( A, Am, An ), + .beta = beta, + .tileC = C->get_blktile( C, Cm, Cn ), + }; + void (*callback)(void*); + RUNTIME_request_t *request = options->request; + starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); + int workerid, accessC; + char *cl_name = "zherk"; + /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_RW(C, Cm, Cn); CHAMELEON_END_ACCESS_DECLARATION; + /* Callback fro profiling information */ + callback = options->profiling ? cl_zherk_callback : NULL; + + /* Fix the worker id */ + workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + + /* Reduce the C access if needed */ + accessC = ( beta == 0. ) ? STARPU_W : STARPU_RW; + + /* Insert the task */ rt_starpu_insert_task( - codelet, - STARPU_VALUE, &uplo, sizeof(int), - STARPU_VALUE, &trans, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &k, sizeof(int), - STARPU_VALUE, &alpha, sizeof(double), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_VALUE, &beta, sizeof(double), - accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + &cl_zherk, + /* Task codelet arguments */ + STARPU_VALUE, &clargs, sizeof(struct cl_zherk_args_s), + STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "zherk", + STARPU_NAME, cl_name, #endif - 0); + + 0 ); + + (void)nb; } diff --git a/runtime/starpu/codelets/codelet_zlacpy.c b/runtime/starpu/codelets/codelet_zlacpy.c index bec2d92288ad496a8e34d746ea7513f63c6c4a0c..3fd038bded4bd94643ae36e354a40453d4511747 100644 --- a/runtime/starpu/codelets/codelet_zlacpy.c +++ b/runtime/starpu/codelets/codelet_zlacpy.c @@ -12,8 +12,6 @@ * @brief Chameleon zlacpy StarPU codelet * * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Julien Langou * @author Henricus Bouwmeester * @author Mathieu Faverge @@ -27,70 +25,93 @@ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" -#if !defined(CHAMELEON_SIMULATION) -static void cl_zlacpy_cpu_func(void *descr[], void *cl_arg) -{ +struct cl_zlacpy_args_s { cham_uplo_t uplo; - int M; - int N; + int m; + int n; int displA; int displB; CHAM_tile_t *tileA; CHAM_tile_t *tileB; - CHAMELEON_Complex64_t *A; - CHAMELEON_Complex64_t *B; +}; + +#if !defined(CHAMELEON_SIMULATION) +static void +cl_zlacpy_cpu_func(void *descr[], void *cl_arg) +{ + struct cl_zlacpy_args_s clargs; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args(cl_arg, &uplo, &M, &N, &displA, &displB); - - assert( tileA->format & CHAMELEON_TILE_FULLRANK ); - assert( tileB->format & CHAMELEON_TILE_FULLRANK ); - - A = tileA->mat; - B = tileB->mat; - CORE_zlacpy( uplo, M, N, A + displA, tileA->ld, B + displB, tileB->ld ); + starpu_codelet_unpack_args( cl_arg, &clargs ); + assert( clargs.displA == 0 ); + assert( clargs.displB == 0 ); + /* A = tileA->mat; */ + /* B = tileB->mat; */ + /* CORE_zlacpy( uplo, M, N, A + displA, tileA->ld, B + displB, tileB->ld ); */ + TCORE_zlacpy( clargs.uplo, clargs.m, clargs.n, tileA, tileB ); } #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS_CPU(zlacpy, cl_zlacpy_cpu_func) +CODELETS_CPU( zlacpy, cl_zlacpy_cpu_func ) void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int nb, int displA, const CHAM_desc_t *A, int Am, int An, int displB, const CHAM_desc_t *B, int Bm, int Bn ) { - (void)nb; - struct starpu_codelet *codelet = &cl_zlacpy; - void (*callback)(void*) = options->profiling ? cl_zlacpy_callback : NULL; - starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt); - int workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + struct cl_zlacpy_args_s clargs = { + .uplo = uplo, + .m = m, + .n = n, + .displA = displA, + .displB = displB, + .tileA = A->get_blktile( A, Am, An ), + .tileB = B->get_blktile( B, Bm, Bn ), + }; + void (*callback)(void*); + RUNTIME_request_t *request = options->request; + starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); + int workerid; + char *cl_name = "zlacpy"; + /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; - CHAMELEON_ACCESS_R( A, Am, An ); - CHAMELEON_ACCESS_W( B, Bm, Bn ); + CHAMELEON_ACCESS_R(A, Am, An); + CHAMELEON_ACCESS_W(B, Bm, Bn); CHAMELEON_END_ACCESS_DECLARATION; + /* Callback fro profiling information */ + callback = options->profiling ? cl_zlacpy_callback : NULL; + + /* Fix the worker id */ + workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + + /* Insert the task */ rt_starpu_insert_task( - codelet, - STARPU_VALUE, &uplo, sizeof(cham_uplo_t), - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &displA, sizeof(int), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_VALUE, &displB, sizeof(int), - STARPU_W, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + &cl_zlacpy, + /* Task codelet arguments */ + STARPU_VALUE, &clargs, sizeof(struct cl_zlacpy_args_s), + STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_W, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "zlacpy", + STARPU_NAME, cl_name, #endif - 0); + + 0 ); + + (void)nb; } void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, diff --git a/runtime/starpu/codelets/codelet_zlascal.c b/runtime/starpu/codelets/codelet_zlascal.c index bd6823a08d8c3608141aeb9d145ffc671b6afad2..eb03a8685b19e0df448ccac2b2461de6e3250a54 100644 --- a/runtime/starpu/codelets/codelet_zlascal.c +++ b/runtime/starpu/codelets/codelet_zlascal.c @@ -21,33 +21,38 @@ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" -#if !defined(CHAMELEON_SIMULATION) -static void cl_zlascal_cpu_func(void *descr[], void *cl_arg) -{ +struct cl_zlascal_args_s { cham_uplo_t uplo; - int M; - int N; + int m; + int n; CHAMELEON_Complex64_t alpha; CHAM_tile_t *tileA; +}; + +#if !defined(CHAMELEON_SIMULATION) +static void +cl_zlascal_cpu_func( void *descr[], void *cl_arg ) +{ + struct cl_zlascal_args_s clargs; + CHAM_tile_t *tileA; tileA = cti_interface_get(descr[0]); - starpu_codelet_unpack_args(cl_arg, &uplo, &M, &N, &alpha); - TCORE_zlascal(uplo, M, N, alpha, tileA); - return; + starpu_codelet_unpack_args( cl_arg, &clargs ); + TCORE_zlascal( clargs.uplo, clargs.m, clargs.n, clargs.alpha, tileA ); } #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS_CPU(zlascal, cl_zlascal_cpu_func) +CODELETS_CPU( zlascal, cl_zlascal_cpu_func ) void INSERT_TASK_zlascal( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int nb, CHAMELEON_Complex64_t alpha, - const CHAM_desc_t *A, int Am, int An) + const CHAM_desc_t *A, int Am, int An ) { if ( alpha == 0. ) { return INSERT_TASK_zlaset( options, uplo, m, n, @@ -57,28 +62,46 @@ void INSERT_TASK_zlascal( const RUNTIME_option_t *options, return; } - (void)nb; - struct starpu_codelet *codelet = &cl_zlascal; - void (*callback)(void*) = options->profiling ? cl_zlascal_callback : NULL; - starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt); - int workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + struct cl_zlascal_args_s clargs = { + .uplo = uplo, + .m = m, + .n = n, + .alpha = alpha, + .tileA = A->get_blktile( A, Am, An ), + }; + void (*callback)(void*); + RUNTIME_request_t *request = options->request; + starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); + int workerid; + char *cl_name = "zlascal"; + /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_RW(A, Am, An); CHAMELEON_END_ACCESS_DECLARATION; + /* Callback fro profiling information */ + callback = options->profiling ? cl_zlascal_callback : NULL; + + /* Fix the worker id */ + workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + + /* Insert the task */ rt_starpu_insert_task( - codelet, - STARPU_VALUE, &uplo, sizeof(int), - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &alpha, sizeof(CHAMELEON_Complex64_t), - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + &cl_zlascal, + /* Task codelet arguments */ + STARPU_VALUE, &clargs, sizeof(struct cl_zlascal_args_s), + STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "zlascal", + STARPU_NAME, cl_name, #endif - 0); + + 0 ); + + (void)nb; } diff --git a/runtime/starpu/codelets/codelet_zlaset.c b/runtime/starpu/codelets/codelet_zlaset.c index c7661bdb8278687f6d4cea4496eedd504d0df897..0636686e3be44a17947b741109eb1d9450fdf841 100644 --- a/runtime/starpu/codelets/codelet_zlaset.c +++ b/runtime/starpu/codelets/codelet_zlaset.c @@ -12,8 +12,6 @@ * @brief Chameleon zlaset StarPU codelet * * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Mathieu Faverge * @author Emmanuel Agullo @@ -26,56 +24,78 @@ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" -#if !defined(CHAMELEON_SIMULATION) -static void cl_zlaset_cpu_func(void *descr[], void *cl_arg) -{ +struct cl_zlaset_args_s { cham_uplo_t uplo; - int M; - int N; + int m; + int n; CHAMELEON_Complex64_t alpha; CHAMELEON_Complex64_t beta; CHAM_tile_t *tileA; +}; + +#if !defined(CHAMELEON_SIMULATION) +static void +cl_zlaset_cpu_func( void *descr[], void *cl_arg ) +{ + struct cl_zlaset_args_s clargs; + CHAM_tile_t *tileA; tileA = cti_interface_get(descr[0]); - starpu_codelet_unpack_args(cl_arg, &uplo, &M, &N, &alpha, &beta); - TCORE_zlaset(uplo, M, N, alpha, beta, tileA); + starpu_codelet_unpack_args( cl_arg, &clargs ); + TCORE_zlaset( clargs.uplo, clargs.m, clargs.n, clargs.alpha, clargs.beta, tileA ); } #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS_CPU(zlaset, cl_zlaset_cpu_func) +CODELETS_CPU( zlaset, cl_zlaset_cpu_func ) -void INSERT_TASK_zlaset(const RUNTIME_option_t *options, - cham_uplo_t uplo, int M, int N, - CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t beta, - const CHAM_desc_t *A, int Am, int An) +void INSERT_TASK_zlaset( const RUNTIME_option_t *options, + cham_uplo_t uplo, int m, int n, + CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t beta, + const CHAM_desc_t *A, int Am, int An ) { + struct cl_zlaset_args_s clargs = { + .uplo = uplo, + .m = m, + .n = n, + .alpha = alpha, + .beta = beta, + .tileA = A->get_blktile( A, Am, An ), + }; + void (*callback)(void*); + RUNTIME_request_t *request = options->request; + starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); + int workerid; + char *cl_name = "zlaset"; - struct starpu_codelet *codelet = &cl_zlaset; - void (*callback)(void*) = options->profiling ? cl_zlaset_callback : NULL; - starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt); - int workerid = (schedopt == NULL) ? -1 : schedopt->workerid; - + /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_W(A, Am, An); CHAMELEON_END_ACCESS_DECLARATION; + /* Callback fro profiling information */ + callback = options->profiling ? cl_zlaset_callback : NULL; + + /* Fix the worker id */ + workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + + /* Insert the task */ rt_starpu_insert_task( - codelet, - STARPU_VALUE, &uplo, sizeof(int), - STARPU_VALUE, &M, sizeof(int), - STARPU_VALUE, &N, sizeof(int), - STARPU_VALUE, &alpha, sizeof(CHAMELEON_Complex64_t), - STARPU_VALUE, &beta, sizeof(CHAMELEON_Complex64_t), + &cl_zlaset, + /* Task codelet arguments */ + STARPU_VALUE, &clargs, sizeof(struct cl_zlaset_args_s), STARPU_W, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "zlaset", + STARPU_NAME, cl_name, #endif - 0); + + 0 ); } diff --git a/runtime/starpu/codelets/codelet_zlauum.c b/runtime/starpu/codelets/codelet_zlauum.c index 0f3f1911bec1e1f83b684c398b82d67b0ff3fbcb..9ff3fc6e92bfa7fb42cb52ce9f83fa2b8b262d6e 100644 --- a/runtime/starpu/codelets/codelet_zlauum.c +++ b/runtime/starpu/codelets/codelet_zlauum.c @@ -12,8 +12,6 @@ * @brief Chameleon zlauum StarPU codelet * * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Julien Langou * @author Henricus Bouwmeester * @author Mathieu Faverge @@ -27,55 +25,74 @@ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" +struct cl_zlauum_args_s { + cham_uplo_t uplo; + int n; + CHAM_tile_t *tileA; +}; + #if !defined(CHAMELEON_SIMULATION) -static void cl_zlauum_cpu_func(void *descr[], void *cl_arg) +static void +cl_zlauum_cpu_func(void *descr[], void *cl_arg) { - cham_uplo_t uplo; - int N; + struct cl_zlauum_args_s clargs; CHAM_tile_t *tileA; + int info = 0; tileA = cti_interface_get(descr[0]); - starpu_codelet_unpack_args(cl_arg, &uplo, &N); - TCORE_zlauum(uplo, N, tileA); + starpu_codelet_unpack_args( cl_arg, &clargs ); + TCORE_zlauum( clargs.uplo, clargs.n, tileA ); } #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS_CPU(zlauum, cl_zlauum_cpu_func) +CODELETS_CPU( zlauum, cl_zlauum_cpu_func ) -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - */ void INSERT_TASK_zlauum( const RUNTIME_option_t *options, cham_uplo_t uplo, int n, int nb, const CHAM_desc_t *A, int Am, int An ) { - (void)nb; - struct starpu_codelet *codelet = &cl_zlauum; - void (*callback)(void*) = options->profiling ? cl_zlauum_callback : NULL; - starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt); - int workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + struct cl_zlauum_args_s clargs = { + .uplo = uplo, + .n = n, + .tileA = A->get_blktile( A, Am, An ), + }; + void (*callback)(void*); + RUNTIME_request_t *request = options->request; + starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); + int workerid; + char *cl_name = "zlauum"; + /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_RW(A, Am, An); CHAMELEON_END_ACCESS_DECLARATION; + /* Callback fro profiling information */ + callback = options->profiling ? cl_zlauum_callback : NULL; + + /* Fix the worker id */ + workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + + /* Insert the task */ rt_starpu_insert_task( - codelet, - STARPU_VALUE, &uplo, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + &cl_zlauum, + /* Task codelet arguments */ + STARPU_VALUE, &clargs, sizeof(struct cl_zlauum_args_s), + STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "zlauum", + STARPU_NAME, cl_name, #endif - 0); + 0 ); + + (void)nb; } diff --git a/runtime/starpu/codelets/codelet_zplghe.c b/runtime/starpu/codelets/codelet_zplghe.c index 87c071e44a71d4f48e3226b14e7e8c81d0fc13be..4c04611ccbbf97a8bffe43516bcf7979189551d2 100644 --- a/runtime/starpu/codelets/codelet_zplghe.c +++ b/runtime/starpu/codelets/codelet_zplghe.c @@ -12,8 +12,6 @@ * @brief Chameleon zplghe StarPU codelet * * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Piotr Luszczek * @author Pierre Lemarinier * @author Mathieu Faverge @@ -27,11 +25,7 @@ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" -/* cl_zplghe_cpu_func - Generate a tile for random hermitian (positive definite if bump is large enough) matrix. */ - -#if !defined(CHAMELEON_SIMULATION) -static void cl_zplghe_cpu_func(void *descr[], void *cl_arg) -{ +struct cl_zplghe_args_s { double bump; int m; int n; @@ -40,47 +34,73 @@ static void cl_zplghe_cpu_func(void *descr[], void *cl_arg) int m0; int n0; unsigned long long int seed; +}; + +#if !defined(CHAMELEON_SIMULATION) +static void cl_zplghe_cpu_func(void *descr[], void *cl_arg) +{ + struct cl_zplghe_args_s clargs; + CHAM_tile_t *tileA; tileA = cti_interface_get(descr[0]); - starpu_codelet_unpack_args(cl_arg, &bump, &m, &n, &bigM, &m0, &n0, &seed ); - TCORE_zplghe( bump, m, n, tileA, bigM, m0, n0, seed ); + starpu_codelet_unpack_args( cl_arg, &clargs ); + TCORE_zplghe( clargs.bump, clargs.m, clargs.n, tileA, + clargs.bigM, clargs.m0, clargs.n0, clargs.seed ); } #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS_CPU(zplghe, cl_zplghe_cpu_func) +CODELETS_CPU( zplghe, cl_zplghe_cpu_func ) void INSERT_TASK_zplghe( const RUNTIME_option_t *options, double bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int bigM, int m0, int n0, unsigned long long int seed ) { - struct starpu_codelet *codelet = &cl_zplghe; - void (*callback)(void*) = options->profiling ? cl_zplghe_callback : NULL; - starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt); - int workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + struct cl_zplghe_args_s clargs = { + .bump = bump, + .m = m, + .n = n, + .tileA = A->get_blktile( A, Am, An ), + .bigM = bigM, + .m0 = m0, + .n0 = n0, + .seed = seed, + }; + void (*callback)(void*); + RUNTIME_request_t *request = options->request; + starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); + int workerid; + char *cl_name = "zplghe"; + /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_W(A, Am, An); CHAMELEON_END_ACCESS_DECLARATION; + /* Callback fro profiling information */ + callback = options->profiling ? cl_zplghe_callback : NULL; + + /* Fix the worker id */ + workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + + /* Insert the task */ rt_starpu_insert_task( - codelet, - STARPU_VALUE, &bump, sizeof(double), - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_W, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_VALUE, &bigM, sizeof(int), - STARPU_VALUE, &m0, sizeof(int), - STARPU_VALUE, &n0, sizeof(int), - STARPU_VALUE, &seed, sizeof(unsigned long long int), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + &cl_zplghe, + /* Task codelet arguments */ + STARPU_VALUE, &clargs, sizeof(struct cl_zplghe_args_s), + STARPU_W, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "zplghe", + STARPU_NAME, cl_name, #endif - 0); + + 0 ); + } diff --git a/runtime/starpu/codelets/codelet_zplgsy.c b/runtime/starpu/codelets/codelet_zplgsy.c index 748e78d3310277cbcd0da4d30c9d563afb1a66d1..57aec1bf28967835817f9ddbef05800b57b93aa8 100644 --- a/runtime/starpu/codelets/codelet_zplgsy.c +++ b/runtime/starpu/codelets/codelet_zplgsy.c @@ -12,8 +12,6 @@ * @brief Chameleon zplgsy StarPU codelet * * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Piotr Luszczek * @author Pierre Lemarinier * @author Mathieu Faverge @@ -27,11 +25,7 @@ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" -/* cl_zplgsy_cpu_func - Generate a tile for random symmetric (positive definite if 'bump' is large enough) matrix. */ - -#if !defined(CHAMELEON_SIMULATION) -static void cl_zplgsy_cpu_func(void *descr[], void *cl_arg) -{ +struct cl_zplgsy_args_s { CHAMELEON_Complex64_t bump; int m; int n; @@ -40,48 +34,73 @@ static void cl_zplgsy_cpu_func(void *descr[], void *cl_arg) int m0; int n0; unsigned long long int seed; +}; + +#if !defined(CHAMELEON_SIMULATION) +static void cl_zplgsy_cpu_func(void *descr[], void *cl_arg) +{ + struct cl_zplgsy_args_s clargs; + CHAM_tile_t *tileA; tileA = cti_interface_get(descr[0]); - starpu_codelet_unpack_args(cl_arg, &bump, &m, &n, &bigM, &m0, &n0, &seed ); - TCORE_zplgsy( bump, m, n, tileA, bigM, m0, n0, seed ); + starpu_codelet_unpack_args( cl_arg, &clargs ); + TCORE_zplgsy( clargs.bump, clargs.m, clargs.n, tileA, + clargs.bigM, clargs.m0, clargs.n0, clargs.seed ); } #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS_CPU(zplgsy, cl_zplgsy_cpu_func) +CODELETS_CPU( zplgsy, cl_zplgsy_cpu_func ) void INSERT_TASK_zplgsy( const RUNTIME_option_t *options, - CHAMELEON_Complex64_t bump, int m, int n, const CHAM_desc_t *A, int Am, int An, - int bigM, int m0, int n0, unsigned long long int seed ) + CHAMELEON_Complex64_t bump, int m, int n, const CHAM_desc_t *A, int Am, int An, + int bigM, int m0, int n0, unsigned long long int seed ) { + struct cl_zplgsy_args_s clargs = { + .bump = bump, + .m = m, + .n = n, + .tileA = A->get_blktile( A, Am, An ), + .bigM = bigM, + .m0 = m0, + .n0 = n0, + .seed = seed, + }; + void (*callback)(void*); + RUNTIME_request_t *request = options->request; + starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); + int workerid; + char *cl_name = "zplgsy"; - struct starpu_codelet *codelet = &cl_zplgsy; - void (*callback)(void*) = options->profiling ? cl_zplgsy_callback : NULL; - starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt); - int workerid = (schedopt == NULL) ? -1 : schedopt->workerid; - + /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_W(A, Am, An); CHAMELEON_END_ACCESS_DECLARATION; + /* Callback fro profiling information */ + callback = options->profiling ? cl_zplgsy_callback : NULL; + + /* Fix the worker id */ + workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + + /* Insert the task */ rt_starpu_insert_task( - codelet, - STARPU_VALUE, &bump, sizeof(CHAMELEON_Complex64_t), - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_W, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_VALUE, &bigM, sizeof(int), - STARPU_VALUE, &m0, sizeof(int), - STARPU_VALUE, &n0, sizeof(int), - STARPU_VALUE, &seed, sizeof(unsigned long long int), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + &cl_zplgsy, + /* Task codelet arguments */ + STARPU_VALUE, &clargs, sizeof(struct cl_zplgsy_args_s), + STARPU_W, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "zplgsy", + STARPU_NAME, cl_name, #endif - 0); + + 0 ); + } diff --git a/runtime/starpu/codelets/codelet_zplrnt.c b/runtime/starpu/codelets/codelet_zplrnt.c index 1376e950c222a2c130b9ac9745c059d631face37..a4eef0c8a1df265dc9afd8d9c896811cf4634e52 100644 --- a/runtime/starpu/codelets/codelet_zplrnt.c +++ b/runtime/starpu/codelets/codelet_zplrnt.c @@ -12,8 +12,6 @@ * @brief Chameleon zplrnt StarPU codelet * * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Piotr Luszczek * @author Pierre Lemarinier * @author Mathieu Faverge @@ -27,9 +25,7 @@ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" -#if !defined(CHAMELEON_SIMULATION) -static void cl_zplrnt_cpu_func(void *descr[], void *cl_arg) -{ +struct cl_zplrnt_args_s { int m; int n; CHAM_tile_t *tileA; @@ -37,47 +33,72 @@ static void cl_zplrnt_cpu_func(void *descr[], void *cl_arg) int m0; int n0; unsigned long long int seed; +}; + +#if !defined(CHAMELEON_SIMULATION) +static void +cl_zplrnt_cpu_func(void *descr[], void *cl_arg) +{ + struct cl_zplrnt_args_s clargs; + CHAM_tile_t *tileA; tileA = cti_interface_get(descr[0]); - starpu_codelet_unpack_args(cl_arg, &m, &n, &bigM, &m0, &n0, &seed ); - TCORE_zplrnt( m, n, tileA, bigM, m0, n0, seed ); + starpu_codelet_unpack_args( cl_arg, &clargs ); + TCORE_zplrnt( clargs.m, clargs.n, tileA, + clargs.bigM, clargs.m0, clargs.n0, clargs.seed ); } #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS_CPU(zplrnt, cl_zplrnt_cpu_func) +CODELETS_CPU( zplrnt, cl_zplrnt_cpu_func ) void INSERT_TASK_zplrnt( const RUNTIME_option_t *options, int m, int n, const CHAM_desc_t *A, int Am, int An, int bigM, int m0, int n0, unsigned long long int seed ) { + struct cl_zplrnt_args_s clargs = { + .m = m, + .n = n, + .tileA = A->get_blktile( A, Am, An ), + .bigM = bigM, + .m0 = m0, + .n0 = n0, + .seed = seed, + }; + void (*callback)(void*); + RUNTIME_request_t *request = options->request; + starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); + int workerid; + char *cl_name = "zplrnt"; - struct starpu_codelet *codelet = &cl_zplrnt; - void (*callback)(void*) = options->profiling ? cl_zplrnt_callback : NULL; - starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt); - int workerid = (schedopt == NULL) ? -1 : schedopt->workerid; - + /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_W(A, Am, An); CHAMELEON_END_ACCESS_DECLARATION; + /* Callback fro profiling information */ + callback = options->profiling ? cl_zplrnt_callback : NULL; + + /* Fix the worker id */ + workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + + /* Insert the task */ rt_starpu_insert_task( - codelet, - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_W, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_VALUE, &bigM, sizeof(int), - STARPU_VALUE, &m0, sizeof(int), - STARPU_VALUE, &n0, sizeof(int), - STARPU_VALUE, &seed, sizeof(unsigned long long int), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + &cl_zplrnt, + /* Task codelet arguments */ + STARPU_VALUE, &clargs, sizeof(struct cl_zplrnt_args_s), + STARPU_W, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "zplrnt", + STARPU_NAME, cl_name, #endif - 0); + + 0 ); } diff --git a/runtime/starpu/codelets/codelet_zpotrf.c b/runtime/starpu/codelets/codelet_zpotrf.c index 2337b00fde7207c893e935e250deb2d1cafc0cb6..93f4f640d72b22660bd18dcf7067ddff008a7dec 100644 --- a/runtime/starpu/codelets/codelet_zpotrf.c +++ b/runtime/starpu/codelets/codelet_zpotrf.c @@ -12,8 +12,6 @@ * @brief Chameleon zpotrf StarPU codelet * * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge @@ -27,24 +25,30 @@ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" -#if !defined(CHAMELEON_SIMULATION) -static void cl_zpotrf_cpu_func(void *descr[], void *cl_arg) -{ +struct cl_zpotrf_args_s { cham_uplo_t uplo; int n; CHAM_tile_t *tileA; int iinfo; RUNTIME_sequence_t *sequence; RUNTIME_request_t *request; +}; + +#if !defined(CHAMELEON_SIMULATION) +static void +cl_zpotrf_cpu_func(void *descr[], void *cl_arg) +{ + struct cl_zpotrf_args_s clargs; + CHAM_tile_t *tileA; int info = 0; tileA = cti_interface_get(descr[0]); - starpu_codelet_unpack_args(cl_arg, &uplo, &n, &iinfo, &sequence, &request); - TCORE_zpotrf(uplo, n, tileA, &info); + starpu_codelet_unpack_args( cl_arg, &clargs ); + TCORE_zpotrf( clargs.uplo, clargs.n, tileA, &info ); - if ( (sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) { - RUNTIME_sequence_flush( NULL, sequence, request, iinfo+info ); + if ( (clargs.sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) { + RUNTIME_sequence_flush( NULL, clargs.sequence, clargs.request, clargs.iinfo+info ); } } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -52,46 +56,54 @@ static void cl_zpotrf_cpu_func(void *descr[], void *cl_arg) /* * Codelet definition */ -#if defined(CHAMELEON_SIMULATION) && defined(CHAMELEON_SIMULATION_EXTENDED) -CODELETS( zpotrf, cl_zpotrf_cpu_func, cl_zpotrf_cuda_func, STARPU_CUDA_ASYNC ) -#else CODELETS_CPU( zpotrf, cl_zpotrf_cpu_func ) -#endif -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - */ -void INSERT_TASK_zpotrf(const RUNTIME_option_t *options, - cham_uplo_t uplo, int n, int nb, - const CHAM_desc_t *A, int Am, int An, - int iinfo) +void INSERT_TASK_zpotrf( const RUNTIME_option_t *options, + cham_uplo_t uplo, int n, int nb, + const CHAM_desc_t *A, int Am, int An, + int iinfo ) { - (void)nb; - struct starpu_codelet *codelet = &cl_zpotrf; - void (*callback)(void*) = options->profiling ? cl_zpotrf_callback : NULL; - starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt); - int workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + struct cl_zpotrf_args_s clargs = { + .uplo = uplo, + .n = n, + .tileA = A->get_blktile( A, Am, An ), + .iinfo = iinfo, + .sequence = options->sequence, + .request = options->request, + }; + void (*callback)(void*); + RUNTIME_request_t *request = options->request; + starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); + int workerid; + char *cl_name = "zpotrf"; + /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_RW(A, Am, An); CHAMELEON_END_ACCESS_DECLARATION; + /* Callback fro profiling information */ + callback = options->profiling ? cl_zpotrf_callback : NULL; + + /* Fix the worker id */ + workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + + /* Insert the task */ rt_starpu_insert_task( - codelet, - STARPU_VALUE, &uplo, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_VALUE, &iinfo, sizeof(int), - STARPU_VALUE, &(options->sequence), sizeof(RUNTIME_sequence_t*), - STARPU_VALUE, &(options->request), sizeof(RUNTIME_request_t*), - /* STARPU_SCRATCH, options->ws_worker, */ - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + &cl_zpotrf, + /* Task codelet arguments */ + STARPU_VALUE, &clargs, sizeof(struct cl_zpotrf_args_s), + STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "zpotrf", + STARPU_NAME, cl_name, #endif - 0); + + 0 ); + + (void)nb; } diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c index 9552eee9ca40f3982c39b2140e9d5a9a2fe1b7df..0cba41cab3ab660e5dada013b4614cb179059b34 100644 --- a/runtime/starpu/codelets/codelet_zsyrk.c +++ b/runtime/starpu/codelets/codelet_zsyrk.c @@ -25,9 +25,7 @@ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" -#if !defined(CHAMELEON_SIMULATION) -static void cl_zsyrk_cpu_func(void *descr[], void *cl_arg) -{ +struct cl_zsyrk_args_s { cham_uplo_t uplo; cham_trans_t trans; int n; @@ -36,41 +34,46 @@ static void cl_zsyrk_cpu_func(void *descr[], void *cl_arg) CHAM_tile_t *tileA; CHAMELEON_Complex64_t beta; CHAM_tile_t *tileC; +}; + +#if !defined(CHAMELEON_SIMULATION) +static void +cl_zsyrk_cpu_func(void *descr[], void *cl_arg) +{ + struct cl_zsyrk_args_s clargs; + CHAM_tile_t *tileA; + CHAM_tile_t *tileC; tileA = cti_interface_get(descr[0]); tileC = cti_interface_get(descr[1]); - starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta); - TCORE_zsyrk(uplo, trans, - n, k, - alpha, tileA, - beta, tileC); + starpu_codelet_unpack_args( cl_arg, &clargs ); + TCORE_zsyrk( clargs.uplo, clargs.trans, clargs.n, clargs.k, + clargs.alpha, tileA, clargs.beta, tileC ); } -#ifdef CHAMELEON_USE_CUDA -static void cl_zsyrk_cuda_func(void *descr[], void *cl_arg) +#if defined(CHAMELEON_USE_CUDA) +static void +cl_zsyrk_cuda_func(void *descr[], void *cl_arg) { - cham_uplo_t uplo; - cham_trans_t trans; - int n; - int k; - cuDoubleComplex alpha; + struct cl_zsyrk_args_s clargs; CHAM_tile_t *tileA; - cuDoubleComplex beta; CHAM_tile_t *tileC; tileA = cti_interface_get(descr[0]); tileC = cti_interface_get(descr[1]); - starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta); + starpu_codelet_unpack_args( cl_arg, &clargs ); RUNTIME_getStream(stream); CUDA_zsyrk( - uplo, trans, n, k, - &alpha, tileA->mat, tileA->ld, - &beta, tileC->mat, tileC->ld, - stream); + clargs.uplo, clargs.trans, clargs.n, clargs.k, + (cuDoubleComplex*)&(clargs.alpha), + tileA->mat, tileA->ld, + (cuDoubleComplex*)&(clargs.beta), + tileC->mat, tileC->ld, + stream ); #ifndef STARPU_CUDA_ASYNC cudaStreamSynchronize( stream ); @@ -78,57 +81,74 @@ static void cl_zsyrk_cuda_func(void *descr[], void *cl_arg) return; } -#endif /* CHAMELEON_USE_CUDA */ +#endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS(zsyrk, cl_zsyrk_cpu_func, cl_zsyrk_cuda_func, STARPU_CUDA_ASYNC) +CODELETS( zsyrk, cl_zsyrk_cpu_func, cl_zsyrk_cuda_func, STARPU_CUDA_ASYNC ) -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - */ -void INSERT_TASK_zsyrk(const RUNTIME_option_t *options, - cham_uplo_t uplo, cham_trans_t trans, - int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) +void INSERT_TASK_zsyrk( const RUNTIME_option_t *options, + cham_uplo_t uplo, cham_trans_t trans, + int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) { if ( alpha == 0. ) { return INSERT_TASK_zlascal( options, uplo, n, n, nb, beta, C, Cm, Cn ); } - (void)nb; - struct starpu_codelet *codelet = &cl_zsyrk; - void (*callback)(void*) = options->profiling ? cl_zsyrk_callback : NULL; - starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt); - int workerid = (schedopt == NULL) ? -1 : schedopt->workerid; - int accessC = ( beta == 0. ) ? STARPU_W : STARPU_RW; + struct cl_zsyrk_args_s clargs = { + .uplo = uplo, + .trans = trans, + .n = n, + .k = k, + .alpha = alpha, + .tileA = A->get_blktile( A, Am, An ), + .beta = beta, + .tileC = C->get_blktile( C, Cm, Cn ), + }; + void (*callback)(void*); + RUNTIME_request_t *request = options->request; + starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); + int workerid, accessC; + char *cl_name = "zsyrk"; + /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_RW(C, Cm, Cn); CHAMELEON_END_ACCESS_DECLARATION; + /* Callback fro profiling information */ + callback = options->profiling ? cl_zsyrk_callback : NULL; + + /* Fix the worker id */ + workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + + /* Reduce the C access if needed */ + accessC = ( beta == 0. ) ? STARPU_W : STARPU_RW; + + /* Insert the task */ rt_starpu_insert_task( - codelet, - STARPU_VALUE, &uplo, sizeof(int), - STARPU_VALUE, &trans, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &k, sizeof(int), - STARPU_VALUE, &alpha, sizeof(CHAMELEON_Complex64_t), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_VALUE, &beta, sizeof(CHAMELEON_Complex64_t), - accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + &cl_zsyrk, + + /* Task codelet arguments */ + STARPU_VALUE, &clargs, sizeof(struct cl_zsyrk_args_s), + STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "zsyrk", + STARPU_NAME, cl_name, #endif - 0); + + 0 ); + + (void)nb; } diff --git a/runtime/starpu/codelets/codelet_ztradd.c b/runtime/starpu/codelets/codelet_ztradd.c index 689cb6254ed2b8956505108665977176761c8278..2cbaa3aeff6fdc406006f8bfbaeccce107267dd8 100644 --- a/runtime/starpu/codelets/codelet_ztradd.c +++ b/runtime/starpu/codelets/codelet_ztradd.c @@ -21,30 +21,38 @@ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" -#if !defined(CHAMELEON_SIMULATION) -static void cl_ztradd_cpu_func(void *descr[], void *cl_arg) -{ +struct cl_ztradd_args_s { cham_uplo_t uplo; cham_trans_t trans; - int M; - int N; + int m; + int n; CHAMELEON_Complex64_t alpha; CHAM_tile_t *tileA; CHAMELEON_Complex64_t beta; CHAM_tile_t *tileB; +}; + +#if !defined(CHAMELEON_SIMULATION) +static void +cl_ztradd_cpu_func(void *descr[], void *cl_arg) +{ + struct cl_ztradd_args_s clargs; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &M, &N, &alpha, &beta); - TCORE_ztradd(uplo, trans, M, N, alpha, tileA, beta, tileB); - return; + + starpu_codelet_unpack_args( cl_arg, &clargs ); + TCORE_ztradd( clargs.uplo, clargs.trans, clargs.m, clargs.n, + clargs.alpha, tileA, clargs.beta, tileB ); } #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS_CPU(ztradd, cl_ztradd_cpu_func) +CODELETS_CPU( ztradd, cl_ztradd_cpu_func ) void INSERT_TASK_ztradd( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int m, int n, int nb, @@ -56,34 +64,54 @@ void INSERT_TASK_ztradd( const RUNTIME_option_t *options, beta, B, Bm, Bn ); } - struct starpu_codelet *codelet = &cl_ztradd; - void (*callback)(void*) = options->profiling ? cl_zgeadd_callback : NULL; - starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt); - int workerid = (schedopt == NULL) ? -1 : schedopt->workerid; - int accessB = ( beta == 0. ) ? STARPU_W : STARPU_RW; + struct cl_ztradd_args_s clargs = { + .uplo = uplo, + .trans = trans, + .m = m, + .n = n, + .alpha = alpha, + .tileA = A->get_blktile( A, Am, An ), + .beta = beta, + .tileB = B->get_blktile( B, Bm, Bn ), + }; + void (*callback)(void*); + RUNTIME_request_t *request = options->request; + starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); + int workerid, accessB; + char *cl_name = "ztradd"; + /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_RW(B, Bm, Bn); CHAMELEON_END_ACCESS_DECLARATION; + /* Callback fro profiling information */ + callback = options->profiling ? cl_ztradd_callback : NULL; + + /* Fix the worker id */ + workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + + /* Reduce the B access if needed */ + accessB = ( beta == 0. ) ? STARPU_W : STARPU_RW; + + /* Insert the task */ rt_starpu_insert_task( - codelet, - STARPU_VALUE, &uplo, sizeof(int), - STARPU_VALUE, &trans, sizeof(int), - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &alpha, sizeof(CHAMELEON_Complex64_t), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_VALUE, &beta, sizeof(CHAMELEON_Complex64_t), - accessB, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + &cl_ztradd, + /* Task codelet arguments */ + STARPU_VALUE, &clargs, sizeof(struct cl_ztradd_args_s), + STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + accessB, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "ztradd", + STARPU_NAME, cl_name, #endif - 0); + + 0 ); (void)nb; } diff --git a/runtime/starpu/codelets/codelet_ztrmm.c b/runtime/starpu/codelets/codelet_ztrmm.c index 354e5f454d84d9268ae4df9eed5791a231d0c942..099e5ca8a987135094c2c675d3756f7328c94df1 100644 --- a/runtime/starpu/codelets/codelet_ztrmm.c +++ b/runtime/starpu/codelets/codelet_ztrmm.c @@ -25,51 +25,53 @@ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" -#if !defined(CHAMELEON_SIMULATION) -static void cl_ztrmm_cpu_func(void *descr[], void *cl_arg) -{ +struct cl_ztrmm_args_s { cham_side_t side; cham_uplo_t uplo; cham_trans_t transA; cham_diag_t diag; - int M; - int N; + int m; + int n; CHAMELEON_Complex64_t alpha; CHAM_tile_t *tileA; CHAM_tile_t *tileB; +}; + +#if !defined(CHAMELEON_SIMULATION) +static void +cl_ztrmm_cpu_func(void *descr[], void *cl_arg) +{ + struct cl_ztrmm_args_s clargs; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args(cl_arg, &side, &uplo, &transA, &diag, &M, &N, &alpha); - TCORE_ztrmm(side, uplo, - transA, diag, - M, N, - alpha, tileA, - tileB); + starpu_codelet_unpack_args( cl_arg, &clargs ); + TCORE_ztrmm( clargs.side, clargs.uplo, clargs.transA, clargs.diag, + clargs.m, clargs.n, clargs.alpha, tileA, tileB ); } #ifdef CHAMELEON_USE_CUDA -static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg) +static void +cl_ztrmm_cuda_func(void *descr[], void *cl_arg) { - cham_side_t side; - cham_uplo_t uplo; - cham_trans_t transA; - cham_diag_t diag; - int M; - int N; - cuDoubleComplex alpha; + struct cl_ztrmm_args_s clargs; CHAM_tile_t *tileA; CHAM_tile_t *tileB; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args(cl_arg, &side, &uplo, &transA, &diag, &M, &N, &alpha); + + starpu_codelet_unpack_args( cl_arg, &clargs ); RUNTIME_getStream(stream); CUDA_ztrmm( - side, uplo, transA, diag, M, N, &alpha, + clargs.side, clargs.uplo, clargs.transA, clargs.diag, + clargs.m, clargs.n, + (cuDoubleComplex*)&(clargs.alpha), tileA->mat, tileA->ld, tileB->mat, tileB->ld, stream ); @@ -80,58 +82,66 @@ static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg) return; } -#endif /* CHAMELEON_USE_CUDA */ +#endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ - /* * Codelet definition */ -CODELETS(ztrmm, cl_ztrmm_cpu_func, cl_ztrmm_cuda_func, STARPU_CUDA_ASYNC) +CODELETS( ztrmm, cl_ztrmm_cpu_func, cl_ztrmm_cuda_func, STARPU_CUDA_ASYNC ) -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - */ -void INSERT_TASK_ztrmm(const RUNTIME_option_t *options, - cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, - int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn) +void INSERT_TASK_ztrmm( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { - if ( alpha == 0. ) { - return INSERT_TASK_zlaset( options, ChamUpperLower, m, n, - alpha, alpha, B, Bm, Bn ); - } - - (void)nb; - struct starpu_codelet *codelet = &cl_ztrmm; - void (*callback)(void*) = options->profiling ? cl_ztrmm_callback : NULL; - starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt); - int workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + struct cl_ztrmm_args_s clargs = { + .side = side, + .uplo = uplo, + .transA = transA, + .diag = diag, + .m = m, + .n = n, + .alpha = alpha, + .tileA = A->get_blktile( A, Am, An ), + .tileB = B->get_blktile( B, Bm, Bn ), + }; + void (*callback)(void*); + RUNTIME_request_t *request = options->request; + starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); + int workerid; + char *cl_name = "ztrmm"; + /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_RW(B, Bm, Bn); CHAMELEON_END_ACCESS_DECLARATION; + /* Callback fro profiling information */ + callback = options->profiling ? cl_ztrmm_callback : NULL; + + /* Fix the worker id */ + workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + + /* Insert the task */ rt_starpu_insert_task( - codelet, - STARPU_VALUE, &side, sizeof(int), - STARPU_VALUE, &uplo, sizeof(int), - STARPU_VALUE, &transA, sizeof(int), - STARPU_VALUE, &diag, sizeof(int), - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &alpha, sizeof(CHAMELEON_Complex64_t), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_RW, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + &cl_ztrmm, + /* Task codelet arguments */ + STARPU_VALUE, &clargs, sizeof(struct cl_ztrmm_args_s), + STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_RW, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "ztrmm", + STARPU_NAME, cl_name, #endif - 0); + + 0 ); + + (void)nb; } diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c index 694f0a99bbd5b7638a9c81829e740d402d92375a..e7ad01a43e6745bc49ecaee676d74cb3f6180f49 100644 --- a/runtime/starpu/codelets/codelet_ztrsm.c +++ b/runtime/starpu/codelets/codelet_ztrsm.c @@ -12,8 +12,6 @@ * @brief Chameleon ztrsm StarPU codelet * * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge @@ -27,9 +25,7 @@ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" -#if !defined(CHAMELEON_SIMULATION) -static void cl_ztrsm_cpu_func(void *descr[], void *cl_arg) -{ +struct cl_ztrsm_args_s { cham_side_t side; cham_uplo_t uplo; cham_trans_t transA; @@ -39,38 +35,43 @@ static void cl_ztrsm_cpu_func(void *descr[], void *cl_arg) CHAMELEON_Complex64_t alpha; CHAM_tile_t *tileA; CHAM_tile_t *tileB; +}; + +#if !defined(CHAMELEON_SIMULATION) +static void +cl_ztrsm_cpu_func(void *descr[], void *cl_arg) +{ + struct cl_ztrsm_args_s clargs; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args(cl_arg, &side, &uplo, &transA, &diag, &m, &n, &alpha); - TCORE_ztrsm(side, uplo, - transA, diag, - m, n, - alpha, tileA, - tileB); + + starpu_codelet_unpack_args( cl_arg, &clargs ); + TCORE_ztrsm( clargs.side, clargs.uplo, clargs.transA, clargs.diag, + clargs.m, clargs.n, clargs.alpha, tileA, tileB ); } #ifdef CHAMELEON_USE_CUDA -static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg) +static void +cl_ztrsm_cuda_func(void *descr[], void *cl_arg) { - cham_side_t side; - cham_uplo_t uplo; - cham_trans_t transA; - cham_diag_t diag; - int m; - int n; - cuDoubleComplex alpha; + struct cl_ztrsm_args_s clargs; CHAM_tile_t *tileA; CHAM_tile_t *tileB; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args(cl_arg, &side, &uplo, &transA, &diag, &m, &n, &alpha); + + starpu_codelet_unpack_args( cl_arg, &clargs ); RUNTIME_getStream(stream); CUDA_ztrsm( - side, uplo, transA, diag, m, n, &alpha, + clargs.side, clargs.uplo, clargs.transA, clargs.diag, + clargs.m, clargs.n, + (cuDoubleComplex*)&(clargs.alpha), tileA->mat, tileA->ld, tileB->mat, tileB->ld, stream ); @@ -81,52 +82,66 @@ static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg) return; } -#endif /* CHAMELEON_USE_CUDA */ +#endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS(ztrsm, cl_ztrsm_cpu_func, cl_ztrsm_cuda_func, STARPU_CUDA_ASYNC) +CODELETS( ztrsm, cl_ztrsm_cpu_func, cl_ztrsm_cuda_func, STARPU_CUDA_ASYNC ) -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - */ -void INSERT_TASK_ztrsm(const RUNTIME_option_t *options, - cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, - int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn) +void INSERT_TASK_ztrsm( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { - (void)nb; - struct starpu_codelet *codelet = &cl_ztrsm; - void (*callback)(void*) = options->profiling ? cl_ztrsm_callback : NULL; - starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt); - int workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + struct cl_ztrsm_args_s clargs = { + .side = side, + .uplo = uplo, + .transA = transA, + .diag = diag, + .m = m, + .n = n, + .alpha = alpha, + .tileA = A->get_blktile( A, Am, An ), + .tileB = B->get_blktile( B, Bm, Bn ), + }; + void (*callback)(void*); + RUNTIME_request_t *request = options->request; + starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); + int workerid; + char *cl_name = "ztrsm"; + /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_RW(B, Bm, Bn); CHAMELEON_END_ACCESS_DECLARATION; + /* Callback fro profiling information */ + callback = options->profiling ? cl_ztrsm_callback : NULL; + + /* Fix the worker id */ + workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + + /* Insert the task */ rt_starpu_insert_task( - codelet, - STARPU_VALUE, &side, sizeof(int), - STARPU_VALUE, &uplo, sizeof(int), - STARPU_VALUE, &transA, sizeof(int), - STARPU_VALUE, &diag, sizeof(int), - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &alpha, sizeof(CHAMELEON_Complex64_t), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_RW, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + &cl_ztrsm, + /* Task codelet arguments */ + STARPU_VALUE, &clargs, sizeof(struct cl_ztrsm_args_s), + STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_RW, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "ztrsm", + STARPU_NAME, cl_name, #endif - 0); + + 0 ); + + (void)nb; } diff --git a/runtime/starpu/codelets/codelet_ztrtri.c b/runtime/starpu/codelets/codelet_ztrtri.c index 01b14eef48612375cd0042ddda0f3da191163e91..9b2b0e5e3ded41d29fbfb272d95eada319dade35 100644 --- a/runtime/starpu/codelets/codelet_ztrtri.c +++ b/runtime/starpu/codelets/codelet_ztrtri.c @@ -12,8 +12,6 @@ * @brief Chameleon ztrtri StarPU codelet * * @version 1.0.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Julien Langou * @author Henricus Bouwmeester * @author Mathieu Faverge @@ -27,24 +25,31 @@ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" -#if !defined(CHAMELEON_SIMULATION) -static void cl_ztrtri_cpu_func(void *descr[], void *cl_arg) -{ +struct cl_ztrtri_args_s { cham_uplo_t uplo; cham_diag_t diag; - int N; + int n; CHAM_tile_t *tileA; int iinfo; RUNTIME_sequence_t *sequence; RUNTIME_request_t *request; +}; + +#if !defined(CHAMELEON_SIMULATION) +static void +cl_ztrtri_cpu_func(void *descr[], void *cl_arg) +{ + struct cl_ztrtri_args_s clargs; + CHAM_tile_t *tileA; int info = 0; tileA = cti_interface_get(descr[0]); - starpu_codelet_unpack_args(cl_arg, &uplo, &diag, &N, &iinfo, &sequence, &request); - TCORE_ztrtri(uplo, diag, N, tileA, &info); - if ( (sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) { - RUNTIME_sequence_flush( NULL, sequence, request, iinfo+info ); + starpu_codelet_unpack_args( cl_arg, &clargs ); + TCORE_ztrtri( clargs.uplo, clargs.diag, clargs.n, tileA, &info ); + + if ( (clargs.sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) { + RUNTIME_sequence_flush( NULL, clargs.sequence, clargs.request, clargs.iinfo+info ); } } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -52,43 +57,55 @@ static void cl_ztrtri_cpu_func(void *descr[], void *cl_arg) /* * Codelet definition */ -CODELETS_CPU(ztrtri, cl_ztrtri_cpu_func) +CODELETS_CPU( ztrtri, cl_ztrtri_cpu_func ) -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - */ void INSERT_TASK_ztrtri( const RUNTIME_option_t *options, - cham_uplo_t uplo, cham_diag_t diag, - int n, int nb, + cham_uplo_t uplo, cham_diag_t diag, int n, int nb, const CHAM_desc_t *A, int Am, int An, int iinfo ) { - (void)nb; - struct starpu_codelet *codelet = &cl_ztrtri; - void (*callback)(void*) = options->profiling ? cl_ztrtri_callback : NULL; - starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt); - int workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + struct cl_ztrtri_args_s clargs = { + .uplo = uplo, + .diag = diag, + .n = n, + .tileA = A->get_blktile( A, Am, An ), + .iinfo = iinfo, + .sequence = options->sequence, + .request = options->request, + }; + void (*callback)(void*); + RUNTIME_request_t *request = options->request; + starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); + int workerid; + char *cl_name = "ztrtri"; + /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_RW(A, Am, An); CHAMELEON_END_ACCESS_DECLARATION; + /* Callback fro profiling information */ + callback = options->profiling ? cl_ztrtri_callback : NULL; + + /* Fix the worker id */ + workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + + /* Insert the task */ rt_starpu_insert_task( - codelet, - STARPU_VALUE, &uplo, sizeof(int), - STARPU_VALUE, &diag, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_VALUE, &iinfo, sizeof(int), - STARPU_VALUE, &(options->sequence), sizeof(RUNTIME_sequence_t*), - STARPU_VALUE, &(options->request), sizeof(RUNTIME_request_t*), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + &cl_ztrtri, + /* Task codelet arguments */ + STARPU_VALUE, &clargs, sizeof(struct cl_ztrtri_args_s), + STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "ztrtri", + STARPU_NAME, cl_name, #endif - 0); + + 0 ); + + (void)nb; } diff --git a/tools/bench/plafrim/run.sh b/tools/bench/plafrim/run.sh index b3ea6b313a3a9eb73eb5fbfdd670026672fade3b..3234c2bb8a8ecf0943f2e8c7d11fcb5bdb8cc2e3 100755 --- a/tools/bench/plafrim/run.sh +++ b/tools/bench/plafrim/run.sh @@ -4,6 +4,10 @@ echo "######################### Chameleon benchmarks #########################" set -x +# Unset the binding environment of the CI for this specific case +unset STARPU_MPI_NOBIND +unset STARPU_WORKERS_NOBIND + # to avoid a lock during fetching chameleon branch in parallel export XDG_CACHE_HOME=/tmp/guix-$$