From b904e524a38e72f446fc3a6856008e46b4371f07 Mon Sep 17 00:00:00 2001 From: Alycia Lisito <alycia.lisito@inria.fr> Date: Wed, 6 Apr 2022 16:12:12 +0200 Subject: [PATCH] core: remove nb from args of INSERT_TASK_zlacpy[x] --- compute/pzgelqf.c | 2 +- compute/pzgelqf_param.c | 2 +- compute/pzgelqfrh.c | 2 +- compute/pzgemm.c | 16 ++--- compute/pzgenm2.c | 14 ++-- compute/pzgeqrf.c | 2 +- compute/pzgeqrf_param.c | 2 +- compute/pzgeqrfrh.c | 2 +- compute/pzgetrf_incpiv.c | 2 +- compute/pzhemm.c | 16 ++--- compute/pzhetrd_he2hb.c | 6 +- compute/pzlacpy.c | 10 +-- compute/pzlange.c | 2 +- compute/pzlansy.c | 2 +- compute/pzsymm.c | 16 ++--- compute/pzunglq.c | 2 +- compute/pzunglq_param.c | 2 +- compute/pzunglqrh.c | 2 +- compute/pzungqr.c | 2 +- compute/pzungqr_param.c | 2 +- compute/pzungqrrh.c | 2 +- compute/pzunmlq.c | 8 +-- compute/pzunmlq_param.c | 8 +-- compute/pzunmlqrh.c | 8 +-- compute/pzunmqr.c | 8 +-- compute/pzunmqr_param.c | 8 +-- compute/pzunmqrrh.c | 8 +-- compute/zplghe.c | 14 ++-- coreblas/compute/core_ztile.c | 2 +- coreblas/compute/core_ztile_empty.c | 2 +- coreblas/include/coreblas/coreblas_ztile.h | 2 +- include/chameleon/tasks_z.h | 8 +-- runtime/openmp/codelets/codelet_zlacpy.c | 39 ++++++----- runtime/parsec/codelets/codelet_zlacpy.c | 81 +++++++++++++++------- runtime/quark/codelets/codelet_zlacpy.c | 74 ++++++++++++-------- runtime/starpu/codelets/codelet_zlacpy.c | 24 +++---- 36 files changed, 224 insertions(+), 178 deletions(-) diff --git a/compute/pzgelqf.c b/compute/pzgelqf.c index c72ed710e..32346a6e8 100644 --- a/compute/pzgelqf.c +++ b/compute/pzgelqf.c @@ -104,7 +104,7 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempDkm, tempDkn, A->nb, + ChamUpper, tempDkm, tempDkn, A(k, k), D(k) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzgelqf_param.c b/compute/pzgelqf_param.c index a8c45399f..b69b84b50 100644 --- a/compute/pzgelqf_param.c +++ b/compute/pzgelqf_param.c @@ -115,7 +115,7 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t INSERT_TASK_zlacpy( &options, - ChamUpper, tempDkm, tempDpn, A->nb, + ChamUpper, tempDkm, tempDpn, A(k, p), D(k, p) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( diff --git a/compute/pzgelqfrh.c b/compute/pzgelqfrh.c index 5d7fae952..d192c978c 100644 --- a/compute/pzgelqfrh.c +++ b/compute/pzgelqfrh.c @@ -107,7 +107,7 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM INSERT_TASK_zlacpy( &options, - ChamUpper, tempDkm, tempDNn, A->nb, + ChamUpper, tempDkm, tempDNn, A(k, N), D(k, N) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzgemm.c b/compute/pzgemm.c index b2a71d511..66a79bcde 100644 --- a/compute/pzgemm.c +++ b/compute/pzgemm.c @@ -72,7 +72,7 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran if ( transA == ChamNoTrans ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempmm, tempkk, C->mb, + ChamUpperLower, tempmm, tempkk, A( m, k ), WA( m, (k % C->q) + lq ) ); @@ -81,7 +81,7 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran for ( q=1; q < C->q; q++ ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempmm, tempkk, C->mb, + ChamUpperLower, tempmm, tempkk, WA( m, ((k+q-1) % C->q) + lq ), WA( m, ((k+q) % C->q) + lq ) ); } @@ -89,7 +89,7 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran else { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempkk, tempmm, C->mb, + ChamUpperLower, tempkk, tempmm, A( k, m ), WA( m, (k % C->q) + lq ) ); @@ -98,7 +98,7 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran for ( q=1; q < C->q; q++ ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempkk, tempmm, C->mb, + ChamUpperLower, tempkk, tempmm, WA( m, ((k+q-1) % C->q) + lq ), WA( m, ((k+q) % C->q) + lq ) ); } @@ -112,7 +112,7 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran if ( transB == ChamNoTrans ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempkk, tempnn, C->mb, + ChamUpperLower, tempkk, tempnn, B( k, n ), WB( (k % C->p) + lp, n ) ); @@ -121,7 +121,7 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran for ( p=1; p < C->p; p++ ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempkk, tempnn, C->mb, + ChamUpperLower, tempkk, tempnn, WB( ((k+p-1) % C->p) + lp, n ), WB( ((k+p) % C->p) + lp, n ) ); } @@ -129,7 +129,7 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran else { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempnn, tempkk, C->mb, + ChamUpperLower, tempnn, tempkk, B( n, k ), WB( (k % C->p) + lp, n ) ); @@ -138,7 +138,7 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran for ( p=1; p < C->p; p++ ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempnn, tempkk, C->mb, + ChamUpperLower, tempnn, tempkk, WB( ((k+p-1) % C->p) + lp, n ), WB( ((k+p) % C->p) + lp, n ) ); } diff --git a/compute/pzgenm2.c b/compute/pzgenm2.c index ec1c9cfc7..c7b209946 100644 --- a/compute/pzgenm2.c +++ b/compute/pzgenm2.c @@ -144,7 +144,7 @@ chameleon_pzgenm2( double tol, const CHAM_desc_t *A, double *result, if ( (m != 0) || (n != 0) ) { INSERT_TASK_dlacpy( &options, - ChamUpperLower, 1, 1, 1, + ChamUpperLower, 1, 1, NRMX(0, 0), NRMX(m, n) ); } @@ -200,7 +200,7 @@ chameleon_pzgenm2( double tol, const CHAM_desc_t *A, double *result, #else INSERT_TASK_zlacpy( &options, - ChamUpperLower, 1, tempnn, tempnn, + ChamUpperLower, 1, tempnn, DROW( 0, n ), X( 0, n ) ); #endif @@ -210,7 +210,7 @@ chameleon_pzgenm2( double tol, const CHAM_desc_t *A, double *result, for (m = 1; m < A->p; m++) { INSERT_TASK_zlacpy( &options, - ChamUpperLower, 1, tempnn, tempnn, + ChamUpperLower, 1, tempnn, X( 0, n ), X( m, n ) ); } @@ -263,7 +263,7 @@ chameleon_pzgenm2( double tol, const CHAM_desc_t *A, double *result, for (k = 1; k < A->q; k++) { INSERT_TASK_zlacpy( &options, - ChamUpperLower, tempmm, 1, tempmm, + ChamUpperLower, tempmm, 1, SX( m, 0 ), SX( m, k ) ); } @@ -298,7 +298,7 @@ chameleon_pzgenm2( double tol, const CHAM_desc_t *A, double *result, for (k = 1; k < A->p; k++) { INSERT_TASK_zlacpy( &options, - ChamUpperLower, 1, tempnn, tempnn, + ChamUpperLower, 1, tempnn, X( 0, n ), X( k, n ) ); } @@ -340,7 +340,7 @@ chameleon_pzgenm2( double tol, const CHAM_desc_t *A, double *result, for(n = 1; n < A->q; n++) { INSERT_TASK_dlacpy( &options, - ChamUpperLower, 1, 1, 1, + ChamUpperLower, 1, 1, NRMX( myp, 0 ), NRMX( myp, n ) ); } @@ -381,7 +381,7 @@ chameleon_pzgenm2( double tol, const CHAM_desc_t *A, double *result, for(m = 1; m < A->p; m++) { INSERT_TASK_dlacpy( &options, - ChamUpperLower, 1, 1, 1, + ChamUpperLower, 1, 1, NRMSX( 0, myq ), NRMSX( m, myq ) ); } diff --git a/compute/pzgeqrf.c b/compute/pzgeqrf.c index 9baf7ae00..a7637392d 100644 --- a/compute/pzgeqrf.c +++ b/compute/pzgeqrf.c @@ -98,7 +98,7 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamLower, tempDkm, tempDkn, A->nb, + ChamLower, tempDkm, tempDkn, A(k, k), D(k) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzgeqrf_param.c b/compute/pzgeqrf_param.c index accf33506..a4e08e061 100644 --- a/compute/pzgeqrf_param.c +++ b/compute/pzgeqrf_param.c @@ -77,7 +77,7 @@ int chameleon_pzgeqrf_param_step( int genD, cham_uplo_t uplo, int k, int ib, INSERT_TASK_zlacpy( options, - ChamLower, tempDpm, tempDkn, A->nb, + ChamLower, tempDpm, tempDkn, A(p, k), D(p, k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( diff --git a/compute/pzgeqrfrh.c b/compute/pzgeqrfrh.c index 8cd96c65c..21c1b43ed 100644 --- a/compute/pzgeqrfrh.c +++ b/compute/pzgeqrfrh.c @@ -104,7 +104,7 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM INSERT_TASK_zlacpy( &options, - ChamLower, tempDMm, tempDkn, A->nb, + ChamLower, tempDMm, tempDkn, A(M, k), D(M, k) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzgetrf_incpiv.c b/compute/pzgetrf_incpiv.c index 69505a5c4..1b61fa1e6 100644 --- a/compute/pzgetrf_incpiv.c +++ b/compute/pzgetrf_incpiv.c @@ -97,7 +97,7 @@ void chameleon_pzgetrf_incpiv( CHAM_desc_t *A, CHAM_desc_t *L, CHAM_desc_t *D, i #if defined(CHAMELEON_COPY_DIAG) INSERT_TASK_zlacpy( &options, - ChamUpperLower, tempkm, tempkn, A->nb, + ChamUpperLower, tempkm, tempkn, A(k, k), D(k)); #endif diff --git a/compute/pzhemm.c b/compute/pzhemm.c index 175f2ce20..75f1ab66b 100644 --- a/compute/pzhemm.c +++ b/compute/pzhemm.c @@ -88,7 +88,7 @@ chameleon_pzhemm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zlacpy( options, - ChamUpperLower, tempam, tempak, C->mb, + ChamUpperLower, tempam, tempak, A( Am, Ak ), WA( m, (k % C->q) + lq ) ); @@ -97,7 +97,7 @@ chameleon_pzhemm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, for ( q=1; q < C->q; q++ ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempam, tempak, C->mb, + ChamUpperLower, tempam, tempak, WA( m, ((k+q-1) % C->q) + lq ), WA( m, ((k+q) % C->q) + lq ) ); } @@ -110,7 +110,7 @@ chameleon_pzhemm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zlacpy( options, - ChamUpperLower, tempkk, tempnn, C->mb, + ChamUpperLower, tempkk, tempnn, B( k, n ), WB( (k % C->p) + lp, n ) ); @@ -119,7 +119,7 @@ chameleon_pzhemm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, for ( p=1; p < C->p; p++ ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempkk, tempnn, C->mb, + ChamUpperLower, tempkk, tempnn, WB( ((k+p-1) % C->p) + lp, n ), WB( ((k+p) % C->p) + lp, n ) ); } @@ -205,7 +205,7 @@ chameleon_pzhemm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zlacpy( options, - ChamUpperLower, tempmm, tempkk, C->mb, + ChamUpperLower, tempmm, tempkk, B( m, k ), WA( m, (k % C->q) + lq ) ); @@ -214,7 +214,7 @@ chameleon_pzhemm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, for ( q=1; q < C->q; q++ ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempmm, tempkk, C->mb, + ChamUpperLower, tempmm, tempkk, WA( m, ((k+q-1) % C->q) + lq ), WA( m, ((k+q) % C->q) + lq ) ); } @@ -245,7 +245,7 @@ chameleon_pzhemm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zlacpy( options, - ChamUpperLower, tempak, tempan, C->mb, + ChamUpperLower, tempak, tempan, A( Ak, An ), WB( (k % C->p) + lp, n ) ); @@ -254,7 +254,7 @@ chameleon_pzhemm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, for ( p=1; p < C->p; p++ ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempak, tempan, C->mb, + ChamUpperLower, tempak, tempan, WB( ((k+p-1) % C->p) + lp, n ), WB( ((k+p) % C->p) + lp, n ) ); } diff --git a/compute/pzhetrd_he2hb.c b/compute/pzhetrd_he2hb.c index 8522f39d6..600e1ec7b 100644 --- a/compute/pzhetrd_he2hb.c +++ b/compute/pzhetrd_he2hb.c @@ -120,7 +120,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, #if defined(CHAMELEON_COPY_DIAG) INSERT_TASK_zlacpy( &options, - ChamLower, tempkm, tempkn, A->nb, + ChamLower, tempkm, tempkn, A(k+1, k), E(k+1, k) ); #if defined(CHAMELEON_USE_CUDA) @@ -271,7 +271,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, #if defined(CHAMELEON_COPY_DIAG) INSERT_TASK_zlacpy( &options, - ChamUpper, tempkm, tempkn, A->nb, + ChamUpper, tempkm, tempkn, A(k, k+1), E(k, k+1) ); #if defined(CHAMELEON_USE_CUDA) @@ -411,7 +411,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, for (k = 1; k < A->nt; k++){ tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; INSERT_TASK_zlacpy( &options, - uplo, tempkn, tempkn, A->mb, + uplo, tempkn, tempkn, D(k), A(k, k)); } diff --git a/compute/pzlacpy.c b/compute/pzlacpy.c index 1a2307992..280ff8473 100644 --- a/compute/pzlacpy.c +++ b/compute/pzlacpy.c @@ -53,7 +53,7 @@ void chameleon_pzlacpy(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, INSERT_TASK_zlacpy( &options, ChamUpper, - X, Y, A->mb, + X, Y, A(m, m), B(m, m)); } @@ -62,7 +62,7 @@ void chameleon_pzlacpy(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, INSERT_TASK_zlacpy( &options, ChamUpperLower, - X, Y, A->mb, + X, Y, A(m, n), B(m, n)); } @@ -79,7 +79,7 @@ void chameleon_pzlacpy(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, INSERT_TASK_zlacpy( &options, ChamLower, - X, Y, A->mb, + X, Y, A(m, m), B(m, m)); } @@ -88,7 +88,7 @@ void chameleon_pzlacpy(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, INSERT_TASK_zlacpy( &options, ChamUpperLower, - X, Y, A->mb, + X, Y, A(m, n), B(m, n)); } @@ -106,7 +106,7 @@ void chameleon_pzlacpy(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, INSERT_TASK_zlacpy( &options, ChamUpperLower, - X, Y, A->mb, + X, Y, A(m, n), B(m, n)); } diff --git a/compute/pzlange.c b/compute/pzlange.c index 4acf29aa1..833b0f695 100644 --- a/compute/pzlange.c +++ b/compute/pzlange.c @@ -507,7 +507,7 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia if ( (m != 0) || (n != 0) ) { INSERT_TASK_dlacpy( &options, - ChamUpperLower, 1, 1, 1, + ChamUpperLower, 1, 1, W( &Welt, 0, 0 ), W( &Welt, m, n ) ); } } diff --git a/compute/pzlansy.c b/compute/pzlansy.c index 5bb6b6242..1472f0a69 100644 --- a/compute/pzlansy.c +++ b/compute/pzlansy.c @@ -413,7 +413,7 @@ void chameleon_pzlansy_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_tra if ( (m != 0) || (n != 0) ) { INSERT_TASK_dlacpy( &options, - ChamUpperLower, 1, 1, 1, + ChamUpperLower, 1, 1, W( &Welt, 0, 0 ), W( &Welt, m, n )); } } diff --git a/compute/pzsymm.c b/compute/pzsymm.c index 27adf0cdf..2f2fa1997 100644 --- a/compute/pzsymm.c +++ b/compute/pzsymm.c @@ -88,7 +88,7 @@ chameleon_pzsymm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zlacpy( options, - ChamUpperLower, tempam, tempak, C->mb, + ChamUpperLower, tempam, tempak, A( Am, Ak ), WA( m, (k % C->q) + lq ) ); @@ -97,7 +97,7 @@ chameleon_pzsymm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, for ( q=1; q < C->q; q++ ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempam, tempak, C->mb, + ChamUpperLower, tempam, tempak, WA( m, ((k+q-1) % C->q) + lq ), WA( m, ((k+q) % C->q) + lq ) ); } @@ -110,7 +110,7 @@ chameleon_pzsymm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zlacpy( options, - ChamUpperLower, tempkk, tempnn, C->mb, + ChamUpperLower, tempkk, tempnn, B( k, n ), WB( (k % C->p) + lp, n ) ); @@ -119,7 +119,7 @@ chameleon_pzsymm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, for ( p=1; p < C->p; p++ ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempkk, tempnn, C->mb, + ChamUpperLower, tempkk, tempnn, WB( ((k+p-1) % C->p) + lp, n ), WB( ((k+p) % C->p) + lp, n ) ); } @@ -205,7 +205,7 @@ chameleon_pzsymm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zlacpy( options, - ChamUpperLower, tempmm, tempkk, C->mb, + ChamUpperLower, tempmm, tempkk, B( m, k ), WA( m, (k % C->q) + lq ) ); @@ -214,7 +214,7 @@ chameleon_pzsymm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, for ( q=1; q < C->q; q++ ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempmm, tempkk, C->mb, + ChamUpperLower, tempmm, tempkk, WA( m, ((k+q-1) % C->q) + lq ), WA( m, ((k+q) % C->q) + lq ) ); } @@ -245,7 +245,7 @@ chameleon_pzsymm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zlacpy( options, - ChamUpperLower, tempak, tempan, C->mb, + ChamUpperLower, tempak, tempan, A( Ak, An ), WB( (k % C->p) + lp, n ) ); @@ -254,7 +254,7 @@ chameleon_pzsymm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, for ( p=1; p < C->p; p++ ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempak, tempan, C->mb, + ChamUpperLower, tempak, tempan, WB( ((k+p-1) % C->p) + lp, n ), WB( ((k+p) % C->p) + lp, n ) ); } diff --git a/compute/pzunglq.c b/compute/pzunglq.c index f65f73e60..42f623cb2 100644 --- a/compute/pzunglq.c +++ b/compute/pzunglq.c @@ -123,7 +123,7 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDkn, A->nb, + ChamUpper, tempkmin, tempDkn, A(k, k), D(k) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzunglq_param.c b/compute/pzunglq_param.c index 1a15e384e..1f1963cee 100644 --- a/compute/pzunglq_param.c +++ b/compute/pzunglq_param.c @@ -137,7 +137,7 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDpn, A->nb, + ChamUpper, tempkmin, tempDpn, A(k, p), D(k, p) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzunglqrh.c b/compute/pzunglqrh.c index 397b5f1e8..f904846cd 100644 --- a/compute/pzunglqrh.c +++ b/compute/pzunglqrh.c @@ -150,7 +150,7 @@ void chameleon_pzunglqrh( int genD, int BS, INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDNn, A->nb, + ChamUpper, tempkmin, tempDNn, A(k, N), D(k, N) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzungqr.c b/compute/pzungqr.c index c7ee3f5dd..cb4b8e967 100644 --- a/compute/pzungqr.c +++ b/compute/pzungqr.c @@ -123,7 +123,7 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, INSERT_TASK_zlacpy( &options, - ChamLower, tempDkm, tempkmin, A->nb, + ChamLower, tempDkm, tempkmin, A(k, k), D(k) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzungqr_param.c b/compute/pzungqr_param.c index 16fbefb8d..2f5c24983 100644 --- a/compute/pzungqr_param.c +++ b/compute/pzungqr_param.c @@ -114,7 +114,7 @@ void chameleon_pzungqr_param_step( int genD, cham_uplo_t uplo, int k, int ib, int tempDmm = m == D->mt-1 ? D->m - m * D->mb : D->mb; INSERT_TASK_zlacpy( options, - ChamLower, tempDmm, tempkmin, A->nb, + ChamLower, tempDmm, tempkmin, A(m, k), D(m, k) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzungqrrh.c b/compute/pzungqrrh.c index d500fe2db..e48b37ba1 100644 --- a/compute/pzungqrrh.c +++ b/compute/pzungqrrh.c @@ -150,7 +150,7 @@ void chameleon_pzungqrrh( int genD, int BS, int tempDMm = M == D->mt-1 ? D->m-M*D->mb : D->mb; INSERT_TASK_zlacpy( &options, - ChamLower, tempDMm, tempkmin, A->nb, + ChamLower, tempDMm, tempkmin, A(M, k), D(M, k) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzunmlq.c b/compute/pzunmlq.c index 9edd667d7..43e695fc6 100644 --- a/compute/pzunmlq.c +++ b/compute/pzunmlq.c @@ -107,7 +107,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDkn, A->nb, + ChamUpper, tempkmin, tempDkn, A(k, k), D(k) ); #if defined(CHAMELEON_USE_CUDA) @@ -202,7 +202,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDkn, A->nb, + ChamUpper, tempkmin, tempDkn, A(k, k), D(k) ); #if defined(CHAMELEON_USE_CUDA) @@ -271,7 +271,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDkn, A->nb, + ChamUpper, tempkmin, tempDkn, A(k, k), D(k) ); #if defined(CHAMELEON_USE_CUDA) @@ -318,7 +318,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDkn, A->nb, + ChamUpper, tempkmin, tempDkn, A(k, k), D(k) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzunmlq_param.c b/compute/pzunmlq_param.c index b01b7e7a3..3acb7af52 100644 --- a/compute/pzunmlq_param.c +++ b/compute/pzunmlq_param.c @@ -106,7 +106,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDpn, A->nb, + ChamUpper, tempkmin, tempDpn, A(k, p), D(k, p) ); #if defined(CHAMELEON_USE_CUDA) @@ -237,7 +237,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDpn, A->nb, + ChamUpper, tempkmin, tempDpn, A(k, p), D(k, p) ); #if defined(CHAMELEON_USE_CUDA) @@ -331,7 +331,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDpn, A->nb, + ChamUpper, tempkmin, tempDpn, A(k, p), D(k, p) ); #if defined(CHAMELEON_USE_CUDA) @@ -383,7 +383,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDpn, A->nb, + ChamUpper, tempkmin, tempDpn, A(k, p), D(k, p) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzunmlqrh.c b/compute/pzunmlqrh.c index 8d9ad2ee5..af977675d 100644 --- a/compute/pzunmlqrh.c +++ b/compute/pzunmlqrh.c @@ -108,7 +108,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDpn, A->nb, + ChamUpper, tempkmin, tempDpn, A(k, p), D(k, p) ); #if defined(CHAMELEON_USE_CUDA) @@ -263,7 +263,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDpn, A->nb, + ChamUpper, tempkmin, tempDpn, A(k, p), D(k, p) ); #if defined(CHAMELEON_USE_CUDA) @@ -367,7 +367,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDpn, A->nb, + ChamUpper, tempkmin, tempDpn, A(k, p), D(k, p) ); #if defined(CHAMELEON_USE_CUDA) @@ -416,7 +416,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDpn, A->nb, + ChamUpper, tempkmin, tempDpn, A(k, p), D(k, p) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzunmqr.c b/compute/pzunmqr.c index d9f5e1850..17839a541 100644 --- a/compute/pzunmqr.c +++ b/compute/pzunmqr.c @@ -108,7 +108,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, INSERT_TASK_zlacpy( &options, - ChamLower, tempDkm, tempkmin, A->nb, + ChamLower, tempDkm, tempkmin, A(k, k), D(k) ); #if defined(CHAMELEON_USE_CUDA) @@ -203,7 +203,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, INSERT_TASK_zlacpy( &options, - ChamLower, tempDkm, tempkmin, A->nb, + ChamLower, tempDkm, tempkmin, A(k, k), D(k) ); #if defined(CHAMELEON_USE_CUDA) @@ -273,7 +273,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, INSERT_TASK_zlacpy( &options, - ChamLower, tempDkm, tempkmin, A->nb, + ChamLower, tempDkm, tempkmin, A(k, k), D(k) ); #if defined(CHAMELEON_USE_CUDA) @@ -320,7 +320,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, INSERT_TASK_zlacpy( &options, - ChamLower, tempDkm, tempkmin, A->nb, + ChamLower, tempDkm, tempkmin, A(k, k), D(k) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzunmqr_param.c b/compute/pzunmqr_param.c index ee2d0da7e..e2507120b 100644 --- a/compute/pzunmqr_param.c +++ b/compute/pzunmqr_param.c @@ -106,7 +106,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zlacpy( &options, - ChamLower, tempDpm, tempkmin, A->nb, + ChamLower, tempDpm, tempkmin, A(p, k), D(p, k) ); #if defined(CHAMELEON_USE_CUDA) @@ -237,7 +237,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zlacpy( &options, - ChamLower, tempDpm, tempkmin, A->nb, + ChamLower, tempDpm, tempkmin, A(p, k), D(p, k) ); #if defined(CHAMELEON_USE_CUDA) @@ -330,7 +330,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zlacpy( &options, - ChamLower, tempDpm, tempkmin, A->nb, + ChamLower, tempDpm, tempkmin, A(p, k), D(p, k) ); #if defined(CHAMELEON_USE_CUDA) @@ -382,7 +382,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zlacpy( &options, - ChamLower, tempDpm, tempkmin, A->nb, + ChamLower, tempDpm, tempkmin, A(p, k), D(p, k) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzunmqrrh.c b/compute/pzunmqrrh.c index 98a816cd1..6ec5691ce 100644 --- a/compute/pzunmqrrh.c +++ b/compute/pzunmqrrh.c @@ -107,7 +107,7 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zlacpy( &options, - ChamLower, tempDpm, tempkmin, A->nb, + ChamLower, tempDpm, tempkmin, A(p, k), D(p, k) ); #if defined(CHAMELEON_USE_CUDA) @@ -260,7 +260,7 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zlacpy( &options, - ChamLower, tempDpm, tempkmin, A->nb, + ChamLower, tempDpm, tempkmin, A(p, k), D(p, k) ); #if defined(CHAMELEON_USE_CUDA) @@ -365,7 +365,7 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zlacpy( &options, - ChamLower, tempDpm, tempkmin, A->nb, + ChamLower, tempDpm, tempkmin, A(p, k), D(p, k) ); #if defined(CHAMELEON_USE_CUDA) @@ -415,7 +415,7 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zlacpy( &options, - ChamLower, tempDpm, tempkmin, A->nb, + ChamLower, tempDpm, tempkmin, A(p, k), D(p, k) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/zplghe.c b/compute/zplghe.c index efbd06911..2a097f637 100644 --- a/compute/zplghe.c +++ b/compute/zplghe.c @@ -31,7 +31,8 @@ * * @ingroup CHAMELEON_Complex64_t * - * CHAMELEON_zplghe - Generate a random hermitian (positive definite if 'bump' is large enough) half-matrix by tiles. + * @brief Generate a random hermitian (positive definite if 'bump' is large + * enough) half-matrix by tiles. * ******************************************************************************* * @@ -136,11 +137,12 @@ int CHAMELEON_zplghe( double bump, cham_uplo_t uplo, int N, * * @ingroup CHAMELEON_Complex64_t_Tile * - * CHAMELEON_zplghe_Tile - Generate a random hermitian (positive definite if 'bump' is large enough) half-matrix by tiles. - * Tile equivalent of CHAMELEON_zplghe(). - * Operates on matrices stored by tiles. - * All matrices are passed through descriptors. - * All dimensions are taken from the descriptors. + * @brief Generate a random hermitian (positive definite if 'bump' is large + * enough) half-matrix by tiles. + * + * Tile equivalent of CHAMELEON_zplghe(). Operates on matrices stored by + * tiles. All matrices are passed through descriptors. All dimensions are + * taken from the descriptors. * ******************************************************************************* * diff --git a/coreblas/compute/core_ztile.c b/coreblas/compute/core_ztile.c index dcec74350..ef9d191ac 100644 --- a/coreblas/compute/core_ztile.c +++ b/coreblas/compute/core_ztile.c @@ -368,7 +368,7 @@ TCORE_zlacpy( cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, CHAM_tile_t } void -TCORE_zlacpyx( cham_uplo_t uplo, int M, int N, int displA, int displB, const CHAM_tile_t *A, int LDA, CHAM_tile_t *B, int LDB ) +TCORE_zlacpyx( cham_uplo_t uplo, int M, int N, int displA, const CHAM_tile_t *A, int LDA, int displB, CHAM_tile_t *B, int LDB ) { assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); diff --git a/coreblas/compute/core_ztile_empty.c b/coreblas/compute/core_ztile_empty.c index 30347d332..3af4ac447 100644 --- a/coreblas/compute/core_ztile_empty.c +++ b/coreblas/compute/core_ztile_empty.c @@ -264,7 +264,7 @@ TCORE_zlacpy( cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, CHAM_tile_t } void -TCORE_zlacpyx( cham_uplo_t uplo, int M, int N, int displA, int displB, const CHAM_tile_t *A, int LDA, CHAM_tile_t *B, int LDB ) +TCORE_zlacpyx( cham_uplo_t uplo, int M, int N, int displA, const CHAM_tile_t *A, int LDA, int displB, CHAM_tile_t *B, int LDB ) { return; } diff --git a/coreblas/include/coreblas/coreblas_ztile.h b/coreblas/include/coreblas/coreblas_ztile.h index 74443b887..8779531ad 100644 --- a/coreblas/include/coreblas/coreblas_ztile.h +++ b/coreblas/include/coreblas/coreblas_ztile.h @@ -43,7 +43,7 @@ int TCORE_zherfb( cham_uplo_t uplo, int N, int K, int IB, int NB, const CHAM_ti int TCORE_zhessq( cham_store_t storev, cham_uplo_t uplo, int N, const CHAM_tile_t *A, CHAM_tile_t *sclssq ); #endif void TCORE_zlacpy( cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *B ); -void TCORE_zlacpyx( cham_uplo_t uplo, int M, int N, int displA, int displB, const CHAM_tile_t *A, int LDA, CHAM_tile_t *B, int LDB ); +void TCORE_zlacpyx( cham_uplo_t uplo, int M, int N, int displA, const CHAM_tile_t *A, int LDA, int displB, CHAM_tile_t *B, int LDB ); void TCORE_zlange( cham_normtype_t norm, int M, int N, const CHAM_tile_t *A, double *work, double *normA ); #if defined(PRECISION_z) || defined(PRECISION_c) void TCORE_zlanhe( cham_normtype_t norm, cham_uplo_t uplo, int N, const CHAM_tile_t *A, double *work, double *normA ); diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index 85f0e30ee..9a26496f1 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -126,13 +126,13 @@ void INSERT_TASK_zhessq( const RUNTIME_option_t *options, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ); void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, - cham_uplo_t uplo, int m, int n, int mb, + cham_uplo_t uplo, int m, int n, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, - cham_uplo_t uplo, int m, int n, int mb, - int displA, const CHAM_desc_t *A, int Am, int An, - int displB, const CHAM_desc_t *B, int Bm, int Bn ); + cham_uplo_t uplo, int m, int n, + int displA, const CHAM_desc_t *A, int Am, int An, int lda, + int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb ); void INSERT_TASK_zlange( const RUNTIME_option_t *options, cham_normtype_t norm, int M, int N, int NB, const CHAM_desc_t *A, int Am, int An, diff --git a/runtime/openmp/codelets/codelet_zlacpy.c b/runtime/openmp/codelets/codelet_zlacpy.c index 25de93f94..8c5696d6a 100644 --- a/runtime/openmp/codelets/codelet_zlacpy.c +++ b/runtime/openmp/codelets/codelet_zlacpy.c @@ -20,10 +20,10 @@ #include "chameleon/tasks_z.h" #include "coreblas/coreblas_ztile.h" -void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, - cham_uplo_t uplo, int m, int n, int nb, - int displA, const CHAM_desc_t *A, int Am, int An, - int displB, const CHAM_desc_t *B, int Bm, int Bn ) +void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, + cham_uplo_t uplo, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); @@ -31,24 +31,29 @@ void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, assert( tileA->format & CHAMELEON_TILE_FULLRANK ); assert( tileB->format & CHAMELEON_TILE_FULLRANK ); -#pragma omp task firstprivate( uplo, m, n, displA, tileA, displB, tileB ) depend( in:tileA[0] ) depend( inout:tileB[0] ) +#pragma omp task firstprivate( uplo, m, n, tileA, tileB ) depend( in:tileA[0] ) depend( inout:tileB[0] ) { - CHAMELEON_Complex64_t *A = tileA->mat; - CHAMELEON_Complex64_t *B = tileB->mat; - - CORE_zlacpy( uplo, m, n, A + displA, tileA->ld, B + displB, tileB->ld ); + TCORE_zlacpy( uplo, m, n, tileA, tileB ); } (void)options; - (void)nb; } -void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, - cham_uplo_t uplo, int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn ) +void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, + cham_uplo_t uplo, int m, int n, + int displA, const CHAM_desc_t *A, int Am, int An, int lda, + int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb ) { - INSERT_TASK_zlacpyx( options, uplo, m, n, nb, - 0, A, Am, An, - 0, B, Bm, Bn ); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + +#pragma omp task firstprivate( uplo, m, n, displA, tileA, lda, displB, tileB, ldb ) depend( in:tileA[0] ) depend( inout:tileB[0] ) + { + TCORE_zlacpyx( uplo, m, n, displA, tileA, lda, displB, tileB, ldb ); + } + + (void)options; } diff --git a/runtime/parsec/codelets/codelet_zlacpy.c b/runtime/parsec/codelets/codelet_zlacpy.c index 5c047798d..65b9bc054 100644 --- a/runtime/parsec/codelets/codelet_zlacpy.c +++ b/runtime/parsec/codelets/codelet_zlacpy.c @@ -22,6 +22,48 @@ #include "chameleon/tasks_z.h" #include "coreblas/coreblas_z.h" +static inline int +CORE_zlacpy_parsec( parsec_execution_stream_t *context, + parsec_task_t *this_task ) +{ + cham_uplo_t uplo; + int M; + int N; + CHAMELEON_Complex64_t *A; + int LDA; + CHAMELEON_Complex64_t *B; + int LDB; + + parsec_dtd_unpack_args( + this_task, &uplo, &M, &N, &A, &LDA, &B, &LDB ); + + CORE_zlacpy( uplo, M, N, A, LDA, B, LDB ); + + (void)context; + return PARSEC_HOOK_RETURN_DONE; +} + +void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, + cham_uplo_t uplo, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) +{ + parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + + parsec_dtd_taskpool_insert_task( + PARSEC_dtd_taskpool, CORE_zlacpy_parsec, options->priority, "lacpy", + sizeof(cham_uplo_t), &uplo, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, + PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, + sizeof(int), &(tileA->ld), VALUE, + PASSED_BY_REF, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), chameleon_parsec_get_arena_index( B ) | OUTPUT | AFFINITY, + sizeof(int), &(tileB->ld), VALUE, + PARSEC_DTD_ARG_END ); +} + static inline int CORE_zlacpyx_parsec( parsec_execution_stream_t *context, parsec_task_t *this_task ) @@ -39,42 +81,29 @@ CORE_zlacpyx_parsec( parsec_execution_stream_t *context, parsec_dtd_unpack_args( this_task, &uplo, &M, &N, &displA, &A, &LDA, &displB, &B, &LDB ); - CORE_zlacpy( uplo, M, N, A + (displA), LDA, B + (displB), LDB ); + CORE_zlacpy( uplo, M, N, A + displA, LDA, B + displB, LDB ); (void)context; return PARSEC_HOOK_RETURN_DONE; } void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, - cham_uplo_t uplo, int m, int n, int nb, - int displA, const CHAM_desc_t *A, int Am, int An, - int displB, const CHAM_desc_t *B, int Bm, int Bn ) + cham_uplo_t uplo, int m, int n, + int displA, const CHAM_desc_t *A, int Am, int An, int lda, + int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb ) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); - CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); - CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zlacpyx_parsec, options->priority, "lacpy", - sizeof(cham_uplo_t), &uplo, VALUE, - sizeof(int), &m, VALUE, - sizeof(int), &n, VALUE, - sizeof(int), &displA, VALUE, - PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &(tileA->ld), VALUE, - sizeof(int), &displB, VALUE, - PASSED_BY_REF, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), chameleon_parsec_get_arena_index( B ) | OUTPUT | AFFINITY, - sizeof(int), &(tileB->ld), VALUE, + sizeof(cham_uplo_t), &uplo, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, + sizeof(int), &displA, VALUE, + PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, + sizeof(int), &lda, VALUE, + sizeof(int), &displB, VALUE, + PASSED_BY_REF, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), chameleon_parsec_get_arena_index( B ) | OUTPUT | AFFINITY, + sizeof(int), &ldb, VALUE, PARSEC_DTD_ARG_END ); - (void)nb; -} - -void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, - cham_uplo_t uplo, int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn ) -{ - INSERT_TASK_zlacpyx( options, uplo, m, n, nb, - 0, A, Am, An, - 0, B, Bm, Bn ); } diff --git a/runtime/quark/codelets/codelet_zlacpy.c b/runtime/quark/codelets/codelet_zlacpy.c index 11992d320..90a9ae5fb 100644 --- a/runtime/quark/codelets/codelet_zlacpy.c +++ b/runtime/quark/codelets/codelet_zlacpy.c @@ -29,50 +29,66 @@ static inline void CORE_zlacpy_quark(Quark *quark) { cham_uplo_t uplo; - int M; - int N; - int displA; - CHAM_tile_t *tileA; - CHAMELEON_Complex64_t *A; - int displB; - CHAM_tile_t *tileB; - CHAMELEON_Complex64_t *B; + int M, N; + int LDA, LDB; + CHAM_tile_t *tileA, *tileB; - quark_unpack_args_7(quark, uplo, M, N, displA, tileA, displB, tileB); + quark_unpack_args_5(quark, uplo, M, N, tileA, tileB); assert( tileA->format & CHAMELEON_TILE_FULLRANK ); assert( tileB->format & CHAMELEON_TILE_FULLRANK ); - A = tileA->mat; - B = tileB->mat; - CORE_zlacpy( uplo, M, N, A + displA, tileA->ld, B + displB, tileB->ld ); + TCORE_zlacpy( uplo, M, N, tileA, tileB ); } -void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, - cham_uplo_t uplo, int m, int n, int nb, - int displA, const CHAM_desc_t *A, int Am, int An, - int displB, const CHAM_desc_t *B, int Bm, int Bn ) +void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, + cham_uplo_t uplo, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_LACPY; QUARK_Insert_Task(opt->quark, CORE_zlacpy_quark, (Quark_Task_Flags*)opt, - sizeof(int), &uplo, VALUE, - sizeof(int), &m, VALUE, - sizeof(int), &n, VALUE, - sizeof(int), &displA, VALUE, + sizeof(int), &uplo, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &displB, VALUE, sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), OUTPUT, 0); - (void)nb; } -void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, - cham_uplo_t uplo, int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn ) +static inline void CORE_zlacpyx_quark(Quark *quark) { - INSERT_TASK_zlacpyx( options, uplo, m, n, nb, - 0, A, Am, An, - 0, B, Bm, Bn ); + cham_uplo_t uplo; + int M, N; + int displA, displB; + int LDA, LDB; + CHAM_tile_t *tileA, *tileB; + + quark_unpack_args_9(quark, uplo, M, N, displA, tileA, LDA, displB, tileB, LDB); + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + + TCORE_zlacpyx( uplo, M, N, displA, tileA, LDA, displB, tileB, LDB ); +} + +void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, + cham_uplo_t uplo, int m, int n, + int displA, const CHAM_desc_t *A, int Am, int An, int lda, + int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb ) +{ + quark_option_t *opt = (quark_option_t*)(options->schedopt); + DAG_CORE_LACPY; + QUARK_Insert_Task(opt->quark, CORE_zlacpy_quark, (Quark_Task_Flags*)opt, + sizeof(int), &uplo, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, + sizeof(int), &displA, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, + sizeof(int), &lda, VALUE, + sizeof(int), &displB, VALUE, + sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), OUTPUT, + sizeof(int), &ldb, VALUE, + 0); } diff --git a/runtime/starpu/codelets/codelet_zlacpy.c b/runtime/starpu/codelets/codelet_zlacpy.c index 7302ec42c..af49ba0c9 100644 --- a/runtime/starpu/codelets/codelet_zlacpy.c +++ b/runtime/starpu/codelets/codelet_zlacpy.c @@ -52,9 +52,7 @@ cl_zlacpy_cpu_func(void *descr[], void *cl_arg) assert( clargs->displA == 0 ); assert( clargs->displB == 0 ); - CHAMELEON_Complex64_t *A = tileA->mat; - CHAMELEON_Complex64_t *B = tileB->mat; - // CORE_zlacpy( clargs->uplo, clargs->m, clargs->n, A + clargs->displA, tileA->ld, B + clargs->displB, tileB->ld ); + TCORE_zlacpy( clargs->uplo, clargs->m, clargs->n, tileA, tileB ); } @@ -68,8 +66,8 @@ cl_zlacpyx_cpu_func(void *descr[], void *cl_arg) tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - TCORE_zlacpyx( clargs->uplo, clargs->m, clargs->n, clargs->displA, clargs->displB, - tileA, clargs->lda, tileB, clargs->ldb ); + TCORE_zlacpyx( clargs->uplo, clargs->m, clargs->n, clargs->displA, + tileA, clargs->lda, clargs->displB, tileB, clargs->ldb ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -80,9 +78,9 @@ CODELETS_CPU( zlacpy, cl_zlacpy_cpu_func ) CODELETS_CPU( zlacpyx, cl_zlacpyx_cpu_func ) void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, - cham_uplo_t uplo, int m, int n, int nb, - int displA, const CHAM_desc_t *A, int Am, int An, - int displB, const CHAM_desc_t *B, int Bm, int Bn ) + cham_uplo_t uplo, int m, int n, + int displA, const CHAM_desc_t *A, int Am, int An, int lda, + int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb ) { struct cl_zlacpy_args_s *clargs = NULL; void (*callback)(void*); @@ -105,8 +103,8 @@ void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, clargs->displB = displB; clargs->tileA = A->get_blktile( A, Am, An ); clargs->tileB = B->get_blktile( B, Bm, Bn ); - clargs->lda = clargs->tileA->ld; - clargs->ldb = clargs->tileB->ld; + clargs->lda = lda; + clargs->ldb = ldb; } /* Callback fro profiling information */ @@ -129,12 +127,10 @@ void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, #endif 0 ); - - (void)nb; } void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, - cham_uplo_t uplo, int m, int n, int nb, + cham_uplo_t uplo, int m, int n, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ) { @@ -183,6 +179,4 @@ void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, #endif 0 ); - - (void)nb; } -- GitLab