diff --git a/compute/pzgelqf.c b/compute/pzgelqf.c index c72ed710e0d5d3f9cc20339c3d1e9d75904db1c8..32346a6e8805d671620a8722ba2a0c7852a97815 100644 --- a/compute/pzgelqf.c +++ b/compute/pzgelqf.c @@ -104,7 +104,7 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempDkm, tempDkn, A->nb, + ChamUpper, tempDkm, tempDkn, A(k, k), D(k) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzgelqf_param.c b/compute/pzgelqf_param.c index a8c45399f82897c073438226d45157cd2c8d0eab..b69b84b50d7c2a0ce4359e83e9bdb475a8bf2cc1 100644 --- a/compute/pzgelqf_param.c +++ b/compute/pzgelqf_param.c @@ -115,7 +115,7 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t INSERT_TASK_zlacpy( &options, - ChamUpper, tempDkm, tempDpn, A->nb, + ChamUpper, tempDkm, tempDpn, A(k, p), D(k, p) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( diff --git a/compute/pzgelqfrh.c b/compute/pzgelqfrh.c index 5d7fae952cc43166f09384730c17d05d27ae4b49..d192c978c2345cf5434add595606d3734c18a46e 100644 --- a/compute/pzgelqfrh.c +++ b/compute/pzgelqfrh.c @@ -107,7 +107,7 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM INSERT_TASK_zlacpy( &options, - ChamUpper, tempDkm, tempDNn, A->nb, + ChamUpper, tempDkm, tempDNn, A(k, N), D(k, N) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzgemm.c b/compute/pzgemm.c index b2a71d51163d12ba390896c906a9efc7399dfc1f..66a79bcde0a26f21c9e18f5827695a7b36e4fbb6 100644 --- a/compute/pzgemm.c +++ b/compute/pzgemm.c @@ -72,7 +72,7 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran if ( transA == ChamNoTrans ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempmm, tempkk, C->mb, + ChamUpperLower, tempmm, tempkk, A( m, k ), WA( m, (k % C->q) + lq ) ); @@ -81,7 +81,7 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran for ( q=1; q < C->q; q++ ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempmm, tempkk, C->mb, + ChamUpperLower, tempmm, tempkk, WA( m, ((k+q-1) % C->q) + lq ), WA( m, ((k+q) % C->q) + lq ) ); } @@ -89,7 +89,7 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran else { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempkk, tempmm, C->mb, + ChamUpperLower, tempkk, tempmm, A( k, m ), WA( m, (k % C->q) + lq ) ); @@ -98,7 +98,7 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran for ( q=1; q < C->q; q++ ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempkk, tempmm, C->mb, + ChamUpperLower, tempkk, tempmm, WA( m, ((k+q-1) % C->q) + lq ), WA( m, ((k+q) % C->q) + lq ) ); } @@ -112,7 +112,7 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran if ( transB == ChamNoTrans ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempkk, tempnn, C->mb, + ChamUpperLower, tempkk, tempnn, B( k, n ), WB( (k % C->p) + lp, n ) ); @@ -121,7 +121,7 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran for ( p=1; p < C->p; p++ ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempkk, tempnn, C->mb, + ChamUpperLower, tempkk, tempnn, WB( ((k+p-1) % C->p) + lp, n ), WB( ((k+p) % C->p) + lp, n ) ); } @@ -129,7 +129,7 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran else { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempnn, tempkk, C->mb, + ChamUpperLower, tempnn, tempkk, B( n, k ), WB( (k % C->p) + lp, n ) ); @@ -138,7 +138,7 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran for ( p=1; p < C->p; p++ ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempnn, tempkk, C->mb, + ChamUpperLower, tempnn, tempkk, WB( ((k+p-1) % C->p) + lp, n ), WB( ((k+p) % C->p) + lp, n ) ); } diff --git a/compute/pzgenm2.c b/compute/pzgenm2.c index ec1c9cfc7a1033838aad9bdb755ff6eb4992bc12..c7b209946b0e18d713d80fa67bf0249d15e4d169 100644 --- a/compute/pzgenm2.c +++ b/compute/pzgenm2.c @@ -144,7 +144,7 @@ chameleon_pzgenm2( double tol, const CHAM_desc_t *A, double *result, if ( (m != 0) || (n != 0) ) { INSERT_TASK_dlacpy( &options, - ChamUpperLower, 1, 1, 1, + ChamUpperLower, 1, 1, NRMX(0, 0), NRMX(m, n) ); } @@ -200,7 +200,7 @@ chameleon_pzgenm2( double tol, const CHAM_desc_t *A, double *result, #else INSERT_TASK_zlacpy( &options, - ChamUpperLower, 1, tempnn, tempnn, + ChamUpperLower, 1, tempnn, DROW( 0, n ), X( 0, n ) ); #endif @@ -210,7 +210,7 @@ chameleon_pzgenm2( double tol, const CHAM_desc_t *A, double *result, for (m = 1; m < A->p; m++) { INSERT_TASK_zlacpy( &options, - ChamUpperLower, 1, tempnn, tempnn, + ChamUpperLower, 1, tempnn, X( 0, n ), X( m, n ) ); } @@ -263,7 +263,7 @@ chameleon_pzgenm2( double tol, const CHAM_desc_t *A, double *result, for (k = 1; k < A->q; k++) { INSERT_TASK_zlacpy( &options, - ChamUpperLower, tempmm, 1, tempmm, + ChamUpperLower, tempmm, 1, SX( m, 0 ), SX( m, k ) ); } @@ -298,7 +298,7 @@ chameleon_pzgenm2( double tol, const CHAM_desc_t *A, double *result, for (k = 1; k < A->p; k++) { INSERT_TASK_zlacpy( &options, - ChamUpperLower, 1, tempnn, tempnn, + ChamUpperLower, 1, tempnn, X( 0, n ), X( k, n ) ); } @@ -340,7 +340,7 @@ chameleon_pzgenm2( double tol, const CHAM_desc_t *A, double *result, for(n = 1; n < A->q; n++) { INSERT_TASK_dlacpy( &options, - ChamUpperLower, 1, 1, 1, + ChamUpperLower, 1, 1, NRMX( myp, 0 ), NRMX( myp, n ) ); } @@ -381,7 +381,7 @@ chameleon_pzgenm2( double tol, const CHAM_desc_t *A, double *result, for(m = 1; m < A->p; m++) { INSERT_TASK_dlacpy( &options, - ChamUpperLower, 1, 1, 1, + ChamUpperLower, 1, 1, NRMSX( 0, myq ), NRMSX( m, myq ) ); } diff --git a/compute/pzgeqrf.c b/compute/pzgeqrf.c index 9baf7ae0028a7fb00bd1dd2a7b995fc3bc4074d2..a7637392de7071440c77ae8e17d30a41ea879605 100644 --- a/compute/pzgeqrf.c +++ b/compute/pzgeqrf.c @@ -98,7 +98,7 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamLower, tempDkm, tempDkn, A->nb, + ChamLower, tempDkm, tempDkn, A(k, k), D(k) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzgeqrf_param.c b/compute/pzgeqrf_param.c index accf33506175e76baa0dad7319b4919b00afbcc1..a4e08e061087e47896d9586013b44878c0283c9c 100644 --- a/compute/pzgeqrf_param.c +++ b/compute/pzgeqrf_param.c @@ -77,7 +77,7 @@ int chameleon_pzgeqrf_param_step( int genD, cham_uplo_t uplo, int k, int ib, INSERT_TASK_zlacpy( options, - ChamLower, tempDpm, tempDkn, A->nb, + ChamLower, tempDpm, tempDkn, A(p, k), D(p, k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( diff --git a/compute/pzgeqrfrh.c b/compute/pzgeqrfrh.c index 8cd96c65cd87c4b47201a6d48be55561a1f56f47..21c1b43ed4ccaf04c9b7a484b64768bfe2260c44 100644 --- a/compute/pzgeqrfrh.c +++ b/compute/pzgeqrfrh.c @@ -104,7 +104,7 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM INSERT_TASK_zlacpy( &options, - ChamLower, tempDMm, tempDkn, A->nb, + ChamLower, tempDMm, tempDkn, A(M, k), D(M, k) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzgetrf_incpiv.c b/compute/pzgetrf_incpiv.c index 69505a5c4f1528d657d234b5e4f396229029607d..1b61fa1e62042f47ea6264feaeaa2639a967e3d0 100644 --- a/compute/pzgetrf_incpiv.c +++ b/compute/pzgetrf_incpiv.c @@ -97,7 +97,7 @@ void chameleon_pzgetrf_incpiv( CHAM_desc_t *A, CHAM_desc_t *L, CHAM_desc_t *D, i #if defined(CHAMELEON_COPY_DIAG) INSERT_TASK_zlacpy( &options, - ChamUpperLower, tempkm, tempkn, A->nb, + ChamUpperLower, tempkm, tempkn, A(k, k), D(k)); #endif diff --git a/compute/pzhemm.c b/compute/pzhemm.c index 175f2ce2007c388f6b371b4c8aed5e77ac592e78..75f1ab66be514b1eaf22f0f47efd8321cbcebc31 100644 --- a/compute/pzhemm.c +++ b/compute/pzhemm.c @@ -88,7 +88,7 @@ chameleon_pzhemm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zlacpy( options, - ChamUpperLower, tempam, tempak, C->mb, + ChamUpperLower, tempam, tempak, A( Am, Ak ), WA( m, (k % C->q) + lq ) ); @@ -97,7 +97,7 @@ chameleon_pzhemm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, for ( q=1; q < C->q; q++ ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempam, tempak, C->mb, + ChamUpperLower, tempam, tempak, WA( m, ((k+q-1) % C->q) + lq ), WA( m, ((k+q) % C->q) + lq ) ); } @@ -110,7 +110,7 @@ chameleon_pzhemm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zlacpy( options, - ChamUpperLower, tempkk, tempnn, C->mb, + ChamUpperLower, tempkk, tempnn, B( k, n ), WB( (k % C->p) + lp, n ) ); @@ -119,7 +119,7 @@ chameleon_pzhemm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, for ( p=1; p < C->p; p++ ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempkk, tempnn, C->mb, + ChamUpperLower, tempkk, tempnn, WB( ((k+p-1) % C->p) + lp, n ), WB( ((k+p) % C->p) + lp, n ) ); } @@ -205,7 +205,7 @@ chameleon_pzhemm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zlacpy( options, - ChamUpperLower, tempmm, tempkk, C->mb, + ChamUpperLower, tempmm, tempkk, B( m, k ), WA( m, (k % C->q) + lq ) ); @@ -214,7 +214,7 @@ chameleon_pzhemm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, for ( q=1; q < C->q; q++ ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempmm, tempkk, C->mb, + ChamUpperLower, tempmm, tempkk, WA( m, ((k+q-1) % C->q) + lq ), WA( m, ((k+q) % C->q) + lq ) ); } @@ -245,7 +245,7 @@ chameleon_pzhemm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zlacpy( options, - ChamUpperLower, tempak, tempan, C->mb, + ChamUpperLower, tempak, tempan, A( Ak, An ), WB( (k % C->p) + lp, n ) ); @@ -254,7 +254,7 @@ chameleon_pzhemm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, for ( p=1; p < C->p; p++ ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempak, tempan, C->mb, + ChamUpperLower, tempak, tempan, WB( ((k+p-1) % C->p) + lp, n ), WB( ((k+p) % C->p) + lp, n ) ); } diff --git a/compute/pzhetrd_he2hb.c b/compute/pzhetrd_he2hb.c index 8522f39d6e085ad0c9c513491978712948157a39..600e1ec7be7d036b474c97ea8f052237de4d5d24 100644 --- a/compute/pzhetrd_he2hb.c +++ b/compute/pzhetrd_he2hb.c @@ -120,7 +120,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, #if defined(CHAMELEON_COPY_DIAG) INSERT_TASK_zlacpy( &options, - ChamLower, tempkm, tempkn, A->nb, + ChamLower, tempkm, tempkn, A(k+1, k), E(k+1, k) ); #if defined(CHAMELEON_USE_CUDA) @@ -271,7 +271,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, #if defined(CHAMELEON_COPY_DIAG) INSERT_TASK_zlacpy( &options, - ChamUpper, tempkm, tempkn, A->nb, + ChamUpper, tempkm, tempkn, A(k, k+1), E(k, k+1) ); #if defined(CHAMELEON_USE_CUDA) @@ -411,7 +411,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, for (k = 1; k < A->nt; k++){ tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; INSERT_TASK_zlacpy( &options, - uplo, tempkn, tempkn, A->mb, + uplo, tempkn, tempkn, D(k), A(k, k)); } diff --git a/compute/pzlacpy.c b/compute/pzlacpy.c index 1a2307992c54f65841a088c8dd40f41742c61666..280ff84731420ee417fdbf55e926a5f9ad17b8b9 100644 --- a/compute/pzlacpy.c +++ b/compute/pzlacpy.c @@ -53,7 +53,7 @@ void chameleon_pzlacpy(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, INSERT_TASK_zlacpy( &options, ChamUpper, - X, Y, A->mb, + X, Y, A(m, m), B(m, m)); } @@ -62,7 +62,7 @@ void chameleon_pzlacpy(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, INSERT_TASK_zlacpy( &options, ChamUpperLower, - X, Y, A->mb, + X, Y, A(m, n), B(m, n)); } @@ -79,7 +79,7 @@ void chameleon_pzlacpy(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, INSERT_TASK_zlacpy( &options, ChamLower, - X, Y, A->mb, + X, Y, A(m, m), B(m, m)); } @@ -88,7 +88,7 @@ void chameleon_pzlacpy(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, INSERT_TASK_zlacpy( &options, ChamUpperLower, - X, Y, A->mb, + X, Y, A(m, n), B(m, n)); } @@ -106,7 +106,7 @@ void chameleon_pzlacpy(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, INSERT_TASK_zlacpy( &options, ChamUpperLower, - X, Y, A->mb, + X, Y, A(m, n), B(m, n)); } diff --git a/compute/pzlange.c b/compute/pzlange.c index 4acf29aa1c5e1edcf15b58f29eda0302dcbd4410..833b0f6953a1e2962ca98b83e4f0e06a0e690538 100644 --- a/compute/pzlange.c +++ b/compute/pzlange.c @@ -507,7 +507,7 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia if ( (m != 0) || (n != 0) ) { INSERT_TASK_dlacpy( &options, - ChamUpperLower, 1, 1, 1, + ChamUpperLower, 1, 1, W( &Welt, 0, 0 ), W( &Welt, m, n ) ); } } diff --git a/compute/pzlansy.c b/compute/pzlansy.c index 5bb6b62425dbafbbb1b47917094bb4f55bc8ff4e..1472f0a69f30c44707c88cb5f730e007d7a97537 100644 --- a/compute/pzlansy.c +++ b/compute/pzlansy.c @@ -413,7 +413,7 @@ void chameleon_pzlansy_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_tra if ( (m != 0) || (n != 0) ) { INSERT_TASK_dlacpy( &options, - ChamUpperLower, 1, 1, 1, + ChamUpperLower, 1, 1, W( &Welt, 0, 0 ), W( &Welt, m, n )); } } diff --git a/compute/pzsymm.c b/compute/pzsymm.c index 27adf0cdfb541371f367aad4131703c747326bb6..2f2fa1997472bc2fc56002b5bc7cf1df12d48676 100644 --- a/compute/pzsymm.c +++ b/compute/pzsymm.c @@ -88,7 +88,7 @@ chameleon_pzsymm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zlacpy( options, - ChamUpperLower, tempam, tempak, C->mb, + ChamUpperLower, tempam, tempak, A( Am, Ak ), WA( m, (k % C->q) + lq ) ); @@ -97,7 +97,7 @@ chameleon_pzsymm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, for ( q=1; q < C->q; q++ ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempam, tempak, C->mb, + ChamUpperLower, tempam, tempak, WA( m, ((k+q-1) % C->q) + lq ), WA( m, ((k+q) % C->q) + lq ) ); } @@ -110,7 +110,7 @@ chameleon_pzsymm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zlacpy( options, - ChamUpperLower, tempkk, tempnn, C->mb, + ChamUpperLower, tempkk, tempnn, B( k, n ), WB( (k % C->p) + lp, n ) ); @@ -119,7 +119,7 @@ chameleon_pzsymm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, for ( p=1; p < C->p; p++ ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempkk, tempnn, C->mb, + ChamUpperLower, tempkk, tempnn, WB( ((k+p-1) % C->p) + lp, n ), WB( ((k+p) % C->p) + lp, n ) ); } @@ -205,7 +205,7 @@ chameleon_pzsymm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zlacpy( options, - ChamUpperLower, tempmm, tempkk, C->mb, + ChamUpperLower, tempmm, tempkk, B( m, k ), WA( m, (k % C->q) + lq ) ); @@ -214,7 +214,7 @@ chameleon_pzsymm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, for ( q=1; q < C->q; q++ ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempmm, tempkk, C->mb, + ChamUpperLower, tempmm, tempkk, WA( m, ((k+q-1) % C->q) + lq ), WA( m, ((k+q) % C->q) + lq ) ); } @@ -245,7 +245,7 @@ chameleon_pzsymm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zlacpy( options, - ChamUpperLower, tempak, tempan, C->mb, + ChamUpperLower, tempak, tempan, A( Ak, An ), WB( (k % C->p) + lp, n ) ); @@ -254,7 +254,7 @@ chameleon_pzsymm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, for ( p=1; p < C->p; p++ ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempak, tempan, C->mb, + ChamUpperLower, tempak, tempan, WB( ((k+p-1) % C->p) + lp, n ), WB( ((k+p) % C->p) + lp, n ) ); } diff --git a/compute/pzunglq.c b/compute/pzunglq.c index f65f73e60ac098915495872d52f5b65a1c5fe769..42f623cb28a63928221b5e7d7118b28d749e0833 100644 --- a/compute/pzunglq.c +++ b/compute/pzunglq.c @@ -123,7 +123,7 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDkn, A->nb, + ChamUpper, tempkmin, tempDkn, A(k, k), D(k) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzunglq_param.c b/compute/pzunglq_param.c index 1a15e384ed920a25b6748a37dc67bd85aed2e24f..1f1963cee614ddaade991adcd4f536a16be19b6a 100644 --- a/compute/pzunglq_param.c +++ b/compute/pzunglq_param.c @@ -137,7 +137,7 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDpn, A->nb, + ChamUpper, tempkmin, tempDpn, A(k, p), D(k, p) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzunglqrh.c b/compute/pzunglqrh.c index 397b5f1e88a7479d1db78fbad77c184f6b029397..f904846cd8bc8aa2888e1dd24e5758d8ad5fd972 100644 --- a/compute/pzunglqrh.c +++ b/compute/pzunglqrh.c @@ -150,7 +150,7 @@ void chameleon_pzunglqrh( int genD, int BS, INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDNn, A->nb, + ChamUpper, tempkmin, tempDNn, A(k, N), D(k, N) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzungqr.c b/compute/pzungqr.c index c7ee3f5ddf49bb71c3aaf7a4f9e0c9e4367fce85..cb4b8e967c0d77c42380699090a234bea0f6afea 100644 --- a/compute/pzungqr.c +++ b/compute/pzungqr.c @@ -123,7 +123,7 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, INSERT_TASK_zlacpy( &options, - ChamLower, tempDkm, tempkmin, A->nb, + ChamLower, tempDkm, tempkmin, A(k, k), D(k) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzungqr_param.c b/compute/pzungqr_param.c index 16fbefb8dea33c5ca32d10f0e35c9edbc4837db6..2f5c24983acbde9e159f9038849f8521f003bdbd 100644 --- a/compute/pzungqr_param.c +++ b/compute/pzungqr_param.c @@ -114,7 +114,7 @@ void chameleon_pzungqr_param_step( int genD, cham_uplo_t uplo, int k, int ib, int tempDmm = m == D->mt-1 ? D->m - m * D->mb : D->mb; INSERT_TASK_zlacpy( options, - ChamLower, tempDmm, tempkmin, A->nb, + ChamLower, tempDmm, tempkmin, A(m, k), D(m, k) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzungqrrh.c b/compute/pzungqrrh.c index d500fe2db7c750f8ff4ead41da200a1c03dac2f8..e48b37ba13adbcf80a3d8fac09b512cc22adf4c1 100644 --- a/compute/pzungqrrh.c +++ b/compute/pzungqrrh.c @@ -150,7 +150,7 @@ void chameleon_pzungqrrh( int genD, int BS, int tempDMm = M == D->mt-1 ? D->m-M*D->mb : D->mb; INSERT_TASK_zlacpy( &options, - ChamLower, tempDMm, tempkmin, A->nb, + ChamLower, tempDMm, tempkmin, A(M, k), D(M, k) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzunmlq.c b/compute/pzunmlq.c index 9edd667d7e74f1d06cd8007a17de9752b68ee7cf..43e695fc6748ae876a55d17006291bf0c34adecc 100644 --- a/compute/pzunmlq.c +++ b/compute/pzunmlq.c @@ -107,7 +107,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDkn, A->nb, + ChamUpper, tempkmin, tempDkn, A(k, k), D(k) ); #if defined(CHAMELEON_USE_CUDA) @@ -202,7 +202,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDkn, A->nb, + ChamUpper, tempkmin, tempDkn, A(k, k), D(k) ); #if defined(CHAMELEON_USE_CUDA) @@ -271,7 +271,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDkn, A->nb, + ChamUpper, tempkmin, tempDkn, A(k, k), D(k) ); #if defined(CHAMELEON_USE_CUDA) @@ -318,7 +318,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDkn, A->nb, + ChamUpper, tempkmin, tempDkn, A(k, k), D(k) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzunmlq_param.c b/compute/pzunmlq_param.c index b01b7e7a3438f1a84cbd45f7333ac54b9219640d..3acb7af525aa50937a98e1e4fc42dd11c73ffc7c 100644 --- a/compute/pzunmlq_param.c +++ b/compute/pzunmlq_param.c @@ -106,7 +106,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDpn, A->nb, + ChamUpper, tempkmin, tempDpn, A(k, p), D(k, p) ); #if defined(CHAMELEON_USE_CUDA) @@ -237,7 +237,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDpn, A->nb, + ChamUpper, tempkmin, tempDpn, A(k, p), D(k, p) ); #if defined(CHAMELEON_USE_CUDA) @@ -331,7 +331,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDpn, A->nb, + ChamUpper, tempkmin, tempDpn, A(k, p), D(k, p) ); #if defined(CHAMELEON_USE_CUDA) @@ -383,7 +383,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDpn, A->nb, + ChamUpper, tempkmin, tempDpn, A(k, p), D(k, p) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzunmlqrh.c b/compute/pzunmlqrh.c index 8d9ad2ee5cf66314372b026d52313c9de29efda8..af977675d0905c4b2f5723a23cffb7e7ad3643aa 100644 --- a/compute/pzunmlqrh.c +++ b/compute/pzunmlqrh.c @@ -108,7 +108,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDpn, A->nb, + ChamUpper, tempkmin, tempDpn, A(k, p), D(k, p) ); #if defined(CHAMELEON_USE_CUDA) @@ -263,7 +263,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDpn, A->nb, + ChamUpper, tempkmin, tempDpn, A(k, p), D(k, p) ); #if defined(CHAMELEON_USE_CUDA) @@ -367,7 +367,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDpn, A->nb, + ChamUpper, tempkmin, tempDpn, A(k, p), D(k, p) ); #if defined(CHAMELEON_USE_CUDA) @@ -416,7 +416,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempDpn, A->nb, + ChamUpper, tempkmin, tempDpn, A(k, p), D(k, p) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzunmqr.c b/compute/pzunmqr.c index d9f5e1850e53e589d7aedaa2ca2552dc692df24e..17839a5412c1cdf3addca474310bee846359d280 100644 --- a/compute/pzunmqr.c +++ b/compute/pzunmqr.c @@ -108,7 +108,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, INSERT_TASK_zlacpy( &options, - ChamLower, tempDkm, tempkmin, A->nb, + ChamLower, tempDkm, tempkmin, A(k, k), D(k) ); #if defined(CHAMELEON_USE_CUDA) @@ -203,7 +203,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, INSERT_TASK_zlacpy( &options, - ChamLower, tempDkm, tempkmin, A->nb, + ChamLower, tempDkm, tempkmin, A(k, k), D(k) ); #if defined(CHAMELEON_USE_CUDA) @@ -273,7 +273,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, INSERT_TASK_zlacpy( &options, - ChamLower, tempDkm, tempkmin, A->nb, + ChamLower, tempDkm, tempkmin, A(k, k), D(k) ); #if defined(CHAMELEON_USE_CUDA) @@ -320,7 +320,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, INSERT_TASK_zlacpy( &options, - ChamLower, tempDkm, tempkmin, A->nb, + ChamLower, tempDkm, tempkmin, A(k, k), D(k) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzunmqr_param.c b/compute/pzunmqr_param.c index ee2d0da7eaacefd456c8b764c6cb0601415c555b..e2507120b9f4c3c29821adc00c03e6dce8cf150c 100644 --- a/compute/pzunmqr_param.c +++ b/compute/pzunmqr_param.c @@ -106,7 +106,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zlacpy( &options, - ChamLower, tempDpm, tempkmin, A->nb, + ChamLower, tempDpm, tempkmin, A(p, k), D(p, k) ); #if defined(CHAMELEON_USE_CUDA) @@ -237,7 +237,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zlacpy( &options, - ChamLower, tempDpm, tempkmin, A->nb, + ChamLower, tempDpm, tempkmin, A(p, k), D(p, k) ); #if defined(CHAMELEON_USE_CUDA) @@ -330,7 +330,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zlacpy( &options, - ChamLower, tempDpm, tempkmin, A->nb, + ChamLower, tempDpm, tempkmin, A(p, k), D(p, k) ); #if defined(CHAMELEON_USE_CUDA) @@ -382,7 +382,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zlacpy( &options, - ChamLower, tempDpm, tempkmin, A->nb, + ChamLower, tempDpm, tempkmin, A(p, k), D(p, k) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/pzunmqrrh.c b/compute/pzunmqrrh.c index 98a816cd1866d56303a7ea09224c073a98d120aa..6ec5691cea7efc15a566dcbbf09bc3bf9dfe1867 100644 --- a/compute/pzunmqrrh.c +++ b/compute/pzunmqrrh.c @@ -107,7 +107,7 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zlacpy( &options, - ChamLower, tempDpm, tempkmin, A->nb, + ChamLower, tempDpm, tempkmin, A(p, k), D(p, k) ); #if defined(CHAMELEON_USE_CUDA) @@ -260,7 +260,7 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zlacpy( &options, - ChamLower, tempDpm, tempkmin, A->nb, + ChamLower, tempDpm, tempkmin, A(p, k), D(p, k) ); #if defined(CHAMELEON_USE_CUDA) @@ -365,7 +365,7 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zlacpy( &options, - ChamLower, tempDpm, tempkmin, A->nb, + ChamLower, tempDpm, tempkmin, A(p, k), D(p, k) ); #if defined(CHAMELEON_USE_CUDA) @@ -415,7 +415,7 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zlacpy( &options, - ChamLower, tempDpm, tempkmin, A->nb, + ChamLower, tempDpm, tempkmin, A(p, k), D(p, k) ); #if defined(CHAMELEON_USE_CUDA) diff --git a/compute/zplghe.c b/compute/zplghe.c index efbd069113f1f9415a6135cd2442ad71c2b7f116..2a097f637cd1de991873dea7dd51bda98d503301 100644 --- a/compute/zplghe.c +++ b/compute/zplghe.c @@ -31,7 +31,8 @@ * * @ingroup CHAMELEON_Complex64_t * - * CHAMELEON_zplghe - Generate a random hermitian (positive definite if 'bump' is large enough) half-matrix by tiles. + * @brief Generate a random hermitian (positive definite if 'bump' is large + * enough) half-matrix by tiles. * ******************************************************************************* * @@ -136,11 +137,12 @@ int CHAMELEON_zplghe( double bump, cham_uplo_t uplo, int N, * * @ingroup CHAMELEON_Complex64_t_Tile * - * CHAMELEON_zplghe_Tile - Generate a random hermitian (positive definite if 'bump' is large enough) half-matrix by tiles. - * Tile equivalent of CHAMELEON_zplghe(). - * Operates on matrices stored by tiles. - * All matrices are passed through descriptors. - * All dimensions are taken from the descriptors. + * @brief Generate a random hermitian (positive definite if 'bump' is large + * enough) half-matrix by tiles. + * + * Tile equivalent of CHAMELEON_zplghe(). Operates on matrices stored by + * tiles. All matrices are passed through descriptors. All dimensions are + * taken from the descriptors. * ******************************************************************************* * diff --git a/coreblas/compute/core_ztile.c b/coreblas/compute/core_ztile.c index dcec74350500739b6fdbed7c38cd09f9588447e7..ef9d191ac03ec2a4d9b859a138f178b385e05716 100644 --- a/coreblas/compute/core_ztile.c +++ b/coreblas/compute/core_ztile.c @@ -368,7 +368,7 @@ TCORE_zlacpy( cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, CHAM_tile_t } void -TCORE_zlacpyx( cham_uplo_t uplo, int M, int N, int displA, int displB, const CHAM_tile_t *A, int LDA, CHAM_tile_t *B, int LDB ) +TCORE_zlacpyx( cham_uplo_t uplo, int M, int N, int displA, const CHAM_tile_t *A, int LDA, int displB, CHAM_tile_t *B, int LDB ) { assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); diff --git a/coreblas/compute/core_ztile_empty.c b/coreblas/compute/core_ztile_empty.c index 30347d3320ba80c657ffa39795021357a43d9b7f..3af4ac44729e4ad08bbbc6d7f0e1a170a1e296d1 100644 --- a/coreblas/compute/core_ztile_empty.c +++ b/coreblas/compute/core_ztile_empty.c @@ -264,7 +264,7 @@ TCORE_zlacpy( cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, CHAM_tile_t } void -TCORE_zlacpyx( cham_uplo_t uplo, int M, int N, int displA, int displB, const CHAM_tile_t *A, int LDA, CHAM_tile_t *B, int LDB ) +TCORE_zlacpyx( cham_uplo_t uplo, int M, int N, int displA, const CHAM_tile_t *A, int LDA, int displB, CHAM_tile_t *B, int LDB ) { return; } diff --git a/coreblas/include/coreblas/coreblas_ztile.h b/coreblas/include/coreblas/coreblas_ztile.h index 74443b88724db7a787cb93be514bdd7108372cb3..8779531ad3bdb8786d8e5078c152e94420020dfc 100644 --- a/coreblas/include/coreblas/coreblas_ztile.h +++ b/coreblas/include/coreblas/coreblas_ztile.h @@ -43,7 +43,7 @@ int TCORE_zherfb( cham_uplo_t uplo, int N, int K, int IB, int NB, const CHAM_ti int TCORE_zhessq( cham_store_t storev, cham_uplo_t uplo, int N, const CHAM_tile_t *A, CHAM_tile_t *sclssq ); #endif void TCORE_zlacpy( cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *B ); -void TCORE_zlacpyx( cham_uplo_t uplo, int M, int N, int displA, int displB, const CHAM_tile_t *A, int LDA, CHAM_tile_t *B, int LDB ); +void TCORE_zlacpyx( cham_uplo_t uplo, int M, int N, int displA, const CHAM_tile_t *A, int LDA, int displB, CHAM_tile_t *B, int LDB ); void TCORE_zlange( cham_normtype_t norm, int M, int N, const CHAM_tile_t *A, double *work, double *normA ); #if defined(PRECISION_z) || defined(PRECISION_c) void TCORE_zlanhe( cham_normtype_t norm, cham_uplo_t uplo, int N, const CHAM_tile_t *A, double *work, double *normA ); diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index 85f0e30ee377818e24ac0e20362e9720b5bfda42..9a26496f199568818f63751ef98a3c58d46b725f 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -126,13 +126,13 @@ void INSERT_TASK_zhessq( const RUNTIME_option_t *options, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ); void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, - cham_uplo_t uplo, int m, int n, int mb, + cham_uplo_t uplo, int m, int n, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, - cham_uplo_t uplo, int m, int n, int mb, - int displA, const CHAM_desc_t *A, int Am, int An, - int displB, const CHAM_desc_t *B, int Bm, int Bn ); + cham_uplo_t uplo, int m, int n, + int displA, const CHAM_desc_t *A, int Am, int An, int lda, + int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb ); void INSERT_TASK_zlange( const RUNTIME_option_t *options, cham_normtype_t norm, int M, int N, int NB, const CHAM_desc_t *A, int Am, int An, diff --git a/runtime/openmp/codelets/codelet_zlacpy.c b/runtime/openmp/codelets/codelet_zlacpy.c index 25de93f94bbba0cfeb4d66f21b27a85ab572161b..8c5696d6ac417d56031cafd369a48bcc6f33c3ea 100644 --- a/runtime/openmp/codelets/codelet_zlacpy.c +++ b/runtime/openmp/codelets/codelet_zlacpy.c @@ -20,10 +20,10 @@ #include "chameleon/tasks_z.h" #include "coreblas/coreblas_ztile.h" -void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, - cham_uplo_t uplo, int m, int n, int nb, - int displA, const CHAM_desc_t *A, int Am, int An, - int displB, const CHAM_desc_t *B, int Bm, int Bn ) +void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, + cham_uplo_t uplo, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); @@ -31,24 +31,29 @@ void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, assert( tileA->format & CHAMELEON_TILE_FULLRANK ); assert( tileB->format & CHAMELEON_TILE_FULLRANK ); -#pragma omp task firstprivate( uplo, m, n, displA, tileA, displB, tileB ) depend( in:tileA[0] ) depend( inout:tileB[0] ) +#pragma omp task firstprivate( uplo, m, n, tileA, tileB ) depend( in:tileA[0] ) depend( inout:tileB[0] ) { - CHAMELEON_Complex64_t *A = tileA->mat; - CHAMELEON_Complex64_t *B = tileB->mat; - - CORE_zlacpy( uplo, m, n, A + displA, tileA->ld, B + displB, tileB->ld ); + TCORE_zlacpy( uplo, m, n, tileA, tileB ); } (void)options; - (void)nb; } -void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, - cham_uplo_t uplo, int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn ) +void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, + cham_uplo_t uplo, int m, int n, + int displA, const CHAM_desc_t *A, int Am, int An, int lda, + int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb ) { - INSERT_TASK_zlacpyx( options, uplo, m, n, nb, - 0, A, Am, An, - 0, B, Bm, Bn ); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + +#pragma omp task firstprivate( uplo, m, n, displA, tileA, lda, displB, tileB, ldb ) depend( in:tileA[0] ) depend( inout:tileB[0] ) + { + TCORE_zlacpyx( uplo, m, n, displA, tileA, lda, displB, tileB, ldb ); + } + + (void)options; } diff --git a/runtime/parsec/codelets/codelet_zlacpy.c b/runtime/parsec/codelets/codelet_zlacpy.c index 5c047798d8227eb7618d6490d08ab00e2d4abee2..65b9bc0542dae69579db42ef476705cebfa4bca7 100644 --- a/runtime/parsec/codelets/codelet_zlacpy.c +++ b/runtime/parsec/codelets/codelet_zlacpy.c @@ -22,6 +22,48 @@ #include "chameleon/tasks_z.h" #include "coreblas/coreblas_z.h" +static inline int +CORE_zlacpy_parsec( parsec_execution_stream_t *context, + parsec_task_t *this_task ) +{ + cham_uplo_t uplo; + int M; + int N; + CHAMELEON_Complex64_t *A; + int LDA; + CHAMELEON_Complex64_t *B; + int LDB; + + parsec_dtd_unpack_args( + this_task, &uplo, &M, &N, &A, &LDA, &B, &LDB ); + + CORE_zlacpy( uplo, M, N, A, LDA, B, LDB ); + + (void)context; + return PARSEC_HOOK_RETURN_DONE; +} + +void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, + cham_uplo_t uplo, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) +{ + parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + + parsec_dtd_taskpool_insert_task( + PARSEC_dtd_taskpool, CORE_zlacpy_parsec, options->priority, "lacpy", + sizeof(cham_uplo_t), &uplo, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, + PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, + sizeof(int), &(tileA->ld), VALUE, + PASSED_BY_REF, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), chameleon_parsec_get_arena_index( B ) | OUTPUT | AFFINITY, + sizeof(int), &(tileB->ld), VALUE, + PARSEC_DTD_ARG_END ); +} + static inline int CORE_zlacpyx_parsec( parsec_execution_stream_t *context, parsec_task_t *this_task ) @@ -39,42 +81,29 @@ CORE_zlacpyx_parsec( parsec_execution_stream_t *context, parsec_dtd_unpack_args( this_task, &uplo, &M, &N, &displA, &A, &LDA, &displB, &B, &LDB ); - CORE_zlacpy( uplo, M, N, A + (displA), LDA, B + (displB), LDB ); + CORE_zlacpy( uplo, M, N, A + displA, LDA, B + displB, LDB ); (void)context; return PARSEC_HOOK_RETURN_DONE; } void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, - cham_uplo_t uplo, int m, int n, int nb, - int displA, const CHAM_desc_t *A, int Am, int An, - int displB, const CHAM_desc_t *B, int Bm, int Bn ) + cham_uplo_t uplo, int m, int n, + int displA, const CHAM_desc_t *A, int Am, int An, int lda, + int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb ) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); - CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); - CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zlacpyx_parsec, options->priority, "lacpy", - sizeof(cham_uplo_t), &uplo, VALUE, - sizeof(int), &m, VALUE, - sizeof(int), &n, VALUE, - sizeof(int), &displA, VALUE, - PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &(tileA->ld), VALUE, - sizeof(int), &displB, VALUE, - PASSED_BY_REF, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), chameleon_parsec_get_arena_index( B ) | OUTPUT | AFFINITY, - sizeof(int), &(tileB->ld), VALUE, + sizeof(cham_uplo_t), &uplo, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, + sizeof(int), &displA, VALUE, + PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, + sizeof(int), &lda, VALUE, + sizeof(int), &displB, VALUE, + PASSED_BY_REF, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), chameleon_parsec_get_arena_index( B ) | OUTPUT | AFFINITY, + sizeof(int), &ldb, VALUE, PARSEC_DTD_ARG_END ); - (void)nb; -} - -void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, - cham_uplo_t uplo, int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn ) -{ - INSERT_TASK_zlacpyx( options, uplo, m, n, nb, - 0, A, Am, An, - 0, B, Bm, Bn ); } diff --git a/runtime/quark/codelets/codelet_zlacpy.c b/runtime/quark/codelets/codelet_zlacpy.c index 11992d320efc8269d9e2a5ea62f9f409849b7487..90a9ae5fbca18270d2e75f4e95c408c14096ba93 100644 --- a/runtime/quark/codelets/codelet_zlacpy.c +++ b/runtime/quark/codelets/codelet_zlacpy.c @@ -29,50 +29,66 @@ static inline void CORE_zlacpy_quark(Quark *quark) { cham_uplo_t uplo; - int M; - int N; - int displA; - CHAM_tile_t *tileA; - CHAMELEON_Complex64_t *A; - int displB; - CHAM_tile_t *tileB; - CHAMELEON_Complex64_t *B; + int M, N; + int LDA, LDB; + CHAM_tile_t *tileA, *tileB; - quark_unpack_args_7(quark, uplo, M, N, displA, tileA, displB, tileB); + quark_unpack_args_5(quark, uplo, M, N, tileA, tileB); assert( tileA->format & CHAMELEON_TILE_FULLRANK ); assert( tileB->format & CHAMELEON_TILE_FULLRANK ); - A = tileA->mat; - B = tileB->mat; - CORE_zlacpy( uplo, M, N, A + displA, tileA->ld, B + displB, tileB->ld ); + TCORE_zlacpy( uplo, M, N, tileA, tileB ); } -void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, - cham_uplo_t uplo, int m, int n, int nb, - int displA, const CHAM_desc_t *A, int Am, int An, - int displB, const CHAM_desc_t *B, int Bm, int Bn ) +void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, + cham_uplo_t uplo, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_LACPY; QUARK_Insert_Task(opt->quark, CORE_zlacpy_quark, (Quark_Task_Flags*)opt, - sizeof(int), &uplo, VALUE, - sizeof(int), &m, VALUE, - sizeof(int), &n, VALUE, - sizeof(int), &displA, VALUE, + sizeof(int), &uplo, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &displB, VALUE, sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), OUTPUT, 0); - (void)nb; } -void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, - cham_uplo_t uplo, int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn ) +static inline void CORE_zlacpyx_quark(Quark *quark) { - INSERT_TASK_zlacpyx( options, uplo, m, n, nb, - 0, A, Am, An, - 0, B, Bm, Bn ); + cham_uplo_t uplo; + int M, N; + int displA, displB; + int LDA, LDB; + CHAM_tile_t *tileA, *tileB; + + quark_unpack_args_9(quark, uplo, M, N, displA, tileA, LDA, displB, tileB, LDB); + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + + TCORE_zlacpyx( uplo, M, N, displA, tileA, LDA, displB, tileB, LDB ); +} + +void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, + cham_uplo_t uplo, int m, int n, + int displA, const CHAM_desc_t *A, int Am, int An, int lda, + int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb ) +{ + quark_option_t *opt = (quark_option_t*)(options->schedopt); + DAG_CORE_LACPY; + QUARK_Insert_Task(opt->quark, CORE_zlacpy_quark, (Quark_Task_Flags*)opt, + sizeof(int), &uplo, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, + sizeof(int), &displA, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, + sizeof(int), &lda, VALUE, + sizeof(int), &displB, VALUE, + sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), OUTPUT, + sizeof(int), &ldb, VALUE, + 0); } diff --git a/runtime/starpu/codelets/codelet_zlacpy.c b/runtime/starpu/codelets/codelet_zlacpy.c index 7302ec42c94f7aa69adfa5692fa2ded758b797c9..af49ba0c96514250b362b41dc839cb674a9c5262 100644 --- a/runtime/starpu/codelets/codelet_zlacpy.c +++ b/runtime/starpu/codelets/codelet_zlacpy.c @@ -52,9 +52,7 @@ cl_zlacpy_cpu_func(void *descr[], void *cl_arg) assert( clargs->displA == 0 ); assert( clargs->displB == 0 ); - CHAMELEON_Complex64_t *A = tileA->mat; - CHAMELEON_Complex64_t *B = tileB->mat; - // CORE_zlacpy( clargs->uplo, clargs->m, clargs->n, A + clargs->displA, tileA->ld, B + clargs->displB, tileB->ld ); + TCORE_zlacpy( clargs->uplo, clargs->m, clargs->n, tileA, tileB ); } @@ -68,8 +66,8 @@ cl_zlacpyx_cpu_func(void *descr[], void *cl_arg) tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - TCORE_zlacpyx( clargs->uplo, clargs->m, clargs->n, clargs->displA, clargs->displB, - tileA, clargs->lda, tileB, clargs->ldb ); + TCORE_zlacpyx( clargs->uplo, clargs->m, clargs->n, clargs->displA, + tileA, clargs->lda, clargs->displB, tileB, clargs->ldb ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -80,9 +78,9 @@ CODELETS_CPU( zlacpy, cl_zlacpy_cpu_func ) CODELETS_CPU( zlacpyx, cl_zlacpyx_cpu_func ) void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, - cham_uplo_t uplo, int m, int n, int nb, - int displA, const CHAM_desc_t *A, int Am, int An, - int displB, const CHAM_desc_t *B, int Bm, int Bn ) + cham_uplo_t uplo, int m, int n, + int displA, const CHAM_desc_t *A, int Am, int An, int lda, + int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb ) { struct cl_zlacpy_args_s *clargs = NULL; void (*callback)(void*); @@ -105,8 +103,8 @@ void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, clargs->displB = displB; clargs->tileA = A->get_blktile( A, Am, An ); clargs->tileB = B->get_blktile( B, Bm, Bn ); - clargs->lda = clargs->tileA->ld; - clargs->ldb = clargs->tileB->ld; + clargs->lda = lda; + clargs->ldb = ldb; } /* Callback fro profiling information */ @@ -129,12 +127,10 @@ void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, #endif 0 ); - - (void)nb; } void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, - cham_uplo_t uplo, int m, int n, int nb, + cham_uplo_t uplo, int m, int n, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ) { @@ -183,6 +179,4 @@ void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, #endif 0 ); - - (void)nb; }