diff --git a/compute/pzbuild.c b/compute/pzbuild.c index c8f4f536fe22fabef4722dbbbd00a85bf9a0a772..cd25892b84dc68dcced8794a779029263a2c6667 100644 --- a/compute/pzbuild.c +++ b/compute/pzbuild.c @@ -60,7 +60,6 @@ void chameleon_pzbuild( cham_uplo_t uplo, CHAM_desc_t *A, void *user_data, void* RUNTIME_option_t options; int m, n; - int ldam; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) @@ -68,7 +67,6 @@ void chameleon_pzbuild( cham_uplo_t uplo, CHAM_desc_t *A, void *user_data, void* RUNTIME_options_init(&options, chamctxt, sequence, request); for (m = 0; m < A->mt; m++) { - ldam = BLKLDD(A, m); for (n = 0; n < A->nt; n++) { if ( ( uplo == ChamUpper && m <= n ) || @@ -76,7 +74,7 @@ void chameleon_pzbuild( cham_uplo_t uplo, CHAM_desc_t *A, void *user_data, void* ( uplo == ChamUpperLower ) ) INSERT_TASK_zbuild( &options, - A(m, n), ldam, + A(m, n), user_data, user_build_callback ); } } diff --git a/compute/pzgelqf.c b/compute/pzgelqf.c index a85cad0fd91ad8ca14eac15d155e886426a2d73d..c0e9b10052a7da83be3176a4c8b43812f4c9f849 100644 --- a/compute/pzgelqf.c +++ b/compute/pzgelqf.c @@ -41,7 +41,6 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D size_t ws_host = 0; int k, m, n; - int ldak, ldam, lddk; int tempkm, tempkn, tempmm, tempnn; int ib, minMNT; @@ -92,39 +91,36 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; - ldak = BLKLDD(A, k); - lddk = BLKLDD(D, k); INSERT_TASK_zgelqt( &options, tempkm, tempkn, ib, T->nb, - A(k, k), ldak, - T(k, k), T->mb); + A(k, k), + T(k, k)); if ( genD ) { int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb; int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; INSERT_TASK_zlacpy( &options, ChamUpper, tempDkm, tempDkn, A->nb, - A(k, k), ldak, - D(k), lddk ); + A(k, k), + D(k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempDkm, tempDkn, 0., 1., - D(k), lddk ); + D(k) ); #endif } for (m = k+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); INSERT_TASK_zunmlq( &options, ChamRight, ChamConjTrans, tempmm, tempkn, tempkn, ib, T->nb, - D(k), lddk, - T(k, k), T->mb, - A(m, k), ldam); + D(k), + T(k, k), + A(m, k)); } RUNTIME_data_flush( sequence, D(k) ); RUNTIME_data_flush( sequence, T(k, k) ); @@ -139,12 +135,11 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D INSERT_TASK_ztplqt( &options, tempkm, tempnn, 0, ib, T->nb, - A(k, k), ldak, - A(k, n), ldak, - T(k, n), T->mb); + A(k, k), + A(k, n), + T(k, n)); for (m = k+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); RUNTIME_data_migrate( sequence, A(m, k), A->get_rankof( A, m, n ) ); @@ -153,10 +148,10 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D &options, ChamRight, ChamConjTrans, tempmm, tempnn, A->mb, 0, ib, T->nb, - A(k, n), ldak, - T(k, n), T->mb, - A(m, k), ldam, - A(m, n), ldam); + A(k, n), + T(k, n), + A(m, k), + A(m, n)); } RUNTIME_data_flush( sequence, A(k, n) ); RUNTIME_data_flush( sequence, T(k, n) ); diff --git a/compute/pzgelqf_param.c b/compute/pzgelqf_param.c index 511125da43777535f433b7e6b5ed649a02b50d01..5ba6cbe30d25eeebcb68fc94ab8465a404c7aff6 100644 --- a/compute/pzgelqf_param.c +++ b/compute/pzgelqf_param.c @@ -41,7 +41,6 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t int k, m, n, i, p; int K, L, nbgeqrt; - int ldak, ldam, lddk; int tempkmin, tempkm, tempnn, tempmm, temppn; int ib, node, nbtiles, *tiles; @@ -89,8 +88,6 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t RUNTIME_iteration_push(chamctxt, k); tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); - lddk = BLKLDD(D, k); /* The number of geqrt to apply */ nbgeqrt = qrtree->getnbgeqrf(qrtree, k); @@ -104,8 +101,8 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t INSERT_TASK_zgelqt( &options, tempkm, temppn, ib, T->nb, - A( k, p), ldak, - T(k, p), T->mb); + A( k, p), + T(k, p)); if ( genD ) { int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb; @@ -114,27 +111,26 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t INSERT_TASK_zlacpy( &options, ChamUpper, tempDkm, tempDpn, A->nb, - A(k, p), ldak, - D(k, p), lddk ); + A(k, p), + D(k, p) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempDkm, tempDpn, 0., 1., - D(k, p), lddk ); + D(k, p) ); #endif } for (m = k+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); INSERT_TASK_zunmlq( &options, ChamRight, ChamConjTrans, tempmm, temppn, tempkmin, ib, T->nb, - D(k, p), lddk, - T(k, p), T->mb, - A(m, p), ldam); + D(k, p), + T(k, p), + A(m, p)); } RUNTIME_data_flush( sequence, D(k, p) ); RUNTIME_data_flush( sequence, T(k, p) ); @@ -167,13 +163,12 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t INSERT_TASK_ztplqt( &options, tempkm, tempnn, chameleon_min(L, tempkm), ib, T->nb, - A(k, p), ldak, - A(k, n), ldak, - T(k, n), T->mb); + A(k, p), + A(k, n), + T(k, n)); for (m = k+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); node = A->get_rankof( A, m, n ); RUNTIME_data_migrate( sequence, A(m, p), node ); @@ -183,10 +178,10 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t &options, ChamRight, ChamConjTrans, tempmm, tempnn, tempkm, L, ib, T->nb, - A(k, n), ldak, - T(k, n), T->mb, - A(m, p), ldam, - A(m, n), ldam); + A(k, n), + T(k, n), + A(m, p), + A(m, n)); } RUNTIME_data_flush( sequence, A(k, n) ); RUNTIME_data_flush( sequence, T(k, n) ); diff --git a/compute/pzgelqfrh.c b/compute/pzgelqfrh.c index 8eb69f56aee4cff2ad5c431607bb3683cad77b58..2b531416f21d3057026a75d3aeabc4f6d10e8672 100644 --- a/compute/pzgelqfrh.c +++ b/compute/pzgelqfrh.c @@ -44,7 +44,6 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM int k, m, n; int K, N, RD; - int ldak, ldam, lddk; int tempkmin, tempkm, tempNn, tempnn, tempmm, tempNRDn; int ib, node; @@ -91,8 +90,6 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM RUNTIME_iteration_push(chamctxt, k); tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); - lddk = BLKLDD(D, k); for (N = k; N < A->nt; N += BS) { tempNn = N == A->nt-1 ? A->n-N*A->nb : A->nb; @@ -100,8 +97,8 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM INSERT_TASK_zgelqt( &options, tempkm, tempNn, ib, T->nb, - A(k, N), ldak, - T(k, N), T->mb); + A(k, N), + T(k, N)); if ( genD ) { int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb; int tempDNn = N == D->nt-1 ? D->n-N*D->nb : D->nb; @@ -109,26 +106,25 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM INSERT_TASK_zlacpy( &options, ChamUpper, tempDkm, tempDNn, A->nb, - A(k, N), ldak, - D(k, N), lddk ); + A(k, N), + D(k, N) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempDkm, tempDNn, 0., 1., - D(k, N), lddk ); + D(k, N) ); #endif } for (m = k+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); INSERT_TASK_zunmlq( &options, ChamRight, ChamConjTrans, tempmm, tempNn, tempkmin, ib, T->nb, - D(k, N), lddk, - T(k, N), T->mb, - A(m, N), ldam); + D(k, N), + T(k, N), + A(m, N)); } RUNTIME_data_flush( sequence, D(k, N) ); RUNTIME_data_flush( sequence, T(k, N) ); @@ -143,13 +139,12 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM INSERT_TASK_ztplqt( &options, tempkm, tempnn, 0, ib, T->nb, - A(k, N), ldak, - A(k, n), ldak, - T(k, n), T->mb); + A(k, N), + A(k, n), + T(k, n)); for (m = k+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); RUNTIME_data_migrate( sequence, A(m, N), A->get_rankof( A, m, n ) ); @@ -158,10 +153,10 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM &options, ChamRight, ChamConjTrans, tempmm, tempnn, tempkm, 0, ib, T->nb, - A(k, n), ldak, - T(k, n), T->mb, - A(m, N), ldam, - A(m, n), ldam); + A(k, n), + T(k, n), + A(m, N), + A(m, n)); } RUNTIME_data_flush( sequence, A(k, n) ); RUNTIME_data_flush( sequence, T(k, n) ); @@ -179,13 +174,12 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM INSERT_TASK_ztplqt( &options, tempkm, tempNRDn, chameleon_min(tempNRDn, tempkm), ib, T->nb, - A (k, N ), ldak, - A (k, N+RD), ldak, - T2(k, N+RD), T->mb); + A (k, N ), + A (k, N+RD), + T2(k, N+RD)); for (m = k+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m ); node = A->get_rankof( A, m, N+RD ); RUNTIME_data_migrate( sequence, A(m, N), node ); @@ -195,10 +189,10 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM &options, ChamRight, ChamConjTrans, tempmm, tempNRDn, tempkm, tempNRDn, ib, T->nb, - A (k, N+RD), ldak, - T2(k, N+RD), T->mb, - A (m, N ), ldam, - A (m, N+RD), ldam); + A (k, N+RD), + T2(k, N+RD), + A (m, N ), + A (m, N+RD)); } RUNTIME_data_flush( sequence, A (k, N+RD) ); RUNTIME_data_flush( sequence, T2(k, N+RD) ); diff --git a/compute/pzgemm.c b/compute/pzgemm.c index d6d9c7eec81e74a2da0bcd0eea344ccf2dacc98b..52b885d4099d3893994069f5c73cb86658a441ea 100644 --- a/compute/pzgemm.c +++ b/compute/pzgemm.c @@ -41,7 +41,6 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran { RUNTIME_sequence_t *sequence = options->sequence; int m, n, k, p, q, KT, K, lp, lq; - int ldam, ldak, ldbn, ldbk, ldcm; int tempmm, tempnn, tempkk; int lookahead, myp, myq; @@ -74,20 +73,17 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran lq = (k % lookahead) * C->q; tempkk = k == KT - 1 ? K - k * A->nb : A->nb; zbeta = k == 0 ? beta : zone; - ldak = BLKLDD(A, k); - ldbk = BLKLDD(B, k); /* Transfert ownership of the k column of A */ for (m = 0; m < C->mt; m ++ ) { tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb; - ldam = BLKLDD(A, m); if ( transA == ChamNoTrans ) { INSERT_TASK_zlacpy( options, ChamUpperLower, tempmm, tempkk, C->mb, - A( m, k ), ldam, - WA( m, (k % C->q) + lq ), WA.mb ); + A( m, k ), + WA( m, (k % C->q) + lq ) ); RUNTIME_data_flush( sequence, A( m, k ) ); @@ -95,16 +91,16 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran INSERT_TASK_zlacpy( options, ChamUpperLower, tempmm, tempkk, C->mb, - WA( m, ((k+q-1) % C->q) + lq ), WA.mb, - WA( m, ((k+q) % C->q) + lq ), WA.mb ); + WA( m, ((k+q-1) % C->q) + lq ), + WA( m, ((k+q) % C->q) + lq ) ); } } else { INSERT_TASK_zlacpy( options, ChamUpperLower, tempkk, tempmm, C->mb, - A( k, m ), ldak, - WA( m, (k % C->q) + lq ), WA.mb ); + A( k, m ), + WA( m, (k % C->q) + lq ) ); RUNTIME_data_flush( sequence, A( k, m ) ); @@ -112,8 +108,8 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran INSERT_TASK_zlacpy( options, ChamUpperLower, tempkk, tempmm, C->mb, - WA( m, ((k+q-1) % C->q) + lq ), WA.mb, - WA( m, ((k+q) % C->q) + lq ), WA.mb ); + WA( m, ((k+q-1) % C->q) + lq ), + WA( m, ((k+q) % C->q) + lq ) ); } } } @@ -121,14 +117,13 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran /* Transfert ownership of the k row of B */ for (n = 0; n < C->nt; n++) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - ldbn = BLKLDD(B, n); if ( transB == ChamNoTrans ) { INSERT_TASK_zlacpy( options, ChamUpperLower, tempkk, tempnn, C->mb, - B( k, n ), ldbk, - WB( (k % C->p) + lp, n ), WB.mb ); + B( k, n ), + WB( (k % C->p) + lp, n ) ); RUNTIME_data_flush( sequence, B( k, n ) ); @@ -136,16 +131,16 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran INSERT_TASK_zlacpy( options, ChamUpperLower, tempkk, tempnn, C->mb, - WB( ((k+p-1) % C->p) + lp, n ), WB.mb, - WB( ((k+p) % C->p) + lp, n ), WB.mb ); + WB( ((k+p-1) % C->p) + lp, n ), + WB( ((k+p) % C->p) + lp, n ) ); } } else { INSERT_TASK_zlacpy( options, ChamUpperLower, tempnn, tempkk, C->mb, - B( n, k ), ldbn, - WB( (k % C->p) + lp, n ), WB.mb ); + B( n, k ), + WB( (k % C->p) + lp, n ) ); RUNTIME_data_flush( sequence, B( n, k ) ); @@ -153,15 +148,14 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran INSERT_TASK_zlacpy( options, ChamUpperLower, tempnn, tempkk, C->mb, - WB( ((k+p-1) % C->p) + lp, n ), WB.mb, - WB( ((k+p) % C->p) + lp, n ), WB.mb ); + WB( ((k+p-1) % C->p) + lp, n ), + WB( ((k+p) % C->p) + lp, n ) ); } } } for (m = myp; m < C->mt; m+=C->p) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); for (n = myq; n < C->nt; n+=C->q) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; @@ -170,9 +164,9 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran options, transA, transB, tempmm, tempnn, tempkk, A->mb, - alpha, WA( m, myq + lq ), WA.mb, /* lda * Z */ - WB( myp + lp, n ), WB.mb, /* ldb * Y */ - zbeta, C( m, n ), ldcm ); /* ldc * Y */ + alpha, WA( m, myq + lq ), /* lda * Z */ + WB( myp + lp, n ), /* ldb * Y */ + zbeta, C( m, n ) ); /* ldc * Y */ } } } @@ -198,7 +192,6 @@ chameleon_pzgemm_generic( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tr RUNTIME_sequence_t *sequence = options->sequence; int m, n, k; - int ldam, ldak, ldbn, ldbk, ldcm; int tempmm, tempnn, tempkn, tempkm; CHAMELEON_Complex64_t zbeta; @@ -206,33 +199,29 @@ chameleon_pzgemm_generic( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tr for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); for (n = 0; n < C->nt; n++) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; /* * A: ChamNoTrans / B: ChamNoTrans */ if (transA == ChamNoTrans) { - ldam = BLKLDD(A, m); if (transB == ChamNoTrans) { for (k = 0; k < A->nt; k++) { tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; - ldbk = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; INSERT_TASK_zgemm( options, transA, transB, tempmm, tempnn, tempkn, A->mb, - alpha, A(m, k), ldam, /* lda * Z */ - B(k, n), ldbk, /* ldb * Y */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, A(m, k), /* lda * Z */ + B(k, n), /* ldb * Y */ + zbeta, C(m, n)); /* ldc * Y */ } } /* * A: ChamNoTrans / B: Cham[Conj]Trans */ else { - ldbn = BLKLDD(B, n); for (k = 0; k < A->nt; k++) { tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; zbeta = k == 0 ? beta : zone; @@ -240,9 +229,9 @@ chameleon_pzgemm_generic( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tr options, transA, transB, tempmm, tempnn, tempkn, A->mb, - alpha, A(m, k), ldam, /* lda * Z */ - B(n, k), ldbn, /* ldb * Z */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, A(m, k), /* lda * Z */ + B(n, k), /* ldb * Z */ + zbeta, C(m, n)); /* ldc * Y */ } } } @@ -253,34 +242,30 @@ chameleon_pzgemm_generic( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tr if (transB == ChamNoTrans) { for (k = 0; k < A->mt; k++) { tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); - ldbk = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; INSERT_TASK_zgemm( options, transA, transB, tempmm, tempnn, tempkm, A->mb, - alpha, A(k, m), ldak, /* lda * X */ - B(k, n), ldbk, /* ldb * Y */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, A(k, m), /* lda * X */ + B(k, n), /* ldb * Y */ + zbeta, C(m, n)); /* ldc * Y */ } } /* * A: Cham[Conj]Trans / B: Cham[Conj]Trans */ else { - ldbn = BLKLDD(B, n); for (k = 0; k < A->mt; k++) { tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); zbeta = k == 0 ? beta : zone; INSERT_TASK_zgemm( options, transA, transB, tempmm, tempnn, tempkm, A->mb, - alpha, A(k, m), ldak, /* lda * X */ - B(n, k), ldbn, /* ldb * Z */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, A(k, m), /* lda * X */ + B(n, k), /* ldb * Z */ + zbeta, C(m, n)); /* ldc * Y */ } } } diff --git a/compute/pzgeqrf.c b/compute/pzgeqrf.c index 3f92fa3926b2bd9959cd8b03b827589c0e769154..29b03c6d8209d67dc86d295475ca4d023a7cbcd4 100644 --- a/compute/pzgeqrf.c +++ b/compute/pzgeqrf.c @@ -41,7 +41,6 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D size_t ws_host = 0; int k, m, n; - int ldak, ldam, lddk; int tempkm, tempkn, tempnn, tempmm; int ib; int minMNT = chameleon_min(A->mt, A->nt); @@ -87,27 +86,25 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; - ldak = BLKLDD(A, k); - lddk = BLKLDD(D, k); INSERT_TASK_zgeqrt( &options, tempkm, tempkn, ib, T->nb, - A(k, k), ldak, - T(k, k), T->mb); + A(k, k), + T(k, k)); if ( genD ) { int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb; int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; INSERT_TASK_zlacpy( &options, ChamLower, tempDkm, tempDkn, A->nb, - A(k, k), ldak, - D(k), lddk ); + A(k, k), + D(k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempDkm, tempDkn, 0., 1., - D(k), lddk ); + D(k) ); #endif } for (n = k+1; n < A->nt; n++) { @@ -116,16 +113,15 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D &options, ChamLeft, ChamConjTrans, tempkm, tempnn, tempkm, ib, T->nb, - D(k), lddk, - T(k, k), T->mb, - A(k, n), ldak); + D(k), + T(k, k), + A(k, n)); } RUNTIME_data_flush( sequence, D(k) ); RUNTIME_data_flush( sequence, T(k, k) ); for (m = k+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); RUNTIME_data_migrate( sequence, A(k, k), A->get_rankof( A, m, k ) ); @@ -134,9 +130,9 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D INSERT_TASK_ztpqrt( &options, tempmm, tempkn, 0, ib, T->nb, - A(k, k), ldak, - A(m, k), ldam, - T(m, k), T->mb); + A(k, k), + A(m, k), + T(m, k)); for (n = k+1; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; @@ -149,10 +145,10 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D &options, ChamLeft, ChamConjTrans, tempmm, tempnn, A->nb, 0, ib, T->nb, - A(m, k), ldam, - T(m, k), T->mb, - A(k, n), ldak, - A(m, n), ldam); + A(m, k), + T(m, k), + A(k, n), + A(m, n)); } RUNTIME_data_flush( sequence, A(m, k) ); RUNTIME_data_flush( sequence, T(m, k) ); diff --git a/compute/pzgeqrf_param.c b/compute/pzgeqrf_param.c index ae417f76eb7c1676a549496cca73e4d6fa20e02c..d6742f7c8057ea165d1f47c2840d4be890531e80 100644 --- a/compute/pzgeqrf_param.c +++ b/compute/pzgeqrf_param.c @@ -46,7 +46,6 @@ void chameleon_pzgeqrf_param( int genD, int K, int k, m, n, i, p; int L, nbgeqrt; - int ldap, ldam, lddm; int tempkmin, tempkn, tempnn, tempmm; int ib, node, nbtiles, *tiles; @@ -100,14 +99,12 @@ void chameleon_pzgeqrf_param( int genD, int K, m = qrtree->getm(qrtree, k, i); tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; tempkmin = chameleon_min(tempmm, tempkn); - ldam = BLKLDD(A, m); - lddm = BLKLDD(D, m); INSERT_TASK_zgeqrt( &options, tempmm, tempkn, ib, T->nb, - A(m, k), ldam, - T(m, k), T->mb); + A(m, k), + T(m, k)); if ( genD ) { int tempDmm = m == D->mt-1 ? D->m-m*D->mb : D->mb; @@ -116,14 +113,14 @@ void chameleon_pzgeqrf_param( int genD, int K, INSERT_TASK_zlacpy( &options, ChamLower, tempDmm, tempDkn, A->nb, - A(m, k), ldam, - D(m, k), lddm ); + A(m, k), + D(m, k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempDmm, tempDkn, 0., 1., - D(m, k), lddm ); + D(m, k) ); #endif } @@ -133,9 +130,9 @@ void chameleon_pzgeqrf_param( int genD, int K, &options, ChamLeft, ChamConjTrans, tempmm, tempnn, tempkmin, ib, T->nb, - D(m, k), lddm, - T(m, k), T->mb, - A(m, n), ldam); + D(m, k), + T(m, k), + A(m, n)); } RUNTIME_data_flush( sequence, D(m, k) ); RUNTIME_data_flush( sequence, T(m, k) ); @@ -149,8 +146,6 @@ void chameleon_pzgeqrf_param( int genD, int K, p = qrtree->currpiv(qrtree, k, m); tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldap = BLKLDD(A, p); - ldam = BLKLDD(A, m); if ( qrtree->gettype(qrtree, k, m) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ @@ -170,9 +165,9 @@ void chameleon_pzgeqrf_param( int genD, int K, INSERT_TASK_ztpqrt( &options, tempmm, tempkn, chameleon_min(L, tempkn), ib, T->nb, - A(p, k), ldap, - A(m, k), ldam, - T(m, k), T->mb); + A(p, k), + A(m, k), + T(m, k)); for (n = k+1; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; @@ -185,10 +180,10 @@ void chameleon_pzgeqrf_param( int genD, int K, &options, ChamLeft, ChamConjTrans, tempmm, tempnn, A->nb, L, ib, T->nb, - A(m, k), ldam, - T(m, k), T->mb, - A(p, n), ldap, - A(m, n), ldam); + A(m, k), + T(m, k), + A(p, n), + A(m, n)); } RUNTIME_data_flush( sequence, A(m, k) ); RUNTIME_data_flush( sequence, T(m, k) ); diff --git a/compute/pzgeqrfrh.c b/compute/pzgeqrfrh.c index 1182e5bc19ba569cecf38d949d45ab873ce53375..6f51507f819ff4aabaa220748bb3b71163ba0bdf 100644 --- a/compute/pzgeqrfrh.c +++ b/compute/pzgeqrfrh.c @@ -41,10 +41,8 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM RUNTIME_option_t options; size_t ws_worker = 0; size_t ws_host = 0; - int k, m, n; int K, M, RD; - int ldaM, ldam, ldaMRD, lddM; int tempkmin, tempkn, tempMm, tempnn, tempmm, tempMRDm; int ib, node; @@ -92,14 +90,12 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM for (M = k; M < A->mt; M += BS) { tempMm = M == A->mt-1 ? A->m-M*A->mb : A->mb; tempkmin = chameleon_min(tempMm, tempkn); - ldaM = BLKLDD(A, M); - lddM = BLKLDD(D, M); INSERT_TASK_zgeqrt( &options, tempMm, tempkn, ib, T->nb, - A(M, k), ldaM, - T(M, k), T->mb); + A(M, k), + T(M, k)); if ( genD ) { int tempDMm = M == D->mt-1 ? D->m-M*D->mb : D->mb; int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; @@ -107,14 +103,14 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM INSERT_TASK_zlacpy( &options, ChamLower, tempDMm, tempDkn, A->nb, - A(M, k), ldaM, - D(M, k), lddM ); + A(M, k), + D(M, k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempDMm, tempDkn, 0., 1., - D(M, k), lddM ); + D(M, k) ); #endif } for (n = k+1; n < A->nt; n++) { @@ -123,16 +119,15 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM &options, ChamLeft, ChamConjTrans, tempMm, tempnn, tempkmin, ib, T->nb, - D(M, k), lddM, - T(M, k), T->mb, - A(M, n), ldaM); + D(M, k), + T(M, k), + A(M, n)); } RUNTIME_data_flush( sequence, D(M, k) ); RUNTIME_data_flush( sequence, T(M, k) ); for (m = M+1; m < chameleon_min(M+BS, A->mt); m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); RUNTIME_data_migrate( sequence, A(M, k), A->get_rankof( A, m, k ) ); @@ -141,9 +136,9 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM INSERT_TASK_ztpqrt( &options, tempmm, tempkn, 0, ib, T->nb, - A(M, k), ldaM, - A(m, k), ldam, - T(m, k), T->mb); + A(M, k), + A(m, k), + T(m, k)); for (n = k+1; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; @@ -155,10 +150,10 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM &options, ChamLeft, ChamConjTrans, tempmm, tempnn, A->nb, 0, ib, T->nb, - A(m, k), ldam, - T(m, k), T->mb, - A(M, n), ldaM, - A(m, n), ldam); + A(m, k), + T(m, k), + A(M, n), + A(m, n)); } RUNTIME_data_flush( sequence, A(m, k) ); RUNTIME_data_flush( sequence, T(m, k) ); @@ -167,8 +162,6 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM for (RD = BS; RD < A->mt-k; RD *= 2) { for (M = k; M+RD < A->mt; M += 2*RD) { tempMRDm = M+RD == A->mt-1 ? A->m-(M+RD)*A->mb : A->mb; - ldaM = BLKLDD(A, M ); - ldaMRD = BLKLDD(A, M+RD); node = A->get_rankof( A, M+RD, k ); RUNTIME_data_migrate( sequence, A(M, k), node ); @@ -178,9 +171,9 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM INSERT_TASK_ztpqrt( &options, tempMRDm, tempkn, chameleon_min( tempMRDm, tempkn ), ib, T->nb, - A (M , k), ldaM, - A (M+RD, k), ldaMRD, - T2(M+RD, k), T->mb); + A (M , k), + A (M+RD, k), + T2(M+RD, k)); for (n = k+1; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; @@ -193,10 +186,10 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM &options, ChamLeft, ChamConjTrans, tempMRDm, tempnn, A->nb, tempMRDm, ib, T->nb, - A (M+RD, k), ldaMRD, - T2(M+RD, k), T->mb, - A (M, n), ldaM, - A (M+RD, n), ldaMRD); + A (M+RD, k), + T2(M+RD, k), + A (M, n), + A (M+RD, n)); } RUNTIME_data_flush( sequence, A (M+RD, k) ); RUNTIME_data_flush( sequence, T2(M+RD, k) ); diff --git a/compute/pzgetrf_incpiv.c b/compute/pzgetrf_incpiv.c index 642970b628cede045e6f6e4ea3d9456d898718d1..1e6b5513727f2b2f85fde9a6f8cfb50534c5198a 100644 --- a/compute/pzgetrf_incpiv.c +++ b/compute/pzgetrf_incpiv.c @@ -49,7 +49,6 @@ void chameleon_pzgetrf_incpiv( CHAM_desc_t *A, CHAM_desc_t *L, CHAM_desc_t *D, i size_t ws_host = 0; int k, m, n; - int ldak, ldam, lddk; int tempkm, tempkn, tempmm, tempnn; int ib; int minMNT = chameleon_min(A->mt, A->nt); @@ -84,13 +83,11 @@ void chameleon_pzgetrf_incpiv( CHAM_desc_t *A, CHAM_desc_t *L, CHAM_desc_t *D, i tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; - ldak = BLKLDD(A, k); - lddk = BLKLDD(D, k); INSERT_TASK_zgetrf_incpiv( &options, tempkm, tempkn, ib, L->nb, - A(k, k), ldak, - L(k, k), L->mb, + A(k, k), + L(k, k), IPIV(k, k), k == A->mt-1, A->nb*k); @@ -99,8 +96,8 @@ void chameleon_pzgetrf_incpiv( CHAM_desc_t *A, CHAM_desc_t *L, CHAM_desc_t *D, i INSERT_TASK_zlacpy( &options, ChamUpperLower, tempkm, tempkn, A->nb, - A(k, k), ldak, - D(k), lddk); + A(k, k), + D(k)); #endif } @@ -110,19 +107,18 @@ void chameleon_pzgetrf_incpiv( CHAM_desc_t *A, CHAM_desc_t *L, CHAM_desc_t *D, i &options, tempkm, tempnn, tempkm, ib, L->nb, IPIV(k, k), - L(k, k), L->mb, - D(k), lddk, - A(k, n), ldak); + L(k, k), + D(k), + A(k, n)); } for (m = k+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); INSERT_TASK_ztstrf( &options, tempmm, tempkn, ib, L->nb, - A(k, k), ldak, - A(m, k), ldam, - L(m, k), L->mb, + A(k, k), + A(m, k), + L(m, k), IPIV(m, k), m == A->mt-1, A->nb*k); @@ -131,10 +127,10 @@ void chameleon_pzgetrf_incpiv( CHAM_desc_t *A, CHAM_desc_t *L, CHAM_desc_t *D, i INSERT_TASK_zssssm( &options, A->nb, tempnn, tempmm, tempnn, A->nb, ib, L->nb, - A(k, n), ldak, - A(m, n), ldam, - L(m, k), L->mb, - A(m, k), ldam, + A(k, n), + A(m, n), + L(m, k), + A(m, k), IPIV(m, k)); } } diff --git a/compute/pzgetrf_nopiv.c b/compute/pzgetrf_nopiv.c index d67dec3b604860bd39703515806dd26073501a72..eb9d6ea29a77cd27c5f03b1ff9bb5b1d43ae07ad 100644 --- a/compute/pzgetrf_nopiv.c +++ b/compute/pzgetrf_nopiv.c @@ -35,7 +35,6 @@ void chameleon_pzgetrf_nopiv(CHAM_desc_t *A, RUNTIME_option_t options; int k, m, n, ib; - int ldak, ldam; int tempkm, tempkn, tempmm, tempnn; CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t) 1.0; @@ -54,24 +53,22 @@ void chameleon_pzgetrf_nopiv(CHAM_desc_t *A, tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; - ldak = BLKLDD(A, k); options.priority = 2*A->nt - 2*k; INSERT_TASK_zgetrf_nopiv( &options, tempkm, tempkn, ib, A->mb, - A(k, k), ldak, A->mb*k); + A(k, k), A->mb*k); for (m = k+1; m < A->mt; m++) { options.priority = 2*A->nt - 2*k - m; tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); INSERT_TASK_ztrsm( &options, ChamRight, ChamUpper, ChamNoTrans, ChamNonUnit, tempmm, tempkn, A->mb, - zone, A(k, k), ldak, - A(m, k), ldam); + zone, A(k, k), + A(m, k)); } for (n = k+1; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; @@ -80,20 +77,19 @@ void chameleon_pzgetrf_nopiv(CHAM_desc_t *A, &options, ChamLeft, ChamLower, ChamNoTrans, ChamUnit, tempkm, tempnn, A->mb, - zone, A(k, k), ldak, - A(k, n), ldak); + zone, A(k, k), + A(k, n)); for (m = k+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; options.priority = 2*A->nt - 2*k - n - m; - ldam = BLKLDD(A, m); INSERT_TASK_zgemm( &options, ChamNoTrans, ChamNoTrans, tempmm, tempnn, A->mb, A->mb, - mzone, A(m, k), ldam, - A(k, n), ldak, - zone, A(m, n), ldam); + mzone, A(m, k), + A(k, n), + zone, A(m, n)); } } diff --git a/compute/pzgram.c b/compute/pzgram.c index c8fc0b34e4e38043bfc8468c747fa0f2b1fd5a0f..3218b1a78757b8ef7802837ff5fe224bff6403ff 100644 --- a/compute/pzgram.c +++ b/compute/pzgram.c @@ -46,21 +46,20 @@ chameleon_pzgram_internal( cham_uplo_t uplo, for(m = mmin; m < mmax; m++) { int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb; - int ldam = BLKLDD( A, m ); if ( n == m ) { INSERT_TASK_dsyssq( options, ChamColumnwise, uplo, tempmm, - A(m, n), ldam, W( Wcol, m, n) ); + A(m, n), W( Wcol, m, n) ); } else { INSERT_TASK_dgessq( options, ChamColumnwise, tempmm, tempnn, - A(m, n), ldam, W( Wcol, m, n) ); + A(m, n), W( Wcol, m, n) ); if ( uplo != ChamUpperLower ) { INSERT_TASK_dgessq( options, ChamRowwise, tempmm, tempnn, - A(m, n), ldam, W( Wcol, n, m) ); + A(m, n), W( Wcol, n, m) ); } } } @@ -121,16 +120,15 @@ chameleon_pzgram_internal( cham_uplo_t uplo, for(m = mmin; m < mmax; m++) { int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb; - int ldam = BLKLDD( A, m ); INSERT_TASK_zgram( options, ( m == n ) ? uplo : ChamUpperLower, A->m, A->n, tempmm, tempnn, - W( Wcol, 0, m ), 2, - W( Wcol, 0, n ), 2, + W( Wcol, 0, m ), + W( Wcol, 0, n ), W( Welt, 0, 0 ), - A( m, n ), ldam ); + A( m, n ) ); } } } @@ -144,9 +142,8 @@ void chameleon_pzgram( cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *seq RUNTIME_option_t options; CHAM_desc_t Wcol; CHAM_desc_t Welt; - int workmt, worknt; - int m, n, tempmm, tempnn, ldw; + int m, n, tempmm, tempnn; chamctxt = chameleon_context_self(); if ( sequence->status != CHAMELEON_SUCCESS ) { @@ -170,27 +167,25 @@ void chameleon_pzgram( cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *seq /* Initialize Wcol */ for(m = 0; m < Wcol.mt; m++) { tempmm = m == Wcol.mt-1 ? Wcol.m-m*Wcol.mb : Wcol.mb; - ldw = Wcol.get_blkldd(&Wcol, m); for(n = 0; n < Wcol.nt; n++) { tempnn = n == Wcol.nt-1 ? Wcol.n-n*Wcol.nb : Wcol.nb; INSERT_TASK_dlaset( &options, ChamUpperLower, tempmm, tempnn, -1., -1., - W( &Wcol, m, n ), ldw ); + W( &Wcol, m, n ) ); } } /* Initialize Welt */ for(m = 0; m < Welt.mt; m++) { tempmm = m == Welt.mt-1 ? Welt.m-m*Welt.mb : Welt.mb; - ldw = Welt.get_blkldd(&Welt, m); for(n = 0; n < Welt.nt; n++) { tempnn = n == Welt.nt-1 ? Welt.n-n*Welt.nb : Welt.nb; INSERT_TASK_dlaset( &options, ChamUpperLower, tempmm, tempnn, -1., -1., - W( &Welt, m, n ), ldw ); + W( &Welt, m, n ) ); } } diff --git a/compute/pzhemm.c b/compute/pzhemm.c index b41220165efa574408a907cdd54e6e55dd705652..4339b77ff1458dc1d565ae1a4719fa73b7f2161a 100644 --- a/compute/pzhemm.c +++ b/compute/pzhemm.c @@ -43,7 +43,6 @@ chameleon_pzhemm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, RUNTIME_sequence_t *sequence = options->sequence; cham_trans_t transA; int m, n, k, p, q, KT, K, lp, lq; - int ldcm; int tempmm, tempnn, tempkk; int lookahead, myp, myq; @@ -64,7 +63,7 @@ chameleon_pzhemm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, /* Transfert ownership of the k column of A or B */ for (m = 0; m < C->mt; m ++ ) { - int Am, Ak, ldam; + int Am, Ak; int tempam, tempak; tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb; @@ -85,13 +84,12 @@ chameleon_pzhemm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, tempam = tempmm; tempak = tempkk; } - ldam = BLKLDD( A, Am ); INSERT_TASK_zlacpy( options, ChamUpperLower, tempam, tempak, C->mb, - A( Am, Ak ), ldam, - WA( m, (k % C->q) + lq ), WA->mb ); + A( Am, Ak ), + WA( m, (k % C->q) + lq ) ); RUNTIME_data_flush( sequence, A( Am, Ak ) ); @@ -99,23 +97,21 @@ chameleon_pzhemm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zlacpy( options, ChamUpperLower, tempam, tempak, C->mb, - WA( m, ((k+q-1) % C->q) + lq ), WA->mb, - WA( m, ((k+q) % C->q) + lq ), WA->mb ); + WA( m, ((k+q-1) % C->q) + lq ), + WA( m, ((k+q) % C->q) + lq ) ); } } /* Transfert ownership of the k row of B, or A */ for (n = 0; n < C->nt; n++) { - int ldbk; tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - ldbk = BLKLDD( B, k ); INSERT_TASK_zlacpy( options, ChamUpperLower, tempkk, tempnn, C->mb, - B( k, n ), ldbk, - WB( (k % C->p) + lp, n ), WB->mb ); + B( k, n ), + WB( (k % C->p) + lp, n ) ); RUNTIME_data_flush( sequence, B( k, n ) ); @@ -123,15 +119,14 @@ chameleon_pzhemm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zlacpy( options, ChamUpperLower, tempkk, tempnn, C->mb, - WB( ((k+p-1) % C->p) + lp, n ), WB->mb, - WB( ((k+p) % C->p) + lp, n ), WB->mb ); + WB( ((k+p-1) % C->p) + lp, n ), + WB( ((k+p) % C->p) + lp, n ) ); } } /* Perform the update of this iteration */ for (m = myp; m < C->mt; m+=C->p) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); if ( k == m ) { for (n = myq; n < C->nt; n+=C->q) { @@ -140,9 +135,9 @@ chameleon_pzhemm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zhemm( options, ChamLeft, uplo, tempmm, tempnn, A->mb, - alpha, WA( m, myq + lq ), WA->mb, - WB( myp + lp, n ), WB->mb, - zbeta, C( m, n ), ldcm ); + alpha, WA( m, myq + lq ), + WB( myp + lp, n ), + zbeta, C( m, n ) ); } } else { @@ -161,9 +156,9 @@ chameleon_pzhemm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zgemm( options, transA, ChamNoTrans, tempmm, tempnn, tempkk, A->mb, - alpha, WA( m, myq + lq ), WA->mb, - WB( myp + lp, n ), WB->mb, - zbeta, C( m, n ), ldcm ); + alpha, WA( m, myq + lq ), + WB( myp + lp, n ), + zbeta, C( m, n ) ); } } } @@ -184,7 +179,6 @@ chameleon_pzhemm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, RUNTIME_sequence_t *sequence = options->sequence; cham_trans_t transA; int m, n, k, p, q, KT, K, lp, lq; - int ldcm; int tempmm, tempnn, tempkk; int lookahead, myp, myq; @@ -205,16 +199,14 @@ chameleon_pzhemm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, /* Transfert ownership of the k column of A or B */ for (m = 0; m < C->mt; m++ ) { - int ldbm; tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb; - ldbm = BLKLDD( B, m ); INSERT_TASK_zlacpy( options, ChamUpperLower, tempmm, tempkk, C->mb, - B( m, k ), ldbm, - WA( m, (k % C->q) + lq ), WA->mb ); + B( m, k ), + WA( m, (k % C->q) + lq ) ); RUNTIME_data_flush( sequence, B( m, k ) ); @@ -222,14 +214,14 @@ chameleon_pzhemm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zlacpy( options, ChamUpperLower, tempmm, tempkk, C->mb, - WA( m, ((k+q-1) % C->q) + lq ), WA->mb, - WA( m, ((k+q) % C->q) + lq ), WA->mb ); + WA( m, ((k+q-1) % C->q) + lq ), + WA( m, ((k+q) % C->q) + lq ) ); } } /* Transfert ownership of the k row of B, or A */ for (n = 0; n < C->nt; n++) { - int Ak, An, ldak; + int Ak, An; int tempak, tempan; tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; @@ -249,13 +241,12 @@ chameleon_pzhemm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, tempak = tempkk; tempan = tempnn; } - ldak = BLKLDD( A, Ak ); INSERT_TASK_zlacpy( options, ChamUpperLower, tempak, tempan, C->mb, - A( Ak, An ), ldak, - WB( (k % C->p) + lp, n ), WB->mb ); + A( Ak, An ), + WB( (k % C->p) + lp, n ) ); RUNTIME_data_flush( sequence, A( Ak, An ) ); @@ -263,8 +254,8 @@ chameleon_pzhemm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zlacpy( options, ChamUpperLower, tempak, tempan, C->mb, - WB( ((k+p-1) % C->p) + lp, n ), WB->mb, - WB( ((k+p) % C->p) + lp, n ), WB->mb ); + WB( ((k+p-1) % C->p) + lp, n ), + WB( ((k+p) % C->p) + lp, n ) ); } } @@ -275,15 +266,14 @@ chameleon_pzhemm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, if ( k == n ) { for (m = myp; m < C->mt; m+=C->p) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); /* A has been stored in WA or WB for the summa ring */ INSERT_TASK_zhemm( options, ChamRight, uplo, tempmm, tempnn, A->mb, - alpha, WB( myp + lp, n ), WB->mb, - WA( m, myq + lq ), WA->mb, - zbeta, C( m, n ), ldcm ); + alpha, WB( myp + lp, n ), + WA( m, myq + lq ), + zbeta, C( m, n ) ); } } else { @@ -298,14 +288,13 @@ chameleon_pzhemm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, for (m = myp; m < C->mt; m+=C->p) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); INSERT_TASK_zgemm( options, ChamNoTrans, transA, tempmm, tempnn, tempkk, A->mb, - alpha, WA( m, myq + lq ), WA->mb, - WB( myp + lp, n ), WB->mb, - zbeta, C( m, n ), ldcm ); + alpha, WA( m, myq + lq ), + WB( myp + lp, n ), + zbeta, C( m, n ) ); } } } @@ -366,7 +355,6 @@ chameleon_pzhemm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_ RUNTIME_option_t *options ) { int k, m, n; - int ldam, ldan, ldak, ldbk, ldbm, ldcm; int tempmm, tempnn, tempkn, tempkm; CHAMELEON_Complex64_t zbeta; @@ -374,28 +362,24 @@ chameleon_pzhemm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_ for(m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); for(n = 0; n < C->nt; n++) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; /* * ChamLeft / ChamLower */ if (side == ChamLeft) { - ldam = BLKLDD(A, m); if (uplo == ChamLower) { for (k = 0; k < C->mt; k++) { tempkm = k == C->mt-1 ? C->m-k*C->mb : C->mb; - ldak = BLKLDD(A, k); - ldbk = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; if (k < m) { INSERT_TASK_zgemm( options, ChamNoTrans, ChamNoTrans, tempmm, tempnn, tempkm, A->mb, - alpha, A(m, k), ldam, /* lda * K */ - B(k, n), ldbk, /* ldb * Y */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, A(m, k), /* lda * K */ + B(k, n), /* ldb * Y */ + zbeta, C(m, n)); /* ldc * Y */ } else { if (k == m) { @@ -403,18 +387,18 @@ chameleon_pzhemm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_ options, side, uplo, tempmm, tempnn, A->mb, - alpha, A(k, k), ldak, /* ldak * X */ - B(k, n), ldbk, /* ldb * Y */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, A(k, k), /* ldak * X */ + B(k, n), /* ldb * Y */ + zbeta, C(m, n)); /* ldc * Y */ } else { INSERT_TASK_zgemm( options, ChamConjTrans, ChamNoTrans, tempmm, tempnn, tempkm, A->mb, - alpha, A(k, m), ldak, /* ldak * X */ - B(k, n), ldbk, /* ldb * Y */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, A(k, m), /* ldak * X */ + B(k, n), /* ldb * Y */ + zbeta, C(m, n)); /* ldc * Y */ } } } @@ -425,17 +409,15 @@ chameleon_pzhemm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_ else { for (k = 0; k < C->mt; k++) { tempkm = k == C->mt-1 ? C->m-k*C->mb : C->mb; - ldak = BLKLDD(A, k); - ldbk = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; if (k < m) { INSERT_TASK_zgemm( options, ChamConjTrans, ChamNoTrans, tempmm, tempnn, tempkm, A->mb, - alpha, A(k, m), ldak, /* ldak * X */ - B(k, n), ldbk, /* ldb * Y */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, A(k, m), /* ldak * X */ + B(k, n), /* ldb * Y */ + zbeta, C(m, n)); /* ldc * Y */ } else { if (k == m) { @@ -443,18 +425,18 @@ chameleon_pzhemm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_ options, side, uplo, tempmm, tempnn, A->mb, - alpha, A(k, k), ldak, /* ldak * K */ - B(k, n), ldbk, /* ldb * Y */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, A(k, k), /* ldak * K */ + B(k, n), /* ldb * Y */ + zbeta, C(m, n)); /* ldc * Y */ } else { INSERT_TASK_zgemm( options, ChamNoTrans, ChamNoTrans, tempmm, tempnn, tempkm, A->mb, - alpha, A(m, k), ldam, /* lda * K */ - B(k, n), ldbk, /* ldb * Y */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, A(m, k), /* lda * K */ + B(k, n), /* ldb * Y */ + zbeta, C(m, n)); /* ldc * Y */ } } } @@ -464,21 +446,18 @@ chameleon_pzhemm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_ * ChamRight / ChamLower */ else { - ldan = BLKLDD(A, n); - ldbm = BLKLDD(B, m); if (uplo == ChamLower) { for (k = 0; k < C->nt; k++) { tempkn = k == C->nt-1 ? C->n-k*C->nb : C->nb; - ldak = BLKLDD(A, k); zbeta = k == 0 ? beta : zone; if (k < n) { INSERT_TASK_zgemm( options, ChamNoTrans, ChamConjTrans, tempmm, tempnn, tempkn, A->mb, - alpha, B(m, k), ldbm, /* ldb * K */ - A(n, k), ldan, /* lda * K */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, B(m, k), /* ldb * K */ + A(n, k), /* lda * K */ + zbeta, C(m, n)); /* ldc * Y */ } else { if (k == n) { @@ -486,18 +465,18 @@ chameleon_pzhemm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_ options, side, uplo, tempmm, tempnn, A->mb, - alpha, A(k, k), ldak, /* ldak * Y */ - B(m, k), ldbm, /* ldb * Y */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, A(k, k), /* ldak * Y */ + B(m, k), /* ldb * Y */ + zbeta, C(m, n)); /* ldc * Y */ } else { INSERT_TASK_zgemm( options, ChamNoTrans, ChamNoTrans, tempmm, tempnn, tempkn, A->mb, - alpha, B(m, k), ldbm, /* ldb * K */ - A(k, n), ldak, /* ldak * Y */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, B(m, k), /* ldb * K */ + A(k, n), /* ldak * Y */ + zbeta, C(m, n)); /* ldc * Y */ } } } @@ -508,16 +487,15 @@ chameleon_pzhemm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_ else { for (k = 0; k < C->nt; k++) { tempkn = k == C->nt-1 ? C->n-k*C->nb : C->nb; - ldak = BLKLDD(A, k); zbeta = k == 0 ? beta : zone; if (k < n) { INSERT_TASK_zgemm( options, ChamNoTrans, ChamNoTrans, tempmm, tempnn, tempkn, A->mb, - alpha, B(m, k), ldbm, /* ldb * K */ - A(k, n), ldak, /* ldak * Y */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, B(m, k), /* ldb * K */ + A(k, n), /* ldak * Y */ + zbeta, C(m, n)); /* ldc * Y */ } else { if (k == n) { @@ -525,18 +503,18 @@ chameleon_pzhemm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_ options, side, uplo, tempmm, tempnn, A->mb, - alpha, A(k, k), ldak, /* ldak * Y */ - B(m, k), ldbm, /* ldb * Y */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, A(k, k), /* ldak * Y */ + B(m, k), /* ldb * Y */ + zbeta, C(m, n)); /* ldc * Y */ } else { INSERT_TASK_zgemm( options, ChamNoTrans, ChamConjTrans, tempmm, tempnn, tempkn, A->mb, - alpha, B(m, k), ldbm, /* ldb * K */ - A(n, k), ldan, /* lda * K */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, B(m, k), /* ldb * K */ + A(n, k), /* lda * K */ + zbeta, C(m, n)); /* ldc * Y */ } } } diff --git a/compute/pzher2k.c b/compute/pzher2k.c index 8f51860bb996a404e4307e4436a2f06e98d59b53..8e2cb085cd394c15d6220c97675f9e77a6f3f3e3 100644 --- a/compute/pzher2k.c +++ b/compute/pzher2k.c @@ -38,8 +38,6 @@ void chameleon_pzher2k( cham_uplo_t uplo, cham_trans_t trans, RUNTIME_option_t options; int m, n, k, mmin, mmax; - int ldak, ldam, ldan, ldcm, ldcn; - int ldbk, ldbm, ldbn; int tempnn, tempmm, tempkn, tempkm; CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0; @@ -54,9 +52,6 @@ void chameleon_pzher2k( cham_uplo_t uplo, cham_trans_t trans, for (n = 0; n < C->nt; n++) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - ldan = BLKLDD(A, n); - ldbn = BLKLDD(B, n); - ldcn = BLKLDD(C, n); if (uplo == ChamLower) { mmin = n+1; @@ -78,15 +73,12 @@ void chameleon_pzher2k( cham_uplo_t uplo, cham_trans_t trans, &options, uplo, trans, tempnn, tempkn, A->mb, - alpha, A(n, k), ldan, /* ldan * K */ - B(n, k), ldbn, - dbeta, C(n, n), ldcn); /* ldc * N */ + alpha, A(n, k), /* ldan * K */ + B(n, k), + dbeta, C(n, n)); /* ldc * N */ } for (m = mmin; m < mmax; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldam = BLKLDD(A, m); - ldbm = BLKLDD(B, m); - ldcm = BLKLDD(C, m); for (k = 0; k < A->nt; k++) { tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; zbeta = k == 0 ? (CHAMELEON_Complex64_t)beta : zone; @@ -94,17 +86,17 @@ void chameleon_pzher2k( cham_uplo_t uplo, cham_trans_t trans, &options, ChamNoTrans, ChamConjTrans, tempmm, tempnn, tempkn, A->mb, - alpha, A(m, k), ldam, - B(n, k), ldbn, - zbeta, C(m, n), ldcm); + alpha, A(m, k), + B(n, k), + zbeta, C(m, n)); INSERT_TASK_zgemm( &options, ChamNoTrans, ChamConjTrans, tempmm, tempnn, tempkn, A->mb, - conj(alpha), B(m, k), ldbm, - A(n, k), ldan, - zone, C(m, n), ldcm); + conj(alpha), B(m, k), + A(n, k), + zone, C(m, n)); } } } @@ -114,40 +106,35 @@ void chameleon_pzher2k( cham_uplo_t uplo, cham_trans_t trans, else { for (k = 0; k < A->mt; k++) { tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); - ldbk = BLKLDD(B, k); dbeta = k == 0 ? beta : 1.0; INSERT_TASK_zher2k( &options, uplo, trans, tempnn, tempkm, A->mb, - alpha, A(k, n), ldak, /* lda * N */ - B(k, n), ldbk, - dbeta, C(n, n), ldcn); /* ldc * N */ + alpha, A(k, n), /* lda * N */ + B(k, n), + dbeta, C(n, n)); /* ldc * N */ } for (m = mmin; m < mmax; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); for (k = 0; k < A->mt; k++) { tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); - ldbk = BLKLDD(B, k); zbeta = k == 0 ? (CHAMELEON_Complex64_t)beta : zone; INSERT_TASK_zgemm( &options, ChamConjTrans, ChamNoTrans, tempmm, tempnn, tempkm, A->mb, - alpha, A(k, m), ldak, - B(k, n), ldbk, - zbeta, C(m, n), ldcm); + alpha, A(k, m), + B(k, n), + zbeta, C(m, n)); INSERT_TASK_zgemm( &options, ChamConjTrans, ChamNoTrans, tempmm, tempnn, tempkm, A->mb, - conj(alpha), B(k, m), ldbk, - A(k, n), ldak, - zone, C(m, n), ldcm ); + conj(alpha), B(k, m), + A(k, n), + zone, C(m, n) ); } } } diff --git a/compute/pzherk.c b/compute/pzherk.c index 29635ccdd50a8a21775ed39f62a3efa981ee11fa..bd3fa544100ec378763c66ca6ce852c5edf13f8c 100644 --- a/compute/pzherk.c +++ b/compute/pzherk.c @@ -37,7 +37,6 @@ void chameleon_pzherk(cham_uplo_t uplo, cham_trans_t trans, RUNTIME_option_t options; int m, n, k; - int ldak, ldam, ldan, ldcm, ldcn; int tempnn, tempmm, tempkn, tempkm; CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0; @@ -53,8 +52,6 @@ void chameleon_pzherk(cham_uplo_t uplo, cham_trans_t trans, for (n = 0; n < C->nt; n++) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - ldan = BLKLDD(A, n); - ldcn = BLKLDD(C, n); /* * ChamNoTrans */ @@ -66,8 +63,8 @@ void chameleon_pzherk(cham_uplo_t uplo, cham_trans_t trans, &options, uplo, trans, tempnn, tempkn, A->mb, - alpha, A(n, k), ldan, /* ldan * K */ - dbeta, C(n, n), ldcn); /* ldc * N */ + alpha, A(n, k), /* ldan * K */ + dbeta, C(n, n)); /* ldc * N */ } /* * ChamNoTrans / ChamLower @@ -75,8 +72,6 @@ void chameleon_pzherk(cham_uplo_t uplo, cham_trans_t trans, if (uplo == ChamLower) { for (m = n+1; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldam = BLKLDD(A, m); - ldcm = BLKLDD(C, m); for (k = 0; k < A->nt; k++) { tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; zbeta = k == 0 ? (CHAMELEON_Complex64_t)beta : zone; @@ -84,9 +79,9 @@ void chameleon_pzherk(cham_uplo_t uplo, cham_trans_t trans, &options, trans, ChamConjTrans, tempmm, tempnn, tempkn, A->mb, - zalpha, A(m, k), ldam, /* ldam * K */ - A(n, k), ldan, /* ldan * K */ - zbeta, C(m, n), ldcm); /* ldc * N */ + zalpha, A(m, k), /* ldam * K */ + A(n, k), /* ldan * K */ + zbeta, C(m, n)); /* ldc * N */ } } } @@ -96,7 +91,6 @@ void chameleon_pzherk(cham_uplo_t uplo, cham_trans_t trans, else { for (m = n+1; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldam = BLKLDD(A, m); for (k = 0; k < A->nt; k++) { tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; zbeta = k == 0 ? (CHAMELEON_Complex64_t)beta : zone; @@ -104,9 +98,9 @@ void chameleon_pzherk(cham_uplo_t uplo, cham_trans_t trans, &options, trans, ChamConjTrans, tempnn, tempmm, tempkn, A->mb, - zalpha, A(n, k), ldan, /* ldan * K */ - A(m, k), ldam, /* ldam * M */ - zbeta, C(n, m), ldcn); /* ldc * M */ + zalpha, A(n, k), /* ldan * K */ + A(m, k), /* ldam * M */ + zbeta, C(n, m)); /* ldc * M */ } } } @@ -117,14 +111,13 @@ void chameleon_pzherk(cham_uplo_t uplo, cham_trans_t trans, else { for (k = 0; k < A->mt; k++) { tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); dbeta = k == 0 ? beta : 1.0; INSERT_TASK_zherk( &options, uplo, trans, tempnn, tempkm, A->mb, - alpha, A(k, n), ldak, /* lda * N */ - dbeta, C(n, n), ldcn); /* ldc * N */ + alpha, A(k, n), /* lda * N */ + dbeta, C(n, n)); /* ldc * N */ } /* * Cham[Conj]Trans / ChamLower @@ -132,18 +125,16 @@ void chameleon_pzherk(cham_uplo_t uplo, cham_trans_t trans, if (uplo == ChamLower) { for (m = n+1; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); for (k = 0; k < A->mt; k++) { tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); zbeta = k == 0 ? (CHAMELEON_Complex64_t)beta : zone; INSERT_TASK_zgemm( &options, trans, ChamNoTrans, tempmm, tempnn, tempkm, A->mb, - zalpha, A(k, m), ldak, /* lda * M */ - A(k, n), ldak, /* lda * N */ - zbeta, C(m, n), ldcm); /* ldc * N */ + zalpha, A(k, m), /* lda * M */ + A(k, n), /* lda * N */ + zbeta, C(m, n)); /* ldc * N */ } } } @@ -155,15 +146,14 @@ void chameleon_pzherk(cham_uplo_t uplo, cham_trans_t trans, tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; for (k = 0; k < A->mt; k++) { tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); zbeta = k == 0 ? (CHAMELEON_Complex64_t)beta : zone; INSERT_TASK_zgemm( &options, trans, ChamNoTrans, tempnn, tempmm, tempkm, A->mb, - zalpha, A(k, n), ldak, /* lda * K */ - A(k, m), ldak, /* lda * M */ - zbeta, C(n, m), ldcn); /* ldc * M */ + zalpha, A(k, n), /* lda * K */ + A(k, m), /* lda * M */ + zbeta, C(n, m)); /* ldc * M */ } } } diff --git a/compute/pzhetrd_he2hb.c b/compute/pzhetrd_he2hb.c index a5b1aeb3ccac4d2f23520ddc296cded783f61061..8ebd19da5c7efc00082eb7e14f48e21b53e439c3 100644 --- a/compute/pzhetrd_he2hb.c +++ b/compute/pzhetrd_he2hb.c @@ -48,8 +48,6 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, size_t ws_host = 0; int k, m, n, i, j; - int ldak, ldak1, ldam, ldan, ldaj, ldai; - int lddk, lddk1, lddm, lddn, ldek, ldek1; int tempkm, tempkn, tempmm, tempnn, tempjj; int ib; @@ -98,14 +96,10 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, /* Let's extract the diagonal in a temporary copy that contains A and A' */ for (k = 1; k < A->nt; k++){ tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; - ldak = BLKLDD(A, k); - lddk = BLKLDD((&D), k); - - INSERT_TASK_zhe2ge(&options, - uplo, - tempkn, tempkn, ldak, - A(k, k), ldak, - D(k), lddk); + + INSERT_TASK_zhe2ge( &options, + uplo, tempkn, tempkn, A->mb, + A(k, k), D(k) ); } if (uplo == ChamLower) { @@ -114,28 +108,25 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, tempkm = k+1 == A->mt-1 ? A->m-(k+1)*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n- k *A->nb : A->nb; - ldak1 = BLKLDD(A, k+1); - lddk1 = BLKLDD((&D), k+1); - ldek1 = BLKLDD(E, k+1); INSERT_TASK_zgeqrt( &options, tempkm, tempkn, ib, A->nb, - A(k+1, k), ldak1, - T(k+1, k), T->mb); + A(k+1, k), + T(k+1, k)); #if defined(CHAMELEON_COPY_DIAG) INSERT_TASK_zlacpy( &options, ChamLower, tempkm, tempkn, A->nb, - A(k+1, k), ldak1, - E(k+1, k), ldek1 ); + A(k+1, k), + E(k+1, k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempkm, tempkn, 0., 1., - E(k+1, k), ldek1 ); + E(k+1, k) ); #endif #endif @@ -144,62 +135,57 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, &options, ChamLower, tempkm, tempkm, ib, A->nb, - E(k+1, k), ldak1, - T(k+1, k), T->mb, - D(k+1), lddk1); + E(k+1, k), + T(k+1, k), + D(k+1)); /* RIGHT on the remaining tiles until the bottom */ for (m = k+2; m < A->mt ; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); INSERT_TASK_zunmqr( &options, ChamRight, ChamNoTrans, tempmm, A->nb, tempkm, ib, A->nb, - E(k+1, k), ldek1, - T(k+1, k), T->mb, - A(m, k+1), ldam); + E(k+1, k), + T(k+1, k), + A(m, k+1)); } for (m = k+2; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); - lddm = BLKLDD((&D), m); options.priority = 1; INSERT_TASK_ztsqrt( &options, tempmm, A->nb, ib, A->nb, - A(k+1, k), ldak1, - A(m , k), ldam, - T(m , k), T->mb); + A(k+1, k), + A(m , k), + T(m , k)); options.priority = 0; /* LEFT */ for (i = k+2; i < m; i++) { - ldai = BLKLDD(A, i); INSERT_TASK_ztsmqr_hetra1( &options, ChamLeft, ChamConjTrans, A->mb, A->nb, tempmm, A->nb, A->nb, ib, A->nb, - A(i, k+1), ldai, - A(m, i), ldam, - A(m, k), ldam, - T(m, k), T->mb); + A(i, k+1), + A(m, i), + A(m, k), + T(m, k)); } /* RIGHT */ for (j = m+1; j < A->mt ; j++) { tempjj = j == A->mt-1 ? A->m-j*A->mb : A->mb; - ldaj = BLKLDD(A, j); INSERT_TASK_ztsmqr( &options, ChamRight, ChamNoTrans, tempjj, A->nb, tempjj, tempmm, A->nb, ib, A->nb, - A(j, k+1), ldaj, - A(j, m), ldaj, - A(m, k), ldam, - T(m, k), T->mb); + A(j, k+1), + A(j, m), + A(m, k), + T(m, k)); } /* LEFT->RIGHT */ @@ -218,8 +204,8 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, &options, ChamUpperLower, ChamConjTrans, tempmm, A->nb, A->nb, - A(m, k+1), ldam, - AT(m), ldak1); + A(m, k+1), + AT(m)); /* Left application on |A1| */ /* |A2| */ @@ -227,10 +213,10 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, &options, ChamLeft, ChamConjTrans, A->mb, A->nb, tempmm, A->nb, A->nb, ib, A->nb, - D(k+1), lddk1, - A(m, k+1), ldam, - A(m, k), ldam, - T(m, k), T->mb); + D(k+1), + A(m, k+1), + A(m, k), + T(m, k)); /* Left application on | A2'| */ /* | A3 | */ @@ -238,30 +224,30 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, &options, ChamLeft, ChamConjTrans, A->mb, tempmm, tempmm, tempmm, A->nb, ib, A->nb, - AT(m), ldak1, - D(m), lddm, - A(m, k), ldam, - T(m, k), T->mb); + AT(m), + D(m), + A(m, k), + T(m, k)); /* Right application on | A1 A2' | */ INSERT_TASK_ztsmqr( &options, ChamRight, ChamNoTrans, A->mb, A->nb, A->mb, tempmm, A->nb, ib, A->nb, - D(k+1), lddk1, - AT(m) , ldak1, - A(m, k), ldam, - T(m, k), T->mb); + D(k+1), + AT(m) , + A(m, k), + T(m, k)); /* Right application on | A2 A3 | */ INSERT_TASK_ztsmqr( &options, ChamRight, ChamNoTrans, tempmm, A->nb, tempmm, tempmm, A->nb, ib, A->nb, - A(m, k+1), ldam, - D(m), lddm, - A(m, k), ldam, - T(m, k), T->mb); + A(m, k+1), + D(m), + A(m, k), + T(m, k)); options.priority = 0; } @@ -274,28 +260,24 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, tempkn = k+1 == A->nt-1 ? A->n-(k+1)*A->nb : A->nb; tempkm = k == A->mt-1 ? A->m- k *A->mb : A->mb; - ldak = BLKLDD(A, k); - ldek = BLKLDD(E, k); - ldak1 = BLKLDD(A, k+1); - lddk1 = BLKLDD((&D), k+1); INSERT_TASK_zgelqt( &options, tempkm, tempkn, ib, A->nb, - A(k, k+1), ldak, - T(k, k+1), T->mb); + A(k, k+1), + T(k, k+1)); #if defined(CHAMELEON_COPY_DIAG) INSERT_TASK_zlacpy( &options, ChamUpper, tempkm, tempkn, A->nb, - A(k, k+1), ldak, - E(k, k+1), ldek ); + A(k, k+1), + E(k, k+1) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkm, tempkn, 0., 1., - E(k, k+1), ldek ); + E(k, k+1) ); #endif #endif @@ -304,9 +286,9 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, &options, ChamUpper, tempkn, tempkn, ib, A->nb, - E(k, k+1), ldek, - T(k, k+1), T->mb, - D(k+1), lddk1); + E(k, k+1), + T(k, k+1), + D(k+1)); /* LEFT on the remaining tiles until the left side */ for (n = k+2; n < A->nt ; n++) { @@ -315,35 +297,32 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, &options, ChamLeft, ChamNoTrans, A->mb, tempnn, tempkn, ib, A->nb, - E(k, k+1), ldek, - T(k, k+1), T->mb, - A(k+1, n ), ldak1); + E(k, k+1), + T(k, k+1), + A(k+1, n )); } for (n = k+2; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; - ldan = BLKLDD(A, n); - lddn = BLKLDD((&D), n); options.priority = 1; INSERT_TASK_ztslqt( &options, A->mb, tempnn, ib, A->nb, - A(k, k+1), ldak, - A(k, n ), ldak, - T(k, n ), T->mb); + A(k, k+1), + A(k, n ), + T(k, n )); options.priority = 0; /* RIGHT */ for (i = k+2; i < n; i++) { - ldai = BLKLDD(A, i); INSERT_TASK_ztsmlq_hetra1( &options, ChamRight, ChamConjTrans, A->mb, A->nb, A->nb, tempnn, A->nb, ib, A->nb, - A(k+1, i), ldak1, - A(i, n), ldai, - A(k, n), ldak, - T(k, n), T->mb); + A(k+1, i), + A(i, n), + A(k, n), + T(k, n)); } /* LEFT */ @@ -353,10 +332,10 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, &options, ChamLeft, ChamNoTrans, A->nb, tempjj, tempnn, tempjj, A->nb, ib, A->nb, - A(k+1, j), ldak1, - A(n, j), ldan, - A(k, n), ldak, - T(k, n), T->mb); + A(k+1, j), + A(n, j), + A(k, n), + T(k, n)); } /* RIGHT->LEFT */ @@ -375,28 +354,28 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, &options, ChamUpperLower, ChamConjTrans, A->mb, tempnn, A->nb, - A(k+1, n), ldak1, - AT(n), A->mb); + A(k+1, n), + AT(n) ); /* Right application on | A1 A2 | */ INSERT_TASK_ztsmlq( &options, ChamRight, ChamConjTrans, A->mb, A->nb, A->mb, tempnn, A->nb, ib, A->nb, - D(k+1), lddk1, - A(k+1, n), ldak1, - A(k, n), ldak, - T(k, n), T->mb); + D(k+1), + A(k+1, n), + A(k, n), + T(k, n)); /* Right application on | A2' A3 | */ INSERT_TASK_ztsmlq( &options, ChamRight, ChamConjTrans, tempnn, A->nb, tempnn, tempnn, A->nb, ib, A->nb, - AT(n), A->mb, - D(n), lddn, - A(k, n), ldak, - T(k, n), T->mb); + AT(n), + D(n), + A(k, n), + T(k, n)); /* Left application on |A1 | */ /* |A2'| */ @@ -404,10 +383,10 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, &options, ChamLeft, ChamNoTrans, A->mb, A->nb, tempnn, A->nb, A->nb, ib, A->nb, - D(k+1), lddk1, - AT(n), A->mb, - A(k, n), ldak, - T(k, n), T->mb); + D(k+1), + AT(n), + A(k, n), + T(k, n)); /* Left application on | A2 | */ /* | A3 | */ @@ -415,10 +394,10 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, &options, ChamLeft, ChamNoTrans, A->mb, tempnn, tempnn, tempnn, A->nb, ib, A->nb, - A(k+1, n), ldak1, - D(n), lddn, - A(k, n), ldak, - T(k, n), T->mb); + A(k+1, n), + D(n), + A(k, n), + T(k, n)); } options.priority = 0; @@ -429,13 +408,9 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, /* Copy-back into A */ for (k = 1; k < A->nt; k++){ tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; - ldak = BLKLDD(A, k); - lddk = BLKLDD((&D), k); - INSERT_TASK_zlacpy(&options, - uplo, - tempkn, tempkn, ldak, - D(k), lddk, - A(k, k), ldak); + INSERT_TASK_zlacpy( &options, + uplo, tempkn, tempkn, A->mb, + D(k), A(k, k)); } diff --git a/compute/pzlacpy.c b/compute/pzlacpy.c index d526cbe4d145be47b5f5f2dc2c0abc3f4c8b0eb4..397b122b5298fc9999945be1fadd15bffd173027 100644 --- a/compute/pzlacpy.c +++ b/compute/pzlacpy.c @@ -25,21 +25,14 @@ #define A(m,n) A, m, n #define B(m,n) B, m, n -/** - * - */ -/** - * - */ void chameleon_pzlacpy(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, - RUNTIME_sequence_t *sequence, RUNTIME_request_t *request) + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request) { CHAM_context_t *chamctxt; RUNTIME_option_t options; int X, Y; int m, n; - int ldam, ldbm; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) { @@ -54,16 +47,14 @@ void chameleon_pzlacpy(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, case ChamUpper: for (m = 0; m < A->mt; m++) { X = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); - ldbm = BLKLDD(B, m); if (m < A->nt) { Y = m == A->nt-1 ? A->n-m*A->nb : A->nb; INSERT_TASK_zlacpy( &options, ChamUpper, X, Y, A->mb, - A(m, m), ldam, - B(m, m), ldbm); + A(m, m), + B(m, m)); } for (n = m+1; n < A->nt; n++) { Y = n == A->nt-1 ? A->n-n*A->nb : A->nb; @@ -71,8 +62,8 @@ void chameleon_pzlacpy(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, &options, ChamUpperLower, X, Y, A->mb, - A(m, n), ldam, - B(m, n), ldbm); + A(m, n), + B(m, n)); } } break; @@ -82,16 +73,14 @@ void chameleon_pzlacpy(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, case ChamLower: for (m = 0; m < A->mt; m++) { X = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); - ldbm = BLKLDD(B, m); if (m < A->nt) { Y = m == A->nt-1 ? A->n-m*A->nb : A->nb; INSERT_TASK_zlacpy( &options, ChamLower, X, Y, A->mb, - A(m, m), ldam, - B(m, m), ldbm); + A(m, m), + B(m, m)); } for (n = 0; n < chameleon_min(m, A->nt); n++) { Y = n == A->nt-1 ? A->n-n*A->nb : A->nb; @@ -99,8 +88,8 @@ void chameleon_pzlacpy(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, &options, ChamUpperLower, X, Y, A->mb, - A(m, n), ldam, - B(m, n), ldbm); + A(m, n), + B(m, n)); } } break; @@ -111,16 +100,14 @@ void chameleon_pzlacpy(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, default: for (m = 0; m < A->mt; m++) { X = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); - ldbm = BLKLDD(B, m); for (n = 0; n < A->nt; n++) { Y = n == A->nt-1 ? A->n-n*A->nb : A->nb; INSERT_TASK_zlacpy( &options, ChamUpperLower, X, Y, A->mb, - A(m, n), ldam, - B(m, n), ldbm); + A(m, n), + B(m, n)); } } } diff --git a/compute/pzlag2c.c b/compute/pzlag2c.c index 7704fffdd254e5571cf0064f02b84d0b8855b459..b574b2a3370715eadc40c6b4972cfb6e3b7e245b 100644 --- a/compute/pzlag2c.c +++ b/compute/pzlag2c.c @@ -41,7 +41,6 @@ void chameleon_pclag2z(CHAM_desc_t *SA, CHAM_desc_t *B, int X, Y; int m, n; - int ldam, ldbm; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) { @@ -51,15 +50,13 @@ void chameleon_pclag2z(CHAM_desc_t *SA, CHAM_desc_t *B, for(m = 0; m < SA->mt; m++) { X = m == SA->mt-1 ? SA->m-m*SA->mb : SA->mb; - ldam = BLKLDD(SA, m); - ldbm = BLKLDD(B, m); for(n = 0; n < SA->nt; n++) { Y = n == SA->nt-1 ? SA->n-n*SA->nb : SA->nb; INSERT_TASK_clag2z( &options, X, Y, SA->mb, - SA(m, n), ldam, - B(m, n), ldbm); + SA(m, n), + B(m, n)); } } RUNTIME_options_finalize(&options, chamctxt); diff --git a/compute/pzlange.c b/compute/pzlange.c index f909fa25fa34ff17bc6d0c72b811916a187d284e..a0b6a6d90faa23ddb98cf5354300f34573b513cd 100644 --- a/compute/pzlange.c +++ b/compute/pzlange.c @@ -56,19 +56,18 @@ chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, for(m = mmin; m < mmax; m++) { int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb; - int ldam = BLKLDD( A, m ); if ( (n == m) && (uplo != ChamUpperLower) ) { INSERT_TASK_ztrasm( options, ChamColumnwise, uplo, diag, tempmm, tempnn, - A(m, n), ldam, W( Wcol, m, n ) ); + A(m, n), W( Wcol, m, n ) ); } else { INSERT_TASK_dzasum( options, ChamColumnwise, ChamUpperLower, tempmm, tempnn, - A(m, n), ldam, W( Wcol, m, n ) ); + A(m, n), W( Wcol, m, n ) ); } if ( m >= P ) { @@ -93,7 +92,7 @@ chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, INSERT_TASK_dlange( options, ChamMaxNorm, 1, tempnn, A->nb, - W( Wcol, 0, n ), 1, + W( Wcol, 0, n ), W( Welt, 0, n ) ); } @@ -144,7 +143,6 @@ chameleon_pzlange_inf( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, int nmax = ( uplo == ChamLower ) ? chameleon_min(m+1, NT) : NT; int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb; - int ldam = BLKLDD( A, m ); for(n = nmin; n < nmax; n++) { int tempnn = ( n == (NT-1) ) ? N - n * A->nb : A->nb; @@ -153,13 +151,13 @@ chameleon_pzlange_inf( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, INSERT_TASK_ztrasm( options, ChamRowwise, uplo, diag, tempmm, tempnn, - A(m, n), ldam, W( Wcol, m, n) ); + A(m, n), W( Wcol, m, n) ); } else { INSERT_TASK_dzasum( options, ChamRowwise, ChamUpperLower, tempmm, tempnn, - A(m, n), ldam, W( Wcol, m, n) ); + A(m, n), W( Wcol, m, n) ); } if ( n >= Q ) { @@ -184,7 +182,7 @@ chameleon_pzlange_inf( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, INSERT_TASK_dlange( options, ChamMaxNorm, tempmm, 1, A->nb, - W( Wcol, m, 0), 1, W( Welt, m, 0)); + W( Wcol, m, 0), W( Welt, m, 0)); } /** @@ -231,7 +229,6 @@ chameleon_pzlange_max( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_ int nmax = ( uplo == ChamLower ) ? chameleon_min(m+1, NT) : NT; int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb; - int ldam = BLKLDD( A, m ); for(n = nmin; n < nmax; n++) { int tempnn = ( n == (NT-1) ) ? N - n * A->nb : A->nb; @@ -240,13 +237,13 @@ chameleon_pzlange_max( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_ INSERT_TASK_zlantr( options, ChamMaxNorm, uplo, diag, tempmm, tempnn, A->nb, - A(m, n), ldam, W( Welt, m, n)); + A(m, n), W( Welt, m, n)); } else { INSERT_TASK_zlange( options, ChamMaxNorm, tempmm, tempnn, A->nb, - A(m, n), ldam, W( Welt, m, n )); + A(m, n), W( Welt, m, n )); } if ( n >= Q ) { @@ -315,7 +312,6 @@ chameleon_pzlange_frb( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_ int nmax = ( uplo == ChamLower ) ? chameleon_min(m+1, NT) : NT; int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb; - int ldam = BLKLDD( A, m ); for(n = nmin; n < nmax; n++) { int tempnn = ( n == (NT-1) ) ? N - n * A->nb : A->nb; @@ -324,14 +320,14 @@ chameleon_pzlange_frb( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_ INSERT_TASK_ztrssq( options, uplo, diag, tempmm, tempnn, - A(m, n), ldam, W( Welt, m, n) ); + A(m, n), W( Welt, m, n) ); } else { INSERT_TASK_zgessq( options, ChamEltwise, tempmm, tempnn, - A(m, n), ldam, W( Welt, m, n) ); + A(m, n), W( Welt, m, n) ); } if ( n >= Q ) { @@ -469,7 +465,7 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia &options, ChamUpperLower, Wcol.mb, Wcol.nb, alpha, beta, - W( &Wcol, m, n ), Wcol.mb ); + W( &Wcol, m, n ) ); } } } @@ -479,7 +475,7 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia &options, ChamUpperLower, Welt.mb, Welt.nb, alpha, beta, - W( &Welt, m, n ), Welt.mb ); + W( &Welt, m, n ) ); } } @@ -512,7 +508,7 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia INSERT_TASK_dlacpy( &options, ChamUpperLower, 1, 1, 1, - W( &Welt, 0, 0 ), 1, W( &Welt, m, n ), 1); + W( &Welt, 0, 0 ), W( &Welt, m, n ) ); } } } diff --git a/compute/pzlansy.c b/compute/pzlansy.c index e799f57285430e09eccd348abdca54d0c31d6cdc..7698ed3899f08e7b9337491226caffc90854345f 100644 --- a/compute/pzlansy.c +++ b/compute/pzlansy.c @@ -52,7 +52,6 @@ chameleon_pzlansy_inf( cham_uplo_t uplo, CHAM_desc_t *A, int nmax = ( uplo == ChamLower ) ? chameleon_min(m+1, NT) : NT; int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb; - int ldam = BLKLDD( A, m ); for(n = nmin; n < nmax; n++) { int tempnn = ( n == (NT-1) ) ? N - n * A->nb : A->nb; @@ -61,18 +60,18 @@ chameleon_pzlansy_inf( cham_uplo_t uplo, CHAM_desc_t *A, INSERT_TASK_dzasum( options, ChamRowwise, uplo, tempmm, tempnn, - A(m, n), ldam, W( Wcol, m, n) ); + A(m, n), W( Wcol, m, n) ); } else { INSERT_TASK_dzasum( options, ChamRowwise, ChamUpperLower, tempmm, tempnn, - A(m, n), ldam, W( Wcol, m, n) ); + A(m, n), W( Wcol, m, n) ); INSERT_TASK_dzasum( options, ChamColumnwise, ChamUpperLower, tempmm, tempnn, - A(m, n), ldam, W( Wcol, n, m) ); + A(m, n), W( Wcol, n, m) ); } } } @@ -101,7 +100,7 @@ chameleon_pzlansy_inf( cham_uplo_t uplo, CHAM_desc_t *A, INSERT_TASK_dlange( options, ChamMaxNorm, tempmm, 1, A->nb, - W( Wcol, m, 0), 1, W( Welt, m, 0)); + W( Wcol, m, 0), W( Welt, m, 0)); } /** @@ -146,7 +145,6 @@ chameleon_pzlansy_max( cham_trans_t trans, cham_uplo_t uplo, CHAM_desc_t *A, int nmax = (uplo == ChamLower ) ? chameleon_min(m+1, NT) : NT; int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb; - int ldam = BLKLDD( A, m ); for(n = nmin; n < nmax; n++) { int tempnn = ( n == (NT-1) ) ? N - n * A->nb : A->nb; @@ -156,20 +154,20 @@ chameleon_pzlansy_max( cham_trans_t trans, cham_uplo_t uplo, CHAM_desc_t *A, INSERT_TASK_zlanhe( options, ChamMaxNorm, uplo, tempmm, A->nb, - A(m, n), ldam, W( Welt, m, n)); + A(m, n), W( Welt, m, n)); } else { INSERT_TASK_zlansy( options, ChamMaxNorm, uplo, tempmm, A->nb, - A(m, n), ldam, W( Welt, m, n)); + A(m, n), W( Welt, m, n)); } } else { INSERT_TASK_zlange( options, ChamMaxNorm, tempmm, tempnn, A->nb, - A(m, n), ldam, W( Welt, m, n)); + A(m, n), W( Welt, m, n)); } if ( n >= Q ) { @@ -233,7 +231,6 @@ chameleon_pzlansy_frb( cham_trans_t trans, cham_uplo_t uplo, int nmax = (uplo == ChamLower ) ? chameleon_min(m+1, NT) : NT; int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb; - int ldam = BLKLDD( A, m ); for(n = nmin; n < nmax; n++) { int tempnn = ( n == (NT-1) ) ? N - n * A->nb : A->nb; @@ -242,21 +239,21 @@ chameleon_pzlansy_frb( cham_trans_t trans, cham_uplo_t uplo, if ( trans == ChamConjTrans) { INSERT_TASK_zhessq( options, ChamEltwise, uplo, tempmm, - A(m, n), ldam, W( Welt, m, n) ); + A(m, n), W( Welt, m, n) ); } else { INSERT_TASK_zsyssq( options, ChamEltwise, uplo, tempmm, - A(m, n), ldam, W( Welt, m, n) ); + A(m, n), W( Welt, m, n) ); } } else { INSERT_TASK_zgessq( options, ChamEltwise, tempmm, tempnn, - A(m, n), ldam, W( Welt, m, n) ); + A(m, n), W( Welt, m, n) ); INSERT_TASK_zgessq( options, ChamEltwise, tempmm, tempnn, - A(m, n), ldam, W( Welt, n, m) ); + A(m, n), W( Welt, n, m) ); } } } @@ -380,7 +377,7 @@ void chameleon_pzlansy_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_tra &options, ChamUpperLower, Wcol.mb, Wcol.nb, alpha, beta, - W( &Wcol, m, n ), Wcol.mb ); + W( &Wcol, m, n ) ); } } } @@ -390,7 +387,7 @@ void chameleon_pzlansy_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_tra &options, ChamUpperLower, Welt.mb, Welt.nb, alpha, beta, - W( &Welt, m, n ), Welt.mb ); + W( &Welt, m, n ) ); } } @@ -419,7 +416,7 @@ void chameleon_pzlansy_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_tra INSERT_TASK_dlacpy( &options, ChamUpperLower, 1, 1, 1, - W( &Welt, 0, 0 ), 1, W( &Welt, m, n ), 1); + W( &Welt, 0, 0 ), W( &Welt, m, n )); } } } diff --git a/compute/pzlascal.c b/compute/pzlascal.c index 93119debc461d8bf2f69062e176f5df409ee2004..15c81af25faaf89ee1ba38c8a99cb123e5b3a776 100644 --- a/compute/pzlascal.c +++ b/compute/pzlascal.c @@ -31,7 +31,6 @@ void chameleon_pzlascal(cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc int tempmm, tempnn, tempmn, tempnm; int m, n; - int ldam, ldan; int minmnt = chameleon_min(A->mt, A->nt); chamctxt = chameleon_context_self(); @@ -46,21 +45,19 @@ void chameleon_pzlascal(cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc for (n = 0; n < minmnt; n++) { tempnm = n == A->mt-1 ? A->m-n*A->mb : A->mb; tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; - ldan = BLKLDD(A, n); INSERT_TASK_zlascal( &options, ChamLower, tempnm, tempnn, A->mb, - alpha, A(n, n), ldan); + alpha, A(n, n)); for (m = n+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-A->mb*m : A->nb; - ldam = BLKLDD(A, m); INSERT_TASK_zlascal( &options, ChamUpperLower, tempmm, tempnn, A->mb, - alpha, A(m, n), ldam); + alpha, A(m, n)); } } break; @@ -69,12 +66,11 @@ void chameleon_pzlascal(cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc for (m = 0; m < minmnt; m++) { tempmm = m == A->mt-1 ? A->m-A->mb*m : A->nb; tempmn = m == A->nt-1 ? A->n-m*A->nb : A->nb; - ldam = BLKLDD(A, m); INSERT_TASK_zlascal( &options, ChamUpper, tempmm, tempmn, A->mb, - alpha, A(m, m), ldam); + alpha, A(m, m)); for (n = m+1; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; @@ -82,7 +78,7 @@ void chameleon_pzlascal(cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc INSERT_TASK_zlascal( &options, ChamUpperLower, tempmm, tempnn, A->mb, - alpha, A(m, n), ldam); + alpha, A(m, n)); } } break; @@ -91,7 +87,6 @@ void chameleon_pzlascal(cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc default: for (m = 0; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-A->mb*m : A->nb; - ldam = BLKLDD(A, m); for (n = 0; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; @@ -99,7 +94,7 @@ void chameleon_pzlascal(cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc INSERT_TASK_zlascal( &options, ChamUpperLower, tempmm, tempnn, A->mb, - alpha, A(m, n), ldam); + alpha, A(m, n)); } } } diff --git a/compute/pzlaset.c b/compute/pzlaset.c index d874f8dd8798fb72a3bc1536ab55dc1e4e47a698..02fd03af9b2ac03cb6cf9c91efeaa14e1c2cb814 100644 --- a/compute/pzlaset.c +++ b/compute/pzlaset.c @@ -38,7 +38,6 @@ void chameleon_pzlaset( cham_uplo_t uplo, RUNTIME_option_t options; int i, j; - int ldai, ldaj; int tempim; int tempjm, tempjn; int minmn = chameleon_min(A->mt, A->nt); @@ -54,26 +53,23 @@ void chameleon_pzlaset( cham_uplo_t uplo, for (j = 0; j < minmn; j++){ tempjm = j == A->mt-1 ? A->m-j*A->mb : A->mb; tempjn = j == A->nt-1 ? A->n-j*A->nb : A->nb; - ldaj = BLKLDD(A, j); INSERT_TASK_zlaset( &options, ChamLower, tempjm, tempjn, alpha, beta, - A(j, j), ldaj); + A(j, j)); for (i = j+1; i < A->mt; i++){ tempim = i == A->mt-1 ? A->m-i*A->mb : A->mb; - ldai = BLKLDD(A, i); INSERT_TASK_zlaset( &options, ChamUpperLower, tempim, tempjn, alpha, alpha, - A(i, j), ldai); + A(i, j)); } } } else if (uplo == ChamUpper) { for (i = 0; i < A->mt; i++) { tempim = i == A->mt-1 ? A->m-i*A->mb : A->mb; - ldai = BLKLDD(A, i); if ( i < A->nt ) { j = i; @@ -82,7 +78,7 @@ void chameleon_pzlaset( cham_uplo_t uplo, INSERT_TASK_zlaset( &options, uplo, tempim, tempjn, - alpha, beta, A(i, j), ldai); + alpha, beta, A(i, j)); } for (j = i+1; j < A->nt; j++) { tempjn = j == A->nt-1 ? A->n-j*A->nb : A->nb; @@ -90,21 +86,20 @@ void chameleon_pzlaset( cham_uplo_t uplo, INSERT_TASK_zlaset( &options, ChamUpperLower, tempim, tempjn, - alpha, alpha, A(i, j), ldai); + alpha, alpha, A(i, j)); } } } else { for (i = 0; i < A->mt; i++){ tempim = i == A->mt-1 ? A->m-i*A->mb : A->mb; - ldai = BLKLDD(A, i); for (j = 0; j < A->nt; j++){ tempjn = j == A->nt-1 ? A->n-j*A->nb : A->nb; INSERT_TASK_zlaset( &options, ChamUpperLower, tempim, tempjn, alpha, (i == j) ? beta : alpha, - A(i, j), ldai); + A(i, j)); } } } diff --git a/compute/pzlaset2.c b/compute/pzlaset2.c index b982ed47e9df7d2638a64ca169567d851e6d8b29..8fb02e425c8426e9814696e48d40027691761280 100644 --- a/compute/pzlaset2.c +++ b/compute/pzlaset2.c @@ -37,7 +37,6 @@ void chameleon_pzlaset2(cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, RUNTIME_option_t options; int i, j; - int ldai, ldaj; int tempim; int tempjm, tempjn; int minmn = chameleon_min(A->mt, A->nt); @@ -53,19 +52,17 @@ void chameleon_pzlaset2(cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, for (j = 0; j < minmn; j++){ tempjm = j == A->mt-1 ? A->m-j*A->mb : A->mb; tempjn = j == A->nt-1 ? A->n-j*A->nb : A->nb; - ldaj = BLKLDD(A, j); INSERT_TASK_zlaset2( &options, ChamLower, tempjm, tempjn, alpha, - A(j, j), ldaj); + A(j, j)); for (i = j+1; i < A->mt; i++){ tempim = i == A->mt-1 ? A->m-i*A->mb : A->mb; - ldai = BLKLDD(A, i); INSERT_TASK_zlaset2( &options, ChamUpperLower, tempim, tempjn, alpha, - A(i, j), ldai); + A(i, j)); } } } @@ -74,33 +71,30 @@ void chameleon_pzlaset2(cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, tempjn = j == A->nt-1 ? A->n-j*A->nb : A->nb; for (i = 0; i < chameleon_min(j, A->mt); i++){ tempim = i == A->mt-1 ? A->m-i*A->mb : A->mb; - ldai = BLKLDD(A, i); INSERT_TASK_zlaset2( &options, ChamUpperLower, tempim, tempjn, alpha, - A(i, j), ldai); + A(i, j)); } } for (j = 0; j < minmn; j++){ tempjm = j == A->mt-1 ? A->m-j*A->mb : A->mb; tempjn = j == A->nt-1 ? A->n-j*A->nb : A->nb; - ldaj = BLKLDD(A, j); INSERT_TASK_zlaset2( &options, ChamUpper, tempjm, tempjn, alpha, - A(j, j), ldaj); + A(j, j)); } } else { for (i = 0; i < A->mt; i++){ tempim = i == A->mt-1 ? A->m-i*A->mb : A->mb; - ldai = BLKLDD(A, i); for (j = 0; j < A->nt; j++){ tempjn = j == A->nt-1 ? A->n-j*A->nb : A->nb; INSERT_TASK_zlaset2( &options, ChamUpperLower, tempim, tempjn, alpha, - A(i, j), ldai); + A(i, j)); } } } diff --git a/compute/pzlauum.c b/compute/pzlauum.c index 660ab4e80e8e86fd79bb3115cae265940909ff53..9ad726dff6828c646cf259271d333f1f35131035 100644 --- a/compute/pzlauum.c +++ b/compute/pzlauum.c @@ -36,7 +36,6 @@ void chameleon_pzlauum(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_option_t options; int k, m, n; - int ldak, ldam, ldan; int tempkm, tempkn; chamctxt = chameleon_context_self(); @@ -50,25 +49,22 @@ void chameleon_pzlauum(cham_uplo_t uplo, CHAM_desc_t *A, if (uplo == ChamLower) { for (k = 0; k < A->mt; k++) { tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); for(n = 0; n < k; n++) { - ldan = BLKLDD(A, n); INSERT_TASK_zherk( &options, uplo, ChamConjTrans, A->mb, tempkm, A->mb, - 1.0, A(k, n), ldak, - 1.0, A(n, n), ldan); + 1.0, A(k, n), + 1.0, A(n, n)); for(m = n+1; m < k; m++) { - ldam = BLKLDD(A, m); INSERT_TASK_zgemm( &options, ChamConjTrans, ChamNoTrans, A->mb, A->nb, tempkm, A->mb, - 1.0, A(k, m), ldak, - A(k, n), ldak, - 1.0, A(m, n), ldam); + 1.0, A(k, m), + A(k, n), + 1.0, A(m, n)); } } for (n = 0; n < k; n++) { @@ -77,14 +73,14 @@ void chameleon_pzlauum(cham_uplo_t uplo, CHAM_desc_t *A, &options, ChamLeft, uplo, ChamConjTrans, ChamNonUnit, tempkm, A->nb, A->mb, - 1.0, A(k, k), ldak, - A(k, n), ldak); + 1.0, A(k, k), + A(k, n)); } RUNTIME_data_flush( sequence, A(k, k) ); INSERT_TASK_zlauum( &options, uplo, tempkm, A->mb, - A(k, k), ldak); + A(k, k)); } } /* @@ -93,43 +89,39 @@ void chameleon_pzlauum(cham_uplo_t uplo, CHAM_desc_t *A, else { for (k = 0; k < A->mt; k++) { tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; - ldak = BLKLDD(A, k); for (m = 0; m < k; m++) { - ldam = BLKLDD(A, m); INSERT_TASK_zherk( &options, uplo, ChamNoTrans, A->mb, tempkn, A->mb, - 1.0, A(m, k), ldam, - 1.0, A(m, m), ldam); + 1.0, A(m, k), + 1.0, A(m, m)); for (n = m+1; n < k; n++){ - ldan = BLKLDD(A, n); INSERT_TASK_zgemm( &options, ChamNoTrans, ChamConjTrans, A->mb, A->nb, tempkn, A->mb, - 1.0, A(m, k), ldam, - A(n, k), ldan, - 1.0, A(m, n), ldam); + 1.0, A(m, k), + A(n, k), + 1.0, A(m, n)); } } for (m = 0; m < k; m++) { - ldam = BLKLDD(A, m); RUNTIME_data_flush( sequence, A(m, k) ); INSERT_TASK_ztrmm( &options, ChamRight, uplo, ChamConjTrans, ChamNonUnit, A->mb, tempkn, A->mb, - 1.0, A(k, k), ldak, - A(m, k), ldam); + 1.0, A(k, k), + A(m, k)); } RUNTIME_data_flush( sequence, A(k, k) ); INSERT_TASK_zlauum( &options, uplo, tempkn, A->mb, - A(k, k), ldak); + A(k, k)); } } RUNTIME_options_finalize(&options, chamctxt); diff --git a/compute/pzplghe.c b/compute/pzplghe.c index 27ae7d355601139cef866aca8ceca1e3be7d99ee..26fa75c031c2b85b57ae60b972c87d3f934f9a90 100644 --- a/compute/pzplghe.c +++ b/compute/pzplghe.c @@ -35,7 +35,6 @@ void chameleon_pzplghe( double bump, cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_option_t options; int m, n, minmn; - int ldam; int tempmm, tempnn; chamctxt = chameleon_context_self(); @@ -52,12 +51,11 @@ void chameleon_pzplghe( double bump, cham_uplo_t uplo, CHAM_desc_t *A, for (m = n; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); options.priority = m + n; INSERT_TASK_zplghe( &options, - bump, tempmm, tempnn, A(m, n), ldam, + bump, tempmm, tempnn, A(m, n), A->m, m*A->mb, n*A->nb, seed ); } } @@ -66,7 +64,6 @@ void chameleon_pzplghe( double bump, cham_uplo_t uplo, CHAM_desc_t *A, case ChamUpper: for (m = 0; m < minmn; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); for (n = m; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; @@ -74,7 +71,7 @@ void chameleon_pzplghe( double bump, cham_uplo_t uplo, CHAM_desc_t *A, options.priority = m + n; INSERT_TASK_zplghe( &options, - bump, tempmm, tempnn, A(m, n), ldam, + bump, tempmm, tempnn, A(m, n), A->m, m*A->mb, n*A->nb, seed ); } } @@ -83,7 +80,6 @@ void chameleon_pzplghe( double bump, cham_uplo_t uplo, CHAM_desc_t *A, default: for (m = 0; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); for (n = 0; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; @@ -91,7 +87,7 @@ void chameleon_pzplghe( double bump, cham_uplo_t uplo, CHAM_desc_t *A, options.priority = m + n; INSERT_TASK_zplghe( &options, - bump, tempmm, tempnn, A(m, n), ldam, + bump, tempmm, tempnn, A(m, n), A->m, m*A->mb, n*A->nb, seed ); } } diff --git a/compute/pzplgsy.c b/compute/pzplgsy.c index 38bc8fcba6f6dd5831f0a5493efb6bf8400df559..cddda67a55e03239b06ab91726bb23ef8546268c 100644 --- a/compute/pzplgsy.c +++ b/compute/pzplgsy.c @@ -35,7 +35,6 @@ void chameleon_pzplgsy( CHAMELEON_Complex64_t bump, cham_uplo_t uplo, CHAM_desc_ RUNTIME_option_t options; int m, n, minmn; - int ldam; int tempmm, tempnn; chamctxt = chameleon_context_self(); @@ -52,12 +51,11 @@ void chameleon_pzplgsy( CHAMELEON_Complex64_t bump, cham_uplo_t uplo, CHAM_desc_ for (m = n; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); options.priority = m + n; INSERT_TASK_zplgsy( &options, - bump, tempmm, tempnn, A(m, n), ldam, + bump, tempmm, tempnn, A(m, n), A->m, m*A->mb, n*A->nb, seed ); } } @@ -66,7 +64,6 @@ void chameleon_pzplgsy( CHAMELEON_Complex64_t bump, cham_uplo_t uplo, CHAM_desc_ case ChamUpper: for (m = 0; m < minmn; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); for (n = m; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; @@ -74,7 +71,7 @@ void chameleon_pzplgsy( CHAMELEON_Complex64_t bump, cham_uplo_t uplo, CHAM_desc_ options.priority = m + n; INSERT_TASK_zplgsy( &options, - bump, tempmm, tempnn, A(m, n), ldam, + bump, tempmm, tempnn, A(m, n), A->m, m*A->mb, n*A->nb, seed ); } } @@ -83,7 +80,6 @@ void chameleon_pzplgsy( CHAMELEON_Complex64_t bump, cham_uplo_t uplo, CHAM_desc_ default: for (m = 0; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); for (n = 0; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; @@ -91,7 +87,7 @@ void chameleon_pzplgsy( CHAMELEON_Complex64_t bump, cham_uplo_t uplo, CHAM_desc_ options.priority = m + n; INSERT_TASK_zplgsy( &options, - bump, tempmm, tempnn, A(m, n), ldam, + bump, tempmm, tempnn, A(m, n), A->m, m*A->mb, n*A->nb, seed ); } } diff --git a/compute/pzplrnt.c b/compute/pzplrnt.c index d7b18ae25d80ed549a343d98edb102c259722bf0..cbcc50a674a1c6654df7b5e8f7f76df8f2781563 100644 --- a/compute/pzplrnt.c +++ b/compute/pzplrnt.c @@ -34,7 +34,6 @@ void chameleon_pzplrnt( CHAM_desc_t *A, unsigned long long int seed, RUNTIME_option_t options; int m, n; - int ldam; int tempmm, tempnn; chamctxt = chameleon_context_self(); @@ -45,14 +44,13 @@ void chameleon_pzplrnt( CHAM_desc_t *A, unsigned long long int seed, for (m = 0; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); for (n = 0; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; INSERT_TASK_zplrnt( &options, - tempmm, tempnn, A(m, n), ldam, + tempmm, tempnn, A(m, n), A->m, m*A->mb, n*A->nb, seed ); } } diff --git a/compute/pzpotrf.c b/compute/pzpotrf.c index 68b0e926887d73c3ef7847c50d4cf667ae6c8d31..c9c77abbf5ab8476f38a1036735b15507bf315b6 100644 --- a/compute/pzpotrf.c +++ b/compute/pzpotrf.c @@ -37,7 +37,6 @@ void chameleon_pzpotrf(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_option_t options; int k, m, n; - int ldak, ldam, ldan; int tempkm, tempmm, tempnn; size_t ws_host = 0; @@ -60,52 +59,48 @@ void chameleon_pzpotrf(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_iteration_push(chamctxt, k); tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); options.priority = 2*A->mt - 2*k; INSERT_TASK_zpotrf( &options, ChamLower, tempkm, A->mb, - A(k, k), ldak, A->nb*k); + A(k, k), A->nb*k); for (m = k+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); options.priority = 2*A->mt - 2*k - m; INSERT_TASK_ztrsm( &options, ChamRight, ChamLower, ChamConjTrans, ChamNonUnit, tempmm, A->mb, A->mb, - zone, A(k, k), ldak, - A(m, k), ldam); + zone, A(k, k), + A(m, k)); } RUNTIME_data_flush( sequence, A(k, k) ); for (n = k+1; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; - ldan = BLKLDD(A, n); options.priority = 2*A->mt - 2*k - n; INSERT_TASK_zherk( &options, ChamLower, ChamNoTrans, tempnn, A->nb, A->mb, - -1.0, A(n, k), ldan, - 1.0, A(n, n), ldan); + -1.0, A(n, k), + 1.0, A(n, n)); for (m = n+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m - m*A->mb : A->mb; - ldam = BLKLDD(A, m); options.priority = 2*A->mt - 2*k - n - m; INSERT_TASK_zgemm( &options, ChamNoTrans, ChamConjTrans, tempmm, tempnn, A->mb, A->mb, - mzone, A(m, k), ldam, - A(n, k), ldan, - zone, A(m, n), ldam); + mzone, A(m, k), + A(n, k), + zone, A(m, n)); } RUNTIME_data_flush( sequence, A(n, k) ); } @@ -120,14 +115,13 @@ void chameleon_pzpotrf(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_iteration_push(chamctxt, k); tempkm = k == A->nt-1 ? A->n-k*A->nb : A->nb; - ldak = BLKLDD(A, k); options.priority = 2*A->nt - 2*k; INSERT_TASK_zpotrf( &options, ChamUpper, tempkm, A->mb, - A(k, k), ldak, A->nb*k); + A(k, k), A->nb*k); for (n = k+1; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n - n*A->nb : A->nb; @@ -137,22 +131,21 @@ void chameleon_pzpotrf(cham_uplo_t uplo, CHAM_desc_t *A, &options, ChamLeft, ChamUpper, ChamConjTrans, ChamNonUnit, A->mb, tempnn, A->mb, - zone, A(k, k), ldak, - A(k, n), ldak); + zone, A(k, k), + A(k, n)); } RUNTIME_data_flush( sequence, A(k, k) ); for (m = k+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m - m*A->mb : A->mb; - ldam = BLKLDD(A, m); options.priority = 2*A->nt - 2*k - m; INSERT_TASK_zherk( &options, ChamUpper, ChamConjTrans, tempmm, A->mb, A->mb, - -1.0, A(k, m), ldak, - 1.0, A(m, m), ldam); + -1.0, A(k, m), + 1.0, A(m, m)); for (n = m+1; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; @@ -162,9 +155,9 @@ void chameleon_pzpotrf(cham_uplo_t uplo, CHAM_desc_t *A, &options, ChamConjTrans, ChamNoTrans, tempmm, tempnn, A->mb, A->mb, - mzone, A(k, m), ldak, - A(k, n), ldak, - zone, A(m, n), ldam); + mzone, A(k, m), + A(k, n), + zone, A(m, n)); } RUNTIME_data_flush( sequence, A(k, m) ); } diff --git a/compute/pzpotrimm.c b/compute/pzpotrimm.c index 8982ad9ced319b0ce12105a7c0510d315dd2ee12..7d924b9fe16a40168d55fbd8c739f0ee4c626169 100644 --- a/compute/pzpotrimm.c +++ b/compute/pzpotrimm.c @@ -36,8 +36,6 @@ void chameleon_pzpotrimm(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_ RUNTIME_option_t options; int k, m, n; - int ldbm, ldcm; - int ldak, ldam, ldan; int tempkm, tempmm, tempnn, tempkn; CHAMELEON_Complex64_t alpha = (CHAMELEON_Complex64_t) 1.0; @@ -64,45 +62,41 @@ void chameleon_pzpotrimm(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_ RUNTIME_iteration_push(chamctxt, k); tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); INSERT_TASK_zpotrf( &options, ChamLower, tempkm, A->mb, - A(k, k), ldak, A->nb*k); + A(k, k), A->nb*k); for (m = k+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); INSERT_TASK_ztrsm( &options, ChamRight, ChamLower, ChamConjTrans, ChamNonUnit, tempmm, A->mb, A->mb, - zone, A(k, k), ldak, - A(m, k), ldam); + zone, A(k, k), + A(m, k)); } RUNTIME_data_flush( sequence, A(k, k) ); for (n = k+1; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; - ldan = BLKLDD(A, n); INSERT_TASK_zherk( &options, ChamLower, ChamNoTrans, tempnn, A->nb, A->mb, - -1.0, A(n, k), ldan, - 1.0, A(n, n), ldan); + -1.0, A(n, k), + 1.0, A(n, n)); for (m = n+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m - m*A->mb : A->mb; - ldam = BLKLDD(A, m); INSERT_TASK_zgemm( &options, ChamNoTrans, ChamConjTrans, tempmm, tempnn, A->mb, A->mb, - mzone, A(m, k), ldam, - A(n, k), ldan, - zone, A(m, n), ldam); + mzone, A(m, k), + A(n, k), + zone, A(m, n)); } RUNTIME_data_flush( sequence, A(n, k) ); } @@ -116,28 +110,25 @@ void chameleon_pzpotrimm(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_ RUNTIME_iteration_push(chamctxt, A->nt + k); tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; - ldak = BLKLDD(A, k); for (m = k+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); INSERT_TASK_ztrsm( &options, ChamRight, uplo, ChamNoTrans, ChamNonUnit, tempmm, tempkn, A->mb, - mzone, A(k, k), ldak, - A(m, k), ldam); + mzone, A(k, k), + A(m, k)); } for (m = k+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); for (n = 0; n < k; n++) { INSERT_TASK_zgemm( &options, ChamNoTrans, ChamNoTrans, tempmm, A->nb, tempkn, A->mb, - zone, A(m, k), ldam, - A(k, n), ldak, - zone, A(m, n), ldam); + zone, A(m, k), + A(k, n), + zone, A(m, n)); } RUNTIME_data_flush( sequence, A(m, k) ); } @@ -147,15 +138,15 @@ void chameleon_pzpotrimm(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_ &options, ChamLeft, uplo, ChamNoTrans, ChamNonUnit, tempkn, A->nb, A->mb, - zone, A(k, k), ldak, - A(k, n), ldak); + zone, A(k, k), + A(k, n)); } RUNTIME_data_flush( sequence, A(k, k) ); INSERT_TASK_ztrtri( &options, uplo, ChamNonUnit, tempkn, A->mb, - A(k, k), ldak, A->nb*k); + A(k, k), A->nb*k); RUNTIME_iteration_pop(chamctxt); } @@ -166,25 +157,22 @@ void chameleon_pzpotrimm(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_ RUNTIME_iteration_push(chamctxt, 2*A->nt + k); tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); for(n = 0; n < k; n++) { - ldan = BLKLDD(A, n); INSERT_TASK_zherk( &options, uplo, ChamConjTrans, A->mb, tempkm, A->mb, - 1.0, A(k, n), ldak, - 1.0, A(n, n), ldan); + 1.0, A(k, n), + 1.0, A(n, n)); for(m = n+1; m < k; m++) { - ldam = BLKLDD(A, m); INSERT_TASK_zgemm( &options, ChamConjTrans, ChamNoTrans, A->mb, A->nb, tempkm, A->mb, - 1.0, A(k, m), ldak, - A(k, n), ldak, - 1.0, A(m, n), ldam); + 1.0, A(k, m), + A(k, n), + 1.0, A(m, n)); } } for (n = 0; n < k; n++) { @@ -193,14 +181,14 @@ void chameleon_pzpotrimm(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_ &options, ChamLeft, uplo, ChamConjTrans, ChamNonUnit, tempkm, A->nb, A->mb, - 1.0, A(k, k), ldak, - A(k, n), ldak); + 1.0, A(k, k), + A(k, n)); } RUNTIME_data_flush( sequence, A(k, k) ); INSERT_TASK_zlauum( &options, uplo, tempkm, A->mb, - A(k, k), ldak); + A(k, k)); RUNTIME_iteration_pop(chamctxt); } @@ -211,26 +199,22 @@ void chameleon_pzpotrimm(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_ RUNTIME_iteration_push(chamctxt, 3*A->nt + k); tempkn = k == C->nt-1 ? C->n-k*C->nb : C->nb; - ldak = BLKLDD(A, k); zbeta = k == 0 ? beta : zone; for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldbm = BLKLDD(B, m); - ldcm = BLKLDD(C, m); for (n = 0; n < C->nt; n++) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - ldan = BLKLDD(A, n); if (k < n) { INSERT_TASK_zgemm( &options, ChamNoTrans, ChamTrans, tempmm, tempnn, tempkn, A->mb, - alpha, B(m, k), ldbm, /* ldbm * K */ - A(n, k), ldan, /* ldan * K */ - zbeta, C(m, n), ldcm); /* ldcm * Y */ + alpha, B(m, k), /* ldbm * K */ + A(n, k), /* ldan * K */ + zbeta, C(m, n)); /* ldcm * Y */ } else { if (k == n) { @@ -238,18 +222,18 @@ void chameleon_pzpotrimm(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_ &options, ChamRight, uplo, tempmm, tempnn, A->mb, - alpha, A(k, k), ldak, /* ldak * Y */ - B(m, k), ldbm, /* ldbm * Y */ - zbeta, C(m, n), ldcm); /* ldcm * Y */ + alpha, A(k, k), /* ldak * Y */ + B(m, k), /* ldbm * Y */ + zbeta, C(m, n)); /* ldcm * Y */ } else { INSERT_TASK_zgemm( &options, ChamNoTrans, ChamNoTrans, tempmm, tempnn, tempkn, A->mb, - alpha, B(m, k), ldbm, /* ldbm * K */ - A(k, n), ldak, /* ldak * Y */ - zbeta, C(m, n), ldcm); /* ldcm * Y */ + alpha, B(m, k), /* ldbm * K */ + A(k, n), /* ldak * Y */ + zbeta, C(m, n)); /* ldcm * Y */ } } } @@ -273,12 +257,11 @@ void chameleon_pzpotrimm(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_ RUNTIME_iteration_push(chamctxt, k); tempkm = k == A->nt-1 ? A->n-k*A->nb : A->nb; - ldak = BLKLDD(A, k); INSERT_TASK_zpotrf( &options, ChamUpper, tempkm, A->mb, - A(k, k), ldak, A->nb*k); + A(k, k), A->nb*k); for (n = k+1; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n - n*A->nb : A->nb; @@ -286,21 +269,20 @@ void chameleon_pzpotrimm(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_ &options, ChamLeft, ChamUpper, ChamConjTrans, ChamNonUnit, A->mb, tempnn, A->mb, - zone, A(k, k), ldak, - A(k, n), ldak); + zone, A(k, k), + A(k, n)); } RUNTIME_data_flush( sequence, A(k, k) ); for (m = k+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m - m*A->mb : A->mb; - ldam = BLKLDD(A, m); INSERT_TASK_zherk( &options, ChamUpper, ChamConjTrans, tempmm, A->mb, A->mb, - -1.0, A(k, m), ldak, - 1.0, A(m, m), ldam); + -1.0, A(k, m), + 1.0, A(m, m)); for (n = m+1; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; @@ -309,9 +291,9 @@ void chameleon_pzpotrimm(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_ &options, ChamConjTrans, ChamNoTrans, tempmm, tempnn, A->mb, A->mb, - mzone, A(k, m), ldak, - A(k, n), ldak, - zone, A(m, n), ldam); + mzone, A(k, m), + A(k, n), + zone, A(m, n)); } RUNTIME_data_flush( sequence, A(k, m) ); } @@ -325,46 +307,43 @@ void chameleon_pzpotrimm(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_ RUNTIME_iteration_push(chamctxt, A->nt + k); tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); for (n = k+1; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; INSERT_TASK_ztrsm( &options, ChamLeft, uplo, ChamNoTrans, ChamNonUnit, tempkm, tempnn, A->mb, - mzone, A(k, k), ldak, - A(k, n), ldak); + mzone, A(k, k), + A(k, n)); } for (n = k+1; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; for (m = 0; m < k; m++) { - ldam = BLKLDD(A, m); INSERT_TASK_zgemm( &options, ChamNoTrans, ChamNoTrans, A->mb, tempnn, tempkm, A->mb, - zone, A(m, k), ldam, - A(k, n), ldak, - zone, A(m, n), ldam); + zone, A(m, k), + A(k, n), + zone, A(m, n)); } RUNTIME_data_flush( sequence, A(k, n) ); } for (m = 0; m < k; m++) { - ldam = BLKLDD(A, m); RUNTIME_data_flush( sequence, A(m, k) ); INSERT_TASK_ztrsm( &options, ChamRight, uplo, ChamNoTrans, ChamNonUnit, A->mb, tempkm, A->mb, - zone, A(k, k), ldak, - A(m, k), ldam); + zone, A(k, k), + A(m, k)); } RUNTIME_data_flush( sequence, A(k, k) ); INSERT_TASK_ztrtri( &options, uplo, ChamNonUnit, tempkm, A->mb, - A(k, k), ldak, A->mb*k); + A(k, k), A->mb*k); RUNTIME_iteration_pop(chamctxt); } @@ -375,43 +354,39 @@ void chameleon_pzpotrimm(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_ RUNTIME_iteration_push(chamctxt, 2*A->nt + k); tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; - ldak = BLKLDD(A, k); for (m = 0; m < k; m++) { - ldam = BLKLDD(A, m); INSERT_TASK_zherk( &options, uplo, ChamNoTrans, A->mb, tempkn, A->mb, - 1.0, A(m, k), ldam, - 1.0, A(m, m), ldam); + 1.0, A(m, k), + 1.0, A(m, m)); for (n = m+1; n < k; n++){ - ldan = BLKLDD(A, n); INSERT_TASK_zgemm( &options, ChamNoTrans, ChamConjTrans, A->mb, A->nb, tempkn, A->mb, - 1.0, A(m, k), ldam, - A(n, k), ldan, - 1.0, A(m, n), ldam); + 1.0, A(m, k), + A(n, k), + 1.0, A(m, n)); } } for (m = 0; m < k; m++) { - ldam = BLKLDD(A, m); RUNTIME_data_flush( sequence, A(m, k) ); INSERT_TASK_ztrmm( &options, ChamRight, uplo, ChamConjTrans, ChamNonUnit, A->mb, tempkn, A->mb, - 1.0, A(k, k), ldak, - A(m, k), ldam); + 1.0, A(k, k), + A(m, k)); } RUNTIME_data_flush( sequence, A(k, k) ); INSERT_TASK_zlauum( &options, uplo, tempkn, A->mb, - A(k, k), ldak); + A(k, k)); RUNTIME_iteration_pop(chamctxt); } @@ -422,26 +397,22 @@ void chameleon_pzpotrimm(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_ RUNTIME_iteration_push(chamctxt, 3*A->nt + k); tempkn = k == C->nt-1 ? C->n-k*C->nb : C->nb; - ldak = BLKLDD(A, k); zbeta = k == 0 ? beta : zone; for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldbm = BLKLDD(B, m); - ldcm = BLKLDD(C, m); for (n = 0; n < C->nt; n++) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - ldan = BLKLDD(A, n); if (k < n) { INSERT_TASK_zgemm( &options, ChamNoTrans, ChamNoTrans, tempmm, tempnn, tempkn, A->mb, - alpha, B(m, k), ldbm, /* ldbm * K */ - A(k, n), ldak, /* ldak * Y */ - zbeta, C(m, n), ldcm); /* ldcm * Y */ + alpha, B(m, k), /* ldbm * K */ + A(k, n), /* ldak * Y */ + zbeta, C(m, n)); /* ldcm * Y */ } else { if (k == n) { @@ -449,18 +420,18 @@ void chameleon_pzpotrimm(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_ &options, ChamRight, uplo, tempmm, tempnn, A->mb, - alpha, A(k, k), ldak, /* ldak * Y */ - B(m, k), ldbm, /* ldbm * Y */ - zbeta, C(m, n), ldcm); /* ldcm * Y */ + alpha, A(k, k), /* ldak * Y */ + B(m, k), /* ldbm * Y */ + zbeta, C(m, n)); /* ldcm * Y */ } else { INSERT_TASK_zgemm( &options, ChamNoTrans, ChamTrans, tempmm, tempnn, tempkn, A->mb, - alpha, B(m, k), ldbm, /* ldbm * K */ - A(n, k), ldan, /* ldan * K */ - zbeta, C(m, n), ldcm); /* ldcm * Y */ + alpha, B(m, k), /* ldbm * K */ + A(n, k), /* ldan * K */ + zbeta, C(m, n)); /* ldcm * Y */ } } } diff --git a/compute/pzsymm.c b/compute/pzsymm.c index f9d724c08ab8835e1dddffc332bb4298facc9ee2..d953dacf503be5cc2e3c513d0b4d43ab8647e033 100644 --- a/compute/pzsymm.c +++ b/compute/pzsymm.c @@ -43,7 +43,6 @@ chameleon_pzsymm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, RUNTIME_sequence_t *sequence = options->sequence; cham_trans_t transA; int m, n, k, p, q, KT, K, lp, lq; - int ldcm; int tempmm, tempnn, tempkk; int lookahead, myp, myq; @@ -64,7 +63,7 @@ chameleon_pzsymm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, /* Transfert ownership of the k column of A or B */ for (m = 0; m < C->mt; m ++ ) { - int Am, Ak, ldam; + int Am, Ak; int tempam, tempak; tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb; @@ -85,13 +84,12 @@ chameleon_pzsymm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, tempam = tempmm; tempak = tempkk; } - ldam = BLKLDD( A, Am ); INSERT_TASK_zlacpy( options, ChamUpperLower, tempam, tempak, C->mb, - A( Am, Ak ), ldam, - WA( m, (k % C->q) + lq ), WA->mb ); + A( Am, Ak ), + WA( m, (k % C->q) + lq ) ); RUNTIME_data_flush( sequence, A( Am, Ak ) ); @@ -99,23 +97,21 @@ chameleon_pzsymm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zlacpy( options, ChamUpperLower, tempam, tempak, C->mb, - WA( m, ((k+q-1) % C->q) + lq ), WA->mb, - WA( m, ((k+q) % C->q) + lq ), WA->mb ); + WA( m, ((k+q-1) % C->q) + lq ), + WA( m, ((k+q) % C->q) + lq ) ); } } /* Transfert ownership of the k row of B, or A */ for (n = 0; n < C->nt; n++) { - int ldbk; tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - ldbk = BLKLDD( B, k ); INSERT_TASK_zlacpy( options, ChamUpperLower, tempkk, tempnn, C->mb, - B( k, n ), ldbk, - WB( (k % C->p) + lp, n ), WB->mb ); + B( k, n ), + WB( (k % C->p) + lp, n ) ); RUNTIME_data_flush( sequence, B( k, n ) ); @@ -123,15 +119,14 @@ chameleon_pzsymm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zlacpy( options, ChamUpperLower, tempkk, tempnn, C->mb, - WB( ((k+p-1) % C->p) + lp, n ), WB->mb, - WB( ((k+p) % C->p) + lp, n ), WB->mb ); + WB( ((k+p-1) % C->p) + lp, n ), + WB( ((k+p) % C->p) + lp, n ) ); } } /* Perform the update of this iteration */ for (m = myp; m < C->mt; m+=C->p) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); if ( k == m ) { for (n = myq; n < C->nt; n+=C->q) { @@ -140,9 +135,9 @@ chameleon_pzsymm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zsymm( options, ChamLeft, uplo, tempmm, tempnn, A->mb, - alpha, WA( m, myq + lq ), WA->mb, - WB( myp + lp, n ), WB->mb, - zbeta, C( m, n ), ldcm ); + alpha, WA( m, myq + lq ), + WB( myp + lp, n ), + zbeta, C( m, n ) ); } } else { @@ -161,9 +156,9 @@ chameleon_pzsymm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zgemm( options, transA, ChamNoTrans, tempmm, tempnn, tempkk, A->mb, - alpha, WA( m, myq + lq ), WA->mb, - WB( myp + lp, n ), WB->mb, - zbeta, C( m, n ), ldcm ); + alpha, WA( m, myq + lq ), + WB( myp + lp, n ), + zbeta, C( m, n ) ); } } } @@ -184,7 +179,6 @@ chameleon_pzsymm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, RUNTIME_sequence_t *sequence = options->sequence; cham_trans_t transA; int m, n, k, p, q, KT, K, lp, lq; - int ldcm; int tempmm, tempnn, tempkk; int lookahead, myp, myq; @@ -205,16 +199,14 @@ chameleon_pzsymm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, /* Transfert ownership of the k column of A or B */ for (m = 0; m < C->mt; m++ ) { - int ldbm; tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb; - ldbm = BLKLDD( B, m ); INSERT_TASK_zlacpy( options, ChamUpperLower, tempmm, tempkk, C->mb, - B( m, k ), ldbm, - WA( m, (k % C->q) + lq ), WA->mb ); + B( m, k ), + WA( m, (k % C->q) + lq ) ); RUNTIME_data_flush( sequence, B( m, k ) ); @@ -222,14 +214,14 @@ chameleon_pzsymm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zlacpy( options, ChamUpperLower, tempmm, tempkk, C->mb, - WA( m, ((k+q-1) % C->q) + lq ), WA->mb, - WA( m, ((k+q) % C->q) + lq ), WA->mb ); + WA( m, ((k+q-1) % C->q) + lq ), + WA( m, ((k+q) % C->q) + lq ) ); } } /* Transfert ownership of the k row of B, or A */ for (n = 0; n < C->nt; n++) { - int Ak, An, ldak; + int Ak, An; int tempak, tempan; tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; @@ -249,13 +241,12 @@ chameleon_pzsymm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, tempak = tempkk; tempan = tempnn; } - ldak = BLKLDD( A, Ak ); INSERT_TASK_zlacpy( options, ChamUpperLower, tempak, tempan, C->mb, - A( Ak, An ), ldak, - WB( (k % C->p) + lp, n ), WB->mb ); + A( Ak, An ), + WB( (k % C->p) + lp, n ) ); RUNTIME_data_flush( sequence, A( Ak, An ) ); @@ -263,8 +254,8 @@ chameleon_pzsymm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, INSERT_TASK_zlacpy( options, ChamUpperLower, tempak, tempan, C->mb, - WB( ((k+p-1) % C->p) + lp, n ), WB->mb, - WB( ((k+p) % C->p) + lp, n ), WB->mb ); + WB( ((k+p-1) % C->p) + lp, n ), + WB( ((k+p) % C->p) + lp, n ) ); } } @@ -275,15 +266,14 @@ chameleon_pzsymm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, if ( k == n ) { for (m = myp; m < C->mt; m+=C->p) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); /* A has been stored in WA or WB for the summa ring */ INSERT_TASK_zsymm( options, ChamRight, uplo, tempmm, tempnn, A->mb, - alpha, WB( myp + lp, n ), WB->mb, - WA( m, myq + lq ), WA->mb, - zbeta, C( m, n ), ldcm ); + alpha, WB( myp + lp, n ), + WA( m, myq + lq ), + zbeta, C( m, n ) ); } } else { @@ -298,14 +288,13 @@ chameleon_pzsymm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, for (m = myp; m < C->mt; m+=C->p) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); INSERT_TASK_zgemm( options, ChamNoTrans, transA, tempmm, tempnn, tempkk, A->mb, - alpha, WA( m, myq + lq ), WA->mb, - WB( myp + lp, n ), WB->mb, - zbeta, C( m, n ), ldcm ); + alpha, WA( m, myq + lq ), + WB( myp + lp, n ), + zbeta, C( m, n ) ); } } } @@ -366,7 +355,6 @@ chameleon_pzsymm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_ RUNTIME_option_t *options ) { int k, m, n; - int ldam, ldan, ldak, ldbk, ldbm, ldcm; int tempmm, tempnn, tempkn, tempkm; CHAMELEON_Complex64_t zbeta; @@ -374,28 +362,24 @@ chameleon_pzsymm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_ for(m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); for(n = 0; n < C->nt; n++) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; /* * ChamLeft / ChamLower */ if (side == ChamLeft) { - ldam = BLKLDD(A, m); if (uplo == ChamLower) { for (k = 0; k < C->mt; k++) { tempkm = k == C->mt-1 ? C->m-k*C->mb : C->mb; - ldak = BLKLDD(A, k); - ldbk = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; if (k < m) { INSERT_TASK_zgemm( options, ChamNoTrans, ChamNoTrans, tempmm, tempnn, tempkm, A->mb, - alpha, A(m, k), ldam, /* lda * K */ - B(k, n), ldbk, /* ldb * Y */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, A(m, k), /* lda * K */ + B(k, n), /* ldb * Y */ + zbeta, C(m, n)); /* ldc * Y */ } else { if (k == m) { @@ -403,18 +387,18 @@ chameleon_pzsymm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_ options, side, uplo, tempmm, tempnn, A->mb, - alpha, A(k, k), ldak, /* ldak * X */ - B(k, n), ldbk, /* ldb * Y */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, A(k, k), /* ldak * X */ + B(k, n), /* ldb * Y */ + zbeta, C(m, n)); /* ldc * Y */ } else { INSERT_TASK_zgemm( options, ChamTrans, ChamNoTrans, tempmm, tempnn, tempkm, A->mb, - alpha, A(k, m), ldak, /* ldak * X */ - B(k, n), ldbk, /* ldb * Y */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, A(k, m), /* ldak * X */ + B(k, n), /* ldb * Y */ + zbeta, C(m, n)); /* ldc * Y */ } } } @@ -425,17 +409,15 @@ chameleon_pzsymm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_ else { for (k = 0; k < C->mt; k++) { tempkm = k == C->mt-1 ? C->m-k*C->mb : C->mb; - ldak = BLKLDD(A, k); - ldbk = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; if (k < m) { INSERT_TASK_zgemm( options, ChamTrans, ChamNoTrans, tempmm, tempnn, tempkm, A->mb, - alpha, A(k, m), ldak, /* ldak * X */ - B(k, n), ldbk, /* ldb * Y */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, A(k, m), /* ldak * X */ + B(k, n), /* ldb * Y */ + zbeta, C(m, n)); /* ldc * Y */ } else { if (k == m) { @@ -443,18 +425,18 @@ chameleon_pzsymm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_ options, side, uplo, tempmm, tempnn, A->mb, - alpha, A(k, k), ldak, /* ldak * K */ - B(k, n), ldbk, /* ldb * Y */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, A(k, k), /* ldak * K */ + B(k, n), /* ldb * Y */ + zbeta, C(m, n)); /* ldc * Y */ } else { INSERT_TASK_zgemm( options, ChamNoTrans, ChamNoTrans, tempmm, tempnn, tempkm, A->mb, - alpha, A(m, k), ldam, /* lda * K */ - B(k, n), ldbk, /* ldb * Y */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, A(m, k), /* lda * K */ + B(k, n), /* ldb * Y */ + zbeta, C(m, n)); /* ldc * Y */ } } } @@ -464,21 +446,18 @@ chameleon_pzsymm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_ * ChamRight / ChamLower */ else { - ldan = BLKLDD(A, n); - ldbm = BLKLDD(B, m); if (uplo == ChamLower) { for (k = 0; k < C->nt; k++) { tempkn = k == C->nt-1 ? C->n-k*C->nb : C->nb; - ldak = BLKLDD(A, k); zbeta = k == 0 ? beta : zone; if (k < n) { INSERT_TASK_zgemm( options, ChamNoTrans, ChamTrans, tempmm, tempnn, tempkn, A->mb, - alpha, B(m, k), ldbm, /* ldb * K */ - A(n, k), ldan, /* lda * K */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, B(m, k), /* ldb * K */ + A(n, k), /* lda * K */ + zbeta, C(m, n)); /* ldc * Y */ } else { if (k == n) { @@ -486,18 +465,18 @@ chameleon_pzsymm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_ options, side, uplo, tempmm, tempnn, A->mb, - alpha, A(k, k), ldak, /* ldak * Y */ - B(m, k), ldbm, /* ldb * Y */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, A(k, k), /* ldak * Y */ + B(m, k), /* ldb * Y */ + zbeta, C(m, n)); /* ldc * Y */ } else { INSERT_TASK_zgemm( options, ChamNoTrans, ChamNoTrans, tempmm, tempnn, tempkn, A->mb, - alpha, B(m, k), ldbm, /* ldb * K */ - A(k, n), ldak, /* ldak * Y */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, B(m, k), /* ldb * K */ + A(k, n), /* ldak * Y */ + zbeta, C(m, n)); /* ldc * Y */ } } } @@ -508,16 +487,15 @@ chameleon_pzsymm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_ else { for (k = 0; k < C->nt; k++) { tempkn = k == C->nt-1 ? C->n-k*C->nb : C->nb; - ldak = BLKLDD(A, k); zbeta = k == 0 ? beta : zone; if (k < n) { INSERT_TASK_zgemm( options, ChamNoTrans, ChamNoTrans, tempmm, tempnn, tempkn, A->mb, - alpha, B(m, k), ldbm, /* ldb * K */ - A(k, n), ldak, /* ldak * Y */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, B(m, k), /* ldb * K */ + A(k, n), /* ldak * Y */ + zbeta, C(m, n)); /* ldc * Y */ } else { if (k == n) { @@ -525,18 +503,18 @@ chameleon_pzsymm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_ options, side, uplo, tempmm, tempnn, A->mb, - alpha, A(k, k), ldak, /* ldak * Y */ - B(m, k), ldbm, /* ldb * Y */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, A(k, k), /* ldak * Y */ + B(m, k), /* ldb * Y */ + zbeta, C(m, n)); /* ldc * Y */ } else { INSERT_TASK_zgemm( options, ChamNoTrans, ChamTrans, tempmm, tempnn, tempkn, A->mb, - alpha, B(m, k), ldbm, /* ldb * K */ - A(n, k), ldan, /* lda * K */ - zbeta, C(m, n), ldcm); /* ldc * Y */ + alpha, B(m, k), /* ldb * K */ + A(n, k), /* lda * K */ + zbeta, C(m, n)); /* ldc * Y */ } } } diff --git a/compute/pzsyr2k.c b/compute/pzsyr2k.c index 0b5f7195f398bd8319a38868cc6ac8ac7de92d55..c0c34cfd62605e0d85cd80c2b4115b46bbf89538 100644 --- a/compute/pzsyr2k.c +++ b/compute/pzsyr2k.c @@ -38,8 +38,6 @@ void chameleon_pzsyr2k( cham_uplo_t uplo, cham_trans_t trans, RUNTIME_option_t options; int m, n, k, mmin, mmax; - int ldak, ldam, ldan, ldcm, ldcn; - int ldbk, ldbm, ldbn; int tempnn, tempmm, tempkn, tempkm; CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0; @@ -53,9 +51,6 @@ void chameleon_pzsyr2k( cham_uplo_t uplo, cham_trans_t trans, for (n = 0; n < C->nt; n++) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - ldan = BLKLDD(A, n); - ldbn = BLKLDD(B, n); - ldcn = BLKLDD(C, n); if (uplo == ChamLower) { mmin = n+1; @@ -77,15 +72,12 @@ void chameleon_pzsyr2k( cham_uplo_t uplo, cham_trans_t trans, &options, uplo, trans, tempnn, tempkn, A->mb, - alpha, A(n, k), ldan, /* ldan * K */ - B(n, k), ldbn, - zbeta, C(n, n), ldcn); /* ldc * N */ + alpha, A(n, k), /* ldan * K */ + B(n, k), + zbeta, C(n, n)); /* ldc * N */ } for (m = mmin; m < mmax; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldam = BLKLDD(A, m); - ldbm = BLKLDD(B, m); - ldcm = BLKLDD(C, m); for (k = 0; k < A->nt; k++) { tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; zbeta = k == 0 ? beta : zone; @@ -93,17 +85,17 @@ void chameleon_pzsyr2k( cham_uplo_t uplo, cham_trans_t trans, &options, ChamNoTrans, ChamTrans, tempmm, tempnn, tempkn, A->mb, - alpha, A(m, k), ldam, - B(n, k), ldbn, - zbeta, C(m, n), ldcm); + alpha, A(m, k), + B(n, k), + zbeta, C(m, n)); INSERT_TASK_zgemm( &options, ChamNoTrans, ChamTrans, tempmm, tempnn, tempkn, A->mb, - alpha, B(m, k), ldbm, - A(n, k), ldan, - zone, C(m, n), ldcm); + alpha, B(m, k), + A(n, k), + zone, C(m, n)); } } } @@ -113,40 +105,35 @@ void chameleon_pzsyr2k( cham_uplo_t uplo, cham_trans_t trans, else { for (k = 0; k < A->mt; k++) { tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); - ldbk = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; INSERT_TASK_zsyr2k( &options, uplo, trans, tempnn, tempkm, A->mb, - alpha, A(k, n), ldak, /* lda * N */ - B(k, n), ldbk, - zbeta, C(n, n), ldcn); /* ldc * N */ + alpha, A(k, n), /* lda * N */ + B(k, n), + zbeta, C(n, n)); /* ldc * N */ } for (m = mmin; m < mmax; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); for (k = 0; k < A->mt; k++) { tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); - ldbk = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; INSERT_TASK_zgemm( &options, ChamTrans, ChamNoTrans, tempmm, tempnn, tempkm, A->mb, - alpha, A(k, m), ldak, - B(k, n), ldbk, - zbeta, C(m, n), ldcm); + alpha, A(k, m), + B(k, n), + zbeta, C(m, n)); INSERT_TASK_zgemm( &options, ChamTrans, ChamNoTrans, tempmm, tempnn, tempkm, A->mb, - alpha, B(k, m), ldbk, - A(k, n), ldak, - zone, C(m, n), ldcm ); + alpha, B(k, m), + A(k, n), + zone, C(m, n) ); } } } diff --git a/compute/pzsyrk.c b/compute/pzsyrk.c index 8f6b3013ac6f6bd8632e225c502d5296ab1b7567..74ea7112e6ef2074568d133532e5f42256e49d92 100644 --- a/compute/pzsyrk.c +++ b/compute/pzsyrk.c @@ -38,7 +38,6 @@ void chameleon_pzsyrk(cham_uplo_t uplo, cham_trans_t trans, RUNTIME_option_t options; int m, n, k; - int ldak, ldam, ldan, ldcm, ldcn; int tempnn, tempmm, tempkn, tempkm; CHAMELEON_Complex64_t zbeta; @@ -52,8 +51,6 @@ void chameleon_pzsyrk(cham_uplo_t uplo, cham_trans_t trans, for (n = 0; n < C->nt; n++) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - ldan = BLKLDD(A, n); - ldcn = BLKLDD(C, n); /* * ChamNoTrans */ @@ -65,8 +62,8 @@ void chameleon_pzsyrk(cham_uplo_t uplo, cham_trans_t trans, &options, uplo, trans, tempnn, tempkn, A->mb, - alpha, A(n, k), ldan, /* ldan * K */ - zbeta, C(n, n), ldcn); /* ldc * N */ + alpha, A(n, k), /* ldan * K */ + zbeta, C(n, n)); /* ldc * N */ } /* * ChamNoTrans / ChamLower @@ -74,8 +71,6 @@ void chameleon_pzsyrk(cham_uplo_t uplo, cham_trans_t trans, if (uplo == ChamLower) { for (m = n+1; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldam = BLKLDD(A, m); - ldcm = BLKLDD(C, m); for (k = 0; k < A->nt; k++) { tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; zbeta = k == 0 ? beta : zone; @@ -83,9 +78,9 @@ void chameleon_pzsyrk(cham_uplo_t uplo, cham_trans_t trans, &options, trans, ChamTrans, tempmm, tempnn, tempkn, A->mb, - alpha, A(m, k), ldam, /* ldam * K */ - A(n, k), ldan, /* ldan * K */ - zbeta, C(m, n), ldcm); /* ldc * N */ + alpha, A(m, k), /* ldam * K */ + A(n, k), /* ldan * K */ + zbeta, C(m, n)); /* ldc * N */ } } } @@ -95,7 +90,6 @@ void chameleon_pzsyrk(cham_uplo_t uplo, cham_trans_t trans, else { for (m = n+1; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldam = BLKLDD(A, m); for (k = 0; k < A->nt; k++) { tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; zbeta = k == 0 ? beta : zone; @@ -103,9 +97,9 @@ void chameleon_pzsyrk(cham_uplo_t uplo, cham_trans_t trans, &options, trans, ChamTrans, tempnn, tempmm, tempkn, A->mb, - alpha, A(n, k), ldan, /* ldan * K */ - A(m, k), ldam, /* ldam * M */ - zbeta, C(n, m), ldcn); /* ldc * M */ + alpha, A(n, k), /* ldan * K */ + A(m, k), /* ldam * M */ + zbeta, C(n, m)); /* ldc * M */ } } } @@ -116,14 +110,13 @@ void chameleon_pzsyrk(cham_uplo_t uplo, cham_trans_t trans, else { for (k = 0; k < A->mt; k++) { tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); zbeta = k == 0 ? beta : zone; INSERT_TASK_zsyrk( &options, uplo, trans, tempnn, tempkm, A->mb, - alpha, A(k, n), ldak, /* lda * N */ - zbeta, C(n, n), ldcn); /* ldc * N */ + alpha, A(k, n), /* lda * N */ + zbeta, C(n, n)); /* ldc * N */ } /* * ChamTrans / ChamLower @@ -131,18 +124,16 @@ void chameleon_pzsyrk(cham_uplo_t uplo, cham_trans_t trans, if (uplo == ChamLower) { for (m = n+1; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); for (k = 0; k < A->mt; k++) { tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); zbeta = k == 0 ? beta : zone; INSERT_TASK_zgemm( &options, trans, ChamNoTrans, tempmm, tempnn, tempkm, A->mb, - alpha, A(k, m), ldak, /* lda * M */ - A(k, n), ldak, /* lda * N */ - zbeta, C(m, n), ldcm); /* ldc * N */ + alpha, A(k, m), /* lda * M */ + A(k, n), /* lda * N */ + zbeta, C(m, n)); /* ldc * N */ } } } @@ -154,15 +145,14 @@ void chameleon_pzsyrk(cham_uplo_t uplo, cham_trans_t trans, tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; for (k = 0; k < A->mt; k++) { tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); zbeta = k == 0 ? beta : zone; INSERT_TASK_zgemm( &options, trans, ChamNoTrans, tempnn, tempmm, tempkm, A->mb, - alpha, A(k, n), ldak, /* lda * K */ - A(k, m), ldak, /* lda * M */ - zbeta, C(n, m), ldcn); /* ldc * M */ + alpha, A(k, n), /* lda * K */ + A(k, m), /* lda * M */ + zbeta, C(n, m)); /* ldc * M */ } } } diff --git a/compute/pzsytrf.c b/compute/pzsytrf.c index 381f7e4a1f7533919f5877a3362a8427893db552..42bced9d3db381927f2b27cf7fa75a39e60822ba 100644 --- a/compute/pzsytrf.c +++ b/compute/pzsytrf.c @@ -36,7 +36,6 @@ void chameleon_pzsytrf(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_option_t options; int k, m, n; - int ldak, ldam, ldan; int tempkm, tempmm, tempnn; size_t ws_host = 0; @@ -59,45 +58,41 @@ void chameleon_pzsytrf(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_iteration_push(chamctxt, k); tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); INSERT_TASK_zsytrf_nopiv( &options, ChamLower, tempkm, A->mb, - A(k, k), ldak, A->nb*k); + A(k, k), A->nb*k); for (m = k+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); INSERT_TASK_ztrsm( &options, ChamRight, ChamLower, ChamTrans, ChamNonUnit, tempmm, A->mb, A->mb, - zone, A(k, k), ldak, - A(m, k), ldam); + zone, A(k, k), + A(m, k)); } RUNTIME_data_flush( sequence, A(k, k) ); for (n = k+1; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; - ldan = BLKLDD(A, n); INSERT_TASK_zsyrk( &options, ChamLower, ChamNoTrans, tempnn, A->nb, A->mb, - -1.0, A(n, k), ldan, - 1.0, A(n, n), ldan); + -1.0, A(n, k), + 1.0, A(n, n)); for (m = n+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m - m*A->mb : A->mb; - ldam = BLKLDD(A, m); INSERT_TASK_zgemm( &options, ChamNoTrans, ChamTrans, tempmm, tempnn, A->mb, A->mb, - mzone, A(m, k), ldam, - A(n, k), ldan, - zone, A(m, n), ldam); + mzone, A(m, k), + A(n, k), + zone, A(m, n)); } RUNTIME_data_flush( sequence, A(n, k) ); } @@ -113,12 +108,11 @@ void chameleon_pzsytrf(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_iteration_push(chamctxt, k); tempkm = k == A->nt-1 ? A->n-k*A->nb : A->nb; - ldak = BLKLDD(A, k); INSERT_TASK_zsytrf_nopiv( &options, ChamUpper, tempkm, A->mb, - A(k, k), ldak, A->nb*k); + A(k, k), A->nb*k); for (n = k+1; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n - n*A->nb : A->nb; @@ -126,21 +120,20 @@ void chameleon_pzsytrf(cham_uplo_t uplo, CHAM_desc_t *A, &options, ChamLeft, ChamUpper, ChamTrans, ChamNonUnit, A->mb, tempnn, A->mb, - zone, A(k, k), ldak, - A(k, n), ldak); + zone, A(k, k), + A(k, n)); } RUNTIME_data_flush( sequence, A(k, k) ); for (m = k+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m - m*A->mb : A->mb; - ldam = BLKLDD(A, m); INSERT_TASK_zsyrk( &options, ChamUpper, ChamTrans, tempmm, A->mb, A->mb, - -1.0, A(k, m), ldak, - 1.0, A(m, m), ldam); + -1.0, A(k, m), + 1.0, A(m, m)); for (n = m+1; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; @@ -149,9 +142,9 @@ void chameleon_pzsytrf(cham_uplo_t uplo, CHAM_desc_t *A, &options, ChamTrans, ChamNoTrans, tempmm, tempnn, A->mb, A->mb, - mzone, A(k, m), ldak, - A(k, n), ldak, - zone, A(m, n), ldam); + mzone, A(k, m), + A(k, n), + zone, A(m, n)); } RUNTIME_data_flush( sequence, A(k, m) ); } diff --git a/compute/pztile2band.c b/compute/pztile2band.c index 45766f7a028926ae61909d1168fd5e10124d8acb..ef9d54becc0d1dc245af56591152ceab371835bf 100644 --- a/compute/pztile2band.c +++ b/compute/pztile2band.c @@ -34,7 +34,6 @@ void chameleon_pztile2band(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_option_t options; int j; - int ldaj, ldx; int tempjm, tempjn; int minmnt = chameleon_min(A->mt, A->nt); @@ -44,7 +43,8 @@ void chameleon_pztile2band(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, } RUNTIME_options_init(&options, chamctxt, sequence, request); - ldx = B->mb-1; + /* The code is actually incorrect due to the removal of the ld (Need new insert_task dedicated) */ + assert( 0 ); /* * ChamLower => Lower Band @@ -57,28 +57,26 @@ void chameleon_pztile2band(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, tempjm = j == A->mt-1 ? A->m - j * A->mb : A->mb; tempjn = j == B->nt-1 ? B->n - j * B->nb : B->nb; - ldaj = BLKLDD(A, j); INSERT_TASK_zlaset( &options, ChamUpperLower, B->mb, tempjn, 0., 0., - B(0, j), B->mb ); + B(0, j) ); INSERT_TASK_zlacpy( &options, ChamLower, tempjm, tempjn, A->nb, - A(j, j), ldaj, - B(0, j), ldx ); + A(j, j), + B(0, j) ); if( j<minmnt-1 ){ tempjm = (j+1) == A->mt-1 ? A->m-(j+1)*A->mb : A->mb; - ldaj = BLKLDD(A, j+1); INSERT_TASK_zlacpyx( &options, ChamUpper, tempjm, tempjn, A->nb, - 0, A(j+1, j), ldaj, - A->nb, B(0, j), ldx); + 0, A(j+1, j), + A->nb, B(0, j)); } } } @@ -88,28 +86,27 @@ void chameleon_pztile2band(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, assert( A->n == B->n ); assert( A->m >= B->n ); tempjn = j == A->nt-1 ? A->n - j * A->nb : A->nb; - ldaj = BLKLDD(A, j); INSERT_TASK_zlaset( &options, ChamUpperLower, B->mb, tempjn, 0., 0., - B(0, j), B->mb ); + B(0, j) ); if(j > 0){ INSERT_TASK_zlacpy( &options, ChamLower, A->mb, tempjn, A->nb, - A(j-1, j), BLKLDD(A, j-1), - B(0, j), ldx); + A(j-1, j), + B(0, j)); } tempjm = j == B->nt-1 ? B->n - j * B->nb : B->nb; INSERT_TASK_zlacpyx( &options, ChamUpper, tempjm, tempjn, A->nb, - 0, A(j, j), ldaj, - A->nb, B(0, j), ldx); + 0, A(j, j), + A->nb, B(0, j)); } } RUNTIME_options_finalize(&options, chamctxt); diff --git a/compute/pztpgqrt.c b/compute/pztpgqrt.c index c49cb0452008910b14135bf2f9b34ccb9a9e1b0a..73b43d461991515c34dad9b06896f4c66960e09b 100644 --- a/compute/pztpgqrt.c +++ b/compute/pztpgqrt.c @@ -40,7 +40,6 @@ void chameleon_pztpgqrt( int KT, int L, size_t ws_host = 0; int k, m, n; - int ldvm, ldqk, ldqm; int tempkn, tempnn, tempmm, templm; int ib; @@ -80,15 +79,12 @@ void chameleon_pztpgqrt( int KT, int L, RUNTIME_iteration_push(chamctxt, k); tempkn = k == Q1->nt-1 ? Q1->n-k*Q1->nb : Q1->nb; - ldqk = BLKLDD(Q1, k); /* Equivalent to the tsmqr step on Q1,Q2 */ maxmtk = chameleon_min( Q2->mt, maxmt+k ) - 1; for (m = maxmtk; m > -1; m--) { tempmm = m == Q2->mt-1 ? Q2->m-m*Q2->mb : Q2->mb; templm = ((L > 0) && (m == maxmtk)) ? tempmm : 0; - ldvm = BLKLDD(V2, m); - ldqm = BLKLDD(Q2, m); for (n = k; n < Q2->nt; n++) { tempnn = n == Q2->nt-1 ? Q2->n-n*Q2->nb : Q2->nb; @@ -97,10 +93,10 @@ void chameleon_pztpgqrt( int KT, int L, &options, ChamLeft, ChamNoTrans, tempmm, tempnn, tempkn, templm, ib, T2->nb, - V2(m, k), ldvm, - T2(m, k), T2->mb, - Q1(k, n), ldqk, - Q2(m, n), ldqm ); + V2(m, k), + T2(m, k), + Q1(k, n), + Q2(m, n) ); } } diff --git a/compute/pztpqrt.c b/compute/pztpqrt.c index 88800dd26eb6efd3606327516afa24d968a2c75d..28effe7ca4ab72e66bd4a7ea2c770bf84b095b8e 100644 --- a/compute/pztpqrt.c +++ b/compute/pztpqrt.c @@ -36,7 +36,6 @@ void chameleon_pztpqrt( int L, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *T, size_t ws_host = 0; int k, m, n; - int ldak, ldbm; int tempkm, tempkn, tempnn, tempmm, templm; int ib; @@ -76,19 +75,17 @@ void chameleon_pztpqrt( int L, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *T, tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; - ldak = BLKLDD(A, k); for (m = 0; m < maxmt; m++) { tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; templm = ((L > 0) && (m == maxmt-1)) ? tempmm : 0; - ldbm = BLKLDD(B, m); /* TT kernel */ INSERT_TASK_ztpqrt( &options, tempmm, tempkn, templm, ib, T->nb, - A(k, k), ldak, - B(m, k), ldbm, - T(m, k), T->mb ); + A(k, k), + B(m, k), + T(m, k) ); for (n = k+1; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; @@ -96,10 +93,10 @@ void chameleon_pztpqrt( int L, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *T, &options, ChamLeft, ChamConjTrans, tempmm, tempnn, tempkm, templm, ib, T->nb, - B(m, k), ldbm, - T(m, k), T->mb, - A(k, n), ldak, - B(m, n), ldbm ); + B(m, k), + T(m, k), + A(k, n), + B(m, n) ); } } diff --git a/compute/pztradd.c b/compute/pztradd.c index 3a29fa4f1e8a9e5ad311febf4d039c43edd98e92..4918f70f39b218636e44b9b2fbb6370613005776 100644 --- a/compute/pztradd.c +++ b/compute/pztradd.c @@ -38,7 +38,6 @@ void chameleon_pztradd(cham_uplo_t uplo, cham_trans_t trans, int tempmm, tempnn, tempmn, tempnm; int m, n; - int ldam, ldan, ldbm, ldbn; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) { @@ -52,25 +51,21 @@ void chameleon_pztradd(cham_uplo_t uplo, cham_trans_t trans, for (n = 0; n < chameleon_min(B->mt,B->nt); n++) { tempnm = n == B->mt-1 ? B->m-n*B->mb : B->mb; tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - ldan = BLKLDD(A, n); - ldbn = BLKLDD(B, n); INSERT_TASK_ztradd( &options, uplo, trans, tempnm, tempnn, B->mb, - alpha, A(n, n), ldan, - beta, B(n, n), ldbn); + alpha, A(n, n), + beta, B(n, n)); for (m = n+1; m < B->mt; m++) { tempmm = m == B->mt-1 ? B->m-B->mb*m : B->nb; - ldam = BLKLDD(A, m); - ldbm = BLKLDD(B, m); INSERT_TASK_zgeadd( &options, trans, tempmm, tempnn, B->mb, - alpha, A(m, n), ldam, - beta, B(m, n), ldbm); + alpha, A(m, n), + beta, B(m, n)); } } } @@ -78,24 +73,21 @@ void chameleon_pztradd(cham_uplo_t uplo, cham_trans_t trans, for (n = 0; n < chameleon_min(B->mt,B->nt); n++) { tempnm = n == B->mt-1 ? B->m-n*B->mb : B->mb; tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - ldan = BLKLDD(A, n); - ldbn = BLKLDD(B, n); INSERT_TASK_ztradd( &options, uplo, trans, tempnm, tempnn, B->mb, - alpha, A(n, n), ldan, - beta, B(n, n), ldbn); + alpha, A(n, n), + beta, B(n, n)); for (m = n+1; m < B->mt; m++) { tempmm = m == B->mt-1 ? B->m-B->mb*m : B->nb; - ldbm = BLKLDD(B, m); INSERT_TASK_zgeadd( &options, trans, tempmm, tempnn, B->mb, - alpha, A(n, m), ldan, - beta, B(m, n), ldbm); + alpha, A(n, m), + beta, B(m, n)); } } } @@ -105,14 +97,12 @@ void chameleon_pztradd(cham_uplo_t uplo, cham_trans_t trans, for (m = 0; m < chameleon_min(B->mt,B->nt); m++) { tempmm = m == B->mt-1 ? B->m-B->mb*m : B->nb; tempmn = m == B->nt-1 ? B->n-m*B->nb : B->nb; - ldam = BLKLDD(A, m); - ldbm = BLKLDD(B, m); INSERT_TASK_ztradd( &options, uplo, trans, tempmm, tempmn, B->mb, - alpha, A(m, m), ldam, - beta, B(m, m), ldbm); + alpha, A(m, m), + beta, B(m, m)); for (n = m+1; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; @@ -120,8 +110,8 @@ void chameleon_pztradd(cham_uplo_t uplo, cham_trans_t trans, INSERT_TASK_zgeadd( &options, trans, tempmm, tempnn, B->mb, - alpha, A(m, n), ldam, - beta, B(m, n), ldbm); + alpha, A(m, n), + beta, B(m, n)); } } } @@ -129,24 +119,21 @@ void chameleon_pztradd(cham_uplo_t uplo, cham_trans_t trans, for (m = 0; m < chameleon_min(B->mt,B->nt); m++) { tempmm = m == B->mt-1 ? B->m-B->mb*m : B->nb; tempmn = m == B->nt-1 ? B->n-m*B->nb : B->nb; - ldam = BLKLDD(A, m); - ldbm = BLKLDD(B, m); INSERT_TASK_ztradd( &options, uplo, trans, tempmm, tempmn, B->mb, - alpha, A(m, m), ldam, - beta, B(m, m), ldbm); + alpha, A(m, m), + beta, B(m, m)); for (n = m+1; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - ldan = BLKLDD(A, n); INSERT_TASK_zgeadd( &options, trans, tempmm, tempnn, B->mb, - alpha, A(n, m), ldan, - beta, B(m, n), ldbm); + alpha, A(n, m), + beta, B(m, n)); } } } @@ -156,8 +143,6 @@ void chameleon_pztradd(cham_uplo_t uplo, cham_trans_t trans, if (trans == ChamNoTrans) { for (m = 0; m < B->mt; m++) { tempmm = m == B->mt-1 ? B->m-B->mb*m : B->nb; - ldam = BLKLDD(A, m); - ldbm = BLKLDD(B, m); for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; @@ -165,25 +150,23 @@ void chameleon_pztradd(cham_uplo_t uplo, cham_trans_t trans, INSERT_TASK_zgeadd( &options, trans, tempmm, tempnn, B->mb, - alpha, A(m, n), ldam, - beta, B(m, n), ldbm); + alpha, A(m, n), + beta, B(m, n)); } } } else { for (m = 0; m < B->mt; m++) { tempmm = m == B->mt-1 ? B->m-B->mb*m : B->nb; - ldbm = BLKLDD(B, m); for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - ldan = BLKLDD(A, n); INSERT_TASK_zgeadd( &options, trans, tempmm, tempnn, B->mb, - alpha, A(n, m), ldan, - beta, B(m, n), ldbm); + alpha, A(n, m), + beta, B(m, n)); } } } diff --git a/compute/pztrmm.c b/compute/pztrmm.c index 3f6376274307a8f140a88f1482b82d0aaa3bb015..d0bddb77bd0830b786ffaf29808340cad8f28fc4 100644 --- a/compute/pztrmm.c +++ b/compute/pztrmm.c @@ -40,7 +40,6 @@ void chameleon_pztrmm(cham_side_t side, cham_uplo_t uplo, RUNTIME_option_t options; int k, m, n; - int ldak, ldam, ldan, ldbk, ldbm; int tempkm, tempkn, tempmm, tempnn; CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0; @@ -58,27 +57,24 @@ void chameleon_pztrmm(cham_side_t side, cham_uplo_t uplo, if (trans == ChamNoTrans) { for (m = 0; m < B->mt; m++) { tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); - ldam = BLKLDD(A, m); for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; INSERT_TASK_ztrmm( &options, side, uplo, trans, diag, tempmm, tempnn, A->mb, - alpha, A(m, m), ldam, /* lda * tempkm */ - B(m, n), ldbm); /* ldb * tempnn */ + alpha, A(m, m), /* lda * tempkm */ + B(m, n)); /* ldb * tempnn */ for (k = m+1; k < A->mt; k++) { tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; - ldbk = BLKLDD(B, k); INSERT_TASK_zgemm( &options, trans, ChamNoTrans, tempmm, tempnn, tempkn, A->mb, - alpha, A(m, k), ldam, - B(k, n), ldbk, - zone, B(m, n), ldbm); + alpha, A(m, k), + B(k, n), + zone, B(m, n)); } } } @@ -89,27 +85,23 @@ void chameleon_pztrmm(cham_side_t side, cham_uplo_t uplo, else { for (m = B->mt-1; m > -1; m--) { tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); - ldam = BLKLDD(A, m); for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; INSERT_TASK_ztrmm( &options, side, uplo, trans, diag, tempmm, tempnn, A->mb, - alpha, A(m, m), ldam, /* lda * tempkm */ - B(m, n), ldbm); /* ldb * tempnn */ + alpha, A(m, m), /* lda * tempkm */ + B(m, n)); /* ldb * tempnn */ for (k = 0; k < m; k++) { - ldbk = BLKLDD(B, k); - ldak = BLKLDD(A, k); INSERT_TASK_zgemm( &options, trans, ChamNoTrans, tempmm, tempnn, B->mb, A->mb, - alpha, A(k, m), ldak, - B(k, n), ldbk, - zone, B(m, n), ldbm); + alpha, A(k, m), + B(k, n), + zone, B(m, n)); } } } @@ -122,26 +114,23 @@ void chameleon_pztrmm(cham_side_t side, cham_uplo_t uplo, if (trans == ChamNoTrans) { for (m = B->mt-1; m > -1; m--) { tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); - ldam = BLKLDD(A, m); for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; INSERT_TASK_ztrmm( &options, side, uplo, trans, diag, tempmm, tempnn, A->mb, - alpha, A(m, m), ldam, /* lda * tempkm */ - B(m, n), ldbm); /* ldb * tempnn */ + alpha, A(m, m), /* lda * tempkm */ + B(m, n)); /* ldb * tempnn */ for (k = 0; k < m; k++) { - ldbk = BLKLDD(B, k); INSERT_TASK_zgemm( &options, trans, ChamNoTrans, tempmm, tempnn, B->mb, A->mb, - alpha, A(m, k), ldam, - B(k, n), ldbk, - zone, B(m, n), ldbm); + alpha, A(m, k), + B(k, n), + zone, B(m, n)); } } } @@ -152,28 +141,24 @@ void chameleon_pztrmm(cham_side_t side, cham_uplo_t uplo, else { for (m = 0; m < B->mt; m++) { tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); - ldam = BLKLDD(A, m); for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; INSERT_TASK_ztrmm( &options, side, uplo, trans, diag, tempmm, tempnn, A->mb, - alpha, A(m, m), ldam, /* lda * tempkm */ - B(m, n), ldbm); /* ldb * tempnn */ + alpha, A(m, m), /* lda * tempkm */ + B(m, n)); /* ldb * tempnn */ for (k = m+1; k < A->mt; k++) { tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); - ldbk = BLKLDD(B, k); INSERT_TASK_zgemm( &options, trans, ChamNoTrans, tempmm, tempnn, tempkm, A->mb, - alpha, A(k, m), ldak, - B(k, n), ldbk, - zone, B(m, n), ldbm); + alpha, A(k, m), + B(k, n), + zone, B(m, n)); } } } @@ -188,26 +173,23 @@ void chameleon_pztrmm(cham_side_t side, cham_uplo_t uplo, if (trans == ChamNoTrans) { for (n = B->nt-1; n > -1; n--) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - ldan = BLKLDD(A, n); for (m = 0; m < B->mt; m++) { tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); INSERT_TASK_ztrmm( &options, side, uplo, trans, diag, tempmm, tempnn, A->mb, - alpha, A(n, n), ldan, /* lda * tempkm */ - B(m, n), ldbm); /* ldb * tempnn */ + alpha, A(n, n), /* lda * tempkm */ + B(m, n)); /* ldb * tempnn */ for (k = 0; k < n; k++) { - ldak = BLKLDD(A, k); INSERT_TASK_zgemm( &options, ChamNoTrans, trans, tempmm, tempnn, B->mb, A->mb, - alpha, B(m, k), ldbm, - A(k, n), ldak, - zone, B(m, n), ldbm); + alpha, B(m, k), + A(k, n), + zone, B(m, n)); } } } @@ -218,16 +200,14 @@ void chameleon_pztrmm(cham_side_t side, cham_uplo_t uplo, else { for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - ldan = BLKLDD(A, n); for (m = 0; m < B->mt; m++) { tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); INSERT_TASK_ztrmm( &options, side, uplo, trans, diag, tempmm, tempnn, A->mb, - alpha, A(n, n), ldan, /* lda * tempkm */ - B(m, n), ldbm); /* ldb * tempnn */ + alpha, A(n, n), /* lda * tempkm */ + B(m, n)); /* ldb * tempnn */ for (k = n+1; k < A->mt; k++) { tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; @@ -235,9 +215,9 @@ void chameleon_pztrmm(cham_side_t side, cham_uplo_t uplo, &options, ChamNoTrans, trans, tempmm, tempnn, tempkn, A->mb, - alpha, B(m, k), ldbm, - A(n, k), ldan, - zone, B(m, n), ldbm); + alpha, B(m, k), + A(n, k), + zone, B(m, n)); } } } @@ -250,27 +230,24 @@ void chameleon_pztrmm(cham_side_t side, cham_uplo_t uplo, if (trans == ChamNoTrans) { for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - ldan = BLKLDD(A, n); for (m = 0; m < B->mt; m++) { tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); INSERT_TASK_ztrmm( &options, side, uplo, trans, diag, tempmm, tempnn, A->mb, - alpha, A(n, n), ldan, /* lda * tempkm */ - B(m, n), ldbm); /* ldb * tempnn */ + alpha, A(n, n), /* lda * tempkm */ + B(m, n)); /* ldb * tempnn */ for (k = n+1; k < A->mt; k++) { tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; - ldak = BLKLDD(A, k); INSERT_TASK_zgemm( &options, ChamNoTrans, trans, tempmm, tempnn, tempkn, A->mb, - alpha, B(m, k), ldbm, - A(k, n), ldak, - zone, B(m, n), ldbm); + alpha, B(m, k), + A(k, n), + zone, B(m, n)); } } } @@ -281,25 +258,23 @@ void chameleon_pztrmm(cham_side_t side, cham_uplo_t uplo, else { for (n = B->nt-1; n > -1; n--) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - ldan = BLKLDD(A, n); for (m = 0; m < B->mt; m++) { tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); INSERT_TASK_ztrmm( &options, side, uplo, trans, diag, tempmm, tempnn, A->mb, - alpha, A(n, n), ldan, /* lda * tempkm */ - B(m, n), ldbm); /* ldb * tempnn */ + alpha, A(n, n), /* lda * tempkm */ + B(m, n)); /* ldb * tempnn */ for (k = 0; k < n; k++) { INSERT_TASK_zgemm( &options, ChamNoTrans, trans, tempmm, tempnn, B->mb, A->mb, - alpha, B(m, k), ldbm, - A(n, k), ldan, - zone, B(m, n), ldbm); + alpha, B(m, k), + A(n, k), + zone, B(m, n)); } } } diff --git a/compute/pztrsm.c b/compute/pztrsm.c index 38b3cedcab0f041bc1081264bf93b1f47aaad41e..c6e7eac7b8433207064892f50cd3d74020776211 100644 --- a/compute/pztrsm.c +++ b/compute/pztrsm.c @@ -38,7 +38,6 @@ void chameleon_pztrsm(cham_side_t side, cham_uplo_t uplo, cham_trans_t trans, ch RUNTIME_option_t options; int k, m, n; - int ldak, ldam, ldan, ldbk, ldbm; int tempkm, tempkn, tempmm, tempnn; CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t) 1.0; @@ -59,8 +58,6 @@ void chameleon_pztrsm(cham_side_t side, cham_uplo_t uplo, cham_trans_t trans, ch if (trans == ChamNoTrans) { for (k = 0; k < B->mt; k++) { tempkm = k == 0 ? B->m-(B->mt-1)*B->mb : B->mb; - ldak = BLKLDD(A, B->mt-1-k); - ldbk = BLKLDD(B, B->mt-1-k); lalpha = k == 0 ? alpha : zone; for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; @@ -68,22 +65,20 @@ void chameleon_pztrsm(cham_side_t side, cham_uplo_t uplo, cham_trans_t trans, ch &options, side, uplo, trans, diag, tempkm, tempnn, A->mb, - lalpha, A(B->mt-1-k, B->mt-1-k), ldak, /* lda * tempkm */ - B(B->mt-1-k, n), ldbk); /* ldb * tempnn */ + lalpha, A(B->mt-1-k, B->mt-1-k), /* lda * tempkm */ + B(B->mt-1-k, n)); /* ldb * tempnn */ } RUNTIME_data_flush( sequence, A(B->mt-1-k, B->mt-1-k) ); for (m = k+1; m < B->mt; m++) { - ldam = BLKLDD(A, B->mt-1-m); - ldbm = BLKLDD(B, B->mt-1-m); for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; INSERT_TASK_zgemm( &options, ChamNoTrans, ChamNoTrans, B->mb, tempnn, tempkm, A->mb, - mzone, A(B->mt-1-m, B->mt-1-k), ldam, - B(B->mt-1-k, n ), ldbk, - lalpha, B(B->mt-1-m, n ), ldbm); + mzone, A(B->mt-1-m, B->mt-1-k), + B(B->mt-1-k, n ), + lalpha, B(B->mt-1-m, n )); } RUNTIME_data_flush( sequence, A(B->mt-1-m, B->mt-1-k) ); } @@ -98,8 +93,6 @@ void chameleon_pztrsm(cham_side_t side, cham_uplo_t uplo, cham_trans_t trans, ch else { for (k = 0; k < B->mt; k++) { tempkm = k == B->mt-1 ? B->m-k*B->mb : B->mb; - ldak = BLKLDD(A, k); - ldbk = BLKLDD(B, k); lalpha = k == 0 ? alpha : zone; for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; @@ -107,22 +100,21 @@ void chameleon_pztrsm(cham_side_t side, cham_uplo_t uplo, cham_trans_t trans, ch &options, side, uplo, trans, diag, tempkm, tempnn, A->mb, - lalpha, A(k, k), ldak, - B(k, n), ldbk); + lalpha, A(k, k), + B(k, n)); } RUNTIME_data_flush( sequence, A(k, k) ); for (m = k+1; m < B->mt; m++) { tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; INSERT_TASK_zgemm( &options, trans, ChamNoTrans, tempmm, tempnn, B->mb, A->mb, - mzone, A(k, m), ldak, - B(k, n), ldbk, - lalpha, B(m, n), ldbm); + mzone, A(k, m), + B(k, n), + lalpha, B(m, n)); } RUNTIME_data_flush( sequence, A(k, m) ); } @@ -140,8 +132,6 @@ void chameleon_pztrsm(cham_side_t side, cham_uplo_t uplo, cham_trans_t trans, ch if (trans == ChamNoTrans) { for (k = 0; k < B->mt; k++) { tempkm = k == B->mt-1 ? B->m-k*B->mb : B->mb; - ldak = BLKLDD(A, k); - ldbk = BLKLDD(B, k); lalpha = k == 0 ? alpha : zone; for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; @@ -149,23 +139,21 @@ void chameleon_pztrsm(cham_side_t side, cham_uplo_t uplo, cham_trans_t trans, ch &options, side, uplo, trans, diag, tempkm, tempnn, A->mb, - lalpha, A(k, k), ldak, - B(k, n), ldbk); + lalpha, A(k, k), + B(k, n)); } RUNTIME_data_flush( sequence, A(k, k) ); for (m = k+1; m < B->mt; m++) { tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldam = BLKLDD(A, m); - ldbm = BLKLDD(B, m); for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; INSERT_TASK_zgemm( &options, ChamNoTrans, ChamNoTrans, tempmm, tempnn, B->mb, A->mb, - mzone, A(m, k), ldam, - B(k, n), ldbk, - lalpha, B(m, n), ldbm); + mzone, A(m, k), + B(k, n), + lalpha, B(m, n)); } RUNTIME_data_flush( sequence, A(m, k) ); } @@ -180,8 +168,6 @@ void chameleon_pztrsm(cham_side_t side, cham_uplo_t uplo, cham_trans_t trans, ch else { for (k = 0; k < B->mt; k++) { tempkm = k == 0 ? B->m-(B->mt-1)*B->mb : B->mb; - ldak = BLKLDD(A, B->mt-1-k); - ldbk = BLKLDD(B, B->mt-1-k); lalpha = k == 0 ? alpha : zone; for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; @@ -189,21 +175,20 @@ void chameleon_pztrsm(cham_side_t side, cham_uplo_t uplo, cham_trans_t trans, ch &options, side, uplo, trans, diag, tempkm, tempnn, A->mb, - lalpha, A(B->mt-1-k, B->mt-1-k), ldak, - B(B->mt-1-k, n), ldbk); + lalpha, A(B->mt-1-k, B->mt-1-k), + B(B->mt-1-k, n)); } RUNTIME_data_flush( sequence, A(B->mt-1-k, B->mt-1-k) ); for (m = k+1; m < B->mt; m++) { - ldbm = BLKLDD(B, B->mt-1-m); for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; INSERT_TASK_zgemm( &options, trans, ChamNoTrans, B->mb, tempnn, tempkm, A->mb, - mzone, A(B->mt-1-k, B->mt-1-m), ldak, - B(B->mt-1-k, n ), ldbk, - lalpha, B(B->mt-1-m, n ), ldbm); + mzone, A(B->mt-1-k, B->mt-1-m), + B(B->mt-1-k, n ), + lalpha, B(B->mt-1-m, n )); } RUNTIME_data_flush( sequence, A(B->mt-1-k, B->mt-1-m) ); } @@ -222,31 +207,28 @@ void chameleon_pztrsm(cham_side_t side, cham_uplo_t uplo, cham_trans_t trans, ch if (trans == ChamNoTrans) { for (k = 0; k < B->nt; k++) { tempkn = k == B->nt-1 ? B->n-k*B->nb : B->nb; - ldak = BLKLDD(A, k); lalpha = k == 0 ? alpha : zone; for (m = 0; m < B->mt; m++) { tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); INSERT_TASK_ztrsm( &options, side, uplo, trans, diag, tempmm, tempkn, A->mb, - lalpha, A(k, k), ldak, /* lda * tempkn */ - B(m, k), ldbm); /* ldb * tempkn */ + lalpha, A(k, k), /* lda * tempkn */ + B(m, k)); /* ldb * tempkn */ } RUNTIME_data_flush( sequence, A(k, k) ); for (m = 0; m < B->mt; m++) { tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); for (n = k+1; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; INSERT_TASK_zgemm( &options, ChamNoTrans, ChamNoTrans, tempmm, tempnn, B->mb, A->mb, - mzone, B(m, k), ldbm, /* ldb * B->mb */ - A(k, n), ldak, /* lda * tempnn */ - lalpha, B(m, n), ldbm); /* ldb * tempnn */ + mzone, B(m, k), /* ldb * B->mb */ + A(k, n), /* lda * tempnn */ + lalpha, B(m, n)); /* ldb * tempnn */ } RUNTIME_data_flush( sequence, B(m, k) ); } @@ -261,27 +243,24 @@ void chameleon_pztrsm(cham_side_t side, cham_uplo_t uplo, cham_trans_t trans, ch else { for (k = 0; k < B->nt; k++) { tempkn = k == 0 ? B->n-(B->nt-1)*B->nb : B->nb; - ldak = BLKLDD(A, B->nt-1-k); for (m = 0; m < B->mt; m++) { tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); INSERT_TASK_ztrsm( &options, side, uplo, trans, diag, tempmm, tempkn, A->mb, - alpha, A(B->nt-1-k, B->nt-1-k), ldak, /* lda * tempkn */ - B( m, B->nt-1-k), ldbm); /* ldb * tempkn */ + alpha, A(B->nt-1-k, B->nt-1-k), /* lda * tempkn */ + B( m, B->nt-1-k)); /* ldb * tempkn */ RUNTIME_data_flush( sequence, A(B->nt-1-k, B->nt-1-k) ); for (n = k+1; n < B->nt; n++) { - ldan = BLKLDD(A, B->nt-1-n); INSERT_TASK_zgemm( &options, ChamNoTrans, trans, tempmm, B->nb, tempkn, A->mb, - minvalpha, B(m, B->nt-1-k), ldbm, /* ldb * tempkn */ - A(B->nt-1-n, B->nt-1-k), ldan, /* A->mb * tempkn (Never last row) */ - zone, B(m, B->nt-1-n), ldbm); /* ldb * B->nb */ + minvalpha, B(m, B->nt-1-k), /* ldb * tempkn */ + A(B->nt-1-n, B->nt-1-k), /* A->mb * tempkn (Never last row) */ + zone, B(m, B->nt-1-n)); /* ldb * B->nb */ } RUNTIME_data_flush( sequence, B(m, B->nt-1-k) ); } @@ -298,17 +277,15 @@ void chameleon_pztrsm(cham_side_t side, cham_uplo_t uplo, cham_trans_t trans, ch if (trans == ChamNoTrans) { for (k = 0; k < B->nt; k++) { tempkn = k == 0 ? B->n-(B->nt-1)*B->nb : B->nb; - ldak = BLKLDD(A, B->nt-1-k); lalpha = k == 0 ? alpha : zone; for (m = 0; m < B->mt; m++) { tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); INSERT_TASK_ztrsm( &options, side, uplo, trans, diag, tempmm, tempkn, A->mb, - lalpha, A(B->nt-1-k, B->nt-1-k), ldak, /* lda * tempkn */ - B( m, B->nt-1-k), ldbm); /* ldb * tempkn */ + lalpha, A(B->nt-1-k, B->nt-1-k), /* lda * tempkn */ + B( m, B->nt-1-k)); /* ldb * tempkn */ RUNTIME_data_flush( sequence, A(B->nt-1-k, B->nt-1-k) ); for (n = k+1; n < B->nt; n++) { @@ -316,9 +293,9 @@ void chameleon_pztrsm(cham_side_t side, cham_uplo_t uplo, cham_trans_t trans, ch &options, ChamNoTrans, ChamNoTrans, tempmm, B->nb, tempkn, A->mb, - mzone, B(m, B->nt-1-k), ldbm, /* ldb * tempkn */ - A(B->nt-1-k, B->nt-1-n), ldak, /* lda * B->nb */ - lalpha, B(m, B->nt-1-n), ldbm); /* ldb * B->nb */ + mzone, B(m, B->nt-1-k), /* ldb * tempkn */ + A(B->nt-1-k, B->nt-1-n), /* lda * B->nb */ + lalpha, B(m, B->nt-1-n)); /* ldb * B->nb */ } RUNTIME_data_flush( sequence, B(m, B->nt-1-k) ); } @@ -333,28 +310,25 @@ void chameleon_pztrsm(cham_side_t side, cham_uplo_t uplo, cham_trans_t trans, ch else { for (k = 0; k < B->nt; k++) { tempkn = k == B->nt-1 ? B->n-k*B->nb : B->nb; - ldak = BLKLDD(A, k); for (m = 0; m < B->mt; m++) { tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; - ldbm = BLKLDD(B, m); INSERT_TASK_ztrsm( &options, side, uplo, trans, diag, tempmm, tempkn, A->mb, - alpha, A(k, k), ldak, /* lda * tempkn */ - B(m, k), ldbm); /* ldb * tempkn */ + alpha, A(k, k), /* lda * tempkn */ + B(m, k)); /* ldb * tempkn */ RUNTIME_data_flush( sequence, A(k, k) ); for (n = k+1; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; - ldan = BLKLDD(A, n); INSERT_TASK_zgemm( &options, ChamNoTrans, trans, tempmm, tempnn, B->mb, A->mb, - minvalpha, B(m, k), ldbm, /* ldb * tempkn */ - A(n, k), ldan, /* ldan * tempkn */ - zone, B(m, n), ldbm); /* ldb * tempnn */ + minvalpha, B(m, k), /* ldb * tempkn */ + A(n, k), /* ldan * tempkn */ + zone, B(m, n)); /* ldb * tempnn */ } RUNTIME_data_flush( sequence, B(m, k) ); } diff --git a/compute/pztrsmpl.c b/compute/pztrsmpl.c index aa861daf180cc5e08a29a94af96e8ea31e2bc50c..8a760806b734e4401b01f3192dbff4e87e62dcf6 100644 --- a/compute/pztrsmpl.c +++ b/compute/pztrsmpl.c @@ -39,7 +39,6 @@ void chameleon_pztrsmpl( CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *L, int *IP RUNTIME_option_t options; int k, m, n; - int ldak, ldam, ldbk, ldbm; int tempkm, tempnn, tempkmin, tempmm, tempkn; int ib; @@ -54,31 +53,27 @@ void chameleon_pztrsmpl( CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *L, int *IP tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; tempkmin = k == chameleon_min(A->mt, A->nt)-1 ? chameleon_min(A->m, A->n)-k*A->mb : A->mb; - ldak = BLKLDD(A, k); - ldbk = BLKLDD(B, k); for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; INSERT_TASK_zgessm( &options, tempkm, tempnn, tempkmin, ib, L->nb, IPIV(k, k), - L(k, k), L->mb, - A(k, k), ldak, - B(k, n), ldbk); + L(k, k), + A(k, k), + B(k, n)); } for (m = k+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); - ldbm = BLKLDD(B, m); for (n = 0; n < B->nt; n++) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; INSERT_TASK_zssssm( &options, A->nb, tempnn, tempmm, tempnn, tempkn, ib, L->nb, - B(k, n), ldbk, - B(m, n), ldbm, - L(m, k), L->mb, - A(m, k), ldam, + B(k, n), + B(m, n), + L(m, k), + A(m, k), IPIV(m, k)); } } diff --git a/compute/pztrtri.c b/compute/pztrtri.c index 925ec975bf6ce7506a913b93869feac78004f5fd..89c1a60740f6e4ae4907fe5c00dc113fdee0e7f1 100644 --- a/compute/pztrtri.c +++ b/compute/pztrtri.c @@ -36,7 +36,6 @@ void chameleon_pztrtri(cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, RUNTIME_option_t options; int k, m, n; - int ldam, ldak; int tempkn, tempkm, tempmm, tempnn; CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t) 1.0; @@ -55,28 +54,25 @@ void chameleon_pztrtri(cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, RUNTIME_iteration_push(chamctxt, k); tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; - ldak = BLKLDD(A, k); for (m = k+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); INSERT_TASK_ztrsm( &options, ChamRight, uplo, ChamNoTrans, diag, tempmm, tempkn, A->mb, - mzone, A(k, k), ldak, - A(m, k), ldam); + mzone, A(k, k), + A(m, k)); } for (m = k+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); for (n = 0; n < k; n++) { INSERT_TASK_zgemm( &options, ChamNoTrans, ChamNoTrans, tempmm, A->nb, tempkn, A->mb, - zone, A(m, k), ldam, - A(k, n), ldak, - zone, A(m, n), ldam); + zone, A(m, k), + A(k, n), + zone, A(m, n)); } RUNTIME_data_flush( sequence, A(m, k) ); } @@ -86,15 +82,15 @@ void chameleon_pztrtri(cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, &options, ChamLeft, uplo, ChamNoTrans, diag, tempkn, A->nb, A->mb, - zone, A(k, k), ldak, - A(k, n), ldak); + zone, A(k, k), + A(k, n)); } RUNTIME_data_flush( sequence, A(k, k) ); INSERT_TASK_ztrtri( &options, uplo, diag, tempkn, A->mb, - A(k, k), ldak, A->nb*k); + A(k, k), A->nb*k); RUNTIME_iteration_pop(chamctxt); } @@ -107,46 +103,43 @@ void chameleon_pztrtri(cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, RUNTIME_iteration_push(chamctxt, k); tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); for (n = k+1; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; INSERT_TASK_ztrsm( &options, ChamLeft, uplo, ChamNoTrans, diag, tempkm, tempnn, A->mb, - mzone, A(k, k), ldak, - A(k, n), ldak); + mzone, A(k, k), + A(k, n)); } for (n = k+1; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; for (m = 0; m < k; m++) { - ldam = BLKLDD(A, m); INSERT_TASK_zgemm( &options, ChamNoTrans, ChamNoTrans, A->mb, tempnn, tempkm, A->mb, - zone, A(m, k), ldam, - A(k, n), ldak, - zone, A(m, n), ldam); + zone, A(m, k), + A(k, n), + zone, A(m, n)); } RUNTIME_data_flush( sequence, A(k, n) ); } for (m = 0; m < k; m++) { - ldam = BLKLDD(A, m); RUNTIME_data_flush( sequence, A(m, k) ); INSERT_TASK_ztrsm( &options, ChamRight, uplo, ChamNoTrans, diag, A->mb, tempkm, A->mb, - zone, A(k, k), ldak, - A(m, k), ldam); + zone, A(k, k), + A(m, k)); } RUNTIME_data_flush( sequence, A(k, k) ); INSERT_TASK_ztrtri( &options, uplo, diag, tempkm, A->mb, - A(k, k), ldak, A->mb*k); + A(k, k), A->mb*k); RUNTIME_iteration_pop(chamctxt); } diff --git a/compute/pzunglq.c b/compute/pzunglq.c index 63c3697f1fb33beb78795c28edb74ad081cb68ae..205678cb781eac775c6399d696e002a3caec0c43 100644 --- a/compute/pzunglq.c +++ b/compute/pzunglq.c @@ -42,7 +42,6 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T size_t ws_host = 0; int k, m, n; - int ldak, ldqm, lddk; int tempnn, tempmm, tempkmin, tempkn; int tempAkm, tempAkn; int ib, minMT; @@ -94,14 +93,11 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T tempAkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; tempkmin = chameleon_min( tempAkn, tempAkm ); tempkn = k == Q->nt-1 ? Q->n-k*Q->nb : Q->nb; - ldak = BLKLDD(A, k); - lddk = BLKLDD(D, k); for (n = Q->nt-1; n > k; n--) { tempnn = n == Q->nt-1 ? Q->n-n*Q->nb : Q->nb; for (m = k; m < Q->mt; m++) { tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb; - ldqm = BLKLDD(Q, m); RUNTIME_data_migrate( sequence, Q(m, k), Q->get_rankof( Q, m, n ) ); @@ -111,10 +107,10 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T &options, ChamRight, ChamNoTrans, tempmm, tempnn, tempAkm, 0, ib, T->nb, - A(k, n), ldak, - T(k, n), T->mb, - Q(m, k), ldqm, - Q(m, n), ldqm); + A(k, n), + T(k, n), + Q(m, k), + Q(m, n)); } RUNTIME_data_flush( sequence, A(k, n) ); RUNTIME_data_flush( sequence, T(k, n) ); @@ -125,19 +121,18 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T INSERT_TASK_zlacpy( &options, ChamUpper, tempkmin, tempDkn, A->nb, - A(k, k), ldak, - D(k), lddk ); + A(k, k), + D(k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, tempDkn, 0., 1., - D(k), lddk ); + D(k) ); #endif } for (m = k; m < Q->mt; m++) { tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb; - ldqm = BLKLDD(Q, m); /* Restore the original location of the tiles */ RUNTIME_data_migrate( sequence, Q(m, k), @@ -147,9 +142,9 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T &options, ChamRight, ChamNoTrans, tempmm, tempkn, tempkmin, ib, T->nb, - D(k), lddk, - T(k, k), T->mb, - Q(m, k), ldqm); + D(k), + T(k, k), + Q(m, k)); } RUNTIME_data_flush( sequence, D(k) ); RUNTIME_data_flush( sequence, T(k, k) ); diff --git a/compute/pzunglq_param.c b/compute/pzunglq_param.c index e90345ec7cd63c3a6518f81e89c97056837184dd..af4bfff5eb47af8ef593a9d93d65af5c647f3601 100644 --- a/compute/pzunglq_param.c +++ b/compute/pzunglq_param.c @@ -41,7 +41,6 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t int k, m, n, i, p; int K, L; - int ldak, ldqm, lddk; int tempkm, tempkmin, temppn, tempnn, tempmm; int ib, node, nbtiles, *tiles; @@ -87,8 +86,6 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t RUNTIME_iteration_push(chamctxt, k); tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); - lddk = BLKLDD(D, k); /* Setting the order of the tiles*/ nbtiles = libhqr_walk_stepk( qrtree, k, tiles ); @@ -111,7 +108,6 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t } for (m = k; m < Q->mt; m++) { tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb; - ldqm = BLKLDD(Q, m); node = Q->get_rankof( Q, m, n ); RUNTIME_data_migrate( sequence, Q(m, p), node ); @@ -121,10 +117,10 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t &options, ChamRight, ChamNoTrans, tempmm, tempnn, tempkm, L, ib, T->nb, - A(k, n), ldak, - T(k, n), T->mb, - Q(m, p), ldqm, - Q(m, n), ldqm); + A(k, n), + T(k, n), + Q(m, p), + Q(m, n)); } RUNTIME_data_flush( sequence, A(k, n) ); RUNTIME_data_flush( sequence, T(k, n) ); @@ -142,19 +138,18 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t INSERT_TASK_zlacpy( &options, ChamUpper, tempkmin, tempDpn, A->nb, - A(k, p), ldak, - D(k, p), lddk ); + A(k, p), + D(k, p) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, tempDpn, 0., 1., - D(k, p), lddk ); + D(k, p) ); #endif } for (m = k; m < Q->mt; m++) { tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb; - ldqm = BLKLDD(Q, m); RUNTIME_data_migrate( sequence, Q(m, p), Q->get_rankof( Q, m, p ) ); @@ -163,9 +158,9 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t &options, ChamRight, ChamNoTrans, tempmm, temppn, tempkmin, ib, T->nb, - D(k, p), lddk, - T(k, p), T->mb, - Q(m, p), ldqm); + D(k, p), + T(k, p), + Q(m, p)); } RUNTIME_data_flush( sequence, D(k, p) ); RUNTIME_data_flush( sequence, T(k, p) ); diff --git a/compute/pzunglqrh.c b/compute/pzunglqrh.c index 6bdfcabeaa7220525fda52945f29d87e2b552d4c..0ab0756b67e5f40c2ff215f100e1031710a57c04 100644 --- a/compute/pzunglqrh.c +++ b/compute/pzunglqrh.c @@ -46,7 +46,6 @@ void chameleon_pzunglqrh( int genD, int BS, int k, m, n; int K, N, RD, lastRD; - int ldak, lddk, ldqm; int tempkm, tempkmin, tempNn, tempnn, tempmm, tempNRDn; int ib, node; @@ -88,8 +87,6 @@ void chameleon_pzunglqrh( int genD, int BS, RUNTIME_iteration_push(chamctxt, k); tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); - lddk = BLKLDD(D, k); lastRD = 0; for (RD = BS; RD < A->nt-k; RD *= 2) lastRD = RD; @@ -98,7 +95,6 @@ void chameleon_pzunglqrh( int genD, int BS, tempNRDn = N+RD == A->nt-1 ? A->n-(N+RD)*A->nb : A->nb; for (m = k; m < Q->mt; m++) { tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb; - ldqm = BLKLDD(Q, m ); node = Q->get_rankof( Q, m, N+RD ); RUNTIME_data_migrate( sequence, Q(m, N), node ); @@ -109,10 +105,10 @@ void chameleon_pzunglqrh( int genD, int BS, &options, ChamRight, ChamNoTrans, tempmm, tempNRDn, tempkm, tempNRDn, ib, T->nb, - A (k, N+RD), ldak, - T2(k, N+RD), T->mb, - Q (m, N ), ldqm, - Q (m, N+RD), ldqm); + A (k, N+RD), + T2(k, N+RD), + Q (m, N ), + Q (m, N+RD)); } RUNTIME_data_flush( sequence, A (k, N+RD) ); @@ -127,7 +123,6 @@ void chameleon_pzunglqrh( int genD, int BS, for (m = k; m < Q->mt; m++) { tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb; - ldqm = BLKLDD(Q, m); node = Q->get_rankof( Q, m, n ); RUNTIME_data_migrate( sequence, Q(m, N), node ); @@ -138,10 +133,10 @@ void chameleon_pzunglqrh( int genD, int BS, &options, ChamRight, ChamNoTrans, tempmm, tempnn, tempkm, 0, ib, T->nb, - A(k, n), ldak, - T(k, n), T->mb, - Q(m, N), ldqm, - Q(m, n), ldqm); + A(k, n), + T(k, n), + Q(m, N), + Q(m, n)); } RUNTIME_data_flush( sequence, A(k, n) ); @@ -154,19 +149,18 @@ void chameleon_pzunglqrh( int genD, int BS, INSERT_TASK_zlacpy( &options, ChamUpper, tempkmin, tempDNn, A->nb, - A(k, N), ldak, - D(k, N), lddk ); + A(k, N), + D(k, N) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, tempDNn, 0., 1., - D(k, N), lddk ); + D(k, N) ); #endif } for (m = k; m < Q->mt; m++) { tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb; - ldqm = BLKLDD(Q, m); RUNTIME_data_migrate( sequence, Q(m, N), Q->get_rankof( Q, m, N ) ); @@ -176,9 +170,9 @@ void chameleon_pzunglqrh( int genD, int BS, ChamRight, ChamNoTrans, tempmm, tempNn, tempkmin, ib, T->nb, - D(k, N), lddk, - T(k, N), T->mb, - Q(m, N), ldqm); + D(k, N), + T(k, N), + Q(m, N)); } RUNTIME_data_flush( sequence, D(k, N) ); RUNTIME_data_flush( sequence, T(k, N) ); diff --git a/compute/pzungqr.c b/compute/pzungqr.c index dda7b25a1c1bb50004b749ab546d46bbb924b872..fdccd74fe30437185fb753b6af009c59f068cbbe 100644 --- a/compute/pzungqr.c +++ b/compute/pzungqr.c @@ -43,7 +43,6 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, size_t ws_host = 0; int k, m, n; - int ldak, ldqk, ldam, ldqm, lddk; int tempmm, tempnn, tempkmin, tempkm; int tempAkm, tempAkn; int ib, minMT; @@ -95,13 +94,8 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, tempAkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; tempkmin = chameleon_min( tempAkn, tempAkm ); tempkm = k == Q->mt-1 ? Q->m-k*Q->mb : Q->mb; - ldak = BLKLDD(A, k); - lddk = BLKLDD(D, k); - ldqk = BLKLDD(Q, k); for (m = Q->mt - 1; m > k; m--) { tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb; - ldam = BLKLDD(A, m); - ldqm = BLKLDD(Q, m); for (n = k; n < Q->nt; n++) { tempnn = n == Q->nt-1 ? Q->n-n*Q->nb : Q->nb; @@ -113,10 +107,10 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, &options, ChamLeft, ChamNoTrans, tempmm, tempnn, tempAkn, 0, ib, T->nb, - A(m, k), ldam, - T(m, k), T->mb, - Q(k, n), ldqk, - Q(m, n), ldqm); + A(m, k), + T(m, k), + Q(k, n), + Q(m, n)); } RUNTIME_data_flush( sequence, A(m, k) ); RUNTIME_data_flush( sequence, T(m, k) ); @@ -128,14 +122,14 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, INSERT_TASK_zlacpy( &options, ChamLower, tempDkm, tempkmin, A->nb, - A(k, k), ldak, - D(k), lddk ); + A(k, k), + D(k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempDkm, tempkmin, 0., 1., - D(k), lddk ); + D(k) ); #endif } for (n = k; n < Q->nt; n++) { @@ -149,9 +143,9 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, &options, ChamLeft, ChamNoTrans, tempkm, tempnn, tempkmin, ib, T->nb, - D(k), lddk, - T(k, k), T->mb, - Q(k, n), ldqk); + D(k), + T(k, k), + Q(k, n)); } RUNTIME_data_flush( sequence, D(k) ); RUNTIME_data_flush( sequence, T(k, k) ); diff --git a/compute/pzungqr_param.c b/compute/pzungqr_param.c index 346848b6fc98efe1a0ca00c987dab9e3bfe3d5bb..8458b97def80e8d72016a2ffabc498f43bcf95bc 100644 --- a/compute/pzungqr_param.c +++ b/compute/pzungqr_param.c @@ -42,7 +42,6 @@ void chameleon_pzungqr_param( int genD, int K, size_t ws_host = 0; int k, m, n, i, p, L; - int ldam, ldqm, ldqp, lddm; int tempmm, tempnn, tempkmin, tempkn; int ib, nbgeqrt, node, nbtiles, *tiles; @@ -94,9 +93,6 @@ void chameleon_pzungqr_param( int genD, int K, p = qrtree->currpiv(qrtree, k, m); tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb; - ldqp = BLKLDD(Q, p); - ldam = BLKLDD(A, m); - ldqm = BLKLDD(Q, m); if( qrtree->gettype(qrtree, k, m) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ @@ -120,10 +116,10 @@ void chameleon_pzungqr_param( int genD, int K, &options, ChamLeft, ChamNoTrans, tempmm, tempnn, tempkn, L, ib, T->nb, - A(m, k), ldam, - T(m, k), T->mb, - Q(p, n), ldqp, - Q(m, n), ldqm); + A(m, k), + T(m, k), + Q(p, n), + Q(m, n)); } RUNTIME_data_flush( sequence, A(m, k) ); RUNTIME_data_flush( sequence, T(m, k) ); @@ -138,23 +134,20 @@ void chameleon_pzungqr_param( int genD, int K, tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; tempkmin = chameleon_min(tempmm, tempkn); - ldam = BLKLDD(A, m); - lddm = BLKLDD(D, m); - ldqm = BLKLDD(Q, m); if ( genD ) { int tempDmm = m == D->mt-1 ? D->m-m*D->mb : D->mb; INSERT_TASK_zlacpy( &options, ChamLower, tempDmm, tempkmin, A->nb, - A(m, k), ldam, - D(m, k), lddm ); + A(m, k), + D(m, k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempDmm, tempkmin, 0., 1., - D(m, k), lddm ); + D(m, k) ); #endif } @@ -169,9 +162,9 @@ void chameleon_pzungqr_param( int genD, int K, &options, ChamLeft, ChamNoTrans, tempmm, tempnn, tempkmin, ib, T->nb, - D(m, k), lddm, - T(m, k), T->mb, - Q(m, n), ldqm); + D(m, k), + T(m, k), + Q(m, n)); } RUNTIME_data_flush( sequence, D(m, k) ); RUNTIME_data_flush( sequence, T(m, k) ); diff --git a/compute/pzungqrrh.c b/compute/pzungqrrh.c index 8310ae1aee8d6cc85d8c6d8eb8940c5f3932e966..3e964790791a3d04b83bc93f3d90f4da5c6981b8 100644 --- a/compute/pzungqrrh.c +++ b/compute/pzungqrrh.c @@ -48,8 +48,6 @@ void chameleon_pzungqrrh( int genD, int BS, int k, m, n; int K, M, RD, lastRD; - int ldaM, ldam, ldaMRD, lddM; - int ldqM, ldqm, ldqMRD; int tempkn, tempMm, tempnn, tempmm, tempMRDm, tempkmin; int ib, node; @@ -97,9 +95,6 @@ void chameleon_pzungqrrh( int genD, int BS, for (RD = lastRD; RD >= BS; RD /= 2) { for (M = k; M+RD < A->mt; M += 2*RD) { tempMRDm = M+RD == A->mt-1 ? A->m-(M+RD)*A->mb : A->mb; - ldqM = BLKLDD(Q, M ); - ldqMRD = BLKLDD(Q, M+RD); - ldaMRD = BLKLDD(A, M+RD); for (n = k; n < Q->nt; n++) { tempnn = n == Q->nt-1 ? Q->n-n*Q->nb : Q->nb; @@ -112,10 +107,10 @@ void chameleon_pzungqrrh( int genD, int BS, &options, ChamLeft, ChamNoTrans, tempMRDm, tempnn, tempkn, tempMRDm, ib, T->nb, - A (M+RD, k), ldaMRD, - T2(M+RD, k), T->mb, - Q (M, n), ldqM, - Q (M+RD, n), ldqMRD); + A (M+RD, k), + T2(M+RD, k), + Q (M, n), + Q (M+RD, n)); } RUNTIME_data_flush( sequence, A (M+RD, k) ); @@ -125,13 +120,8 @@ void chameleon_pzungqrrh( int genD, int BS, for (M = k; M < A->mt; M += BS) { tempMm = M == A->mt-1 ? A->m-M*A->mb : A->mb; tempkmin = chameleon_min(tempMm, tempkn); - ldaM = BLKLDD(A, M); - lddM = BLKLDD(D, M); - ldqM = BLKLDD(Q, M); for (m = chameleon_min(M+BS, A->mt)-1; m > M; m--) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldqm = BLKLDD(Q, m); - ldam = BLKLDD(A, m); for (n = k; n < Q->nt; n++) { tempnn = n == Q->nt-1 ? Q->n-n*Q->nb : Q->nb; @@ -145,10 +135,10 @@ void chameleon_pzungqrrh( int genD, int BS, &options, ChamLeft, ChamNoTrans, tempmm, tempnn, tempkn, 0, ib, T->nb, - A(m, k), ldam, - T(m, k), T->mb, - Q(M, n), ldqM, - Q(m, n), ldqm); + A(m, k), + T(m, k), + Q(M, n), + Q(m, n)); } RUNTIME_data_flush( sequence, A(m, k) ); RUNTIME_data_flush( sequence, T(m, k) ); @@ -159,14 +149,14 @@ void chameleon_pzungqrrh( int genD, int BS, INSERT_TASK_zlacpy( &options, ChamLower, tempDMm, tempkmin, A->nb, - A(M, k), ldaM, - D(M, k), lddM ); + A(M, k), + D(M, k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempDMm, tempkmin, 0., 1., - D(M, k), lddM ); + D(M, k) ); #endif } for (n = k; n < Q->nt; n++) { @@ -181,9 +171,9 @@ void chameleon_pzungqrrh( int genD, int BS, ChamLeft, ChamNoTrans, tempMm, tempnn, tempkmin, ib, T->nb, - D(M, k), lddM, - T(M, k), T->mb, - Q(M, n), ldqM); + D(M, k), + T(M, k), + Q(M, n)); } RUNTIME_data_flush( sequence, D(M, k) ); RUNTIME_data_flush( sequence, T(M, k) ); diff --git a/compute/pzunmlq.c b/compute/pzunmlq.c index ac2cb0aa76116c1cf70f6cd3efaa0f9621ff5b9b..0b3f109fc611659817339c8a3723fdf34f132b0b 100644 --- a/compute/pzunmlq.c +++ b/compute/pzunmlq.c @@ -44,7 +44,6 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, size_t ws_host = 0; int k, m, n; - int ldak, ldck, ldcm, lddk; int tempkm, tempkn, tempkmin, tempmm, tempnn; int ib, KT, K; @@ -100,23 +99,20 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, tempkm = k == C->mt - 1 ? C->m - k * C->mb : C->mb; tempkmin = k == KT - 1 ? K - k * A->nb : A->nb; - ldak = BLKLDD(A, k); - ldck = BLKLDD(C, k); - lddk = BLKLDD(D, k); if ( genD ) { int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; INSERT_TASK_zlacpy( &options, ChamUpper, tempkmin, tempDkn, A->nb, - A(k, k), ldak, - D(k), lddk ); + A(k, k), + D(k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, tempDkn, 0., 1., - D(k), lddk ); + D(k) ); #endif } for (n = 0; n < C->nt; n++) { @@ -125,9 +121,9 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, &options, side, trans, tempkm, tempnn, tempkmin, ib, T->nb, - D(k), lddk, - T(k, k), T->mb, - C(k, n), ldck); + D(k), + T(k, k), + C(k, n)); } RUNTIME_data_flush( sequence, D(k) ); @@ -135,7 +131,6 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, for (m = k+1; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); for (n = 0; n < C->nt; n++) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; @@ -147,10 +142,10 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, &options, side, trans, tempmm, tempnn, tempkmin, 0, ib, T->nb, - A(k, m), ldak, - T(k, m), T->mb, - C(k, n), ldck, - C(m, n), ldcm); + A(k, m), + T(k, m), + C(k, n), + C(m, n)); } RUNTIME_data_flush( sequence, A(k, m) ); @@ -176,13 +171,9 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, tempkm = k == C->mt - 1 ? C->m - k * C->mb : C->mb; tempkmin = k == KT - 1 ? K - k * A->nb : A->nb; - ldak = BLKLDD(A, k); - ldck = BLKLDD(C, k); - lddk = BLKLDD(D, k); for (m = C->mt-1; m > k; m--) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); for (n = 0; n < C->nt; n++) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; @@ -194,10 +185,10 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, &options, side, trans, tempmm, tempnn, tempkmin, 0, ib, T->nb, - A(k, m), ldak, - T(k, m), T->mb, - C(k, n), ldck, - C(m, n), ldcm); + A(k, m), + T(k, m), + C(k, n), + C(m, n)); } RUNTIME_data_flush( sequence, A(k, m) ); @@ -209,14 +200,14 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, INSERT_TASK_zlacpy( &options, ChamUpper, tempkmin, tempDkn, A->nb, - A(k, k), ldak, - D(k), lddk ); + A(k, k), + D(k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, tempDkn, 0., 1., - D(k), lddk ); + D(k) ); #endif } for (n = 0; n < C->nt; n++) { @@ -229,9 +220,9 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, &options, side, trans, tempkm, tempnn, tempkmin, ib, T->nb, - D(k), lddk, - T(k, k), T->mb, - C(k, n), ldck); + D(k), + T(k, k), + C(k, n)); } RUNTIME_data_flush( sequence, D(k) ); RUNTIME_data_flush( sequence, T(k, k) ); @@ -249,14 +240,11 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, tempkn = k == C->nt - 1 ? C->n - k * C->nb : C->nb; tempkmin = k == KT - 1 ? K - k * A->nb : A->nb; - ldak = BLKLDD(A, k); - lddk = BLKLDD(D, k); for (n = C->nt-1; n > k; n--) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); RUNTIME_data_migrate( sequence, C(m, k), C->get_rankof( C, m, n ) ); @@ -266,10 +254,10 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, &options, side, trans, tempmm, tempnn, tempkmin, 0, ib, T->nb, - A(k, n), ldak, - T(k, n), T->mb, - C(m, k), ldcm, - C(m, n), ldcm); + A(k, n), + T(k, n), + C(m, k), + C(m, n)); } RUNTIME_data_flush( sequence, A(k, n) ); @@ -281,19 +269,18 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, INSERT_TASK_zlacpy( &options, ChamUpper, tempkmin, tempDkn, A->nb, - A(k, k), ldak, - D(k), lddk ); + A(k, k), + D(k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, tempDkn, 0., 1., - D(k), lddk ); + D(k) ); #endif } for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); RUNTIME_data_migrate( sequence, C(m, k), C->get_rankof( C, m, k ) ); @@ -302,9 +289,9 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, &options, side, trans, tempmm, tempkn, tempkmin, ib, T->nb, - D(k), lddk, - T(k, k), T->mb, - C(m, k), ldcm); + D(k), + T(k, k), + C(m, k)); } RUNTIME_data_flush( sequence, D(k) ); @@ -322,8 +309,6 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, tempkn = k == C->nt - 1 ? C->n - k * C->nb : C->nb; tempkmin = k == KT - 1 ? K - k * A->nb : A->nb; - ldak = BLKLDD(A, k); - lddk = BLKLDD(D, k); if ( genD ) { int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; @@ -331,26 +316,25 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, INSERT_TASK_zlacpy( &options, ChamUpper, tempkmin, tempDkn, A->nb, - A(k, k), ldak, - D(k), lddk ); + A(k, k), + D(k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, tempDkn, 0., 1., - D(k), lddk ); + D(k) ); #endif } for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); INSERT_TASK_zunmlq( &options, side, trans, tempmm, tempkn, tempkmin, ib, T->nb, - D(k), lddk, - T(k, k), T->mb, - C(m, k), ldcm); + D(k), + T(k, k), + C(m, k)); } RUNTIME_data_flush( sequence, D(k) ); @@ -360,7 +344,6 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); RUNTIME_data_migrate( sequence, C(m, k), C->get_rankof( C, m, n ) ); @@ -370,10 +353,10 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, &options, side, trans, tempmm, tempnn, tempkmin, 0, ib, T->nb, - A(k, n), ldak, - T(k, n), T->mb, - C(m, k), ldcm, - C(m, n), ldcm); + A(k, n), + T(k, n), + C(m, k), + C(m, n)); } RUNTIME_data_flush( sequence, A(k, n) ); diff --git a/compute/pzunmlq_param.c b/compute/pzunmlq_param.c index 6e27ca3f610cbb139e0f37edc127e69097fcedab..16c1c588c7981a93e1e39948e4e8199fdcbb17e9 100644 --- a/compute/pzunmlq_param.c +++ b/compute/pzunmlq_param.c @@ -42,7 +42,6 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, size_t ws_host = 0; int k, m, n, i, p; - int ldak, lddk, ldcp, ldcm; int temppm, temppn, tempmm, tempnn, tempkm,tempkmin; int ib, KT, L; int node, nbtiles, *tiles; @@ -93,8 +92,6 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, RUNTIME_iteration_push(chamctxt, k); tempkm = k == A->mt - 1 ? A->m - k * A->mb : A->mb; - ldak = BLKLDD(A, k); - lddk = BLKLDD(D, k); T = TS; for (i = 0; i < qrtree->getnbgeqrf(qrtree, k); i++) { @@ -103,7 +100,6 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, temppm = p == C->mt-1 ? C->m - p * C->mb : C->mb; tempkmin = chameleon_min( temppm, tempkm ); - ldcp = BLKLDD(C, p); if ( genD ) { int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb; @@ -111,14 +107,14 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zlacpy( &options, ChamUpper, tempkmin, tempDpn, A->nb, - A(k, p), ldak, - D(k, p), lddk ); + A(k, p), + D(k, p) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, tempDpn, 0., 1., - D(k, p), lddk ); + D(k, p) ); #endif } for (n = 0; n < C->nt; n++) { @@ -126,9 +122,9 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zunmlq( &options, side, trans, temppm, tempnn, tempkmin, ib, T->nb, - D(k, p), lddk, - T(k, p), T->mb, - C(p, n), ldcp); + D(k, p), + T(k, p), + C(p, n)); } RUNTIME_data_flush( sequence, D(k, p) ); RUNTIME_data_flush( sequence, T(k, p) ); @@ -142,8 +138,6 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, p = qrtree->currpiv(qrtree, k, m); tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); - ldcp = BLKLDD(C, p); if( qrtree->gettype(qrtree, k, m) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ @@ -165,10 +159,10 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_ztpmlqt( &options, side, trans, tempmm, tempnn, tempkm, chameleon_min( L, tempnn ), ib, T->nb, - A(k, m), ldak, - T(k, m), T->mb, - C(p, n), ldcp, - C(m, n), ldcm); + A(k, m), + T(k, m), + C(p, n), + C(m, n)); } RUNTIME_data_flush( sequence, A(k, m) ); RUNTIME_data_flush( sequence, T(k, m) ); @@ -191,8 +185,6 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, RUNTIME_iteration_push(chamctxt, k); tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); - lddk = BLKLDD(D, k); /* Setting the order of the tiles*/ nbtiles = libhqr_walk_stepk( qrtree, k, tiles ); @@ -202,8 +194,6 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, p = qrtree->currpiv(qrtree, k, m); tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcp = BLKLDD(C, p); - ldcm = BLKLDD(C, m); if( qrtree->gettype(qrtree, k, m) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ @@ -225,10 +215,10 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_ztpmlqt( &options, side, trans, tempmm, tempnn, tempkm, chameleon_min(L, tempnn), ib, T->nb, - A(k, m), ldak, - T(k, m), T->mb, - C(p, n), ldcp, - C(m, n), ldcm); + A(k, m), + T(k, m), + C(p, n), + C(m, n)); } RUNTIME_data_flush( sequence, A(k, m) ); RUNTIME_data_flush( sequence, T(k, m) ); @@ -241,7 +231,6 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, temppm = p == C->mt-1 ? C->m-p*C->mb : C->mb; tempkmin = chameleon_min( temppm, tempkm ); - ldcp = BLKLDD(C, p); if ( genD ) { int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb; @@ -249,14 +238,14 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zlacpy( &options, ChamUpper, tempkmin, tempDpn, A->nb, - A(k, p), ldak, - D(k, p), lddk ); + A(k, p), + D(k, p) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, tempDpn, 0., 1., - D(k, p), lddk ); + D(k, p) ); #endif } @@ -269,9 +258,9 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zunmlq( &options, side, trans, temppm, tempnn, tempkmin, ib, T->nb, - D(k, p), lddk, - T(k, p), T->mb, - C(p, n), ldcp); + D(k, p), + T(k, p), + C(p, n)); } RUNTIME_data_flush( sequence, D(k, p) ); @@ -290,8 +279,6 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, RUNTIME_iteration_push(chamctxt, k); tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); - lddk = BLKLDD(D, k); /* Setting the order of the tiles*/ nbtiles = libhqr_walk_stepk( qrtree, k, tiles ); @@ -315,7 +302,6 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); node = C->get_rankof( C, m, n ); RUNTIME_data_migrate( sequence, C(m, p), node ); @@ -324,10 +310,10 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_ztpmlqt( &options, side, trans, tempmm, tempnn, tempkm, L, ib, T->nb, - A(k, n), ldak, - T(k, n), T->mb, - C(m, p), ldcm, - C(m, n), ldcm); + A(k, n), + T(k, n), + C(m, p), + C(m, n)); } RUNTIME_data_flush( sequence, A(k, n) ); RUNTIME_data_flush( sequence, T(k, n) ); @@ -346,20 +332,19 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zlacpy( &options, ChamUpper, tempkmin, tempDpn, A->nb, - A(k, p), ldak, - D(k, p), lddk ); + A(k, p), + D(k, p) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, tempDpn, 0., 1., - D(k, p), lddk ); + D(k, p) ); #endif } for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); RUNTIME_data_migrate( sequence, C(m, p), C->get_rankof( C, m, p ) ); @@ -367,9 +352,9 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zunmlq( &options, side, trans, tempmm, temppn, tempkmin, ib, T->nb, - D(k, p), lddk, - T(k, p), T->mb, - C(m, p), ldcm); + D(k, p), + T(k, p), + C(m, p)); } RUNTIME_data_flush( sequence, D(k, p) ); RUNTIME_data_flush( sequence, T(k, p) ); @@ -385,8 +370,6 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, RUNTIME_iteration_push(chamctxt, k); tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); - lddk = BLKLDD(D, k); T = TS; for (i = 0; i < qrtree->getnbgeqrf(qrtree, k); i++) { @@ -401,26 +384,25 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zlacpy( &options, ChamUpper, tempkmin, tempDpn, A->nb, - A(k, p), ldak, - D(k, p), lddk ); + A(k, p), + D(k, p) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, tempDpn, 0., 1., - D(k, p), lddk ); + D(k, p) ); #endif } for (m = 0; m < C->mt; m++) { - ldcm = BLKLDD(C, m); tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; INSERT_TASK_zunmlq( &options, side, trans, tempmm, temppn, tempkmin, ib, T->nb, - D(k, p), lddk, - T(k, p), TS->mb, - C(m, p), ldcm); + D(k, p), + T(k, p), + C(m, p)); } RUNTIME_data_flush( sequence, D(k, p) ); RUNTIME_data_flush( sequence, T(k, p) ); @@ -448,7 +430,6 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); node = C->get_rankof( C, m, n ); RUNTIME_data_migrate( sequence, C(m, p), node ); @@ -457,10 +438,10 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_ztpmlqt( &options, side, trans, tempmm, tempnn, tempkm, L, ib, T->nb, - A(k, n), ldak, - T(k, n), T->mb, - C(m, p), ldcm, - C(m, n), ldcm); + A(k, n), + T(k, n), + C(m, p), + C(m, n)); } RUNTIME_data_flush( sequence, A(k, n) ); RUNTIME_data_flush( sequence, T(k, n) ); diff --git a/compute/pzunmlqrh.c b/compute/pzunmlqrh.c index 8f2931ccdba43f7f7d89f8f5f1b962a28567c4bf..ee64598623793ccda1776ddd2961e254f3063bd0 100644 --- a/compute/pzunmlqrh.c +++ b/compute/pzunmlqrh.c @@ -47,8 +47,6 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans int k, m, n, p; int KT, RD, lastRD; - int ldak, lddk; - int ldcp, ldcm; int temppm, temppn, tempkm, tempnn, tempmm, tempkmin; int ib, node; @@ -96,15 +94,12 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans tempkm = k == A->mt - 1 ? A->m - k * A->mb : A->mb; - ldak = BLKLDD(A, k); - lddk = BLKLDD(D, k); for (p = k; p < C->mt; p += BS) { temppm = p == C->mt-1 ? C->m - p * C->mb : C->mb; tempkmin = chameleon_min( temppm, tempkm ); - ldcp = BLKLDD(C, p); if ( genD ) { int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb; @@ -112,14 +107,14 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zlacpy( &options, ChamUpper, tempkmin, tempDpn, A->nb, - A(k, p), ldak, - D(k, p), lddk ); + A(k, p), + D(k, p) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, tempDpn, 0., 1., - D(k, p), lddk ); + D(k, p) ); #endif } for (n = 0; n < C->nt; n++) { @@ -128,16 +123,15 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans &options, side, trans, temppm, tempnn, tempkmin, ib, T->nb, - D(k, p), lddk, - T(k, p), T->mb, - C(p, n), ldcp); + D(k, p), + T(k, p), + C(p, n)); } RUNTIME_data_flush( sequence, D(k, p) ); RUNTIME_data_flush( sequence, T(k, p) ); for (m = p+1; m < chameleon_min(p+BS, C->mt); m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); for (n = 0; n < C->nt; n++) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; @@ -150,10 +144,10 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_ztpmlqt( &options, side, trans, tempmm, tempnn, tempkm, 0, ib, T->nb, - A(k, m), ldak, - T(k, m), T->mb, - C(p, n), ldcp, - C(m, n), ldcm); + A(k, m), + T(k, m), + C(p, n), + C(m, n)); } RUNTIME_data_flush( sequence, A(k, m) ); RUNTIME_data_flush( sequence, T(k, m) ); @@ -164,8 +158,6 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans m = p+RD; tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); - ldcp = BLKLDD(C, p); for (n = 0; n < C->nt; n++) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; @@ -179,10 +171,10 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans &options, side, trans, tempmm, tempnn, tempkm, tempnn, ib, T->nb, - A (k, m), ldak, - T2(k, m), T->mb, - C (p, n), ldcp, - C (m, n), ldcm); + A (k, m), + T2(k, m), + C (p, n), + C (m, n)); } RUNTIME_data_flush( sequence, A (k, m) ); RUNTIME_data_flush( sequence, T2(k, m) ); @@ -206,8 +198,6 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans RUNTIME_iteration_push(chamctxt, k); tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); - lddk = BLKLDD(D, k); lastRD = 0; for (RD = BS; RD < C->mt-k; RD *= 2) @@ -217,8 +207,6 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans m = p+RD; tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); - ldcp = BLKLDD(C, p); for (n = 0; n < C->nt; n++) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; @@ -231,21 +219,19 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_ztpmlqt( &options, side, trans, tempmm, tempnn, tempkm, tempnn, ib, T->nb, - A (k, m), ldak, - T2(k, m), T->mb, - C (p, n), ldcp, - C (m, n), ldcm); + A (k, m), + T2(k, m), + C (p, n), + C (m, n)); } RUNTIME_data_flush( sequence, A (k, m) ); RUNTIME_data_flush( sequence, T2(k, m) ); } } for (p = k; p < C->mt; p += BS) { - ldcp = BLKLDD(C, p); for (m = chameleon_min(p+BS, C->mt)-1; m > p; m--) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); for (n = 0; n < C->nt; n++) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; @@ -258,10 +244,10 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_ztpmlqt( &options, side, trans, tempmm, tempnn, tempkm, 0, ib, T->nb, - A(k, m), ldak, - T(k, m), T->mb, - C(p, n), ldcp, - C(m, n), ldcm); + A(k, m), + T(k, m), + C(p, n), + C(m, n)); } RUNTIME_data_flush( sequence, A(k, m) ); RUNTIME_data_flush( sequence, T(k, m) ); @@ -276,14 +262,14 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zlacpy( &options, ChamUpper, tempkmin, tempDpn, A->nb, - A(k, p), ldak, - D(k, p), lddk ); + A(k, p), + D(k, p) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, tempDpn, 0., 1., - D(k, p), lddk ); + D(k, p) ); #endif } @@ -296,9 +282,9 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zunmlq( &options, side, trans, temppm, tempnn, tempkmin, ib, T->nb, - D(k, p), lddk, - T(k, p), T->mb, - C(p, n), ldcp); + D(k, p), + T(k, p), + C(p, n)); } RUNTIME_data_flush( sequence, D(k, p) ); RUNTIME_data_flush( sequence, T(k, p) ); @@ -316,8 +302,6 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans RUNTIME_iteration_push(chamctxt, k); tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); - lddk = BLKLDD(D, k); lastRD = 0; for (RD = BS; RD < C->nt-k; RD *= 2) lastRD = RD; @@ -329,7 +313,6 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); node = C->get_rankof( C, m, n ); RUNTIME_data_migrate( sequence, C(m, p), node ); @@ -339,10 +322,10 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_ztpmlqt( &options, side, trans, tempmm, tempnn, tempkm, tempnn, ib, T->nb, - A (k, n), ldak, - T2(k, n), T->mb, - C (m, p), ldcm, - C (m, n), ldcm); + A (k, n), + T2(k, n), + C (m, p), + C (m, n)); } RUNTIME_data_flush( sequence, A (k, n) ); RUNTIME_data_flush( sequence, T2(k, n) ); @@ -356,7 +339,6 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); node = C->get_rankof( C, m, n ); RUNTIME_data_migrate( sequence, C(m, p), node ); @@ -366,10 +348,10 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_ztpmlqt( &options, side, trans, tempmm, tempnn, tempkm, 0, ib, T->nb, - A(k, n), ldak, - T(k, n), T->mb, - C(m, p), ldcm, - C(m, n), ldcm); + A(k, n), + T(k, n), + C(m, p), + C(m, n)); } RUNTIME_data_flush( sequence, A(k, n) ); RUNTIME_data_flush( sequence, T(k, n) ); @@ -384,20 +366,19 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zlacpy( &options, ChamUpper, tempkmin, tempDpn, A->nb, - A(k, p), ldak, - D(k, p), lddk ); + A(k, p), + D(k, p) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, tempDpn, 0., 1., - D(k, p), lddk ); + D(k, p) ); #endif } for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); RUNTIME_data_migrate( sequence, C(m, p), C->get_rankof( C, m, p ) ); @@ -405,9 +386,9 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zunmlq( &options, side, trans, tempmm, temppn, tempkmin, ib, T->nb, - D(k, p), lddk, - T(k, p), T->mb, - C(m, p), ldcm); + D(k, p), + T(k, p), + C(m, p)); } RUNTIME_data_flush( sequence, D(k, p) ); RUNTIME_data_flush( sequence, T(k, p) ); @@ -423,8 +404,6 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans RUNTIME_iteration_push(chamctxt, k); tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); - lddk = BLKLDD(D, k); for (p = k; p < C->nt; p += BS) { temppn = p == C->nt - 1 ? C->n - p * C->nb : C->nb; @@ -436,26 +415,25 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zlacpy( &options, ChamUpper, tempkmin, tempDpn, A->nb, - A(k, p), ldak, - D(k, p), lddk ); + A(k, p), + D(k, p) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, tempDpn, 0., 1., - D(k, p), lddk ); + D(k, p) ); #endif } for (m = 0; m < C->mt; m++) { - ldcm = BLKLDD(C, m); tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; INSERT_TASK_zunmlq( &options, side, trans, tempmm, temppn, tempkmin, ib, T->nb, - D(k, p), lddk, - T(k, p), T->mb, - C(m, p), ldcm); + D(k, p), + T(k, p), + C(m, p)); } RUNTIME_data_flush( sequence, D(k, p) ); RUNTIME_data_flush( sequence, T(k, p) ); @@ -464,7 +442,6 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); node = C->get_rankof( C, m, n ); RUNTIME_data_migrate( sequence, C(m, p), node ); @@ -474,10 +451,10 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_ztpmlqt( &options, side, trans, tempmm, tempnn, tempkm, 0, ib, T->nb, - A(k, n), ldak, - T(k, n), T->mb, - C(m, p), ldcm, - C(m, n), ldcm); + A(k, n), + T(k, n), + C(m, p), + C(m, n)); } RUNTIME_data_flush( sequence, A(k, n) ); RUNTIME_data_flush( sequence, T(k, n) ); @@ -490,7 +467,6 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); node = C->get_rankof( C, m, n ); RUNTIME_data_migrate( sequence, C(m, p), node ); @@ -501,10 +477,10 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans &options, side, trans, tempmm, tempnn, tempkm, tempnn, ib, T->nb, - A (k, n), ldak, - T2(k, n), T->mb, - C (m, p), ldcm, - C (m, n), ldcm); + A (k, n), + T2(k, n), + C (m, p), + C (m, n)); } RUNTIME_data_flush( sequence, A (k, n) ); RUNTIME_data_flush( sequence, T2(k, n) ); diff --git a/compute/pzunmqr.c b/compute/pzunmqr.c index 3a9e93bbee000aab330e562ee82bb6befd44d5db..333b81b17c3feeae562c8e2c474294a9957c7e84 100644 --- a/compute/pzunmqr.c +++ b/compute/pzunmqr.c @@ -44,7 +44,6 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, size_t ws_host = 0; int k, m, n; - int ldak, ldck, ldam, ldan, ldcm, lddk; int tempkm, tempkn, tempkmin, tempmm, tempnn; int ib, KT, K; @@ -100,9 +99,6 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, tempkm = k == C->mt - 1 ? C->m - k * C->mb : C->mb; tempkmin = k == KT - 1 ? K - k * A->nb : A->nb; - ldak = BLKLDD(A, k); - ldck = BLKLDD(C, k); - lddk = BLKLDD(D, k); if ( genD ) { int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb; @@ -110,14 +106,14 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, INSERT_TASK_zlacpy( &options, ChamLower, tempDkm, tempkmin, A->nb, - A(k, k), ldak, - D(k), lddk ); + A(k, k), + D(k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempDkm, tempkmin, 0., 1., - D(k), lddk ); + D(k) ); #endif } for (n = 0; n < C->nt; n++) { @@ -126,9 +122,9 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, &options, side, trans, tempkm, tempnn, tempkmin, ib, T->nb, - D(k), lddk, - T(k, k), T->mb, - C(k, n), ldck); + D(k), + T(k, k), + C(k, n)); } RUNTIME_data_flush( sequence, D(k) ); @@ -136,8 +132,6 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, for (m = k+1; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldam = BLKLDD(A, m); - ldcm = BLKLDD(C, m); for (n = 0; n < C->nt; n++) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; @@ -149,10 +143,10 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, &options, side, trans, tempmm, tempnn, tempkmin, 0, ib, T->nb, - A(m, k), ldam, - T(m, k), T->mb, - C(k, n), ldck, - C(m, n), ldcm); + A(m, k), + T(m, k), + C(k, n), + C(m, n)); } RUNTIME_data_flush( sequence, A(m, k) ); @@ -178,14 +172,9 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, tempkm = k == C->mt - 1 ? C->m - k * C->mb : C->mb; tempkmin = k == KT - 1 ? K - k * A->nb : A->nb; - ldak = BLKLDD(A, k); - ldck = BLKLDD(C, k); - lddk = BLKLDD(D, k); for (m = C->mt-1; m > k; m--) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldam = BLKLDD(A, m); - ldcm = BLKLDD(C, m); for (n = 0; n < C->nt; n++) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; @@ -197,10 +186,10 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, &options, side, trans, tempmm, tempnn, tempkmin, 0, ib, T->nb, - A(m, k), ldam, - T(m, k), T->mb, - C(k, n), ldck, - C(m, n), ldcm); + A(m, k), + T(m, k), + C(k, n), + C(m, n)); } RUNTIME_data_flush( sequence, A(m, k) ); RUNTIME_data_flush( sequence, T(m, k) ); @@ -212,14 +201,14 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, INSERT_TASK_zlacpy( &options, ChamLower, tempDkm, tempkmin, A->nb, - A(k, k), ldak, - D(k), lddk ); + A(k, k), + D(k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempDkm, tempkmin, 0., 1., - D(k), lddk ); + D(k) ); #endif } for (n = 0; n < C->nt; n++) { @@ -232,9 +221,9 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, &options, side, trans, tempkm, tempnn, tempkmin, ib, T->nb, - D(k), lddk, - T(k, k), T->mb, - C(k, n), ldck); + D(k), + T(k, k), + C(k, n)); } RUNTIME_data_flush( sequence, D(k) ); RUNTIME_data_flush( sequence, T(k, k) ); @@ -252,15 +241,11 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, tempkn = k == C->nt - 1 ? C->n - k * C->nb : C->nb; tempkmin = k == KT - 1 ? K - k * A->nb : A->nb; - ldak = BLKLDD(A, k); - lddk = BLKLDD(D, k); for (n = C->nt-1; n > k; n--) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - ldan = BLKLDD(A, n); for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); RUNTIME_data_migrate( sequence, C(m, k), C->get_rankof( C, m, n ) ); @@ -270,10 +255,10 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, &options, side, trans, tempmm, tempnn, tempkmin, 0, ib, T->nb, - A(n, k), ldan, - T(n, k), T->mb, - C(m, k), ldcm, - C(m, n), ldcm); + A(n, k), + T(n, k), + C(m, k), + C(m, n)); } RUNTIME_data_flush( sequence, A(n, k) ); @@ -286,19 +271,18 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, INSERT_TASK_zlacpy( &options, ChamLower, tempDkm, tempkmin, A->nb, - A(k, k), ldak, - D(k), lddk ); + A(k, k), + D(k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempDkm, tempkmin, 0., 1., - D(k), lddk ); + D(k) ); #endif } for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); RUNTIME_data_migrate( sequence, C(m, k), C->get_rankof( C, m, k ) ); @@ -307,9 +291,9 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, &options, side, trans, tempmm, tempkn, tempkmin, ib, T->nb, - D(k), lddk, - T(k, k), T->mb, - C(m, k), ldcm); + D(k), + T(k, k), + C(m, k)); } RUNTIME_data_flush( sequence, D(k) ); @@ -327,8 +311,6 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, tempkn = k == C->nt - 1 ? C->n - k * C->nb : C->nb; tempkmin = k == KT - 1 ? K - k * A->nb : A->nb; - ldak = BLKLDD(A, k); - lddk = BLKLDD(D, k); if ( genD ) { int tempDkm = k == D->mt - 1 ? D->m - k * D->mb : D->mb; @@ -336,26 +318,25 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, INSERT_TASK_zlacpy( &options, ChamLower, tempDkm, tempkmin, A->nb, - A(k, k), ldak, - D(k), lddk ); + A(k, k), + D(k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempDkm, tempkmin, 0., 1., - D(k), lddk ); + D(k) ); #endif } for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); INSERT_TASK_zunmqr( &options, side, trans, tempmm, tempkn, tempkmin, ib, T->nb, - D(k), lddk, - T(k, k), T->mb, - C(m, k), ldcm); + D(k), + T(k, k), + C(m, k)); } RUNTIME_data_flush( sequence, D(k) ); @@ -363,10 +344,8 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, for (n = k+1; n < C->nt; n++) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - ldan = BLKLDD(A, n); for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); RUNTIME_data_migrate( sequence, C(m, k), C->get_rankof( C, m, n ) ); @@ -376,10 +355,10 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, &options, side, trans, tempmm, tempnn, tempkmin, 0, ib, T->nb, - A(n, k), ldan, - T(n, k), T->mb, - C(m, k), ldcm, - C(m, n), ldcm); + A(n, k), + T(n, k), + C(m, k), + C(m, n)); } RUNTIME_data_flush( sequence, A(n, k) ); diff --git a/compute/pzunmqr_param.c b/compute/pzunmqr_param.c index a3905647c0767cbaad4f4a8be98e1397b6996924..8f95b49a6f4dcd053274965e764e91712aa4d27d 100644 --- a/compute/pzunmqr_param.c +++ b/compute/pzunmqr_param.c @@ -42,7 +42,6 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, size_t ws_host = 0; int k, m, n, i, p; - int ldap, ldam, ldan, lddp, ldcp, ldcm; int temppm, temppn, tempmm, tempnn, tempkn,tempkmin; int ib, KT, L; int node, nbtiles, *tiles; @@ -101,9 +100,6 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, temppm = p == C->mt-1 ? C->m - p * C->mb : C->mb; tempkmin = chameleon_min( temppm, tempkn ); - ldap = BLKLDD(A, p); - lddp = BLKLDD(D, p); - ldcp = BLKLDD(C, p); if ( genD ) { int tempDpm = p == D->mt-1 ? D->m-p*D->mb : D->mb; @@ -111,14 +107,14 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zlacpy( &options, ChamLower, tempDpm, tempkmin, A->nb, - A(p, k), ldap, - D(p, k), lddp ); + A(p, k), + D(p, k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempDpm, tempkmin, 0., 1., - D(p, k), lddp ); + D(p, k) ); #endif } for (n = 0; n < C->nt; n++) { @@ -126,9 +122,9 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zunmqr( &options, side, trans, temppm, tempnn, tempkmin, ib, T->nb, - D(p, k), lddp, - T(p, k), T->mb, - C(p, n), ldcp); + D(p, k), + T(p, k), + C(p, n)); } RUNTIME_data_flush( sequence, D(p, k) ); RUNTIME_data_flush( sequence, T(p, k) ); @@ -142,9 +138,6 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, p = qrtree->currpiv(qrtree, k, m); tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldam = BLKLDD(A, m); - ldcp = BLKLDD(C, p); - ldcm = BLKLDD(C, m); if( qrtree->gettype(qrtree, k, m) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ @@ -166,10 +159,10 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_ztpmqrt( &options, side, trans, tempmm, tempnn, tempkn, L, ib, T->nb, - A(m, k), ldam, - T(m, k), T->mb, - C(p, n), ldcp, - C(m, n), ldcm); + A(m, k), + T(m, k), + C(p, n), + C(m, n)); } RUNTIME_data_flush( sequence, A(m, k) ); RUNTIME_data_flush( sequence, T(m, k) ); @@ -201,9 +194,6 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, p = qrtree->currpiv(qrtree, k, m); tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldam = BLKLDD(A, m); - ldcp = BLKLDD(C, p); - ldcm = BLKLDD(C, m); if( qrtree->gettype(qrtree, k, m) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ @@ -225,10 +215,10 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_ztpmqrt( &options, side, trans, tempmm, tempnn, tempkn, L, ib, T->nb, - A(m, k), ldam, - T(m, k), T->mb, - C(p, n), ldcp, - C(m, n), ldcm); + A(m, k), + T(m, k), + C(p, n), + C(m, n)); } RUNTIME_data_flush( sequence, A(m, k) ); RUNTIME_data_flush( sequence, T(m, k) ); @@ -241,9 +231,6 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, temppm = p == C->mt-1 ? C->m-p*C->mb : C->mb; tempkmin = chameleon_min( temppm, tempkn ); - ldap = BLKLDD(A, p); - lddp = BLKLDD(D, p); - ldcp = BLKLDD(C, p); if ( genD ) { int tempDpm = p == D->mt-1 ? D->m-p*D->mb : D->mb; @@ -251,14 +238,14 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zlacpy( &options, ChamLower, tempDpm, tempkmin, A->nb, - A(p, k), ldap, - D(p, k), lddp ); + A(p, k), + D(p, k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempDpm, tempkmin, 0., 1., - D(p, k), lddp ); + D(p, k) ); #endif } @@ -271,9 +258,9 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zunmqr( &options, side, trans, temppm, tempnn, tempkmin, ib, T->nb, - D(p, k), lddp, - T(p, k), T->mb, - C(p, n), ldcp); + D(p, k), + T(p, k), + C(p, n)); } RUNTIME_data_flush( sequence, D(p, k) ); RUNTIME_data_flush( sequence, T(p, k) ); @@ -300,7 +287,6 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, p = qrtree->currpiv(qrtree, k, n); tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - ldan = BLKLDD(A, n); if( qrtree->gettype(qrtree, k, n) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ @@ -315,7 +301,6 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); node = C->get_rankof( C, m, n ); RUNTIME_data_migrate( sequence, C(m, p), node ); @@ -324,10 +309,10 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_ztpmqrt( &options, side, trans, tempmm, tempnn, tempkn, chameleon_min( L, tempmm ), ib, T->nb, - A(n, k), ldan, - T(n, k), T->mb, - C(m, p), ldcm, - C(m, n), ldcm); + A(n, k), + T(n, k), + C(m, p), + C(m, n)); } RUNTIME_data_flush( sequence, A(n, k) ); RUNTIME_data_flush( sequence, T(n, k) ); @@ -339,8 +324,6 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, temppn = p == C->nt-1 ? C->n - p * C->nb : C->nb; tempkmin = chameleon_min(temppn, tempkn); - ldap = BLKLDD(A, p); - lddp = BLKLDD(D, p); if ( genD ) { int tempDpm = p == D->mt-1 ? D->m-p*D->mb : D->mb; @@ -348,20 +331,19 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zlacpy( &options, ChamLower, tempDpm, tempkmin, A->nb, - A(p, k), ldap, - D(p, k), lddp ); + A(p, k), + D(p, k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempDpm, tempkmin, 0., 1., - D(p, k), lddp ); + D(p, k) ); #endif } for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); RUNTIME_data_migrate( sequence, C(m, p), C->get_rankof( C, m, p ) ); @@ -369,9 +351,9 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zunmqr( &options, side, trans, tempmm, temppn, tempkmin, ib, T->nb, - D(p, k), lddp, - T(p, k), T->mb, - C(m, p), ldcm); + D(p, k), + T(p, k), + C(m, p)); } RUNTIME_data_flush( sequence, D(p, k) ); RUNTIME_data_flush( sequence, T(p, k) ); @@ -394,8 +376,6 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, temppn = p == C->nt - 1 ? C->n - p * C->nb : C->nb; tempkmin = chameleon_min( temppn, tempkn ); - ldap = BLKLDD(A, p); - lddp = BLKLDD(D, p); if ( genD ) { int tempDpm = p == D->mt-1 ? D->m-p*D->mb : D->mb; @@ -403,26 +383,25 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_zlacpy( &options, ChamLower, tempDpm, tempkmin, A->nb, - A(p, k), ldap, - D(p, k), lddp ); + A(p, k), + D(p, k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempDpm, tempkmin, 0., 1., - D(p, k), lddp ); + D(p, k) ); #endif } for (m = 0; m < C->mt; m++) { - ldcm = BLKLDD(C, m); tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; INSERT_TASK_zunmqr( &options, side, trans, tempmm, temppn, tempkmin, ib, T->nb, - D(p, k), lddp, - T(p, k), T->mb, - C(m, p), ldcm); + D(p, k), + T(p, k), + C(m, p)); } RUNTIME_data_flush( sequence, D(p, k) ); RUNTIME_data_flush( sequence, T(p, k) ); @@ -436,7 +415,6 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, p = qrtree->currpiv(qrtree, k, n); tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - ldan = BLKLDD(A, n); if( qrtree->gettype(qrtree, k, n) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ @@ -451,7 +429,6 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); node = C->get_rankof( C, m, n ); RUNTIME_data_migrate( sequence, C(m, p), node ); @@ -460,10 +437,10 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, INSERT_TASK_ztpmqrt( &options, side, trans, tempmm, tempnn, tempkn, chameleon_min( L, tempmm ), ib, T->nb, - A(n, k), ldan, - T(n, k), T->mb, - C(m, p), ldcm, - C(m, n), ldcm); + A(n, k), + T(n, k), + C(m, p), + C(m, n)); } RUNTIME_data_flush( sequence, A(n, k) ); RUNTIME_data_flush( sequence, T(n, k) ); diff --git a/compute/pzunmqrrh.c b/compute/pzunmqrrh.c index 1d4500f849c3072d7bbf9e54e0f25c72363a5fbd..9b34176c82c0684c136a757591cc6cb9b04ff0c4 100644 --- a/compute/pzunmqrrh.c +++ b/compute/pzunmqrrh.c @@ -47,9 +47,6 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans int k, m, n, p; int KT, RD, lastRD; - int ldap, ldam, ldan; - int ldcp, ldcm; - int lddp; int temppm, temppn, tempkn, tempnn, tempmm, tempkmin; int ib, node; @@ -102,9 +99,6 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans temppm = p == C->mt-1 ? C->m - p * C->mb : C->mb; tempkmin = chameleon_min( temppm, tempkn ); - ldap = BLKLDD(A, p); - lddp = BLKLDD(D, p); - ldcp = BLKLDD(C, p); if ( genD ) { int tempDpm = p == D->mt-1 ? D->m-p*D->mb : D->mb; @@ -112,14 +106,14 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zlacpy( &options, ChamLower, tempDpm, tempkmin, A->nb, - A(p, k), ldap, - D(p, k), lddp ); + A(p, k), + D(p, k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempDpm, tempkmin, 0., 1., - D(p, k), lddp ); + D(p, k) ); #endif } for (n = 0; n < C->nt; n++) { @@ -128,17 +122,15 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans &options, side, trans, temppm, tempnn, tempkmin, ib, T->nb, - D(p, k), lddp, - T(p, k), T->mb, - C(p, n), ldcp); + D(p, k), + T(p, k), + C(p, n)); } RUNTIME_data_flush( sequence, D(p, k) ); RUNTIME_data_flush( sequence, T(p, k) ); for (m = p+1; m < chameleon_min(p+BS, C->mt); m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldam = BLKLDD(A, m); - ldcm = BLKLDD(C, m); for (n = 0; n < C->nt; n++) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; @@ -151,10 +143,10 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_ztpmqrt( &options, side, trans, tempmm, tempnn, tempkn, 0, ib, T->nb, - A(m, k), ldam, - T(m, k), T->mb, - C(p, n), ldcp, - C(m, n), ldcm); + A(m, k), + T(m, k), + C(p, n), + C(m, n)); } RUNTIME_data_flush( sequence, A(m, k) ); RUNTIME_data_flush( sequence, T(m, k) ); @@ -165,9 +157,6 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans m = p+RD; tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldam = BLKLDD(A, m); - ldcm = BLKLDD(C, m); - ldcp = BLKLDD(C, p); for (n = 0; n < C->nt; n++) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; @@ -180,10 +169,10 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_ztpmqrt( &options, side, trans, tempmm, tempnn, tempkn, tempmm, ib, T->nb, - A (m, k), ldam, - T2(m, k), T->mb, - C (p, n), ldcp, - C (m, n), ldcm); + A (m, k), + T2(m, k), + C (p, n), + C (m, n)); } RUNTIME_data_flush( sequence, A (m, k) ); RUNTIME_data_flush( sequence, T2(m, k) ); @@ -215,9 +204,6 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans m = p+RD; tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldam = BLKLDD(A, m); - ldcm = BLKLDD(C, m); - ldcp = BLKLDD(C, p); for (n = 0; n < C->nt; n++) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; @@ -230,24 +216,19 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_ztpmqrt( &options, side, trans, tempmm, tempnn, tempkn, tempmm, ib, T->nb, - A (m, k), ldam, - T2(m, k), T->mb, - C (p, n), ldcp, - C (m, n), ldcm); + A (m, k), + T2(m, k), + C (p, n), + C (m, n)); } RUNTIME_data_flush( sequence, A (m, k) ); RUNTIME_data_flush( sequence, T2(m, k) ); } } for (p = k; p < C->mt; p += BS) { - ldap = BLKLDD(A, p); - lddp = BLKLDD(D, p); - ldcp = BLKLDD(C, p); for (m = chameleon_min(p+BS, C->mt)-1; m > p; m--) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldam = BLKLDD(A, m); - ldcm = BLKLDD(C, m); for (n = 0; n < C->nt; n++) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; @@ -260,10 +241,10 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_ztpmqrt( &options, side, trans, tempmm, tempnn, tempkn, 0, ib, T->nb, - A(m, k), ldam, - T(m, k), T->mb, - C(p, n), ldcp, - C(m, n), ldcm); + A(m, k), + T(m, k), + C(p, n), + C(m, n)); } RUNTIME_data_flush( sequence, A(m, k) ); RUNTIME_data_flush( sequence, T(m, k) ); @@ -278,14 +259,14 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zlacpy( &options, ChamLower, tempDpm, tempkmin, A->nb, - A(p, k), ldap, - D(p, k), lddp ); + A(p, k), + D(p, k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempDpm, tempkmin, 0., 1., - D(p, k), lddp ); + D(p, k) ); #endif } @@ -298,9 +279,9 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zunmqr( &options, side, trans, temppm, tempnn, tempkmin, ib, T->nb, - D(p, k), lddp, - T(p, k), T->mb, - C(p, n), ldcp); + D(p, k), + T(p, k), + C(p, n)); } RUNTIME_data_flush( sequence, D(p, k) ); RUNTIME_data_flush( sequence, T(p, k) ); @@ -327,11 +308,9 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans n = p+RD; tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - ldan = BLKLDD(A, n); for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); node = C->get_rankof( C, m, n ); RUNTIME_data_migrate( sequence, C(m, p), node ); @@ -341,10 +320,10 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_ztpmqrt( &options, side, trans, tempmm, tempnn, tempkn, tempmm, ib, T->nb, - A (n, k), ldan, - T2(n, k), T->mb, - C (m, p), ldcm, - C (m, n), ldcm); + A (n, k), + T2(n, k), + C (m, p), + C (m, n)); } RUNTIME_data_flush( sequence, A (n, k) ); RUNTIME_data_flush( sequence, T2(n, k) ); @@ -355,11 +334,9 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans for (n = chameleon_min(p+BS, C->nt)-1; n > p; n--) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - ldan = BLKLDD(A, n); for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); node = C->get_rankof( C, m, n ); RUNTIME_data_migrate( sequence, C(m, p), node ); @@ -369,10 +346,10 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_ztpmqrt( &options, side, trans, tempmm, tempnn, tempkn, 0, ib, T->nb, - A(n, k), ldan, - T(n, k), T->mb, - C(m, p), ldcm, - C(m, n), ldcm); + A(n, k), + T(n, k), + C(m, p), + C(m, n)); } RUNTIME_data_flush( sequence, A(n, k) ); RUNTIME_data_flush( sequence, T(n, k) ); @@ -380,8 +357,6 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans temppn = p == C->nt-1 ? C->n - p * C->nb : C->nb; tempkmin = chameleon_min( temppn, tempkn ); - ldap = BLKLDD(A, p); - lddp = BLKLDD(D, p); if ( genD ) { int tempDpm = p == D->mt-1 ? D->m-p*D->mb : D->mb; @@ -389,20 +364,19 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zlacpy( &options, ChamLower, tempDpm, tempkmin, A->nb, - A(p, k), ldap, - D(p, k), lddp ); + A(p, k), + D(p, k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempDpm, tempkmin, 0., 1., - D(p, k), lddp ); + D(p, k) ); #endif } for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); RUNTIME_data_migrate( sequence, C(m, p), C->get_rankof( C, m, p ) ); @@ -410,9 +384,9 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zunmqr( &options, side, trans, tempmm, temppn, tempkmin, ib, T->nb, - D(p, k), lddp, - T(p, k), T->mb, - C(m, p), ldcm); + D(p, k), + T(p, k), + C(m, p)); } RUNTIME_data_flush( sequence, D(p, k) ); RUNTIME_data_flush( sequence, T(p, k) ); @@ -433,8 +407,6 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans temppn = p == C->nt - 1 ? C->n - p * C->nb : C->nb; tempkmin = chameleon_min( temppn, tempkn ); - ldap = BLKLDD(A, p); - lddp = BLKLDD(D, p); if ( genD ) { int tempDpm = p == D->mt-1 ? D->m-p*D->mb : D->mb; @@ -442,36 +414,33 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zlacpy( &options, ChamLower, tempDpm, tempkmin, A->nb, - A(p, k), ldap, - D(p, k), lddp ); + A(p, k), + D(p, k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempDpm, tempkmin, 0., 1., - D(p, k), lddp ); + D(p, k) ); #endif } for (m = 0; m < C->mt; m++) { - ldcm = BLKLDD(C, m); tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; INSERT_TASK_zunmqr( &options, side, trans, tempmm, temppn, tempkmin, ib, T->nb, - D(p, k), lddp, - T(p, k), T->mb, - C(m, p), ldcm); + D(p, k), + T(p, k), + C(m, p)); } RUNTIME_data_flush( sequence, D(p, k) ); RUNTIME_data_flush( sequence, T(p, k) ); for (n = p+1; n < chameleon_min(p+BS, C->nt); n++) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - ldan = BLKLDD(A, n); for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); node = C->get_rankof( C, m, n ); RUNTIME_data_migrate( sequence, C(m, p), node ); @@ -481,10 +450,10 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_ztpmqrt( &options, side, trans, tempmm, tempnn, tempkn, 0, ib, T->nb, - A(n, k), ldan, - T(n, k), T->mb, - C(m, p), ldcm, - C(m, n), ldcm); + A(n, k), + T(n, k), + C(m, p), + C(m, n)); } RUNTIME_data_flush( sequence, A(n, k) ); RUNTIME_data_flush( sequence, T(n, k) ); @@ -494,11 +463,9 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans for (p = k; p+RD < C->nt; p += 2*RD) { n = p + RD; tempnn = n == C->mt-1 ? C->m-n*C->mb : C->mb; - ldan = BLKLDD(A, n); for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); node = C->get_rankof( C, m, n ); RUNTIME_data_migrate( sequence, C(m, p), node ); @@ -508,10 +475,10 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_ztpmqrt( &options, side, trans, tempmm, tempnn, tempkn, tempmm, ib, T->nb, - A (n, k), ldan, - T2(n, k), T->mb, - C (m, p), ldcm, - C (m, n), ldcm); + A (n, k), + T2(n, k), + C (m, p), + C (m, n)); } RUNTIME_data_flush( sequence, A (n, k) ); RUNTIME_data_flush( sequence, T2(n, k) ); diff --git a/control/descriptor.c b/control/descriptor.c index 2f739427740ad0abdd80887f86677dfe155b533d..52584a44646b9fd1305876708626cc7423b8d9f4 100644 --- a/control/descriptor.c +++ b/control/descriptor.c @@ -65,9 +65,32 @@ int chameleon_desc_mat_free( CHAM_desc_t *desc ) desc->mat = NULL; } + if ( desc->tiles ) { + free( desc->tiles ); + } return CHAMELEON_SUCCESS; } +void chameleon_desc_init_tiles( CHAM_desc_t *desc ) +{ + CHAM_tile_t *tile; + int ii, jj; + + desc->tiles = malloc( desc->lmt * desc->lnt * sizeof(CHAM_tile_t) ); + + tile = desc->tiles; + for( jj=0; jj<desc->lnt; jj++ ) { + for( ii=0; ii<desc->lmt; ii++, tile++ ) { + int rank = desc->get_rankof( desc, ii, jj ); + tile->format = CHAMELEON_TILE_FULLRANK; + tile->m = ii == desc->lmt-1 ? desc->lm - ii * desc->mb : desc->mb; + tile->n = jj == desc->lnt-1 ? desc->ln - jj * desc->nb : desc->nb; + tile->mat = (rank == desc->myrank) ? desc->get_blkaddr( desc, ii, jj ) : NULL; + tile->ld = desc->get_blkldd( desc, ii ); + } + } +} + /** * Internal function to return MPI rank of element A(m,n) with m,n = block indices */ @@ -174,6 +197,7 @@ int chameleon_desc_init( CHAM_desc_t *desc, void *mat, } // If one of the function get_* is NULL, we switch back to the default, like in chameleon_desc_init() + desc->get_blktile = chameleon_desc_gettile; desc->get_blkaddr = get_blkaddr ? get_blkaddr : chameleon_getaddr_ccrb; desc->get_blkldd = get_blkldd ? get_blkldd : chameleon_getblkldd_ccrb; desc->get_rankof = get_rankof ? get_rankof : chameleon_getrankof_2d; @@ -282,6 +306,8 @@ int chameleon_desc_init( CHAM_desc_t *desc, void *mat, desc->A12 = (size_t)( desc->llm%mb)*(size_t)(desc->lln - desc->lln%nb) + desc->A21; desc->A22 = (size_t)(desc->llm - desc->llm%mb)*(size_t)( desc->lln%nb) + desc->A12; + chameleon_desc_init_tiles( desc ); + /* Create runtime specific structure like registering data */ RUNTIME_desc_create( desc ); diff --git a/control/descriptor.h b/control/descriptor.h index 6eff677110b0e4269a5366cc7ffcf24269050547..b4d8832d0e376a34c3384d71ab868b97f1b27bfc 100644 --- a/control/descriptor.h +++ b/control/descriptor.h @@ -131,6 +131,21 @@ inline static void *chameleon_getaddr_null(const CHAM_desc_t *A, int m, int n) return NULL; } +/** + * Internal function to return address of block (m,n) with m,n = block indices + */ +inline static CHAM_tile_t *chameleon_desc_gettile(const CHAM_desc_t *A, int m, int n) +{ + size_t mm = m + A->i / A->mb; + size_t nn = n + A->j / A->nb; + size_t offset = 0; + + assert( A->tiles != NULL ); + + offset = A->lmt * nn + mm; + return A->tiles + offset; +} + /** * Internal function to return address of element A(m,n) with m,n = matrix indices */ diff --git a/coreblas/compute/CMakeLists.txt b/coreblas/compute/CMakeLists.txt index 283659eed45ee323aed7c10ff5479c257c442dc9..e85b36fa5ed57637225dedfb7f47c0c3518dff19 100644 --- a/coreblas/compute/CMakeLists.txt +++ b/coreblas/compute/CMakeLists.txt @@ -98,6 +98,7 @@ set(ZSRC core_zttqrt.c core_zunmlq.c core_zunmqr.c + core_ztile.c ) precisions_rules_py(COREBLAS_SRCS_GENERATED "${ZSRC}" diff --git a/coreblas/compute/core_ztile.c b/coreblas/compute/core_ztile.c new file mode 100644 index 0000000000000000000000000000000000000000..def292605dddc3090255e8f1dd559f044ddd3bfc --- /dev/null +++ b/coreblas/compute/core_ztile.c @@ -0,0 +1,838 @@ +/** + * + * @file core_ztile.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + * @brief Chameleon CPU kernel interface from CHAM_tile_t layout to the real one. + * + * @version 1.0.0 + * @author Mathieu Faverge + * @date 2015-11-03 + * @precisions normal z -> c d s + * + */ +#include "coreblas.h" +#include "coreblas/coreblas_ztile.h" + +void +TCORE_dzasum( cham_store_t storev, + cham_uplo_t uplo, + int M, + int N, + const CHAM_tile_t *A, + double * work ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + CORE_dzasum( storev, uplo, M, N, A->mat, A->ld, work ); +} + +int +TCORE_zaxpy( int M, + CHAMELEON_Complex64_t alpha, + const CHAM_tile_t * A, + int incA, + CHAM_tile_t * B, + int incB ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( B->format & CHAMELEON_TILE_FULLRANK ); + return CORE_zaxpy( M, alpha, A->mat, incA, B->mat, incB ); +} + +int +TCORE_zgeadd( cham_trans_t trans, + int M, + int N, + CHAMELEON_Complex64_t alpha, + const CHAM_tile_t * A, + CHAMELEON_Complex64_t beta, + CHAM_tile_t * B ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( B->format & CHAMELEON_TILE_FULLRANK ); + return CORE_zgeadd( trans, M, N, alpha, A->mat, A->ld, beta, B->mat, B->ld ); +} + +int +TCORE_zgelqt( int M, + int N, + int IB, + CHAM_tile_t * A, + CHAM_tile_t * T, + CHAMELEON_Complex64_t *TAU, + CHAMELEON_Complex64_t *WORK ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( T->format & CHAMELEON_TILE_FULLRANK ); + return CORE_zgelqt( M, N, IB, A->mat, A->ld, T->mat, T->ld, TAU, WORK ); +} + +void +TCORE_zgemm( cham_trans_t transA, + cham_trans_t transB, + int M, + int N, + int K, + CHAMELEON_Complex64_t alpha, + const CHAM_tile_t * A, + const CHAM_tile_t * B, + CHAMELEON_Complex64_t beta, + CHAM_tile_t * C ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( B->format & CHAMELEON_TILE_FULLRANK ); + assert( C->format & CHAMELEON_TILE_FULLRANK ); + CORE_zgemm( + transA, transB, M, N, K, alpha, A->mat, A->ld, B->mat, B->ld, beta, C->mat, C->ld ); +} + +int +TCORE_zgeqrt( int M, + int N, + int IB, + CHAM_tile_t * A, + CHAM_tile_t * T, + CHAMELEON_Complex64_t *TAU, + CHAMELEON_Complex64_t *WORK ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( T->format & CHAMELEON_TILE_FULLRANK ); + return CORE_zgeqrt( M, N, IB, A->mat, A->ld, T->mat, T->ld, TAU, WORK ); +} + +int +TCORE_zgessm( int M, int N, int K, int IB, const int *IPIV, const CHAM_tile_t *L, CHAM_tile_t *A ) +{ + assert( L->format & CHAMELEON_TILE_FULLRANK ); + assert( A->format & CHAMELEON_TILE_FULLRANK ); + return CORE_zgessm( M, N, K, IB, IPIV, L->mat, L->ld, A->mat, A->ld ); +} + +int +TCORE_zgessq( cham_store_t storev, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *sclssq ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( sclssq->format & CHAMELEON_TILE_FULLRANK ); + return CORE_zgessq( storev, M, N, A->mat, A->ld, sclssq->mat ); +} + +int +TCORE_zgetrf( int M, int N, CHAM_tile_t *A, int *IPIV, int *INFO ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + return CORE_zgetrf( M, N, A->mat, A->ld, IPIV, INFO ); +} + +int +TCORE_zgetrf_incpiv( int M, int N, int IB, CHAM_tile_t *A, int *IPIV, int *INFO ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + return CORE_zgetrf_incpiv( M, N, IB, A->mat, A->ld, IPIV, INFO ); +} + +int +TCORE_zgetrf_nopiv( int M, int N, int IB, CHAM_tile_t *A, int *INFO ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + return CORE_zgetrf_nopiv( M, N, IB, A->mat, A->ld, INFO ); +} + +void +TCORE_zhe2ge( cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *B ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( B->format & CHAMELEON_TILE_FULLRANK ); + CORE_zhe2ge( uplo, M, N, A->mat, A->ld, B->mat, B->ld ); +} + +#if defined( PRECISION_z ) || defined( PRECISION_c ) +void +TCORE_zhemm( cham_side_t side, + cham_uplo_t uplo, + int M, + int N, + CHAMELEON_Complex64_t alpha, + const CHAM_tile_t * A, + const CHAM_tile_t * B, + CHAMELEON_Complex64_t beta, + CHAM_tile_t * C ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( B->format & CHAMELEON_TILE_FULLRANK ); + assert( C->format & CHAMELEON_TILE_FULLRANK ); + CORE_zhemm( side, uplo, M, N, alpha, A->mat, A->ld, B->mat, B->ld, beta, C->mat, C->ld ); +} + +void +TCORE_zherk( cham_uplo_t uplo, + cham_trans_t trans, + int N, + int K, + double alpha, + const CHAM_tile_t *A, + double beta, + CHAM_tile_t * C ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( C->format & CHAMELEON_TILE_FULLRANK ); + CORE_zherk( uplo, trans, N, K, alpha, A->mat, A->ld, beta, C->mat, C->ld ); +} + +void +TCORE_zher2k( cham_uplo_t uplo, + cham_trans_t trans, + int N, + int K, + CHAMELEON_Complex64_t alpha, + const CHAM_tile_t * A, + const CHAM_tile_t * B, + double beta, + CHAM_tile_t * C ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( B->format & CHAMELEON_TILE_FULLRANK ); + assert( C->format & CHAMELEON_TILE_FULLRANK ); + CORE_zher2k( uplo, trans, N, K, alpha, A->mat, A->ld, B->mat, B->ld, beta, C->mat, C->ld ); +} +#endif + +int +TCORE_zherfb( cham_uplo_t uplo, + int N, + int K, + int IB, + int NB, + const CHAM_tile_t * A, + const CHAM_tile_t * T, + CHAM_tile_t * C, + CHAMELEON_Complex64_t *WORK, + int ldwork ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( T->format & CHAMELEON_TILE_FULLRANK ); + assert( C->format & CHAMELEON_TILE_FULLRANK ); + return CORE_zherfb( + uplo, N, K, IB, NB, A->mat, A->ld, T->mat, T->ld, C->mat, C->ld, WORK, ldwork ); +} + +#if defined( PRECISION_z ) || defined( PRECISION_c ) +int +TCORE_zhessq( cham_store_t storev, + cham_uplo_t uplo, + int N, + const CHAM_tile_t *A, + CHAM_tile_t * sclssq ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( sclssq->format & CHAMELEON_TILE_FULLRANK ); + return CORE_zhessq( storev, uplo, N, A->mat, A->ld, sclssq->mat ); +} +#endif + +void +TCORE_zlacpy( cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *B ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( B->format & CHAMELEON_TILE_FULLRANK ); + CORE_zlacpy( uplo, M, N, A->mat, A->ld, B->mat, B->ld ); +} + +void +TCORE_zlange( cham_normtype_t norm, + int M, + int N, + const CHAM_tile_t *A, + double * work, + double * normA ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + CORE_zlange( norm, M, N, A->mat, A->ld, work, normA ); +} + +#if defined( PRECISION_z ) || defined( PRECISION_c ) +void +TCORE_zlanhe( cham_normtype_t norm, + cham_uplo_t uplo, + int N, + const CHAM_tile_t *A, + double * work, + double * normA ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + CORE_zlanhe( norm, uplo, N, A->mat, A->ld, work, normA ); +} +#endif + +void +TCORE_zlansy( cham_normtype_t norm, + cham_uplo_t uplo, + int N, + const CHAM_tile_t *A, + double * work, + double * normA ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + CORE_zlansy( norm, uplo, N, A->mat, A->ld, work, normA ); +} + +void +TCORE_zlantr( cham_normtype_t norm, + cham_uplo_t uplo, + cham_diag_t diag, + int M, + int N, + const CHAM_tile_t *A, + double * work, + double * normA ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + CORE_zlantr( norm, uplo, diag, M, N, A->mat, A->ld, work, normA ); +} + +int +TCORE_zlascal( cham_uplo_t uplo, int m, int n, CHAMELEON_Complex64_t alpha, CHAM_tile_t *A ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + return CORE_zlascal( uplo, m, n, alpha, A->mat, A->ld ); +} + +void +TCORE_zlaset( cham_uplo_t uplo, + int n1, + int n2, + CHAMELEON_Complex64_t alpha, + CHAMELEON_Complex64_t beta, + CHAM_tile_t * A ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + CORE_zlaset( uplo, n1, n2, alpha, beta, A->mat, A->ld ); +} + +void +TCORE_zlaset2( cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha, CHAM_tile_t *A ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + CORE_zlaset2( uplo, n1, n2, alpha, A->mat, A->ld ); +} + +int +TCORE_zlatro( cham_uplo_t uplo, + cham_trans_t trans, + int M, + int N, + const CHAM_tile_t *A, + CHAM_tile_t * B ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( B->format & CHAMELEON_TILE_FULLRANK ); + return CORE_zlatro( uplo, trans, M, N, A->mat, A->ld, B->mat, B->ld ); +} + +void +TCORE_zlauum( cham_uplo_t uplo, int N, CHAM_tile_t *A ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + CORE_zlauum( uplo, N, A->mat, A->ld ); +} + +#if defined( PRECISION_z ) || defined( PRECISION_c ) +void +TCORE_zplghe( double bump, + int m, + int n, + CHAM_tile_t * tileA, + int bigM, + int m0, + int n0, + unsigned long long int seed ) +{ + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + CORE_zplghe( bump, m, n, tileA->mat, tileA->ld, bigM, m0, n0, seed ); +} +#endif + +void +TCORE_zplgsy( CHAMELEON_Complex64_t bump, + int m, + int n, + CHAM_tile_t * tileA, + int bigM, + int m0, + int n0, + unsigned long long int seed ) +{ + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + CORE_zplgsy( bump, m, n, tileA->mat, tileA->ld, bigM, m0, n0, seed ); +} + +void +TCORE_zplrnt( int m, + int n, + CHAM_tile_t * tileA, + int bigM, + int m0, + int n0, + unsigned long long int seed ) +{ + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + CORE_zplrnt( m, n, tileA->mat, tileA->ld, bigM, m0, n0, seed ); +} + +void +TCORE_zpotrf( cham_uplo_t uplo, int n, CHAM_tile_t *A, int *INFO ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + CORE_zpotrf( uplo, n, A->mat, A->ld, INFO ); +} + +int +TCORE_zssssm( int M1, + int N1, + int M2, + int N2, + int K, + int IB, + CHAM_tile_t * A1, + CHAM_tile_t * A2, + const CHAM_tile_t *L1, + const CHAM_tile_t *L2, + const int * IPIV ) +{ + assert( A1->format & CHAMELEON_TILE_FULLRANK ); + assert( A2->format & CHAMELEON_TILE_FULLRANK ); + assert( L1->format & CHAMELEON_TILE_FULLRANK ); + assert( L2->format & CHAMELEON_TILE_FULLRANK ); + return CORE_zssssm( M1, + N1, + M2, + N2, + K, + IB, + A1->mat, + A1->ld, + A2->mat, + A2->ld, + L1->mat, + L1->ld, + L2->mat, + L2->ld, + IPIV ); +} + +void +TCORE_zsymm( cham_side_t side, + cham_uplo_t uplo, + int M, + int N, + CHAMELEON_Complex64_t alpha, + const CHAM_tile_t * A, + const CHAM_tile_t * B, + CHAMELEON_Complex64_t beta, + CHAM_tile_t * C ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( B->format & CHAMELEON_TILE_FULLRANK ); + assert( C->format & CHAMELEON_TILE_FULLRANK ); + CORE_zsymm( side, uplo, M, N, alpha, A->mat, A->ld, B->mat, B->ld, beta, C->mat, C->ld ); +} + +void +TCORE_zsyrk( cham_uplo_t uplo, + cham_trans_t trans, + int N, + int K, + CHAMELEON_Complex64_t alpha, + const CHAM_tile_t * A, + CHAMELEON_Complex64_t beta, + CHAM_tile_t * C ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( C->format & CHAMELEON_TILE_FULLRANK ); + CORE_zsyrk( uplo, trans, N, K, alpha, A->mat, A->ld, beta, C->mat, C->ld ); +} + +void +TCORE_zsyr2k( cham_uplo_t uplo, + cham_trans_t trans, + int N, + int K, + CHAMELEON_Complex64_t alpha, + const CHAM_tile_t * A, + const CHAM_tile_t * B, + CHAMELEON_Complex64_t beta, + CHAM_tile_t * C ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( B->format & CHAMELEON_TILE_FULLRANK ); + assert( C->format & CHAMELEON_TILE_FULLRANK ); + CORE_zsyr2k( uplo, trans, N, K, alpha, A->mat, A->ld, B->mat, B->ld, beta, C->mat, C->ld ); +} + +int +TCORE_zsyssq( cham_store_t storev, + cham_uplo_t uplo, + int N, + const CHAM_tile_t *A, + CHAM_tile_t * sclssq ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( sclssq->format & CHAMELEON_TILE_FULLRANK ); + return CORE_zsyssq( storev, uplo, N, A->mat, A->ld, sclssq->mat ); +} + +#if defined( PRECISION_z ) || defined( PRECISION_c ) +int +TCORE_zsytf2_nopiv( cham_uplo_t uplo, int n, CHAM_tile_t *A ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + return CORE_zsytf2_nopiv( uplo, n, A->mat, A->ld ); +} +#endif + +int +TCORE_ztplqt( int M, + int N, + int L, + int IB, + CHAM_tile_t * A, + CHAM_tile_t * B, + CHAM_tile_t * T, + CHAMELEON_Complex64_t *WORK ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( B->format & CHAMELEON_TILE_FULLRANK ); + assert( T->format & CHAMELEON_TILE_FULLRANK ); + return CORE_ztplqt( M, N, L, IB, A->mat, A->ld, B->mat, B->ld, T->mat, T->ld, WORK ); +} + +int +TCORE_ztpmlqt( cham_side_t side, + cham_trans_t trans, + int M, + int N, + int K, + int L, + int IB, + const CHAM_tile_t * V, + const CHAM_tile_t * T, + CHAM_tile_t * A, + CHAM_tile_t * B, + CHAMELEON_Complex64_t *WORK ) +{ + assert( V->format & CHAMELEON_TILE_FULLRANK ); + assert( T->format & CHAMELEON_TILE_FULLRANK ); + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( B->format & CHAMELEON_TILE_FULLRANK ); + return CORE_ztpmlqt( side, + trans, + M, + N, + K, + L, + IB, + V->mat, + V->ld, + T->mat, + T->ld, + A->mat, + A->ld, + B->mat, + B->ld, + WORK ); +} + +int +TCORE_ztpmqrt( cham_side_t side, + cham_trans_t trans, + int M, + int N, + int K, + int L, + int IB, + const CHAM_tile_t * V, + const CHAM_tile_t * T, + CHAM_tile_t * A, + CHAM_tile_t * B, + CHAMELEON_Complex64_t *WORK ) +{ + assert( V->format & CHAMELEON_TILE_FULLRANK ); + assert( T->format & CHAMELEON_TILE_FULLRANK ); + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( B->format & CHAMELEON_TILE_FULLRANK ); + return CORE_ztpmqrt( side, + trans, + M, + N, + K, + L, + IB, + V->mat, + V->ld, + T->mat, + T->ld, + A->mat, + A->ld, + B->mat, + B->ld, + WORK ); +} + +int +TCORE_ztpqrt( int M, + int N, + int L, + int IB, + CHAM_tile_t * A, + CHAM_tile_t * B, + CHAM_tile_t * T, + CHAMELEON_Complex64_t *WORK ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( B->format & CHAMELEON_TILE_FULLRANK ); + assert( T->format & CHAMELEON_TILE_FULLRANK ); + return CORE_ztpqrt( M, N, L, IB, A->mat, A->ld, B->mat, B->ld, T->mat, T->ld, WORK ); +} + +int +TCORE_ztradd( cham_uplo_t uplo, + cham_trans_t trans, + int M, + int N, + CHAMELEON_Complex64_t alpha, + const CHAM_tile_t * A, + CHAMELEON_Complex64_t beta, + CHAM_tile_t * B ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( B->format & CHAMELEON_TILE_FULLRANK ); + return CORE_ztradd( uplo, trans, M, N, alpha, A->mat, A->ld, beta, B->mat, B->ld ); +} + +void +TCORE_ztrasm( cham_store_t storev, + cham_uplo_t uplo, + cham_diag_t diag, + int M, + int N, + const CHAM_tile_t *A, + double * work ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + CORE_ztrasm( storev, uplo, diag, M, N, A->mat, A->ld, work ); +} + +void +TCORE_ztrmm( cham_side_t side, + cham_uplo_t uplo, + cham_trans_t transA, + cham_diag_t diag, + int M, + int N, + CHAMELEON_Complex64_t alpha, + const CHAM_tile_t * A, + CHAM_tile_t * B ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( B->format & CHAMELEON_TILE_FULLRANK ); + CORE_ztrmm( side, uplo, transA, diag, M, N, alpha, A->mat, A->ld, B->mat, B->ld ); +} + +void +TCORE_ztrsm( cham_side_t side, + cham_uplo_t uplo, + cham_trans_t transA, + cham_diag_t diag, + int M, + int N, + CHAMELEON_Complex64_t alpha, + const CHAM_tile_t * A, + CHAM_tile_t * B ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( B->format & CHAMELEON_TILE_FULLRANK ); + CORE_ztrsm( side, uplo, transA, diag, M, N, alpha, A->mat, A->ld, B->mat, B->ld ); +} + +int +TCORE_ztrssq( cham_uplo_t uplo, + cham_diag_t diag, + int M, + int N, + const CHAM_tile_t *A, + CHAM_tile_t * sclssq ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( sclssq->format & CHAMELEON_TILE_FULLRANK ); + double *W = sclssq->mat; + return CORE_ztrssq( uplo, diag, M, N, A->mat, A->ld, W, W + 1 ); +} + +void +TCORE_ztrtri( cham_uplo_t uplo, cham_diag_t diag, int N, CHAM_tile_t *A, int *info ) +{ + assert( A->format & CHAMELEON_TILE_FULLRANK ); + CORE_ztrtri( uplo, diag, N, A->mat, A->ld, info ); +} + +int +TCORE_ztsmlq_hetra1( cham_side_t side, + cham_trans_t trans, + int m1, + int n1, + int m2, + int n2, + int k, + int ib, + CHAM_tile_t * A1, + CHAM_tile_t * A2, + const CHAM_tile_t * V, + const CHAM_tile_t * T, + CHAMELEON_Complex64_t *WORK, + int ldwork ) +{ + assert( A1->format & CHAMELEON_TILE_FULLRANK ); + assert( A2->format & CHAMELEON_TILE_FULLRANK ); + assert( V->format & CHAMELEON_TILE_FULLRANK ); + assert( T->format & CHAMELEON_TILE_FULLRANK ); + return CORE_ztsmlq_hetra1( side, + trans, + m1, + n1, + m2, + n2, + k, + ib, + A1->mat, + A1->ld, + A2->mat, + A2->ld, + V->mat, + V->ld, + T->mat, + T->ld, + WORK, + ldwork ); +} + +int +TCORE_ztsmqr_hetra1( cham_side_t side, + cham_trans_t trans, + int m1, + int n1, + int m2, + int n2, + int k, + int ib, + CHAM_tile_t * A1, + CHAM_tile_t * A2, + const CHAM_tile_t * V, + const CHAM_tile_t * T, + CHAMELEON_Complex64_t *WORK, + int ldwork ) +{ + assert( A1->format & CHAMELEON_TILE_FULLRANK ); + assert( A2->format & CHAMELEON_TILE_FULLRANK ); + assert( V->format & CHAMELEON_TILE_FULLRANK ); + assert( T->format & CHAMELEON_TILE_FULLRANK ); + return CORE_ztsmqr_hetra1( side, + trans, + m1, + n1, + m2, + n2, + k, + ib, + A1->mat, + A1->ld, + A2->mat, + A2->ld, + V->mat, + V->ld, + T->mat, + T->ld, + WORK, + ldwork ); +} + +int +TCORE_ztstrf( int M, + int N, + int IB, + int NB, + CHAM_tile_t * U, + CHAM_tile_t * A, + CHAM_tile_t * L, + int * IPIV, + CHAMELEON_Complex64_t *WORK, + int LDWORK, + int * INFO ) +{ + assert( U->format & CHAMELEON_TILE_FULLRANK ); + assert( A->format & CHAMELEON_TILE_FULLRANK ); + assert( L->format & CHAMELEON_TILE_FULLRANK ); + return CORE_ztstrf( + M, N, IB, NB, U->mat, U->ld, A->mat, A->ld, L->mat, L->ld, IPIV, WORK, LDWORK, INFO ); +} + +int +TCORE_zunmlq( cham_side_t side, + cham_trans_t trans, + int M, + int N, + int IB, + int K, + const CHAM_tile_t * V, + const CHAM_tile_t * T, + CHAM_tile_t * C, + CHAMELEON_Complex64_t *WORK, + int LDWORK ) +{ + assert( V->format & CHAMELEON_TILE_FULLRANK ); + assert( T->format & CHAMELEON_TILE_FULLRANK ); + assert( C->format & CHAMELEON_TILE_FULLRANK ); + return CORE_zunmlq( + side, trans, M, N, IB, K, V->mat, V->ld, T->mat, T->ld, C->mat, C->ld, WORK, LDWORK ); +} + +int +TCORE_zunmqr( cham_side_t side, + cham_trans_t trans, + int M, + int N, + int K, + int IB, + const CHAM_tile_t * V, + const CHAM_tile_t * T, + CHAM_tile_t * C, + CHAMELEON_Complex64_t *WORK, + int LDWORK ) +{ + assert( V->format & CHAMELEON_TILE_FULLRANK ); + assert( T->format & CHAMELEON_TILE_FULLRANK ); + assert( C->format & CHAMELEON_TILE_FULLRANK ); + return CORE_zunmqr( + side, trans, M, N, K, IB, V->mat, V->ld, T->mat, T->ld, C->mat, C->ld, WORK, LDWORK ); +} + +int +TCORE_zgram( cham_uplo_t uplo, + int M, + int N, + int Mt, + int Nt, + const CHAM_tile_t *Di, + const CHAM_tile_t *Dj, + const CHAM_tile_t *D, + CHAM_tile_t * A ) +{ + assert( Di->format & CHAMELEON_TILE_FULLRANK ); + assert( Dj->format & CHAMELEON_TILE_FULLRANK ); + assert( D->format & CHAMELEON_TILE_FULLRANK ); + assert( A->format & CHAMELEON_TILE_FULLRANK ); + return CORE_zgram( + uplo, M, N, Mt, Nt, Di->mat, Di->ld, Dj->mat, Dj->ld, D->mat, A->mat, A->ld ); +} diff --git a/coreblas/include/CMakeLists.txt b/coreblas/include/CMakeLists.txt index 9403541a4874e5e84ff365c73a150cc843f2f6da..3d77e2b03817ccb0bd7833d70757fc4bf70692e7 100644 --- a/coreblas/include/CMakeLists.txt +++ b/coreblas/include/CMakeLists.txt @@ -31,6 +31,7 @@ set(COREBLAS_HDRS_GENERATED "") set(ZHDR coreblas/coreblas_z.h coreblas/coreblas_zc.h + coreblas/coreblas_ztile.h ) precisions_rules_py( COREBLAS_HDRS_GENERATED "${ZHDR}" diff --git a/coreblas/include/coreblas/coreblas_ztile.h b/coreblas/include/coreblas/coreblas_ztile.h new file mode 100644 index 0000000000000000000000000000000000000000..f26e3cbeee1251739b94f5cca6f756e8dd78e8be --- /dev/null +++ b/coreblas/include/coreblas/coreblas_ztile.h @@ -0,0 +1,81 @@ +/** + * + * @file coreblas_ztile.h + * + * @copyright 2019-2019 Bordeaux INP, CNRS (LaBRI UMR 5800 ), Inria, + * Univ. Bordeaux. All rights reserved. + * + * @brief Chameleon CPU kernel CHAM_tile_t interface + * + * @version 1.0.0 + * @author Mathieu Faverge + * @date 2019-08-01 + * @precisions normal z -> c d s + * + */ +#ifndef _coreblas_ztile_h_ +#define _coreblas_ztile_h_ + +void TCORE_dzasum( cham_store_t storev, cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, double *work ); +int TCORE_zaxpy( int M, CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A, int incA, CHAM_tile_t *B, int incB ); +int TCORE_zgeadd( cham_trans_t trans, int M, int N, CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A, CHAMELEON_Complex64_t beta, CHAM_tile_t *B ); +int TCORE_zgelqt( int M, int N, int IB, CHAM_tile_t *A, CHAM_tile_t *T, CHAMELEON_Complex64_t *TAU, CHAMELEON_Complex64_t *WORK ); +void TCORE_zgemm( cham_trans_t transA, cham_trans_t transB, int M, int N, int K, CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A, const CHAM_tile_t *B, CHAMELEON_Complex64_t beta, CHAM_tile_t *C ); +int TCORE_zgeqrt( int M, int N, int IB, CHAM_tile_t *A, CHAM_tile_t *T, CHAMELEON_Complex64_t *TAU, CHAMELEON_Complex64_t *WORK ); +int TCORE_zgessm( int M, int N, int K, int IB, const int *IPIV, const CHAM_tile_t *L, CHAM_tile_t *A ); +int TCORE_zgessq( cham_store_t storev, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *sclssq ); +int TCORE_zgetrf( int M, int N, CHAM_tile_t *A, int *IPIV, int *INFO ); +int TCORE_zgetrf_incpiv( int M, int N, int IB, CHAM_tile_t *A, int *IPIV, int *INFO ); +int TCORE_zgetrf_nopiv( int M, int N, int IB, CHAM_tile_t *A, int *INFO ); +void TCORE_zhe2ge( cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *B ); +#if defined(PRECISION_z ) || defined(PRECISION_c) +void TCORE_zhemm( cham_side_t side, cham_uplo_t uplo, int M, int N, CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A, const CHAM_tile_t *B, CHAMELEON_Complex64_t beta, CHAM_tile_t *C ); +void TCORE_zherk( cham_uplo_t uplo, cham_trans_t trans, int N, int K, double alpha, const CHAM_tile_t *A, double beta, CHAM_tile_t *C ); +void TCORE_zher2k( cham_uplo_t uplo, cham_trans_t trans, int N, int K, CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A, const CHAM_tile_t *B, double beta, CHAM_tile_t *C ); +#endif +int TCORE_zherfb( cham_uplo_t uplo, int N, int K, int IB, int NB, const CHAM_tile_t *A, const CHAM_tile_t *T, CHAM_tile_t *C, CHAMELEON_Complex64_t *WORK, int ldwork ); +#if defined(PRECISION_z ) || defined(PRECISION_c) +int TCORE_zhessq( cham_store_t storev, cham_uplo_t uplo, int N, const CHAM_tile_t *A, CHAM_tile_t *sclssq ); +#endif +void TCORE_zlacpy( cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *B ); +void TCORE_zlange( cham_normtype_t norm, int M, int N, const CHAM_tile_t *A, double *work, double *normA ); +#if defined(PRECISION_z ) || defined(PRECISION_c) +void TCORE_zlanhe( cham_normtype_t norm, cham_uplo_t uplo, int N, const CHAM_tile_t *A, double *work, double *normA ); +#endif +void TCORE_zlansy( cham_normtype_t norm, cham_uplo_t uplo, int N, const CHAM_tile_t *A, double *work, double *normA ); +void TCORE_zlantr( cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, int M, int N, const CHAM_tile_t *A, double *work, double *normA ); +int TCORE_zlascal( cham_uplo_t uplo, int m, int n, CHAMELEON_Complex64_t alpha, CHAM_tile_t *A ); +void TCORE_zlaset( cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t beta, CHAM_tile_t *A ); +void TCORE_zlaset2( cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha, CHAM_tile_t *A ); +int TCORE_zlatro( cham_uplo_t uplo, cham_trans_t trans, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *B ); +void TCORE_zlauum( cham_uplo_t uplo, int N, CHAM_tile_t *A ); +#if defined(PRECISION_z ) || defined(PRECISION_c) +void TCORE_zplghe( double bump, int m, int n, CHAM_tile_t *tileA, int bigM, int m0, int n0, unsigned long long int seed ); +#endif +void TCORE_zplgsy( CHAMELEON_Complex64_t bump, int m, int n, CHAM_tile_t *tileA, int bigM, int m0, int n0, unsigned long long int seed ); +void TCORE_zplrnt( int m, int n, CHAM_tile_t *tileA, int bigM, int m0, int n0, unsigned long long int seed ); +void TCORE_zpotrf( cham_uplo_t uplo, int n, CHAM_tile_t *A, int *INFO ); +int TCORE_zssssm( int M1, int N1, int M2, int N2, int K, int IB, CHAM_tile_t *A1, CHAM_tile_t *A2, const CHAM_tile_t *L1, const CHAM_tile_t *L2, const int *IPIV ); +void TCORE_zsymm( cham_side_t side, cham_uplo_t uplo, int M, int N, CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A, const CHAM_tile_t *B, CHAMELEON_Complex64_t beta, CHAM_tile_t *C ); +void TCORE_zsyrk( cham_uplo_t uplo, cham_trans_t trans, int N, int K, CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A, CHAMELEON_Complex64_t beta, CHAM_tile_t *C ); +void TCORE_zsyr2k( cham_uplo_t uplo, cham_trans_t trans, int N, int K, CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A, const CHAM_tile_t *B, CHAMELEON_Complex64_t beta, CHAM_tile_t *C ); +int TCORE_zsyssq( cham_store_t storev, cham_uplo_t uplo, int N, const CHAM_tile_t *A, CHAM_tile_t *sclssq ); +int TCORE_zsytf2_nopiv( cham_uplo_t uplo, int n, CHAM_tile_t *A ); +int TCORE_ztplqt( int M, int N, int L, int IB, CHAM_tile_t *A, CHAM_tile_t *B, CHAM_tile_t *T, CHAMELEON_Complex64_t *WORK ); +int TCORE_ztpmlqt( cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int IB, const CHAM_tile_t *V, const CHAM_tile_t *T, CHAM_tile_t *A, CHAM_tile_t *B, CHAMELEON_Complex64_t *WORK ); +int TCORE_ztpmqrt( cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int IB, const CHAM_tile_t *V, const CHAM_tile_t *T, CHAM_tile_t *A, CHAM_tile_t *B, CHAMELEON_Complex64_t *WORK ); +int TCORE_ztpqrt( int M, int N, int L, int IB, CHAM_tile_t *A, CHAM_tile_t *B, CHAM_tile_t *T, CHAMELEON_Complex64_t *WORK ); +int TCORE_ztradd( cham_uplo_t uplo, cham_trans_t trans, int M, int N, CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A, CHAMELEON_Complex64_t beta, CHAM_tile_t *B ); +void TCORE_ztrasm( cham_store_t storev, cham_uplo_t uplo, cham_diag_t diag, int M, int N, const CHAM_tile_t *A, double *work ); +void TCORE_ztrmm( cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, int M, int N, CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A, CHAM_tile_t *B ); +void TCORE_ztrsm( cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, int M, int N, CHAMELEON_Complex64_t alpha, const CHAM_tile_t *A, CHAM_tile_t *B ); +int TCORE_ztrssq( cham_uplo_t uplo, cham_diag_t diag, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *sclssq ); +void TCORE_ztrtri( cham_uplo_t uplo, cham_diag_t diag, int N, CHAM_tile_t *A, int *info ); +int TCORE_ztsmlq_hetra1( cham_side_t side, cham_trans_t trans, int m1, int n1, int m2, int n2, int k, int ib, CHAM_tile_t *A1, CHAM_tile_t *A2, const CHAM_tile_t *V, const CHAM_tile_t *T, CHAMELEON_Complex64_t *WORK, int ldwork ); +int TCORE_ztsmqr_hetra1( cham_side_t side, cham_trans_t trans, int m1, int n1, int m2, int n2, int k, int ib, CHAM_tile_t *A1, CHAM_tile_t *A2, const CHAM_tile_t *V, const CHAM_tile_t *T, CHAMELEON_Complex64_t *WORK, int ldwork ); +int TCORE_ztstrf( int M, int N, int IB, int NB, CHAM_tile_t *U, CHAM_tile_t *A, CHAM_tile_t *L, int *IPIV, CHAMELEON_Complex64_t *WORK, int LDWORK, int *INFO ); +int TCORE_zunmlq( cham_side_t side, cham_trans_t trans, int M, int N, int IB, int K, const CHAM_tile_t *V, const CHAM_tile_t *T, CHAM_tile_t *C, CHAMELEON_Complex64_t *WORK, int LDWORK ); +int TCORE_zunmqr( cham_side_t side, cham_trans_t trans, int M, int N, int K, int IB, const CHAM_tile_t *V, const CHAM_tile_t *T, CHAM_tile_t *C, CHAMELEON_Complex64_t *WORK, int LDWORK ); +int TCORE_zgram( cham_uplo_t uplo, int M, int N, int Mt, int Nt, const CHAM_tile_t *Di, const CHAM_tile_t *Dj, const CHAM_tile_t *D, CHAM_tile_t *A ); + +#endif /* _coreblas_ztile_h_ */ diff --git a/include/chameleon/struct.h b/include/chameleon/struct.h index 74758056ea7d90e62f71ad73a0fd6cb872ff88f5..7451dfac6e75293e772cf7e90593ad765831f67d 100644 --- a/include/chameleon/struct.h +++ b/include/chameleon/struct.h @@ -28,6 +28,16 @@ BEGIN_C_DECLS +#define CHAMELEON_TILE_FULLRANK (1 << 0) +#define CHAMELEON_TILE_DESC (1 << 1) +#define CHAMELEON_TILE_HMAT (1 << 2) + +typedef struct chameleon_tile_s { + int8_t format; + int m, n, ld; + void *mat; +} CHAM_tile_t; + /** * Tile matrix descriptor * @@ -49,17 +59,21 @@ BEGIN_C_DECLS struct chameleon_desc_s; typedef struct chameleon_desc_s CHAM_desc_t; -typedef void* (*blkaddr_fct_t) ( const CHAM_desc_t*, int, int ); -typedef int (*blkldd_fct_t) ( const CHAM_desc_t*, int ); -typedef int (*blkrankof_fct_t)( const CHAM_desc_t*, int, int ); +typedef void* (*blkaddr_fct_t) ( const CHAM_desc_t*, int, int ); +typedef int (*blkldd_fct_t) ( const CHAM_desc_t*, int ); +typedef int (*blkrankof_fct_t)( const CHAM_desc_t*, int, int ); +typedef CHAM_tile_t* (*blktile_fct_t) ( const CHAM_desc_t*, int, int ); struct chameleon_desc_s { + // function to get chameleon tiles address + blktile_fct_t get_blktile; // function to get chameleon tiles address blkaddr_fct_t get_blkaddr; // function to get chameleon tiles leading dimension blkldd_fct_t get_blkldd; // function to get chameleon tiles MPI rank blkrankof_fct_t get_rankof; + CHAM_tile_t *tiles; // pointer to the array of tiles descriptors void *mat; // pointer to the beginning of the matrix size_t A21; // pointer to the beginning of the matrix A21 size_t A12; // pointer to the beginning of the matrix A12 diff --git a/include/chameleon/tasks.h b/include/chameleon/tasks.h index e0b1a56a89595ea45063aa600715f31afaaad29d..70b2d7b1330bbb2a20a9825b81089c8b01766a7f 100644 --- a/include/chameleon/tasks.h +++ b/include/chameleon/tasks.h @@ -93,7 +93,7 @@ typedef enum chameleon_tasktype_e { typedef int (*cham_unary_operator_t)( const CHAM_desc_t *desc, cham_uplo_t uplo, int m, int n, - void *data, void *op_args ); + CHAM_tile_t *data, void *op_args ); void INSERT_TASK_map( const RUNTIME_option_t *options, cham_uplo_t uplo, const CHAM_desc_t *A, int Am, int An, diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index d61e99b81996ac14f34607e6f9f96278080433ec..a5dbef97567bee951f8a6324966fc1289d292e7e 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -32,144 +32,144 @@ */ void INSERT_TASK_dzasum( const RUNTIME_option_t *options, cham_store_t storev, cham_uplo_t uplo, int M, int N, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_zaxpy( const RUNTIME_option_t *options, int M, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int incA, const CHAM_desc_t *B, int Bm, int Bn, int incB ); void INSERT_TASK_zbuild( const RUNTIME_option_t *options, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, void *user_data, void* user_build_callback ); void INSERT_TASK_zgeadd( const RUNTIME_option_t *options, cham_trans_t trans, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn, int ldb ); + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_zgelqt( const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ); + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *T, int Tm, int Tn ); void INSERT_TASK_zgemm( const RUNTIME_option_t *options, cham_trans_t transA, cham_trans_t transB, int m, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc ); + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ); void INSERT_TASK_zgeqrt( const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ); + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *T, int Tm, int Tn ); void INSERT_TASK_zgessm( const RUNTIME_option_t *options, int m, int n, int k, int ib, int nb, int *IPIV, - const CHAM_desc_t *L, int Lm, int Ln, int ldl, - const CHAM_desc_t *D, int Dm, int Dn, int ldd, - const CHAM_desc_t *A, int Am, int An, int lda ); + const CHAM_desc_t *L, int Lm, int Ln, + const CHAM_desc_t *D, int Dm, int Dn, + const CHAM_desc_t *A, int Am, int An ); void INSERT_TASK_zgessq( const RUNTIME_option_t *options, cham_store_t storev, int m, int n, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ); void INSERT_TASK_zgetrf( const RUNTIME_option_t *options, int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, int *IPIV, cham_bool_t check_info, int iinfo ); void INSERT_TASK_zgetrf_incpiv( const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *L, int Lm, int Ln, int ldl, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *L, int Lm, int Ln, int *IPIV, cham_bool_t check_info, int iinfo ); void INSERT_TASK_zgetrf_nopiv( const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, int iinfo ); + const CHAM_desc_t *A, int Am, int An, int iinfo ); void INSERT_TASK_zhe2ge( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int mb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ); + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_zhemm( const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc ); + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ); void INSERT_TASK_zher2k( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int LDB, - double beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc ); + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + double beta, const CHAM_desc_t *C, int Cm, int Cn ); void INSERT_TASK_zherfb( const RUNTIME_option_t *options, cham_uplo_t uplo, int n, int k, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, - const CHAM_desc_t *C, int Cm, int Cn, int ldc ); + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *T, int Tm, int Tn, + const CHAM_desc_t *C, int Cm, int Cn ); void INSERT_TASK_zherk( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, - double alpha, const CHAM_desc_t *A, int Am, int An, int lda, - double beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc ); + double alpha, const CHAM_desc_t *A, int Am, int An, + double beta, const CHAM_desc_t *C, int Cm, int Cn ); void INSERT_TASK_zhessq( const RUNTIME_option_t *options, cham_store_t storev, cham_uplo_t uplo, int n, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ); void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int mb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ); + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int mb, - int displA, const CHAM_desc_t *A, int Am, int An, int lda, - int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb ); + int displA, const CHAM_desc_t *A, int Am, int An, + int displB, const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_zlange( const RUNTIME_option_t *options, cham_normtype_t norm, int M, int N, int NB, - const CHAM_desc_t *A, int Am, int An, int LDA, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_zlange_max( const RUNTIME_option_t *options, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_zlanhe( const RUNTIME_option_t *options, cham_normtype_t norm, cham_uplo_t uplo, int N, int NB, - const CHAM_desc_t *A, int Am, int An, int LDA, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_zlansy( const RUNTIME_option_t *options, cham_normtype_t norm, cham_uplo_t uplo, int N, int NB, - const CHAM_desc_t *A, int Am, int An, int LDA, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_zlantr( const RUNTIME_option_t *options, cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, int M, int N, int NB, - const CHAM_desc_t *A, int Am, int An, int LDA, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_zlascal( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int nb, CHAMELEON_Complex64_t alpha, - const CHAM_desc_t *A, int Am, int An, int lda ); + const CHAM_desc_t *A, int Am, int An ); void INSERT_TASK_zlaset( const RUNTIME_option_t *options, cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t beta, - const CHAM_desc_t *tileA, int tileAm, int tileAn, int ldtilea ); + const CHAM_desc_t *tileA, int tileAm, int tileAn ); void INSERT_TASK_zlaset2( const RUNTIME_option_t *options, cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha, - const CHAM_desc_t *tileA, int tileAm, int tileAn, int ldtilea ); + const CHAM_desc_t *tileA, int tileAm, int tileAn ); void INSERT_TASK_zlatro( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int m, int n, int mb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ); + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_zlauum( const RUNTIME_option_t *options, cham_uplo_t uplo, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda ); + const CHAM_desc_t *A, int Am, int An ); void INSERT_TASK_zplghe( const RUNTIME_option_t *options, - double bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int lda, + double bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int bigM, int m0, int n0, unsigned long long int seed ); void INSERT_TASK_zplgsy( const RUNTIME_option_t *options, - CHAMELEON_Complex64_t bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int lda, + CHAMELEON_Complex64_t bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int bigM, int m0, int n0, unsigned long long int seed ); void INSERT_TASK_zplrnt( const RUNTIME_option_t *options, - int m, int n, const CHAM_desc_t *A, int Am, int An, int lda, + int m, int n, const CHAM_desc_t *A, int Am, int An, int bigM, int m0, int n0, unsigned long long int seed ); void INSERT_TASK_zplssq( const RUNTIME_option_t *options, cham_store_t storev, int M, int N, @@ -179,125 +179,125 @@ void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, int N, const CHAM_desc_t *RESULT, int RESULTm, int RESULTn ); void INSERT_TASK_zpotrf( const RUNTIME_option_t *options, cham_uplo_t uplo, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, int iinfo ); void INSERT_TASK_zssssm( const RUNTIME_option_t *options, int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *L1, int L1m, int L1n, int ldl1, - const CHAM_desc_t *L2, int L2m, int L2n, int ldl2, + const CHAM_desc_t *A1, int A1m, int A1n, + const CHAM_desc_t *A2, int A2m, int A2n, + const CHAM_desc_t *L1, int L1m, int L1n, + const CHAM_desc_t *L2, int L2m, int L2n, const int *IPIV ); void INSERT_TASK_zsymm( const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc ); + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ); void INSERT_TASK_zsyr2k( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int LDB, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc ); + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ); void INSERT_TASK_zsyrk( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc ); + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ); void INSERT_TASK_zsyssq( const RUNTIME_option_t *options, cham_store_t storev, cham_uplo_t uplo, int n, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ); void INSERT_TASK_zsytrf_nopiv( const RUNTIME_option_t *options, cham_uplo_t uplo, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, int iinfo ); void INSERT_TASK_ztplqt( const RUNTIME_option_t *options, int m, int n, int l, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ); + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + const CHAM_desc_t *T, int Tm, int Tn ); void INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int ib, int nb, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ); + const CHAM_desc_t *V, int Vm, int Vn, + const CHAM_desc_t *T, int Tm, int Tn, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int m, int n, int k, int l, int ib, int nb, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ); + const CHAM_desc_t *V, int Vm, int Vn, + const CHAM_desc_t *T, int Tm, int Tn, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_ztpqrt( const RUNTIME_option_t *options, int m, int n, int l, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ); + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + const CHAM_desc_t *T, int Tm, int Tn ); void INSERT_TASK_ztradd( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn, int ldb ); + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_ztrasm( const RUNTIME_option_t *options, cham_store_t storev, cham_uplo_t uplo, cham_diag_t diag, int M, int N, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_ztrmm( const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ); + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_ztrsm( const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ); + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ); void INSERT_TASK_ztrssq( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_diag_t diag, int m, int n, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ); void INSERT_TASK_ztrtri( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_diag_t diag, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, int iinfo ); void INSERT_TASK_ztsmlq_hetra1( const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ); + const CHAM_desc_t *A1, int A1m, int A1n, + const CHAM_desc_t *A2, int A2m, int A2n, + const CHAM_desc_t *V, int Vm, int Vn, + const CHAM_desc_t *T, int Tm, int Tn ); void INSERT_TASK_ztsmqr_hetra1( const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ); + const CHAM_desc_t *A1, int A1m, int A1n, + const CHAM_desc_t *A2, int A2m, int A2n, + const CHAM_desc_t *V, int Vm, int Vn, + const CHAM_desc_t *T, int Tm, int Tn ); void INSERT_TASK_ztstrf( const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *U, int Um, int Un, int ldu, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *L, int Lm, int Ln, int ldl, + const CHAM_desc_t *U, int Um, int Un, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *L, int Lm, int Ln, int *IPIV, cham_bool_t check_info, int iinfo ); void INSERT_TASK_zunmlq( const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int m, int n, int ib, int nb, int k, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, - const CHAM_desc_t *C, int Cm, int Cn, int ldc ); + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *T, int Tm, int Tn, + const CHAM_desc_t *C, int Cm, int Cn ); void INSERT_TASK_zunmqr( const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int m, int n, int k, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, - const CHAM_desc_t *C, int Cm, int Cn, int ldc ); + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *T, int Tm, int Tn, + const CHAM_desc_t *C, int Cm, int Cn ); /** * Keep these insert_task for retro-compatibility @@ -305,117 +305,117 @@ void INSERT_TASK_zunmqr( const RUNTIME_option_t *options, static inline void INSERT_TASK_ztslqt( const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ) + const CHAM_desc_t *A1, int A1m, int A1n, + const CHAM_desc_t *A2, int A2m, int A2n, + const CHAM_desc_t *T, int Tm, int Tn ) { INSERT_TASK_ztplqt( options, m, n, 0, ib, nb, - A1, A1m, A1n, lda1, - A2, A2m, A2n, lda2, - T, Tm, Tn, ldt ); + A1, A1m, A1n, + A2, A2m, A2n, + T, Tm, Tn ); } static inline void INSERT_TASK_ztsqrt( const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ) + const CHAM_desc_t *A1, int A1m, int A1n, + const CHAM_desc_t *A2, int A2m, int A2n, + const CHAM_desc_t *T, int Tm, int Tn ) { INSERT_TASK_ztpqrt( options, m, n, 0, ib, nb, - A1, A1m, A1n, lda1, - A2, A2m, A2n, lda2, - T, Tm, Tn, ldt ); + A1, A1m, A1n, + A2, A2m, A2n, + T, Tm, Tn ); } static inline void INSERT_TASK_zttlqt( const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ) + const CHAM_desc_t *A1, int A1m, int A1n, + const CHAM_desc_t *A2, int A2m, int A2n, + const CHAM_desc_t *T, int Tm, int Tn ) { INSERT_TASK_ztplqt( options, m, n, n, ib, nb, - A1, A1m, A1n, lda1, - A2, A2m, A2n, lda2, - T, Tm, Tn, ldt ); + A1, A1m, A1n, + A2, A2m, A2n, + T, Tm, Tn ); } static inline void INSERT_TASK_zttqrt( const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ) + const CHAM_desc_t *A1, int A1m, int A1n, + const CHAM_desc_t *A2, int A2m, int A2n, + const CHAM_desc_t *T, int Tm, int Tn ) { INSERT_TASK_ztpqrt( options, m, n, m, ib, nb, - A1, A1m, A1n, lda1, - A2, A2m, A2n, lda2, - T, Tm, Tn, ldt ); + A1, A1m, A1n, + A2, A2m, A2n, + T, Tm, Tn ); } static inline void INSERT_TASK_ztsmlq( const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ) + const CHAM_desc_t *A1, int A1m, int A1n, + const CHAM_desc_t *A2, int A2m, int A2n, + const CHAM_desc_t *V, int Vm, int Vn, + const CHAM_desc_t *T, int Tm, int Tn ) { (void)m1; (void)n1; INSERT_TASK_ztpmlqt( options, side, trans, m2, n2, k, 0, ib, nb, - V, Vm, Vn, ldv, T, Tm, Tn, ldt, - A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 ); + V, Vm, Vn, T, Tm, Tn, + A1, A1m, A1n, A2, A2m, A2n ); } static inline void INSERT_TASK_ztsmqr( const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ) + const CHAM_desc_t *A1, int A1m, int A1n, + const CHAM_desc_t *A2, int A2m, int A2n, + const CHAM_desc_t *V, int Vm, int Vn, + const CHAM_desc_t *T, int Tm, int Tn ) { (void)m1; (void)n1; INSERT_TASK_ztpmqrt( options, side, trans, m2, n2, k, 0, ib, nb, - V, Vm, Vn, ldv, T, Tm, Tn, ldt, - A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 ); + V, Vm, Vn, T, Tm, Tn, + A1, A1m, A1n, A2, A2m, A2n ); } static inline void INSERT_TASK_zttmlq( const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ) + const CHAM_desc_t *A1, int A1m, int A1n, + const CHAM_desc_t *A2, int A2m, int A2n, + const CHAM_desc_t *V, int Vm, int Vn, + const CHAM_desc_t *T, int Tm, int Tn ) { (void)m1; (void)n1; INSERT_TASK_ztpmlqt( options, side, trans, m2, n2, k, n2, ib, nb, - V, Vm, Vn, ldv, T, Tm, Tn, ldt, - A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 ); + V, Vm, Vn, T, Tm, Tn, + A1, A1m, A1n, A2, A2m, A2n ); } static inline void INSERT_TASK_zttmqr( const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ) + const CHAM_desc_t *A1, int A1m, int A1n, + const CHAM_desc_t *A2, int A2m, int A2n, + const CHAM_desc_t *V, int Vm, int Vn, + const CHAM_desc_t *T, int Tm, int Tn ) { (void)m1; (void)n1; INSERT_TASK_ztpmqrt( options, side, trans, m2, n2, k, m2, ib, nb, - V, Vm, Vn, ldv, T, Tm, Tn, ldt, - A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 ); + V, Vm, Vn, T, Tm, Tn, + A1, A1m, A1n, A2, A2m, A2n ); } /** @@ -424,9 +424,9 @@ INSERT_TASK_zttmqr( const RUNTIME_option_t *options, void INSERT_TASK_zgram( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int mt, int nt, - const CHAM_desc_t *Di, int Dim, int Din, int lddi, - const CHAM_desc_t *Dj, int Djm, int Djn, int lddj, + const CHAM_desc_t *Di, int Dim, int Din, + const CHAM_desc_t *Dj, int Djm, int Djn, const CHAM_desc_t *D, int Dm, int Dn, - CHAM_desc_t *A, int Am, int An, int lda); + CHAM_desc_t *A, int Am, int An); #endif /* _chameleon_tasks_z_h_ */ diff --git a/runtime/openmp/codelets/codelet_dzasum.c b/runtime/openmp/codelets/codelet_dzasum.c index 1ce65879b2c4077e42a93d0785e69367bb69ffec..e5d37a1b9dbc7ddbe7efe562898dd8d18bbd8459 100644 --- a/runtime/openmp/codelets/codelet_dzasum.c +++ b/runtime/openmp/codelets/codelet_dzasum.c @@ -2,8 +2,6 @@ * * @file openmp/codelet_dzasum.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * @@ -12,27 +10,26 @@ * @brief Chameleon dzasum OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.6.0 for CHAMELEON 0.9.2 - * @author Florent Pruvost * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" -#include "coreblas/coreblas_z.h" #include "chameleon/tasks_z.h" +#include "coreblas/coreblas_ztile.h" -void INSERT_TASK_dzasum(const RUNTIME_option_t *options, +void INSERT_TASK_dzasum( const RUNTIME_option_t *options, cham_store_t storev, cham_uplo_t uplo, int M, int N, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ); - double *ptrB = RTBLKADDR( B, double, Bm, Bn ); -#pragma omp task firstprivate(storev, uplo, M, N, lda, ptrA, ptrB) depend(in:ptrA[0]) depend(inout:ptrB[0]) - CORE_dzasum(storev, uplo, M, N, ptrA, lda, ptrB); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + +#pragma omp task firstprivate( storev, uplo, M, N, tileA, tileB ) depend( in:tileA[0] ) depend( inout:tileB[0] ) + TCORE_dzasum( storev, uplo, M, N, tileA, tileB->mat ); } diff --git a/runtime/openmp/codelets/codelet_map.c b/runtime/openmp/codelets/codelet_map.c index 4feac8e523e34ee038df7834b1fadb4f3d0b1088..9bceac18976cfa0d5a841343bbbf79f08029e963 100644 --- a/runtime/openmp/codelets/codelet_map.c +++ b/runtime/openmp/codelets/codelet_map.c @@ -10,8 +10,9 @@ * @brief Chameleon map OpenMP codelet * * @version 0.9.2 + * @author Philippe Virouleau * @author Mathieu Faverge - * @date 2018-11-21 + * @date 2019-11-19 * */ #include "chameleon_openmp.h" @@ -20,11 +21,11 @@ void INSERT_TASK_map( const RUNTIME_option_t *options, cham_uplo_t uplo, const CHAM_desc_t *A, int Am, int An, cham_unary_operator_t op_fct, void *op_args ) { - char *ptrA = RTBLKADDR( A, char, Am, An ); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); -#pragma omp task depend(inout: ptrA[0]) +#pragma omp task depend( inout: tileA[0] ) { - op_fct( A, uplo, Am, An, ptrA, op_args ); + op_fct( A, uplo, Am, An, tileA, op_args ); } } diff --git a/runtime/openmp/codelets/codelet_zaxpy.c b/runtime/openmp/codelets/codelet_zaxpy.c index 2d23d3ed893d9ff80c201560a712ebe466e735de..1c0006d8217b28e22d02938760ce8914a59eba45 100644 --- a/runtime/openmp/codelets/codelet_zaxpy.c +++ b/runtime/openmp/codelets/codelet_zaxpy.c @@ -2,34 +2,32 @@ * * @file openmp/codelet_zaxpy.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zaxpy StarPU codelet + * @brief Chameleon zaxpy OpenMP codelet * * @version 0.9.2 - * @author Florent Pruvost * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" -void INSERT_TASK_zaxpy(const RUNTIME_option_t *options, +void INSERT_TASK_zaxpy( const RUNTIME_option_t *options, int M, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int incA, - const CHAM_desc_t *B, int Bm, int Bn, int incB) + const CHAM_desc_t *B, int Bm, int Bn, int incB ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn); -#pragma omp task firstprivate(M, alpha, incA, incB, ptrA, ptrB) depend(in:ptrA[0]) depend(inout:ptrB[0]) - CORE_zaxpy(M, alpha, ptrA, incA, ptrB, incB); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); +#pragma omp task firstprivate( M, alpha, incA, incB, tileA, tileB ) depend( in:tileA[0] ) depend( inout:tileB[0] ) + TCORE_zaxpy( M, alpha, tileA, incA, tileB, incB ); } diff --git a/runtime/openmp/codelets/codelet_zbuild.c b/runtime/openmp/codelets/codelet_zbuild.c index 094d9c2750ca04b3f5e5be566c841ca18fbba3f5..98170904d04e9b92320afa0b3ff72206bc7ac638 100644 --- a/runtime/openmp/codelets/codelet_zbuild.c +++ b/runtime/openmp/codelets/codelet_zbuild.c @@ -2,35 +2,26 @@ * * @file openmp/codelet_zbuild.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zbuild StarPU codelet + * @brief Chameleon zbuild OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Piotr Luszczek - * @author Pierre Lemarinier - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede - * @author Guillaume Sylvand * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void INSERT_TASK_zbuild( const RUNTIME_option_t *options, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, void *user_data, void* user_build_callback ) { int row_min, row_max, col_min, col_max; @@ -38,10 +29,10 @@ void INSERT_TASK_zbuild( const RUNTIME_option_t *options, row_max = Am == A->mt-1 ? A->m-1 : row_min+A->mb-1 ; col_min = An*A->nb ; col_max = An == A->nt-1 ? A->n-1 : col_min+A->nb-1 ; - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - void (*callback)(int row_min, int row_max, int col_min, int col_max, void *buffer, int ld, void *user_data) ; + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + void ( *callback )( int row_min, int row_max, int col_min, int col_max, void *buffer, int ld, void *user_data ) ; callback = user_build_callback; -#pragma omp task firstprivate(row_min, row_max, col_min, col_max, ptrA, lda, user_data) depend(inout:ptrA[0]) - callback(row_min, row_max, col_min, col_max, ptrA, lda, user_data); +#pragma omp task firstprivate( row_min, row_max, col_min, col_max, tileA, user_data ) depend( inout:tileA[0] ) + callback( row_min, row_max, col_min, col_max, tileA->mat, tileA->ld, user_data ); } diff --git a/runtime/openmp/codelets/codelet_zgeadd.c b/runtime/openmp/codelets/codelet_zgeadd.c index e1bb12f26d3db3b005c7e1f96e022aa60532b4f5..60e8af6fb3adff9d698ac6eac0dbbf2c30038e42 100644 --- a/runtime/openmp/codelets/codelet_zgeadd.c +++ b/runtime/openmp/codelets/codelet_zgeadd.c @@ -2,90 +2,31 @@ * * @file openmp/codelet_zgeadd.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zgeadd StarPU codelet + * @brief Chameleon zgeadd OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" -/** - ****************************************************************************** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - * @brief Adds two general matrices together as in PBLAS pzgeadd. - * - * B <- alpha * op(A) + beta * B, - * - * where op(X) = X, X', or conj(X') - * - ******************************************************************************* - * - * @param[in] trans - * Specifies whether the matrix A is non-transposed, transposed, or - * conjugate transposed - * = ChamNoTrans: op(A) = A - * = ChamTrans: op(A) = A' - * = ChamConjTrans: op(A) = conj(A') - * - * @param[in] M - * Number of rows of the matrices op(A) and B. - * - * @param[in] N - * Number of columns of the matrices op(A) and B. - * - * @param[in] alpha - * Scalar factor of A. - * - * @param[in] A - * Matrix of size LDA-by-N, if trans = ChamNoTrans, LDA-by-M - * otherwise. - * - * @param[in] LDA - * Leading dimension of the array A. LDA >= max(1,k), with k=M, if - * trans = ChamNoTrans, and k=N otherwise. - * - * @param[in] beta - * Scalar factor of B. - * - * @param[in,out] B - * Matrix of size LDB-by-N. - * On exit, B = alpha * op(A) + beta * B - * - * @param[in] LDB - * Leading dimension of the array B. LDB >= max(1,M) - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if -i, the i-th argument had an illegal value - * - */ void INSERT_TASK_zgeadd( const RUNTIME_option_t *options, cham_trans_t trans, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn, int ldb ) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn); -#pragma omp task firstprivate(trans, m, n, alpha, beta, lda, ldb, ptrA, ptrB) depend(in:ptrA[0]) depend(inout:ptrB[0]) - CORE_zgeadd(trans, m, n, alpha, ptrA, lda, beta, ptrB, ldb); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); +#pragma omp task firstprivate( trans, m, n, alpha, beta, tileA, tileB ) depend( in:tileA[0] ) depend( inout:tileB[0] ) + TCORE_zgeadd( trans, m, n, alpha, tileA, beta, tileB ); } diff --git a/runtime/openmp/codelets/codelet_zgelqt.c b/runtime/openmp/codelets/codelet_zgelqt.c index d6baed7cff39804d4bc2f9a73ae38a5782e3edc4..b6004abfa9d69c7cbf3d6347190b64a728cb086b 100644 --- a/runtime/openmp/codelets/codelet_zgelqt.c +++ b/runtime/openmp/codelets/codelet_zgelqt.c @@ -2,108 +2,39 @@ * * @file openmp/codelet_zgelqt.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zgelqt StarPU codelet + * @brief Chameleon zgelqt OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Hatem Ltaief - * @author Jakub Kurzak - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" - -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - * CORE_zgelqt - computes a LQ factorization of a complex M-by-N tile A: A = L * Q. - * - * The tile Q is represented as a product of elementary reflectors - * - * Q = H(k)' . . . H(2)' H(1)', where k = min(M,N). - * - * Each H(i) has the form - * - * H(i) = I - tau * v * v' - * - * where tau is a complex scalar, and v is a complex vector with - * v(1:i-1) = 0 and v(i) = 1; conjg(v(i+1:n)) is stored on exit in - * A(i,i+1:n), and tau in TAU(i). - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the tile A. M >= 0. - * - * @param[in] N - * The number of columns of the tile A. N >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A - * On entry, the M-by-N tile A. - * On exit, the elements on and below the diagonal of the array - * contain the M-by-min(M,N) lower trapezoidal tile L (L is - * lower triangular if M <= N); the elements above the diagonal, - * with the array TAU, represent the unitary tile Q as a - * product of elementary reflectors (see Further Details). - * - * @param[in] LDA - * The leading dimension of the array A. LDA >= max(1,M). - * - * @param[out] T - * The IB-by-N triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] LDT - * The leading dimension of the array T. LDT >= IB. - * - * @param[out] TAU - * The scalar factors of the elementary reflectors (see Further - * Details). - * - * @param[out] WORK - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if -i, the i-th argument had an illegal value - * - */ +#include "coreblas/coreblas_ztile.h" -void INSERT_TASK_zgelqt(const RUNTIME_option_t *options, +void INSERT_TASK_zgelqt( const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *T, int Tm, int Tn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileT = T->get_blktile( T, Tm, Tn ); int ws_size = options->ws_wsize; -#pragma omp task firstprivate(ws_size, m, n, ib, ptrA, lda, ptrT, ldt) depend(inout:ptrA[0]) depend(out:ptrT[0]) +#pragma omp task firstprivate( ws_size, m, n, ib, tileA, tileT ) depend( inout:tileA[0] ) depend( out:tileT[0] ) { CHAMELEON_Complex64_t TAU[ws_size]; CHAMELEON_Complex64_t *work = TAU + chameleon_max( m, n ); - CORE_zlaset( ChamUpperLower, ib, m, 0., 0., ptrT, ldt ); - CORE_zgelqt( m, n, ib, ptrA, lda, ptrT, ldt, TAU, work ); + TCORE_zlaset( ChamUpperLower, ib, m, 0., 0., tileT ); + TCORE_zgelqt( m, n, ib, tileA, tileT, TAU, work ); } } diff --git a/runtime/openmp/codelets/codelet_zgemm.c b/runtime/openmp/codelets/codelet_zgemm.c index cc15f3d0d7f20c6b6bfce30c9ab71bb0cf72b247..a65f303de24611773203f89384b95c313969036e 100644 --- a/runtime/openmp/codelets/codelet_zgemm.c +++ b/runtime/openmp/codelets/codelet_zgemm.c @@ -2,51 +2,40 @@ * * @file openmp/codelet_zgemm.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zgemm StarPU codelet + * @brief Chameleon zgemm OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Hatem Ltaief - * @author Jakub Kurzak - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - */ -void INSERT_TASK_zgemm(const RUNTIME_option_t *options, - cham_trans_t transA, cham_trans_t transB, - int m, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc) +void +INSERT_TASK_zgemm( const RUNTIME_option_t *options, + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn); - CHAMELEON_Complex64_t *ptrC = RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn); -#pragma omp task firstprivate(transA, transB, m, n, k, alpha, ptrA, lda, ptrB, ldb, beta, ptrC, ldc) depend(in:ptrA[0], ptrB[0]) depend(inout:ptrC[0]) - CORE_zgemm(transA, transB, - m, n, k, - alpha, ptrA, lda, - ptrB, ldb, - beta, ptrC, ldc); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn ); + +#pragma omp task firstprivate( transA, transB, m, n, k, alpha, tileA, tileB, beta, tileC ) depend( in:tileA[0], tileB[0] ) depend( inout:tileC[0] ) + TCORE_zgemm( transA, transB, + m, n, k, + alpha, tileA, + tileB, + beta, tileC ); } diff --git a/runtime/openmp/codelets/codelet_zgeqrt.c b/runtime/openmp/codelets/codelet_zgeqrt.c index 2f2ed8c495643fb244a6d9a7066e22e687632d66..1fdaa8683ae09e90e4495fe5f76de9e4c34b8934 100644 --- a/runtime/openmp/codelets/codelet_zgeqrt.c +++ b/runtime/openmp/codelets/codelet_zgeqrt.c @@ -2,109 +2,39 @@ * * @file openmp/codelet_zgeqrt.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zgeqrt StarPU codelet + * @brief Chameleon zgeqrt OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Hatem Ltaief - * @author Jakub Kurzak - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" - -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - * CORE_zgeqrt computes a QR factorization of a complex M-by-N tile A: - * A = Q * R. - * - * The tile Q is represented as a product of elementary reflectors - * - * Q = H(1) H(2) . . . H(k), where k = min(M,N). - * - * Each H(i) has the form - * - * H(i) = I - tau * v * v' - * - * where tau is a complex scalar, and v is a complex vector with - * v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), - * and tau in TAU(i). - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the tile A. M >= 0. - * - * @param[in] N - * The number of columns of the tile A. N >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A - * On entry, the M-by-N tile A. - * On exit, the elements on and above the diagonal of the array - * contain the min(M,N)-by-N upper trapezoidal tile R (R is - * upper triangular if M >= N); the elements below the diagonal, - * with the array TAU, represent the unitary tile Q as a - * product of elementary reflectors (see Further Details). - * - * @param[in] LDA - * The leading dimension of the array A. LDA >= max(1,M). - * - * @param[out] T - * The IB-by-N triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] LDT - * The leading dimension of the array T. LDT >= IB. - * - * @param[out] TAU - * The scalar factors of the elementary reflectors (see Further - * Details). - * - * @param[out] WORK - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if -i, the i-th argument had an illegal value - * - */ +#include "coreblas/coreblas_ztile.h" -void INSERT_TASK_zgeqrt(const RUNTIME_option_t *options, +void INSERT_TASK_zgeqrt( const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *T, int Tm, int Tn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileT = T->get_blktile( T, Tm, Tn ); int ws_size = options->ws_wsize; -#pragma omp task firstprivate(ws_size, m, n, ib, ptrA, lda, ptrT, ldt) depend(inout:ptrA[0]) depend(out:ptrT[0]) +#pragma omp task firstprivate( ws_size, m, n, ib, tileA, tileT ) depend( inout:tileA[0] ) depend( out:tileT[0] ) { CHAMELEON_Complex64_t TAU[ws_size]; - CHAMELEON_Complex64_t *work = TAU + chameleon_max(m, n); + CHAMELEON_Complex64_t *work = TAU + chameleon_max( m, n ); - CORE_zlaset( ChamUpperLower, ib, n, 0., 0., ptrT, ldt ); - CORE_zgeqrt( m, n, ib, ptrA, lda, ptrT, ldt, TAU, work ); + TCORE_zlaset( ChamUpperLower, ib, n, 0., 0., tileT ); + TCORE_zgeqrt( m, n, ib, tileA, tileT, TAU, work ); } } diff --git a/runtime/openmp/codelets/codelet_zgessm.c b/runtime/openmp/codelets/codelet_zgessm.c index 29969b08e0fc9449ca84304b6a511bae98221d09..c9bfcd2f33c7cc40ca07793c61b6dcf7a1db2bc5 100644 --- a/runtime/openmp/codelets/codelet_zgessm.c +++ b/runtime/openmp/codelets/codelet_zgessm.c @@ -2,86 +2,33 @@ * * @file openmp/codelet_zgessm.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zgessm StarPU codelet + * @brief Chameleon zgessm OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Hatem Ltaief - * @author Jakub Kurzak - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" - -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - * CORE_zgessm applies the factors L computed by CORE_zgetrf_incpiv to - * a complex M-by-N tile A. - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the tile A. M >= 0. - * - * @param[in] N - * The number of columns of the tile A. N >= 0. - * - * @param[in] K - * The number of columns of the tile L. K >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in] IPIV - * The pivot indices array of size K as returned by - * CORE_zgetrf_incpiv. - * - * @param[in] L - * The M-by-K lower triangular tile. - * - * @param[in] LDL - * The leading dimension of the array L. LDL >= max(1,M). - * - * @param[in,out] A - * On entry, the M-by-N tile A. - * On exit, updated by the application of L. - * - * @param[in] LDA - * The leading dimension of the array A. LDA >= max(1,M). - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if INFO = -k, the k-th argument had an illegal value - * - */ +#include "coreblas/coreblas_ztile.h" -void INSERT_TASK_zgessm(const RUNTIME_option_t *options, +void INSERT_TASK_zgessm( const RUNTIME_option_t *options, int m, int n, int k, int ib, int nb, int *IPIV, - const CHAM_desc_t *L, int Lm, int Ln, int ldl, - const CHAM_desc_t *D, int Dm, int Dn, int ldd, - const CHAM_desc_t *A, int Am, int An, int lda) + const CHAM_desc_t *L, int Lm, int Ln, + const CHAM_desc_t *D, int Dm, int Dn, + const CHAM_desc_t *A, int Am, int An ) { - CHAMELEON_Complex64_t *ptrD = RTBLKADDR(D, CHAMELEON_Complex64_t, Dm, Dn); - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); -#pragma omp task firstprivate(m, n, k, ib, IPIV, ptrD, ldd, ptrA, lda) depend(in:ptrD[0]) depend(inout:ptrA[0]) - CORE_zgessm(m, n, k, ib, IPIV, ptrD, ldd, ptrA, lda); + CHAM_tile_t *tileD = D->get_blktile( D, Dm, Dn ); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); +#pragma omp task firstprivate( m, n, k, ib, IPIV, tileD, tileA ) depend( in:tileD[0] ) depend( inout:tileA[0] ) + TCORE_zgessm( m, n, k, ib, IPIV, tileD, tileA ); } diff --git a/runtime/openmp/codelets/codelet_zgessq.c b/runtime/openmp/codelets/codelet_zgessq.c index 42453eaacfde2ddc798a70059df8d417c7bd1edb..ccf2ee3435b318a0dd1f48e011366bf3717b71a2 100644 --- a/runtime/openmp/codelets/codelet_zgessq.c +++ b/runtime/openmp/codelets/codelet_zgessq.c @@ -2,35 +2,31 @@ * * @file openmp/codelet_zgessq.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zgessq StarPU codelet + * @brief Chameleon zgessq OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.6.0 for CHAMELEON 0.9.2 - * @author Mathieu Faverge * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void INSERT_TASK_zgessq( const RUNTIME_option_t *options, cham_store_t storev, int m, int n, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - double *ptrScaleSum = RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn); -#pragma omp task firstprivate(storev, m, n, ptrA, lda, ptrScaleSum) depend(in:ptrA[0]) depend(inout:ptrScaleSum[0]) - CORE_zgessq( storev, m, n, ptrA, lda, ptrScaleSum ); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileScaleSum = SCALESUMSQ->get_blktile( SCALESUMSQ, SCALESUMSQm, SCALESUMSQn ); +#pragma omp task firstprivate( storev, m, n, tileA, tileScaleSum ) depend( in:tileA[0] ) depend( inout:tileScaleSum[0] ) + TCORE_zgessq( storev, m, n, tileA, tileScaleSum ); } diff --git a/runtime/openmp/codelets/codelet_zgetrf.c b/runtime/openmp/codelets/codelet_zgetrf.c index 8a197d18b2bd58c3507aa0c7fffc3578c242092f..4ed49a30cca24c06036fd58707cd37c1a3b7b1f9 100644 --- a/runtime/openmp/codelets/codelet_zgetrf.c +++ b/runtime/openmp/codelets/codelet_zgetrf.c @@ -2,38 +2,32 @@ * * @file openmp/codelet_zgetrf.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zgetrf StarPU codelet + * @brief Chameleon zgetrf OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" -void INSERT_TASK_zgetrf(const RUNTIME_option_t *options, +void INSERT_TASK_zgetrf( const RUNTIME_option_t *options, int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, int *IPIV, - cham_bool_t check_info, int iinfo) + cham_bool_t check_info, int iinfo ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); int info = 0; -#pragma omp task firstprivate(m, n, ptrA, lda, IPIV, info) depend(out:IPIV[0]) depend(inout:ptrA[0]) - CORE_zgetrf( m, n, ptrA, lda, IPIV, &info ); +#pragma omp task firstprivate( m, n, tileA, IPIV, info ) depend( out:IPIV[0] ) depend( inout:tileA[0] ) + TCORE_zgetrf( m, n, tileA, IPIV, &info ); } diff --git a/runtime/openmp/codelets/codelet_zgetrf_incpiv.c b/runtime/openmp/codelets/codelet_zgetrf_incpiv.c index 00edb5847a067f7cf8c2366450cdd740d398bdbf..8dcf085390cafcff2f644a75c30f17268b662223 100644 --- a/runtime/openmp/codelets/codelet_zgetrf_incpiv.c +++ b/runtime/openmp/codelets/codelet_zgetrf_incpiv.c @@ -2,95 +2,33 @@ * * @file openmp/codelet_zgetrf_incpiv.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zgetrf_incpiv StarPU codelet + * @brief Chameleon zgetrf_incpiv OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Hatem Ltaief - * @author Jakub Kurzak - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" - -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - * CORE_zgetrf_incpiv computes an LU factorization of a general M-by-N tile A - * using partial pivoting with row interchanges. - * - * The factorization has the form - * - * A = P * L * U - * - * where P is a permutation matrix, L is lower triangular with unit - * diagonal elements (lower trapezoidal if m > n), and U is upper - * triangular (upper trapezoidal if m < n). - * - * This is the right-looking Level 2.5 BLAS version of the algorithm. - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the tile A. M >= 0. - * - * @param[in] N - * The number of columns of the tile A. N >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A - * On entry, the M-by-N tile to be factored. - * On exit, the factors L and U from the factorization - * A = P*L*U; the unit diagonal elements of L are not stored. - * - * @param[in] LDA - * The leading dimension of the array A. LDA >= max(1,M). - * - * @param[out] IPIV - * The pivot indices; for 1 <= i <= min(M,N), row i of the - * tile was interchanged with row IPIV(i). - * - * @param[out] INFO - * See returned value. - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if INFO = -k, the k-th argument had an illegal value - * @retval >0 if INFO = k, U(k,k) is exactly zero. The factorization - * has been completed, but the factor U is exactly - * singular, and division by zero will occur if it is used - * to solve a system of equations. - * - */ +#include "coreblas/coreblas_ztile.h" -void INSERT_TASK_zgetrf_incpiv(const RUNTIME_option_t *options, +void INSERT_TASK_zgetrf_incpiv( const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *L, int Lm, int Ln, int ldl, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *L, int Lm, int Ln, int *IPIV, - cham_bool_t check_info, int iinfo) + cham_bool_t check_info, int iinfo ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); int info = 0; -#pragma omp task firstprivate(m, n, ib, ptrA, lda, IPIV, info) depend(out:IPIV[0]) depend(inout:ptrA[0]) - CORE_zgetrf_incpiv(m, n, ib, ptrA, lda, IPIV, &info); +#pragma omp task firstprivate( m, n, ib, tileA, IPIV, info ) depend( out:IPIV[0] ) depend( inout:tileA[0] ) + TCORE_zgetrf_incpiv( m, n, ib, tileA, IPIV, &info ); } diff --git a/runtime/openmp/codelets/codelet_zgetrf_nopiv.c b/runtime/openmp/codelets/codelet_zgetrf_nopiv.c index 86a520800969781dfad57bfc892575b8cc8de36c..53446efac813836c5c954046ceea02e9ab2356f0 100644 --- a/runtime/openmp/codelets/codelet_zgetrf_nopiv.c +++ b/runtime/openmp/codelets/codelet_zgetrf_nopiv.c @@ -2,83 +2,31 @@ * * @file openmp/codelet_zgetrf_nopiv.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zgetrf_nopiv StarPU codelet + * @brief Chameleon zgetrf_nopiv OpenMP codelet * * @version 0.9.2 - * @author Omar Zenati - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" - -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - * CORE_zgetrf_nopiv computes an LU factorization of a general diagonal - * dominant M-by-N matrix A witout pivoting. - * - * The factorization has the form - * A = L * U - * where L is lower triangular with unit - * diagonal elements (lower trapezoidal if m > n), and U is upper - * triangular (upper trapezoidal if m < n). - * - * This is the right-looking Level 3 BLAS version of the algorithm. - * WARNING: Your matrix need to be diagonal dominant if you want to call this - * routine safely. - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the matrix A. M >= 0. - * - * @param[in] N - * The number of columns of the matrix A. N >= 0. - * - * @param[in] IB - * The block size to switch between blocked and unblocked code. - * - * @param[in,out] A - * On entry, the M-by-N matrix to be factored. - * On exit, the factors L and U from the factorization - * A = P*L*U; the unit diagonal elements of L are not stored. - * - * @param[in] LDA - * The leading dimension of the array A. LDA >= max(1,M). - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if INFO = -k, the k-th argument had an illegal value - * @retval >0 if INFO = k, U(k,k) is exactly zero. The factorization - * has been completed, but the factor U is exactly - * singular, and division by zero will occur if it is used - * to solve a system of equations. - * - */ +#include "coreblas/coreblas_ztile.h" -void INSERT_TASK_zgetrf_nopiv(const RUNTIME_option_t *options, +void INSERT_TASK_zgetrf_nopiv( const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - int iinfo) + const CHAM_desc_t *A, int Am, int An, + int iinfo ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); int info = 0; -#pragma omp task firstprivate(m, n, ib, ptrA, lda, info) depend(inout:ptrA[0]) - CORE_zgetrf_nopiv(m, n, ib, ptrA, lda, &info); +#pragma omp task firstprivate( m, n, ib, tileA, info ) depend( inout:tileA[0] ) + TCORE_zgetrf_nopiv( m, n, ib, tileA, &info ); } diff --git a/runtime/openmp/codelets/codelet_zgram.c b/runtime/openmp/codelets/codelet_zgram.c index 1250573332538d53bf201c4b82fb7e57ec2aa1c3..af0363df7a1818789390d1e70f9bc477656e892e 100644 --- a/runtime/openmp/codelets/codelet_zgram.c +++ b/runtime/openmp/codelets/codelet_zgram.c @@ -10,34 +10,30 @@ * @brief Chameleon zgram OpenMP codelet * * @version 0.9.2 + * @author Philippe Virouleau * @author Mathieu Faverge - * @author Florent Pruvost - * @date 2019-04-10 + * @date 2019-11-19 * @precisions normal z -> s d c z * */ - #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" +#include "coreblas/coreblas_ztile.h" void INSERT_TASK_zgram( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int mt, int nt, - const CHAM_desc_t *Di, int Dim, int Din, int lddi, - const CHAM_desc_t *Dj, int Djm, int Djn, int lddj, - const CHAM_desc_t *D, int Dm, int Dn, - CHAM_desc_t *A, int Am, int An, int lda) + const CHAM_desc_t *Di, int Dim, int Din, + const CHAM_desc_t *Dj, int Djm, int Djn, + const CHAM_desc_t *D, int Dm, int Dn, + CHAM_desc_t *A, int Am, int An ) { - double *ptrDi = RTBLKADDR(Di, double, Dim, Din); - double *ptrDj = RTBLKADDR(Dj, double, Djm, Djn); - double *ptrD = RTBLKADDR(D, double, Dm, Dn); - double *ptrA = RTBLKADDR(A, double, Am, An); + CHAM_tile_t *tileDi = Di->get_blktile( Di, Dim, Din ); + CHAM_tile_t *tileDj = Dj->get_blktile( Dj, Djm, Djn ); + CHAM_tile_t *tileD = D->get_blktile( D, Dm, Dn ); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); -#pragma omp task firstprivate(uplo, m, n, mt, nt, ptrDi, lddi, ptrDj, lddj, ptrD, ptrA, lda) depend(in:ptrDi[0], ptrDj[0], ptrD[0]) depend(inout:ptrA[0]) - CORE_zgram( uplo, - m, n, mt, nt, - ptrDi, lddi, - ptrDj, lddj, - ptrD, - ptrA, lda); +#pragma omp task firstprivate( uplo, m, n, mt, nt, tileDi, tileDj, tileD, tileA ) depend( in:tileDi[0], tileDj[0], tileD[0] ) depend( inout:tileA[0] ) + TCORE_zgram( uplo, m, n, mt, nt, + tileDi, tileDj, tileD, tileA ); } diff --git a/runtime/openmp/codelets/codelet_zhe2ge.c b/runtime/openmp/codelets/codelet_zhe2ge.c index f0730231c80955dc20f77ca5335e1b44fc17c484..df85d582d252ce765e80271e94e22c1609b32e74 100644 --- a/runtime/openmp/codelets/codelet_zhe2ge.c +++ b/runtime/openmp/codelets/codelet_zhe2ge.c @@ -2,38 +2,32 @@ * * @file openmp/codelet_zhe2ge.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zhe2ge StarPU codelet + * @brief Chameleon zhe2ge OpenMP codelet * * @version 0.9.2 * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - */ -void INSERT_TASK_zhe2ge(const RUNTIME_option_t *options, +void INSERT_TASK_zhe2ge( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int mb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ); - CHAMELEON_Complex64_t *ptrB = RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ); -#pragma omp task firstprivate(uplo, m, n, ptrA, lda, ptrB, ldb) depend(in: ptrA[0]) depend(inout:ptrB[0]) - CORE_zhe2ge(uplo, m, n, ptrA, lda, ptrB, ldb); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); +#pragma omp task firstprivate( uplo, m, n, tileA, tileB ) depend( in: tileA[0] ) depend( inout:tileB[0] ) + TCORE_zhe2ge( uplo, m, n, tileA, tileB ); } diff --git a/runtime/openmp/codelets/codelet_zhemm.c b/runtime/openmp/codelets/codelet_zhemm.c index 91cafbb923023f0dd460356a9df79cf905e22e91..f472ceac36ef7eae036940e4813a6a09de184660 100644 --- a/runtime/openmp/codelets/codelet_zhemm.c +++ b/runtime/openmp/codelets/codelet_zhemm.c @@ -2,51 +2,38 @@ * * @file openmp/codelet_zhemm.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zhemm StarPU codelet + * @brief Chameleon zhemm OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Hatem Ltaief - * @author Jakub Kurzak - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - */ -void INSERT_TASK_zhemm(const RUNTIME_option_t *options, +void INSERT_TASK_zhemm( const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn); - CHAMELEON_Complex64_t *ptrC = RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn); -#pragma omp task firstprivate(side, uplo, m, n, alpha, ptrA, lda, ptrB, ldb, beta, ptrC, ldc) depend(in:ptrA[0], ptrB[0]) depend(inout:ptrC[0]) - CORE_zhemm(side, uplo, + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn ); +#pragma omp task firstprivate( side, uplo, m, n, alpha, tileA, tileB, beta, tileC ) depend( in:tileA[0], tileB[0] ) depend( inout:tileC[0] ) + TCORE_zhemm( side, uplo, m, n, - alpha, ptrA, lda, - ptrB, ldb, - beta, ptrC, ldc); + alpha, tileA, + tileB, + beta, tileC ); } diff --git a/runtime/openmp/codelets/codelet_zher2k.c b/runtime/openmp/codelets/codelet_zher2k.c index 0ad4992a4108bd78ebe5faf0b213ef1daf37ccf0..1c67a996ce1a34ca9610ff465792cf4a963c847e 100644 --- a/runtime/openmp/codelets/codelet_zher2k.c +++ b/runtime/openmp/codelets/codelet_zher2k.c @@ -2,48 +2,35 @@ * * @file openmp/codelet_zher2k.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zher2k StarPU codelet + * @brief Chameleon zher2k OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Hatem Ltaief - * @author Jakub Kurzak - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - */ -void INSERT_TASK_zher2k(const RUNTIME_option_t *options, +void INSERT_TASK_zher2k( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - double beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + double beta, const CHAM_desc_t *C, int Cm, int Cn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn); - CHAMELEON_Complex64_t *ptrC = RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn); -#pragma omp task firstprivate(uplo, trans, n, k, alpha, ptrA, lda, ptrB, ldb, beta, ptrC, ldc) depend(in:ptrA[0], ptrB[0]) depend(inout:ptrC[0]) - CORE_zher2k(uplo, trans, - n, k, alpha, ptrA, lda, ptrB, ldb, beta, ptrC, ldc); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn ); +#pragma omp task firstprivate( uplo, trans, n, k, alpha, tileA, tileB, beta, tileC ) depend( in:tileA[0], tileB[0] ) depend( inout:tileC[0] ) + TCORE_zher2k( uplo, trans, + n, k, alpha, tileA, tileB, beta, tileC ); } diff --git a/runtime/openmp/codelets/codelet_zherfb.c b/runtime/openmp/codelets/codelet_zherfb.c index bb6f5a4dca133be3d0eabe668f8230b36f202030..6dfa2fe3a15b38dca26c1a3b19e81a26c900b5ec 100644 --- a/runtime/openmp/codelets/codelet_zherfb.c +++ b/runtime/openmp/codelets/codelet_zherfb.c @@ -2,43 +2,38 @@ * * @file openmp/codelet_zherfb.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zherfb StarPU codelet + * @brief Chameleon zherfb OpenMP codelet * * @version 0.9.2 * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" +#include "coreblas/coreblas_ztile.h" -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - */ -void INSERT_TASK_zherfb(const RUNTIME_option_t *options, +void INSERT_TASK_zherfb( const RUNTIME_option_t *options, cham_uplo_t uplo, int n, int k, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, - const CHAM_desc_t *C, int Cm, int Cn, int ldc) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *T, int Tm, int Tn, + const CHAM_desc_t *C, int Cm, int Cn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); - CHAMELEON_Complex64_t *ptrC = RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileT = T->get_blktile( T, Tm, Tn ); + CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn ); int ws_size = options->ws_wsize; -#pragma omp task firstprivate(ws_size, uplo, n, k, ib, nb, ptrA, lda, ptrT, ldt) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0]) +#pragma omp task firstprivate( ws_size, uplo, n, k, ib, nb, tileA, tileT ) depend( in:tileA[0], tileT[0] ) depend( inout:tileC[0] ) { CHAMELEON_Complex64_t work[ws_size]; - CORE_zherfb(uplo, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc, work, nb); + TCORE_zherfb( uplo, n, k, ib, nb, tileA, tileT, tileC, work, nb ); } } diff --git a/runtime/openmp/codelets/codelet_zherk.c b/runtime/openmp/codelets/codelet_zherk.c index d9742c2677e114e8b619ea73b9fb951c30264669..d05ef433174d13f65951e9ab971108a2e5f2d833 100644 --- a/runtime/openmp/codelets/codelet_zherk.c +++ b/runtime/openmp/codelets/codelet_zherk.c @@ -2,48 +2,35 @@ * * @file openmp/codelet_zherk.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zherk StarPU codelet + * @brief Chameleon zherk OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Hatem Ltaief - * @author Jakub Kurzak - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - */ -void INSERT_TASK_zherk(const RUNTIME_option_t *options, +void INSERT_TASK_zherk( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, - double alpha, const CHAM_desc_t *A, int Am, int An, int lda, - double beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc) + double alpha, const CHAM_desc_t *A, int Am, int An, + double beta, const CHAM_desc_t *C, int Cm, int Cn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - CHAMELEON_Complex64_t *ptrC = RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn); -#pragma omp task firstprivate(uplo, trans, n, k, alpha, ptrA, lda, beta, ptrC, ldc) depend(in:ptrA[0]) depend(inout:ptrC[0]) - CORE_zherk(uplo, trans, + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn ); +#pragma omp task firstprivate( uplo, trans, n, k, alpha, tileA, beta, tileC ) depend( in:tileA[0] ) depend( inout:tileC[0] ) + TCORE_zherk( uplo, trans, n, k, - alpha, ptrA, lda, - beta, ptrC, ldc); + alpha, tileA, + beta, tileC ); } diff --git a/runtime/openmp/codelets/codelet_zhessq.c b/runtime/openmp/codelets/codelet_zhessq.c index 85232d001a1b29286796d1a2e17e77eb095f310c..7f1c8ae11bb226749794a67176955accafe899de 100644 --- a/runtime/openmp/codelets/codelet_zhessq.c +++ b/runtime/openmp/codelets/codelet_zhessq.c @@ -2,21 +2,17 @@ * * @file openmp/codelet_zhessq.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zhessq StarPU codelet + * @brief Chameleon zhessq OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.6.0 for CHAMELEON 0.9.2 - * @author Mathieu Faverge * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c * */ @@ -25,10 +21,10 @@ void INSERT_TASK_zhessq( const RUNTIME_option_t *options, cham_store_t storev, cham_uplo_t uplo, int n, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { INSERT_TASK_zsyssq( options, storev, uplo, n, - A, Am, An, lda, + A, Am, An, SCALESUMSQ, SCALESUMSQm, SCALESUMSQn ); } diff --git a/runtime/openmp/codelets/codelet_zlacpy.c b/runtime/openmp/codelets/codelet_zlacpy.c index e483d86d10ac5c2671f55addb415499065a754f8..a2bdc0947ceb984e3e12b7987a23c442931ec568 100644 --- a/runtime/openmp/codelets/codelet_zlacpy.c +++ b/runtime/openmp/codelets/codelet_zlacpy.c @@ -2,54 +2,50 @@ * * @file openmp/codelet_zlacpy.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zlacpy StarPU codelet + * @brief Chameleon zlacpy OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Julien Langou - * @author Henricus Bouwmeester - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - */ void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int nb, - int displA, const CHAM_desc_t *A, int Am, int An, int lda, - int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb) + int displA, const CHAM_desc_t *A, int Am, int An, + int displB, const CHAM_desc_t *B, int Bm, int Bn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A + displA, CHAMELEON_Complex64_t, Am, An); - CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B + displB, CHAMELEON_Complex64_t, Bm, Bn); -#pragma omp task firstprivate(uplo, m, n, ptrA, lda, ptrB, ldb) depend(in:ptrA[0]) depend(inout:ptrB[0]) - CORE_zlacpy(uplo, m, n, ptrA, lda, ptrB, ldb); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + +#pragma omp task firstprivate( uplo, m, n, displA, tileA, displB, tileB ) depend( in:tileA[0] ) depend( inout:tileB[0] ) + { + CHAMELEON_Complex64_t *A = tileA->mat; + CHAMELEON_Complex64_t *B = tileB->mat; + + CORE_zlacpy( uplo, m, n, A + displA, tileA->ld, B + displB, tileB->ld ); + } } void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { INSERT_TASK_zlacpyx( options, uplo, m, n, nb, - 0, A, Am, An, lda, - 0, B, Bm, Bn, ldb ); + 0, A, Am, An, + 0, B, Bm, Bn ); } diff --git a/runtime/openmp/codelets/codelet_zlag2c.c b/runtime/openmp/codelets/codelet_zlag2c.c index cc3b003c0b795a3a0d77792a8d022c1a1c462b84..1edde747d708f3102289bae61c77b46bc84c6408 100644 --- a/runtime/openmp/codelets/codelet_zlag2c.c +++ b/runtime/openmp/codelets/codelet_zlag2c.c @@ -2,53 +2,42 @@ * * @file openmp/codelet_zlag2c.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zlag2c StarPU codelet + * @brief Chameleon zlag2c OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions mixed zc -> ds * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - */ void INSERT_TASK_zlag2c( const RUNTIME_option_t *options, int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - CHAMELEON_Complex32_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex32_t, Bm, Bn); -#pragma omp task firstprivate(m, n, ptrA, lda, ptrB, ldb) depend(in:ptrA[0]) depend(inout:ptrB[0]) - CORE_zlag2c( m, n, ptrA, lda, ptrB, ldb); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAMELEON_Complex32_t *tileB = B->get_blktile( B, Bm, Bn ); +#pragma omp task firstprivate( m, n, tileA, tileB ) depend( in:tileA[0] ) depend( inout:tileB[0] ) + TCORE_zlag2c( m, n, tileA, tileB ); } void INSERT_TASK_clag2z( const RUNTIME_option_t *options, int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { - CHAMELEON_Complex32_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex32_t, Am, An); - CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn); -#pragma omp task firstprivate(m, n, ptrA, lda, ptrB, ldb) depend(in:ptrA[0]) depend(inout:ptrB[0]) - CORE_clag2z( m, n, ptrA, lda, ptrB, ldb); + CHAMELEON_Complex32_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); +#pragma omp task firstprivate( m, n, tileA, tileB ) depend( in:tileA[0] ) depend( inout:tileB[0] ) + TCORE_clag2z( m, n, tileA, tileB ); } diff --git a/runtime/openmp/codelets/codelet_zlange.c b/runtime/openmp/codelets/codelet_zlange.c index 5ed3ec11cf89f80cf51a44de8ccbb8856406d77a..b9ec27f1052e8e37d56762de1491e1b4d8fe4ae0 100644 --- a/runtime/openmp/codelets/codelet_zlange.c +++ b/runtime/openmp/codelets/codelet_zlange.c @@ -2,55 +2,55 @@ * * @file openmp/codelet_zlange.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zlange StarPU codelet + * @brief Chameleon zlange OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.6.0 for CHAMELEON 0.9.2 - * @author Julien Langou - * @author Henricus Bouwmeester - * @author Mathieu Faverge * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" -void INSERT_TASK_zlange(const RUNTIME_option_t *options, - cham_normtype_t norm, int M, int N, int NB, - const CHAM_desc_t *A, int Am, int An, int LDA, - const CHAM_desc_t *B, int Bm, int Bn) +void INSERT_TASK_zlange( const RUNTIME_option_t *options, + cham_normtype_t norm, int M, int N, int NB, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - double *ptrB = RTBLKADDR(B, double, Bm, Bn); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); int ws_size = options->ws_wsize; -#pragma omp task firstprivate(ws_size, M, N, ptrA, LDA, ptrB, options) depend(in:ptrA[0]) depend(inout:ptrB[0]) +#pragma omp task firstprivate( ws_size, M, N, tileA, tileB, options ) depend( in:tileA[0] ) depend( inout:tileB[0] ) { - double work[ws_size]; - CORE_zlange( norm, M, N, ptrA, LDA, work, ptrB); + double work[ws_size]; + TCORE_zlange( norm, M, N, tileA, work, tileB->mat ); } } -void INSERT_TASK_zlange_max(const RUNTIME_option_t *options, +void INSERT_TASK_zlange_max( const RUNTIME_option_t *options, const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn) + const CHAM_desc_t *B, int Bm, int Bn ) { - double *ptrA = RTBLKADDR(A, double, Am, An); - double *ptrB = RTBLKADDR(B, double, Bm, Bn); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); -#pragma omp task firstprivate(ptrA, ptrB) depend(in:ptrA[0]) depend(inout:ptrB[0]) +#pragma omp task firstprivate( tileA, tileB ) depend( in:tileA[0] ) depend( inout:tileB[0] ) { - if ( *ptrA > *ptrB ) - *ptrB = *ptrA; + double *A, *B; + + A = tileA->mat; + B = tileB->mat; + + if ( A[0] > B[0] ) { + B[0] = A[0]; + } } } diff --git a/runtime/openmp/codelets/codelet_zlanhe.c b/runtime/openmp/codelets/codelet_zlanhe.c index 5f9e0a77179414645cecf4de662bebb9a3eec789..9b13a208ffb89d38ce148902acd7e41f9916952b 100644 --- a/runtime/openmp/codelets/codelet_zlanhe.c +++ b/runtime/openmp/codelets/codelet_zlanhe.c @@ -2,41 +2,36 @@ * * @file openmp/codelet_zlanhe.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zlanhe StarPU codelet + * @brief Chameleon zlanhe OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.6.0 for CHAMELEON 0.9.2 - * @author Julien Langou - * @author Henricus Bouwmeester - * @author Mathieu Faverge * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" -void INSERT_TASK_zlanhe(const RUNTIME_option_t *options, - cham_normtype_t norm, cham_uplo_t uplo, int N, int NB, - const CHAM_desc_t *A, int Am, int An, int LDA, - const CHAM_desc_t *B, int Bm, int Bn) +void INSERT_TASK_zlanhe( const RUNTIME_option_t *options, + cham_normtype_t norm, cham_uplo_t uplo, int N, int NB, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - double *normA = RTBLKADDR(B, double, Bm, Bn); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); int ws_size = options->ws_wsize; -#pragma omp task firstprivate(ws_size, norm, uplo, N, ptrA, LDA, normA) depend(in:ptrA[0]) depend(inout:normA[0]) + +#pragma omp task firstprivate( ws_size, norm, uplo, N, tileA, tileB ) depend( in:tileA[0] ) depend( inout:tileB[0] ) { - double work[ws_size]; - CORE_zlanhe( norm, uplo, N, ptrA, LDA, work, normA); + double work[ws_size]; + TCORE_zlanhe( norm, uplo, N, tileA, work, tileB->mat ); } } diff --git a/runtime/openmp/codelets/codelet_zlansy.c b/runtime/openmp/codelets/codelet_zlansy.c index 147b59da4c3634f5ebf66657975ba988c4e98d3f..9046ca44c76753e1cb8320db039697f805ca2574 100644 --- a/runtime/openmp/codelets/codelet_zlansy.c +++ b/runtime/openmp/codelets/codelet_zlansy.c @@ -2,41 +2,36 @@ * * @file openmp/codelet_zlansy.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zlansy StarPU codelet + * @brief Chameleon zlansy OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.6.0 for CHAMELEON 0.9.2 - * @author Julien Langou - * @author Henricus Bouwmeester - * @author Mathieu Faverge * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" -void INSERT_TASK_zlansy(const RUNTIME_option_t *options, - cham_normtype_t norm, cham_uplo_t uplo, int N, int NB, - const CHAM_desc_t *A, int Am, int An, int LDA, - const CHAM_desc_t *B, int Bm, int Bn) +void INSERT_TASK_zlansy( const RUNTIME_option_t *options, + cham_normtype_t norm, cham_uplo_t uplo, int N, int NB, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - double *normA = RTBLKADDR(B, double, Bm, Bn); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); int ws_size = options->ws_wsize; -#pragma omp task firstprivate(ws_size, norm, uplo, N, ptrA, LDA, normA) depend(in:ptrA[0]) depend(inout:normA[0]) + +#pragma omp task firstprivate( ws_size, norm, uplo, N, tileA, tileB ) depend( in:tileA[0] ) depend( inout:tileB[0] ) { - double work[ws_size]; - CORE_zlansy( norm, uplo, N, ptrA, LDA, work, normA); + double work[ws_size]; + TCORE_zlansy( norm, uplo, N, tileA, work, tileB->mat ); } } diff --git a/runtime/openmp/codelets/codelet_zlantr.c b/runtime/openmp/codelets/codelet_zlantr.c index 994d2fb26f966b92621a1f91b7a041c09a211c8f..0006b23035387bdace48a1446400e1c8b65e8f8c 100644 --- a/runtime/openmp/codelets/codelet_zlantr.c +++ b/runtime/openmp/codelets/codelet_zlantr.c @@ -2,40 +2,36 @@ * * @file openmp/codelet_zlantr.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zlantr StarPU codelet + * @brief Chameleon zlantr OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.6.0 for CHAMELEON 0.9.2 - * @author Mathieu Faverge * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" -void INSERT_TASK_zlantr(const RUNTIME_option_t *options, - cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, - int M, int N, int NB, - const CHAM_desc_t *A, int Am, int An, int LDA, - const CHAM_desc_t *B, int Bm, int Bn) +void INSERT_TASK_zlantr( const RUNTIME_option_t *options, + cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, + int M, int N, int NB, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - double *ptrB = RTBLKADDR(B, double, Bm, Bn); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); int ws_wsize = options->ws_wsize; -#pragma omp task firstprivate(ws_wsize, norm, uplo, diag, M, N, ptrA, LDA, ptrB) depend(in:ptrA[0]) depend(inout:ptrB[0]) +#pragma omp task firstprivate( ws_wsize, norm, uplo, diag, M, N, tileA, tileB ) depend( in:tileA[0] ) depend( inout:tileB[0] ) { - double work[ws_wsize]; - CORE_zlantr(norm, uplo, diag, M, N, ptrA, LDA, work, ptrB); + double work[ws_wsize]; + TCORE_zlantr( norm, uplo, diag, M, N, tileA, work, tileB->mat ); } } diff --git a/runtime/openmp/codelets/codelet_zlascal.c b/runtime/openmp/codelets/codelet_zlascal.c index 84944c9093f5d21768788da53b9149e3970ea497..71bb938c73a38df3063793c7df70ae7b5ee96cf7 100644 --- a/runtime/openmp/codelets/codelet_zlascal.c +++ b/runtime/openmp/codelets/codelet_zlascal.c @@ -2,67 +2,31 @@ * * @file openmp/codelet_zlascal.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zlascal StarPU codelet + * @brief Chameleon zlascal OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Dalal Sukkari * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - * CORE_zlascal adds to matrices together. - * - * A <- alpha * A - * - ******************************************************************************* - * - * @param[in] M - * Number of rows of the matrices A and B. - * - * @param[in] N - * Number of columns of the matrices A and B. - * - * @param[in] alpha - * Scalar factor of A. - * - * @param[in] A - * Matrix of size LDA-by-N. - * - * @param[in] LDA - * Leading dimension of the array A. LDA >= max(1,M) - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if -i, the i-th argument had an illegal value - * - */ - -void INSERT_TASK_zlascal(const RUNTIME_option_t *options, - cham_uplo_t uplo, - int m, int n, int nb, - CHAMELEON_Complex64_t alpha, - const CHAM_desc_t *A, int Am, int An, int lda) +void INSERT_TASK_zlascal( const RUNTIME_option_t *options, + cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, + const CHAM_desc_t *A, int Am, int An ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); -#pragma omp task firstprivate(uplo, m, n, alpha, ptrA, lda) depend(inout:ptrA[0]) - CORE_zlascal(uplo, m, n, alpha, ptrA, lda); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); +#pragma omp task firstprivate( uplo, m, n, alpha, tileA ) depend( inout:tileA[0] ) + TCORE_zlascal( uplo, m, n, alpha, tileA ); } diff --git a/runtime/openmp/codelets/codelet_zlaset.c b/runtime/openmp/codelets/codelet_zlaset.c index 8884f4cbf8a37e30afbbd78097ac188267cea79e..5e083ffb12d5db6a9fc9a2f602a4f58a86027ad2 100644 --- a/runtime/openmp/codelets/codelet_zlaset.c +++ b/runtime/openmp/codelets/codelet_zlaset.c @@ -2,73 +2,30 @@ * * @file openmp/codelet_zlaset.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zlaset StarPU codelet + * @brief Chameleon zlaset OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Hatem Ltaief - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" - -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - * CORE_zlaset - Sets the elements of the matrix A on the diagonal - * to beta and on the off-diagonals to alpha - * - ******************************************************************************* - * - * @param[in] uplo - * Specifies which elements of the matrix are to be set - * = ChamUpper: Upper part of A is set; - * = ChamLower: Lower part of A is set; - * = ChamUpperLower: ALL elements of A are set. - * - * @param[in] M - * The number of rows of the matrix A. M >= 0. - * - * @param[in] N - * The number of columns of the matrix A. N >= 0. - * - * @param[in] alpha - * The constant to which the off-diagonal elements are to be set. - * - * @param[in] beta - * The constant to which the diagonal elements are to be set. - * - * @param[in,out] A - * On entry, the M-by-N tile A. - * On exit, A has been set accordingly. - * - * @param[in] LDA - * The leading dimension of the array A. LDA >= max(1,M). - * - */ -void INSERT_TASK_zlaset(const RUNTIME_option_t *options, - cham_uplo_t uplo, int M, int N, - CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t beta, - const CHAM_desc_t *A, int Am, int An, int LDA) +void INSERT_TASK_zlaset( const RUNTIME_option_t *options, + cham_uplo_t uplo, int M, int N, + CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t beta, + const CHAM_desc_t *A, int Am, int An ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); -#pragma omp task firstprivate(uplo, M, N, alpha, beta, ptrA, LDA) depend(inout:ptrA[0]) - CORE_zlaset(uplo, M, N, alpha, beta, ptrA, LDA); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); +#pragma omp task firstprivate( uplo, M, N, alpha, beta, tileA ) depend( inout:tileA[0] ) + TCORE_zlaset( uplo, M, N, alpha, beta, tileA ); } diff --git a/runtime/openmp/codelets/codelet_zlaset2.c b/runtime/openmp/codelets/codelet_zlaset2.c index 87fb57d9e275372ae9cd86ae62617f466c2ebf65..0203e4b79b73d2e2a0e53a7cc8f5c43bb33bb15c 100644 --- a/runtime/openmp/codelets/codelet_zlaset2.c +++ b/runtime/openmp/codelets/codelet_zlaset2.c @@ -2,71 +2,30 @@ * * @file openmp/codelet_zlaset2.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zlaset2 StarPU codelet + * @brief Chameleon zlaset2 OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Hatem Ltaief - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" - -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - * CORE_zlaset2 - Sets the elements of the matrix A to alpha. - * Not LAPACK compliant! Read below. - * - ******************************************************************************* - * - * @param[in] uplo - * Specifies which elements of the matrix are to be set - * = ChamUpper: STRICT Upper part of A is set to alpha; - * = ChamLower: STRICT Lower part of A is set to alpha; - * = ChamUpperLower: ALL elements of A are set to alpha. - * Not LAPACK Compliant. - * - * @param[in] M - * The number of rows of the matrix A. M >= 0. - * - * @param[in] N - * The number of columns of the matrix A. N >= 0. - * - * @param[in] alpha - * The constant to which the elements are to be set. - * - * @param[in,out] A - * On entry, the M-by-N tile A. - * On exit, A has been set to alpha accordingly. - * - * @param[in] LDA - * The leading dimension of the array A. LDA >= max(1,M). - * - */ -void INSERT_TASK_zlaset2(const RUNTIME_option_t *options, - cham_uplo_t uplo, int M, int N, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int LDA) +void INSERT_TASK_zlaset2( const RUNTIME_option_t *options, + cham_uplo_t uplo, int M, int N, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); -#pragma omp task firstprivate(uplo, M, N, alpha, ptrA, LDA) depend(inout:ptrA[0]) - CORE_zlaset2(uplo, M, N, alpha, ptrA, LDA); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); +#pragma omp task firstprivate( uplo, M, N, alpha, tileA ) depend( inout:tileA[0] ) + TCORE_zlaset2( uplo, M, N, alpha, tileA ); } diff --git a/runtime/openmp/codelets/codelet_zlatro.c b/runtime/openmp/codelets/codelet_zlatro.c index 86c0c4b884773083383c5fb9ce7a5615c7540dda..408a5a7fdec6072d0a747d707fdc56f3d97e77be 100644 --- a/runtime/openmp/codelets/codelet_zlatro.c +++ b/runtime/openmp/codelets/codelet_zlatro.c @@ -2,45 +2,32 @@ * * @file openmp/codelet_zlatro.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zlatro StarPU codelet + * @brief Chameleon zlatro OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Julien Langou - * @author Henricus Bouwmeester - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - */ void INSERT_TASK_zlatro( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int m, int n, int mb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn); -#pragma omp task firstprivate(uplo, trans, m, n, ptrA, lda, ptrB, ldb) depend(in:ptrA[0]) depend(inout:ptrB[0]) - CORE_zlatro(uplo, trans, m, n, ptrA, lda, ptrB, ldb); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); +#pragma omp task firstprivate( uplo, trans, m, n, tileA, tileB ) depend( in:tileA[0] ) depend( inout:tileB[0] ) + TCORE_zlatro( uplo, trans, m, n, tileA, tileB ); } diff --git a/runtime/openmp/codelets/codelet_zlauum.c b/runtime/openmp/codelets/codelet_zlauum.c index 3d729f974fadd7aca74a3be1a6790469b0a5bb1c..48f1704d1024f4ae1b7b3114931639c7cbd8eb47 100644 --- a/runtime/openmp/codelets/codelet_zlauum.c +++ b/runtime/openmp/codelets/codelet_zlauum.c @@ -2,42 +2,29 @@ * * @file openmp/codelet_zlauum.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zlauum StarPU codelet + * @brief Chameleon zlauum OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Julien Langou - * @author Henricus Bouwmeester - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - */ -void INSERT_TASK_zlauum(const RUNTIME_option_t *options, +void INSERT_TASK_zlauum( const RUNTIME_option_t *options, cham_uplo_t uplo, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda) + const CHAM_desc_t *A, int Am, int An ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); -#pragma omp task firstprivate(uplo, n, ptrA, lda) depend(inout:ptrA[0]) - CORE_zlauum(uplo, n, ptrA, lda); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); +#pragma omp task firstprivate( uplo, n, tileA ) depend( inout:tileA[0] ) + TCORE_zlauum( uplo, n, tileA ); } diff --git a/runtime/openmp/codelets/codelet_zplghe.c b/runtime/openmp/codelets/codelet_zplghe.c index f7721e74ca45d8c706b2984c55df581e80c81896..148360e139f4e4a41fb34e01324ebf6aa23445e4 100644 --- a/runtime/openmp/codelets/codelet_zplghe.c +++ b/runtime/openmp/codelets/codelet_zplghe.c @@ -2,37 +2,29 @@ * * @file openmp/codelet_zplghe.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zplghe StarPU codelet + * @brief Chameleon zplghe OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Piotr Luszczek - * @author Pierre Lemarinier - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void INSERT_TASK_zplghe( const RUNTIME_option_t *options, - double bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int lda, + double bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int bigM, int m0, int n0, unsigned long long int seed ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); -#pragma omp task firstprivate(bump, m, n, ptrA, lda, bigM, m0, n0, seed) depend(inout:ptrA[0]) - CORE_zplghe( bump, m, n, ptrA, lda, bigM, m0, n0, seed ); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); +#pragma omp task firstprivate( bump, m, n, tileA, bigM, m0, n0, seed ) depend( inout:tileA[0] ) + TCORE_zplghe( bump, m, n, tileA, bigM, m0, n0, seed ); } diff --git a/runtime/openmp/codelets/codelet_zplgsy.c b/runtime/openmp/codelets/codelet_zplgsy.c index d41878c377a5b2b4e9a8326195a2e3e27412cf22..f33f432205e1d2a8122eb62c9302163859049379 100644 --- a/runtime/openmp/codelets/codelet_zplgsy.c +++ b/runtime/openmp/codelets/codelet_zplgsy.c @@ -2,37 +2,29 @@ * * @file openmp/codelet_zplgsy.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zplgsy StarPU codelet + * @brief Chameleon zplgsy OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Piotr Luszczek - * @author Pierre Lemarinier - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void INSERT_TASK_zplgsy( const RUNTIME_option_t *options, - CHAMELEON_Complex64_t bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int lda, + CHAMELEON_Complex64_t bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int bigM, int m0, int n0, unsigned long long int seed ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); -#pragma omp task firstprivate(bump, m, n, ptrA, lda, bigM, m0, n0, seed) depend(inout:ptrA[0]) - CORE_zplgsy( bump, m, n, ptrA, lda, bigM, m0, n0, seed ); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); +#pragma omp task firstprivate( bump, m, n, tileA, bigM, m0, n0, seed ) depend( inout:tileA[0] ) + TCORE_zplgsy( bump, m, n, tileA, bigM, m0, n0, seed ); } diff --git a/runtime/openmp/codelets/codelet_zplrnt.c b/runtime/openmp/codelets/codelet_zplrnt.c index 9827cc74c1cd01889c466e0a3a5aa7aa94c4fde3..4251214c6143fed6b62ac4f9cf3c3bc49d5268e2 100644 --- a/runtime/openmp/codelets/codelet_zplrnt.c +++ b/runtime/openmp/codelets/codelet_zplrnt.c @@ -2,37 +2,29 @@ * * @file openmp/codelet_zplrnt.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zplrnt StarPU codelet + * @brief Chameleon zplrnt OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Piotr Luszczek - * @author Pierre Lemarinier - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void INSERT_TASK_zplrnt( const RUNTIME_option_t *options, - int m, int n, const CHAM_desc_t *A, int Am, int An, int lda, + int m, int n, const CHAM_desc_t *A, int Am, int An, int bigM, int m0, int n0, unsigned long long int seed ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); -#pragma omp task firstprivate(m, n, ptrA, lda, bigM, m0, n0, seed) depend(inout:ptrA[0]) - CORE_zplrnt( m, n, ptrA, lda, bigM, m0, n0, seed ); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); +#pragma omp task firstprivate( m, n, tileA, bigM, m0, n0, seed ) depend( inout:tileA[0] ) + TCORE_zplrnt( m, n, tileA, bigM, m0, n0, seed ); } diff --git a/runtime/openmp/codelets/codelet_zplssq.c b/runtime/openmp/codelets/codelet_zplssq.c index ad59a3eb6955f386297753e19bc79a9a12f32bc0..260a1f29bde9bb3a68d5b8976b2d9a4fe8c6210f 100644 --- a/runtime/openmp/codelets/codelet_zplssq.c +++ b/runtime/openmp/codelets/codelet_zplssq.c @@ -2,45 +2,47 @@ * * @file openmp/codelet_zplssq.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zplssq StarPU codelet + * @brief Chameleon zplssq OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.6.0 for CHAMELEON 0.9.2 - * @author Mathieu Faverge * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include <math.h> #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void INSERT_TASK_zplssq( const RUNTIME_option_t *options, cham_store_t storev, int M, int N, const CHAM_desc_t *IN, int INm, int INn, const CHAM_desc_t *OUT, int OUTm, int OUTn ) { - double *sclssq_in = RTBLKADDR(IN, double, INm, INn ); - double *sclssq_out = RTBLKADDR(OUT, double, OUTm, OUTn); -#pragma omp task firstprivate(storev, M, N) depend(in: sclssq_in[0]) depend(inout: sclssq_out[0]) - CORE_zplssq(storev, M, N, sclssq_in, sclssq_out); + CHAM_tile_t *tileIN = IN->get_blktile( IN, INm, INn ); + CHAM_tile_t *tileOUT = OUT->get_blktile( OUT, OUTm, OUTn ); + + assert( tileIN->format & CHAMELEON_TILE_FULLRANK ); + assert( tileOUT->format & CHAMELEON_TILE_FULLRANK ); + +#pragma omp task firstprivate( storev, M, N ) depend( in: tileIN[0] ) depend( inout: tileOUT[0] ) + CORE_zplssq( storev, M, N, tileIN->mat, tileOUT->mat ); } void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, int N, const CHAM_desc_t *RESULT, int RESULTm, int RESULTn ) { - double *res = RTBLKADDR(RESULT, double, RESULTm, RESULTn); + CHAM_tile_t *tileRESULT = RESULT->get_blktile( RESULT, RESULTm, RESULTn ); + + assert( tileRESULT->format & CHAMELEON_TILE_FULLRANK ); -#pragma omp task firstprivate(N) depend(inout: res[0]) - CORE_zplssq2(N, res); +#pragma omp task firstprivate( N ) depend( inout: tileRESULT[0] ) + CORE_zplssq2( N, tileRESULT->mat ); } diff --git a/runtime/openmp/codelets/codelet_zpotrf.c b/runtime/openmp/codelets/codelet_zpotrf.c index 72ac47a75b6c56bc04ff96b3f51438e5e6e52179..aea59eab66db7d93f2719682936a730bb5a11505 100644 --- a/runtime/openmp/codelets/codelet_zpotrf.c +++ b/runtime/openmp/codelets/codelet_zpotrf.c @@ -2,46 +2,33 @@ * * @file openmp/codelet_zpotrf.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zpotrf StarPU codelet + * @brief Chameleon zpotrf OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Hatem Ltaief - * @author Jakub Kurzak - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" #include "coreblas.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - */ -void INSERT_TASK_zpotrf(const RUNTIME_option_t *options, +void INSERT_TASK_zpotrf( const RUNTIME_option_t *options, cham_uplo_t uplo, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - int iinfo) + const CHAM_desc_t *A, int Am, int An, + int iinfo ) { - (void)nb; + ( void )nb; int info = 0; - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); -#pragma omp task firstprivate(uplo, n, lda, info, ptrA) depend(inout:ptrA[0]) - CORE_zpotrf(uplo, n, ptrA, lda, &info); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); +#pragma omp task firstprivate( uplo, n, info, tileA ) depend( inout:tileA[0] ) + TCORE_zpotrf( uplo, n, tileA, &info ); } diff --git a/runtime/openmp/codelets/codelet_zssssm.c b/runtime/openmp/codelets/codelet_zssssm.c index c1fd4b896400c43abebc5da2bfc150b0db376a88..a0e23614ed8dd39373d40e8c1540d22ab60d69ee 100644 --- a/runtime/openmp/codelets/codelet_zssssm.c +++ b/runtime/openmp/codelets/codelet_zssssm.c @@ -2,114 +2,38 @@ * * @file openmp/codelet_zssssm.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zssssm StarPU codelet + * @brief Chameleon zssssm OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Hatem Ltaief - * @author Jakub Kurzak + * @author Philippe Virouleau * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede - * @date 2018-06-15 + * @date 2019-11-19 * @precisions normal z -> c d s * */ - #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - * CORE_zssssm applies the LU factorization update from a complex - * matrix formed by a lower triangular IB-by-K tile L1 on top of a - * M2-by-K tile L2 to a second complex matrix formed by a M1-by-N1 - * tile A1 on top of a M2-by-N2 tile A2 (N1 == N2). - * - * This is the right-looking Level 2.5 BLAS version of the algorithm. - * - ******************************************************************************* - * - * @param[in] M1 - * The number of rows of the tile A1. M1 >= 0. - * - * @param[in] N1 - * The number of columns of the tile A1. N1 >= 0. - * - * @param[in] M2 - * The number of rows of the tile A2 and of the tile L2. - * M2 >= 0. - * - * @param[in] N2 - * The number of columns of the tile A2. N2 >= 0. - * - * @param[in] K - * The number of columns of the tiles L1 and L2. K >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A1 - * On entry, the M1-by-N1 tile A1. - * On exit, A1 is updated by the application of L (L1 L2). - * - * @param[in] LDA1 - * The leading dimension of the array A1. LDA1 >= max(1,M1). - * - * @param[in,out] A2 - * On entry, the M2-by-N2 tile A2. - * On exit, A2 is updated by the application of L (L1 L2). - * - * @param[in] LDA2 - * The leading dimension of the array A2. LDA2 >= max(1,M2). - * - * @param[in] L1 - * The IB-by-K lower triangular tile as returned by - * CORE_ztstrf. - * - * @param[in] LDL1 - * The leading dimension of the array L1. LDL1 >= max(1,IB). - * - * @param[in] L2 - * The M2-by-K tile as returned by CORE_ztstrf. - * - * @param[in] LDL2 - * The leading dimension of the array L2. LDL2 >= max(1,M2). - * - * @param[in] IPIV - * The pivot indices array of size K as returned by - * CORE_ztstrf. - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if INFO = -k, the k-th argument had an illegal value - * - */ +#include "coreblas/coreblas_ztile.h" -void INSERT_TASK_zssssm(const RUNTIME_option_t *options, +void INSERT_TASK_zssssm( const RUNTIME_option_t *options, int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *L1, int L1m, int L1n, int ldl1, - const CHAM_desc_t *L2, int L2m, int L2n, int ldl2, - const int *IPIV) + const CHAM_desc_t *A1, int A1m, int A1n, + const CHAM_desc_t *A2, int A2m, int A2n, + const CHAM_desc_t *L1, int L1m, int L1n, + const CHAM_desc_t *L2, int L2m, int L2n, + const int *IPIV ) { - CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n); - CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n); - CHAMELEON_Complex64_t *ptrL1 = RTBLKADDR(L1, CHAMELEON_Complex64_t, L1m, L1n); - CHAMELEON_Complex64_t *ptrL2 = RTBLKADDR(L2, CHAMELEON_Complex64_t, L2m, L2n); + CHAM_tile_t *tileA1 = A1->get_blktile( A1, A1m, A1n ); + CHAM_tile_t *tileA2 = A2->get_blktile( A2, A2m, A2n ); + CHAM_tile_t *tileL1 = L1->get_blktile( L1, L1m, L1n ); + CHAM_tile_t *tileL2 = L2->get_blktile( L2, L2m, L2n ); -#pragma omp task firstprivate(m1, n1, m2, n2, k, ib, ptrA1, ptrA2, ptrL1, ptrL2, lda1, lda2, ldl1, ldl2, IPIV) \ - depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrL1[0], ptrL2[0]) - CORE_zssssm(m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrL1, ldl1, ptrL2, ldl2, IPIV); +#pragma omp task firstprivate( m1, n1, m2, n2, k, ib, tileA1, tileA2, tileL1, tileL2, IPIV ) \ + depend( inout:tileA1[0], tileA2[0] ) depend( in:tileL1[0], tileL2[0] ) + TCORE_zssssm( m1, n1, m2, n2, k, ib, tileA1, tileA2, tileL1, tileL2, IPIV ); } diff --git a/runtime/openmp/codelets/codelet_zsymm.c b/runtime/openmp/codelets/codelet_zsymm.c index 85bb6c0fe491cbc9e14b655cdc64e4a396295d4e..60bbbfbd1914f5e1212c8765778f0c9bd7498c50 100644 --- a/runtime/openmp/codelets/codelet_zsymm.c +++ b/runtime/openmp/codelets/codelet_zsymm.c @@ -2,49 +2,38 @@ * * @file openmp/codelet_zsymm.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zsymm StarPU codelet + * @brief Chameleon zsymm OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Hatem Ltaief - * @author Jakub Kurzak + * @author Philippe Virouleau * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede - * @date 2018-06-15 + * @date 2019-11-19 * @precisions normal z -> c d s * */ - #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - */ -void INSERT_TASK_zsymm(const RUNTIME_option_t *options, +#include "coreblas/coreblas_ztile.h" + +void INSERT_TASK_zsymm( const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn); - CHAMELEON_Complex64_t *ptrC = RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn); -#pragma omp task firstprivate(side, uplo, m, n, alpha, ptrA, lda, ptrB, ldb, beta, ptrC, ldc) depend(in:ptrA[0], ptrB[0]) depend(inout:ptrC[0]) - CORE_zsymm(side, uplo, + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn ); +#pragma omp task firstprivate( side, uplo, m, n, alpha, tileA, tileB, beta, tileC ) depend( in:tileA[0], tileB[0] ) depend( inout:tileC[0] ) + TCORE_zsymm( side, uplo, m, n, - alpha, ptrA, lda, - ptrB, ldb, - beta, ptrC, ldc); + alpha, tileA, + tileB, + beta, tileC ); } diff --git a/runtime/openmp/codelets/codelet_zsyr2k.c b/runtime/openmp/codelets/codelet_zsyr2k.c index bedb0ef46e2d4bf7203ba46df83e1da70ab95e02..73d09b143ba46b8bd15ab7d1ebf2e145821bc628 100644 --- a/runtime/openmp/codelets/codelet_zsyr2k.c +++ b/runtime/openmp/codelets/codelet_zsyr2k.c @@ -2,49 +2,36 @@ * * @file openmp/codelet_zsyr2k.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zsyr2k StarPU codelet + * @brief Chameleon zsyr2k OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Hatem Ltaief - * @author Jakub Kurzak - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - */ -void INSERT_TASK_zsyr2k(const RUNTIME_option_t *options, +void INSERT_TASK_zsyr2k( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) { - (void)nb; - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn); - CHAMELEON_Complex64_t *ptrC = RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn); -#pragma omp task firstprivate(uplo, trans, n, k, alpha, ptrA, lda, ptrB, ldb, beta, ptrC, ldc) depend(in:ptrA[0], ptrB[0]) depend(inout:ptrC[0]) - CORE_zsyr2k(uplo, trans, - n, k, alpha, ptrA, lda, ptrB, ldb, beta, ptrC, ldc); + ( void )nb; + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn ); +#pragma omp task firstprivate( uplo, trans, n, k, alpha, tileA, tileB, beta, tileC ) depend( in:tileA[0], tileB[0] ) depend( inout:tileC[0] ) + TCORE_zsyr2k( uplo, trans, + n, k, alpha, tileA, tileB, beta, tileC ); } diff --git a/runtime/openmp/codelets/codelet_zsyrk.c b/runtime/openmp/codelets/codelet_zsyrk.c index 797aa4b089b05621c210d9928b23b9500f7371f0..89d674a7fea68c4ad994fedc9348947e75e86d62 100644 --- a/runtime/openmp/codelets/codelet_zsyrk.c +++ b/runtime/openmp/codelets/codelet_zsyrk.c @@ -2,49 +2,36 @@ * * @file openmp/codelet_zsyrk.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zsyrk StarPU codelet + * @brief Chameleon zsyrk OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Hatem Ltaief - * @author Jakub Kurzak - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - */ -void INSERT_TASK_zsyrk(const RUNTIME_option_t *options, +void INSERT_TASK_zsyrk( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) { - (void)nb; - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - CHAMELEON_Complex64_t *ptrC = RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn); -#pragma omp task firstprivate(uplo, trans, n, k, alpha, ptrA, lda, beta, ptrC, ldc) depend(in:ptrA[0]) depend(inout:ptrC[0]) - CORE_zsyrk(uplo, trans, + ( void )nb; + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn ); +#pragma omp task firstprivate( uplo, trans, n, k, alpha, tileA, beta, tileC ) depend( in:tileA[0] ) depend( inout:tileC[0] ) + TCORE_zsyrk( uplo, trans, n, k, - alpha, ptrA, lda, - beta, ptrC, ldc); + alpha, tileA, + beta, tileC ); } diff --git a/runtime/openmp/codelets/codelet_zsyssq.c b/runtime/openmp/codelets/codelet_zsyssq.c index 32a7dc9d52775570d9575755b8d5c05d53945b70..e767eba613cd5394594b997ac295598e698b2831 100644 --- a/runtime/openmp/codelets/codelet_zsyssq.c +++ b/runtime/openmp/codelets/codelet_zsyssq.c @@ -2,33 +2,31 @@ * * @file openmp/codelet_zsyssq.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zsyssq StarPU codelet + * @brief Chameleon zsyssq OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.6.0 for CHAMELEON 0.9.2 + * @author Philippe Virouleau * @author Mathieu Faverge - * @date 2018-06-15 + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" +#include "coreblas/coreblas_ztile.h" void INSERT_TASK_zsyssq( const RUNTIME_option_t *options, cham_store_t storev, cham_uplo_t uplo, int n, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - double *ptrSCALESUMSQ = RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn); -#pragma omp task firstprivate(storev, uplo, n, ptrA, lda, ptrSCALESUMSQ) depend(in:ptrA[0]) depend(inout:ptrSCALESUMSQ[0]) - CORE_zsyssq( storev, uplo, n, ptrA, lda, ptrSCALESUMSQ ); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileSCALESUMSQ = SCALESUMSQ->get_blktile( SCALESUMSQ, SCALESUMSQm, SCALESUMSQn ); +#pragma omp task firstprivate( storev, uplo, n, tileA, tileSCALESUMSQ ) depend( in:tileA[0] ) depend( inout:tileSCALESUMSQ[0] ) + TCORE_zsyssq( storev, uplo, n, tileA, tileSCALESUMSQ ); } diff --git a/runtime/openmp/codelets/codelet_zsytrf_nopiv.c b/runtime/openmp/codelets/codelet_zsytrf_nopiv.c index 3fb6bb2c4f2fe51b72c4b4ea729a9649fc867fec..9f6debd57eb47570118cc317678aabbcad913f10 100644 --- a/runtime/openmp/codelets/codelet_zsytrf_nopiv.c +++ b/runtime/openmp/codelets/codelet_zsytrf_nopiv.c @@ -2,36 +2,30 @@ * * @file openmp/codelet_zsytrf_nopiv.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zsytrf_nopiv StarPU codelet + * @brief Chameleon zsytrf_nopiv OpenMP codelet * * @version 0.9.2 - * @author Hatem Ltaief - * @author Jakub Kurzak + * @author Philippe Virouleau * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede - * @author Florent Pruvost - * @author Marc Sergent - * @date 2018-06-15 + * @date 2019-11-19 * @precisions normal z -> c * */ - #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -void INSERT_TASK_zsytrf_nopiv(const RUNTIME_option_t *options, - cham_uplo_t uplo, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - int iinfo) +#include "coreblas/coreblas_ztile.h" + +void INSERT_TASK_zsytrf_nopiv( const RUNTIME_option_t *options, + cham_uplo_t uplo, int n, int nb, + const CHAM_desc_t *A, int Am, int An, + int iinfo ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); -#pragma omp task firstprivate(uplo, n, ptrA, lda) depend(inout:ptrA[0]) - CORE_zsytf2_nopiv(uplo, n, ptrA, lda); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); +#pragma omp task firstprivate( uplo, n, tileA ) depend( inout:tileA[0] ) + TCORE_zsytf2_nopiv( uplo, n, tileA ); } diff --git a/runtime/openmp/codelets/codelet_ztplqt.c b/runtime/openmp/codelets/codelet_ztplqt.c index d35487b004ef23d4aa2ecc47884dece306387762..db708143e4c1d195074b1d753634179460790222 100644 --- a/runtime/openmp/codelets/codelet_ztplqt.c +++ b/runtime/openmp/codelets/codelet_ztplqt.c @@ -2,42 +2,41 @@ * * @file openmp/codelet_ztplqt.c * - * @copyright 2009-2016 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon ztplqt StarPU codelet + * @brief Chameleon ztplqt OpenMP codelet * * @version 0.9.2 + * @author Philippe Virouleau * @author Mathieu Faverge - * @date 2018-06-15 + * @date 2019-11-19 * @precisions normal z -> s d c * */ - #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" +#include "coreblas/coreblas_ztile.h" void INSERT_TASK_ztplqt( const RUNTIME_option_t *options, int M, int N, int L, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + const CHAM_desc_t *T, int Tm, int Tn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn); - CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + CHAM_tile_t *tileT = T->get_blktile( T, Tm, Tn ); int ws_size = options->ws_wsize; -#pragma omp task firstprivate(ws_size, M, N, L, ib, ptrA, lda, ptrB, ldb, ptrT, ldt) depend(inout:ptrA[0], ptrB[0]) depend(out:ptrT[0]) +#pragma omp task firstprivate( ws_size, M, N, L, ib, tileA, tileB, tileT ) depend( inout:tileA[0], tileB[0] ) depend( out:tileT[0] ) { - CHAMELEON_Complex64_t work[ws_size]; + CHAMELEON_Complex64_t work[ws_size]; - CORE_zlaset( ChamUpperLower, ib, M, 0., 0., ptrT, ldt ); - CORE_ztplqt( M, N, L, ib, - ptrA, lda, ptrB, ldb, ptrT, ldt, work ); + TCORE_zlaset( ChamUpperLower, ib, M, 0., 0., tileT ); + TCORE_ztplqt( M, N, L, ib, + tileA, tileB, tileT, work ); } } diff --git a/runtime/openmp/codelets/codelet_ztpmlqt.c b/runtime/openmp/codelets/codelet_ztpmlqt.c index 5a131e823f26abbc4e008a8278c39307fdbd96ff..06fbb30a4a3e5a5dcf7355b9b56d3fdfd52f4473 100644 --- a/runtime/openmp/codelets/codelet_ztpmlqt.c +++ b/runtime/openmp/codelets/codelet_ztpmlqt.c @@ -2,40 +2,40 @@ * * @file openmp/codelet_ztpmlqt.c * - * @copyright 2009-2016 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * - * @brief Chameleon ztpmlqt StarPU codelet + * @brief Chameleon ztpmlqt OpenMP codelet * * @version 0.9.2 + * @author Philippe Virouleau * @author Mathieu Faverge - * @date 2018-06-15 + * @date 2019-11-19 * @precisions normal z -> s d c * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" +#include "coreblas/coreblas_ztile.h" void INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int ib, int nb, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ) + const CHAM_desc_t *V, int Vm, int Vn, + const CHAM_desc_t *T, int Tm, int Tn, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn); - CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); - CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + CHAM_tile_t *tileT = T->get_blktile( T, Tm, Tn ); + CHAM_tile_t *tileV = V->get_blktile( V, Vm, Vn ); int ws_size = options->ws_wsize; -#pragma omp task firstprivate(ws_size, side, trans, M, N, K, L, ib, ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb) depend(in:ptrV[0], ptrT[0]) depend(inout:ptrA[0], ptrB[0]) +#pragma omp task firstprivate( ws_size, side, trans, M, N, K, L, ib, tileV, tileT, tileA, tileB ) depend( in:tileV[0], tileT[0] ) depend( inout:tileA[0], tileB[0] ) { CHAMELEON_Complex64_t work[ws_size]; - CORE_ztpmlqt( side, trans, M, N, K, L, ib, - ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, work ); + TCORE_ztpmlqt( side, trans, M, N, K, L, ib, + tileV, tileT, tileA, tileB, work ); } } diff --git a/runtime/openmp/codelets/codelet_ztpmqrt.c b/runtime/openmp/codelets/codelet_ztpmqrt.c index a281b35cca6d3c16611f33754f47683349e66ec3..33a746216cc69c397110cbeac30b2c9181706473 100644 --- a/runtime/openmp/codelets/codelet_ztpmqrt.c +++ b/runtime/openmp/codelets/codelet_ztpmqrt.c @@ -2,40 +2,40 @@ * * @file openmp/codelet_ztpmqrt.c * - * @copyright 2009-2016 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * - * @brief Chameleon ztpmqrt StarPU codelet + * @brief Chameleon ztpmqrt OpenMP codelet * * @version 0.9.2 + * @author Philippe Virouleau * @author Mathieu Faverge - * @date 2018-06-15 + * @date 2019-11-19 * @precisions normal z -> s d c * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" +#include "coreblas/coreblas_ztile.h" void INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int ib, int nb, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ) + const CHAM_desc_t *V, int Vm, int Vn, + const CHAM_desc_t *T, int Tm, int Tn, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn); - CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); - CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + CHAM_tile_t *tileT = T->get_blktile( T, Tm, Tn ); + CHAM_tile_t *tileV = V->get_blktile( V, Vm, Vn ); int ws_size = options->ws_wsize; -#pragma omp task firstprivate(ws_size, side, trans, M, N, K, L, ib, nb, ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb) depend(in:ptrV[0], ptrT[0]) depend(inout:ptrA[0], ptrB[0]) +#pragma omp task firstprivate( ws_size, side, trans, M, N, K, L, ib, nb, tileV, tileT, tileA, tileB ) depend( in:tileV[0], tileT[0] ) depend( inout:tileA[0], tileB[0] ) { CHAMELEON_Complex64_t tmp[ws_size]; - CORE_ztpmqrt( side, trans, M, N, K, L, ib, - ptrV, ldv, ptrT, ldt, ptrA, lda, ptrB, ldb, tmp ); + TCORE_ztpmqrt( side, trans, M, N, K, L, ib, + tileV, tileT, tileA, tileB, tmp ); } } diff --git a/runtime/openmp/codelets/codelet_ztpqrt.c b/runtime/openmp/codelets/codelet_ztpqrt.c index 13ce6a8f5ddf06db09483faf1be2f71f41ada1f2..bba9bfea39ba2d2c973521cec1c1edb21f164dde 100644 --- a/runtime/openmp/codelets/codelet_ztpqrt.c +++ b/runtime/openmp/codelets/codelet_ztpqrt.c @@ -2,41 +2,41 @@ * * @file openmp/codelet_ztpqrt.c * - * @copyright 2009-2016 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon ztpqrt StarPU codelet + * @brief Chameleon ztpqrt OpenMP codelet * * @version 0.9.2 + * @author Philippe Virouleau * @author Mathieu Faverge - * @date 2018-06-15 + * @date 2019-11-19 * @precisions normal z -> s d c * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" +#include "coreblas/coreblas_ztile.h" void INSERT_TASK_ztpqrt( const RUNTIME_option_t *options, int M, int N, int L, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + const CHAM_desc_t *T, int Tm, int Tn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn); - CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + CHAM_tile_t *tileT = T->get_blktile( T, Tm, Tn ); int ws_size = options->ws_wsize; -#pragma omp task firstprivate(ws_size, M, N, L, ib, ptrT, ldt, ptrA, lda, ptrB, ldb) depend(inout:ptrA[0], ptrB[0]) depend(out:ptrT[0]) +#pragma omp task firstprivate( ws_size, M, N, L, ib, tileT, tileA, tileB ) depend( inout:tileA[0], tileB[0] ) depend( out:tileT[0] ) { CHAMELEON_Complex64_t tmp[ws_size]; - CORE_zlaset( ChamUpperLower, ib, N, 0., 0., ptrT, ldt ); - CORE_ztpqrt( M, N, L, ib, - ptrA, lda, ptrB, ldb, ptrT, ldt, tmp ); + TCORE_zlaset( ChamUpperLower, ib, N, 0., 0., tileT ); + TCORE_ztpqrt( M, N, L, ib, + tileA, tileB, tileT, tmp ); } } diff --git a/runtime/openmp/codelets/codelet_ztradd.c b/runtime/openmp/codelets/codelet_ztradd.c index dbf8ba72f581e67a969c75c1ca434c23d21c5f5d..18dc9cc2ade2e47b9d25ffdac95f7d12bfad2e75 100644 --- a/runtime/openmp/codelets/codelet_ztradd.c +++ b/runtime/openmp/codelets/codelet_ztradd.c @@ -2,94 +2,32 @@ * * @file openmp/codelet_ztradd.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon ztradd StarPU codelet + * @brief Chameleon ztradd OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 + * @author Philippe Virouleau * @author Mathieu Faverge - * @date 2018-06-15 + * @date 2019-11-19 * @precisions normal z -> c d s * */ - #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" +#include "coreblas/coreblas_ztile.h" -/** - ****************************************************************************** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - * @brief Adds two trapezoidal matrices together as in PBLAS pzgeadd. - * - * B <- alpha * op(A) + beta * B, - * - * where op(X) = X, X', or conj(X') - * - ******************************************************************************* - * - * @param[in] uplo - * Specifies the shape of A and B matrices: - * = ChamUpperLower: A and B are general matrices. - * = ChamUpper: op(A) and B are upper trapezoidal matrices. - * = ChamLower: op(A) and B are lower trapezoidal matrices. - * - * @param[in] trans - * Specifies whether the matrix A is non-transposed, transposed, or - * conjugate transposed - * = ChamNoTrans: op(A) = A - * = ChamTrans: op(A) = A' - * = ChamConjTrans: op(A) = conj(A') - * - * @param[in] M - * Number of rows of the matrices op(A) and B. - * - * @param[in] N - * Number of columns of the matrices op(A) and B. - * - * @param[in] alpha - * Scalar factor of A. - * - * @param[in] A - * Matrix of size LDA-by-N, if trans = ChamNoTrans, LDA-by-M - * otherwise. - * - * @param[in] LDA - * Leading dimension of the array A. LDA >= max(1,k), with k=M, if - * trans = ChamNoTrans, and k=N otherwise. - * - * @param[in] beta - * Scalar factor of B. - * - * @param[in,out] B - * Matrix of size LDB-by-N. - * On exit, B = alpha * op(A) + beta * B - * - * @param[in] LDB - * Leading dimension of the array B. LDB >= max(1,M) - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if -i, the i-th argument had an illegal value - * - */ void INSERT_TASK_ztradd( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn, int ldb) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); -#pragma omp task firstprivate(uplo, trans, m, n, alpha, ptrA, lda, ptrB, ldb) depend(in:ptrA[0]) depend(inout:ptrB[0]) - CORE_ztradd(uplo, trans, m, n, alpha, ptrA, lda, beta, ptrB, ldb); +#pragma omp task firstprivate( uplo, trans, m, n, alpha, tileA, tileB ) depend( in:tileA[0] ) depend( inout:tileB[0] ) + TCORE_ztradd( uplo, trans, m, n, alpha, tileA, beta, tileB ); } diff --git a/runtime/openmp/codelets/codelet_ztrasm.c b/runtime/openmp/codelets/codelet_ztrasm.c index 715dc89a1d91da6444062094fe1fa75120168833..d3392d90e716835424c32b2d46ae9e3deca6e6b3 100644 --- a/runtime/openmp/codelets/codelet_ztrasm.c +++ b/runtime/openmp/codelets/codelet_ztrasm.c @@ -2,33 +2,31 @@ * * @file openmp/codelet_ztrasm.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon ztrasm StarPU codelet + * @brief Chameleon ztrasm OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.6.0 for CHAMELEON 0.9.2 + * @author Philippe Virouleau * @author Mathieu Faverge - * @date 2018-06-15 + * @date 2019-11-19 * @precisions normal z -> c d s * */ - #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -void INSERT_TASK_ztrasm(const RUNTIME_option_t *options, +#include "coreblas/coreblas_ztile.h" + +void INSERT_TASK_ztrasm( const RUNTIME_option_t *options, cham_store_t storev, cham_uplo_t uplo, cham_diag_t diag, int M, int N, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - double *ptrB = RTBLKADDR(B, double, Bm, Bn); -#pragma omp task firstprivate(storev, uplo, diag, M, N, ptrA, lda, ptrB) depend(in:ptrA[0]) depend(inout:ptrB[0]) - CORE_ztrasm(storev, uplo, diag, M, N, ptrA, lda, ptrB); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); +#pragma omp task firstprivate( storev, uplo, diag, M, N, tileA, tileB ) depend( in:tileA[0] ) depend( inout:tileB[0] ) + TCORE_ztrasm( storev, uplo, diag, M, N, tileA, tileB->mat ); } diff --git a/runtime/openmp/codelets/codelet_ztrmm.c b/runtime/openmp/codelets/codelet_ztrmm.c index 248865be1a32d8963fc56b531e9a77598a6ab245..c7a329359ff607417812294a2420305f09948c6c 100644 --- a/runtime/openmp/codelets/codelet_ztrmm.c +++ b/runtime/openmp/codelets/codelet_ztrmm.c @@ -2,47 +2,36 @@ * * @file openmp/codelet_ztrmm.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon ztrmm StarPU codelet + * @brief Chameleon ztrmm OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Julien Langou - * @author Henricus Bouwmeester + * @author Philippe Virouleau * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede - * @date 2018-06-15 + * @date 2019-11-19 * @precisions normal z -> c d s * */ - #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - */ -void INSERT_TASK_ztrmm(const RUNTIME_option_t *options, +#include "coreblas/coreblas_ztile.h" + +void INSERT_TASK_ztrmm( const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn); -#pragma omp task firstprivate(side, uplo, transA, diag, m, n, alpha, ptrA, lda, ptrB, ldb) depend(in:ptrA[0]) depend(inout:ptrB[0]) - CORE_ztrmm(side, uplo, + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); +#pragma omp task firstprivate( side, uplo, transA, diag, m, n, alpha, tileA, tileB ) depend( in:tileA[0] ) depend( inout:tileB[0] ) + TCORE_ztrmm( side, uplo, transA, diag, m, n, - alpha, ptrA, lda, - ptrB, ldb); + alpha, tileA, + tileB ); } diff --git a/runtime/openmp/codelets/codelet_ztrsm.c b/runtime/openmp/codelets/codelet_ztrsm.c index 062bfc51c012d35036a3511680ee84d3429f2f5a..bc02e562a42124bc515ab7f005e6bf76f6d1c7dd 100644 --- a/runtime/openmp/codelets/codelet_ztrsm.c +++ b/runtime/openmp/codelets/codelet_ztrsm.c @@ -2,49 +2,36 @@ * * @file openmp/codelet_ztrsm.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon ztrsm StarPU codelet + * @brief Chameleon ztrsm OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Hatem Ltaief - * @author Jakub Kurzak - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - */ -void INSERT_TASK_ztrsm(const RUNTIME_option_t *options, +void INSERT_TASK_ztrsm( const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn); -#pragma omp task firstprivate(side, uplo, transA, diag, m, n, alpha, ptrA, lda, ptrB, ldb) depend(in:ptrA[0]) depend(inout: ptrB[0]) - CORE_ztrsm(side, uplo, + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); +#pragma omp task firstprivate( side, uplo, transA, diag, m, n, alpha, tileA, tileB ) depend( in:tileA[0] ) depend( inout: tileB[0] ) + TCORE_ztrsm( side, uplo, transA, diag, m, n, - alpha, ptrA, lda, - ptrB, ldb); + alpha, tileA, + tileB ); } diff --git a/runtime/openmp/codelets/codelet_ztrssq.c b/runtime/openmp/codelets/codelet_ztrssq.c index 38a69fca2afba6ad56389152e6f88f30e556a7d3..de53dfe7cc55cfce797c217b628ad2b8d6109c7f 100644 --- a/runtime/openmp/codelets/codelet_ztrssq.c +++ b/runtime/openmp/codelets/codelet_ztrssq.c @@ -2,34 +2,32 @@ * * @file openmp/codelet_ztrssq.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon ztrssq StarPU codelet + * @brief Chameleon ztrssq OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.6.0 for CHAMELEON 0.9.2 + * @author Philippe Virouleau * @author Mathieu Faverge - * @date 2018-06-15 + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" +#include "coreblas/coreblas_ztile.h" void INSERT_TASK_ztrssq( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_diag_t diag, int m, int n, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *W, int Wm, int Wn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - double *ptrSCALESUMSQ = RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn); -#pragma omp task firstprivate(uplo, diag, m, n, ptrA, lda, SCALESUMSQ) depend(in:ptrA[0]) depend(inout:ptrSCALESUMSQ[0]) - CORE_ztrssq( uplo, diag, m, n, ptrA, lda, &ptrSCALESUMSQ[0], &ptrSCALESUMSQ[1]); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileW = W->get_blktile( W, Wm, Wn ); +#pragma omp task firstprivate( uplo, diag, m, n, tileA, tileW ) depend( in:tileA[0] ) depend( inout:tileW[0] ) + TCORE_ztrssq( uplo, diag, m, n, tileA, tileW ); } diff --git a/runtime/openmp/codelets/codelet_ztrtri.c b/runtime/openmp/codelets/codelet_ztrtri.c index ea113cdaeeea8b359a3f32e758c0eeb56cbceee5..8c0d9b8b60e38f1c1a78055b8ab4457a66d9ceb8 100644 --- a/runtime/openmp/codelets/codelet_ztrtri.c +++ b/runtime/openmp/codelets/codelet_ztrtri.c @@ -2,42 +2,31 @@ * * @file openmp/codelet_ztrtri.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon ztrtri StarPU codelet + * @brief Chameleon ztrtri OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Julien Langou - * @author Henricus Bouwmeester + * @author Philippe Virouleau * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede - * @date 2018-06-15 + * @date 2019-11-19 * @precisions normal z -> c d s * */ - #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - */ -void INSERT_TASK_ztrtri(const RUNTIME_option_t *options, +#include "coreblas/coreblas_ztile.h" + +void INSERT_TASK_ztrtri( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_diag_t diag, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - int iinfo) + const CHAM_desc_t *A, int Am, int An, + int iinfo ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); -#pragma omp task firstprivate(uplo, diag, n, ptrA, lda, iinfo) depend(inout:ptrA[0]) - CORE_ztrtri(uplo, diag, n, ptrA, lda, &iinfo); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); +#pragma omp task firstprivate( uplo, diag, n, tileA, iinfo ) depend( inout:tileA[0] ) + TCORE_ztrtri( uplo, diag, n, tileA, &iinfo ); } diff --git a/runtime/openmp/codelets/codelet_ztsmlq_hetra1.c b/runtime/openmp/codelets/codelet_ztsmlq_hetra1.c index 4bd4eec9306c27472776e856597e850fe5d4ac14..4c5ed64a58ca44edf3c6594c6a654b40dcde8ade 100644 --- a/runtime/openmp/codelets/codelet_ztsmlq_hetra1.c +++ b/runtime/openmp/codelets/codelet_ztsmlq_hetra1.c @@ -2,50 +2,43 @@ * * @file openmp/codelet_ztsmlq_hetra1.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon ztsmlq_hetra1 StarPU codelet + * @brief Chameleon ztsmlq_hetra1 OpenMP codelet * * @version 0.9.2 - * @author Hatem Ltaief - * @author Mathieu Faverge - * @author Azzam Haidar * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - */ -void INSERT_TASK_ztsmlq_hetra1(const RUNTIME_option_t *options, +#include "coreblas/coreblas_ztile.h" + +void INSERT_TASK_ztsmlq_hetra1( const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) + const CHAM_desc_t *A1, int A1m, int A1n, + const CHAM_desc_t *A2, int A2m, int A2n, + const CHAM_desc_t *V, int Vm, int Vn, + const CHAM_desc_t *T, int Tm, int Tn ) { - CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n); - CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n); - CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); - CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn); + CHAM_tile_t *tileA1 = A1->get_blktile( A1, A1m, A1n ); + CHAM_tile_t *tileA2 = A2->get_blktile( A2, A2m, A2n ); + CHAM_tile_t *tileT = T->get_blktile( T, Tm, Tn ); + CHAM_tile_t *tileV = V->get_blktile( V, Vm, Vn ); int ldwork = side == ChamLeft ? ib : nb; int ws_size = options->ws_wsize; -#pragma omp task firstprivate(ws_size, side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0]) +#pragma omp task firstprivate( ws_size, side, trans, m1, n1, m2, n2, k, ib, tileA1, tileA2, tileV, tileT, ldwork ) depend( inout:tileA1[0], tileA2[0] ) depend( in:tileT[0], tileV[0] ) { CHAMELEON_Complex64_t work[ws_size]; - CORE_ztsmlq_hetra1(side, trans, m1, n1, m2, n2, k, - ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork); + TCORE_ztsmlq_hetra1( side, trans, m1, n1, m2, n2, k, + ib, tileA1, tileA2, tileV, tileT, work, ldwork ); } } diff --git a/runtime/openmp/codelets/codelet_ztsmqr_hetra1.c b/runtime/openmp/codelets/codelet_ztsmqr_hetra1.c index 1a655281b25777187c8eaa7d0997cc8dcd9b0bb4..97f84c5ad13ccc1ef9f2f724c5420eb225e58838 100644 --- a/runtime/openmp/codelets/codelet_ztsmqr_hetra1.c +++ b/runtime/openmp/codelets/codelet_ztsmqr_hetra1.c @@ -2,50 +2,43 @@ * * @file openmp/codelet_ztsmqr_hetra1.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon ztsmqr_hetra1 StarPU codelet + * @brief Chameleon ztsmqr_hetra1 OpenMP codelet * * @version 0.9.2 - * @author Hatem Ltaief - * @author Mathieu Faverge - * @author Azzam Haidar * @author Philippe Virouleau - * @date 2018-06-15 + * @author Mathieu Faverge + * @date 2019-11-19 * @precisions normal z -> c d s * */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - */ -void INSERT_TASK_ztsmqr_hetra1(const RUNTIME_option_t *options, +#include "coreblas/coreblas_ztile.h" + +void INSERT_TASK_ztsmqr_hetra1( const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) + const CHAM_desc_t *A1, int A1m, int A1n, + const CHAM_desc_t *A2, int A2m, int A2n, + const CHAM_desc_t *V, int Vm, int Vn, + const CHAM_desc_t *T, int Tm, int Tn ) { - CHAMELEON_Complex64_t *ptrA1 = RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n); - CHAMELEON_Complex64_t *ptrA2 = RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n); - CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); - CHAMELEON_Complex64_t *ptrV = RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn); + CHAM_tile_t *tileA1 = A1->get_blktile( A1, A1m, A1n ); + CHAM_tile_t *tileA2 = A2->get_blktile( A2, A2m, A2n ); + CHAM_tile_t *tileT = T->get_blktile( T, Tm, Tn ); + CHAM_tile_t *tileV = V->get_blktile( V, Vm, Vn ); int ldwork = side == ChamLeft ? ib : nb; int ws_size = options->ws_wsize; -#pragma omp task firstprivate(ws_size, side, trans, m1, n1, m2, n2, k, ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, ldwork) depend(inout:ptrA1[0], ptrA2[0]) depend(in:ptrT[0], ptrV[0]) +#pragma omp task firstprivate( ws_size, side, trans, m1, n1, m2, n2, k, ib, tileA1, tileA2, tileV, tileT, ldwork ) depend( inout:tileA1[0], tileA2[0] ) depend( in:tileT[0], tileV[0] ) { CHAMELEON_Complex64_t work[ws_size]; - CORE_ztsmqr_hetra1(side, trans, m1, n1, m2, n2, k, - ib, ptrA1, lda1, ptrA2, lda2, ptrV, ldv, ptrT, ldt, work, ldwork); + TCORE_ztsmqr_hetra1( side, trans, m1, n1, m2, n2, k, + ib, tileA1, tileA2, tileV, tileT, work, ldwork ); } } diff --git a/runtime/openmp/codelets/codelet_ztstrf.c b/runtime/openmp/codelets/codelet_ztstrf.c index 2db3931ec0967bb9eca6067f665c7950afa29ccc..da5e42a9c7f57b8454b5230e40be2fb0e26d26c2 100644 --- a/runtime/openmp/codelets/codelet_ztstrf.c +++ b/runtime/openmp/codelets/codelet_ztstrf.c @@ -2,111 +2,39 @@ * * @file openmp/codelet_ztstrf.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon ztstrf StarPU codelet + * @brief Chameleon ztstrf OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Hatem Ltaief - * @author Jakub Kurzak + * @author Philippe Virouleau * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede - * @date 2018-06-15 + * @date 2019-11-19 * @precisions normal z -> c d s * */ - #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - * CORE_ztstrf computes an LU factorization of a complex matrix formed - * by an upper triangular NB-by-N tile U on top of a M-by-N tile A - * using partial pivoting with row interchanges. - * - * This is the right-looking Level 2.5 BLAS version of the algorithm. - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the tile A. M >= 0. - * - * @param[in] N - * The number of columns of the tile A. N >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in] NB - * - * @param[in,out] U - * On entry, the NB-by-N upper triangular tile. - * On exit, the new factor U from the factorization - * - * @param[in] LDU - * The leading dimension of the array U. LDU >= max(1,NB). - * - * @param[in,out] A - * On entry, the M-by-N tile to be factored. - * On exit, the factor L from the factorization - * - * @param[in] LDA - * The leading dimension of the array A. LDA >= max(1,M). - * - * @param[in,out] L - * On entry, the IB-by-N lower triangular tile. - * On exit, the interchanged rows form the tile A in case of pivoting. - * - * @param[in] LDL - * The leading dimension of the array L. LDL >= max(1,IB). - * - * @param[out] IPIV - * The pivot indices; for 1 <= i <= min(M,N), row i of the - * tile U was interchanged with row IPIV(i) of the tile A. - * - * @param[in,out] WORK - * - * @param[in] LDWORK - * The dimension of the array WORK. - * - * @param[out] INFO - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if INFO = -k, the k-th argument had an illegal value - * @retval >0 if INFO = k, U(k,k) is exactly zero. The factorization - * has been completed, but the factor U is exactly - * singular, and division by zero will occur if it is used - * to solve a system of equations. - * - */ +#include "coreblas/coreblas_ztile.h" -void INSERT_TASK_ztstrf(const RUNTIME_option_t *options, +void INSERT_TASK_ztstrf( const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *U, int Um, int Un, int ldu, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *L, int Lm, int Ln, int ldl, + const CHAM_desc_t *U, int Um, int Un, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *L, int Lm, int Ln, int *IPIV, - cham_bool_t check_info, int iinfo) + cham_bool_t check_info, int iinfo ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - CHAMELEON_Complex64_t *ptrU = RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un); - CHAMELEON_Complex64_t *ptrL = RTBLKADDR(L, CHAMELEON_Complex64_t, Lm, Ln); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileU = U->get_blktile( U, Um, Un ); + CHAM_tile_t *tileL = L->get_blktile( L, Lm, Ln ); int ws_size = options->ws_wsize; -#pragma omp task firstprivate(ws_size, m, n, ib, nb, ptrU, ldu, ptrA, lda, ptrL, ldl, IPIV, iinfo) depend(inout:ptrA[0], ptrU[0], ptrL[0]) +#pragma omp task firstprivate( ws_size, m, n, ib, nb, tileU, tileA, tileL, IPIV, iinfo ) depend( inout:tileA[0], tileU[0], tileL[0] ) { CHAMELEON_Complex64_t work[ws_size]; - CORE_ztstrf(m, n, ib, nb, ptrU, ldu, ptrA, lda, ptrL, ldl, IPIV, work, nb, &iinfo); + TCORE_ztstrf( m, n, ib, nb, tileU, tileA, tileL, IPIV, work, nb, &iinfo ); } } diff --git a/runtime/openmp/codelets/codelet_zunmlq.c b/runtime/openmp/codelets/codelet_zunmlq.c index 92d6e71f8f34c171100880d747fb4f605e50baa3..9b62b78f49e8c8386e391b8c8853db683c05b043 100644 --- a/runtime/openmp/codelets/codelet_zunmlq.c +++ b/runtime/openmp/codelets/codelet_zunmlq.c @@ -2,129 +2,39 @@ * * @file openmp/codelet_zunmlq.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zunmlq StarPU codelet + * @brief Chameleon zunmlq OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Hatem Ltaief - * @author Jakub Kurzak - * @author Dulceneia Becker + * @author Philippe Virouleau * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede - * @date 2018-06-15 + * @date 2019-11-19 * @precisions normal z -> c d s * */ - #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - * CORE_zunmlq overwrites the general complex M-by-N tile C with - * - * SIDE = 'L' SIDE = 'R' - * TRANS = 'N': Q * C C * Q - * TRANS = 'C': Q^H * C C * Q^H - * - * where Q is a complex unitary matrix defined as the product of k - * elementary reflectors - * - * Q = H(k) . . . H(2) H(1) - * - * as returned by CORE_zgelqt. Q is of order M if SIDE = 'L' and of order N - * if SIDE = 'R'. - * - ******************************************************************************* - * - * @param[in] side - * @arg ChamLeft : apply Q or Q^H from the Left; - * @arg ChamRight : apply Q or Q^H from the Right. - * - * @param[in] trans - * @arg ChamNoTrans : No transpose, apply Q; - * @arg ChamConjTrans : Transpose, apply Q^H. - * - * @param[in] M - * The number of rows of the tile C. M >= 0. - * - * @param[in] N - * The number of columns of the tile C. N >= 0. - * - * @param[in] K - * The number of elementary reflectors whose product defines - * the matrix Q. - * If SIDE = ChamLeft, M >= K >= 0; - * if SIDE = ChamRight, N >= K >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in] A - * Dimension: (LDA,M) if SIDE = ChamLeft, - * (LDA,N) if SIDE = ChamRight, - * The i-th row must contain the vector which defines the - * elementary reflector H(i), for i = 1,2,...,k, as returned by - * CORE_zgelqt in the first k rows of its array argument A. - * - * @param[in] LDA - * The leading dimension of the array A. LDA >= max(1,K). - * - * @param[in] T - * The IB-by-K triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] LDT - * The leading dimension of the array T. LDT >= IB. - * - * @param[in,out] C - * On entry, the M-by-N tile C. - * On exit, C is overwritten by Q*C or Q^T*C or C*Q^T or C*Q. - * - * @param[in] LDC - * The leading dimension of the array C. LDC >= max(1,M). - * - * @param[in,out] WORK - * On exit, if INFO = 0, WORK(1) returns the optimal LDWORK. - * - * @param[in] LDWORK - * The dimension of the array WORK. - * If SIDE = ChamLeft, LDWORK >= max(1,N); - * if SIDE = ChamRight, LDWORK >= max(1,M). - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if -i, the i-th argument had an illegal value - * - */ +#include "coreblas/coreblas_ztile.h" -void INSERT_TASK_zunmlq(const RUNTIME_option_t *options, +void INSERT_TASK_zunmlq( const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int m, int n, int k, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, - const CHAM_desc_t *C, int Cm, int Cn, int ldc) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *T, int Tm, int Tn, + const CHAM_desc_t *C, int Cm, int Cn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); - CHAMELEON_Complex64_t *ptrC = RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileT = T->get_blktile( T, Tm, Tn ); + CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn ); int ws_size = options->ws_wsize; -#pragma omp task firstprivate(ws_size, side, trans, m, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0]) +#pragma omp task firstprivate( ws_size, side, trans, m, n, k, ib, nb, tileA, tileT, tileC ) depend( in:tileA[0], tileT[0] ) depend( inout:tileC[0] ) { CHAMELEON_Complex64_t work[ws_size]; - CORE_zunmlq(side, trans, m, n, k, ib, - ptrA, lda, ptrT, ldt, ptrC, ldc, work, nb); + TCORE_zunmlq( side, trans, m, n, k, ib, + tileA, tileT, tileC, work, nb ); } } diff --git a/runtime/openmp/codelets/codelet_zunmqr.c b/runtime/openmp/codelets/codelet_zunmqr.c index 66aa62b5dbc6e8ce2d9a74b91ce1fd5ae1d088f8..e8af9ea663eab1febd3a250080fb9305cdda1afb 100644 --- a/runtime/openmp/codelets/codelet_zunmqr.c +++ b/runtime/openmp/codelets/codelet_zunmqr.c @@ -2,129 +2,39 @@ * * @file openmp/codelet_zunmqr.c * - * @copyright 2009-2014 The University of Tennessee and The University of - * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. * *** * - * @brief Chameleon zunmqr StarPU codelet + * @brief Chameleon zunmqr OpenMP codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Hatem Ltaief - * @author Jakub Kurzak + * @author Philippe Virouleau * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede - * @date 2018-06-15 + * @date 2019-11-19 * @precisions normal z -> c d s * */ - #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - * CORE_zunmqr overwrites the general complex M-by-N tile C with - * - * SIDE = 'L' SIDE = 'R' - * TRANS = 'N': Q * C C * Q - * TRANS = 'C': Q^H * C C * Q^H - * - * where Q is a complex unitary matrix defined as the product of k - * elementary reflectors - * - * Q = H(1) H(2) . . . H(k) - * - * as returned by CORE_zgeqrt. Q is of order M if SIDE = 'L' and of order N - * if SIDE = 'R'. - * - ******************************************************************************* - * - * @param[in] side - * @arg ChamLeft : apply Q or Q^H from the Left; - * @arg ChamRight : apply Q or Q^H from the Right. - * - * @param[in] trans - * @arg ChamNoTrans : No transpose, apply Q; - * @arg ChamConjTrans : Transpose, apply Q^H. - * - * @param[in] M - * The number of rows of the tile C. M >= 0. - * - * @param[in] N - * The number of columns of the tile C. N >= 0. - * - * @param[in] K - * The number of elementary reflectors whose product defines - * the matrix Q. - * If SIDE = ChamLeft, M >= K >= 0; - * if SIDE = ChamRight, N >= K >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in] A - * Dimension: (LDA,K) - * The i-th column must contain the vector which defines the - * elementary reflector H(i), for i = 1,2,...,k, as returned by - * CORE_zgeqrt in the first k columns of its array argument A. - * - * @param[in] LDA - * The leading dimension of the array A. - * If SIDE = ChamLeft, LDA >= max(1,M); - * if SIDE = ChamRight, LDA >= max(1,N). - * - * @param[in] T - * The IB-by-K triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] LDT - * The leading dimension of the array T. LDT >= IB. - * - * @param[in,out] C - * On entry, the M-by-N tile C. - * On exit, C is overwritten by Q*C or Q^T*C or C*Q^T or C*Q. - * - * @param[in] LDC - * The leading dimension of the array C. LDC >= max(1,M). - * - * @param[in,out] WORK - * On exit, if INFO = 0, WORK(1) returns the optimal LDWORK. - * - * @param[in] LDWORK - * The dimension of the array WORK. - * If SIDE = ChamLeft, LDWORK >= max(1,N); - * if SIDE = ChamRight, LDWORK >= max(1,M). - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if -i, the i-th argument had an illegal value - * - */ +#include "coreblas/coreblas_ztile.h" -void INSERT_TASK_zunmqr(const RUNTIME_option_t *options, +void INSERT_TASK_zunmqr( const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int m, int n, int k, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, - const CHAM_desc_t *C, int Cm, int Cn, int ldc) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *T, int Tm, int Tn, + const CHAM_desc_t *C, int Cm, int Cn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); - CHAMELEON_Complex64_t *ptrC = RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileT = T->get_blktile( T, Tm, Tn ); + CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn ); int ws_size = options->ws_wsize; -#pragma omp task firstprivate(ws_size, side, trans, m, n, k, ib, nb, ptrA, lda, ptrT, ldt, ptrC, ldc) depend(in:ptrA[0], ptrT[0]) depend(inout:ptrC[0]) +#pragma omp task firstprivate( ws_size, side, trans, m, n, k, ib, nb, tileA, tileT, tileC ) depend( in:tileA[0], tileT[0] ) depend( inout:tileC[0] ) { CHAMELEON_Complex64_t tmp[ws_size]; - CORE_zunmqr(side, trans, m, n, k, ib, - ptrA, lda, ptrT, ldt, ptrC, ldc, tmp, nb); + TCORE_zunmqr( side, trans, m, n, k, ib, + tileA, tileT, tileC, tmp, nb ); } } diff --git a/runtime/openmp/control/runtime_descriptor.c b/runtime/openmp/control/runtime_descriptor.c index 5fefecb6575d422ce91b03fbbddafe2682e8f3f9..222128c7e4816b3c10e61174fbf096ac41e195ee 100644 --- a/runtime/openmp/control/runtime_descriptor.c +++ b/runtime/openmp/control/runtime_descriptor.c @@ -102,5 +102,5 @@ void RUNTIME_data_migrate( const RUNTIME_sequence_t *sequence, void *RUNTIME_data_getaddr( const CHAM_desc_t *desc, int m, int n ) { - return desc->get_blkaddr( desc, m, n ); + return desc->get_blktile( desc, m, n ); } diff --git a/runtime/openmp/include/chameleon_openmp.h b/runtime/openmp/include/chameleon_openmp.h index 2e93fe8f01fbf44167c694d428b9347e74a72add..c8bbbe4efd69088aeaf411853e22791b32afd30a 100644 --- a/runtime/openmp/include/chameleon_openmp.h +++ b/runtime/openmp/include/chameleon_openmp.h @@ -20,14 +20,7 @@ #define _chameleon_openmp_h_ #include "coreblas.h" - #include "control/common.h" #include <omp.h> -/* - * Access to block pointer and leading dimension - */ -#define RTBLKADDR( desc, type, m, n ) ( (type*)RUNTIME_data_getaddr( desc, m, n ) ) - - #endif /* _chameleon_openmp_h_ */ diff --git a/runtime/parsec/codelets/codelet_dzasum.c b/runtime/parsec/codelets/codelet_dzasum.c index e0faa8dd277a80e58d01c6f1e133f1664113ae44..7f256d1117ea4906a478f1fe45d60ccbc91e7000 100644 --- a/runtime/parsec/codelets/codelet_dzasum.c +++ b/runtime/parsec/codelets/codelet_dzasum.c @@ -44,10 +44,11 @@ CORE_dzasum_parsec( parsec_execution_stream_t *context, void INSERT_TASK_dzasum(const RUNTIME_option_t *options, cham_store_t storev, cham_uplo_t uplo, int M, int N, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_dzasum_parsec, options->priority, "dzasum", @@ -56,7 +57,7 @@ void INSERT_TASK_dzasum(const RUNTIME_option_t *options, sizeof(int), &M, VALUE, sizeof(int), &N, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PASSED_BY_REF, RTBLKADDR( B, double, Bm, Bn ), INOUT | AFFINITY, PARSEC_DTD_ARG_END ); } diff --git a/runtime/parsec/codelets/codelet_zbuild.c b/runtime/parsec/codelets/codelet_zbuild.c index bd85b99b06b15c24513c6dfab51b4d9990023051..6118017b2abb0ae9e9c9db4cd3e8f45d462fb31c 100644 --- a/runtime/parsec/codelets/codelet_zbuild.c +++ b/runtime/parsec/codelets/codelet_zbuild.c @@ -43,10 +43,11 @@ CORE_zbuild_parsec( parsec_execution_stream_t *context, } void INSERT_TASK_zbuild( const RUNTIME_option_t *options, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, void *user_data, void* user_build_callback ) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); int row_min, row_max, col_min, col_max; row_min = Am*A->mb ; row_max = Am == A->mt-1 ? A->m-1 : row_min+A->mb-1 ; @@ -60,7 +61,7 @@ void INSERT_TASK_zbuild( const RUNTIME_option_t *options, sizeof(int), &col_min, VALUE, sizeof(int), &col_max, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | OUTPUT | AFFINITY, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, sizeof(void*), &user_data, VALUE, sizeof(void*), &user_build_callback, VALUE, PARSEC_DTD_ARG_END ); diff --git a/runtime/parsec/codelets/codelet_zgeadd.c b/runtime/parsec/codelets/codelet_zgeadd.c index da7eb2de21b946bd5e57a414d77202082b433a83..e1246e8af8d8bbbbb0e3a37699fba181217e09aa 100644 --- a/runtime/parsec/codelets/codelet_zgeadd.c +++ b/runtime/parsec/codelets/codelet_zgeadd.c @@ -12,8 +12,6 @@ * @brief Chameleon zgeadd PaRSEC codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @author Reazul Hoque * @date 2015-11-04 @@ -102,10 +100,12 @@ CORE_zgeadd_parsec( parsec_execution_stream_t *context, */ void INSERT_TASK_zgeadd( const RUNTIME_option_t *options, cham_trans_t trans, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn, int ldb ) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn ) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zgeadd_parsec, options->priority, "geadd", @@ -114,10 +114,10 @@ void INSERT_TASK_zgeadd( const RUNTIME_option_t *options, sizeof(int), &n, VALUE, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, sizeof(CHAMELEON_Complex64_t), &beta, VALUE, PASSED_BY_REF, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), chameleon_parsec_get_arena_index( B ) | INOUT | AFFINITY, - sizeof(int), &ldb, VALUE, + sizeof(int), &(tileB->ld), VALUE, PARSEC_DTD_ARG_END ); (void)nb; diff --git a/runtime/parsec/codelets/codelet_zgelqt.c b/runtime/parsec/codelets/codelet_zgelqt.c index 38cc714d602728fe10033cb05744151ddb924800..966d919fe4f59f6ecefd93a199263d01f28dc8f7 100644 --- a/runtime/parsec/codelets/codelet_zgelqt.c +++ b/runtime/parsec/codelets/codelet_zgelqt.c @@ -21,66 +21,6 @@ #include "chameleon/tasks_z.h" #include "coreblas/coreblas_z.h" -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zgelqt - computes a LQ factorization of a complex M-by-N tile A: A = L * Q. - * - * The tile Q is represented as a product of elementary reflectors - * - * Q = H(k)' . . . H(2)' H(1)', where k = min(M,N). - * - * Each H(i) has the form - * - * H(i) = I - tau * v * v' - * - * where tau is a complex scalar, and v is a complex vector with - * v(1:i-1) = 0 and v(i) = 1; conjg(v(i+1:n)) is stored on exit in - * A(i,i+1:n), and tau in TAU(i). - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the tile A. M >= 0. - * - * @param[in] N - * The number of columns of the tile A. N >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A - * On entry, the M-by-N tile A. - * On exit, the elements on and below the diagonal of the array - * contain the M-by-min(M,N) lower trapezoidal tile L (L is - * lower triangular if M <= N); the elements above the diagonal, - * with the array TAU, represent the unitary tile Q as a - * product of elementary reflectors (see Further Details). - * - * @param[in] LDA - * The leading dimension of the array A. LDA >= max(1,M). - * - * @param[out] T - * The IB-by-N triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] LDT - * The leading dimension of the array T. LDT >= IB. - * - * @param[out] TAU - * The scalar factors of the elementary reflectors (see Further - * Details). - * - * @param[out] WORK - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if -i, the i-th argument had an illegal value - * - */ static inline int CORE_zgelqt_parsec( parsec_execution_stream_t *context, parsec_task_t *this_task ) @@ -107,10 +47,12 @@ CORE_zgelqt_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zgelqt(const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *T, int Tm, int Tn) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileT = T->get_blktile( T, Tm, Tn ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zgelqt_parsec, options->priority, "gelqt", @@ -118,9 +60,9 @@ void INSERT_TASK_zgelqt(const RUNTIME_option_t *options, sizeof(int), &n, VALUE, sizeof(int), &ib, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INOUT | AFFINITY, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PASSED_BY_REF, RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), chameleon_parsec_get_arena_index( T ) | OUTPUT, - sizeof(int), &ldt, VALUE, + sizeof(int), &(tileT->ld), VALUE, sizeof(CHAMELEON_Complex64_t)*nb, NULL, SCRATCH, sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, PARSEC_DTD_ARG_END ); diff --git a/runtime/parsec/codelets/codelet_zgemm.c b/runtime/parsec/codelets/codelet_zgemm.c index c3db187a65fdcde80fa408b8efac051f0d4c0c6a..267033bc73e5bb8c4c795fd6954343db3abcac86 100644 --- a/runtime/parsec/codelets/codelet_zgemm.c +++ b/runtime/parsec/codelets/codelet_zgemm.c @@ -59,11 +59,14 @@ CORE_zgemm_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zgemm( const RUNTIME_option_t *options, cham_trans_t transA, cham_trans_t transB, int m, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc ) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zgemm_parsec, options->priority, "Gemm", @@ -74,12 +77,12 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options, sizeof(int), &k, VALUE, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PASSED_BY_REF, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), chameleon_parsec_get_arena_index( B ) | INPUT, - sizeof(int), &ldb, VALUE, + sizeof(int), &(tileB->ld), VALUE, sizeof(CHAMELEON_Complex64_t), &beta, VALUE, PASSED_BY_REF, RTBLKADDR( C, CHAMELEON_Complex64_t, Cm, Cn ), chameleon_parsec_get_arena_index( C ) | INOUT | AFFINITY, - sizeof(int), &ldc, VALUE, + sizeof(int), &(tileC->ld), VALUE, PARSEC_DTD_ARG_END ); (void)nb; diff --git a/runtime/parsec/codelets/codelet_zgeqrt.c b/runtime/parsec/codelets/codelet_zgeqrt.c index ca6f169c943654121c7de5fd8b3e75bb5d5507fe..9a514361ba8f26f5b126bef4410af55310fdac88 100644 --- a/runtime/parsec/codelets/codelet_zgeqrt.c +++ b/runtime/parsec/codelets/codelet_zgeqrt.c @@ -108,10 +108,12 @@ CORE_zgeqrt_parsec ( parsec_execution_stream_t *context, void INSERT_TASK_zgeqrt(const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *T, int Tm, int Tn) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileT = T->get_blktile( T, Tm, Tn ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zgeqrt_parsec, options->priority, "geqrt", @@ -119,9 +121,9 @@ void INSERT_TASK_zgeqrt(const RUNTIME_option_t *options, sizeof(int), &n, VALUE, sizeof(int), &ib, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INOUT | AFFINITY, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PASSED_BY_REF, RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), chameleon_parsec_get_arena_index( T ) | OUTPUT, - sizeof(int), &ldt, VALUE, + sizeof(int), &(tileT->ld), VALUE, sizeof(CHAMELEON_Complex64_t)*nb, NULL, SCRATCH, sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, PARSEC_DTD_ARG_END ); diff --git a/runtime/parsec/codelets/codelet_zgessm.c b/runtime/parsec/codelets/codelet_zgessm.c index 0515ccec7c52469bb67e72eb67c6f5defff39c0c..3893d377021d5ddf4614620355b45cccf7bc9a83 100644 --- a/runtime/parsec/codelets/codelet_zgessm.c +++ b/runtime/parsec/codelets/codelet_zgessm.c @@ -92,11 +92,14 @@ CORE_zgessm_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zgessm(const RUNTIME_option_t *options, int m, int n, int k, int ib, int nb, int *IPIV, - const CHAM_desc_t *L, int Lm, int Ln, int ldl, - const CHAM_desc_t *D, int Dm, int Dn, int ldd, - const CHAM_desc_t *A, int Am, int An, int lda) + const CHAM_desc_t *L, int Lm, int Ln, + const CHAM_desc_t *D, int Dm, int Dn, + const CHAM_desc_t *A, int Am, int An) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileL = L->get_blktile( L, Lm, Ln ); + CHAM_tile_t *tileD = D->get_blktile( D, Dm, Dn ); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zgessm_parsec, options->priority, "gessm", @@ -106,11 +109,11 @@ void INSERT_TASK_zgessm(const RUNTIME_option_t *options, sizeof(int), &ib, VALUE, sizeof(int*), &IPIV, VALUE, PASSED_BY_REF, RTBLKADDR( L, CHAMELEON_Complex64_t, Lm, Ln ), chameleon_parsec_get_arena_index( L ) | INPUT, - sizeof(int), &ldl, VALUE, + sizeof(int), &(tileL->ld), VALUE, PASSED_BY_REF, RTBLKADDR( D, CHAMELEON_Complex64_t, Dm, Dn ), chameleon_parsec_get_arena_index( D ) | INPUT, - sizeof(int), &ldd, VALUE, + sizeof(int), &(tileD->ld), VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INOUT | AFFINITY, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PARSEC_DTD_ARG_END ); (void)nb; diff --git a/runtime/parsec/codelets/codelet_zgessq.c b/runtime/parsec/codelets/codelet_zgessq.c index 4cd62d842671401caf7a0422df7b1ac1d3801947..4a74f16a27825b0f1aaae29408f4c97de9c5bb3e 100644 --- a/runtime/parsec/codelets/codelet_zgessq.c +++ b/runtime/parsec/codelets/codelet_zgessq.c @@ -43,10 +43,11 @@ CORE_zgessq_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zgessq( const RUNTIME_option_t *options, cham_store_t storev, int m, int n, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zgessq_parsec, options->priority, "gessq", @@ -54,7 +55,7 @@ void INSERT_TASK_zgessq( const RUNTIME_option_t *options, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PASSED_BY_REF, RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ), chameleon_parsec_get_arena_index( SCALESUMSQ ) | INOUT | AFFINITY, PARSEC_DTD_ARG_END ); } diff --git a/runtime/parsec/codelets/codelet_zgetrf.c b/runtime/parsec/codelets/codelet_zgetrf.c index 8154ca2cf70a8e06c7449fe0d809b354e9af308d..3f02a3d4549cbe94a50a62dd401a326191dcaa60 100644 --- a/runtime/parsec/codelets/codelet_zgetrf.c +++ b/runtime/parsec/codelets/codelet_zgetrf.c @@ -51,18 +51,19 @@ CORE_zgetrf_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zgetrf(const RUNTIME_option_t *options, int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, int *IPIV, cham_bool_t check_info, int iinfo) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zgetrf_parsec, options->priority, "getrf", sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INOUT | AFFINITY, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, sizeof(int)*nb, IPIV, SCRATCH, sizeof(cham_bool_t), &check_info, VALUE, sizeof(int), &iinfo, VALUE, diff --git a/runtime/parsec/codelets/codelet_zgetrf_incpiv.c b/runtime/parsec/codelets/codelet_zgetrf_incpiv.c index d3128c04947402852262d453efd3f818d9a68eb3..d328f2e774e07f1ed2fb7366585098ce08f9ab7a 100644 --- a/runtime/parsec/codelets/codelet_zgetrf_incpiv.c +++ b/runtime/parsec/codelets/codelet_zgetrf_incpiv.c @@ -21,59 +21,6 @@ #include "chameleon/tasks_z.h" #include "coreblas/coreblas_z.h" -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zgetrf_incpiv computes an LU factorization of a general M-by-N tile A - * using partial pivoting with row int erchanges. - * - * The factorization has the form - * - * A = P * L * U - * - * where P is a permutation matrix, L is lower triangular with unit - * diagonal elements (lower trapezoidal if m > n), and U is upper - * triangular (upper trapezoidal if m < n). - * - * This is the right-looking Level 2.5 BLAS version of the algorithm. - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the tile A. M >= 0. - * - * @param[in] N - * The number of columns of the tile A. N >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A - * On entry, the M-by-N tile to be factored. - * On exit, the factors L and U from the factorization - * A = P*L*U; the unit diagonal elements of L are not stored. - * - * @param[in] LDA - * The leading dimension of the array A. LDA >= max(1,M). - * - * @param[out] IPIV - * The pivot indices; for 1 <= i <= min(M,N), row i of the - * tile was int erchanged with row IPIV(i). - * - * @param[out] INFO - * See returned value. - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if INFO = -k, the k-th argument had an illegal value - * @retval >0 if INFO = k, U(k,k) is exactly zero. The factorization - * has been completed, but the factor U is exactly - * singular, and division by zero will occur if it is used - * to solve a system of equations. - * - */ static inline int CORE_zgetrf_incpiv_parsec( parsec_execution_stream_t *context, parsec_task_t *this_task ) @@ -105,12 +52,13 @@ CORE_zgetrf_incpiv_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zgetrf_incpiv( const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *L, int Lm, int Ln, int ldl, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *L, int Lm, int Ln, int *IPIV, cham_bool_t check_info, int iinfo ) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zgetrf_incpiv_parsec, options->priority, "getrf_inc", @@ -118,7 +66,7 @@ void INSERT_TASK_zgetrf_incpiv( const RUNTIME_option_t *options, sizeof(int), &n, VALUE, sizeof(int), &ib, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INOUT | AFFINITY, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, sizeof(int*), &IPIV, VALUE, sizeof(int), &check_info, VALUE, sizeof(int), &iinfo, VALUE, @@ -129,6 +77,5 @@ void INSERT_TASK_zgetrf_incpiv( const RUNTIME_option_t *options, (void)L; (void)Lm; (void)Ln; - (void)ldl; (void)nb; } diff --git a/runtime/parsec/codelets/codelet_zgetrf_nopiv.c b/runtime/parsec/codelets/codelet_zgetrf_nopiv.c index 2ff4e9a20db1acdc7bd10d4d1a94d5d045c53f05..867a24b9d1e866c3e43e8cb3197ba69d268b8eb4 100644 --- a/runtime/parsec/codelets/codelet_zgetrf_nopiv.c +++ b/runtime/parsec/codelets/codelet_zgetrf_nopiv.c @@ -21,52 +21,6 @@ #include "chameleon/tasks_z.h" #include "coreblas/coreblas_z.h" -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zgetrf_nopiv computes an LU factorization of a general diagonal - * dominant M-by-N matrix A witout pivoting. - * - * The factorization has the form - * A = L * U - * where L is lower triangular with unit - * diagonal elements (lower trapezoidal if m > n), and U is upper - * triangular (upper trapezoidal if m < n). - * - * This is the right-looking Level 3 BLAS version of the algorithm. - * WARNING: Your matrix need to be diagonal dominant if you want to call this - * routine safely. - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the matrix A. M >= 0. - * - * @param[in] N - * The number of columns of the matrix A. N >= 0. - * - * @param[in] IB - * The block size to switch between blocked and unblocked code. - * - * @param[in,out] A - * On entry, the M-by-N matrix to be factored. - * On exit, the factors L and U from the factorization - * A = P*L*U; the unit diagonal elements of L are not stored. - * - * @param[in] LDA - * The leading dimension of the array A. LDA >= max(1,M). - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if INFO = -k, the k-th argument had an illegal value - * @retval >0 if INFO = k, U(k,k) is exactly zero. The factorization - * has been completed, but the factor U is exactly - * singular, and division by zero will occur if it is used - * to solve a system of equations. - * - */ static inline int CORE_zgetrf_nopiv_parsec( parsec_execution_stream_t *context, parsec_task_t *this_task ) @@ -96,10 +50,11 @@ CORE_zgetrf_nopiv_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zgetrf_nopiv(const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, int iinfo) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zgetrf_nopiv_parsec, options->priority, "getrf_nopiv", @@ -107,7 +62,7 @@ void INSERT_TASK_zgetrf_nopiv(const RUNTIME_option_t *options, sizeof(int), &n, VALUE, sizeof(int), &ib, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INOUT | AFFINITY, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, sizeof(int), &iinfo, VALUE, sizeof(RUNTIME_sequence_t*), &(options->sequence), VALUE, sizeof(RUNTIME_request_t*), &(options->request), VALUE, diff --git a/runtime/parsec/codelets/codelet_zgram.c b/runtime/parsec/codelets/codelet_zgram.c index 8d0217cc2552e329b51be45577c71d9399170098..ddba740357c9d521ad2a0d3a9ae28bdf153a14ce 100644 --- a/runtime/parsec/codelets/codelet_zgram.c +++ b/runtime/parsec/codelets/codelet_zgram.c @@ -20,11 +20,6 @@ #include "chameleon/tasks_z.h" #include "coreblas/coreblas_z.h" -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - */ static inline int CORE_zgegram_parsec( parsec_execution_stream_t *context, parsec_task_t *this_task ) @@ -82,12 +77,15 @@ CORE_zsygram_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zgram( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int mt, int nt, - const CHAM_desc_t *Di, int Dim, int Din, int lddi, - const CHAM_desc_t *Dj, int Djm, int Djn, int lddj, + const CHAM_desc_t *Di, int Dim, int Din, + const CHAM_desc_t *Dj, int Djm, int Djn, const CHAM_desc_t *D, int Dm, int Dn, - CHAM_desc_t *A, int Am, int An, int lda) + CHAM_desc_t *A, int Am, int An) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileDi = Di->get_blktile( Di, Dim, Din ); + CHAM_tile_t *tileDj = Dj->get_blktile( Dj, Djm, Djn ); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); double *ptrDi, *ptrDj; /* @@ -105,10 +103,10 @@ void INSERT_TASK_zgram( const RUNTIME_option_t *options, sizeof(int), &mt, VALUE, sizeof(int), &nt, VALUE, PASSED_BY_REF, RTBLKADDR( Di, double, Dim, Din ), chameleon_parsec_get_arena_index( Di ) | INPUT, - sizeof(int), &lddi, VALUE, + sizeof(int), &(tileDi->ld), VALUE, PASSED_BY_REF, RTBLKADDR( D, double, Dm, Dn ), chameleon_parsec_get_arena_index( D ) | INPUT, PASSED_BY_REF, RTBLKADDR( A, double, Am, An ), chameleon_parsec_get_arena_index( A ) | INOUT | AFFINITY, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PARSEC_DTD_ARG_END ); } else { parsec_dtd_taskpool_insert_task( @@ -119,12 +117,12 @@ void INSERT_TASK_zgram( const RUNTIME_option_t *options, sizeof(int), &mt, VALUE, sizeof(int), &nt, VALUE, PASSED_BY_REF, RTBLKADDR( Di, double, Dim, Din ), chameleon_parsec_get_arena_index( Di ) | INPUT, - sizeof(int), &lddi, VALUE, + sizeof(int), &(tileDi->ld), VALUE, PASSED_BY_REF, RTBLKADDR( Dj, double, Djm, Djn ), chameleon_parsec_get_arena_index( Dj ) | INPUT, - sizeof(int), &lddj, VALUE, + sizeof(int), &(tileDj->ld), VALUE, PASSED_BY_REF, RTBLKADDR( D, double, Dm, Dn ), chameleon_parsec_get_arena_index( D ) | INPUT, PASSED_BY_REF, RTBLKADDR( A, double, Am, An ), chameleon_parsec_get_arena_index( A ) | INOUT | AFFINITY, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PARSEC_DTD_ARG_END ); } diff --git a/runtime/parsec/codelets/codelet_zhe2ge.c b/runtime/parsec/codelets/codelet_zhe2ge.c index a54fb14aa91bf3b9536b4c5a1c71e60e0e92d9c5..2b8b6ad242309002d51be8031eab5ca78bdf5ddc 100644 --- a/runtime/parsec/codelets/codelet_zhe2ge.c +++ b/runtime/parsec/codelets/codelet_zhe2ge.c @@ -21,11 +21,6 @@ #include "chameleon/tasks_z.h" #include "coreblas/coreblas_z.h" -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - */ static inline int CORE_zhe2ge_parsec( parsec_execution_stream_t *context, parsec_task_t *this_task ) @@ -51,10 +46,12 @@ CORE_zhe2ge_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zhe2ge(const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int mb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zhe2ge_parsec, options->priority, "he2ge", @@ -62,9 +59,9 @@ void INSERT_TASK_zhe2ge(const RUNTIME_option_t *options, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, PASSED_BY_REF, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT , - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PASSED_BY_REF, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), OUTPUT | AFFINITY, - sizeof(int), &ldb, VALUE, + sizeof(int), &(tileB->ld), VALUE, PARSEC_DTD_ARG_END ); (void)mb; diff --git a/runtime/parsec/codelets/codelet_zhemm.c b/runtime/parsec/codelets/codelet_zhemm.c index a4ce4f6135f7c28f7c4977b6bdfddd7758bd7091..e3fe539cb74b460000f35ef5580a6d5383256299 100644 --- a/runtime/parsec/codelets/codelet_zhemm.c +++ b/runtime/parsec/codelets/codelet_zhemm.c @@ -21,11 +21,6 @@ #include "chameleon/tasks_z.h" #include "coreblas/coreblas_z.h" -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - */ static inline int CORE_zhemm_parsec( parsec_execution_stream_t *context, parsec_task_t *this_task ) @@ -58,11 +53,14 @@ CORE_zhemm_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zhemm(const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zhemm_parsec, options->priority, "hemm", @@ -72,12 +70,12 @@ void INSERT_TASK_zhemm(const RUNTIME_option_t *options, sizeof(int), &n, VALUE, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PASSED_BY_REF, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), chameleon_parsec_get_arena_index( B ) | INPUT, - sizeof(int), &ldb, VALUE, + sizeof(int), &(tileB->ld), VALUE, sizeof(CHAMELEON_Complex64_t), &beta, VALUE, PASSED_BY_REF, RTBLKADDR( C, CHAMELEON_Complex64_t, Cm, Cn ), chameleon_parsec_get_arena_index( C ) | INOUT | AFFINITY, - sizeof(int), &ldc, VALUE, + sizeof(int), &(tileC->ld), VALUE, PARSEC_DTD_ARG_END ); (void)nb; diff --git a/runtime/parsec/codelets/codelet_zher2k.c b/runtime/parsec/codelets/codelet_zher2k.c index 1d4aec6ff21b017366afe3d49caf2d92bd2f5437..10599009c9ca1b76c54f630815f30bd138bc0af8 100644 --- a/runtime/parsec/codelets/codelet_zher2k.c +++ b/runtime/parsec/codelets/codelet_zher2k.c @@ -21,11 +21,6 @@ #include "chameleon/tasks_z.h" #include "coreblas/coreblas_z.h" -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - */ static inline int CORE_zher2k_parsec( parsec_execution_stream_t *context, parsec_task_t *this_task ) @@ -58,11 +53,14 @@ CORE_zher2k_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zher2k(const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - double beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + double beta, const CHAM_desc_t *C, int Cm, int Cn) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zher2k_parsec, options->priority, "her2k", @@ -72,12 +70,12 @@ void INSERT_TASK_zher2k(const RUNTIME_option_t *options, sizeof(int), &k, VALUE, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PASSED_BY_REF, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), chameleon_parsec_get_arena_index( B ) | INPUT, - sizeof(int), &ldb, VALUE, + sizeof(int), &(tileB->ld), VALUE, sizeof(double), &beta, VALUE, PASSED_BY_REF, RTBLKADDR( C, CHAMELEON_Complex64_t, Cm, Cn ), chameleon_parsec_get_arena_index( C ) | INOUT | AFFINITY, - sizeof(int), &ldc, VALUE, + sizeof(int), &(tileC->ld), VALUE, PARSEC_DTD_ARG_END ); (void)nb; diff --git a/runtime/parsec/codelets/codelet_zherfb.c b/runtime/parsec/codelets/codelet_zherfb.c index 489e7d01695346131e187a9020fcab9f97d8bcac..0b6d3d649c1b769e6aa9f00930dfddf967bfe774 100644 --- a/runtime/parsec/codelets/codelet_zherfb.c +++ b/runtime/parsec/codelets/codelet_zherfb.c @@ -53,11 +53,14 @@ CORE_zherfb_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zherfb(const RUNTIME_option_t *options, cham_uplo_t uplo, int n, int k, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, - const CHAM_desc_t *C, int Cm, int Cn, int ldc) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *T, int Tm, int Tn, + const CHAM_desc_t *C, int Cm, int Cn) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileT = T->get_blktile( T, Tm, Tn ); + CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zherfb_parsec, options->priority, "herfb", @@ -67,11 +70,11 @@ void INSERT_TASK_zherfb(const RUNTIME_option_t *options, sizeof(int), &ib, VALUE, sizeof(int), &nb, VALUE, PASSED_BY_REF, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PASSED_BY_REF, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), INPUT, - sizeof(int), &ldt, VALUE, + sizeof(int), &(tileT->ld), VALUE, PASSED_BY_REF, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), INOUT | AFFINITY, - sizeof(int), &ldc, VALUE, + sizeof(int), &(tileC->ld), VALUE, sizeof(CHAMELEON_Complex64_t)*2*nb*nb, NULL, SCRATCH, sizeof(int), &nb, VALUE, PARSEC_DTD_ARG_END ); diff --git a/runtime/parsec/codelets/codelet_zherk.c b/runtime/parsec/codelets/codelet_zherk.c index 4cd3f591b034f7d80934be5392841f0bb1e65578..45822a1e2080b40ec6ac787db28b38ab6fa1f66d 100644 --- a/runtime/parsec/codelets/codelet_zherk.c +++ b/runtime/parsec/codelets/codelet_zherk.c @@ -21,11 +21,6 @@ #include "chameleon/tasks_z.h" #include "coreblas/coreblas_z.h" -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - */ static inline int CORE_zherk_parsec( parsec_execution_stream_t *context, parsec_task_t *this_task ) @@ -55,10 +50,12 @@ CORE_zherk_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zherk(const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, - double alpha, const CHAM_desc_t *A, int Am, int An, int lda, - double beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc) + double alpha, const CHAM_desc_t *A, int Am, int An, + double beta, const CHAM_desc_t *C, int Cm, int Cn) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zherk_parsec, options->priority, "herk", @@ -68,10 +65,10 @@ void INSERT_TASK_zherk(const RUNTIME_option_t *options, sizeof(int), &k, VALUE, sizeof(double), &alpha, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, sizeof(double), &beta, VALUE, PASSED_BY_REF, RTBLKADDR( C, CHAMELEON_Complex64_t, Cm, Cn ), chameleon_parsec_get_arena_index( C ) | INOUT | AFFINITY, - sizeof(int), &ldc, VALUE, + sizeof(int), &(tileC->ld), VALUE, PARSEC_DTD_ARG_END ); (void)nb; diff --git a/runtime/parsec/codelets/codelet_zhessq.c b/runtime/parsec/codelets/codelet_zhessq.c index 2229c62eb91c03e52cbc848f482b2702a40a4c0c..5cd16be91efceb988fa1a8d2faf8caad5df5963a 100644 --- a/runtime/parsec/codelets/codelet_zhessq.c +++ b/runtime/parsec/codelets/codelet_zhessq.c @@ -22,10 +22,10 @@ void INSERT_TASK_zhessq( const RUNTIME_option_t *options, cham_store_t storev, cham_uplo_t uplo, int n, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { INSERT_TASK_zsyssq( options, storev, uplo, n, - A, Am, An, lda, + A, Am, An, SCALESUMSQ, SCALESUMSQm, SCALESUMSQn ); } diff --git a/runtime/parsec/codelets/codelet_zlacpy.c b/runtime/parsec/codelets/codelet_zlacpy.c index 4b5faaf1dee4533c747220e92df54392694e6b10..62b5ffa3682dd7969b1884f2799f3c3935fa13a8 100644 --- a/runtime/parsec/codelets/codelet_zlacpy.c +++ b/runtime/parsec/codelets/codelet_zlacpy.c @@ -21,11 +21,6 @@ #include "chameleon/tasks_z.h" #include "coreblas/coreblas_z.h" -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - */ static inline int CORE_zlacpyx_parsec( parsec_execution_stream_t *context, parsec_task_t *this_task ) @@ -51,10 +46,12 @@ CORE_zlacpyx_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int nb, - int displA, const CHAM_desc_t *A, int Am, int An, int lda, - int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb ) + int displA, const CHAM_desc_t *A, int Am, int An, + int displB, const CHAM_desc_t *B, int Bm, int Bn ) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zlacpyx_parsec, options->priority, "lacpy", @@ -63,20 +60,20 @@ void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, sizeof(int), &n, VALUE, sizeof(int), &displA, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, sizeof(int), &displB, VALUE, PASSED_BY_REF, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), chameleon_parsec_get_arena_index( B ) | OUTPUT | AFFINITY, - sizeof(int), &ldb, VALUE, + sizeof(int), &(tileB->ld), VALUE, PARSEC_DTD_ARG_END ); (void)nb; } void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { INSERT_TASK_zlacpyx( options, uplo, m, n, nb, - 0, A, Am, An, lda, - 0, B, Bm, Bn, ldb ); + 0, A, Am, An, + 0, B, Bm, Bn ); } diff --git a/runtime/parsec/codelets/codelet_zlag2c.c b/runtime/parsec/codelets/codelet_zlag2c.c index 6c65435356d66cc24aa4977c50e12a58d745ae01..535849fc88b9659cb8e99dd45e18fbde6542acc9 100644 --- a/runtime/parsec/codelets/codelet_zlag2c.c +++ b/runtime/parsec/codelets/codelet_zlag2c.c @@ -21,11 +21,6 @@ #include "chameleon/tasks_z.h" #include "coreblas/coreblas_z.h" -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - */ static inline int CORE_zlag2c_parsec( parsec_execution_stream_t *context, parsec_task_t *this_task ) @@ -48,18 +43,20 @@ CORE_zlag2c_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zlag2c(const RUNTIME_option_t *options, int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); parsec_dtd_taskpool_insert_task(PARSEC_dtd_taskpool, CORE_zlag2c_parsec, "lag2c", sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PASSED_BY_REF, RTBLKADDR( B, CHAMELEON_Complex32_t, Bm, Bn ), OUTPUT | AFFINITY, - sizeof(int), &ldb, VALUE, + sizeof(int), &(tileB->ld), VALUE, PARSEC_DTD_ARG_END ); } @@ -89,18 +86,20 @@ CORE_clag2z_parsec(parsec_execution_stream_t *context, parsec_task_t *this_task) void INSERT_TASK_clag2z(const RUNTIME_option_t *options, int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_clag2z_parsec, options->priority, "lag2z", sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex32_t, Am, An ), INPUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PASSED_BY_REF, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), chameleon_parsec_get_arena_index( B ) | OUTPUT | AFFINITY, - sizeof(int), &ldb, VALUE, + sizeof(int), &(tileB->ld), VALUE, PARSEC_DTD_ARG_END ); } diff --git a/runtime/parsec/codelets/codelet_zlange.c b/runtime/parsec/codelets/codelet_zlange.c index b66dfd7bccaf0f0782f9c9495e167e29b392ae64..776ea85e557c99bbf77ebcd6c5f042cf30840f1b 100644 --- a/runtime/parsec/codelets/codelet_zlange.c +++ b/runtime/parsec/codelets/codelet_zlange.c @@ -44,10 +44,11 @@ CORE_zlange_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zlange(const RUNTIME_option_t *options, cham_normtype_t norm, int M, int N, int NB, - const CHAM_desc_t *A, int Am, int An, int LDA, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); int szeW = chameleon_max( M, N ); @@ -57,7 +58,7 @@ void INSERT_TASK_zlange(const RUNTIME_option_t *options, sizeof(int), &M, VALUE, sizeof(int), &N, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &LDA, VALUE, + sizeof(int), &(tileA->ld), VALUE, sizeof(double)*szeW, NULL, SCRATCH, PASSED_BY_REF, RTBLKADDR( B, double, Bm, Bn ), OUTPUT | AFFINITY, PARSEC_DTD_ARG_END ); diff --git a/runtime/parsec/codelets/codelet_zlanhe.c b/runtime/parsec/codelets/codelet_zlanhe.c index 5621b08cc0205a059f9401c3a208476c44643954..02a9d50bc9b215a1f5ee61ddeae5ef7569373d67 100644 --- a/runtime/parsec/codelets/codelet_zlanhe.c +++ b/runtime/parsec/codelets/codelet_zlanhe.c @@ -44,11 +44,11 @@ CORE_zlanhe_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zlanhe(const RUNTIME_option_t *options, cham_normtype_t norm, cham_uplo_t uplo, int N, int NB, - const CHAM_desc_t *A, int Am, int An, int LDA, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); - + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); int szeW = chameleon_max( 1, N ); parsec_dtd_taskpool_insert_task( @@ -57,7 +57,7 @@ void INSERT_TASK_zlanhe(const RUNTIME_option_t *options, sizeof(int), &uplo, VALUE, sizeof(int), &N, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &LDA, VALUE, + sizeof(int), &(tileA->ld), VALUE, sizeof(double)*szeW, NULL, SCRATCH, PASSED_BY_REF, RTBLKADDR( B, double, Bm, Bn ), OUTPUT | AFFINITY, PARSEC_DTD_ARG_END ); diff --git a/runtime/parsec/codelets/codelet_zlansy.c b/runtime/parsec/codelets/codelet_zlansy.c index 85e23886b9e8a479f1c89746f35eed44a6091abc..425bcc99230d40d0f250d5cfe6b23377ff5ea03f 100644 --- a/runtime/parsec/codelets/codelet_zlansy.c +++ b/runtime/parsec/codelets/codelet_zlansy.c @@ -44,11 +44,11 @@ CORE_zlansy_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zlansy(const RUNTIME_option_t *options, cham_normtype_t norm, cham_uplo_t uplo, int N, int NB, - const CHAM_desc_t *A, int Am, int An, int LDA, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); - + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); int szeW = chameleon_max( 1, N ); parsec_dtd_taskpool_insert_task( @@ -57,7 +57,7 @@ void INSERT_TASK_zlansy(const RUNTIME_option_t *options, sizeof(int), &uplo, VALUE, sizeof(int), &N, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &LDA, VALUE, + sizeof(int), &(tileA->ld), VALUE, sizeof(double)*szeW, NULL, SCRATCH, PASSED_BY_REF, RTBLKADDR( B, double, Bm, Bn ), OUTPUT | AFFINITY, PARSEC_DTD_ARG_END ); diff --git a/runtime/parsec/codelets/codelet_zlantr.c b/runtime/parsec/codelets/codelet_zlantr.c index d4aa9b42ad2500a2e0ee1310863ee184321bd774..650f5c71b6263d50ffd219123951c62289114fd7 100644 --- a/runtime/parsec/codelets/codelet_zlantr.c +++ b/runtime/parsec/codelets/codelet_zlantr.c @@ -45,22 +45,22 @@ CORE_zlantr_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zlantr(const RUNTIME_option_t *options, cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, int M, int N, int NB, - const CHAM_desc_t *A, int Am, int An, int LDA, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); - + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); int szeW = chameleon_max( 1, N ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zlantr_parsec, options->priority, "lantr", - sizeof(int), &norm, VALUE, - sizeof(int), &uplo, VALUE, - sizeof(int), &diag, VALUE, - sizeof(int), &M, VALUE, - sizeof(int), &N, VALUE, + sizeof(int), &norm, VALUE, + sizeof(int), &uplo, VALUE, + sizeof(int), &diag, VALUE, + sizeof(int), &M, VALUE, + sizeof(int), &N, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &LDA, VALUE, + sizeof(int), &(tileA->ld), VALUE, sizeof(double)*szeW, NULL, SCRATCH, PASSED_BY_REF, RTBLKADDR( B, double, Bm, Bn ), OUTPUT | AFFINITY, PARSEC_DTD_ARG_END ); diff --git a/runtime/parsec/codelets/codelet_zlascal.c b/runtime/parsec/codelets/codelet_zlascal.c index 09f40b18f3a5562c44d09d81ebfd7bc98466eb4b..32456e8c798dda40d2e01682cffb51a643af4184 100644 --- a/runtime/parsec/codelets/codelet_zlascal.c +++ b/runtime/parsec/codelets/codelet_zlascal.c @@ -12,8 +12,6 @@ * @brief Chameleon zlascal PaRSEC codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Julien Langou * @author Henricus Bouwmeester * @author Mathieu Faverge @@ -27,11 +25,6 @@ #include "chameleon/tasks_z.h" #include "coreblas/coreblas_z.h" -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - */ static inline int CORE_zlascal_parsec( parsec_execution_stream_t *context, parsec_task_t *this_task ) @@ -56,9 +49,10 @@ void INSERT_TASK_zlascal(const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int nb, CHAMELEON_Complex64_t alpha, - const CHAM_desc_t *A, int Am, int An, int lda) + const CHAM_desc_t *A, int Am, int An) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zlascal_parsec, options->priority, "lascal", @@ -67,7 +61,7 @@ void INSERT_TASK_zlascal(const RUNTIME_option_t *options, sizeof(int), &n, VALUE, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, PASSED_BY_REF, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT | AFFINITY, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PARSEC_DTD_ARG_END ); (void)nb; diff --git a/runtime/parsec/codelets/codelet_zlaset.c b/runtime/parsec/codelets/codelet_zlaset.c index f9e0b9566da0cb0852228f0d3603e314987e4f11..8190891aadac71cf7bc2f226f27a435238b8e84e 100644 --- a/runtime/parsec/codelets/codelet_zlaset.c +++ b/runtime/parsec/codelets/codelet_zlaset.c @@ -21,41 +21,6 @@ #include "chameleon/tasks_z.h" #include "coreblas/coreblas_z.h" -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zlaset - Sets the elements of the matrix A on the diagonal - * to beta and on the off-diagonals to alpha - * - ******************************************************************************* - * - * @param[in] uplo - * Specifies which elements of the matrix are to be set - * = ChamUpper: Upper part of A is set; - * = ChamLower: Lower part of A is set; - * = ChamUpperLower: ALL elements of A are set. - * - * @param[in] M - * The number of rows of the matrix A. M >= 0. - * - * @param[in] N - * The number of columns of the matrix A. N >= 0. - * - * @param[in] alpha - * The constant to which the off-diagonal elements are to be set. - * - * @param[in] beta - * The constant to which the diagonal elements are to be set. - * - * @param[in,out] A - * On entry, the M-by-N tile A. - * On exit, A has been set accordingly. - * - * @param[in] LDA - * The leading dimension of the array A. LDA >= max(1,M). - * - */ static inline int CORE_zlaset_parsec( parsec_execution_stream_t *context, parsec_task_t *this_task ) @@ -80,9 +45,10 @@ CORE_zlaset_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zlaset(const RUNTIME_option_t *options, cham_uplo_t uplo, int M, int N, CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t beta, - const CHAM_desc_t *A, int Am, int An, int LDA) + const CHAM_desc_t *A, int Am, int An) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zlaset_parsec, options->priority, "laset", @@ -92,6 +58,6 @@ void INSERT_TASK_zlaset(const RUNTIME_option_t *options, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, sizeof(CHAMELEON_Complex64_t), &beta, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | OUTPUT | AFFINITY, - sizeof(int), &LDA, VALUE, + sizeof(int), &(tileA->ld), VALUE, PARSEC_DTD_ARG_END ); } diff --git a/runtime/parsec/codelets/codelet_zlaset2.c b/runtime/parsec/codelets/codelet_zlaset2.c index 731e6d4717a1d9fbb693d3c0667ef8725879748d..46067a65fa1267c0bcd23b0f3d685d16d5af0008 100644 --- a/runtime/parsec/codelets/codelet_zlaset2.c +++ b/runtime/parsec/codelets/codelet_zlaset2.c @@ -21,39 +21,6 @@ #include "chameleon/tasks_z.h" #include "coreblas/coreblas_z.h" -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zlaset2 - Sets the elements of the matrix A to alpha. - * Not LAPACK compliant! Read below. - * - ******************************************************************************* - * - * @param[in] uplo - * Specifies which elements of the matrix are to be set - * = ChamUpper: STRICT Upper part of A is set to alpha; - * = ChamLower: STRICT Lower part of A is set to alpha; - * = ChamUpperLower: ALL elements of A are set to alpha. - * Not LAPACK Compliant. - * - * @param[in] M - * The number of rows of the matrix A. M >= 0. - * - * @param[in] N - * The number of columns of the matrix A. N >= 0. - * - * @param[in] alpha - * The constant to which the elements are to be set. - * - * @param[in,out] A - * On entry, the M-by-N tile A. - * On exit, A has been set to alpha accordingly. - * - * @param[in] LDA - * The leading dimension of the array A. LDA >= max(1,M). - * - */ static inline int CORE_zlaset2_parsec( parsec_execution_stream_t *context, parsec_task_t *this_task ) @@ -76,9 +43,10 @@ CORE_zlaset2_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zlaset2(const RUNTIME_option_t *options, cham_uplo_t uplo, int M, int N, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int LDA) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zlaset2_parsec, options->priority, "laset2", @@ -87,6 +55,6 @@ void INSERT_TASK_zlaset2(const RUNTIME_option_t *options, sizeof(int), &N, VALUE, sizeof(int), &alpha, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | OUTPUT | AFFINITY, - sizeof(int), &LDA, VALUE, + sizeof(int), &(tileA->ld), VALUE, PARSEC_DTD_ARG_END ); } diff --git a/runtime/parsec/codelets/codelet_zlatro.c b/runtime/parsec/codelets/codelet_zlatro.c index af9c8cf3dffd8e857d641e7e567cdb27ff265080..9451ea48a67b2e87d4fef19993e64785b3ad0322 100644 --- a/runtime/parsec/codelets/codelet_zlatro.c +++ b/runtime/parsec/codelets/codelet_zlatro.c @@ -44,16 +44,15 @@ CORE_zlatro_parsec( parsec_execution_stream_t *context, return PARSEC_HOOK_RETURN_DONE; } -/** - * - */ void INSERT_TASK_zlatro(const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int m, int n, int mb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zlatro_parsec, options->priority, "latro", @@ -62,9 +61,9 @@ void INSERT_TASK_zlatro(const RUNTIME_option_t *options, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, PASSED_BY_REF, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PASSED_BY_REF, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), OUTPUT | AFFINITY, - sizeof(int), &ldb, VALUE, + sizeof(int), &(tileB->ld), VALUE, PARSEC_DTD_ARG_END ); (void)mb; diff --git a/runtime/parsec/codelets/codelet_zlauum.c b/runtime/parsec/codelets/codelet_zlauum.c index 497862ea15f5309ef64dec552bcc5b8e85aafca3..45098a1ff024cea672b48a7063361842a63c3652 100644 --- a/runtime/parsec/codelets/codelet_zlauum.c +++ b/runtime/parsec/codelets/codelet_zlauum.c @@ -21,11 +21,6 @@ #include "chameleon/tasks_z.h" #include "coreblas/coreblas_z.h" -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - */ static inline int CORE_zlauum_parsec( parsec_execution_stream_t *context, parsec_task_t *this_task ) @@ -46,16 +41,17 @@ CORE_zlauum_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zlauum(const RUNTIME_option_t *options, cham_uplo_t uplo, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda) + const CHAM_desc_t *A, int Am, int An) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zlauum_parsec, options->priority, "lauum", sizeof(int), &uplo, VALUE, sizeof(int), &n, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INOUT | AFFINITY, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PARSEC_DTD_ARG_END ); (void)nb; diff --git a/runtime/parsec/codelets/codelet_zplghe.c b/runtime/parsec/codelets/codelet_zplghe.c index 324270e2a6c06a32b46b5a0fc3b135ee356a4b63..40d3aa614ff6a042b25f5f380d4ed2d4b9a2e543 100644 --- a/runtime/parsec/codelets/codelet_zplghe.c +++ b/runtime/parsec/codelets/codelet_zplghe.c @@ -45,10 +45,11 @@ CORE_zplghe_parsec( parsec_execution_stream_t *context, } void INSERT_TASK_zplghe( const RUNTIME_option_t *options, - double bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int lda, + double bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int bigM, int m0, int n0, unsigned long long int seed ) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zplghe_parsec, options->priority, "zplghe", @@ -56,7 +57,7 @@ void INSERT_TASK_zplghe( const RUNTIME_option_t *options, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | OUTPUT | AFFINITY, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, sizeof(int), &bigM, VALUE, sizeof(int), &m0, VALUE, sizeof(int), &n0, VALUE, diff --git a/runtime/parsec/codelets/codelet_zplgsy.c b/runtime/parsec/codelets/codelet_zplgsy.c index 9ab5a4faeb73e4067e91cdd8e72217d03c5c3196..53e6f2b310f76ff9c4dc58740d7b00691094a906 100644 --- a/runtime/parsec/codelets/codelet_zplgsy.c +++ b/runtime/parsec/codelets/codelet_zplgsy.c @@ -45,10 +45,11 @@ CORE_zplgsy_parsec( parsec_execution_stream_t *context, } void INSERT_TASK_zplgsy( const RUNTIME_option_t *options, - CHAMELEON_Complex64_t bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int lda, + CHAMELEON_Complex64_t bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int bigM, int m0, int n0, unsigned long long int seed ) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zplgsy_parsec, options->priority, "zplgsy", @@ -56,7 +57,7 @@ void INSERT_TASK_zplgsy( const RUNTIME_option_t *options, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | OUTPUT | AFFINITY, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, sizeof(int), &bigM, VALUE, sizeof(int), &m0, VALUE, sizeof(int), &n0, VALUE, diff --git a/runtime/parsec/codelets/codelet_zplrnt.c b/runtime/parsec/codelets/codelet_zplrnt.c index 2bbeef7419167515c1bd4bffc0d6f516a3196653..b1e97221d0cb5dfce8aece73a236867ef41f7916 100644 --- a/runtime/parsec/codelets/codelet_zplrnt.c +++ b/runtime/parsec/codelets/codelet_zplrnt.c @@ -44,17 +44,18 @@ CORE_zplrnt_parsec( parsec_execution_stream_t *context, } void INSERT_TASK_zplrnt( const RUNTIME_option_t *options, - int m, int n, const CHAM_desc_t *A, int Am, int An, int lda, + int m, int n, const CHAM_desc_t *A, int Am, int An, int bigM, int m0, int n0, unsigned long long int seed ) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zplrnt_parsec, options->priority, "zplrnt", sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | OUTPUT | AFFINITY, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, sizeof(int), &bigM, VALUE, sizeof(int), &m0, VALUE, sizeof(int), &n0, VALUE, diff --git a/runtime/parsec/codelets/codelet_zpotrf.c b/runtime/parsec/codelets/codelet_zpotrf.c index 0b6b6b9b5d31d503ffcc1c8bfdf495dfe4522e3f..93fdbf4a3326f1785f57262d26a1a18c397e2a96 100644 --- a/runtime/parsec/codelets/codelet_zpotrf.c +++ b/runtime/parsec/codelets/codelet_zpotrf.c @@ -53,17 +53,18 @@ CORE_zpotrf_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zpotrf(const RUNTIME_option_t *options, cham_uplo_t uplo, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, int iinfo) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zpotrf_parsec, options->priority, "potrf", sizeof(int), &uplo, VALUE, sizeof(int), &n, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INOUT | AFFINITY, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, sizeof(int), &iinfo, VALUE, sizeof(RUNTIME_sequence_t*), &(options->sequence), VALUE, sizeof(RUNTIME_request_t*), &(options->request), VALUE, diff --git a/runtime/parsec/codelets/codelet_zssssm.c b/runtime/parsec/codelets/codelet_zssssm.c index 25d00097a9c312800b39c213fec13a37db294c35..e11a2131bee6f81748d1840d7e3bc966643c0e1c 100644 --- a/runtime/parsec/codelets/codelet_zssssm.c +++ b/runtime/parsec/codelets/codelet_zssssm.c @@ -52,13 +52,17 @@ CORE_zssssm_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zssssm(const RUNTIME_option_t *options, int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *L1, int L1m, int L1n, int ldl1, - const CHAM_desc_t *L2, int L2m, int L2n, int ldl2, + const CHAM_desc_t *A1, int A1m, int A1n, + const CHAM_desc_t *A2, int A2m, int A2n, + const CHAM_desc_t *L1, int L1m, int L1n, + const CHAM_desc_t *L2, int L2m, int L2n, const int *IPIV) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA1 = A1->get_blktile( A1, A1m, A1n ); + CHAM_tile_t *tileA2 = A2->get_blktile( A2, A2m, A2n ); + CHAM_tile_t *tileL1 = L1->get_blktile( L1, L1m, L1n ); + CHAM_tile_t *tileL2 = L2->get_blktile( L2, L2m, L2n ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zssssm_parsec, options->priority, "ssssm", @@ -69,13 +73,13 @@ void INSERT_TASK_zssssm(const RUNTIME_option_t *options, sizeof(int), &k, VALUE, sizeof(int), &ib, VALUE, PASSED_BY_REF, RTBLKADDR( A1, CHAMELEON_Complex64_t, A1m, A1n ), chameleon_parsec_get_arena_index( A1 ) | INOUT, - sizeof(int), &lda1, VALUE, + sizeof(int), &(tileA1->ld), VALUE, PASSED_BY_REF, RTBLKADDR( A2, CHAMELEON_Complex64_t, A2m, A2n ), chameleon_parsec_get_arena_index( A2 ) | INOUT | AFFINITY, - sizeof(int), &lda2, VALUE, + sizeof(int), &(tileA2->ld), VALUE, PASSED_BY_REF, RTBLKADDR( L1, CHAMELEON_Complex64_t, L1m, L1n ), chameleon_parsec_get_arena_index( L1 ) | INPUT, - sizeof(int), &ldl1, VALUE, + sizeof(int), &(tileL1->ld), VALUE, PASSED_BY_REF, RTBLKADDR( L2, CHAMELEON_Complex64_t, L2m, L2n ), chameleon_parsec_get_arena_index( L2 ) | INPUT, - sizeof(int), &ldl2, VALUE, + sizeof(int), &(tileL2->ld), VALUE, sizeof(int*), &IPIV, VALUE, PARSEC_DTD_ARG_END ); diff --git a/runtime/parsec/codelets/codelet_zsymm.c b/runtime/parsec/codelets/codelet_zsymm.c index 703b1b5d15d7bab0f0db10f0cb6a47c9c1934dc9..75e879049762f8109256020f6bc083fc70a4d834 100644 --- a/runtime/parsec/codelets/codelet_zsymm.c +++ b/runtime/parsec/codelets/codelet_zsymm.c @@ -53,11 +53,14 @@ CORE_zsymm_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zsymm(const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zsymm_parsec, options->priority, "symm", @@ -67,12 +70,12 @@ void INSERT_TASK_zsymm(const RUNTIME_option_t *options, sizeof(int), &n, VALUE, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PASSED_BY_REF, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), chameleon_parsec_get_arena_index( B ) | INPUT, - sizeof(int), &ldb, VALUE, + sizeof(int), &(tileB->ld), VALUE, sizeof(CHAMELEON_Complex64_t), &beta, VALUE, PASSED_BY_REF, RTBLKADDR( C, CHAMELEON_Complex64_t, Cm, Cn ), chameleon_parsec_get_arena_index( C ) | INOUT | AFFINITY, - sizeof(int), &ldc, VALUE, + sizeof(int), &(tileC->ld), VALUE, PARSEC_DTD_ARG_END ); (void)nb; diff --git a/runtime/parsec/codelets/codelet_zsyr2k.c b/runtime/parsec/codelets/codelet_zsyr2k.c index 3a44fa279e9d029d68dcc2336115855394aabe1b..dfe1bc26a24ff2e8ccf9e1d876fb5da20cb414a2 100644 --- a/runtime/parsec/codelets/codelet_zsyr2k.c +++ b/runtime/parsec/codelets/codelet_zsyr2k.c @@ -53,11 +53,14 @@ CORE_zsyr2k_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zsyr2k(const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zsyr2k_parsec, options->priority, "syr2k", @@ -67,12 +70,12 @@ void INSERT_TASK_zsyr2k(const RUNTIME_option_t *options, sizeof(int), &k, VALUE, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PASSED_BY_REF, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), chameleon_parsec_get_arena_index( B ) | INPUT, - sizeof(int), &ldb, VALUE, + sizeof(int), &(tileB->ld), VALUE, sizeof(CHAMELEON_Complex64_t), &beta, VALUE, PASSED_BY_REF, RTBLKADDR( C, CHAMELEON_Complex64_t, Cm, Cn ), chameleon_parsec_get_arena_index( C ) | INOUT | AFFINITY, - sizeof(int), &ldc, VALUE, + sizeof(int), &(tileC->ld), VALUE, PARSEC_DTD_ARG_END ); (void)nb; diff --git a/runtime/parsec/codelets/codelet_zsyrk.c b/runtime/parsec/codelets/codelet_zsyrk.c index 6c1109e894db543c7b260eddb60a78ce05a19b6b..26d4352e28e020ee6529c37d94d70bcfab321b3f 100644 --- a/runtime/parsec/codelets/codelet_zsyrk.c +++ b/runtime/parsec/codelets/codelet_zsyrk.c @@ -50,10 +50,12 @@ CORE_zsyrk_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zsyrk(const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zsyrk_parsec, options->priority, "syrk", @@ -63,10 +65,10 @@ void INSERT_TASK_zsyrk(const RUNTIME_option_t *options, sizeof(int), &k, VALUE, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, sizeof(CHAMELEON_Complex64_t), &beta, VALUE, PASSED_BY_REF, RTBLKADDR( C, CHAMELEON_Complex64_t, Cm, Cn ), chameleon_parsec_get_arena_index( C ) | INOUT | AFFINITY, - sizeof(int), &ldc, VALUE, + sizeof(int), &(tileC->ld), VALUE, PARSEC_DTD_ARG_END ); (void)nb; diff --git a/runtime/parsec/codelets/codelet_zsyssq.c b/runtime/parsec/codelets/codelet_zsyssq.c index d3797ca26c9966bbe8319e1cfd173a430688eaeb..c4d570e4c55249f46b247b585fccc07303df2721 100644 --- a/runtime/parsec/codelets/codelet_zsyssq.c +++ b/runtime/parsec/codelets/codelet_zsyssq.c @@ -43,10 +43,11 @@ CORE_zsyssq_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zsyssq( const RUNTIME_option_t *options, cham_store_t storev, cham_uplo_t uplo, int n, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zsyssq_parsec, options->priority, "syssq", @@ -54,7 +55,7 @@ void INSERT_TASK_zsyssq( const RUNTIME_option_t *options, sizeof(int), &uplo, VALUE, sizeof(int), &n, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PASSED_BY_REF, RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ), chameleon_parsec_get_arena_index( SCALESUMSQ ) | INOUT | AFFINITY, PARSEC_DTD_ARG_END ); } diff --git a/runtime/parsec/codelets/codelet_zsytrf_nopiv.c b/runtime/parsec/codelets/codelet_zsytrf_nopiv.c index ae4cdc64077f11eeb79a5618ebfadb6993726702..773fbcbc154b31351c613f6a11100bf7b1306153 100644 --- a/runtime/parsec/codelets/codelet_zsytrf_nopiv.c +++ b/runtime/parsec/codelets/codelet_zsytrf_nopiv.c @@ -42,17 +42,18 @@ CORE_zsytrf_nopiv_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zsytrf_nopiv(const RUNTIME_option_t *options, cham_uplo_t uplo, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, int iinfo) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zsytrf_nopiv_parsec, options->priority, "sytrf_nopiv", sizeof(int), &uplo, VALUE, sizeof(int), &n, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INOUT | AFFINITY, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, sizeof(int), &iinfo, VALUE, PARSEC_DTD_ARG_END ); diff --git a/runtime/parsec/codelets/codelet_ztplqt.c b/runtime/parsec/codelets/codelet_ztplqt.c index b99d4961fdae1d1750284ccbb46b51d44cbd9705..1d97ccd4a6ce49de3488232331c83aec752a0329 100644 --- a/runtime/parsec/codelets/codelet_ztplqt.c +++ b/runtime/parsec/codelets/codelet_ztplqt.c @@ -50,11 +50,14 @@ CORE_ztplqt_parsec( parsec_execution_stream_t *context, void INSERT_TASK_ztplqt( const RUNTIME_option_t *options, int M, int N, int L, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + const CHAM_desc_t *T, int Tm, int Tn ) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + CHAM_tile_t *tileT = T->get_blktile( T, Tm, Tn ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_ztplqt_parsec, options->priority, "tplqt", @@ -63,11 +66,11 @@ void INSERT_TASK_ztplqt( const RUNTIME_option_t *options, sizeof(int), &L, VALUE, sizeof(int), &ib, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INOUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PASSED_BY_REF, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), chameleon_parsec_get_arena_index( B ) | INOUT | AFFINITY, - sizeof(int), &ldb, VALUE, + sizeof(int), &(tileB->ld), VALUE, PASSED_BY_REF, RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), chameleon_parsec_get_arena_index( T ) | OUTPUT, - sizeof(int), &ldt, VALUE, + sizeof(int), &(tileT->ld), VALUE, sizeof(CHAMELEON_Complex64_t)*(ib+1)*nb, NULL, SCRATCH, PARSEC_DTD_ARG_END ); } diff --git a/runtime/parsec/codelets/codelet_ztpmlqt.c b/runtime/parsec/codelets/codelet_ztpmlqt.c index b31b3514be8603bcd2c551f7a01b82aae9304c9c..bf4b83472d97b544ffc8fef5971bf6c78502a8b8 100644 --- a/runtime/parsec/codelets/codelet_ztpmlqt.c +++ b/runtime/parsec/codelets/codelet_ztpmlqt.c @@ -55,12 +55,16 @@ CORE_ztpmlqt_parsec( parsec_execution_stream_t *context, void INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int ib, int nb, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ) + const CHAM_desc_t *V, int Vm, int Vn, + const CHAM_desc_t *T, int Tm, int Tn, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileV = V->get_blktile( V, Vm, Vn ); + CHAM_tile_t *tileT = T->get_blktile( T, Tm, Tn ); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_ztpmlqt_parsec, options->priority, "tpmlqt", @@ -72,13 +76,13 @@ void INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options, sizeof(int), &L, VALUE, sizeof(int), &ib, VALUE, PASSED_BY_REF, RTBLKADDR( V, CHAMELEON_Complex64_t, Vm, Vn ), chameleon_parsec_get_arena_index( V ) | INPUT, - sizeof(int), &ldv, VALUE, + sizeof(int), &(tileV->ld), VALUE, PASSED_BY_REF, RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), chameleon_parsec_get_arena_index( T ) | INPUT, - sizeof(int), &ldt, VALUE, + sizeof(int), &(tileT->ld), VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INOUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PASSED_BY_REF, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), chameleon_parsec_get_arena_index( B ) | INOUT | AFFINITY, - sizeof(int), &ldb, VALUE, + sizeof(int), &(tileB->ld), VALUE, sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, PARSEC_DTD_ARG_END ); } diff --git a/runtime/parsec/codelets/codelet_ztpmqrt.c b/runtime/parsec/codelets/codelet_ztpmqrt.c index e8caa5d1f9b8b71f7c6c0bf9c8f2b557ef00a877..bacc8fbd2a22163a9b0bd51c8885c4b660d3d0c3 100644 --- a/runtime/parsec/codelets/codelet_ztpmqrt.c +++ b/runtime/parsec/codelets/codelet_ztpmqrt.c @@ -55,12 +55,16 @@ CORE_ztpmqrt_parsec( parsec_execution_stream_t *context, void INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int ib, int nb, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ) + const CHAM_desc_t *V, int Vm, int Vn, + const CHAM_desc_t *T, int Tm, int Tn, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileV = V->get_blktile( V, Vm, Vn ); + CHAM_tile_t *tileT = T->get_blktile( T, Tm, Tn ); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_ztpmqrt_parsec, options->priority, "tpmqrt", @@ -72,13 +76,13 @@ void INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options, sizeof(int), &L, VALUE, sizeof(int), &ib, VALUE, PASSED_BY_REF, RTBLKADDR( V, CHAMELEON_Complex64_t, Vm, Vn ), chameleon_parsec_get_arena_index( V ) | INPUT, - sizeof(int), &ldv, VALUE, + sizeof(int), &(tileV->ld), VALUE, PASSED_BY_REF, RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), chameleon_parsec_get_arena_index( T ) | INPUT, - sizeof(int), &ldt, VALUE, + sizeof(int), &(tileT->ld), VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INOUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PASSED_BY_REF, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), chameleon_parsec_get_arena_index( B ) | INOUT | AFFINITY, - sizeof(int), &ldb, VALUE, + sizeof(int), &(tileB->ld), VALUE, sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, PARSEC_DTD_ARG_END ); } diff --git a/runtime/parsec/codelets/codelet_ztpqrt.c b/runtime/parsec/codelets/codelet_ztpqrt.c index 022d7014ef90ae5df0feba16e2c02e57c5773064..e552b6da98e7aadcebfb46d55c76be615a3066c9 100644 --- a/runtime/parsec/codelets/codelet_ztpqrt.c +++ b/runtime/parsec/codelets/codelet_ztpqrt.c @@ -50,11 +50,14 @@ CORE_ztpqrt_parsec( parsec_execution_stream_t *context, void INSERT_TASK_ztpqrt( const RUNTIME_option_t *options, int M, int N, int L, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + const CHAM_desc_t *T, int Tm, int Tn ) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); + CHAM_tile_t *tileT = T->get_blktile( T, Tm, Tn ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_ztpqrt_parsec, options->priority, "tpqrt", @@ -63,11 +66,11 @@ void INSERT_TASK_ztpqrt( const RUNTIME_option_t *options, sizeof(int), &L, VALUE, sizeof(int), &ib, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INOUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PASSED_BY_REF, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), chameleon_parsec_get_arena_index( B ) | INOUT | AFFINITY, - sizeof(int), &ldb, VALUE, + sizeof(int), &(tileB->ld), VALUE, PASSED_BY_REF, RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), chameleon_parsec_get_arena_index( T ) | OUTPUT, - sizeof(int), &ldt, VALUE, + sizeof(int), &(tileT->ld), VALUE, sizeof(CHAMELEON_Complex64_t)*(ib+1)*nb, NULL, SCRATCH, PARSEC_DTD_ARG_END ); } diff --git a/runtime/parsec/codelets/codelet_ztradd.c b/runtime/parsec/codelets/codelet_ztradd.c index fdfb35729e593c483e4a43bfca2b2bc8e0f900bb..825b71a01f9595a9acd70a8abfa37d0948c6e180 100644 --- a/runtime/parsec/codelets/codelet_ztradd.c +++ b/runtime/parsec/codelets/codelet_ztradd.c @@ -12,8 +12,6 @@ * @brief Chameleon ztradd PaRSEC codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @date 2015-11-04 * @precisions normal z -> c d s @@ -108,10 +106,12 @@ CORE_ztradd_parsec( parsec_execution_stream_t *context, */ void INSERT_TASK_ztradd( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn, int ldb ) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn ) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_ztradd_parsec, options->priority, "tradd", @@ -121,10 +121,10 @@ void INSERT_TASK_ztradd( const RUNTIME_option_t *options, sizeof(int), &n, VALUE, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, sizeof(CHAMELEON_Complex64_t), &beta, VALUE, PASSED_BY_REF, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), chameleon_parsec_get_arena_index( B ) | INOUT | AFFINITY, - sizeof(int), &ldb, VALUE, + sizeof(int), &(tileB->ld), VALUE, PARSEC_DTD_ARG_END ); (void)nb; diff --git a/runtime/parsec/codelets/codelet_ztrasm.c b/runtime/parsec/codelets/codelet_ztrasm.c index dc222f01df905923496830579dd67386ef221548..71ceb01ebc8e0b966af20b003015e6e6ac3bf8d9 100644 --- a/runtime/parsec/codelets/codelet_ztrasm.c +++ b/runtime/parsec/codelets/codelet_ztrasm.c @@ -45,10 +45,11 @@ CORE_ztrasm_parsec( parsec_execution_stream_t *context, void INSERT_TASK_ztrasm(const RUNTIME_option_t *options, cham_store_t storev, cham_uplo_t uplo, cham_diag_t diag, int M, int N, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_ztrasm_parsec, options->priority, "trasm", @@ -58,7 +59,7 @@ void INSERT_TASK_ztrasm(const RUNTIME_option_t *options, sizeof(int), &M, VALUE, sizeof(int), &N, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PASSED_BY_REF, RTBLKADDR( B, double, Bm, Bn ), INOUT | AFFINITY, PARSEC_DTD_ARG_END ); } diff --git a/runtime/parsec/codelets/codelet_ztrmm.c b/runtime/parsec/codelets/codelet_ztrmm.c index a7b16e6aa6268235bedcdc73ad0e52ca866af515..449647bfd6b98845af8fa0ae85475429bfcd9eb0 100644 --- a/runtime/parsec/codelets/codelet_ztrmm.c +++ b/runtime/parsec/codelets/codelet_ztrmm.c @@ -53,10 +53,12 @@ CORE_ztrmm_parsec( parsec_execution_stream_t *context, void INSERT_TASK_ztrmm(const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_ztrmm_parsec, options->priority, "trmm", @@ -68,9 +70,9 @@ void INSERT_TASK_ztrmm(const RUNTIME_option_t *options, sizeof(int), &n, VALUE, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PASSED_BY_REF, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), chameleon_parsec_get_arena_index( B ) | INOUT | AFFINITY, - sizeof(int), &ldb, VALUE, + sizeof(int), &(tileB->ld), VALUE, PARSEC_DTD_ARG_END ); (void)nb; diff --git a/runtime/parsec/codelets/codelet_ztrsm.c b/runtime/parsec/codelets/codelet_ztrsm.c index 0d068b3c63c39ebce2294c498c7808589fea8599..64bff731bf45c46f8d936a47fa50229eaa8e6806 100644 --- a/runtime/parsec/codelets/codelet_ztrsm.c +++ b/runtime/parsec/codelets/codelet_ztrsm.c @@ -45,10 +45,12 @@ CORE_ztrsm_parsec( parsec_execution_stream_t *context, void INSERT_TASK_ztrsm(const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_ztrsm_parsec, options->priority, "Trsm", @@ -60,9 +62,9 @@ void INSERT_TASK_ztrsm(const RUNTIME_option_t *options, sizeof(int), &n, VALUE, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PASSED_BY_REF, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), chameleon_parsec_get_arena_index( B ) | INOUT | AFFINITY, - sizeof(int), &ldb, VALUE, + sizeof(int), &(tileB->ld), VALUE, PARSEC_DTD_ARG_END ); (void)nb; diff --git a/runtime/parsec/codelets/codelet_ztrssq.c b/runtime/parsec/codelets/codelet_ztrssq.c index f56c8b7ef10fa9a48c35856dedfc593f9589e34e..b3e1f25885f004a249f6d2aaa559a31dcf390cac 100644 --- a/runtime/parsec/codelets/codelet_ztrssq.c +++ b/runtime/parsec/codelets/codelet_ztrssq.c @@ -45,10 +45,11 @@ CORE_ztrssq_parsec( parsec_execution_stream_t *context, void INSERT_TASK_ztrssq( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_diag_t diag, int m, int n, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_ztrssq_parsec, options->priority, "trssq", @@ -57,7 +58,7 @@ void INSERT_TASK_ztrssq( const RUNTIME_option_t *options, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PASSED_BY_REF, RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ), INOUT | AFFINITY, PARSEC_DTD_ARG_END ); } diff --git a/runtime/parsec/codelets/codelet_ztrtri.c b/runtime/parsec/codelets/codelet_ztrtri.c index c8f80aa308b634a328e139680590ac1b4901a606..e91aa9acec64fdcac7783a9e2d6e418a570ff16c 100644 --- a/runtime/parsec/codelets/codelet_ztrtri.c +++ b/runtime/parsec/codelets/codelet_ztrtri.c @@ -51,10 +51,11 @@ CORE_ztrtri_parsec( parsec_execution_stream_t *context, void INSERT_TASK_ztrtri( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_diag_t diag, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, int iinfo ) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_ztrtri_parsec, options->priority, "trtri", @@ -62,7 +63,7 @@ void INSERT_TASK_ztrtri( const RUNTIME_option_t *options, sizeof(int), &diag, VALUE, sizeof(int), &n, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INOUT | AFFINITY, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, sizeof(int), &iinfo, VALUE, sizeof(RUNTIME_sequence_t*), &(options->sequence), VALUE, sizeof(RUNTIME_request_t*), &(options->request), VALUE, diff --git a/runtime/parsec/codelets/codelet_ztsmlq_hetra1.c b/runtime/parsec/codelets/codelet_ztsmlq_hetra1.c index 5df710c3a3c1bcc196bd83c5f1ba6b1cfbcaa335..b37325c2f67b23a1711d2e87dde81eb796f80e19 100644 --- a/runtime/parsec/codelets/codelet_ztsmlq_hetra1.c +++ b/runtime/parsec/codelets/codelet_ztsmlq_hetra1.c @@ -62,12 +62,16 @@ CORE_ztsmlq_hetra1_parsec( parsec_execution_stream_t *context, void INSERT_TASK_ztsmlq_hetra1(const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) + const CHAM_desc_t *A1, int A1m, int A1n, + const CHAM_desc_t *A2, int A2m, int A2n, + const CHAM_desc_t *V, int Vm, int Vn, + const CHAM_desc_t *T, int Tm, int Tn) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA1 = A1->get_blktile( A1, A1m, A1n ); + CHAM_tile_t *tileA2 = A2->get_blktile( A2, A2m, A2n ); + CHAM_tile_t *tileV = V->get_blktile( V, Vm, Vn ); + CHAM_tile_t *tileT = T->get_blktile( T, Tm, Tn ); int ldwork = side == ChamLeft ? ib : nb; parsec_dtd_taskpool_insert_task( @@ -81,13 +85,13 @@ void INSERT_TASK_ztsmlq_hetra1(const RUNTIME_option_t *options, sizeof(int), &k, VALUE, sizeof(int), &ib, VALUE, PASSED_BY_REF, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), INOUT, - sizeof(int), &lda1, VALUE, + sizeof(int), &(tileA1->ld), VALUE, PASSED_BY_REF, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), INOUT | AFFINITY, - sizeof(int), &lda2, VALUE, + sizeof(int), &(tileA2->ld), VALUE, PASSED_BY_REF, RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn), INPUT, - sizeof(int), &ldv, VALUE, + sizeof(int), &(tileV->ld), VALUE, PASSED_BY_REF, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), INPUT, - sizeof(int), &ldt, VALUE, + sizeof(int), &(tileT->ld), VALUE, sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, sizeof(int), &ldwork, VALUE, PARSEC_DTD_ARG_END ); diff --git a/runtime/parsec/codelets/codelet_ztsmqr_hetra1.c b/runtime/parsec/codelets/codelet_ztsmqr_hetra1.c index 3aeddf268ce41863394771a47e7bf0051a0c36d9..4e04d8b4b5b49ebc1d27a2ba3fdc9ac36ebd275b 100644 --- a/runtime/parsec/codelets/codelet_ztsmqr_hetra1.c +++ b/runtime/parsec/codelets/codelet_ztsmqr_hetra1.c @@ -62,12 +62,16 @@ CORE_ztsmqr_hetra1_parsec( parsec_execution_stream_t *context, void INSERT_TASK_ztsmqr_hetra1(const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) + const CHAM_desc_t *A1, int A1m, int A1n, + const CHAM_desc_t *A2, int A2m, int A2n, + const CHAM_desc_t *V, int Vm, int Vn, + const CHAM_desc_t *T, int Tm, int Tn) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA1 = A1->get_blktile( A1, A1m, A1n ); + CHAM_tile_t *tileA2 = A2->get_blktile( A2, A2m, A2n ); + CHAM_tile_t *tileV = V->get_blktile( V, Vm, Vn ); + CHAM_tile_t *tileT = T->get_blktile( T, Tm, Tn ); int ldwork = side == ChamLeft ? ib : nb; parsec_dtd_taskpool_insert_task( @@ -81,13 +85,13 @@ void INSERT_TASK_ztsmqr_hetra1(const RUNTIME_option_t *options, sizeof(int), &k, VALUE, sizeof(int), &ib, VALUE, PASSED_BY_REF, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), INOUT, - sizeof(int), &lda1, VALUE, + sizeof(int), &(tileA1->ld), VALUE, PASSED_BY_REF, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), INOUT | AFFINITY, - sizeof(int), &lda2, VALUE, + sizeof(int), &(tileA2->ld), VALUE, PASSED_BY_REF, RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn), INPUT, - sizeof(int), &ldv, VALUE, + sizeof(int), &(tileV->ld), VALUE, PASSED_BY_REF, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), INPUT, - sizeof(int), &ldt, VALUE, + sizeof(int), &(tileT->ld), VALUE, sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, sizeof(int), &ldwork, VALUE, PARSEC_DTD_ARG_END ); diff --git a/runtime/parsec/codelets/codelet_ztstrf.c b/runtime/parsec/codelets/codelet_ztstrf.c index 5cf60cee1a32379ee7d52c90bbacae3097d2ae65..b489bab07aaef59b9b58a11e92f74d8cff8f2bc2 100644 --- a/runtime/parsec/codelets/codelet_ztstrf.c +++ b/runtime/parsec/codelets/codelet_ztstrf.c @@ -59,13 +59,16 @@ CORE_ztstrf_parsec( parsec_execution_stream_t *context, void INSERT_TASK_ztstrf(const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *U, int Um, int Un, int ldu, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *L, int Lm, int Ln, int ldl, + const CHAM_desc_t *U, int Um, int Un, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *L, int Lm, int Ln, int *IPIV, cham_bool_t check_info, int iinfo) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileU = U->get_blktile( U, Um, Un ); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileL = L->get_blktile( L, Lm, Ln ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_ztstrf_parsec, options->priority, "tstrf", @@ -74,11 +77,11 @@ void INSERT_TASK_ztstrf(const RUNTIME_option_t *options, sizeof(int), &ib, VALUE, sizeof(int), &nb, VALUE, PASSED_BY_REF, RTBLKADDR( U, CHAMELEON_Complex64_t, Um, Un ), chameleon_parsec_get_arena_index( U ) | INOUT, - sizeof(int), &ldu, VALUE, + sizeof(int), &(tileU->ld), VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INOUT | AFFINITY, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PASSED_BY_REF, RTBLKADDR( L, CHAMELEON_Complex64_t, Lm, Ln ), chameleon_parsec_get_arena_index( L ) | OUTPUT, - sizeof(int), &ldl, VALUE, + sizeof(int), &(tileL->ld), VALUE, sizeof(int*), &IPIV, VALUE, sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, sizeof(int), &nb, VALUE, diff --git a/runtime/parsec/codelets/codelet_zunmlq.c b/runtime/parsec/codelets/codelet_zunmlq.c index f1c725cc3ea144e531539d39152d9ce6052b8ea9..13866d1921f114f30365fd282813db2077021985 100644 --- a/runtime/parsec/codelets/codelet_zunmlq.c +++ b/runtime/parsec/codelets/codelet_zunmlq.c @@ -53,11 +53,14 @@ CORE_zunmlq_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zunmlq(const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int m, int n, int k, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, - const CHAM_desc_t *C, int Cm, int Cn, int ldc) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *T, int Tm, int Tn, + const CHAM_desc_t *C, int Cm, int Cn) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileT = T->get_blktile( T, Tm, Tn ); + CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zunmlq_parsec, options->priority, "unmlq", @@ -68,11 +71,11 @@ void INSERT_TASK_zunmlq(const RUNTIME_option_t *options, sizeof(int), &k, VALUE, sizeof(int), &ib, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PASSED_BY_REF, RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), chameleon_parsec_get_arena_index( T ) | INPUT, - sizeof(int), &ldt, VALUE, + sizeof(int), &(tileT->ld), VALUE, PASSED_BY_REF, RTBLKADDR( C, CHAMELEON_Complex64_t, Cm, Cn ), chameleon_parsec_get_arena_index( C ) | INOUT | AFFINITY, - sizeof(int), &ldc, VALUE, + sizeof(int), &(tileC->ld), VALUE, sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, sizeof(int), &nb, VALUE, PARSEC_DTD_ARG_END ); diff --git a/runtime/parsec/codelets/codelet_zunmqr.c b/runtime/parsec/codelets/codelet_zunmqr.c index 66eb30cdbac747fc4299d154c7c945fb73c9eead..2e5ff664fbe8f6fc33565481bd964e911bdc447e 100644 --- a/runtime/parsec/codelets/codelet_zunmqr.c +++ b/runtime/parsec/codelets/codelet_zunmqr.c @@ -53,11 +53,14 @@ CORE_zunmqr_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zunmqr(const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int m, int n, int k, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, - const CHAM_desc_t *C, int Cm, int Cn, int ldc) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *T, int Tm, int Tn, + const CHAM_desc_t *C, int Cm, int Cn) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileT = T->get_blktile( T, Tm, Tn ); + CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn ); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zunmqr_parsec, options->priority, "unmqr", @@ -68,11 +71,11 @@ void INSERT_TASK_zunmqr(const RUNTIME_option_t *options, sizeof(int), &k, VALUE, sizeof(int), &ib, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &lda, VALUE, + sizeof(int), &(tileA->ld), VALUE, PASSED_BY_REF, RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), chameleon_parsec_get_arena_index( T ) | INPUT, - sizeof(int), &ldt, VALUE, + sizeof(int), &(tileT->ld), VALUE, PASSED_BY_REF, RTBLKADDR( C, CHAMELEON_Complex64_t, Cm, Cn ), chameleon_parsec_get_arena_index( C ) | INOUT | AFFINITY, - sizeof(int), &ldc, VALUE, + sizeof(int), &(tileC->ld), VALUE, sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, sizeof(int), &nb, VALUE, PARSEC_DTD_ARG_END ); diff --git a/runtime/parsec/include/chameleon_parsec.h b/runtime/parsec/include/chameleon_parsec.h index 94a850f753596a092c12d16edcf0859c3aa1abdd..1d2a035c695e3a13198e1cd718a036369e1a2a33 100644 --- a/runtime/parsec/include/chameleon_parsec.h +++ b/runtime/parsec/include/chameleon_parsec.h @@ -32,7 +32,7 @@ struct chameleon_parsec_desc_s { parsec_data_collection_t super; int arena_index; - CHAM_desc_t *desc; + CHAM_desc_t *desc; parsec_data_t **data_map; }; diff --git a/runtime/quark/codelets/codelet_dzasum.c b/runtime/quark/codelets/codelet_dzasum.c index 8c49a6f76160564cdc2100ba9cee6a419260ff0e..86b1a6e2b8583ecec039d015da45128c5c7c3d96 100644 --- a/runtime/quark/codelets/codelet_dzasum.c +++ b/runtime/quark/codelets/codelet_dzasum.c @@ -12,8 +12,6 @@ * @brief Chameleon dzasum Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.6.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @date 2014-11-16 * @precisions normal z -> c d s @@ -21,25 +19,25 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" -void CORE_dzasum_quark(Quark *quark) +static inline void +CORE_dzasum_quark(Quark *quark) { cham_store_t storev; cham_uplo_t uplo; int M; int N; - CHAMELEON_Complex64_t *A; - int lda; - double *work; + CHAM_tile_t *A; + CHAM_tile_t *work; - quark_unpack_args_7(quark, storev, uplo, M, N, A, lda, work); - CORE_dzasum(storev, uplo, M, N, A, lda, work); + quark_unpack_args_6(quark, storev, uplo, M, N, A, work); + TCORE_dzasum( storev, uplo, M, N, A, work->mat ); } void INSERT_TASK_dzasum(const RUNTIME_option_t *options, cham_store_t storev, cham_uplo_t uplo, int M, int N, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn) { quark_option_t *opt = (quark_option_t*)(options->schedopt); @@ -49,8 +47,7 @@ void INSERT_TASK_dzasum(const RUNTIME_option_t *options, sizeof(int), &uplo, VALUE, sizeof(int), &M, VALUE, sizeof(int), &N, VALUE, - sizeof(CHAMELEON_Complex64_t)*lda*N, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &lda, VALUE, - sizeof(double), RTBLKADDR(B, double, Bm, Bn), INOUT, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, + sizeof(void*), RTBLKADDR(B, double, Bm, Bn), INOUT, 0); } diff --git a/runtime/quark/codelets/codelet_map.c b/runtime/quark/codelets/codelet_map.c index 5e51cd9c552f3d998dc764b5b008194f5fffc85c..5b6ed1ac8dfa5fa80e49a85637ef52c1a978eeb5 100644 --- a/runtime/quark/codelets/codelet_map.c +++ b/runtime/quark/codelets/codelet_map.c @@ -23,12 +23,12 @@ void CORE_map_quark(Quark *quark) cham_uplo_t uplo; int m; int n; - void *data; + CHAM_tile_t *tile; cham_unary_operator_t op_fct; void *op_args; - quark_unpack_args_7( quark, desc, uplo, m, n, data, op_fct, op_args ); - op_fct( desc, uplo, m, n, data, op_args ); + quark_unpack_args_7( quark, desc, uplo, m, n, tile, op_fct, op_args ); + op_fct( desc, uplo, m, n, tile, op_args ); } void INSERT_TASK_map( const RUNTIME_option_t *options, @@ -43,7 +43,7 @@ void INSERT_TASK_map( const RUNTIME_option_t *options, sizeof(cham_uplo_t), &uplo, VALUE, sizeof(int), &Am, VALUE, sizeof(int), &An, VALUE, - sizeof(char), RTBLKADDR(A, void, Am, An), INOUT, + sizeof(void*), RTBLKADDR(A, void, Am, An), INOUT, sizeof(cham_unary_operator_t), &op_fct, VALUE, sizeof(void*), &op_args, VALUE, 0); diff --git a/runtime/quark/codelets/codelet_zaxpy.c b/runtime/quark/codelets/codelet_zaxpy.c index be7fd595991dbc1bdf671fe813db39d3870e211a..6e77e3ca1a2c6f9874e00efd63271114f03845a2 100644 --- a/runtime/quark/codelets/codelet_zaxpy.c +++ b/runtime/quark/codelets/codelet_zaxpy.c @@ -12,8 +12,6 @@ * @brief Chameleon zaxpy Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.6.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @date 2014-11-16 * @precisions normal z -> c d s @@ -21,19 +19,19 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zaxpy_quark(Quark *quark) { int M; CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t *A; + CHAM_tile_t *tileA; int incA; - CHAMELEON_Complex64_t *B; + CHAM_tile_t *tileB; int incB; - quark_unpack_args_6(quark, M, alpha, A, incA, B, incB); - CORE_zaxpy(M, alpha, A, incA, B, incB); + quark_unpack_args_6( quark, M, alpha, tileA, incA, tileB, incB ); + TCORE_zaxpy(M, alpha, tileA, incA, tileB, incB); } void INSERT_TASK_zaxpy(const RUNTIME_option_t *options, @@ -46,9 +44,9 @@ void INSERT_TASK_zaxpy(const RUNTIME_option_t *options, QUARK_Insert_Task(opt->quark, CORE_zaxpy_quark, (Quark_Task_Flags*)opt, sizeof(int), &M, VALUE, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, - sizeof(CHAMELEON_Complex64_t)*M, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, sizeof(int), &incA, VALUE, - sizeof(CHAMELEON_Complex64_t)*M, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INOUT, + sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INOUT, sizeof(int), &incB, VALUE, 0); } diff --git a/runtime/quark/codelets/codelet_zbuild.c b/runtime/quark/codelets/codelet_zbuild.c index d7269a6af4bf992dd10175407d62387c4b037fa2..ff2de7675e62a5630c2549c989f59dadc353b0fa 100644 --- a/runtime/quark/codelets/codelet_zbuild.c +++ b/runtime/quark/codelets/codelet_zbuild.c @@ -12,8 +12,6 @@ * @brief Chameleon zbuild Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Piotr Luszczek * @author Pierre Lemarinier * @author Mathieu Faverge @@ -26,23 +24,22 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zbuild_quark(Quark *quark) { - CHAMELEON_Complex64_t *A; - int lda; + CHAM_tile_t *tileA; void *user_data; void (*user_build_callback)(int row_min, int row_max, int col_min, int col_max, void *buffer, int ld, void *user_data) ; int row_min, row_max, col_min, col_max; - quark_unpack_args_8( quark, row_min, row_max, col_min, col_max, A, lda, user_data, user_build_callback); + quark_unpack_args_7( quark, row_min, row_max, col_min, col_max, tileA, user_data, user_build_callback); - user_build_callback(row_min, row_max, col_min, col_max, A, lda, user_data); + user_build_callback(row_min, row_max, col_min, col_max, tileA->mat, tileA->ld, user_data); } void INSERT_TASK_zbuild( const RUNTIME_option_t *options, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, void *user_data, void* user_build_callback ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); @@ -58,8 +55,7 @@ void INSERT_TASK_zbuild( const RUNTIME_option_t *options, sizeof(int), &row_max, VALUE, sizeof(int), &col_min, VALUE, sizeof(int), &col_max, VALUE, - sizeof(CHAMELEON_Complex64_t)*lda*A->nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), OUTPUT, - sizeof(int), &lda, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), OUTPUT, sizeof(void*), &user_data, VALUE, sizeof(void*), &user_build_callback, VALUE, 0); diff --git a/runtime/quark/codelets/codelet_zgeadd.c b/runtime/quark/codelets/codelet_zgeadd.c index dd8205a6ed43d16cba0691f0bb815ea479bd8ff4..4e0aef869dac6a9557d577a9519d546a69d953d3 100644 --- a/runtime/quark/codelets/codelet_zgeadd.c +++ b/runtime/quark/codelets/codelet_zgeadd.c @@ -12,8 +12,6 @@ * @brief Chameleon zgeadd Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede @@ -23,7 +21,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zgeadd_quark(Quark *quark) { @@ -31,14 +29,12 @@ void CORE_zgeadd_quark(Quark *quark) int M; int N; CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t *A; - int LDA; + CHAM_tile_t *tileA; CHAMELEON_Complex64_t beta; - CHAMELEON_Complex64_t *B; - int LDB; + CHAM_tile_t *tileB; - quark_unpack_args_9(quark, trans, M, N, alpha, A, LDA, beta, B, LDB); - CORE_zgeadd(trans, M, N, alpha, A, LDA, beta, B, LDB); + quark_unpack_args_7(quark, trans, M, N, alpha, tileA, beta, tileB); + TCORE_zgeadd(trans, M, N, alpha, tileA, beta, tileB); return; } @@ -97,8 +93,8 @@ void CORE_zgeadd_quark(Quark *quark) */ void INSERT_TASK_zgeadd( const RUNTIME_option_t *options, cham_trans_t trans, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn, int ldb ) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_GEADD; @@ -107,11 +103,9 @@ void INSERT_TASK_zgeadd( const RUNTIME_option_t *options, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, - sizeof(CHAMELEON_Complex64_t)*lda*n, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &lda, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, sizeof(CHAMELEON_Complex64_t), &beta, VALUE, - sizeof(CHAMELEON_Complex64_t)*ldb*n, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INOUT, - sizeof(int), &ldb, VALUE, + sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INOUT, 0); (void)nb; diff --git a/runtime/quark/codelets/codelet_zgelqt.c b/runtime/quark/codelets/codelet_zgelqt.c index 6704bd55a2956181927d622aadf9a4d1267d53f4..d294df89ba89e09db2c45e7aa8081e5670699d0b 100644 --- a/runtime/quark/codelets/codelet_zgelqt.c +++ b/runtime/quark/codelets/codelet_zgelqt.c @@ -12,8 +12,6 @@ * @brief Chameleon zgelqt Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge @@ -25,89 +23,27 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zgelqt_quark(Quark *quark) { int m; int n; int ib; - CHAMELEON_Complex64_t *A; - int lda; - CHAMELEON_Complex64_t *T; - int ldt; + CHAM_tile_t *tileA; + CHAM_tile_t *tileT; CHAMELEON_Complex64_t *TAU; CHAMELEON_Complex64_t *WORK; - quark_unpack_args_9(quark, m, n, ib, A, lda, T, ldt, TAU, WORK); - CORE_zlaset( ChamUpperLower, ib, m, 0., 0., T, ldt ); - CORE_zgelqt(m, n, ib, A, lda, T, ldt, TAU, WORK); + quark_unpack_args_7(quark, m, n, ib, tileA, tileT, TAU, WORK); + TCORE_zlaset( ChamUpperLower, ib, m, 0., 0., tileT ); + TCORE_zgelqt(m, n, ib, tileA, tileT, TAU, WORK); } -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zgelqt - computes a LQ factorization of a complex M-by-N tile A: A = L * Q. - * - * The tile Q is represented as a product of elementary reflectors - * - * Q = H(k)' . . . H(2)' H(1)', where k = min(M,N). - * - * Each H(i) has the form - * - * H(i) = I - tau * v * v' - * - * where tau is a complex scalar, and v is a complex vector with - * v(1:i-1) = 0 and v(i) = 1; conjg(v(i+1:n)) is stored on exit in - * A(i,i+1:n), and tau in TAU(i). - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the tile A. M >= 0. - * - * @param[in] N - * The number of columns of the tile A. N >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A - * On entry, the M-by-N tile A. - * On exit, the elements on and below the diagonal of the array - * contain the M-by-min(M,N) lower trapezoidal tile L (L is - * lower triangular if M <= N); the elements above the diagonal, - * with the array TAU, represent the unitary tile Q as a - * product of elementary reflectors (see Further Details). - * - * @param[in] LDA - * The leading dimension of the array A. LDA >= max(1,M). - * - * @param[out] T - * The IB-by-N triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] LDT - * The leading dimension of the array T. LDT >= IB. - * - * @param[out] TAU - * The scalar factors of the elementary reflectors (see Further - * Details). - * - * @param[out] WORK - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if -i, the i-th argument had an illegal value - * - */ void INSERT_TASK_zgelqt(const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *T, int Tm, int Tn) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_GELQT; @@ -115,10 +51,8 @@ void INSERT_TASK_zgelqt(const RUNTIME_option_t *options, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, sizeof(int), &ib, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT, - sizeof(int), &lda, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), OUTPUT, - sizeof(int), &ldt, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT, + sizeof(void*), RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), OUTPUT, sizeof(CHAMELEON_Complex64_t)*nb, NULL, SCRATCH, sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, 0); diff --git a/runtime/quark/codelets/codelet_zgemm.c b/runtime/quark/codelets/codelet_zgemm.c index b736b9b880ff369d3bfc00b92a020900e79e4d6c..7bdffae459449ba81628f2f9a6b3f975d2e8fb3a 100644 --- a/runtime/quark/codelets/codelet_zgemm.c +++ b/runtime/quark/codelets/codelet_zgemm.c @@ -12,8 +12,6 @@ * @brief Chameleon zgemm Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge @@ -25,7 +23,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zgemm_quark(Quark *quark) { @@ -35,28 +33,25 @@ void CORE_zgemm_quark(Quark *quark) int n; int k; CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t *A; - int lda; - CHAMELEON_Complex64_t *B; - int ldb; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; CHAMELEON_Complex64_t beta; - CHAMELEON_Complex64_t *C; - int ldc; + CHAM_tile_t *tileC; - quark_unpack_args_13(quark, transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); - CORE_zgemm(transA, transB, - m, n, k, - alpha, A, lda, - B, ldb, - beta, C, ldc); + quark_unpack_args_10(quark, transA, transB, m, n, k, alpha, tileA, tileB, beta, tileC); + TCORE_zgemm( transA, transB, + m, n, k, + alpha, tileA, + tileB, + beta, tileC ); } void INSERT_TASK_zgemm(const RUNTIME_option_t *options, cham_trans_t transA, cham_trans_t transB, int m, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_GEMM; @@ -67,12 +62,9 @@ void INSERT_TASK_zgemm(const RUNTIME_option_t *options, sizeof(int), &n, VALUE, sizeof(int), &k, VALUE, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &lda, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INPUT, - sizeof(int), &ldb, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, + sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INPUT, sizeof(CHAMELEON_Complex64_t), &beta, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), INOUT, - sizeof(int), &ldc, VALUE, + sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), INOUT, 0); } diff --git a/runtime/quark/codelets/codelet_zgeqrt.c b/runtime/quark/codelets/codelet_zgeqrt.c index 4aa554ea90b8ef64e8d550eb4244f1403ab8873b..2cf6d7a328e568f72dff86d0e82ba93820c97c03 100644 --- a/runtime/quark/codelets/codelet_zgeqrt.c +++ b/runtime/quark/codelets/codelet_zgeqrt.c @@ -12,8 +12,6 @@ * @brief Chameleon zgeqrt Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge @@ -25,90 +23,27 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zgeqrt_quark(Quark *quark) { int m; int n; int ib; - CHAMELEON_Complex64_t *A; - int lda; - CHAMELEON_Complex64_t *T; - int ldt; + CHAM_tile_t *tileA; + CHAM_tile_t *tileT; CHAMELEON_Complex64_t *TAU; CHAMELEON_Complex64_t *WORK; - quark_unpack_args_9(quark, m, n, ib, A, lda, T, ldt, TAU, WORK); - CORE_zlaset( ChamUpperLower, ib, n, 0., 0., T, ldt ); - CORE_zgeqrt(m, n, ib, A, lda, T, ldt, TAU, WORK); + quark_unpack_args_7(quark, m, n, ib, tileA, tileT, TAU, WORK); + TCORE_zlaset( ChamUpperLower, ib, n, 0., 0., tileT ); + TCORE_zgeqrt( m, n, ib, tileA, tileT, TAU, WORK ); } -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zgeqrt computes a QR factorization of a complex M-by-N tile A: - * A = Q * R. - * - * The tile Q is represented as a product of elementary reflectors - * - * Q = H(1) H(2) . . . H(k), where k = min(M,N). - * - * Each H(i) has the form - * - * H(i) = I - tau * v * v' - * - * where tau is a complex scalar, and v is a complex vector with - * v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), - * and tau in TAU(i). - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the tile A. M >= 0. - * - * @param[in] N - * The number of columns of the tile A. N >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A - * On entry, the M-by-N tile A. - * On exit, the elements on and above the diagonal of the array - * contain the min(M,N)-by-N upper trapezoidal tile R (R is - * upper triangular if M >= N); the elements below the diagonal, - * with the array TAU, represent the unitary tile Q as a - * product of elementary reflectors (see Further Details). - * - * @param[in] LDA - * The leading dimension of the array A. LDA >= max(1,M). - * - * @param[out] T - * The IB-by-N triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] LDT - * The leading dimension of the array T. LDT >= IB. - * - * @param[out] TAU - * The scalar factors of the elementary reflectors (see Further - * Details). - * - * @param[out] WORK - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if -i, the i-th argument had an illegal value - * - */ void INSERT_TASK_zgeqrt(const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *T, int Tm, int Tn) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_GEQRT; @@ -116,10 +51,8 @@ void INSERT_TASK_zgeqrt(const RUNTIME_option_t *options, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, sizeof(int), &ib, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT, - sizeof(int), &lda, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), OUTPUT, - sizeof(int), &ldt, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT, + sizeof(void*), RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), OUTPUT, sizeof(CHAMELEON_Complex64_t)*nb, NULL, SCRATCH, sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, 0); diff --git a/runtime/quark/codelets/codelet_zgessm.c b/runtime/quark/codelets/codelet_zgessm.c index 53a6b3eb9f4746fe2f3a0d288f6710ce9d17a7bd..5a893bd8549f3d4e77dbde391f25eab915330d70 100644 --- a/runtime/quark/codelets/codelet_zgessm.c +++ b/runtime/quark/codelets/codelet_zgessm.c @@ -12,8 +12,6 @@ * @brief Chameleon zgessm Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge @@ -26,7 +24,7 @@ #include "coreblas/cblas.h" #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zgessm_quark(Quark *quark) { @@ -35,67 +33,20 @@ void CORE_zgessm_quark(Quark *quark) int k; int ib; int *IPIV; - CHAMELEON_Complex64_t *L; - int ldl; - CHAMELEON_Complex64_t *D; - int ldd; - CHAMELEON_Complex64_t *A; - int lda; + CHAM_tile_t *tileL; + CHAM_tile_t *tileD; + CHAM_tile_t *tileA; - quark_unpack_args_11(quark, m, n, k, ib, IPIV, L, ldl, D, ldd, A, lda); - CORE_zgessm(m, n, k, ib, IPIV, D, ldd, A, lda); + quark_unpack_args_8(quark, m, n, k, ib, IPIV, tileL, tileD, tileA); + TCORE_zgessm(m, n, k, ib, IPIV, tileD, tileA); } -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zgessm applies the factors L computed by CORE_zgetrf_incpiv to - * a complex M-by-N tile A. - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the tile A. M >= 0. - * - * @param[in] N - * The number of columns of the tile A. N >= 0. - * - * @param[in] K - * The number of columns of the tile L. K >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in] IPIV - * The pivot indices array of size K as returned by - * CORE_zgetrf_incpiv. - * - * @param[in] L - * The M-by-K lower triangular tile. - * - * @param[in] LDL - * The leading dimension of the array L. LDL >= max(1,M). - * - * @param[in,out] A - * On entry, the M-by-N tile A. - * On exit, updated by the application of L. - * - * @param[in] LDA - * The leading dimension of the array A. LDA >= max(1,M). - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if INFO = -k, the k-th argument had an illegal value - * - */ void INSERT_TASK_zgessm(const RUNTIME_option_t *options, int m, int n, int k, int ib, int nb, int *IPIV, - const CHAM_desc_t *L, int Lm, int Ln, int ldl, - const CHAM_desc_t *D, int Dm, int Dn, int ldd, - const CHAM_desc_t *A, int Am, int An, int lda) + const CHAM_desc_t *L, int Lm, int Ln, + const CHAM_desc_t *D, int Dm, int Dn, + const CHAM_desc_t *A, int Am, int An) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_GESSM; @@ -105,11 +56,8 @@ void INSERT_TASK_zgessm(const RUNTIME_option_t *options, sizeof(int), &k, VALUE, sizeof(int), &ib, VALUE, sizeof(int)*nb, IPIV, INPUT, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(L, CHAMELEON_Complex64_t, Lm, Ln), INPUT | QUARK_REGION_L, - sizeof(int), &ldl, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(D, CHAMELEON_Complex64_t, Dm, Dn), INPUT | QUARK_REGION_L, - sizeof(int), &ldd, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT, - sizeof(int), &lda, VALUE, + sizeof(void*), RTBLKADDR(L, CHAMELEON_Complex64_t, Lm, Ln), INPUT | QUARK_REGION_L, + sizeof(void*), RTBLKADDR(D, CHAMELEON_Complex64_t, Dm, Dn), INPUT | QUARK_REGION_L, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT, 0); } diff --git a/runtime/quark/codelets/codelet_zgessq.c b/runtime/quark/codelets/codelet_zgessq.c index a0f343091a76782f5853b87b73a84da415db1b46..5ea99fc1f206508b7b641c7d935eb4f0d8870522 100644 --- a/runtime/quark/codelets/codelet_zgessq.c +++ b/runtime/quark/codelets/codelet_zgessq.c @@ -12,8 +12,6 @@ * @brief Chameleon zgessq Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.6.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @date 2014-11-16 * @precisions normal z -> c d s @@ -21,44 +19,32 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zgessq_quark(Quark *quark) { cham_store_t storev; int m; int n; - CHAMELEON_Complex64_t *A; - int lda; - double *SCALESUMSQ; + CHAM_tile_t *tileA; + CHAM_tile_t *tileW; - quark_unpack_args_6( quark, storev, m, n, A, lda, SCALESUMSQ ); - CORE_zgessq( storev, m, n, A, lda, SCALESUMSQ ); + quark_unpack_args_5( quark, storev, m, n, tileA, tileW ); + TCORE_zgessq( storev, m, n, tileA, tileW ); } void INSERT_TASK_zgessq( const RUNTIME_option_t *options, - cham_store_t storev, int m, int n, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) + cham_store_t storev, int m, int n, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { - int sizessq; - - if ( storev == ChamColumnwise ) { - sizessq = 2*n; - } else if ( storev == ChamRowwise ) { - sizessq = 2*m; - } else { - sizessq = 2; - } - quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_GESSQ; QUARK_Insert_Task(opt->quark, CORE_zgessq_quark, (Quark_Task_Flags*)opt, sizeof(cham_store_t), &storev, VALUE, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, - sizeof(CHAMELEON_Complex64_t)*m*n, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &lda, VALUE, - sizeof(double)*sizessq, RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INOUT, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, + sizeof(void*), RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INOUT, 0); } diff --git a/runtime/quark/codelets/codelet_zgetrf.c b/runtime/quark/codelets/codelet_zgetrf.c index 86d1c324405b79290b6f7ccfedd028551c4b1bd5..8b235b15d4b0a5cc14a61c7ed90e9acaf1868d4c 100644 --- a/runtime/quark/codelets/codelet_zgetrf.c +++ b/runtime/quark/codelets/codelet_zgetrf.c @@ -12,8 +12,6 @@ * @brief Chameleon zgetrf Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede @@ -23,14 +21,13 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zgetrf_quark(Quark *quark) { int m; int n; - CHAMELEON_Complex64_t *A; - int lda; + CHAM_tile_t *tileA; int *IPIV; RUNTIME_sequence_t *sequence; RUNTIME_request_t *request; @@ -38,8 +35,8 @@ void CORE_zgetrf_quark(Quark *quark) int iinfo; int info; - quark_unpack_args_9(quark, m, n, A, lda, IPIV, sequence, request, check_info, iinfo); - CORE_zgetrf( m, n, A, lda, IPIV, &info ); + quark_unpack_args_8(quark, m, n, tileA, IPIV, sequence, request, check_info, iinfo); + TCORE_zgetrf( m, n, tileA, IPIV, &info ); if ( (info != CHAMELEON_SUCCESS) && check_info ) { RUNTIME_sequence_flush( (CHAM_context_t*)quark, sequence, request, iinfo+info ); } @@ -47,7 +44,7 @@ void CORE_zgetrf_quark(Quark *quark) void INSERT_TASK_zgetrf(const RUNTIME_option_t *options, int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, int *IPIV, cham_bool_t check_info, int iinfo) { @@ -56,11 +53,10 @@ void INSERT_TASK_zgetrf(const RUNTIME_option_t *options, QUARK_Insert_Task(opt->quark, CORE_zgetrf_quark, (Quark_Task_Flags*)opt, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT | LOCALITY, - sizeof(int), &lda, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT | LOCALITY, sizeof(int)*nb, IPIV, OUTPUT, - sizeof(RUNTIME_sequence_t*), &(options->sequence), VALUE, - sizeof(RUNTIME_request_t*), &(options->request), VALUE, + sizeof(RUNTIME_sequence_t*), &(options->sequence), VALUE, + sizeof(RUNTIME_request_t*), &(options->request), VALUE, sizeof(cham_bool_t), &check_info, VALUE, sizeof(int), &iinfo, VALUE, 0); diff --git a/runtime/quark/codelets/codelet_zgetrf_incpiv.c b/runtime/quark/codelets/codelet_zgetrf_incpiv.c index 7a4da42d7a63e9cda85b89cb55ea6e33062bdb09..2bac0ee19a3634e9afb4ca9dc936075e921bf92c 100644 --- a/runtime/quark/codelets/codelet_zgetrf_incpiv.c +++ b/runtime/quark/codelets/codelet_zgetrf_incpiv.c @@ -12,8 +12,6 @@ * @brief Chameleon zgetrf_incpiv Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge @@ -25,15 +23,14 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zgetrf_incpiv_quark(Quark *quark) { int m; int n; int ib; - CHAMELEON_Complex64_t *A; - int lda; + CHAM_tile_t *tileA; int *IPIV; RUNTIME_sequence_t *sequence; RUNTIME_request_t *request; @@ -42,70 +39,17 @@ void CORE_zgetrf_incpiv_quark(Quark *quark) int info; - quark_unpack_args_10(quark, m, n, ib, A, lda, IPIV, sequence, request, check_info, iinfo); - CORE_zgetrf_incpiv(m, n, ib, A, lda, IPIV, &info); + quark_unpack_args_9(quark, m, n, ib, tileA, IPIV, sequence, request, check_info, iinfo); + TCORE_zgetrf_incpiv(m, n, ib, tileA, IPIV, &info); if ( (info != CHAMELEON_SUCCESS) && check_info ) { RUNTIME_sequence_flush( (CHAM_context_t*)quark, sequence, request, iinfo+info ); } } -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zgetrf_incpiv computes an LU factorization of a general M-by-N tile A - * using partial pivoting with row interchanges. - * - * The factorization has the form - * - * A = P * L * U - * - * where P is a permutation matrix, L is lower triangular with unit - * diagonal elements (lower trapezoidal if m > n), and U is upper - * triangular (upper trapezoidal if m < n). - * - * This is the right-looking Level 2.5 BLAS version of the algorithm. - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the tile A. M >= 0. - * - * @param[in] N - * The number of columns of the tile A. N >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A - * On entry, the M-by-N tile to be factored. - * On exit, the factors L and U from the factorization - * A = P*L*U; the unit diagonal elements of L are not stored. - * - * @param[in] LDA - * The leading dimension of the array A. LDA >= max(1,M). - * - * @param[out] IPIV - * The pivot indices; for 1 <= i <= min(M,N), row i of the - * tile was interchanged with row IPIV(i). - * - * @param[out] INFO - * See returned value. - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if INFO = -k, the k-th argument had an illegal value - * @retval >0 if INFO = k, U(k,k) is exactly zero. The factorization - * has been completed, but the factor U is exactly - * singular, and division by zero will occur if it is used - * to solve a system of equations. - * - */ void INSERT_TASK_zgetrf_incpiv(const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *L, int Lm, int Ln, int ldl, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *L, int Lm, int Ln, int *IPIV, cham_bool_t check_info, int iinfo) { @@ -115,8 +59,7 @@ void INSERT_TASK_zgetrf_incpiv(const RUNTIME_option_t *options, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, sizeof(int), &ib, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT, - sizeof(int), &lda, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT, sizeof(int)*nb, IPIV, OUTPUT, sizeof(RUNTIME_sequence_t*), &(options->sequence), VALUE, sizeof(RUNTIME_request_t*), &(options->request), VALUE, @@ -124,5 +67,4 @@ void INSERT_TASK_zgetrf_incpiv(const RUNTIME_option_t *options, sizeof(int), &iinfo, VALUE, 0); - (void)L; (void)Lm; (void)Ln; (void)ldl; } diff --git a/runtime/quark/codelets/codelet_zgetrf_nopiv.c b/runtime/quark/codelets/codelet_zgetrf_nopiv.c index 8dd8e65fd5cdea6892b9f21880ce693f6f0ec88d..130cf7ef41a69e2a4fd012ab68858b2a2115b5da 100644 --- a/runtime/quark/codelets/codelet_zgetrf_nopiv.c +++ b/runtime/quark/codelets/codelet_zgetrf_nopiv.c @@ -22,76 +22,29 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zgetrf_nopiv_quark(Quark *quark) { int m; int n; int ib; - CHAMELEON_Complex64_t *A; - int lda; + CHAM_tile_t *tileA; RUNTIME_sequence_t *sequence; RUNTIME_request_t *request; int iinfo; int info; - quark_unpack_args_8(quark, m, n, ib, A, lda, sequence, request, iinfo); - CORE_zgetrf_nopiv(m, n, ib, A, lda, &info); + quark_unpack_args_7(quark, m, n, ib, tileA, sequence, request, iinfo); + TCORE_zgetrf_nopiv(m, n, ib, tileA, &info); if ( info != CHAMELEON_SUCCESS ) { RUNTIME_sequence_flush( (CHAM_context_t*)quark, sequence, request, iinfo+info ); } } -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zgetrf_nopiv computes an LU factorization of a general diagonal - * dominant M-by-N matrix A witout pivoting. - * - * The factorization has the form - * A = L * U - * where L is lower triangular with unit - * diagonal elements (lower trapezoidal if m > n), and U is upper - * triangular (upper trapezoidal if m < n). - * - * This is the right-looking Level 3 BLAS version of the algorithm. - * WARNING: Your matrix need to be diagonal dominant if you want to call this - * routine safely. - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the matrix A. M >= 0. - * - * @param[in] N - * The number of columns of the matrix A. N >= 0. - * - * @param[in] IB - * The block size to switch between blocked and unblocked code. - * - * @param[in,out] A - * On entry, the M-by-N matrix to be factored. - * On exit, the factors L and U from the factorization - * A = P*L*U; the unit diagonal elements of L are not stored. - * - * @param[in] LDA - * The leading dimension of the array A. LDA >= max(1,M). - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if INFO = -k, the k-th argument had an illegal value - * @retval >0 if INFO = k, U(k,k) is exactly zero. The factorization - * has been completed, but the factor U is exactly - * singular, and division by zero will occur if it is used - * to solve a system of equations. - * - */ void INSERT_TASK_zgetrf_nopiv(const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, int iinfo) { quark_option_t *opt = (quark_option_t*)(options->schedopt); @@ -101,8 +54,7 @@ void INSERT_TASK_zgetrf_nopiv(const RUNTIME_option_t *options, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, sizeof(int), &ib, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT, - sizeof(int), &lda, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT, sizeof(RUNTIME_sequence_t*), &(options->sequence), VALUE, sizeof(RUNTIME_request_t*), &(options->request), VALUE, sizeof(int), &iinfo, VALUE, diff --git a/runtime/quark/codelets/codelet_zgram.c b/runtime/quark/codelets/codelet_zgram.c index f2c4228ec1fd7dfbe0535d9fac109e903c12ce0f..2acba7cdd644f3e68d66d1aaaa1ccfa3f5cfbea1 100644 --- a/runtime/quark/codelets/codelet_zgram.c +++ b/runtime/quark/codelets/codelet_zgram.c @@ -18,36 +18,28 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zgram_quark(Quark *quark) { cham_uplo_t uplo; int m, n, mt, nt; - double *Di; - int lddi; - double *Dj; - int lddj; - double *D; - double *A; - int lda; + CHAM_tile_t *Di; + CHAM_tile_t *Dj; + CHAM_tile_t *D; + CHAM_tile_t *A; - quark_unpack_args_12(quark, uplo, m, n, mt, nt, Di, lddi, Dj, lddj, D, A, lda); - CORE_zgram( uplo, - m, n, mt, nt, - Di, lddi, - Dj, lddj, - D, - A, lda); + quark_unpack_args_9(quark, uplo, m, n, mt, nt, Di, Dj, D, A ); + TCORE_zgram( uplo, m, n, mt, nt, Di, Dj, D, A ); } void INSERT_TASK_zgram( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int mt, int nt, - const CHAM_desc_t *Di, int Dim, int Din, int lddi, - const CHAM_desc_t *Dj, int Djm, int Djn, int lddj, + const CHAM_desc_t *Di, int Dim, int Din, + const CHAM_desc_t *Dj, int Djm, int Djn, const CHAM_desc_t *D, int Dm, int Dn, - CHAM_desc_t *A, int Am, int An, int lda) + CHAM_desc_t *A, int Am, int An ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_GRAM; @@ -57,12 +49,9 @@ void INSERT_TASK_zgram( const RUNTIME_option_t *options, sizeof(int), &n, VALUE, sizeof(int), &mt, VALUE, sizeof(int), &nt, VALUE, - sizeof(double)*lddi*mt, RTBLKADDR(Di, double, Dim, Din), INPUT, - sizeof(int), &lddi, VALUE, - sizeof(double)*lddj*nt, RTBLKADDR(Dj, double, Djm, Djn), INPUT, - sizeof(int), &lddj, VALUE, - sizeof(double)*2, RTBLKADDR(D, double, Dm, Dn), INPUT, - sizeof(double)*mt*nt, RTBLKADDR(A, double, Am, An), INOUT, - sizeof(int), &lda, VALUE, + sizeof(void*), RTBLKADDR(Di, double, Dim, Din), INPUT, + sizeof(void*), RTBLKADDR(Dj, double, Djm, Djn), INPUT, + sizeof(void*), RTBLKADDR(D, double, Dm, Dn ), INPUT, + sizeof(void*), RTBLKADDR(A, double, Am, An ), INOUT, 0); } diff --git a/runtime/quark/codelets/codelet_zhe2ge.c b/runtime/quark/codelets/codelet_zhe2ge.c index fc141c5b7a0846529c59559bdc995c5f9a231b91..ae9ea7e2fac1bc308e7b07c25578c968b735310c 100644 --- a/runtime/quark/codelets/codelet_zhe2ge.c +++ b/runtime/quark/codelets/codelet_zhe2ge.c @@ -19,7 +19,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" /** * @@ -31,21 +31,19 @@ static inline void CORE_zhe2ge_quark(Quark *quark) cham_uplo_t uplo; int M; int N; - CHAMELEON_Complex64_t *A; - int LDA; - CHAMELEON_Complex64_t *B; - int LDB; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; - quark_unpack_args_7(quark, uplo, M, N, A, LDA, B, LDB); - CORE_zhe2ge(uplo, M, N, A, LDA, B, LDB); + quark_unpack_args_5(quark, uplo, M, N, tileA, tileB); + TCORE_zhe2ge(uplo, M, N, tileA, tileB); } void INSERT_TASK_zhe2ge(const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int mb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_LACPY; @@ -53,9 +51,7 @@ void INSERT_TASK_zhe2ge(const RUNTIME_option_t *options, sizeof(int), &uplo, VALUE, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, - sizeof(CHAMELEON_Complex64_t)*mb*mb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &lda, VALUE, - sizeof(CHAMELEON_Complex64_t)*mb*mb, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), OUTPUT, - sizeof(int), &ldb, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, + sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), OUTPUT, 0); } diff --git a/runtime/quark/codelets/codelet_zhemm.c b/runtime/quark/codelets/codelet_zhemm.c index af368adb26d41d2d07f477374bba0fa1a836cc77..0e576a93eb2ae13fb0c94cab15f25672d3cd7463 100644 --- a/runtime/quark/codelets/codelet_zhemm.c +++ b/runtime/quark/codelets/codelet_zhemm.c @@ -12,8 +12,6 @@ * @brief Chameleon zhemm Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge @@ -25,7 +23,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zhemm_quark(Quark *quark) { @@ -34,28 +32,25 @@ void CORE_zhemm_quark(Quark *quark) int M; int N; CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t *A; - int LDA; - CHAMELEON_Complex64_t *B; - int LDB; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; CHAMELEON_Complex64_t beta; - CHAMELEON_Complex64_t *C; - int LDC; + CHAM_tile_t *tileC; - quark_unpack_args_12(quark, side, uplo, M, N, alpha, A, LDA, B, LDB, beta, C, LDC); - CORE_zhemm(side, uplo, + quark_unpack_args_9(quark, side, uplo, M, N, alpha, tileA, tileB, beta, tileC); + TCORE_zhemm(side, uplo, M, N, - alpha, A, LDA, - B, LDB, - beta, C, LDC); + alpha, tileA, + tileB, + beta, tileC); } void INSERT_TASK_zhemm(const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_HEMM; @@ -65,13 +60,10 @@ void INSERT_TASK_zhemm(const RUNTIME_option_t *options, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &lda, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INPUT, - sizeof(int), &ldb, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, + sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INPUT, sizeof(CHAMELEON_Complex64_t), &beta, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), INOUT, - sizeof(int), &ldc, VALUE, + sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), INOUT, 0); } diff --git a/runtime/quark/codelets/codelet_zher2k.c b/runtime/quark/codelets/codelet_zher2k.c index c39093ddc0c00466d7122f985d918a25dc1663f9..5e0ba7efb3b4406733eeab6a1377c5aa9319f450 100644 --- a/runtime/quark/codelets/codelet_zher2k.c +++ b/runtime/quark/codelets/codelet_zher2k.c @@ -12,8 +12,6 @@ * @brief Chameleon zher2k Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge @@ -25,7 +23,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zher2k_quark(Quark *quark) { @@ -34,25 +32,22 @@ void CORE_zher2k_quark(Quark *quark) int n; int k; CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t *A; - int lda; - CHAMELEON_Complex64_t *B; - int ldb; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; double beta; - CHAMELEON_Complex64_t *C; - int ldc; + CHAM_tile_t *tileC; - quark_unpack_args_12(quark, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc); - CORE_zher2k(uplo, trans, - n, k, alpha, A, lda, B, ldb, beta, C, ldc); + quark_unpack_args_9(quark, uplo, trans, n, k, alpha, tileA, tileB, beta, tileC); + TCORE_zher2k(uplo, trans, + n, k, alpha, tileA, tileB, beta, tileC); } void INSERT_TASK_zher2k(const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - double beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + double beta, const CHAM_desc_t *C, int Cm, int Cn) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_HER2K; @@ -62,12 +57,9 @@ void INSERT_TASK_zher2k(const RUNTIME_option_t *options, sizeof(int), &n, VALUE, sizeof(int), &k, VALUE, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &lda, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INPUT, - sizeof(int), &ldb, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, + sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INPUT, sizeof(double), &beta, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), INOUT, - sizeof(int), &ldc, VALUE, + sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), INOUT, 0); } diff --git a/runtime/quark/codelets/codelet_zherfb.c b/runtime/quark/codelets/codelet_zherfb.c index 3e4565b316c367ad6f36d6e51561a886a4b56be6..29b525f30278a407f9710c873b59a31c607df3c8 100644 --- a/runtime/quark/codelets/codelet_zherfb.c +++ b/runtime/quark/codelets/codelet_zherfb.c @@ -19,7 +19,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zherfb_quark(Quark *quark) { @@ -28,25 +28,22 @@ void CORE_zherfb_quark(Quark *quark) int k; int ib; int nb; - CHAMELEON_Complex64_t *A; - int lda; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *C; - int ldc; + CHAM_tile_t *tileA; + CHAM_tile_t *tileT; + CHAM_tile_t *tileC; CHAMELEON_Complex64_t *WORK; int ldwork; - quark_unpack_args_13(quark, uplo, n, k, ib, nb, A, lda, T, ldt, C, ldc, WORK, ldwork); - CORE_zherfb(uplo, n, k, ib, nb, A, lda, T, ldt, C, ldc, WORK, ldwork); + quark_unpack_args_10(quark, uplo, n, k, ib, nb, tileA, tileT, tileC, WORK, ldwork); + TCORE_zherfb(uplo, n, k, ib, nb, tileA, tileT, tileC, WORK, ldwork); } void INSERT_TASK_zherfb(const RUNTIME_option_t *options, cham_uplo_t uplo, int n, int k, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, - const CHAM_desc_t *C, int Cm, int Cn, int ldc) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *T, int Tm, int Tn, + const CHAM_desc_t *C, int Cm, int Cn) { quark_option_t *opt = (quark_option_t*)(options->schedopt); @@ -56,12 +53,9 @@ void INSERT_TASK_zherfb(const RUNTIME_option_t *options, sizeof(int), &k, VALUE, sizeof(int), &ib, VALUE, sizeof(int), &nb, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), (uplo == ChamUpper) ? INOUT|QUARK_REGION_U : INOUT|QUARK_REGION_L, - sizeof(int), &lda, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), INPUT, - sizeof(int), &ldt, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), (uplo == ChamUpper) ? INOUT|QUARK_REGION_D|QUARK_REGION_U : INOUT|QUARK_REGION_D|QUARK_REGION_L, - sizeof(int), &ldc, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), (uplo == ChamUpper) ? INOUT|QUARK_REGION_U : INOUT|QUARK_REGION_L, + sizeof(void*), RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), INPUT, + sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), (uplo == ChamUpper) ? INOUT|QUARK_REGION_D|QUARK_REGION_U : INOUT|QUARK_REGION_D|QUARK_REGION_L, sizeof(CHAMELEON_Complex64_t)*2*nb*nb, NULL, SCRATCH, sizeof(int), &nb, VALUE, 0); diff --git a/runtime/quark/codelets/codelet_zherk.c b/runtime/quark/codelets/codelet_zherk.c index 70254a131236f3f610d81f6bc4b61b8da2dcc11e..8651de4d9c3a9cab50023db08b41384748af2ef8 100644 --- a/runtime/quark/codelets/codelet_zherk.c +++ b/runtime/quark/codelets/codelet_zherk.c @@ -12,8 +12,6 @@ * @brief Chameleon zherk Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge @@ -25,7 +23,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zherk_quark(Quark *quark) { @@ -34,24 +32,22 @@ void CORE_zherk_quark(Quark *quark) int n; int k; double alpha; - CHAMELEON_Complex64_t *A; - int lda; + CHAM_tile_t *tileA; double beta; - CHAMELEON_Complex64_t *C; - int ldc; + CHAM_tile_t *tileC; - quark_unpack_args_10(quark, uplo, trans, n, k, alpha, A, lda, beta, C, ldc); - CORE_zherk(uplo, trans, - n, k, - alpha, A, lda, - beta, C, ldc); + quark_unpack_args_8(quark, uplo, trans, n, k, alpha, tileA, beta, tileC); + TCORE_zherk(uplo, trans, + n, k, + alpha, tileA, + beta, tileC); } void INSERT_TASK_zherk(const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, - double alpha, const CHAM_desc_t *A, int Am, int An, int lda, - double beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc) + double alpha, const CHAM_desc_t *A, int Am, int An, + double beta, const CHAM_desc_t *C, int Cm, int Cn) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_HERK; @@ -61,10 +57,8 @@ void INSERT_TASK_zherk(const RUNTIME_option_t *options, sizeof(int), &n, VALUE, sizeof(int), &k, VALUE, sizeof(double), &alpha, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &lda, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, sizeof(double), &beta, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), INOUT, - sizeof(int), &ldc, VALUE, + sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), INOUT, 0); } diff --git a/runtime/quark/codelets/codelet_zhessq.c b/runtime/quark/codelets/codelet_zhessq.c index b66dfd60a9dcda3747993cacec4b4d33b22e6370..f58dc4194da704f2537a7ae7046aab799c4f2979 100644 --- a/runtime/quark/codelets/codelet_zhessq.c +++ b/runtime/quark/codelets/codelet_zhessq.c @@ -12,8 +12,6 @@ * @brief Chameleon zhessq Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.6.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @date 2014-11-16 * @precisions normal z -> c @@ -24,10 +22,10 @@ void INSERT_TASK_zhessq( const RUNTIME_option_t *options, cham_store_t storev, cham_uplo_t uplo, int n, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { INSERT_TASK_zsyssq( options, storev, uplo, n, - A, Am, An, lda, + A, Am, An, SCALESUMSQ, SCALESUMSQm, SCALESUMSQn ); } diff --git a/runtime/quark/codelets/codelet_zlacpy.c b/runtime/quark/codelets/codelet_zlacpy.c index 817b684529f72052efaea4b0c055ae07abab71b8..976bc83a79b328514311bf13f08f3af2a5d02409 100644 --- a/runtime/quark/codelets/codelet_zlacpy.c +++ b/runtime/quark/codelets/codelet_zlacpy.c @@ -12,8 +12,6 @@ * @brief Chameleon zlacpy Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Julien Langou * @author Henricus Bouwmeester * @author Mathieu Faverge @@ -25,7 +23,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" static inline void CORE_zlacpy_quark(Quark *quark) { @@ -33,20 +31,26 @@ static inline void CORE_zlacpy_quark(Quark *quark) int M; int N; int displA; + CHAM_tile_t *tileA; CHAMELEON_Complex64_t *A; - int LDA; int displB; + CHAM_tile_t *tileB; CHAMELEON_Complex64_t *B; - int LDB; - quark_unpack_args_9(quark, uplo, M, N, displA, A, LDA, displB, B, LDB); - CORE_zlacpy(uplo, M, N, A + displA, LDA, B + displB, LDB); + quark_unpack_args_7(quark, uplo, M, N, displA, tileA, displB, tileB); + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + + A = tileA->mat; + B = tileB->mat; + CORE_zlacpy( uplo, M, N, A + displA, tileA->ld, B + displB, tileB->ld ); } void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int nb, - int displA, const CHAM_desc_t *A, int Am, int An, int lda, - int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb ) + int displA, const CHAM_desc_t *A, int Am, int An, + int displB, const CHAM_desc_t *B, int Bm, int Bn ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_LACPY; @@ -55,20 +59,18 @@ void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, sizeof(int), &displA, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &lda, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, sizeof(int), &displB, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), OUTPUT, - sizeof(int), &ldb, VALUE, + sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), OUTPUT, 0); } void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { INSERT_TASK_zlacpyx( options, uplo, m, n, nb, - 0, A, Am, An, lda, - 0, B, Bm, Bn, ldb ); + 0, A, Am, An, + 0, B, Bm, Bn ); } diff --git a/runtime/quark/codelets/codelet_zlag2c.c b/runtime/quark/codelets/codelet_zlag2c.c index 23bf31b92f2230fa995e030f10b1e82a8783f3a8..28ac7853cbb6de6c595a547d587b06d8d0c2cd2e 100644 --- a/runtime/quark/codelets/codelet_zlag2c.c +++ b/runtime/quark/codelets/codelet_zlag2c.c @@ -12,8 +12,6 @@ * @brief Chameleon zlag2c Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede @@ -23,37 +21,37 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zlag2c_quark(Quark *quark) { int m; int n; - CHAMELEON_Complex64_t *A; - int lda; - CHAMELEON_Complex32_t *B; - int ldb; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; RUNTIME_sequence_t *sequence; RUNTIME_request_t *request; + int info; - quark_unpack_args_8(quark, m, n, A, lda, B, ldb, sequence, request); - CORE_zlag2c( m, n, A, lda, B, ldb); + quark_unpack_args_6(quark, m, n, tileA, tileB, sequence, request); + TCORE_zlag2c( m, n, tileA, tileB, &info ); + if ( (sequence->status != CHAMELEON_SUCCESS) && (info != 0) ) { + RUNTIME_sequence_flush( (CHAM_context_t*)quark, sequence, request, info ); + } } void INSERT_TASK_zlag2c(const RUNTIME_option_t *options, int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_LAG2C; QUARK_Insert_Task(opt->quark, CORE_zlag2c_quark, (Quark_Task_Flags*)opt, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &lda, VALUE, - sizeof(CHAMELEON_Complex32_t)*nb*nb, RTBLKADDR(B, CHAMELEON_Complex32_t, Bm, Bn), OUTPUT, - sizeof(int), &ldb, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, + sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex32_t, Bm, Bn), OUTPUT, sizeof(RUNTIME_sequence_t*), &(options->sequence), VALUE, sizeof(RUNTIME_request_t*), &(options->request), VALUE, 0); @@ -63,26 +61,22 @@ void CORE_clag2z_quark(Quark *quark) { int m; int n; - CHAMELEON_Complex32_t *A; - int lda; - CHAMELEON_Complex64_t *B; - int ldb; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; - quark_unpack_args_6(quark, m, n, A, lda, B, ldb); - CORE_clag2z( m, n, A, lda, B, ldb); + quark_unpack_args_6(quark, m, n, tileA, tileB); + TCORE_clag2z( m, n, tileA, tileB); } void INSERT_TASK_clag2z(const RUNTIME_option_t *options, int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn) { QUARK_Insert_Task(opt->quark, CORE_clag2z_quark, (Quark_Task_Flags*)opt, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, - sizeof(CHAMELEON_Complex32_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex32_t, Am, An), INPUT, - sizeof(int), &lda, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INOUT, - sizeof(int), &ldb, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex32_t, Am, An), INPUT, + sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INOUT, 0); } diff --git a/runtime/quark/codelets/codelet_zlange.c b/runtime/quark/codelets/codelet_zlange.c index c1d32c4324a2b016b7d03d478c58201d60b50e5a..608100393ca5d1a12207023508acceb4a98971aa 100644 --- a/runtime/quark/codelets/codelet_zlange.c +++ b/runtime/quark/codelets/codelet_zlange.c @@ -12,8 +12,6 @@ * @brief Chameleon zlange Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.6.0 for CHAMELEON 0.9.2 * @author Julien Langou * @author Henricus Bouwmeester * @author Mathieu Faverge @@ -23,25 +21,24 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zlange_quark(Quark *quark) { - double *normA; + CHAM_tile_t *tileNorm; cham_normtype_t norm; int M; int N; - CHAMELEON_Complex64_t *A; - int LDA; + CHAM_tile_t *tileA; double *work; - quark_unpack_args_7(quark, norm, M, N, A, LDA, work, normA); - CORE_zlange( norm, M, N, A, LDA, work, normA); + quark_unpack_args_6( quark, norm, M, N, tileA, work, tileNorm ); + TCORE_zlange( norm, M, N, tileA, work, tileNorm->mat ); } void INSERT_TASK_zlange(const RUNTIME_option_t *options, cham_normtype_t norm, int M, int N, int NB, - const CHAM_desc_t *A, int Am, int An, int LDA, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn) { quark_option_t *opt = (quark_option_t*)(options->schedopt); @@ -52,21 +49,25 @@ void INSERT_TASK_zlange(const RUNTIME_option_t *options, sizeof(int), &norm, VALUE, sizeof(int), &M, VALUE, sizeof(int), &N, VALUE, - sizeof(CHAMELEON_Complex64_t)*NB*NB, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &LDA, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, sizeof(double)*szeW, NULL, SCRATCH, - sizeof(double), RTBLKADDR(B, double, Bm, Bn), OUTPUT, + sizeof(void*), RTBLKADDR(B, double, Bm, Bn), OUTPUT, 0); } void CORE_zlange_max_quark(Quark *quark) { - double *A; - double *normA; + CHAM_tile_t *tileA; + CHAM_tile_t *tileNorm; + double *A, *norm; - quark_unpack_args_2(quark, A, normA); - if ( A[0] > *normA ) - *normA = A[0]; + quark_unpack_args_2(quark, tileA, tileNorm); + A = tileA->mat; + norm = tileNorm->mat; + + if ( A[0] > *norm ) { + *norm = A[0]; + } } void INSERT_TASK_zlange_max(const RUNTIME_option_t *options, @@ -77,8 +78,8 @@ void INSERT_TASK_zlange_max(const RUNTIME_option_t *options, DAG_CORE_LANGE_MAX; QUARK_Insert_Task( opt->quark, CORE_zlange_max_quark, (Quark_Task_Flags*)opt, - sizeof(double), RTBLKADDR(A, double, Am, An), INPUT, - sizeof(double), RTBLKADDR(B, double, Bm, Bn), OUTPUT, + sizeof(void*), RTBLKADDR(A, double, Am, An), INPUT, + sizeof(void*), RTBLKADDR(B, double, Bm, Bn), OUTPUT, 0); } diff --git a/runtime/quark/codelets/codelet_zlanhe.c b/runtime/quark/codelets/codelet_zlanhe.c index 8ff0c70d052ca78d8aaec61ff60ea4053850c5d6..a7c05e812c8ecfaefd1336800bf4d1b5776fe4da 100644 --- a/runtime/quark/codelets/codelet_zlanhe.c +++ b/runtime/quark/codelets/codelet_zlanhe.c @@ -12,8 +12,6 @@ * @brief Chameleon zlanhe Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.6.0 for CHAMELEON 0.9.2 * @author Julien Langou * @author Henricus Bouwmeester * @author Mathieu Faverge @@ -23,25 +21,24 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zlanhe_quark(Quark *quark) { - double *normA; + CHAM_tile_t *tileNorm; cham_normtype_t norm; cham_uplo_t uplo; int N; - CHAMELEON_Complex64_t *A; - int LDA; + CHAM_tile_t *tileA; double *work; - quark_unpack_args_7(quark, norm, uplo, N, A, LDA, work, normA); - CORE_zlanhe( norm, uplo, N, A, LDA, work, normA); + quark_unpack_args_6(quark, norm, uplo, N, tileA, work, tileNorm ); + TCORE_zlanhe( norm, uplo, N, tileA, work, tileNorm->mat ); } void INSERT_TASK_zlanhe(const RUNTIME_option_t *options, cham_normtype_t norm, cham_uplo_t uplo, int N, int NB, - const CHAM_desc_t *A, int Am, int An, int LDA, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn) { quark_option_t *opt = (quark_option_t*)(options->schedopt); @@ -52,9 +49,8 @@ void INSERT_TASK_zlanhe(const RUNTIME_option_t *options, sizeof(int), &norm, VALUE, sizeof(int), &uplo, VALUE, sizeof(int), &N, VALUE, - sizeof(CHAMELEON_Complex64_t)*NB*NB, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &LDA, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, sizeof(double)*szeW, NULL, SCRATCH, - sizeof(double), RTBLKADDR(B, double, Bm, Bn), OUTPUT, + sizeof(void*), RTBLKADDR(B, double, Bm, Bn), OUTPUT, 0); } diff --git a/runtime/quark/codelets/codelet_zlansy.c b/runtime/quark/codelets/codelet_zlansy.c index 1acef127cea59154897ff203b49ece22db649ff0..17aecc2320fdad11fac4e891bc1b6f1f0b8933d2 100644 --- a/runtime/quark/codelets/codelet_zlansy.c +++ b/runtime/quark/codelets/codelet_zlansy.c @@ -12,8 +12,6 @@ * @brief Chameleon zlansy Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.6.0 for CHAMELEON 0.9.2 * @author Julien Langou * @author Henricus Bouwmeester * @author Mathieu Faverge @@ -23,25 +21,24 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zlansy_quark(Quark *quark) { - double *normA; + CHAM_tile_t *tileNorm; cham_normtype_t norm; cham_uplo_t uplo; int N; - CHAMELEON_Complex64_t *A; - int LDA; + CHAM_tile_t *tileA; double *work; - quark_unpack_args_7(quark, norm, uplo, N, A, LDA, work, normA); - CORE_zlansy( norm, uplo, N, A, LDA, work, normA); + quark_unpack_args_6(quark, norm, uplo, N, tileA, work, tileNorm ); + TCORE_zlansy( norm, uplo, N, tileA, work, tileNorm->mat ); } void INSERT_TASK_zlansy(const RUNTIME_option_t *options, cham_normtype_t norm, cham_uplo_t uplo, int N, int NB, - const CHAM_desc_t *A, int Am, int An, int LDA, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn) { quark_option_t *opt = (quark_option_t*)(options->schedopt); @@ -52,9 +49,8 @@ void INSERT_TASK_zlansy(const RUNTIME_option_t *options, sizeof(int), &norm, VALUE, sizeof(int), &uplo, VALUE, sizeof(int), &N, VALUE, - sizeof(CHAMELEON_Complex64_t)*NB*NB, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &LDA, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, sizeof(double)*szeW, NULL, SCRATCH, - sizeof(double), RTBLKADDR(B, double, Bm, Bn), OUTPUT, + sizeof(void*), RTBLKADDR(B, double, Bm, Bn), OUTPUT, 0); } diff --git a/runtime/quark/codelets/codelet_zlantr.c b/runtime/quark/codelets/codelet_zlantr.c index 708f783292550191381cbb88f00581282e5c1b21..5a6bdb83333c47fe7fecf9f90da606f14f6d1443 100644 --- a/runtime/quark/codelets/codelet_zlantr.c +++ b/runtime/quark/codelets/codelet_zlantr.c @@ -12,8 +12,6 @@ * @brief Chameleon zlantr Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.6.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @date 2014-11-16 * @precisions normal z -> c d s @@ -21,26 +19,25 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zlantr_quark(Quark *quark) { - double *normA; + CHAM_tile_t *tileNorm; cham_normtype_t norm, uplo, diag; int M; int N; - CHAMELEON_Complex64_t *A; - int LDA; + CHAM_tile_t *tileA; double *work; - quark_unpack_args_9(quark, norm, uplo, diag, M, N, A, LDA, work, normA); - CORE_zlantr( norm, uplo, diag, M, N, A, LDA, work, normA); + quark_unpack_args_8(quark, norm, uplo, diag, M, N, tileA, work, tileNorm ); + TCORE_zlantr( norm, uplo, diag, M, N, tileA, work, tileNorm->mat ); } void INSERT_TASK_zlantr(const RUNTIME_option_t *options, cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, int M, int N, int NB, - const CHAM_desc_t *A, int Am, int An, int LDA, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn) { quark_option_t *opt = (quark_option_t*)(options->schedopt); @@ -53,9 +50,8 @@ void INSERT_TASK_zlantr(const RUNTIME_option_t *options, sizeof(int), &diag, VALUE, sizeof(int), &M, VALUE, sizeof(int), &N, VALUE, - sizeof(CHAMELEON_Complex64_t)*NB*NB, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &LDA, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, sizeof(double)*szeW, NULL, SCRATCH, - sizeof(double), RTBLKADDR(B, double, Bm, Bn), OUTPUT, + sizeof(void*), RTBLKADDR(B, double, Bm, Bn), OUTPUT, 0); } diff --git a/runtime/quark/codelets/codelet_zlascal.c b/runtime/quark/codelets/codelet_zlascal.c index 5f73570b9640091d0a08fc643e95dbf1f5de02af..4aebaf8d663b8d6589a91308a9adf60da7a0722b 100644 --- a/runtime/quark/codelets/codelet_zlascal.c +++ b/runtime/quark/codelets/codelet_zlascal.c @@ -12,8 +12,6 @@ * @brief Chameleon zlascal Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Julien Langou * @author Henricus Bouwmeester * @author Mathieu Faverge @@ -25,7 +23,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" static inline void CORE_zlascal_quark(Quark *quark) { @@ -33,18 +31,17 @@ static inline void CORE_zlascal_quark(Quark *quark) int M; int N; CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t *A; - int LDA; + CHAM_tile_t *tileA; - quark_unpack_args_6(quark, uplo, M, N, alpha, A, LDA); - CORE_zlascal(uplo, M, N, alpha, A, LDA); + quark_unpack_args_5(quark, uplo, M, N, alpha, tileA); + TCORE_zlascal(uplo, M, N, alpha, tileA); } void INSERT_TASK_zlascal(const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int nb, CHAMELEON_Complex64_t alpha, - const CHAM_desc_t *A, int Am, int An, int lda) + const CHAM_desc_t *A, int Am, int An) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_LASCAL; @@ -53,8 +50,7 @@ void INSERT_TASK_zlascal(const RUNTIME_option_t *options, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT, - sizeof(int), &lda, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT, 0); } diff --git a/runtime/quark/codelets/codelet_zlaset.c b/runtime/quark/codelets/codelet_zlaset.c index 5b35de50f76c80b488108b629b5946a8fec49d13..7c449a02fc8ed34753392c574f3c9bb0c4cc66b5 100644 --- a/runtime/quark/codelets/codelet_zlaset.c +++ b/runtime/quark/codelets/codelet_zlaset.c @@ -12,8 +12,6 @@ * @brief Chameleon zlaset Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Mathieu Faverge * @author Emmanuel Agullo @@ -24,7 +22,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zlaset_quark(Quark *quark) { @@ -33,52 +31,16 @@ void CORE_zlaset_quark(Quark *quark) int N; CHAMELEON_Complex64_t alpha; CHAMELEON_Complex64_t beta; - CHAMELEON_Complex64_t *A; - int LDA; + CHAM_tile_t *tileA; - quark_unpack_args_7(quark, uplo, M, N, alpha, beta, A, LDA); - CORE_zlaset(uplo, M, N, alpha, beta, A, LDA); + quark_unpack_args_6(quark, uplo, M, N, alpha, beta, tileA); + TCORE_zlaset(uplo, M, N, alpha, beta, tileA); } -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zlaset - Sets the elements of the matrix A on the diagonal - * to beta and on the off-diagonals to alpha - * - ******************************************************************************* - * - * @param[in] uplo - * Specifies which elements of the matrix are to be set - * = ChamUpper: Upper part of A is set; - * = ChamLower: Lower part of A is set; - * = ChamUpperLower: ALL elements of A are set. - * - * @param[in] M - * The number of rows of the matrix A. M >= 0. - * - * @param[in] N - * The number of columns of the matrix A. N >= 0. - * - * @param[in] alpha - * The constant to which the off-diagonal elements are to be set. - * - * @param[in] beta - * The constant to which the diagonal elements are to be set. - * - * @param[in,out] A - * On entry, the M-by-N tile A. - * On exit, A has been set accordingly. - * - * @param[in] LDA - * The leading dimension of the array A. LDA >= max(1,M). - * - */ void INSERT_TASK_zlaset(const RUNTIME_option_t *options, cham_uplo_t uplo, int M, int N, CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t beta, - const CHAM_desc_t *A, int Am, int An, int LDA) + const CHAM_desc_t *A, int Am, int An) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_LASET; @@ -88,7 +50,6 @@ void INSERT_TASK_zlaset(const RUNTIME_option_t *options, sizeof(int), &N, VALUE, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, sizeof(CHAMELEON_Complex64_t), &beta, VALUE, - sizeof(CHAMELEON_Complex64_t)*LDA*N, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), OUTPUT, - sizeof(int), &LDA, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), OUTPUT, 0); } diff --git a/runtime/quark/codelets/codelet_zlaset2.c b/runtime/quark/codelets/codelet_zlaset2.c index aa4998a40f99bfe01ad9e16763d61fe3607762a8..b8acfd2680a8d958c8384411af9562d725282e30 100644 --- a/runtime/quark/codelets/codelet_zlaset2.c +++ b/runtime/quark/codelets/codelet_zlaset2.c @@ -12,8 +12,6 @@ * @brief Chameleon zlaset2 Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Mathieu Faverge * @author Emmanuel Agullo @@ -24,7 +22,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zlaset2_quark(Quark *quark) { @@ -32,49 +30,15 @@ void CORE_zlaset2_quark(Quark *quark) int M; int N; CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t *A; - int LDA; + CHAM_tile_t *tileA; - quark_unpack_args_6(quark, uplo, M, N, alpha, A, LDA); - CORE_zlaset2(uplo, M, N, alpha, A, LDA); + quark_unpack_args_5(quark, uplo, M, N, alpha, tileA); + TCORE_zlaset2(uplo, M, N, alpha, tileA); } -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zlaset2 - Sets the elements of the matrix A to alpha. - * Not LAPACK compliant! Read below. - * - ******************************************************************************* - * - * @param[in] uplo - * Specifies which elements of the matrix are to be set - * = ChamUpper: STRICT Upper part of A is set to alpha; - * = ChamLower: STRICT Lower part of A is set to alpha; - * = ChamUpperLower: ALL elements of A are set to alpha. - * Not LAPACK Compliant. - * - * @param[in] M - * The number of rows of the matrix A. M >= 0. - * - * @param[in] N - * The number of columns of the matrix A. N >= 0. - * - * @param[in] alpha - * The constant to which the elements are to be set. - * - * @param[in,out] A - * On entry, the M-by-N tile A. - * On exit, A has been set to alpha accordingly. - * - * @param[in] LDA - * The leading dimension of the array A. LDA >= max(1,M). - * - */ void INSERT_TASK_zlaset2(const RUNTIME_option_t *options, cham_uplo_t uplo, int M, int N, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int LDA) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_LASET; @@ -83,7 +47,6 @@ void INSERT_TASK_zlaset2(const RUNTIME_option_t *options, sizeof(int), &M, VALUE, sizeof(int), &N, VALUE, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, - sizeof(CHAMELEON_Complex64_t)*M*N, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), OUTPUT, - sizeof(int), &LDA, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), OUTPUT, 0); } diff --git a/runtime/quark/codelets/codelet_zlatro.c b/runtime/quark/codelets/codelet_zlatro.c index 220a0c6f882bfb4a0060bf405cd5d42047b9117f..055bee94c90a740c8b200445949371cf6104834f 100644 --- a/runtime/quark/codelets/codelet_zlatro.c +++ b/runtime/quark/codelets/codelet_zlatro.c @@ -19,7 +19,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zlatro_quark(Quark *quark) { @@ -27,20 +27,18 @@ void CORE_zlatro_quark(Quark *quark) cham_trans_t trans; int M; int N; - const CHAMELEON_Complex64_t *A; - int LDA; - CHAMELEON_Complex64_t *B; - int LDB; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; - quark_unpack_args_8(quark, uplo, trans, M, N, A, LDA, B, LDB); - CORE_zlatro(uplo, trans, M, N, A, LDA, B, LDB); + quark_unpack_args_6(quark, uplo, trans, M, N, tileA, tileB); + TCORE_zlatro(uplo, trans, M, N, tileA, tileB); } void INSERT_TASK_zlatro(const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int m, int n, int mb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn) { quark_option_t *opt = (quark_option_t*)(options->schedopt); @@ -49,9 +47,7 @@ void INSERT_TASK_zlatro(const RUNTIME_option_t *options, sizeof(int), &trans, VALUE, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, - sizeof(CHAMELEON_Complex64_t)*mb*mb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &lda, VALUE, - sizeof(CHAMELEON_Complex64_t)*mb*mb, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), OUTPUT, - sizeof(int), &ldb, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, + sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), OUTPUT, 0); } diff --git a/runtime/quark/codelets/codelet_zlauum.c b/runtime/quark/codelets/codelet_zlauum.c index 17022ca6ad39b7e51ecf0e2546bc22bd6773e47b..356fb8adcb24876f9aa2e527aafb3167c0fa52f4 100644 --- a/runtime/quark/codelets/codelet_zlauum.c +++ b/runtime/quark/codelets/codelet_zlauum.c @@ -12,8 +12,6 @@ * @brief Chameleon zlauum Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Julien Langou * @author Henricus Bouwmeester * @author Mathieu Faverge @@ -25,29 +23,27 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zlauum_quark(Quark *quark) { cham_uplo_t uplo; int N; - CHAMELEON_Complex64_t *A; - int LDA; + CHAM_tile_t *tileA; - quark_unpack_args_4(quark, uplo, N, A, LDA); - CORE_zlauum(uplo, N, A, LDA); + quark_unpack_args_3(quark, uplo, N, tileA); + TCORE_zlauum(uplo, N, tileA); } void INSERT_TASK_zlauum(const RUNTIME_option_t *options, cham_uplo_t uplo, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda) + const CHAM_desc_t *A, int Am, int An) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_LAUUM; QUARK_Insert_Task(opt->quark, CORE_zlauum_quark, (Quark_Task_Flags*)opt, sizeof(int), &uplo, VALUE, sizeof(int), &n, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT, - sizeof(int), &lda, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT, 0); } diff --git a/runtime/quark/codelets/codelet_zplghe.c b/runtime/quark/codelets/codelet_zplghe.c index b57d82c8a40bad4fad8e9db3153ac8832affd21e..eab1f625590a1782b236e7be4e78ebddf1e47a45 100644 --- a/runtime/quark/codelets/codelet_zplghe.c +++ b/runtime/quark/codelets/codelet_zplghe.c @@ -12,8 +12,6 @@ * @brief Chameleon zplghe Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Piotr Luszczek * @author Pierre Lemarinier * @author Mathieu Faverge @@ -25,26 +23,25 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zplghe_quark(Quark *quark) { double bump; int m; int n; - CHAMELEON_Complex64_t *A; - int lda; + CHAM_tile_t *tileA; int bigM; int m0; int n0; unsigned long long int seed; - quark_unpack_args_9( quark, bump, m, n, A, lda, bigM, m0, n0, seed ); - CORE_zplghe( bump, m, n, A, lda, bigM, m0, n0, seed ); + quark_unpack_args_8( quark, bump, m, n, tileA, bigM, m0, n0, seed ); + TCORE_zplghe( bump, m, n, tileA, bigM, m0, n0, seed ); } void INSERT_TASK_zplghe( const RUNTIME_option_t *options, - double bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int lda, + double bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int bigM, int m0, int n0, unsigned long long int seed ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); @@ -53,8 +50,7 @@ void INSERT_TASK_zplghe( const RUNTIME_option_t *options, sizeof(double), &bump, VALUE, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, - sizeof(CHAMELEON_Complex64_t)*lda*n, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), OUTPUT, - sizeof(int), &lda, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), OUTPUT, sizeof(int), &bigM, VALUE, sizeof(int), &m0, VALUE, sizeof(int), &n0, VALUE, diff --git a/runtime/quark/codelets/codelet_zplgsy.c b/runtime/quark/codelets/codelet_zplgsy.c index 60eee5385aa74edecb2773eeabc8aa7e1038aa45..8ed9175fc575ee5564b55c59751d0677ad0cc118 100644 --- a/runtime/quark/codelets/codelet_zplgsy.c +++ b/runtime/quark/codelets/codelet_zplgsy.c @@ -12,8 +12,6 @@ * @brief Chameleon zplgsy Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Piotr Luszczek * @author Pierre Lemarinier * @author Mathieu Faverge @@ -25,26 +23,25 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zplgsy_quark(Quark *quark) { CHAMELEON_Complex64_t bump; int m; int n; - CHAMELEON_Complex64_t *A; - int lda; + CHAM_tile_t *tileA; int bigM; int m0; int n0; unsigned long long int seed; - quark_unpack_args_9( quark, bump, m, n, A, lda, bigM, m0, n0, seed ); - CORE_zplgsy( bump, m, n, A, lda, bigM, m0, n0, seed ); + quark_unpack_args_8( quark, bump, m, n, tileA, bigM, m0, n0, seed ); + TCORE_zplgsy( bump, m, n, tileA, bigM, m0, n0, seed ); } void INSERT_TASK_zplgsy( const RUNTIME_option_t *options, - CHAMELEON_Complex64_t bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int lda, + CHAMELEON_Complex64_t bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int bigM, int m0, int n0, unsigned long long int seed ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); @@ -53,8 +50,7 @@ void INSERT_TASK_zplgsy( const RUNTIME_option_t *options, sizeof(CHAMELEON_Complex64_t), &bump, VALUE, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, - sizeof(CHAMELEON_Complex64_t)*lda*n, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), OUTPUT, - sizeof(int), &lda, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), OUTPUT, sizeof(int), &bigM, VALUE, sizeof(int), &m0, VALUE, sizeof(int), &n0, VALUE, diff --git a/runtime/quark/codelets/codelet_zplrnt.c b/runtime/quark/codelets/codelet_zplrnt.c index 455c0a8d01288ab104ca9b8f5aeea129f39c37b1..9f84ba3a47b156cd29d3e0ac076daaa88e3bb0cb 100644 --- a/runtime/quark/codelets/codelet_zplrnt.c +++ b/runtime/quark/codelets/codelet_zplrnt.c @@ -12,8 +12,6 @@ * @brief Chameleon zplrnt Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Piotr Luszczek * @author Pierre Lemarinier * @author Mathieu Faverge @@ -25,25 +23,24 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zplrnt_quark(Quark *quark) { int m; int n; - CHAMELEON_Complex64_t *A; - int lda; + CHAM_tile_t *tileA; int bigM; int m0; int n0; unsigned long long int seed; - quark_unpack_args_8( quark, m, n, A, lda, bigM, m0, n0, seed ); - CORE_zplrnt( m, n, A, lda, bigM, m0, n0, seed ); + quark_unpack_args_7( quark, m, n, tileA, bigM, m0, n0, seed ); + TCORE_zplrnt( m, n, tileA, bigM, m0, n0, seed ); } void INSERT_TASK_zplrnt( const RUNTIME_option_t *options, - int m, int n, const CHAM_desc_t *A, int Am, int An, int lda, + int m, int n, const CHAM_desc_t *A, int Am, int An, int bigM, int m0, int n0, unsigned long long int seed ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); @@ -51,8 +48,7 @@ void INSERT_TASK_zplrnt( const RUNTIME_option_t *options, QUARK_Insert_Task(opt->quark, CORE_zplrnt_quark, (Quark_Task_Flags*)opt, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, - sizeof(CHAMELEON_Complex64_t)*lda*n, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), OUTPUT, - sizeof(int), &lda, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), OUTPUT, sizeof(int), &bigM, VALUE, sizeof(int), &m0, VALUE, sizeof(int), &n0, VALUE, diff --git a/runtime/quark/codelets/codelet_zplssq.c b/runtime/quark/codelets/codelet_zplssq.c index b340873af51c0e8fae9659def50170eecdb16f83..bc985a5a3e79eb400273d63185c96946d631d85c 100644 --- a/runtime/quark/codelets/codelet_zplssq.c +++ b/runtime/quark/codelets/codelet_zplssq.c @@ -12,8 +12,6 @@ * @brief Chameleon zplssq Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.6.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @date 2014-11-16 * @precisions normal z -> c d s @@ -22,76 +20,36 @@ #include <math.h> #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zplssq_quark(Quark *quark) { cham_store_t storev; int M; int N; - double *SCLSSQ_IN; - double *SCLSSQ_OUT; + CHAM_tile_t *tileIN; + CHAM_tile_t *tileOUT; - quark_unpack_args_5( quark, storev, M, N, SCLSSQ_IN, SCLSSQ_OUT ); + assert( tileIN->format & CHAMELEON_TILE_FULLRANK ); + assert( tileOUT->format & CHAMELEON_TILE_FULLRANK ); - CORE_zplssq(storev, M, N, SCLSSQ_IN, SCLSSQ_OUT); + quark_unpack_args_5( quark, storev, M, N, tileIN, tileOUT ); + CORE_zplssq( storev, M, N, tileIN->mat, tileOUT->mat ); } -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * @brief Compute sum( a_ij ^ 2 ) = scl * sqrt(ssq) - * - * with scl and ssq such that - * - * ( scl**2 )*ssq = sum( A( 2*i )**2 * A( 2*i+1 ) ) - * i - * - * The values of A(2*i+1) are assumed to be at least unity. - * The values of A(2*i) are assumed to be non-negative and scl is - * - * scl = max( A( 2*i ) ), - * i - * - * The routine makes only one pass through the matrix A. - * - ******************************************************************************* - * - * @param[in] M - * The number of couple (scale, sumsq) in the matrix A. - * - * @param[in] A - * The 2-by-M matrix. - * - * @param[out] result - * On exit, result contains scl * sqrt( ssq ) - * - */ void INSERT_TASK_zplssq( const RUNTIME_option_t *options, cham_store_t storev, int M, int N, - const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn, - const CHAM_desc_t *SCLSSQ, int SCLSSQm, int SCLSSQn ) + const CHAM_desc_t *IN, int INm, int INn, + const CHAM_desc_t *OUT, int OUTm, int OUTn ) { - int sizein = 2*M*N; - int sizeout; - - if ( storev == ChamColumnwise ) { - sizeout = 2*N; - } else if ( storev == ChamRowwise ) { - sizeout = 2*M; - } else { - sizeout = 2; - } - quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_PLSSQ; QUARK_Insert_Task(opt->quark, CORE_zplssq_quark, (Quark_Task_Flags*)opt, sizeof(int), &storev, VALUE, sizeof(int), &M, VALUE, sizeof(int), &N, VALUE, - sizeof(double)*sizein, RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INPUT, - sizeof(double)*sizeout, RTBLKADDR(SCLSSQ, double, SCLSSQm, SCLSSQn), INOUT, + sizeof(void*), RTBLKADDR(IN, double, INm, INn), INPUT, + sizeof(void*), RTBLKADDR(OUT, double, OUTm, OUTn), INOUT, 0); } @@ -99,11 +57,12 @@ void INSERT_TASK_zplssq( const RUNTIME_option_t *options, void CORE_zplssq2_quark(Quark *quark) { int N; - double *RESULT; + CHAM_tile_t *RESULT; - quark_unpack_args_2( quark, N, RESULT ); + assert( tileRESULT->format & CHAMELEON_TILE_FULLRANK ); - CORE_zplssq2(N, RESULT); + quark_unpack_args_2( quark, N, RESULT ); + CORE_zplssq2(N, RESULT->mat); } void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, int N, @@ -113,6 +72,6 @@ void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, int N, DAG_CORE_PLSSQ2; QUARK_Insert_Task(opt->quark, CORE_zplssq2_quark, (Quark_Task_Flags*)opt, sizeof(int), &N, VALUE, - sizeof(double)*2*N, RTBLKADDR(RESULT, double, RESULTm, RESULTn), INOUT, + sizeof(void*), RTBLKADDR(RESULT, double, RESULTm, RESULTn), INOUT, 0); } diff --git a/runtime/quark/codelets/codelet_zpotrf.c b/runtime/quark/codelets/codelet_zpotrf.c index b3bb0256c30730c23c091e891d7d4de88594158e..7b8df563fd922f233ed3d370150659caca144b50 100644 --- a/runtime/quark/codelets/codelet_zpotrf.c +++ b/runtime/quark/codelets/codelet_zpotrf.c @@ -12,8 +12,6 @@ * @brief Chameleon zpotrf Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge @@ -25,22 +23,21 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zpotrf_quark(Quark *quark) { cham_uplo_t uplo; int n; - CHAMELEON_Complex64_t *A; - int lda; + CHAM_tile_t *tileA; RUNTIME_sequence_t *sequence; RUNTIME_request_t *request; int iinfo; int info; - quark_unpack_args_7(quark, uplo, n, A, lda, sequence, request, iinfo); - CORE_zpotrf(uplo, n, A, lda, &info); + quark_unpack_args_6(quark, uplo, n, tileA, sequence, request, iinfo); + TCORE_zpotrf(uplo, n, tileA, &info); if ( (sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) { RUNTIME_sequence_flush( (CHAM_context_t*)quark, sequence, request, iinfo+info ); } @@ -48,7 +45,7 @@ void CORE_zpotrf_quark(Quark *quark) void INSERT_TASK_zpotrf(const RUNTIME_option_t *options, cham_uplo_t uplo, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, int iinfo) { quark_option_t *opt = (quark_option_t*)(options->schedopt); @@ -56,8 +53,7 @@ void INSERT_TASK_zpotrf(const RUNTIME_option_t *options, QUARK_Insert_Task(opt->quark, CORE_zpotrf_quark, (Quark_Task_Flags*)opt, sizeof(int), &uplo, VALUE, sizeof(int), &n, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT, - sizeof(int), &lda, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT, sizeof(RUNTIME_sequence_t*), &(options->sequence), VALUE, sizeof(RUNTIME_request_t*), &(options->request), VALUE, sizeof(int), &iinfo, VALUE, diff --git a/runtime/quark/codelets/codelet_zssssm.c b/runtime/quark/codelets/codelet_zssssm.c index bc5b3ae5caee9fbc235c415b92d9ea2dd8185be2..83ec806967f7fe58bb03a0ca5f2126591498416b 100644 --- a/runtime/quark/codelets/codelet_zssssm.c +++ b/runtime/quark/codelets/codelet_zssssm.c @@ -12,8 +12,6 @@ * @brief Chameleon zssssm Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge @@ -26,7 +24,7 @@ #include "coreblas/cblas.h" #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zssssm_quark(Quark *quark) { @@ -36,95 +34,22 @@ void CORE_zssssm_quark(Quark *quark) int n2; int k; int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *L1; - int ldl1; - CHAMELEON_Complex64_t *L2; - int ldl2; + CHAM_tile_t *tileA1; + CHAM_tile_t *tileA2; + CHAM_tile_t *tileL1; + CHAM_tile_t *tileL2; int *IPIV; - quark_unpack_args_15(quark, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, L1, ldl1, L2, ldl2, IPIV); - CORE_zssssm(m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, L1, ldl1, L2, ldl2, IPIV); + quark_unpack_args_11(quark, m1, n1, m2, n2, k, ib, tileA1, tileA2, tileL1, tileL2, IPIV); + TCORE_zssssm(m1, n1, m2, n2, k, ib, tileA1, tileA2, tileL1, tileL2, IPIV); } -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zssssm applies the LU factorization update from a complex - * matrix formed by a lower triangular IB-by-K tile L1 on top of a - * M2-by-K tile L2 to a second complex matrix formed by a M1-by-N1 - * tile A1 on top of a M2-by-N2 tile A2 (N1 == N2). - * - * This is the right-looking Level 2.5 BLAS version of the algorithm. - * - ******************************************************************************* - * - * @param[in] M1 - * The number of rows of the tile A1. M1 >= 0. - * - * @param[in] N1 - * The number of columns of the tile A1. N1 >= 0. - * - * @param[in] M2 - * The number of rows of the tile A2 and of the tile L2. - * M2 >= 0. - * - * @param[in] N2 - * The number of columns of the tile A2. N2 >= 0. - * - * @param[in] K - * The number of columns of the tiles L1 and L2. K >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A1 - * On entry, the M1-by-N1 tile A1. - * On exit, A1 is updated by the application of L (L1 L2). - * - * @param[in] LDA1 - * The leading dimension of the array A1. LDA1 >= max(1,M1). - * - * @param[in,out] A2 - * On entry, the M2-by-N2 tile A2. - * On exit, A2 is updated by the application of L (L1 L2). - * - * @param[in] LDA2 - * The leading dimension of the array A2. LDA2 >= max(1,M2). - * - * @param[in] L1 - * The IB-by-K lower triangular tile as returned by - * CORE_ztstrf. - * - * @param[in] LDL1 - * The leading dimension of the array L1. LDL1 >= max(1,IB). - * - * @param[in] L2 - * The M2-by-K tile as returned by CORE_ztstrf. - * - * @param[in] LDL2 - * The leading dimension of the array L2. LDL2 >= max(1,M2). - * - * @param[in] IPIV - * The pivot indices array of size K as returned by - * CORE_ztstrf. - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if INFO = -k, the k-th argument had an illegal value - * - */ void INSERT_TASK_zssssm(const RUNTIME_option_t *options, int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *L1, int L1m, int L1n, int ldl1, - const CHAM_desc_t *L2, int L2m, int L2n, int ldl2, + const CHAM_desc_t *A1, int A1m, int A1n, + const CHAM_desc_t *A2, int A2m, int A2n, + const CHAM_desc_t *L1, int L1m, int L1n, + const CHAM_desc_t *L2, int L2m, int L2n, const int *IPIV) { quark_option_t *opt = (quark_option_t*)(options->schedopt); @@ -136,14 +61,10 @@ void INSERT_TASK_zssssm(const RUNTIME_option_t *options, sizeof(int), &n2, VALUE, sizeof(int), &k, VALUE, sizeof(int), &ib, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), INOUT, - sizeof(int), &lda1, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), INOUT | LOCALITY, - sizeof(int), &lda2, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, RTBLKADDR(L1, CHAMELEON_Complex64_t, L1m, L1n), INPUT, - sizeof(int), &ldl1, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, RTBLKADDR(L2, CHAMELEON_Complex64_t, L2m, L2n), INPUT, - sizeof(int), &ldl2, VALUE, + sizeof(void*), RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), INOUT, + sizeof(void*), RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), INOUT | LOCALITY, + sizeof(void*), RTBLKADDR(L1, CHAMELEON_Complex64_t, L1m, L1n), INPUT, + sizeof(void*), RTBLKADDR(L2, CHAMELEON_Complex64_t, L2m, L2n), INPUT, sizeof(int)*nb, IPIV, INPUT, 0); } diff --git a/runtime/quark/codelets/codelet_zsymm.c b/runtime/quark/codelets/codelet_zsymm.c index acb3cd4491120b222e45709ba40158e55d966138..b1965360c3b1afb0fc1c313a20a7c63ae5e95990 100644 --- a/runtime/quark/codelets/codelet_zsymm.c +++ b/runtime/quark/codelets/codelet_zsymm.c @@ -12,8 +12,6 @@ * @brief Chameleon zsymm Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge @@ -25,7 +23,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zsymm_quark(Quark *quark) { @@ -34,28 +32,25 @@ void CORE_zsymm_quark(Quark *quark) int M; int N; CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t *A; - int LDA; - CHAMELEON_Complex64_t *B; - int LDB; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; CHAMELEON_Complex64_t beta; - CHAMELEON_Complex64_t *C; - int LDC; + CHAM_tile_t *tileC; - quark_unpack_args_12(quark, side, uplo, M, N, alpha, A, LDA, B, LDB, beta, C, LDC); - CORE_zsymm(side, uplo, + quark_unpack_args_9(quark, side, uplo, M, N, alpha, tileA, tileB, beta, tileC); + TCORE_zsymm(side, uplo, M, N, - alpha, A, LDA, - B, LDB, - beta, C, LDC); + alpha, tileA, + tileB, + beta, tileC); } void INSERT_TASK_zsymm(const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_SYMM; @@ -65,12 +60,9 @@ void INSERT_TASK_zsymm(const RUNTIME_option_t *options, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &lda, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INPUT, - sizeof(int), &ldb, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, + sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INPUT, sizeof(CHAMELEON_Complex64_t), &beta, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), INOUT, - sizeof(int), &ldc, VALUE, + sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), INOUT, 0); } diff --git a/runtime/quark/codelets/codelet_zsyr2k.c b/runtime/quark/codelets/codelet_zsyr2k.c index 66da8d9c6853843f4d89ad5efbaa8676af61133f..d0306a94284854a10ca5f2eaed20a0de8f8f52d3 100644 --- a/runtime/quark/codelets/codelet_zsyr2k.c +++ b/runtime/quark/codelets/codelet_zsyr2k.c @@ -12,8 +12,6 @@ * @brief Chameleon zsyr2k Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge @@ -25,7 +23,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zsyr2k_quark(Quark *quark) { @@ -34,25 +32,22 @@ void CORE_zsyr2k_quark(Quark *quark) int n; int k; CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t *A; - int lda; - CHAMELEON_Complex64_t *B; - int ldb; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; CHAMELEON_Complex64_t beta; - CHAMELEON_Complex64_t *C; - int ldc; + CHAM_tile_t *tileC; - quark_unpack_args_12(quark, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc); - CORE_zsyr2k(uplo, trans, - n, k, alpha, A, lda, B, ldb, beta, C, ldc); + quark_unpack_args_9(quark, uplo, trans, n, k, alpha, tileA, tileB, beta, tileC); + TCORE_zsyr2k(uplo, trans, + n, k, alpha, tileA, tileB, beta, tileC); } void INSERT_TASK_zsyr2k(const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_SYR2K; @@ -62,12 +57,9 @@ void INSERT_TASK_zsyr2k(const RUNTIME_option_t *options, sizeof(int), &n, VALUE, sizeof(int), &k, VALUE, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &lda, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INPUT, - sizeof(int), &ldb, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, + sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INPUT, sizeof(CHAMELEON_Complex64_t), &beta, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), INOUT, - sizeof(int), &ldc, VALUE, + sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), INOUT, 0); } diff --git a/runtime/quark/codelets/codelet_zsyrk.c b/runtime/quark/codelets/codelet_zsyrk.c index d32372f7af30f5f9dfee03c4f025891078017e1b..5add74bf7f386a53a2d986ca9ea07dec434bcbbc 100644 --- a/runtime/quark/codelets/codelet_zsyrk.c +++ b/runtime/quark/codelets/codelet_zsyrk.c @@ -12,8 +12,6 @@ * @brief Chameleon zsyrk Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge @@ -25,7 +23,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zsyrk_quark(Quark *quark) { @@ -34,24 +32,22 @@ void CORE_zsyrk_quark(Quark *quark) int n; int k; CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t *A; - int lda; + CHAM_tile_t *tileA; CHAMELEON_Complex64_t beta; - CHAMELEON_Complex64_t *C; - int ldc; + CHAM_tile_t *tileC; - quark_unpack_args_10(quark, uplo, trans, n, k, alpha, A, lda, beta, C, ldc); - CORE_zsyrk(uplo, trans, + quark_unpack_args_8(quark, uplo, trans, n, k, alpha, tileA, beta, tileC); + TCORE_zsyrk(uplo, trans, n, k, - alpha, A, lda, - beta, C, ldc); + alpha, tileA, + beta, tileC); } void INSERT_TASK_zsyrk(const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_SYRK; @@ -61,10 +57,8 @@ void INSERT_TASK_zsyrk(const RUNTIME_option_t *options, sizeof(int), &n, VALUE, sizeof(int), &k, VALUE, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &lda, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, sizeof(CHAMELEON_Complex64_t), &beta, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), INOUT, - sizeof(int), &ldc, VALUE, + sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), INOUT, 0); } diff --git a/runtime/quark/codelets/codelet_zsyssq.c b/runtime/quark/codelets/codelet_zsyssq.c index 913424820fd8e01995ef2d889ebd97ac791758cc..8636e3a69f56698be7feecc081b7ba891824be8d 100644 --- a/runtime/quark/codelets/codelet_zsyssq.c +++ b/runtime/quark/codelets/codelet_zsyssq.c @@ -12,8 +12,6 @@ * @brief Chameleon zsyssq Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.6.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @date 2014-11-16 * @precisions normal z -> c d s @@ -21,42 +19,32 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zsyssq_quark(Quark *quark) { cham_store_t storev; cham_uplo_t uplo; int n; - CHAMELEON_Complex64_t *A; - int lda; - double *SCALESUMSQ; + CHAM_tile_t *tileA; + CHAM_tile_t *tileW; - quark_unpack_args_6( quark, storev, uplo, n, A, lda, SCALESUMSQ ); - CORE_zsyssq( storev, uplo, n, A, lda, SCALESUMSQ ); + quark_unpack_args_5( quark, storev, uplo, n, tileA, tileW ); + TCORE_zsyssq( storev, uplo, n, tileA, tileW ); } void INSERT_TASK_zsyssq( const RUNTIME_option_t *options, cham_store_t storev, cham_uplo_t uplo, int n, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { - int sizessq; - - if ( storev == ChamEltwise ) { - sizessq = 2; - } else { - sizessq = 2*n; - } - quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_SYSSQ; QUARK_Insert_Task(opt->quark, CORE_zsyssq_quark, (Quark_Task_Flags*)opt, sizeof(cham_store_t), &storev, VALUE, sizeof(int), &uplo, VALUE, sizeof(int), &n, VALUE, - sizeof(CHAMELEON_Complex64_t)*n*n, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &lda, VALUE, - sizeof(double)*sizessq, RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INOUT, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, + sizeof(void*), RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INOUT, 0); } diff --git a/runtime/quark/codelets/codelet_zsytrf_nopiv.c b/runtime/quark/codelets/codelet_zsytrf_nopiv.c index cf41a06c0130af5d15f88df5a1796df115aa2e5a..5e3922ec1469fa643d6615ddad251eebc0225854 100644 --- a/runtime/quark/codelets/codelet_zsytrf_nopiv.c +++ b/runtime/quark/codelets/codelet_zsytrf_nopiv.c @@ -25,21 +25,20 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zsytrf_nopiv_quark(Quark *quark) { cham_uplo_t uplo; int n; - CHAMELEON_Complex64_t *A; - int lda; + CHAM_tile_t *tileA; RUNTIME_sequence_t *sequence; RUNTIME_request_t *request; int iinfo; int info = 0; - quark_unpack_args_7(quark, uplo, n, A, lda, sequence, request, iinfo); - info = CORE_zsytf2_nopiv(uplo, n, A, lda); + quark_unpack_args_6(quark, uplo, n, tileA, sequence, request, iinfo); + info = TCORE_zsytf2_nopiv(uplo, n, tileA); if ( (sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) { RUNTIME_sequence_flush( (CHAM_context_t*)quark, sequence, request, iinfo+info ); } @@ -47,7 +46,7 @@ void CORE_zsytrf_nopiv_quark(Quark *quark) void INSERT_TASK_zsytrf_nopiv(const RUNTIME_option_t *options, cham_uplo_t uplo, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, int iinfo) { quark_option_t *opt = (quark_option_t*)(options->schedopt); @@ -55,8 +54,7 @@ void INSERT_TASK_zsytrf_nopiv(const RUNTIME_option_t *options, QUARK_Insert_Task(opt->quark, CORE_zsytrf_nopiv_quark, (Quark_Task_Flags*)opt, sizeof(int), &uplo, VALUE, sizeof(int), &n, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT, - sizeof(int), &lda, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT, sizeof(RUNTIME_sequence_t*), &(options->sequence), VALUE, sizeof(RUNTIME_request_t*), &(options->request), VALUE, sizeof(int), &iinfo, VALUE, diff --git a/runtime/quark/codelets/codelet_ztplqt.c b/runtime/quark/codelets/codelet_ztplqt.c index e151f0a9c69effb189d71aca391e9ce392a665e5..1b31544c6b12655143083cd2581daa24a70904d5 100644 --- a/runtime/quark/codelets/codelet_ztplqt.c +++ b/runtime/quark/codelets/codelet_ztplqt.c @@ -19,7 +19,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" static void CORE_ztplqt_quark( Quark *quark ) @@ -28,27 +28,24 @@ CORE_ztplqt_quark( Quark *quark ) int N; int L; int ib; - CHAMELEON_Complex64_t *A; - int lda; - CHAMELEON_Complex64_t *B; - int ldb; - CHAMELEON_Complex64_t *T; - int ldt; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + CHAM_tile_t *tileT; CHAMELEON_Complex64_t *WORK; - quark_unpack_args_11( quark, M, N, L, ib, - A, lda, B, ldb, T, ldt, WORK ); + quark_unpack_args_8( quark, M, N, L, ib, + tileA, tileB, tileT, WORK ); - CORE_zlaset( ChamUpperLower, ib, N, 0., 0., T, ldt ); - CORE_ztplqt( M, N, L, ib, - A, lda, B, ldb, T, ldt, WORK ); + TCORE_zlaset( ChamUpperLower, ib, N, 0., 0., tileT ); + TCORE_ztplqt( M, N, L, ib, + tileA, tileB, tileT, WORK ); } void INSERT_TASK_ztplqt( const RUNTIME_option_t *options, int M, int N, int L, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + const CHAM_desc_t *T, int Tm, int Tn ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_TPLQT; @@ -61,12 +58,9 @@ void INSERT_TASK_ztplqt( const RUNTIME_option_t *options, sizeof(int), &N, VALUE, sizeof(int), &L, VALUE, sizeof(int), &ib, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), INOUT | QUARK_REGION_L | QUARK_REGION_D, - sizeof(int), &lda, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), INOUT | shapeB | LOCALITY, - sizeof(int), &ldb, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*ib, RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), OUTPUT, - sizeof(int), &ldt, VALUE, + sizeof(void*), RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), INOUT | QUARK_REGION_L | QUARK_REGION_D, + sizeof(void*), RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), INOUT | shapeB | LOCALITY, + sizeof(void*), RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), OUTPUT, sizeof(CHAMELEON_Complex64_t)*(ib+1)*nb, NULL, SCRATCH, 0); } diff --git a/runtime/quark/codelets/codelet_ztpmlqt.c b/runtime/quark/codelets/codelet_ztpmlqt.c index d2ccd5fa18c8cb6da6598193278f0b2cbde3cd02..4fe5df16194e2ceae29f2ccfb19707a613dc4f33 100644 --- a/runtime/quark/codelets/codelet_ztpmlqt.c +++ b/runtime/quark/codelets/codelet_ztpmlqt.c @@ -19,7 +19,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" static void CORE_ztpmlqt_quark( Quark *quark ) @@ -31,30 +31,26 @@ CORE_ztpmlqt_quark( Quark *quark ) int K; int L; int ib; - const CHAMELEON_Complex64_t *V; - int ldv; - const CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *A; - int lda; - CHAMELEON_Complex64_t *B; - int ldb; + CHAM_tile_t *tileV; + CHAM_tile_t *tileT; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; CHAMELEON_Complex64_t *WORK; - quark_unpack_args_16( quark, side, trans, M, N, K, L, ib, - V, ldv, T, ldt, A, lda, B, ldb, WORK ); + quark_unpack_args_12( quark, side, trans, M, N, K, L, ib, + tileV, tileT, tileA, tileB, WORK ); - CORE_ztpmlqt( side, trans, M, N, K, L, ib, - V, ldv, T, ldt, A, lda, B, ldb, WORK ); + TCORE_ztpmlqt( side, trans, M, N, K, L, ib, + tileV, tileT, tileA, tileB, WORK ); } void INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int ib, int nb, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ) + const CHAM_desc_t *V, int Vm, int Vn, + const CHAM_desc_t *T, int Tm, int Tn, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_TPMLQT; @@ -70,14 +66,10 @@ void INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options, sizeof(int), &K, VALUE, sizeof(int), &L, VALUE, sizeof(int), &ib, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR( V, CHAMELEON_Complex64_t, Vm, Vn ), INPUT | shapeV, - sizeof(int), &ldv, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), INPUT, - sizeof(int), &ldt, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), INOUT, - sizeof(int), &lda, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), INOUT | LOCALITY, - sizeof(int), &ldb, VALUE, + sizeof(void*), RTBLKADDR( V, CHAMELEON_Complex64_t, Vm, Vn ), INPUT | shapeV, + sizeof(void*), RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), INPUT, + sizeof(void*), RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), INOUT, + sizeof(void*), RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), INOUT | LOCALITY, sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, 0); } diff --git a/runtime/quark/codelets/codelet_ztpmqrt.c b/runtime/quark/codelets/codelet_ztpmqrt.c index ac396d88ad5247bbcac415351be9faa4daa6741d..a7670ec15cb603dcbfc120844b662a6641fc8813 100644 --- a/runtime/quark/codelets/codelet_ztpmqrt.c +++ b/runtime/quark/codelets/codelet_ztpmqrt.c @@ -19,7 +19,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" static void CORE_ztpmqrt_quark( Quark *quark ) @@ -31,30 +31,26 @@ CORE_ztpmqrt_quark( Quark *quark ) int K; int L; int ib; - const CHAMELEON_Complex64_t *V; - int ldv; - const CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *A; - int lda; - CHAMELEON_Complex64_t *B; - int ldb; + CHAM_tile_t *tileV; + CHAM_tile_t *tileT; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; CHAMELEON_Complex64_t *WORK; - quark_unpack_args_16( quark, side, trans, M, N, K, L, ib, - V, ldv, T, ldt, A, lda, B, ldb, WORK ); + quark_unpack_args_12( quark, side, trans, M, N, K, L, ib, + tileV, tileT, tileA, tileB, WORK ); - CORE_ztpmqrt( side, trans, M, N, K, L, ib, - V, ldv, T, ldt, A, lda, B, ldb, WORK ); + TCORE_ztpmqrt( side, trans, M, N, K, L, ib, + tileV, tileT, tileA, tileB, WORK ); } void INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int ib, int nb, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb ) + const CHAM_desc_t *V, int Vm, int Vn, + const CHAM_desc_t *T, int Tm, int Tn, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_TPMQRT; @@ -70,14 +66,10 @@ void INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options, sizeof(int), &K, VALUE, sizeof(int), &L, VALUE, sizeof(int), &ib, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR( V, CHAMELEON_Complex64_t, Vm, Vn ), INPUT | shapeV, - sizeof(int), &ldv, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), INPUT, - sizeof(int), &ldt, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), INOUT, - sizeof(int), &lda, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), INOUT | LOCALITY, - sizeof(int), &ldb, VALUE, + sizeof(void*), RTBLKADDR( V, CHAMELEON_Complex64_t, Vm, Vn ), INPUT | shapeV, + sizeof(void*), RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), INPUT, + sizeof(void*), RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), INOUT, + sizeof(void*), RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), INOUT | LOCALITY, sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, 0); } diff --git a/runtime/quark/codelets/codelet_ztpqrt.c b/runtime/quark/codelets/codelet_ztpqrt.c index 814300de138185d20c137a34843010900d473e17..efecbd2be7a42bd4c859322c2002758ebe706ad4 100644 --- a/runtime/quark/codelets/codelet_ztpqrt.c +++ b/runtime/quark/codelets/codelet_ztpqrt.c @@ -19,7 +19,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" static void CORE_ztpqrt_quark( Quark *quark ) @@ -28,27 +28,24 @@ CORE_ztpqrt_quark( Quark *quark ) int N; int L; int ib; - CHAMELEON_Complex64_t *A; - int lda; - CHAMELEON_Complex64_t *B; - int ldb; - CHAMELEON_Complex64_t *T; - int ldt; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + CHAM_tile_t *tileT; CHAMELEON_Complex64_t *WORK; - quark_unpack_args_11( quark, M, N, L, ib, - A, lda, B, ldb, T, ldt, WORK ); + quark_unpack_args_8( quark, M, N, L, ib, + tileA, tileB, tileT, WORK ); - CORE_zlaset( ChamUpperLower, ib, N, 0., 0., T, ldt ); - CORE_ztpqrt( M, N, L, ib, - A, lda, B, ldb, T, ldt, WORK ); + TCORE_zlaset( ChamUpperLower, ib, N, 0., 0., tileT ); + TCORE_ztpqrt( M, N, L, ib, + tileA, tileB, tileT, WORK ); } void INSERT_TASK_ztpqrt( const RUNTIME_option_t *options, int M, int N, int L, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb, - const CHAM_desc_t *T, int Tm, int Tn, int ldt ) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + const CHAM_desc_t *T, int Tm, int Tn ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_TPQRT; @@ -61,12 +58,9 @@ void INSERT_TASK_ztpqrt( const RUNTIME_option_t *options, sizeof(int), &N, VALUE, sizeof(int), &L, VALUE, sizeof(int), &ib, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), INOUT | QUARK_REGION_U | QUARK_REGION_D, - sizeof(int), &lda, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), INOUT | shapeB | LOCALITY, - sizeof(int), &ldb, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*ib, RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), OUTPUT, - sizeof(int), &ldt, VALUE, + sizeof(void*), RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), INOUT | QUARK_REGION_U | QUARK_REGION_D, + sizeof(void*), RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), INOUT | shapeB | LOCALITY, + sizeof(void*), RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), OUTPUT, sizeof(CHAMELEON_Complex64_t)*(ib+1)*nb, NULL, SCRATCH, 0); } diff --git a/runtime/quark/codelets/codelet_ztradd.c b/runtime/quark/codelets/codelet_ztradd.c index d804d95990d42441021b903291179a59261f2c90..c2f99242d9ba3e7fdfed0ecedf3c9b516452a27e 100644 --- a/runtime/quark/codelets/codelet_ztradd.c +++ b/runtime/quark/codelets/codelet_ztradd.c @@ -12,8 +12,6 @@ * @brief Chameleon ztradd Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @date 2015-11-03 * @precisions normal z -> c d s @@ -21,7 +19,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_ztradd_quark(Quark *quark) { @@ -30,14 +28,12 @@ void CORE_ztradd_quark(Quark *quark) int M; int N; CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t *A; - int LDA; + CHAM_tile_t *tileA; CHAMELEON_Complex64_t beta; - CHAMELEON_Complex64_t *B; - int LDB; + CHAM_tile_t *tileB; - quark_unpack_args_10(quark, uplo, trans, M, N, alpha, A, LDA, beta, B, LDB); - CORE_ztradd(uplo, trans, M, N, alpha, A, LDA, beta, B, LDB); + quark_unpack_args_8(quark, uplo, trans, M, N, alpha, tileA, beta, tileB); + TCORE_ztradd(uplo, trans, M, N, alpha, tileA, beta, tileB); return; } @@ -102,8 +98,8 @@ void CORE_ztradd_quark(Quark *quark) */ void INSERT_TASK_ztradd( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn, int ldb ) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_GEADD; @@ -113,11 +109,9 @@ void INSERT_TASK_ztradd( const RUNTIME_option_t *options, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, - sizeof(CHAMELEON_Complex64_t)*lda*n, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &lda, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, sizeof(CHAMELEON_Complex64_t), &beta, VALUE, - sizeof(CHAMELEON_Complex64_t)*ldb*n, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INOUT, - sizeof(int), &ldb, VALUE, + sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INOUT, 0); (void)nb; diff --git a/runtime/quark/codelets/codelet_ztrasm.c b/runtime/quark/codelets/codelet_ztrasm.c index bed2262809381256aa3c9c731df359b11803120f..5ce77d8b1c1df571a7818bbf2ae9dfd9d9172e28 100644 --- a/runtime/quark/codelets/codelet_ztrasm.c +++ b/runtime/quark/codelets/codelet_ztrasm.c @@ -12,8 +12,6 @@ * @brief Chameleon ztrasm Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.6.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @date 2014-11-16 * @precisions normal z -> c d s @@ -21,7 +19,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_ztrasm_quark(Quark *quark) { @@ -30,29 +28,27 @@ void CORE_ztrasm_quark(Quark *quark) cham_diag_t diag; int M; int N; - CHAMELEON_Complex64_t *A; - int lda; - double *work; + CHAM_tile_t *tileA; + CHAM_tile_t *tileW; - quark_unpack_args_8(quark, storev, uplo, diag, M, N, A, lda, work); - CORE_ztrasm(storev, uplo, diag, M, N, A, lda, work); + quark_unpack_args_7( quark, storev, uplo, diag, M, N, tileA, tileW ); + TCORE_ztrasm( storev, uplo, diag, M, N, tileA, tileW->mat ); } void INSERT_TASK_ztrasm(const RUNTIME_option_t *options, cham_store_t storev, cham_uplo_t uplo, cham_diag_t diag, int M, int N, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn) { quark_option_t *opt = (quark_option_t*)(options->schedopt); - int szeW = storev == ChamRowwise ? M : N ; + QUARK_Insert_Task(opt->quark, CORE_ztrasm_quark, (Quark_Task_Flags*)opt, sizeof(int), &storev, VALUE, sizeof(int), &uplo, VALUE, sizeof(int), &diag, VALUE, sizeof(int), &M, VALUE, sizeof(int), &N, VALUE, - sizeof(CHAMELEON_Complex64_t)*lda*N, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &lda, VALUE, - sizeof(double)*szeW, RTBLKADDR(B, double, Bm, Bn), INOUT, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, + sizeof(void*), RTBLKADDR(B, double, Bm, Bn), INOUT, 0); } diff --git a/runtime/quark/codelets/codelet_ztrmm.c b/runtime/quark/codelets/codelet_ztrmm.c index 5697128bf89e59fc63a4f70cd364367b45e75e92..ce869b781c6da7238d922e4464a5d197e9a99db1 100644 --- a/runtime/quark/codelets/codelet_ztrmm.c +++ b/runtime/quark/codelets/codelet_ztrmm.c @@ -12,8 +12,6 @@ * @brief Chameleon ztrmm Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Julien Langou * @author Henricus Bouwmeester * @author Mathieu Faverge @@ -25,7 +23,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_ztrmm_quark(Quark *quark) { @@ -36,24 +34,22 @@ void CORE_ztrmm_quark(Quark *quark) int M; int N; CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t *A; - int LDA; - CHAMELEON_Complex64_t *B; - int LDB; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; - quark_unpack_args_11(quark, side, uplo, transA, diag, M, N, alpha, A, LDA, B, LDB); - CORE_ztrmm(side, uplo, + quark_unpack_args_9(quark, side, uplo, transA, diag, M, N, alpha, tileA, tileB); + TCORE_ztrmm(side, uplo, transA, diag, M, N, - alpha, A, LDA, - B, LDB); + alpha, tileA, + tileB); } void INSERT_TASK_ztrmm(const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_TRMM; @@ -65,9 +61,7 @@ void INSERT_TASK_ztrmm(const RUNTIME_option_t *options, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &lda, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INOUT, - sizeof(int), &ldb, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, + sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INOUT, 0); } diff --git a/runtime/quark/codelets/codelet_ztrsm.c b/runtime/quark/codelets/codelet_ztrsm.c index ce15a4ca1c890f446e5c88866e1b72859380777c..5c9b068502a4f8a65bc2cd60c43f6c727b9ab802 100644 --- a/runtime/quark/codelets/codelet_ztrsm.c +++ b/runtime/quark/codelets/codelet_ztrsm.c @@ -12,8 +12,6 @@ * @brief Chameleon ztrsm Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge @@ -25,7 +23,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_ztrsm_quark(Quark *quark) { @@ -36,24 +34,22 @@ void CORE_ztrsm_quark(Quark *quark) int m; int n; CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t *A; - int lda; - CHAMELEON_Complex64_t *B; - int ldb; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; - quark_unpack_args_11(quark, side, uplo, transA, diag, m, n, alpha, A, lda, B, ldb); - CORE_ztrsm(side, uplo, + quark_unpack_args_9(quark, side, uplo, transA, diag, m, n, alpha, tileA, tileB); + TCORE_ztrsm(side, uplo, transA, diag, m, n, - alpha, A, lda, - B, ldb); + alpha, tileA, + tileB); } void INSERT_TASK_ztrsm(const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_TRSM; @@ -65,9 +61,7 @@ void INSERT_TASK_ztrsm(const RUNTIME_option_t *options, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, sizeof(CHAMELEON_Complex64_t), &alpha, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &lda, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INOUT, - sizeof(int), &ldb, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, + sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), INOUT, 0); } diff --git a/runtime/quark/codelets/codelet_ztrssq.c b/runtime/quark/codelets/codelet_ztrssq.c index 80bb9e78c3e41fff60f96fea3d9e9b768a25d65d..fc9b34497067b430a9a527f312ed0e746ebbf9e9 100644 --- a/runtime/quark/codelets/codelet_ztrssq.c +++ b/runtime/quark/codelets/codelet_ztrssq.c @@ -12,8 +12,6 @@ * @brief Chameleon ztrssq Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.6.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @date 2014-11-16 * @precisions normal z -> c d s @@ -21,7 +19,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_ztrssq_quark(Quark *quark) { @@ -29,18 +27,17 @@ void CORE_ztrssq_quark(Quark *quark) cham_diag_t diag; int m; int n; - CHAMELEON_Complex64_t *A; - int lda; - double *SCALESUMSQ; + CHAM_tile_t *tileA; + CHAM_tile_t *tileW; - quark_unpack_args_7( quark, uplo, diag, m, n, A, lda, SCALESUMSQ ); - CORE_ztrssq( uplo, diag, m, n, A, lda, &SCALESUMSQ[0], &SCALESUMSQ[1]); + quark_unpack_args_6( quark, uplo, diag, m, n, tileA, tileW ); + TCORE_ztrssq( uplo, diag, m, n, tileA, tileW ); } void INSERT_TASK_ztrssq( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_diag_t diag, int m, int n, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); @@ -50,8 +47,7 @@ void INSERT_TASK_ztrssq( const RUNTIME_option_t *options, sizeof(int), &diag, VALUE, sizeof(int), &m, VALUE, sizeof(int), &n, VALUE, - sizeof(CHAMELEON_Complex64_t)*lda*n, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &lda, VALUE, - sizeof(double)*2, RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INOUT, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, + sizeof(void*), RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INOUT, 0); } diff --git a/runtime/quark/codelets/codelet_ztrtri.c b/runtime/quark/codelets/codelet_ztrtri.c index edc02bc7f2ab8cbfb3d66238834b55cc0f7b54a6..650dc8d19849ae5c8b4ceb47a16035cee6af42b3 100644 --- a/runtime/quark/codelets/codelet_ztrtri.c +++ b/runtime/quark/codelets/codelet_ztrtri.c @@ -12,8 +12,6 @@ * @brief Chameleon ztrtri Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Julien Langou * @author Henricus Bouwmeester * @author Mathieu Faverge @@ -25,23 +23,22 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_ztrtri_quark(Quark *quark) { cham_uplo_t uplo; cham_diag_t diag; int N; - CHAMELEON_Complex64_t *A; - int LDA; + CHAM_tile_t *tileA; RUNTIME_sequence_t *sequence; RUNTIME_request_t *request; int iinfo; int info; - quark_unpack_args_8(quark, uplo, diag, N, A, LDA, sequence, request, iinfo); - CORE_ztrtri(uplo, diag, N, A, LDA, &info); + quark_unpack_args_7(quark, uplo, diag, N, tileA, sequence, request, iinfo); + TCORE_ztrtri(uplo, diag, N, tileA, &info); if ( (sequence->status == CHAMELEON_SUCCESS) && (info > 0) ) { RUNTIME_sequence_flush( (CHAM_context_t*)quark, sequence, request, iinfo+info ); } @@ -50,7 +47,7 @@ void CORE_ztrtri_quark(Quark *quark) void INSERT_TASK_ztrtri(const RUNTIME_option_t *options, cham_uplo_t uplo, cham_diag_t diag, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *A, int Am, int An, int iinfo) { quark_option_t *opt = (quark_option_t*)(options->schedopt); @@ -59,8 +56,7 @@ void INSERT_TASK_ztrtri(const RUNTIME_option_t *options, sizeof(int), &uplo, VALUE, sizeof(int), &diag, VALUE, sizeof(int), &n, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT, - sizeof(int), &lda, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT, sizeof(RUNTIME_sequence_t*), &(options->sequence), VALUE, sizeof(RUNTIME_request_t*), &(options->request), VALUE, sizeof(int), &iinfo, VALUE, diff --git a/runtime/quark/codelets/codelet_ztsmlq_hetra1.c b/runtime/quark/codelets/codelet_ztsmlq_hetra1.c index 6b68cfed4e9c7134b8d0bf99730f6c5a561e9c83..4349c864f29056ae4e7d01e386069f9d5479001a 100644 --- a/runtime/quark/codelets/codelet_ztsmlq_hetra1.c +++ b/runtime/quark/codelets/codelet_ztsmlq_hetra1.c @@ -22,7 +22,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_ztsmlq_hetra1_quark(Quark *quark) { @@ -34,28 +34,25 @@ void CORE_ztsmlq_hetra1_quark(Quark *quark) int n2; int k; int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *V; - int ldv; - CHAMELEON_Complex64_t *T; - int ldt; + CHAM_tile_t *tileA1; + CHAM_tile_t *tileA2; + CHAM_tile_t *tileV; + CHAM_tile_t *tileT; CHAMELEON_Complex64_t *WORK; int ldwork; - quark_unpack_args_18(quark, side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); - CORE_ztsmlq_hetra1(side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); + quark_unpack_args_14(quark, side, trans, m1, n1, m2, n2, k, ib, tileA1, tileA2, tileV, tileT, WORK, ldwork); + TCORE_ztsmlq_hetra1( side, trans, m1, n1, m2, n2, k, ib, + tileA1, tileA2, tileV, tileT, WORK, ldwork ); } void INSERT_TASK_ztsmlq_hetra1(const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) + const CHAM_desc_t *A1, int A1m, int A1n, + const CHAM_desc_t *A2, int A2m, int A2n, + const CHAM_desc_t *V, int Vm, int Vn, + const CHAM_desc_t *T, int Tm, int Tn) { quark_option_t *opt = (quark_option_t*)(options->schedopt); int ldwork = side == ChamLeft ? ib : nb; @@ -69,14 +66,10 @@ void INSERT_TASK_ztsmlq_hetra1(const RUNTIME_option_t *options, sizeof(int), &n2, VALUE, sizeof(int), &k, VALUE, sizeof(int), &ib, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), INOUT|QUARK_REGION_U|QUARK_REGION_D, - sizeof(int), &lda1, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), INOUT, - sizeof(int), &lda2, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn), INPUT, - sizeof(int), &ldv, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), INPUT, - sizeof(int), &ldt, VALUE, + sizeof(void*), RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), INOUT|QUARK_REGION_U|QUARK_REGION_D, + sizeof(void*), RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), INOUT, + sizeof(void*), RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn), INPUT, + sizeof(void*), RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), INPUT, sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, sizeof(int), &ldwork, VALUE, 0); diff --git a/runtime/quark/codelets/codelet_ztsmqr_hetra1.c b/runtime/quark/codelets/codelet_ztsmqr_hetra1.c index 879302364314166cf8f86d2fe98b820cdd8b0aea..934338eea8e50313b75b15e3a053ce9a0e24d3e0 100644 --- a/runtime/quark/codelets/codelet_ztsmqr_hetra1.c +++ b/runtime/quark/codelets/codelet_ztsmqr_hetra1.c @@ -22,7 +22,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_ztsmqr_hetra1_quark(Quark *quark) { @@ -34,28 +34,24 @@ void CORE_ztsmqr_hetra1_quark(Quark *quark) int n2; int k; int ib; - CHAMELEON_Complex64_t *A1; - int lda1; - CHAMELEON_Complex64_t *A2; - int lda2; - CHAMELEON_Complex64_t *V; - int ldv; - CHAMELEON_Complex64_t *T; - int ldt; + CHAM_tile_t *tileA1; + CHAM_tile_t *tileA2; + CHAM_tile_t *tileV; + CHAM_tile_t *tileT; CHAMELEON_Complex64_t *WORK; int ldwork; - quark_unpack_args_18(quark, side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); - CORE_ztsmqr_hetra1(side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); + quark_unpack_args_14(quark, side, trans, m1, n1, m2, n2, k, ib, tileA1, tileA2, tileV, tileT, WORK, ldwork); + TCORE_ztsmqr_hetra1(side, trans, m1, n1, m2, n2, k, ib, tileA1, tileA2, tileV, tileT, WORK, ldwork); } void INSERT_TASK_ztsmqr_hetra1(const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int lda1, - const CHAM_desc_t *A2, int A2m, int A2n, int lda2, - const CHAM_desc_t *V, int Vm, int Vn, int ldv, - const CHAM_desc_t *T, int Tm, int Tn, int ldt) + const CHAM_desc_t *A1, int A1m, int A1n, + const CHAM_desc_t *A2, int A2m, int A2n, + const CHAM_desc_t *V, int Vm, int Vn, + const CHAM_desc_t *T, int Tm, int Tn) { quark_option_t *opt = (quark_option_t*)(options->schedopt); int ldwork = side == ChamLeft ? ib : nb; @@ -69,14 +65,10 @@ void INSERT_TASK_ztsmqr_hetra1(const RUNTIME_option_t *options, sizeof(int), &n2, VALUE, sizeof(int), &k, VALUE, sizeof(int), &ib, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), INOUT|QUARK_REGION_L|QUARK_REGION_D, - sizeof(int), &lda1, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), INOUT, - sizeof(int), &lda2, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn), INPUT, - sizeof(int), &ldv, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), INPUT, - sizeof(int), &ldt, VALUE, + sizeof(void*), RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), INOUT|QUARK_REGION_L|QUARK_REGION_D, + sizeof(void*), RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), INOUT, + sizeof(void*), RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn), INPUT, + sizeof(void*), RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), INPUT, sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, sizeof(int), &ldwork, VALUE, 0); diff --git a/runtime/quark/codelets/codelet_ztstrf.c b/runtime/quark/codelets/codelet_ztstrf.c index e9671f4daa0a4652c9a4cb32cc044f7999c9d82c..1bc4a8d357ef4313ea48b9cdb7f5e0bd189d8301 100644 --- a/runtime/quark/codelets/codelet_ztstrf.c +++ b/runtime/quark/codelets/codelet_ztstrf.c @@ -12,8 +12,6 @@ * @brief Chameleon ztstrf Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge @@ -25,7 +23,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" #include "coreblas/cblas.h" #include <math.h> @@ -35,12 +33,9 @@ void CORE_ztstrf_quark(Quark *quark) int n; int ib; int nb; - CHAMELEON_Complex64_t *U; - int ldu; - CHAMELEON_Complex64_t *A; - int lda; - CHAMELEON_Complex64_t *L; - int ldl; + CHAM_tile_t *tileU; + CHAM_tile_t *tileA; + CHAM_tile_t *tileL; int *IPIV; CHAMELEON_Complex64_t *WORK; int ldwork; @@ -51,83 +46,18 @@ void CORE_ztstrf_quark(Quark *quark) int info; - quark_unpack_args_17(quark, m, n, ib, nb, U, ldu, A, lda, L, ldl, IPIV, WORK, ldwork, sequence, request, check_info, iinfo); - CORE_ztstrf(m, n, ib, nb, U, ldu, A, lda, L, ldl, IPIV, WORK, ldwork, &info); + quark_unpack_args_14(quark, m, n, ib, nb, tileU, tileA, tileL, IPIV, WORK, ldwork, sequence, request, check_info, iinfo); + TCORE_ztstrf(m, n, ib, nb, tileU, tileA, tileL, IPIV, WORK, ldwork, &info); if ( (info != CHAMELEON_SUCCESS) && check_info ) { RUNTIME_sequence_flush( (CHAM_context_t*)quark, sequence, request, iinfo+info ); } } -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_ztstrf computes an LU factorization of a complex matrix formed - * by an upper triangular NB-by-N tile U on top of a M-by-N tile A - * using partial pivoting with row interchanges. - * - * This is the right-looking Level 2.5 BLAS version of the algorithm. - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the tile A. M >= 0. - * - * @param[in] N - * The number of columns of the tile A. N >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in] NB - * - * @param[in,out] U - * On entry, the NB-by-N upper triangular tile. - * On exit, the new factor U from the factorization - * - * @param[in] LDU - * The leading dimension of the array U. LDU >= max(1,NB). - * - * @param[in,out] A - * On entry, the M-by-N tile to be factored. - * On exit, the factor L from the factorization - * - * @param[in] LDA - * The leading dimension of the array A. LDA >= max(1,M). - * - * @param[in,out] L - * On entry, the IB-by-N lower triangular tile. - * On exit, the interchanged rows form the tile A in case of pivoting. - * - * @param[in] LDL - * The leading dimension of the array L. LDL >= max(1,IB). - * - * @param[out] IPIV - * The pivot indices; for 1 <= i <= min(M,N), row i of the - * tile U was interchanged with row IPIV(i) of the tile A. - * - * @param[in,out] WORK - * - * @param[in] LDWORK - * The dimension of the array WORK. - * - * @param[out] INFO - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if INFO = -k, the k-th argument had an illegal value - * @retval >0 if INFO = k, U(k,k) is exactly zero. The factorization - * has been completed, but the factor U is exactly - * singular, and division by zero will occur if it is used - * to solve a system of equations. - * - */ void INSERT_TASK_ztstrf(const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *U, int Um, int Un, int ldu, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *L, int Lm, int Ln, int ldl, + const CHAM_desc_t *U, int Um, int Un, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *L, int Lm, int Ln, int *IPIV, cham_bool_t check_info, int iinfo) { @@ -138,12 +68,9 @@ void INSERT_TASK_ztstrf(const RUNTIME_option_t *options, sizeof(int), &n, VALUE, sizeof(int), &ib, VALUE, sizeof(int), &nb, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un), INOUT | QUARK_REGION_D | QUARK_REGION_U, - sizeof(int), &ldu, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT | LOCALITY, - sizeof(int), &lda, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, RTBLKADDR(L, CHAMELEON_Complex64_t, Lm, Ln), OUTPUT, - sizeof(int), &ldl, VALUE, + sizeof(void*), RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un), INOUT | QUARK_REGION_D | QUARK_REGION_U, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INOUT | LOCALITY, + sizeof(void*), RTBLKADDR(L, CHAMELEON_Complex64_t, Lm, Ln), OUTPUT, sizeof(int)*nb, IPIV, OUTPUT, sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, sizeof(int), &nb, VALUE, diff --git a/runtime/quark/codelets/codelet_zunmlq.c b/runtime/quark/codelets/codelet_zunmlq.c index 5b8687571af808182585b223ed9636099fccc296..eb884f674db48022692570a85b35cf61ce57dc4d 100644 --- a/runtime/quark/codelets/codelet_zunmlq.c +++ b/runtime/quark/codelets/codelet_zunmlq.c @@ -12,8 +12,6 @@ * @brief Chameleon zunmlq Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Jakub Kurzak * @author Dulceneia Becker @@ -26,7 +24,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zunmlq_quark(Quark *quark) { @@ -36,109 +34,24 @@ void CORE_zunmlq_quark(Quark *quark) int n; int k; int ib; - CHAMELEON_Complex64_t *A; - int lda; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *C; - int ldc; + CHAM_tile_t *tileA; + CHAM_tile_t *tileT; + CHAM_tile_t *tileC; CHAMELEON_Complex64_t *WORK; int ldwork; - quark_unpack_args_14(quark, side, trans, m, n, k, ib, - A, lda, T, ldt, C, ldc, WORK, ldwork); - CORE_zunmlq(side, trans, m, n, k, ib, - A, lda, T, ldt, C, ldc, WORK, ldwork); + quark_unpack_args_11(quark, side, trans, m, n, k, ib, + tileA, tileT, tileC, WORK, ldwork); + TCORE_zunmlq(side, trans, m, n, k, ib, + tileA, tileT, tileC, WORK, ldwork); } -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zunmlq overwrites the general complex M-by-N tile C with - * - * SIDE = 'L' SIDE = 'R' - * TRANS = 'N': Q * C C * Q - * TRANS = 'C': Q^H * C C * Q^H - * - * where Q is a complex unitary matrix defined as the product of k - * elementary reflectors - * - * Q = H(k) . . . H(2) H(1) - * - * as returned by CORE_zgelqt. Q is of order M if SIDE = 'L' and of order N - * if SIDE = 'R'. - * - ******************************************************************************* - * - * @param[in] side - * @arg ChamLeft : apply Q or Q^H from the Left; - * @arg ChamRight : apply Q or Q^H from the Right. - * - * @param[in] trans - * @arg ChamNoTrans : No transpose, apply Q; - * @arg ChamConjTrans : Transpose, apply Q^H. - * - * @param[in] M - * The number of rows of the tile C. M >= 0. - * - * @param[in] N - * The number of columns of the tile C. N >= 0. - * - * @param[in] K - * The number of elementary reflectors whose product defines - * the matrix Q. - * If SIDE = ChamLeft, M >= K >= 0; - * if SIDE = ChamRight, N >= K >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in] A - * Dimension: (LDA,M) if SIDE = ChamLeft, - * (LDA,N) if SIDE = ChamRight, - * The i-th row must contain the vector which defines the - * elementary reflector H(i), for i = 1,2,...,k, as returned by - * CORE_zgelqt in the first k rows of its array argument A. - * - * @param[in] LDA - * The leading dimension of the array A. LDA >= max(1,K). - * - * @param[in] T - * The IB-by-K triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] LDT - * The leading dimension of the array T. LDT >= IB. - * - * @param[in,out] C - * On entry, the M-by-N tile C. - * On exit, C is overwritten by Q*C or Q^T*C or C*Q^T or C*Q. - * - * @param[in] LDC - * The leading dimension of the array C. LDC >= max(1,M). - * - * @param[in,out] WORK - * On exit, if INFO = 0, WORK(1) returns the optimal LDWORK. - * - * @param[in] LDWORK - * The dimension of the array WORK. - * If SIDE = ChamLeft, LDWORK >= max(1,N); - * if SIDE = ChamRight, LDWORK >= max(1,M). - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if -i, the i-th argument had an illegal value - * - */ void INSERT_TASK_zunmlq(const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int m, int n, int k, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, - const CHAM_desc_t *C, int Cm, int Cn, int ldc) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *T, int Tm, int Tn, + const CHAM_desc_t *C, int Cm, int Cn) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_UNMLQ; @@ -149,12 +62,9 @@ void INSERT_TASK_zunmlq(const RUNTIME_option_t *options, sizeof(int), &n, VALUE, sizeof(int), &k, VALUE, sizeof(int), &ib, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT | QUARK_REGION_U, - sizeof(int), &lda, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), INPUT, - sizeof(int), &ldt, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), INOUT, - sizeof(int), &ldc, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT | QUARK_REGION_U, + sizeof(void*), RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), INPUT, + sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), INOUT, sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, sizeof(int), &nb, VALUE, 0); diff --git a/runtime/quark/codelets/codelet_zunmqr.c b/runtime/quark/codelets/codelet_zunmqr.c index f03746016c3012467e4a1370353084f8cb21f282..2cabe00f57c1ac31ab8b4d519e61d0ca7afb10c9 100644 --- a/runtime/quark/codelets/codelet_zunmqr.c +++ b/runtime/quark/codelets/codelet_zunmqr.c @@ -12,8 +12,6 @@ * @brief Chameleon zunmqr Quark codelet * * @version 0.9.2 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge @@ -25,7 +23,7 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" void CORE_zunmqr_quark(Quark *quark) { @@ -35,110 +33,24 @@ void CORE_zunmqr_quark(Quark *quark) int n; int k; int ib; - CHAMELEON_Complex64_t *A; - int lda; - CHAMELEON_Complex64_t *T; - int ldt; - CHAMELEON_Complex64_t *C; - int ldc; + CHAM_tile_t *tileA; + CHAM_tile_t *tileT; + CHAM_tile_t *tileC; CHAMELEON_Complex64_t *WORK; int ldwork; - quark_unpack_args_14(quark, side, trans, m, n, k, ib, - A, lda, T, ldt, C, ldc, WORK, ldwork); - CORE_zunmqr(side, trans, m, n, k, ib, - A, lda, T, ldt, C, ldc, WORK, ldwork); + quark_unpack_args_11(quark, side, trans, m, n, k, ib, + tileA, tileT, tileC, WORK, ldwork); + TCORE_zunmqr(side, trans, m, n, k, ib, + tileA, tileT, tileC, WORK, ldwork); } -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zunmqr overwrites the general complex M-by-N tile C with - * - * SIDE = 'L' SIDE = 'R' - * TRANS = 'N': Q * C C * Q - * TRANS = 'C': Q^H * C C * Q^H - * - * where Q is a complex unitary matrix defined as the product of k - * elementary reflectors - * - * Q = H(1) H(2) . . . H(k) - * - * as returned by CORE_zgeqrt. Q is of order M if SIDE = 'L' and of order N - * if SIDE = 'R'. - * - ******************************************************************************* - * - * @param[in] side - * @arg ChamLeft : apply Q or Q^H from the Left; - * @arg ChamRight : apply Q or Q^H from the Right. - * - * @param[in] trans - * @arg ChamNoTrans : No transpose, apply Q; - * @arg ChamConjTrans : Transpose, apply Q^H. - * - * @param[in] M - * The number of rows of the tile C. M >= 0. - * - * @param[in] N - * The number of columns of the tile C. N >= 0. - * - * @param[in] K - * The number of elementary reflectors whose product defines - * the matrix Q. - * If SIDE = ChamLeft, M >= K >= 0; - * if SIDE = ChamRight, N >= K >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in] A - * Dimension: (LDA,K) - * The i-th column must contain the vector which defines the - * elementary reflector H(i), for i = 1,2,...,k, as returned by - * CORE_zgeqrt in the first k columns of its array argument A. - * - * @param[in] LDA - * The leading dimension of the array A. - * If SIDE = ChamLeft, LDA >= max(1,M); - * if SIDE = ChamRight, LDA >= max(1,N). - * - * @param[in] T - * The IB-by-K triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] LDT - * The leading dimension of the array T. LDT >= IB. - * - * @param[in,out] C - * On entry, the M-by-N tile C. - * On exit, C is overwritten by Q*C or Q^T*C or C*Q^T or C*Q. - * - * @param[in] LDC - * The leading dimension of the array C. LDC >= max(1,M). - * - * @param[in,out] WORK - * On exit, if INFO = 0, WORK(1) returns the optimal LDWORK. - * - * @param[in] LDWORK - * The dimension of the array WORK. - * If SIDE = ChamLeft, LDWORK >= max(1,N); - * if SIDE = ChamRight, LDWORK >= max(1,M). - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if -i, the i-th argument had an illegal value - * - */ void INSERT_TASK_zunmqr(const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int m, int n, int k, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *T, int Tm, int Tn, int ldt, - const CHAM_desc_t *C, int Cm, int Cn, int ldc) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *T, int Tm, int Tn, + const CHAM_desc_t *C, int Cm, int Cn) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_UNMQR; @@ -149,12 +61,9 @@ void INSERT_TASK_zunmqr(const RUNTIME_option_t *options, sizeof(int), &n, VALUE, sizeof(int), &k, VALUE, sizeof(int), &ib, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT | QUARK_REGION_L, - sizeof(int), &lda, VALUE, - sizeof(CHAMELEON_Complex64_t)*ib*nb, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), INPUT, - sizeof(int), &ldt, VALUE, - sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), INOUT, - sizeof(int), &ldc, VALUE, + sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT | QUARK_REGION_L, + sizeof(void*), RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), INPUT, + sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), INOUT, sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL, SCRATCH, sizeof(int), &nb, VALUE, 0); diff --git a/runtime/quark/control/runtime_descriptor.c b/runtime/quark/control/runtime_descriptor.c index cb4d198ba479155c69e8aa7778166bfba0b51927..b09e44165b3496ccaf4b8e9d15713aeb01d04e0e 100644 --- a/runtime/quark/control/runtime_descriptor.c +++ b/runtime/quark/control/runtime_descriptor.c @@ -101,5 +101,5 @@ void RUNTIME_data_migrate( const RUNTIME_sequence_t *sequence, void *RUNTIME_data_getaddr( const CHAM_desc_t *desc, int m, int n ) { - return desc->get_blkaddr( desc, m, n ); + return desc->get_blktile( desc, m, n ); } diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt index ff990638938b710275c6952057b45832159b71b1..924d480e81ed08fc4cc29ddd4920c64243e5757c 100644 --- a/runtime/starpu/CMakeLists.txt +++ b/runtime/starpu/CMakeLists.txt @@ -94,6 +94,7 @@ set(RUNTIME_COMMON control/runtime_options.c control/runtime_profiling.c control/runtime_workspace.c + interface/cham_tile_interface.c ${RUNTIME_COMMON_GENERATED} ) diff --git a/runtime/starpu/codelets/codelet_dzasum.c b/runtime/starpu/codelets/codelet_dzasum.c index 869927a85d13fa9eca1a309c19625fcafa4db182..5b2c9827e1292b99fd097246288b795b0c3580ae 100644 --- a/runtime/starpu/codelets/codelet_dzasum.c +++ b/runtime/starpu/codelets/codelet_dzasum.c @@ -30,16 +30,14 @@ static void cl_dzasum_cpu_func(void *descr[], void *cl_arg) cham_uplo_t uplo; int M; int N; - CHAMELEON_Complex64_t *A; - int ldA; - double *work; + CHAM_tile_t *tileA; + CHAM_tile_t *tilework; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - work = (double *)STARPU_MATRIX_GET_PTR(descr[1]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); + tileA = cti_interface_get(descr[0]); + tilework = cti_interface_get(descr[1]); starpu_codelet_unpack_args(cl_arg, &storev, &uplo, &M, &N); - CORE_dzasum(storev, uplo, M, N, A, ldA, work); + TCORE_dzasum(storev, uplo, M, N, tileA, tilework->mat ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -50,7 +48,7 @@ CODELETS_CPU(dzasum, 2, cl_dzasum_cpu_func) void INSERT_TASK_dzasum( const RUNTIME_option_t *options, cham_store_t storev, cham_uplo_t uplo, int M, int N, - const CHAM_desc_t *A, int Am, int An, int ldA, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ) { struct starpu_codelet *codelet = &cl_dzasum; @@ -63,17 +61,16 @@ void INSERT_TASK_dzasum( const RUNTIME_option_t *options, starpu_insert_task( starpu_mpi_codelet(codelet), - STARPU_VALUE, &storev, sizeof(int), - STARPU_VALUE, &uplo, sizeof(int), - STARPU_VALUE, &M, sizeof(int), - STARPU_VALUE, &N, sizeof(int), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_VALUE, &storev, sizeof(cham_store_t), + STARPU_VALUE, &uplo, sizeof(cham_uplo_t), + STARPU_VALUE, &M, sizeof(int), + STARPU_VALUE, &N, sizeof(int), + STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), STARPU_RW, RTBLKADDR(B, double, Bm, Bn), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, #if defined(CHAMELEON_CODELETS_HAVE_NAME) STARPU_NAME, "dzasum", #endif 0); - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_map.c b/runtime/starpu/codelets/codelet_map.c index 2388af472872c0c3b260737be7c880ed3afd5054..bab6a097cf046ef7aeed306bb6b14830449b3504 100644 --- a/runtime/starpu/codelets/codelet_map.c +++ b/runtime/starpu/codelets/codelet_map.c @@ -17,7 +17,7 @@ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" -CHAMELEON_CL_CB(map, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N); +CHAMELEON_CL_CB(map, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) #if !defined(CHAMELEON_SIMULATION) static void cl_map_cpu_func(void *descr[], void *cl_arg) @@ -26,13 +26,13 @@ static void cl_map_cpu_func(void *descr[], void *cl_arg) cham_uplo_t uplo; int m; int n; - void *data; + CHAM_tile_t *tile; cham_unary_operator_t op_fct; void *op_args; - data = (void *)STARPU_MATRIX_GET_PTR(descr[0]); + tile = cti_interface_get(descr[0]); starpu_codelet_unpack_args(cl_arg, &desc, &uplo, &m, &n, &op_fct, &op_args ); - op_fct( desc, uplo, m, n, data, op_args ); + op_fct( desc, uplo, m, n, tile, op_args ); } #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_zaxpy.c b/runtime/starpu/codelets/codelet_zaxpy.c index 38368ffff4a358a11a762ad46b597c9c790f5b5d..3458a9968acc2de81ed12cbf5cd25d2f5be2dd94 100644 --- a/runtime/starpu/codelets/codelet_zaxpy.c +++ b/runtime/starpu/codelets/codelet_zaxpy.c @@ -25,15 +25,15 @@ static void cl_zaxpy_cpu_func(void *descr[], void *cl_arg) { int M; CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t *A; + CHAM_tile_t *tileA; int incA; - CHAMELEON_Complex64_t *B; + CHAM_tile_t *tileB; int incB; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - B = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); starpu_codelet_unpack_args(cl_arg, &M, &alpha, &incA, &incB); - CORE_zaxpy(M, alpha, A, incA, B, incB); + TCORE_zaxpy(M, alpha, tileA, incA, tileB, incB); } #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_zbuild.c b/runtime/starpu/codelets/codelet_zbuild.c index e329dc377cb5599ef06b53981fc418fb7205b506..f0879fddb53aa9d929ff533371aee48fddda4343 100644 --- a/runtime/starpu/codelets/codelet_zbuild.c +++ b/runtime/starpu/codelets/codelet_zbuild.c @@ -31,22 +31,20 @@ #if !defined(CHAMELEON_SIMULATION) static void cl_zbuild_cpu_func(void *descr[], void *cl_arg) { - CHAMELEON_Complex64_t *A; - int ldA; - void *user_data; - void (*user_build_callback)(int row_min, int row_max, int col_min, int col_max, void *buffer, int ldA, void *user_data) ; - int row_min, row_max, col_min, col_max; + CHAM_tile_t *tileA; + void *user_data; + void (*user_build_callback)(int row_min, int row_max, int col_min, int col_max, void *buffer, int ld, void *user_data) ; + int row_min, row_max, col_min, col_max; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); + tileA = cti_interface_get(descr[0]); - starpu_codelet_unpack_args(cl_arg, &row_min, &row_max, &col_min, &col_max, &user_data, &user_build_callback ); + starpu_codelet_unpack_args(cl_arg, &row_min, &row_max, &col_min, &col_max, &user_data, &user_build_callback ); - /* The callback 'user_build_callback' is expected to build the block of matrix [row_min, row_max] x [col_min, col_max] - * (with both min and max values included in the intervals, index start at 0 like in C, NOT 1 like in Fortran) - * and store it at the address 'buffer' with leading dimension 'ld' - */ - user_build_callback(row_min, row_max, col_min, col_max, A, ldA, user_data); + /* The callback 'user_build_callback' is expected to build the block of matrix [row_min, row_max] x [col_min, col_max] + * (with both min and max values included in the intervals, index start at 0 like in C, NOT 1 like in Fortran) + * and store it at the address 'buffer' with leading dimension 'ld' + */ + user_build_callback(row_min, row_max, col_min, col_max, tileA->mat, tileA->ld, user_data); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -56,24 +54,24 @@ static void cl_zbuild_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU(zbuild, 1, cl_zbuild_cpu_func) -void INSERT_TASK_zbuild( const RUNTIME_option_t *options, - const CHAM_desc_t *A, int Am, int An, int ldA, - void *user_data, void* user_build_callback ) + void INSERT_TASK_zbuild( const RUNTIME_option_t *options, + const CHAM_desc_t *A, int Am, int An, + void *user_data, void* user_build_callback ) { - struct starpu_codelet *codelet = &cl_zbuild; - void (*callback)(void*) = options->profiling ? cl_zbuild_callback : NULL; - int row_min, row_max, col_min, col_max; + struct starpu_codelet *codelet = &cl_zbuild; + void (*callback)(void*) = options->profiling ? cl_zbuild_callback : NULL; + int row_min, row_max, col_min, col_max; - CHAMELEON_BEGIN_ACCESS_DECLARATION; - CHAMELEON_ACCESS_W(A, Am, An); - CHAMELEON_END_ACCESS_DECLARATION; + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_W(A, Am, An); + CHAMELEON_END_ACCESS_DECLARATION; - row_min = Am*A->mb ; - row_max = Am == A->mt-1 ? A->m-1 : row_min+A->mb-1 ; - col_min = An*A->nb ; - col_max = An == A->nt-1 ? A->n-1 : col_min+A->nb-1 ; - starpu_insert_task( + row_min = Am*A->mb ; + row_max = Am == A->mt-1 ? A->m-1 : row_min+A->mb-1 ; + col_min = An*A->nb ; + col_max = An == A->nt-1 ? A->n-1 : col_min+A->nb-1 ; + starpu_insert_task( starpu_mpi_codelet(codelet), STARPU_VALUE, &row_min, sizeof(int), STARPU_VALUE, &row_max, sizeof(int), @@ -88,5 +86,4 @@ void INSERT_TASK_zbuild( const RUNTIME_option_t *options, STARPU_NAME, "zbuild", #endif 0); - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c index 44d431b7dafc684188f869aa560e0dabdb1f311a..1aee0b3fe5ff978d9c3b3b5de042af8a332d0d5c 100644 --- a/runtime/starpu/codelets/codelet_zcallback.c +++ b/runtime/starpu/codelets/codelet_zcallback.c @@ -22,56 +22,56 @@ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" -CHAMELEON_CL_CB(dzasum, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N) -CHAMELEON_CL_CB(zaxpy, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[1]), 0, M) -CHAMELEON_CL_CB(zgeadd, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N) -CHAMELEON_CL_CB(zlascal, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N) -CHAMELEON_CL_CB(zgelqt, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, (4./3.)*M*N*K) -CHAMELEON_CL_CB(zgemm, starpu_matrix_get_nx(task->handles[2]), starpu_matrix_get_ny(task->handles[2]), starpu_matrix_get_ny(task->handles[0]), 2. *M*N*K) /* If A^t, computation is wrong */ -CHAMELEON_CL_CB(zgeqrt, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, (4./3.)*M*M*N) -CHAMELEON_CL_CB(zgessm, starpu_matrix_get_nx(task->handles[2]), starpu_matrix_get_nx(task->handles[2]), starpu_matrix_get_nx(task->handles[2]), 2. *M*N*K) -CHAMELEON_CL_CB(zgessq, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), 0, 4.*M*N) -CHAMELEON_CL_CB(zgetrf, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (2./3.)*M*N*K) -CHAMELEON_CL_CB(zgetrf_incpiv, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (2./3.)*M*N*K) -CHAMELEON_CL_CB(zgetrf_nopiv, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (2./3.)*M*N*K) -CHAMELEON_CL_CB(zgram, starpu_matrix_get_nx(task->handles[3]), starpu_matrix_get_ny(task->handles[3]), 0, M*N) -CHAMELEON_CL_CB(zhe2ge, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, (1./2.0)*M*N) -CHAMELEON_CL_CB(zherfb, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, 2. *M* M*M) +CHAMELEON_CL_CB(dzasum, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) +CHAMELEON_CL_CB(zaxpy, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[1]), 0, M) +CHAMELEON_CL_CB(zgeadd, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) +CHAMELEON_CL_CB(zlascal, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) +CHAMELEON_CL_CB(zgelqt, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, (4./3.)*M*N*K) +CHAMELEON_CL_CB(zgemm, cti_handle_get_m(task->handles[2]), cti_handle_get_n(task->handles[2]), cti_handle_get_n(task->handles[0]), 2. *M*N*K) /* If A^t, computation is wrong */ +CHAMELEON_CL_CB(zgeqrt, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, (4./3.)*M*M*N) +CHAMELEON_CL_CB(zgessm, cti_handle_get_m(task->handles[2]), cti_handle_get_m(task->handles[2]), cti_handle_get_m(task->handles[2]), 2. *M*N*K) +CHAMELEON_CL_CB(zgessq, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), 0, 4.*M*N) +CHAMELEON_CL_CB(zgetrf, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), (2./3.)*M*N*K) +CHAMELEON_CL_CB(zgetrf_incpiv, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), (2./3.)*M*N*K) +CHAMELEON_CL_CB(zgetrf_nopiv, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), (2./3.)*M*N*K) +CHAMELEON_CL_CB(zgram, cti_handle_get_m(task->handles[3]), cti_handle_get_n(task->handles[3]), 0, M*N) +CHAMELEON_CL_CB(zhe2ge, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, (1./2.0)*M*N) +CHAMELEON_CL_CB(zherfb, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, 2. *M* M*M) #if defined(PRECISION_z) || defined(PRECISION_c) -CHAMELEON_CL_CB(zhemm, starpu_matrix_get_nx(task->handles[2]), starpu_matrix_get_ny(task->handles[2]), 0, 2.*M*M *N) -CHAMELEON_CL_CB(zher2k, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, ( 1.+2.*M*N)*M) -CHAMELEON_CL_CB(zherk, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, ( 1.+ M)*M*N) +CHAMELEON_CL_CB(zhemm, cti_handle_get_m(task->handles[2]), cti_handle_get_n(task->handles[2]), 0, 2.*M*M *N) +CHAMELEON_CL_CB(zher2k, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, ( 1.+2.*M*N)*M) +CHAMELEON_CL_CB(zherk, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, ( 1.+ M)*M*N) #endif -CHAMELEON_CL_CB(zlacpy, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N) -CHAMELEON_CL_CB(zlange, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N) -CHAMELEON_CL_CB(zlaset, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N) -CHAMELEON_CL_CB(zlaset2, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N) -CHAMELEON_CL_CB(zlatro, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N) -CHAMELEON_CL_CB(zlauum, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (1./3.)*M* M*M) +CHAMELEON_CL_CB(zlacpy, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) +CHAMELEON_CL_CB(zlange, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) +CHAMELEON_CL_CB(zlaset, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) +CHAMELEON_CL_CB(zlaset2, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) +CHAMELEON_CL_CB(zlatro, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) +CHAMELEON_CL_CB(zlauum, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), (1./3.)*M* M*M) #if defined(PRECISION_z) || defined(PRECISION_c) -CHAMELEON_CL_CB(zplghe, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N) -CHAMELEON_CL_CB(zsytrf_nopiv, starpu_matrix_get_nx(task->handles[0]), 0, 0, (1./3.)*M* M*M) +CHAMELEON_CL_CB(zplghe, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) +CHAMELEON_CL_CB(zsytrf_nopiv, cti_handle_get_m(task->handles[0]), 0, 0, (1./3.)*M* M*M) #endif -CHAMELEON_CL_CB(zplgsy, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N) -CHAMELEON_CL_CB(zplrnt, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N) -CHAMELEON_CL_CB(zbuild, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N) -CHAMELEON_CL_CB(zplssq, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N) -CHAMELEON_CL_CB(zplssq2, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, 2*N) -CHAMELEON_CL_CB(zpotrf, starpu_matrix_get_nx(task->handles[0]), 0, 0, (1./3.)*M* M*M) -CHAMELEON_CL_CB(zssssm, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), M*M*(2.*M+starpu_matrix_get_nx(task->handles[2]))) -CHAMELEON_CL_CB(zsymm, starpu_matrix_get_nx(task->handles[2]), starpu_matrix_get_ny(task->handles[2]), 0, 2.*M*M *N) -CHAMELEON_CL_CB(zsyr2k, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, ( 1.+2.*M*N)*M) -CHAMELEON_CL_CB(zsyrk, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, ( 1.+ M)*M*N) -CHAMELEON_CL_CB(ztplqt, starpu_matrix_get_nx(task->handles[1]), starpu_matrix_get_ny(task->handles[1]), starpu_matrix_get_nx(task->handles[0]), 2.*M*N*K) -CHAMELEON_CL_CB(ztpqrt, starpu_matrix_get_nx(task->handles[1]), starpu_matrix_get_ny(task->handles[1]), starpu_matrix_get_nx(task->handles[0]), 2.*M*N*K) -CHAMELEON_CL_CB(ztpmlqt, starpu_matrix_get_nx(task->handles[3]), starpu_matrix_get_ny(task->handles[3]), starpu_matrix_get_nx(task->handles[2]), 4.*M*N*K) -CHAMELEON_CL_CB(ztpmqrt, starpu_matrix_get_nx(task->handles[3]), starpu_matrix_get_ny(task->handles[3]), starpu_matrix_get_nx(task->handles[2]), 4.*M*N*K) -CHAMELEON_CL_CB(ztrasm, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, 0.5*M*(M+1)) -CHAMELEON_CL_CB(ztrmm, starpu_matrix_get_nx(task->handles[1]), starpu_matrix_get_ny(task->handles[1]), 0, M*M*N) -CHAMELEON_CL_CB(ztrsm, starpu_matrix_get_nx(task->handles[1]), starpu_matrix_get_ny(task->handles[1]), 0, M*M*N) -CHAMELEON_CL_CB(ztrtri, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (1./3.)*M *M*M) -CHAMELEON_CL_CB(ztsmlq_hetra1, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (4.0*M+starpu_matrix_get_nx(task->handles[3]))*M*M) -CHAMELEON_CL_CB(ztsmqr_hetra1, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (4.0*M+starpu_matrix_get_nx(task->handles[3]))*M*M) -CHAMELEON_CL_CB(ztstrf, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), M* M*M) -CHAMELEON_CL_CB(zunmlq, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), 2. *M* M*M) -CHAMELEON_CL_CB(zunmqr, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), 2. *M* M*M) +CHAMELEON_CL_CB(zplgsy, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) +CHAMELEON_CL_CB(zplrnt, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) +CHAMELEON_CL_CB(zbuild, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) +CHAMELEON_CL_CB(zplssq, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) +CHAMELEON_CL_CB(zplssq2, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, 2*N) +CHAMELEON_CL_CB(zpotrf, cti_handle_get_m(task->handles[0]), 0, 0, (1./3.)*M* M*M) +CHAMELEON_CL_CB(zssssm, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), M*M*(2.*M+cti_handle_get_m(task->handles[2]))) +CHAMELEON_CL_CB(zsymm, cti_handle_get_m(task->handles[2]), cti_handle_get_n(task->handles[2]), 0, 2.*M*M *N) +CHAMELEON_CL_CB(zsyr2k, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, ( 1.+2.*M*N)*M) +CHAMELEON_CL_CB(zsyrk, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, ( 1.+ M)*M*N) +CHAMELEON_CL_CB(ztplqt, cti_handle_get_m(task->handles[1]), cti_handle_get_n(task->handles[1]), cti_handle_get_m(task->handles[0]), 2.*M*N*K) +CHAMELEON_CL_CB(ztpqrt, cti_handle_get_m(task->handles[1]), cti_handle_get_n(task->handles[1]), cti_handle_get_m(task->handles[0]), 2.*M*N*K) +CHAMELEON_CL_CB(ztpmlqt, cti_handle_get_m(task->handles[3]), cti_handle_get_n(task->handles[3]), cti_handle_get_m(task->handles[2]), 4.*M*N*K) +CHAMELEON_CL_CB(ztpmqrt, cti_handle_get_m(task->handles[3]), cti_handle_get_n(task->handles[3]), cti_handle_get_m(task->handles[2]), 4.*M*N*K) +CHAMELEON_CL_CB(ztrasm, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, 0.5*M*(M+1)) +CHAMELEON_CL_CB(ztrmm, cti_handle_get_m(task->handles[1]), cti_handle_get_n(task->handles[1]), 0, M*M*N) +CHAMELEON_CL_CB(ztrsm, cti_handle_get_m(task->handles[1]), cti_handle_get_n(task->handles[1]), 0, M*M*N) +CHAMELEON_CL_CB(ztrtri, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), (1./3.)*M *M*M) +CHAMELEON_CL_CB(ztsmlq_hetra1, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), (4.0*M+cti_handle_get_m(task->handles[3]))*M*M) +CHAMELEON_CL_CB(ztsmqr_hetra1, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), (4.0*M+cti_handle_get_m(task->handles[3]))*M*M) +CHAMELEON_CL_CB(ztstrf, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), M* M*M) +CHAMELEON_CL_CB(zunmlq, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), 2. *M* M*M) +CHAMELEON_CL_CB(zunmqr, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), 2. *M* M*M) diff --git a/runtime/starpu/codelets/codelet_zgeadd.c b/runtime/starpu/codelets/codelet_zgeadd.c index 9e8ec52efa05a8eec1999a16353eb735fbfec64b..e6a73fd887431993f95929f904c0fa74fb74a620 100644 --- a/runtime/starpu/codelets/codelet_zgeadd.c +++ b/runtime/starpu/codelets/codelet_zgeadd.c @@ -32,19 +32,15 @@ static void cl_zgeadd_cpu_func(void *descr[], void *cl_arg) int M; int N; CHAMELEON_Complex64_t alpha; - const CHAMELEON_Complex64_t *A; - int ldA; + CHAM_tile_t *tileA; CHAMELEON_Complex64_t beta; - CHAMELEON_Complex64_t *B; - int ldB; + CHAM_tile_t *tileB; - A = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - B = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldB = STARPU_MATRIX_GET_LD( descr[1] ); + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); starpu_codelet_unpack_args(cl_arg, &trans, &M, &N, &alpha, &beta); - CORE_zgeadd(trans, M, N, alpha, A, ldA, beta, B, ldB); + TCORE_zgeadd(trans, M, N, alpha, tileA, beta, tileB); return; } @@ -55,25 +51,21 @@ static void cl_zgeadd_cuda_func(void *descr[], void *cl_arg) int M; int N; cuDoubleComplex alpha; - const cuDoubleComplex *A; - int ldA; + CHAM_tile_t *tileA; cuDoubleComplex beta; - cuDoubleComplex *B; - int ldB; + CHAM_tile_t *tileB; - A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - B = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldB = STARPU_MATRIX_GET_LD( descr[1] ); - starpu_codelet_unpack_args(cl_arg, &trans, &M, &N, &alpha, &beta); + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + starpu_codelet_unpack_args(cl_arg, &trans, &M, &N, &alpha, &beta ); RUNTIME_getStream( stream ); CUDA_zgeadd( trans, M, N, - &alpha, A, ldA, - &beta, B, ldB, + &alpha, tileA->mat, tileA->ld, + &beta, tileB->mat, tileB->ld, stream); #ifndef STARPU_CUDA_ASYNC @@ -149,8 +141,8 @@ CODELETS_CPU(zgeadd, 2, cl_zgeadd_cpu_func) */ void INSERT_TASK_zgeadd( const RUNTIME_option_t *options, cham_trans_t trans, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int ldA, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn, int ldB ) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn ) { struct starpu_codelet *codelet = &cl_zgeadd; void (*callback)(void*) = options->profiling ? cl_zgeadd_callback : NULL; @@ -175,7 +167,6 @@ void INSERT_TASK_zgeadd( const RUNTIME_option_t *options, STARPU_NAME, "zgeadd", #endif 0); - (void)ldA; (void)nb; } diff --git a/runtime/starpu/codelets/codelet_zgelqt.c b/runtime/starpu/codelets/codelet_zgelqt.c index 21e38b440c18bbcf5e1472476503cfcef7e2b621..d40413f58562788572b2c497c08e9fc10e226233 100644 --- a/runtime/starpu/codelets/codelet_zgelqt.c +++ b/runtime/starpu/codelets/codelet_zgelqt.c @@ -34,23 +34,22 @@ static void cl_zgelqt_cpu_func(void *descr[], void *cl_arg) int m; int n; int ib; - CHAMELEON_Complex64_t *A; - int ldA; - CHAMELEON_Complex64_t *T; - int ldT; - CHAMELEON_Complex64_t *TAU, *WORK; + CHAM_tile_t *tileA; + CHAM_tile_t *tileT; + CHAM_tile_t *tileW; + CHAMELEON_Complex64_t *TAU; + CHAMELEON_Complex64_t *WORK; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - T = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - TAU = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); /* max(m,n) + ib*n */ - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldT = STARPU_MATRIX_GET_LD( descr[1] ); + tileA = cti_interface_get(descr[0]); + tileT = cti_interface_get(descr[1]); + tileW = cti_interface_get(descr[2]); /* max(m,n) + ib * n */ starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &h_work); + TAU = tileW->mat; WORK = TAU + chameleon_max( m, n ); - CORE_zlaset( ChamUpperLower, ib, m, 0., 0., T, ldT ); - CORE_zgelqt(m, n, ib, A, ldA, T, ldT, TAU, WORK); + TCORE_zlaset( ChamUpperLower, ib, m, 0., 0., tileT ); + TCORE_zgelqt(m, n, ib, tileA, tileT, TAU, WORK); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -59,70 +58,10 @@ static void cl_zgelqt_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU(zgelqt, 3, cl_zgelqt_cpu_func) -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zgelqt - computes a LQ factorization of a complex M-by-N tile A: A = L * Q. - * - * The tile Q is represented as a product of elementary reflectors - * - * Q = H(k)' . . . H(2)' H(1)', where k = min(M,N). - * - * Each H(i) has the form - * - * H(i) = I - tau * v * v' - * - * where tau is a complex scalar, and v is a complex vector with - * v(1:i-1) = 0 and v(i) = 1; conjg(v(i+1:n)) is stored on exit in - * A(i,i+1:n), and tau in TAU(i). - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the tile A. M >= 0. - * - * @param[in] N - * The number of columns of the tile A. N >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A - * On entry, the M-by-N tile A. - * On exit, the elements on and below the diagonal of the array - * contain the M-by-min(M,N) lower trapezoidal tile L (L is - * lower triangular if M <= N); the elements above the diagonal, - * with the array TAU, represent the unitary tile Q as a - * product of elementary reflectors (see Further Details). - * - * @param[in] ldA - * The leading dimension of the array A. ldA >= max(1,M). - * - * @param[out] T - * The IB-by-N triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] ldT - * The leading dimension of the array T. ldT >= IB. - * - * @param[out] TAU - * The scalar factors of the elementary reflectors (see Further - * Details). - * - * @param[out] WORK - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if -i, the i-th argument had an illegal value - * - */ void INSERT_TASK_zgelqt(const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int ldA, - const CHAM_desc_t *T, int Tm, int Tn, int ldT) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *T, int Tm, int Tn) { (void)nb; struct starpu_codelet *codelet = &cl_zgelqt; @@ -151,6 +90,4 @@ void INSERT_TASK_zgelqt(const RUNTIME_option_t *options, STARPU_NAME, "zgelqt", #endif 0); - (void)ldT; - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c index 3979260158bbc277014ed2d6da688000911faac6..55c480f7c1a1ceff8e1fd225c9af39e9b63090b0 100644 --- a/runtime/starpu/codelets/codelet_zgemm.c +++ b/runtime/starpu/codelets/codelet_zgemm.c @@ -36,27 +36,20 @@ static void cl_zgemm_cpu_func(void *descr[], void *cl_arg) int n; int k; CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t *A; - int ldA; - CHAMELEON_Complex64_t *B; - int ldB; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; CHAMELEON_Complex64_t beta; - CHAMELEON_Complex64_t *C; - int ldC; + CHAM_tile_t *tileC; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - B = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - C = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldB = STARPU_MATRIX_GET_LD( descr[1] ); - ldC = STARPU_MATRIX_GET_LD( descr[2] ); + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); starpu_codelet_unpack_args(cl_arg, &transA, &transB, &m, &n, &k, &alpha, &beta); - CORE_zgemm(transA, transB, - m, n, k, - alpha, A, ldA, - B, ldB, - beta, C, ldC); + TCORE_zgemm( transA, transB, + m, n, k, + alpha, tileA, tileB, + beta, tileC ); } #ifdef CHAMELEON_USE_CUDA @@ -68,20 +61,14 @@ static void cl_zgemm_cuda_func(void *descr[], void *cl_arg) int n; int k; cuDoubleComplex alpha; - const cuDoubleComplex *A; - int ldA; - const cuDoubleComplex *B; - int ldB; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; cuDoubleComplex beta; - cuDoubleComplex *C; - int ldC; + CHAM_tile_t *tileC; - A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); - C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldB = STARPU_MATRIX_GET_LD( descr[1] ); - ldC = STARPU_MATRIX_GET_LD( descr[2] ); + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); starpu_codelet_unpack_args(cl_arg, &transA, &transB, &m, &n, &k, &alpha, &beta); @@ -90,9 +77,9 @@ static void cl_zgemm_cuda_func(void *descr[], void *cl_arg) CUDA_zgemm( transA, transB, m, n, k, - &alpha, A, ldA, - B, ldB, - &beta, C, ldC, + &alpha, tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + &beta, tileC->mat, tileC->ld, stream); #ifndef STARPU_CUDA_ASYNC @@ -117,9 +104,9 @@ CODELETS(zgemm, 3, cl_zgemm_cpu_func, cl_zgemm_cuda_func, STARPU_CUDA_ASYNC) void INSERT_TASK_zgemm(const RUNTIME_option_t *options, cham_trans_t transA, cham_trans_t transB, int m, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int ldA, - const CHAM_desc_t *B, int Bm, int Bn, int ldB, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldC) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) { (void)nb; struct starpu_codelet *codelet = &cl_zgemm; @@ -150,7 +137,4 @@ void INSERT_TASK_zgemm(const RUNTIME_option_t *options, #endif 0); - (void)ldA; - (void)ldB; - (void)ldC; } diff --git a/runtime/starpu/codelets/codelet_zgeqrt.c b/runtime/starpu/codelets/codelet_zgeqrt.c index ddc681630a08bc9e7cc8a7949e83847346cb12aa..ccbdb322e129333b6fe3371ce5d8971e3be86c90 100644 --- a/runtime/starpu/codelets/codelet_zgeqrt.c +++ b/runtime/starpu/codelets/codelet_zgeqrt.c @@ -34,24 +34,23 @@ static void cl_zgeqrt_cpu_func(void *descr[], void *cl_arg) int m; int n; int ib; - CHAMELEON_Complex64_t *A; - int ldA; - CHAMELEON_Complex64_t *T; - int ldT; - CHAMELEON_Complex64_t *TAU, *WORK; + CHAM_tile_t *tileA; + CHAM_tile_t *tileT; + CHAM_tile_t *tileW; + CHAMELEON_Complex64_t *TAU; + CHAMELEON_Complex64_t *WORK; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - T = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - TAU = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); /* max(m,n) + n * ib */ - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldT = STARPU_MATRIX_GET_LD( descr[1] ); + tileA = cti_interface_get(descr[0]); + tileT = cti_interface_get(descr[1]); + tileW = cti_interface_get(descr[2]); /* max(m,n) + ib * n */ starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &h_work); + TAU = tileW->mat; WORK = TAU + chameleon_max( m, n ); - CORE_zlaset( ChamUpperLower, ib, n, 0., 0., T, ldT ); - CORE_zgeqrt(m, n, ib, A, ldA, T, ldT, TAU, WORK); + TCORE_zlaset( ChamUpperLower, ib, n, 0., 0., tileT ); + TCORE_zgeqrt(m, n, ib, tileA, tileT, TAU, WORK ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -60,71 +59,10 @@ static void cl_zgeqrt_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU(zgeqrt, 3, cl_zgeqrt_cpu_func) -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zgeqrt computes a QR factorization of a complex M-by-N tile A: - * A = Q * R. - * - * The tile Q is represented as a product of elementary reflectors - * - * Q = H(1) H(2) . . . H(k), where k = min(M,N). - * - * Each H(i) has the form - * - * H(i) = I - tau * v * v' - * - * where tau is a complex scalar, and v is a complex vector with - * v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), - * and tau in TAU(i). - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the tile A. M >= 0. - * - * @param[in] N - * The number of columns of the tile A. N >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A - * On entry, the M-by-N tile A. - * On exit, the elements on and above the diagonal of the array - * contain the min(M,N)-by-N upper trapezoidal tile R (R is - * upper triangular if M >= N); the elements below the diagonal, - * with the array TAU, represent the unitary tile Q as a - * product of elementary reflectors (see Further Details). - * - * @param[in] ldA - * The leading dimension of the array A. ldA >= max(1,M). - * - * @param[out] T - * The IB-by-N triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] ldT - * The leading dimension of the array T. ldT >= IB. - * - * @param[out] TAU - * The scalar factors of the elementary reflectors (see Further - * Details). - * - * @param[out] WORK - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if -i, the i-th argument had an illegal value - * - */ void INSERT_TASK_zgeqrt(const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int ldA, - const CHAM_desc_t *T, int Tm, int Tn, int ldT) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *T, int Tm, int Tn) { (void)nb; struct starpu_codelet *codelet = &cl_zgeqrt; @@ -153,6 +91,4 @@ void INSERT_TASK_zgeqrt(const RUNTIME_option_t *options, STARPU_NAME, "zgeqrt", #endif 0); - (void)ldT; - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zgessm.c b/runtime/starpu/codelets/codelet_zgessm.c index 6144862d0d1bd6b44589ac03c5a44deaab26046e..6d94b0ffb9684cbb171597112ee1a64a9169299a 100644 --- a/runtime/starpu/codelets/codelet_zgessm.c +++ b/runtime/starpu/codelets/codelet_zgessm.c @@ -35,19 +35,15 @@ static void cl_zgessm_cpu_func(void *descr[], void *cl_arg) int k; int ib; int *IPIV; - CHAMELEON_Complex64_t *D; - int ldD; - CHAMELEON_Complex64_t *A; - int ldA; + CHAM_tile_t *tileD; + CHAM_tile_t *tileA; - D = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); + tileD = cti_interface_get(descr[1]); + tileA = cti_interface_get(descr[2]); - ldD = STARPU_MATRIX_GET_LD( descr[1] ); - ldA = STARPU_MATRIX_GET_LD( descr[2] ); starpu_codelet_unpack_args(cl_arg, &m, &n, &k, &ib, &IPIV); - CORE_zgessm(m, n, k, ib, IPIV, D, ldD, A, ldA); + TCORE_zgessm(m, n, k, ib, IPIV, tileD, tileA); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -56,57 +52,12 @@ static void cl_zgessm_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU(zgessm, 3, cl_zgessm_cpu_func) -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zgessm applies the factors L computed by CORE_zgetrf_incpiv to - * a complex M-by-N tile A. - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the tile A. M >= 0. - * - * @param[in] N - * The number of columns of the tile A. N >= 0. - * - * @param[in] K - * The number of columns of the tile L. K >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in] IPIV - * The pivot indices array of size K as returned by - * CORE_zgetrf_incpiv. - * - * @param[in] L - * The M-by-K lower triangular tile. - * - * @param[in] ldL - * The leading dimension of the array L. ldL >= max(1,M). - * - * @param[in,out] A - * On entry, the M-by-N tile A. - * On exit, updated by the application of L. - * - * @param[in] ldA - * The leading dimension of the array A. ldA >= max(1,M). - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if INFO = -k, the k-th argument had an illegal value - * - */ - void INSERT_TASK_zgessm( const RUNTIME_option_t *options, int m, int n, int k, int ib, int nb, int *IPIV, - const CHAM_desc_t *L, int Lm, int Ln, int ldL, - const CHAM_desc_t *D, int Dm, int Dn, int ldD, - const CHAM_desc_t *A, int Am, int An, int ldA ) + const CHAM_desc_t *L, int Lm, int Ln, + const CHAM_desc_t *D, int Dm, int Dn, + const CHAM_desc_t *A, int Am, int An ) { (void)nb; struct starpu_codelet *codelet = &cl_zgessm; @@ -134,6 +85,4 @@ void INSERT_TASK_zgessm( const RUNTIME_option_t *options, STARPU_NAME, "zgessm", #endif 0); - (void)ldD; - (void)ldL; } diff --git a/runtime/starpu/codelets/codelet_zgessq.c b/runtime/starpu/codelets/codelet_zgessq.c index 16517741181e63b4cb499ac2b646552d68e7816c..1a63e1a22e5266d408ae1fc568db75f6c2db242e 100644 --- a/runtime/starpu/codelets/codelet_zgessq.c +++ b/runtime/starpu/codelets/codelet_zgessq.c @@ -29,16 +29,14 @@ static void cl_zgessq_cpu_func(void *descr[], void *cl_arg) cham_store_t storev; int m; int n; - CHAMELEON_Complex64_t *A; - int ldA; - double *SCALESUMSQ; + CHAM_tile_t *tileA; + CHAM_tile_t *tileW; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); + tileA = cti_interface_get(descr[0]); + tileW = cti_interface_get(descr[1]); - SCALESUMSQ = (double *)STARPU_MATRIX_GET_PTR(descr[1]); - starpu_codelet_unpack_args(cl_arg, &storev, &m, &n); - CORE_zgessq( storev, m, n, A, ldA, SCALESUMSQ ); + starpu_codelet_unpack_args( cl_arg, &storev, &m, &n ); + TCORE_zgessq( storev, m, n, tileA, tileW ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -49,7 +47,7 @@ CODELETS_CPU(zgessq, 2, cl_zgessq_cpu_func) void INSERT_TASK_zgessq( const RUNTIME_option_t *options, cham_store_t storev, int m, int n, - const CHAM_desc_t *A, int Am, int An, int ldA, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { struct starpu_codelet *codelet = &cl_zgessq; @@ -73,5 +71,4 @@ void INSERT_TASK_zgessq( const RUNTIME_option_t *options, STARPU_NAME, "zgessq", #endif 0); - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zgetrf.c b/runtime/starpu/codelets/codelet_zgetrf.c index 3ce6227d994e7ad497ce6f98bab1a9843932c6ef..b744a43a9de6f0485cc64639e64f3b005eab3284 100644 --- a/runtime/starpu/codelets/codelet_zgetrf.c +++ b/runtime/starpu/codelets/codelet_zgetrf.c @@ -30,8 +30,7 @@ static void cl_zgetrf_cpu_func(void *descr[], void *cl_arg) { int m; int n; - CHAMELEON_Complex64_t *A; - int ldA; + CHAM_tile_t *tileA; int *IPIV; cham_bool_t check_info; int iinfo; @@ -39,11 +38,10 @@ static void cl_zgetrf_cpu_func(void *descr[], void *cl_arg) RUNTIME_request_t *request; int info = 0; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); + tileA = cti_interface_get(descr[0]); starpu_codelet_unpack_args(cl_arg, &m, &n, &IPIV, &check_info, &iinfo, &sequence, &request); - CORE_zgetrf( m, n, A, ldA, IPIV, &info ); + TCORE_zgetrf( m, n, tileA, IPIV, &info ); if ( (sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) { RUNTIME_sequence_flush( NULL, sequence, request, iinfo+info ); @@ -58,7 +56,7 @@ CODELETS_CPU(zgetrf, 1, cl_zgetrf_cpu_func) void INSERT_TASK_zgetrf( const RUNTIME_option_t *options, int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int ldA, + const CHAM_desc_t *A, int Am, int An, int *IPIV, cham_bool_t check_info, int iinfo ) { @@ -86,5 +84,4 @@ void INSERT_TASK_zgetrf( const RUNTIME_option_t *options, STARPU_NAME, "zgetrf", #endif 0); - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zgetrf_incpiv.c b/runtime/starpu/codelets/codelet_zgetrf_incpiv.c index 3a7c599d4250cc39cae91228362e92e3038a4dc5..61eef5b399d4ecb661fe95fb8c894a586c98235e 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_incpiv.c +++ b/runtime/starpu/codelets/codelet_zgetrf_incpiv.c @@ -34,8 +34,7 @@ static void cl_zgetrf_incpiv_cpu_func(void *descr[], void *cl_arg) int m; int n; int ib; - CHAMELEON_Complex64_t *A; - int ldA; + CHAM_tile_t *tileA; int *IPIV; cham_bool_t check_info; int iinfo; @@ -43,11 +42,10 @@ static void cl_zgetrf_incpiv_cpu_func(void *descr[], void *cl_arg) RUNTIME_request_t *request; int info = 0; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); + tileA = cti_interface_get(descr[0]); starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &IPIV, &check_info, &iinfo, &h_work, &sequence, &request); - CORE_zgetrf_incpiv(m, n, ib, A, ldA, IPIV, &info); + TCORE_zgetrf_incpiv(m, n, ib, tileA, IPIV, &info); if ( (sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) { RUNTIME_sequence_flush( NULL, sequence, request, iinfo+info ); @@ -60,64 +58,10 @@ static void cl_zgetrf_incpiv_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU(zgetrf_incpiv, 3, cl_zgetrf_incpiv_cpu_func) -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zgetrf_incpiv computes an LU factorization of a general M-by-N tile A - * using partial pivoting with row interchanges. - * - * The factorization has the form - * - * A = P * L * U - * - * where P is a permutation matrix, L is lower triangular with unit - * diagonal elements (lower trapezoidal if m > n), and U is upper - * triangular (upper trapezoidal if m < n). - * - * This is the right-looking Level 2.5 BLAS version of the algorithm. - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the tile A. M >= 0. - * - * @param[in] N - * The number of columns of the tile A. N >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A - * On entry, the M-by-N tile to be factored. - * On exit, the factors L and U from the factorization - * A = P*L*U; the unit diagonal elements of L are not stored. - * - * @param[in] ldA - * The leading dimension of the array A. ldA >= max(1,M). - * - * @param[out] IPIV - * The pivot indices; for 1 <= i <= min(M,N), row i of the - * tile was interchanged with row IPIV(i). - * - * @param[out] INFO - * See returned value. - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if INFO = -k, the k-th argument had an illegal value - * @retval >0 if INFO = k, U(k,k) is exactly zero. The factorization - * has been completed, but the factor U is exactly - * singular, and division by zero will occur if it is used - * to solve a system of equations. - * - */ - void INSERT_TASK_zgetrf_incpiv(const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int ldA, - const CHAM_desc_t *L, int Lm, int Ln, int ldL, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *L, int Lm, int Ln, int *IPIV, cham_bool_t check_info, int iinfo) { @@ -152,6 +96,4 @@ void INSERT_TASK_zgetrf_incpiv(const RUNTIME_option_t *options, STARPU_NAME, "zgetrf_incpiv", #endif 0); - (void)ldL; - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zgetrf_nopiv.c b/runtime/starpu/codelets/codelet_zgetrf_nopiv.c index 34daf949517b070614b8d67cf05e895d6049768d..d6fd0239e8c7ae38a5683a2b2b08dd47e4712861 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_nopiv.c +++ b/runtime/starpu/codelets/codelet_zgetrf_nopiv.c @@ -33,18 +33,16 @@ static void cl_zgetrf_nopiv_cpu_func(void *descr[], void *cl_arg) int m; int n; int ib; - CHAMELEON_Complex64_t *A; - int ldA; + CHAM_tile_t *tileA; int iinfo; RUNTIME_sequence_t *sequence; RUNTIME_request_t *request; int info = 0; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); + tileA = cti_interface_get(descr[0]); starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &iinfo, &sequence, &request); - CORE_zgetrf_nopiv(m, n, ib, A, ldA, &info); + TCORE_zgetrf_nopiv(m, n, ib, tileA, &info); if ( (sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) { RUNTIME_sequence_flush( NULL, sequence, request, iinfo+info ); @@ -57,56 +55,9 @@ static void cl_zgetrf_nopiv_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU(zgetrf_nopiv, 1, cl_zgetrf_nopiv_cpu_func) -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zgetrf_nopiv computes an LU factorization of a general diagonal - * dominant M-by-N matrix A witout pivoting. - * - * The factorization has the form - * A = L * U - * where L is lower triangular with unit - * diagonal elements (lower trapezoidal if m > n), and U is upper - * triangular (upper trapezoidal if m < n). - * - * This is the right-looking Level 3 BLAS version of the algorithm. - * WARNING: Your matrix need to be diagonal dominant if you want to call this - * routine safely. - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the matrix A. M >= 0. - * - * @param[in] N - * The number of columns of the matrix A. N >= 0. - * - * @param[in] IB - * The block size to switch between blocked and unblocked code. - * - * @param[in,out] A - * On entry, the M-by-N matrix to be factored. - * On exit, the factors L and U from the factorization - * A = P*L*U; the unit diagonal elements of L are not stored. - * - * @param[in] ldA - * The leading dimension of the array A. ldA >= max(1,M). - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if INFO = -k, the k-th argument had an illegal value - * @retval >0 if INFO = k, U(k,k) is exactly zero. The factorization - * has been completed, but the factor U is exactly - * singular, and division by zero will occur if it is used - * to solve a system of equations. - * - */ - void INSERT_TASK_zgetrf_nopiv(const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int ldA, + const CHAM_desc_t *A, int Am, int An, int iinfo) { (void)nb; @@ -132,5 +83,4 @@ void INSERT_TASK_zgetrf_nopiv(const RUNTIME_option_t *options, STARPU_NAME, "zgetrf_nopiv", #endif 0); - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zgram.c b/runtime/starpu/codelets/codelet_zgram.c index f8997ad1c1817d2842945ccfc1d4193e944377e1..1aa1833b41c5cbe817a86382047459902222f98b 100644 --- a/runtime/starpu/codelets/codelet_zgram.c +++ b/runtime/starpu/codelets/codelet_zgram.c @@ -25,30 +25,18 @@ static void cl_zgram_cpu_func(void *descr[], void *cl_arg) { cham_uplo_t uplo; int m, n, mt, nt; - double *Di; - int ldDI; - double *Dj; - int ldDJ; - double *D; - double *A; - int ldA; + CHAM_tile_t *Di; + CHAM_tile_t *Dj; + CHAM_tile_t *D; + CHAM_tile_t *A; - Di = (double *)STARPU_MATRIX_GET_PTR(descr[0]); - Dj = (double *)STARPU_MATRIX_GET_PTR(descr[1]); - D = (double *)STARPU_MATRIX_GET_PTR(descr[2]); - A = (double *)STARPU_MATRIX_GET_PTR(descr[3]); + Di = cti_interface_get(descr[0]); + Dj = cti_interface_get(descr[1]); + D = cti_interface_get(descr[2]); + A = cti_interface_get(descr[3]); - ldDI = STARPU_MATRIX_GET_LD( descr[0] ); - ldDJ = STARPU_MATRIX_GET_LD( descr[1] ); - ldA = STARPU_MATRIX_GET_LD( descr[3] ); - - starpu_codelet_unpack_args(cl_arg, &uplo, &m, &n, &mt, &nt); - CORE_zgram( uplo, - m, n, mt, nt, - Di, ldDI, - Dj, ldDJ, - D, - A, ldA); + starpu_codelet_unpack_args( cl_arg, &uplo, &m, &n, &mt, &nt ); + TCORE_zgram( uplo, m, n, mt, nt, Di, Dj, D, A ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -60,10 +48,10 @@ CODELETS_CPU(zgram, 4, cl_zgram_cpu_func) void INSERT_TASK_zgram( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int mt, int nt, - const CHAM_desc_t *Di, int Dim, int Din, int ldDI, - const CHAM_desc_t *Dj, int Djm, int Djn, int ldDJ, + const CHAM_desc_t *Di, int Dim, int Din, + const CHAM_desc_t *Dj, int Djm, int Djn, const CHAM_desc_t *D, int Dm, int Dn, - CHAM_desc_t *A, int Am, int An, int ldA) + CHAM_desc_t *A, int Am, int An) { struct starpu_codelet *codelet = &cl_zgram; void (*callback)(void*) = options->profiling ? cl_zgram_callback : NULL; @@ -92,7 +80,4 @@ void INSERT_TASK_zgram( const RUNTIME_option_t *options, STARPU_NAME, "zgram", #endif 0); - (void)ldA; - (void)ldDJ; - (void)ldDI; } diff --git a/runtime/starpu/codelets/codelet_zhe2ge.c b/runtime/starpu/codelets/codelet_zhe2ge.c index e740f97dd19d374ccf8da9f75a40aa86a032a6d8..cb8f66cd1f3820f7499858f6bb20dd0f140c3dc2 100644 --- a/runtime/starpu/codelets/codelet_zhe2ge.c +++ b/runtime/starpu/codelets/codelet_zhe2ge.c @@ -27,19 +27,15 @@ static void cl_zhe2ge_cpu_func(void *descr[], void *cl_arg) cham_uplo_t uplo; int M; int N; - const CHAMELEON_Complex64_t *A; - int ldA; - CHAMELEON_Complex64_t *B; - int ldB; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; - A = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - B = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldB = STARPU_MATRIX_GET_LD( descr[1] ); starpu_codelet_unpack_args(cl_arg, &uplo, &M, &N); - CORE_zhe2ge(uplo, M, N, A, ldA, B, ldB); + TCORE_zhe2ge(uplo, M, N, tileA, tileB); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -56,8 +52,8 @@ CODELETS_CPU(zhe2ge, 2, cl_zhe2ge_cpu_func) void INSERT_TASK_zhe2ge(const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int mb, - const CHAM_desc_t *A, int Am, int An, int ldA, - const CHAM_desc_t *B, int Bm, int Bn, int ldB) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn) { (void)mb; struct starpu_codelet *codelet = &cl_zhe2ge; @@ -81,6 +77,4 @@ void INSERT_TASK_zhe2ge(const RUNTIME_option_t *options, STARPU_NAME, "zhe2ge", #endif 0); - (void)ldB; - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zhemm.c b/runtime/starpu/codelets/codelet_zhemm.c index df1b09627e0271dc7b028542da9c987a7f9a981d..db4d7c7f85babe20d7f7bc0eeadafdda0dabcfdd 100644 --- a/runtime/starpu/codelets/codelet_zhemm.c +++ b/runtime/starpu/codelets/codelet_zhemm.c @@ -35,28 +35,21 @@ static void cl_zhemm_cpu_func(void *descr[], void *cl_arg) int M; int N; CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t *A; - int ldA; - CHAMELEON_Complex64_t *B; - int ldB; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; CHAMELEON_Complex64_t beta; - CHAMELEON_Complex64_t *C; - int ldC; + CHAM_tile_t *tileC; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - B = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - C = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); - - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldB = STARPU_MATRIX_GET_LD( descr[1] ); - ldC = STARPU_MATRIX_GET_LD( descr[2] ); + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &beta); - CORE_zhemm(side, uplo, + TCORE_zhemm(side, uplo, M, N, - alpha, A, ldA, - B, ldB, - beta, C, ldC); + alpha, tileA, + tileB, + beta, tileC); } #ifdef CHAMELEON_USE_CUDA @@ -67,21 +60,14 @@ static void cl_zhemm_cuda_func(void *descr[], void *cl_arg) int M; int N; cuDoubleComplex alpha; - const cuDoubleComplex *A; - int ldA; - const cuDoubleComplex *B; - int ldB; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; cuDoubleComplex beta; - cuDoubleComplex *C; - int ldC; - - A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); - C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); + CHAM_tile_t *tileC; - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldB = STARPU_MATRIX_GET_LD( descr[1] ); - ldC = STARPU_MATRIX_GET_LD( descr[2] ); + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &beta); @@ -90,9 +76,9 @@ static void cl_zhemm_cuda_func(void *descr[], void *cl_arg) CUDA_zhemm( side, uplo, M, N, - &alpha, A, ldA, - B, ldB, - &beta, C, ldC, + &alpha, tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + &beta, tileC->mat, tileC->ld, stream); #ifndef STARPU_CUDA_ASYNC @@ -117,9 +103,9 @@ CODELETS(zhemm, 3, cl_zhemm_cpu_func, cl_zhemm_cuda_func, STARPU_CUDA_ASYNC) void INSERT_TASK_zhemm(const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int ldA, - const CHAM_desc_t *B, int Bm, int Bn, int ldB, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldC) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) { (void)nb; struct starpu_codelet *codelet = &cl_zhemm; @@ -148,7 +134,4 @@ void INSERT_TASK_zhemm(const RUNTIME_option_t *options, STARPU_NAME, "zhemm", #endif 0); - (void)ldC; - (void)ldB; - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zher2k.c b/runtime/starpu/codelets/codelet_zher2k.c index bdd303ea513c6ac3812abfca145f55e7050f0886..1e3c2f5b1c9f430ea1872646a028e3f9697b3f0c 100644 --- a/runtime/starpu/codelets/codelet_zher2k.c +++ b/runtime/starpu/codelets/codelet_zher2k.c @@ -35,25 +35,18 @@ static void cl_zher2k_cpu_func(void *descr[], void *cl_arg) int n; int k; CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t *A; - int ldA; - CHAMELEON_Complex64_t *B; - int ldB; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; double beta; - CHAMELEON_Complex64_t *C; - int ldC; + CHAM_tile_t *tileC; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - B = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - C = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); - - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldB = STARPU_MATRIX_GET_LD( descr[1] ); - ldC = STARPU_MATRIX_GET_LD( descr[2] ); + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta); - CORE_zher2k(uplo, trans, - n, k, alpha, A, ldA, B, ldB, beta, C, ldC); + TCORE_zher2k(uplo, trans, + n, k, alpha, tileA, tileB, beta, tileC); } #ifdef CHAMELEON_USE_CUDA @@ -64,28 +57,24 @@ static void cl_zher2k_cuda_func(void *descr[], void *cl_arg) int n; int k; cuDoubleComplex alpha; - const cuDoubleComplex *A; - int ldA; - const cuDoubleComplex *B; - int ldB; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; double beta; - cuDoubleComplex *C; - int ldC; - - A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); - C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); + CHAM_tile_t *tileC; - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldB = STARPU_MATRIX_GET_LD( descr[1] ); - ldC = STARPU_MATRIX_GET_LD( descr[2] ); + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta); RUNTIME_getStream(stream); CUDA_zher2k( uplo, trans, - n, k, &alpha, A, ldA, B, ldB, &beta, C, ldC, + n, k, + &alpha, tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + &beta, tileC->mat, tileC->ld, stream); #ifndef STARPU_CUDA_ASYNC @@ -107,12 +96,13 @@ CODELETS(zher2k, 3, cl_zher2k_cpu_func, cl_zher2k_cuda_func, STARPU_CUDA_ASYNC) * @ingroup INSERT_TASK_Complex64_t * */ -void INSERT_TASK_zher2k(const RUNTIME_option_t *options, - cham_uplo_t uplo, cham_trans_t trans, - int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int ldA, - const CHAM_desc_t *B, int Bm, int Bn, int ldB, - double beta, const CHAM_desc_t *C, int Cm, int Cn, int ldC) +void +INSERT_TASK_zher2k( const RUNTIME_option_t *options, + cham_uplo_t uplo, cham_trans_t trans, + int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + double beta, const CHAM_desc_t *C, int Cm, int Cn ) { (void)nb; struct starpu_codelet *codelet = &cl_zher2k; @@ -141,7 +131,4 @@ void INSERT_TASK_zher2k(const RUNTIME_option_t *options, STARPU_NAME, "zher2k", #endif 0); - (void)ldC; - (void)ldB; - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zherfb.c b/runtime/starpu/codelets/codelet_zherfb.c index f0bbcb02e6c1a9e2451a4fb83067e11c71a04d6d..a160d04907897cce2ba9767d009d92c1dc5fc366 100644 --- a/runtime/starpu/codelets/codelet_zherfb.c +++ b/runtime/starpu/codelets/codelet_zherfb.c @@ -25,64 +25,48 @@ static void cl_zherfb_cpu_func(void *descr[], void *cl_arg) { cham_uplo_t uplo; - int n; - int k; - int ib; - int nb; - const CHAMELEON_Complex64_t *A; - int ldA; - const CHAMELEON_Complex64_t *T; - int ldT; - CHAMELEON_Complex64_t *C; - int ldC; - CHAMELEON_Complex64_t *WORK; - int ldWORK; + int n, k, ib, nb; + CHAM_tile_t *tileA; + CHAM_tile_t *tileT; + CHAM_tile_t *tileC; + CHAM_tile_t *tileW; + int ldW; - A = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - T = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - C = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); - WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */ + tileA = cti_interface_get(descr[0]); + tileT = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); + tileW = cti_interface_get(descr[3]); /* ib * nb */ - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldT = STARPU_MATRIX_GET_LD( descr[1] ); - ldC = STARPU_MATRIX_GET_LD( descr[2] ); + starpu_codelet_unpack_args( cl_arg, &uplo, &n, &k, &ib, &nb, &ldW ); - starpu_codelet_unpack_args(cl_arg, &uplo, &n, &k, &ib, &nb, &ldWORK); - - CORE_zherfb(uplo, n, k, ib, nb, A, ldA, T, ldT, C, ldC, WORK, ldWORK); + TCORE_zherfb( uplo, n, k, ib, nb, tileA, tileT, tileC, tileW->mat, ldW ); } #if defined(CHAMELEON_USE_CUDA) static void cl_zherfb_cuda_func(void *descr[], void *cl_arg) { cham_uplo_t uplo; - int n; - int k; - int ib; - int nb; - const cuDoubleComplex *A; - int ldA; - const cuDoubleComplex *T; - int ldT; - cuDoubleComplex *C; - int ldC; - cuDoubleComplex *WORK; - int ldWORK; + int n, k, ib, nb; + CHAM_tile_t *tileA; + CHAM_tile_t *tileT; + CHAM_tile_t *tileC; + CHAM_tile_t *tileW; + int ldW; RUNTIME_getStream(stream); - A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - T = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); - C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); - WORK = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */ - - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldT = STARPU_MATRIX_GET_LD( descr[1] ); - ldC = STARPU_MATRIX_GET_LD( descr[2] ); + tileA = cti_interface_get(descr[0]); + tileT = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); + tileW = cti_interface_get(descr[3]); /* ib * nb */ - starpu_codelet_unpack_args(cl_arg, &uplo, &n, &k, &ib, &nb, &ldWORK); + starpu_codelet_unpack_args( cl_arg, &uplo, &n, &k, &ib, &nb, &ldW ); - CUDA_zherfb( uplo, n, k, ib, nb, A, ldA, T, ldT, C, ldC, WORK, ldWORK, stream ); + CUDA_zherfb( uplo, n, k, ib, nb, + tileA->mat, tileA->ld, + tileT->mat, tileT->ld, + tileC->mat, tileC->ld, + tileW->mat, ldW, stream ); #ifndef STARPU_CUDA_ASYNC cudaStreamSynchronize( stream ); @@ -104,9 +88,9 @@ CODELETS(zherfb, 4, cl_zherfb_cpu_func, cl_zherfb_cuda_func, STARPU_CUDA_ASYNC) void INSERT_TASK_zherfb(const RUNTIME_option_t *options, cham_uplo_t uplo, int n, int k, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int ldA, - const CHAM_desc_t *T, int Tm, int Tn, int ldT, - const CHAM_desc_t *C, int Cm, int Cn, int ldC) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *T, int Tm, int Tn, + const CHAM_desc_t *C, int Cm, int Cn) { struct starpu_codelet *codelet = &cl_zherfb; void (*callback)(void*) = options->profiling ? cl_zherfb_callback : NULL; @@ -124,18 +108,15 @@ void INSERT_TASK_zherfb(const RUNTIME_option_t *options, STARPU_VALUE, &k, sizeof(int), STARPU_VALUE, &ib, sizeof(int), STARPU_VALUE, &nb, sizeof(int), + STARPU_VALUE, &nb, sizeof(int), /* ldw */ STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), STARPU_R, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), STARPU_RW, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), STARPU_SCRATCH, options->ws_worker, - STARPU_VALUE, &nb, sizeof(int), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, #if defined(CHAMELEON_CODELETS_HAVE_NAME) STARPU_NAME, "zherfb", #endif 0); - (void)ldC; - (void)ldT; - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zherk.c b/runtime/starpu/codelets/codelet_zherk.c index 89ac73aa3f4afb6e5334d68621c73d2c5770efc7..bd6131b6b4103ebb1150acb604e3a094a572eef3 100644 --- a/runtime/starpu/codelets/codelet_zherk.c +++ b/runtime/starpu/codelets/codelet_zherk.c @@ -35,23 +35,18 @@ static void cl_zherk_cpu_func(void *descr[], void *cl_arg) int n; int k; double alpha; - CHAMELEON_Complex64_t *A; - int ldA; + CHAM_tile_t *tileA; double beta; - CHAMELEON_Complex64_t *C; - int ldC; + CHAM_tile_t *tileC; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - C = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldC = STARPU_MATRIX_GET_LD( descr[1] ); + tileA = cti_interface_get(descr[0]); + tileC = cti_interface_get(descr[1]); starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta); - CORE_zherk(uplo, trans, + TCORE_zherk(uplo, trans, n, k, - alpha, A, ldA, - beta, C, ldC); + alpha, tileA, + beta, tileC); } #ifdef CHAMELEON_USE_CUDA @@ -62,27 +57,21 @@ static void cl_zherk_cuda_func(void *descr[], void *cl_arg) int n; int k; double alpha; - const cuDoubleComplex *A; - int ldA; + CHAM_tile_t *tileA; double beta; - cuDoubleComplex *C; - int ldC; - - A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); + CHAM_tile_t *tileC; - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldC = STARPU_MATRIX_GET_LD( descr[1] ); + tileA = cti_interface_get(descr[0]); + tileC = cti_interface_get(descr[1]); starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta); RUNTIME_getStream(stream); CUDA_zherk( - uplo, trans, - n, k, - &alpha, A, ldA, - &beta, C, ldC, + uplo, trans, n, k, + &alpha, tileA->mat, tileA->ld, + &beta, tileC->mat, tileC->ld, stream); #ifndef STARPU_CUDA_ASYNC @@ -107,8 +96,8 @@ CODELETS(zherk, 2, cl_zherk_cpu_func, cl_zherk_cuda_func, STARPU_CUDA_ASYNC) void INSERT_TASK_zherk(const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, - double alpha, const CHAM_desc_t *A, int Am, int An, int ldA, - double beta, const CHAM_desc_t *C, int Cm, int Cn, int ldC) + double alpha, const CHAM_desc_t *A, int Am, int An, + double beta, const CHAM_desc_t *C, int Cm, int Cn) { (void)nb; struct starpu_codelet *codelet = &cl_zherk; @@ -135,6 +124,4 @@ void INSERT_TASK_zherk(const RUNTIME_option_t *options, STARPU_NAME, "zherk", #endif 0); - (void)ldC; - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zhessq.c b/runtime/starpu/codelets/codelet_zhessq.c index 884f02aa1e5ff7999ece8d94c25568dae98d4c38..cb24c4e6925ac784b5ed8561ccd17556c956c181 100644 --- a/runtime/starpu/codelets/codelet_zhessq.c +++ b/runtime/starpu/codelets/codelet_zhessq.c @@ -24,10 +24,10 @@ void INSERT_TASK_zhessq( const RUNTIME_option_t *options, cham_store_t storev, cham_uplo_t uplo, int n, - const CHAM_desc_t *A, int Am, int An, int ldA, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { INSERT_TASK_zsyssq( options, storev, uplo, n, - A, Am, An, ldA, + A, Am, An, SCALESUMSQ, SCALESUMSQm, SCALESUMSQn ); } diff --git a/runtime/starpu/codelets/codelet_zlacpy.c b/runtime/starpu/codelets/codelet_zlacpy.c index 6c07e6586fc65d1091a2ac31e6fb980bf78ec45a..d13e6ec088d0a5808354770ef77e894301c04308 100644 --- a/runtime/starpu/codelets/codelet_zlacpy.c +++ b/runtime/starpu/codelets/codelet_zlacpy.c @@ -35,19 +35,22 @@ static void cl_zlacpy_cpu_func(void *descr[], void *cl_arg) int N; int displA; int displB; - const CHAMELEON_Complex64_t *A; - int ldA; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + CHAMELEON_Complex64_t *A; CHAMELEON_Complex64_t *B; - int ldB; - A = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - B = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldB = STARPU_MATRIX_GET_LD( descr[1] ); + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); starpu_codelet_unpack_args(cl_arg, &uplo, &M, &N, &displA, &displB); - CORE_zlacpy(uplo, M, N, A + displA, ldA, B + displB, ldB); + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + + A = tileA->mat; + B = tileB->mat; + CORE_zlacpy( uplo, M, N, A + displA, tileA->ld, B + displB, tileB->ld ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -56,15 +59,10 @@ static void cl_zlacpy_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU(zlacpy, 2, cl_zlacpy_cpu_func) -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - */ void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int nb, - int displA, const CHAM_desc_t *A, int Am, int An, int ldA, - int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldB ) + int displA, const CHAM_desc_t *A, int Am, int An, + int displB, const CHAM_desc_t *B, int Bm, int Bn ) { (void)nb; struct starpu_codelet *codelet = &cl_zlacpy; @@ -90,16 +88,14 @@ void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, STARPU_NAME, "zlacpy", #endif 0); - (void)ldA; - (void)ldA; } void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int ldA, - const CHAM_desc_t *B, int Bm, int Bn, int ldB ) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { INSERT_TASK_zlacpyx( options, uplo, m, n, nb, - 0, A, Am, An, ldA, - 0, B, Bm, Bn, ldB ); + 0, A, Am, An, + 0, B, Bm, Bn ); } diff --git a/runtime/starpu/codelets/codelet_zlag2c.c b/runtime/starpu/codelets/codelet_zlag2c.c index 21823b861f9e51a5377fc3bc0a8cfc2daeda346e..89b36aa0c00d2f2e4fffb548927a7053d8c92eeb 100644 --- a/runtime/starpu/codelets/codelet_zlag2c.c +++ b/runtime/starpu/codelets/codelet_zlag2c.c @@ -30,19 +30,15 @@ static void cl_zlag2c_cpu_func(void *descr[], void *cl_arg) { int m; int n; - CHAMELEON_Complex64_t *A; - int ldA; - CHAMELEON_Complex32_t *B; - int ldB; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - B = (CHAMELEON_Complex32_t *)STARPU_MATRIX_GET_PTR(descr[1]); + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldB = STARPU_MATRIX_GET_LD( descr[1] ); starpu_codelet_unpack_args(cl_arg, &m, &n); - CORE_zlag2c( m, n, A, ldA, B, ldB); + TCORE_zlag2c( m, n, tileA, tileB); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -58,8 +54,8 @@ CODELETS_CPU(zlag2c, 1, cl_zlag2c_cpu_func) */ void INSERT_TASK_zlag2c(const RUNTIME_option_t *options, int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int ldA, - const CHAM_desc_t *B, int Bm, int Bn, int ldB) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn) { (void)nb; struct starpu_codelet *codelet = &cl_zlag2c; @@ -82,10 +78,6 @@ void INSERT_TASK_zlag2c(const RUNTIME_option_t *options, STARPU_NAME, "zlag2c", #endif 0); - (void)ldB; - (void)ldA; - (void)ldB; - (void)ldA; } #if !defined(CHAMELEON_SIMULATION) @@ -93,19 +85,15 @@ static void cl_clag2z_cpu_func(void *descr[], void *cl_arg) { int m; int n; - CHAMELEON_Complex32_t *A; - int ldA; - CHAMELEON_Complex64_t *B; - int ldB; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; - A = (CHAMELEON_Complex32_t *)STARPU_MATRIX_GET_PTR(descr[0]); - B = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldB = STARPU_MATRIX_GET_LD( descr[1] ); starpu_codelet_unpack_args(cl_arg, &m, &n); - CORE_clag2z( m, n, A, ldA, B, ldB); + TCORE_clag2z( m, n, tileA, tileB); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -116,8 +104,8 @@ CODELETS_CPU(clag2z, 2, cl_clag2z_cpu_func) void INSERT_TASK_clag2z(const RUNTIME_option_t *options, int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int ldA, - const CHAM_desc_t *B, int Bm, int Bn, int ldB) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn) { (void)nb; struct starpu_codelet *codelet = &cl_clag2z; @@ -140,8 +128,4 @@ void INSERT_TASK_clag2z(const RUNTIME_option_t *options, STARPU_NAME, "clag2z", #endif 0); - (void)ldB; - (void)ldA; - (void)ldB; - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zlange.c b/runtime/starpu/codelets/codelet_zlange.c index 7ed0446384828fda3f00b639f96a1a5ce4d1f0d1..477f364dff7caf95572c7c13ca79a763f82ebc0d 100644 --- a/runtime/starpu/codelets/codelet_zlange.c +++ b/runtime/starpu/codelets/codelet_zlange.c @@ -28,22 +28,19 @@ #if !defined(CHAMELEON_SIMULATION) static void cl_zlange_cpu_func(void *descr[], void *cl_arg) { - double *normA; cham_normtype_t norm; int M; int N; - CHAMELEON_Complex64_t *A; - int ldA; - double *work; + CHAM_tile_t *tileA; + CHAM_tile_t *tilework; + CHAM_tile_t *tilenormA; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - work = (double *)STARPU_MATRIX_GET_PTR(descr[1]); - normA = (double *)STARPU_MATRIX_GET_PTR(descr[2]); + tileA = cti_interface_get(descr[0]); + tilework = cti_interface_get(descr[1]); + tilenormA = cti_interface_get(descr[2]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - - starpu_codelet_unpack_args(cl_arg, &norm, &M, &N); - CORE_zlange( norm, M, N, A, ldA, work, normA ); + starpu_codelet_unpack_args( cl_arg, &norm, &M, &N ); + TCORE_zlange( norm, M, N, tileA, tilework->mat, tilenormA->mat ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -54,7 +51,7 @@ CODELETS_CPU(zlange, 3, cl_zlange_cpu_func) void INSERT_TASK_zlange( const RUNTIME_option_t *options, cham_normtype_t norm, int M, int N, int NB, - const CHAM_desc_t *A, int Am, int An, int ldA, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ) { (void)NB; @@ -68,7 +65,7 @@ void INSERT_TASK_zlange( const RUNTIME_option_t *options, starpu_insert_task( starpu_mpi_codelet(codelet), - STARPU_VALUE, &norm, sizeof(int), + STARPU_VALUE, &norm, sizeof(cham_normtype_t), STARPU_VALUE, &M, sizeof(int), STARPU_VALUE, &N, sizeof(int), STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), @@ -80,20 +77,23 @@ void INSERT_TASK_zlange( const RUNTIME_option_t *options, STARPU_NAME, "zlange", #endif 0); - (void)ldA; } #if !defined(CHAMELEON_SIMULATION) static void cl_zlange_max_cpu_func(void *descr[], void *cl_arg) { - double *A; - double *B; + CHAM_tile_t *tileA; + CHAM_tile_t *tileNorm; + double *A, *norm; + + tileA = cti_interface_get(descr[0]); + tileNorm = cti_interface_get(descr[1]); - A = (double *)STARPU_MATRIX_GET_PTR(descr[0]); - B = (double *)STARPU_MATRIX_GET_PTR(descr[1]); + A = tileA->mat; + norm = tileNorm->mat; - if ( *A > *B ) { - *B = *A; + if ( A[0] > *norm ) { + *norm = A[0]; } (void)cl_arg; } diff --git a/runtime/starpu/codelets/codelet_zlanhe.c b/runtime/starpu/codelets/codelet_zlanhe.c index 3428c08faf9dc2202e1ea73914a82b8525b1409b..8b7854127db7349a2336b2a8b8e214aafe4391c3 100644 --- a/runtime/starpu/codelets/codelet_zlanhe.c +++ b/runtime/starpu/codelets/codelet_zlanhe.c @@ -28,22 +28,19 @@ #if !defined(CHAMELEON_SIMULATION) static void cl_zlanhe_cpu_func(void *descr[], void *cl_arg) { - double *normA; + CHAM_tile_t *tilenormA; cham_normtype_t norm; cham_uplo_t uplo; int N; - CHAMELEON_Complex64_t *A; - int ldA; - double *work; + CHAM_tile_t *tileA; + CHAM_tile_t *tilework; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - work = (double *)STARPU_MATRIX_GET_PTR(descr[1]); - normA = (double *)STARPU_MATRIX_GET_PTR(descr[2]); - - ldA = STARPU_MATRIX_GET_LD( descr[0] ); + tileA = cti_interface_get(descr[0]); + tilework = cti_interface_get(descr[1]); + tilenormA = cti_interface_get(descr[2]); starpu_codelet_unpack_args(cl_arg, &norm, &uplo, &N); - CORE_zlanhe( norm, uplo, N, A, ldA, work, normA); + TCORE_zlanhe( norm, uplo, N, tileA, tilework->mat, tilenormA->mat ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -54,7 +51,7 @@ CODELETS_CPU(zlanhe, 3, cl_zlanhe_cpu_func) void INSERT_TASK_zlanhe(const RUNTIME_option_t *options, cham_normtype_t norm, cham_uplo_t uplo, int N, int NB, - const CHAM_desc_t *A, int Am, int An, int ldA, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn) { struct starpu_codelet *codelet = &cl_zlanhe; @@ -79,7 +76,6 @@ void INSERT_TASK_zlanhe(const RUNTIME_option_t *options, STARPU_NAME, "zlanhe", #endif 0); - (void)ldA; (void)NB; } diff --git a/runtime/starpu/codelets/codelet_zlansy.c b/runtime/starpu/codelets/codelet_zlansy.c index 42a4a4ee47bc53713f8a1aa2cf45214b28f86d05..2345339cebd9290d2b77fa31b13567659f7de5f6 100644 --- a/runtime/starpu/codelets/codelet_zlansy.c +++ b/runtime/starpu/codelets/codelet_zlansy.c @@ -28,21 +28,19 @@ #if !defined(CHAMELEON_SIMULATION) static void cl_zlansy_cpu_func(void *descr[], void *cl_arg) { - double *normA; + CHAM_tile_t *tilenormA; cham_normtype_t norm; cham_uplo_t uplo; int N; - CHAMELEON_Complex64_t *A; - int ldA; - double *work; + CHAM_tile_t *tileA; + CHAM_tile_t *tilework; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - work = (double *)STARPU_MATRIX_GET_PTR(descr[1]); - normA = (double *)STARPU_MATRIX_GET_PTR(descr[2]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); + tileA = cti_interface_get(descr[0]); + tilework = cti_interface_get(descr[1]); + tilenormA = cti_interface_get(descr[2]); starpu_codelet_unpack_args(cl_arg, &norm, &uplo, &N); - CORE_zlansy( norm, uplo, N, A, ldA, work, normA); + TCORE_zlansy( norm, uplo, N, tileA, tilework->mat, tilenormA->mat ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -53,7 +51,7 @@ CODELETS_CPU(zlansy, 3, cl_zlansy_cpu_func) void INSERT_TASK_zlansy( const RUNTIME_option_t *options, cham_normtype_t norm, cham_uplo_t uplo, int N, int NB, - const CHAM_desc_t *A, int Am, int An, int ldA, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ) { (void)NB; @@ -79,5 +77,4 @@ void INSERT_TASK_zlansy( const RUNTIME_option_t *options, STARPU_NAME, "zlansy", #endif 0); - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zlantr.c b/runtime/starpu/codelets/codelet_zlantr.c index 1154bfd47d89217d5aaf3cf39d2ffba8b57dace1..4d4020d67e6d41f5b7f77613f94b6ed84bfd03c1 100644 --- a/runtime/starpu/codelets/codelet_zlantr.c +++ b/runtime/starpu/codelets/codelet_zlantr.c @@ -25,20 +25,18 @@ #if !defined(CHAMELEON_SIMULATION) static void cl_zlantr_cpu_func(void *descr[], void *cl_arg) { - double *normA; + CHAM_tile_t *tilenormA; cham_normtype_t norm, uplo, diag; int M, N; - CHAMELEON_Complex64_t *A; - int ldA; - double *work; + CHAM_tile_t *tileA; + CHAM_tile_t *tilework; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - work = (double *)STARPU_MATRIX_GET_PTR(descr[1]); - normA = (double *)STARPU_MATRIX_GET_PTR(descr[2]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); + tileA = cti_interface_get(descr[0]); + tilework = cti_interface_get(descr[1]); + tilenormA = cti_interface_get(descr[2]); starpu_codelet_unpack_args(cl_arg, &norm, &uplo, &diag, &M, &N); - CORE_zlantr( norm, uplo, diag, M, N, A, ldA, work, normA); + TCORE_zlantr( norm, uplo, diag, M, N, tileA, tilework->mat, tilenormA->mat ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -50,7 +48,7 @@ CODELETS_CPU(zlantr, 3, cl_zlantr_cpu_func) void INSERT_TASK_zlantr( const RUNTIME_option_t *options, cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, int M, int N, int NB, - const CHAM_desc_t *A, int Am, int An, int ldA, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ) { struct starpu_codelet *codelet = &cl_zlantr; @@ -77,7 +75,6 @@ void INSERT_TASK_zlantr( const RUNTIME_option_t *options, STARPU_NAME, "zlantr", #endif 0); - (void)ldA; (void)NB; } diff --git a/runtime/starpu/codelets/codelet_zlascal.c b/runtime/starpu/codelets/codelet_zlascal.c index c454234a5b8eafd5556536f1bdfe04cd94bcc49e..58b4d3ad2871f57aef7631818ba269d85ebbc16c 100644 --- a/runtime/starpu/codelets/codelet_zlascal.c +++ b/runtime/starpu/codelets/codelet_zlascal.c @@ -30,14 +30,12 @@ static void cl_zlascal_cpu_func(void *descr[], void *cl_arg) int M; int N; CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t *A; - int ldA; + CHAM_tile_t *tileA; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); + tileA = cti_interface_get(descr[0]); starpu_codelet_unpack_args(cl_arg, &uplo, &M, &N, &alpha); - CORE_zlascal(uplo, M, N, alpha, A, ldA); + TCORE_zlascal(uplo, M, N, alpha, tileA); return; } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -47,42 +45,11 @@ static void cl_zlascal_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU(zlascal, 1, cl_zlascal_cpu_func) -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zlascal adds to matrices together. - * - * A <- alpha * A - * - ******************************************************************************* - * - * @param[in] M - * Number of rows of the matrices A and B. - * - * @param[in] N - * Number of columns of the matrices A and B. - * - * @param[in] alpha - * Scalar factor of A. - * - * @param[in] A - * Matrix of size ldA-by-N. - * - * @param[in] ldA - * Leading dimension of the array A. ldA >= max(1,M) - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if -i, the i-th argument had an illegal value - * - */ void INSERT_TASK_zlascal(const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int nb, CHAMELEON_Complex64_t alpha, - const CHAM_desc_t *A, int Am, int An, int ldA) + const CHAM_desc_t *A, int Am, int An) { (void)nb; struct starpu_codelet *codelet = &cl_zlascal; @@ -105,5 +72,4 @@ void INSERT_TASK_zlascal(const RUNTIME_option_t *options, STARPU_NAME, "zlascal", #endif 0); - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zlaset.c b/runtime/starpu/codelets/codelet_zlaset.c index ba27e1925dd038b016ec1b9f76a5d076d6063545..5aecc3c3a1fade54184a35093e7b99c9c5eaaf7b 100644 --- a/runtime/starpu/codelets/codelet_zlaset.c +++ b/runtime/starpu/codelets/codelet_zlaset.c @@ -34,14 +34,12 @@ static void cl_zlaset_cpu_func(void *descr[], void *cl_arg) int N; CHAMELEON_Complex64_t alpha; CHAMELEON_Complex64_t beta; - CHAMELEON_Complex64_t *A; - int ldA; + CHAM_tile_t *tileA; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); + tileA = cti_interface_get(descr[0]); starpu_codelet_unpack_args(cl_arg, &uplo, &M, &N, &alpha, &beta); - CORE_zlaset(uplo, M, N, alpha, beta, A, ldA); + TCORE_zlaset(uplo, M, N, alpha, beta, tileA); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -50,45 +48,10 @@ static void cl_zlaset_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU(zlaset, 1, cl_zlaset_cpu_func) -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zlaset - Sets the elements of the matrix A on the diagonal - * to beta and on the off-diagonals to alpha - * - ******************************************************************************* - * - * @param[in] uplo - * Specifies which elements of the matrix are to be set - * = ChamUpper: Upper part of A is set; - * = ChamLower: Lower part of A is set; - * = ChamUpperLower: ALL elements of A are set. - * - * @param[in] M - * The number of rows of the matrix A. M >= 0. - * - * @param[in] N - * The number of columns of the matrix A. N >= 0. - * - * @param[in] alpha - * The constant to which the off-diagonal elements are to be set. - * - * @param[in] beta - * The constant to which the diagonal elements are to be set. - * - * @param[in,out] A - * On entry, the M-by-N tile A. - * On exit, A has been set accordingly. - * - * @param[in] ldA - * The leading dimension of the array A. ldA >= max(1,M). - * - */ void INSERT_TASK_zlaset(const RUNTIME_option_t *options, cham_uplo_t uplo, int M, int N, CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t beta, - const CHAM_desc_t *A, int Am, int An, int ldA) + const CHAM_desc_t *A, int Am, int An) { struct starpu_codelet *codelet = &cl_zlaset; @@ -112,5 +75,4 @@ void INSERT_TASK_zlaset(const RUNTIME_option_t *options, STARPU_NAME, "zlaset", #endif 0); - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zlaset2.c b/runtime/starpu/codelets/codelet_zlaset2.c index c7efc2a9c7b5b951f0ad11786df39fc27ea516b4..f9344fc291f26c95a1c1c49069e2d929ea9f3b70 100644 --- a/runtime/starpu/codelets/codelet_zlaset2.c +++ b/runtime/starpu/codelets/codelet_zlaset2.c @@ -33,13 +33,11 @@ static void cl_zlaset2_cpu_func(void *descr[], void *cl_arg) int M; int N; CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t *A; - int ldA; + CHAM_tile_t *tileA; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); + tileA = cti_interface_get(descr[0]); starpu_codelet_unpack_args(cl_arg, &uplo, &M, &N, &alpha); - CORE_zlaset2(uplo, M, N, alpha, A, ldA); + TCORE_zlaset2(uplo, M, N, alpha, tileA); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -48,42 +46,9 @@ static void cl_zlaset2_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU(zlaset2, 1, cl_zlaset2_cpu_func) -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zlaset2 - Sets the elements of the matrix A to alpha. - * Not LAPACK compliant! Read below. - * - ******************************************************************************* - * - * @param[in] uplo - * Specifies which elements of the matrix are to be set - * = ChamUpper: STRICT Upper part of A is set to alpha; - * = ChamLower: STRICT Lower part of A is set to alpha; - * = ChamUpperLower: ALL elements of A are set to alpha. - * Not LAPACK Compliant. - * - * @param[in] M - * The number of rows of the matrix A. M >= 0. - * - * @param[in] N - * The number of columns of the matrix A. N >= 0. - * - * @param[in] alpha - * The constant to which the elements are to be set. - * - * @param[in,out] A - * On entry, the M-by-N tile A. - * On exit, A has been set to alpha accordingly. - * - * @param[in] ldA - * The leading dimension of the array A. ldA >= max(1,M). - * - */ void INSERT_TASK_zlaset2(const RUNTIME_option_t *options, cham_uplo_t uplo, int M, int N, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int ldA) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An) { struct starpu_codelet *codelet = &cl_zlaset2; @@ -106,5 +71,4 @@ void INSERT_TASK_zlaset2(const RUNTIME_option_t *options, STARPU_NAME, "zlaset2", #endif 0); - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zlatro.c b/runtime/starpu/codelets/codelet_zlatro.c index f2a1b07797af1a2eed1ae385d99dca7c57e8823e..d3d6a18dc3c614a440c3334c63c76048fd2715af 100644 --- a/runtime/starpu/codelets/codelet_zlatro.c +++ b/runtime/starpu/codelets/codelet_zlatro.c @@ -34,18 +34,14 @@ static void cl_zlatro_cpu_func(void *descr[], void *cl_arg) cham_trans_t trans; int M; int N; - const CHAMELEON_Complex64_t *A; - int ldA; - CHAMELEON_Complex64_t *B; - int ldB; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; - A = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - B = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldB = STARPU_MATRIX_GET_LD( descr[1] ); + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &M, &N); - CORE_zlatro(uplo, trans, M, N, A, ldA, B, ldB); + TCORE_zlatro(uplo, trans, M, N, tileA, tileB); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -62,8 +58,8 @@ CODELETS_CPU(zlatro, 2, cl_zlatro_cpu_func) void INSERT_TASK_zlatro( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int m, int n, int mb, - const CHAM_desc_t *A, int Am, int An, int ldA, - const CHAM_desc_t *B, int Bm, int Bn, int ldB ) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { struct starpu_codelet *codelet = &cl_zlatro; void (*callback)(void*) = NULL; @@ -87,7 +83,5 @@ void INSERT_TASK_zlatro( const RUNTIME_option_t *options, STARPU_NAME, "zlatro", #endif 0); - (void)ldA; - (void)ldB; (void)mb; } diff --git a/runtime/starpu/codelets/codelet_zlauum.c b/runtime/starpu/codelets/codelet_zlauum.c index 31f742e60e2bc188e50b276fab7b5b88c67c21c9..db67785550c6c6c0ca93d37a1d101e7461774a86 100644 --- a/runtime/starpu/codelets/codelet_zlauum.c +++ b/runtime/starpu/codelets/codelet_zlauum.c @@ -32,14 +32,12 @@ static void cl_zlauum_cpu_func(void *descr[], void *cl_arg) { cham_uplo_t uplo; int N; - CHAMELEON_Complex64_t *A; - int ldA; + CHAM_tile_t *tileA; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); + tileA = cti_interface_get(descr[0]); starpu_codelet_unpack_args(cl_arg, &uplo, &N); - CORE_zlauum(uplo, N, A, ldA); + TCORE_zlauum(uplo, N, tileA); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -55,7 +53,7 @@ CODELETS_CPU(zlauum, 1, cl_zlauum_cpu_func) */ void INSERT_TASK_zlauum( const RUNTIME_option_t *options, cham_uplo_t uplo, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int ldA ) + const CHAM_desc_t *A, int Am, int An ) { (void)nb; struct starpu_codelet *codelet = &cl_zlauum; @@ -77,5 +75,4 @@ void INSERT_TASK_zlauum( const RUNTIME_option_t *options, #endif 0); - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zplghe.c b/runtime/starpu/codelets/codelet_zplghe.c index fc3a062f7bb5d9daacae243e80cb23a0286853cd..20da9d0ab75e78e6e0d6379354b3597493302fb7 100644 --- a/runtime/starpu/codelets/codelet_zplghe.c +++ b/runtime/starpu/codelets/codelet_zplghe.c @@ -35,18 +35,16 @@ static void cl_zplghe_cpu_func(void *descr[], void *cl_arg) double bump; int m; int n; - CHAMELEON_Complex64_t *A; - int ldA; + CHAM_tile_t *tileA; int bigM; int m0; int n0; unsigned long long int seed; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); + tileA = cti_interface_get(descr[0]); starpu_codelet_unpack_args(cl_arg, &bump, &m, &n, &bigM, &m0, &n0, &seed ); - CORE_zplghe( bump, m, n, A, ldA, bigM, m0, n0, seed ); + TCORE_zplghe( bump, m, n, tileA, bigM, m0, n0, seed ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -56,7 +54,7 @@ static void cl_zplghe_cpu_func(void *descr[], void *cl_arg) CODELETS_CPU(zplghe, 1, cl_zplghe_cpu_func) void INSERT_TASK_zplghe( const RUNTIME_option_t *options, - double bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int ldA, + double bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int bigM, int m0, int n0, unsigned long long int seed ) { struct starpu_codelet *codelet = &cl_zplghe; @@ -82,5 +80,4 @@ void INSERT_TASK_zplghe( const RUNTIME_option_t *options, STARPU_NAME, "zplghe", #endif 0); - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zplgsy.c b/runtime/starpu/codelets/codelet_zplgsy.c index e98a40ea03fe04a7ef94b116bab302308f9f9b03..4fbc00d7d6f5a4b8968721bd7a08a3e1408b47c5 100644 --- a/runtime/starpu/codelets/codelet_zplgsy.c +++ b/runtime/starpu/codelets/codelet_zplgsy.c @@ -35,18 +35,16 @@ static void cl_zplgsy_cpu_func(void *descr[], void *cl_arg) CHAMELEON_Complex64_t bump; int m; int n; - CHAMELEON_Complex64_t *A; - int ldA; + CHAM_tile_t *tileA; int bigM; int m0; int n0; unsigned long long int seed; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); + tileA = cti_interface_get(descr[0]); starpu_codelet_unpack_args(cl_arg, &bump, &m, &n, &bigM, &m0, &n0, &seed ); - CORE_zplgsy( bump, m, n, A, ldA, bigM, m0, n0, seed ); + TCORE_zplgsy( bump, m, n, tileA, bigM, m0, n0, seed ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -56,7 +54,7 @@ static void cl_zplgsy_cpu_func(void *descr[], void *cl_arg) CODELETS_CPU(zplgsy, 1, cl_zplgsy_cpu_func) void INSERT_TASK_zplgsy( const RUNTIME_option_t *options, - CHAMELEON_Complex64_t bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int ldA, + CHAMELEON_Complex64_t bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int bigM, int m0, int n0, unsigned long long int seed ) { @@ -83,5 +81,4 @@ void INSERT_TASK_zplgsy( const RUNTIME_option_t *options, STARPU_NAME, "zplgsy", #endif 0); - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zplrnt.c b/runtime/starpu/codelets/codelet_zplrnt.c index 8ec8b960985c5af8d22bd69354907caff847cb7e..28b82334c073a28e9d4cfedbeae16529f979e822 100644 --- a/runtime/starpu/codelets/codelet_zplrnt.c +++ b/runtime/starpu/codelets/codelet_zplrnt.c @@ -32,18 +32,16 @@ static void cl_zplrnt_cpu_func(void *descr[], void *cl_arg) { int m; int n; - CHAMELEON_Complex64_t *A; - int ldA; + CHAM_tile_t *tileA; int bigM; int m0; int n0; unsigned long long int seed; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); + tileA = cti_interface_get(descr[0]); starpu_codelet_unpack_args(cl_arg, &m, &n, &bigM, &m0, &n0, &seed ); - CORE_zplrnt( m, n, A, ldA, bigM, m0, n0, seed ); + TCORE_zplrnt( m, n, tileA, bigM, m0, n0, seed ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -53,7 +51,7 @@ static void cl_zplrnt_cpu_func(void *descr[], void *cl_arg) CODELETS_CPU(zplrnt, 1, cl_zplrnt_cpu_func) void INSERT_TASK_zplrnt( const RUNTIME_option_t *options, - int m, int n, const CHAM_desc_t *A, int Am, int An, int ldA, + int m, int n, const CHAM_desc_t *A, int Am, int An, int bigM, int m0, int n0, unsigned long long int seed ) { @@ -79,5 +77,4 @@ void INSERT_TASK_zplrnt( const RUNTIME_option_t *options, STARPU_NAME, "zplrnt", #endif 0); - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zplssq.c b/runtime/starpu/codelets/codelet_zplssq.c index 96a41eb75bf9e30d9ad939acd8c29f89c71cc7c0..7da4758d7366c3e32a93e550d2a17ddc8793ce1d 100644 --- a/runtime/starpu/codelets/codelet_zplssq.c +++ b/runtime/starpu/codelets/codelet_zplssq.c @@ -29,14 +29,17 @@ static void cl_zplssq_cpu_func(void *descr[], void *cl_arg) cham_store_t storev; int M; int N; - double *SCLSSQ_IN; - double *SCLSSQ_OUT; + CHAM_tile_t *tileIN; + CHAM_tile_t *tileOUT; - starpu_codelet_unpack_args(cl_arg, &storev, &M, &N); - SCLSSQ_IN = (double *)STARPU_MATRIX_GET_PTR(descr[0]); - SCLSSQ_OUT = (double *)STARPU_MATRIX_GET_PTR(descr[1]); + starpu_codelet_unpack_args( cl_arg, &storev, &M, &N ); + tileIN = cti_interface_get(descr[0]); + tileOUT = cti_interface_get(descr[1]); - CORE_zplssq(storev, M, N, SCLSSQ_IN, SCLSSQ_OUT); + assert( tileIN->format & CHAMELEON_TILE_FULLRANK ); + assert( tileOUT->format & CHAMELEON_TILE_FULLRANK ); + + CORE_zplssq( storev, M, N, tileIN->mat, tileOUT->mat ); (void)cl_arg; } @@ -49,15 +52,15 @@ CODELETS_CPU(zplssq, 2, cl_zplssq_cpu_func) void INSERT_TASK_zplssq( const RUNTIME_option_t *options, cham_store_t storev, int M, int N, - const CHAM_desc_t *SCLSSQ_IN, int SCLSSQ_INm, int SCLSSQ_INn, - const CHAM_desc_t *SCLSSQ_OUT, int SCLSSQ_OUTm, int SCLSSQ_OUTn ) + const CHAM_desc_t *IN, int INm, int INn, + const CHAM_desc_t *OUT, int OUTm, int OUTn ) { struct starpu_codelet *codelet = &cl_zplssq; void (*callback)(void*) = options->profiling ? cl_zplssq_callback : NULL; CHAMELEON_BEGIN_ACCESS_DECLARATION; - CHAMELEON_ACCESS_R( SCLSSQ_IN, SCLSSQ_INm, SCLSSQ_INn ); - CHAMELEON_ACCESS_RW( SCLSSQ_OUT, SCLSSQ_OUTm, SCLSSQ_OUTn ); + CHAMELEON_ACCESS_R( IN, INm, INn ); + CHAMELEON_ACCESS_RW( OUT, OUTm, OUTn ); CHAMELEON_END_ACCESS_DECLARATION; starpu_insert_task( @@ -65,8 +68,8 @@ void INSERT_TASK_zplssq( const RUNTIME_option_t *options, STARPU_VALUE, &storev, sizeof(int), STARPU_VALUE, &M, sizeof(int), STARPU_VALUE, &N, sizeof(int), - STARPU_R, RTBLKADDR( SCLSSQ_IN, double, SCLSSQ_INm, SCLSSQ_INn ), - STARPU_RW, RTBLKADDR( SCLSSQ_OUT, double, SCLSSQ_OUTm, SCLSSQ_OUTn ), + STARPU_R, RTBLKADDR( IN, double, INm, INn ), + STARPU_RW, RTBLKADDR( OUT, double, OUTm, OUTn ), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, #if defined(CHAMELEON_CODELETS_HAVE_NAME) @@ -79,12 +82,14 @@ void INSERT_TASK_zplssq( const RUNTIME_option_t *options, static void cl_zplssq2_cpu_func(void *descr[], void *cl_arg) { int N; - double *RESULT; + CHAM_tile_t *tileRESULT; starpu_codelet_unpack_args(cl_arg, &N); - RESULT = (double *)STARPU_MATRIX_GET_PTR(descr[0]); + tileRESULT = cti_interface_get(descr[0]); + + assert( tileRESULT->format & CHAMELEON_TILE_FULLRANK ); - CORE_zplssq2(N, RESULT); + CORE_zplssq2( N, tileRESULT->mat ); (void)cl_arg; } diff --git a/runtime/starpu/codelets/codelet_zpotrf.c b/runtime/starpu/codelets/codelet_zpotrf.c index c0bead183bc7223ae5acfb3e3f50156886f0798e..01ef85d594e64be2db165fac4e9a3f697fe07088 100644 --- a/runtime/starpu/codelets/codelet_zpotrf.c +++ b/runtime/starpu/codelets/codelet_zpotrf.c @@ -32,18 +32,16 @@ static void cl_zpotrf_cpu_func(void *descr[], void *cl_arg) { cham_uplo_t uplo; int n; - CHAMELEON_Complex64_t *A; - int ldA; + CHAM_tile_t *tileA; int iinfo; RUNTIME_sequence_t *sequence; RUNTIME_request_t *request; int info = 0; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); + tileA = cti_interface_get(descr[0]); starpu_codelet_unpack_args(cl_arg, &uplo, &n, &iinfo, &sequence, &request); - CORE_zpotrf(uplo, n, A, ldA, &info); + TCORE_zpotrf(uplo, n, tileA, &info); if ( (sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) { RUNTIME_sequence_flush( NULL, sequence, request, iinfo+info ); @@ -63,7 +61,7 @@ CODELETS_CPU(zpotrf, 1, cl_zpotrf_cpu_func) */ void INSERT_TASK_zpotrf(const RUNTIME_option_t *options, cham_uplo_t uplo, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int ldA, + const CHAM_desc_t *A, int Am, int An, int iinfo) { (void)nb; @@ -89,5 +87,4 @@ void INSERT_TASK_zpotrf(const RUNTIME_option_t *options, STARPU_NAME, "zpotrf", #endif 0); - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zssssm.c b/runtime/starpu/codelets/codelet_zssssm.c index a46520512775908bddd6453e3595f7f64f43d1a6..78275624e8a67c0d6cc6d50ee50622159aa40037 100644 --- a/runtime/starpu/codelets/codelet_zssssm.c +++ b/runtime/starpu/codelets/codelet_zssssm.c @@ -36,26 +36,18 @@ static void cl_zssssm_cpu_func(void *descr[], void *cl_arg) int n2; int k; int ib; - CHAMELEON_Complex64_t *A1; - int ldA1; - CHAMELEON_Complex64_t *A2; - int ldA2; - CHAMELEON_Complex64_t *L1; - int ldL1; - CHAMELEON_Complex64_t *L2; - int ldL2; + CHAM_tile_t *tileA1; + CHAM_tile_t *tileA2; + CHAM_tile_t *tileL1; + CHAM_tile_t *tileL2; int *IPIV; - A1 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - A2 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - L1 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); - L2 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); - ldA1 = STARPU_MATRIX_GET_LD( descr[0] ); - ldA2 = STARPU_MATRIX_GET_LD( descr[1] ); - ldL1 = STARPU_MATRIX_GET_LD( descr[2] ); - ldL2 = STARPU_MATRIX_GET_LD( descr[3] ); + tileA1 = cti_interface_get(descr[0]); + tileA2 = cti_interface_get(descr[1]); + tileL1 = cti_interface_get(descr[2]); + tileL2 = cti_interface_get(descr[3]); starpu_codelet_unpack_args(cl_arg, &m1, &n1, &m2, &n2, &k, &ib, &IPIV); - CORE_zssssm(m1, n1, m2, n2, k, ib, A1, ldA1, A2, ldA2, L1, ldL1, L2, ldL2, IPIV); + TCORE_zssssm(m1, n1, m2, n2, k, ib, tileA1, tileA2, tileL1, tileL2, IPIV); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -64,81 +56,12 @@ static void cl_zssssm_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU(zssssm, 4, cl_zssssm_cpu_func) -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zssssm applies the LU factorization update from a complex - * matrix formed by a lower triangular IB-by-K tile L1 on top of a - * M2-by-K tile L2 to a second complex matrix formed by a M1-by-N1 - * tile A1 on top of a M2-by-N2 tile A2 (N1 == N2). - * - * This is the right-looking Level 2.5 BLAS version of the algorithm. - * - ******************************************************************************* - * - * @param[in] M1 - * The number of rows of the tile A1. M1 >= 0. - * - * @param[in] N1 - * The number of columns of the tile A1. N1 >= 0. - * - * @param[in] M2 - * The number of rows of the tile A2 and of the tile L2. - * M2 >= 0. - * - * @param[in] N2 - * The number of columns of the tile A2. N2 >= 0. - * - * @param[in] K - * The number of columns of the tiles L1 and L2. K >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in,out] A1 - * On entry, the M1-by-N1 tile A1. - * On exit, A1 is updated by the application of L (L1 L2). - * - * @param[in] ldA1 - * The leading dimension of the array A1. ldA1 >= max(1,M1). - * - * @param[in,out] A2 - * On entry, the M2-by-N2 tile A2. - * On exit, A2 is updated by the application of L (L1 L2). - * - * @param[in] ldA2 - * The leading dimension of the array A2. ldA2 >= max(1,M2). - * - * @param[in] L1 - * The IB-by-K lower triangular tile as returned by - * CORE_ztstrf. - * - * @param[in] ldL1 - * The leading dimension of the array L1. ldL1 >= max(1,IB). - * - * @param[in] L2 - * The M2-by-K tile as returned by CORE_ztstrf. - * - * @param[in] ldL2 - * The leading dimension of the array L2. ldL2 >= max(1,M2). - * - * @param[in] IPIV - * The pivot indices array of size K as returned by - * CORE_ztstrf. - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if INFO = -k, the k-th argument had an illegal value - * - */ void INSERT_TASK_zssssm( const RUNTIME_option_t *options, int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int ldA1, - const CHAM_desc_t *A2, int A2m, int A2n, int ldA2, - const CHAM_desc_t *L1, int L1m, int L1n, int ldL1, - const CHAM_desc_t *L2, int L2m, int L2n, int ldL2, + const CHAM_desc_t *A1, int A1m, int A1n, + const CHAM_desc_t *A2, int A2m, int A2n, + const CHAM_desc_t *L1, int L1m, int L1n, + const CHAM_desc_t *L2, int L2m, int L2n, const int *IPIV ) { (void)nb; diff --git a/runtime/starpu/codelets/codelet_zsymm.c b/runtime/starpu/codelets/codelet_zsymm.c index fc22d08f905b71bbce52a68e00637eb5f5105660..844347ac3b4f75f8c2338c0dbc7c41a239667340 100644 --- a/runtime/starpu/codelets/codelet_zsymm.c +++ b/runtime/starpu/codelets/codelet_zsymm.c @@ -35,26 +35,21 @@ static void cl_zsymm_cpu_func(void *descr[], void *cl_arg) int M; int N; CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t *A; - int ldA; - CHAMELEON_Complex64_t *B; - int ldB; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; CHAMELEON_Complex64_t beta; - CHAMELEON_Complex64_t *C; - int ldC; + CHAM_tile_t *tileC; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - B = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - C = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldB = STARPU_MATRIX_GET_LD( descr[1] ); - ldC = STARPU_MATRIX_GET_LD( descr[2] ); starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &beta); - CORE_zsymm(side, uplo, + TCORE_zsymm(side, uplo, M, N, - alpha, A, ldA, - B, ldB, - beta, C, ldC); + alpha, tileA, + tileB, + beta, tileC); } #ifdef CHAMELEON_USE_CUDA @@ -65,20 +60,15 @@ static void cl_zsymm_cuda_func(void *descr[], void *cl_arg) int M; int N; cuDoubleComplex alpha; - const cuDoubleComplex *A; - int ldA; - const cuDoubleComplex *B; - int ldB; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; cuDoubleComplex beta; - cuDoubleComplex *C; - int ldC; + CHAM_tile_t *tileC; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); - A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); - C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldB = STARPU_MATRIX_GET_LD( descr[1] ); - ldC = STARPU_MATRIX_GET_LD( descr[2] ); starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &beta); RUNTIME_getStream(stream); @@ -86,9 +76,9 @@ static void cl_zsymm_cuda_func(void *descr[], void *cl_arg) CUDA_zsymm( side, uplo, M, N, - &alpha, A, ldA, - B, ldB, - &beta, C, ldC, + &alpha, tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + &beta, tileC->mat, tileC->ld, stream); #ifndef STARPU_CUDA_ASYNC @@ -113,9 +103,9 @@ CODELETS(zsymm, 3, cl_zsymm_cpu_func, cl_zsymm_cuda_func, STARPU_CUDA_ASYNC) void INSERT_TASK_zsymm(const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int ldA, - const CHAM_desc_t *B, int Bm, int Bn, int ldB, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldC) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) { (void)nb; struct starpu_codelet *codelet = &cl_zsymm; @@ -144,7 +134,4 @@ void INSERT_TASK_zsymm(const RUNTIME_option_t *options, STARPU_NAME, "zsymm", #endif 0); - (void)ldC; - (void)ldB; - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zsyr2k.c b/runtime/starpu/codelets/codelet_zsyr2k.c index 45b5377a7999ff0f93816d255205e795efb20cb4..95f5f28a91ed292a79e1e7519580cf325c7d906f 100644 --- a/runtime/starpu/codelets/codelet_zsyr2k.c +++ b/runtime/starpu/codelets/codelet_zsyr2k.c @@ -35,23 +35,18 @@ static void cl_zsyr2k_cpu_func(void *descr[], void *cl_arg) int n; int k; CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t *A; - int ldA; - CHAMELEON_Complex64_t *B; - int ldB; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; CHAMELEON_Complex64_t beta; - CHAMELEON_Complex64_t *C; - int ldC; + CHAM_tile_t *tileC; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - B = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - C = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldB = STARPU_MATRIX_GET_LD( descr[1] ); - ldC = STARPU_MATRIX_GET_LD( descr[2] ); starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta); - CORE_zsyr2k(uplo, trans, - n, k, alpha, A, ldA, B, ldB, beta, C, ldC); + TCORE_zsyr2k(uplo, trans, + n, k, alpha, tileA, tileB, beta, tileC); } #ifdef CHAMELEON_USE_CUDA @@ -62,26 +57,24 @@ static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg) int n; int k; cuDoubleComplex alpha; - const cuDoubleComplex *A; - int ldA; - const cuDoubleComplex *B; - int ldB; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; cuDoubleComplex beta; - cuDoubleComplex *C; - int ldC; + CHAM_tile_t *tileC; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); - A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); - C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldB = STARPU_MATRIX_GET_LD( descr[1] ); - ldC = STARPU_MATRIX_GET_LD( descr[2] ); starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta); RUNTIME_getStream(stream); CUDA_zsyr2k( uplo, trans, - n, k, &alpha, A, ldA, B, ldB, &beta, C, ldC, + n, k, + &alpha, tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + &beta, tileC->mat, tileC->ld, stream); #ifndef STARPU_CUDA_ASYNC @@ -106,9 +99,9 @@ CODELETS(zsyr2k, 3, cl_zsyr2k_cpu_func, cl_zsyr2k_cuda_func, STARPU_CUDA_ASYNC) void INSERT_TASK_zsyr2k(const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int ldA, - const CHAM_desc_t *B, int Bm, int Bn, int ldB, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldC) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) { (void)nb; struct starpu_codelet *codelet = &cl_zsyr2k; @@ -137,7 +130,4 @@ void INSERT_TASK_zsyr2k(const RUNTIME_option_t *options, STARPU_NAME, "zsyr2k", #endif 0); - (void)ldC; - (void)ldB; - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c index 76faf111d8419347a345255c18d5fffc89c72553..a9dd529de0e0879e42b794feee8ea02c87df26db 100644 --- a/runtime/starpu/codelets/codelet_zsyrk.c +++ b/runtime/starpu/codelets/codelet_zsyrk.c @@ -35,22 +35,18 @@ static void cl_zsyrk_cpu_func(void *descr[], void *cl_arg) int n; int k; CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t *A; - int ldA; + CHAM_tile_t *tileA; CHAMELEON_Complex64_t beta; - CHAMELEON_Complex64_t *C; - int ldC; + CHAM_tile_t *tileC; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - C = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); + tileA = cti_interface_get(descr[0]); + tileC = cti_interface_get(descr[1]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldC = STARPU_MATRIX_GET_LD( descr[1] ); starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta); - CORE_zsyrk(uplo, trans, + TCORE_zsyrk(uplo, trans, n, k, - alpha, A, ldA, - beta, C, ldC); + alpha, tileA, + beta, tileC); } #ifdef CHAMELEON_USE_CUDA @@ -61,25 +57,21 @@ static void cl_zsyrk_cuda_func(void *descr[], void *cl_arg) int n; int k; cuDoubleComplex alpha; - const cuDoubleComplex *A; - int ldA; + CHAM_tile_t *tileA; cuDoubleComplex beta; - cuDoubleComplex *C; - int ldC; + CHAM_tile_t *tileC; + + tileA = cti_interface_get(descr[0]); + tileC = cti_interface_get(descr[1]); - A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldC = STARPU_MATRIX_GET_LD( descr[1] ); starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta); RUNTIME_getStream(stream); CUDA_zsyrk( - uplo, trans, - n, k, - &alpha, A, ldA, - &beta, C, ldC, + uplo, trans, n, k, + &alpha, tileA->mat, tileA->ld, + &beta, tileC->mat, tileC->ld, stream); #ifndef STARPU_CUDA_ASYNC @@ -104,8 +96,8 @@ CODELETS(zsyrk, 2, cl_zsyrk_cpu_func, cl_zsyrk_cuda_func, STARPU_CUDA_ASYNC) void INSERT_TASK_zsyrk(const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int ldA, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldC) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) { (void)nb; struct starpu_codelet *codelet = &cl_zsyrk; @@ -132,6 +124,4 @@ void INSERT_TASK_zsyrk(const RUNTIME_option_t *options, STARPU_NAME, "zsyrk", #endif 0); - (void)ldC; - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zsyssq.c b/runtime/starpu/codelets/codelet_zsyssq.c index 678fb8d024522f5d669181fb3fb866a0329a2202..c2d6c8edb81de46be3b9e7ee9efe2583ffce0b79 100644 --- a/runtime/starpu/codelets/codelet_zsyssq.c +++ b/runtime/starpu/codelets/codelet_zsyssq.c @@ -28,15 +28,13 @@ static void cl_zsyssq_cpu_func(void *descr[], void *cl_arg) cham_store_t storev; cham_uplo_t uplo; int n; - CHAMELEON_Complex64_t *A; - int ldA; - double *SCALESUMSQ; + CHAM_tile_t *tileA; + CHAM_tile_t *tileW; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - SCALESUMSQ = (double *)STARPU_MATRIX_GET_PTR(descr[1]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - starpu_codelet_unpack_args(cl_arg, &storev, &uplo, &n); - CORE_zsyssq( storev, uplo, n, A, ldA, SCALESUMSQ ); + tileA = cti_interface_get(descr[0]); + tileW = cti_interface_get(descr[1]); + starpu_codelet_unpack_args( cl_arg, &storev, &uplo, &n ); + TCORE_zsyssq( storev, uplo, n, tileA, tileW ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -47,7 +45,7 @@ CODELETS_CPU(zsyssq, 2, cl_zsyssq_cpu_func) void INSERT_TASK_zsyssq( const RUNTIME_option_t *options, cham_store_t storev, cham_uplo_t uplo, int n, - const CHAM_desc_t *A, int Am, int An, int ldA, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { struct starpu_codelet *codelet = &cl_zsyssq; @@ -71,5 +69,4 @@ void INSERT_TASK_zsyssq( const RUNTIME_option_t *options, STARPU_NAME, "zsyssq", #endif 0); - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zsytrf_nopiv.c b/runtime/starpu/codelets/codelet_zsytrf_nopiv.c index 3f4d7f1070ad37f2e0043292188be3c934edac12..92e0b60b08704c5c2f1f0b0534dc84ca19d15bca 100644 --- a/runtime/starpu/codelets/codelet_zsytrf_nopiv.c +++ b/runtime/starpu/codelets/codelet_zsytrf_nopiv.c @@ -32,14 +32,12 @@ static void cl_zsytrf_nopiv_cpu_func(void *descr[], void *cl_arg) { cham_uplo_t uplo; int n; - CHAMELEON_Complex64_t *A; - int ldA; + CHAM_tile_t *tileA; int iinfo; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); + tileA = cti_interface_get(descr[0]); starpu_codelet_unpack_args(cl_arg, &uplo, &n, &iinfo); - CORE_zsytf2_nopiv(uplo, n, A, ldA); + TCORE_zsytf2_nopiv(uplo, n, tileA); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -50,7 +48,7 @@ CODELETS_CPU(zsytrf_nopiv, 1, cl_zsytrf_nopiv_cpu_func) void INSERT_TASK_zsytrf_nopiv( const RUNTIME_option_t *options, cham_uplo_t uplo, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int ldA, + const CHAM_desc_t *A, int Am, int An, int iinfo ) { (void)nb; @@ -74,5 +72,4 @@ void INSERT_TASK_zsytrf_nopiv( const RUNTIME_option_t *options, STARPU_NAME, "zsytrf_nopiv", #endif 0); - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_ztplqt.c b/runtime/starpu/codelets/codelet_ztplqt.c index 3809706d50485c4bc3ddb917a4186f995bdefc81..04aca83f3f54d29e3ae5c21e4323d2132bff0749 100644 --- a/runtime/starpu/codelets/codelet_ztplqt.c +++ b/runtime/starpu/codelets/codelet_ztplqt.c @@ -28,26 +28,20 @@ static void cl_ztplqt_cpu_func(void *descr[], void *cl_arg) int N; int L; int ib; - CHAMELEON_Complex64_t *A; - int ldA; - CHAMELEON_Complex64_t *B; - int ldB; - CHAMELEON_Complex64_t *T; - int ldT; - CHAMELEON_Complex64_t *WORK; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + CHAM_tile_t *tileT; + CHAM_tile_t *tileWORK; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - B = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - T = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); - WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */ - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldB = STARPU_MATRIX_GET_LD( descr[1] ); - ldT = STARPU_MATRIX_GET_LD( descr[2] ); + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + tileT = cti_interface_get(descr[2]); + tileWORK = cti_interface_get(descr[3]); /* ib * nb */ starpu_codelet_unpack_args( cl_arg, &M, &N, &L, &ib ); - CORE_zlaset( ChamUpperLower, ib, M, 0., 0., T, ldT ); - CORE_ztplqt( M, N, L, ib, - A, ldA, B, ldB, T, ldT, WORK ); + TCORE_zlaset( ChamUpperLower, ib, M, 0., 0., tileT ); + TCORE_ztplqt( M, N, L, ib, + tileA, tileB, tileT, tileWORK->mat ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -58,9 +52,9 @@ CODELETS_CPU(ztplqt, 4, cl_ztplqt_cpu_func) void INSERT_TASK_ztplqt( const RUNTIME_option_t *options, int M, int N, int L, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int ldA, - const CHAM_desc_t *B, int Bm, int Bn, int ldB, - const CHAM_desc_t *T, int Tm, int Tn, int ldT ) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + const CHAM_desc_t *T, int Tm, int Tn ) { struct starpu_codelet *codelet = &cl_ztplqt; void (*callback)(void*) = options->profiling ? cl_ztplqt_callback : NULL; @@ -91,8 +85,6 @@ void INSERT_TASK_ztplqt( const RUNTIME_option_t *options, STARPU_NAME, (L == 0) ? "ztplqs" : "ztplqt", #endif 0); - (void)ldB; - (void)ldA; (void)ib; (void)nb; } diff --git a/runtime/starpu/codelets/codelet_ztpmlqt.c b/runtime/starpu/codelets/codelet_ztpmlqt.c index 15f9be5a2ae9c8d2333a8bd8f547fdbcd7f7578f..32cefc983b2db4f5d256657bfb68bbc14c0ec633 100644 --- a/runtime/starpu/codelets/codelet_ztpmlqt.c +++ b/runtime/starpu/codelets/codelet_ztpmlqt.c @@ -29,32 +29,22 @@ static void cl_ztpmlqt_cpu_func(void *descr[], void *cl_arg) int K; int L; int ib; - const CHAMELEON_Complex64_t *V; - int ldV; - const CHAMELEON_Complex64_t *T; - int ldT; - CHAMELEON_Complex64_t *A; - int ldA; - CHAMELEON_Complex64_t *B; - int ldB; - CHAMELEON_Complex64_t *WORK; size_t lwork; + CHAM_tile_t *tileV; + CHAM_tile_t *tileT; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + CHAM_tile_t *tileW; - V = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - T = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); - B = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); - WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */ - ldV = STARPU_MATRIX_GET_LD( descr[0] ); - ldT = STARPU_MATRIX_GET_LD( descr[1] ); - ldA = STARPU_MATRIX_GET_LD( descr[2] ); - ldB = STARPU_MATRIX_GET_LD( descr[3] ); + tileV = cti_interface_get(descr[0]); + tileT = cti_interface_get(descr[1]); + tileA = cti_interface_get(descr[2]); + tileB = cti_interface_get(descr[3]); + tileW = cti_interface_get(descr[4]); /* ib * nb */ starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib, &lwork ); - CORE_ztpmlqt( side, trans, M, N, K, L, ib, - V, ldV, T, ldT, A, ldA, B, ldB, WORK ); - - (void)lwork; + TCORE_ztpmlqt( side, trans, M, N, K, L, ib, + tileV, tileT, tileA, tileB, tileW->mat ); } #if defined(CHAMELEON_USE_CUDA) @@ -67,26 +57,18 @@ static void cl_ztpmlqt_cuda_func(void *descr[], void *cl_arg) int K; int L; int ib; - const cuDoubleComplex *V; - int ldV; - const cuDoubleComplex *T; - int ldT; - cuDoubleComplex *A; - int ldA; - cuDoubleComplex *B; - int ldB; - cuDoubleComplex *W; size_t lwork; + CHAM_tile_t *tileV; + CHAM_tile_t *tileT; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + CHAM_tile_t *tileW; - V = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - T = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); - A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); - B = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); - W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */ - ldV = STARPU_MATRIX_GET_LD( descr[0] ); - ldT = STARPU_MATRIX_GET_LD( descr[1] ); - ldA = STARPU_MATRIX_GET_LD( descr[2] ); - ldB = STARPU_MATRIX_GET_LD( descr[3] ); + tileV = cti_interface_get(descr[0]); + tileT = cti_interface_get(descr[1]); + tileA = cti_interface_get(descr[2]); + tileB = cti_interface_get(descr[3]); + tileW = cti_interface_get(descr[4]); /* 3*ib*nb */ starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib, &lwork ); @@ -94,8 +76,11 @@ static void cl_ztpmlqt_cuda_func(void *descr[], void *cl_arg) CUDA_ztpmlqt( side, trans, M, N, K, L, ib, - V, ldV, T, ldT, A, ldA, B, ldB, - W, lwork, stream ); + tileV->mat, tileV->ld, + tileT->mat, tileT->ld, + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + tileW->mat, lwork, stream ); #ifndef STARPU_CUDA_ASYNC cudaStreamSynchronize( stream ); @@ -112,10 +97,10 @@ CODELETS(ztpmlqt, 5, cl_ztpmlqt_cpu_func, cl_ztpmlqt_cuda_func, STARPU_CUDA_ASYN void INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int ib, int nb, - const CHAM_desc_t *V, int Vm, int Vn, int ldV, - const CHAM_desc_t *T, int Tm, int Tn, int ldT, - const CHAM_desc_t *A, int Am, int An, int ldA, - const CHAM_desc_t *B, int Bm, int Bn, int ldB ) + const CHAM_desc_t *V, int Vm, int Vn, + const CHAM_desc_t *T, int Tm, int Tn, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { struct starpu_codelet *codelet = &cl_ztpmlqt; void (*callback)(void*) = options->profiling ? cl_ztpmlqt_callback : NULL; @@ -136,11 +121,11 @@ void INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options, STARPU_VALUE, &K, sizeof(int), STARPU_VALUE, &L, sizeof(int), STARPU_VALUE, &ib, sizeof(int), + STARPU_VALUE, &(options->ws_wsize), sizeof(size_t), STARPU_R, RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn), STARPU_R, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), STARPU_RW, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), - STARPU_VALUE, &(options->ws_wsize), sizeof(size_t), /* Other options */ STARPU_SCRATCH, options->ws_worker, STARPU_PRIORITY, options->priority, @@ -152,9 +137,6 @@ void INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options, STARPU_NAME, (( L == 0 ) ? "ztsmlq" : "ztpmlqt"), #endif 0); - (void)ldA; - (void)ldT; - (void)ldV; (void)ib; (void)nb; } diff --git a/runtime/starpu/codelets/codelet_ztpmqrt.c b/runtime/starpu/codelets/codelet_ztpmqrt.c index ff225663d9c1f298bf8b16ad5eb7c342e8640165..2f921000d506c73baafa5f48ab5526c880a42986 100644 --- a/runtime/starpu/codelets/codelet_ztpmqrt.c +++ b/runtime/starpu/codelets/codelet_ztpmqrt.c @@ -29,35 +29,24 @@ static void cl_ztpmqrt_cpu_func(void *descr[], void *cl_arg) int K; int L; int ib; - const CHAMELEON_Complex64_t *V; - int ldV; - const CHAMELEON_Complex64_t *T; - int ldT; - CHAMELEON_Complex64_t *A; - int ldA; - CHAMELEON_Complex64_t *B; - int ldB; - CHAMELEON_Complex64_t *WORK; size_t lwork; + CHAM_tile_t *tileV; + CHAM_tile_t *tileT; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + CHAM_tile_t *tileW; - V = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - T = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); - B = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); - WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */ - ldV = STARPU_MATRIX_GET_LD( descr[0] ); - ldT = STARPU_MATRIX_GET_LD( descr[1] ); - ldA = STARPU_MATRIX_GET_LD( descr[2] ); - ldB = STARPU_MATRIX_GET_LD( descr[3] ); + tileV = cti_interface_get(descr[0]); + tileT = cti_interface_get(descr[1]); + tileA = cti_interface_get(descr[2]); + tileB = cti_interface_get(descr[3]); + tileW = cti_interface_get(descr[4]); /* ib * nb */ starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib, &lwork ); - CORE_ztpmqrt( side, trans, M, N, K, L, ib, - V, ldV, T, ldT, A, ldA, B, ldB, WORK ); - - (void)lwork; + TCORE_ztpmqrt( side, trans, M, N, K, L, ib, + tileV, tileT, tileA, tileB, tileW->mat ); } - #if defined(CHAMELEON_USE_CUDA) static void cl_ztpmqrt_cuda_func(void *descr[], void *cl_arg) { @@ -68,34 +57,30 @@ static void cl_ztpmqrt_cuda_func(void *descr[], void *cl_arg) int K; int L; int ib; - const cuDoubleComplex *V; - int ldV; - const cuDoubleComplex *T; - int ldT; - cuDoubleComplex *A; - int ldA; - cuDoubleComplex *B; - int ldB; - cuDoubleComplex *W; size_t lwork; + CHAM_tile_t *tileV; + CHAM_tile_t *tileT; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + CHAM_tile_t *tileW; + + tileV = cti_interface_get(descr[0]); + tileT = cti_interface_get(descr[1]); + tileA = cti_interface_get(descr[2]); + tileB = cti_interface_get(descr[3]); + tileW = cti_interface_get(descr[4]); /* 3*ib*nb */ - V = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - T = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); - A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); - B = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); - W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 3*ib*nb */ - ldV = STARPU_MATRIX_GET_LD( descr[0] ); - ldT = STARPU_MATRIX_GET_LD( descr[1] ); - ldA = STARPU_MATRIX_GET_LD( descr[2] ); - ldB = STARPU_MATRIX_GET_LD( descr[3] ); starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib, &lwork ); RUNTIME_getStream(stream); CUDA_ztpmqrt( side, trans, M, N, K, L, ib, - V, ldV, T, ldT, A, ldA, B, ldB, - W, lwork, stream ); + tileV->mat, tileV->ld, + tileT->mat, tileT->ld, + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + tileW->mat, lwork, stream ); #ifndef STARPU_CUDA_ASYNC cudaStreamSynchronize( stream ); @@ -112,10 +97,10 @@ CODELETS(ztpmqrt, 5, cl_ztpmqrt_cpu_func, cl_ztpmqrt_cuda_func, STARPU_CUDA_ASYN void INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int ib, int nb, - const CHAM_desc_t *V, int Vm, int Vn, int ldV, - const CHAM_desc_t *T, int Tm, int Tn, int ldT, - const CHAM_desc_t *A, int Am, int An, int ldA, - const CHAM_desc_t *B, int Bm, int Bn, int ldB ) + const CHAM_desc_t *V, int Vm, int Vn, + const CHAM_desc_t *T, int Tm, int Tn, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { struct starpu_codelet *codelet = &cl_ztpmqrt; void (*callback)(void*) = options->profiling ? cl_ztpmqrt_callback : NULL; @@ -136,11 +121,11 @@ void INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options, STARPU_VALUE, &K, sizeof(int), STARPU_VALUE, &L, sizeof(int), STARPU_VALUE, &ib, sizeof(int), + STARPU_VALUE, &(options->ws_wsize), sizeof(size_t), STARPU_R, RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn), STARPU_R, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), STARPU_RW, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), - STARPU_VALUE, &(options->ws_wsize), sizeof(size_t), /* Other options */ STARPU_SCRATCH, options->ws_worker, STARPU_PRIORITY, options->priority, @@ -152,9 +137,6 @@ void INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options, STARPU_NAME, (( L == 0 ) ? "ztsmqr" : "ztpmqrt"), #endif 0); - (void)ldA; - (void)ldT; - (void)ldV; (void)ib; (void)nb; } diff --git a/runtime/starpu/codelets/codelet_ztpqrt.c b/runtime/starpu/codelets/codelet_ztpqrt.c index 26962b5cb0189ea53a8200be56a5b1127cb54d5c..c806232a2ad3b09f0588b672cf1c59289e58c733 100644 --- a/runtime/starpu/codelets/codelet_ztpqrt.c +++ b/runtime/starpu/codelets/codelet_ztpqrt.c @@ -27,26 +27,20 @@ static void cl_ztpqrt_cpu_func(void *descr[], void *cl_arg) int N; int L; int ib; - CHAMELEON_Complex64_t *A; - int ldA; - CHAMELEON_Complex64_t *B; - int ldB; - CHAMELEON_Complex64_t *T; - int ldT; - CHAMELEON_Complex64_t *WORK; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + CHAM_tile_t *tileT; + CHAM_tile_t *tileWORK; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - B = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - T = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); - WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */ - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldB = STARPU_MATRIX_GET_LD( descr[1] ); - ldT = STARPU_MATRIX_GET_LD( descr[2] ); + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + tileT = cti_interface_get(descr[2]); + tileWORK = cti_interface_get(descr[3]); /* ib * nb */ starpu_codelet_unpack_args( cl_arg, &M, &N, &L, &ib ); - CORE_zlaset( ChamUpperLower, ib, N, 0., 0., T, ldT ); - CORE_ztpqrt( M, N, L, ib, - A, ldA, B, ldB, T, ldT, WORK ); + TCORE_zlaset( ChamUpperLower, ib, N, 0., 0., tileT ); + TCORE_ztpqrt( M, N, L, ib, + tileA, tileB, tileT, tileWORK->mat ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -57,9 +51,9 @@ CODELETS_CPU(ztpqrt, 4, cl_ztpqrt_cpu_func) void INSERT_TASK_ztpqrt( const RUNTIME_option_t *options, int M, int N, int L, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int ldA, - const CHAM_desc_t *B, int Bm, int Bn, int ldB, - const CHAM_desc_t *T, int Tm, int Tn, int ldT ) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + const CHAM_desc_t *T, int Tm, int Tn ) { struct starpu_codelet *codelet = &cl_ztpqrt; void (*callback)(void*) = options->profiling ? cl_ztpqrt_callback : NULL; @@ -90,8 +84,6 @@ void INSERT_TASK_ztpqrt( const RUNTIME_option_t *options, STARPU_NAME, "ztpqrt", #endif 0); - (void)ldB; - (void)ldA; (void)ib; (void)nb; } diff --git a/runtime/starpu/codelets/codelet_ztradd.c b/runtime/starpu/codelets/codelet_ztradd.c index ada8343d759bd8c8972d6da1c93def0dd2793dff..d7799dc408dddac73b544e95cfcb2e4ef6abb158 100644 --- a/runtime/starpu/codelets/codelet_ztradd.c +++ b/runtime/starpu/codelets/codelet_ztradd.c @@ -31,18 +31,14 @@ static void cl_ztradd_cpu_func(void *descr[], void *cl_arg) int M; int N; CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t *A; - int ldA; + CHAM_tile_t *tileA; CHAMELEON_Complex64_t beta; - CHAMELEON_Complex64_t *B; - int ldB; + CHAM_tile_t *tileB; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - B = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldB = STARPU_MATRIX_GET_LD( descr[1] ); + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &M, &N, &alpha, &beta); - CORE_ztradd(uplo, trans, M, N, alpha, A, ldA, beta, B, ldB); + TCORE_ztradd(uplo, trans, M, N, alpha, tileA, beta, tileB); return; } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -113,8 +109,8 @@ CODELETS_CPU(ztradd, 2, cl_ztradd_cpu_func) */ void INSERT_TASK_ztradd( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int ldA, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn, int ldB ) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn ) { struct starpu_codelet *codelet = &cl_ztradd; void (*callback)(void*) = options->profiling ? cl_zgeadd_callback : NULL; @@ -140,7 +136,6 @@ void INSERT_TASK_ztradd( const RUNTIME_option_t *options, STARPU_NAME, "ztradd", #endif 0); - (void)ldA; (void)nb; } diff --git a/runtime/starpu/codelets/codelet_ztrasm.c b/runtime/starpu/codelets/codelet_ztrasm.c index a616b14c8efd0e47803a0d5115bfbe4863ab2938..1062237b4e9055a189b3bfd23e04719887c506e2 100644 --- a/runtime/starpu/codelets/codelet_ztrasm.c +++ b/runtime/starpu/codelets/codelet_ztrasm.c @@ -31,15 +31,13 @@ static void cl_ztrasm_cpu_func(void *descr[], void *cl_arg) cham_diag_t diag; int M; int N; - CHAMELEON_Complex64_t *A; - int ldA; - double *work; + CHAM_tile_t *tileA; + CHAM_tile_t *tileW; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - work = (double *)STARPU_MATRIX_GET_PTR(descr[1]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); + tileA = cti_interface_get(descr[0]); + tileW = cti_interface_get(descr[1]); starpu_codelet_unpack_args(cl_arg, &storev, &uplo, &diag, &M, &N); - CORE_ztrasm(storev, uplo, diag, M, N, A, ldA, work); + TCORE_ztrasm(storev, uplo, diag, M, N, tileA, tileW->mat ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -50,7 +48,7 @@ CODELETS_CPU(ztrasm, 2, cl_ztrasm_cpu_func) void INSERT_TASK_ztrasm( const RUNTIME_option_t *options, cham_store_t storev, cham_uplo_t uplo, cham_diag_t diag, int M, int N, - const CHAM_desc_t *A, int Am, int An, int ldA, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ) { struct starpu_codelet *codelet = &cl_ztrasm; @@ -76,5 +74,4 @@ void INSERT_TASK_ztrasm( const RUNTIME_option_t *options, STARPU_NAME, "ztrasm", #endif 0); - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_ztrmm.c b/runtime/starpu/codelets/codelet_ztrmm.c index 322c2326c554dd6ecebb36e2e3b30568d9164a20..9ae9cdf796356e5b5438746d825dda5bd7a765a3 100644 --- a/runtime/starpu/codelets/codelet_ztrmm.c +++ b/runtime/starpu/codelets/codelet_ztrmm.c @@ -37,22 +37,18 @@ static void cl_ztrmm_cpu_func(void *descr[], void *cl_arg) int M; int N; CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t *A; - int ldA; - CHAMELEON_Complex64_t *B; - int ldB; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - B = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldB = STARPU_MATRIX_GET_LD( descr[1] ); + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); starpu_codelet_unpack_args(cl_arg, &side, &uplo, &transA, &diag, &M, &N, &alpha); - CORE_ztrmm(side, uplo, + TCORE_ztrmm(side, uplo, transA, diag, M, N, - alpha, A, ldA, - B, ldB); + alpha, tileA, + tileB); } #ifdef CHAMELEON_USE_CUDA @@ -65,26 +61,20 @@ static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg) int M; int N; cuDoubleComplex alpha; - const cuDoubleComplex *A; - int ldA; - cuDoubleComplex *B; - int ldB; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; - A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - B = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldB = STARPU_MATRIX_GET_LD( descr[1] ); + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); starpu_codelet_unpack_args(cl_arg, &side, &uplo, &transA, &diag, &M, &N, &alpha); RUNTIME_getStream(stream); CUDA_ztrmm( - side, uplo, - transA, diag, - M, N, - &alpha, A, ldA, - B, ldB, - stream); + side, uplo, transA, diag, M, N, &alpha, + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + stream ); #ifndef STARPU_CUDA_ASYNC cudaStreamSynchronize( stream ); @@ -109,8 +99,8 @@ CODELETS(ztrmm, 2, cl_ztrmm_cpu_func, cl_ztrmm_cuda_func, STARPU_CUDA_ASYNC) void INSERT_TASK_ztrmm(const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int ldA, - const CHAM_desc_t *B, int Bm, int Bn, int ldB) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn) { (void)nb; struct starpu_codelet *codelet = &cl_ztrmm; @@ -138,6 +128,4 @@ void INSERT_TASK_ztrmm(const RUNTIME_option_t *options, STARPU_NAME, "ztrmm", #endif 0); - (void)ldB; - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c index 1d3281bdb93e65cbc6948a7486643bccb950aad9..6155433fac2fd0cae8e3cc324dfc00f4d6a6e7ae 100644 --- a/runtime/starpu/codelets/codelet_ztrsm.c +++ b/runtime/starpu/codelets/codelet_ztrsm.c @@ -37,21 +37,17 @@ static void cl_ztrsm_cpu_func(void *descr[], void *cl_arg) int m; int n; CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t *A; - int ldA; - CHAMELEON_Complex64_t *B; - int ldB; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - B = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldB = STARPU_MATRIX_GET_LD( descr[1] ); + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); starpu_codelet_unpack_args(cl_arg, &side, &uplo, &transA, &diag, &m, &n, &alpha); - CORE_ztrsm(side, uplo, + TCORE_ztrsm(side, uplo, transA, diag, m, n, - alpha, A, ldA, - B, ldB); + alpha, tileA, + tileB); } #ifdef CHAMELEON_USE_CUDA @@ -64,25 +60,20 @@ static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg) int m; int n; cuDoubleComplex alpha; - const cuDoubleComplex *A; - int ldA; - cuDoubleComplex *B; - int ldB; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; - A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - B = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldB = STARPU_MATRIX_GET_LD( descr[1] ); + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); starpu_codelet_unpack_args(cl_arg, &side, &uplo, &transA, &diag, &m, &n, &alpha); RUNTIME_getStream(stream); CUDA_ztrsm( - side, uplo, transA, diag, - m, n, - &alpha, A, ldA, - B, ldB, - stream); + side, uplo, transA, diag, m, n, &alpha, + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + stream ); #ifndef STARPU_CUDA_ASYNC cudaStreamSynchronize( stream ); @@ -106,8 +97,8 @@ CODELETS(ztrsm, 2, cl_ztrsm_cpu_func, cl_ztrsm_cuda_func, STARPU_CUDA_ASYNC) void INSERT_TASK_ztrsm(const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int ldA, - const CHAM_desc_t *B, int Bm, int Bn, int ldB) + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn) { (void)nb; struct starpu_codelet *codelet = &cl_ztrsm; @@ -135,6 +126,4 @@ void INSERT_TASK_ztrsm(const RUNTIME_option_t *options, STARPU_NAME, "ztrsm", #endif 0); - (void)ldB; - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_ztrssq.c b/runtime/starpu/codelets/codelet_ztrssq.c index 4374a51f487e67885dcb2a27a2e1573e44b56cbe..2ce632d23adb39bdabca34e50b3bd0151141a9b1 100644 --- a/runtime/starpu/codelets/codelet_ztrssq.c +++ b/runtime/starpu/codelets/codelet_ztrssq.c @@ -29,15 +29,13 @@ static void cl_ztrssq_cpu_func(void *descr[], void *cl_arg) cham_diag_t diag; int m; int n; - CHAMELEON_Complex64_t *A; - int ldA; - double *SCALESUMSQ; + CHAM_tile_t *tileA; + CHAM_tile_t *tileW; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - SCALESUMSQ = (double *)STARPU_MATRIX_GET_PTR(descr[1]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - starpu_codelet_unpack_args(cl_arg, &uplo, &diag, &m, &n); - CORE_ztrssq( uplo, diag, m, n, A, ldA, &SCALESUMSQ[0], &SCALESUMSQ[1]); + tileA = cti_interface_get(descr[0]); + tileW = cti_interface_get(descr[1]); + starpu_codelet_unpack_args( cl_arg, &uplo, &diag, &m, &n ); + TCORE_ztrssq( uplo, diag, m, n, tileA, tileW ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -49,7 +47,7 @@ CODELETS_CPU(ztrssq, 2, cl_ztrssq_cpu_func) void INSERT_TASK_ztrssq( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_diag_t diag, int m, int n, - const CHAM_desc_t *A, int Am, int An, int ldA, + const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { struct starpu_codelet *codelet = &cl_ztrssq; @@ -74,5 +72,4 @@ void INSERT_TASK_ztrssq( const RUNTIME_option_t *options, STARPU_NAME, "ztrssq", #endif 0); - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_ztrtri.c b/runtime/starpu/codelets/codelet_ztrtri.c index 50b4921064ed6c3dec73ee45204a50076ea6dd39..aac4c7a04230b603c5a53d2feb7705f56a4a9a32 100644 --- a/runtime/starpu/codelets/codelet_ztrtri.c +++ b/runtime/starpu/codelets/codelet_ztrtri.c @@ -33,17 +33,15 @@ static void cl_ztrtri_cpu_func(void *descr[], void *cl_arg) cham_uplo_t uplo; cham_diag_t diag; int N; - CHAMELEON_Complex64_t *A; - int ldA; + CHAM_tile_t *tileA; int iinfo; RUNTIME_sequence_t *sequence; RUNTIME_request_t *request; int info = 0; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - ldA = STARPU_MATRIX_GET_LD( descr[0] ); + tileA = cti_interface_get(descr[0]); starpu_codelet_unpack_args(cl_arg, &uplo, &diag, &N, &iinfo, &sequence, &request); - CORE_ztrtri(uplo, diag, N, A, ldA, &info); + TCORE_ztrtri(uplo, diag, N, tileA, &info); if ( (sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) { RUNTIME_sequence_flush( NULL, sequence, request, iinfo+info ); @@ -64,7 +62,7 @@ CODELETS_CPU(ztrtri, 1, cl_ztrtri_cpu_func) void INSERT_TASK_ztrtri( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_diag_t diag, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int ldA, + const CHAM_desc_t *A, int Am, int An, int iinfo ) { (void)nb; @@ -90,5 +88,4 @@ void INSERT_TASK_ztrtri( const RUNTIME_option_t *options, STARPU_NAME, "ztrtri", #endif 0); - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_ztsmlq_hetra1.c b/runtime/starpu/codelets/codelet_ztsmlq_hetra1.c index 9056a098b17d1b38c83b4005de3387f4d0721c26..bca185baada20eb253a5029a7b61ab53b0e46e4c 100644 --- a/runtime/starpu/codelets/codelet_ztsmlq_hetra1.c +++ b/runtime/starpu/codelets/codelet_ztsmlq_hetra1.c @@ -34,31 +34,22 @@ static void cl_ztsmlq_hetra1_cpu_func(void *descr[], void *cl_arg) int n2; int k; int ib; - int nb; - CHAMELEON_Complex64_t *A1; - int ldA1; - CHAMELEON_Complex64_t *A2; - int ldA2; - CHAMELEON_Complex64_t *V; - int ldV; - CHAMELEON_Complex64_t *T; - int ldT; + CHAM_tile_t *tileA1; + CHAM_tile_t *tileA2; + CHAM_tile_t *tileV; + CHAM_tile_t *tileT; + CHAM_tile_t *tileW; + int ldW; - CHAMELEON_Complex64_t *WORK; - int ldWORK; + tileA1 = cti_interface_get(descr[0]); + tileA2 = cti_interface_get(descr[1]); + tileV = cti_interface_get(descr[2]); + tileT = cti_interface_get(descr[3]); + tileW = cti_interface_get(descr[4]); /* ib * nb */ - A1 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - A2 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - V = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); - T = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); - WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */ - ldA1 = STARPU_MATRIX_GET_LD( descr[0] ); - ldA2 = STARPU_MATRIX_GET_LD( descr[1] ); - ldV = STARPU_MATRIX_GET_LD( descr[2] ); - ldT = STARPU_MATRIX_GET_LD( descr[3] ); - starpu_codelet_unpack_args( cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, &nb, &ldWORK); - CORE_ztsmlq_hetra1(side, trans, m1, n1, m2, n2, k, - ib, A1, ldA1, A2, ldA2, V, ldV, T, ldT, WORK, ldWORK); + starpu_codelet_unpack_args( cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, &ldW ); + TCORE_ztsmlq_hetra1( side, trans, m1, n1, m2, n2, k, ib, + tileA1, tileA2, tileV, tileT, tileW->mat, ldW ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -75,10 +66,10 @@ CODELETS_CPU(ztsmlq_hetra1, 5, cl_ztsmlq_hetra1_cpu_func) void INSERT_TASK_ztsmlq_hetra1( const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int ldA1, - const CHAM_desc_t *A2, int A2m, int A2n, int ldA2, - const CHAM_desc_t *V, int Vm, int Vn, int ldV, - const CHAM_desc_t *T, int Tm, int Tn, int ldT ) + const CHAM_desc_t *A1, int A1m, int A1n, + const CHAM_desc_t *A2, int A2m, int A2n, + const CHAM_desc_t *V, int Vm, int Vn, + const CHAM_desc_t *T, int Tm, int Tn ) { struct starpu_codelet *codelet = &cl_ztsmlq_hetra1; void (*callback)(void*) = options->profiling ? cl_ztsmlq_hetra1_callback : NULL; @@ -102,13 +93,12 @@ void INSERT_TASK_ztsmlq_hetra1( const RUNTIME_option_t *options, STARPU_VALUE, &n2, sizeof(int), STARPU_VALUE, &k, sizeof(int), STARPU_VALUE, &ib, sizeof(int), - STARPU_VALUE, &nb, sizeof(int), + STARPU_VALUE, &ldWORK, sizeof(int), STARPU_RW, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), STARPU_RW, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), STARPU_R, RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn), STARPU_R, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), STARPU_SCRATCH, options->ws_worker, - STARPU_VALUE, &ldWORK, sizeof(int), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, #if defined(CHAMELEON_CODELETS_HAVE_NAME) diff --git a/runtime/starpu/codelets/codelet_ztsmqr_hetra1.c b/runtime/starpu/codelets/codelet_ztsmqr_hetra1.c index 9f3c2d3fbab220830105b149475f97787d6f34bc..e6e2ff53afcfb9e97fdf6843830d24d758534128 100644 --- a/runtime/starpu/codelets/codelet_ztsmqr_hetra1.c +++ b/runtime/starpu/codelets/codelet_ztsmqr_hetra1.c @@ -34,32 +34,22 @@ static void cl_ztsmqr_hetra1_cpu_func(void *descr[], void *cl_arg) int n2; int k; int ib; - CHAMELEON_Complex64_t *A1; - int ldA1; - CHAMELEON_Complex64_t *A2; - int ldA2; - CHAMELEON_Complex64_t *V; - int ldV; - CHAMELEON_Complex64_t *T; - int ldT; + CHAM_tile_t *tileA1; + CHAM_tile_t *tileA2; + CHAM_tile_t *tileV; + CHAM_tile_t *tileT; + CHAM_tile_t *tileW; + int ldW; - /* TODO: manage workspace */ - CHAMELEON_Complex64_t *WORK; - int ldWORK; + tileA1 = cti_interface_get(descr[0]); + tileA2 = cti_interface_get(descr[1]); + tileV = cti_interface_get(descr[2]); + tileT = cti_interface_get(descr[3]); + tileW = cti_interface_get(descr[4]); - A1 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - A2 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - V = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); - T = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); - WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); - ldA1 = STARPU_MATRIX_GET_LD( descr[0] ); - ldA2 = STARPU_MATRIX_GET_LD( descr[1] ); - ldV = STARPU_MATRIX_GET_LD( descr[2] ); - ldT = STARPU_MATRIX_GET_LD( descr[3] ); - starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, - &ib, &ldWORK); - CORE_ztsmqr_hetra1(side, trans, m1, n1, m2, n2, k, - ib, A1, ldA1, A2, ldA2, V, ldV, T, ldT, WORK, ldWORK); + starpu_codelet_unpack_args( cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, &ldW ); + TCORE_ztsmqr_hetra1( side, trans, m1, n1, m2, n2, k, ib, + tileA1, tileA2, tileV, tileT, tileW->mat, ldW ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -76,10 +66,10 @@ CODELETS_CPU(ztsmqr_hetra1, 5, cl_ztsmqr_hetra1_cpu_func) void INSERT_TASK_ztsmqr_hetra1( const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int m1, int n1, int m2, int n2, int k, int ib, int nb, - const CHAM_desc_t *A1, int A1m, int A1n, int ldA1, - const CHAM_desc_t *A2, int A2m, int A2n, int ldA2, - const CHAM_desc_t *V, int Vm, int Vn, int ldV, - const CHAM_desc_t *T, int Tm, int Tn, int ldT ) + const CHAM_desc_t *A1, int A1m, int A1n, + const CHAM_desc_t *A2, int A2m, int A2n, + const CHAM_desc_t *V, int Vm, int Vn, + const CHAM_desc_t *T, int Tm, int Tn ) { struct starpu_codelet *codelet = &cl_ztsmqr_hetra1; void (*callback)(void*) = options->profiling ? cl_ztsmqr_hetra1_callback : NULL; @@ -103,12 +93,12 @@ void INSERT_TASK_ztsmqr_hetra1( const RUNTIME_option_t *options, STARPU_VALUE, &n2, sizeof(int), STARPU_VALUE, &k, sizeof(int), STARPU_VALUE, &ib, sizeof(int), + STARPU_VALUE, &ldWORK, sizeof(int), STARPU_RW, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), STARPU_RW, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), STARPU_R, RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn), STARPU_R, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), STARPU_SCRATCH, options->ws_worker, - STARPU_VALUE, &ldWORK, sizeof(int), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, #if defined(CHAMELEON_CODELETS_HAVE_NAME) diff --git a/runtime/starpu/codelets/codelet_ztstrf.c b/runtime/starpu/codelets/codelet_ztstrf.c index a711810e9aa29c211e60f24d42ccc5a3c3033690..e8115be4f146e757539dadd79cbb34f1089e813d 100644 --- a/runtime/starpu/codelets/codelet_ztstrf.c +++ b/runtime/starpu/codelets/codelet_ztstrf.c @@ -35,33 +35,28 @@ static void cl_ztstrf_cpu_func(void *descr[], void *cl_arg) int n; int ib; int nb; - CHAMELEON_Complex64_t *U; - int ldU; - CHAMELEON_Complex64_t *A; - int ldA; - CHAMELEON_Complex64_t *L; - int ldL; + CHAM_tile_t *tileU; + CHAM_tile_t *tileA; + CHAM_tile_t *tileL; int *IPIV; - CHAMELEON_Complex64_t *WORK; - int ldWORK; + CHAM_tile_t *tileW; + int ldW; cham_bool_t check_info; int iinfo; RUNTIME_sequence_t *sequence; RUNTIME_request_t *request; int info = 0; - U = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - L = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); - WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); - ldU = STARPU_MATRIX_GET_LD( descr[0] ); - ldA = STARPU_MATRIX_GET_LD( descr[1] ); - ldL = STARPU_MATRIX_GET_LD( descr[2] ); - starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &nb, - &IPIV, &d_work, &ldWORK, &check_info, &iinfo, - &sequence, &request); + tileU = cti_interface_get(descr[0]); + tileA = cti_interface_get(descr[1]); + tileL = cti_interface_get(descr[2]); + tileW = cti_interface_get(descr[3]); - CORE_ztstrf(m, n, ib, nb, U, ldU, A, ldA, L, ldL, IPIV, WORK, ldWORK, &info); + starpu_codelet_unpack_args( cl_arg, &m, &n, &ib, &nb, + &IPIV, &d_work, &ldW, &check_info, &iinfo, + &sequence, &request ); + + TCORE_ztstrf(m, n, ib, nb, tileU, tileA, tileL, IPIV, tileW->mat, ldW, &info); if ( (sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) { RUNTIME_sequence_flush( NULL, sequence, request, iinfo+info ); @@ -74,76 +69,11 @@ static void cl_ztstrf_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU(ztstrf, 4, cl_ztstrf_cpu_func) -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_ztstrf computes an LU factorization of a complex matrix formed - * by an upper triangular NB-by-N tile U on top of a M-by-N tile A - * using partial pivoting with row interchanges. - * - * This is the right-looking Level 2.5 BLAS version of the algorithm. - * - ******************************************************************************* - * - * @param[in] M - * The number of rows of the tile A. M >= 0. - * - * @param[in] N - * The number of columns of the tile A. N >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in] NB - * - * @param[in,out] U - * On entry, the NB-by-N upper triangular tile. - * On exit, the new factor U from the factorization - * - * @param[in] ldU - * The leading dimension of the array U. ldU >= max(1,NB). - * - * @param[in,out] A - * On entry, the M-by-N tile to be factored. - * On exit, the factor L from the factorization - * - * @param[in] ldA - * The leading dimension of the array A. ldA >= max(1,M). - * - * @param[in,out] L - * On entry, the IB-by-N lower triangular tile. - * On exit, the interchanged rows form the tile A in case of pivoting. - * - * @param[in] ldL - * The leading dimension of the array L. ldL >= max(1,IB). - * - * @param[out] IPIV - * The pivot indices; for 1 <= i <= min(M,N), row i of the - * tile U was interchanged with row IPIV(i) of the tile A. - * - * @param[in,out] WORK - * - * @param[in] ldWORK - * The dimension of the array WORK. - * - * @param[out] INFO - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if INFO = -k, the k-th argument had an illegal value - * @retval >0 if INFO = k, U(k,k) is exactly zero. The factorization - * has been completed, but the factor U is exactly - * singular, and division by zero will occur if it is used - * to solve a system of equations. - * - */ void INSERT_TASK_ztstrf( const RUNTIME_option_t *options, int m, int n, int ib, int nb, - const CHAM_desc_t *U, int Um, int Un, int ldU, - const CHAM_desc_t *A, int Am, int An, int ldA, - const CHAM_desc_t *L, int Lm, int Ln, int ldL, + const CHAM_desc_t *U, int Um, int Un, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *L, int Lm, int Ln, int *IPIV, cham_bool_t check_info, int iinfo ) { @@ -181,7 +111,4 @@ void INSERT_TASK_ztstrf( const RUNTIME_option_t *options, STARPU_NAME, "ztstrf", #endif 0); - (void)ldL; - (void)ldA; - (void)ldU; } diff --git a/runtime/starpu/codelets/codelet_zunmlq.c b/runtime/starpu/codelets/codelet_zunmlq.c index dd9c8d7dd58731fb3bfbf58f06e2541dab21d5b0..4769f790b7c06f944da115a9d218c1783861aa76 100644 --- a/runtime/starpu/codelets/codelet_zunmlq.c +++ b/runtime/starpu/codelets/codelet_zunmlq.c @@ -37,27 +37,21 @@ static void cl_zunmlq_cpu_func(void *descr[], void *cl_arg) int n; int k; int ib; - const CHAMELEON_Complex64_t *A; - int ldA; - const CHAMELEON_Complex64_t *T; - int ldT; - CHAMELEON_Complex64_t *C; - int ldC; - CHAMELEON_Complex64_t *WORK; - int ldWORK; + CHAM_tile_t *tileA; + CHAM_tile_t *tileT; + CHAM_tile_t *tileC; + CHAM_tile_t *tileW; + int ldW; - A = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - T = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - C = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); - WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */ - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldT = STARPU_MATRIX_GET_LD( descr[1] ); - ldC = STARPU_MATRIX_GET_LD( descr[2] ); + tileA = cti_interface_get(descr[0]); + tileT = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); + tileW = cti_interface_get(descr[3]); /* ib * nb */ - starpu_codelet_unpack_args(cl_arg, &side, &trans, &m, &n, &k, &ib, &ldWORK); + starpu_codelet_unpack_args( cl_arg, &side, &trans, &m, &n, &k, &ib, &ldW ); - CORE_zunmlq(side, trans, m, n, k, ib, - A, ldA, T, ldT, C, ldC, WORK, ldWORK); + TCORE_zunmlq( side, trans, m, n, k, ib, + tileA, tileT, tileC, tileW->mat, ldW ); } #if defined(CHAMELEON_USE_CUDA) @@ -69,25 +63,27 @@ static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg) int n; int k; int ib; - const cuDoubleComplex *A, *T; - cuDoubleComplex *C, *WORK; - int ldA, ldT, ldC, ldWORK; + CHAM_tile_t *tileA; + CHAM_tile_t *tileT; + CHAM_tile_t *tileC; + CHAM_tile_t *tileW; + int ldW; - starpu_codelet_unpack_args(cl_arg, &side, &trans, &m, &n, &k, &ib, &ldWORK); + tileA = cti_interface_get(descr[0]); + tileT = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); + tileW = cti_interface_get(descr[3]); /* ib * nb */ - A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - T = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); - C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); - WORK = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */ - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldT = STARPU_MATRIX_GET_LD( descr[1] ); - ldC = STARPU_MATRIX_GET_LD( descr[2] ); + starpu_codelet_unpack_args( cl_arg, &side, &trans, &m, &n, &k, &ib, &ldW ); RUNTIME_getStream(stream); CUDA_zunmlqt( side, trans, m, n, k, ib, - A, ldA, T, ldT, C, ldC, WORK, ldWORK, stream ); + tileA->mat, tileA->ld, + tileT->mat, tileT->ld, + tileC->mat, tileC->ld, + tileW->mat, ldW, stream ); #ifndef STARPU_CUDA_ASYNC cudaStreamSynchronize( stream ); @@ -101,94 +97,12 @@ static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg) */ CODELETS(zunmlq, 4, cl_zunmlq_cpu_func, cl_zunmlq_cuda_func, STARPU_CUDA_ASYNC) -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zunmlq overwrites the general complex M-by-N tile C with - * - * SIDE = 'L' SIDE = 'R' - * TRANS = 'N': Q * C C * Q - * TRANS = 'C': Q^H * C C * Q^H - * - * where Q is a complex unitary matrix defined as the product of k - * elementary reflectors - * - * Q = H(k) . . . H(2) H(1) - * - * as returned by CORE_zgelqt. Q is of order M if SIDE = 'L' and of order N - * if SIDE = 'R'. - * - ******************************************************************************* - * - * @param[in] side - * @arg ChamLeft : apply Q or Q^H from the Left; - * @arg ChamRight : apply Q or Q^H from the Right. - * - * @param[in] trans - * @arg ChamNoTrans : No transpose, apply Q; - * @arg ChamConjTrans : Transpose, apply Q^H. - * - * @param[in] M - * The number of rows of the tile C. M >= 0. - * - * @param[in] N - * The number of columns of the tile C. N >= 0. - * - * @param[in] K - * The number of elementary reflectors whose product defines - * the matrix Q. - * If SIDE = ChamLeft, M >= K >= 0; - * if SIDE = ChamRight, N >= K >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in] A - * Dimension: (ldA,M) if SIDE = ChamLeft, - * (ldA,N) if SIDE = ChamRight, - * The i-th row must contain the vector which defines the - * elementary reflector H(i), for i = 1,2,...,k, as returned by - * CORE_zgelqt in the first k rows of its array argument A. - * - * @param[in] ldA - * The leading dimension of the array A. ldA >= max(1,K). - * - * @param[in] T - * The IB-by-K triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] ldT - * The leading dimension of the array T. ldT >= IB. - * - * @param[in,out] C - * On entry, the M-by-N tile C. - * On exit, C is overwritten by Q*C or Q^T*C or C*Q^T or C*Q. - * - * @param[in] ldC - * The leading dimension of the array C. ldC >= max(1,M). - * - * @param[in,out] WORK - * On exit, if INFO = 0, WORK(1) returns the optimal ldWORK. - * - * @param[in] ldWORK - * The dimension of the array WORK. - * If SIDE = ChamLeft, ldWORK >= max(1,N); - * if SIDE = ChamRight, ldWORK >= max(1,M). - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if -i, the i-th argument had an illegal value - * - */ void INSERT_TASK_zunmlq( const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int m, int n, int k, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int ldA, - const CHAM_desc_t *T, int Tm, int Tn, int ldT, - const CHAM_desc_t *C, int Cm, int Cn, int ldC ) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *T, int Tm, int Tn, + const CHAM_desc_t *C, int Cm, int Cn ) { struct starpu_codelet *codelet = &cl_zunmlq; void (*callback)(void*) = options->profiling ? cl_zunmlq_callback : NULL; @@ -219,7 +133,4 @@ void INSERT_TASK_zunmlq( const RUNTIME_option_t *options, STARPU_NAME, "zunmlq", #endif 0); - - (void)ldT; - (void)ldA; } diff --git a/runtime/starpu/codelets/codelet_zunmqr.c b/runtime/starpu/codelets/codelet_zunmqr.c index a8ef47db4d87612dd100c269cfd0159a507b099f..dafd7fbe29095308f77c78675bff57e2402a4086 100644 --- a/runtime/starpu/codelets/codelet_zunmqr.c +++ b/runtime/starpu/codelets/codelet_zunmqr.c @@ -36,27 +36,21 @@ static void cl_zunmqr_cpu_func(void *descr[], void *cl_arg) int n; int k; int ib; - const CHAMELEON_Complex64_t *A; - int ldA; - const CHAMELEON_Complex64_t *T; - int ldT; - CHAMELEON_Complex64_t *C; - int ldC; - CHAMELEON_Complex64_t *WORK; -int ldWORK; + CHAM_tile_t *tileA; + CHAM_tile_t *tileT; + CHAM_tile_t *tileC; + CHAM_tile_t *tileW; + int ldW; - A = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - T = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - C = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); - WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */ - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldT = STARPU_MATRIX_GET_LD( descr[1] ); - ldC = STARPU_MATRIX_GET_LD( descr[2] ); + tileA = cti_interface_get(descr[0]); + tileT = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); + tileW = cti_interface_get(descr[3]); /* ib * nb */ - starpu_codelet_unpack_args(cl_arg, &side, &trans, &m, &n, &k, &ib, &ldWORK); + starpu_codelet_unpack_args( cl_arg, &side, &trans, &m, &n, &k, &ib, &ldW ); - CORE_zunmqr(side, trans, m, n, k, ib, - A, ldA, T, ldT, C, ldC, WORK, ldWORK); + TCORE_zunmqr( side, trans, m, n, k, ib, + tileA, tileT, tileC, tileW->mat, ldW ); } #if defined(CHAMELEON_USE_CUDA) @@ -68,25 +62,27 @@ static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg) int n; int k; int ib; - const cuDoubleComplex *A, *T; - cuDoubleComplex *C, *WORK; - int ldA, ldT, ldC, ldWORK; + CHAM_tile_t *tileA; + CHAM_tile_t *tileT; + CHAM_tile_t *tileC; + CHAM_tile_t *tileW; + int ldW; - starpu_codelet_unpack_args(cl_arg, &side, &trans, &m, &n, &k, &ib, &ldWORK); + tileA = cti_interface_get(descr[0]); + tileT = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); + tileW = cti_interface_get(descr[3]); /* ib * nb */ - A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - T = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); - C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); - WORK = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */ - ldA = STARPU_MATRIX_GET_LD( descr[0] ); - ldT = STARPU_MATRIX_GET_LD( descr[1] ); - ldC = STARPU_MATRIX_GET_LD( descr[2] ); + starpu_codelet_unpack_args( cl_arg, &side, &trans, &m, &n, &k, &ib, &ldW ); RUNTIME_getStream(stream); CUDA_zunmqrt( side, trans, m, n, k, ib, - A, ldA, T, ldT, C, ldC, WORK, ldWORK, stream ); + tileA->mat, tileA->ld, + tileT->mat, tileT->ld, + tileC->mat, tileC->ld, + tileW->mat, ldW, stream ); #ifndef STARPU_CUDA_ASYNC cudaStreamSynchronize( stream ); @@ -100,95 +96,12 @@ static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg) */ CODELETS(zunmqr, 4, cl_zunmqr_cpu_func, cl_zunmqr_cuda_func, STARPU_CUDA_ASYNC) -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * CORE_zunmqr overwrites the general complex M-by-N tile C with - * - * SIDE = 'L' SIDE = 'R' - * TRANS = 'N': Q * C C * Q - * TRANS = 'C': Q^H * C C * Q^H - * - * where Q is a complex unitary matrix defined as the product of k - * elementary reflectors - * - * Q = H(1) H(2) . . . H(k) - * - * as returned by CORE_zgeqrt. Q is of order M if SIDE = 'L' and of order N - * if SIDE = 'R'. - * - ******************************************************************************* - * - * @param[in] side - * @arg ChamLeft : apply Q or Q^H from the Left; - * @arg ChamRight : apply Q or Q^H from the Right. - * - * @param[in] trans - * @arg ChamNoTrans : No transpose, apply Q; - * @arg ChamConjTrans : Transpose, apply Q^H. - * - * @param[in] M - * The number of rows of the tile C. M >= 0. - * - * @param[in] N - * The number of columns of the tile C. N >= 0. - * - * @param[in] K - * The number of elementary reflectors whose product defines - * the matrix Q. - * If SIDE = ChamLeft, M >= K >= 0; - * if SIDE = ChamRight, N >= K >= 0. - * - * @param[in] IB - * The inner-blocking size. IB >= 0. - * - * @param[in] A - * Dimension: (ldA,K) - * The i-th column must contain the vector which defines the - * elementary reflector H(i), for i = 1,2,...,k, as returned by - * CORE_zgeqrt in the first k columns of its array argument A. - * - * @param[in] ldA - * The leading dimension of the array A. - * If SIDE = ChamLeft, ldA >= max(1,M); - * if SIDE = ChamRight, ldA >= max(1,N). - * - * @param[in] T - * The IB-by-K triangular factor T of the block reflector. - * T is upper triangular by block (economic storage); - * The rest of the array is not referenced. - * - * @param[in] ldT - * The leading dimension of the array T. ldT >= IB. - * - * @param[in,out] C - * On entry, the M-by-N tile C. - * On exit, C is overwritten by Q*C or Q^T*C or C*Q^T or C*Q. - * - * @param[in] ldC - * The leading dimension of the array C. ldC >= max(1,M). - * - * @param[in,out] WORK - * On exit, if INFO = 0, WORK(1) returns the optimal ldWORK. - * - * @param[in] ldWORK - * The dimension of the array WORK. - * If SIDE = ChamLeft, ldWORK >= max(1,N); - * if SIDE = ChamRight, ldWORK >= max(1,M). - * - ******************************************************************************* - * - * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if -i, the i-th argument had an illegal value - * - */ void INSERT_TASK_zunmqr( const RUNTIME_option_t *options, cham_side_t side, cham_trans_t trans, int m, int n, int k, int ib, int nb, - const CHAM_desc_t *A, int Am, int An, int ldA, - const CHAM_desc_t *T, int Tm, int Tn, int ldT, - const CHAM_desc_t *C, int Cm, int Cn, int ldC ) + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *T, int Tm, int Tn, + const CHAM_desc_t *C, int Cm, int Cn ) { struct starpu_codelet *codelet = &cl_zunmqr; void (*callback)(void*) = options->profiling ? cl_zunmqr_callback : NULL; @@ -219,7 +132,4 @@ void INSERT_TASK_zunmqr( const RUNTIME_option_t *options, STARPU_NAME, "zunmqr", #endif 0); - - (void)ldT; - (void)ldA; } diff --git a/runtime/starpu/control/runtime_control.c b/runtime/starpu/control/runtime_control.c index 52c790e727c7d9d3497ce7163c3a872f3199943e..1279986bfd8f58b172c77dacb59f9fff738ccfd7 100644 --- a/runtime/starpu/control/runtime_control.c +++ b/runtime/starpu/control/runtime_control.c @@ -21,6 +21,9 @@ #include <stdio.h> #include <stdlib.h> #include "chameleon_starpu.h" +#if defined(HAVE_STARPU_FXT_PROFILING) +#include <starpu_fxt.h> +#endif /** * @@ -37,6 +40,10 @@ static int chameleon_starpu_init( starpu_conf_t *conf ) MPI_Initialized( &flag ); # endif +#if defined(HAVE_STARPU_FXT_PROFILING) + starpu_fxt_autostart_profiling(0); +#endif + # ifdef HAVE_STARPU_MPI_INIT_CONF hres = starpu_mpi_init_conf(NULL, NULL, !flag, MPI_COMM_WORLD, conf); # else diff --git a/runtime/starpu/control/runtime_descriptor.c b/runtime/starpu/control/runtime_descriptor.c index 24ae67e66aed8aeb96a8de68771f551ac1595ffb..77ca06002ded80f3ec016aa14cc52e58161d868d 100644 --- a/runtime/starpu/control/runtime_descriptor.c +++ b/runtime/starpu/control/runtime_descriptor.c @@ -456,23 +456,18 @@ void *RUNTIME_data_getaddr( const CHAM_desc_t *A, int m, int n ) if (*ptrtile == NULL) { int home_node = -1; - void *user_ptr = NULL; int myrank = A->myrank; int owner = A->get_rankof( A, m, n ); - int64_t eltsze = CHAMELEON_Element_Size(A->dtyp); - int tempmm = (mm == A->lmt-1) ? (A->lm - mm * A->mb) : A->mb; - int tempnn = (nn == A->lnt-1) ? (A->ln - nn * A->nb) : A->nb; + CHAM_tile_t *tile = A->get_blktile( A, m, n ); if ( myrank == owner ) { - user_ptr = A->get_blkaddr(A, m, n); - if ( user_ptr != NULL ) { + if ( tile->mat != NULL ) + { home_node = STARPU_MAIN_RAM; } } - starpu_matrix_data_register( ptrtile, home_node, (uintptr_t) user_ptr, - BLKLDD(A, m), - tempmm, tempnn, eltsze ); + starpu_cham_tile_register( ptrtile, home_node, tile, A->dtyp ); #if defined(HAVE_STARPU_DATA_SET_OOC_FLAG) if ( A->ooc == 0 ) { diff --git a/runtime/starpu/control/runtime_options.c b/runtime/starpu/control/runtime_options.c index 221c07e1ddd7bc592d3ac30d09e82663a390d61b..32dce7c5b9c3e717a61fd702b836ad3e4721b07b 100644 --- a/runtime/starpu/control/runtime_options.c +++ b/runtime/starpu/control/runtime_options.c @@ -48,10 +48,16 @@ int RUNTIME_options_ws_alloc( RUNTIME_option_t *options, size_t worker_size, siz { int ret = 0; if ( worker_size > 0 ) { + CHAM_tile_t tile = { + .format = CHAMELEON_TILE_FULLRANK, + .m = worker_size, + .n = 1, + .ld = worker_size, + .mat = NULL, + }; options->ws_wsize = worker_size; - starpu_matrix_data_register( (starpu_data_handle_t*)(&(options->ws_worker)), - -1, (uintptr_t)NULL, - worker_size, worker_size, 1, sizeof(char)); + starpu_cham_tile_register( (starpu_data_handle_t*)(&(options->ws_worker)), + -1, &tile, sizeof(char) ); } if ( host_size > 0 ) { options->ws_hsize = host_size; diff --git a/runtime/starpu/include/cham_tile_interface.h b/runtime/starpu/include/cham_tile_interface.h new file mode 100644 index 0000000000000000000000000000000000000000..2d449941d833940fb08d592eca447a5258c52bb0 --- /dev/null +++ b/runtime/starpu/include/cham_tile_interface.h @@ -0,0 +1,56 @@ +/** + * + * @file starpu/cham_tile_interface.h + * + * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Header to describe the Chameleon tile interface in StarPU + * + * @version 0.9.2 + * @author Mathieu Faverge + * @author Gwenole Lucas + * @date 2019-07-23 + * + */ +#ifndef _cham_tile_interface_h_ +#define _cham_tile_interface_h_ + +extern struct starpu_data_interface_ops starpu_interface_cham_tile_ops; +#define STARPU_CHAM_TILE_INTERFACE_ID starpu_interface_cham_tile_ops.interfaceid + +struct starpu_cham_tile_interface_s; +typedef struct starpu_cham_tile_interface_s starpu_cham_tile_interface_t; + +/** + * Chameleon tile interface + */ +struct starpu_cham_tile_interface_s +{ + enum starpu_data_interface_id id; /**< Identifier of the interface */ + uintptr_t dev_handle; /**< device handle of the matrix */ + cham_flttype_t flttype; /**< Type of the elements of the matrix */ + size_t allocsize; /**< size actually currently allocated */ + size_t tilesize; /**< size of the elements of the matrix */ + CHAM_tile_t tile; /**< Internal tile structure used to store + information on non memory home_node */ +}; + +void starpu_cham_tile_register( starpu_data_handle_t *handleptr, + int home_node, + CHAM_tile_t *tile, + cham_flttype_t flttype ); + +int cti_handle_get_m ( starpu_data_handle_t handle ); +int cti_handle_get_n ( starpu_data_handle_t handle ); +size_t cti_handle_get_allocsize( starpu_data_handle_t handle ); + +static inline CHAM_tile_t * +cti_interface_get( starpu_cham_tile_interface_t *interface ) +{ + return &(interface->tile); +} + +#endif /* _cham_tile_interface_h_ */ diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in index 1a07cc8021fe0b7bfebfc38f665eaf23b4ce4ecf..a36fdc613c780ca867760d2bd43deb2fcbdc5f85 100644 --- a/runtime/starpu/include/chameleon_starpu.h.in +++ b/runtime/starpu/include/chameleon_starpu.h.in @@ -72,6 +72,7 @@ #include "runtime_profiling.h" #include "runtime_codelet_profile.h" #include "runtime_workspace.h" +#include "cham_tile_interface.h" typedef struct starpu_conf starpu_conf_t; @@ -80,16 +81,18 @@ typedef struct starpu_conf starpu_conf_t; /* * MPI Redefinitions */ +#if defined(CHAMELEON_STARPU_SYNC) +#define TASK_SYNCHRONOUS , STARPU_TASK_SYNCHRONOUS, 1 +#else +#define TASK_SYNCHRONOUS +#endif + #if defined(CHAMELEON_USE_MPI) #undef STARPU_REDUX -//#define starpu_insert_task(...) starpu_mpi_insert_task(MPI_COMM_WORLD, __VA_ARGS__) #define starpu_insert_task starpu_mpi_insert_task -#define starpu_mpi_codelet(_codelet_) MPI_COMM_WORLD, _codelet_ - +#define starpu_mpi_codelet(_codelet_) MPI_COMM_WORLD, _codelet_ TASK_SYNCHRONOUS #else - -#define starpu_mpi_codelet(_codelet_) _codelet_ - +#define starpu_mpi_codelet(_codelet_) _codelet_ TASK_SYNCHRONOUS #endif /* diff --git a/runtime/starpu/include/runtime_codelet_z.h b/runtime/starpu/include/runtime_codelet_z.h index f416b632032ece52beadc25f2a64c786701ffaf0..3e957e69de2c13cf65bf39bd6478d8fad74d88c5 100644 --- a/runtime/starpu/include/runtime_codelet_z.h +++ b/runtime/starpu/include/runtime_codelet_z.h @@ -28,6 +28,7 @@ #include "chameleon/tasks_z.h" #if !defined(CHAMELEON_SIMULATION) #include "coreblas/coreblas_z.h" +#include "coreblas/coreblas_ztile.h" #if defined(CHAMELEON_USE_CUDA) #include "cudablas.h" #endif diff --git a/runtime/starpu/include/runtime_codelets.h b/runtime/starpu/include/runtime_codelets.h index a9a1b9b775d7218c25b8432c62b9fb27b9f83997..026e754c4f72b0e2cf221dcd9611773041cd3331 100644 --- a/runtime/starpu/include/runtime_codelets.h +++ b/runtime/starpu/include/runtime_codelets.h @@ -31,61 +31,60 @@ #define CODELET_CUDA_FLAGS(flags) #endif -#define CODELETS_ALL(cl_name, _nbuffers, cpu_func_name, cuda_func_name, _original_location_, cuda_flags) \ - struct starpu_perfmodel cl_##cl_name##_fake = { \ - .type = STARPU_HISTORY_BASED, \ - .symbol = "fake_"#cl_name \ - }; \ - \ - struct starpu_perfmodel cl_##cl_name##_model = { \ - .type = STARPU_HISTORY_BASED, \ - .symbol = ""#cl_name \ - }; \ - \ - struct starpu_codelet cl_##cl_name = { \ - .where = (_original_location_), \ - .cpu_func = ((cpu_func_name)), \ - CODELET_CUDA_FLAGS(cuda_flags) \ - .cuda_func = ((cuda_func_name)), \ - .nbuffers = ((_nbuffers)), \ - .model = &cl_##cl_name##_model, \ - .name = #cl_name \ - }; \ - \ - void cl_##cl_name##_restrict_where(uint32_t where) \ - { \ - if ( cl_##cl_name.where & where ) \ - cl_##cl_name.where = (cl_##cl_name.where & where); \ - } \ - \ - void cl_##cl_name##_restore_where(void) \ - { \ - cl_##cl_name.where = (_original_location_); \ - } \ - \ - void cl_##cl_name##_restore_model(void) \ - { \ - cl_##cl_name.model = &cl_##cl_name##_model; \ +#define CODELETS_ALL(cl_name, _nbuffers, cpu_func_name, cuda_func_name, _original_location_, cuda_flags) \ + struct starpu_perfmodel cl_##cl_name##_fake = { \ + .type = STARPU_HISTORY_BASED, \ + .symbol = "fake_"#cl_name \ + }; \ + \ + struct starpu_perfmodel cl_##cl_name##_model = { \ + .type = STARPU_HISTORY_BASED, \ + .symbol = ""#cl_name \ + }; \ + \ + struct starpu_codelet cl_##cl_name = { \ + .where = (_original_location_), \ + .cpu_func = ((cpu_func_name)), \ + CODELET_CUDA_FLAGS(cuda_flags) \ + .cuda_func = ((cuda_func_name)), \ + .nbuffers = ((_nbuffers)), \ + .model = &cl_##cl_name##_model, \ + .name = #cl_name \ + }; \ + \ + void cl_##cl_name##_restrict_where(uint32_t where) \ + { \ + if ( cl_##cl_name.where & where ) \ + cl_##cl_name.where = (cl_##cl_name.where & where); \ + } \ + \ + void cl_##cl_name##_restore_where(void) \ + { \ + cl_##cl_name.where = (_original_location_); \ + } \ + \ + void cl_##cl_name##_restore_model(void) \ + { \ + cl_##cl_name.model = &cl_##cl_name##_model; \ } #if defined(CHAMELEON_SIMULATION) -#define CODELETS_CPU(name, _nbuffers, cpu_func_name) \ - CODELETS_ALL( name, _nbuffers, (starpu_cpu_func_t) 1, NULL, STARPU_CPU, 0 ) +#define CODELETS_CPU(name, _nbuffers, cpu_func_name) \ + CODELETS_ALL( name, _nbuffers, (starpu_cpu_func_t) 1, NULL, STARPU_CPU, 0 ) #else -#define CODELETS_CPU(name, _nbuffers, cpu_func_name) \ - CODELETS_ALL( name, _nbuffers, cpu_func_name, NULL, STARPU_CPU, 0 ) +#define CODELETS_CPU(name, _nbuffers, cpu_func_name) \ + CODELETS_ALL( name, _nbuffers, cpu_func_name, NULL, STARPU_CPU, 0 ) #endif #define CODELETS_GPU(name, _nbuffers, cpu_func_name, cuda_func_name, cuda_flags) \ - CODELETS_ALL( name, _nbuffers, cpu_func_name, cuda_func_name, STARPU_CPU | STARPU_CUDA, cuda_flags ) + CODELETS_ALL( name, _nbuffers, cpu_func_name, cuda_func_name, STARPU_CPU | STARPU_CUDA, cuda_flags ) - -#define CODELETS_ALL_HEADER(name) \ - CHAMELEON_CL_CB_HEADER(name); \ - void cl_##name##_load_fake_model(void); \ - void cl_##name##_restore_model(void); \ - extern struct starpu_codelet cl_##name; \ - void cl_##name##_restrict_where(uint32_t where); \ +#define CODELETS_ALL_HEADER(name) \ + CHAMELEON_CL_CB_HEADER(name); \ + void cl_##name##_load_fake_model(void); \ + void cl_##name##_restore_model(void); \ + extern struct starpu_codelet cl_##name; \ + void cl_##name##_restrict_where(uint32_t where); \ void cl_##name##_restore_where(void) #if defined(CHAMELEON_SIMULATION) diff --git a/runtime/starpu/interface/cham_tile_interface.c b/runtime/starpu/interface/cham_tile_interface.c new file mode 100644 index 0000000000000000000000000000000000000000..ca78c6b5a8449551314494f66334a06d9869e56d --- /dev/null +++ b/runtime/starpu/interface/cham_tile_interface.c @@ -0,0 +1,431 @@ +/** + * + * @file starpu/cham_tile_interface.c + * + * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon tile interface for StarPU + * + * @version 0.9.2 + * @author Mathieu Faverge + * @author Gwenole Lucas + * @date 2019-07-23 + * + */ +#include "chameleon_starpu.h" + +static inline CHAM_tile_t * +cti_handle_get( starpu_data_handle_t handle ) +{ + starpu_cham_tile_interface_t *cham_tile_interface = (starpu_cham_tile_interface_t *) + starpu_data_get_interface_on_node( handle, STARPU_MAIN_RAM ); + +#ifdef STARPU_DEBUG + STARPU_ASSERT_MSG( cham_tile_interface->id == STARPU_CHAM_TILE_INTERFACE_ID, + "Error. The given data is not a cham_tile." ); +#endif + + return &(cham_tile_interface->tile); +} + +int +cti_handle_get_m( starpu_data_handle_t handle ) +{ + CHAM_tile_t *tile = cti_handle_get( handle ); + return tile->m; +} + +int +cti_handle_get_n( starpu_data_handle_t handle ) +{ + CHAM_tile_t *tile = cti_handle_get( handle ); + return tile->n; +} + +static void +cti_init( void *data_interface ) +{ + starpu_cham_tile_interface_t *cham_tile_interface = data_interface; + cham_tile_interface->id = STARPU_CHAM_TILE_INTERFACE_ID; + cham_tile_interface->allocsize = -1; +} + +static void +cti_register_data_handle( starpu_data_handle_t handle, + unsigned home_node, + void *data_interface ) +{ + starpu_cham_tile_interface_t *cham_tile_interface = (starpu_cham_tile_interface_t *) data_interface; + unsigned node; + + for (node = 0; node < STARPU_MAXNODES; node++) + { + starpu_cham_tile_interface_t *local_interface = (starpu_cham_tile_interface_t *) + starpu_data_get_interface_on_node(handle, node); + + memcpy( local_interface, cham_tile_interface, + sizeof( starpu_cham_tile_interface_t ) ); + + if ( node != home_node ) + { + local_interface->dev_handle = 0; + local_interface->tile.mat = NULL; + local_interface->tile.ld = -1; + } + } +} + +static starpu_ssize_t +cti_allocate_data_on_node( void *data_interface, unsigned node ) +{ + uintptr_t addr = 0, handle; + starpu_cham_tile_interface_t *cham_tile_interface = + (starpu_cham_tile_interface_t *) data_interface; + + uint32_t ld = cham_tile_interface->tile.m; + starpu_ssize_t allocated_memory; + + allocated_memory = cham_tile_interface->allocsize; + if ( allocated_memory <= 0 ) { + return 0; + } + + handle = starpu_malloc_on_node( node, allocated_memory ); + + if ( !handle ) { + return -ENOMEM; + } + + if ( starpu_node_get_kind(node) != STARPU_OPENCL_RAM ) { + addr = handle; + } + + /* update the data properly */ + cham_tile_interface->tile.mat = (void*)addr; + cham_tile_interface->tile.ld = ld; + cham_tile_interface->dev_handle = handle; + + return allocated_memory; +} + +static void +cti_free_data_on_node( void *data_interface, unsigned node ) +{ + starpu_cham_tile_interface_t *cham_tile_interface = + (starpu_cham_tile_interface_t *) data_interface; + + starpu_free_on_node( node, cham_tile_interface->dev_handle, cham_tile_interface->allocsize ); + cham_tile_interface->tile.mat = NULL; + cham_tile_interface->dev_handle = 0; +} + +static void * +cti_to_pointer( void *data_interface, unsigned node ) +{ + (void) node; + starpu_cham_tile_interface_t *cham_tile_interface = data_interface; + + return (void*)(cham_tile_interface->tile.mat); +} + +static int +cti_pointer_is_inside( void *data_interface, unsigned node, void *ptr ) +{ + (void) node; + starpu_cham_tile_interface_t *cham_tile_interface = data_interface; + char *begin = cham_tile_interface->tile.mat; + char *end = begin + cham_tile_interface->allocsize; + + STARPU_ASSERT_MSG( cham_tile_interface->tile.format & CHAMELEON_TILE_FULLRANK, + "Only full-rank matrices are supported." ); + + return ( (char*) ptr >= begin ) + && ( (char*) ptr < end ); +} + +static size_t +cti_get_size(starpu_data_handle_t handle) +{ + starpu_cham_tile_interface_t *cham_tile_interface = + starpu_data_get_interface_on_node( handle, STARPU_MAIN_RAM ); + +#ifdef STARPU_DEBUG + STARPU_ASSERT_MSG( cham_tile_interface->id == STARPU_CHAM_TILE_INTERFACE_ID, + "Error. The given data is not a cham_tile." ); +#endif + + return cham_tile_interface->allocsize; +} + +static size_t +cti_get_alloc_size(starpu_data_handle_t handle) +{ + starpu_cham_tile_interface_t *cham_tile_interface = + starpu_data_get_interface_on_node( handle, STARPU_MAIN_RAM ); + +#ifdef STARPU_DEBUG + STARPU_ASSERT_MSG( cham_tile_interface->id == STARPU_CHAM_TILE_INTERFACE_ID, + "Error. The given data is not a cham_tile." ); +#endif + + STARPU_ASSERT_MSG( cham_tile_interface->allocsize != (size_t)-1, + "The cham_tile allocation size needs to be defined" ); + + return cham_tile_interface->allocsize; +} + +static uint32_t +cti_footprint( starpu_data_handle_t handle ) +{ + CHAM_tile_t *tile = cti_handle_get( handle ); + return starpu_hash_crc32c_be( tile->m, tile->n ); +} + +static uint32_t +cti_alloc_footprint( starpu_data_handle_t handle ) +{ + return starpu_hash_crc32c_be( cti_handle_get_allocsize(handle), 0 ); +} + +static int +cti_compare( void *data_interface_a, void *data_interface_b ) +{ + starpu_cham_tile_interface_t *cham_tile_a = (starpu_cham_tile_interface_t *) data_interface_a; + starpu_cham_tile_interface_t *cham_tile_b = (starpu_cham_tile_interface_t *) data_interface_b; + + /* Two matrices are considered compatible if they have the same size */ + return ( cham_tile_a->tile.m == cham_tile_b->tile.m ) + && ( cham_tile_a->tile.n == cham_tile_b->tile.n ) + && ( cham_tile_a->flttype == cham_tile_b->flttype ); +} + +static int +cti_alloc_compare(void *data_interface_a, void *data_interface_b) +{ + starpu_cham_tile_interface_t *cham_tile_a = (starpu_cham_tile_interface_t *) data_interface_a; + starpu_cham_tile_interface_t *cham_tile_b = (starpu_cham_tile_interface_t *) data_interface_b; + + /* Two matrices are considered compatible if they have the same allocated size */ + return ( cham_tile_a->allocsize == cham_tile_b->allocsize ); +} + +static void +cti_display( starpu_data_handle_t handle, FILE *f ) +{ + starpu_cham_tile_interface_t *cham_tile_interface = (starpu_cham_tile_interface_t *) + starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM); + + fprintf( f, "%u\t%u\t", + cham_tile_interface->tile.m, + cham_tile_interface->tile.n ); +} + +static int +cti_pack_data_fullrank( starpu_cham_tile_interface_t *cham_tile_interface, + void *ptr ) +{ + char *matrix = (void *)cham_tile_interface->tile.mat; + + if ( cham_tile_interface->tile.m == cham_tile_interface->tile.ld ) { + memcpy( ptr, matrix, cham_tile_interface->allocsize ); + } + else { + int n; + char *tmpptr = ptr; + + for(n=0; n<cham_tile_interface->tile.n; n++) + { + size_t elemsize = CHAMELEON_Element_Size( cham_tile_interface->flttype ); + size_t size = cham_tile_interface->tile.m * elemsize; + memcpy( tmpptr, matrix, size ); + tmpptr += size; + matrix += cham_tile_interface->tile.ld * elemsize; + } + } + return 0; +} + +static int +cti_pack_data( starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count ) +{ + STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node)); + + starpu_cham_tile_interface_t *cham_tile_interface = (starpu_cham_tile_interface_t *) + starpu_data_get_interface_on_node(handle, node); + + *count = (starpu_ssize_t)(cham_tile_interface->allocsize); + *count += sizeof(size_t) + sizeof(CHAM_tile_t); + + if ( ptr != NULL ) + { + char *tmp; + *ptr = (void *)starpu_malloc_on_node_flags( node, *count, 0 ); + tmp = (char*)(*ptr); + + /* Start by the size to allocate on reception */ + memcpy( tmp, &(cham_tile_interface->allocsize), sizeof(size_t) ); + tmp += sizeof(size_t); + + /* Copy the tile metadata */ + memcpy( tmp, &(cham_tile_interface->tile), sizeof(CHAM_tile_t) ); + tmp += sizeof(CHAM_tile_t); + + /* Pack the real data */ + if ( cham_tile_interface->tile.format & CHAMELEON_TILE_FULLRANK ) { + cti_pack_data_fullrank( cham_tile_interface, tmp ); + } + else { + STARPU_ASSERT_MSG( 1, "Unsupported format for pack." ); + } + } + + return 0; +} + +static int +cti_unpack_data_fullrank( starpu_cham_tile_interface_t *cham_tile_interface, + void *ptr ) +{ + char *matrix = (void *)cham_tile_interface->tile.mat; + + if ( cham_tile_interface->tile.m == cham_tile_interface->tile.ld ) { + memcpy( matrix, ptr, cham_tile_interface->allocsize ); + } + else { + int n; + char *tmpptr = ptr; + + for(n=0 ; n<cham_tile_interface->tile.n; n++) + { + size_t elemsize = CHAMELEON_Element_Size( cham_tile_interface->flttype ); + size_t size = cham_tile_interface->tile.m * elemsize; + memcpy( matrix, tmpptr, size ); + tmpptr += size; + matrix += cham_tile_interface->tile.ld * elemsize; + } + } + return 0; +} + +static int +cti_unpack_data( starpu_data_handle_t handle, unsigned node, void *ptr, size_t count ) +{ + STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node)); + + starpu_cham_tile_interface_t *cham_tile_interface = (starpu_cham_tile_interface_t *) + starpu_data_get_interface_on_node(handle, node); + + CHAM_tile_t dsttile; + char *tmp = ptr; + + /* Extract the size of the information t unpack */ + memcpy( &(cham_tile_interface->allocsize), tmp, sizeof(size_t) ); + tmp += sizeof(size_t); + + /* Extract the tile metadata of the remote tile */ + memcpy( &dsttile, tmp, sizeof(CHAM_tile_t) ); + tmp += sizeof(CHAM_tile_t); + + cham_tile_interface->tile.format = dsttile.format; + cham_tile_interface->tile.ld = cham_tile_interface->tile.m; + STARPU_ASSERT( cham_tile_interface->tile.m == dsttile.m ); + STARPU_ASSERT( cham_tile_interface->tile.n == dsttile.n ); + STARPU_ASSERT( count == cham_tile_interface->allocsize + sizeof(size_t) + sizeof(CHAM_tile_t) ); + + + /* Unpack the real data */ + if ( cham_tile_interface->tile.format & CHAMELEON_TILE_FULLRANK ) { + cti_unpack_data_fullrank( cham_tile_interface, tmp ); + } + else { + STARPU_ASSERT_MSG( 1, "Unsupported format for pack." ); + } + + /* Free the received information */ + starpu_free_on_node_flags( node, (uintptr_t)ptr, count, 0 ); + + return 0; +} + +static starpu_ssize_t +cti_describe( void *data_interface, char *buf, size_t size ) +{ + starpu_cham_tile_interface_t *cham_tile_interface = (starpu_cham_tile_interface_t *) data_interface; + return snprintf( buf, size, "M%ux%ux%u", + (unsigned) cham_tile_interface->tile.m, + (unsigned) cham_tile_interface->tile.n, + (unsigned) cham_tile_interface->flttype ); +} + +struct starpu_data_interface_ops starpu_interface_cham_tile_ops = +{ + .init = cti_init, + .register_data_handle = cti_register_data_handle, + .allocate_data_on_node = cti_allocate_data_on_node, + .free_data_on_node = cti_free_data_on_node, + .to_pointer = cti_to_pointer, + .pointer_is_inside = cti_pointer_is_inside, + .get_size = cti_get_size, + .get_alloc_size = cti_get_alloc_size, + .footprint = cti_footprint, + .alloc_footprint = cti_alloc_footprint, + .compare = cti_compare, + .alloc_compare = cti_alloc_compare, + .display = cti_display, + .pack_data = cti_pack_data, + .unpack_data = cti_unpack_data, + .describe = cti_describe, + //.copy_methods =&cti_copy_methods, + .interfaceid = STARPU_UNKNOWN_INTERFACE_ID, + .interface_size = sizeof(starpu_cham_tile_interface_t), + .name = "STARPU_CHAM_TILE_INTERFACE" +}; + +void +starpu_cham_tile_register( starpu_data_handle_t *handleptr, + int home_node, + CHAM_tile_t *tile, + cham_flttype_t flttype ) +{ + size_t elemsize = CHAMELEON_Element_Size( flttype ); + starpu_cham_tile_interface_t cham_tile_interface = + { + .id = STARPU_CHAM_TILE_INTERFACE_ID, + .flttype = flttype, + .dev_handle = (intptr_t)(tile->mat), + .allocsize = tile->m * tile->n * elemsize, /* We compute with m even if it's allocated with ld */ + .tilesize = tile->m * tile->n * elemsize, + }; + memcpy( &(cham_tile_interface.tile), tile, sizeof( CHAM_tile_t ) ); + + starpu_data_register( handleptr, home_node, &cham_tile_interface, &starpu_interface_cham_tile_ops ); +} + +size_t +cti_handle_get_allocsize( starpu_data_handle_t handle ) +{ + starpu_cham_tile_interface_t *cham_tile_interface = (starpu_cham_tile_interface_t *) + starpu_data_get_interface_on_node( handle, STARPU_MAIN_RAM ); + +#ifdef STARPU_DEBUG + STARPU_ASSERT_MSG( cham_tile_interface->id == STARPU_CHAM_TILE_INTERFACE_ID, + "Error. The given data is not a cham_tile." ); +#endif + + return cham_tile_interface->allocsize; +} + +void +starpu_cham_tile_interface_init() __attribute__((constructor)); + +void +starpu_cham_tile_interface_init() +{ + if ( starpu_interface_cham_tile_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID ) + { + starpu_interface_cham_tile_ops.interfaceid = starpu_data_interface_get_next_id(); + } +}