diff --git a/compute/pzlange.c b/compute/pzlange.c index ce1665743831a340d1147dee78060531903215a9..fc2ecfe6702f280fc49f3e22c988822533f66e0f 100644 --- a/compute/pzlange.c +++ b/compute/pzlange.c @@ -32,7 +32,7 @@ static inline void chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_desc_t *Wcol, CHAM_desc_t *Welt, - RUNTIME_option_t *options) + RUNTIME_option_t *options ) { int m, n; int minMNT = chameleon_min( A->mt, A->nt ); @@ -58,7 +58,7 @@ chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb; int ldam = BLKLDD( A, m ); - if ( (n == m) && (uplo != ChamUpperLower) ) { + if ( (n == m) && (uplo != ChamUpperLower) ) { INSERT_TASK_ztrasm( options, ChamColumnwise, uplo, diag, tempmm, tempnn, @@ -95,7 +95,8 @@ chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, INSERT_TASK_dlange( options, ChamMaxNorm, 1, tempnn, A->nb, - W( Wcol, 0, n), 1, W( Welt, 0, n)); + W( Wcol, 0, n ), 1, + W( Welt, 0, n ) ); } /** @@ -105,7 +106,8 @@ chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, for(n = Q; n < NT; n++) { INSERT_TASK_dlange_max( options, - W( Welt, 0, n), W( Welt, 0, n%Q) ); + W( Welt, 0, n ), + W( Welt, 0, n%Q ) ); } /** @@ -115,7 +117,8 @@ chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, for(n = 1; n < Q; n++) { INSERT_TASK_dlange_max( options, - W( Welt, 0, n), W( Welt, 0, 0) ); + W( Welt, 0, n ), + W( Welt, 0, 0 ) ); } } @@ -247,13 +250,14 @@ chameleon_pzlange_max( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_ INSERT_TASK_zlange( options, ChamMaxNorm, tempmm, tempnn, A->nb, - A(m, n), ldam, W( Welt, m, n)); + A(m, n), ldam, W( Welt, m, n )); } if ( n >= Q ) { INSERT_TASK_dlange_max( options, - W( Welt, m, n), W( Welt, m, n%Q) ); + W( Welt, m, n ), + W( Welt, m, n%Q ) ); } } @@ -264,7 +268,8 @@ chameleon_pzlange_max( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_ for(n = 1; n < Q; n++) { INSERT_TASK_dlange_max( options, - W( Welt, m, n), W( Welt, m, 0) ); + W( Welt, m, n ), + W( Welt, m, 0 ) ); } } @@ -275,7 +280,8 @@ chameleon_pzlange_max( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_ for(m = P; m < MT; m++) { INSERT_TASK_dlange_max( options, - W( Welt, m, 0), W( Welt, m%P, 0) ); + W( Welt, m, 0 ), + W( Welt, m%P, 0 ) ); } /** @@ -285,7 +291,8 @@ chameleon_pzlange_max( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_ for(m = 1; m < P; m++) { INSERT_TASK_dlange_max( options, - W( Welt, m, 0), W( Welt, 0, 0) ); + W( Welt, m, 0 ), + W( Welt, 0, 0 ) ); } } @@ -382,7 +389,7 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia double alpha = 0.0; double beta = 0.0; - int workn, workmt, worknt; + int workmt, worknt; int m, n, wcol_init = 0; chamctxt = chameleon_context_self(); @@ -395,7 +402,6 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia workmt = chameleon_max( A->mt, A->p ); worknt = chameleon_max( A->nt, A->q ); - workn = chameleon_max( A->n, A->q ); switch ( norm ) { case ChamOneNorm: @@ -502,7 +508,7 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia */ for(m = 0; m < A->p; m++) { for(n = 0; n < A->q; n++) { - if ( (m != 0) && (n != 0) ) { + if ( (m != 0) || (n != 0) ) { INSERT_TASK_dlacpy( &options, ChamUpperLower, 1, 1, 1, @@ -514,7 +520,7 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia CHAMELEON_Desc_Flush( &Welt, sequence ); RUNTIME_sequence_wait(chamctxt, sequence); - *result = *(double *)Welt.get_blkaddr( &Welt, A->myrank / A->q, A->myrank % A->q ); + *result = *((double *)Welt.get_blkaddr( &Welt, A->myrank / A->q, A->myrank % A->q )); if ( wcol_init ) { chameleon_desc_destroy( &Wcol ); diff --git a/compute/pzlansy.c b/compute/pzlansy.c index c6d0ba73eb3c03f4e96874e6798dd38ebf5a0a38..6a0e2bb0086e4892d7eab3deadb7644093c1fb81 100644 --- a/compute/pzlansy.c +++ b/compute/pzlansy.c @@ -315,7 +315,7 @@ void chameleon_pzlansy_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_tra double alpha = 0.0; double beta = 0.0; - int workn, workmt, worknt; + int workmt, worknt; int m, n, wcol_init = 0; chamctxt = chameleon_context_self(); @@ -328,7 +328,6 @@ void chameleon_pzlansy_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_tra workmt = chameleon_max( A->mt, A->p ); worknt = chameleon_max( A->nt, A->q ); - workn = chameleon_max( A->n, A->q ); switch ( norm ) { case ChamOneNorm: @@ -415,7 +414,7 @@ void chameleon_pzlansy_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_tra */ for(m = 0; m < A->p; m++) { for(n = 0; n < A->q; n++) { - if ( (m != 0) && (n != 0) ) { + if ( (m != 0) || (n != 0) ) { INSERT_TASK_dlacpy( &options, ChamUpperLower, 1, 1, 1, diff --git a/control/common.h b/control/common.h index 3c5dabf15d0529a852acb682838c0c8195a2e6ec..a9731f9607c70c0444adef343645a16d946831ed 100644 --- a/control/common.h +++ b/control/common.h @@ -77,11 +77,8 @@ * Global shortcuts */ #define CHAMELEON_RANK chameleon_rank(chamctxt) -#define CHAMELEON_SIZE chamctxt->world_size -#define CHAMELEON_GRPSIZE chamctxt->group_size #define CHAMELEON_NB chamctxt->nb #define CHAMELEON_IB chamctxt->ib -#define CHAMELEON_SCHEDULING chamctxt->scheduling #define CHAMELEON_RHBLK chamctxt->rhblock #define CHAMELEON_TRANSLATION chamctxt->translation #define CHAMELEON_PARALLEL chamctxt->parallel_enabled diff --git a/include/chameleon/struct.h b/include/chameleon/struct.h index bc1fb9a05b2b16faa4ce4cb79712e056d01e5d2a..ce3beb83e943531404a3a33a614a14b7f2a26932 100644 --- a/include/chameleon/struct.h +++ b/include/chameleon/struct.h @@ -109,8 +109,6 @@ typedef struct chameleon_context_s { int my_mpi_rank; int mpi_comm_size; #endif - int world_size; - int group_size; /* Boolean flags */ cham_bool_t warnings_enabled; diff --git a/runtime/openmp/codelets/codelet_zplssq.c b/runtime/openmp/codelets/codelet_zplssq.c index bf72752ec10eadea297c04e3135ed4eefff4a400..7ee45f66de44b77073ee4714ea1e7ebf5d9504cc 100644 --- a/runtime/openmp/codelets/codelet_zplssq.c +++ b/runtime/openmp/codelets/codelet_zplssq.c @@ -57,18 +57,18 @@ * */ void INSERT_TASK_zplssq( const RUNTIME_option_t *options, - const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn, - const CHAM_desc_t *SCLSSQ, int SCLSSQm, int SCLSSQn ) + const CHAM_desc_t *IN, int INm, int INn, + const CHAM_desc_t *OUT, int OUTm, int OUTn ) { - double *scalesum = RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn); - double *scl = RTBLKADDR(SCLSSQ, double, SCLSSQm, SCLSSQn); -#pragma omp task depend(in: scalesum[0]) depend(inout: scl[0]) + double *sclssq_in = RTBLKADDR(IN, double, INm, INn ); + double *sclssq_out = RTBLKADDR(OUT, double, OUTm, OUTn); +#pragma omp task depend(in: sclssq_in[0]) depend(inout: sclssq_out[0]) { - if( scl[0] < scalesum[0] ) { - scl[1] = scalesum[1] + (scl[1] * (( scl[0] / scalesum[0] ) * ( scl[0] / scalesum[0] ))); - scl[0] = scalesum[0]; + if( sclssq_out[0] < sclssq_in[0] ) { + sclssq_out[1] = sclssq_in[1] + (sclssq_out[1] * (( sclssq_out[0] / sclssq_in[0] ) * ( sclssq_out[0] / sclssq_in[0] ))); + sclssq_out[0] = sclssq_in[0]; } else { - scl[1] = scl[1] + (scalesum[1] * (( scalesum[0] / scl[0] ) * ( scalesum[0] / scl[0] ))); + sclssq_out[1] = sclssq_out[1] + (sclssq_in[1] * (( sclssq_in[0] / sclssq_out[0] ) * ( sclssq_in[0] / sclssq_out[0] ))); } } } diff --git a/runtime/parsec/codelets/codelet_zplssq.c b/runtime/parsec/codelets/codelet_zplssq.c index 6b5b493d156ca08ed762571b92e701821f8c038b..ed1fde76fcd4d9fa51f1cd6a2e717f52b92f5792 100644 --- a/runtime/parsec/codelets/codelet_zplssq.c +++ b/runtime/parsec/codelets/codelet_zplssq.c @@ -56,17 +56,20 @@ static inline int CORE_zplssq_parsec( parsec_execution_stream_t *context, parsec_task_t *this_task ) { - double *SCALESUMSQ; - double *SCLSSQ; + double *SCLSSQ_IN; + double *SCLSSQ_OUT; parsec_dtd_unpack_args( - this_task, &SCALESUMSQ, &SCLSSQ ); + this_task, &SCLSSQ_IN, &SCLSSQ_OUT ); - if( SCLSSQ[0] < SCALESUMSQ[0] ) { - SCLSSQ[1] = SCALESUMSQ[1] + (SCLSSQ[1] * (( SCLSSQ[0] / SCALESUMSQ[0] ) * ( SCLSSQ[0] / SCALESUMSQ[0] ))); - SCLSSQ[0] = SCALESUMSQ[0]; + assert( SCLSSQ_OUT[0] >= 0. ); + if( SCLSSQ_OUT[0] < SCLSSQ_IN[0] ) { + SCLSSQ_OUT[1] = SCLSSQ_IN[1] + (SCLSSQ_OUT[1] * (( SCLSSQ_OUT[0] / SCLSSQ_IN[0] ) * ( SCLSSQ_OUT[0] / SCLSSQ_IN[0] ))); + SCLSSQ_OUT[0] = SCLSSQ_IN[0]; } else { - SCLSSQ[1] = SCLSSQ[1] + (SCALESUMSQ[1] * (( SCALESUMSQ[0] / SCLSSQ[0] ) * ( SCALESUMSQ[0] / SCLSSQ[0] ))); + if ( SCLSSQ_OUT[0] > 0 ) { + SCLSSQ_OUT[1] = SCLSSQ_OUT[1] + (SCLSSQ_IN[1] * (( SCLSSQ_IN[0] / SCLSSQ_OUT[0] ) * ( SCLSSQ_IN[0] / SCLSSQ_OUT[0] ))); + } } (void)context; diff --git a/runtime/quark/codelets/codelet_zplssq.c b/runtime/quark/codelets/codelet_zplssq.c index 20b2625643eec30008864c5e911ddb5514dc4bc5..79067050a5b8094a95e7695f1cde016cd2f61709 100644 --- a/runtime/quark/codelets/codelet_zplssq.c +++ b/runtime/quark/codelets/codelet_zplssq.c @@ -26,16 +26,19 @@ void CORE_zplssq_quark(Quark *quark) { - double *SCALESUMSQ; - double *SCLSSQ; + double *SCLSSQ_IN; + double *SCLSSQ_OUT; - quark_unpack_args_2( quark, SCALESUMSQ, SCLSSQ ); + quark_unpack_args_2( quark, SCLSSQ_IN, SCLSSQ_OUT ); - if( SCLSSQ[0] < SCALESUMSQ[0] ) { - SCLSSQ[1] = SCALESUMSQ[1] + (SCLSSQ[1] * (( SCLSSQ[0] / SCALESUMSQ[0] ) * ( SCLSSQ[0] / SCALESUMSQ[0] ))); - SCLSSQ[0] = SCALESUMSQ[0]; + assert( SCLSSQ_OUT[0] >= 0. ); + if( SCLSSQ_OUT[0] < SCLSSQ_IN[0] ) { + SCLSSQ_OUT[1] = SCLSSQ_IN[1] + (SCLSSQ_OUT[1] * (( SCLSSQ_OUT[0] / SCLSSQ_IN[0] ) * ( SCLSSQ_OUT[0] / SCLSSQ_IN[0] ))); + SCLSSQ_OUT[0] = SCLSSQ_IN[0]; } else { - SCLSSQ[1] = SCLSSQ[1] + (SCALESUMSQ[1] * (( SCALESUMSQ[0] / SCLSSQ[0] ) * ( SCALESUMSQ[0] / SCLSSQ[0] ))); + if ( SCLSSQ_OUT[0] > 0 ) { + SCLSSQ_OUT[1] = SCLSSQ_OUT[1] + (SCLSSQ_IN[1] * (( SCLSSQ_IN[0] / SCLSSQ_OUT[0] ) * ( SCLSSQ_IN[0] / SCLSSQ_OUT[0] ))); + } } } diff --git a/runtime/starpu/codelets/codelet_zgeadd.c b/runtime/starpu/codelets/codelet_zgeadd.c index c5541b1d758ac94a35636945602d742a1a1db468..7c296d750bc4fa32d425a877f1b8731389e4e41e 100644 --- a/runtime/starpu/codelets/codelet_zgeadd.c +++ b/runtime/starpu/codelets/codelet_zgeadd.c @@ -120,13 +120,13 @@ static void cl_zgeadd_cpu_func(void *descr[], void *cl_arg) int M; int N; CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t *A; + const CHAMELEON_Complex64_t *A; int LDA; CHAMELEON_Complex64_t beta; CHAMELEON_Complex64_t *B; int LDB; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); + A = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); B = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); starpu_codelet_unpack_args(cl_arg, &trans, &M, &N, &alpha, &LDA, &beta, &LDB); CORE_zgeadd(trans, M, N, alpha, A, LDA, beta, B, LDB); diff --git a/runtime/starpu/codelets/codelet_zlacpy.c b/runtime/starpu/codelets/codelet_zlacpy.c index a614181d0e2b10736313167d073d911dde472031..0d2426400e3eb0216d694b4563543fb3c642aabb 100644 --- a/runtime/starpu/codelets/codelet_zlacpy.c +++ b/runtime/starpu/codelets/codelet_zlacpy.c @@ -32,17 +32,17 @@ * */ void INSERT_TASK_zlacpyx(const RUNTIME_option_t *options, - cham_uplo_t uplo, int m, int n, int nb, - int displA, const CHAM_desc_t *A, int Am, int An, int lda, - int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb) + cham_uplo_t uplo, int m, int n, int nb, + int displA, const CHAM_desc_t *A, int Am, int An, int lda, + int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb) { (void)nb; struct starpu_codelet *codelet = &cl_zlacpy; void (*callback)(void*) = options->profiling ? cl_zlacpy_callback : NULL; CHAMELEON_BEGIN_ACCESS_DECLARATION; - CHAMELEON_ACCESS_R(A, Am, An); - CHAMELEON_ACCESS_W(B, Bm, Bn); + CHAMELEON_ACCESS_R( A, Am, An ); + CHAMELEON_ACCESS_W( B, Bm, Bn ); CHAMELEON_END_ACCESS_DECLARATION; starpu_insert_task( @@ -65,13 +65,13 @@ void INSERT_TASK_zlacpyx(const RUNTIME_option_t *options, } void INSERT_TASK_zlacpy(const RUNTIME_option_t *options, - cham_uplo_t uplo, int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *B, int Bm, int Bn, int ldb) + cham_uplo_t uplo, int m, int n, int nb, + const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *B, int Bm, int Bn, int ldb) { INSERT_TASK_zlacpyx( options, uplo, m, n, nb, - 0, A, Am, An, lda, - 0, B, Bm, Bn, ldb ); + 0, A, Am, An, lda, + 0, B, Bm, Bn, ldb ); } #if !defined(CHAMELEON_SIMULATION) diff --git a/runtime/starpu/codelets/codelet_zlag2c.c b/runtime/starpu/codelets/codelet_zlag2c.c index b65dc00593cb0bd95031371fc1fcc26a750ca0ca..43b4314c6b58b9a3bea0d39b667e6006883e2e6f 100644 --- a/runtime/starpu/codelets/codelet_zlag2c.c +++ b/runtime/starpu/codelets/codelet_zlag2c.c @@ -85,24 +85,25 @@ void INSERT_TASK_clag2z(const RUNTIME_option_t *options, struct starpu_codelet *codelet = &cl_clag2z; void (*callback)(void*) = options->profiling ? cl_clag2z_callback : NULL; - if ( chameleon_desc_islocal( A, Am, An ) || - chameleon_desc_islocal( B, Bm, Bn ) ) - { - starpu_insert_task( - starpu_mpi_codelet(codelet), - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex32_t, Am, An), - STARPU_VALUE, &lda, sizeof(int), - STARPU_W, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), - STARPU_VALUE, &ldb, sizeof(int), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_R( A, Am, An ); + CHAMELEON_ACCESS_W( B, Bm, Bn ); + CHAMELEON_END_ACCESS_DECLARATION; + + starpu_insert_task( + starpu_mpi_codelet(codelet), + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_R, RTBLKADDR(A, CHAMELEON_Complex32_t, Am, An), + STARPU_VALUE, &lda, sizeof(int), + STARPU_W, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + STARPU_VALUE, &ldb, sizeof(int), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "clag2z", + STARPU_NAME, "clag2z", #endif - 0); - } + 0); } diff --git a/runtime/starpu/codelets/codelet_zlange.c b/runtime/starpu/codelets/codelet_zlange.c index a90de22caac82801eaf997139c0b91948f8eebbe..f689d82bcb332dc40e32d31c2a43d7cf09398eee 100644 --- a/runtime/starpu/codelets/codelet_zlange.c +++ b/runtime/starpu/codelets/codelet_zlange.c @@ -70,7 +70,7 @@ static void cl_zlange_cpu_func(void *descr[], void *cl_arg) work = (double *)STARPU_MATRIX_GET_PTR(descr[1]); normA = (double *)STARPU_MATRIX_GET_PTR(descr[2]); starpu_codelet_unpack_args(cl_arg, &norm, &M, &N, &LDA); - CORE_zlange( norm, M, N, A, LDA, work, normA); + CORE_zlange( norm, M, N, A, LDA, work, normA ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -86,34 +86,35 @@ void INSERT_TASK_zlange_max(const RUNTIME_option_t *options, struct starpu_codelet *codelet = &cl_zlange_max; void (*callback)(void*) = options->profiling ? cl_zlange_callback : NULL; - if ( chameleon_desc_islocal( A, Am, An ) || - chameleon_desc_islocal( B, Bm, Bn ) ) - { - starpu_insert_task( - starpu_mpi_codelet(codelet), - STARPU_R, RTBLKADDR(A, double, Am, An), - STARPU_RW, RTBLKADDR(B, double, Bm, Bn), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_R( A, Am, An ); + CHAMELEON_ACCESS_RW( B, Bm, Bn ); + CHAMELEON_END_ACCESS_DECLARATION; + + starpu_insert_task( + starpu_mpi_codelet(codelet), + STARPU_R, RTBLKADDR(A, double, Am, An), + STARPU_RW, RTBLKADDR(B, double, Bm, Bn), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "zlange_max", + STARPU_NAME, "zlange_max", #endif - 0); - } + 0); } #if !defined(CHAMELEON_SIMULATION) static void cl_zlange_max_cpu_func(void *descr[], void *cl_arg) { double *A; - double *normA; - - A = (double *)STARPU_MATRIX_GET_PTR(descr[0]); - normA = (double *)STARPU_MATRIX_GET_PTR(descr[1]); + double *B; - if ( *A > *normA ) - *normA = *A; + A = (double *)STARPU_MATRIX_GET_PTR(descr[0]); + B = (double *)STARPU_MATRIX_GET_PTR(descr[1]); + if ( *A > *B ) { + *B = *A; + } (void)cl_arg; } #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_zplssq.c b/runtime/starpu/codelets/codelet_zplssq.c index eb79e712b3eb323538f1a16906c13449cf8a62c4..4fdbaf6c3d402da601267f52788ce6eeedd82653 100644 --- a/runtime/starpu/codelets/codelet_zplssq.c +++ b/runtime/starpu/codelets/codelet_zplssq.c @@ -55,21 +55,21 @@ * */ void INSERT_TASK_zplssq( const RUNTIME_option_t *options, - const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn, - const CHAM_desc_t *SCLSSQ, int SCLSSQm, int SCLSSQn ) + const CHAM_desc_t *SCLSSQ_IN, int SCLSSQ_INm, int SCLSSQ_INn, + const CHAM_desc_t *SCLSSQ_OUT, int SCLSSQ_OUTm, int SCLSSQ_OUTn ) { struct starpu_codelet *codelet = &cl_zplssq; void (*callback)(void*) = options->profiling ? cl_zplssq_callback : NULL; CHAMELEON_BEGIN_ACCESS_DECLARATION; - CHAMELEON_ACCESS_R(SCALESUMSQ, SCALESUMSQm, SCALESUMSQn); - CHAMELEON_ACCESS_RW(SCLSSQ, SCLSSQm, SCLSSQn); + CHAMELEON_ACCESS_R( SCLSSQ_IN, SCLSSQ_INm, SCLSSQ_INn ); + CHAMELEON_ACCESS_RW( SCLSSQ_OUT, SCLSSQ_OUTm, SCLSSQ_OUTn ); CHAMELEON_END_ACCESS_DECLARATION; starpu_insert_task( starpu_mpi_codelet(codelet), - STARPU_R, RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), - STARPU_RW, RTBLKADDR(SCLSSQ, double, SCLSSQm, SCLSSQn), + STARPU_R, RTBLKADDR( SCLSSQ_IN, double, SCLSSQ_INm, SCLSSQ_INn ), + STARPU_RW, RTBLKADDR( SCLSSQ_OUT, double, SCLSSQ_OUTm, SCLSSQ_OUTn ), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, #if defined(CHAMELEON_CODELETS_HAVE_NAME) @@ -82,17 +82,20 @@ void INSERT_TASK_zplssq( const RUNTIME_option_t *options, #if !defined(CHAMELEON_SIMULATION) static void cl_zplssq_cpu_func(void *descr[], void *cl_arg) { - double *SCALESUMSQ; - double *SCLSSQ; + double *SCLSSQ_IN; + double *SCLSSQ_OUT; - SCALESUMSQ = (double *)STARPU_MATRIX_GET_PTR(descr[0]); - SCLSSQ = (double *)STARPU_MATRIX_GET_PTR(descr[1]); + SCLSSQ_IN = (double *)STARPU_MATRIX_GET_PTR(descr[0]); + SCLSSQ_OUT = (double *)STARPU_MATRIX_GET_PTR(descr[1]); - if( SCLSSQ[0] < SCALESUMSQ[0] ) { - SCLSSQ[1] = SCALESUMSQ[1] + (SCLSSQ[1] * (( SCLSSQ[0] / SCALESUMSQ[0] ) * ( SCLSSQ[0] / SCALESUMSQ[0] ))); - SCLSSQ[0] = SCALESUMSQ[0]; + assert( SCLSSQ_OUT[0] >= 0. ); + if( SCLSSQ_OUT[0] < SCLSSQ_IN[0] ) { + SCLSSQ_OUT[1] = SCLSSQ_IN[1] + (SCLSSQ_OUT[1] * (( SCLSSQ_OUT[0] / SCLSSQ_IN[0] ) * ( SCLSSQ_OUT[0] / SCLSSQ_IN[0] ))); + SCLSSQ_OUT[0] = SCLSSQ_IN[0]; } else { - SCLSSQ[1] = SCLSSQ[1] + (SCALESUMSQ[1] * (( SCALESUMSQ[0] / SCLSSQ[0] ) * ( SCALESUMSQ[0] / SCLSSQ[0] ))); + if ( SCLSSQ_OUT[0] > 0 ) { + SCLSSQ_OUT[1] = SCLSSQ_OUT[1] + (SCLSSQ_IN[1] * (( SCLSSQ_IN[0] / SCLSSQ_OUT[0] ) * ( SCLSSQ_IN[0] / SCLSSQ_OUT[0] ))); + } } (void)cl_arg; @@ -110,17 +113,19 @@ void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, struct starpu_codelet *codelet = &cl_zplssq2; void (*callback)(void*) = options->profiling ? cl_zplssq2_callback : NULL; - if ( chameleon_desc_islocal( RESULT, RESULTm, RESULTn ) ) { - starpu_insert_task( - starpu_mpi_codelet(codelet), - STARPU_RW, RTBLKADDR(RESULT, double, RESULTm, RESULTn), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_RW( RESULT, RESULTm, RESULTn ); + CHAMELEON_END_ACCESS_DECLARATION; + + starpu_insert_task( + starpu_mpi_codelet(codelet), + STARPU_RW, RTBLKADDR(RESULT, double, RESULTm, RESULTn), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "zplssq2", + STARPU_NAME, "zplssq2", #endif - 0); - } + 0); } diff --git a/runtime/starpu/codelets/codelet_ztrasm.c b/runtime/starpu/codelets/codelet_ztrasm.c index 4e0dc05a6db8eb62a2730f56abcc08edb70495ce..c1d154aadc361f391f388514ec155fd798fc1749 100644 --- a/runtime/starpu/codelets/codelet_ztrasm.c +++ b/runtime/starpu/codelets/codelet_ztrasm.c @@ -66,8 +66,8 @@ static void cl_ztrasm_cpu_func(void *descr[], void *cl_arg) int lda; double *work; - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - work = (double *)STARPU_MATRIX_GET_PTR(descr[1]); + A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); + work = (double *)STARPU_MATRIX_GET_PTR(descr[1]); starpu_codelet_unpack_args(cl_arg, &storev, &uplo, &diag, &M, &N, &lda); CORE_ztrasm(storev, uplo, diag, M, N, A, lda, work); } diff --git a/testing/testing_zauxiliary.c b/testing/testing_zauxiliary.c index 2c0f361dbbba24066065b834ced526a1c5b1976b..6ebe3771565b4227002a8cd0fa6778e2c52bd14b 100644 --- a/testing/testing_zauxiliary.c +++ b/testing/testing_zauxiliary.c @@ -182,6 +182,7 @@ int main (int argc, char **argv) CHAMELEON_Disable(CHAMELEON_AUTOTUNING); CHAMELEON_Set(CHAMELEON_TILE_SIZE, nb ); CHAMELEON_Set(CHAMELEON_INNER_BLOCK_SIZE, ib ); + CHAMELEON_user_tag_size( 64, 54 ); argc -= 6; argv += 6; diff --git a/testing/testing_zlange.c b/testing/testing_zlange.c index 864b1e4ce9e55a85c9050de07d475a790da86a9c..34fde1bc404d355b4083eaeed9234ea2ebc8106b 100644 --- a/testing/testing_zlange.c +++ b/testing/testing_zlange.c @@ -51,11 +51,9 @@ int testing_zlange(int argc, char **argv) /* Allocate Data */ CHAMELEON_Complex64_t *A = (CHAMELEON_Complex64_t *)malloc(LDAxN*sizeof(CHAMELEON_Complex64_t)); - double *work = (double*) malloc(max(M,N)*sizeof(double)); + double *work = (double*) malloc(max(M,N)*sizeof(double)); double normcham, normlapack, result; - RUNTIME_comm_set_tag_sizes( 31, 16 ); - eps = LAPACKE_dlamch_work('e'); printf("\n");