Commit a4f56680 authored by Mathieu Faverge's avatar Mathieu Faverge

Fix distributed norms

parent 3c0f1115
...@@ -32,7 +32,7 @@ ...@@ -32,7 +32,7 @@
static inline void static inline void
chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A,
CHAM_desc_t *Wcol, CHAM_desc_t *Welt, CHAM_desc_t *Wcol, CHAM_desc_t *Welt,
RUNTIME_option_t *options) RUNTIME_option_t *options )
{ {
int m, n; int m, n;
int minMNT = chameleon_min( A->mt, A->nt ); int minMNT = chameleon_min( A->mt, A->nt );
...@@ -58,7 +58,7 @@ chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, ...@@ -58,7 +58,7 @@ chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A,
int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb; int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb;
int ldam = BLKLDD( A, m ); int ldam = BLKLDD( A, m );
if ( (n == m) && (uplo != ChamUpperLower) ) { if ( (n == m) && (uplo != ChamUpperLower) ) {
INSERT_TASK_ztrasm( INSERT_TASK_ztrasm(
options, options,
ChamColumnwise, uplo, diag, tempmm, tempnn, ChamColumnwise, uplo, diag, tempmm, tempnn,
...@@ -95,7 +95,8 @@ chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, ...@@ -95,7 +95,8 @@ chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A,
INSERT_TASK_dlange( INSERT_TASK_dlange(
options, options,
ChamMaxNorm, 1, tempnn, A->nb, ChamMaxNorm, 1, tempnn, A->nb,
W( Wcol, 0, n), 1, W( Welt, 0, n)); W( Wcol, 0, n ), 1,
W( Welt, 0, n ) );
} }
/** /**
...@@ -105,7 +106,8 @@ chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, ...@@ -105,7 +106,8 @@ chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A,
for(n = Q; n < NT; n++) { for(n = Q; n < NT; n++) {
INSERT_TASK_dlange_max( INSERT_TASK_dlange_max(
options, options,
W( Welt, 0, n), W( Welt, 0, n%Q) ); W( Welt, 0, n ),
W( Welt, 0, n%Q ) );
} }
/** /**
...@@ -115,7 +117,8 @@ chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, ...@@ -115,7 +117,8 @@ chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A,
for(n = 1; n < Q; n++) { for(n = 1; n < Q; n++) {
INSERT_TASK_dlange_max( INSERT_TASK_dlange_max(
options, options,
W( Welt, 0, n), W( Welt, 0, 0) ); W( Welt, 0, n ),
W( Welt, 0, 0 ) );
} }
} }
...@@ -247,13 +250,14 @@ chameleon_pzlange_max( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_ ...@@ -247,13 +250,14 @@ chameleon_pzlange_max( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_
INSERT_TASK_zlange( INSERT_TASK_zlange(
options, options,
ChamMaxNorm, tempmm, tempnn, A->nb, ChamMaxNorm, tempmm, tempnn, A->nb,
A(m, n), ldam, W( Welt, m, n)); A(m, n), ldam, W( Welt, m, n ));
} }
if ( n >= Q ) { if ( n >= Q ) {
INSERT_TASK_dlange_max( INSERT_TASK_dlange_max(
options, options,
W( Welt, m, n), W( Welt, m, n%Q) ); W( Welt, m, n ),
W( Welt, m, n%Q ) );
} }
} }
...@@ -264,7 +268,8 @@ chameleon_pzlange_max( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_ ...@@ -264,7 +268,8 @@ chameleon_pzlange_max( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_
for(n = 1; n < Q; n++) { for(n = 1; n < Q; n++) {
INSERT_TASK_dlange_max( INSERT_TASK_dlange_max(
options, options,
W( Welt, m, n), W( Welt, m, 0) ); W( Welt, m, n ),
W( Welt, m, 0 ) );
} }
} }
...@@ -275,7 +280,8 @@ chameleon_pzlange_max( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_ ...@@ -275,7 +280,8 @@ chameleon_pzlange_max( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_
for(m = P; m < MT; m++) { for(m = P; m < MT; m++) {
INSERT_TASK_dlange_max( INSERT_TASK_dlange_max(
options, options,
W( Welt, m, 0), W( Welt, m%P, 0) ); W( Welt, m, 0 ),
W( Welt, m%P, 0 ) );
} }
/** /**
...@@ -285,7 +291,8 @@ chameleon_pzlange_max( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_ ...@@ -285,7 +291,8 @@ chameleon_pzlange_max( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_
for(m = 1; m < P; m++) { for(m = 1; m < P; m++) {
INSERT_TASK_dlange_max( INSERT_TASK_dlange_max(
options, options,
W( Welt, m, 0), W( Welt, 0, 0) ); W( Welt, m, 0 ),
W( Welt, 0, 0 ) );
} }
} }
...@@ -382,7 +389,7 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia ...@@ -382,7 +389,7 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia
double alpha = 0.0; double alpha = 0.0;
double beta = 0.0; double beta = 0.0;
int workn, workmt, worknt; int workmt, worknt;
int m, n, wcol_init = 0; int m, n, wcol_init = 0;
chamctxt = chameleon_context_self(); chamctxt = chameleon_context_self();
...@@ -395,7 +402,6 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia ...@@ -395,7 +402,6 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia
workmt = chameleon_max( A->mt, A->p ); workmt = chameleon_max( A->mt, A->p );
worknt = chameleon_max( A->nt, A->q ); worknt = chameleon_max( A->nt, A->q );
workn = chameleon_max( A->n, A->q );
switch ( norm ) { switch ( norm ) {
case ChamOneNorm: case ChamOneNorm:
...@@ -502,7 +508,7 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia ...@@ -502,7 +508,7 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia
*/ */
for(m = 0; m < A->p; m++) { for(m = 0; m < A->p; m++) {
for(n = 0; n < A->q; n++) { for(n = 0; n < A->q; n++) {
if ( (m != 0) && (n != 0) ) { if ( (m != 0) || (n != 0) ) {
INSERT_TASK_dlacpy( INSERT_TASK_dlacpy(
&options, &options,
ChamUpperLower, 1, 1, 1, ChamUpperLower, 1, 1, 1,
...@@ -514,7 +520,7 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia ...@@ -514,7 +520,7 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia
CHAMELEON_Desc_Flush( &Welt, sequence ); CHAMELEON_Desc_Flush( &Welt, sequence );
RUNTIME_sequence_wait(chamctxt, sequence); RUNTIME_sequence_wait(chamctxt, sequence);
*result = *(double *)Welt.get_blkaddr( &Welt, A->myrank / A->q, A->myrank % A->q ); *result = *((double *)Welt.get_blkaddr( &Welt, A->myrank / A->q, A->myrank % A->q ));
if ( wcol_init ) { if ( wcol_init ) {
chameleon_desc_destroy( &Wcol ); chameleon_desc_destroy( &Wcol );
......
...@@ -315,7 +315,7 @@ void chameleon_pzlansy_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_tra ...@@ -315,7 +315,7 @@ void chameleon_pzlansy_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_tra
double alpha = 0.0; double alpha = 0.0;
double beta = 0.0; double beta = 0.0;
int workn, workmt, worknt; int workmt, worknt;
int m, n, wcol_init = 0; int m, n, wcol_init = 0;
chamctxt = chameleon_context_self(); chamctxt = chameleon_context_self();
...@@ -328,7 +328,6 @@ void chameleon_pzlansy_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_tra ...@@ -328,7 +328,6 @@ void chameleon_pzlansy_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_tra
workmt = chameleon_max( A->mt, A->p ); workmt = chameleon_max( A->mt, A->p );
worknt = chameleon_max( A->nt, A->q ); worknt = chameleon_max( A->nt, A->q );
workn = chameleon_max( A->n, A->q );
switch ( norm ) { switch ( norm ) {
case ChamOneNorm: case ChamOneNorm:
...@@ -415,7 +414,7 @@ void chameleon_pzlansy_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_tra ...@@ -415,7 +414,7 @@ void chameleon_pzlansy_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_tra
*/ */
for(m = 0; m < A->p; m++) { for(m = 0; m < A->p; m++) {
for(n = 0; n < A->q; n++) { for(n = 0; n < A->q; n++) {
if ( (m != 0) && (n != 0) ) { if ( (m != 0) || (n != 0) ) {
INSERT_TASK_dlacpy( INSERT_TASK_dlacpy(
&options, &options,
ChamUpperLower, 1, 1, 1, ChamUpperLower, 1, 1, 1,
......
...@@ -77,11 +77,8 @@ ...@@ -77,11 +77,8 @@
* Global shortcuts * Global shortcuts
*/ */
#define CHAMELEON_RANK chameleon_rank(chamctxt) #define CHAMELEON_RANK chameleon_rank(chamctxt)
#define CHAMELEON_SIZE chamctxt->world_size
#define CHAMELEON_GRPSIZE chamctxt->group_size
#define CHAMELEON_NB chamctxt->nb #define CHAMELEON_NB chamctxt->nb
#define CHAMELEON_IB chamctxt->ib #define CHAMELEON_IB chamctxt->ib
#define CHAMELEON_SCHEDULING chamctxt->scheduling
#define CHAMELEON_RHBLK chamctxt->rhblock #define CHAMELEON_RHBLK chamctxt->rhblock
#define CHAMELEON_TRANSLATION chamctxt->translation #define CHAMELEON_TRANSLATION chamctxt->translation
#define CHAMELEON_PARALLEL chamctxt->parallel_enabled #define CHAMELEON_PARALLEL chamctxt->parallel_enabled
......
...@@ -109,8 +109,6 @@ typedef struct chameleon_context_s { ...@@ -109,8 +109,6 @@ typedef struct chameleon_context_s {
int my_mpi_rank; int my_mpi_rank;
int mpi_comm_size; int mpi_comm_size;
#endif #endif
int world_size;
int group_size;
/* Boolean flags */ /* Boolean flags */
cham_bool_t warnings_enabled; cham_bool_t warnings_enabled;
......
...@@ -57,18 +57,18 @@ ...@@ -57,18 +57,18 @@
* *
*/ */
void INSERT_TASK_zplssq( const RUNTIME_option_t *options, void INSERT_TASK_zplssq( const RUNTIME_option_t *options,
const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn, const CHAM_desc_t *IN, int INm, int INn,
const CHAM_desc_t *SCLSSQ, int SCLSSQm, int SCLSSQn ) const CHAM_desc_t *OUT, int OUTm, int OUTn )
{ {
double *scalesum = RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn); double *sclssq_in = RTBLKADDR(IN, double, INm, INn );
double *scl = RTBLKADDR(SCLSSQ, double, SCLSSQm, SCLSSQn); double *sclssq_out = RTBLKADDR(OUT, double, OUTm, OUTn);
#pragma omp task depend(in: scalesum[0]) depend(inout: scl[0]) #pragma omp task depend(in: sclssq_in[0]) depend(inout: sclssq_out[0])
{ {
if( scl[0] < scalesum[0] ) { if( sclssq_out[0] < sclssq_in[0] ) {
scl[1] = scalesum[1] + (scl[1] * (( scl[0] / scalesum[0] ) * ( scl[0] / scalesum[0] ))); sclssq_out[1] = sclssq_in[1] + (sclssq_out[1] * (( sclssq_out[0] / sclssq_in[0] ) * ( sclssq_out[0] / sclssq_in[0] )));
scl[0] = scalesum[0]; sclssq_out[0] = sclssq_in[0];
} else { } else {
scl[1] = scl[1] + (scalesum[1] * (( scalesum[0] / scl[0] ) * ( scalesum[0] / scl[0] ))); sclssq_out[1] = sclssq_out[1] + (sclssq_in[1] * (( sclssq_in[0] / sclssq_out[0] ) * ( sclssq_in[0] / sclssq_out[0] )));
} }
} }
} }
......
...@@ -56,17 +56,20 @@ static inline int ...@@ -56,17 +56,20 @@ static inline int
CORE_zplssq_parsec( parsec_execution_stream_t *context, CORE_zplssq_parsec( parsec_execution_stream_t *context,
parsec_task_t *this_task ) parsec_task_t *this_task )
{ {
double *SCALESUMSQ; double *SCLSSQ_IN;
double *SCLSSQ; double *SCLSSQ_OUT;
parsec_dtd_unpack_args( parsec_dtd_unpack_args(
this_task, &SCALESUMSQ, &SCLSSQ ); this_task, &SCLSSQ_IN, &SCLSSQ_OUT );
if( SCLSSQ[0] < SCALESUMSQ[0] ) { assert( SCLSSQ_OUT[0] >= 0. );
SCLSSQ[1] = SCALESUMSQ[1] + (SCLSSQ[1] * (( SCLSSQ[0] / SCALESUMSQ[0] ) * ( SCLSSQ[0] / SCALESUMSQ[0] ))); if( SCLSSQ_OUT[0] < SCLSSQ_IN[0] ) {
SCLSSQ[0] = SCALESUMSQ[0]; SCLSSQ_OUT[1] = SCLSSQ_IN[1] + (SCLSSQ_OUT[1] * (( SCLSSQ_OUT[0] / SCLSSQ_IN[0] ) * ( SCLSSQ_OUT[0] / SCLSSQ_IN[0] )));
SCLSSQ_OUT[0] = SCLSSQ_IN[0];
} else { } else {
SCLSSQ[1] = SCLSSQ[1] + (SCALESUMSQ[1] * (( SCALESUMSQ[0] / SCLSSQ[0] ) * ( SCALESUMSQ[0] / SCLSSQ[0] ))); if ( SCLSSQ_OUT[0] > 0 ) {
SCLSSQ_OUT[1] = SCLSSQ_OUT[1] + (SCLSSQ_IN[1] * (( SCLSSQ_IN[0] / SCLSSQ_OUT[0] ) * ( SCLSSQ_IN[0] / SCLSSQ_OUT[0] )));
}
} }
(void)context; (void)context;
......
...@@ -26,16 +26,19 @@ ...@@ -26,16 +26,19 @@
void CORE_zplssq_quark(Quark *quark) void CORE_zplssq_quark(Quark *quark)
{ {
double *SCALESUMSQ; double *SCLSSQ_IN;
double *SCLSSQ; double *SCLSSQ_OUT;
quark_unpack_args_2( quark, SCALESUMSQ, SCLSSQ ); quark_unpack_args_2( quark, SCLSSQ_IN, SCLSSQ_OUT );
if( SCLSSQ[0] < SCALESUMSQ[0] ) { assert( SCLSSQ_OUT[0] >= 0. );
SCLSSQ[1] = SCALESUMSQ[1] + (SCLSSQ[1] * (( SCLSSQ[0] / SCALESUMSQ[0] ) * ( SCLSSQ[0] / SCALESUMSQ[0] ))); if( SCLSSQ_OUT[0] < SCLSSQ_IN[0] ) {
SCLSSQ[0] = SCALESUMSQ[0]; SCLSSQ_OUT[1] = SCLSSQ_IN[1] + (SCLSSQ_OUT[1] * (( SCLSSQ_OUT[0] / SCLSSQ_IN[0] ) * ( SCLSSQ_OUT[0] / SCLSSQ_IN[0] )));
SCLSSQ_OUT[0] = SCLSSQ_IN[0];
} else { } else {
SCLSSQ[1] = SCLSSQ[1] + (SCALESUMSQ[1] * (( SCALESUMSQ[0] / SCLSSQ[0] ) * ( SCALESUMSQ[0] / SCLSSQ[0] ))); if ( SCLSSQ_OUT[0] > 0 ) {
SCLSSQ_OUT[1] = SCLSSQ_OUT[1] + (SCLSSQ_IN[1] * (( SCLSSQ_IN[0] / SCLSSQ_OUT[0] ) * ( SCLSSQ_IN[0] / SCLSSQ_OUT[0] )));
}
} }
} }
......
...@@ -120,13 +120,13 @@ static void cl_zgeadd_cpu_func(void *descr[], void *cl_arg) ...@@ -120,13 +120,13 @@ static void cl_zgeadd_cpu_func(void *descr[], void *cl_arg)
int M; int M;
int N; int N;
CHAMELEON_Complex64_t alpha; CHAMELEON_Complex64_t alpha;
CHAMELEON_Complex64_t *A; const CHAMELEON_Complex64_t *A;
int LDA; int LDA;
CHAMELEON_Complex64_t beta; CHAMELEON_Complex64_t beta;
CHAMELEON_Complex64_t *B; CHAMELEON_Complex64_t *B;
int LDB; int LDB;
A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); A = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
B = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); B = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
starpu_codelet_unpack_args(cl_arg, &trans, &M, &N, &alpha, &LDA, &beta, &LDB); starpu_codelet_unpack_args(cl_arg, &trans, &M, &N, &alpha, &LDA, &beta, &LDB);
CORE_zgeadd(trans, M, N, alpha, A, LDA, beta, B, LDB); CORE_zgeadd(trans, M, N, alpha, A, LDA, beta, B, LDB);
......
...@@ -32,17 +32,17 @@ ...@@ -32,17 +32,17 @@
* *
*/ */
void INSERT_TASK_zlacpyx(const RUNTIME_option_t *options, void INSERT_TASK_zlacpyx(const RUNTIME_option_t *options,
cham_uplo_t uplo, int m, int n, int nb, cham_uplo_t uplo, int m, int n, int nb,
int displA, const CHAM_desc_t *A, int Am, int An, int lda, int displA, const CHAM_desc_t *A, int Am, int An, int lda,
int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb) int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb)
{ {
(void)nb; (void)nb;
struct starpu_codelet *codelet = &cl_zlacpy; struct starpu_codelet *codelet = &cl_zlacpy;
void (*callback)(void*) = options->profiling ? cl_zlacpy_callback : NULL; void (*callback)(void*) = options->profiling ? cl_zlacpy_callback : NULL;
CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_BEGIN_ACCESS_DECLARATION;
CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_R( A, Am, An );
CHAMELEON_ACCESS_W(B, Bm, Bn); CHAMELEON_ACCESS_W( B, Bm, Bn );
CHAMELEON_END_ACCESS_DECLARATION; CHAMELEON_END_ACCESS_DECLARATION;
starpu_insert_task( starpu_insert_task(
...@@ -65,13 +65,13 @@ void INSERT_TASK_zlacpyx(const RUNTIME_option_t *options, ...@@ -65,13 +65,13 @@ void INSERT_TASK_zlacpyx(const RUNTIME_option_t *options,
} }
void INSERT_TASK_zlacpy(const RUNTIME_option_t *options, void INSERT_TASK_zlacpy(const RUNTIME_option_t *options,
cham_uplo_t uplo, int m, int n, int nb, cham_uplo_t uplo, int m, int n, int nb,
const CHAM_desc_t *A, int Am, int An, int lda, const CHAM_desc_t *A, int Am, int An, int lda,
const CHAM_desc_t *B, int Bm, int Bn, int ldb) const CHAM_desc_t *B, int Bm, int Bn, int ldb)
{ {
INSERT_TASK_zlacpyx( options, uplo, m, n, nb, INSERT_TASK_zlacpyx( options, uplo, m, n, nb,
0, A, Am, An, lda, 0, A, Am, An, lda,
0, B, Bm, Bn, ldb ); 0, B, Bm, Bn, ldb );
} }
#if !defined(CHAMELEON_SIMULATION) #if !defined(CHAMELEON_SIMULATION)
......
...@@ -85,24 +85,25 @@ void INSERT_TASK_clag2z(const RUNTIME_option_t *options, ...@@ -85,24 +85,25 @@ void INSERT_TASK_clag2z(const RUNTIME_option_t *options,
struct starpu_codelet *codelet = &cl_clag2z; struct starpu_codelet *codelet = &cl_clag2z;
void (*callback)(void*) = options->profiling ? cl_clag2z_callback : NULL; void (*callback)(void*) = options->profiling ? cl_clag2z_callback : NULL;
if ( chameleon_desc_islocal( A, Am, An ) || CHAMELEON_BEGIN_ACCESS_DECLARATION;
chameleon_desc_islocal( B, Bm, Bn ) ) CHAMELEON_ACCESS_R( A, Am, An );
{ CHAMELEON_ACCESS_W( B, Bm, Bn );
starpu_insert_task( CHAMELEON_END_ACCESS_DECLARATION;
starpu_mpi_codelet(codelet),
STARPU_VALUE, &m, sizeof(int), starpu_insert_task(
STARPU_VALUE, &n, sizeof(int), starpu_mpi_codelet(codelet),
STARPU_R, RTBLKADDR(A, CHAMELEON_Complex32_t, Am, An), STARPU_VALUE, &m, sizeof(int),
STARPU_VALUE, &lda, sizeof(int), STARPU_VALUE, &n, sizeof(int),
STARPU_W, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), STARPU_R, RTBLKADDR(A, CHAMELEON_Complex32_t, Am, An),
STARPU_VALUE, &ldb, sizeof(int), STARPU_VALUE, &lda, sizeof(int),
STARPU_PRIORITY, options->priority, STARPU_W, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
STARPU_CALLBACK, callback, STARPU_VALUE, &ldb, sizeof(int),
STARPU_PRIORITY, options->priority,
STARPU_CALLBACK, callback,
#if defined(CHAMELEON_CODELETS_HAVE_NAME) #if defined(CHAMELEON_CODELETS_HAVE_NAME)
STARPU_NAME, "clag2z", STARPU_NAME, "clag2z",
#endif #endif
0); 0);
}
} }
......
...@@ -70,7 +70,7 @@ static void cl_zlange_cpu_func(void *descr[], void *cl_arg) ...@@ -70,7 +70,7 @@ static void cl_zlange_cpu_func(void *descr[], void *cl_arg)
work = (double *)STARPU_MATRIX_GET_PTR(descr[1]); work = (double *)STARPU_MATRIX_GET_PTR(descr[1]);
normA = (double *)STARPU_MATRIX_GET_PTR(descr[2]); normA = (double *)STARPU_MATRIX_GET_PTR(descr[2]);
starpu_codelet_unpack_args(cl_arg, &norm, &M, &N, &LDA); starpu_codelet_unpack_args(cl_arg, &norm, &M, &N, &LDA);
CORE_zlange( norm, M, N, A, LDA, work, normA); CORE_zlange( norm, M, N, A, LDA, work, normA );
} }
#endif /* !defined(CHAMELEON_SIMULATION) */ #endif /* !defined(CHAMELEON_SIMULATION) */
...@@ -86,34 +86,35 @@ void INSERT_TASK_zlange_max(const RUNTIME_option_t *options, ...@@ -86,34 +86,35 @@ void INSERT_TASK_zlange_max(const RUNTIME_option_t *options,
struct starpu_codelet *codelet = &cl_zlange_max; struct starpu_codelet *codelet = &cl_zlange_max;
void (*callback)(void*) = options->profiling ? cl_zlange_callback : NULL; void (*callback)(void*) = options->profiling ? cl_zlange_callback : NULL;
if ( chameleon_desc_islocal( A, Am, An ) || CHAMELEON_BEGIN_ACCESS_DECLARATION;
chameleon_desc_islocal( B, Bm, Bn ) ) CHAMELEON_ACCESS_R( A, Am, An );
{ CHAMELEON_ACCESS_RW( B, Bm, Bn );
starpu_insert_task( CHAMELEON_END_ACCESS_DECLARATION;
starpu_mpi_codelet(codelet),
STARPU_R, RTBLKADDR(A, double, Am, An), starpu_insert_task(
STARPU_RW, RTBLKADDR(B, double, Bm, Bn), starpu_mpi_codelet(codelet),
STARPU_PRIORITY, options->priority, STARPU_R, RTBLKADDR(A, double, Am, An),
STARPU_CALLBACK, callback, STARPU_RW, RTBLKADDR(B, double, Bm, Bn),
STARPU_PRIORITY, options->priority,
STARPU_CALLBACK, callback,
#if defined(CHAMELEON_CODELETS_HAVE_NAME) #if defined(CHAMELEON_CODELETS_HAVE_NAME)
STARPU_NAME, "zlange_max", STARPU_NAME, "zlange_max",
#endif #endif
0); 0);
}
} }
#if !defined(CHAMELEON_SIMULATION) #if !defined(CHAMELEON_SIMULATION)
static void cl_zlange_max_cpu_func(void *descr[], void *cl_arg) static void cl_zlange_max_cpu_func(void *descr[], void *cl_arg)
{ {
double *A; double *A;
double *normA; double *B;
A = (double *)STARPU_MATRIX_GET_PTR(descr[0]);
normA = (double *)STARPU_MATRIX_GET_PTR(descr[1]);
if ( *A > *normA ) A = (double *)STARPU_MATRIX_GET_PTR(descr[0]);
*normA = *A; B = (double *)STARPU_MATRIX_GET_PTR(descr[1]);
if ( *A > *B ) {
*B = *A;
}
(void)cl_arg; (void)cl_arg;
} }
#endif /* !defined(CHAMELEON_SIMULATION) */ #endif /* !defined(CHAMELEON_SIMULATION) */
......
...@@ -55,21 +55,21 @@ ...@@ -55,21 +55,21 @@
* *
*/ */
void INSERT_TASK_zplssq( const RUNTIME_option_t *options, void INSERT_TASK_zplssq( const RUNTIME_option_t *options,
const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn, const CHAM_desc_t *SCLSSQ_IN, int SCLSSQ_INm, int SCLSSQ_INn,
const CHAM_desc_t *SCLSSQ, int SCLSSQm, int SCLSSQn ) const CHAM_desc_t *SCLSSQ_OUT, int SCLSSQ_OUTm, int SCLSSQ_OUTn )
{ {
struct starpu_codelet *codelet = &cl_zplssq; struct starpu_codelet *codelet = &cl_zplssq;
void (*callback)(void*) = options->profiling ? cl_zplssq_callback : NULL; void (*callback)(void*) = options->profiling ? cl_zplssq_callback : NULL;
CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_BEGIN_ACCESS_DECLARATION;
CHAMELEON_ACCESS_R(SCALESUMSQ, SCALESUMSQm, SCALESUMSQn); CHAMELEON_ACCESS_R( SCLSSQ_IN, SCLSSQ_INm, SCLSSQ_INn );
CHAMELEON_ACCESS_RW(SCLSSQ, SCLSSQm, SCLSSQn); CHAMELEON_ACCESS_RW( SCLSSQ_OUT, SCLSSQ_OUTm, SCLSSQ_OUTn );
CHAMELEON_END_ACCESS_DECLARATION; CHAMELEON_END_ACCESS_DECLARATION;
starpu_insert_task( starpu_insert_task(
starpu_mpi_codelet(codelet), starpu_mpi_codelet(codelet),
STARPU_R, RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), STARPU_R, RTBLKADDR( SCLSSQ_IN, double, SCLSSQ_INm, SCLSSQ_INn ),
STARPU_RW, RTBLKADDR(SCLSSQ, double, SCLSSQm, SCLSSQn), STARPU_RW, RTBLKADDR( SCLSSQ_OUT, double