Commit fa6d78a3 authored by Mathieu Faverge's avatar Mathieu Faverge

Merge branch 'starpu/alloc_on_the_fly' into 'master'

Starpu/alloc on the fly

See merge request solverstack/chameleon!140
parents e932eacc f676d524
...@@ -72,11 +72,10 @@ chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, ...@@ -72,11 +72,10 @@ chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A,
} }
if ( m >= P ) { if ( m >= P ) {
INSERT_TASK_dgeadd( INSERT_TASK_daxpy(
options, options, tempnn, 1.,
ChamNoTrans, 1, tempnn, A->nb, W( Wcol, m, n ), 1,
1.0, W( Wcol, m, n ), 1, W( Wcol, m%P, n ), 1 );
1.0, W( Wcol, m%P, n ), 1 );
} }
} }
...@@ -85,11 +84,10 @@ chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, ...@@ -85,11 +84,10 @@ chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A,
* For each i, W(i, n) = reduce( W(0..P-1, n) ) * For each i, W(i, n) = reduce( W(0..P-1, n) )
*/ */
for(m = 1; m < P; m++) { for(m = 1; m < P; m++) {
INSERT_TASK_dgeadd( INSERT_TASK_daxpy(
options, options, tempnn, 1.,
ChamNoTrans, 1, tempnn, A->nb, W( Wcol, m, n ), 1,
1.0, W( Wcol, m, n ), 1, W( Wcol, 0, n ), 1 );
1.0, W( Wcol, 0, n ), 1 );
} }
INSERT_TASK_dlange( INSERT_TASK_dlange(
...@@ -165,11 +163,10 @@ chameleon_pzlange_inf( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, ...@@ -165,11 +163,10 @@ chameleon_pzlange_inf( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A,
} }
if ( n >= Q ) { if ( n >= Q ) {
INSERT_TASK_dgeadd( INSERT_TASK_daxpy(
options, options, tempmm, 1.,
ChamNoTrans, tempmm, 1, A->mb, W( Wcol, m, n ), 1,
1.0, W( Wcol, m, n ), tempmm, W( Wcol, m, n%Q ), 1 );
1.0, W( Wcol, m, n%Q), tempmm );
} }
} }
...@@ -178,11 +175,10 @@ chameleon_pzlange_inf( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, ...@@ -178,11 +175,10 @@ chameleon_pzlange_inf( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A,
* For each j, W(m, j) = reduce( Wcol(m, 0..Q-1) ) * For each j, W(m, j) = reduce( Wcol(m, 0..Q-1) )
*/ */
for(n = 1; n < Q; n++) { for(n = 1; n < Q; n++) {
INSERT_TASK_dgeadd( INSERT_TASK_daxpy(
options, options, tempmm, 1.,
ChamNoTrans, tempmm, 1, A->mb, W( Wcol, m, n ), 1,
1.0, W( Wcol, m, n), tempmm, W( Wcol, m, 0 ), 1 );
1.0, W( Wcol, m, 0), tempmm );
} }
INSERT_TASK_dlange( INSERT_TASK_dlange(
...@@ -407,11 +403,14 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia ...@@ -407,11 +403,14 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia
case ChamOneNorm: case ChamOneNorm:
RUNTIME_options_ws_alloc( &options, 1, 0 ); RUNTIME_options_ws_alloc( &options, 1, 0 );
chameleon_desc_init( &Wcol, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 1, A->nb, A->nb, chameleon_desc_init( &Wcol, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble, 1, A->nb, A->nb,
workmt, worknt * A->nb, 0, 0, workmt, worknt * A->nb, A->p, A->q, workmt, worknt * A->nb, 0, 0, workmt, worknt * A->nb, A->p, A->q,
NULL, NULL, NULL ); NULL, NULL, NULL );
wcol_init = 1; wcol_init = 1;
/*
* Use the global allocator for Welt, otherwise flush may free the data before the result is read.
*/
chameleon_desc_init( &Welt, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 1, 1, 1, chameleon_desc_init( &Welt, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 1, 1, 1,
A->p, worknt, 0, 0, A->p, worknt, A->p, A->q, A->p, worknt, 0, 0, A->p, worknt, A->p, A->q,
NULL, NULL, NULL ); NULL, NULL, NULL );
...@@ -424,7 +423,7 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia ...@@ -424,7 +423,7 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia
case ChamInfNorm: case ChamInfNorm:
RUNTIME_options_ws_alloc( &options, A->mb, 0 ); RUNTIME_options_ws_alloc( &options, A->mb, 0 );
chameleon_desc_init( &Wcol, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, A->mb, 1, A->mb, chameleon_desc_init( &Wcol, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble, A->mb, 1, A->mb,
workmt * A->mb, worknt, 0, 0, workmt * A->mb, worknt, A->p, A->q, workmt * A->mb, worknt, 0, 0, workmt * A->mb, worknt, A->p, A->q,
NULL, NULL, NULL ); NULL, NULL, NULL );
wcol_init = 1; wcol_init = 1;
...@@ -522,7 +521,7 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia ...@@ -522,7 +521,7 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia
} }
CHAMELEON_Desc_Flush( &Welt, sequence ); CHAMELEON_Desc_Flush( &Welt, sequence );
CHAMELEON_Desc_Flush( A, sequence ); CHAMELEON_Desc_Flush( A, sequence );
RUNTIME_sequence_wait(chamctxt, sequence); RUNTIME_sequence_wait( chamctxt, sequence );
*result = *((double *)Welt.get_blkaddr( &Welt, A->myrank / A->q, A->myrank % A->q )); *result = *((double *)Welt.get_blkaddr( &Welt, A->myrank / A->q, A->myrank % A->q ));
......
...@@ -81,11 +81,10 @@ chameleon_pzlansy_inf( cham_uplo_t uplo, CHAM_desc_t *A, ...@@ -81,11 +81,10 @@ chameleon_pzlansy_inf( cham_uplo_t uplo, CHAM_desc_t *A,
int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb; int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb;
for(n = Q; n < NT; n++) { for(n = Q; n < NT; n++) {
INSERT_TASK_dgeadd( INSERT_TASK_daxpy(
options, options, tempmm, 1.,
ChamNoTrans, tempmm, 1, A->nb, W( Wcol, m, n ), 1,
1.0, W( Wcol, m, n ), tempmm, W( Wcol, m, n%Q ), 1 );
1.0, W( Wcol, m, n%Q), tempmm );
} }
/** /**
...@@ -93,11 +92,10 @@ chameleon_pzlansy_inf( cham_uplo_t uplo, CHAM_desc_t *A, ...@@ -93,11 +92,10 @@ chameleon_pzlansy_inf( cham_uplo_t uplo, CHAM_desc_t *A,
* For each j, W(m, j) = reduce( Wcol(m, 0..Q-1) ) * For each j, W(m, j) = reduce( Wcol(m, 0..Q-1) )
*/ */
for(n = 1; n < Q; n++) { for(n = 1; n < Q; n++) {
INSERT_TASK_dgeadd( INSERT_TASK_daxpy(
options, options, tempmm, 1.,
ChamNoTrans, tempmm, 1, A->mb, W( Wcol, m, n ), 1,
1.0, W( Wcol, m, n), tempmm, W( Wcol, m, 0 ), 1 );
1.0, W( Wcol, m, 0), tempmm );
} }
INSERT_TASK_dlange( INSERT_TASK_dlange(
...@@ -334,11 +332,14 @@ void chameleon_pzlansy_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_tra ...@@ -334,11 +332,14 @@ void chameleon_pzlansy_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_tra
case ChamInfNorm: case ChamInfNorm:
RUNTIME_options_ws_alloc( &options, 1, 0 ); RUNTIME_options_ws_alloc( &options, 1, 0 );
chameleon_desc_init( &Wcol, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, A->mb, 1, A->mb, chameleon_desc_init( &Wcol, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble, A->mb, 1, A->mb,
workmt * A->mb, worknt, 0, 0, workmt * A->mb, worknt, A->p, A->q, workmt * A->mb, worknt, 0, 0, workmt * A->mb, worknt, A->p, A->q,
NULL, NULL, NULL ); NULL, NULL, NULL );
wcol_init = 1; wcol_init = 1;
/*
* Use the global allocator for Welt, otherwise flush may free the data before the result is read.
*/
chameleon_desc_init( &Welt, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 1, 1, 1, chameleon_desc_init( &Welt, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 1, 1, 1,
workmt, A->q, 0, 0, workmt, A->q, A->p, A->q, workmt, A->q, 0, 0, workmt, A->q, A->p, A->q,
NULL, NULL, NULL ); NULL, NULL, NULL );
......
...@@ -466,6 +466,12 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, ...@@ -466,6 +466,12 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree,
RUNTIME_data_flush( sequence, T(k, n) ); RUNTIME_data_flush( sequence, T(k, n) );
} }
/* Restore the original location of the tiles */
for (m = 0; m < B->mt; m++) {
RUNTIME_data_migrate( sequence, B( m, k ),
B->get_rankof( B, m, k ) );
}
RUNTIME_iteration_pop(chamctxt); RUNTIME_iteration_pop(chamctxt);
} }
} }
......
...@@ -467,6 +467,12 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, ...@@ -467,6 +467,12 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree,
RUNTIME_data_flush( sequence, T(n, k) ); RUNTIME_data_flush( sequence, T(n, k) );
} }
/* Restore the original location of the tiles */
for (m = 0; m < B->mt; m++) {
RUNTIME_data_migrate( sequence, B(m, k),
B->get_rankof( B, m, k ) );
}
RUNTIME_iteration_pop(chamctxt); RUNTIME_iteration_pop(chamctxt);
} }
} }
......
...@@ -266,7 +266,7 @@ int CHAMELEON_zlaset_Tile_Async( cham_uplo_t uplo, ...@@ -266,7 +266,7 @@ int CHAMELEON_zlaset_Tile_Async( cham_uplo_t uplo,
return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE);
} }
/* Check input arguments */ /* Check input arguments */
if (A->nb != A->mb) { if ( (alpha != beta) && (A->nb != A->mb) ) {
chameleon_error("CHAMELEON_zlaset_Tile_Async", "only square tiles supported"); chameleon_error("CHAMELEON_zlaset_Tile_Async", "only square tiles supported");
return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE);
} }
......
...@@ -134,7 +134,7 @@ void chameleon_pzungqr_param( int genD, int K, const libhqr_tree_t *qrtree, ...@@ -134,7 +134,7 @@ void chameleon_pzungqr_param( int genD, int K, const libhqr_tree_t *qrtree,
static inline int static inline int
chameleon_zdesc_alloc_diag( CHAM_desc_t *descA, int nb, int m, int n, int p, int q ) { chameleon_zdesc_alloc_diag( CHAM_desc_t *descA, int nb, int m, int n, int p, int q ) {
int diag_m = chameleon_min( m, n ); int diag_m = chameleon_min( m, n );
return chameleon_desc_init( descA, CHAMELEON_MAT_ALLOC_GLOBAL, return chameleon_desc_init( descA, CHAMELEON_MAT_ALLOC_TILE,
ChamComplexDouble, nb, nb, nb*nb, ChamComplexDouble, nb, nb, nb*nb,
diag_m, nb, 0, 0, diag_m, nb, p, q, diag_m, nb, 0, 0, diag_m, nb, p, q,
chameleon_getaddr_diag, chameleon_getaddr_diag,
...@@ -145,7 +145,7 @@ chameleon_zdesc_alloc_diag( CHAM_desc_t *descA, int nb, int m, int n, int p, int ...@@ -145,7 +145,7 @@ chameleon_zdesc_alloc_diag( CHAM_desc_t *descA, int nb, int m, int n, int p, int
#define chameleon_zdesc_alloc( descA, mb, nb, lm, ln, i, j, m, n, free) \ #define chameleon_zdesc_alloc( descA, mb, nb, lm, ln, i, j, m, n, free) \
{ \ { \
int rc; \ int rc; \
rc = chameleon_desc_init( &(descA), CHAMELEON_MAT_ALLOC_GLOBAL, \ rc = chameleon_desc_init( &(descA), CHAMELEON_MAT_ALLOC_TILE, \
ChamComplexDouble, (mb), (nb), ((mb)*(nb)), \ ChamComplexDouble, (mb), (nb), ((mb)*(nb)), \
(m), (n), (i), (j), (m), (n), 1, 1, \ (m), (n), (i), (j), (m), (n), 1, 1, \
NULL, NULL, NULL ); \ NULL, NULL, NULL ); \
...@@ -174,7 +174,7 @@ chameleon_zlap2tile( CHAM_context_t *chamctxt, ...@@ -174,7 +174,7 @@ chameleon_zlap2tile( CHAM_context_t *chamctxt,
if ( CHAMELEON_TRANSLATION == ChamOutOfPlace ) { if ( CHAMELEON_TRANSLATION == ChamOutOfPlace ) {
/* Initialize the tile descriptor */ /* Initialize the tile descriptor */
chameleon_desc_init( descAt, CHAMELEON_MAT_ALLOC_GLOBAL, ChamComplexDouble, mb, nb, (mb)*(nb), chameleon_desc_init( descAt, CHAMELEON_MAT_ALLOC_TILE, ChamComplexDouble, mb, nb, (mb)*(nb),
lm, ln, 0, 0, m, n, 1, 1, lm, ln, 0, 0, m, n, 1, 1,
chameleon_getaddr_ccrb, chameleon_getblkldd_ccrb, NULL ); chameleon_getaddr_ccrb, chameleon_getblkldd_ccrb, NULL );
...@@ -235,6 +235,7 @@ chameleon_ztile2lap( CHAM_context_t *chamctxt, CHAM_desc_t *descAl, CHAM_desc_t ...@@ -235,6 +235,7 @@ chameleon_ztile2lap( CHAM_context_t *chamctxt, CHAM_desc_t *descAl, CHAM_desc_t
static inline void static inline void
chameleon_ztile2lap_cleanup( CHAM_context_t *chamctxt, CHAM_desc_t *descAl, CHAM_desc_t *descAt ) chameleon_ztile2lap_cleanup( CHAM_context_t *chamctxt, CHAM_desc_t *descAl, CHAM_desc_t *descAt )
{ {
(void)chamctxt;
chameleon_desc_destroy( descAl ); chameleon_desc_destroy( descAl );
chameleon_desc_destroy( descAt ); chameleon_desc_destroy( descAt );
} }
......
...@@ -226,26 +226,32 @@ int chameleon_desc_init( CHAM_desc_t *desc, void *mat, ...@@ -226,26 +226,32 @@ int chameleon_desc_init( CHAM_desc_t *desc, void *mat,
/* The matrix is alocated tile by tile with out of core */ /* The matrix is alocated tile by tile with out of core */
desc->ooc = 0; desc->ooc = 0;
// Matrix address switch ( (intptr_t)mat ) {
if ( mat == CHAMELEON_MAT_ALLOC_GLOBAL ) { case (intptr_t)CHAMELEON_MAT_ALLOC_TILE:
rc = chameleon_desc_mat_alloc( desc ); if ( chamctxt->scheduler == RUNTIME_SCHED_STARPU ) {
/* Let's use the allocation on the fly as in OOC */
desc->get_blkaddr = chameleon_getaddr_null;
desc->mat = NULL;
break;
}
/* Otherwise we switch back to the full allocation */
desc->alloc_mat = 1; case (intptr_t)CHAMELEON_MAT_ALLOC_GLOBAL:
desc->use_mat = 1;
}
else if ( mat == CHAMELEON_MAT_ALLOC_TILE ) {
//chameleon_error( "chameleon_desc_init", "CHAMELEON_MAT_ALLOC_TILE is not available yet" );
//desc->mat = NULL;
rc = chameleon_desc_mat_alloc( desc ); rc = chameleon_desc_mat_alloc( desc );
desc->alloc_mat = 1;
desc->use_mat = 1; desc->use_mat = 1;
break;
desc->alloc_mat = 1; case (intptr_t)CHAMELEON_MAT_OOC:
} if ( chamctxt->scheduler != RUNTIME_SCHED_STARPU ) {
else if ( mat == CHAMELEON_MAT_OOC ) { chameleon_error("CHAMELEON_Desc_Create", "CHAMELEON Out-of-Core descriptors are supported only with StarPU");
return CHAMELEON_ERR_NOT_SUPPORTED;
}
desc->mat = NULL; desc->mat = NULL;
desc->ooc = 1; desc->ooc = 1;
} break;
else {
default:
/* memory of the matrix is handled by users */ /* memory of the matrix is handled by users */
desc->mat = mat; desc->mat = mat;
desc->use_mat = 1; desc->use_mat = 1;
......
...@@ -74,7 +74,8 @@ int chameleon_alloc_ibnb_tile(int M, int N, cham_tasktype_t func, int type, CHAM ...@@ -74,7 +74,8 @@ int chameleon_alloc_ibnb_tile(int M, int N, cham_tasktype_t func, int type, CHAM
lm = IB * MT; lm = IB * MT;
ln = NB * NT; ln = NB * NT;
return CHAMELEON_Desc_Create( desc, NULL, type, IB, NB, IB*NB, lm, ln, 0, 0, lm, ln, p, q ); return CHAMELEON_Desc_Create( desc, CHAMELEON_MAT_ALLOC_TILE, type, IB, NB, IB*NB,
lm, ln, 0, 0, lm, ln, p, q );
} }
/** /**
...@@ -119,7 +120,8 @@ int chameleon_alloc_ipiv(int M, int N, cham_tasktype_t func, int type, CHAM_desc ...@@ -119,7 +120,8 @@ int chameleon_alloc_ipiv(int M, int N, cham_tasktype_t func, int type, CHAM_desc
/* TODO: Fix the distribution for IPIV */ /* TODO: Fix the distribution for IPIV */
*IPIV = (int*)malloc( size ); *IPIV = (int*)malloc( size );
return CHAMELEON_Desc_Create( desc, NULL, type, IB, NB, IB*NB, lm, ln, 0, 0, lm, ln, p, q ); return CHAMELEON_Desc_Create( desc, CHAMELEON_MAT_ALLOC_TILE, type, IB, NB, IB*NB,
lm, ln, 0, 0, lm, ln, p, q );
} }
/** /**
......
...@@ -67,7 +67,7 @@ ...@@ -67,7 +67,7 @@
* The leading dimension of the array A. LDA >= max(1,M). * The leading dimension of the array A. LDA >= max(1,M).
* *
* @param[out] T * @param[out] T
* The IB-by-N triangular factor T of the block reflector. * The IB-by-M triangular factor T of the block reflector.
* T is upper triangular by block (economic storage); * T is upper triangular by block (economic storage);
* The rest of the array is not referenced. * The rest of the array is not referenced.
* *
......
...@@ -483,6 +483,8 @@ INSERT_TASK_ztsmlq( const RUNTIME_option_t *options, ...@@ -483,6 +483,8 @@ INSERT_TASK_ztsmlq( const RUNTIME_option_t *options,
const CHAM_desc_t *V, int Vm, int Vn, int ldv, const CHAM_desc_t *V, int Vm, int Vn, int ldv,
const CHAM_desc_t *T, int Tm, int Tn, int ldt ) const CHAM_desc_t *T, int Tm, int Tn, int ldt )
{ {
(void)m1;
(void)n1;
return INSERT_TASK_ztpmlqt( options, side, trans, m2, n2, k, 0, ib, nb, return INSERT_TASK_ztpmlqt( options, side, trans, m2, n2, k, 0, ib, nb,
V, Vm, Vn, ldv, T, Tm, Tn, ldt, V, Vm, Vn, ldv, T, Tm, Tn, ldt,
A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 ); A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 );
...@@ -497,6 +499,8 @@ INSERT_TASK_ztsmqr( const RUNTIME_option_t *options, ...@@ -497,6 +499,8 @@ INSERT_TASK_ztsmqr( const RUNTIME_option_t *options,
const CHAM_desc_t *V, int Vm, int Vn, int ldv, const CHAM_desc_t *V, int Vm, int Vn, int ldv,
const CHAM_desc_t *T, int Tm, int Tn, int ldt ) const CHAM_desc_t *T, int Tm, int Tn, int ldt )
{ {
(void)m1;
(void)n1;
return INSERT_TASK_ztpmqrt( options, side, trans, m2, n2, k, 0, ib, nb, return INSERT_TASK_ztpmqrt( options, side, trans, m2, n2, k, 0, ib, nb,
V, Vm, Vn, ldv, T, Tm, Tn, ldt, V, Vm, Vn, ldv, T, Tm, Tn, ldt,
A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 ); A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 );
...@@ -511,6 +515,8 @@ INSERT_TASK_zttmlq( const RUNTIME_option_t *options, ...@@ -511,6 +515,8 @@ INSERT_TASK_zttmlq( const RUNTIME_option_t *options,
const CHAM_desc_t *V, int Vm, int Vn, int ldv, const CHAM_desc_t *V, int Vm, int Vn, int ldv,
const CHAM_desc_t *T, int Tm, int Tn, int ldt ) const CHAM_desc_t *T, int Tm, int Tn, int ldt )
{ {
(void)m1;
(void)n1;
return INSERT_TASK_ztpmlqt( options, side, trans, m2, n2, k, n2, ib, nb, return INSERT_TASK_ztpmlqt( options, side, trans, m2, n2, k, n2, ib, nb,
V, Vm, Vn, ldv, T, Tm, Tn, ldt, V, Vm, Vn, ldv, T, Tm, Tn, ldt,
A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 ); A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 );
...@@ -525,6 +531,8 @@ INSERT_TASK_zttmqr( const RUNTIME_option_t *options, ...@@ -525,6 +531,8 @@ INSERT_TASK_zttmqr( const RUNTIME_option_t *options,
const CHAM_desc_t *V, int Vm, int Vn, int ldv, const CHAM_desc_t *V, int Vm, int Vn, int ldv,
const CHAM_desc_t *T, int Tm, int Tn, int ldt ) const CHAM_desc_t *T, int Tm, int Tn, int ldt )
{ {
(void)m1;
(void)n1;
return INSERT_TASK_ztpmqrt( options, side, trans, m2, n2, k, m2, ib, nb, return INSERT_TASK_ztpmqrt( options, side, trans, m2, n2, k, m2, ib, nb,
V, Vm, Vn, ldv, T, Tm, Tn, ldt, V, Vm, Vn, ldv, T, Tm, Tn, ldt,
A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 ); A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 );
......
...@@ -98,10 +98,13 @@ void INSERT_TASK_zgelqt(const RUNTIME_option_t *options, ...@@ -98,10 +98,13 @@ void INSERT_TASK_zgelqt(const RUNTIME_option_t *options,
CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
int ws_size = options->ws_wsize; int ws_size = options->ws_wsize;
#pragma omp task firstprivate(ws_size, m, n, ib, ptrA, lda, ptrT, ldt) depend(inout:ptrA[0]) depend(inout:ptrT[0])
#pragma omp task firstprivate(ws_size, m, n, ib, ptrA, lda, ptrT, ldt) depend(inout:ptrA[0]) depend(out:ptrT[0])
{ {
CHAMELEON_Complex64_t TAU[ws_size]; CHAMELEON_Complex64_t TAU[ws_size];
CHAMELEON_Complex64_t *work = TAU + chameleon_max( m, n ); CHAMELEON_Complex64_t *work = TAU + chameleon_max( m, n );
CORE_zgelqt(m, n, ib, ptrA, lda, ptrT, ldt, TAU, work);
CORE_zlaset( ChamUpperLower, ib, m, 0., 0., ptrT, ldt );
CORE_zgelqt( m, n, ib, ptrA, lda, ptrT, ldt, TAU, work );
} }
} }
...@@ -99,10 +99,13 @@ void INSERT_TASK_zgeqrt(const RUNTIME_option_t *options, ...@@ -99,10 +99,13 @@ void INSERT_TASK_zgeqrt(const RUNTIME_option_t *options,
CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
int ws_size = options->ws_wsize; int ws_size = options->ws_wsize;
#pragma omp task firstprivate(ws_size, m, n, ib, ptrA, lda, ptrT, ldt) depend(inout:ptrA[0]) depend(inout:ptrT[0])
#pragma omp task firstprivate(ws_size, m, n, ib, ptrA, lda, ptrT, ldt) depend(inout:ptrA[0]) depend(out:ptrT[0])
{ {
CHAMELEON_Complex64_t TAU[ws_size]; CHAMELEON_Complex64_t TAU[ws_size];
CHAMELEON_Complex64_t *work = TAU + chameleon_max(m, n); CHAMELEON_Complex64_t *work = TAU + chameleon_max(m, n);
CORE_zgeqrt(m, n, ib, ptrA, lda, ptrT, ldt, TAU, work);
CORE_zlaset( ChamUpperLower, ib, n, 0., 0., ptrT, ldt );
CORE_zgeqrt( m, n, ib, ptrA, lda, ptrT, ldt, TAU, work );
} }
} }
...@@ -31,9 +31,13 @@ INSERT_TASK_ztplqt( const RUNTIME_option_t *options, ...@@ -31,9 +31,13 @@ INSERT_TASK_ztplqt( const RUNTIME_option_t *options,
CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn); CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn);
CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
int ws_size = options->ws_wsize; int ws_size = options->ws_wsize;
#pragma omp task firstprivate(ws_size, M, N, L, ib, ptrA, lda, ptrB, ldb, ptrT, ldt) depend(inout:ptrA[0], ptrB[0], ptrT[0])
#pragma omp task firstprivate(ws_size, M, N, L, ib, ptrA, lda, ptrB, ldb, ptrT, ldt) depend(inout:ptrA[0], ptrB[0]) depend(out:ptrT[0])
{ {
CHAMELEON_Complex64_t work[ws_size]; CHAMELEON_Complex64_t work[ws_size];
CORE_zlaset( ChamUpperLower, ib, M, 0., 0., ptrT, ldt);
CORE_ztplqt( M, N, L, ib, CORE_ztplqt( M, N, L, ib,
ptrA, lda, ptrB, ldb, ptrT, ldt, work ); ptrA, lda, ptrB, ldb, ptrT, ldt, work );
} }
......
...@@ -30,9 +30,13 @@ INSERT_TASK_ztpqrt( const RUNTIME_option_t *options, ...@@ -30,9 +30,13 @@ INSERT_TASK_ztpqrt( const RUNTIME_option_t *options,
CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn); CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn);
CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn); CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
int ws_size = options->ws_wsize; int ws_size = options->ws_wsize;
#pragma omp task firstprivate(ws_size, M, N, L, ib, ptrT, ldt, ptrA, lda, ptrB, ldb) depend(in:ptrT[0]) depend(inout:ptrA[0], ptrB[0])
#pragma omp task firstprivate(ws_size, M, N, L, ib, ptrT, ldt, ptrA, lda, ptrB, ldb) depend(inout:ptrA[0], ptrB[0]) depend(out:ptrT[0])
{ {
CHAMELEON_Complex64_t tmp[ws_size]; CHAMELEON_Complex64_t tmp[ws_size];
CORE_zlaset( ChamUpperLower, ib, N, 0., 0., ptrT, ldt);
CORE_ztpqrt( M, N, L, ib, CORE_ztpqrt( M, N, L, ib,
ptrA, lda, ptrB, ldb, ptrT, ldt, tmp ); ptrA, lda, ptrB, ldb, ptrT, ldt, tmp );
} }
......
...@@ -98,6 +98,7 @@ CORE_zgelqt_parsec( parsec_execution_stream_t *context, ...@@ -98,6 +98,7 @@ CORE_zgelqt_parsec( parsec_execution_stream_t *context,
parsec_dtd_unpack_args( parsec_dtd_unpack_args(
this_task, &m, &n, &ib, &A, &lda, &T, &ldt, &TAU, &WORK ); this_task, &m, &n, &ib, &A, &lda, &T, &ldt, &TAU, &WORK );
CORE_zlaset( ChamUpperLower, ib, m, 0., 0., T, ldt );
CORE_zgelqt( m, n, ib, A, lda, T, ldt, TAU, WORK ); CORE_zgelqt( m, n, ib, A, lda, T, ldt, TAU, WORK );
(void)context; (void)context;
......
...@@ -99,6 +99,7 @@ CORE_zgeqrt_parsec ( parsec_execution_stream_t *context, ...@@ -99,6 +99,7 @@ CORE_zgeqrt_parsec ( parsec_execution_stream_t *context,
parsec_dtd_unpack_args( parsec_dtd_unpack_args(
this_task, &m, &n, &ib, &A, &lda, &T, &ldt, &TAU, &WORK ); this_task, &m, &n, &ib, &A, &lda, &T, &ldt, &TAU, &WORK );
CORE_zlaset( ChamUpperLower, ib, n, 0., 0., T, ldt );
CORE_zgeqrt( m, n, ib, A, lda, T, ldt, TAU, WORK ); CORE_zgeqrt( m, n, ib, A, lda, T, ldt, TAU, WORK );
(void)context; (void)context;
......
...@@ -40,6 +40,7 @@ CORE_ztplqt_parsec( parsec_execution_stream_t *context, ...@@ -40,6 +40,7 @@ CORE_ztplqt_parsec( parsec_execution_stream_t *context,
parsec_dtd_unpack_args( parsec_dtd_unpack_args(
this_task, &M, &N, &L, &ib, &A, &lda, &B, &ldb, &T, &ldt, &WORK ); this_task, &M, &N, &L, &ib, &A, &lda, &B, &ldb, &T, &ldt, &WORK );
CORE_zlaset( ChamUpperLower, ib, M, 0., 0., T, ldt );
CORE_ztplqt( M, N, L, ib, CORE_ztplqt( M, N, L, ib,
A, lda, B, ldb, T, ldt, WORK ); A, lda, B, ldb, T, ldt, WORK );
......
...@@ -40,6 +40,7 @@ CORE_ztpqrt_parsec( parsec_execution_stream_t *context, ...@@ -40,6 +40,7 @@ CORE_ztpqrt_parsec( parsec_execution_stream_t *context,
parsec_dtd_unpack_args( parsec_dtd_unpack_args(
this_task, &M, &N, &L, &ib, &A, &lda, &B, &ldb, &T, &ldt, &WORK ); this_task, &M, &N, &L, &ib, &A, &lda, &B, &ldb, &T, &ldt, &WORK );
CORE_zlaset( ChamUpperLower, ib, N, 0., 0., T, ldt );
CORE_ztpqrt( M, N, L, ib, CORE_ztpqrt( M, N, L, ib,
A, lda, B, ldb, T, ldt, WORK ); A, lda, B, ldb, T, ldt, WORK );
......
...@@ -40,6 +40,7 @@ void CORE_zgelqt_quark(Quark *quark) ...@@ -40,6 +40,7 @@ void CORE_zgelqt_quark(Quark *quark)
CHAMELEON_Complex64_t *WORK; CHAMELEON_Complex64_t *WORK;