Commit fa6d78a3 authored by Mathieu Faverge's avatar Mathieu Faverge

Merge branch 'starpu/alloc_on_the_fly' into 'master'

Starpu/alloc on the fly

See merge request solverstack/chameleon!140
parents e932eacc f676d524
......@@ -72,11 +72,10 @@ chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A,
}
if ( m >= P ) {
INSERT_TASK_dgeadd(
options,
ChamNoTrans, 1, tempnn, A->nb,
1.0, W( Wcol, m, n ), 1,
1.0, W( Wcol, m%P, n ), 1 );
INSERT_TASK_daxpy(
options, tempnn, 1.,
W( Wcol, m, n ), 1,
W( Wcol, m%P, n ), 1 );
}
}
......@@ -85,11 +84,10 @@ chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A,
* For each i, W(i, n) = reduce( W(0..P-1, n) )
*/
for(m = 1; m < P; m++) {
INSERT_TASK_dgeadd(
options,
ChamNoTrans, 1, tempnn, A->nb,
1.0, W( Wcol, m, n ), 1,
1.0, W( Wcol, 0, n ), 1 );
INSERT_TASK_daxpy(
options, tempnn, 1.,
W( Wcol, m, n ), 1,
W( Wcol, 0, n ), 1 );
}
INSERT_TASK_dlange(
......@@ -165,11 +163,10 @@ chameleon_pzlange_inf( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A,
}
if ( n >= Q ) {
INSERT_TASK_dgeadd(
options,
ChamNoTrans, tempmm, 1, A->mb,
1.0, W( Wcol, m, n ), tempmm,
1.0, W( Wcol, m, n%Q), tempmm );
INSERT_TASK_daxpy(
options, tempmm, 1.,
W( Wcol, m, n ), 1,
W( Wcol, m, n%Q ), 1 );
}
}
......@@ -178,11 +175,10 @@ chameleon_pzlange_inf( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A,
* For each j, W(m, j) = reduce( Wcol(m, 0..Q-1) )
*/
for(n = 1; n < Q; n++) {
INSERT_TASK_dgeadd(
options,
ChamNoTrans, tempmm, 1, A->mb,
1.0, W( Wcol, m, n), tempmm,
1.0, W( Wcol, m, 0), tempmm );
INSERT_TASK_daxpy(
options, tempmm, 1.,
W( Wcol, m, n ), 1,
W( Wcol, m, 0 ), 1 );
}
INSERT_TASK_dlange(
......@@ -407,11 +403,14 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia
case ChamOneNorm:
RUNTIME_options_ws_alloc( &options, 1, 0 );
chameleon_desc_init( &Wcol, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 1, A->nb, A->nb,
chameleon_desc_init( &Wcol, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble, 1, A->nb, A->nb,
workmt, worknt * A->nb, 0, 0, workmt, worknt * A->nb, A->p, A->q,
NULL, NULL, NULL );
wcol_init = 1;
/*
* Use the global allocator for Welt, otherwise flush may free the data before the result is read.
*/
chameleon_desc_init( &Welt, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 1, 1, 1,
A->p, worknt, 0, 0, A->p, worknt, A->p, A->q,
NULL, NULL, NULL );
......@@ -424,7 +423,7 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia
case ChamInfNorm:
RUNTIME_options_ws_alloc( &options, A->mb, 0 );
chameleon_desc_init( &Wcol, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, A->mb, 1, A->mb,
chameleon_desc_init( &Wcol, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble, A->mb, 1, A->mb,
workmt * A->mb, worknt, 0, 0, workmt * A->mb, worknt, A->p, A->q,
NULL, NULL, NULL );
wcol_init = 1;
......@@ -522,7 +521,7 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia
}
CHAMELEON_Desc_Flush( &Welt, sequence );
CHAMELEON_Desc_Flush( A, sequence );
RUNTIME_sequence_wait(chamctxt, sequence);
RUNTIME_sequence_wait( chamctxt, sequence );
*result = *((double *)Welt.get_blkaddr( &Welt, A->myrank / A->q, A->myrank % A->q ));
......
......@@ -81,11 +81,10 @@ chameleon_pzlansy_inf( cham_uplo_t uplo, CHAM_desc_t *A,
int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb;
for(n = Q; n < NT; n++) {
INSERT_TASK_dgeadd(
options,
ChamNoTrans, tempmm, 1, A->nb,
1.0, W( Wcol, m, n ), tempmm,
1.0, W( Wcol, m, n%Q), tempmm );
INSERT_TASK_daxpy(
options, tempmm, 1.,
W( Wcol, m, n ), 1,
W( Wcol, m, n%Q ), 1 );
}
/**
......@@ -93,11 +92,10 @@ chameleon_pzlansy_inf( cham_uplo_t uplo, CHAM_desc_t *A,
* For each j, W(m, j) = reduce( Wcol(m, 0..Q-1) )
*/
for(n = 1; n < Q; n++) {
INSERT_TASK_dgeadd(
options,
ChamNoTrans, tempmm, 1, A->mb,
1.0, W( Wcol, m, n), tempmm,
1.0, W( Wcol, m, 0), tempmm );
INSERT_TASK_daxpy(
options, tempmm, 1.,
W( Wcol, m, n ), 1,
W( Wcol, m, 0 ), 1 );
}
INSERT_TASK_dlange(
......@@ -334,11 +332,14 @@ void chameleon_pzlansy_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_tra
case ChamInfNorm:
RUNTIME_options_ws_alloc( &options, 1, 0 );
chameleon_desc_init( &Wcol, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, A->mb, 1, A->mb,
chameleon_desc_init( &Wcol, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble, A->mb, 1, A->mb,
workmt * A->mb, worknt, 0, 0, workmt * A->mb, worknt, A->p, A->q,
NULL, NULL, NULL );
wcol_init = 1;
/*
* Use the global allocator for Welt, otherwise flush may free the data before the result is read.
*/
chameleon_desc_init( &Welt, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 1, 1, 1,
workmt, A->q, 0, 0, workmt, A->q, A->p, A->q,
NULL, NULL, NULL );
......
......@@ -466,6 +466,12 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree,
RUNTIME_data_flush( sequence, T(k, n) );
}
/* Restore the original location of the tiles */
for (m = 0; m < B->mt; m++) {
RUNTIME_data_migrate( sequence, B( m, k ),
B->get_rankof( B, m, k ) );
}
RUNTIME_iteration_pop(chamctxt);
}
}
......
......@@ -467,6 +467,12 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree,
RUNTIME_data_flush( sequence, T(n, k) );
}
/* Restore the original location of the tiles */
for (m = 0; m < B->mt; m++) {
RUNTIME_data_migrate( sequence, B(m, k),
B->get_rankof( B, m, k ) );
}
RUNTIME_iteration_pop(chamctxt);
}
}
......
......@@ -266,7 +266,7 @@ int CHAMELEON_zlaset_Tile_Async( cham_uplo_t uplo,
return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE);
}
/* Check input arguments */
if (A->nb != A->mb) {
if ( (alpha != beta) && (A->nb != A->mb) ) {
chameleon_error("CHAMELEON_zlaset_Tile_Async", "only square tiles supported");
return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE);
}
......
......@@ -134,7 +134,7 @@ void chameleon_pzungqr_param( int genD, int K, const libhqr_tree_t *qrtree,
static inline int
chameleon_zdesc_alloc_diag( CHAM_desc_t *descA, int nb, int m, int n, int p, int q ) {
int diag_m = chameleon_min( m, n );
return chameleon_desc_init( descA, CHAMELEON_MAT_ALLOC_GLOBAL,
return chameleon_desc_init( descA, CHAMELEON_MAT_ALLOC_TILE,
ChamComplexDouble, nb, nb, nb*nb,
diag_m, nb, 0, 0, diag_m, nb, p, q,
chameleon_getaddr_diag,
......@@ -145,7 +145,7 @@ chameleon_zdesc_alloc_diag( CHAM_desc_t *descA, int nb, int m, int n, int p, int
#define chameleon_zdesc_alloc( descA, mb, nb, lm, ln, i, j, m, n, free) \
{ \
int rc; \
rc = chameleon_desc_init( &(descA), CHAMELEON_MAT_ALLOC_GLOBAL, \
rc = chameleon_desc_init( &(descA), CHAMELEON_MAT_ALLOC_TILE, \
ChamComplexDouble, (mb), (nb), ((mb)*(nb)), \
(m), (n), (i), (j), (m), (n), 1, 1, \
NULL, NULL, NULL ); \
......@@ -174,7 +174,7 @@ chameleon_zlap2tile( CHAM_context_t *chamctxt,
if ( CHAMELEON_TRANSLATION == ChamOutOfPlace ) {
/* Initialize the tile descriptor */
chameleon_desc_init( descAt, CHAMELEON_MAT_ALLOC_GLOBAL, ChamComplexDouble, mb, nb, (mb)*(nb),
chameleon_desc_init( descAt, CHAMELEON_MAT_ALLOC_TILE, ChamComplexDouble, mb, nb, (mb)*(nb),
lm, ln, 0, 0, m, n, 1, 1,
chameleon_getaddr_ccrb, chameleon_getblkldd_ccrb, NULL );
......@@ -235,6 +235,7 @@ chameleon_ztile2lap( CHAM_context_t *chamctxt, CHAM_desc_t *descAl, CHAM_desc_t
static inline void
chameleon_ztile2lap_cleanup( CHAM_context_t *chamctxt, CHAM_desc_t *descAl, CHAM_desc_t *descAt )
{
(void)chamctxt;
chameleon_desc_destroy( descAl );
chameleon_desc_destroy( descAt );
}
......
......@@ -226,26 +226,32 @@ int chameleon_desc_init( CHAM_desc_t *desc, void *mat,
/* The matrix is alocated tile by tile with out of core */
desc->ooc = 0;
// Matrix address
if ( mat == CHAMELEON_MAT_ALLOC_GLOBAL ) {
rc = chameleon_desc_mat_alloc( desc );
switch ( (intptr_t)mat ) {
case (intptr_t)CHAMELEON_MAT_ALLOC_TILE:
if ( chamctxt->scheduler == RUNTIME_SCHED_STARPU ) {
/* Let's use the allocation on the fly as in OOC */
desc->get_blkaddr = chameleon_getaddr_null;
desc->mat = NULL;
break;
}
/* Otherwise we switch back to the full allocation */
desc->alloc_mat = 1;
desc->use_mat = 1;
}
else if ( mat == CHAMELEON_MAT_ALLOC_TILE ) {
//chameleon_error( "chameleon_desc_init", "CHAMELEON_MAT_ALLOC_TILE is not available yet" );
//desc->mat = NULL;
case (intptr_t)CHAMELEON_MAT_ALLOC_GLOBAL:
rc = chameleon_desc_mat_alloc( desc );
desc->alloc_mat = 1;
desc->use_mat = 1;
break;
desc->alloc_mat = 1;
}
else if ( mat == CHAMELEON_MAT_OOC ) {
case (intptr_t)CHAMELEON_MAT_OOC:
if ( chamctxt->scheduler != RUNTIME_SCHED_STARPU ) {
chameleon_error("CHAMELEON_Desc_Create", "CHAMELEON Out-of-Core descriptors are supported only with StarPU");
return CHAMELEON_ERR_NOT_SUPPORTED;
}
desc->mat = NULL;
desc->ooc = 1;
}
else {
break;
default:
/* memory of the matrix is handled by users */
desc->mat = mat;
desc->use_mat = 1;
......
......@@ -74,7 +74,8 @@ int chameleon_alloc_ibnb_tile(int M, int N, cham_tasktype_t func, int type, CHAM
lm = IB * MT;
ln = NB * NT;
return CHAMELEON_Desc_Create( desc, NULL, type, IB, NB, IB*NB, lm, ln, 0, 0, lm, ln, p, q );
return CHAMELEON_Desc_Create( desc, CHAMELEON_MAT_ALLOC_TILE, type, IB, NB, IB*NB,
lm, ln, 0, 0, lm, ln, p, q );
}
/**
......@@ -119,7 +120,8 @@ int chameleon_alloc_ipiv(int M, int N, cham_tasktype_t func, int type, CHAM_desc
/* TODO: Fix the distribution for IPIV */
*IPIV = (int*)malloc( size );
return CHAMELEON_Desc_Create( desc, NULL, type, IB, NB, IB*NB, lm, ln, 0, 0, lm, ln, p, q );
return CHAMELEON_Desc_Create( desc, CHAMELEON_MAT_ALLOC_TILE, type, IB, NB, IB*NB,
lm, ln, 0, 0, lm, ln, p, q );
}
/**
......
......@@ -67,7 +67,7 @@
* The leading dimension of the array A. LDA >= max(1,M).
*
* @param[out] T
* The IB-by-N triangular factor T of the block reflector.
* The IB-by-M triangular factor T of the block reflector.
* T is upper triangular by block (economic storage);
* The rest of the array is not referenced.
*
......
......@@ -483,6 +483,8 @@ INSERT_TASK_ztsmlq( const RUNTIME_option_t *options,
const CHAM_desc_t *V, int Vm, int Vn, int ldv,
const CHAM_desc_t *T, int Tm, int Tn, int ldt )
{
(void)m1;
(void)n1;
return INSERT_TASK_ztpmlqt( options, side, trans, m2, n2, k, 0, ib, nb,
V, Vm, Vn, ldv, T, Tm, Tn, ldt,
A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 );
......@@ -497,6 +499,8 @@ INSERT_TASK_ztsmqr( const RUNTIME_option_t *options,
const CHAM_desc_t *V, int Vm, int Vn, int ldv,
const CHAM_desc_t *T, int Tm, int Tn, int ldt )
{
(void)m1;
(void)n1;
return INSERT_TASK_ztpmqrt( options, side, trans, m2, n2, k, 0, ib, nb,
V, Vm, Vn, ldv, T, Tm, Tn, ldt,
A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 );
......@@ -511,6 +515,8 @@ INSERT_TASK_zttmlq( const RUNTIME_option_t *options,
const CHAM_desc_t *V, int Vm, int Vn, int ldv,
const CHAM_desc_t *T, int Tm, int Tn, int ldt )
{
(void)m1;
(void)n1;
return INSERT_TASK_ztpmlqt( options, side, trans, m2, n2, k, n2, ib, nb,
V, Vm, Vn, ldv, T, Tm, Tn, ldt,
A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 );
......@@ -525,6 +531,8 @@ INSERT_TASK_zttmqr( const RUNTIME_option_t *options,
const CHAM_desc_t *V, int Vm, int Vn, int ldv,
const CHAM_desc_t *T, int Tm, int Tn, int ldt )
{
(void)m1;
(void)n1;
return INSERT_TASK_ztpmqrt( options, side, trans, m2, n2, k, m2, ib, nb,
V, Vm, Vn, ldv, T, Tm, Tn, ldt,
A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 );
......
......@@ -98,10 +98,13 @@ void INSERT_TASK_zgelqt(const RUNTIME_option_t *options,
CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
int ws_size = options->ws_wsize;
#pragma omp task firstprivate(ws_size, m, n, ib, ptrA, lda, ptrT, ldt) depend(inout:ptrA[0]) depend(inout:ptrT[0])
#pragma omp task firstprivate(ws_size, m, n, ib, ptrA, lda, ptrT, ldt) depend(inout:ptrA[0]) depend(out:ptrT[0])
{
CHAMELEON_Complex64_t TAU[ws_size];
CHAMELEON_Complex64_t *work = TAU + chameleon_max( m, n );
CORE_zgelqt(m, n, ib, ptrA, lda, ptrT, ldt, TAU, work);
CORE_zlaset( ChamUpperLower, ib, m, 0., 0., ptrT, ldt );
CORE_zgelqt( m, n, ib, ptrA, lda, ptrT, ldt, TAU, work );
}
}
......@@ -99,10 +99,13 @@ void INSERT_TASK_zgeqrt(const RUNTIME_option_t *options,
CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
int ws_size = options->ws_wsize;
#pragma omp task firstprivate(ws_size, m, n, ib, ptrA, lda, ptrT, ldt) depend(inout:ptrA[0]) depend(inout:ptrT[0])
#pragma omp task firstprivate(ws_size, m, n, ib, ptrA, lda, ptrT, ldt) depend(inout:ptrA[0]) depend(out:ptrT[0])
{
CHAMELEON_Complex64_t TAU[ws_size];
CHAMELEON_Complex64_t *work = TAU + chameleon_max(m, n);
CORE_zgeqrt(m, n, ib, ptrA, lda, ptrT, ldt, TAU, work);
CORE_zlaset( ChamUpperLower, ib, n, 0., 0., ptrT, ldt );
CORE_zgeqrt( m, n, ib, ptrA, lda, ptrT, ldt, TAU, work );
}
}
......@@ -31,9 +31,13 @@ INSERT_TASK_ztplqt( const RUNTIME_option_t *options,
CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn);
CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
int ws_size = options->ws_wsize;
#pragma omp task firstprivate(ws_size, M, N, L, ib, ptrA, lda, ptrB, ldb, ptrT, ldt) depend(inout:ptrA[0], ptrB[0], ptrT[0])
#pragma omp task firstprivate(ws_size, M, N, L, ib, ptrA, lda, ptrB, ldb, ptrT, ldt) depend(inout:ptrA[0], ptrB[0]) depend(out:ptrT[0])
{
CHAMELEON_Complex64_t work[ws_size];
CORE_zlaset( ChamUpperLower, ib, M, 0., 0., ptrT, ldt);
CORE_ztplqt( M, N, L, ib,
ptrA, lda, ptrB, ldb, ptrT, ldt, work );
}
......
......@@ -30,9 +30,13 @@ INSERT_TASK_ztpqrt( const RUNTIME_option_t *options,
CHAMELEON_Complex64_t *ptrB = RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn);
CHAMELEON_Complex64_t *ptrT = RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn);
int ws_size = options->ws_wsize;
#pragma omp task firstprivate(ws_size, M, N, L, ib, ptrT, ldt, ptrA, lda, ptrB, ldb) depend(in:ptrT[0]) depend(inout:ptrA[0], ptrB[0])
#pragma omp task firstprivate(ws_size, M, N, L, ib, ptrT, ldt, ptrA, lda, ptrB, ldb) depend(inout:ptrA[0], ptrB[0]) depend(out:ptrT[0])
{
CHAMELEON_Complex64_t tmp[ws_size];
CORE_zlaset( ChamUpperLower, ib, N, 0., 0., ptrT, ldt);
CORE_ztpqrt( M, N, L, ib,
ptrA, lda, ptrB, ldb, ptrT, ldt, tmp );
}
......
......@@ -98,6 +98,7 @@ CORE_zgelqt_parsec( parsec_execution_stream_t *context,
parsec_dtd_unpack_args(
this_task, &m, &n, &ib, &A, &lda, &T, &ldt, &TAU, &WORK );
CORE_zlaset( ChamUpperLower, ib, m, 0., 0., T, ldt );
CORE_zgelqt( m, n, ib, A, lda, T, ldt, TAU, WORK );
(void)context;
......
......@@ -99,6 +99,7 @@ CORE_zgeqrt_parsec ( parsec_execution_stream_t *context,
parsec_dtd_unpack_args(
this_task, &m, &n, &ib, &A, &lda, &T, &ldt, &TAU, &WORK );
CORE_zlaset( ChamUpperLower, ib, n, 0., 0., T, ldt );
CORE_zgeqrt( m, n, ib, A, lda, T, ldt, TAU, WORK );
(void)context;
......
......@@ -40,6 +40,7 @@ CORE_ztplqt_parsec( parsec_execution_stream_t *context,
parsec_dtd_unpack_args(
this_task, &M, &N, &L, &ib, &A, &lda, &B, &ldb, &T, &ldt, &WORK );
CORE_zlaset( ChamUpperLower, ib, M, 0., 0., T, ldt );
CORE_ztplqt( M, N, L, ib,
A, lda, B, ldb, T, ldt, WORK );
......
......@@ -40,6 +40,7 @@ CORE_ztpqrt_parsec( parsec_execution_stream_t *context,
parsec_dtd_unpack_args(
this_task, &M, &N, &L, &ib, &A, &lda, &B, &ldb, &T, &ldt, &WORK );
CORE_zlaset( ChamUpperLower, ib, N, 0., 0., T, ldt );
CORE_ztpqrt( M, N, L, ib,
A, lda, B, ldb, T, ldt, WORK );
......
......@@ -40,6 +40,7 @@ void CORE_zgelqt_quark(Quark *quark)
CHAMELEON_Complex64_t *WORK;
quark_unpack_args_9(quark, m, n, ib, A, lda, T, ldt, TAU, WORK);
CORE_zlaset( ChamUpperLower, ib, m, 0., 0., T, ldt );
CORE_zgelqt(m, n, ib, A, lda, T, ldt, TAU, WORK);
}
......
......@@ -40,6 +40,7 @@ void CORE_zgeqrt_quark(Quark *quark)
CHAMELEON_Complex64_t *WORK;
quark_unpack_args_9(quark, m, n, ib, A, lda, T, ldt, TAU, WORK);
CORE_zlaset( ChamUpperLower, ib, n, 0., 0., T, ldt );
CORE_zgeqrt(m, n, ib, A, lda, T, ldt, TAU, WORK);
}
......
......@@ -39,6 +39,7 @@ CORE_ztplqt_quark( Quark *quark )
quark_unpack_args_11( quark, M, N, L, ib,
A, lda, B, ldb, T, ldt, WORK );
CORE_zlaset( ChamUpperLower, ib, N, 0., 0., T, ldt );
CORE_ztplqt( M, N, L, ib,
A, lda, B, ldb, T, ldt, WORK );
}
......
......@@ -39,6 +39,7 @@ CORE_ztpqrt_quark( Quark *quark )
quark_unpack_args_11( quark, M, N, L, ib,
A, lda, B, ldb, T, ldt, WORK );
CORE_zlaset( ChamUpperLower, ib, N, 0., 0., T, ldt );
CORE_ztpqrt( M, N, L, ib,
A, lda, B, ldb, T, ldt, WORK );
}
......
......@@ -26,6 +26,36 @@
#include "chameleon_starpu.h"
#include "runtime_codelet_z.h"
#if !defined(CHAMELEON_SIMULATION)
static void cl_zgelqt_cpu_func(void *descr[], void *cl_arg)
{
CHAMELEON_starpu_ws_t *h_work;
int m;
int n;
int ib;
CHAMELEON_Complex64_t *A;
int lda;
CHAMELEON_Complex64_t *T;
int ldt;
CHAMELEON_Complex64_t *TAU, *WORK;
A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
T = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
TAU = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); /* max(m,n) + ib*n */
starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldt, &h_work);
WORK = TAU + chameleon_max( m, n );
CORE_zlaset( ChamUpperLower, ib, m, 0., 0., T, ldt );
CORE_zgelqt(m, n, ib, A, lda, T, ldt, TAU, WORK);
}
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
CODELETS_CPU(zgelqt, 3, cl_zgelqt_cpu_func)
/**
*
* @ingroup INSERT_TASK_Complex64_t
......@@ -87,7 +117,6 @@
* \retval <0 if -i, the i-th argument had an illegal value
*
*/
void INSERT_TASK_zgelqt(const RUNTIME_option_t *options,
int m, int n, int ib, int nb,
const CHAM_desc_t *A, int Am, int An, int lda,
......@@ -123,33 +152,3 @@ void INSERT_TASK_zgelqt(const RUNTIME_option_t *options,
#endif
0);
}
#if !defined(CHAMELEON_SIMULATION)
static void cl_zgelqt_cpu_func(void *descr[], void *cl_arg)
{
CHAMELEON_starpu_ws_t *h_work;
int m;
int n;
int ib;
CHAMELEON_Complex64_t *A;
int lda;
CHAMELEON_Complex64_t *T;
int ldt;
CHAMELEON_Complex64_t *TAU, *WORK;
A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
T = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
TAU = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); /* max(m,n) + ib*n */
starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldt, &h_work);
WORK = TAU + chameleon_max( m, n );
CORE_zgelqt(m, n, ib, A, lda, T, ldt, TAU, WORK);
}
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
CODELETS_CPU(zgelqt, 3, cl_zgelqt_cpu_func)
......@@ -35,7 +35,7 @@ void INSERT_TASK_zgemm(const RUNTIME_option_t *options,
cham_trans_t transA, cham_trans_t transB,
int m, int n, int k, int nb,
CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
const CHAM_desc_t *B, int Bm, int Bn, int ldb,
const CHAM_desc_t *B, int Bm, int Bn, int ldb,
CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc)
{
(void)nb;
......
......@@ -26,6 +26,37 @@
#include "chameleon_starpu.h"
#include "runtime_codelet_z.h"
#if !defined(CHAMELEON_SIMULATION)
static void cl_zgeqrt_cpu_func(void *descr[], void *cl_arg)
{
CHAMELEON_starpu_ws_t *h_work;
int m;
int n;
int ib;
CHAMELEON_Complex64_t *A;
int lda;
CHAMELEON_Complex64_t *T;
int ldt;
CHAMELEON_Complex64_t *TAU, *WORK;
A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
T = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
TAU = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); /* max(m,n) + n * ib */
starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldt, &h_work);
WORK = TAU + chameleon_max( m, n );
CORE_zlaset( ChamUpperLower, ib, n, 0., 0., T, ldt );
CORE_zgeqrt(m, n, ib, A, lda, T, ldt, TAU, WORK);
}
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
CODELETS_CPU(zgeqrt, 3, cl_zgeqrt_cpu_func)
/**
*
* @ingroup INSERT_TASK_Complex64_t
......@@ -88,7 +119,6 @@
* \retval <0 if -i, the i-th argument had an illegal value
*
*/
void INSERT_TASK_zgeqrt(const RUNTIME_option_t *options,
int m, int n, int ib, int nb,
const CHAM_desc_t *A, int Am, int An, int lda,
......@@ -124,33 +154,3 @@ void INSERT_TASK_zgeqrt(const RUNTIME_option_t *options,
#endif
0);
}
#if !defined(CHAMELEON_SIMULATION)
static void cl_zgeqrt_cpu_func(void *descr[], void *cl_arg)
{
CHAMELEON_starpu_ws_t *h_work;
int m;
int n;
int ib;
CHAMELEON_Complex64_t *A;
int lda;
CHAMELEON_Complex64_t *T;
int ldt;
CHAMELEON_Complex64_t *TAU, *WORK;
A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
T = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
TAU = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); /* max(m,n) + n * ib */
starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldt, &h_work);
WORK = TAU + chameleon_max( m, n );
CORE_zgeqrt(m, n, ib, A, lda, T, ldt, TAU, WORK);
}
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
CODELETS_CPU(zgeqrt, 3, cl_zgeqrt_cpu_func)
......@@ -24,10 +24,10 @@
#include "chameleon_starpu.h"
#include "runtime_codelet_z.h"
void INSERT_TASK_zlange(const RUNTIME_option_t *options,
cham_normtype_t norm, int M, int N, int NB,
const CHAM_desc_t *A, int Am, int An, int LDA,
const CHAM_desc_t *B, int Bm, int Bn)
void INSERT_TASK_zlange( const RUNTIME_option_t *options,
cham_normtype_t norm, int M, int N, int NB,
const CHAM_desc_t *A, int Am, int An, int LDA,
const CHAM_desc_t *B, int Bm, int Bn )
{
(void)NB;
struct starpu_codelet *codelet = &cl_zlange;
......
......@@ -43,6 +43,7 @@ static void cl_ztplqt_cpu_func(void *descr[], void *cl_arg)
starpu_codelet_unpack_args( cl_arg, &M, &N, &L, &ib,
&lda, &ldb, &ldt );
CORE_zlaset( ChamUpperLower, ib, M, 0., 0., T, ldt );
CORE_ztplqt( M, N, L, ib,
A, lda, B, ldb, T, ldt, WORK );
}
......
......@@ -43,6 +43,7 @@ static void cl_ztpqrt_cpu_func(void *descr[], void *cl_arg)
starpu_codelet_unpack_args( cl_arg, &M, &N, &L, &ib,
&lda, &ldb, &ldt );
CORE_zlaset( ChamUpperLower, ib, N, 0., 0., T, ldt );
CORE_ztpqrt( M, N, L, ib,
A, lda, B, ldb, T, ldt, WORK );
}
......
......@@ -238,7 +238,7 @@ void RUNTIME_desc_destroy( CHAM_desc_t *desc )
for (m = 0; m < lmt; m++)
{
if (*handle != NULL) {
starpu_data_unregister(*handle);
starpu_data_unregister_submit(*handle);
}
handle++;
}
......
......@@ -49,9 +49,9 @@ int RUNTIME_options_ws_alloc( RUNTIME_option_t *options, size_t worker_size, siz
int ret = 0;
if ( worker_size > 0 ) {
options->ws_wsize = worker_size;
starpu_vector_data_register((starpu_data_handle_t*)(&(options->ws_worker)),
-1, (uintptr_t)NULL,
worker_size, sizeof(char));
starpu_matrix_data_register( (starpu_data_handle_t*)(&(options->ws_worker)),
-1, (uintptr_t)NULL,
worker_size, worker_size, 1, sizeof(char));
}
if ( host_size > 0 ) {
options->ws_hsize = host_size;
......
......@@ -103,7 +103,6 @@ int testing_zgels(int argc, char **argv)
}