Commit 22e8ea8e authored by Mathieu Faverge's avatar Mathieu Faverge
Browse files

Merge back the twosided branch

parent fd42e344
......@@ -179,16 +179,16 @@ set(ZSRC
##################
# OTHERS
##################
#pzgebrd_ge2tb.c
#pzgebrd_tb2bd.c
pztile2band.c
#pzgebrd_gb2bd.c
pzgebrd_ge2gb.c
#pzgetmi2.c
#pzgetrf_reclap.c
#pzgetrf_rectil.c
#pzhbcpy_t2bl.c
#pzhegst.c
#pzherbt.c
#pzhetrd_hb2st.c
#pzhetrd_he2hb.c
#pzhetrd_hb2ht.c
pzhetrd_he2hb.c
#pzlarft_blgtrd.c
#pzlaswp.c
#pzlaswpc.c
......@@ -198,16 +198,16 @@ set(ZSRC
#zgebrd.c
#zgecfi.c
#zgecfi2.c
#zgesvd.c
zgesvd.c
#zgetmi.c
#zgetri.c
#zgetrs.c
#zheev.c
#zheevd.c
zheevd.c
#zhegst.c
#zhegv.c
#zhegvd.c
#zhetrd.c
zhetrd.c
#zlaswp.c
#zlaswpc.c
#ztrsmrv.c
......
/**
*
* @copyright (c) 2009-2014 The University of Tennessee and The University
* of Tennessee Research Foundation.
* All rights reserved.
* @copyright (c) 2012-2014 Inria. All rights reserved.
* @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
*
**/
/**
*
* @file pzgebrd_ge2gb.c
*
* PLASMA auxiliary routines
* PLASMA is a software package provided by Univ. of Tennessee,
* Univ. of California Berkeley and Univ. of Colorado Denver
*
* @version 2.8.0
* @author Hatem Ltaief
* @author Azzam Haidar
* @date 2010-11-15
* @precisions normal z -> s d c
*
**/
#include "control/common.h"
void morse_pzgebrd_ge2gb(MORSE_desc_t A, MORSE_desc_t T,
MORSE_sequence_t *sequence, MORSE_request_t *request)
{
int k;
int tempkm, tempkn;
if (A.m >= A.n){
for (k = 0; k < A.nt; k++) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
morse_pzgeqrf(
morse_desc_submatrix(&A, k*A.mb, k*A.nb, A.m-k*A.mb, tempkn),
morse_desc_submatrix(&T, k*T.mb, k*T.nb, T.m-k*T.mb, tempkn),
sequence, request);
morse_pzunmqr(
MorseLeft,
MorseConjTrans,
morse_desc_submatrix(&A, k*A.mb, k*A.nb, A.m-k*A.mb, tempkn),
morse_desc_submatrix(&A, k*A.mb, (k+1)*A.nb, A.m-k*A.mb, A.n-(k+1)*A.nb),
morse_desc_submatrix(&T, k*T.mb, k*T.nb, T.m-k*T.mb, tempkn),
sequence, request);
if (k+1 < A.nt){
tempkn = k+1 == A.nt-1 ? A.n-(k+1)*A.nb : A.nb;
morse_pzgelqf(
morse_desc_submatrix(&A, k*A.mb, (k+1)*A.nb, tempkm, A.n-(k+1)*A.nb),
morse_desc_submatrix(&T, k*T.mb, (k+1)*T.nb, T.mb, T.n-(k+1)*T.nb),
sequence, request);
morse_pzunmlq(
MorseRight, MorseConjTrans,
morse_desc_submatrix(&A, k*A.mb, (k+1)*A.nb, tempkm, A.n-(k+1)*A.nb),
morse_desc_submatrix(&A, (k+1)*A.mb, (k+1)*A.nb, A.m-(k+1)*A.mb, A.n-(k+1)*A.nb),
morse_desc_submatrix(&T, k*T.mb, (k+1)*T.nb, T.mb, T.n-(k+1)*T.nb),
sequence, request);
}
}
}
else{
for (k = 0; k < A.mt; k++) {
tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb;
tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb;
morse_pzgelqf(
morse_desc_submatrix(&A, k*A.mb, k*A.nb, tempkm, A.n-k*A.nb),
morse_desc_submatrix(&T, k*T.mb, k*T.nb, T.mb, T.n-k*T.nb),
sequence, request);
morse_pzunmlq(
MorseRight, MorseConjTrans,
morse_desc_submatrix(&A, k*A.mb, k*A.nb, tempkm, A.n-k*A.nb),
morse_desc_submatrix(&A, (k+1)*A.mb, k*A.nb, A.m-(k+1)*A.mb, A.n-k*A.nb),
morse_desc_submatrix(&T, k*T.mb, k*T.nb, T.mb, T.n-k*T.nb),
sequence, request);
if (k+1 < A.mt){
tempkm = k+1 == A.mt-1 ? A.m-(k+1)*A.mb : A.mb;
morse_pzgeqrf(
morse_desc_submatrix(&A, (k+1)*A.mb, k*A.nb, A.m-(k+1)*A.mb, tempkn),
morse_desc_submatrix(&T, (k+1)*T.mb, k*T.nb, T.m-(k+1)*T.mb, tempkn),
sequence, request);
morse_pzunmqr(
MorseLeft, MorseConjTrans,
morse_desc_submatrix(&A, (k+1)*A.mb, k*A.nb, A.m-(k+1)*A.mb, tempkn),
morse_desc_submatrix(&A, (k+1)*A.mb, (k+1)*A.nb, A.m-(k+1)*A.mb, A.n-(k+1)*A.nb),
morse_desc_submatrix(&T, (k+1)*T.mb, k*T.nb, T.m-(k+1)*T.mb, tempkn),
sequence, request);
}
}
}
}
......@@ -53,7 +53,7 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T,
int k, m, n;
int ldak, ldam;
int tempkm, tempkn, tempmm, tempnn;
int ib, minMT;
int ib, minMNT;
morse = morse_context_self();
if (sequence->status != MORSE_SUCCESS)
......@@ -63,9 +63,9 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T,
ib = MORSE_IB;
if (A->m > A->n) {
minMT = A->nt;
minMNT = A->nt;
} else {
minMT = A->mt;
minMNT = A->mt;
}
/*
......@@ -114,7 +114,7 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T,
morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb, A->p, A->q);
#endif
for (k = 0; k < min(A->mt, A->nt); k++) {
for (k = 0; k < minMNT; k++) {
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
ldak = BLKLDD(A, k);
......@@ -181,4 +181,5 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T,
morse_desc_mat_free(DIAG);
free(DIAG);
#endif
(void)DIAG;
}
......@@ -59,7 +59,6 @@ void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
int ldak, ldam;
int tempkmin, tempkm, tempNn, tempnn, tempmm, tempNRDn;
int ib;
int nblk;
morse = morse_context_self();
if (sequence->status != MORSE_SUCCESS)
......@@ -112,9 +111,11 @@ void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
#if defined(CHAMELEON_COPY_DIAG)
/* necessary to avoid dependencies between tasks regarding the diag tile */
nblk = ( A->nt + BS -1 ) / BS;
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb, A->p, A->q);
{
int nblk = ( A->nt + BS -1 ) / BS;
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb, A->p, A->q);
}
#endif
for (k = 0; k < min(A->mt, A->nt); k++) {
......@@ -212,4 +213,5 @@ void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
morse_desc_mat_free(DIAG);
free(DIAG);
#endif
(void)DIAG;
}
......@@ -175,4 +175,5 @@ void morse_pzgeqrf(MORSE_desc_t *A, MORSE_desc_t *T,
morse_desc_mat_free(DIAG);
free(DIAG);
#endif
(void)DIAG;
}
......@@ -57,7 +57,6 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
int ldaM, ldam, ldaMRD;
int tempkmin, tempkn, tempMm, tempnn, tempmm, tempMRDm;
int ib;
int nblk;
morse = morse_context_self();
if (sequence->status != MORSE_SUCCESS)
......@@ -109,10 +108,12 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
#if defined(CHAMELEON_COPY_DIAG)
/* necessary to avoid dependencies between tasks regarding the diag tile */
nblk = ( A->mt + BS -1 ) / BS;
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb, A->p, A->q);
{
/* necessary to avoid dependencies between tasks regarding the diag tile */
int nblk = ( A->mt + BS -1 ) / BS;
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb, A->p, A->q);
}
#endif
K = min(A->mt, A->nt);
......@@ -211,4 +212,5 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
morse_desc_mat_free(DIAG);
free(DIAG);
#endif
(void)DIAG;
}
/**
*
* @copyright (c) 2009-2014 The University of Tennessee and The University
* of Tennessee Research Foundation.
* All rights reserved.
* @copyright (c) 2012-2014 Inria. All rights reserved.
* @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
*
**/
/**
*
* @file pzhetrd_he2hb.c
*
* PLASMA auxiliary routines
* PLASMA is a software package provided by Univ. of Tennessee,
* Univ. of California Berkeley and Univ. of Colorado Denver
*
* @version 2.7.1
* @author Hatem Ltaief
* @author Azzam Haidar
* @date 2010-11-15
* @precisions normal z -> s d c
*
**/
#include "control/common.h"
#define A(m, n) A, m, n
#define T(m, n) T, m, n
#define D(k) D, (k)-1, 0
#define AT(k) AT, k, 0
#if defined(CHAMELEON_COPY_DIAG)
#define E(m, n) E, m, 0
#else
#define E(m, n) A, m, n
#endif
/***************************************************************************//**
* Parallel tile BAND Tridiagonal Reduction - dynamic scheduler
**/
void morse_pzhetrd_he2hb(MORSE_enum uplo,
MORSE_desc_t *A, MORSE_desc_t *T,
MORSE_sequence_t *sequence, MORSE_request_t *request)
{
MORSE_context_t *morse;
MORSE_option_t options;
MORSE_desc_t *E = NULL;
MORSE_desc_t *D = NULL;
MORSE_desc_t *AT = NULL;
size_t ws_worker = 0;
size_t ws_host = 0;
int k, m, n, i, j;
int ldak, ldak1, ldam, ldan, ldaj, ldai;
int tempkm, tempkn, tempmm, tempnn, tempjj;
int ib;
morse = morse_context_self();
if (sequence->status != MORSE_SUCCESS)
return;
RUNTIME_options_init(&options, morse, sequence, request);
ib = MORSE_IB;
/*
* zgeqrt = A->nb * (ib+1)
* zunmqr = A->nb * ib
* ztsqrt = A->nb * (ib+1)
* ztsmqr = A->nb * ib
* zherfb = A->nb * ib
* ztsmqr_hetra1 = A->nb * ib
*/
ws_worker = A->nb * (ib+1);
/* Allocation of temporary (scratch) working space */
#if defined(CHAMELEON_USE_CUDA)
/* Worker space
*
* zunmqr = A->nb * ib
* ztsmqr = 2 * A->nb * ib
* zherfb = A->nb * ib
*/
ws_worker = max( ws_worker, ib * A->nb * 2 );
#endif
#if defined(CHAMELEON_USE_MAGMA)
/* Worker space
*
* zgeqrt = max( A->nb * (ib+1), ib * (ib + A->nb) )
* ztsqrt = max( A->nb * (ib+1), ib * (ib + A->nb) )
*/
ws_worker = max( ws_worker, ib * (ib + A->nb) );
/* Host space
*
* zgeqrt = ib * (A->mb+3*ib) + A->mb )
* ztsqrt = 2 * ib * (A->nb+ib) + A->nb
*/
ws_host = max( ws_host, ib * (A->mb + 3 * ib) + A->mb );
ws_host = max( ws_host, 2 * ib * (A->nb + ib) + A->nb );
#endif
ws_worker *= sizeof(MORSE_Complex64_t);
ws_host *= sizeof(MORSE_Complex64_t);
RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
#if defined(CHAMELEON_COPY_DIAG)
/* Copy of the extra-diagonal to generate more parallelism by releasing anti-dependencies on UNMQR/TSMQR triangle conflict */
E = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc_diag(*E, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb, A->p, A->q);
#endif
/* Copy of the diagonal tiles to keep the general version of the tile all along the computation */
D = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc_diag(*D, A->mb, A->nb, min(A->m, A->n) - A->mb, A->nb, 0, 0, min(A->m, A->n) - A->mb, A->nb, A->p, A->q);
AT = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
*AT = morse_desc_init(
MorseComplexDouble, A->mb, A->nb, (A->mb*A->nb),
min(A->mt, A->nt) * A->mb, A->nb, 0, 0, min(A->mt, A->nt) * A->mb, A->nb, 1, 1);
morse_desc_mat_alloc( AT );
/* Let's extract the diagonal in a temporary copy that contains A and A' */
for (k = 1; k < A->nt; k++){
tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
ldak = BLKLDD(A, k);
MORSE_TASK_zhe2ge(&options,
uplo,
tempkn, tempkn, ldak,
A(k, k), ldak,
D(k), ldak);
}
if (uplo == MorseLower) {
for (k = 0; k < A->nt-1; k++){
tempkm = k+1 == A->mt-1 ? A->m-(k+1)*A->mb : A->mb;
tempkn = k == A->nt-1 ? A->n- k *A->nb : A->nb;
ldak1 = BLKLDD(A, k+1);
MORSE_TASK_zgeqrt(
&options,
tempkm, tempkn, ib, A->nb,
A(k+1, k), ldak1,
T(k+1, k), T->mb);
#if defined(CHAMELEON_COPY_DIAG)
MORSE_TASK_zlacpy(
&options,
MorseLower, tempkm, tempkn, A->nb,
A(k+1, k), ldak1,
E(k+1, k), ldak1 );
#if defined(CHAMELEON_USE_CUDA)
MORSE_TASK_zlaset(
&options,
MorseUpper, tempkm, tempkn,
0., 1.,
E(k+1, k), ldak1 );
#endif
#endif
/* LEFT and RIGHT on the symmetric diagonal block */
MORSE_TASK_zherfb(
&options,
MorseLower,
tempkm, tempkm, ib, A->nb,
E(k+1, k), ldak1,
T(k+1, k), T->mb,
D(k+1), ldak1);
/* RIGHT on the remaining tiles until the bottom */
for (m = k+2; m < A->mt ; m++) {
tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
ldam = BLKLDD(A, m);
MORSE_TASK_zunmqr(
&options,
MorseRight, MorseNoTrans,
tempmm, A->nb, tempkm, ib, A->nb,
E(k+1, k), ldak1,
T(k+1, k), T->mb,
A(m, k+1), ldam);
}
for (m = k+2; m < A->mt; m++) {
tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
ldam = BLKLDD(A, m);
options.priority = 1;
MORSE_TASK_ztsqrt(
&options,
tempmm, A->nb, ib, A->nb,
A(k+1, k), ldak1,
A(m , k), ldam,
T(m , k), T->mb);
options.priority = 0;
/* LEFT */
for (i = k+2; i < m; i++) {
ldai = BLKLDD(A, i);
MORSE_TASK_ztsmqr_hetra1(
&options,
MorseLeft, MorseConjTrans,
A->mb, A->nb, tempmm, A->nb, A->nb, ib, A->nb,
A(i, k+1), ldai,
A(m, i), ldam,
A(m, k), ldam,
T(m, k), T->mb);
}
/* RIGHT */
for (j = m+1; j < A->mt ; j++) {
tempjj = j == A->mt-1 ? A->m-j*A->mb : A->mb;
ldaj = BLKLDD(A, j);
MORSE_TASK_ztsmqr(
&options,
MorseRight, MorseNoTrans,
tempjj, A->nb, tempjj, tempmm, A->nb, ib, A->nb,
A(j, k+1), ldaj,
A(j, m), ldaj,
A(m, k), ldam,
T(m, k), T->mb);
}
/* LEFT->RIGHT */
options.priority = 1;
/**
* Compute the update of the following:
*
* A1 A2'
* A2 A3
*
* with A1 and A3 two diagonal tiles. This is the tsmqr_corner
* from plasma split in 4 tasks
*/
/* Copy the transpose of A2 (m, k+1): AT(k) <- A2' = A2(k+1, m) */
MORSE_TASK_zlatro(
&options,
MorseUpperLower, MorseConjTrans,
tempmm, A->nb, A->nb,
A(m, k+1), ldam,
AT(m), ldak1);
/* Left application on |A1| */
/* |A2| */
MORSE_TASK_ztsmqr(
&options,
MorseLeft, MorseConjTrans,
A->mb, A->nb, tempmm, A->nb, A->nb, ib, A->nb,
D(k+1), ldak1,
A(m, k+1), ldam,
A(m, k), ldam,
T(m, k), T->mb);
/* Left application on | A2'| */
/* | A3 | */
MORSE_TASK_ztsmqr(
&options,
MorseLeft, MorseConjTrans,
A->mb, tempmm, tempmm, tempmm, A->nb, ib, A->nb,
AT(m), ldak1,
D(m) , ldam,
A(m, k), ldam,
T(m, k), T->mb);
/* Right application on | A1 A2' | */
MORSE_TASK_ztsmqr(
&options,
MorseRight, MorseNoTrans,
A->mb, A->nb, A->mb, tempmm, A->nb, ib, A->nb,
D(k+1), ldak1,
AT(m) , ldak1,
A(m, k), ldam,
T(m, k), T->mb);
/* Right application on | A2 A3 | */
MORSE_TASK_ztsmqr(
&options,
MorseRight, MorseNoTrans,
tempmm, A->nb, tempmm, tempmm, A->nb, ib, A->nb,
A(m, k+1), ldam,
D(m) , ldam,
A(m, k), ldam,
T(m, k), T->mb);
options.priority = 0;
}
}
}
else {
for (k = 0; k < A->nt-1; k++){
tempkn = k+1 == A->nt-1 ? A->n-(k+1)*A->nb : A->nb;
tempkm = k == A->mt-1 ? A->m- k *A->mb : A->mb;
ldak = BLKLDD(A, k);
ldak1 = BLKLDD(A, k+1);
MORSE_TASK_zgelqt(
&options,
tempkm, tempkn, ib, A->nb,
A(k, k+1), ldak,
T(k, k+1), T->mb);
#if defined(CHAMELEON_COPY_DIAG)
MORSE_TASK_zlacpy(
&options,
MorseUpper, tempkm, tempkn, A->nb,
A(k, k+1), ldak,
E(k, k+1), ldak );
#if defined(CHAMELEON_USE_CUDA)
MORSE_TASK_zlaset(
&options,
MorseLower, tempkm, tempkn,
0., 1.,
E(k, k+1), ldak );
#endif
#endif
/* RIGHT and LEFT on the symmetric diagonal block */
MORSE_TASK_zherfb(
&options,
MorseUpper,
tempkn, tempkn, ib, A->nb,
E(k, k+1), ldak,
T(k, k+1), T->mb,
D(k+1), ldak1);
/* LEFT on the remaining tiles until the left side */
for (n = k+2; n < A->nt ; n++) {
tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
MORSE_TASK_zunmlq(
&options,
MorseLeft, MorseNoTrans,
A->mb, tempnn, tempkn, ib, A->nb,
E(k, k+1), ldak,
T(k, k+1), T->mb,
A(k+1, n ), ldak1);
}
for (n = k+2; n < A->nt; n++) {
tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
ldan = BLKLDD(A, n);
options.priority = 1;
MORSE_TASK_ztslqt(
&options,
A->mb, tempnn, ib, A->nb,
A(k, k+1), ldak,
A(k, n ), ldak,
T(k, n ), T->mb);