diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt index 8d8fe14aedff319d6c3473ea0ae614d4372f7fea..1d9b97f0432585e291caf7406c07c5d349f4335e 100644 --- a/compute/CMakeLists.txt +++ b/compute/CMakeLists.txt @@ -179,16 +179,16 @@ set(ZSRC ################## # OTHERS ################## - #pzgebrd_ge2tb.c - #pzgebrd_tb2bd.c + pztile2band.c + #pzgebrd_gb2bd.c + pzgebrd_ge2gb.c #pzgetmi2.c #pzgetrf_reclap.c #pzgetrf_rectil.c - #pzhbcpy_t2bl.c #pzhegst.c #pzherbt.c - #pzhetrd_hb2st.c - #pzhetrd_he2hb.c + #pzhetrd_hb2ht.c + pzhetrd_he2hb.c #pzlarft_blgtrd.c #pzlaswp.c #pzlaswpc.c @@ -198,16 +198,16 @@ set(ZSRC #zgebrd.c #zgecfi.c #zgecfi2.c - #zgesvd.c + zgesvd.c #zgetmi.c #zgetri.c #zgetrs.c #zheev.c - #zheevd.c + zheevd.c #zhegst.c #zhegv.c #zhegvd.c - #zhetrd.c + zhetrd.c #zlaswp.c #zlaswpc.c #ztrsmrv.c diff --git a/compute/pzgebrd_ge2gb.c b/compute/pzgebrd_ge2gb.c new file mode 100644 index 0000000000000000000000000000000000000000..710eb1604a0eaf34b90e858d7ef5a6f85efd9904 --- /dev/null +++ b/compute/pzgebrd_ge2gb.c @@ -0,0 +1,102 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file pzgebrd_ge2gb.c + * + * PLASMA auxiliary routines + * PLASMA is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 2.8.0 + * @author Hatem Ltaief + * @author Azzam Haidar + * @date 2010-11-15 + * @precisions normal z -> s d c + * + **/ +#include "control/common.h" + +void morse_pzgebrd_ge2gb(MORSE_desc_t A, MORSE_desc_t T, + MORSE_sequence_t *sequence, MORSE_request_t *request) +{ + int k; + int tempkm, tempkn; + if (A.m >= A.n){ + for (k = 0; k < A.nt; k++) { + tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; + tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; + + morse_pzgeqrf( + morse_desc_submatrix(&A, k*A.mb, k*A.nb, A.m-k*A.mb, tempkn), + morse_desc_submatrix(&T, k*T.mb, k*T.nb, T.m-k*T.mb, tempkn), + sequence, request); + + morse_pzunmqr( + MorseLeft, + MorseConjTrans, + morse_desc_submatrix(&A, k*A.mb, k*A.nb, A.m-k*A.mb, tempkn), + morse_desc_submatrix(&A, k*A.mb, (k+1)*A.nb, A.m-k*A.mb, A.n-(k+1)*A.nb), + morse_desc_submatrix(&T, k*T.mb, k*T.nb, T.m-k*T.mb, tempkn), + sequence, request); + + if (k+1 < A.nt){ + tempkn = k+1 == A.nt-1 ? A.n-(k+1)*A.nb : A.nb; + + morse_pzgelqf( + morse_desc_submatrix(&A, k*A.mb, (k+1)*A.nb, tempkm, A.n-(k+1)*A.nb), + morse_desc_submatrix(&T, k*T.mb, (k+1)*T.nb, T.mb, T.n-(k+1)*T.nb), + sequence, request); + + morse_pzunmlq( + MorseRight, MorseConjTrans, + morse_desc_submatrix(&A, k*A.mb, (k+1)*A.nb, tempkm, A.n-(k+1)*A.nb), + morse_desc_submatrix(&A, (k+1)*A.mb, (k+1)*A.nb, A.m-(k+1)*A.mb, A.n-(k+1)*A.nb), + morse_desc_submatrix(&T, k*T.mb, (k+1)*T.nb, T.mb, T.n-(k+1)*T.nb), + sequence, request); + } + } + } + else{ + for (k = 0; k < A.mt; k++) { + tempkm = k == A.mt-1 ? A.m-k*A.mb : A.mb; + tempkn = k == A.nt-1 ? A.n-k*A.nb : A.nb; + + morse_pzgelqf( + morse_desc_submatrix(&A, k*A.mb, k*A.nb, tempkm, A.n-k*A.nb), + morse_desc_submatrix(&T, k*T.mb, k*T.nb, T.mb, T.n-k*T.nb), + sequence, request); + + morse_pzunmlq( + MorseRight, MorseConjTrans, + morse_desc_submatrix(&A, k*A.mb, k*A.nb, tempkm, A.n-k*A.nb), + morse_desc_submatrix(&A, (k+1)*A.mb, k*A.nb, A.m-(k+1)*A.mb, A.n-k*A.nb), + morse_desc_submatrix(&T, k*T.mb, k*T.nb, T.mb, T.n-k*T.nb), + sequence, request); + + if (k+1 < A.mt){ + tempkm = k+1 == A.mt-1 ? A.m-(k+1)*A.mb : A.mb; + + morse_pzgeqrf( + morse_desc_submatrix(&A, (k+1)*A.mb, k*A.nb, A.m-(k+1)*A.mb, tempkn), + morse_desc_submatrix(&T, (k+1)*T.mb, k*T.nb, T.m-(k+1)*T.mb, tempkn), + sequence, request); + + morse_pzunmqr( + MorseLeft, MorseConjTrans, + morse_desc_submatrix(&A, (k+1)*A.mb, k*A.nb, A.m-(k+1)*A.mb, tempkn), + morse_desc_submatrix(&A, (k+1)*A.mb, (k+1)*A.nb, A.m-(k+1)*A.mb, A.n-(k+1)*A.nb), + morse_desc_submatrix(&T, (k+1)*T.mb, k*T.nb, T.m-(k+1)*T.mb, tempkn), + sequence, request); + } + } + } +} diff --git a/compute/pzgelqf.c b/compute/pzgelqf.c index cbbb8e44a210af323ee1ddbd538a100a19416a0b..7d59288a91657a6923f6612e09596cc8fd06a909 100644 --- a/compute/pzgelqf.c +++ b/compute/pzgelqf.c @@ -53,7 +53,7 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T, int k, m, n; int ldak, ldam; int tempkm, tempkn, tempmm, tempnn; - int ib, minMT; + int ib, minMNT; morse = morse_context_self(); if (sequence->status != MORSE_SUCCESS) @@ -63,9 +63,9 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T, ib = MORSE_IB; if (A->m > A->n) { - minMT = A->nt; + minMNT = A->nt; } else { - minMT = A->mt; + minMNT = A->mt; } /* @@ -114,7 +114,7 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T, morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb, A->p, A->q); #endif - for (k = 0; k < min(A->mt, A->nt); k++) { + for (k = 0; k < minMNT; k++) { tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; ldak = BLKLDD(A, k); @@ -181,4 +181,5 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T, morse_desc_mat_free(DIAG); free(DIAG); #endif + (void)DIAG; } diff --git a/compute/pzgelqfrh.c b/compute/pzgelqfrh.c index d108869da821b4e8c7fbebbab0a783d71bca93fa..c358e06c9c2845768d3a97e9f934d6fdc1e486b3 100644 --- a/compute/pzgelqfrh.c +++ b/compute/pzgelqfrh.c @@ -59,7 +59,6 @@ void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS, int ldak, ldam; int tempkmin, tempkm, tempNn, tempnn, tempmm, tempNRDn; int ib; - int nblk; morse = morse_context_self(); if (sequence->status != MORSE_SUCCESS) @@ -112,9 +111,11 @@ void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS, #if defined(CHAMELEON_COPY_DIAG) /* necessary to avoid dependencies between tasks regarding the diag tile */ - nblk = ( A->nt + BS -1 ) / BS; - DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); - morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb, A->p, A->q); + { + int nblk = ( A->nt + BS -1 ) / BS; + DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); + morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb, A->p, A->q); + } #endif for (k = 0; k < min(A->mt, A->nt); k++) { @@ -212,4 +213,5 @@ void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS, morse_desc_mat_free(DIAG); free(DIAG); #endif + (void)DIAG; } diff --git a/compute/pzgeqrf.c b/compute/pzgeqrf.c index 9ab9cc1f3d8b9afa51c3ba67ef5141f7c59c4158..ad141a97b1adb407457161e82c0889ee7e4759fd 100644 --- a/compute/pzgeqrf.c +++ b/compute/pzgeqrf.c @@ -175,4 +175,5 @@ void morse_pzgeqrf(MORSE_desc_t *A, MORSE_desc_t *T, morse_desc_mat_free(DIAG); free(DIAG); #endif + (void)DIAG; } diff --git a/compute/pzgeqrfrh.c b/compute/pzgeqrfrh.c index 22c21f5bb3545b43713880864c0abf04dfd12bae..c95013f89f661bf18f3a8ac88cd5fe7beeea1c2f 100644 --- a/compute/pzgeqrfrh.c +++ b/compute/pzgeqrfrh.c @@ -57,7 +57,6 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS, int ldaM, ldam, ldaMRD; int tempkmin, tempkn, tempMm, tempnn, tempmm, tempMRDm; int ib; - int nblk; morse = morse_context_self(); if (sequence->status != MORSE_SUCCESS) @@ -109,10 +108,12 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS, RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); #if defined(CHAMELEON_COPY_DIAG) - /* necessary to avoid dependencies between tasks regarding the diag tile */ - nblk = ( A->mt + BS -1 ) / BS; - DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); - morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb, A->p, A->q); + { + /* necessary to avoid dependencies between tasks regarding the diag tile */ + int nblk = ( A->mt + BS -1 ) / BS; + DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); + morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb, A->p, A->q); + } #endif K = min(A->mt, A->nt); @@ -211,4 +212,5 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS, morse_desc_mat_free(DIAG); free(DIAG); #endif + (void)DIAG; } diff --git a/compute/pzhetrd_he2hb.c b/compute/pzhetrd_he2hb.c new file mode 100644 index 0000000000000000000000000000000000000000..88e673c59e76c7f962e142c644cf26ccfe1d05e4 --- /dev/null +++ b/compute/pzhetrd_he2hb.c @@ -0,0 +1,469 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file pzhetrd_he2hb.c + * + * PLASMA auxiliary routines + * PLASMA is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 2.7.1 + * @author Hatem Ltaief + * @author Azzam Haidar + * @date 2010-11-15 + * @precisions normal z -> s d c + * + **/ +#include "control/common.h" + +#define A(m, n) A, m, n +#define T(m, n) T, m, n +#define D(k) D, (k)-1, 0 + +#define AT(k) AT, k, 0 + +#if defined(CHAMELEON_COPY_DIAG) +#define E(m, n) E, m, 0 +#else +#define E(m, n) A, m, n +#endif + +/***************************************************************************//** + * Parallel tile BAND Tridiagonal Reduction - dynamic scheduler + **/ +void morse_pzhetrd_he2hb(MORSE_enum uplo, + MORSE_desc_t *A, MORSE_desc_t *T, + MORSE_sequence_t *sequence, MORSE_request_t *request) +{ + MORSE_context_t *morse; + MORSE_option_t options; + MORSE_desc_t *E = NULL; + MORSE_desc_t *D = NULL; + MORSE_desc_t *AT = NULL; + size_t ws_worker = 0; + size_t ws_host = 0; + + int k, m, n, i, j; + int ldak, ldak1, ldam, ldan, ldaj, ldai; + int tempkm, tempkn, tempmm, tempnn, tempjj; + int ib; + + morse = morse_context_self(); + if (sequence->status != MORSE_SUCCESS) + return; + + RUNTIME_options_init(&options, morse, sequence, request); + ib = MORSE_IB; + + /* + * zgeqrt = A->nb * (ib+1) + * zunmqr = A->nb * ib + * ztsqrt = A->nb * (ib+1) + * ztsmqr = A->nb * ib + * zherfb = A->nb * ib + * ztsmqr_hetra1 = A->nb * ib + */ + ws_worker = A->nb * (ib+1); + + /* Allocation of temporary (scratch) working space */ +#if defined(CHAMELEON_USE_CUDA) + /* Worker space + * + * zunmqr = A->nb * ib + * ztsmqr = 2 * A->nb * ib + * zherfb = A->nb * ib + */ + ws_worker = max( ws_worker, ib * A->nb * 2 ); +#endif + +#if defined(CHAMELEON_USE_MAGMA) + /* Worker space + * + * zgeqrt = max( A->nb * (ib+1), ib * (ib + A->nb) ) + * ztsqrt = max( A->nb * (ib+1), ib * (ib + A->nb) ) + */ + ws_worker = max( ws_worker, ib * (ib + A->nb) ); + + /* Host space + * + * zgeqrt = ib * (A->mb+3*ib) + A->mb ) + * ztsqrt = 2 * ib * (A->nb+ib) + A->nb + */ + ws_host = max( ws_host, ib * (A->mb + 3 * ib) + A->mb ); + ws_host = max( ws_host, 2 * ib * (A->nb + ib) + A->nb ); +#endif + + ws_worker *= sizeof(MORSE_Complex64_t); + ws_host *= sizeof(MORSE_Complex64_t); + + RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); + +#if defined(CHAMELEON_COPY_DIAG) + /* Copy of the extra-diagonal to generate more parallelism by releasing anti-dependencies on UNMQR/TSMQR triangle conflict */ + E = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); + morse_zdesc_alloc_diag(*E, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb, A->p, A->q); +#endif + + /* Copy of the diagonal tiles to keep the general version of the tile all along the computation */ + D = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); + morse_zdesc_alloc_diag(*D, A->mb, A->nb, min(A->m, A->n) - A->mb, A->nb, 0, 0, min(A->m, A->n) - A->mb, A->nb, A->p, A->q); + + AT = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); + *AT = morse_desc_init( + MorseComplexDouble, A->mb, A->nb, (A->mb*A->nb), + min(A->mt, A->nt) * A->mb, A->nb, 0, 0, min(A->mt, A->nt) * A->mb, A->nb, 1, 1); + morse_desc_mat_alloc( AT ); + + /* Let's extract the diagonal in a temporary copy that contains A and A' */ + for (k = 1; k < A->nt; k++){ + tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; + ldak = BLKLDD(A, k); + + MORSE_TASK_zhe2ge(&options, + uplo, + tempkn, tempkn, ldak, + A(k, k), ldak, + D(k), ldak); + } + + if (uplo == MorseLower) { + for (k = 0; k < A->nt-1; k++){ + tempkm = k+1 == A->mt-1 ? A->m-(k+1)*A->mb : A->mb; + tempkn = k == A->nt-1 ? A->n- k *A->nb : A->nb; + ldak1 = BLKLDD(A, k+1); + + MORSE_TASK_zgeqrt( + &options, + tempkm, tempkn, ib, A->nb, + A(k+1, k), ldak1, + T(k+1, k), T->mb); + +#if defined(CHAMELEON_COPY_DIAG) + MORSE_TASK_zlacpy( + &options, + MorseLower, tempkm, tempkn, A->nb, + A(k+1, k), ldak1, + E(k+1, k), ldak1 ); +#if defined(CHAMELEON_USE_CUDA) + MORSE_TASK_zlaset( + &options, + MorseUpper, tempkm, tempkn, + 0., 1., + E(k+1, k), ldak1 ); +#endif +#endif + + /* LEFT and RIGHT on the symmetric diagonal block */ + MORSE_TASK_zherfb( + &options, + MorseLower, + tempkm, tempkm, ib, A->nb, + E(k+1, k), ldak1, + T(k+1, k), T->mb, + D(k+1), ldak1); + + /* RIGHT on the remaining tiles until the bottom */ + for (m = k+2; m < A->mt ; m++) { + tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + ldam = BLKLDD(A, m); + MORSE_TASK_zunmqr( + &options, + MorseRight, MorseNoTrans, + tempmm, A->nb, tempkm, ib, A->nb, + E(k+1, k), ldak1, + T(k+1, k), T->mb, + A(m, k+1), ldam); + } + + for (m = k+2; m < A->mt; m++) { + tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + ldam = BLKLDD(A, m); + + options.priority = 1; + MORSE_TASK_ztsqrt( + &options, + tempmm, A->nb, ib, A->nb, + A(k+1, k), ldak1, + A(m , k), ldam, + T(m , k), T->mb); + options.priority = 0; + + /* LEFT */ + for (i = k+2; i < m; i++) { + ldai = BLKLDD(A, i); + MORSE_TASK_ztsmqr_hetra1( + &options, + MorseLeft, MorseConjTrans, + A->mb, A->nb, tempmm, A->nb, A->nb, ib, A->nb, + A(i, k+1), ldai, + A(m, i), ldam, + A(m, k), ldam, + T(m, k), T->mb); + } + + /* RIGHT */ + for (j = m+1; j < A->mt ; j++) { + tempjj = j == A->mt-1 ? A->m-j*A->mb : A->mb; + ldaj = BLKLDD(A, j); + MORSE_TASK_ztsmqr( + &options, + MorseRight, MorseNoTrans, + tempjj, A->nb, tempjj, tempmm, A->nb, ib, A->nb, + A(j, k+1), ldaj, + A(j, m), ldaj, + A(m, k), ldam, + T(m, k), T->mb); + } + + /* LEFT->RIGHT */ + options.priority = 1; + /** + * Compute the update of the following: + * + * A1 A2' + * A2 A3 + * + * with A1 and A3 two diagonal tiles. This is the tsmqr_corner + * from plasma split in 4 tasks + */ + /* Copy the transpose of A2 (m, k+1): AT(k) <- A2' = A2(k+1, m) */ + MORSE_TASK_zlatro( + &options, + MorseUpperLower, MorseConjTrans, + tempmm, A->nb, A->nb, + A(m, k+1), ldam, + AT(m), ldak1); + + /* Left application on |A1| */ + /* |A2| */ + MORSE_TASK_ztsmqr( + &options, + MorseLeft, MorseConjTrans, + A->mb, A->nb, tempmm, A->nb, A->nb, ib, A->nb, + D(k+1), ldak1, + A(m, k+1), ldam, + A(m, k), ldam, + T(m, k), T->mb); + + /* Left application on | A2'| */ + /* | A3 | */ + MORSE_TASK_ztsmqr( + &options, + MorseLeft, MorseConjTrans, + A->mb, tempmm, tempmm, tempmm, A->nb, ib, A->nb, + AT(m), ldak1, + D(m) , ldam, + A(m, k), ldam, + T(m, k), T->mb); + + /* Right application on | A1 A2' | */ + MORSE_TASK_ztsmqr( + &options, + MorseRight, MorseNoTrans, + A->mb, A->nb, A->mb, tempmm, A->nb, ib, A->nb, + D(k+1), ldak1, + AT(m) , ldak1, + A(m, k), ldam, + T(m, k), T->mb); + + /* Right application on | A2 A3 | */ + MORSE_TASK_ztsmqr( + &options, + MorseRight, MorseNoTrans, + tempmm, A->nb, tempmm, tempmm, A->nb, ib, A->nb, + A(m, k+1), ldam, + D(m) , ldam, + A(m, k), ldam, + T(m, k), T->mb); + options.priority = 0; + } + } + } + else { + for (k = 0; k < A->nt-1; k++){ + tempkn = k+1 == A->nt-1 ? A->n-(k+1)*A->nb : A->nb; + tempkm = k == A->mt-1 ? A->m- k *A->mb : A->mb; + ldak = BLKLDD(A, k); + ldak1 = BLKLDD(A, k+1); + MORSE_TASK_zgelqt( + &options, + tempkm, tempkn, ib, A->nb, + A(k, k+1), ldak, + T(k, k+1), T->mb); + +#if defined(CHAMELEON_COPY_DIAG) + MORSE_TASK_zlacpy( + &options, + MorseUpper, tempkm, tempkn, A->nb, + A(k, k+1), ldak, + E(k, k+1), ldak ); +#if defined(CHAMELEON_USE_CUDA) + MORSE_TASK_zlaset( + &options, + MorseLower, tempkm, tempkn, + 0., 1., + E(k, k+1), ldak ); +#endif +#endif + + /* RIGHT and LEFT on the symmetric diagonal block */ + MORSE_TASK_zherfb( + &options, + MorseUpper, + tempkn, tempkn, ib, A->nb, + E(k, k+1), ldak, + T(k, k+1), T->mb, + D(k+1), ldak1); + + /* LEFT on the remaining tiles until the left side */ + for (n = k+2; n < A->nt ; n++) { + tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; + MORSE_TASK_zunmlq( + &options, + MorseLeft, MorseNoTrans, + A->mb, tempnn, tempkn, ib, A->nb, + E(k, k+1), ldak, + T(k, k+1), T->mb, + A(k+1, n ), ldak1); + } + + for (n = k+2; n < A->nt; n++) { + tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; + ldan = BLKLDD(A, n); + options.priority = 1; + MORSE_TASK_ztslqt( + &options, + A->mb, tempnn, ib, A->nb, + A(k, k+1), ldak, + A(k, n ), ldak, + T(k, n ), T->mb); + options.priority = 0; + + /* RIGHT */ + for (i = k+2; i < n; i++) { + ldai = BLKLDD(A, i); + MORSE_TASK_ztsmlq_hetra1( + &options, + MorseRight, MorseConjTrans, + A->mb, A->nb, A->nb, tempnn, A->nb, ib, A->nb, + A(k+1, i), ldak1, + A(i, n), ldai, + A(k, n), ldak, + T(k, n), T->mb); + } + + /* LEFT */ + for (j = n+1; j < A->nt ; j++) { + tempjj = j == A->nt-1 ? A->n-j*A->nb : A->nb; + MORSE_TASK_ztsmlq( + &options, + MorseLeft, MorseNoTrans, + A->nb, tempjj, tempnn, tempjj, A->nb, ib, A->nb, + A(k+1, j), ldak1, + A(n, j), ldan, + A(k, n), ldak, + T(k, n), T->mb); + } + + /* RIGHT->LEFT */ + options.priority = 1; + /** + * Compute the update of the following: + * + * A1 A2 + * A2' A3 + * + * with A1 and A3 two diagonal tiles. This is the tsmqr_corner + * from plasma split in 4 tasks + */ + /* Copy the transpose of A2: AT(k) <- A2' */ + MORSE_TASK_zlatro( + &options, + MorseUpperLower, MorseConjTrans, + A->mb, tempnn, A->nb, + A(k+1, n), ldak1, + AT(n), A->mb); + + /* Right application on | A1 A2 | */ + MORSE_TASK_ztsmlq( + &options, + MorseRight, MorseConjTrans, + A->mb, A->nb, A->mb, tempnn, A->nb, ib, A->nb, + D(k+1), ldak1, + A(k+1, n), ldak1, + A(k, n), ldak, + T(k, n), T->mb); + + /* Right application on | A2' A3 | */ + MORSE_TASK_ztsmlq( + &options, + MorseRight, MorseConjTrans, + tempnn, A->nb, tempnn, tempnn, A->nb, ib, A->nb, + AT(n), A->mb, + D(n), ldan, + A(k, n), ldak, + T(k, n), T->mb); + + /* Left application on |A1 | */ + /* |A2'| */ + MORSE_TASK_ztsmlq( + &options, + MorseLeft, MorseNoTrans, + A->mb, A->nb, tempnn, A->nb, A->nb, ib, A->nb, + D(k+1), ldak1, + AT(n), A->mb, + A(k, n), ldak, + T(k, n), T->mb); + + /* Left application on | A2 | */ + /* | A3 | */ + MORSE_TASK_ztsmlq( + &options, + MorseLeft, MorseNoTrans, + A->mb, tempnn, tempnn, tempnn, A->nb, ib, A->nb, + A(k+1, n), ldak1, + D(n), ldan, + A(k, n), ldak, + T(k, n), T->mb); + } + options.priority = 0; + } + } + + /* Copy-back into A */ + for (k = 1; k < A->nt; k++){ + tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; + ldak = BLKLDD(A, k); + MORSE_TASK_zlacpy(&options, + uplo, + tempkn, tempkn, ldak, + D(k), ldak, + A(k, k), ldak); + } + + RUNTIME_options_ws_free(&options); + RUNTIME_options_finalize(&options, morse); + MORSE_TASK_dataflush_all(); + + MORSE_Sequence_Wait(sequence); + morse_desc_mat_free(D); + free(D); + + morse_desc_mat_free(AT); + free(AT); + +#if defined(CHAMELEON_COPY_DIAG) + morse_desc_mat_free(E); + free(E); +#endif + (void)E; +} diff --git a/compute/pztile2band.c b/compute/pztile2band.c new file mode 100644 index 0000000000000000000000000000000000000000..4611d2adce9a7e2679e82d8ec83d3b59aff33063 --- /dev/null +++ b/compute/pztile2band.c @@ -0,0 +1,124 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + + +/** + * + * @file pztile2band.c + * + * PLASMA auxiliary routines + * PLASMA is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 2.8.0 + * @author Azzam Haidar + * @author Gregoire Pichon + * @author Mathieu Faverge + * @date 2010-11-15 + * @precisions normal z -> s d c + * + **/ +#include "control/common.h" + +#define A(m,n) A, m, n +#define B(m, n) B, m, n + +/***************************************************************************//** + * Parallel copy of a band matrix from full NxN tile storage to band storage (LDABxN). + **/ +void morse_pztile2band(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B, + MORSE_sequence_t *sequence, MORSE_request_t *request) +{ + MORSE_context_t *morse; + MORSE_option_t options; + + int j; + int ldaj, ldx; + int tempjm, tempjn; + int minmnt = min(A->mt, A->nt); + + morse = morse_context_self(); + if (sequence->status != MORSE_SUCCESS) + return; + RUNTIME_options_init(&options, morse, sequence, request); + + ldx = B->mb-1; + + /* + * MorseLower => Lower Band + */ + if ( uplo == MorseLower ) { + for (j = 0; j < minmnt; j++){ + /* Compute dimension on N with B since it is dimensioned with min(A->m, A->n) */ + assert( A->m == B->n ); + assert( A->n >= B->n ); + + tempjm = j == A->mt-1 ? A->m - j * A->mb : A->mb; + tempjn = j == B->nt-1 ? B->n - j * B->nb : B->nb; + ldaj = BLKLDD(A, j); + + MORSE_TASK_zlaset( + &options, + MorseUpperLower, B->mb, tempjn, + 0., 0., + B(0, j), B->mb ); + + MORSE_TASK_zlacpy( + &options, + MorseLower, tempjm, tempjn, A->nb, + A(j, j), ldaj, + B(0, j), ldx ); + + if( j<minmnt-1 ){ + tempjm = (j+1) == A->mt-1 ? A->m-(j+1)*A->mb : A->mb; + ldaj = BLKLDD(A, j+1); + MORSE_TASK_zlacpyx( + &options, + MorseUpper, tempjm, tempjn, A->nb, + 0, A(j+1, j), ldaj, + A->nb, B(0, j), ldx); + } + } + } + else if ( uplo == MorseUpper ) { + for (j = 0; j < minmnt; j++){ + /* Compute dimension on M with B since it is dimensioned with min(A->m, A->n) */ + assert( A->n == B->n ); + assert( A->m >= B->n ); + tempjn = j == A->nt-1 ? A->n - j * A->nb : A->nb; + ldaj = BLKLDD(A, j); + + MORSE_TASK_zlaset( + &options, + MorseUpperLower, B->mb, tempjn, + 0., 0., + B(0, j), B->mb ); + + if(j > 0){ + MORSE_TASK_zlacpy( + &options, + MorseLower, A->mb, tempjn, A->nb, + A(j-1, j), BLKLDD(A, j-1), + B(0, j), ldx); + } + + tempjm = j == B->nt-1 ? B->n - j * B->nb : B->nb; + MORSE_TASK_zlacpyx( + &options, + MorseUpper, tempjm, tempjn, A->nb, + 0, A(j, j), ldaj, + A->nb, B(0, j), ldx); + } + } + RUNTIME_options_finalize(&options, morse); + MORSE_TASK_dataflush_all(); +} +#undef B +#undef A diff --git a/compute/pzunglq.c b/compute/pzunglq.c index 7b1343866534893e900f092ddc29470c1dc96dad..2356ff6707dde66993eefb9fd228f9f2ecb0dd1a 100644 --- a/compute/pzunglq.c +++ b/compute/pzunglq.c @@ -153,4 +153,5 @@ void morse_pzunglq(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T, morse_desc_mat_free(DIAG); free(DIAG); #endif + (void)DIAG; } diff --git a/compute/pzunglqrh.c b/compute/pzunglqrh.c index cd4e8abc014496e0076f68a05e2720dcd6f81195..30288e5d255567859ead0c4c41592dd112739408 100644 --- a/compute/pzunglqrh.c +++ b/compute/pzunglqrh.c @@ -59,7 +59,6 @@ void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q, int ldqm; int tempkm, tempkmin, tempNn, tempnn, tempmm, tempNRDn; int ib; - int nblk; morse = morse_context_self(); if (sequence->status != MORSE_SUCCESS) @@ -90,10 +89,12 @@ void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q, RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); #if defined(CHAMELEON_COPY_DIAG) - /* necessary to avoid dependencies between tasks regarding the diag tile */ - nblk = ( A->nt + BS -1 ) / BS; - DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); - morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb, A->p, A->q); + { + /* necessary to avoid dependencies between tasks regarding the diag tile */ + int nblk = ( A->nt + BS -1 ) / BS; + DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); + morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb, A->p, A->q); + } #endif K = min(A->mt, A->nt); @@ -178,4 +179,5 @@ void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q, morse_desc_mat_free(DIAG); free(DIAG); #endif + (void)DIAG; } diff --git a/compute/pzungqr.c b/compute/pzungqr.c index 8226c8477bea418ccc59b2e908b78f8e4a4897a7..80e7fd82bc6140056ecfe8e4774d34d998e55fa9 100644 --- a/compute/pzungqr.c +++ b/compute/pzungqr.c @@ -154,4 +154,5 @@ void morse_pzungqr(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T, morse_desc_mat_free(DIAG); free(DIAG); #endif + (void)DIAG; } diff --git a/compute/pzungqrrh.c b/compute/pzungqrrh.c index 8c2f64ddb809336b45cdacb293639e995adcfef9..bd08779f2631654e54b7adbb72416d44469dd845 100644 --- a/compute/pzungqrrh.c +++ b/compute/pzungqrrh.c @@ -61,7 +61,6 @@ void morse_pzungqrrh(MORSE_desc_t *A, MORSE_desc_t *Q, int ldqM, ldqm, ldqMRD; int tempkn, tempMm, tempnn, tempmm, tempMRDm, tempkmin; int ib; - int nblk; morse = morse_context_self(); if (sequence->status != MORSE_SUCCESS) @@ -92,10 +91,12 @@ void morse_pzungqrrh(MORSE_desc_t *A, MORSE_desc_t *Q, RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); #if defined(CHAMELEON_COPY_DIAG) - /* necessary to avoid dependencies between tasks regarding the diag tile */ - nblk = ( A->mt + BS -1 ) / BS; - DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); - morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb, A->p, A->q); + { + /* necessary to avoid dependencies between tasks regarding the diag tile */ + int nblk = ( A->mt + BS -1 ) / BS; + DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); + morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb, A->p, A->q); + } #endif K = min(A->mt, A->nt); @@ -183,4 +184,5 @@ void morse_pzungqrrh(MORSE_desc_t *A, MORSE_desc_t *Q, morse_desc_mat_free(DIAG); free(DIAG); #endif + (void)DIAG; } diff --git a/compute/pzunmlq.c b/compute/pzunmlq.c index f157fb241b2237671735212a9e793e9b90357dfb..46e35e596e5468e7a12e03129478fee54a9fb92b 100644 --- a/compute/pzunmlq.c +++ b/compute/pzunmlq.c @@ -312,4 +312,5 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans, morse_desc_mat_free(DIAG); free(DIAG); #endif + (void)DIAG; } diff --git a/compute/pzunmlqrh.c b/compute/pzunmlqrh.c index ce3cbd1dba54d21a3834cac5308c42d6c8bec30a..e9a5b3689dd2a9d9bf4b3ea48f36eb0044727618 100644 --- a/compute/pzunmlqrh.c +++ b/compute/pzunmlqrh.c @@ -57,11 +57,9 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans, int k, m, n; int K, N, RD, lastRD; - int ldaN, ldak; - int ldbN, ldbm, ldbNRD; + int ldak, ldbN, ldbm, ldbNRD; int tempNn, tempkm, tempnn, tempmm, tempNRDn, tempkmin; int ib; - int nblk; morse = morse_context_self(); if (sequence->status != MORSE_SUCCESS) @@ -91,11 +89,13 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans, RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); - /* necessary to avoid dependencies between tasks regarding the diag tile */ #if defined(CHAMELEON_COPY_DIAG) - nblk = ( A->nt + BS -1 ) / BS; - DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); - morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb, A->p, A->q); + /* necessary to avoid dependencies between tasks regarding the diag tile */ + { + int nblk = ( A->nt + BS -1 ) / BS; + DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); + morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb, A->p, A->q); + } #endif K = min(A->mt, A->nt); @@ -110,7 +110,6 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans, for (N = k; N < A->nt; N += BS) { tempNn = N == A->nt-1 ? A->n-N*A->nb : A->nb; tempkmin = min(tempkm,tempNn); - ldaN = BLKLDD(A, N); ldbN = BLKLDD(B, N); #if defined(CHAMELEON_COPY_DIAG) MORSE_TASK_zlacpy( @@ -206,7 +205,6 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans, for (N = k; N < A->nt; N += BS) { tempNn = N == A->nt-1 ? A->n-N*A->nb : A->nb; tempkmin = min(tempkm,tempNn); - ldaN = BLKLDD(A, N); ldbN = BLKLDD(B, N); for (m = min(N+BS, A->nt)-1; m > N; m--) { tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; @@ -339,7 +337,6 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans, for (N = k; N < A->nt; N += BS) { tempNn = N == A->nt-1 ? A->n-N*A->nb : A->nb; tempkmin = min(tempkm,tempNn); - ldaN = BLKLDD(A, N); #if defined(CHAMELEON_COPY_DIAG) MORSE_TASK_zlacpy( &options, @@ -413,4 +410,5 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans, morse_desc_mat_free(DIAG); free(DIAG); #endif + (void)DIAG; } diff --git a/compute/pzunmqr.c b/compute/pzunmqr.c index 3d53e459cd7d4ddbad667ff88944306796d118b0..7d359f059546d4853771e4bd808a5d7101ea6aad 100644 --- a/compute/pzunmqr.c +++ b/compute/pzunmqr.c @@ -317,4 +317,5 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans, morse_desc_mat_free(DIAG); free(DIAG); #endif + (void)DIAG; } diff --git a/compute/pzunmqrrh.c b/compute/pzunmqrrh.c index aee79ee4a90c1129bdaa6ab89ecea83e431c8b5d..71bb3d6bd60cb2b8c78fa077b201adcc054c208a 100644 --- a/compute/pzunmqrrh.c +++ b/compute/pzunmqrrh.c @@ -61,7 +61,6 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans, int ldbM, ldbm, ldbMRD; int tempMm, tempkn, tempnn, tempmm, tempMRDm, tempkmin; int ib; - int nblk; morse = morse_context_self(); if (sequence->status != MORSE_SUCCESS) @@ -91,11 +90,14 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans, RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); - /* necessary to avoid dependencies between tasks regarding the diag tile */ #if defined(CHAMELEON_COPY_DIAG) - nblk = ( A->mt + BS -1 ) / BS; - DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); - morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb, A->p, A->q); + /* necessary to avoid dependencies between tasks regarding the diag tile */ + { + int nblk = ( A->mt + BS -1 ) / BS; + nblk = ( A->mt + BS -1 ) / BS; + DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); + morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb, A->p, A->q); + } #endif K = min(A->mt, A->nt); @@ -418,4 +420,5 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans, morse_desc_mat_free(DIAG); free(DIAG); #endif + (void)DIAG; } diff --git a/compute/zgesvd.c b/compute/zgesvd.c new file mode 100644 index 0000000000000000000000000000000000000000..6c5263e4c27e2a008d375da266abb54dec0d80bf --- /dev/null +++ b/compute/zgesvd.c @@ -0,0 +1,611 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file zgesvd.c + * + * PLASMA computational routines + * PLASMA is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 2.8.0 + * @author Gregoire Pichon + * @author Mathieu Faverge + * @date 2010-11-15 + * @precisions normal z -> s d c + * + **/ +#include <coreblas/include/lapacke.h> +#include "control/common.h" + +/***************************************************************************//** + * + * @ingroup MORSE_Complex64_t + * + * MORSE_zgesvd - computes the singular value decomposition (SVD) of a complex + * M-by-N matrix A, optionally computing the left and/or right singular + * vectors. The SVD is written + * + * A = U * SIGMA * transpose(V) + * + * where SIGMA is an M-by-N matrix which is zero except for its + * min(m,n) diagonal elements, U is an M-by-M orthogonal matrix, and + * V is an N-by-N orthogonal matrix. The diagonal elements of SIGMA + * are the singular values of A; they are real and non-negative, and + * are returned in descending order. The first min(m,n) columns of + * U and V are the left and right singular vectors of A. + * + * Note that the routine returns V**T, not V. + ******************************************************************************* + * + * @param[in] jobu + * Specifies options for computing all or part of the matrix U. + * Intended usage: + * = MorseVec = 'A'(lapack): all M columns of U are returned + * in array U; + * = MorseNoVec = 'N': no columns of U (no left singular vectors) + * are computed. + * = MorseSVec = 'S': the first min(m,n) columns of U (the left + * singular vectors) are returned in the array U; + * NOT SUPPORTTED YET + * = MorseOVec = 'O': the first min(m,n) columns of U (the left + * singular vectors) are overwritten on the array A; + * NOT SUPPORTTED YET + * + * @param[in] jobvt + * Specifies options for computing all or part of the matrix V**H. + * Intended usage: + * = MorseVec = 'A'(lapack): all N rows of V**H are returned + * in the array VT; + * = MorseNoVec = 'N': no rows of V**H (no right singular vectors) + * are computed. + * = MorseSVec = 'S': the first min(m,n) rows of V**H (the right + * singular vectors) are returned in the array VT; + * NOT SUPPORTTED YET + * = MorseOVec = 'O': the first min(m,n) rows of V**H (the right + * singular vectors) are overwritten on the array A; + * NOT SUPPORTTED YET + * + * Note: jobu and jobvt cannot both be MorseOVec. + * + * @param[in] M + * The number of rows of the matrix A. M >= 0. + * + * @param[in] N + * The number of columns of the matrix A. N >= 0. + * + * @param[in,out] A + * On entry, the M-by-N matrix A. + * On exit, + * if JOBU = 'O', A is overwritten with the first min(m,n) + * columns of U (the left singular vectors, + * stored columnwise); + * if JOBVT = 'O', A is overwritten with the first min(m,n) + * rows of V**H (the right singular vectors, + * stored rowwise); + * if JOBU .ne. 'O' and JOBVT .ne. 'O', the contents of A + * are destroyed. + * + * @param[in] LDA + * The leading dimension of the array A. LDA >= max(1,M). + * + * @param[out] S + * The double precision singular values of A, sorted so that S(i) >= S(i+1). + * + * @param[in, out] descT + * On entry, descriptor as return by MORSE_Alloc_Workspace_zgesvd + * On exit, contains auxiliary factorization data. + * + * @param[out] U + * (LDU,M) if JOBU = 'A' or (LDU,min(M,N)) if JOBU = 'S'. + * If JOBU = 'A', U contains the M-by-M unitary matrix U; + * if JOBU = 'S', U contains the first min(m,n) columns of U + * (the left singular vectors, stored columnwise); + * if JOBU = 'N' or 'O', U is not referenced. + * + * @param[in] LDU + * The leading dimension of the array U. LDU >= 1; if + * JOBU = 'S' or 'A', LDU >= M. + * + * @param[out] VT + * If JOBVT = 'A', VT contains the N-by-N unitary matrix + * V**H; + * if JOBVT = 'S', VT contains the first min(m,n) rows of + * V**H (the right singular vectors, stored rowwise); + * if JOBVT = 'N' or 'O', VT is not referenced. + * + * @param[in] LDVT + * The leading dimension of the array VT. LDVT >= 1; if + * JOBVT = 'A', LDVT >= N; if JOBVT = 'S', LDVT >= min(M,N). + * + ******************************************************************************* + * + * @return + * \retval MORSE_SUCCESS successful exit + * \retval <0 if -i, the i-th argument had an illegal value + * + ******************************************************************************* + * + * @sa MORSE_zgesvd_Tile + * @sa MORSE_zgesvd_Tile_Async + * @sa MORSE_cgesvd + * @sa MORSE_dgesvd + * @sa MORSE_sgesvd + * + ******************************************************************************/ +int MORSE_zgesvd(MORSE_enum jobu, MORSE_enum jobvt, + int M, int N, + MORSE_Complex64_t *A, int LDA, + double *S, + MORSE_desc_t *descT, + MORSE_Complex64_t *U, int LDU, + MORSE_Complex64_t *VT, int LDVT) +{ + int NB; + int status; + MORSE_context_t *morse; + MORSE_sequence_t *sequence = NULL; + MORSE_request_t request = MORSE_REQUEST_INITIALIZER; + MORSE_desc_t descA; + + morse = morse_context_self(); + if (morse == NULL) { + morse_fatal_error("MORSE_zgesvd", "MORSE not initialized"); + return MORSE_ERR_NOT_INITIALIZED; + } + + /* Check input arguments */ + if (jobu != MorseNoVec && jobu != MorseVec) { + morse_error("MORSE_zgesvd", "illegal value of jobu"); + return -1; + } + if (jobvt != MorseNoVec && jobvt != MorseVec) { + morse_error("MORSE_zgesvd", "illegal value of jobvt"); + return -2; + } + if (M < 0) { + morse_error("MORSE_zgesvd", "illegal value of M"); + return -3; + } + if (N < 0) { + morse_error("MORSE_zgesvd", "illegal value of N"); + return -4; + } + if (LDA < max(1, M)) { + morse_error("MORSE_zgesvd", "illegal value of LDA"); + return -6; + } + if (LDU < 1) { + morse_error("MORSE_zgesvd", "illegal value of LDU"); + return -9; + } + if (LDVT < 1) { + morse_error("MORSE_zgesvd", "illegal value of LDVT"); + return -11; + } + /* Quick return */ + if (min(M, N) == 0) { + return MORSE_SUCCESS; + } + + /* Tune NB & IB depending on M & N; Set NBNB */ + status = morse_tune(MORSE_FUNC_ZGESVD, M, N, 0); + if (status != MORSE_SUCCESS) { + morse_error("MORSE_zgesvd", "morse_tune() failed"); + return status; + } + + /* Set MT, NT */ + NB = MORSE_NB; + + morse_sequence_create(morse, &sequence); + + /* if ( MORSE_TRANSLATION == MORSE_OUTOFPLACE ) { */ + morse_zooplap2tile( descA, A, NB, NB, LDA, N, 0, 0, M, N, sequence, &request, + morse_desc_mat_free(&(descA)) ); + /* } else { */ + /* morse_ziplap2tile( descA, A, NB, NB, LDA, N, 0, 0, M, N, */ + /* sequence, &request); */ + /* } */ + + /* Call the tile interface */ + MORSE_zgesvd_Tile_Async(jobu, jobvt, &descA, S, descT, U, LDU, VT, LDVT, sequence, &request); + + /* if ( MORSE_TRANSLATION == MORSE_OUTOFPLACE ) { */ + morse_zooptile2lap( descA, A, NB, NB, LDA, N, sequence, &request); + morse_sequence_wait(morse, sequence); + morse_desc_mat_free(&descA); + /* } else { */ + /* morse_ziptile2lap( descA, A, NB, NB, LDA, N, sequence, &request); */ + /* morse_sequence_wait(morse, sequence); */ + /* } */ + + status = sequence->status; + morse_sequence_destroy(morse, sequence); + return status; +} + +/***************************************************************************//** + * + * @ingroup MORSE_Complex64_t_Tile + * + * MORSE_zgesvd_Tile - computes the singular value decomposition (SVD) of a complex + * M-by-N matrix A, optionally computing the left and/or right singular + * vectors. + * Tile equivalent of MORSE_zgesvd(). + * Operates on matrices stored by tiles. + * All matrices are passed through descriptors. + * All dimensions are taken from the descriptors. + * + ******************************************************************************* + * + * @param[in] jobu + * Specifies options for computing all or part of the matrix U. + * Intended usage: + * = MorseVec = 'A'(lapack): all M columns of U are returned + * in array U; + * = MorseNoVec = 'N': no columns of U (no left singular vectors) + * are computed. + * = MorseSVec = 'S': the first min(m,n) columns of U (the left + * singular vectors) are returned in the array U; + * NOT SUPPORTTED YET + * = MorseOVec = 'O': the first min(m,n) columns of U (the left + * singular vectors) are overwritten on the array A; + * NOT SUPPORTTED YET + * + * @param[in] jobvt + * Specifies options for computing all or part of the matrix V**H. + * Intended usage: + * = MorseVec = 'A'(lapack): all N rows of V**H are returned + * in the array VT; + * = MorseNoVec = 'N': no rows of V**H (no right singular vectors) + * are computed. + * = MorseSVec = 'S': the first min(m,n) rows of V**H (the right + * singular vectors) are returned in the array VT; + * NOT SUPPORTTED YET + * = MorseOVec = 'O': the first min(m,n) rows of V**H (the right + * singular vectors) are overwritten on the array A; + * NOT SUPPORTTED YET + * + * Note: jobu and jobvt cannot both be MorseOVec. + * + * @param[in,out] A + * On entry, the M-by-N matrix A. + * On exit, + * if JOBU = 'O', A is overwritten with the first min(m,n) + * columns of U (the left singular vectors, + * stored columnwise); + * if JOBVT = 'O', A is overwritten with the first min(m,n) + * rows of V**H (the right singular vectors, + * stored rowwise); + * if JOBU .ne. 'O' and JOBVT .ne. 'O', the contents of A + * are destroyed. + * + * @param[out] S + * The singular values of A, sorted so that S(i) >= S(i+1). + * + * @param[in, out] T + * On entry, descriptor as return by MORSE_Alloc_Workspace_zgesvd + * On exit, contains auxiliary factorization data. + * + * @param[out] U + * (LDU,M) if JOBU = 'A' or (LDU,min(M,N)) if JOBU = 'S'. + * If JOBU = 'A', U contains the M-by-M unitary matrix U; + * if JOBU = 'S', U contains the first min(m,n) columns of U + * (the left singular vectors, stored columnwise); + * if JOBU = 'N' or 'O', U is not referenced. + * + * @param[in] LDU + * The leading dimension of the array U. LDU >= 1; if + * JOBU = 'S' or 'A', LDU >= M. + * + * @param[out] VT + * If JOBVT = 'A', VT contains the N-by-N unitary matrix + * V**H; + * if JOBVT = 'S', VT contains the first min(m,n) rows of + * V**H (the right singular vectors, stored rowwise); + * if JOBVT = 'N' or 'O', VT is not referenced. + * + * @param[in] LDVT + * The leading dimension of the array VT. LDVT >= 1; if + * JOBVT = 'A', LDVT >= N; if JOBVT = 'S', LDVT >= min(M,N). + * + ******************************************************************************* + * + * @return + * \return MORSE_SUCCESS successful exit + * + ******************************************************************************* + * + * @sa MORSE_zgesvd + * @sa MORSE_zgesvd_Tile_Async + * @sa MORSE_cgesvd_Tile + * @sa MORSE_dgesvd_Tile + * @sa MORSE_sgesvd_Tile + * + ******************************************************************************/ +int MORSE_zgesvd_Tile(MORSE_enum jobu, MORSE_enum jobvt, + MORSE_desc_t *A, + double *S, + MORSE_desc_t *T, + MORSE_Complex64_t *U, int LDU, + MORSE_Complex64_t *VT, int LDVT) +{ + MORSE_context_t *morse; + MORSE_sequence_t *sequence = NULL; + MORSE_request_t request = MORSE_REQUEST_INITIALIZER; + int status; + + morse = morse_context_self(); + if (morse == NULL) { + morse_fatal_error("MORSE_zgesvd_Tile", "MORSE not initialized"); + return MORSE_ERR_NOT_INITIALIZED; + } + morse_sequence_create(morse, &sequence); + MORSE_zgesvd_Tile_Async(jobu, jobvt, A, S, T, U, LDU, VT, LDVT, sequence, &request); + morse_sequence_wait(morse, sequence); + status = sequence->status; + morse_sequence_destroy(morse, sequence); + return status; +} + +/***************************************************************************//** + * + * @ingroup MORSE_Complex64_t_Tile_Async + * + * MORSE_zgesvd_Tile_Async - computes the singular value decomposition (SVD) of a complex + * M-by-N matrix A, optionally computing the left and/or right singular + * vectors. + * Non-blocking equivalent of MORSE_zgesvd_Tile(). + * May return before the computation is finished. + * Allows for pipelining of operations at runtime. + * + ******************************************************************************* + * + * @param[in] sequence + * Identifies the sequence of function calls that this call belongs to + * (for completion checks and exception handling purposes). + * + * @param[out] request + * Identifies this function call (for exception handling purposes). + * + ******************************************************************************* + * + * @sa MORSE_zgesvd + * @sa MORSE_zgesvd_Tile + * @sa MORSE_cgesvd_Tile_Async + * @sa MORSE_dgesvd_Tile_Async + * @sa MORSE_sgesvd_Tile_Async + * + ******************************************************************************/ +int MORSE_zgesvd_Tile_Async(MORSE_enum jobu, MORSE_enum jobvt, + MORSE_desc_t *A, + double *S, + MORSE_desc_t *T, + MORSE_Complex64_t *U, int LDU, + MORSE_Complex64_t *VT, int LDVT, + MORSE_sequence_t *sequence, MORSE_request_t *request) +{ + MORSE_desc_t descA; + MORSE_desc_t descT; + MORSE_desc_t descU, descVT; + MORSE_desc_t descAB; + MORSE_desc_t *subA, *subT, *subUVT; + double *E; + int M, N, MINMN, NB, LDAB; + int KL, KU, uplo, nru, ncvt; + int info = 0; + char gbbrd_vect; + + MORSE_context_t *morse; + morse = morse_context_self(); + + if (morse == NULL) { + morse_fatal_error("MORSE_zgesvd_Tile_Async", "MORSE not initialized"); + return MORSE_ERR_NOT_INITIALIZED; + } + if (sequence == NULL) { + morse_fatal_error("MORSE_zgesvd_Tile_Async", "NULL sequence"); + return MORSE_ERR_UNALLOCATED; + } + if (request == NULL) { + morse_fatal_error("MORSE_zgesvd_Tile_Async", "NULL request"); + return MORSE_ERR_UNALLOCATED; + } + /* Check sequence status */ + if (sequence->status == MORSE_SUCCESS) + request->status = MORSE_SUCCESS; + else + return morse_request_fail(sequence, request, MORSE_ERR_SEQUENCE_FLUSHED); + + /* Check descriptors for correctness */ + if (morse_desc_check(A) != MORSE_SUCCESS) { + morse_error("MORSE_zgesvd_Tile_Async", "invalid first descriptor"); + return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE); + } else { + descA = *A; + } + if (morse_desc_check(T) != MORSE_SUCCESS) { + morse_error("MORSE_zgesvd_Tile_Async", "invalid fourth descriptor"); + return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE); + } else { + descT = *T; + } + /* Check input arguments */ + if (jobu != MorseNoVec && jobu != MorseVec) { + morse_error("MORSE_zgesvd_Tile_Async", "illegal value of jobu"); + return MORSE_ERR_NOT_SUPPORTED; + } + if (jobvt != MorseNoVec && jobvt != MorseVec) { + morse_error("MORSE_zgesvd_Tile_Async", "illegal value of jobvt"); + return MORSE_ERR_NOT_SUPPORTED; + } + if (descA.nb != descA.mb) { + morse_error("MORSE_zgesvd_Tile_Async", "only square tiles supported"); + return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE); + } + + M = descA.m; + N = descA.n; + MINMN = min(M, N); + NB = descA.mb; + LDAB = NB + 1; + uplo = M >= N ? MorseUpper : MorseLower; + + /* Reduction to band */ + morse_pzgebrd_ge2gb( descA, descT, + sequence, request ); + + /* Allocate band structure */ + morse_zdesc_alloc_diag( descAB, + LDAB, NB, + LDAB, MINMN, + 0, 0, + LDAB, MINMN, + 1, 1 ); + + /* Convert matrix to band form */ + morse_pztile2band( uplo, + &descA, &descAB, + sequence, request ); + + E = malloc( MINMN * sizeof(double) ); + if (E == NULL) { + morse_error("MORSE_zheevd_Tile_Async", "malloc(E) failed"); + free(E); + return MORSE_ERR_OUT_OF_RESOURCES; + } + memset(E, 0, MINMN * sizeof(double) ); + + /* NCC = 0, C = NULL, we do not update any matrix with new singular vectors */ + /* On exit, AB = U (S +~ E) VT */ + if (uplo == MorseUpper){ + KL = 0; + KU = NB; + } + else{ + KL = NB; + KU = 0; + } + + /* Manage the case where only singular values are required */ + if (jobu == MorseNoVec) { + nru = 0; + if (jobvt == MorseNoVec) { + gbbrd_vect = 'N'; + ncvt = 0; + } + else { + gbbrd_vect = 'P'; + ncvt = N; + } + } + else { + nru = M; + if (jobvt == MorseNoVec) { + gbbrd_vect = 'Q'; + ncvt = 0; + } + else { + gbbrd_vect = 'B'; + ncvt = N; + } + } + + morse_sequence_wait(morse, sequence); + info = LAPACKE_zgbbrd( LAPACK_COL_MAJOR, + gbbrd_vect, + M, N, + 0, KL, KU, + (MORSE_Complex64_t *) descAB.mat, LDAB, + S, E, + U, LDU, + VT, LDVT, + NULL, 1 ); + if (info != 0) { + fprintf(stderr, "MORSE_zgesvd_Tile_Async: LAPACKE_zgbbrd = %d\n", info ); + } + morse_desc_mat_free(&descAB); + + /* Transform U and Vt into tile format */ + if ( jobu != MorseNoVec ) { + morse_zooplap2tile( descU, U, NB, NB, LDU, M, 0, 0, M, M, sequence, request, morse_desc_mat_free(&(descU)) ); + } + if ( jobvt != MorseNoVec ) { + morse_zooplap2tile( descVT, VT, NB, NB, LDVT, N, 0, 0, N, N, sequence, request, morse_desc_mat_free(&(descVT)) ); + } + morse_sequence_wait(morse, sequence); + + subA = NULL; + subT = NULL; + subUVT = NULL; + if ( jobu != MorseNoVec ) { + if ( M < N ){ + subA = morse_desc_submatrix(&descA, descA.mb, 0, descA.m-descA.mb, descA.n-descA.nb); + subUVT = morse_desc_submatrix(&descU, descU.mb, 0, descU.m-descU.mb, descU.n); + subT = morse_desc_submatrix(&descT, descT.mb, 0, descT.m-descT.mb, descT.n-descT.nb); + morse_pzunmqr( MorseLeft, MorseNoTrans, + subA, subUVT, subT, + sequence, request ); + } + else { + morse_pzunmqr( MorseLeft, MorseNoTrans, + &descA, &descU, &descT, + sequence, request ); + } + } + + if ( jobvt != MorseNoVec ) { + if ( M < N ){ + morse_pzunmlq( MorseRight, MorseNoTrans, + &descA, &descVT, &descT, + sequence, request ); + } + else { + subA = morse_desc_submatrix(&descA, 0, descA.nb, descA.m-descA.mb, descA.n -descA.nb); + subUVT = morse_desc_submatrix(&descVT, 0, descVT.nb, descVT.m, descVT.n-descVT.nb); + subT = morse_desc_submatrix(&descT, 0, descT.nb, descT.m-descT.mb, descT.n -descT.nb); + morse_pzunmlq( MorseRight, MorseNoTrans, + subA, subUVT, subT, + sequence, request ); + } + } + + /* Transform U and VT into lapack layout */ + if ( jobu != MorseNoVec ) { + morse_zooptile2lap( descU, U, NB, NB, LDU, M, sequence, request ); + } + if ( jobvt != MorseNoVec ) { + morse_zooptile2lap( descVT, VT, NB, NB, LDVT, N, sequence, request ); + } + + morse_sequence_wait(morse, sequence); + if (subA) { + free(subA); free(subUVT); free(subT); + } + + /* Solve the bidiagonal SVD problem */ + /* On exit, U and VT are updated with bidiagonal matrix singular vectors */ + info = LAPACKE_zbdsqr( LAPACK_COL_MAJOR, 'U', + MINMN, ncvt, nru, 0, + S, E, + VT, LDVT, U, LDU, NULL, 1 ); + if (info != 0) { + fprintf(stderr, "MORSE_zgesvd_Tile_Async: LAPACKE_zbdsqr = %d\n", info ); + } + + if (jobu != MorseNoVec) + morse_desc_mat_free( &descU ); + if (jobvt != MorseNoVec) + morse_desc_mat_free( &descVT ); + free(E); + return MORSE_SUCCESS; +} diff --git a/compute/zheevd.c b/compute/zheevd.c new file mode 100644 index 0000000000000000000000000000000000000000..7272b0c243065b70712c0f5a64b27a3626479764 --- /dev/null +++ b/compute/zheevd.c @@ -0,0 +1,509 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * @file zheevd.c + * + * PLASMA computational routines + * Release Date: November, 15th 2009 + * PLASMA is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 2.7.1 + * @author Azzam Haidar + * @author Hatem Ltaief + * @date 2010-11-15 + * @precisions normal z -> s d c + * + **/ +#include <coreblas/include/lapacke.h> +#include "control/common.h" + +/***************************************************************************//** + * + * @ingroup MORSE_Complex64_t + * + * MORSE_zheevd - Computes all eigenvalues and, optionally, + * eigenvectors of a complex Hermitian matrix A. The matrix A is + * preliminary reduced to tridiagonal form using a two-stage + * approach: + * First stage: reduction to band tridiagonal form; + * Second stage: reduction from band to tridiagonal form. + * + ******************************************************************************* + * + * @param[in] jobz + * Intended usage: + * = MorseNoVec: computes eigenvalues only; + * = MorseVec: computes eigenvalues and eigenvectors. + * + * @param[in] uplo + * Specifies whether the matrix A is upper triangular or + * lower triangular: + * = MorseUpper: Upper triangle of A is stored; + * = MorseLower: Lower triangle of A is stored. + * + * @param[in] N + * The order of the matrix A. N >= 0. + * + * @param[in,out] A + * On entry, the symmetric (or Hermitian) matrix A. + * If uplo = MorseUpper, the leading N-by-N upper triangular + * part of A contains the upper triangular part of the matrix + * A, and the strictly lower triangular part of A is not + * referenced. + * If uplo = MorseLower, the leading N-by-N lower triangular + * part of A contains the lower triangular part of the matrix + * A, and the strictly upper triangular part of A is not + * referenced. + * On exit, the lower triangle (if uplo = MorseLower) or the + * upper triangle (if uplo = MorseUpper) of A, including the + * diagonal, is destroyed. + * + * @param[in] LDA + * The leading dimension of the array A. LDA >= max(1,N). + * + * @param[out] W + * On exit, if info = 0, the eigenvalues. + * + * @param[in, out] descT + * On entry, descriptor as return by MORSE_Alloc_Workspace_zheevd + * On exit, contains auxiliary factorization data. + * + ******************************************************************************* + * + * @return + * \retval MORSE_SUCCESS successful exit + * \retval <0 if -i, the i-th argument had an illegal value + * \retval >0 if INFO = i, the algorithm failed to converge; i + * off-diagonal elements of an intermediate tridiagonal + * form did not converge to zero. + * + ******************************************************************************* + * + * @sa MORSE_zheevd_Tile + * @sa MORSE_zheevd_Tile_Async + * @sa MORSE_cheevd + * @sa MORSE_dsyev + * @sa MORSE_ssyev + * + ******************************************************************************/ +int MORSE_zheevd(MORSE_enum jobz, MORSE_enum uplo, int N, + MORSE_Complex64_t *A, int LDA, + double *W, + MORSE_desc_t *descT) +{ + int NB; + int status; + MORSE_context_t *morse; + MORSE_sequence_t *sequence = NULL; + MORSE_request_t request = MORSE_REQUEST_INITIALIZER; + MORSE_desc_t descA; + + morse = morse_context_self(); + if (morse == NULL) { + morse_fatal_error("MORSE_zheevd", "MORSE not initialized"); + return MORSE_ERR_NOT_INITIALIZED; + } + + /* Check input arguments */ + if (jobz != MorseNoVec && jobz != MorseVec) { + morse_error("MORSE_zheevd", "illegal value of jobz"); + return -1; + } + if (uplo != MorseLower && uplo != MorseUpper) { + morse_error("MORSE_zheevd", "illegal value of uplo"); + return -2; + } + if (N < 0) { + morse_error("MORSE_zheevd", "illegal value of N"); + return -3; + } + if (LDA < max(1, N)) { + morse_error("MORSE_zheevd", "illegal value of LDA"); + return -5; + } + + /* Quick return */ + if (N == 0) + return MORSE_SUCCESS; + + /* Tune NB & IB depending on N; Set NBNB */ + status = morse_tune(MORSE_FUNC_ZHEEVD, N, N, 0); + if (status != MORSE_SUCCESS) { + morse_error("MORSE_zheevd", "morse_tune() failed"); + return status; + } + + /* Set NT */ + NB = MORSE_NB; + + morse_sequence_create(morse, &sequence); + + /* if ( MORSE_TRANSLATION == MORSE_OUTOFPLACE ) { */ + morse_zooplap2tile( descA, A, NB, NB, LDA, N, 0, 0, N, N, sequence, &request, + morse_desc_mat_free(&(descA)) ); + /* } else { */ + /* morse_ziplap2tile( descA, A, NB, NB, LDA, N, 0, 0, N, N, */ + /* sequence, &requoest); */ + /* } */ + + /* Call the tile interface */ + MORSE_zheevd_Tile_Async(jobz, uplo, &descA, W, descT, sequence, &request); + + /* if ( MORSE_TRANSLATION == MORSE_OUTOFPLACE ) { */ + morse_zooptile2lap( descA, A, NB, NB, LDA, N, sequence, &request); + morse_sequence_wait(morse, sequence); + morse_desc_mat_free(&descA); + /* } else { */ + /* morse_ziptile2lap( descA, A, NB, NB, LDA, N, sequence, &request); */ + /* morse_sequence_wait(morse, sequence); */ + /* } */ + + status = sequence->status; + morse_sequence_destroy(morse, sequence); + return status; +} +/***************************************************************************//** + * + * @ingroup MORSE_Complex64_t_Tile + * + * MORSE_zheevd_Tile - Computes all eigenvalues and, optionally, eigenvectors of a + * complex Hermitian matrix A using a two-stage approach: + * First stage: reduction to band tridiagonal form; + * Second stage: reduction from band to tridiagonal form. + * + * Operates on matrices stored by tiles. + * All matrices are passed through descriptors. + * All dimensions are taken from the descriptors. + * + ******************************************************************************* + * + * @param[in] jobz + * Intended usage: + * = MorseNoVec: computes eigenvalues only; + * = MorseVec: computes eigenvalues and eigenvectors. + * + * @param[in] uplo + * Specifies whether the matrix A is upper triangular or + * lower triangular: + * = MorseUpper: Upper triangle of A is stored; + * = MorseLower: Lower triangle of A is stored. + * + * @param[in,out] A + * On entry, the symmetric (or Hermitian) matrix A. + * If uplo = MorseUpper, the leading N-by-N upper triangular + * part of A contains the upper triangular part of the matrix + * A, and the strictly lower triangular part of A is not + * referenced. + * If UPLO = 'L', the leading N-by-N lower triangular part of + * A contains the lower triangular part of the matrix A, and + * the strictly upper triangular part of A is not referenced. + * On exit, if jobz = MorseVec, then if return value = 0, A + * contains the orthonormal eigenvectors of the matrix A. + * If jobz = MorseNoVec, then on exit the lower triangle (if + * uplo = MorseLower) or the upper triangle (if uplo = + * MorseUpper) of A, including the diagonal, is destroyed.* + * + * @param[out] W + * On exit, if info = 0, the eigenvalues. + * + * @param[in,out] T + * On entry, descriptor as return by + * MORSE_Alloc_Workspace_zheevd + * On exit, contains auxiliary factorization data. + * + ******************************************************************************* + * + * @return + * \retval MORSE_SUCCESS successful exit + * \retval <0 if -i, the i-th argument had an illegal value + * \retval >0 if INFO = i, the algorithm failed to converge; i + * off-diagonal elements of an intermediate tridiagonal + * form did not converge to zero. + * + ******************************************************************************* + * + * @sa MORSE_zheevd_Tile + * @sa MORSE_zheevd_Tile_Async + * @sa MORSE_cheevd + * @sa MORSE_dsyev + * @sa MORSE_ssyev + * + ******************************************************************************/ +int MORSE_zheevd_Tile(MORSE_enum jobz, MORSE_enum uplo, + MORSE_desc_t *A, double *W, MORSE_desc_t *T) +{ + MORSE_context_t *morse; + MORSE_sequence_t *sequence = NULL; + MORSE_request_t request = MORSE_REQUEST_INITIALIZER; + int status; + + morse = morse_context_self(); + if (morse == NULL) { + morse_fatal_error("MORSE_zheevd_Tile", "MORSE not initialized"); + return MORSE_ERR_NOT_INITIALIZED; + } + morse_sequence_create(morse, &sequence); + MORSE_zheevd_Tile_Async(jobz, uplo, A, W, T, sequence, &request); + morse_sequence_wait(morse, sequence); + + RUNTIME_desc_getoncpu(A); + RUNTIME_desc_getoncpu(T); + + status = sequence->status; + morse_sequence_destroy(morse, sequence); + return status; +} + +/***************************************************************************//** + * + * @ingroup MORSE_Complex64_t_Tile_Async + * + * MORSE_zheevd_Tile_Async - Computes all eigenvalues and, + * optionally, eigenvectors of a complex Hermitian matrix A using a + * two-stage approach: + * First stage: reduction to band tridiagonal form; + * Second stage: reduction from band to tridiagonal form. + * + * May return before the computation is finished. + * Allows for pipelining of operations at runtime. + * + ******************************************************************************* + * + * @param[in] jobz + * Intended usage: + * = MorseNoVec: computes eigenvalues only; + * = MorseVec: computes eigenvalues and eigenvectors. + * + * @param[in] uplo + * Specifies whether the matrix A is upper triangular or + * lower triangular: + * = MorseUpper: Upper triangle of A is stored; + * = MorseLower: Lower triangle of A is stored. + * + * @param[in,out] A + * On entry, the symmetric (or Hermitian) matrix A. + * If uplo = MorseUpper, the leading N-by-N upper triangular + * part of A contains the upper triangular part of the matrix + * A, and the strictly lower triangular part of A is not + * referenced. + * If UPLO = 'L', the leading N-by-N lower triangular part of + * A contains the lower triangular part of the matrix A, and + * the strictly upper triangular part of A is not referenced. + * On exit, if jobz = MorseVec, then if return value = 0, A + * contains the orthonormal eigenvectors of the matrix A. + * If jobz = MorseNoVec, then on exit the lower triangle (if + * uplo = MorseLower) or the upper triangle (if uplo = + * MorseUpper) of A, including the diagonal, is destroyed.* + * + * @param[out] W + * On exit, if info = 0, the eigenvalues. + * + * @param[in,out] T + * On entry, descriptor as return by + * MORSE_Alloc_Workspace_zheevd + * On exit, contains auxiliary factorization data. + * + * @param[in] sequence + * Identifies the sequence of function calls that this call belongs to + * (for completion checks and exception handling purposes). + * + * @param[out] request + * Identifies this function call (for exception handling purposes). + * + ******************************************************************************* + * + * @sa MORSE_zheevd + * @sa MORSE_zheevd_Tile + * @sa MORSE_cheevd_Tile_Async + * @sa MORSE_dsyev_Tile_Async + * @sa MORSE_ssyev_Tile_Async + * + ******************************************************************************/ +int MORSE_zheevd_Tile_Async(MORSE_enum jobz, MORSE_enum uplo, + MORSE_desc_t *A, double *W, MORSE_desc_t *T, + MORSE_sequence_t *sequence, MORSE_request_t *request) +{ + MORSE_context_t *morse; + MORSE_desc_t descA; + MORSE_desc_t descT; + MORSE_Complex64_t *Q2; + int N, NB, status; + double *E; + MORSE_Complex64_t *V; + MORSE_desc_t descQ2, descV; + MORSE_desc_t *subA, *subQ, *subT; + + morse = morse_context_self(); + if (morse == NULL) { + morse_fatal_error("MORSE_zheevd_Tile_Async", "MORSE not initialized"); + return MORSE_ERR_NOT_INITIALIZED; + } + if (sequence == NULL) { + morse_fatal_error("MORSE_zheevd_Tile_Async", "NULL sequence"); + return MORSE_ERR_UNALLOCATED; + } + if (request == NULL) { + morse_fatal_error("MORSE_zheevd_Tile_Async", "NULL request"); + return MORSE_ERR_UNALLOCATED; + } + /* Check sequence status */ + if (sequence->status == MORSE_SUCCESS) + request->status = MORSE_SUCCESS; + else + return morse_request_fail(sequence, request, MORSE_ERR_SEQUENCE_FLUSHED); + + /* Check descriptors for correctness */ + if (morse_desc_check(A) != MORSE_SUCCESS) { + morse_error("MORSE_zheevd_Tile_Async", "invalid descriptor"); + return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE); + } else { + descA = *A; + } + if (morse_desc_check(T) != MORSE_SUCCESS) { + morse_error("MORSE_zheevd_Tile_Async", "invalid descriptor"); + return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE); + } else { + descT = *T; + } + /* Check input arguments */ + if (jobz != MorseNoVec && jobz != MorseVec) { + morse_error("MORSE_zheevd_Tile_Async", "illegal value of jobz"); + return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE); + } + if (uplo != MorseLower && uplo != MorseUpper) { + morse_error("MORSE_zheevd_Tile_Async", "illegal value of uplo"); + return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE); + } + if (descA.m != descA.n) { + morse_error("MORSE_zheevd_Tile_Async", "matrix need to be square"); + return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE); + } + if (descA.nb != descA.mb) { + morse_error("MORSE_zheevd_Tile_Async", "only square tiles supported"); + return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE); + } + + N = descA.m; + NB = min(descA.mb,descA.m); + + /* Allocate data structures for reduction to tridiagonal form */ + E = malloc( (N - 1) * sizeof(double) ); + if (E == NULL) { + morse_error("MORSE_zheevd_Tile_Async", "malloc(E) failed"); + free(E); + return MORSE_ERR_OUT_OF_RESOURCES; + } + + if (jobz == MorseVec){ + /* Have to synchrone right now */ + Q2 = malloc( N * N * sizeof(MORSE_Complex64_t)); + /* For bug in lapacke */ + memset( Q2, 0, N * N * sizeof(MORSE_Complex64_t)); + } + + status = MORSE_zhetrd_Tile_Async( jobz, uplo, + A, W, E, T, + Q2, N, + sequence, request ); + if (status != 0) { + morse_error("MORSE_zheevd_Tile", "MORSE_zhetrd failed"); + } + + + if (jobz == MorseNoVec){ + /* Tridiagonal eigensolver */ + status = LAPACKE_zstedc( LAPACK_COL_MAJOR, + morse_lapack_const(jobz), + N, W, E, + NULL, N ); + if (status != 0) { + morse_error("MORSE_zheevd_Tile_Async", "LAPACKE_zstedc failed"); + } + free(E); + return MORSE_SUCCESS; + } + + V = malloc( N * N * sizeof(MORSE_Complex64_t) ); + if (V == NULL) { + morse_error("MORSE_zheevd_Tile_Async", "malloc(V) failed"); + free(V); + return MORSE_ERR_OUT_OF_RESOURCES; + } + /* For bug in lapacke */ + memset(V, 0, N * N * sizeof(MORSE_Complex64_t)); + + /* Tridiagonal eigensolver */ + /* V contains the eigenvectors of the tridiagonal system on exit */ + status = LAPACKE_zstedc( LAPACK_COL_MAJOR, + morse_lapack_const(MorseIvec), + N, W, E, + V, N ); + if (status != 0) { + morse_error("MORSE_zheevd_Tile_Async", "LAPACKE_zstedc failed"); + } + + /* Convert matrices in tile format */ + /* A/T from MORSE_zhetrd refer to Q1 (tile layout) */ + /* Q from MORSE_zhetrd refers to Q2 (lapack layout) */ + /* V from LAPACKE_zstedc refers to V (lapack layout) */ + /* The final eigenvectors are (Q1 Q2 V) or (Q1^h Q2 V) */ + morse_zooplap2tile( descQ2, Q2, NB, NB, N, N, 0, 0, N, N, sequence, request, + morse_desc_mat_free(&(descQ2)) ); + morse_zooplap2tile( descV, V, NB, NB, N, N, 0, 0, N, N, sequence, request, + morse_desc_mat_free(&(descQ2)); morse_desc_mat_free(&(descV)) ); + if (uplo == MorseLower) + { + subA = morse_desc_submatrix(&descA, descA.mb, 0, descA.m -descA.mb, descA.n-descA.nb); + subQ = morse_desc_submatrix(&descQ2, descQ2.mb, 0, descQ2.m-descQ2.mb, descQ2.n ); + subT = morse_desc_submatrix(&descT, descT.mb, 0, descT.m -descT.mb, descT.n-descT.nb); + + /* Compute Q2 = Q1 * Q2 */ + morse_pzunmqr( MorseLeft, MorseNoTrans, + subA, subQ, subT, + sequence, request ); + + /* Compute the final eigenvectors A = (Q1 * Q2) * V */ + morse_pzgemm( MorseNoTrans, MorseNoTrans, + 1.0, &descQ2, &descV, + 0.0, &descA, + sequence, request); + + } + else { + subA = morse_desc_submatrix(&descA, 0, descA.nb, descA.m -descA.mb, descA.n -descA.nb ); + subQ = morse_desc_submatrix(&descQ2, descQ2.mb, 0, descQ2.m-descQ2.mb, descQ2.n ); + subT = morse_desc_submatrix(&descT, 0, descT.nb, descT.m -descT.mb, descT.n -descT.nb ); + + /* Compute Q2 = Q1^h * Q2 */ + morse_pzunmlq( MorseLeft, MorseConjTrans, + subA, subQ, subT, + sequence, request ); + + /* Compute the final eigenvectors A = (Q1^h * Q2) * V */ + morse_pzgemm( MorseNoTrans, MorseNoTrans, + 1.0, &descQ2, &descV, + 0.0, &descA, + sequence, request); + } + + morse_sequence_wait(morse, sequence); + + free(subA); free(subQ); free(subT); + morse_desc_mat_free(&descQ2); + free(Q2); + + morse_desc_mat_free(&descV); + free(V); + + free(E); + return MORSE_SUCCESS; +} diff --git a/compute/zhetrd.c b/compute/zhetrd.c new file mode 100644 index 0000000000000000000000000000000000000000..6e11245aa99f5e73c98dfc28cf30647c26c6dde2 --- /dev/null +++ b/compute/zhetrd.c @@ -0,0 +1,423 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * @file zhetrd.c + * + * PLASMA computational routines + * Release Date: November, 15th 2009 + * PLASMA is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 2.7.1 + * @author Azzam Haidar + * @author Hatem Ltaief + * @author Gregoire Pichon + * @author Mathieu Faverge + * @date 2010-11-15 + * @precisions normal z -> s d c + * + **/ +#include <coreblas/include/lapacke.h> +#include "control/common.h" + +/***************************************************************************//** + * + * @ingroup MORSE_Complex64_t + * + * MORSE_zhetrd - reduces a complex Hermitian matrix A to real symmetric + * tridiagonal form S using a two-stage approach + * First stage: reduction to band tridiagonal form (unitary Q1); + * Second stage: reduction from band to tridiagonal form (unitary + * Q2). Let Q = Q1 * Q2 be the global unitary transformation; Q**H * + * A * Q = S. + * + ******************************************************************************* + * + * @param[in] jobz + * Intended usage: + * = MorseNoVec: computes tridiagonal only; + * = MorseVec: computes tridiagonal and generate the orthogonal matrix Q. + * + * @param[in] uplo + * Specifies whether the matrix A is upper triangular or + * lower triangular: + * = MorseUpper:: Upper triangle of A is stored; + * = MorseLower: Lower triangle of A is stored. + * + * @param[in] N + * The order of the matrix A. N >= 0. + * + * @param[in,out] A + * On entry, the symmetric (or Hermitian) matrix A. + * If uplo = MorseUpper, the leading N-by-N upper triangular + * part of A contains the upper triangular part of the matrix + * A, and the strictly lower triangular part of A is not + * referenced. + * If uplo = MorseLower, the leading N-by-N lower triangular + * part of A contains the lower triangular part of the matrix + * A, and the strictly upper triangular part of A is not + * referenced. + * On exit, the lower triangle (if uplo = MorseLower) or the + * upper triangle (if uplo = MorseUpper) of A, including the + * diagonal, is destroyed. + * + * @param[in] LDA + * The leading dimension of the array A. LDA >= max(1,N). + * + * @param[out] D + * On exit, the diagonal elements of the tridiagonal matrix: + * D(i) = A(i,i). + * + * @param[out] E + * On exit, he off-diagonal elements of the tridiagonal matrix: + * E(i) = A(i,i+1) if uplo = MorseUpper, E(i) = A(i+1,i) if uplo = MorseLower. + * + * @param[out] descT + * On entry, descriptor as return by MORSE_Alloc_Workspace_zhetrd + * On exit, contains auxiliary factorization data. + * + * @param[out] Q + * On exit, if jobz = MorseVec, then if return value = 0, Q + * contains the N-by-N unitary matrix Q. + * If jobz = MorseNoVec, then it is not referenced. + * + * @param[in] LDQ + * The leading dimension of the array Q. LDQ >= N. + * + ******************************************************************************* + * + * @return + * \retval MORSE_SUCCESS successful exit + * \retval <0 if -i, the i-th argument had an illegal value + * \retval >0 if INFO = i, the algorithm failed to converge; i + * off-diagonal elements of an intermediate tridiagonal + * form did not converge to zero. + * + ******************************************************************************* + * + * @sa MORSE_zhetrd_Tile + * @sa MORSE_zhetrd_Tile_Async + * @sa MORSE_chetrd + * @sa MORSE_dsytrd + * @sa MORSE_ssytrd + * + ******************************************************************************/ +int MORSE_zhetrd(MORSE_enum jobz, MORSE_enum uplo, int N, + MORSE_Complex64_t *A, int LDA, + double *D, + double *E, + MORSE_desc_t *descT, + MORSE_Complex64_t *Q, int LDQ) +{ + int NB; + int status; + MORSE_context_t *morse; + MORSE_sequence_t *sequence = NULL; + MORSE_request_t request = MORSE_REQUEST_INITIALIZER; + MORSE_desc_t descA; + + morse = morse_context_self(); + if (morse == NULL) { + morse_error("MORSE_zhetrd", "MORSE not initialized"); + return MORSE_ERR_NOT_INITIALIZED; + } + + /* Check input arguments */ + if (jobz != MorseNoVec && jobz != MorseVec) { + morse_error("MORSE_zhetrd", "illegal value of jobz"); + return -1; + } + if (uplo != MorseLower && uplo != MorseUpper) { + morse_error("MORSE_zhetrd", "illegal value of uplo"); + return -1; + } + if (N < 0) { + morse_error("MORSE_zhetrd", "illegal value of N"); + return -2; + } + if (LDA < max(1, N)) { + morse_error("MORSE_zhetrd", "illegal value of LDA"); + return -4; + } + + /* Quick return */ + if (N == 0) + return MORSE_SUCCESS; + + /* Tune NB & IB depending on N; Set NBNB */ + status = morse_tune(MORSE_FUNC_ZHETRD, N, N, 0); + if (status != MORSE_SUCCESS) { + morse_error("MORSE_zhetrd", "morse_tune() failed"); + return status; + } + /* Set NT */ + NB = MORSE_NB; + + morse_sequence_create(morse, &sequence); + + /* if ( MORSE_TRANSLATION == MORSE_OUTOFPLACE ) { */ + morse_zooplap2tile( descA, A, NB, NB, LDA, N, 0, 0, N, N, sequence, &request, + morse_desc_mat_free(&(descA)) ); + /* } else { */ + /* morse_ziplap2tile( descA, A, NB, NB, LDA, N, 0, 0, N, N, */ + /* sequence, &request); */ + /* } */ + + /* Call the tile interface */ + MORSE_zhetrd_Tile_Async(jobz, uplo, &descA, D, E, descT, Q, LDQ, sequence, &request); + + /* if ( MORSE_TRANSLATION == MORSE_OUTOFPLACE ) { */ + morse_zooptile2lap( descA, A, NB, NB, LDA, N, sequence, &request); + morse_sequence_wait(morse, sequence); + morse_desc_mat_free(&descA); + /* } else { */ + /* morse_ziptile2lap( descA, A, NB, NB, LDA, N, sequence, &request); */ + /* morse_sequence_wait(morse, sequence); */ + /* } */ + + status = sequence->status; + morse_sequence_destroy(morse, sequence); + return status; +} +/***************************************************************************//** + * + * @ingroup MORSE_Complex64_t_Tile + * + * MORSE_zhetrd_Tile - reduces a complex Hermitian matrix A to real symmetric + * tridiagonal form S using a two-stage approach + * First stage: reduction to band tridiagonal form (unitary Q1); + * Second stage: reduction from band to tridiagonal form (unitary Q2). + * Let Q = Q1 * Q2 be the global unitary transformation; + * Q**H * A * Q = S. + * Tile equivalent of MORSE_zhetrd(). + * Operates on matrices stored by tiles. + * All matrices are passed through descriptors. + * All dimensions are taken from the descriptors. + * + ******************************************************************************* + * + * @param[in] jobz + * Intended usage: + * = MorseNoVec: computes tridiagonal only; + * = MorseVec: computes tridiagonal and generate the orthogonal matrix Q. + * + * @param[in] uplo + * Specifies whether the matrix A is upper triangular or lower triangular: + * = MorseUpper: Upper triangle of A is stored; + * = MorseLower: Lower triangle of A is stored. + * + * @param[in,out] A + * On entry, the symmetric (or Hermitian) matrix A. If uplo + * = MorseUpper, the leading N-by-N upper triangular part of + * A contains the upper triangular part of the matrix A, and + * the strictly lower triangular part of A is not referenced. + * If UPLO = 'L', the leading N-by-N lower triangular part of + * A contains the lower triangular part of the matrix A, and + * the strictly upper triangular part of A is not referenced. + * On exit, if jobz = MorseVec, then if return value = 0, A + * contains the orthonormal eigenvectors of the matrix A. + * If jobz = MorseNoVec, then on exit the lower triangle (if + * uplo = MorseLower) or the upper triangle (if uplo = + * MorseUpper) of A, including the diagonal, is destroyed.* + * + * @param[out] D + * On exit, the diagonal elements of the tridiagonal matrix: + * D(i) = A(i,i). + * + * @param[out] E + * On exit, he off-diagonal elements of the tridiagonal matrix: + * E(i) = A(i,i+1) if uplo = MorseUpper, + * E(i) = A(i+1,i) if uplo = MorseLower. + * + * @param[out] T + * On exit, auxiliary factorization data. + * + * @param[out] Q + * On exit, if jobz = MorseVec, then if return value = 0, Q + * contains the N-by-N unitary matrix Q. + * If jobz = MorseNoVec, then it is not referenced. + * + * @param[in] LDQ + * The leading dimension of the array Q. LDQ >= N. + * + ******************************************************************************* + * + * @return + * \retval MORSE_SUCCESS successful exit + * \retval <0 if -i, the i-th argument had an illegal value + * \retval >0 if INFO = i, the algorithm failed to converge; i + * off-diagonal elements of an intermediate tridiagonal + * form did not converge to zero. + * + ******************************************************************************* + * + * @sa MORSE_zhetrd + * @sa MORSE_zhetrd_Tile_Async + * @sa MORSE_chetrd_Tile + * @sa MORSE_dsytrd_Tile + * @sa MORSE_ssytrd_Tile + * @sa MORSE_zhetrd_Tile + * + ******************************************************************************/ +int MORSE_zhetrd_Tile(MORSE_enum jobz, MORSE_enum uplo, + MORSE_desc_t *A, double *D, double *E, + MORSE_desc_t *T, MORSE_Complex64_t *Q, int LDQ) +{ + MORSE_context_t *morse; + MORSE_sequence_t *sequence = NULL; + MORSE_request_t request = MORSE_REQUEST_INITIALIZER; + int status; + + morse = morse_context_self(); + if (morse == NULL) { + morse_fatal_error("MORSE_zhetrd_Tile", "MORSE not initialized"); + return MORSE_ERR_NOT_INITIALIZED; + } + morse_sequence_create(morse, &sequence); + MORSE_zhetrd_Tile_Async(jobz, uplo, A, D, E, T, Q, LDQ, sequence, &request); + morse_sequence_wait(morse, sequence); + status = sequence->status; + morse_sequence_destroy(morse, sequence); + return status; +} + +/***************************************************************************//** + * + * @ingroup MORSE_Complex64_t_Tile_Async + * + * MORSE_zhetrd_Tile_Async - Computes all eigenvalues and, + * optionally, eigenvectors of a complex Hermitian matrix A using a + * two-stage approach: + * First stage: reduction to band tridiagonal form; + * Second stage: reduction from band to tridiagonal form. + * + * May return before the computation is finished. + * Allows for pipelining of operations at runtime. + * + ******************************************************************************* + * + * @param[in] sequence + * Identifies the sequence of function calls that this call belongs to + * (for completion checks and exception handling purposes). + * + * @param[out] request + * Identifies this function call (for exception handling purposes). + * + ******************************************************************************* + * + * @sa MORSE_zhetrd + * @sa MORSE_zhetrd_Tile + * @sa MORSE_chetrd_Tile_Async + * @sa MORSE_dsytrd_Tile_Async + * @sa MORSE_ssytrd_Tile_Async + * + ******************************************************************************/ +int MORSE_zhetrd_Tile_Async(MORSE_enum jobz, + MORSE_enum uplo, + MORSE_desc_t *A, + double *W, + double *E, + MORSE_desc_t *T, + MORSE_Complex64_t *Q, int LDQ, + MORSE_sequence_t *sequence, MORSE_request_t *request) +{ + MORSE_context_t *morse; + MORSE_desc_t descA; + MORSE_desc_t descAB; + int N, NB, LDAB; + int status; + + morse = morse_context_self(); + if (morse == NULL) { + morse_fatal_error("MORSE_zhetrd_Tile_Async", "MORSE not initialized"); + return MORSE_ERR_NOT_INITIALIZED; + } + if (sequence == NULL) { + morse_fatal_error("MORSE_zhetrd_Tile_Async", "NULL sequence"); + return MORSE_ERR_UNALLOCATED; + } + if (request == NULL) { + morse_fatal_error("MORSE_zhetrd_Tile_Async", "NULL request"); + return MORSE_ERR_UNALLOCATED; + } + + /* Check sequence status */ + if (sequence->status == MORSE_SUCCESS) + request->status = MORSE_SUCCESS; + else + return morse_request_fail(sequence, request, MORSE_ERR_SEQUENCE_FLUSHED); + + /* Check descriptors for correctness */ + if (morse_desc_check(A) != MORSE_SUCCESS) { + morse_error("MORSE_zhetrd_Tile_Async", "invalid descriptor"); + return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE); + } else { + descA = *A; + } + if (morse_desc_check(T) != MORSE_SUCCESS) { + morse_error("MORSE_zhetrd_Tile_Async", "invalid descriptor"); + return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE); + } + + /* Check input arguments */ + if (jobz != MorseNoVec && jobz != MorseVec) { + morse_error("MORSE_zhetrd_Tile_Async", "illegal value of jobz"); + return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE); + } + if (uplo != MorseLower && uplo != MorseUpper) { + morse_error("MORSE_zhetrd_Tile_Async", "illegal value of uplo"); + return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE); + } + if (descA.m != descA.n) { + morse_error("MORSE_zhetrd_Tile_Async", "matrix need to be square"); + return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE); + } + if (descA.nb != descA.mb) { + morse_error("MORSE_zhetrd_Tile_Async", "only square tiles supported"); + return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE); + } + + N = descA.m; + NB = descA.mb; + + /* Reduction to band. On exit, T contains reflectors */ + morse_pzhetrd_he2hb( uplo, A, T, + sequence, request ); + + LDAB = NB+1; + + /* Allocate band structure */ + morse_zdesc_alloc_diag( descAB, + LDAB, NB, /* mb, nb */ + LDAB, N, /* lm, ln */ + 0, 0, /* i, j */ + LDAB, N, /* m, n */ + 1, 1 ); + + /* Copy data into band structure */ + morse_pztile2band( uplo, A, &descAB, + sequence, request ); + morse_sequence_wait(morse, sequence); + + /* Reduce band matrix to tridiagonal matrix */ + status = LAPACKE_zhbtrd( LAPACK_COL_MAJOR, + morse_lapack_const(jobz), + morse_lapack_const(uplo), + N, NB, + (MORSE_Complex64_t *) descAB.mat, LDAB, + W, E, Q, LDQ ); + if (status != 0) { + morse_error("MORSE_zhetrd_Tile_Async", "LAPACKE_zhbtrd failed"); + } + + morse_desc_mat_free(&descAB); + return MORSE_SUCCESS; +} diff --git a/control/compute_z.h b/control/compute_z.h index 53ee78e874412b9e0f977f44aec14fbc8f32ae41..fd6051a4966326ae600e9904bf7536f4d0dff4c8 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -84,25 +84,22 @@ int morse_zshift(MORSE_context_t *morse, int m, int n, MORSE_Complex64_t *A, /***************************************************************************//** * Declarations of parallel functions (dynamic scheduling) - alphabetical order **/ -void morse_pzbarrier_tl2pnl(MORSE_desc_t *A, MORSE_sequence_t *sequence, MORSE_request_t *request); void morse_pzbarrier_pnl2tl(MORSE_desc_t *A, MORSE_sequence_t *sequence, MORSE_request_t *request); -void morse_pzbarrier_tl2row(MORSE_desc_t *A, MORSE_sequence_t *sequence, MORSE_request_t *request); void morse_pzbarrier_row2tl(MORSE_desc_t *A, MORSE_sequence_t *sequence, MORSE_request_t *request); -void morse_pzgebrd_tb2bd(MORSE_enum uplo, MORSE_desc_t *A, double *D, double *E, MORSE_desc_t *T, MORSE_sequence_t *sequence, MORSE_request_t *request); -void morse_pzgebrd_ge2tb(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_sequence_t *sequence, MORSE_request_t *request); +void morse_pzbarrier_tl2pnl(MORSE_desc_t *A, MORSE_sequence_t *sequence, MORSE_request_t *request); +void morse_pzbarrier_tl2row(MORSE_desc_t *A, MORSE_sequence_t *sequence, MORSE_request_t *request); +void morse_pzgebrd_gb2bd(MORSE_enum uplo, MORSE_desc_t *A, double *D, double *E, MORSE_desc_t *T, MORSE_sequence_t *sequence, MORSE_request_t *request); +void morse_pzgebrd_ge2gb(MORSE_desc_t A, MORSE_desc_t T, MORSE_sequence_t *sequence, MORSE_request_t *request); void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_sequence_t *sequence, MORSE_request_t *request); void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS, MORSE_sequence_t *sequence, MORSE_request_t *request); void morse_pzgemm(MORSE_enum transA, MORSE_enum transB, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_Complex64_t beta, MORSE_desc_t *C, MORSE_sequence_t *sequence, MORSE_request_t *request); void morse_pzgeqrf(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_sequence_t *sequence, MORSE_request_t *request); void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS, MORSE_sequence_t *sequence, MORSE_request_t *request); -void morse_pzgerbh(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_sequence_t *sequence, MORSE_request_t *request); -void morse_pzgerbbrh(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_sequence_t *sequence, MORSE_request_t *request); void morse_pzgetmi2(MORSE_enum idep, MORSE_enum odep, MORSE_enum storev, int m, int n, int mb, int nb, MORSE_Complex64_t *A, MORSE_sequence_t *sequence, MORSE_request_t *request); void morse_pzgetrf_incpiv(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV, MORSE_sequence_t *sequence, MORSE_request_t *request); void morse_pzgetrf_nopiv(MORSE_desc_t *A, MORSE_sequence_t *sequence, MORSE_request_t *request); void morse_pzgetrf_reclap(MORSE_desc_t *A, int *IPIV, MORSE_sequence_t *sequence, MORSE_request_t *request); void morse_pzgetrf_rectil(MORSE_desc_t *A, int *IPIV, MORSE_sequence_t *sequence, MORSE_request_t *request); -void morse_pzhbcpy_t2bl(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *AB, MORSE_sequence_t *sequence, MORSE_request_t *request); void morse_pzhegst(MORSE_enum itype, MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_sequence_t *sequence, MORSE_request_t *request); #ifdef COMPLEX void morse_pzhemm(MORSE_enum side, MORSE_enum uplo, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_Complex64_t beta, MORSE_desc_t *C, MORSE_sequence_t *sequence, MORSE_request_t *request); @@ -136,6 +133,7 @@ void morse_pzsymm(MORSE_enum side, MORSE_enum uplo, MORSE_Complex64_t alpha, MOR void morse_pzsyrk(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_Complex64_t beta, MORSE_desc_t *C, MORSE_sequence_t *sequence, MORSE_request_t *request); void morse_pzsyr2k(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_Complex64_t beta, MORSE_desc_t *C, MORSE_sequence_t *sequence, MORSE_request_t *request); void morse_pzsytrf(MORSE_enum uplo, MORSE_desc_t *A, MORSE_sequence_t *sequence, MORSE_request_t *request); +void morse_pztile2band(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *descAB, MORSE_sequence_t *sequence, MORSE_request_t *request); void morse_pztradd(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_Complex64_t beta, MORSE_desc_t *B, MORSE_sequence_t *sequence, MORSE_request_t *request); void morse_pztrmm(MORSE_enum side, MORSE_enum uplo, MORSE_enum transA, MORSE_enum diag, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_sequence_t *sequence, MORSE_request_t *request); void morse_pztrsm(MORSE_enum side, MORSE_enum uplo, MORSE_enum transA, MORSE_enum diag, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_sequence_t *sequence, MORSE_request_t *request); diff --git a/coreblas/compute/CMakeLists.txt b/coreblas/compute/CMakeLists.txt index a48dfdddc1f9c7a9bb93a30567ee658a14d3d270..82813231b0cd3e147e1e676142137ed312504136 100644 --- a/coreblas/compute/CMakeLists.txt +++ b/coreblas/compute/CMakeLists.txt @@ -44,6 +44,8 @@ set(ZSRC core_zgetrf.c core_zgetrf_incpiv.c core_zgetrf_nopiv.c + core_zhe2ge.c + core_zherfb.c core_zhemm.c core_zher2k.c core_zherk.c @@ -56,6 +58,7 @@ set(ZSRC core_zlantr.c core_zlaset2.c core_zlaset.c + core_zlatro.c core_zlauum.c core_zpamm.c core_zparfb.c @@ -79,6 +82,8 @@ set(ZSRC core_ztslqt.c core_ztsmlq.c core_ztsmqr.c + core_ztsmlq_hetra1.c + core_ztsmqr_hetra1.c core_ztsqrt.c core_ztstrf.c core_zttlqt.c diff --git a/coreblas/compute/core_zhe2ge.c b/coreblas/compute/core_zhe2ge.c new file mode 100644 index 0000000000000000000000000000000000000000..53b0b24ddf15637459866b578abfae6ee64c6d08 --- /dev/null +++ b/coreblas/compute/core_zhe2ge.c @@ -0,0 +1,95 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file core_zhe2ge.c + * + * PLASMA core_blas kernel + * PLASMA is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 2.5.0 + * @comment This file has been automatically generated + * from Plasma 2.5.0 for MORSE 1.0.0 + * @author Julien Langou + * @author Henricus Bouwmeester + * @author Mathieu Faverge + * @author Emmanuel Agullo + * @author Cedric Castagnede + * @date 2010-11-15 + * @precisions normal z -> c d s + * + **/ +#include "coreblas/include/lapacke.h" +#include "coreblas/include/coreblas.h" + +/***************************************************************************//** + * + * @ingroup CORE_MORSE_Complex64_t + * + **/ + +void CORE_zhe2ge(MORSE_enum uplo, int M, int N, + const MORSE_Complex64_t *A, int LDA, + MORSE_Complex64_t *B, int LDB) +{ + const MORSE_Complex64_t *Aptr; + MORSE_Complex64_t *Bptr, *BTptr; + int i, j; + + Aptr = A; + Bptr = B; + BTptr = B; + + if (uplo == MorseLower){ + for (j = 0; j < N; j++){ + /* Diagonal element */ + *Bptr = *Aptr; + Bptr++; Aptr++; + + /* Outside the diagonal */ + BTptr = B + j + (j+1) * LDB; + for (i = j+1; i < M; i++, Bptr++, Aptr++, BTptr += LDB) { + *Bptr = *Aptr; + *BTptr = conj( *Aptr ); + } + Aptr += (LDA - i + j + 1); + Bptr += (LDB - i + j + 1); + + /* Bptr[ j * LDB + j ] = A[ j * LDA + j ]; */ + + /* for (i = j+1; i < M; i++) { */ + /* Bptr [ j * LDB + i ] = A[ j * LDA + i ]; */ + /* BTptr[ i * LDB + j ] = conj(A[ j * LDA + i ]); */ + /* } */ + } + } + else{ + for (j = 0; j < N; j++){ + /* for (i = 0; i < j; i++) { */ + /* Bptr [ j * LDB + i ] = A[ j * LDA + i ]; */ + /* BTptr[ i * LDB + j ] = conj(A[ j * LDA + i ]); */ + /* } */ + /* Bptr[ j * LDB + j ] = A[ j * LDA + j ]; */ + + BTptr = B + j; + for (i = 0; i < j; i++, Bptr++, Aptr++, BTptr += LDB) { + *Bptr = *Aptr; + *BTptr = conj( *Aptr ); + } + *Bptr = *A; + + Aptr += (LDA - i); + Bptr += (LDB - i); + } + } +} + diff --git a/coreblas/compute/core_zherfb.c b/coreblas/compute/core_zherfb.c new file mode 100644 index 0000000000000000000000000000000000000000..3f2079c20b711e2a3fea1fd05918cf80448a833c --- /dev/null +++ b/coreblas/compute/core_zherfb.c @@ -0,0 +1,174 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file core_zherfb.c + * + * PLASMA core_blas kernel + * PLASMA is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 2.8.0 + * @author Hatem Ltaief + * @date 2010-11-15 + * @precisions normal z -> c d s + * + **/ +#include "coreblas/include/coreblas.h" +#include "coreblas/include/lapacke.h" + +/** + ******************************************************************************* + * + * @ingroup CORE_MORSE_Complex64_t + * + * CORE_zherfb overwrites the symmetric complex N-by-N tile C with + * + * Q**T*C*Q + * + * where Q is a complex unitary matrix defined as the product of k + * elementary reflectors + * + * Q = H(1) H(2) . . . H(k) + * + * as returned by CORE_zgeqrt. Only MorseLower supported! + * + ******************************************************************************* + * + * @param[in] uplo + * TODO comment + * + * @param[in] n + * The number of rows/columns of the tile C. N >= 0. + * + * @param[in] k + * The number of elementary reflectors whose product defines + * the matrix Q. K >= 0. + * + * @param[in] ib + * The inner-blocking size. IB >= 0. + * + * @param[in] nb + * The blocking size. NB >= 0. + * + * @param[in] A + * The i-th column must contain the vector which defines the + * elementary reflector H(i), for i = 1,2,...,k, as returned by + * CORE_zgeqrt in the first k columns of its array argument A. + * + * @param[in] lda + * The leading dimension of the array A. LDA >= max(1,N). + * + * @param[in] T + * The IB-by-K triangular factor T of the block reflector. + * T is upper triangular by block (economic storage); + * The rest of the array is not referenced. + * + * @param[in] ldt + * The leading dimension of the array T. LDT >= IB. + * + * @param[in,out] C + * On entry, the symmetric N-by-N tile C. + * On exit, C is overwritten by Q**T*C*Q. + * + * @param[in] ldc + * The leading dimension of the array C. LDC >= max(1,M). + * + * @param[in,out] WORK + * On exit, if INFO = 0, WORK(1) returns the optimal LDWORK. + * + * @param[in] ldwork + * The dimension of the array WORK. LDWORK >= max(1,N); + * + ******************************************************************************* + * + * @return + * \retval MORSE_SUCCESS successful exit + * \retval <0 if -i, the i-th argument had an illegal value + * + ******************************************************************************/ +#if defined(MORSE_HAVE_WEAK) +#pragma weak CORE_zherfb = PCORE_zherfb +#define CORE_zherfb PCORE_zherfb +#define CORE_zunmlq PCORE_zunmlq +#define CORE_zunmqr PCORE_zunmqr +int CORE_zunmlq(MORSE_enum side, MORSE_enum trans, + int M, int N, int IB, int K, + const MORSE_Complex64_t *V, int LDV, + const MORSE_Complex64_t *T, int LDT, + MORSE_Complex64_t *C, int LDC, + MORSE_Complex64_t *WORK, int LDWORK); +int CORE_zunmqr(MORSE_enum side, MORSE_enum trans, + int M, int N, int K, int IB, + const MORSE_Complex64_t *V, int LDV, + const MORSE_Complex64_t *T, int LDT, + MORSE_Complex64_t *C, int LDC, + MORSE_Complex64_t *WORK, int LDWORK); +#endif +int CORE_zherfb( MORSE_enum uplo, int n, + int k, int ib, int nb, + const MORSE_Complex64_t *A, int lda, + const MORSE_Complex64_t *T, int ldt, + MORSE_Complex64_t *C, int ldc, + MORSE_Complex64_t *WORK, int ldwork ) +{ + /* Check input arguments */ + if ((uplo != MorseUpper) && (uplo != MorseLower)) { + coreblas_error(1, "Illegal value of uplo"); + return -1; + } + if (n < 0) { + coreblas_error(2, "Illegal value of n"); + return -2; + } + if (k < 0) { + coreblas_error(3, "Illegal value of k"); + return -3; + } + if (ib < 0) { + coreblas_error(4, "Illegal value of ib"); + return -4; + } + if (nb < 0) { + coreblas_error(5, "Illegal value of nb"); + return -5; + } + if ( (lda < max(1,n)) && (n > 0) ) { + coreblas_error(7, "Illegal value of lda"); + return -7; + } + if ( (ldt < max(1,ib)) && (ib > 0) ) { + coreblas_error(9, "Illegal value of ldt"); + return -9; + } + if ( (ldc < max(1,n)) && (n > 0) ) { + coreblas_error(11, "Illegal value of ldc"); + return -11; + } + + if (uplo == MorseLower) { + /* Left */ + CORE_zunmqr(MorseLeft, MorseConjTrans, n, n, k, ib, + A, lda, T, ldt, C, ldc, WORK, ldwork); + /* Right */ + CORE_zunmqr(MorseRight, MorseNoTrans, n, n, k, ib, + A, lda, T, ldt, C, ldc, WORK, ldwork); + } + else { + /* Right */ + CORE_zunmlq(MorseRight, MorseConjTrans, n, n, k, ib, + A, lda, T, ldt, C, ldc, WORK, ldwork); + /* Left */ + CORE_zunmlq(MorseLeft, MorseNoTrans, n, n, k, ib, + A, lda, T, ldt, C, ldc, WORK, ldwork); + } + return 0; +} diff --git a/coreblas/compute/core_zlatro.c b/coreblas/compute/core_zlatro.c new file mode 100644 index 0000000000000000000000000000000000000000..e7449e0ab8a1861283a60bbc9fc08fcd7dace45a --- /dev/null +++ b/coreblas/compute/core_zlatro.c @@ -0,0 +1,164 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file core_zlatro.c + * + * PLASMA core_blas kernel + * PLASMA is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 2.7.1 + * @author Azzam Haidar + * @date 2010-11-15 + * @precisions normal z -> c d s + * + **/ +#include "coreblas/include/coreblas.h" +#include "coreblas/include/lapacke.h" + +/***************************************************************************//** + * + * @ingroup CORE_MORSE_Complex64_t + * + * CORE_zlatro transposes a m-by-n matrix out of place. + * + ******************************************************************************* + * + * @param[in] uplo + * Specifies whether the matrix A is upper triangular or lower + * triangular: + * = MorseUpper: the upper triangle of A and the lower triangle of B + * are referenced. + * = MorseLower: the lower triangle of A and the upper triangle of B + * are referenced. + * = MorseUpperLower: All A and B are referenced. + * + * @param[in] trans + * Specifies whether the matrix A is transposed, not transposed or + * conjugate transposed: + * = MorseNoTrans: B is a copy of A (equivalent to zlacpy); + * = MorseTrans: B is the transpose of A; + * = MorseConjTrans: B is the conjugate transpose of A. + * + * @param[in] M + * Number of rows of the matrix A and number of columns of the matrix B, if trans == Pasma[Conj]Trans. + * Number of rows of the matrix A and the matrix B, if trans == PasmaNoTrans. + * + * @param[in] N + * Number of columns of the matrix A and number of rows of the matrix B, if trans == Pasma[Conj]Trans. + * Number of columns of the matrix A and of the matrix B, if trans == MorseNoTrans. + * + * @param[in] A + * Matrix of size LDA-by-N, if trans == Pasma[Conj]Trans. + * Matrix of size LDA-by-M, if trans == PasmaNoTrans. + * + * @param[in] LDA + * The leading dimension of the array A. + * LDA >= max(1,M), if trans == Pasma[Conj]Trans. + * LDA >= max(1,N), if trans == PasmaNoTrans. + * + * @param[out] B + * Matrix of size LDB-by-M, if trans == Pasma[Conj]Trans. + * Matrix of size LDB-by-N, if trans == PasmaNoTrans. + * + * @param[in] LDB + * The leading dimension of the array B. + * LDB >= max(1,N), if trans == Pasma[Conj]Trans. + * LDB >= max(1,M), if trans == PasmaNoTrans. + * + * + ******************************************************************************* + * + * @return + * \retval MORSE_SUCCESS successful exit + * \retval <0 if INFO = -k, the k-th argument had an illegal value + * + ******************************************************************************/ +#if defined(MORSE_HAVE_WEAK) +#pragma weak CORE_zlatro = PCORE_zlatro +#define CORE_zlatro PCORE_zlatro +#endif +int CORE_zlatro(MORSE_enum uplo, MORSE_enum trans, + int M, int N, + const MORSE_Complex64_t *A, int LDA, + MORSE_Complex64_t *B, int LDB) +{ + int i, j; + + /* Check input arguments */ + if ((uplo != MorseUpper) && (uplo != MorseLower) && (uplo != MorseUpperLower) ) { + coreblas_error(1, "Illegal value of uplo"); + return -1; + } + if ((trans != MorseConjTrans) && (trans != MorseNoTrans) && (trans != MorseTrans) ) { + coreblas_error(2, "Illegal value of trans"); + return -2; + } + if (M < 0) { + coreblas_error(3, "Illegal value of M"); + return -3; + } + if (N < 0) { + coreblas_error(4, "Illegal value of N"); + return -4; + } + if ( (LDA < max(1,M)) && (M > 0) ) { + coreblas_error(6, "Illegal value of LDA"); + return -6; + } + if ( (LDB < max(1,N)) && (N > 0) ) { + coreblas_error(8, "Illegal value of LDB"); + return -8; + } + + if (trans == MorseNoTrans) { + CORE_zlacpy(uplo, M, N, A, LDA, B, LDB); + } + else { + if (trans == MorseConjTrans) { + if(uplo == MorseUpper) { + for(j=0; j<N; j++) + for(i=0; i<min(j+1,M); i++) + B[j+i*LDB] = conj(A[i+j*LDA]); + } + else if(uplo == MorseLower) { + for(j=0;j<N;j++) + for(i=j;i<M;i++) + B[j+i*LDB] = conj(A[i+j*LDA]); + } + else { + for(j=0;j<N;j++) + for(i=0;i<M;i++) + B[j+i*LDB] = conj(A[i+j*LDA]); + } + } + else { + if(uplo==MorseUpper) { + for(j=0;j<N;j++) + for(i=0;i<min(j+1,M);i++) + B[j+i*LDB] = A[i+j*LDA]; + } + else if(uplo==MorseLower) { + for(j=0;j<N;j++) + for(i=j;i<M;i++) + B[j+i*LDB] = A[i+j*LDA]; + } + else { + for(j=0;j<N;j++) + for(i=0;i<M;i++) + B[j+i*LDB] = A[i+j*LDA]; + } + } + } + + return MORSE_SUCCESS; +} diff --git a/coreblas/compute/core_ztsmlq_hetra1.c b/coreblas/compute/core_ztsmlq_hetra1.c new file mode 100644 index 0000000000000000000000000000000000000000..579ff9fdb4973372f1f9861fcae94fe081205154 --- /dev/null +++ b/coreblas/compute/core_ztsmlq_hetra1.c @@ -0,0 +1,177 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file core_ztsmlq_hetra1.c + * + * PLASMA core_blas kernel + * PLASMA is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 2.7.1 + * @author Hatem Ltaief + * @author Mathieu Faverge + * @author Azzam Haidar + * @date 2010-11-15 + * @precisions normal z -> c d s + * + **/ +#include <coreblas/include/lapacke.h> +#include "coreblas/include/coreblas.h" + +/***************************************************************************//** + * + * @ingroup CORE_MORSE_Complex64_t + * + * CORE_ztsmlq_hetra1: see CORE_ztsmlq + * + * This kernel applies a Right transformation on | A1' A2 | + * and does not handle the transpose of A1. + * Needs therefore to make the explicit transpose of A1 before + * and after the application of the block of reflectors + * Can be further optimized by changing accordingly the underneath + * kernel ztsrfb! + * + ******************************************************************************* + * + * @param[in] side + * @arg MorseLeft : apply Q or Q**H from the Left; + * @arg MorseRight : apply Q or Q**H from the Right. + * + * @param[in] trans + * @arg MorseNoTrans : No transpose, apply Q; + * @arg MorseConjTrans : ConjTranspose, apply Q**H. + * + * @param[in] m1 + * The number of rows of the tile A1. m1 >= 0. + * + * @param[in] n1 + * The number of columns of the tile A1. n1 >= 0. + * + * @param[in] m2 + * The number of rows of the tile A2. m2 >= 0. + * m2 = m1 if side == MorseRight. + * + * @param[in] n2 + * The number of columns of the tile A2. n2 >= 0. + * n2 = n1 if side == MorseLeft. + * + * @param[in] k + * The number of elementary reflectors whose product defines + * the matrix Q. + * + * @param[in] ib + * The inner-blocking size. ib >= 0. + * + * @param[in,out] A1 + * On entry, the m1-by-n1 tile A1. + * On exit, A1 is overwritten by the application of Q. + * + * @param[in] lda1 + * The leading dimension of the array A1. lda1 >= max(1,m1). + * + * @param[in,out] A2 + * On entry, the m2-by-n2 tile A2. + * On exit, A2 is overwritten by the application of Q. + * + * @param[in] lda2 + * The leading dimension of the tile A2. lda2 >= max(1,m2). + * + * @param[in] V + * The i-th row must contain the vector which defines the + * elementary reflector H(i), for i = 1,2,...,k, as returned by + * CORE_ZTSLQT in the first k rows of its array argument V. + *! + * @param[in] ldv + * The leading dimension of the array V. ldv >= max(1,K). + * + * @param[in] T + * The IB-by-n1 triangular factor T of the block reflector. + * T is upper triangular by block (economic storage); + * The rest of the array is not referenced. + * + * @param[in] ldt + * The leading dimension of the array T. ldt >= IB. + * + * @param[out] WORK + * Workspace array of size + * LDWORK-by-m1 if side == MorseLeft + * LDWORK-by-IB if side == MorseRight + * + * @param[in] ldwork + * The leading dimension of the array WORK. + * LDWORK >= max(1,IB) if side == MorseLeft + * LDWORK >= max(1,n1) if side == MorseRight + * + ******************************************************************************* + * + * @return + * \retval MORSE_SUCCESS successful exit + * \retval <0 if -i, the i-th argument had an illegal value + * + ******************************************************************************/ +#if defined(MORSE_HAVE_WEAK) +#pragma weak CORE_ztsmlq_hetra1 = PCORE_ztsmlq_hetra1 +#define CORE_ztsmlq_hetra1 PCORE_ztsmlq_hetra1 +#define CORE_ztsmlq PCORE_ztsmlq +int CORE_ztsmlq(MORSE_enum side, MORSE_enum trans, + int m1, int n1, int m2, int n2, int K, int IB, + MORSE_Complex64_t *A1, int lda1, + MORSE_Complex64_t *A2, int lda2, + const MORSE_Complex64_t *V, int ldv, + const MORSE_Complex64_t *T, int ldt, + MORSE_Complex64_t *WORK, int LDWORK); +#endif +int CORE_ztsmlq_hetra1( MORSE_enum side, MORSE_enum trans, + int m1, int n1, int m2, int n2, + int k, int ib, + MORSE_Complex64_t *A1, int lda1, + MORSE_Complex64_t *A2, int lda2, + const MORSE_Complex64_t *V, int ldv, + const MORSE_Complex64_t *T, int ldt, + MORSE_Complex64_t *WORK, int ldwork) +{ + int i, j; + + if ( (m1 != n1) ) { + coreblas_error(3, "Illegal value of M1, N1"); + return -3; + } + + /* in-place transposition of A1 */ + for (j = 0; j < n1; j++){ + A1[j + j*lda1] = conj(A1[j + j*lda1]); + + for (i = j+1; i < m1; i++){ + *WORK = *(A1 + i + j*lda1); + *(A1 + i + j*lda1) = conj(*(A1 + j + i*lda1)); + *(A1 + j + i*lda1) = conj(*WORK); + } + } + + CORE_ztsmlq(side, trans, m1, n1, m2, n2, k, ib, + A1, lda1, A2, lda2, + V, ldv, T, ldt, + WORK, ldwork); + + /* in-place transposition of A1 */ + for (j = 0; j < n1; j++){ + A1[j + j*lda1] = conj(A1[j + j*lda1]); + + for (i = j+1; i < m1; i++){ + *WORK = *(A1 + i + j*lda1); + *(A1 + i + j*lda1) = conj(*(A1 + j + i*lda1)); + *(A1 + j + i*lda1) = conj(*WORK); + } + } + + return MORSE_SUCCESS; +} diff --git a/coreblas/compute/core_ztsmqr_hetra1.c b/coreblas/compute/core_ztsmqr_hetra1.c new file mode 100644 index 0000000000000000000000000000000000000000..76e7f65c08eedfa35057cb97c0350ba01b703bbc --- /dev/null +++ b/coreblas/compute/core_ztsmqr_hetra1.c @@ -0,0 +1,176 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file core_ztsmqr_hetra1.c + * + * PLASMA core_blas kernel + * PLASMA is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 2.7.1 + * @author Hatem Ltaief + * @author Mathieu Faverge + * @author Jakub Kurzak + * @author Azzam Haidar + * @date 2010-11-15 + * @precisions normal z -> c d s + * + **/ +#include <coreblas/include/lapacke.h> +#include "coreblas/include/coreblas.h" + +/***************************************************************************//** + * + * @ingroup CORE_MORSE_Complex64_t + * + * CORE_ztsmqr_hetra1: see CORE_ztsmqr + * + * This kernel applies a left transformation on | A1'| + * | A2 | + * + * Needs therefore to make the explicit transpose of A1 before + * and after the application of the block of reflectors + * Can be further optimized by changing accordingly the underneath + * kernel ztsrfb! + * + ******************************************************************************* + * + * @param[in] side + * @arg MorseLeft : apply Q or Q**H from the Left; + * @arg MorseRight : apply Q or Q**H from the Right. + * + * @param[in] trans + * @arg MorseNoTrans : No transpose, apply Q; + * @arg MorseConjTrans : ConjTranspose, apply Q**H. + * + * @param[in] m1 + * The number of rows of the tile A1. M1 >= 0. + * + * @param[in] n1 + * The number of columns of the tile A1. N1 >= 0. + * + * @param[in] m2 + * The number of rows of the tile A2. M2 >= 0. + * M2 = M1 if side == MorseRight. + * + * @param[in] n2 + * The number of columns of the tile A2. N2 >= 0. + * N2 = N1 if side == MorseLeft. + * + * @param[in] k + * The number of elementary reflectors whose product defines + * the matrix Q. + * + * @param[in] ib + * The inner-blocking size. IB >= 0. + * + * @param[in,out] A1 + * On entry, the M1-by-N1 tile A1. + * On exit, A1 is overwritten by the application of Q. + * + * @param[in] lda1 + * The leading dimension of the array A1. LDA1 >= max(1,M1). + * + * @param[in,out] A2 + * On entry, the M2-by-N2 tile A2. + * On exit, A2 is overwritten by the application of Q. + * + * @param[in] lda2 + * The leading dimension of the tile A2. LDA2 >= max(1,M2). + * + * @param[in] V + * The i-th row must contain the vector which defines the + * elementary reflector H(i), for i = 1,2,...,k, as returned by + * CORE_ZTSQRT in the first k columns of its array argument V. + * + * @param[in] ldv + * The leading dimension of the array V. LDV >= max(1,K). + * + * @param[in] T + * The IB-by-N1 triangular factor T of the block reflector. + * T is upper triangular by block (economic storage); + * The rest of the array is not referenced. + * + * @param[in] ldt + * The leading dimension of the array T. LDT >= IB. + * + * @param[out] WORK + * Workspace array of size + * LDWORK-by-N1 if side == MorseLeft + * LDWORK-by-IB if side == MorseRight + * + * @param[in] ldwork + * The leading dimension of the array WORK. + * LDWORK >= max(1,IB) if side == MorseLeft + * LDWORK >= max(1,M1) if side == MorseRight + * + ******************************************************************************* + * + * @return + * \retval MORSE_SUCCESS successful exit + * \retval <0 if -i, the i-th argument had an illegal value + * + ******************************************************************************/ +#if defined(MORSE_HAVE_WEAK) +#pragma weak CORE_ztsmqr_hetra1 = PCORE_ztsmqr_hetra1 +#define CORE_ztsmqr_hetra1 PCORE_ztsmqr_hetra1 +#define CORE_ztsmqr PCORE_ztsmqr +int CORE_ztsmqr(MORSE_enum side, MORSE_enum trans, + int M1, int N1, int M2, int N2, int K, int IB, + MORSE_Complex64_t *A1, int LDA1, + MORSE_Complex64_t *A2, int LDA2, + const MORSE_Complex64_t *V, int LDV, + const MORSE_Complex64_t *T, int LDT, + MORSE_Complex64_t *WORK, int LDWORK); +#endif +int CORE_ztsmqr_hetra1( MORSE_enum side, MORSE_enum trans, + int m1, int n1, int m2, int n2, + int k, int ib, + MORSE_Complex64_t *A1, int lda1, + MORSE_Complex64_t *A2, int lda2, + const MORSE_Complex64_t *V, int ldv, + const MORSE_Complex64_t *T, int ldt, + MORSE_Complex64_t *WORK, int ldwork) +{ + int i, j; + + if ( (m1 != n1) ) { + coreblas_error(3, "Illegal value of M1, N1"); + return -3; + } + + /* in-place transposition of A1 */ + for (j = 0; j < n1; j++){ + A1[j + j*lda1] = conj(A1[j + j*lda1]); + + for (i = j+1; i < m1; i++){ + *WORK = *(A1 + i + j*lda1); + *(A1 + i + j*lda1) = conj(*(A1 + j + i*lda1)); + *(A1 + j + i*lda1) = conj(*WORK); + } + } + + CORE_ztsmqr(side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); + + /* in-place transposition of A1 */ + for (j = 0; j < n1; j++){ + A1[j + j*lda1] = conj(A1[j + j*lda1]); + + for (i = j+1; i < m1; i++){ + *WORK = *(A1 + i + j*lda1); + *(A1 + i + j*lda1) = conj(*(A1 + j + i*lda1)); + *(A1 + j + i*lda1) = conj(*WORK); + } + } + + return MORSE_SUCCESS; +} diff --git a/coreblas/include/coreblas_z.h b/coreblas/include/coreblas_z.h index 4c6ee39cae6b975da25162e7edb852db83cb484a..e944555626b2093e1f053ba0bd5a4b7c25fdebd7 100644 --- a/coreblas/include/coreblas_z.h +++ b/coreblas/include/coreblas_z.h @@ -107,6 +107,9 @@ int CORE_zgetrf_reclap(int M, int N, int CORE_zgetrf_rectil(const MORSE_desc_t A, int *IPIV, int *info); void CORE_zgetrip(int m, int n, MORSE_Complex64_t *A, MORSE_Complex64_t *work); +void CORE_zhe2ge(MORSE_enum uplo, int M, int N, + const MORSE_Complex64_t *A, int LDA, + MORSE_Complex64_t *B, int LDB); int CORE_zhbelr(MORSE_enum uplo, int N, MORSE_desc_t *A, MORSE_Complex64_t *V, MORSE_Complex64_t *TAU, int st, int ed, int eltsize); @@ -315,14 +318,6 @@ int CORE_ztsmlq(MORSE_enum side, MORSE_enum trans, const MORSE_Complex64_t *V, int LDV, const MORSE_Complex64_t *T, int LDT, MORSE_Complex64_t *WORK, int LDWORK); -int CORE_ztsmlq_corner( int m1, int n1, int m2, int n2, int m3, int n3, - int k, int ib, int nb, - MORSE_Complex64_t *A1, int lda1, - MORSE_Complex64_t *A2, int lda2, - MORSE_Complex64_t *A3, int lda3, - const MORSE_Complex64_t *V, int ldv, - const MORSE_Complex64_t *T, int ldt, - MORSE_Complex64_t *WORK, int ldwork); int CORE_ztsmlq_hetra1( MORSE_enum side, MORSE_enum trans, int m1, int n1, int m2, int n2, int k, int ib, @@ -338,14 +333,6 @@ int CORE_ztsmqr(MORSE_enum side, MORSE_enum trans, const MORSE_Complex64_t *V, int LDV, const MORSE_Complex64_t *T, int LDT, MORSE_Complex64_t *WORK, int LDWORK); -int CORE_ztsmqr_corner( int m1, int n1, int m2, int n2, int m3, int n3, - int k, int ib, int nb, - MORSE_Complex64_t *A1, int lda1, - MORSE_Complex64_t *A2, int lda2, - MORSE_Complex64_t *A3, int lda3, - const MORSE_Complex64_t *V, int ldv, - const MORSE_Complex64_t *T, int ldt, - MORSE_Complex64_t *WORK, int ldwork); int CORE_ztsmqr_hetra1( MORSE_enum side, MORSE_enum trans, int m1, int n1, int m2, int n2, int k, int ib, diff --git a/cudablas/compute/CMakeLists.txt b/cudablas/compute/CMakeLists.txt index d05705eb7fa4a50aeed8bb7e00b707e89e7bf5ae..3f97d72ddca9ce073357d099d0f308cd602386f8 100644 --- a/cudablas/compute/CMakeLists.txt +++ b/cudablas/compute/CMakeLists.txt @@ -31,6 +31,7 @@ set(ZSRC cuda_zgemm.c cuda_zhemm.c cuda_zher2k.c + cuda_zherfb.c cuda_zherk.c cuda_zlarfb.c cuda_zparfb.c diff --git a/cudablas/include/cudablas_z.h b/cudablas/include/cudablas_z.h index 6611bf98deea97b4a00e152bf543c13a5555f56f..0ee705938a2a6cd88e52e212275983d2b254ec09 100644 --- a/cudablas/include/cudablas_z.h +++ b/cudablas/include/cudablas_z.h @@ -38,6 +38,7 @@ int CUDA_zgemerge( MORSE_enum side, MORSE_enum diag, int M, int N, cuDoubleCompl int CUDA_zgemm( MORSE_enum transa, MORSE_enum transb, int m, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM); int CUDA_zhemm( MORSE_enum side, MORSE_enum uplo, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM); int CUDA_zher2k( MORSE_enum uplo, MORSE_enum trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, double *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM); +int CUDA_zherfb( MORSE_enum uplo, int n, int k, int ib, int nb, const cuDoubleComplex *A, int lda, const cuDoubleComplex *T, int ldt, cuDoubleComplex *C, int ldc, cuDoubleComplex *WORK, int ldwork, CUBLAS_STREAM_PARAM ); int CUDA_zherk( MORSE_enum uplo, MORSE_enum trans, int n, int k, double *alpha, const cuDoubleComplex *A, int lda, double *beta, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM); int CUDA_zlarfb(MORSE_enum side, MORSE_enum trans, MORSE_enum direct, MORSE_enum storev, int M, int N, int K, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM ); int CUDA_zparfb(MORSE_enum side, MORSE_enum trans, MORSE_enum direct, MORSE_enum storev, int M1, int N1, int M2, int N2, int K, int L, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM ); diff --git a/include/morse_z.h b/include/morse_z.h index 3d3a2dbf578468d5086cd0895ecd4f91d621c82b..f01b0c778aa1a487999f528e71573589af2c9a57 100644 --- a/include/morse_z.h +++ b/include/morse_z.h @@ -55,7 +55,7 @@ int MORSE_zgeqrs(int M, int N, int NRHS, MORSE_Complex64_t *A, int LDA, MORSE_de //int MORSE_zgesv(int N, int NRHS, MORSE_Complex64_t *A, int LDA, int *IPIV, MORSE_Complex64_t *B, int LDB); int MORSE_zgesv_incpiv(int N, int NRHS, MORSE_Complex64_t *A, int LDA, MORSE_desc_t *descL, int *IPIV, MORSE_Complex64_t *B, int LDB); int MORSE_zgesv_nopiv(int N, int NRHS, MORSE_Complex64_t *A, int LDA, MORSE_Complex64_t *B, int LDB); -//int MORSE_zgesvd(MORSE_enum jobu, MORSE_enum jobvt, int M, int N, MORSE_Complex64_t *A, int LDA, double *S, MORSE_Complex64_t *U, int LDU, MORSE_Complex64_t *VT, int LDVT, MORSE_desc_t *descT); +int MORSE_zgesvd(MORSE_enum jobu, MORSE_enum jobvt, int M, int N, MORSE_Complex64_t *A, int LDA, double *S, MORSE_desc_t *descT, MORSE_Complex64_t *U, int LDU, MORSE_Complex64_t *VT, int LDVT); //int MORSE_zgetrf(int M, int N, MORSE_Complex64_t *A, int LDA, int *IPIV); int MORSE_zgetrf_incpiv(int M, int N, MORSE_Complex64_t *A, int LDA, MORSE_desc_t *descL, int *IPIV); int MORSE_zgetrf_nopiv(int M, int N, MORSE_Complex64_t *A, int LDA); @@ -69,11 +69,11 @@ int MORSE_zherk(MORSE_enum uplo, MORSE_enum trans, int N, int K, double alpha, M int MORSE_zher2k(MORSE_enum uplo, MORSE_enum trans, int N, int K, MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, MORSE_Complex64_t *B, int LDB, double beta, MORSE_Complex64_t *C, int LDC); #endif //int MORSE_zheev(MORSE_enum jobz, MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA, double *W, MORSE_desc_t *descT, MORSE_Complex64_t *Q, int LDQ); -//int MORSE_zheevd(MORSE_enum jobz, MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA, double *W, MORSE_desc_t *descT, MORSE_Complex64_t *Q, int LDQ); +int MORSE_zheevd(MORSE_enum jobz, MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA, double *W, MORSE_desc_t *descT); //int MORSE_zhegv(MORSE_enum itype, MORSE_enum jobz, MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA, MORSE_Complex64_t *B, int LDB, double *W, MORSE_desc_t *descT, MORSE_Complex64_t *Q, int LDQ); //int MORSE_zhegvd(MORSE_enum itype, MORSE_enum jobz, MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA, MORSE_Complex64_t *B, int LDB, double *W, MORSE_desc_t *descT, MORSE_Complex64_t *Q, int LDQ); //int MORSE_zhegst(MORSE_enum itype, MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA, MORSE_Complex64_t *B, int LDB); -//int MORSE_zhetrd(MORSE_enum jobz, MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA, double *D, double *E, MORSE_desc_t *descT, MORSE_Complex64_t *Q, int LDQ); +int MORSE_zhetrd(MORSE_enum jobz, MORSE_enum uplo, int N, MORSE_Complex64_t *A, int LDA, double *D, double *E, MORSE_desc_t *descT, MORSE_Complex64_t *Q, int LDQ); int MORSE_zlacpy(MORSE_enum uplo, int M, int N, MORSE_Complex64_t *A, int LDA, MORSE_Complex64_t *B, int LDB); double MORSE_zlange(MORSE_enum norm, int M, int N, MORSE_Complex64_t *A, int LDA); #ifdef COMPLEX @@ -132,7 +132,7 @@ int MORSE_zgeqrs_Tile(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B); //int MORSE_zgesv_Tile(MORSE_desc_t *A, int *IPIV, MORSE_desc_t *B); int MORSE_zgesv_incpiv_Tile(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV, MORSE_desc_t *B); int MORSE_zgesv_nopiv_Tile(MORSE_desc_t *A, MORSE_desc_t *B); -//int MORSE_zgesvd_Tile(MORSE_enum jobu, MORSE_enum jobvt, MORSE_desc_t *A, double *S, MORSE_desc_t *U, MORSE_desc_t *VT, MORSE_desc_t *T); +int MORSE_zgesvd_Tile(MORSE_enum jobu, MORSE_enum jobvt, MORSE_desc_t *A, double *S, MORSE_desc_t *T, MORSE_Complex64_t *U, int LDU, MORSE_Complex64_t *VT, int LDVT); //int MORSE_zgetrf_Tile(MORSE_desc_t *A, int *IPIV); int MORSE_zgetrf_incpiv_Tile(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV); int MORSE_zgetrf_nopiv_Tile(MORSE_desc_t *A); @@ -146,11 +146,11 @@ int MORSE_zherk_Tile(MORSE_enum uplo, MORSE_enum trans, double alpha, MORSE_desc int MORSE_zher2k_Tile(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, double beta, MORSE_desc_t *C); #endif //int MORSE_zheev_Tile(MORSE_enum jobz, MORSE_enum uplo, MORSE_desc_t *A, double *W, MORSE_desc_t *T, MORSE_Complex64_t *Q, int LDQ); -//int MORSE_zheevd_Tile(MORSE_enum jobz, MORSE_enum uplo, MORSE_desc_t *A, double *W, MORSE_desc_t *T, MORSE_Complex64_t *Q, int LDQ); +int MORSE_zheevd_Tile(MORSE_enum jobz, MORSE_enum uplo, MORSE_desc_t *A, double *W, MORSE_desc_t *T); //int MORSE_zhegv_Tile( MORSE_enum itype, MORSE_enum jobz, MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B, double *W, MORSE_desc_t *T, MORSE_desc_t *Q); //int MORSE_zhegvd_Tile(MORSE_enum itype, MORSE_enum jobz, MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B, double *W, MORSE_desc_t *T, MORSE_desc_t *Q); //int MORSE_zhegst_Tile(MORSE_enum itype, MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B); -//int MORSE_zhetrd_Tile(MORSE_enum jobz, MORSE_enum uplo, MORSE_desc_t *A, double *D, double *E, MORSE_desc_t *T, MORSE_Complex64_t *Q, int LDQ); +int MORSE_zhetrd_Tile(MORSE_enum jobz, MORSE_enum uplo, MORSE_desc_t *A, double *D, double *E, MORSE_desc_t *T, MORSE_Complex64_t *Q, int LDQ); int MORSE_zlacpy_Tile(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B); double MORSE_zlange_Tile(MORSE_enum norm, MORSE_desc_t *A); #ifdef COMPLEX @@ -206,7 +206,7 @@ int MORSE_zgeqrs_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B, M //int MORSE_zgesv_Tile_Async(MORSE_desc_t *A, int *IPIV, MORSE_desc_t *B, MORSE_sequence_t *sequence, MORSE_request_t *request); int MORSE_zgesv_incpiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV, MORSE_desc_t *B, MORSE_sequence_t *sequence, MORSE_request_t *request); int MORSE_zgesv_nopiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *B, MORSE_sequence_t *sequence, MORSE_request_t *request); -//int MORSE_zgesvd_Tile_Async(MORSE_enum jobu, MORSE_enum jobvt, MORSE_desc_t *A, double *S, MORSE_desc_t *U, MORSE_desc_t *VT, MORSE_desc_t *T, MORSE_sequence_t *sequence, MORSE_request_t *request); +int MORSE_zgesvd_Tile_Async(MORSE_enum jobu, MORSE_enum jobvt, MORSE_desc_t *A, double *S, MORSE_desc_t *T, MORSE_Complex64_t *U, int LDU, MORSE_Complex64_t *VT, int LDVT, MORSE_sequence_t *sequence, MORSE_request_t *request); //int MORSE_zgetrf_Tile_Async(MORSE_desc_t *A, int *IPIV, MORSE_sequence_t *sequence, MORSE_request_t *request); int MORSE_zgetrf_incpiv_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV, MORSE_sequence_t *sequence, MORSE_request_t *request); int MORSE_zgetrf_nopiv_Tile_Async(MORSE_desc_t *A, MORSE_sequence_t *sequence, MORSE_request_t *request); @@ -220,11 +220,11 @@ int MORSE_zherk_Tile_Async(MORSE_enum uplo, MORSE_enum trans, double alpha, MORS int MORSE_zher2k_Tile_Async(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, double beta, MORSE_desc_t *C, MORSE_sequence_t *sequence, MORSE_request_t *request); #endif //int MORSE_zheev_Tile_Async(MORSE_enum jobz, MORSE_enum uplo, MORSE_desc_t *A, double *W, MORSE_desc_t *T, MORSE_Complex64_t *Q, int LDQ, MORSE_sequence_t *sequence, MORSE_request_t *request); -//int MORSE_zheevd_Tile_Async(MORSE_enum jobz, MORSE_enum uplo, MORSE_desc_t *A, double *W, MORSE_desc_t *T, MORSE_Complex64_t *Q, int LDQ, MORSE_sequence_t *sequence, MORSE_request_t *request); +int MORSE_zheevd_Tile_Async(MORSE_enum jobz, MORSE_enum uplo, MORSE_desc_t *A, double *W, MORSE_desc_t *T, MORSE_sequence_t *sequence, MORSE_request_t *request); //int MORSE_zhegv_Tile_Async( MORSE_enum itype, MORSE_enum jobz, MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B, double *W, MORSE_desc_t *T, MORSE_desc_t *Q, MORSE_sequence_t *sequence, MORSE_request_t *request); //int MORSE_zhegvd_Tile_Async(MORSE_enum itype, MORSE_enum jobz, MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B, double *W, MORSE_desc_t *T, MORSE_desc_t *Q, MORSE_sequence_t *sequence, MORSE_request_t *request); //int MORSE_zhegst_Tile_Async(MORSE_enum itype, MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_sequence_t *sequence, MORSE_request_t *request); -//int MORSE_zhetrd_Tile_Async(MORSE_enum jobz, MORSE_enum uplo, MORSE_desc_t *A, double *D, double *E, MORSE_desc_t *T, MORSE_Complex64_t *Q, int LDQ, MORSE_sequence_t *sequence, MORSE_request_t *request); +int MORSE_zhetrd_Tile_Async(MORSE_enum jobz, MORSE_enum uplo, MORSE_desc_t *A, double *D, double *E, MORSE_desc_t *T, MORSE_Complex64_t *Q, int LDQ, MORSE_sequence_t *sequence, MORSE_request_t *request); int MORSE_zlacpy_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_sequence_t *sequence, MORSE_request_t *request); int MORSE_zlange_Tile_Async(MORSE_enum norm, MORSE_desc_t *A, double *value, MORSE_sequence_t *sequence, MORSE_request_t *request); #ifdef COMPLEX diff --git a/include/runtime.h b/include/runtime.h index 1bc4b9b13b30b2d365867d98761a10013e027a37..95cf19b75674623160da018bda60c1cbd65ef413 100644 --- a/include/runtime.h +++ b/include/runtime.h @@ -60,19 +60,19 @@ void RUNTIME_comm_size (int*); /******************************************************************************* * RUNTIME Descriptor **/ -void* RUNTIME_mat_alloc (size_t); -void RUNTIME_mat_free (void*, size_t); -void RUNTIME_desc_init (MORSE_desc_t*); -void RUNTIME_desc_create (MORSE_desc_t*); -void RUNTIME_desc_destroy (MORSE_desc_t*); -void RUNTIME_desc_submatrix (MORSE_desc_t*); -void* RUNTIME_desc_getaddr (const MORSE_desc_t*, int, int); +void* RUNTIME_mat_alloc (size_t); +void RUNTIME_mat_free (void*, size_t); +void RUNTIME_desc_init (MORSE_desc_t*); +void RUNTIME_desc_create (MORSE_desc_t*); +void RUNTIME_desc_destroy (MORSE_desc_t*); +void RUNTIME_desc_submatrix (MORSE_desc_t*); +void* RUNTIME_desc_getaddr (const MORSE_desc_t*, int, int); /* Acquire in main memory an up-to-date copy of the data described by the descriptor for read-write access. */ -int RUNTIME_desc_acquire (MORSE_desc_t*); +int RUNTIME_desc_acquire (MORSE_desc_t*); /* Release the data described by the descriptor to be used by the StarPU tasks again. */ -int RUNTIME_desc_release (MORSE_desc_t*); -int RUNTIME_desc_getoncpu (MORSE_desc_t*); -void RUNTIME_user_tag_size (int, int) ; +int RUNTIME_desc_release (MORSE_desc_t*); +int RUNTIME_desc_getoncpu (MORSE_desc_t*); +void RUNTIME_user_tag_size (int, int) ; /******************************************************************************* * RUNTIME Options diff --git a/include/runtime_z.h b/include/runtime_z.h index 3ed09528ba9d80d1a42c2b5bc4248da91c7991e7..df6b2f553e17bcd16c872e24b360cafcb0b4f74c 100644 --- a/include/runtime_z.h +++ b/include/runtime_z.h @@ -157,6 +157,11 @@ void MORSE_TASK_zgetrip_f2(const MORSE_option_t *options, int m, int n, const MORSE_desc_t *A, int Am, int An, int szeA, const MORSE_desc_t *fake1, int fake1m, int fake1n, int szeF1, int paramF1, const MORSE_desc_t *fake2, int fake2m, int fake2n, int szeF2, int paramF2); +void MORSE_TASK_zhe2ge(const MORSE_option_t *options, + MORSE_enum uplo, + int m, int n, int mb, + const MORSE_desc_t *A, int Am, int An, int lda, + const MORSE_desc_t *B, int Bm, int Bn, int ldb); void MORSE_TASK_zhemm(const MORSE_option_t *options, MORSE_enum side, MORSE_enum uplo, int m, int n, int nb, @@ -167,7 +172,6 @@ void MORSE_TASK_zhegst(const MORSE_option_t *options, int itype, MORSE_enum uplo, int N, const MORSE_desc_t *A, int Am, int An, int LDA, const MORSE_desc_t *B, int Bm, int Bn, int LDB, - int iinfo); void MORSE_TASK_zherk(const MORSE_option_t *options, MORSE_enum uplo, MORSE_enum trans, @@ -190,6 +194,10 @@ void MORSE_TASK_zlacpy(const MORSE_option_t *options, MORSE_enum uplo, int m, int n, int mb, const MORSE_desc_t *A, int Am, int An, int lda, const MORSE_desc_t *B, int Bm, int Bn, int ldb); +void MORSE_TASK_zlacpyx(const MORSE_option_t *options, + MORSE_enum uplo, int m, int n, int mb, + int displA, const MORSE_desc_t *A, int Am, int An, int lda, + int displB, const MORSE_desc_t *B, int Bm, int Bn, int ldb); void MORSE_TASK_zlange(const MORSE_option_t *options, MORSE_enum norm, int M, int N, int NB, const MORSE_desc_t *A, int Am, int An, int LDA, @@ -367,13 +375,6 @@ void MORSE_TASK_ztsmlq_hetra1(const MORSE_option_t *options, const MORSE_desc_t *A2, int A2m, int A2n, int lda2, const MORSE_desc_t *V, int Vm, int Vn, int ldv, const MORSE_desc_t *T, int Tm, int Tn, int ldt); -void MORSE_TASK_ztsmlq_corner(const MORSE_option_t *options, - int m1, int n1, int m2, int n2, int m3, int n3, int k, int ib, int nb, - const MORSE_desc_t *A1, int A1m, int A1n, int lda1, - const MORSE_desc_t *A2, int A2m, int A2n, int lda2, - const MORSE_desc_t *A3, int A3m, int A3n, int lda3, - const MORSE_desc_t *V, int Vm, int Vn, int ldv, - const MORSE_desc_t *T, int Tm, int Tn, int ldt); void MORSE_TASK_ztsmqr(const MORSE_option_t *options, MORSE_enum side, MORSE_enum trans, int m1, int n1, int m2, int n2, int k, int ib, int nb, @@ -388,13 +389,6 @@ void MORSE_TASK_ztsmqr_hetra1(const MORSE_option_t *options, const MORSE_desc_t *A2, int A2m, int A2n, int lda2, const MORSE_desc_t *V, int Vm, int Vn, int ldv, const MORSE_desc_t *T, int Tm, int Tn, int ldt); -void MORSE_TASK_ztsmqr_corner(const MORSE_option_t *options, - int m1, int n1, int m2, int n2, int m3, int n3, int k, int ib, int nb, - const MORSE_desc_t *A1, int A1m, int A1n, int lda1, - const MORSE_desc_t *A2, int A2m, int A2n, int lda2, - const MORSE_desc_t *A3, int A3m, int A3n, int lda3, - const MORSE_desc_t *V, int Vm, int Vn, int ldv, - const MORSE_desc_t *T, int Tm, int Tn, int ldt); void MORSE_TASK_ztsqrt(const MORSE_option_t *options, int m, int n, int ib, int nb, const MORSE_desc_t *A1, int A1m, int A1n, int lda1, diff --git a/runtime/quark/CMakeLists.txt b/runtime/quark/CMakeLists.txt index e67975d6b8b507587fe9978088d4c0d9244ddc05..9366a00d7ea931610daa6792d760bcea5e266c98 100644 --- a/runtime/quark/CMakeLists.txt +++ b/runtime/quark/CMakeLists.txt @@ -3,7 +3,7 @@ # @copyright (c) 2009-2014 The University of Tennessee and The University # of Tennessee Research Foundation. # All rights reserved. -# @copyright (c) 2012-2014 Inria. All rights reserved. +# @copyright (c) 2012-2016 Inria. All rights reserved. # @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. # ### @@ -116,6 +116,8 @@ set(ZSRC codelets/codelet_zgetrf.c codelets/codelet_zgetrf_incpiv.c codelets/codelet_zgetrf_nopiv.c + codelets/codelet_zhe2ge.c + codelets/codelet_zherfb.c codelets/codelet_zhessq.c codelets/codelet_zlacpy.c codelets/codelet_zlange.c @@ -124,6 +126,7 @@ set(ZSRC codelets/codelet_zlantr.c codelets/codelet_zlaset2.c codelets/codelet_zlaset.c + codelets/codelet_zlatro.c codelets/codelet_zlauum.c codelets/codelet_zplghe.c codelets/codelet_zplgsy.c @@ -140,6 +143,8 @@ set(ZSRC codelets/codelet_ztslqt.c codelets/codelet_ztsmlq.c codelets/codelet_ztsmqr.c + codelets/codelet_ztsmlq_hetra1.c + codelets/codelet_ztsmqr_hetra1.c codelets/codelet_ztsqrt.c codelets/codelet_ztstrf.c codelets/codelet_zttlqt.c @@ -180,6 +185,9 @@ add_dependencies(chameleon_quark control_include runtime_quark_include ) +if (NOT CHAMELEON_SIMULATION) + add_dependencies(chameleon_quark coreblas_include) +endif() # installation # ------------ diff --git a/runtime/quark/codelets/codelet_zhe2ge.c b/runtime/quark/codelets/codelet_zhe2ge.c new file mode 100644 index 0000000000000000000000000000000000000000..474ea51dc999960f1652babf576802616000efa8 --- /dev/null +++ b/runtime/quark/codelets/codelet_zhe2ge.c @@ -0,0 +1,62 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file codelet_zhe2ge.c + * + * MORSE codelets kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @precisions normal z -> c d s + * + **/ + +#include "runtime/quark/include/morse_quark.h" + +/** + * + * @ingroup CORE_MORSE_Complex64_t + * + **/ +static inline void CORE_zhe2ge_quark(Quark *quark) +{ + MORSE_enum uplo; + int M; + int N; + MORSE_Complex64_t *A; + int LDA; + MORSE_Complex64_t *B; + int LDB; + + quark_unpack_args_7(quark, uplo, M, N, A, LDA, B, LDB); + CORE_zhe2ge(uplo, M, N, A, LDA, B, LDB); +} + + +void MORSE_TASK_zhe2ge(const MORSE_option_t *options, + MORSE_enum uplo, + int m, int n, int mb, + const MORSE_desc_t *A, int Am, int An, int lda, + const MORSE_desc_t *B, int Bm, int Bn, int ldb) +{ + quark_option_t *opt = (quark_option_t*)(options->schedopt); + DAG_CORE_LACPY; + QUARK_Insert_Task(opt->quark, CORE_zhe2ge_quark, (Quark_Task_Flags*)opt, + sizeof(MORSE_enum), &uplo, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, + sizeof(MORSE_Complex64_t)*mb*mb, RTBLKADDR(A, MORSE_Complex64_t, Am, An), INPUT, + sizeof(int), &lda, VALUE, + sizeof(MORSE_Complex64_t)*mb*mb, RTBLKADDR(B, MORSE_Complex64_t, Bm, Bn), OUTPUT, + sizeof(int), &ldb, VALUE, + 0); +} diff --git a/runtime/quark/codelets/codelet_zherfb.c b/runtime/quark/codelets/codelet_zherfb.c new file mode 100644 index 0000000000000000000000000000000000000000..b254c1ed1439c342fce00883fac2b9d384e92cb1 --- /dev/null +++ b/runtime/quark/codelets/codelet_zherfb.c @@ -0,0 +1,71 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file qwrapper_zherfb.c + * + * PLASMA core_blas quark wrapper + * PLASMA is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 2.8.0 + * @author Hatem Ltaief + * @date 2010-11-15 + * @precisions normal z -> c d s + * + **/ +#include "runtime/quark/include/morse_quark.h" + +void CORE_zherfb_quark(Quark *quark) +{ + MORSE_enum uplo; + int n; + int k; + int ib; + int nb; + MORSE_Complex64_t *A; + int lda; + MORSE_Complex64_t *T; + int ldt; + MORSE_Complex64_t *C; + int ldc; + MORSE_Complex64_t *WORK; + int ldwork; + + quark_unpack_args_13(quark, uplo, n, k, ib, nb, A, lda, T, ldt, C, ldc, WORK, ldwork); + CORE_zherfb(uplo, n, k, ib, nb, A, lda, T, ldt, C, ldc, WORK, ldwork); +} + +void MORSE_TASK_zherfb(const MORSE_option_t *options, + MORSE_enum uplo, + int n, int k, int ib, int nb, + const MORSE_desc_t *A, int Am, int An, int lda, + const MORSE_desc_t *T, int Tm, int Tn, int ldt, + const MORSE_desc_t *C, int Cm, int Cn, int ldc) +{ + quark_option_t *opt = (quark_option_t*)(options->schedopt); + + QUARK_Insert_Task(opt->quark, CORE_zherfb_quark, (Quark_Task_Flags*)opt, + sizeof(MORSE_enum), &uplo, VALUE, + sizeof(int), &n, VALUE, + sizeof(int), &k, VALUE, + sizeof(int), &ib, VALUE, + sizeof(int), &nb, VALUE, + sizeof(MORSE_Complex64_t)*nb*nb, RTBLKADDR(A, MORSE_Complex64_t, Am, An), (uplo == MorseUpper) ? INOUT|QUARK_REGION_U : INOUT|QUARK_REGION_L, + sizeof(int), &lda, VALUE, + sizeof(MORSE_Complex64_t)*ib*nb, RTBLKADDR(T, MORSE_Complex64_t, Tm, Tn), INPUT, + sizeof(int), &ldt, VALUE, + sizeof(MORSE_Complex64_t)*nb*nb, RTBLKADDR(C, MORSE_Complex64_t, Cm, Cn), (uplo == MorseUpper) ? INOUT|QUARK_REGION_D|QUARK_REGION_U : INOUT|QUARK_REGION_D|QUARK_REGION_L, + sizeof(int), &ldc, VALUE, + sizeof(MORSE_Complex64_t)*2*nb*nb, NULL, SCRATCH, + sizeof(int), &nb, VALUE, + 0); +} diff --git a/runtime/quark/codelets/codelet_zlacpy.c b/runtime/quark/codelets/codelet_zlacpy.c index 2b2fd9da49a5ace36c50cb0c38f6d8b5d7cbefb9..ee78ca67bd681bda611b3543afef8ec423573cd9 100644 --- a/runtime/quark/codelets/codelet_zlacpy.c +++ b/runtime/quark/codelets/codelet_zlacpy.c @@ -41,30 +41,43 @@ static inline void CORE_zlacpy_quark(Quark *quark) MORSE_enum uplo; int M; int N; + int displA; MORSE_Complex64_t *A; int LDA; + int displB; MORSE_Complex64_t *B; int LDB; - quark_unpack_args_7(quark, uplo, M, N, A, LDA, B, LDB); - CORE_zlacpy(uplo, M, N, A, LDA, B, LDB); + quark_unpack_args_9(quark, uplo, M, N, displA, A, LDA, displB, B, LDB); + CORE_zlacpy(uplo, M, N, A + displA, LDA, B + displB, LDB); } -void MORSE_TASK_zlacpy(const MORSE_option_t *options, - MORSE_enum uplo, int m, int n, int nb, - const MORSE_desc_t *A, int Am, int An, int lda, - const MORSE_desc_t *B, int Bm, int Bn, int ldb) +void MORSE_TASK_zlacpyx(const MORSE_option_t *options, + MORSE_enum uplo, int m, int n, int nb, + int displA, const MORSE_desc_t *A, int Am, int An, int lda, + int displB, const MORSE_desc_t *B, int Bm, int Bn, int ldb) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_LACPY; QUARK_Insert_Task(opt->quark, CORE_zlacpy_quark, (Quark_Task_Flags*)opt, - sizeof(MORSE_enum), &uplo, VALUE, - sizeof(int), &m, VALUE, - sizeof(int), &n, VALUE, - sizeof(MORSE_Complex64_t)*nb*nb, RTBLKADDR(A, MORSE_Complex64_t, Am, An), INPUT, - sizeof(int), &lda, VALUE, - sizeof(MORSE_Complex64_t)*nb*nb, RTBLKADDR(B, MORSE_Complex64_t, Bm, Bn), OUTPUT, - sizeof(int), &ldb, VALUE, + sizeof(MORSE_enum), &uplo, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, + sizeof(int), &displA, VALUE, + sizeof(MORSE_Complex64_t)*nb*nb, RTBLKADDR(A, MORSE_Complex64_t, Am, An), INPUT, + sizeof(int), &lda, VALUE, + sizeof(int), &displB, VALUE, + sizeof(MORSE_Complex64_t)*nb*nb, RTBLKADDR(B, MORSE_Complex64_t, Bm, Bn), OUTPUT, + sizeof(int), &ldb, VALUE, 0); } +void MORSE_TASK_zlacpy(const MORSE_option_t *options, + MORSE_enum uplo, int m, int n, int nb, + const MORSE_desc_t *A, int Am, int An, int lda, + const MORSE_desc_t *B, int Bm, int Bn, int ldb) +{ + MORSE_TASK_zlacpyx( options, uplo, m, n, nb, + 0, A, Am, An, lda, + 0, B, Bm, Bn, ldb ); +} diff --git a/runtime/quark/codelets/codelet_zlatro.c b/runtime/quark/codelets/codelet_zlatro.c new file mode 100644 index 0000000000000000000000000000000000000000..342f1b430b1d4c3e60eae7a363e8144b508f4044 --- /dev/null +++ b/runtime/quark/codelets/codelet_zlatro.c @@ -0,0 +1,63 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * @file qwrapper_zlatro.c + * + * PLASMA core_blas quark wrapper + * PLASMA is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 2.8.0 + * @author Azzam Haidar + * @date 2010-11-15 + * @precisions normal z -> c d s + * + **/ +#include "runtime/quark/include/morse_quark.h" + + +void CORE_zlatro_quark(Quark *quark) +{ + MORSE_enum uplo; + MORSE_enum trans; + int M; + int N; + const MORSE_Complex64_t *A; + int LDA; + MORSE_Complex64_t *B; + int LDB; + + quark_unpack_args_8(quark, uplo, trans, M, N, A, LDA, B, LDB); + CORE_zlatro(uplo, trans, M, N, A, LDA, B, LDB); +} + +/***************************************************************************//** + * + **/ +void MORSE_TASK_zlatro(const MORSE_option_t *options, + MORSE_enum uplo, MORSE_enum trans, + int m, int n, int mb, + const MORSE_desc_t *A, int Am, int An, int lda, + const MORSE_desc_t *B, int Bm, int Bn, int ldb) +{ + quark_option_t *opt = (quark_option_t*)(options->schedopt); + + QUARK_Insert_Task(opt->quark, CORE_zlatro_quark, (Quark_Task_Flags*)opt, + sizeof(MORSE_enum), &uplo, VALUE, + sizeof(MORSE_enum), &trans, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, + sizeof(MORSE_Complex64_t)*mb*mb, RTBLKADDR(A, MORSE_Complex64_t, Am, An), INPUT, + sizeof(int), &lda, VALUE, + sizeof(MORSE_Complex64_t)*mb*mb, RTBLKADDR(B, MORSE_Complex64_t, Bm, Bn), OUTPUT, + sizeof(int), &ldb, VALUE, + 0); +} diff --git a/runtime/quark/codelets/codelet_ztsmlq_hetra1.c b/runtime/quark/codelets/codelet_ztsmlq_hetra1.c new file mode 100644 index 0000000000000000000000000000000000000000..b381e12e2ae9762c283be3d6a081c29affa709f9 --- /dev/null +++ b/runtime/quark/codelets/codelet_ztsmlq_hetra1.c @@ -0,0 +1,86 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file codelet_ztsmlq_hetra1.c + * + * PLASMA core_blas quark wrapper + * PLASMA is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 2.8.0 + * @author Hatem Ltaief + * @author Mathieu Faverge + * @author Jakub Kurzak + * @author Azzam Haidar + * @date 2010-11-15 + * @precisions normal z -> c d s + * + **/ +#include "runtime/quark/include/morse_quark.h" + +void CORE_ztsmlq_hetra1_quark(Quark *quark) +{ + MORSE_enum side; + MORSE_enum trans; + int m1; + int n1; + int m2; + int n2; + int k; + int ib; + MORSE_Complex64_t *A1; + int lda1; + MORSE_Complex64_t *A2; + int lda2; + MORSE_Complex64_t *V; + int ldv; + MORSE_Complex64_t *T; + int ldt; + MORSE_Complex64_t *WORK; + int ldwork; + + quark_unpack_args_18(quark, side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); + CORE_ztsmlq_hetra1(side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); +} + +void MORSE_TASK_ztsmlq_hetra1(const MORSE_option_t *options, + MORSE_enum side, MORSE_enum trans, + int m1, int n1, int m2, int n2, int k, int ib, int nb, + const MORSE_desc_t *A1, int A1m, int A1n, int lda1, + const MORSE_desc_t *A2, int A2m, int A2n, int lda2, + const MORSE_desc_t *V, int Vm, int Vn, int ldv, + const MORSE_desc_t *T, int Tm, int Tn, int ldt) +{ + quark_option_t *opt = (quark_option_t*)(options->schedopt); + int ldwork = side == MorseLeft ? ib : nb; + + QUARK_Insert_Task(opt->quark, CORE_ztsmlq_hetra1_quark, (Quark_Task_Flags*)opt, + sizeof(MORSE_enum), &side, VALUE, + sizeof(MORSE_enum), &trans, VALUE, + sizeof(int), &m1, VALUE, + sizeof(int), &n1, VALUE, + sizeof(int), &m2, VALUE, + sizeof(int), &n2, VALUE, + sizeof(int), &k, VALUE, + sizeof(int), &ib, VALUE, + sizeof(MORSE_Complex64_t)*nb*nb, RTBLKADDR(A1, MORSE_Complex64_t, A1m, A1n), INOUT|QUARK_REGION_U|QUARK_REGION_D, + sizeof(int), &lda1, VALUE, + sizeof(MORSE_Complex64_t)*nb*nb, RTBLKADDR(A2, MORSE_Complex64_t, A2m, A2n), INOUT, + sizeof(int), &lda2, VALUE, + sizeof(MORSE_Complex64_t)*nb*nb, RTBLKADDR(V, MORSE_Complex64_t, Vm, Vn), INPUT, + sizeof(int), &ldv, VALUE, + sizeof(MORSE_Complex64_t)*ib*nb, RTBLKADDR(T, MORSE_Complex64_t, Tm, Tn), INPUT, + sizeof(int), &ldt, VALUE, + sizeof(MORSE_Complex64_t)*ib*nb, NULL, SCRATCH, + sizeof(int), &ldwork, VALUE, + 0); +} diff --git a/runtime/quark/codelets/codelet_ztsmqr_hetra1.c b/runtime/quark/codelets/codelet_ztsmqr_hetra1.c new file mode 100644 index 0000000000000000000000000000000000000000..5e7277eef07f5a26ff0d7608c59cd71da4c6425e --- /dev/null +++ b/runtime/quark/codelets/codelet_ztsmqr_hetra1.c @@ -0,0 +1,86 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file codelets_ztsmqr_hetra1.c + * + * PLASMA core_blas quark wrapper + * PLASMA is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 2.8.0 + * @author Hatem Ltaief + * @author Mathieu Faverge + * @author Jakub Kurzak + * @author Azzam Haidar + * @date 2010-11-15 + * @precisions normal z -> c d s + * + **/ +#include "runtime/quark/include/morse_quark.h" + +void CORE_ztsmqr_hetra1_quark(Quark *quark) +{ + MORSE_enum side; + MORSE_enum trans; + int m1; + int n1; + int m2; + int n2; + int k; + int ib; + MORSE_Complex64_t *A1; + int lda1; + MORSE_Complex64_t *A2; + int lda2; + MORSE_Complex64_t *V; + int ldv; + MORSE_Complex64_t *T; + int ldt; + MORSE_Complex64_t *WORK; + int ldwork; + + quark_unpack_args_18(quark, side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); + CORE_ztsmqr_hetra1(side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); +} + +void MORSE_TASK_ztsmqr_hetra1(const MORSE_option_t *options, + MORSE_enum side, MORSE_enum trans, + int m1, int n1, int m2, int n2, int k, int ib, int nb, + const MORSE_desc_t *A1, int A1m, int A1n, int lda1, + const MORSE_desc_t *A2, int A2m, int A2n, int lda2, + const MORSE_desc_t *V, int Vm, int Vn, int ldv, + const MORSE_desc_t *T, int Tm, int Tn, int ldt) +{ + quark_option_t *opt = (quark_option_t*)(options->schedopt); + int ldwork = side == MorseLeft ? ib : nb; + + QUARK_Insert_Task(opt->quark, CORE_ztsmqr_hetra1_quark, (Quark_Task_Flags*)opt, + sizeof(MORSE_enum), &side, VALUE, + sizeof(MORSE_enum), &trans, VALUE, + sizeof(int), &m1, VALUE, + sizeof(int), &n1, VALUE, + sizeof(int), &m2, VALUE, + sizeof(int), &n2, VALUE, + sizeof(int), &k, VALUE, + sizeof(int), &ib, VALUE, + sizeof(MORSE_Complex64_t)*nb*nb, RTBLKADDR(A1, MORSE_Complex64_t, A1m, A1n), INOUT|QUARK_REGION_L|QUARK_REGION_D, + sizeof(int), &lda1, VALUE, + sizeof(MORSE_Complex64_t)*nb*nb, RTBLKADDR(A2, MORSE_Complex64_t, A2m, A2n), INOUT, + sizeof(int), &lda2, VALUE, + sizeof(MORSE_Complex64_t)*nb*nb, RTBLKADDR(V, MORSE_Complex64_t, Vm, Vn), INPUT, + sizeof(int), &ldv, VALUE, + sizeof(MORSE_Complex64_t)*ib*nb, RTBLKADDR(T, MORSE_Complex64_t, Tm, Tn), INPUT, + sizeof(int), &ldt, VALUE, + sizeof(MORSE_Complex64_t)*ib*nb, NULL, SCRATCH, + sizeof(int), &ldwork, VALUE, + 0); +} diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt index 5dc0bd19b7e01db14e5e05f8f2f78330eb609b5c..08956acafbf1742798bb235a8226b17bf1d73d05 100644 --- a/runtime/starpu/CMakeLists.txt +++ b/runtime/starpu/CMakeLists.txt @@ -136,6 +136,8 @@ set(ZSRC codelets/codelet_zgetrf.c codelets/codelet_zgetrf_incpiv.c codelets/codelet_zgetrf_nopiv.c + codelets/codelet_zhe2ge.c + codelets/codelet_zherfb.c codelets/codelet_zhessq.c codelets/codelet_zlacpy.c codelets/codelet_zlange.c @@ -144,6 +146,7 @@ set(ZSRC codelets/codelet_zlantr.c codelets/codelet_zlaset2.c codelets/codelet_zlaset.c + codelets/codelet_zlatro.c codelets/codelet_zlauum.c codelets/codelet_zplghe.c codelets/codelet_zplgsy.c @@ -160,6 +163,8 @@ set(ZSRC codelets/codelet_ztslqt.c codelets/codelet_ztsmlq.c codelets/codelet_ztsmqr.c + codelets/codelet_ztsmlq_hetra1.c + codelets/codelet_ztsmqr_hetra1.c codelets/codelet_ztsqrt.c codelets/codelet_ztstrf.c codelets/codelet_zttlqt.c @@ -206,7 +211,7 @@ add_dependencies(chameleon_starpu if (NOT CHAMELEON_SIMULATION) add_dependencies(chameleon_starpu coreblas_include) endif() - + if (CHAMELEON_USE_CUDA AND NOT CHAMELEON_SIMULATION) add_dependencies(chameleon_starpu cudablas_include) endif() diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c index fa66cff4d9f3be011bfbe7e7a51f6c29ebdde7e4..8af4ec546d5e5c59f4ad01a06dd211258d3e99fd 100644 --- a/runtime/starpu/codelets/codelet_zcallback.c +++ b/runtime/starpu/codelets/codelet_zcallback.c @@ -40,6 +40,8 @@ CHAMELEON_CL_CB(zgessq, starpu_matrix_get_nx(task->handles[0]), starpu_ma CHAMELEON_CL_CB(zgetrf, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (2./3.)*M*N*K); CHAMELEON_CL_CB(zgetrf_incpiv, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (2./3.)*M*N*K); CHAMELEON_CL_CB(zgetrf_nopiv, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (2./3.)*M*N*K); +CHAMELEON_CL_CB(zhe2ge, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, (1./2.0)*M*N); +CHAMELEON_CL_CB(zherfb, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, 2. *M* M*M); #if defined(PRECISION_z) || defined(PRECISION_c) CHAMELEON_CL_CB(zhemm, starpu_matrix_get_nx(task->handles[2]), starpu_matrix_get_ny(task->handles[2]), 0, 2.*M*M *N); CHAMELEON_CL_CB(zher2k, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, ( 1.+2.*M*N)*M); @@ -49,6 +51,7 @@ CHAMELEON_CL_CB(zlacpy, starpu_matrix_get_nx(task->handles[0]), starpu_ma CHAMELEON_CL_CB(zlange, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N); CHAMELEON_CL_CB(zlaset, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N); CHAMELEON_CL_CB(zlaset2, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N); +CHAMELEON_CL_CB(zlatro, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N); CHAMELEON_CL_CB(zlauum, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (1./3.)*M* M*M); #if defined(PRECISION_z) || defined(PRECISION_c) CHAMELEON_CL_CB(zplghe, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N); @@ -71,6 +74,8 @@ CHAMELEON_CL_CB(ztrtri, starpu_matrix_get_nx(task->handles[0]), starpu_ma CHAMELEON_CL_CB(ztslqt, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), 2. *M* M*M); CHAMELEON_CL_CB(ztsmlq, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (4.0*M+starpu_matrix_get_nx(task->handles[3]))*M*M); CHAMELEON_CL_CB(ztsmqr, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (4.0*M+starpu_matrix_get_nx(task->handles[3]))*M*M); +CHAMELEON_CL_CB(ztsmlq_hetra1, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (4.0*M+starpu_matrix_get_nx(task->handles[3]))*M*M); +CHAMELEON_CL_CB(ztsmqr_hetra1, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (4.0*M+starpu_matrix_get_nx(task->handles[3]))*M*M); CHAMELEON_CL_CB(ztsqrt, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), 2. *M* M*M); CHAMELEON_CL_CB(ztstrf, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), M* M*M); CHAMELEON_CL_CB(zttlqt, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), 1. *M* M*M); diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c index 91b9ccdf4d89be209b3e183b2bc0f6aa570a10a9..581d46a1153631294ee2e0239e61de1c510625a0 100644 --- a/runtime/starpu/codelets/codelet_zgemm.c +++ b/runtime/starpu/codelets/codelet_zgemm.c @@ -100,8 +100,6 @@ void MORSE_TASK_zgemm(const MORSE_option_t *options, } } - - #if !defined(CHAMELEON_SIMULATION) static void cl_zgemm_cpu_func(void *descr[], void *cl_arg) { @@ -169,7 +167,7 @@ static void cl_zgemm_cuda_func(void *descr[], void *cl_arg) return; } -#endif /* CHAMELEON_USE_CUDA */ +#endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* diff --git a/runtime/starpu/codelets/codelet_zhe2ge.c b/runtime/starpu/codelets/codelet_zhe2ge.c new file mode 100644 index 0000000000000000000000000000000000000000..9bb03ebaf9247db83de4e9f346ff3e4edb2a1885 --- /dev/null +++ b/runtime/starpu/codelets/codelet_zhe2ge.c @@ -0,0 +1,82 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file codelet_zhe2ge.c + * + * MORSE codelets kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @precisions normal z -> c d s + * + **/ + +#include "runtime/starpu/include/morse_starpu.h" +#include "runtime/starpu/include/runtime_codelet_z.h" + +/** + * + * @ingroup CORE_MORSE_Complex64_t + * + **/ + +void MORSE_TASK_zhe2ge(const MORSE_option_t *options, + MORSE_enum uplo, + int m, int n, int mb, + const MORSE_desc_t *A, int Am, int An, int lda, + const MORSE_desc_t *B, int Bm, int Bn, int ldb) +{ + (void)mb; + struct starpu_codelet *codelet = &cl_zhe2ge; + void (*callback)(void*) = options->profiling ? cl_zhe2ge_callback : NULL; + + if ( morse_desc_islocal( A, Am, An ) || + morse_desc_islocal( B, Bm, Bn ) ) + { + starpu_insert_task( + codelet, + STARPU_VALUE, &uplo, sizeof(MORSE_enum), + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_R, RTBLKADDR(A, MORSE_Complex64_t, Am, An), + STARPU_VALUE, &lda, sizeof(int), + STARPU_W, RTBLKADDR(B, MORSE_Complex64_t, Bm, Bn), + STARPU_VALUE, &ldb, sizeof(int), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_NAME, "zhe2ge", + 0); + } +} + +#if !defined(CHAMELEON_SIMULATION) +static void cl_zhe2ge_cpu_func(void *descr[], void *cl_arg) +{ + MORSE_enum uplo; + int M; + int N; + const MORSE_Complex64_t *A; + int LDA; + MORSE_Complex64_t *B; + int LDB; + + A = (const MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); + B = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); + starpu_codelet_unpack_args(cl_arg, &uplo, &M, &N, &LDA, &LDB); + CORE_zhe2ge(uplo, M, N, A, LDA, B, LDB); +} +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +CODELETS_CPU(zhe2ge, 2, cl_zhe2ge_cpu_func) diff --git a/runtime/starpu/codelets/codelet_zherfb.c b/runtime/starpu/codelets/codelet_zherfb.c new file mode 100644 index 0000000000000000000000000000000000000000..9f693a90173728c2e6abb26ef09c509ab231c864 --- /dev/null +++ b/runtime/starpu/codelets/codelet_zherfb.c @@ -0,0 +1,135 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file codelet_zherfb.c + * + * MORSE codelets kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 2.7.1 + * @author Hatem Ltaief + * @date 2010-11-15 + * @precisions normal z -> c d s + * + **/ +#include "runtime/starpu/include/morse_starpu.h" +#include "runtime/starpu/include/runtime_codelet_z.h" + +/** + * + * @ingroup CORE_MORSE_Complex64_t + * + **/ +void MORSE_TASK_zherfb(const MORSE_option_t *options, + MORSE_enum uplo, + int n, int k, int ib, int nb, + const MORSE_desc_t *A, int Am, int An, int lda, + const MORSE_desc_t *T, int Tm, int Tn, int ldt, + const MORSE_desc_t *C, int Cm, int Cn, int ldc) +{ + struct starpu_codelet *codelet = &cl_zherfb; + void (*callback)(void*) = options->profiling ? cl_zherfb_callback : NULL; + + if ( morse_desc_islocal( A, Am, An ) || + morse_desc_islocal( T, Tm, Tn ) || + morse_desc_islocal( C, Cm, Cn ) ) + { + starpu_insert_task( + codelet, + STARPU_VALUE, &uplo, sizeof(MORSE_enum), + STARPU_VALUE, &n, sizeof(int), + STARPU_VALUE, &k, sizeof(int), + STARPU_VALUE, &ib, sizeof(int), + STARPU_VALUE, &nb, sizeof(int), + STARPU_R, RTBLKADDR(A, MORSE_Complex64_t, Am, An), + STARPU_VALUE, &lda, sizeof(int), + STARPU_R, RTBLKADDR(T, MORSE_Complex64_t, Tm, Tn), + STARPU_VALUE, &ldt, sizeof(int), + STARPU_RW, RTBLKADDR(C, MORSE_Complex64_t, Cm, Cn), + STARPU_VALUE, &ldc, sizeof(int), + STARPU_SCRATCH, options->ws_worker, + STARPU_VALUE, &nb, sizeof(int), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_NAME, "zherfb", + 0); + } +} + + +#if !defined(CHAMELEON_SIMULATION) +static void cl_zherfb_cpu_func(void *descr[], void *cl_arg) +{ + MORSE_enum uplo; + int n; + int k; + int ib; + int nb; + const MORSE_Complex64_t *A; + int lda; + const MORSE_Complex64_t *T; + int ldt; + MORSE_Complex64_t *C; + int ldc; + MORSE_Complex64_t *WORK; + int ldwork; + + A = (const MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); + T = (const MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); + C = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); + WORK = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */ + + starpu_codelet_unpack_args(cl_arg, &uplo, &n, &k, &ib, &nb, &lda, &ldt, &ldc, &ldwork); + + CORE_zherfb(uplo, n, k, ib, nb, A, lda, T, ldt, C, ldc, WORK, ldwork); +} + +#if defined(CHAMELEON_USE_CUDA) +static void cl_zherfb_cuda_func(void *descr[], void *cl_arg) +{ + MORSE_enum uplo; + int n; + int k; + int ib; + int nb; + const cuDoubleComplex *A; + int lda; + const cuDoubleComplex *T; + int ldt; + cuDoubleComplex *C; + int ldc; + cuDoubleComplex *WORK; + int ldwork; + + stream = starpu_cuda_get_local_stream(); + + A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); + T = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); + C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); + WORK = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */ + + starpu_codelet_unpack_args(cl_arg, &uplo, &n, &k, &ib, &nb, &lda, &ldt, &ldc, &ldwork); + + CUDA_zherfb(uplo, n, k, ib, nb, A, lda, T, ldt, C, ldc, WORK, ldwork); + +#ifndef STARPU_CUDA_ASYNC + cudaStreamSynchronize( stream ); +#endif +} +#endif /* defined(CHAMELEON_USE_CUDA) */ +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +CODELETS(zherfb, 4, cl_zherfb_cpu_func, cl_zherfb_cuda_func, STARPU_CUDA_ASYNC); diff --git a/runtime/starpu/codelets/codelet_zlacpy.c b/runtime/starpu/codelets/codelet_zlacpy.c index b62d2f06e2cd2f2154ebf9e4cd9bbc08e33e952e..c70e6b6805ef7a377f52e6f25acb80b3ab93c8d0 100644 --- a/runtime/starpu/codelets/codelet_zlacpy.c +++ b/runtime/starpu/codelets/codelet_zlacpy.c @@ -38,10 +38,10 @@ * **/ -void MORSE_TASK_zlacpy(const MORSE_option_t *options, - MORSE_enum uplo, int m, int n, int nb, - const MORSE_desc_t *A, int Am, int An, int lda, - const MORSE_desc_t *B, int Bm, int Bn, int ldb) +void MORSE_TASK_zlacpyx(const MORSE_option_t *options, + MORSE_enum uplo, int m, int n, int nb, + int displA, const MORSE_desc_t *A, int Am, int An, int lda, + int displB, const MORSE_desc_t *B, int Bm, int Bn, int ldb) { (void)nb; struct starpu_codelet *codelet = &cl_zlacpy; @@ -52,19 +52,30 @@ void MORSE_TASK_zlacpy(const MORSE_option_t *options, { starpu_insert_task( codelet, - STARPU_VALUE, &uplo, sizeof(MORSE_enum), - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_R, RTBLKADDR(A, MORSE_Complex64_t, Am, An), - STARPU_VALUE, &lda, sizeof(int), - STARPU_W, RTBLKADDR(B, MORSE_Complex64_t, Bm, Bn), - STARPU_VALUE, &ldb, sizeof(int), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + STARPU_VALUE, &uplo, sizeof(MORSE_enum), + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_VALUE, &displA, sizeof(int), + STARPU_R, RTBLKADDR(A, MORSE_Complex64_t, Am, An), + STARPU_VALUE, &lda, sizeof(int), + STARPU_VALUE, &displB, sizeof(int), + STARPU_W, RTBLKADDR(B, MORSE_Complex64_t, Bm, Bn), + STARPU_VALUE, &ldb, sizeof(int), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, 0); } } +void MORSE_TASK_zlacpy(const MORSE_option_t *options, + MORSE_enum uplo, int m, int n, int nb, + const MORSE_desc_t *A, int Am, int An, int lda, + const MORSE_desc_t *B, int Bm, int Bn, int ldb) +{ + MORSE_TASK_zlacpyx( options, uplo, m, n, nb, + 0, A, Am, An, lda, + 0, B, Bm, Bn, ldb ); +} #if !defined(CHAMELEON_SIMULATION) static void cl_zlacpy_cpu_func(void *descr[], void *cl_arg) @@ -72,15 +83,17 @@ static void cl_zlacpy_cpu_func(void *descr[], void *cl_arg) MORSE_enum uplo; int M; int N; - MORSE_Complex64_t *A; + int displA; + int displB; + const MORSE_Complex64_t *A; int LDA; MORSE_Complex64_t *B; int LDB; - A = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); + A = (const MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); B = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - starpu_codelet_unpack_args(cl_arg, &uplo, &M, &N, &LDA, &LDB); - CORE_zlacpy(uplo, M, N, A, LDA, B, LDB); + starpu_codelet_unpack_args(cl_arg, &uplo, &M, &N, &displA, &LDA, &displB, &LDB); + CORE_zlacpy(uplo, M, N, A + displA, LDA, B + displB, LDB); } #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_zlatro.c b/runtime/starpu/codelets/codelet_zlatro.c new file mode 100644 index 0000000000000000000000000000000000000000..6914020301ecf892ff9ce8bc882eca72393b77cc --- /dev/null +++ b/runtime/starpu/codelets/codelet_zlatro.c @@ -0,0 +1,93 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file codelet_zlatro.c + * + * MORSE codelets kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 2.5.0 + * @comment This file has been automatically generated + * from Plasma 2.5.0 for MORSE 1.0.0 + * @author Julien Langou + * @author Henricus Bouwmeester + * @author Mathieu Faverge + * @author Emmanuel Agullo + * @author Cedric Castagnede + * @date 2010-11-15 + * @precisions normal z -> c d s + * + **/ + +#include "runtime/starpu/include/morse_starpu.h" +#include "runtime/starpu/include/runtime_codelet_z.h" + +/** + * + * @ingroup CORE_MORSE_Complex64_t + * + **/ + +void MORSE_TASK_zlatro(const MORSE_option_t *options, + MORSE_enum uplo, MORSE_enum trans, + int m, int n, int mb, + const MORSE_desc_t *A, int Am, int An, int lda, + const MORSE_desc_t *B, int Bm, int Bn, int ldb) +{ + struct starpu_codelet *codelet = &cl_zlatro; + void (*callback)(void*) = NULL; + + if ( morse_desc_islocal( A, Am, An ) || + morse_desc_islocal( B, Bm, Bn ) ) + { + starpu_insert_task( + codelet, + STARPU_VALUE, &uplo, sizeof(MORSE_enum), + STARPU_VALUE, &trans, sizeof(MORSE_enum), + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_R, RTBLKADDR(A, MORSE_Complex64_t, Am, An), + STARPU_VALUE, &lda, sizeof(int), + STARPU_W, RTBLKADDR(B, MORSE_Complex64_t, Bm, Bn), + STARPU_VALUE, &ldb, sizeof(int), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_NAME, "zlatro", + 0); + } + (void)mb; +} + +#if defined(CHAMELEON_SIMULATION) +static void cl_zlatro_cpu_func(void *descr[], void *cl_arg) +{ + MORSE_enum uplo; + MORSE_enum trans; + int M; + int N; + const MORSE_Complex64_t *A; + int LDA; + MORSE_Complex64_t *B; + int LDB; + + A = (const MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); + B = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); + starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &M, &N, &LDA, &LDB); + CORE_zlatro(uplo, trans, M, N, A, LDA, B, LDB); +} +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +CODELETS_CPU(zlatro, 2, cl_zlatro_cpu_func) diff --git a/runtime/starpu/codelets/codelet_ztsmlq_hetra1.c b/runtime/starpu/codelets/codelet_ztsmlq_hetra1.c new file mode 100644 index 0000000000000000000000000000000000000000..b61a8fc3a5493276a26f866ba5902e4e2d6d3b29 --- /dev/null +++ b/runtime/starpu/codelets/codelet_ztsmlq_hetra1.c @@ -0,0 +1,121 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file codelet_ztsmlq_hetra1.c + * + * MORSE codelets kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 2.7.1 + * @author Hatem Ltaief + * @author Mathieu Faverge + * @author Azzam Haidar + * @date 2010-11-15 + * @precisions normal z -> c d s + * + **/ +#include "runtime/starpu/include/morse_starpu.h" +#include "runtime/starpu/include/runtime_codelet_z.h" + +/** + * + * @ingroup CORE_MORSE_Complex64_t + * + **/ + +void MORSE_TASK_ztsmlq_hetra1(const MORSE_option_t *options, + MORSE_enum side, MORSE_enum trans, + int m1, int n1, int m2, int n2, int k, int ib, int nb, + const MORSE_desc_t *A1, int A1m, int A1n, int lda1, + const MORSE_desc_t *A2, int A2m, int A2n, int lda2, + const MORSE_desc_t *V, int Vm, int Vn, int ldv, + const MORSE_desc_t *T, int Tm, int Tn, int ldt) +{ + struct starpu_codelet *codelet = &cl_ztsmlq_hetra1; + void (*callback)(void*) = options->profiling ? cl_ztsmlq_hetra1_callback : NULL; + + int ldwork = side == MorseLeft ? ib : nb; + + /* T follows distribution of V */ + if ( morse_desc_islocal( A1, A1m, A1n ) || + morse_desc_islocal( A2, A2m, A2n ) || + morse_desc_islocal( V, Vm, Vn ) ) + { + starpu_insert_task( + codelet, + STARPU_VALUE, &side, sizeof(MORSE_enum), + STARPU_VALUE, &trans, sizeof(MORSE_enum), + STARPU_VALUE, &m1, sizeof(int), + STARPU_VALUE, &n1, sizeof(int), + STARPU_VALUE, &m2, sizeof(int), + STARPU_VALUE, &n2, sizeof(int), + STARPU_VALUE, &k, sizeof(int), + STARPU_VALUE, &ib, sizeof(int), + STARPU_VALUE, &nb, sizeof(int), + STARPU_RW, RTBLKADDR(A1, MORSE_Complex64_t, A1m, A1n), + STARPU_VALUE, &lda1, sizeof(int), + STARPU_RW, RTBLKADDR(A2, MORSE_Complex64_t, A2m, A2n), + STARPU_VALUE, &lda2, sizeof(int), + STARPU_R, RTBLKADDR(V, MORSE_Complex64_t, Vm, Vn), + STARPU_VALUE, &ldv, sizeof(int), + STARPU_R, RTBLKADDR(T, MORSE_Complex64_t, Tm, Tn), + STARPU_VALUE, &ldt, sizeof(int), + STARPU_SCRATCH, options->ws_worker, + STARPU_VALUE, &ldwork, sizeof(int), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + 0); + } +} + +#if defined(CHAMELEON_SIMULATION) +static void cl_ztsmlq_hetra1_cpu_func(void *descr[], void *cl_arg) +{ + MORSE_enum side; + MORSE_enum trans; + int m1; + int n1; + int m2; + int n2; + int k; + int ib; + int nb; + MORSE_Complex64_t *A1; + int lda1; + MORSE_Complex64_t *A2; + int lda2; + MORSE_Complex64_t *V; + int ldv; + MORSE_Complex64_t *T; + int ldt; + + MORSE_Complex64_t *WORK; + int ldwork; + + A1 = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); + A2 = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); + V = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); + T = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); + WORK = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */ + + starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, + &ib, &nb, &lda1, &lda2, &ldv, &ldt, &ldwork); + CORE_ztsmlq_hetra1(side, trans, m1, n1, m2, n2, k, + ib, A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); +} +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +CODELETS_CPU(ztsmlq_hetra1, 5, cl_ztsmlq_hetra1_cpu_func) diff --git a/runtime/starpu/codelets/codelet_ztsmqr_hetra1.c b/runtime/starpu/codelets/codelet_ztsmqr_hetra1.c new file mode 100644 index 0000000000000000000000000000000000000000..5e305f000b90039c95b78519a77fb12ad35f20d9 --- /dev/null +++ b/runtime/starpu/codelets/codelet_ztsmqr_hetra1.c @@ -0,0 +1,121 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file codelet_ztsmqr_hetra1.c + * + * MORSE codelets kernel + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 2.7.1 + * @author Hatem Ltaief + * @author Mathieu Faverge + * @author Azzam Haidar + * @date 2010-11-15 + * @precisions normal z -> c d s + * + **/ +#include "runtime/starpu/include/morse_starpu.h" +#include "runtime/starpu/include/runtime_codelet_z.h" + +/** + * + * @ingroup CORE_MORSE_Complex64_t + * + **/ + +void MORSE_TASK_ztsmqr_hetra1(const MORSE_option_t *options, + MORSE_enum side, MORSE_enum trans, + int m1, int n1, int m2, int n2, int k, int ib, int nb, + const MORSE_desc_t *A1, int A1m, int A1n, int lda1, + const MORSE_desc_t *A2, int A2m, int A2n, int lda2, + const MORSE_desc_t *V, int Vm, int Vn, int ldv, + const MORSE_desc_t *T, int Tm, int Tn, int ldt) +{ + struct starpu_codelet *codelet = &cl_ztsmqr_hetra1; + void (*callback)(void*) = options->profiling ? cl_ztsmqr_hetra1_callback : NULL; + + int ldwork = side == MorseLeft ? ib : nb; + + /* T follows distribution of V */ + if ( morse_desc_islocal( A1, A1m, A1n ) || + morse_desc_islocal( A2, A2m, A2n ) || + morse_desc_islocal( V, Vm, Vn ) ) + { + starpu_insert_task( + codelet, + STARPU_VALUE, &side, sizeof(MORSE_enum), + STARPU_VALUE, &trans, sizeof(MORSE_enum), + STARPU_VALUE, &m1, sizeof(int), + STARPU_VALUE, &n1, sizeof(int), + STARPU_VALUE, &m2, sizeof(int), + STARPU_VALUE, &n2, sizeof(int), + STARPU_VALUE, &k, sizeof(int), + STARPU_VALUE, &ib, sizeof(int), + STARPU_RW, RTBLKADDR(A1, MORSE_Complex64_t, A1m, A1n), + STARPU_VALUE, &lda1, sizeof(int), + STARPU_RW, RTBLKADDR(A2, MORSE_Complex64_t, A2m, A2n), + STARPU_VALUE, &lda2, sizeof(int), + STARPU_R, RTBLKADDR(V, MORSE_Complex64_t, Vm, Vn), + STARPU_VALUE, &ldv, sizeof(int), + STARPU_R, RTBLKADDR(T, MORSE_Complex64_t, Tm, Tn), + STARPU_VALUE, &ldt, sizeof(int), + STARPU_SCRATCH, options->ws_worker, + STARPU_VALUE, &ldwork, sizeof(int), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_NAME, "ztsmqr_hetra1", + 0); + } +} + +#if defined(CHAMELEON_SIMULATION) +static void cl_ztsmqr_hetra1_cpu_func(void *descr[], void *cl_arg) +{ + MORSE_enum side; + MORSE_enum trans; + int m1; + int n1; + int m2; + int n2; + int k; + int ib; + MORSE_Complex64_t *A1; + int lda1; + MORSE_Complex64_t *A2; + int lda2; + MORSE_Complex64_t *V; + int ldv; + MORSE_Complex64_t *T; + int ldt; + + /* TODO: manage workspace */ + MORSE_Complex64_t *WORK; + int ldwork; + + A1 = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); + A2 = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); + V = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); + T = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); + WORK = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); + + starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, + &ib, &lda1, &lda2, &ldv, &ldt, &ldwork); + CORE_ztsmqr_hetra1(side, trans, m1, n1, m2, n2, k, + ib, A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); +} +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +CODELETS_CPU(ztsmqr_hetra1, 5, cl_ztsmqr_hetra1_cpu_func) diff --git a/runtime/starpu/codelets/codelet_zunmlq.c b/runtime/starpu/codelets/codelet_zunmlq.c index 4c913d17873ab5f397f040cf035e014ca5be1e04..38678c0fc43b89664626932e8cbf956c2e899a74 100644 --- a/runtime/starpu/codelets/codelet_zunmlq.c +++ b/runtime/starpu/codelets/codelet_zunmlq.c @@ -150,6 +150,7 @@ void MORSE_TASK_zunmlq(const MORSE_option_t *options, STARPU_VALUE, &nb, sizeof(int), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, + STARPU_NAME, "zunmlq", 0); } } @@ -164,17 +165,17 @@ static void cl_zunmlq_cpu_func(void *descr[], void *cl_arg) int n; int k; int ib; - MORSE_Complex64_t *A; + const MORSE_Complex64_t *A; int lda; - MORSE_Complex64_t *T; + const MORSE_Complex64_t *T; int ldt; MORSE_Complex64_t *C; int ldc; MORSE_Complex64_t *WORK; int ldwork; - A = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - T = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); + A = (const MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); + T = (const MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); C = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); WORK = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */ @@ -195,7 +196,8 @@ static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg) int n; int k; int ib; - cuDoubleComplex *A, *T, *C, *WORK; + const cuDoubleComplex *A, *T; + cuDoubleComplex *C, *WORK; int lda, ldt, ldc, ldwork; int info = 0; CUstream stream; @@ -203,8 +205,8 @@ static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &side, &trans, &m, &n, &k, &ib, &lda, &ldt, &ldc, &ldwork); - A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); + A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); + T = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); WORK = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */ diff --git a/runtime/starpu/codelets/codelet_zunmqr.c b/runtime/starpu/codelets/codelet_zunmqr.c index 61bec4f56863a7160553df46f1025ff3f2461af0..e8b47fbb38d5ea10ab7ccfbe9a427b2b39f9a953 100644 --- a/runtime/starpu/codelets/codelet_zunmqr.c +++ b/runtime/starpu/codelets/codelet_zunmqr.c @@ -190,17 +190,17 @@ static void cl_zunmqr_cpu_func(void *descr[], void *cl_arg) int n; int k; int ib; - MORSE_Complex64_t *A; + const MORSE_Complex64_t *A; int lda; - MORSE_Complex64_t *T; + const MORSE_Complex64_t *T; int ldt; MORSE_Complex64_t *C; int ldc; MORSE_Complex64_t *WORK; int ldwork; - A = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - T = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); + A = (const MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); + T = (const MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); C = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); WORK = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */ @@ -221,7 +221,8 @@ static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg) int n; int k; int ib; - cuDoubleComplex *A, *T, *C, *WORK; + const cuDoubleComplex *A, *T; + cuDoubleComplex *C, *WORK; int lda, ldt, ldc, ldwork; int info = 0; CUstream stream; @@ -229,8 +230,8 @@ static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &side, &trans, &m, &n, &k, &ib, &lda, &ldt, &ldc, &ldwork); - A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); + A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); + T = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); WORK = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */ diff --git a/runtime/starpu/include/runtime_codelet_z.h b/runtime/starpu/include/runtime_codelet_z.h index 7af0ac2e49860997f9a7c1d190b4c2765ace7bd0..0da29addb22d4383b6266e757df2a3fb0d751189 100644 --- a/runtime/starpu/include/runtime_codelet_z.h +++ b/runtime/starpu/include/runtime_codelet_z.h @@ -65,6 +65,7 @@ ZCODELETS_HEADER(gessq) ZCODELETS_HEADER(getrf) ZCODELETS_HEADER(getrf_incpiv) ZCODELETS_HEADER(getrf_nopiv) +ZCODELETS_HEADER(herfb) ZCODELETS_HEADER(lauum) ZCODELETS_HEADER(potrf) ZCODELETS_HEADER(ssssm) @@ -75,6 +76,8 @@ ZCODELETS_HEADER(trtri) ZCODELETS_HEADER(tslqt) ZCODELETS_HEADER(tsmlq) ZCODELETS_HEADER(tsmqr) +ZCODELETS_HEADER(tsmlq_hetra1) +ZCODELETS_HEADER(tsmqr_hetra1) ZCODELETS_HEADER(tsqrt) ZCODELETS_HEADER(tstrf) ZCODELETS_HEADER(ttlqt) @@ -88,6 +91,7 @@ ZCODELETS_HEADER(unmqr) * Auxiliary functions */ ZCODELETS_HEADER(geadd) +ZCODELETS_HEADER(he2ge) ZCODELETS_HEADER(lascal) ZCODELETS_HEADER(tradd) ZCODELETS_HEADER(lacpy) @@ -97,6 +101,7 @@ ZCODELETS_HEADER(lansy) ZCODELETS_HEADER(lantr) ZCODELETS_HEADER(laset) ZCODELETS_HEADER(laset2) +ZCODELETS_HEADER(latro) ZCODELETS_HEADER(plssq) ZCODELETS_HEADER(plssq2) diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt index e4de172912b9ab9b42449bac31934c7ad8246d3d..fe9c243f374a1210b8394491c77bd2e6465e3756 100644 --- a/testing/CMakeLists.txt +++ b/testing/CMakeLists.txt @@ -82,10 +82,10 @@ set(ZSRC ################## testing_zgeadd.c #testing_zgecfi.c - #testing_zgesvd.c + testing_zgesvd.c #testing_zgetmi.c #testing_zheev.c - #testing_zheevd.c + testing_zheevd.c #testing_zhegst.c #testing_zhegv.c #testing_zhegvd.c @@ -138,6 +138,7 @@ if(NOT CHAMELEON_SIMULATION) list(APPEND libs_for_tests coreblas ${LAPACKE_LIBRARIES} + ${TMG_LIBRARIES} ${LAPACK_LIBRARIES} ${CBLAS_LIBRARIES} ${BLAS_LIBRARIES} @@ -146,6 +147,7 @@ if(NOT CHAMELEON_SIMULATION) ) link_directories(${LAPACKE_LIBRARY_DIRS}) + link_directories(${TMG_LIBRARY_DIRS}) link_directories(${LAPACK_LIBRARY_DIRS}) link_directories(${CBLAS_LIBRARY_DIRS}) link_directories(${BLAS_LIBRARY_DIRS}) diff --git a/testing/testing_zauxiliary.c b/testing/testing_zauxiliary.c index 503fa34b2540cec94e6aa92674cedcb97df22589..aa73b238279ecac1614e926aa1ab4c4f5f3781d0 100644 --- a/testing/testing_zauxiliary.c +++ b/testing/testing_zauxiliary.c @@ -194,99 +194,117 @@ int main (int argc, char **argv) */ if ( strcmp(func, "LANGE") == 0 ) { info += testing_zlange( argc, argv ); - /* - * Blas Level 3 - */ - } else if ( strcmp(func, "GEMM") == 0 ) { + } + /* + * Blas Level 3 + */ + else if ( strcmp(func, "GEMM") == 0 ) { info += testing_zgemm( argc, argv ); + } #ifdef COMPLEX - } else if ( strcmp(func, "HEMM") == 0 ) { + else if ( strcmp(func, "HEMM") == 0 ) { info += testing_zhemm( argc, argv ); - } else if ( strcmp(func, "HERK") == 0 ) { + } + else if ( strcmp(func, "HERK") == 0 ) { info += testing_zherk( argc, argv ); - } else if ( strcmp(func, "HER2K") == 0 ) { + } + else if ( strcmp(func, "HER2K") == 0 ) { info += testing_zher2k( argc, argv ); + } #endif - } else if ( strcmp(func, "SYMM") == 0 ) { + else if ( strcmp(func, "SYMM") == 0 ) { info += testing_zsymm( argc, argv ); - } else if ( strcmp(func, "SYRK") == 0 ) { + } + else if ( strcmp(func, "SYRK") == 0 ) { info += testing_zsyrk( argc, argv ); - } else if ( strcmp(func, "SYR2K") == 0 ) { + } + else if ( strcmp(func, "SYR2K") == 0 ) { info += testing_zsyr2k( argc, argv ); - } else if ( strcmp(func, "TRMM") == 0 ) { + } + else if ( strcmp(func, "TRMM") == 0 ) { info += testing_ztrmm( argc, argv ); - } else if ( strcmp(func, "TRSM") == 0 ) { + } + else if ( strcmp(func, "TRSM") == 0 ) { info += testing_ztrsm( argc, argv ); - } else if ( strcmp(func, "PEMV") == 0 ) { + } + else if ( strcmp(func, "PEMV") == 0 ) { info += testing_zpemv( argc, argv ); - } else if ( strcmp(func, "GEADD") == 0 ) { + } + else if ( strcmp(func, "GEADD") == 0 ) { info = testing_zgeadd( argc, argv ); - /* - * Linear system - */ - } else if ( strcmp(func, "POSV") == 0 ) { + } + /* + * Linear system + */ + else if ( strcmp(func, "POSV") == 0 ) { info += testing_zposv( argc, argv ); - } else if ( strcmp(func, "GELS") == 0 ) { + } + else if ( strcmp(func, "GELS") == 0 ) { info += testing_zgels( argc, argv ); - } else if ( strcmp(func, "GESV_INCPIV") == 0 ) { + } + else if ( strcmp(func, "GESV_INCPIV") == 0 ) { info += testing_zgesv_incpiv( argc, argv ); - /* - } else if ( strcmp(func, "GESV") == 0 ) { - info += testing_zgesv( argc, argv ); - */ - /* - * Matrix inversion - */ - - } else if ( strcmp(func, "POTRI") == 0 ) { + } + /* else if ( strcmp(func, "GESV") == 0 ) { */ + /* info += testing_zgesv( argc, argv ); */ + /* } */ + /* + * Matrix inversion + */ + else if ( strcmp(func, "POTRI") == 0 ) { info += testing_zpotri( argc, argv ); - /* - } else if ( strcmp(func, "GETRI") == 0 ) { - info += testing_zgetri( argc, argv ); - */ - /* - * Eigenvalue Problems - */ - /* - } else if ( strcmp(func, "HEEV") == 0 ) { - info += testing_zheev( argc, argv ); - } else if ( strcmp(func, "HEEVD") == 0 ) { - info += testing_zheevd( argc, argv ); - } else if ( strcmp(func, "HEGV") == 0 ) { - info += testing_zhegv( argc, argv ); - } else if ( strcmp(func, "HEGVD") == 0 ) { - info += testing_zhegv( argc, argv ); - } else if ( strcmp(func, "HEGST") == 0 ) { - info += testing_zhegst( argc, argv ); - */ - /* - * Singular Value Decomposition - */ - /* - } else if ( strcmp(func, "GESVD") == 0 ) { - info += testing_zgesvd( argc, argv ); - */ + } + /* else if ( strcmp(func, "GETRI") == 0 ) { */ + /* info += testing_zgetri( argc, argv ); */ + /* } */ + /* + * Eigenvalue Problems + */ + /* else if ( strcmp(func, "HEEV") == 0 ) { */ + /* info += testing_zheev( argc, argv ); */ + /* } */ + else if ( strcmp(func, "HEEVD") == 0 ) { + info += testing_zheevd( argc, argv ); + } + /* else if ( strcmp(func, "HEGV") == 0 ) { */ + /* info += testing_zhegv( argc, argv ); */ + /* } */ + /* else if ( strcmp(func, "HEGVD") == 0 ) { */ + /* info += testing_zhegv( argc, argv ); */ + /* } */ + /* else if ( strcmp(func, "HEGST") == 0 ) { */ + /* info += testing_zhegst( argc, argv ); */ + /* } */ + /* + * Singular Value Decomposition + */ + else if ( strcmp(func, "GESVD") == 0 ) { + info += testing_zgesvd( argc, argv ); + } #ifdef DOUBLE - /* - * Mixed precision - */ - /* - } else if ( strcmp(func, "CPOSV") == 0 ) { - info += testing_zcposv( argc, argv ); - } else if ( strcmp(func, "CGESV") == 0 ) { - info += testing_zcgesv( argc, argv ); - } else if ( strcmp(func, "CUNGESV") == 0 ) { - info += testing_zcungesv( argc, argv ); - */ + /* + * Mixed precision + */ + /* else if ( strcmp(func, "CPOSV") == 0 ) { */ + /* info += testing_zcposv( argc, argv ); */ + /* } */ + /* else if ( strcmp(func, "CGESV") == 0 ) { */ + /* info += testing_zcgesv( argc, argv ); */ + /* } */ + /* else if ( strcmp(func, "CUNGESV") == 0 ) { */ + /* info += testing_zcungesv( argc, argv ); */ + /* } */ #endif - /* Layout Transformation */ - /* - } else if ( strcmp(func, "GECFI") == 0 ) { - info += testing_zgecfi( argc, argv ); - } else if ( strcmp(func, "GETMI") == 0 ) { - info += testing_zgetmi( argc, argv ); - */ - } else { + /* + * Layout Transformation + */ + /* else if ( strcmp(func, "GECFI") == 0 ) { */ + /* info += testing_zgecfi( argc, argv ); */ + /* } */ + /* else if ( strcmp(func, "GETMI") == 0 ) { */ + /* info += testing_zgetmi( argc, argv ); */ + /* } */ + else { fprintf(stderr, "Function unknown\n"); } diff --git a/testing/testing_zgesvd.c b/testing/testing_zgesvd.c new file mode 100644 index 0000000000000000000000000000000000000000..83f784e2df5dc45508a4c2f3a2ed2e3a89a735bd --- /dev/null +++ b/testing/testing_zgesvd.c @@ -0,0 +1,364 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file testing_zgesvd.c + * + * PLASMA testing routines + * PLASMA is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 2.8.0 + * @author Azzam Haidar + * @author Hatem Ltaief + * @date 2010-11-15 + * @precisions normal z -> c d s + * + **/ +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <math.h> + +#include <morse.h> +#include <coreblas/include/cblas.h> +#include <coreblas/include/lapacke.h> +#include <coreblas/include/coreblas.h> +#include "testing_zauxiliary.h" + +static int check_orthogonality(int, int, int, MORSE_Complex64_t*, int, double); +static int check_reduction(int, int, double*, MORSE_Complex64_t*, int, MORSE_Complex64_t*, int, MORSE_Complex64_t*, int, double); +static int check_solution(int, double*, double*, double); + +int testing_zgesvd(int argc, char **argv) +{ + int tree = 0; + + if ( argc < 1 ){ + goto usage; + } else { + tree = atoi(argv[0]); + } + + /* Check for number of arguments*/ + if ( ((tree == 0) && (argc != 4)) || + ((tree != 0) && (argc != 5)) ){ + usage: + USAGE("GESVD", "MODE M N LDA [RH]", + " - MODE : 0: flat, 1: tree (RH needed)\n" + " - M : number of rows of the matrix A\n" + " - N : number of columns of the matrix A\n" + " - LDA : leading dimension of the matrix A\n" + " - RH : Size of each subdomains\n"); + return -1; + } + + int M = atoi(argv[1]); + int N = atoi(argv[2]); + int LDA = atoi(argv[3]); + int rh; + if ( tree ) { + rh = atoi(argv[4]); + MORSE_Set(MORSE_HOUSEHOLDER_MODE, MORSE_TREE_HOUSEHOLDER); + MORSE_Set(MORSE_HOUSEHOLDER_SIZE, rh); + } + + if (LDA < M){ + printf("LDA should be >= M !\n"); + return -1; + } + + double eps = LAPACKE_dlamch_work('e'); + double dmax = 1.0; + MORSE_enum jobu = MorseVec; + MORSE_enum jobvt = MorseVec; + int info_orthou = 0; + int info_orthovt = 0; + int info_solution = 0; + int info_reduction = 0; + int minMN = min(M, N); + int mode = 4; + double rcond = (double) minMN; + int INFO=-1; + + MORSE_Complex64_t *A1 = (MORSE_Complex64_t *)malloc(LDA*N*sizeof(MORSE_Complex64_t)); + double *S1 = (double *) malloc(minMN*sizeof(double)); + double *S2 = (double *) malloc(minMN*sizeof(double)); + MORSE_Complex64_t *work = (MORSE_Complex64_t *)malloc(3*max(M, N)* sizeof(MORSE_Complex64_t)); + MORSE_Complex64_t *A2 = NULL; + MORSE_Complex64_t *U = NULL; + MORSE_Complex64_t *VT = NULL; + MORSE_desc_t *T; + + /* Check if unable to allocate memory */ + if ( (!A1) || (!S1) || (!S2) || (!work) ) { + printf("Out of Memory \n "); + return -2; + } + + /* TODO: check problem with workspace!!! */ + MORSE_Alloc_Workspace_zgesvd(M, N, &T, 1, 1); + + /*---------------------------------------------------------- + * TESTING ZGESVD + */ + /* Initialize A1 */ + LAPACKE_zlatms_work( LAPACK_COL_MAJOR, M, N, + morse_lapack_const(MorseDistUniform), ISEED, + morse_lapack_const(MorseNonsymPosv), S1, mode, rcond, + dmax, M, N, + morse_lapack_const(MorseNoPacking), A1, LDA, work ); + free(work); + + /* Copy A1 for check */ + if ( (jobu == MorseVec) && (jobvt == MorseVec) ) { + A2 = (MORSE_Complex64_t *)malloc(LDA*N*sizeof(MORSE_Complex64_t)); + LAPACKE_zlacpy_work(LAPACK_COL_MAJOR, 'A', M, N, A1, LDA, A2, LDA); + } + if ( jobu == MorseVec ) { + U = (MORSE_Complex64_t *)malloc(M*M*sizeof(MORSE_Complex64_t)); + LAPACKE_zlaset_work(LAPACK_COL_MAJOR, 'A', M, M, 0., 1., U, M); + } + if ( jobvt == MorseVec ) { + VT = (MORSE_Complex64_t *)malloc(N*N*sizeof(MORSE_Complex64_t)); + LAPACKE_zlaset_work(LAPACK_COL_MAJOR, 'A', N, N, 0., 1., VT, N); + } + + /* MORSE ZGESVD */ + INFO = MORSE_zgesvd(jobu, jobvt, M, N, A1, LDA, S2, T, U, M, VT, N); + if( INFO != 0 ){ + printf(" MORSE_zgesvd returned with error code %d\n",INFO); + goto fin; + } + + printf("\n"); + printf("------ TESTS FOR MORSE ZGESVD ROUTINE ------- \n"); + printf(" Size of the Matrix %d by %d\n", M, N); + printf("\n"); + printf(" The matrix A is randomly generated for each test.\n"); + printf("============\n"); + printf(" The relative machine precision (eps) is to be %e \n",eps); + printf(" Computational tests pass if scaled residuals are less than 60.\n"); + + + /* Check the orthogonality, reduction and the singular values */ + if ( jobu == MorseVec ) + info_orthou = check_orthogonality(MorseLeft, M, M, U, M, eps); + + if ( jobvt == MorseVec ) + info_orthovt = check_orthogonality(MorseRight, N, N, VT, N, eps); + + if ( (jobu == MorseVec) && (jobvt == MorseVec) ) + info_reduction = check_reduction(M, N, S2, A2, LDA, U, M, VT, N, eps); + + info_solution = check_solution(minMN, S1, S2, eps); + + if ( (info_solution == 0) & (info_orthou == 0) & + (info_orthovt == 0) & (info_reduction == 0) ) { + if (M >= N) { + printf("***************************************************\n"); + printf(" ---- TESTING ZGESVD .. M >= N ........... PASSED !\n"); + printf("***************************************************\n"); + } + else { + printf("***************************************************\n"); + printf(" ---- TESTING ZGESVD .. M < N ............ PASSED !\n"); + printf("***************************************************\n"); + } + } + else { + if (M >= N) { + printf("************************************************\n"); + printf(" - TESTING ZGESVD .. M >= N .. FAILED !\n"); + printf("************************************************\n"); + } + else { + printf("************************************************\n"); + printf(" - TESTING ZGESVD .. M < N .. FAILED !\n"); + printf("************************************************\n"); + } + } + +fin: + if ( A2 != NULL ) free(A2); + if ( U != NULL ) free(U); + if ( VT != NULL ) free(VT); + free(A1); free(S1); free(S2); + MORSE_Dealloc_Workspace(&T); + + return 0; +} + +/*------------------------------------------------------------------- + * Check the orthogonality of U VT + */ +static int check_orthogonality(int side, int M, int N, MORSE_Complex64_t *Q, int LDQ, double eps) +{ + double done = 1.0; + double mdone = -1.0; + double normQ, result; + int info_ortho; + int minMN = min(M, N); + double *work = (double *)malloc(minMN*sizeof(double)); + + /* Build the idendity matrix */ + MORSE_Complex64_t *Id = (MORSE_Complex64_t *) malloc(minMN*minMN*sizeof(MORSE_Complex64_t)); + LAPACKE_zlaset_work(LAPACK_COL_MAJOR, 'A', minMN, minMN, 0., 1., Id, minMN); + + /* Perform Id - Q'Q */ + if (M >= N) + cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, N, M, done, Q, LDQ, mdone, Id, N); + else + cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, M, N, done, Q, LDQ, mdone, Id, M); + + normQ = LAPACKE_zlansy_work(LAPACK_COL_MAJOR, morse_lapack_const(MorseInfNorm), 'U', minMN, Id, minMN, work); + + if (getenv("MORSE_TESTING_VERBOSE")) + printf( "||Q||_oo=%e\n", normQ ); + + result = normQ / (M * eps); + if (side == MorseLeft) + { + printf(" ======================================================\n"); + printf(" ||Id-U'*U||_oo / (M*eps) : %e \n", result ); + printf(" ======================================================\n"); + } + else + { + printf(" ======================================================\n"); + printf(" ||Id-VT'*VT||_oo / (M*eps) : %e \n", result ); + printf(" ======================================================\n"); + } + + if ( isnan(result) || isinf(result) || (result > 60.0) ) { + printf("-- Orthogonality is suspicious ! \n"); + info_ortho = 1; + } + else { + printf("-- Orthogonality is CORRECT ! \n"); + info_ortho = 0; + } + + free(work); free(Id); + + return info_ortho; +} + +/*------------------------------------------------------------ + * Check the bidiagonal reduction + */ +static int check_reduction(int M, int N, double *S, MORSE_Complex64_t *A, int LDA, + MORSE_Complex64_t *U, int LDU, MORSE_Complex64_t *VT, int LDVT, double eps ) +{ + MORSE_Complex64_t zone = 1.0; + MORSE_Complex64_t mzone = -1.0; + MORSE_Complex64_t zzero = 0.0; + double Anorm, Rnorm, result; + int info_reduction; + int i; + int maxMN = max(M, N); + int minMN = min(M, N); + + MORSE_Complex64_t *TEMP = (MORSE_Complex64_t *)malloc(minMN*minMN*sizeof(MORSE_Complex64_t)); + MORSE_Complex64_t *Residual = (MORSE_Complex64_t *)malloc(M *N *sizeof(MORSE_Complex64_t)); + double *work = (double *)malloc(maxMN*sizeof(double)); + + LAPACKE_zlacpy_work(LAPACK_COL_MAJOR, morse_lapack_const(MorseUpperLower), M, N, A, LDA, Residual, M); + + if ( M >= N ) { + /* Compute TEMP = SIGMA * Vt */ + LAPACKE_zlacpy_work(LAPACK_COL_MAJOR, 'A', N, N, VT, LDVT, TEMP, N); + for (i = 0; i < minMN; i++){ + cblas_zdscal(N, S[i], TEMP + i, N); + } + + /* Compute Residual = A - U * (SIGMA * VT) */ + cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M, N, N, + CBLAS_SADDR(mzone), U, LDU, + TEMP, minMN, + CBLAS_SADDR(zone), Residual, M); + } + else { + /* Compute TEMP = U * SIGMA */ + LAPACKE_zlacpy_work(LAPACK_COL_MAJOR, 'A', M, M, U, LDU, TEMP, M); + for (i = 0; i < minMN; i++){ + cblas_zdscal(M, S[i], TEMP + i*M, 1); + } + + /* Compute Residual = A - (U * SIGMA) * VT */ + cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M, N, M, + CBLAS_SADDR(mzone), TEMP, M, + VT, LDVT, + CBLAS_SADDR(zone), Residual, M); + } + + /* Compute the norms */ + Rnorm = LAPACKE_zlange_work(LAPACK_COL_MAJOR, morse_lapack_const(MorseOneNorm), M, N, Residual, M, work); + Anorm = LAPACKE_zlange_work(LAPACK_COL_MAJOR, morse_lapack_const(MorseOneNorm), M, N, A, LDA, work); + + if (getenv("MORSE_TESTING_VERBOSE")) + printf( "||A||_oo=%e\n||A - U*SIGMA*VT||_oo=%e\n", Anorm, Rnorm ); + + result = Rnorm / ( Anorm * maxMN * eps); + printf(" ======================================================\n"); + printf(" ||A-U*SIGMA*V'||_oo/(||A||_oo.N.eps) : %e \n", result ); + printf(" ======================================================\n"); + + if ( isnan(result) || isinf(result) || (result > 60.0) ) { + printf("-- Reduction is suspicious ! \n"); + info_reduction = 1; + } + else { + printf("-- Reduction is CORRECT ! \n"); + info_reduction = 0; + } + + free(TEMP); + free(Residual); + free(work); + + return info_reduction; +} + +/*------------------------------------------------------------ + * Check the eigenvalues + */ +static int check_solution(int N, double *E1, double *E2, double eps) +{ + int info_solution, i; + double resid; + double maxtmp; + double maxel = fabs( fabs(E1[0]) - fabs(E2[0]) ); + double maxeig = max( fabs(E1[0]), fabs(E2[0]) ); + for (i = 1; i < N; i++){ + resid = fabs(fabs(E1[i])-fabs(E2[i])); + maxtmp = max(fabs(E1[i]), fabs(E2[i])); + + /* Update */ + maxeig = max(maxtmp, maxeig); + maxel = max(resid, maxel ); + } + + maxel = maxel / (maxeig * N * eps); + printf(" ======================================================\n"); + printf(" | S - singularcomputed | / (|S| * N * eps) : %e \n", maxel ); + printf(" ======================================================\n"); + + if ( isnan(maxel) || isinf(maxel) || (maxel > 100) ) { + printf("-- The singular values are suspicious ! \n"); + info_solution = 1; + } + else{ + printf("-- The singular values are CORRECT ! \n"); + info_solution = 0; + } + return info_solution; +} diff --git a/testing/testing_zheevd.c b/testing/testing_zheevd.c new file mode 100644 index 0000000000000000000000000000000000000000..0837a58c799f38798e21a6b3889e95c030f5c2f8 --- /dev/null +++ b/testing/testing_zheevd.c @@ -0,0 +1,293 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file testing_zheevd.c + * + * PLASMA testing routines + * PLASMA is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 2.8.0 + * @author Hatem Ltaief + * @author Azzam Haidar + * @date 2010-11-15 + * @precisions normal z -> c d s + * + **/ +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <math.h> + +#include <morse.h> +#include <coreblas/include/cblas.h> +#include <coreblas/include/lapacke.h> +#include <coreblas/include/coreblas.h> +#include "testing_zauxiliary.h" + +static int check_orthogonality(int, int, MORSE_Complex64_t*, int, double); +static int check_reduction(int, int, int, MORSE_Complex64_t*, double*, int, MORSE_Complex64_t*, double); +static int check_solution(int, double*, double*, double); + +int testing_zheevd(int argc, char **argv) +{ + /* Check for number of arguments*/ + if (argc != 3) { + USAGE("HEEVD", "MODE N LDA", + " - MODE : mode used to generate the matrix\n" + " - N : size of the matrix A\n" + " - LDA : leading dimension of the matrix A\n"); + return -1; + } + + int mode = atoi(argv[0]); + int N = atoi(argv[1]); + int LDA = atoi(argv[2]); + double eps = LAPACKE_dlamch_work('e'); + double dmax = 1.0; + double rcond = 1.0e6; + int INFO = -1; + + MORSE_enum uplo = MorseUpper; + MORSE_enum vec = MorseVec; + int info_ortho = 0; + int info_solution = 0; + int info_reduction = 0; + int LDAxN = LDA * N; + + MORSE_Complex64_t *A1 = NULL; + MORSE_Complex64_t *A2 = (MORSE_Complex64_t *)malloc(LDAxN * sizeof(MORSE_Complex64_t)); + double *W1 = (double *)malloc(N * sizeof(double)); + double *W2 = (double *)malloc(N * sizeof(double)); + MORSE_Complex64_t *work = (MORSE_Complex64_t *)malloc(3* N * sizeof(MORSE_Complex64_t)); + MORSE_desc_t *T; + + /* Check if unable to allocate memory */ + if ( (!A2) || (!W1) || (!W2) ){ + printf("Out of Memory \n "); + return -2; + } + + MORSE_Alloc_Workspace_zheevd(N, N, &T, 1, 1); + + /*---------------------------------------------------------- + * TESTING ZHEEVD + */ + /* Initialize A1 */ + if (mode == 0){ + int i; + for (i=0; i<N; i++){ + W1[i] = (double )i+1; + } + } + LAPACKE_zlatms_work( LAPACK_COL_MAJOR, N, N, + morse_lapack_const(MorseDistSymmetric), ISEED, + morse_lapack_const(MorseHermGeev), W1, mode, rcond, + dmax, N, N, + morse_lapack_const(MorseNoPacking), A2, LDA, work ); + + /* + * Sort the eigenvalue because when computing the tridiag + * and then the eigenvalue of the DSTQR are sorted. + * So to avoid testing fail when having good results W1 should be sorted + */ + LAPACKE_dlasrt_work( 'I', N, W1 ); + + if ( vec == MorseVec ) { + A1 = (MORSE_Complex64_t *)malloc(LDAxN*sizeof(MORSE_Complex64_t)); + + /* Copy A2 into A1 */ + LAPACKE_zlacpy_work(LAPACK_COL_MAJOR, 'A', N, N, A2, LDA, A1, LDA); + } + + /* + * MORSE ZHEEVD + */ + INFO = MORSE_zheevd(vec, uplo, N, A2, LDA, W2, T); + + if (INFO != 0) { + printf(" ERROR OCCURED INFO %d\n", INFO); + goto fin; + } + + printf("\n"); + printf("------ TESTS FOR MORSE ZHEEVD ROUTINE ------- \n"); + printf(" Size of the Matrix %d by %d\n", N, N); + printf("\n"); + printf(" The matrix A is randomly generated for each test.\n"); + printf("============\n"); + printf(" The relative machine precision (eps) is to be %e \n", eps); + printf(" Computational tests pass if scaled residuals are less than 60.\n"); + + /* Check the orthogonality, reduction and the eigen solutions */ + if (vec == MorseVec) { + info_ortho = check_orthogonality(N, N, A2, LDA, eps); + info_reduction = check_reduction(uplo, N, 1, A1, W2, LDA, A2, eps); + } + info_solution = check_solution(N, W1, W2, eps); + + if ( (info_solution == 0) & (info_ortho == 0) & (info_reduction == 0) ) { + printf("***************************************************\n"); + printf(" ---- TESTING ZHEEVD ...................... PASSED !\n"); + printf("***************************************************\n"); + } + else { + printf("************************************************\n"); + printf(" - TESTING ZHEEVD ... FAILED !\n"); + printf("************************************************\n"); + } + + fin: + MORSE_Dealloc_Workspace(&T); + free(A2); + free(W1); + free(W2); + free(work); + if (A1 != NULL) free(A1); + + return 0; +} + +/*------------------------------------------------------------------- + * Check the orthogonality of Q + */ +static int check_orthogonality(int M, int N, MORSE_Complex64_t *Q, int LDQ, double eps) +{ + double done = 1.0; + double mdone = -1.0; + double normQ, result; + int info_ortho; + int minMN = min(M, N); + double *work = (double *)malloc(minMN*sizeof(double)); + + /* Build the idendity matrix */ + MORSE_Complex64_t *Id = (MORSE_Complex64_t *) malloc(minMN*minMN*sizeof(MORSE_Complex64_t)); + LAPACKE_zlaset_work(LAPACK_COL_MAJOR, 'A', minMN, minMN, 0., 1., Id, minMN); + + /* Perform Id - Q'Q */ + if (M >= N) + cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, N, M, done, Q, LDQ, mdone, Id, minMN); + else + cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, M, N, done, Q, LDQ, mdone, Id, minMN); + + normQ = LAPACKE_zlanhe_work(LAPACK_COL_MAJOR, morse_lapack_const(MorseInfNorm), 'U', minMN, Id, minMN, work); + + result = normQ / (minMN * eps); + printf(" ======================================================\n"); + printf(" ||Id-Q'*Q||_oo / (minMN*eps) : %15.3E \n", result ); + printf(" ======================================================\n"); + + if ( isnan(result) || isinf(result) || (result > 60.0) ) { + printf("-- Orthogonality is suspicious ! \n"); + info_ortho=1; + } + else { + printf("-- Orthogonality is CORRECT ! \n"); + info_ortho=0; + } + free(work); free(Id); + return info_ortho; +} + +/*------------------------------------------------------------ + * Check the reduction + */ +static int check_reduction(int uplo, int N, int bw, MORSE_Complex64_t *A, double *D, int LDA, MORSE_Complex64_t *Q, double eps ) +{ + (void) bw; + MORSE_Complex64_t zone = 1.0; + MORSE_Complex64_t mzone = -1.0; + MORSE_Complex64_t *TEMP = (MORSE_Complex64_t *)malloc(N*N*sizeof(MORSE_Complex64_t)); + MORSE_Complex64_t *Residual = (MORSE_Complex64_t *)malloc(N*N*sizeof(MORSE_Complex64_t)); + double *work = (double *)malloc(N*sizeof(double)); + double Anorm, Rnorm, result; + int info_reduction; + int i; + + /* Compute TEMP = Q * LAMBDA */ + LAPACKE_zlacpy_work(LAPACK_COL_MAJOR, morse_lapack_const(MorseUpperLower), N, N, Q, LDA, TEMP, N); + + for (i = 0; i < N; i++){ + cblas_zdscal(N, D[i], &(TEMP[i*N]), 1); + } + /* Compute Residual = A - Q * LAMBDA * Q^H */ + /* A is Hermetian but both upper and lower + * are assumed valable here for checking + * otherwise it need to be symetrized before + * checking. + */ + LAPACKE_zlacpy_work(LAPACK_COL_MAJOR, morse_lapack_const(MorseUpperLower), N, N, A, LDA, Residual, N); + cblas_zgemm(CblasColMajor, CblasNoTrans, CblasConjTrans, N, N, N, CBLAS_SADDR(mzone), TEMP, N, Q, LDA, CBLAS_SADDR(zone), Residual, N); + + Rnorm = LAPACKE_zlange_work(LAPACK_COL_MAJOR, morse_lapack_const(MorseOneNorm), N, N, Residual, N, work); + Anorm = LAPACKE_zlanhe_work(LAPACK_COL_MAJOR, morse_lapack_const(MorseOneNorm), morse_lapack_const(uplo), N, A, LDA, work); + + result = Rnorm / ( Anorm * N * eps); + if ( uplo == MorseLower ){ + printf(" ======================================================\n"); + printf(" ||A-Q*LAMBDA*Q'||_oo/(||A||_oo.N.eps) : %15.3E \n", result ); + printf(" ======================================================\n"); + }else{ + printf(" ======================================================\n"); + printf(" ||A-Q'*LAMBDA*Q||_oo/(||A||_oo.N.eps) : %15.3E \n", result ); + printf(" ======================================================\n"); + } + + if ( isnan(result) || isinf(result) || (result > 60.0) ) { + printf("-- Reduction is suspicious ! \n"); + info_reduction = 1; + } + else { + printf("-- Reduction is CORRECT ! \n"); + info_reduction = 0; + } + + free(TEMP); free(Residual); + free(work); + + return info_reduction; +} +/*------------------------------------------------------------ + * Check the eigenvalues + */ +static int check_solution(int N, double *E1, double *E2, double eps) +{ + int info_solution, i; + double resid; + double maxtmp; + double maxel = fabs( fabs(E1[0]) - fabs(E2[0]) ); + double maxeig = max( fabs(E1[0]), fabs(E2[0]) ); + + for (i = 1; i < N; i++){ + resid = fabs(fabs(E1[i])-fabs(E2[i])); + maxtmp = max(fabs(E1[i]), fabs(E2[i])); + + /* Update */ + maxeig = max(maxtmp, maxeig); + maxel = max(resid, maxel ); + } + + maxel = maxel / (maxeig * N * eps); + printf(" ======================================================\n"); + printf(" | D - eigcomputed | / (|D| * N * eps) : %15.3E \n", maxel ); + printf(" ======================================================\n"); + + if ( isnan(maxel) || isinf(maxel) || (maxel > 100) ) { + printf("-- The eigenvalues are suspicious ! \n"); + info_solution = 1; + } + else{ + printf("-- The eigenvalues are CORRECT ! \n"); + info_solution = 0; + } + return info_solution; +} diff --git a/timing/CMakeLists.txt b/timing/CMakeLists.txt index bd29812209892b41796228844ee31a2da5e5368c..d30584ac43575b725abc8d244c92385a3d77bc17 100644 --- a/timing/CMakeLists.txt +++ b/timing/CMakeLists.txt @@ -118,10 +118,10 @@ if (NOT CHAMELEON_SIMULATION) time_zlange_tile.c #time_zgebrd_tile.c #time_zgecfi.c - #time_zgesvd_tile.c + time_zgesvd_tile.c #time_zgetrf_reclap.c #time_zgetrf_rectil.c - #time_zheevd_tile.c + time_zheevd_tile.c #time_zheev_tile.c #time_zhegv_tile.c #time_zlapack2tile.c @@ -237,6 +237,7 @@ if(NOT CHAMELEON_SIMULATION) list(APPEND libs_for_timings coreblas ${LAPACKE_LIBRARIES} + ${TMG_LIBRARIES} ${CBLAS_LIBRARIES} ${LAPACK_SEQ_LIBRARIES} ${BLAS_SEQ_LIBRARIES} @@ -245,6 +246,7 @@ if(NOT CHAMELEON_SIMULATION) ) link_directories(${LAPACKE_LIBRARY_DIRS}) + link_directories(${TMG_LIBRARY_DIRS}) link_directories(${LAPACK_LIBRARY_DIRS}) link_directories(${CBLAS_LIBRARY_DIRS}) link_directories(${BLAS_LIBRARY_DIRS}) diff --git a/timing/time_zgesvd_tile.c b/timing/time_zgesvd_tile.c new file mode 100644 index 0000000000000000000000000000000000000000..e4cb71a7ee97fb73435b65ec3f5110b69da4e231 --- /dev/null +++ b/timing/time_zgesvd_tile.c @@ -0,0 +1,75 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @precisions normal z -> c d s + * + **/ +#define _TYPE MORSE_Complex64_t +#define _PREC double +#define _LAMCH LAPACKE_dlamch_work + +#define _NAME "PLASMA_zheev_Tile" +/* See Lawn 41 page 120 */ +#define _FMULS FMULS_GEBRD( M, N ) +#define _FADDS FADDS_GEBRD( M, N ) + +#include "./timing.c" + +static int +RunTest(int *iparam, double *dparam, morse_time_t *t_) +{ + PASTE_CODE_IPARAM_LOCALS( iparam ); + MORSE_desc_t *descT; + int jobu = MorseVec; + int jobvt = MorseVec; + int INFO; + + /* Allocate Data */ + PASTE_CODE_ALLOCATE_MATRIX_TILE( descA, 1, MORSE_Complex64_t, MorseComplexDouble, LDA, M, N ); + PASTE_CODE_ALLOCATE_MATRIX( VT, (jobvt == MorseVec), MORSE_Complex64_t, N, N ); + PASTE_CODE_ALLOCATE_MATRIX( U, (jobu == MorseVec), MORSE_Complex64_t, M, M ); + PASTE_CODE_ALLOCATE_MATRIX( S, 1, double, N, 1 ); + + /* Initialiaze Data */ + MORSE_zplrnt_Tile(descA, 51 ); + + /* Allocate Workspace */ + MORSE_Alloc_Workspace_zgesvd(N, N, &descT, 1, 1); + + if ( jobu == MorseVec ) { + LAPACKE_zlaset_work(LAPACK_COL_MAJOR, 'A', M, M, 0., 1., U, M); + } + if ( jobvt == MorseVec ) { + LAPACKE_zlaset_work(LAPACK_COL_MAJOR, 'A', N, N, 0., 1., VT, N); + } + + START_TIMING(); + INFO = MORSE_zgesvd_Tile(jobu, jobvt, descA, S, descT, U, M, VT, N); + STOP_TIMING(); + + if(INFO!=0){ + printf(" ERROR OCCURED INFO %d\n",INFO); + } + + /* DeAllocate Workspace */ + MORSE_Dealloc_Workspace(&descT); + + if (jobu == MorseVec) { + free( U ); + } + if (jobvt == MorseVec) { + free( VT ); + } + PASTE_CODE_FREE_MATRIX( descA ); + free( S ); + return 0; +} diff --git a/timing/time_zheevd_tile.c b/timing/time_zheevd_tile.c new file mode 100644 index 0000000000000000000000000000000000000000..d659b82138de39592c6925f0f2ce709560e15f77 --- /dev/null +++ b/timing/time_zheevd_tile.c @@ -0,0 +1,60 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2014 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @precisions normal z -> c d s + * + **/ +#define _TYPE MORSE_Complex64_t +#define _PREC double +#define _LAMCH LAPACKE_dlamch_work + +#define _NAME "PLASMA_zheevd_Tile" +/* See Lawn 41 page 120 */ +#define _FMULS ((2. / 3.) * ((double)N * (double)N * (double)N)) +#define _FADDS ((2. / 3.) * ((double)N * (double)N * (double)N)) + +#include "./timing.c" +/* #include <mkl_service.h> */ + +static int +RunTest(int *iparam, double *dparam, morse_time_t *t_) +{ + PASTE_CODE_IPARAM_LOCALS( iparam ); + MORSE_desc_t *descT; + int uplo = MorseLower; + int vec = MorseVec; + int INFO; + + LDA = max(LDA, N); + + /* Allocate Data */ + PASTE_CODE_ALLOCATE_MATRIX( A, 1, MORSE_Complex64_t, LDA, N); + PASTE_CODE_ALLOCATE_MATRIX( S, 1, double, N, 1 ); + + /* Allocate Workspace */ + MORSE_zplghe( (double)N, MorseUpperLower, N, A, LDA, 51 ); + MORSE_Alloc_Workspace_zheevd(N, N, &descT, 1, 1); + + START_TIMING(); + INFO = MORSE_zheevd(vec, uplo, N, A, LDA, S, descT); + STOP_TIMING(); + + if (INFO != 0){ + printf(" ERROR OCCURED INFO %d\n", INFO); + } + + /* DeAllocate Workspace */ + MORSE_Dealloc_Workspace(&descT); + + free( A ); + return 0; +} diff --git a/timing/timing.c b/timing/timing.c index 831f99bbe97f62a2c15c4ecd9549243edcc2e2e8..28bb1c23a6e2b514d101fe3fe44177819492a460 100644 --- a/timing/timing.c +++ b/timing/timing.c @@ -491,6 +491,7 @@ main(int argc, char *argv[]) { iparam[IPARAM_NX ] = -1; iparam[IPARAM_RHBLK ] = 0; iparam[IPARAM_INPLACE ] = MORSE_OUTOFPLACE; + iparam[IPARAM_MODE ] = 0; iparam[IPARAM_INVERSE ] = 0; iparam[IPARAM_NCUDAS ] = 0; @@ -613,6 +614,12 @@ main(int argc, char *argv[]) { iparam[IPARAM_BOUND] = 1; } else if (startswith( argv[i], "--p=" )) { sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[IPARAM_P]) ); + } else if (startswith( argv[i], "--mode=" )) { + sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[IPARAM_MODE]) ); + if (iparam[IPARAM_MODE] < 0 || iparam[IPARAM_MODE] > 20){ + fprintf( stderr, "Invalid mode: %s from 0 to 20\n", argv[i] ); + exit(0); + } } else { fprintf( stderr, "Unknown option: %s\n", argv[i] ); } diff --git a/timing/timing.h b/timing/timing.h index 11019ab02268eafa6a9413658cba5bb1176fdf79..948866a778d05dd648a393b2a35b357851d35b2f 100644 --- a/timing/timing.h +++ b/timing/timing.h @@ -42,6 +42,7 @@ enum iparam_timing { IPARAM_NX, /* */ IPARAM_RHBLK, /* Householder reduction parameter for QR/LQ */ IPARAM_INPLACE, /* InPlace/OutOfPlace translation mode */ + IPARAM_MODE, /* Eigenvalue generation mode */ IPARAM_INVERSE, IPARAM_NCUDAS, @@ -101,7 +102,7 @@ enum dparam_timing { (void)M;(void)N;(void)K;(void)NRHS; \ (void)LDA;(void)LDB;(void)LDC; \ (void)IB;(void)MB;(void)NB;(void)P;(void)Q; \ - (void)MT;(void)NT;(void)check;(void)loud; + (void)MT;(void)NT;(void)check;(void)loud;(void)bigmat; /* Paste code to allocate a matrix in desc if cond_init is true */ #define PASTE_CODE_ALLOCATE_MATRIX_TILE(_desc_, _cond_, _type_, _type2_, _lda_, _m_, _n_) \