diff --git a/compute/pzgelqf.c b/compute/pzgelqf.c index a2dbeb4857b3c5b39d41555b35c8edf471c70db4..98312113fb288cd692b363919bc1410210bcdea3 100644 --- a/compute/pzgelqf.c +++ b/compute/pzgelqf.c @@ -32,11 +32,8 @@ #define A(m,n) A, m, n #define T(m,n) T, m, n -#if defined(CHAMELEON_USE_MAGMA) #define DIAG(k) DIAG, k, 0 -#else -#define DIAG(k) A, k, k -#endif + /***************************************************************************//** * Parallel tile LQ factorization - dynamic scheduling **/ @@ -77,9 +74,6 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T, /* Allocation of temporary (scratch) working space */ #if defined(CHAMELEON_USE_MAGMA) - /* necessary to use UNMLQ on GPU */ - DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); - morse_zdesc_alloc2(*DIAG, A->mb, A->nb, (minMT-1)*A->mb, A->nb, 0, 0, (minMT-1)*A->mb, A->nb); /* Worker space * * zgelqt = max( A->nb * (ib+1), ib * (ib + A->nb) ) @@ -104,6 +98,10 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T, RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); + /* necessary to avoid dependencies between tslqt and unmlq tasks regarding the diag tile */ + DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); + morse_zdesc_alloc2(*DIAG, A->mb, A->nb, (minMT-1)*A->mb, A->nb, 0, 0, (minMT-1)*A->mb, A->nb); + for (k = 0; k < min(A->mt, A->nt); k++) { tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; @@ -113,20 +111,20 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T, tempkm, tempkn, ib, T->nb, A(k, k), ldak, T(k, k), T->mb); -#if defined(CHAMELEON_USE_MAGMA) if ( k < (A->mt-1) ) { MORSE_TASK_zlacpy( &options, MorseUpper, A->mb, A->nb, A->nb, A(k, k), ldak, DIAG(k), A->mb ); +#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlaset( &options, MorseLower, A->mb, A->nb, 0., 1., DIAG(k), A->mb ); - } #endif + } for (m = k+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; ldam = BLKLDD(A, m); @@ -164,8 +162,6 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T, RUNTIME_options_finalize(&options, morse); MORSE_TASK_dataflush_all(); -#if defined(CHAMELEON_USE_MAGMA) morse_desc_mat_free(DIAG); free(DIAG); -#endif } diff --git a/compute/pzgelqfrh.c b/compute/pzgelqfrh.c index 7304c96671df425597e76b5f1d4bdb85a5fdfa1a..04d389694c0ed92c898150452656d296b893b4b7 100644 --- a/compute/pzgelqfrh.c +++ b/compute/pzgelqfrh.c @@ -36,11 +36,8 @@ #define A(m,n) A, (m), (n) #define T(m,n) T, (m), (n) #define T2(m,n) T, (m), (n)+A->nt -#if defined(CHAMELEON_USE_MAGMA) #define DIAG(m,n) DIAG, ((n)/BS), 0 -#else -#define DIAG(m,n) A, (m), (n) -#endif + /***************************************************************************//** * Parallel tile LQ factorization (reduction Householder) - dynamic scheduling **/ @@ -58,6 +55,7 @@ void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS, int ldak, ldam; int tempkmin, tempkm, tempNn, tempnn, tempmm, tempNRDn; int ib; + int nblk; morse = morse_context_self(); if (sequence->status != MORSE_SUCCESS) @@ -78,12 +76,6 @@ void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS, /* Allocation of temporary (scratch) working space */ #if defined(CHAMELEON_USE_MAGMA) - { - /* necessary to use UNMLQ on GPU */ - int nblk = ( A->nt + BS -1 ) / BS; - DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); - morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb); - } /* Worker space * * zgelqt = max( A->nb * (ib+1), ib * (ib + A->nb) ) @@ -108,6 +100,11 @@ void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS, RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); + /* necessary to avoid dependencies between tasks regarding the diag tile */ + nblk = ( A->nt + BS -1 ) / BS; + DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); + morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb); + for (k = 0; k < min(A->mt, A->nt); k++) { tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; ldak = BLKLDD(A, k); @@ -119,20 +116,20 @@ void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS, tempkm, tempNn, ib, T->nb, A(k, N), ldak, T(k, N), T->mb); -#if defined(CHAMELEON_USE_MAGMA) if ( k < (A->mt-1) ) { MORSE_TASK_zlacpy( &options, MorseUpper, tempkm, tempNn, A->nb, A(k, N), ldak, DIAG(k, N), ldak ); +#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlaset( &options, MorseLower, tempkm, tempNn, 0., 1., DIAG(k, N), ldak ); - } #endif + } for (m = k+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; ldam = BLKLDD(A, m); @@ -196,8 +193,6 @@ void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS, RUNTIME_options_finalize(&options, morse); MORSE_TASK_dataflush_all(); -#if defined(CHAMELEON_USE_MAGMA) morse_desc_mat_free(DIAG); free(DIAG); -#endif } diff --git a/compute/pzgeqrf.c b/compute/pzgeqrf.c index 048376d8683ebc13f0b2bd02ef42abc4349a34f9..88f274636efbf2869e24fc07b52f83ed4652d583 100644 --- a/compute/pzgeqrf.c +++ b/compute/pzgeqrf.c @@ -32,11 +32,8 @@ #define A(m,n) A, m, n #define T(m,n) T, m, n -#if defined(CHAMELEON_USE_MAGMA) #define DIAG(k) DIAG, k, 0 -#else -#define DIAG(k) A, k, k -#endif + /***************************************************************************//** * Parallel tile QR factorization - dynamic scheduling **/ @@ -72,9 +69,6 @@ void morse_pzgeqrf(MORSE_desc_t *A, MORSE_desc_t *T, /* Allocation of temporary (scratch) working space */ #if defined(CHAMELEON_USE_MAGMA) - /* necessary to use UNMQR on GPU */ - DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); - morse_zdesc_alloc2(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb); /* Worker space * * zgeqrt = max( A->nb * (ib+1), ib * (ib + A->nb) ) @@ -99,6 +93,10 @@ void morse_pzgeqrf(MORSE_desc_t *A, MORSE_desc_t *T, RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); + /* necessary to avoid dependencies between tsqrt and unmqr tasks regarding the diag tile */ + DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); + morse_zdesc_alloc2(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb); + for (k = 0; k < minMNT; k++) { tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; @@ -108,20 +106,20 @@ void morse_pzgeqrf(MORSE_desc_t *A, MORSE_desc_t *T, tempkm, tempkn, ib, T->nb, A(k, k), ldak, T(k, k), T->mb); -#if defined(CHAMELEON_USE_MAGMA) if ( k < (A->nt-1) ) { MORSE_TASK_zlacpy( &options, MorseLower, A->mb, A->nb, A->nb, A(k, k), ldak, DIAG(k), ldak ); +#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlaset( &options, MorseUpper, A->mb, A->nb, 0., 1., DIAG(k), ldak ); - } #endif + } for (n = k+1; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; MORSE_TASK_zunmqr( @@ -158,8 +156,6 @@ void morse_pzgeqrf(MORSE_desc_t *A, MORSE_desc_t *T, RUNTIME_options_finalize(&options, morse); MORSE_TASK_dataflush_all(); -#if defined(CHAMELEON_USE_MAGMA) morse_desc_mat_free(DIAG); free(DIAG); -#endif } diff --git a/compute/pzgeqrfrh.c b/compute/pzgeqrfrh.c index 56ce1357a221e678f185f78f7e29a9779a4a927d..bbcb6414c636851b6c4ab3ede20bcedbb53fc4cd 100644 --- a/compute/pzgeqrfrh.c +++ b/compute/pzgeqrfrh.c @@ -34,11 +34,7 @@ #define A(m,n) A, (m), (n) #define T(m,n) T, (m), (n) #define T2(m,n) T, (m), ((n)+A->nt) -#if defined(CHAMELEON_USE_MAGMA) #define DIAG(m,n) DIAG, ((m)/BS), 0 -#else -#define DIAG(m,n) A, (m), (n) -#endif /***************************************************************************//** * Parallel tile QR factorization (reduction Householder) - dynamic scheduling @@ -57,6 +53,7 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS, int ldaM, ldam, ldaMRD; int tempkmin, tempkn, tempMm, tempnn, tempmm, tempMRDm; int ib; + int nblk; morse = morse_context_self(); if (sequence->status != MORSE_SUCCESS) @@ -77,12 +74,6 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS, /* Allocation of temporary (scratch) working space */ #if defined(CHAMELEON_USE_MAGMA) - { - int nblk = ( A->mt + BS -1 ) / BS; - DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); - morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb); - } - /* Worker space * * zgeqrt = max( A->nb * (ib+1), ib * (ib + A->nb) ) @@ -107,6 +98,11 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS, RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); + /* necessary to avoid dependencies between tasks regarding the diag tile */ + nblk = ( A->mt + BS -1 ) / BS; + DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); + morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb); + K = min(A->mt, A->nt); for (k = 0; k < K; k++) { tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; @@ -119,20 +115,20 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS, tempMm, tempkn, ib, T->nb, A(M, k), ldaM, T(M, k), T->mb); -#if defined(CHAMELEON_USE_MAGMA) if ( k < (A->nt-1) ) { MORSE_TASK_zlacpy( &options, MorseLower, tempMm, A->nb, A->nb, A(M, k), ldaM, DIAG(M, k), ldaM ); +#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlaset( &options, MorseUpper, tempMm, A->nb, 0., 1., DIAG(M, k), ldaM ); - } #endif + } for (n = k+1; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; MORSE_TASK_zunmqr( @@ -196,8 +192,6 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS, RUNTIME_options_finalize(&options, morse); MORSE_TASK_dataflush_all(); -#if defined(CHAMELEON_USE_MAGMA) morse_desc_mat_free(DIAG); free(DIAG); -#endif } diff --git a/compute/pzgetrf_incpiv.c b/compute/pzgetrf_incpiv.c index 813a59f2ea18d1f561043266a9a9b1d5df9f562a..08b90104ab9d97356d68c654979a2853aa10d4c2 100644 --- a/compute/pzgetrf_incpiv.c +++ b/compute/pzgetrf_incpiv.c @@ -33,11 +33,7 @@ #include "common.h" #define A(_m_,_n_) A, _m_, _n_ -#if defined(CHAMELEON_USE_MAGMA) #define DIAG(_k_) DIAG, _k_, 0 -#else -#define DIAG(_k_) A, _k_, _k_ -#endif #define L(_m_,_n_) L, _m_, _n_ #define IPIV(_m_,_n_) &(IPIV[(int64_t)A->mb*((int64_t)(_m_)+(int64_t)A->mt*(int64_t)(_n_))]) @@ -65,9 +61,6 @@ void morse_pzgetrf_incpiv(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV, ib = MORSE_IB; #if defined(CHAMELEON_USE_MAGMA) - DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); - morse_zdesc_alloc2(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb); - h_work_size = sizeof(MORSE_Complex64_t)*( 2*ib + 2*L->nb )*2*A->mb; d_work_size = sizeof(MORSE_Complex64_t)*( ib )*2*A->mb; #else @@ -76,6 +69,10 @@ void morse_pzgetrf_incpiv(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV, #endif RUNTIME_options_ws_alloc( &options, h_work_size, d_work_size ); + /* necessary to avoid dependencies between tasks regarding the diag tile */ + DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); + morse_zdesc_alloc2(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb); + for (k = 0; k < minMNT; k++) { tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; @@ -88,7 +85,6 @@ void morse_pzgetrf_incpiv(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV, IPIV(k, k), k == A->mt-1, A->nb*k); -#if defined(CHAMELEON_USE_MAGMA) if ( k < (minMNT-1) ) { MORSE_TASK_zlacpy( &options, @@ -96,7 +92,6 @@ void morse_pzgetrf_incpiv(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV, A(k, k), ldak, DIAG(k), ldak); } -#endif for (n = k+1; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; @@ -137,8 +132,6 @@ void morse_pzgetrf_incpiv(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV, RUNTIME_options_finalize(&options, morse); MORSE_TASK_dataflush_all(); -#if defined(CHAMELEON_USE_MAGMA) morse_desc_mat_free(DIAG); free(DIAG); -#endif } diff --git a/compute/pzunglq.c b/compute/pzunglq.c index bdaf41676b82c43bf8c9348307422982a8bf2f5f..8372ddaf02d525ae302582c09b025227c7b9e096 100644 --- a/compute/pzunglq.c +++ b/compute/pzunglq.c @@ -33,11 +33,8 @@ #define A(m,n) A, m, n #define Q(m,n) Q, m, n #define T(m,n) T, m, n -#if defined(CHAMELEON_USE_MAGMA) #define DIAG(k) DIAG, k, 0 -#else -#define DIAG(k) A, k, k -#endif + /***************************************************************************//** * Parallel construction of Q using tile V (application to identity) - dynamic scheduling **/ @@ -77,8 +74,6 @@ void morse_pzunglq(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T, /* Allocation of temporary (scratch) working space */ #if defined(CHAMELEON_USE_MAGMA) - DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); - morse_zdesc_alloc2(*DIAG, A->mb, A->nb, minMT*A->mb, A->nb, 0, 0, minMT*A->mb, A->nb); /* Worker space * * zunmlq = A->nb * ib @@ -92,6 +87,10 @@ void morse_pzunglq(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T, RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); + /* necessary to avoid dependencies between tasks regarding the diag tile */ + DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); + morse_zdesc_alloc2(*DIAG, A->mb, A->nb, minMT*A->mb, A->nb, 0, 0, minMT*A->mb, A->nb); + for (k = min(A->mt, A->nt)-1; k >= 0; k--) { tempAkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempAkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; @@ -113,12 +112,12 @@ void morse_pzunglq(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T, T(k, n), T->mb); } } -#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlacpy( &options, MorseUpper, tempkmin, tempkn, A->nb, A(k, k), ldak, DIAG(k), A->mb ); +#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlaset( &options, MorseLower, tempkmin, tempkn, @@ -141,8 +140,6 @@ void morse_pzunglq(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T, RUNTIME_options_finalize(&options, morse); MORSE_TASK_dataflush_all(); -#if defined(CHAMELEON_USE_MAGMA) morse_desc_mat_free(DIAG); free(DIAG); -#endif } diff --git a/compute/pzunglqrh.c b/compute/pzunglqrh.c index f158c828561e775012f7f0e6b348514149a0f248..1991b531f6d2d34f5de629c5cc20247ec22d0b85 100644 --- a/compute/pzunglqrh.c +++ b/compute/pzunglqrh.c @@ -33,11 +33,8 @@ #define Q(m,n) Q, (m), (n) #define T(m,n) T, (m), (n) #define T2(m,n) T, (m), (n)+(A->nt) -#if defined(CHAMELEON_USE_MAGMA) #define DIAG(m,n) DIAG, ((n)/BS), 0 -#else -#define DIAG(m,n) A, (m), (n) -#endif + /** * Parallel construction of Q using tile V (application to identity; * reduction Householder) - dynamic scheduling @@ -58,6 +55,7 @@ void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q, int ldqm; int tempkm, tempkmin, tempNn, tempnn, tempmm, tempNRDn; int ib; + int nblk; morse = morse_context_self(); if (sequence->status != MORSE_SUCCESS) @@ -74,12 +72,6 @@ void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q, ws_worker = A->nb * ib; #if defined(CHAMELEON_USE_MAGMA) - { - /* necessary to use UNMLQ on GPU */ - int nblk = ( A->nt + BS -1 ) / BS; - DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); - morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb); - } /* Worker space * * zunmqr = A->nb * ib @@ -93,6 +85,11 @@ void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q, RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); + /* necessary to avoid dependencies between tasks regarding the diag tile */ + nblk = ( A->nt + BS -1 ) / BS; + DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); + morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb); + K = min(A->mt, A->nt); for (k = K-1; k >= 0; k--) { tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; @@ -138,12 +135,12 @@ void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q, T(k, n), T->mb); } } -#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlacpy( &options, MorseUpper, tempkmin, tempNn, A->nb, A(k, N), ldak, DIAG(k, N), ldak ); +#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlaset( &options, MorseLower, tempkmin, tempNn, @@ -168,8 +165,6 @@ void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q, RUNTIME_options_finalize(&options, morse); MORSE_TASK_dataflush_all(); -#if defined(CHAMELEON_USE_MAGMA) morse_desc_mat_free(DIAG); free(DIAG); -#endif } diff --git a/compute/pzungqr.c b/compute/pzungqr.c index 1e287e9fe778da3f7e03238a5b5054706413da3b..a67a27696bc6a44b8ef0c9241cdeaab0fe1b3c90 100644 --- a/compute/pzungqr.c +++ b/compute/pzungqr.c @@ -33,11 +33,8 @@ #define A(m,n) A, m, n #define Q(m,n) Q, m, n #define T(m,n) T, m, n -#if defined(CHAMELEON_USE_MAGMA) #define DIAG(k) DIAG, k, 0 -#else -#define DIAG(k) A, k, k -#endif + /***************************************************************************//** * Parallel construction of Q using tile V (application to identity) - dynamic scheduling **/ @@ -71,8 +68,6 @@ void morse_pzungqr(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T, /* Allocation of temporary (scratch) working space */ #if defined(CHAMELEON_USE_MAGMA) - DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); - morse_zdesc_alloc2(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb); /* Worker space * * zunmqr = A->nb * ib @@ -86,6 +81,10 @@ void morse_pzungqr(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T, RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); + /* necessary to avoid dependencies between tasks regarding the diag tile */ + DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); + morse_zdesc_alloc2(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb); + for (k = min(A->mt, A->nt)-1; k >= 0; k--) { tempAkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempAkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; @@ -109,12 +108,12 @@ void morse_pzungqr(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T, T(m, k), T->mb); } } -#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlacpy( &options, MorseLower, tempkm, tempkmin, A->nb, A(k, k), ldak, DIAG(k), ldak ); +#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlaset( &options, MorseUpper, tempkm, tempkmin, @@ -136,8 +135,6 @@ void morse_pzungqr(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T, RUNTIME_options_finalize(&options, morse); MORSE_TASK_dataflush_all(); -#if defined(CHAMELEON_USE_MAGMA) morse_desc_mat_free(DIAG); free(DIAG); -#endif } diff --git a/compute/pzungqrrh.c b/compute/pzungqrrh.c index 036fe35aec813686807afd7b90ae8bd2fac9ae98..c3fc863a3f8115cda1a8d48f2f5b35efc93efac2 100644 --- a/compute/pzungqrrh.c +++ b/compute/pzungqrrh.c @@ -35,11 +35,7 @@ #define Q(m,n) Q, (m), (n) #define T(m,n) T, (m), (n) #define T2(m,n) T, (m), (n)+(A->nt) -#if defined(CHAMELEON_USE_MAGMA) #define DIAG(m,n) DIAG, ((m)/BS), 0 -#else -#define DIAG(m,n) A, (m), (n) -#endif /** * Parallel construction of Q using tile V (application to identity; @@ -61,6 +57,7 @@ void morse_pzungqrrh(MORSE_desc_t *A, MORSE_desc_t *Q, int ldbM, ldbm, ldbMRD; int tempkn, tempMm, tempnn, tempmm, tempMRDm, tempkmin; int ib; + int nblk; morse = morse_context_self(); if (sequence->status != MORSE_SUCCESS) @@ -77,12 +74,6 @@ void morse_pzungqrrh(MORSE_desc_t *A, MORSE_desc_t *Q, ws_worker = A->nb * ib; #if defined(CHAMELEON_USE_MAGMA) - { - int nblk = ( A->mt + BS -1 ) / BS; - DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); - morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb); - } - /* Worker space * * zunmqr = A->nb * ib @@ -96,6 +87,11 @@ void morse_pzungqrrh(MORSE_desc_t *A, MORSE_desc_t *Q, RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); + /* necessary to avoid dependencies between tasks regarding the diag tile */ + nblk = ( A->mt + BS -1 ) / BS; + DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); + morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb); + K = min(A->mt, A->nt); for (k = K-1; k >= 0; k--) { tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; @@ -145,12 +141,12 @@ void morse_pzungqrrh(MORSE_desc_t *A, MORSE_desc_t *Q, T(m, k), T->mb); } } -#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlacpy( &options, MorseLower, tempMm, tempkmin, A->nb, A(M, k), ldaM, DIAG(M, k), ldaM ); +#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlaset( &options, MorseUpper, tempMm, tempkmin, @@ -174,8 +170,6 @@ void morse_pzungqrrh(MORSE_desc_t *A, MORSE_desc_t *Q, RUNTIME_options_finalize(&options, morse); MORSE_TASK_dataflush_all(); -#if defined(CHAMELEON_USE_MAGMA) morse_desc_mat_free(DIAG); free(DIAG); -#endif } diff --git a/compute/pzunmlq.c b/compute/pzunmlq.c index 6a96c0a5def422ad5ce2a82bfb9e0b4c5c29dcbd..4592ab327288c40ab0cab0370b79a5cc573f55d5 100644 --- a/compute/pzunmlq.c +++ b/compute/pzunmlq.c @@ -34,11 +34,8 @@ #define A(m,n) A, m, n #define B(m,n) B, m, n #define T(m,n) T, m, n -#if defined(CHAMELEON_USE_MAGMA) #define DIAG(k) DIAG, k, 0 -#else -#define DIAG(k) A, k, k -#endif + /***************************************************************************//** * Parallel application of Q using tile V - LQ factorization - dynamic scheduling **/ @@ -79,9 +76,6 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans, ws_worker = A->nb * ib; #if defined(CHAMELEON_USE_MAGMA) - /* necessary to use UNMLQ on GPU */ - DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); - morse_zdesc_alloc2(*DIAG, A->mb, A->nb, minMT*A->mb, A->nb, 0, 0, minMT*A->mb, A->nb); /* Worker space * * zunmlq = A->nb * ib @@ -95,6 +89,10 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans, RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); + /* necessary to avoid dependencies between tasks regarding the diag tile */ + DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); + morse_zdesc_alloc2(*DIAG, A->mb, A->nb, minMT*A->mb, A->nb, 0, 0, minMT*A->mb, A->nb); + if (side == MorseLeft ) { if (trans == MorseNoTrans) { /* @@ -105,12 +103,12 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans, tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); -#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlacpy( &options, MorseUpper, tempkmin, tempkm, A->nb, A(k, k), ldak, DIAG(k), A->mb ); +#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlaset( &options, MorseLower, tempkmin, tempkm, @@ -168,12 +166,12 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans, T(k, m), T->mb); } } -#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlacpy( &options, MorseUpper, tempkmin, tempkm, A->nb, A(k, k), ldak, DIAG(k), A->mb ); +#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlaset( &options, MorseLower, tempkmin, tempkm, @@ -217,12 +215,12 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans, T(k, n), T->mb); } } -#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlacpy( &options, MorseUpper, tempkmin, tempkn, A->nb, A(k, k), ldak, DIAG(k), A->mb ); +#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlaset( &options, MorseLower, tempkmin, tempkn, @@ -250,12 +248,12 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans, tempkn = k == B->nt -1 ? B->n -k*B->nb : B->nb; tempkmin = k == minMT-1 ? minM-k*A->mb : A->mb; ldak = BLKLDD(A, k); -#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlacpy( &options, MorseUpper, tempkmin, tempkn, A->nb, A(k, k), ldak, DIAG(k), A->mb ); +#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlaset( &options, MorseLower, tempkmin, tempkn, @@ -295,8 +293,6 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans, RUNTIME_options_finalize(&options, morse); MORSE_TASK_dataflush_all(); -#if defined(CHAMELEON_USE_MAGMA) morse_desc_mat_free(DIAG); free(DIAG); -#endif } diff --git a/compute/pzunmlqrh.c b/compute/pzunmlqrh.c index 277b5d11772ad6326d004ed5ebb0aceda71a2059..e2ad8eb63cf65b601f6235313d063142ba84ccc3 100644 --- a/compute/pzunmlqrh.c +++ b/compute/pzunmlqrh.c @@ -35,11 +35,8 @@ #define B(m,n) B, (m), (n) #define T(m,n) T, (m), (n) #define T2(m,n) T, (m), (n)+A->nt -#if defined(CHAMELEON_USE_MAGMA) #define DIAG(m,n) DIAG, ((n)/BS), 0 -#else -#define DIAG(m,n) A, (m), (n) -#endif + /***************************************************************************//** * Parallel application of Q using tile V - LQ factorization (reduction * Householder) - dynamic scheduling @@ -60,6 +57,7 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans, int ldbN, ldbm, ldbNRD; int tempNn, tempkm, tempnn, tempmm, tempNRDn, tempkmin; int ib; + int nblk; morse = morse_context_self(); if (sequence->status != MORSE_SUCCESS) @@ -76,12 +74,6 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans, ws_worker = A->nb * ib; #if defined(CHAMELEON_USE_MAGMA) - { - /* necessary to use UNMLQ on GPU */ - int nblk = ( A->nt + BS -1 ) / BS; - DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); - morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb); - } /* Worker space * * zunmlq = A->nb * ib @@ -95,6 +87,11 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans, RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); + /* necessary to avoid dependencies between tasks regarding the diag tile */ + nblk = ( A->nt + BS -1 ) / BS; + DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); + morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb); + K = min(A->mt, A->nt); if (side == MorseLeft ) { if (trans == MorseNoTrans) { @@ -109,12 +106,12 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans, tempkmin = min(tempkm,tempNn); ldaN = BLKLDD(A, N); ldbN = BLKLDD(B, N); -#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlacpy( &options, MorseUpper, tempkmin, tempNn, A->nb, A(k, N), ldak, DIAG(k, N), ldak ); +#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlaset( &options, MorseLower, tempkmin, tempNn, @@ -219,12 +216,12 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans, T(k, m), T->mb); } } -#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlacpy( &options, MorseUpper, tempkmin, tempNn, A->nb, A(k, N), ldak, DIAG(k, N), ldak ); +#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlaset( &options, MorseLower, tempkmin, tempNn, @@ -294,12 +291,12 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans, T(k, n), T->mb); } } -#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlacpy( &options, MorseUpper, tempkmin, tempNn, A->nb, A(k, N), ldak, DIAG(k, N), ldak ); +#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlaset( &options, MorseLower, tempkmin, tempNn, @@ -331,12 +328,12 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans, tempNn = N == A->nt-1 ? A->n-N*A->nb : A->nb; tempkmin = min(tempkm,tempNn); ldaN = BLKLDD(A, N); -#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlacpy( &options, MorseUpper, tempkmin, tempNn, A->nb, A(k, N), ldaN, DIAG(k, N), ldaN ); +#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlaset( &options, MorseLower, tempkmin, tempNn, @@ -397,8 +394,6 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans, RUNTIME_options_finalize(&options, morse); MORSE_TASK_dataflush_all(); -#if defined(CHAMELEON_USE_MAGMA) morse_desc_mat_free(DIAG); free(DIAG); -#endif } diff --git a/compute/pzunmqr.c b/compute/pzunmqr.c index 3605e86ee76316410a71ecc883a89feaa87f55d9..d7c426415a2edb3fd542de5ba4c31177504a92b3 100644 --- a/compute/pzunmqr.c +++ b/compute/pzunmqr.c @@ -34,11 +34,8 @@ #define A(m,n) A, m, n #define B(m,n) B, m, n #define T(m,n) T, m, n -#if defined(CHAMELEON_USE_MAGMA) #define DIAG(k) DIAG, k, 0 -#else -#define DIAG(k) A, k, k -#endif + /***************************************************************************//** * Parallel application of Q using tile V - QR factorization - dynamic scheduling **/ @@ -71,9 +68,6 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans, ws_worker = A->nb * ib; #if defined(CHAMELEON_USE_MAGMA) - /* necessary to use UNMQR on GPU */ - DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); - morse_zdesc_alloc2(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb); /* Worker space * * zunmqr = A->nb * ib @@ -87,6 +81,10 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans, RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); + /* necessary to avoid dependencies between tasks regarding the diag tile */ + DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); + morse_zdesc_alloc2(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb); + if (A->m > A->n) { minM = A->n; minMT = A->nt; @@ -105,12 +103,12 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans, tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); -#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlacpy( &options, MorseLower, tempkm, tempkmin, A->nb, A(k, k), ldak, DIAG(k), ldak ); +#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlaset( &options, MorseUpper, tempkm, tempkmin, @@ -170,12 +168,12 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans, T(m, k), T->mb); } } -#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlacpy( &options, MorseLower, tempkm, tempkmin, A->nb, A(k, k), ldak, DIAG(k), ldak ); +#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlaset( &options, MorseUpper, tempkm, tempkmin, @@ -221,12 +219,12 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans, T(n, k), T->mb); } } -#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlacpy( &options, MorseLower, tempkn, tempkmin, A->nb, A(k, k), ldak, DIAG(k), ldak ); +#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlaset( &options, MorseUpper, tempkn, tempkmin, @@ -254,12 +252,12 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans, tempkn = k == B->nt-1 ? B->n-k*B->nb : B->nb; tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb; ldak = BLKLDD(A, k); -#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlacpy( &options, MorseLower, tempkn, tempkmin, A->nb, A(k, k), ldak, DIAG(k), ldak ); +#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlaset( &options, MorseUpper, tempkn, tempkmin, @@ -301,8 +299,6 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans, RUNTIME_options_finalize(&options, morse); MORSE_TASK_dataflush_all(); -#if defined(CHAMELEON_USE_MAGMA) morse_desc_mat_free(DIAG); free(DIAG); -#endif } diff --git a/compute/pzunmqrrh.c b/compute/pzunmqrrh.c index 0c77fe6be0309d9f4274cc54ecb411c5bd7d92a2..fbfb4496c4d7744164de0c36fd4c3337f7ec5385 100644 --- a/compute/pzunmqrrh.c +++ b/compute/pzunmqrrh.c @@ -35,11 +35,8 @@ #define B(m,n) B, (m), (n) #define T(m,n) T, (m), (n) #define T2(m,n) T, (m), ((n)+A->nt) -#if defined(CHAMELEON_USE_MAGMA) #define DIAG(m,n) DIAG, ((m)/BS), 0 -#else -#define DIAG(m,n) A, (m), (n) -#endif + /***************************************************************************//** * Parallel application of Q using tile V - QR factorization (reduction * Householder) - dynamic scheduling @@ -60,6 +57,7 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans, int ldbM, ldbm, ldbMRD; int tempMm, tempkn, tempnn, tempmm, tempMRDm, tempkmin; int ib; + int nblk; morse = morse_context_self(); if (sequence->status != MORSE_SUCCESS) @@ -76,12 +74,6 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans, ws_worker = A->nb * ib; #if defined(CHAMELEON_USE_MAGMA) - { - int nblk = ( A->mt + BS -1 ) / BS; - DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); - morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb); - } - /* Worker space * * zunmqr = A->nb * ib @@ -95,6 +87,11 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans, RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); + /* necessary to avoid dependencies between tasks regarding the diag tile */ + nblk = ( A->mt + BS -1 ) / BS; + DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t)); + morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb); + K = min(A->mt, A->nt); if (side == MorseLeft ) { if (trans == MorseConjTrans) { @@ -108,12 +105,12 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans, tempkmin = min(tempMm, tempkn); ldaM = BLKLDD(A, M); ldbM = BLKLDD(B, M); -#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlacpy( &options, MorseLower, tempMm, tempkmin, A->nb, A(M, k), ldaM, DIAG(M, k), ldaM ); +#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlaset( &options, MorseUpper, tempMm, tempkmin, @@ -221,12 +218,12 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans, T(m, k), T->mb); } } -#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlacpy( &options, MorseLower, tempMm, tempkmin, A->nb, A(M, k), ldaM, DIAG(M, k), ldaM ); +#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlaset( &options, MorseUpper, tempMm, tempkmin, @@ -298,12 +295,12 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans, T(n, k), T->mb); } } -#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlacpy( &options, MorseLower, tempMm, tempkmin, A->nb, A(M, k), ldaM, DIAG(M, k), ldaM ); +#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlaset( &options, MorseUpper, tempMm, tempkmin, @@ -334,12 +331,12 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans, tempMm = M == A->mt-1 ? A->m-M*A->mb : A->mb; tempkmin = min(tempMm, tempkn); ldaM = BLKLDD(A, M); -#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlacpy( &options, MorseLower, tempMm, tempkmin, A->nb, A(M, k), ldaM, DIAG(M, k), ldaM ); +#if defined(CHAMELEON_USE_MAGMA) MORSE_TASK_zlaset( &options, MorseUpper, tempMm, tempkmin, @@ -402,8 +399,6 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans, RUNTIME_options_finalize(&options, morse); MORSE_TASK_dataflush_all(); -#if defined(CHAMELEON_USE_MAGMA) morse_desc_mat_free(DIAG); free(DIAG); -#endif }