diff --git a/compute/pzgebrd.c b/compute/pzgebrd.c index 273824cd6251cd86b12555cd9d12157faeb3597b..cbfa67824f3032c94279739ea2e6b5533734dcb2 100644 --- a/compute/pzgebrd.c +++ b/compute/pzgebrd.c @@ -25,112 +25,153 @@ #include "coreblas/lapacke.h" #endif -void chameleon_pzgebrd_ge2gb( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, - RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) +#define A(m,n) A, (m), (n) + +static inline void +chameleon_pzgebrd_ge2gb( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { - int k; - int tempkm, tempkn; - CHAM_desc_t *A1, *A2, *T1, *D1 = NULL; + CHAM_context_t *chamctxt; + RUNTIME_option_t options; + size_t ws_worker = 0; + size_t ws_host = 0; + int k, m, n, ib; + CHAM_desc_t *Abis = NULL; + CHAM_desc_t *Tbis = NULL; + CHAM_desc_t *Dbis = NULL; - if ( A->m >= A->n ){ - for ( k = 0; k < A->nt; k++ ) { - tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; + chamctxt = chameleon_context_self(); + if (sequence->status != CHAMELEON_SUCCESS) { + return; + } + RUNTIME_options_init( &options, chamctxt, sequence, request ); - A1 = chameleon_desc_submatrix( A, k*A->mb, k*A->nb, A->m-k*A->mb, tempkn ); - A2 = chameleon_desc_submatrix( A, k*A->mb, (k+1)*A->nb, A->m-k*A->mb, A->n-(k+1)*A->nb ); - T1 = chameleon_desc_submatrix( T, k*T->mb, k*T->nb, T->m-k*T->mb, T->nb ); - if ( D != NULL ) { - D1 = chameleon_desc_submatrix( D, k*D->mb, k*D->nb, D->m-k*D->mb, tempkn ); - } + if ( D == NULL ) { + D = A; + genD = 0; + } + ib = CHAMELEON_IB; + + /* + * zgeqrt/zgelqt = A->nb * (ib+1) + * zunmqr/zunmlq = A->nb * ib + * ztpqrt/ztplqt = A->nb * (ib+1) + * ztpmqrt/ztpmlqt = A->nb * ib + */ + ws_worker = A->nb * (ib+1); + + /* Allocation of temporary (scratch) working space */ +#if defined(CHAMELEON_USE_CUDA) + /* Worker space + * + * zunmqr/zunmlq = A->nb * ib + * ztpmqrt/ztpmlqt = 2 * A->nb * ib + */ + ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 ); +#endif - chameleon_pzgeqrf( genD, A1, T1, D1, sequence, request ); - chameleon_pzunmqr( 0, ChamLeft, ChamConjTrans, A1, A2, T1, D1, sequence, request ); + ws_worker *= sizeof(CHAMELEON_Complex64_t); + ws_host *= sizeof(CHAMELEON_Complex64_t); - free( A1 ); - free( A2 ); - free( T1 ); + RUNTIME_options_ws_alloc( &options, ws_worker, ws_host ); + + if ( A->m >= A->n ){ + /* + * Create submatrix descriptor of the 3 matrices without the first column + */ + if ( A->nt > 1 ) { + Abis = chameleon_desc_submatrix( A, 0, A->nb, A->m, chameleon_max( A->n - A->nb, 0 ) ); + Tbis = chameleon_desc_submatrix( T, 0, T->nb, T->m, chameleon_max( T->n - T->nb, 0 ) ); if ( D != NULL ) { - free( D1 ); + Dbis = chameleon_desc_submatrix( D, 0, D->nb, D->m, chameleon_max( D->n - D->nb, 0 ) ); } + } - if ( k+1 < A->nt ) { - tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; + for ( k = 0; k < A->nt; k++ ) { + RUNTIME_iteration_push(chamctxt, k); - A1 = chameleon_desc_submatrix( A, k*A->mb, (k+1)*A->nb, tempkm, A->n-(k+1)*A->nb ); - A2 = chameleon_desc_submatrix( A, (k+1)*A->mb, (k+1)*A->nb, A->m-(k+1)*A->mb, A->n-(k+1)*A->nb ); - T1 = chameleon_desc_submatrix( T, k*T->mb, (k+1)*T->nb, T->mb, T->n-(k+1)*T->nb ); - if ( D != NULL ) { - D1 = chameleon_desc_submatrix( D, k*D->mb, (k+1)*D->nb, tempkm, D->n-(k+1)*D->nb ); - } + chameleon_pzgeqrf_step( genD, k, ib, A, T, D, &options, sequence ); - chameleon_pzgelqf( genD, A1, T1, D1, sequence, request ); - chameleon_pzunmlq( 0, ChamRight, ChamConjTrans, A1, A2, T1, D1, sequence, request ); + /* Restore the original location of the tiles */ + for (n = k; n < A->nt; n++) { + RUNTIME_data_migrate( sequence, A(k, n), + A->get_rankof( A, k, n ) ); + } + + if ( k+1 < A->nt ) { + chameleon_pzgelqf_step( genD, k, ib, Abis, Tbis, Dbis, &options, sequence ); - free( A1 ); - free( A2 ); - free( T1 ); - if ( D != NULL ) { - free( D1 ); + /* Restore the original location of the tiles */ + for (m = k; m < A->mt; m++) { + RUNTIME_data_migrate( sequence, A( m, k+1 ), + A->get_rankof( A, m, k+1 ) ); } } + + RUNTIME_iteration_pop(chamctxt); } } else { - for ( k = 0; k < A->mt; k++ ) { - tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - - A1 = chameleon_desc_submatrix( A, k*A->mb, k*A->nb, tempkm, A->n-k*A->nb ); - A2 = chameleon_desc_submatrix( A, (k+1)*A->mb, k*A->nb, A->m-(k+1)*A->mb, A->n-k*A->nb ); - T1 = chameleon_desc_submatrix( T, k*T->mb, k*T->nb, T->mb, T->n-k*T->nb ); + /* + * Create submatrix descriptor of the 3 matrices without the first row + */ + if ( A->mt > 1 ) { + Abis = chameleon_desc_submatrix( A, A->mb, 0, chameleon_max( A->m - A->mb, 0 ), A->n ); + Tbis = chameleon_desc_submatrix( T, T->mb, 0, chameleon_max( T->m - T->mb, 0 ), T->n ); if ( D != NULL ) { - D1 = chameleon_desc_submatrix( D, k*D->mb, k*D->nb, tempkm, D->n-k*D->nb ); + Dbis = chameleon_desc_submatrix( D, D->mb, 0, chameleon_max( D->m - D->mb, 0 ), D->n ); } + } - chameleon_pzgelqf( genD, A1, T1, D1, sequence, request ); - chameleon_pzunmlq( 0, ChamRight, ChamConjTrans, A1, A2, T1, D1, sequence, request ); + for ( k = 0; k < A->mt; k++ ) { + RUNTIME_iteration_push(chamctxt, k); - free( A1 ); - free( A2 ); - free( T1 ); - if ( D != NULL ) { - free( D1 ); + chameleon_pzgelqf_step( genD, k, ib, A, T, D, &options, sequence ); + + /* Restore the original location of the tiles */ + for (m = k; m < A->mt; m++) { + RUNTIME_data_migrate( sequence, A( m, k ), + A->get_rankof( A, m, k ) ); } if ( k+1 < A->mt ) { - tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; - - A1 = chameleon_desc_submatrix( A, (k+1)*A->mb, k*A->nb, A->m-(k+1)*A->mb, tempkn ); - A2 = chameleon_desc_submatrix( A, (k+1)*A->mb, (k+1)*A->nb, A->m-(k+1)*A->mb, A->n-(k+1)*A->nb ); - T1 = chameleon_desc_submatrix( T, (k+1)*T->mb, k*T->nb, T->m-(k+1)*T->mb, T->nb ); - if ( D != NULL ) { - D1 = chameleon_desc_submatrix( D, (k+1)*D->mb, k*D->nb, D->m-(k+1)*D->mb, tempkn ); - } - - chameleon_pzgeqrf( genD, A1, T1, D1, sequence, request ); - chameleon_pzunmqr( 0, ChamLeft, ChamConjTrans, A1, A2, T1, D1, sequence, request ); + chameleon_pzgeqrf_step( genD, k, ib, Abis, Tbis, Dbis, &options, sequence ); - free( A1 ); - free( A2 ); - free( T1 ); - if ( D != NULL ) { - free( D1 ); + /* Restore the original location of the tiles */ + for (n = k; n < A->nt; n++) { + RUNTIME_data_migrate( sequence, A( k+1, n ), + A->get_rankof( A, k+1, n ) ); } } + RUNTIME_iteration_pop(chamctxt); } } + if ( Abis != NULL ) { + free( Abis ); + } + if ( Tbis != NULL ) { + free( Tbis ); + } + if ( Dbis != NULL ) { + free( Dbis ); + } CHAMELEON_Desc_Flush( A, sequence ); CHAMELEON_Desc_Flush( T, sequence ); if ( D != NULL ) { CHAMELEON_Desc_Flush( D, sequence ); } + + RUNTIME_options_ws_free(&options); + RUNTIME_options_finalize(&options, chamctxt); } -int chameleon_pzgebrd_gb2bd( cham_job_t jobu, cham_job_t jobvt, CHAM_desc_t *A, - CHAMELEON_Complex64_t *U, int LDU, - CHAMELEON_Complex64_t *VT, int LDVT, - double *E, double *S, - RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) +static inline int +chameleon_pzgebrd_gb2bd( cham_job_t jobu, cham_job_t jobvt, CHAM_desc_t *A, + CHAMELEON_Complex64_t *U, int LDU, + CHAMELEON_Complex64_t *VT, int LDVT, + double *E, double *S, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { CHAM_context_t *chamctxt; RUNTIME_option_t options; @@ -219,7 +260,7 @@ int chameleon_pzgebrd( int genD, cham_job_t jobu, cham_job_t jobvt, CHAM_desc_t *subA, *subT, *subUVT, *subD; CHAM_desc_t descUl, descUt; CHAM_desc_t descVTl, descVTt; - int M, N, NB; + int M, N, NB, ib; chamctxt = chameleon_context_self(); if ( sequence->status != CHAMELEON_SUCCESS ) { diff --git a/control/compute_z.h b/control/compute_z.h index 295c64264eed47ed5a67ead3d2ff16d423175105..d7953e9045a3e967781e8985f2ed851f2a529eae 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -64,11 +64,10 @@ int chameleon_zshift(CHAM_context_t *chamctxt, int m, int n, CHAMELEON_Complex64 /** * Declarations of parallel functions (dynamic scheduling) - alphabetical order */ -int chameleon_pzgebrd_gb2bd( cham_job_t jobu, cham_job_t jobvt, CHAM_desc_t *A, CHAMELEON_Complex64_t *U, int LDU, CHAMELEON_Complex64_t *VT, int LDVT, - double *E, double *S, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); -void chameleon_pzgebrd_ge2gb( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); -int chameleon_pzgebrd( int genD, cham_job_t jobu, cham_job_t jobvt, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, CHAMELEON_Complex64_t *U, int LDU, CHAMELEON_Complex64_t *VT, int LDVT, - double *E, double *S, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); +int chameleon_pzgebrd( int genD, cham_job_t jobu, cham_job_t jobvt, + CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, + CHAMELEON_Complex64_t *U, int LDU, CHAMELEON_Complex64_t *VT, int LDVT, + double *E, double *S, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzgenm2( double tol, const CHAM_desc_t *A, double *result, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );