diff --git a/CTestConfig.cmake b/CTestConfig.cmake index e79cbcc92dc25229e5b07e7c33c9d07f2b89de25..c3b485e175923b43ba70a11f6270ec5098d28b97 100644 --- a/CTestConfig.cmake +++ b/CTestConfig.cmake @@ -49,7 +49,7 @@ if(NOT BUILDNAME) if(CHAMELEON_SCHED_PARSEC) set(BUILDNAME "${BUILDNAME}-PaRSEC") - endif(CHAMELEON_SCHED_STARPU) + endif(CHAMELEON_SCHED_PARSEC) if(CHAMELEON_SIMULATION) set(BUILDNAME "${BUILDNAME}-SimGrid") diff --git a/compute/pzgelqf.c b/compute/pzgelqf.c index 31b8bff089877521a0749982fdf0aa7c7c7bdf94..55e16627424850f10510d91a56296b0422bd6326 100644 --- a/compute/pzgelqf.c +++ b/compute/pzgelqf.c @@ -41,7 +41,7 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D size_t ws_host = 0; int k, m, n; - int ldak, ldam; + int ldak, ldam, lddk; int tempkm, tempkn, tempmm, tempnn; int ib, minMNT; @@ -92,6 +92,7 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; ldak = BLKLDD(A, k); + lddk = BLKLDD(D, k); INSERT_TASK_zgelqt( &options, tempkm, tempkn, ib, T->nb, @@ -100,15 +101,15 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D if ( genD ) { INSERT_TASK_zlacpy( &options, - ChamUpper, A->mb, A->nb, A->nb, + ChamUpper, tempkm, tempkn, A->nb, A(k, k), ldak, - D(k), ldak ); + D(k), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamLower, A->mb, A->nb, + ChamLower, tempkm, tempkn, 0., 1., - D(k), ldak ); + D(k), lddk ); #endif } for (m = k+1; m < A->mt; m++) { @@ -118,7 +119,7 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D &options, ChamRight, ChamConjTrans, tempmm, tempkn, tempkn, ib, T->nb, - D(k), ldak, + D(k), lddk, T(k, k), T->mb, A(m, k), ldam); } diff --git a/compute/pzgelqf_param.c b/compute/pzgelqf_param.c index b1415f2bbd8ea53ff88d6daf2e301ea3a31d2254..9f853ef7ae1ef6bb9d86c86b586d335ea3ebd18d 100644 --- a/compute/pzgelqf_param.c +++ b/compute/pzgelqf_param.c @@ -42,7 +42,7 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t int k, m, n, i, p; int K, L; - int ldak, ldam; + int ldak, ldam, lddk; int tempkmin, tempkm, tempnn, tempmm, temppn; int ib; int *tiles; @@ -96,6 +96,7 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; ldak = BLKLDD(A, k); + lddk = BLKLDD(D, k); T = TS; /* The number of geqrt to apply */ @@ -114,13 +115,13 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t &options, ChamUpper, tempkm, temppn, A->nb, A(k, p), ldak, - D(k, p), ldak ); + D(k, p), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkm, temppn, 0., 1., - D(k, p), ldak ); + D(k, p), lddk ); #endif } for (m = k+1; m < A->mt; m++) { @@ -130,7 +131,7 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t &options, ChamRight, ChamConjTrans, tempmm, temppn, tempkmin, ib, T->nb, - D(k, p), ldak, + D(k, p), lddk, T(k, p), T->mb, A(m, p), ldam); } diff --git a/compute/pzgelqfrh.c b/compute/pzgelqfrh.c index f6ac81f523ddbf3f3923936b99a825eab23591aa..6dd19a90eec6ae92ad7a49ce97b7dab86466a938 100644 --- a/compute/pzgelqfrh.c +++ b/compute/pzgelqfrh.c @@ -44,7 +44,7 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM int k, m, n; int K, N, RD; - int ldak, ldam; + int ldak, ldam, lddk; int tempkmin, tempkm, tempNn, tempnn, tempmm, tempNRDn; int ib; @@ -91,6 +91,8 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; ldak = BLKLDD(A, k); + lddk = BLKLDD(D, k); + for (N = k; N < A->nt; N += BS) { tempNn = N == A->nt-1 ? A->n-N*A->nb : A->nb; tempkmin = chameleon_min(tempkm, tempNn); @@ -104,13 +106,13 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM &options, ChamUpper, tempkm, tempNn, A->nb, A(k, N), ldak, - D(k, N), ldak ); + D(k, N), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkm, tempNn, 0., 1., - D(k, N), ldak ); + D(k, N), lddk ); #endif } for (m = k+1; m < A->mt; m++) { @@ -120,7 +122,7 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM &options, ChamRight, ChamConjTrans, tempmm, tempNn, tempkmin, ib, T->nb, - D(k, N), ldak, + D(k, N), lddk, T(k, N), T->mb, A(m, N), ldam); } diff --git a/compute/pzgeqrf.c b/compute/pzgeqrf.c index 2bf10f0f9375ff740dd214f9e2219ea2d7831805..7c38e4773c99b8214d8b82776f63604061851890 100644 --- a/compute/pzgeqrf.c +++ b/compute/pzgeqrf.c @@ -41,7 +41,7 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D size_t ws_host = 0; int k, m, n; - int ldak, ldam; + int ldak, ldam, lddk; int tempkm, tempkn, tempnn, tempmm; int ib; int minMNT = chameleon_min(A->mt, A->nt); @@ -87,6 +87,7 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; ldak = BLKLDD(A, k); + lddk = BLKLDD(D, k); INSERT_TASK_zgeqrt( &options, tempkm, tempkn, ib, T->nb, @@ -95,15 +96,15 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D if ( genD ) { INSERT_TASK_zlacpy( &options, - ChamLower, A->mb, A->nb, A->nb, + ChamLower, tempkm, tempkn, A->nb, A(k, k), ldak, - D(k), ldak ); + D(k), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, A->mb, A->nb, + ChamUpper, tempkm, tempkn, 0., 1., - D(k), ldak ); + D(k), lddk ); #endif } for (n = k+1; n < A->nt; n++) { @@ -112,7 +113,7 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D &options, ChamLeft, ChamConjTrans, tempkm, tempnn, tempkm, ib, T->nb, - D(k), ldak, + D(k), lddk, T(k, k), T->mb, A(k, n), ldak); } diff --git a/compute/pzgeqrf_param.c b/compute/pzgeqrf_param.c index 8afa3938ada8bb59dbfa0215b95181d8c0925fcd..01c0a816f24440ea4cb830e39a909504f117aa36 100644 --- a/compute/pzgeqrf_param.c +++ b/compute/pzgeqrf_param.c @@ -41,7 +41,7 @@ void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t int k, m, n, i, p; int K, L, nbgeqrt; - int ldap, ldam; + int ldap, ldam, lddm; int tempkmin, tempkn, tempnn, tempmm; int ib; int *tiles; @@ -97,6 +97,7 @@ void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; tempkmin = chameleon_min(tempmm, tempkn); ldam = BLKLDD(A, m); + lddm = BLKLDD(D, m); T = TS; @@ -108,15 +109,15 @@ void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t if ( genD ) { INSERT_TASK_zlacpy( &options, - ChamLower, tempmm, A->nb, A->nb, + ChamLower, tempmm, tempkn, A->nb, A(m, k), ldam, - D(m, k), ldam ); + D(m, k), lddm ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempmm, A->nb, + ChamUpper, tempmm, tempkn, 0., 1., - D(m, k), ldam ); + D(m, k), lddm ); #endif } for (n = k+1; n < A->nt; n++) { @@ -125,7 +126,7 @@ void chameleon_pzgeqrf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t &options, ChamLeft, ChamConjTrans, tempmm, tempnn, tempkmin, ib, T->nb, - D(m, k), ldam, + D(m, k), lddm, T(m, k), T->mb, A(m, n), ldam); } diff --git a/compute/pzgeqrfrh.c b/compute/pzgeqrfrh.c index fb10c11c1af6327ca87198437553738afd495b30..472e77e7e96654023aaba277cda5d30e28543feb 100644 --- a/compute/pzgeqrfrh.c +++ b/compute/pzgeqrfrh.c @@ -44,7 +44,7 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM int k, m, n; int K, M, RD; - int ldaM, ldam, ldaMRD; + int ldaM, ldam, ldaMRD, lddM; int tempkmin, tempkn, tempMm, tempnn, tempmm, tempMRDm; int ib; @@ -92,6 +92,7 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM tempMm = M == A->mt-1 ? A->m-M*A->mb : A->mb; tempkmin = chameleon_min(tempMm, tempkn); ldaM = BLKLDD(A, M); + lddM = BLKLDD(D, M); INSERT_TASK_zgeqrt( &options, @@ -101,15 +102,15 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM if ( genD ) { INSERT_TASK_zlacpy( &options, - ChamLower, tempMm, A->nb, A->nb, + ChamLower, tempMm, tempkn, A->nb, A(M, k), ldaM, - D(M, k), ldaM ); + D(M, k), lddM ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamUpper, tempMm, A->nb, + ChamUpper, tempMm, tempkn, 0., 1., - D(M, k), ldaM ); + D(M, k), lddM ); #endif } for (n = k+1; n < A->nt; n++) { @@ -118,7 +119,7 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM &options, ChamLeft, ChamConjTrans, tempMm, tempnn, tempkmin, ib, T->nb, - D(M, k), ldaM, + D(M, k), lddM, T(M, k), T->mb, A(M, n), ldaM); } diff --git a/compute/pzlantr.c b/compute/pzlantr.c index cbdefa954a5d97293bf9277895a499de3300a6a2..acf27965580c15df4fd4535249a8112e47e07fdb 100644 --- a/compute/pzlantr.c +++ b/compute/pzlantr.c @@ -24,18 +24,18 @@ #include "control/common.h" #define A(m, n) A, m, n -#define VECNORMS_STEP1(m, n) VECNORMS_STEP1, m, n -#define VECNORMS_STEP2(m, n) VECNORMS_STEP2, m, n +#define W1(m, n) W1, m, n +#define W2(m, n) W2, m, n #define RESULT(m, n) RESULT, m, n /** * */ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, - CHAM_desc_t *A, double *result, - RUNTIME_sequence_t *sequence, RUNTIME_request_t *request) + CHAM_desc_t *A, double *result, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request) { - CHAM_desc_t *VECNORMS_STEP1 = NULL; - CHAM_desc_t *VECNORMS_STEP2 = NULL; + CHAM_desc_t *W1 = NULL; + CHAM_desc_t *W2 = NULL; CHAM_desc_t *RESULT = NULL; CHAM_context_t *chamctxt; RUNTIME_option_t options; @@ -58,46 +58,47 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, *result = 0.0; switch ( norm ) { - /* - * ChamOneNorm - */ + /* + * ChamOneNorm + */ case ChamOneNorm: /* Init workspace handle for the call to zlange but unused */ RUNTIME_options_ws_alloc( &options, 1, 0 ); workm = chameleon_max( A->mt, A->p ); - workn = A->n; - CHAMELEON_Desc_Create(&(VECNORMS_STEP1), NULL, ChamRealDouble, 1, A->nb, A->nb, - workm, workn, 0, 0, workm, workn, A->p, A->q); + workn = ( uplo == ChamLower ) ? chameleon_min( A->m, A->n ) : A->n; + + CHAMELEON_Desc_Create(&(W1), NULL, ChamRealDouble, 1, A->nb, A->nb, + workm, workn, 0, 0, workm, workn, A->p, A->q); - CHAMELEON_Desc_Create(&(VECNORMS_STEP2), NULL, ChamRealDouble, 1, A->nb, A->nb, - 1, workn, 0, 0, 1, workn, A->p, A->q); + CHAMELEON_Desc_Create(&(W2), NULL, ChamRealDouble, 1, A->nb, A->nb, + 1, workn, 0, 0, 1, workn, A->p, A->q); CHAMELEON_Desc_Create(&(RESULT), NULL, ChamRealDouble, 1, 1, 1, - 1, 1, 0, 0, 1, 1, 1, 1); + 1, 1, 0, 0, 1, 1, 1, 1); /* * ChamUpper */ if (uplo == ChamUpper) { /* Zeroes intermediate vector */ - for(n = 0; n < A->nt; n++) { - tempkn = n == A->nt-1 ? A->n-n*A->nb : A->nb; + for(n = 0; n < W2->nt; n++) { + tempkn = n == W2->nt-1 ? W2->n-n*W2->nb : W2->nb; INSERT_TASK_dlaset( &options, ChamUpperLower, 1, tempkn, 0., 0., - VECNORMS_STEP2(0, n), 1); + W2(0, n), 1); } for(m = 0; m < minMNT; m++) { /* Zeroes intermediate vectors */ - for(n = m; n < A->nt; n++) { - tempkn = n == A->nt-1 ? A->n-n*A->nb : A->nb; + for(n = m; n < W1->nt; n++) { + tempkn = n == W1->nt-1 ? W1->n-n*W1->nb : W1->nb; INSERT_TASK_dlaset( &options, ChamUpperLower, 1, tempkn, 0., 0., - VECNORMS_STEP1(m, n), 1); + W1(m, n), 1); } tempkm = m == A->mt-1 ? A->m-m*A->mb : A->mb; tempkn = m == A->nt-1 ? A->n-m*A->nb : A->nb; @@ -107,7 +108,7 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, &options, ChamColumnwise, uplo, diag, tempkm, tempkn, A(m, m), ldam, - VECNORMS_STEP1(m, m)); + W1(m, m)); /* compute sums of absolute values on columns of each tile */ for(n = m+1; n < A->nt; n++) { @@ -115,17 +116,17 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, INSERT_TASK_dzasum( &options, ChamColumnwise, ChamUpperLower, tempkm, tempkn, - A(m, n), ldam, VECNORMS_STEP1(m, n)); + A(m, n), ldam, W1(m, n)); } /* Compute vector sums between tiles in columns */ - for(n = m; n < A->nt; n++) { - tempkn = n == A->nt-1 ? A->n-n*A->nb : A->nb; + for(n = m; n < W1->nt; n++) { + tempkn = n == W1->nt-1 ? W1->n-n*W1->nb : W1->nb; INSERT_TASK_dgeadd( &options, - ChamNoTrans, 1, tempkn, A->mb, - 1.0, VECNORMS_STEP1(m, n), 1, - 1.0, VECNORMS_STEP2(0, n), 1); + ChamNoTrans, 1, tempkn, W1->mb, + 1.0, W1(m, n), 1, + 1.0, W2(0, n), 1); } } } @@ -143,21 +144,21 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, &options, ChamUpperLower, 1, tempkn, 0., 0., - VECNORMS_STEP1(m, n), 1); + W1(m, n), 1); } /* Zeroes the second intermediate vector */ INSERT_TASK_dlaset( &options, ChamUpperLower, 1, tempkn, 0., 0., - VECNORMS_STEP2(0, n), 1); + W2(0, n), 1); /* compute sums of absolute values on columns of diag tile */ INSERT_TASK_ztrasm( &options, ChamColumnwise, uplo, diag, tempkm, tempkn, A(n, n), ldan, - VECNORMS_STEP1(n, n)); + W1(n, n)); /* compute sums of absolute values on columns of each tile */ for(m = n+1; m < A->mt; m++) { @@ -166,7 +167,7 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, INSERT_TASK_dzasum( &options, ChamColumnwise, ChamUpperLower, tempkm, tempkn, - A(m, n), ldam, VECNORMS_STEP1(m, n)); + A(m, n), ldam, W1(m, n)); } /* Compute vector sums between tiles in columns */ @@ -174,8 +175,8 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, INSERT_TASK_dgeadd( &options, ChamNoTrans, 1, tempkn, A->mb, - 1.0, VECNORMS_STEP1(m, n), 1, - 1.0, VECNORMS_STEP2(0, n), 1); + 1.0, W1(m, n), 1, + 1.0, W2(0, n), 1); } } } @@ -189,8 +190,8 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, INSERT_TASK_dlange( &options, ChamMaxNorm, 1, tempkn, A->nb, - VECNORMS_STEP2(0, n), 1, - VECNORMS_STEP1(0, n)); + W2(0, n), 1, + W1(0, n)); } /* Initialize RESULT array */ @@ -205,7 +206,7 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, for(n = 0; n < A->nt; n++) { INSERT_TASK_dlange_max( &options, - VECNORMS_STEP1(0, n), + W1(0, n), RESULT(0,0)); } } @@ -217,35 +218,35 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, &options, ChamUpperLower, 1, 1, 1, RESULT(0,0), 1, - VECNORMS_STEP1(m, n), 1 ); + W1(m, n), 1 ); } } - CHAMELEON_Desc_Flush( VECNORMS_STEP2, sequence ); - CHAMELEON_Desc_Flush( VECNORMS_STEP1, sequence ); + CHAMELEON_Desc_Flush( W2, sequence ); + CHAMELEON_Desc_Flush( W1, sequence ); CHAMELEON_Desc_Flush( RESULT, sequence ); RUNTIME_sequence_wait(chamctxt, sequence); - *result = *(double *)VECNORMS_STEP1->get_blkaddr(VECNORMS_STEP1, A->myrank / A->q, A->myrank % A->q ); - CHAMELEON_Desc_Destroy( &(VECNORMS_STEP1) ); - CHAMELEON_Desc_Destroy( &(VECNORMS_STEP2) ); + *result = *(double *)W1->get_blkaddr(W1, A->myrank / A->q, A->myrank % A->q ); + CHAMELEON_Desc_Destroy( &(W1) ); + CHAMELEON_Desc_Destroy( &(W2) ); CHAMELEON_Desc_Destroy( &(RESULT) ); break; - /* - * ChamInfNorm - */ + /* + * ChamInfNorm + */ case ChamInfNorm: /* Init workspace handle for the call to zlange */ RUNTIME_options_ws_alloc( &options, A->mb, 0 ); - workm = A->m; + workm = ( uplo == ChamUpper ) ? chameleon_min( A->m, A->n ) : A->m; workn = chameleon_max( A->nt, A->q ); - CHAMELEON_Desc_Create(&(VECNORMS_STEP1), NULL, ChamRealDouble, A->mb, 1, A->mb, - workm, workn, 0, 0, workm, workn, A->p, A->q); + CHAMELEON_Desc_Create(&(W1), NULL, ChamRealDouble, A->mb, 1, A->mb, + workm, workn, 0, 0, workm, workn, A->p, A->q); - CHAMELEON_Desc_Create(&(VECNORMS_STEP2), NULL, ChamRealDouble, A->mb, 1, A->mb, - workm, 1, 0, 0, workm, 1, A->p, A->q); + CHAMELEON_Desc_Create(&(W2), NULL, ChamRealDouble, A->mb, 1, A->mb, + workm, 1, 0, 0, workm, 1, A->p, A->q); CHAMELEON_Desc_Create(&(RESULT), NULL, ChamRealDouble, 1, 1, 1, - 1, 1, 0, 0, 1, 1, 1, 1); + 1, 1, 0, 0, 1, 1, 1, 1); /* * ChamUpper @@ -261,21 +262,21 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, &options, ChamUpperLower, tempkm, 1, 0., 0., - VECNORMS_STEP1(m, n), 1); + W1(m, n), 1); } /* Zeroes intermediate vector */ INSERT_TASK_dlaset( &options, ChamUpperLower, tempkm, 1, 0., 0., - VECNORMS_STEP2(m, 0), 1); + W2(m, 0), 1); /* compute sums of absolute values on rows of diag tile */ INSERT_TASK_ztrasm( &options, ChamRowwise, uplo, diag, tempkm, tempkn, A(m, m), ldam, - VECNORMS_STEP1(m, m)); + W1(m, m)); /* compute sums of absolute values on rows of each tile */ for(n = m+1; n < A->nt; n++) { @@ -283,7 +284,7 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, INSERT_TASK_dzasum( &options, ChamRowwise, ChamUpperLower, tempkm, tempkn, - A(m, n), ldam, VECNORMS_STEP1(m, n)); + A(m, n), ldam, W1(m, n)); } /* Compute vector sums between tiles in rows */ @@ -291,8 +292,8 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, INSERT_TASK_dgeadd( &options, ChamNoTrans, tempkm, 1, A->mb, - 1.0, VECNORMS_STEP1(m, n), tempkm, - 1.0, VECNORMS_STEP2(m, 0), tempkm); + 1.0, W1(m, n), tempkm, + 1.0, W2(m, 0), tempkm); } } @@ -308,7 +309,7 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, &options, ChamUpperLower, tempkm, 1, 0., 0., - VECNORMS_STEP2(m, 0), 1); + W2(m, 0), 1); } for(n = 0; n < minMNT; n++) { /* Zeroes intermediate vectors */ @@ -318,7 +319,7 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, &options, ChamUpperLower, tempkm, 1, 0., 0., - VECNORMS_STEP1(m, n), tempkm); + W1(m, n), tempkm); } tempkm = n == A->mt-1 ? A->m-n*A->mb : A->mb; tempkn = n == A->nt-1 ? A->n-n*A->nb : A->nb; @@ -328,7 +329,7 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, &options, ChamRowwise, uplo, diag, tempkm, tempkn, A(n, n), ldan, - VECNORMS_STEP1(n, n)); + W1(n, n)); /* compute sums of absolute values on rows of each tile */ for(m = n+1; m < A->mt; m++) { @@ -337,7 +338,7 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, INSERT_TASK_dzasum( &options, ChamRowwise, ChamUpperLower, tempkm, tempkn, - A(m, n), ldam, VECNORMS_STEP1(m, n)); + A(m, n), ldam, W1(m, n)); } /* Compute vector sums between tiles in rows */ @@ -346,8 +347,8 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, INSERT_TASK_dgeadd( &options, ChamNoTrans, tempkm, 1, A->mb, - 1.0, VECNORMS_STEP1(m, n), tempkm, - 1.0, VECNORMS_STEP2(m, 0), tempkm); + 1.0, W1(m, n), tempkm, + 1.0, W2(m, 0), tempkm); } } } @@ -356,13 +357,13 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, * Compute max norm of each segment of the final vector in the * previous workspace */ - for(m = 0; m < A->mt; m++) { + for(m = 0; m < W1->mt; m++) { tempkm = m == A->mt-1 ? A->m-m*A->mb : A->mb; INSERT_TASK_dlange( &options, ChamMaxNorm, tempkm, 1, A->nb, - VECNORMS_STEP2(m, 0), 1, - VECNORMS_STEP1(m, 0)); + W2(m, 0), 1, + W1(m, 0)); } /* Initialize RESULT array */ @@ -374,10 +375,10 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, /* compute max norm between tiles in the column */ if (A->myrank % A->q == 0) { - for(m = 0; m < A->mt; m++) { + for(m = 0; m < W1->mt; m++) { INSERT_TASK_dlange_max( &options, - VECNORMS_STEP1(m, 0), + W1(m, 0), RESULT(0,0)); } } @@ -389,29 +390,29 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, &options, ChamUpperLower, 1, 1, 1, RESULT(0,0), 1, - VECNORMS_STEP1(m, n), 1 ); + W1(m, n), 1 ); } } - CHAMELEON_Desc_Flush( VECNORMS_STEP2, sequence ); - CHAMELEON_Desc_Flush( VECNORMS_STEP1, sequence ); + CHAMELEON_Desc_Flush( W2, sequence ); + CHAMELEON_Desc_Flush( W1, sequence ); CHAMELEON_Desc_Flush( RESULT, sequence ); RUNTIME_sequence_wait(chamctxt, sequence); - *result = *(double *)VECNORMS_STEP1->get_blkaddr(VECNORMS_STEP1, A->myrank / A->q, A->myrank % A->q ); - CHAMELEON_Desc_Destroy( &(VECNORMS_STEP1) ); - CHAMELEON_Desc_Destroy( &(VECNORMS_STEP2) ); + *result = *(double *)W1->get_blkaddr(W1, A->myrank / A->q, A->myrank % A->q ); + CHAMELEON_Desc_Destroy( &(W1) ); + CHAMELEON_Desc_Destroy( &(W2) ); CHAMELEON_Desc_Destroy( &(RESULT) ); break; - /* - * ChamFrobeniusNorm - */ + /* + * ChamFrobeniusNorm + */ case ChamFrobeniusNorm: workm = chameleon_max( A->mt, A->p ); workn = chameleon_max( A->nt, A->q ); - CHAMELEON_Desc_Create(&(VECNORMS_STEP1), NULL, ChamRealDouble, 1, 2, 2, - workm, 2*workn, 0, 0, workm, 2*workn, A->p, A->q); + CHAMELEON_Desc_Create(&(W1), NULL, ChamRealDouble, 1, 2, 2, + workm, 2*workn, 0, 0, workm, 2*workn, A->p, A->q); CHAMELEON_Desc_Create(&(RESULT), NULL, ChamRealDouble, 1, 2, 2, - 1, 2, 0, 0, 1, 2, 1, 1); + 1, 2, 0, 0, 1, 2, 1, 1); /* * ChamLower @@ -428,14 +429,14 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, &options, ChamUpperLower, 1, 2, 1., 0., - VECNORMS_STEP1(m,n), 1); + W1(m,n), 1); } /* Compute local norm of the diagonal tile */ INSERT_TASK_ztrssq( &options, uplo, diag, tempkm, tempkn, A(n, n), ldan, - VECNORMS_STEP1(n, n)); + W1(n, n)); /* Compute local norm to each tile */ for(m = n+1; m < A->mt; m++) { tempkm = m == A->mt-1 ? A->m-m*A->mb : A->mb; @@ -444,7 +445,7 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, &options, tempkm, tempkn, A(m, n), ldam, - VECNORMS_STEP1(m, n)); + W1(m, n)); } } } @@ -463,14 +464,14 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, &options, ChamUpperLower, 1, 2, 1., 0., - VECNORMS_STEP1(m,n), 1); + W1(m,n), 1); } /* Compute local norm of the diagonal tile */ INSERT_TASK_ztrssq( &options, uplo, diag, tempkm, tempkn, A(m, m), ldam, - VECNORMS_STEP1(m, m)); + W1(m, m)); /* Compute local norm to each tile */ for(n = m+1; n < A->nt; n++) { tempkn = n == A->nt-1 ? A->n-n*A->nb : A->nb; @@ -478,7 +479,7 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, &options, tempkm, tempkn, A(m, n), ldam, - VECNORMS_STEP1(m, n)); + W1(m, n)); } } } @@ -499,7 +500,7 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, for(m = n; m < A->mt; m++) { INSERT_TASK_dplssq( &options, - VECNORMS_STEP1(m, n), + W1(m, n), RESULT(0,0)); } } @@ -513,7 +514,7 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, for(n = m; n < A->nt; n++) { INSERT_TASK_dplssq( &options, - VECNORMS_STEP1(m, n), + W1(m, n), RESULT(0,0)); } } @@ -531,142 +532,142 @@ void chameleon_pzlantr(cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag, &options, ChamUpperLower, 1, 1, 1, RESULT(0,0), 1, - VECNORMS_STEP1(m, n), 1 ); + W1(m, n), 1 ); } } - CHAMELEON_Desc_Flush( VECNORMS_STEP1, sequence ); + CHAMELEON_Desc_Flush( W1, sequence ); CHAMELEON_Desc_Flush( RESULT, sequence ); RUNTIME_sequence_wait(chamctxt, sequence); - *result = *(double *)VECNORMS_STEP1->get_blkaddr(VECNORMS_STEP1, A->myrank / A->q, A->myrank % A->q ); - CHAMELEON_Desc_Destroy( &(VECNORMS_STEP1) ); + *result = *(double *)W1->get_blkaddr(W1, A->myrank / A->q, A->myrank % A->q ); + CHAMELEON_Desc_Destroy( &(W1) ); CHAMELEON_Desc_Destroy( &(RESULT) ); break; /* * ChamMaxNorm */ - case ChamMaxNorm: - default: - /* Init workspace handle for the call to zlange but unused */ - RUNTIME_options_ws_alloc( &options, 1, 0 ); + case ChamMaxNorm: + default: + /* Init workspace handle for the call to zlange but unused */ + RUNTIME_options_ws_alloc( &options, 1, 0 ); - workm = chameleon_max( A->mt, A->p ); - workn = chameleon_max( A->nt, A->q ); + workm = chameleon_max( A->mt, A->p ); + workn = chameleon_max( A->nt, A->q ); - CHAMELEON_Desc_Create(&(VECNORMS_STEP1), NULL, ChamRealDouble, 1, 1, 1, + CHAMELEON_Desc_Create(&(W1), NULL, ChamRealDouble, 1, 1, 1, workm, workn, 0, 0, workm, workn, A->p, A->q); - CHAMELEON_Desc_Create(&(RESULT), NULL, ChamRealDouble, 1, 1, 1, + CHAMELEON_Desc_Create(&(RESULT), NULL, ChamRealDouble, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1); - /* - * ChamLower - */ - if (uplo == ChamLower) { - /* Compute local maximum to each tile */ - for(n = 0; n < minMNT; n++) { - tempkm = n == A->mt-1 ? A->m-n*A->mb : A->mb; - tempkn = n == A->nt-1 ? A->n-n*A->nb : A->nb; - ldan = BLKLDD(A, n); + /* + * ChamLower + */ + if (uplo == ChamLower) { + /* Compute local maximum to each tile */ + for(n = 0; n < minMNT; n++) { + tempkm = n == A->mt-1 ? A->m-n*A->mb : A->mb; + tempkn = n == A->nt-1 ? A->n-n*A->nb : A->nb; + ldan = BLKLDD(A, n); + + INSERT_TASK_zlantr( + &options, + ChamMaxNorm, uplo, diag, + tempkm, tempkn, A->nb, + A(n, n), ldan, + W1(n, n)); - INSERT_TASK_zlantr( + for(m = n+1; m < A->mt; m++) { + tempkm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + ldam = BLKLDD(A, m); + INSERT_TASK_zlange( &options, - ChamMaxNorm, uplo, diag, - tempkm, tempkn, A->nb, - A(n, n), ldan, - VECNORMS_STEP1(n, n)); - - for(m = n+1; m < A->mt; m++) { - tempkm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); - INSERT_TASK_zlange( - &options, - ChamMaxNorm, tempkm, tempkn, A->nb, - A(m, n), ldam, - VECNORMS_STEP1(m, n)); - } + ChamMaxNorm, tempkm, tempkn, A->nb, + A(m, n), ldam, + W1(m, n)); } } - /* - * ChamUpper - */ - else { - /* Compute local maximum to each tile */ - for(m = 0; m < minMNT; m++) { - tempkm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - tempkn = m == A->nt-1 ? A->n-m*A->nb : A->nb; - ldam = BLKLDD(A, m); + } + /* + * ChamUpper + */ + else { + /* Compute local maximum to each tile */ + for(m = 0; m < minMNT; m++) { + tempkm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + tempkn = m == A->nt-1 ? A->n-m*A->nb : A->nb; + ldam = BLKLDD(A, m); - INSERT_TASK_zlantr( + INSERT_TASK_zlantr( + &options, + ChamMaxNorm, uplo, diag, + tempkm, tempkn, A->nb, + A(m, m), ldam, + W1(m, m)); + + for(n = m+1; n < A->nt; n++) { + tempkn = n == A->nt-1 ? A->n-n*A->nb : A->nb; + INSERT_TASK_zlange( &options, - ChamMaxNorm, uplo, diag, - tempkm, tempkn, A->nb, - A(m, m), ldam, - VECNORMS_STEP1(m, m)); - - for(n = m+1; n < A->nt; n++) { - tempkn = n == A->nt-1 ? A->n-n*A->nb : A->nb; - INSERT_TASK_zlange( - &options, - ChamMaxNorm, tempkm, tempkn, A->nb, - A(m, n), ldam, - VECNORMS_STEP1(m, n)); - } + ChamMaxNorm, tempkm, tempkn, A->nb, + A(m, n), ldam, + W1(m, n)); } } + } - /* Initialize RESULT array */ - INSERT_TASK_dlaset( - &options, - ChamUpperLower, 1, 1, - 0., 0., - RESULT(0,0), 1); - - /* - * ChamLower - */ - if (uplo == ChamLower) { - /* Compute max norm between tiles */ - for(n = 0; n < minMNT; n++) { - for(m = n; m < A->mt; m++) { - INSERT_TASK_dlange_max( - &options, - VECNORMS_STEP1(m, n), - RESULT(0,0)); - } + /* Initialize RESULT array */ + INSERT_TASK_dlaset( + &options, + ChamUpperLower, 1, 1, + 0., 0., + RESULT(0,0), 1); + + /* + * ChamLower + */ + if (uplo == ChamLower) { + /* Compute max norm between tiles */ + for(n = 0; n < minMNT; n++) { + for(m = n; m < A->mt; m++) { + INSERT_TASK_dlange_max( + &options, + W1(m, n), + RESULT(0,0)); } } - /* - * ChamUpper - */ - else { - /* Compute max norm between tiles */ - for(m = 0; m < minMNT; m++) { - for(n = m; n < A->nt; n++) { - INSERT_TASK_dlange_max( - &options, - VECNORMS_STEP1(m, n), - RESULT(0,0)); - } + } + /* + * ChamUpper + */ + else { + /* Compute max norm between tiles */ + for(m = 0; m < minMNT; m++) { + for(n = m; n < A->nt; n++) { + INSERT_TASK_dlange_max( + &options, + W1(m, n), + RESULT(0,0)); } } + } - /* Copy max norm in tiles to dispatch on every nodes */ - for(m = 0; m < A->p; m++) { - for(n = 0; n < A->q; n++) { - INSERT_TASK_dlacpy( - &options, - ChamUpperLower, 1, 1, 1, - RESULT(0,0), 1, - VECNORMS_STEP1(m, n), 1 ); - } + /* Copy max norm in tiles to dispatch on every nodes */ + for(m = 0; m < A->p; m++) { + for(n = 0; n < A->q; n++) { + INSERT_TASK_dlacpy( + &options, + ChamUpperLower, 1, 1, 1, + RESULT(0,0), 1, + W1(m, n), 1 ); } + } - CHAMELEON_Desc_Flush( VECNORMS_STEP1, sequence ); - CHAMELEON_Desc_Flush( RESULT, sequence ); - RUNTIME_sequence_wait(chamctxt, sequence); - *result = *(double *)VECNORMS_STEP1->get_blkaddr(VECNORMS_STEP1, A->myrank / A->q, A->myrank % A->q ); - CHAMELEON_Desc_Destroy( &(VECNORMS_STEP1) ); - CHAMELEON_Desc_Destroy( &(RESULT) ); + CHAMELEON_Desc_Flush( W1, sequence ); + CHAMELEON_Desc_Flush( RESULT, sequence ); + RUNTIME_sequence_wait(chamctxt, sequence); + *result = *(double *)W1->get_blkaddr(W1, A->myrank / A->q, A->myrank % A->q ); + CHAMELEON_Desc_Destroy( &(W1) ); + CHAMELEON_Desc_Destroy( &(RESULT) ); } RUNTIME_options_ws_free(&options); RUNTIME_options_finalize(&options, chamctxt); diff --git a/compute/pztpgqrt.c b/compute/pztpgqrt.c index e159c781a5c6e2cbe72a0f08847ada39e06b3638..f2a68870afb8922e0ef76953c25750ce65ce5d11 100644 --- a/compute/pztpgqrt.c +++ b/compute/pztpgqrt.c @@ -44,7 +44,7 @@ void chameleon_pztpgqrt( int genD, int L, size_t ws_host = 0; int k, m, n; - int ldvk, ldvm; + int ldvk, ldvm, lddk; int ldqk, ldqm; int tempkm, tempkn, tempkk, tempnn, tempmm, templm; int ib; @@ -92,6 +92,7 @@ void chameleon_pztpgqrt( int genD, int L, tempkk = k == V1->nt-1 ? V1->n-k*V1->nb : V1->nb; tempkn = k == Q1->nt-1 ? Q1->n-k*Q1->nb : Q1->nb; ldvk = BLKLDD(V1, k); + lddk = BLKLDD(D, k); ldqk = BLKLDD(Q1, k); /* Equivalent to the tsmqr step on Q1,Q2 */ @@ -139,13 +140,13 @@ void chameleon_pztpgqrt( int genD, int L, &options, ChamLower, tempkm, tempkk, V1->nb, V1(k, k), ldvk, - D(k), ldvk ); + D(k), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempkm, tempkk, 0., 1., - D(k), ldvk ); + D(k), lddk ); #endif } for (n = k; n < Q1->nt; n++) { @@ -154,7 +155,7 @@ void chameleon_pztpgqrt( int genD, int L, &options, ChamLeft, ChamNoTrans, tempkm, tempnn, tempkk, ib, T1->nb, - D(k), ldvk, + D(k), lddk, T1(k, k), T1->mb, Q1(k, n), ldqk); } diff --git a/compute/pzunglq.c b/compute/pzunglq.c index 9b754c679b47057fb1f60f94b387f33db3dfa23b..17dc51b0a719b8bf628743b8f09d978e05a800c6 100644 --- a/compute/pzunglq.c +++ b/compute/pzunglq.c @@ -42,7 +42,7 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T size_t ws_host = 0; int k, m, n; - int ldak, ldqm; + int ldak, ldqm, lddk; int tempnn, tempmm, tempkmin, tempkn; int tempAkm, tempAkn; int ib, minMT; @@ -94,6 +94,8 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T tempkmin = chameleon_min( tempAkn, tempAkm ); tempkn = k == Q->nt-1 ? Q->n-k*Q->nb : Q->nb; ldak = BLKLDD(A, k); + lddk = BLKLDD(D, k); + for (n = Q->nt-1; n > k; n--) { tempnn = n == Q->nt-1 ? Q->n-n*Q->nb : Q->nb; for (m = 0; m < Q->mt; m++) { @@ -117,19 +119,18 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T RUNTIME_data_flush( sequence, T(k, n) ); } - if ( genD ) { INSERT_TASK_zlacpy( &options, ChamUpper, tempkmin, tempkn, A->nb, A(k, k), ldak, - D(k), ldak ); + D(k), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, tempkn, 0., 1., - D(k), ldak ); + D(k), lddk ); #endif } for (m = k; m < Q->mt; m++) { @@ -143,7 +144,7 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T &options, ChamRight, ChamNoTrans, tempmm, tempkn, tempkmin, ib, T->nb, - D(k), ldak, + D(k), lddk, T(k, k), T->mb, Q(m, k), ldqm); } diff --git a/compute/pzunglq_param.c b/compute/pzunglq_param.c index 564d23c605bd03d0a250aa4a64e7afa947282045..7e8d35051d8514ff79d68cac5e6c4d03ef429d66 100644 --- a/compute/pzunglq_param.c +++ b/compute/pzunglq_param.c @@ -41,7 +41,7 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t int k, m, n, i, p; int K, L; - int ldak, ldqm; + int ldak, ldqm, lddk; int tempkm, tempkmin, temppn, tempnn, tempmm; int ib; int *tiles; @@ -93,6 +93,7 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; ldak = BLKLDD(A, k); + lddk = BLKLDD(D, k); /* Setting the order of the tiles*/ libhqr_walk_stepk(qrtree, k, tiles + (k+1)); @@ -147,13 +148,13 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t &options, ChamUpper, tempkmin, temppn, A->nb, A(k, p), ldak, - D(k, p), ldak ); + D(k, p), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, temppn, 0., 1., - D(k, p), ldak ); + D(k, p), lddk ); #endif } for (m = k; m < Q->mt; m++) { @@ -167,7 +168,7 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t &options, ChamRight, ChamNoTrans, tempmm, temppn, tempkmin, ib, T->nb, - D(k, p), ldak, + D(k, p), lddk, T(k, p), T->mb, Q(m, p), ldqm); } diff --git a/compute/pzunglqrh.c b/compute/pzunglqrh.c index 878c8763010c2557a78d47e4bd110a15e635da20..e1d94006617ca35a9372701fd52151e4d9751617 100644 --- a/compute/pzunglqrh.c +++ b/compute/pzunglqrh.c @@ -46,8 +46,7 @@ void chameleon_pzunglqrh( int genD, int BS, int k, m, n; int K, N, RD, lastRD; - int ldak; - int ldqm; + int ldak, lddk, ldqm; int tempkm, tempkmin, tempNn, tempnn, tempmm, tempNRDn; int ib; @@ -89,6 +88,7 @@ void chameleon_pzunglqrh( int genD, int BS, tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; ldak = BLKLDD(A, k); + lddk = BLKLDD(D, k); lastRD = 0; for (RD = BS; RD < A->nt-k; RD *= 2) lastRD = RD; @@ -154,13 +154,13 @@ void chameleon_pzunglqrh( int genD, int BS, &options, ChamUpper, tempkmin, tempNn, A->nb, A(k, N), ldak, - D(k, N), ldak ); + D(k, N), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, tempNn, 0., 1., - D(k, N), ldak ); + D(k, N), lddk ); #endif } for (m = k; m < Q->mt; m++) { @@ -175,7 +175,7 @@ void chameleon_pzunglqrh( int genD, int BS, ChamRight, ChamNoTrans, tempmm, tempNn, tempkmin, ib, T->nb, - D(k, N), ldak, + D(k, N), lddk, T(k, N), T->mb, Q(m, N), ldqm); } diff --git a/compute/pzungqr.c b/compute/pzungqr.c index ed0e4ed9b1352f6529300e846e9e2dbe38401339..b4aca1d4a2c2f58cfa3fd4416fc3cfee2f564a72 100644 --- a/compute/pzungqr.c +++ b/compute/pzungqr.c @@ -43,7 +43,7 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, size_t ws_host = 0; int k, m, n; - int ldak, ldqk, ldam, ldqm; + int ldak, ldqk, ldam, ldqm, lddk; int tempmm, tempnn, tempkmin, tempkm; int tempAkm, tempAkn; int ib, minMT; @@ -95,6 +95,7 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, tempkmin = chameleon_min( tempAkn, tempAkm ); tempkm = k == Q->mt-1 ? Q->m-k*Q->mb : Q->mb; ldak = BLKLDD(A, k); + lddk = BLKLDD(D, k); ldqk = BLKLDD(Q, k); for (m = Q->mt - 1; m > k; m--) { tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb; @@ -125,13 +126,13 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, &options, ChamLower, tempkm, tempkmin, A->nb, A(k, k), ldak, - D(k), ldak ); + D(k), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempkm, tempkmin, 0., 1., - D(k), ldak ); + D(k), lddk ); #endif } for (n = k; n < Q->nt; n++) { @@ -145,7 +146,7 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, &options, ChamLeft, ChamNoTrans, tempkm, tempnn, tempkmin, ib, T->nb, - D(k), ldak, + D(k), lddk, T(k, k), T->mb, Q(k, n), ldqk); } diff --git a/compute/pzungqr_param.c b/compute/pzungqr_param.c index 6c1cd3697667a5377a0dc1101b272775596db8a0..beaa6a8c626b3637090bef97fbebdf0906d29910 100644 --- a/compute/pzungqr_param.c +++ b/compute/pzungqr_param.c @@ -41,7 +41,7 @@ void chameleon_pzungqr_param( int genD, const libhqr_tree_t *qrtree, size_t ws_host = 0; int k, m, n, i, p, L; - int ldam, ldqm, ldqp; + int ldam, ldqm, ldqp, lddm; int tempmm, tempnn, tempkmin, tempkn; int ib, minMT; int *tiles; @@ -145,6 +145,7 @@ void chameleon_pzungqr_param( int genD, const libhqr_tree_t *qrtree, tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; tempkmin = chameleon_min(tempmm, tempkn); ldam = BLKLDD(A, m); + lddm = BLKLDD(D, m); ldqm = BLKLDD(Q, m); if ( genD ) { @@ -152,13 +153,13 @@ void chameleon_pzungqr_param( int genD, const libhqr_tree_t *qrtree, &options, ChamLower, tempmm, tempkmin, A->nb, A(m, k), ldam, - D(m, k), ldam ); + D(m, k), lddm ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempmm, tempkmin, 0., 1., - D(m, k), ldam ); + D(m, k), lddm ); #endif } @@ -173,7 +174,7 @@ void chameleon_pzungqr_param( int genD, const libhqr_tree_t *qrtree, &options, ChamLeft, ChamNoTrans, tempmm, tempnn, tempkmin, ib, T->nb, - D(m, k), ldam, + D(m, k), lddm, T(m, k), T->mb, Q(m, n), ldqm); } diff --git a/compute/pzungqrrh.c b/compute/pzungqrrh.c index d22e6e052b14454b3ac9971d60c2ce8d1824b337..40c182370a4890eac0a5ab12efd79e9fe014460f 100644 --- a/compute/pzungqrrh.c +++ b/compute/pzungqrrh.c @@ -48,7 +48,7 @@ void chameleon_pzungqrrh( int genD, int BS, int k, m, n; int K, M, RD, lastRD; - int ldaM, ldam, ldaMRD; + int ldaM, ldam, ldaMRD, lddM; int ldqM, ldqm, ldqMRD; int tempkn, tempMm, tempnn, tempmm, tempMRDm, tempkmin; int ib; @@ -127,6 +127,7 @@ void chameleon_pzungqrrh( int genD, int BS, tempMm = M == A->mt-1 ? A->m-M*A->mb : A->mb; tempkmin = chameleon_min(tempMm, tempkn); ldaM = BLKLDD(A, M); + lddM = BLKLDD(D, M); ldqM = BLKLDD(Q, M); for (m = chameleon_min(M+BS, A->mt)-1; m > M; m--) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; @@ -160,13 +161,13 @@ void chameleon_pzungqrrh( int genD, int BS, &options, ChamLower, tempMm, tempkmin, A->nb, A(M, k), ldaM, - D(M, k), ldaM ); + D(M, k), lddM ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempMm, tempkmin, 0., 1., - D(M, k), ldaM ); + D(M, k), lddM ); #endif } for (n = k; n < Q->nt; n++) { @@ -181,7 +182,7 @@ void chameleon_pzungqrrh( int genD, int BS, ChamLeft, ChamNoTrans, tempMm, tempnn, tempkmin, ib, T->nb, - D(M, k), ldaM, + D(M, k), lddM, T(M, k), T->mb, Q(M, n), ldqM); } diff --git a/compute/pzunmlq.c b/compute/pzunmlq.c index a7691d927f17d7156e758abfbeb80a28eb993937..74b583011aba123c2e17e809844387f34b982cfc 100644 --- a/compute/pzunmlq.c +++ b/compute/pzunmlq.c @@ -44,7 +44,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, size_t ws_host = 0; int k, m, n; - int ldak, ldbk, ldbm; + int ldak, ldbk, ldbm, lddk; int tempmm, tempnn, tempkn, tempkm, tempkmin; int ib, minMT, minM; @@ -97,21 +97,24 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, RUNTIME_iteration_push(chamctxt, k); tempkm = k == B->mt-1 ? B->m-k*B->mb : B->mb; + tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); + lddk = BLKLDD(D, k); + if ( genD ) { INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempkm, A->nb, + ChamUpper, tempkmin, tempkn, A->nb, A(k, k), ldak, - D(k), ldak ); + D(k), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamLower, tempkmin, tempkm, + ChamLower, tempkmin, tempkn, 0., 1., - D(k), ldak ); + D(k), lddk ); #endif } for (n = 0; n < B->nt; n++) { @@ -120,7 +123,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, &options, side, trans, tempkm, tempnn, tempkmin, ib, T->nb, - D(k), ldak, + D(k), lddk, T(k, k), T->mb, B(k, n), ldbk); } @@ -168,10 +171,13 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, for (k = minMT-1; k >= 0; k--) { RUNTIME_iteration_push(chamctxt, k); + tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; tempkm = k == B->mt-1 ? B->m-k*B->mb : B->mb; tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb; ldak = BLKLDD(A, k); ldbk = BLKLDD(B, k); + lddk = BLKLDD(D, k); + for (m = B->mt-1; m > k; m--) { tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb; ldbm = BLKLDD(B, m); @@ -198,15 +204,15 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, if ( genD ) { INSERT_TASK_zlacpy( &options, - ChamUpper, tempkmin, tempkm, A->nb, + ChamUpper, tempkmin, tempkn, A->nb, A(k, k), ldak, - D(k), ldak ); + D(k), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, - ChamLower, tempkmin, tempkm, + ChamLower, tempkmin, tempkn, 0., 1., - D(k), ldak ); + D(k), lddk ); #endif } for (n = 0; n < B->nt; n++) { @@ -219,7 +225,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, &options, side, trans, tempkm, tempnn, tempkmin, ib, T->nb, - D(k), ldak, + D(k), lddk, T(k, k), T->mb, B(k, n), ldbk); } @@ -240,6 +246,8 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, tempkn = k == B->nt - 1 ? B->n - k * B->nb : B->nb; tempkmin = k == minMT - 1 ? minM - k * A->nb : A->nb; ldak = BLKLDD(A, k); + lddk = BLKLDD(D, k); + for (n = B->nt-1; n > k; n--) { tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; for (m = 0; m < B->mt; m++) { @@ -268,13 +276,13 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, &options, ChamUpper, tempkmin, tempkn, A->nb, A(k, k), ldak, - D(k), ldak ); + D(k), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, tempkn, 0., 1., - D(k), ldak ); + D(k), lddk ); #endif } for (m = 0; m < B->mt; m++) { @@ -288,7 +296,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, &options, side, trans, tempmm, tempkn, tempkmin, ib, T->nb, - D(k), ldak, + D(k), lddk, T(k, k), T->mb, B(m, k), ldbm); } @@ -309,18 +317,20 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, tempkn = k == B->nt-1 ? B->n-k*B->nb : B->nb; tempkmin = k == minMT-1 ? minM-k*A->mb : A->mb; ldak = BLKLDD(A, k); + lddk = BLKLDD(D, k); + if ( genD ) { INSERT_TASK_zlacpy( &options, ChamUpper, tempkmin, tempkn, A->nb, A(k, k), ldak, - D(k), ldak ); + D(k), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, tempkn, 0., 1., - D(k), ldak ); + D(k), lddk ); #endif } for (m = 0; m < B->mt; m++) { @@ -330,7 +340,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans, &options, side, trans, tempmm, tempkn, tempkmin, ib, T->nb, - D(k), ldak, + D(k), lddk, T(k, k), T->mb, B(m, k), ldbm); } diff --git a/compute/pzunmlq_param.c b/compute/pzunmlq_param.c index ac54595cf83d6f9da759bc2c3d30aa6603220f12..d818c192cd724e8e06d39e518e5bffcefe4433f0 100644 --- a/compute/pzunmlq_param.c +++ b/compute/pzunmlq_param.c @@ -42,7 +42,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, size_t ws_host = 0; int k, m, n, i, p; - int ldbm, ldak, ldbp; + int ldbm, ldak, ldbp, lddk; int tempnn, temppn, tempkmin, tempmm, tempkm; int ib, K, L; int *tiles; @@ -95,6 +95,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; ldak = BLKLDD(A, k); + lddk = BLKLDD(D, k); T = TS; for (i = 0; i < qrtree->getnbgeqrf(qrtree, k); i++) { @@ -109,13 +110,13 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, &options, ChamUpper, tempkmin, temppn, A->nb, A(k, p), ldak, - D(k, p), ldak ); + D(k, p), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, temppn, 0., 1., - D(k, p), ldak ); + D(k, p), lddk ); #endif } for (n = 0; n < B->nt; n++) { @@ -124,7 +125,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, &options, side, trans, temppn, tempnn, tempkmin, ib, T->nb, - D(k, p), ldak, + D(k, p), lddk, T(k, p), T->mb, B(p, n), ldbp); } @@ -193,6 +194,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; ldak = BLKLDD(A, k); + lddk = BLKLDD(D, k); /* Setting the order of the tiles*/ libhqr_walk_stepk(qrtree, k, tiles + (k+1)); @@ -249,13 +251,13 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, &options, ChamUpper, tempkmin, temppn, A->nb, A(k, p), ldak, - D(k, p), ldak ); + D(k, p), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, temppn, 0., 1., - D(k, p), ldak ); + D(k, p), lddk ); #endif } for (n = 0; n < B->nt; n++) { @@ -268,7 +270,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, &options, side, trans, temppn, tempnn, tempkmin, ib, T->nb, - D(k, p), ldak, + D(k, p), lddk, T(k, p), T->mb, B(p, n), ldbp); } @@ -291,6 +293,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; ldak = BLKLDD(A, k); + lddk = BLKLDD(D, k); /* Setting the order of the tiles*/ libhqr_walk_stepk(qrtree, k, tiles + (k+1)); @@ -345,13 +348,13 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, &options, ChamUpper, tempkmin, temppn, A->nb, A(k, p), ldak, - D(k, p), ldak ); + D(k, p), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, temppn, 0., 1., - D(k, p), ldak ); + D(k, p), lddk ); #endif } for (m = 0; m < B->mt; m++) { @@ -365,7 +368,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, &options, side, trans, tempmm, temppn, tempkmin, ib, T->nb, - D(k, p), ldak, + D(k, p), lddk, T(k, p), T->mb, B(m, p), ldbm); } @@ -386,6 +389,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; ldak = BLKLDD(A, k); + lddk = BLKLDD(D, k); T = TS; for (i = 0; i < qrtree->getnbgeqrf(qrtree, k); i++) { @@ -399,13 +403,13 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, &options, ChamUpper, tempkmin, temppn, A->nb, A(k, p), ldak, - D(k, p), ldak ); + D(k, p), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, temppn, 0., 1., - D(k, p), ldak ); + D(k, p), lddk ); #endif } for (m = 0; m < B->mt; m++) { @@ -415,7 +419,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree, &options, side, trans, tempmm, temppn, tempkmin, ib, T->nb, - D(k, p), ldak, + D(k, p), lddk, T(k, p), TS->mb, B(m, p), ldbm); } diff --git a/compute/pzunmlqrh.c b/compute/pzunmlqrh.c index 375cf8680e3b4a319ee18abf155e4a63a06cfa7c..ad8ebfc5cec51012ea4cc7260464627dd7ee8721 100644 --- a/compute/pzunmlqrh.c +++ b/compute/pzunmlqrh.c @@ -47,7 +47,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans int k, m, n; int K, N, RD, lastRD; - int ldak, ldbN, ldbm, ldbNRD; + int ldak, lddk, ldbN, ldbm, ldbNRD; int tempNn, tempkm, tempnn, tempmm, tempNRDn, tempkmin; int ib; @@ -95,6 +95,8 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; ldak = BLKLDD(A, k); + lddk = BLKLDD(D, k); + for (N = k; N < A->nt; N += BS) { tempNn = N == A->nt-1 ? A->n-N*A->nb : A->nb; tempkmin = chameleon_min(tempkm,tempNn); @@ -104,13 +106,13 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans &options, ChamUpper, tempkmin, tempNn, A->nb, A(k, N), ldak, - D(k, N), ldak ); + D(k, N), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, tempNn, 0., 1., - D(k, N), ldak ); + D(k, N), lddk ); #endif } for (n = 0; n < B->nt; n++) { @@ -120,7 +122,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans side, trans, tempNn, tempnn, tempkmin, ib, T->nb, - D(k, N), ldak, + D(k, N), lddk, T(k, N), T->mb, B(N, n), ldbN); } @@ -196,6 +198,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; ldak = BLKLDD(A, k); + lddk = BLKLDD(D, k); lastRD = 0; for (RD = BS; RD < A->nt-k; RD *= 2) lastRD = RD; @@ -259,13 +262,13 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans &options, ChamUpper, tempkmin, tempNn, A->nb, A(k, N), ldak, - D(k, N), ldak ); + D(k, N), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, tempNn, 0., 1., - D(k, N), ldak ); + D(k, N), lddk ); #endif } for (n = 0; n < B->nt; n++) { @@ -279,7 +282,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans side, trans, tempNn, tempnn, tempkmin, ib, T->nb, - D(k, N), ldak, + D(k, N), lddk, T(k, N), T->mb, B(N, n), ldbN); } @@ -300,6 +303,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; ldak = BLKLDD(A, k); + lddk = BLKLDD(D, k); lastRD = 0; for (RD = BS; RD < A->nt-k; RD *= 2) lastRD = RD; @@ -361,13 +365,13 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans &options, ChamUpper, tempkmin, tempNn, A->nb, A(k, N), ldak, - D(k, N), ldak ); + D(k, N), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, tempNn, 0., 1., - D(k, N), ldak ); + D(k, N), lddk ); #endif } for (m = 0; m < B->mt; m++) { @@ -382,7 +386,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans side, trans, tempmm, tempNn, tempkmin, ib, T->nb, - D(k, N), ldak, + D(k, N), lddk, T(k, N), T->mb, B(m, N), ldbm); } @@ -401,6 +405,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; ldak = BLKLDD(A, k); + lddk = BLKLDD(D, k); for (N = k; N < A->nt; N += BS) { tempNn = N == A->nt-1 ? A->n-N*A->nb : A->nb; tempkmin = chameleon_min(tempkm,tempNn); @@ -409,13 +414,13 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans &options, ChamUpper, tempkmin, tempNn, A->nb, A(k, N), ldak, - D(k, N), ldak ); + D(k, N), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamLower, tempkmin, tempNn, 0., 1., - D(k, N), ldak ); + D(k, N), lddk ); #endif } for (m = 0; m < B->mt; m++) { @@ -426,7 +431,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans side, trans, tempmm, tempNn, tempkmin, ib, T->nb, - D(k, N), ldak, + D(k, N), lddk, T(k, N), T->mb, B(m, N), ldbm); } diff --git a/compute/pzunmqr.c b/compute/pzunmqr.c index d72a830afdc97dfaea70b9c24da8e910e26095ff..a06e0e13e41eefcd571719845b1369be3d375762 100644 --- a/compute/pzunmqr.c +++ b/compute/pzunmqr.c @@ -44,7 +44,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, size_t ws_host = 0; int k, m, n; - int ldak, ldbk, ldam, ldan, ldbm; + int ldak, ldbk, ldam, ldan, ldbm, lddk; int tempkm, tempnn, tempkmin, tempmm, tempkn; int ib, minMT, minM; @@ -99,19 +99,20 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, tempkm = k == B->mt-1 ? B->m-k*B->mb : B->mb; tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb; ldak = BLKLDD(A, k); + lddk = BLKLDD(D, k); ldbk = BLKLDD(B, k); if ( genD ) { INSERT_TASK_zlacpy( &options, ChamLower, tempkm, tempkmin, A->nb, A(k, k), ldak, - D(k), ldak ); + D(k), lddk ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempkm, tempkmin, 0., 1., - D(k), ldak ); + D(k), lddk ); #endif } for (n = 0; n < B->nt; n++) { @@ -120,7 +121,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans, &options, side, trans, tempkm, tempnn, tempkmin, ib, T->nb, - D(k), ldak, + D(k), lddk, T(k, k), T->mb, B(k, n), ldbk); } diff --git a/compute/pzunmqr_param.c b/compute/pzunmqr_param.c index f5e8017e6ed999a2168d9f981c8aeca922239fe1..584829e255eada9a11e8d3d8059850c5c6849d06 100644 --- a/compute/pzunmqr_param.c +++ b/compute/pzunmqr_param.c @@ -42,7 +42,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, size_t ws_host = 0; int k, m, n, i, p; - int ldam, ldan, ldbm, ldbp; + int ldam, ldan, ldbm, ldbp, lddn, lddm; int tempnn, tempkmin, tempmm, tempkn; int ib, K, L; int *tiles; @@ -102,6 +102,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; tempkmin = chameleon_min(tempmm, tempkn); ldam = BLKLDD(A, m); + lddm = BLKLDD(D, m); ldbm = BLKLDD(B, m); if ( genD ) { @@ -109,13 +110,13 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, &options, ChamLower, tempmm, tempkmin, A->nb, A(m, k), ldam, - D(m, k), ldam ); + D(m, k), lddm ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempmm, tempkmin, 0., 1., - D(m, k), ldam ); + D(m, k), lddm ); #endif } for (n = 0; n < B->nt; n++) { @@ -124,7 +125,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, &options, side, trans, tempmm, tempnn, tempkmin, ib, T->nb, - D(m, k), ldam, + D(m, k), lddm, T(m, k), T->mb, B(m, n), ldbm); } @@ -243,6 +244,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; tempkmin = chameleon_min(tempmm, tempkn); ldam = BLKLDD(A, m); + lddm = BLKLDD(D, m); ldbm = BLKLDD(B, m); if ( genD ) { @@ -250,13 +252,13 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, &options, ChamLower, tempmm, tempkmin, A->nb, A(m, k), ldam, - D(m, k), ldam ); + D(m, k), lddm ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempmm, tempkmin, 0., 1., - D(m, k), ldam ); + D(m, k), lddm ); #endif } for (n = 0; n < B->nt; n++) { @@ -269,7 +271,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, &options, side, trans, tempmm, tempnn, tempkmin, ib, T->nb, - D(m, k), ldam, + D(m, k), lddm, T(m, k), T->mb, B(m, n), ldbm); } @@ -342,19 +344,20 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; tempkmin = chameleon_min(tempnn, tempkn); ldan = BLKLDD(A, n); + lddn = BLKLDD(D, n); if ( genD ) { INSERT_TASK_zlacpy( &options, ChamLower, tempnn, tempkmin, A->nb, A(n, k), ldan, - D(n, k), ldan ); + D(n, k), lddn ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempnn, tempkmin, 0., 1., - D(n, k), ldan ); + D(n, k), lddn ); #endif } for (m = 0; m < B->mt; m++) { @@ -368,7 +371,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, &options, side, trans, tempmm, tempnn, tempkmin, ib, T->nb, - D(n, k), ldan, + D(n, k), lddn, T(n, k), T->mb, B(m, n), ldbm); } @@ -394,19 +397,20 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; tempkmin = chameleon_min(tempnn, tempkn); ldan = BLKLDD(A, n); + lddn = BLKLDD(D, n); if ( genD ) { INSERT_TASK_zlacpy( &options, ChamLower, tempnn, tempkmin, A->nb, A(n, k), ldan, - D(n, k), ldan ); + D(n, k), lddn ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempnn, tempkmin, 0., 1., - D(n, k), ldan ); + D(n, k), lddn ); #endif } for (m = 0; m < B->mt; m++) { @@ -416,7 +420,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree, &options, side, trans, tempmm, tempnn, tempkmin, ib, T->nb, - D(n, k), ldan, + D(n, k), lddn, T(n, k), T->mb, B(m, n), ldbm); } diff --git a/compute/pzunmqrrh.c b/compute/pzunmqrrh.c index 282f675876a7b4aa60a5a95fc02ba6804c5bb948..767630ae6349375da2695f42333c0a2e2559dee5 100644 --- a/compute/pzunmqrrh.c +++ b/compute/pzunmqrrh.c @@ -47,7 +47,7 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans int k, m, n; int K, M, RD, lastRD; - int ldaM, ldam, ldan, ldaMRD; + int ldaM, ldam, ldan, ldaMRD, lddM; int ldbM, ldbm, ldbMRD; int tempMm, tempkn, tempnn, tempmm, tempMRDm, tempkmin; int ib; @@ -99,19 +99,20 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans tempMm = M == A->mt-1 ? A->m-M*A->mb : A->mb; tempkmin = chameleon_min(tempMm, tempkn); ldaM = BLKLDD(A, M); + lddM = BLKLDD(D, M); ldbM = BLKLDD(B, M); if ( genD ) { INSERT_TASK_zlacpy( &options, ChamLower, tempMm, tempkmin, A->nb, A(M, k), ldaM, - D(M, k), ldaM ); + D(M, k), lddM ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempMm, tempkmin, 0., 1., - D(M, k), ldaM ); + D(M, k), lddM ); #endif } for (n = 0; n < B->nt; n++) { @@ -120,7 +121,7 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans &options, side, trans, tempMm, tempnn, tempkmin, ib, T->nb, - D(M, k), ldaM, + D(M, k), lddM, T(M, k), T->mb, B(M, n), ldbM); } @@ -230,6 +231,7 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans tempMm = M == A->mt-1 ? A->m-M*A->mb : A->mb; tempkmin = chameleon_min(tempMm, tempkn); ldaM = BLKLDD(A, M); + lddM = BLKLDD(D, M); ldbM = BLKLDD(B, M); for (m = chameleon_min(M+BS, A->mt)-1; m > M; m--) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; @@ -260,13 +262,13 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans &options, ChamLower, tempMm, tempkmin, A->nb, A(M, k), ldaM, - D(M, k), ldaM ); + D(M, k), lddM ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempMm, tempkmin, 0., 1., - D(M, k), ldaM ); + D(M, k), lddM ); #endif } for (n = 0; n < B->nt; n++) { @@ -278,7 +280,7 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans INSERT_TASK_zunmqr( &options, side, trans, tempMm, tempnn, tempkmin, ib, T->nb, - D(M, k), ldaM, + D(M, k), lddM, T(M, k), T->mb, B(M, n), ldbM); } @@ -331,6 +333,7 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans tempMm = M == A->mt-1 ? A->m-M*A->mb : A->mb; tempkmin = chameleon_min(tempMm, tempkn); ldaM = BLKLDD(A, M); + lddM = BLKLDD(D, M); for (n = chameleon_min(M+BS, A->mt)-1; n > M; n--) { ldan = BLKLDD(A, n); tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb; @@ -360,13 +363,13 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans &options, ChamLower, tempMm, tempkmin, A->nb, A(M, k), ldaM, - D(M, k), ldaM ); + D(M, k), lddM ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempMm, tempkmin, 0., 1., - D(M, k), ldaM ); + D(M, k), lddM ); #endif } for (m = 0; m < B->mt; m++) { @@ -380,7 +383,7 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans &options, side, trans, tempmm, tempMm, tempkmin, ib, T->nb, - D(M, k), ldaM, + D(M, k), lddM, T(M, k), T->mb, B(m, M), ldbm); } @@ -402,18 +405,19 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans tempMm = M == A->mt-1 ? A->m-M*A->mb : A->mb; tempkmin = chameleon_min(tempMm, tempkn); ldaM = BLKLDD(A, M); + lddM = BLKLDD(D, M); if ( genD ) { INSERT_TASK_zlacpy( &options, ChamLower, tempMm, tempkmin, A->nb, A(M, k), ldaM, - D(M, k), ldaM ); + D(M, k), lddM ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, ChamUpper, tempMm, tempkmin, 0., 1., - D(M, k), ldaM ); + D(M, k), lddM ); #endif } for (m = 0; m < B->mt; m++) { @@ -423,7 +427,7 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans &options, side, trans, tempmm, tempMm, tempkmin, ib, T->nb, - D(M, k), ldaM, + D(M, k), lddM, T(M, k), T->mb, B(m, M), ldbm); } diff --git a/compute/zgelqf.c b/compute/zgelqf.c index 5a0e2cd7815a5f78e890c3cab40fae540e51a1b9..f64b4ee0ca8a4d0d0cad5f32fb527a48420d83f2 100644 --- a/compute/zgelqf.c +++ b/compute/zgelqf.c @@ -280,7 +280,7 @@ int CHAMELEON_zgelqf_Tile_Async( CHAM_desc_t *A, CHAM_desc_t *T, */ #if defined(CHAMELEON_COPY_DIAG) { - int m = chameleon_min(A->mt, A->nt) * A->mb; + int m = chameleon_min(A->m, A->n); chameleon_zdesc_alloc(D, A->mb, A->nb, m, A->n, 0, 0, m, A->n, ); Dptr = &D; } diff --git a/compute/zgelqf_param.c b/compute/zgelqf_param.c index c686323ed04565cade1b1d6797ba00f91291c0b1..031f2f921c1e52bda0e15b664a37df4f57771180 100644 --- a/compute/zgelqf_param.c +++ b/compute/zgelqf_param.c @@ -284,7 +284,7 @@ int CHAMELEON_zgelqf_param_Tile_Async( const libhqr_tree_t *qrtree, CHAM_desc_t */ #if defined(CHAMELEON_COPY_DIAG) { - int m = chameleon_min(A->mt, A->nt) * A->mb; + int m = chameleon_min(A->m, A->n); chameleon_zdesc_alloc(D, A->mb, A->nb, m, A->n, 0, 0, m, A->n, ); Dptr = &D; } diff --git a/compute/zgelqs.c b/compute/zgelqs.c index 46fd149c9c763b9ed2f805980738e055bb56b51e..d2795a97e71d2e471059186491f4ff6493505f10 100644 --- a/compute/zgelqs.c +++ b/compute/zgelqs.c @@ -320,7 +320,7 @@ int CHAMELEON_zgelqs_Tile_Async( CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *B, #if defined(CHAMELEON_COPY_DIAG) { - int m = chameleon_min(A->mt, A->nt) * A->mb; + int m = chameleon_min(A->m, A->n); chameleon_zdesc_alloc(D, A->mb, A->nb, m, A->n, 0, 0, m, A->n, ); Dptr = &D; } diff --git a/compute/zgelqs_param.c b/compute/zgelqs_param.c index 70627ae7434aacad4d09f61d333155219b46e86c..b24d70740a12989054a4ccb1f61f16a44518b975 100644 --- a/compute/zgelqs_param.c +++ b/compute/zgelqs_param.c @@ -332,7 +332,7 @@ int CHAMELEON_zgelqs_param_Tile_Async( const libhqr_tree_t *qrtree, CHAM_desc_t #if defined(CHAMELEON_COPY_DIAG) { - int m = chameleon_min(A->mt, A->nt) * A->mb; + int m = chameleon_min(A->m, A->n); chameleon_zdesc_alloc(D, A->mb, A->nb, m, A->n, 0, 0, m, A->n, ); Dptr = &D; } diff --git a/compute/zgels.c b/compute/zgels.c index e49f27f9c8f987f8715037e1b47cb976923588a1..bf620d79d333bd0ea5a66c68f74ea5be6f6df0d4 100644 --- a/compute/zgels.c +++ b/compute/zgels.c @@ -373,10 +373,9 @@ int CHAMELEON_zgels_Tile_Async( cham_trans_t trans, CHAM_desc_t *A, } */ if (A->m >= A->n) { - #if defined(CHAMELEON_COPY_DIAG) { - int n = chameleon_min(A->mt, A->nt) * A->nb; + int n = chameleon_min(A->m, A->n); chameleon_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, ); Dptr = &D; } @@ -401,7 +400,7 @@ int CHAMELEON_zgels_Tile_Async( cham_trans_t trans, CHAM_desc_t *A, free(subB); */ #if defined(CHAMELEON_COPY_DIAG) { - int m = chameleon_min(A->mt, A->nt) * A->mb; + int m = chameleon_min(A->m, A->n); chameleon_zdesc_alloc(D, A->mb, A->nb, m, A->n, 0, 0, m, A->n, ); Dptr = &D; } diff --git a/compute/zgels_param.c b/compute/zgels_param.c index c7e0f790354a1d54e8daaeb8f2b31efab1e39bb9..9ee807f0c47f93fa6667a6fd94d1884acfaad53b 100644 --- a/compute/zgels_param.c +++ b/compute/zgels_param.c @@ -387,7 +387,7 @@ int CHAMELEON_zgels_param_Tile_Async( const libhqr_tree_t *qrtree, cham_trans_t #if defined(CHAMELEON_COPY_DIAG) { - int n = chameleon_min(A->mt, A->nt) * A->nb; + int n = chameleon_min(A->m, A->n); chameleon_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, ); Dptr = &D; } @@ -403,7 +403,7 @@ int CHAMELEON_zgels_param_Tile_Async( const libhqr_tree_t *qrtree, cham_trans_t else { #if defined(CHAMELEON_COPY_DIAG) { - int m = chameleon_min(A->mt, A->nt) * A->mb; + int m = chameleon_min(A->m, A->n); chameleon_zdesc_alloc(D, A->mb, A->nb, m, A->n, 0, 0, m, A->n, ); Dptr = &D; } diff --git a/compute/zgeqrf.c b/compute/zgeqrf.c index b64b6d105e912d4688ef18471e6e60856944c310..ed7f122f66f88e1471059c890cf6878b2057e877 100644 --- a/compute/zgeqrf.c +++ b/compute/zgeqrf.c @@ -279,7 +279,7 @@ int CHAMELEON_zgeqrf_Tile_Async( CHAM_desc_t *A, CHAM_desc_t *T, */ #if defined(CHAMELEON_COPY_DIAG) { - int n = chameleon_min(A->mt, A->nt) * A->nb; + int n = chameleon_min(A->m, A->n); chameleon_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, ); Dptr = &D; } diff --git a/compute/zgeqrf_param.c b/compute/zgeqrf_param.c index 4565554b3a8643c5858ef6f6ba57ce298751d655..5915c3c9fbea12eef035d256e04c7921efa72675 100644 --- a/compute/zgeqrf_param.c +++ b/compute/zgeqrf_param.c @@ -295,7 +295,7 @@ int CHAMELEON_zgeqrf_param_Tile_Async( const libhqr_tree_t *qrtree, CHAM_desc_t */ #if defined(CHAMELEON_COPY_DIAG) { - int n = chameleon_min(A->mt, A->nt) * A->nb; + int n = chameleon_min(A->m, A->n); chameleon_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, ); Dptr = &D; } diff --git a/compute/zgeqrs.c b/compute/zgeqrs.c index bd1a839e24a03c0a89385580d6f8af719e78d818..bfbf1355356484e7f23589d6c47a9651ec67d0ad 100644 --- a/compute/zgeqrs.c +++ b/compute/zgeqrs.c @@ -309,7 +309,7 @@ int CHAMELEON_zgeqrs_Tile_Async( CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *B, */ #if defined(CHAMELEON_COPY_DIAG) { - int n = chameleon_min(A->mt, A->nt) * A->nb; + int n = chameleon_min(A->m, A->n); chameleon_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, ); Dptr = &D; } diff --git a/compute/zgeqrs_param.c b/compute/zgeqrs_param.c index 131992abdc141e09b76f25a6371725060b904042..a26d4d55086828f22e73c5e52825994f576b583f 100644 --- a/compute/zgeqrs_param.c +++ b/compute/zgeqrs_param.c @@ -313,7 +313,7 @@ int CHAMELEON_zgeqrs_param_Tile_Async( const libhqr_tree_t *qrtree, */ #if defined(CHAMELEON_COPY_DIAG) { - int n = chameleon_min(A->mt, A->nt) * A->nb; + int n = chameleon_min(A->m, A->n); chameleon_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, ); Dptr = &D; } diff --git a/compute/zunglq.c b/compute/zunglq.c index 0693e929f1eb362882bfd0b2f3299b435c9dc0d1..6b851c40fe07fc56cdf74539d0a225e71a202ff1 100644 --- a/compute/zunglq.c +++ b/compute/zunglq.c @@ -302,7 +302,7 @@ int CHAMELEON_zunglq_Tile_Async( CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *Q, #if defined(CHAMELEON_COPY_DIAG) { - int m = chameleon_min(A->mt, A->nt) * A->mb; + int m = chameleon_min(A->m, A->n); chameleon_zdesc_alloc(D, A->mb, A->nb, m, A->n, 0, 0, m, A->n, ); Dptr = &D; } diff --git a/compute/zunglq_param.c b/compute/zunglq_param.c index b89d0d5c8dd345c1030c4f50bb8b276598b00c57..5b2856c241681e86448cdb8093a692546065b4b3 100644 --- a/compute/zunglq_param.c +++ b/compute/zunglq_param.c @@ -305,7 +305,7 @@ int CHAMELEON_zunglq_param_Tile_Async( const libhqr_tree_t *qrtree, CHAM_desc_t */ #if defined(CHAMELEON_COPY_DIAG) { - int m = chameleon_min(A->mt, A->nt) * A->mb; + int m = chameleon_min(A->m, A->n); chameleon_zdesc_alloc(D, A->mb, A->nb, m, A->n, 0, 0, m, A->n, ); Dptr = &D; } diff --git a/compute/zungqr.c b/compute/zungqr.c index 372aebe46298c48c5fd1390a0c9fee437082fae5..afbb19dbc81e571ab421ca88800f3ae434b7f5a0 100644 --- a/compute/zungqr.c +++ b/compute/zungqr.c @@ -299,7 +299,7 @@ int CHAMELEON_zungqr_Tile_Async( CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *Q, */ #if defined(CHAMELEON_COPY_DIAG) { - int n = chameleon_min(A->mt, A->nt) * A->nb; + int n = chameleon_min(A->m, A->n); chameleon_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, ); Dptr = &D; } diff --git a/compute/zungqr_param.c b/compute/zungqr_param.c index de8caa0638e130f6fd11749c313c040af026b3e3..fa2bcc46db1bca49c72defe44101f332a386b9a4 100644 --- a/compute/zungqr_param.c +++ b/compute/zungqr_param.c @@ -303,7 +303,7 @@ int CHAMELEON_zungqr_param_Tile_Async( const libhqr_tree_t *qrtree, CHAM_desc_t */ #if defined(CHAMELEON_COPY_DIAG) { - int n = chameleon_min(A->mt, A->nt) * A->nb; + int n = chameleon_min(A->m, A->n); chameleon_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, ); Dptr = &D; } diff --git a/compute/zunmlq.c b/compute/zunmlq.c index 1448429838d34d5a7b1a3075280a4045b5331779..50763f9f572030ed801f3b05f78ae4307900b638 100644 --- a/compute/zunmlq.c +++ b/compute/zunmlq.c @@ -362,7 +362,7 @@ int CHAMELEON_zunmlq_Tile_Async( cham_side_t side, cham_trans_t trans, */ #if defined(CHAMELEON_COPY_DIAG) { - int m = chameleon_min(A->mt, A->nt) * A->mb; + int m = chameleon_min(A->m, A->n); chameleon_zdesc_alloc(D, A->mb, A->nb, m, A->n, 0, 0, m, A->n, ); Dptr = &D; } diff --git a/compute/zunmlq_param.c b/compute/zunmlq_param.c index affdaff790e0c7137ef67e3c24e57cdbef9230ce..623a54ee9f5f8e49a24514ec0857c1ec987aedb1 100644 --- a/compute/zunmlq_param.c +++ b/compute/zunmlq_param.c @@ -367,7 +367,7 @@ int CHAMELEON_zunmlq_param_Tile_Async( const libhqr_tree_t *qrtree, cham_side_t #if defined(CHAMELEON_COPY_DIAG) { - int m = chameleon_min(A->mt, A->nt) * A->mb; + int m = chameleon_min(A->m, A->n); chameleon_zdesc_alloc(D, A->mb, A->nb, m, A->n, 0, 0, m, A->n, ); Dptr = &D; } diff --git a/compute/zunmqr.c b/compute/zunmqr.c index 18d5a0ff336b8924281fb613a6e770740e952a58..5f20272584e13f37031aad737186b5747371969f 100644 --- a/compute/zunmqr.c +++ b/compute/zunmqr.c @@ -365,7 +365,7 @@ int CHAMELEON_zunmqr_Tile_Async( cham_side_t side, cham_trans_t trans, #if defined(CHAMELEON_COPY_DIAG) { - int n = chameleon_min(A->mt, A->nt) * A->nb; + int n = chameleon_min(A->m, A->n); chameleon_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, ); Dptr = &D; } diff --git a/compute/zunmqr_param.c b/compute/zunmqr_param.c index 8751255da2eba667ad427c90ccb8e9a3821c1231..78d59cc037adf0fcb826240bfa2ef53faf8fc82f 100644 --- a/compute/zunmqr_param.c +++ b/compute/zunmqr_param.c @@ -374,7 +374,7 @@ int CHAMELEON_zunmqr_param_Tile_Async( const libhqr_tree_t *qrtree, #if defined(CHAMELEON_COPY_DIAG) { - int n = chameleon_min(A->mt, A->nt) * A->nb; + int n = chameleon_min(A->m, A->n); chameleon_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, ); Dptr = &D; } diff --git a/control/workspace.c b/control/workspace.c index 28ffa861552114188a5814670d681842f258265e..e29a9358c70fa6a89973f9d14d0fbd92f029f11c 100644 --- a/control/workspace.c +++ b/control/workspace.c @@ -67,7 +67,9 @@ int chameleon_alloc_ibnb_tile(int M, int N, cham_tasktype_t func, int type, CHAM (func == CHAMELEON_FUNC_DGESVD) || (func == CHAMELEON_FUNC_CGESVD) || (func == CHAMELEON_FUNC_ZGESVD))) + { NT *= 2; + } lm = IB * MT; ln = NB * NT; diff --git a/testing/testing_zlange.c b/testing/testing_zlange.c index b0aaf5cd50025ca1e037d0626561409c771a8104..5aff7317778d37ad70541698c1f742b24c355b4e 100644 --- a/testing/testing_zlange.c +++ b/testing/testing_zlange.c @@ -54,6 +54,8 @@ int testing_zlange(int argc, char **argv) double *work = (double*) malloc(max(M,N)*sizeof(double)); double normcham, normlapack, result; + RUNTIME_comm_set_tag_sizes( 31, 16 ); + eps = LAPACKE_dlamch_work('e'); printf("\n"); @@ -238,5 +240,5 @@ int testing_zlange(int argc, char **argv) free(A); free(work); - return 0 /*hres*/; + return hres; }