diff --git a/compute/pzlange.c b/compute/pzlange.c index b5f58e47d25fc1c231faefa374e85b9b8e250b79..f909fa25fa34ff17bc6d0c72b811916a187d284e 100644 --- a/compute/pzlange.c +++ b/compute/pzlange.c @@ -329,13 +329,14 @@ chameleon_pzlange_frb( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_ else { INSERT_TASK_zgessq( options, + ChamEltwise, tempmm, tempnn, A(m, n), ldam, W( Welt, m, n) ); } if ( n >= Q ) { INSERT_TASK_dplssq( - options, W( Welt, m, n), W( Welt, m, n%Q) ); + options, ChamEltwise, 1, 1, W( Welt, m, n), W( Welt, m, n%Q) ); } } @@ -345,7 +346,7 @@ chameleon_pzlange_frb( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_ */ for(n = 1; n < Q; n++) { INSERT_TASK_dplssq( - options, W( Welt, m, n), W( Welt, m, 0) ); + options, ChamEltwise, 1, 1, W( Welt, m, n), W( Welt, m, 0) ); } } @@ -355,7 +356,7 @@ chameleon_pzlange_frb( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_ */ for(m = P; m < MT; m++) { INSERT_TASK_dplssq( - options, W( Welt, m, 0), W( Welt, m%P, 0) ); + options, ChamEltwise, 1, 1, W( Welt, m, 0), W( Welt, m%P, 0) ); } /** @@ -364,11 +365,11 @@ chameleon_pzlange_frb( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_ */ for(m = 1; m < P; m++) { INSERT_TASK_dplssq( - options, W( Welt, m, 0), W( Welt, 0, 0) ); + options, ChamEltwise, 1, 1, W( Welt, m, 0), W( Welt, 0, 0) ); } INSERT_TASK_dplssq2( - options, W( Welt, 0, 0) ); + options, 1, W( Welt, 0, 0) ); } /** diff --git a/compute/pzlansy.c b/compute/pzlansy.c index 71189ef578ffa2fd5f5b3d3feac28508d7bb86c5..e799f57285430e09eccd348abdca54d0c31d6cdc 100644 --- a/compute/pzlansy.c +++ b/compute/pzlansy.c @@ -241,21 +241,21 @@ chameleon_pzlansy_frb( cham_trans_t trans, cham_uplo_t uplo, if ( n == m ) { if ( trans == ChamConjTrans) { INSERT_TASK_zhessq( - options, uplo, tempmm, + options, ChamEltwise, uplo, tempmm, A(m, n), ldam, W( Welt, m, n) ); } else { INSERT_TASK_zsyssq( - options, uplo, tempmm, + options, ChamEltwise, uplo, tempmm, A(m, n), ldam, W( Welt, m, n) ); } } else { INSERT_TASK_zgessq( - options, tempmm, tempnn, + options, ChamEltwise, tempmm, tempnn, A(m, n), ldam, W( Welt, m, n) ); INSERT_TASK_zgessq( - options, tempmm, tempnn, + options, ChamEltwise, tempmm, tempnn, A(m, n), ldam, W( Welt, n, m) ); } } @@ -264,7 +264,7 @@ chameleon_pzlansy_frb( cham_trans_t trans, cham_uplo_t uplo, for(m = 0; m < MT; m++) { for(n = Q; n < NT; n++) { INSERT_TASK_dplssq( - options, W( Welt, m, n), W( Welt, m, n%Q) ); + options, ChamEltwise, 1, 1, W( Welt, m, n), W( Welt, m, n%Q) ); } /** @@ -273,7 +273,7 @@ chameleon_pzlansy_frb( cham_trans_t trans, cham_uplo_t uplo, */ for(n = 1; n < Q; n++) { INSERT_TASK_dplssq( - options, W( Welt, m, n), W( Welt, m, 0) ); + options, ChamEltwise, 1, 1, W( Welt, m, n), W( Welt, m, 0) ); } } @@ -283,7 +283,7 @@ chameleon_pzlansy_frb( cham_trans_t trans, cham_uplo_t uplo, */ for(m = P; m < MT; m++) { INSERT_TASK_dplssq( - options, W( Welt, m, 0), W( Welt, m%P, 0) ); + options, ChamEltwise, 1, 1, W( Welt, m, 0), W( Welt, m%P, 0) ); } /** @@ -292,11 +292,11 @@ chameleon_pzlansy_frb( cham_trans_t trans, cham_uplo_t uplo, */ for(m = 1; m < P; m++) { INSERT_TASK_dplssq( - options, W( Welt, m, 0), W( Welt, 0, 0) ); + options, ChamEltwise, 1, 1, W( Welt, m, 0), W( Welt, 0, 0) ); } INSERT_TASK_dplssq2( - options, W( Welt, 0, 0) ); + options, 1, W( Welt, 0, 0) ); } /** diff --git a/coreblas/compute/CMakeLists.txt b/coreblas/compute/CMakeLists.txt index 009cd37dbb5112325e7622b5eef71715d661fde1..d44a83fd7a2480d0c5197f4b4d4477aebfa4be16 100644 --- a/coreblas/compute/CMakeLists.txt +++ b/coreblas/compute/CMakeLists.txt @@ -66,6 +66,7 @@ set(ZSRC core_zplghe.c core_zplgsy.c core_zplrnt.c + core_zplssq.c core_zpotrf.c core_zssssm.c core_zsymm.c diff --git a/coreblas/compute/core_zgessq.c b/coreblas/compute/core_zgessq.c index aba33e5f5102baf1632332be7cbbc2d97d2259c5..71aa407bdebde121a438604a3d08a9145b802e2f 100644 --- a/coreblas/compute/core_zgessq.c +++ b/coreblas/compute/core_zgessq.c @@ -15,23 +15,98 @@ * @comment This file has been automatically generated * from Plasma 2.6.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge + * @author Florent Pruvost * @date 2014-11-16 * @precisions normal z -> c d s * */ #include <math.h> #include "coreblas/lapacke.h" +#include "coreblas/sumsq_update.h" #include "coreblas.h" -#define UPDATE( __nb, __value ) \ - if (__value != 0. ){ \ - if ( *scale < __value ) { \ - *sumsq = __nb + (*sumsq) * ( *scale / __value ) * ( *scale / __value ); \ - *scale = __value; \ - } else { \ - *sumsq = *sumsq + __nb * ( __value / *scale ) * ( __value / *scale ); \ - } \ +/** + * @brief Subcase storev == ChamColumnwise of CORE_zgessq() + */ +static inline int +CORE_zgessq_col( int M, int N, + const CHAMELEON_Complex64_t *A, int LDA, + double *sclssq ) +{ + int i, j; + double tmp; + double *ptr, *tmpScale, *tmpSumsq; + + for(j=0; j<N; j++) { + ptr = (double*) ( A + j * LDA ); + tmpScale = sclssq+2*j; + tmpSumsq = sclssq+2*j+1; + for(i=0; i<M; i++, ptr++) { + tmp = fabs(*ptr); + sumsq_update( 1., tmpScale, tmpSumsq, &tmp ); +#if defined(PRECISION_z) || defined(PRECISION_c) + ptr++; + tmp = fabs(*ptr); + sumsq_update( 1., tmpScale, tmpSumsq, &tmp ); +#endif + } } + return CHAMELEON_SUCCESS; +} +/** + * @brief Subcase storev == ChamRowwise of CORE_zgessq() + */ +static inline int +CORE_zgessq_row( int M, int N, + const CHAMELEON_Complex64_t *A, int LDA, + double *sclssq ) +{ + int i, j; + double tmp; + double *ptr, *tmpScale, *tmpSumsq; + + for(j=0; j<N; j++) { + ptr = (double*) ( A + j * LDA ); + tmpScale = sclssq; + tmpSumsq = sclssq+1; + for(i=0; i<M; i++, ptr++, tmpScale+=2, tmpSumsq+=2) { + tmp = fabs(*ptr); + sumsq_update( 1., tmpScale, tmpSumsq, &tmp ); +#if defined(PRECISION_z) || defined(PRECISION_c) + ptr++; + tmp = fabs(*ptr); + sumsq_update( 1., tmpScale, tmpSumsq, &tmp ); +#endif + } + } + return CHAMELEON_SUCCESS; +} +/** + * @brief Subcase storev == ChamEltwise of CORE_zgessq() + */ +static inline int +CORE_zgessq_elt( int M, int N, + const CHAMELEON_Complex64_t *A, int LDA, + double *sclssq ) +{ + int i, j; + double tmp; + double *ptr; + + for(j=0; j<N; j++) { + ptr = (double*) ( A + j * LDA ); + for(i=0; i<M; i++, ptr++) { + tmp = fabs(*ptr); + sumsq_update( 1., sclssq, sclssq+1, &tmp ); +#if defined(PRECISION_z) || defined(PRECISION_c) + ptr++; + tmp = fabs(*ptr); + sumsq_update( 1., sclssq, sclssq+1, &tmp ); +#endif + } + } + return CHAMELEON_SUCCESS; +} /** * @@ -60,6 +135,12 @@ * ******************************************************************************* * + * @param[in] storev + * Specifies whether the sums are made per column or row. + * = ChamColumnwise: Computes the sum of squares on each column + * = ChamRowwise: Computes the sum of squares on each row + * = ChamEltwise: Computes the sum of squares on all the matrix + * * @param[in] M * The number of rows in the tile A. * @@ -72,13 +153,16 @@ * @param[in] LDA * The leading dimension of the tile A. LDA >= max(1,M). * - * @param[in,out] scale - * On entry, the value scale in the equation above. - * On exit, scale is overwritten with the value scl. - * - * @param[in,out] sumsq - * On entry, the value sumsq in the equation above. - * On exit, SUMSQ is overwritten with the value ssq. + * @param[in,out] sclssq + * Dimension: (2,K) + * if storev == ChamColumnwise, K = N + * if storev == ChamRowwise, K = M + * if storev == ChamEltwise, K = 1 + * On entry, sclssq contains K couples (sclssq[2*i], sclssq[2*i+1]) + * which corresponds to (scale, sumsq) in the equation below + * ( scl**2 )*ssq = sum( A( i, j )**2 ) + ( scale**2 )*sumsq, + * respectively for the columns, the rows and the full matrix + * On exit, each couple is overwritten with the final result (scl, ssq). * ******************************************************************************* * @@ -86,26 +170,18 @@ * @retval -k, the k-th argument had an illegal value * */ -int CORE_zgessq(int M, int N, - const CHAMELEON_Complex64_t *A, int LDA, - double *scale, double *sumsq) +int CORE_zgessq( cham_store_t storev, int M, int N, + const CHAMELEON_Complex64_t *A, int LDA, + double *sclssq ) { - int i, j; - double tmp; - double *ptr; - - for(j=0; j<N; j++) { - ptr = (double*) ( A + j * LDA ); - for(i=0; i<M; i++, ptr++) { - tmp = fabs(*ptr); - UPDATE( 1., tmp ); - -#if defined(PRECISION_z) || defined(PRECISION_c) - ptr++; - tmp = fabs(*ptr); - UPDATE( 1., tmp ); -#endif - } + if (storev == ChamColumnwise) { + CORE_zgessq_col( M, N, A, LDA, sclssq ); + } + else if (storev == ChamRowwise) { + CORE_zgessq_row( M, N, A, LDA, sclssq ); + } + else { + CORE_zgessq_elt( M, N, A, LDA, sclssq ); } return CHAMELEON_SUCCESS; } diff --git a/coreblas/compute/core_zhessq.c b/coreblas/compute/core_zhessq.c index 482cc6fb42d14acf3f945801f645e734001ce317..255e7e7f716f6b2cd3475c11aefac373814a2ae8 100644 --- a/coreblas/compute/core_zhessq.c +++ b/coreblas/compute/core_zhessq.c @@ -19,20 +19,8 @@ * @precisions normal z -> c * */ -#include <math.h> -#include "coreblas/lapacke.h" #include "coreblas.h" -#define UPDATE( __nb, __value ) \ - if (__value != 0. ){ \ - if ( *scale < __value ) { \ - *sumsq = __nb + (*sumsq) * ( *scale / __value ) * ( *scale / __value ); \ - *scale = __value; \ - } else { \ - *sumsq = *sumsq + __nb * ( __value / *scale ) * ( __value / *scale ); \ - } \ - } - /** * * @ingroup CORE_CHAMELEON_Complex64_t @@ -56,18 +44,24 @@ * SCALE and SUMSQ are overwritten by scl and ssq respectively. * * The routine makes only one pass through the tile triangular part of the - * hermitian tile A defined by uplo. + * symmetric tile A defined by uplo. * See also LAPACK zlassq.f * ******************************************************************************* * + * @param[in] storev + * Specifies whether the sums are made per column or row. + * = ChamColumnwise: Computes the sum of squares on each column + * = ChamRowwise: Computes the sum of squares on each row + * = ChamEltwise: Computes the sum of squares on all the matrix + * * @param[in] uplo * Specifies whether the upper or lower triangular part of - * the hermitian matrix A is to be referenced as follows: + * the symmetric matrix A is to be referenced as follows: * = ChamLower: Only the lower triangular part of the - * hermitian matrix A is to be referenced. + * symmetric matrix A is to be referenced. * = ChamUpper: Only the upper triangular part of the - * hermitian matrix A is to be referenced. + * symmetric matrix A is to be referenced. * * @param[in] N * The number of columns and rows in the tile A. @@ -78,13 +72,16 @@ * @param[in] LDA * The leading dimension of the tile A. LDA >= max(1,N). * - * @param[in,out] scale - * On entry, the value scale in the equation above. - * On exit, scale is overwritten with the value scl. - * - * @param[in,out] sumsq - * On entry, the value sumsq in the equation above. - * On exit, SUMSQ is overwritten with the value ssq. + * @param[in,out] sclssq + * Dimension: (2,K) + * if storev == ChamColumnwise, K = N + * if storev == ChamRowwise, K = N + * if storev == ChamEltwise, K = 1 + * On entry, sclssq contains K couples (sclssq[2*i], sclssq[2*i+1]) + * which corresponds to (scale, sumsq) in the equation below + * ( scl**2 )*ssq = sum( A( i, j )**2 ) + ( scale**2 )*sumsq, + * respectively for the columns, the rows and the full matrix + * On exit, each couple is overwritten with the final result (scl, ssq). * ******************************************************************************* * @@ -92,65 +89,9 @@ * @retval -k, the k-th argument had an illegal value * */ - -int CORE_zhessq(cham_uplo_t uplo, int N, - const CHAMELEON_Complex64_t *A, int LDA, - double *scale, double *sumsq) +int CORE_zhessq( cham_store_t storev, cham_uplo_t uplo, int N, + const CHAMELEON_Complex64_t *A, int LDA, + double *sclssq ) { - int i, j; - double tmp; - double *ptr; - - if ( uplo == ChamUpper ) { - for(j=0; j<N; j++) { - ptr = (double*) ( A + j * LDA ); - - for(i=0; i<j; i++, ptr++) { - - tmp = fabs(*ptr); - UPDATE( 2., tmp ); - -#if defined(PRECISION_z) || defined(PRECISION_c) - ptr++; - tmp = fabs(*ptr); - UPDATE( 2., tmp ); -#endif - } - - /* Diagonal */ - tmp = fabs(*ptr); - UPDATE( 1., tmp ); - -#if defined(PRECISION_z) || defined(PRECISION_c) - ptr++; -#endif - } - } else { - - for(j=0; j<N; j++) { - ptr = (double*) ( A + j * LDA + j); - - /* Diagonal */ - tmp = fabs(*ptr); - UPDATE( 1., tmp ); - ptr++; - -#if defined(PRECISION_z) || defined(PRECISION_c) - ptr++; -#endif - - for(i=j+1; i<N; i++, ptr++) { - - tmp = fabs(*ptr); - UPDATE( 2., tmp ); - -#if defined(PRECISION_z) || defined(PRECISION_c) - ptr++; - tmp = fabs(*ptr); - UPDATE( 2., tmp ); -#endif - } - } - } - return CHAMELEON_SUCCESS; + return CORE_zsyssq( storev, uplo, N, A, LDA, sclssq ); } diff --git a/coreblas/compute/core_zplssq.c b/coreblas/compute/core_zplssq.c new file mode 100644 index 0000000000000000000000000000000000000000..9549453dcd50df72fc2780eadc80d81ceeac9a83 --- /dev/null +++ b/coreblas/compute/core_zplssq.c @@ -0,0 +1,193 @@ +/** + * + * @file core_zplssq.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon core_zplssq CPU kernel + * + * @version 0.9.2 + * @author Mathieu Faverge + * @author Florent Pruvost + * @date 2019-04-01 + * @precisions normal z -> c d s + * + */ +#include <math.h> +#include "coreblas/lapacke.h" +#include "coreblas/sumsq_update.h" +#include "coreblas.h" + +/** + * @brief Subcase storev == ChamColumnwise of CORE_zplssq() + */ +static inline int +CORE_zplssq_col( int M, int N, + double *sclssqin, + double *sclssqout ) +{ + int i, j; + double *tmpScaleIn, *tmpSumsqIn, *tmpScaleOut, *tmpSumsqOut; + + tmpScaleIn = sclssqin; + tmpSumsqIn = sclssqin+1; + tmpScaleOut = sclssqout; + tmpSumsqOut = sclssqout+1; + + for(j=0; j<N; j++) { + for(i=0; i<M; i++) { + sumsq_update_2( 1., tmpScaleIn, tmpSumsqIn, tmpScaleOut, tmpSumsqOut ); + tmpScaleIn+=2; + tmpSumsqIn+=2; + } + tmpScaleOut+=2; + tmpSumsqOut+=2; + } + return CHAMELEON_SUCCESS; +} +/** + * @brief Subcase storev == ChamRowwise of CORE_zplssq() + */ +static inline int +CORE_zplssq_row( int M, int N, + double *sclssqin, + double *sclssqout ) +{ + int i, j; + double *tmpScaleIn, *tmpSumsqIn, *tmpScaleOut, *tmpSumsqOut; + + tmpScaleIn = sclssqin; + tmpSumsqIn = sclssqin+1; + + for(j=0; j<N; j++) { + tmpScaleOut = sclssqout; + tmpSumsqOut = sclssqout+1; + for(i=0; i<M; i++) { + sumsq_update_2( 1., tmpScaleIn, tmpSumsqIn, tmpScaleOut, tmpSumsqOut ); + tmpScaleIn+=2; + tmpSumsqIn+=2; + tmpScaleOut+=2; + tmpSumsqOut+=2; + } + } + return CHAMELEON_SUCCESS; +} +/** + * @brief Subcase storev == ChamEltwise of CORE_zplssq() + */ +static inline int +CORE_zplssq_elt( int M, int N, + double *sclssqin, + double *sclssqout ) +{ + int i, j; + double *tmpScaleIn, *tmpSumsqIn, *tmpScaleOut, *tmpSumsqOut; + + tmpScaleIn = sclssqin; + tmpSumsqIn = sclssqin+1; + tmpScaleOut = sclssqout; + tmpSumsqOut = sclssqout+1; + + for(j=0; j<N; j++) { + for(i=0; i<M; i++) { + sumsq_update_2( 1., tmpScaleIn, tmpSumsqIn, tmpScaleOut, tmpSumsqOut ); + tmpScaleIn+=2; + tmpSumsqIn+=2; + } + } + return CHAMELEON_SUCCESS; +} + +/** + * + * @ingroup CORE_CHAMELEON_Complex64_t + * + * CORE_zplssq adds the values ssq ensuring scl >= 0 + * + ******************************************************************************* + * + * @param[in] storev + * Specifies whether the sums are made per column or row. + * = ChamColumnwise: Computes the sum of squares on each column + * = ChamRowwise: Computes the sum of squares on each row + * = ChamEltwise: Computes the sum of squares on all the matrix + * + * @param[in] M + * The number of rows in the tile sclssqin. + * + * @param[in] N + * The number of columns in the tile sclssqin. + * + * @param[in] sclssqin + * The 2*M-by-2*N matrix on which to compute the sums. + * + * @param[in,out] sclssqout + * Dimension: (2,K) + * if storev == ChamColumnwise, K = N + * if storev == ChamRowwise, K = M + * if storev == ChamEltwise, K = 1 + * On entry, sclssqout contains M-by-N couples (sclssqout[2*i], sclssqout[2*i+1]) + * which corresponds to (scale, sumsq) in the equation below + * ( scl**2 )*ssq = sum( A( i, j )**2 ) + ( scale**2 )*sumsq, + * respectively for the columns, the rows and the full matrix + * On exit, each couple is overwritten with the final result (scl, ssq). + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * @retval -k, the k-th argument had an illegal value + * + */ +int CORE_zplssq( cham_store_t storev, int M, int N, + double *sclssqin, + double *sclssqout ) +{ + if (storev == ChamColumnwise) { + CORE_zplssq_col( M, N, sclssqin, sclssqout ); + } + else if (storev == ChamRowwise) { + CORE_zplssq_row( M, N, sclssqin, sclssqout ); + } + else { + CORE_zplssq_elt( M, N, sclssqin, sclssqout ); + } + return CHAMELEON_SUCCESS; +} + +/** + * + * @ingroup CORE_CHAMELEON_Complex64_t + * + * CORE_zplssq2 computes scl*sqrt(ssq) for each couple in a vector + * + ******************************************************************************* + * + * @param[in] N + * The number of columns in the tile sclssq. + * + * @param[in,out] sclssq + * The 2*N matrix on which to compute scl*sqrt(ssq) + * On entry contains all couple (scl,ssq) in (sclssq[i],sclssq[i+1]) + * On exist returns sclssq[i]=sclssq[i]*sqrt(sclssq[i+1]) + * so that the result is stored in the first line. + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * @retval -k, the k-th argument had an illegal value + * + */ +int CORE_zplssq2( int N, + double *sclssq ) +{ + int i; + for (i=0; i<N; i+=2) { + sclssq[i] *= sqrt ( sclssq[i+1] ); + } + return CHAMELEON_SUCCESS; +} diff --git a/coreblas/compute/core_zsyssq.c b/coreblas/compute/core_zsyssq.c index c7abf5a71d25a37c353d0bcb57409c694d89c63a..dfad10e3e6d90b61e737c6a283452c791c37bea6 100644 --- a/coreblas/compute/core_zsyssq.c +++ b/coreblas/compute/core_zsyssq.c @@ -21,17 +21,155 @@ */ #include <math.h> #include "coreblas/lapacke.h" +#include "coreblas/sumsq_update.h" #include "coreblas.h" -#define UPDATE( __nb, __value ) \ - if (__value != 0. ){ \ - if ( *scale < __value ) { \ - *sumsq = __nb + (*sumsq) * ( *scale / __value ) * ( *scale / __value ); \ - *scale = __value; \ - } else { \ - *sumsq = *sumsq + __nb * ( __value / *scale ) * ( __value / *scale ); \ - } \ +/** + * @brief Subcase uplo == ChamUpper, storev == ChamColumnwise || ChamRowwise of CORE_zsyssq() + */ +static inline int +CORE_zsyssq_up_col( int N, + const CHAMELEON_Complex64_t *A, int LDA, + double *sclssq ) +{ + int i, j; + double tmp; + double *ptr; + + for(j=0; j<N; j++) { + ptr = (double*) ( A + j * LDA ); + for(i=0; i<j; i++, ptr++) { + tmp = fabs(*ptr); + sumsq_update( 1., sclssq+2*i, sclssq+2*i+1, &tmp ); + sumsq_update( 1., sclssq+2*j, sclssq+2*j+1, &tmp ); +#if defined(PRECISION_z) || defined(PRECISION_c) + ptr++; + tmp = fabs(*ptr); + sumsq_update( 1., sclssq+2*i, sclssq+2*i+1, &tmp ); + sumsq_update( 1., sclssq+2*j, sclssq+2*j+1, &tmp ); +#endif + } + tmp = fabs(*ptr); + sumsq_update( 1., sclssq+2*j, sclssq+2*j+1, &tmp ); +#if defined(PRECISION_z) || defined(PRECISION_c) + ptr++; + tmp = fabs(*ptr); + sumsq_update( 1., sclssq+2*j, sclssq+2*j+1, &tmp ); +#endif + } + return CHAMELEON_SUCCESS; +} + +/** + * @brief Subcase uplo == ChamLower, storev == ChamColumnwise || ChamRowwise of CORE_zsyssq() + */ +static inline int +CORE_zsyssq_lo_col( int N, + const CHAMELEON_Complex64_t *A, int LDA, + double *sclssq ) +{ + int i, j; + double tmp; + double *ptr; + + for(j=0; j<N; j++) { + ptr = (double*) ( A + j * (LDA + 1) ); + tmp = fabs(*ptr); + sumsq_update( 1., sclssq+2*j, sclssq+2*j+1, &tmp ); +#if defined(PRECISION_z) || defined(PRECISION_c) + ptr++; + tmp = fabs(*ptr); + sumsq_update( 1., sclssq+2*j, sclssq+2*j+1, &tmp ); +#endif + ptr++; + for(i=j+1; i<N; i++, ptr++) { + tmp = fabs(*ptr); + sumsq_update( 1., sclssq+2*i, sclssq+2*i+1, &tmp ); + sumsq_update( 1., sclssq+2*j, sclssq+2*j+1, &tmp ); +#if defined(PRECISION_z) || defined(PRECISION_c) + ptr++; + tmp = fabs(*ptr); + sumsq_update( 1., sclssq+2*i, sclssq+2*i+1, &tmp ); + sumsq_update( 1., sclssq+2*j, sclssq+2*j+1, &tmp ); +#endif + } } + return CHAMELEON_SUCCESS; +} + +/** + * @brief Subcase uplo == ChamUpper, storev == ChamEltwise of CORE_zsyssq() + */ +static inline int +CORE_zsyssq_up_elt( int N, + const CHAMELEON_Complex64_t *A, int LDA, + double *sclssq ) +{ + int i, j; + double tmp; + double *ptr; + + for(j=0; j<N; j++) { + ptr = (double*) ( A + j * LDA ); + + for(i=0; i<j; i++, ptr++) { + tmp = fabs(*ptr); + sumsq_update( 2., sclssq, sclssq+1, &tmp ); +#if defined(PRECISION_z) || defined(PRECISION_c) + ptr++; + tmp = fabs(*ptr); + sumsq_update( 2., sclssq, sclssq+1, &tmp ); +#endif + } + + tmp = fabs(*ptr); + sumsq_update( 1., sclssq, sclssq+1, &tmp ); +#if defined(PRECISION_z) || defined(PRECISION_c) + ptr++; + tmp = fabs(*ptr); + sumsq_update( 1., sclssq, sclssq+1, &tmp ); +#endif + } + return CHAMELEON_SUCCESS; +} + +/** + * @brief Subcase uplo == ChamLower, storev == ChamEltwise of CORE_zsyssq() + */ +static inline int +CORE_zsyssq_lo_elt( int N, + const CHAMELEON_Complex64_t *A, int LDA, + double *sclssq ) +{ + int i, j; + double tmp; + double *ptr; + + for(j=0; j<N; j++) { + ptr = (double*) ( A + j * (LDA + 1) ); + + + tmp = fabs(*ptr); + sumsq_update( 1., sclssq, sclssq+1, &tmp ); +#if defined(PRECISION_z) || defined(PRECISION_c) + ptr++; + tmp = fabs(*ptr); + sumsq_update( 1., sclssq, sclssq+1, &tmp ); +#endif + ptr++; + + for(i=j+1; i<N; i++, ptr++) { + tmp = fabs(*ptr); + sumsq_update( 2., sclssq, sclssq+1, &tmp ); +#if defined(PRECISION_z) || defined(PRECISION_c) + ptr++; + tmp = fabs(*ptr); + sumsq_update( 2., sclssq, sclssq+1, &tmp ); +#endif + } + } + return CHAMELEON_SUCCESS; +} /** * @@ -61,6 +199,12 @@ * ******************************************************************************* * + * @param[in] storev + * Specifies whether the sums are made per column or row. + * = ChamColumnwise: Computes the sum of squares on each column + * = ChamRowwise: Computes the sum of squares on each row + * = ChamEltwise: Computes the sum of squares on all the matrix + * * @param[in] uplo * Specifies whether the upper or lower triangular part of * the symmetric matrix A is to be referenced as follows: @@ -78,13 +222,16 @@ * @param[in] LDA * The leading dimension of the tile A. LDA >= max(1,N). * - * @param[in,out] scale - * On entry, the value scale in the equation above. - * On exit, scale is overwritten with the value scl. - * - * @param[in,out] sumsq - * On entry, the value sumsq in the equation above. - * On exit, SUMSQ is overwritten with the value ssq. + * @param[in,out] sclssq + * Dimension: (2,K) + * if storev == ChamColumnwise, K = N + * if storev == ChamRowwise, K = N + * if storev == ChamEltwise, K = 1 + * On entry, sclssq contains K couples (sclssq[2*i], sclssq[2*i+1]) + * which corresponds to (scale, sumsq) in the equation below + * ( scl**2 )*ssq = sum( A( i, j )**2 ) + ( scale**2 )*sumsq, + * respectively for the columns, the rows and the full matrix + * On exit, each couple is overwritten with the final result (scl, ssq). * ******************************************************************************* * @@ -93,67 +240,21 @@ * */ -int CORE_zsyssq(cham_uplo_t uplo, int N, - const CHAMELEON_Complex64_t *A, int LDA, - double *scale, double *sumsq) +int CORE_zsyssq( cham_store_t storev, cham_uplo_t uplo, int N, + const CHAMELEON_Complex64_t *A, int LDA, + double *sclssq ) { - int i, j; - double tmp; - double *ptr; - if ( uplo == ChamUpper ) { - for(j=0; j<N; j++) { - ptr = (double*) ( A + j * LDA ); - - for(i=0; i<j; i++, ptr++) { - - tmp = fabs(*ptr); - UPDATE( 2., tmp ); - -#if defined(PRECISION_z) || defined(PRECISION_c) - ptr++; - tmp = fabs(*ptr); - UPDATE( 2., tmp ); -#endif - } - - /* Diagonal */ - tmp = fabs(*ptr); - UPDATE( 1., tmp ); - -#if defined(PRECISION_z) || defined(PRECISION_c) - ptr++; - tmp = fabs(*ptr); - UPDATE( 1., tmp ); -#endif + if ( storev == ChamEltwise ) { + CORE_zsyssq_up_elt( N, A, LDA, sclssq ); + } else { + CORE_zsyssq_up_col( N, A, LDA, sclssq ); } } else { - - for(j=0; j<N; j++) { - ptr = (double*) ( A + j * LDA + j); - - /* Diagonal */ - tmp = fabs(*ptr); - UPDATE( 1., tmp ); - ptr++; - -#if defined(PRECISION_z) || defined(PRECISION_c) - tmp = fabs(*ptr); - UPDATE( 1., tmp ); - ptr++; -#endif - - for(i=j+1; i<N; i++, ptr++) { - - tmp = fabs(*ptr); - UPDATE( 2., tmp ); - -#if defined(PRECISION_z) || defined(PRECISION_c) - ptr++; - tmp = fabs(*ptr); - UPDATE( 2., tmp ); -#endif - } + if ( storev == ChamEltwise ) { + CORE_zsyssq_lo_elt( N, A, LDA, sclssq ); + } else { + CORE_zsyssq_lo_col( N, A, LDA, sclssq ); } } return CHAMELEON_SUCCESS; diff --git a/coreblas/eztrace_module/coreblas_eztrace_module b/coreblas/eztrace_module/coreblas_eztrace_module index 3bca3cbf12c4763eec665bf20a02a23d38ee72ae..389858563abc4cfe8cca9e5d40109f6c2a461e3d 100644 --- a/coreblas/eztrace_module/coreblas_eztrace_module +++ b/coreblas/eztrace_module/coreblas_eztrace_module @@ -32,9 +32,9 @@ int CORE_cgessm(int M, int N, int K, int IB, const int *IPIV, void *L, int LDL, void *A, int LDA); -int CORE_cgessq(int M, int N, +int CORE_cgessq(cham_store_t storev, int M, int N, void *A, int LDA, - float *scale, float *sumsq); + float *sclssq); int CORE_cgetf2_nopiv(int M, int N, void *A, int LDA); int CORE_cgetrf(int M, int N, @@ -70,9 +70,9 @@ void CORE_cher2k(int uplo, int trans, void *alpha, void *A, int LDA, void *B, int LDB, float beta, void *C, int LDC); -int CORE_chessq(int uplo, int N, +int CORE_chessq(cham_store_t storev, int uplo, int N, void *A, int LDA, - float *scale, float *sumsq); + float *sclssq); int CORE_cherfb(int uplo, int N, int K, int IB, int NB, void *A, int LDA, @@ -190,9 +190,9 @@ void CORE_csyr2k(int uplo, int trans, void *alpha, void *A, int LDA, void *B, int LDB, void *beta, void *C, int LDC); -int CORE_csyssq(int uplo, int N, +int CORE_csyssq(cham_store_t storev, int uplo, int N, void *A, int LDA, - float *scale, float *sumsq); + float *sclssq); int CORE_csytf2_nopiv(int uplo, int n, void *A, int lda); void CORE_ctrmm(int side, int uplo, int transA, int diag, @@ -334,9 +334,9 @@ int CORE_dgessm(int M, int N, int K, int IB, const int *IPIV, const double *L, int LDL, double *A, int LDA); -int CORE_dgessq(int M, int N, +int CORE_dgessq(cham_store_t storev, int M, int N, const double *A, int LDA, - double *scale, double *sumsq); + double *sclssq); int CORE_dgetf2_nopiv(int M, int N, double *A, int LDA); int CORE_dgetrf(int M, int N, @@ -372,9 +372,9 @@ void CORE_dsyr2k(int uplo, int trans, double alpha, const double *A, int LDA, const double *B, int LDB, double beta, double *C, int LDC); -int CORE_dsyssq(int uplo, int N, +int CORE_dsyssq(cham_store_t storev, int uplo, int N, const double *A, int LDA, - double *scale, double *sumsq); + double *sclssq); int CORE_dsyrfb(int uplo, int N, int K, int IB, int NB, const double *A, int LDA, @@ -621,9 +621,9 @@ int CORE_sgessm(int M, int N, int K, int IB, const int *IPIV, const float *L, int LDL, float *A, int LDA); -int CORE_sgessq(int M, int N, +int CORE_sgessq(cham_store_t storev, int M, int N, const float *A, int LDA, - float *scale, float *sumsq); + float *sclssq); int CORE_sgetf2_nopiv(int M, int N, float *A, int LDA); int CORE_sgetrf(int M, int N, @@ -659,9 +659,9 @@ void CORE_ssyr2k(int uplo, int trans, float alpha, const float *A, int LDA, const float *B, int LDB, float beta, float *C, int LDC); -int CORE_ssyssq(int uplo, int N, +int CORE_ssyssq(cham_store_t storev, int uplo, int N, const float *A, int LDA, - float *scale, float *sumsq); + float *sclssq); int CORE_ssyrfb(int uplo, int N, int K, int IB, int NB, const float *A, int LDA, @@ -901,9 +901,9 @@ int CORE_zgessm(int M, int N, int K, int IB, const int *IPIV, void *L, int LDL, void *A, int LDA); -int CORE_zgessq(int M, int N, +int CORE_zgessq(cham_store_t storev, int M, int N, void *A, int LDA, - double *scale, double *sumsq); + double *sclssq); int CORE_zgetf2_nopiv(int M, int N, void *A, int LDA); int CORE_zgetrf(int M, int N, @@ -939,9 +939,9 @@ void CORE_zher2k(int uplo, int trans, void *alpha, void *A, int LDA, void *B, int LDB, double beta, void *C, int LDC); -int CORE_zhessq(int uplo, int N, +int CORE_zhessq(cham_store_t storev, int uplo, int N, void *A, int LDA, - double *scale, double *sumsq); + double *sclssq); int CORE_zherfb(int uplo, int N, int K, int IB, int NB, void *A, int LDA, @@ -1028,6 +1028,9 @@ void CORE_zplgsy(void *bump, int m, int n, void *A, int lda, int bigM, int m0, int n0, unsigned long long int seed ); void CORE_zplrnt(int m, int n, void *A, int lda, int bigM, int m0, int n0, unsigned long long int seed ); +int CORE_zplssq( cham_store_t storev, int M, int N, + double *sclssqin, double *sclssqout ); +int CORE_zplssq2( int N, double *sclssq ); void CORE_zpotrf(int uplo, int N, void *A, int LDA, int *INFO); void CORE_zshift(int s, int m, int n, int L, void *A); @@ -1053,9 +1056,9 @@ void CORE_zsyr2k(int uplo, int trans, void *alpha, void *A, int LDA, void *B, int LDB, void *beta, void *C, int LDC); -int CORE_zsyssq(int uplo, int N, +int CORE_zsyssq(cham_store_t storev, int uplo, int N, void *A, int LDA, - double *scale, double *sumsq); + double *sclssq); int CORE_zsytf2_nopiv(int uplo, int n, void *A, int lda); void CORE_ztrmm(int side, int uplo, int transA, int diag, diff --git a/coreblas/include/coreblas/coreblas_z.h b/coreblas/include/coreblas/coreblas_z.h index ee434de43d2da6c4219b00f8b8e4236a419adb86..2bd8c01067594c0c85ee2d43aeb6432d5b9ba031 100644 --- a/coreblas/include/coreblas/coreblas_z.h +++ b/coreblas/include/coreblas/coreblas_z.h @@ -64,9 +64,9 @@ int CORE_zgessm(int M, int N, int K, int IB, const int *IPIV, const CHAMELEON_Complex64_t *L, int LDL, CHAMELEON_Complex64_t *A, int LDA); -int CORE_zgessq(int M, int N, +int CORE_zgessq(cham_store_t storev, int M, int N, const CHAMELEON_Complex64_t *A, int LDA, - double *scale, double *sumsq); + double *sclssq); int CORE_zgetf2_nopiv(int M, int N, CHAMELEON_Complex64_t *A, int LDA); int CORE_zgetrf(int M, int N, @@ -104,9 +104,9 @@ void CORE_zher2k(cham_uplo_t uplo, cham_trans_t trans, CHAMELEON_Complex64_t alpha, const CHAMELEON_Complex64_t *A, int LDA, const CHAMELEON_Complex64_t *B, int LDB, double beta, CHAMELEON_Complex64_t *C, int LDC); -int CORE_zhessq(cham_uplo_t uplo, int N, +int CORE_zhessq(cham_store_t storev, cham_uplo_t uplo, int N, const CHAMELEON_Complex64_t *A, int LDA, - double *scale, double *sumsq); + double *sclssq); int CORE_zherfb(cham_uplo_t uplo, int N, int K, int IB, int NB, const CHAMELEON_Complex64_t *A, int LDA, const CHAMELEON_Complex64_t *T, int LDT, @@ -196,6 +196,9 @@ void CORE_zplgsy(CHAMELEON_Complex64_t bump, int m, int n, CHAMELEON_Complex64_t int bigM, int m0, int n0, unsigned long long int seed ); void CORE_zplrnt(int m, int n, CHAMELEON_Complex64_t *A, int lda, int bigM, int m0, int n0, unsigned long long int seed ); +int CORE_zplssq( cham_store_t storev, int M, int N, + double *sclssqin, double *sclssqout ); +int CORE_zplssq2( int N, double *sclssq ); void CORE_zpotrf(cham_uplo_t uplo, int N, CHAMELEON_Complex64_t *A, int LDA, int *INFO); void CORE_zshift(int s, int m, int n, int L, CHAMELEON_Complex64_t *A); @@ -221,9 +224,9 @@ void CORE_zsyr2k(cham_uplo_t uplo, cham_trans_t trans, CHAMELEON_Complex64_t alpha, const CHAMELEON_Complex64_t *A, int LDA, const CHAMELEON_Complex64_t *B, int LDB, CHAMELEON_Complex64_t beta, CHAMELEON_Complex64_t *C, int LDC); -int CORE_zsyssq(cham_uplo_t uplo, int N, +int CORE_zsyssq(cham_store_t storev, cham_uplo_t uplo, int N, const CHAMELEON_Complex64_t *A, int LDA, - double *scale, double *sumsq); + double *sclssq); int CORE_zsytf2_nopiv(cham_uplo_t uplo, int n, CHAMELEON_Complex64_t *A, int lda); int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N, CHAMELEON_Complex64_t alpha, diff --git a/coreblas/include/coreblas/sumsq_update.h b/coreblas/include/coreblas/sumsq_update.h new file mode 100644 index 0000000000000000000000000000000000000000..1c94e85ba13d540c5b72ad3277937829b9375c5b --- /dev/null +++ b/coreblas/include/coreblas/sumsq_update.h @@ -0,0 +1,132 @@ +/** + * + * @file sumsq_update.h + * + * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon CPU auxiliary sumsq_update routine + * + * @version 0.9.2 + * @author Mathieu Faverge + * @date 2019-03-27 + * + */ +#ifndef _sumsq_update_h_ +#define _sumsq_update_h_ + +/** + ******************************************************************************* + * + * @brief Update the couple (scale, sumsq) with one element when + * computing the Froebnius norm. + * + * The frobenius norm is equal to scale * sqrt( sumsq ), this method allows to + * avoid overflow in the sum square computation. + * + ******************************************************************************* + * + * @param[inout] scale + * On entry, the former scale + * On exit, the update scale to take into account the value + * + * @param[inout] sumsq + * On entry, the former sumsq + * On exit, the update sumsq to take into account the value + * + * @param[in] value + * The value to integrate into the couple (scale, sumsq) + * + *******************************************************************************/ +static inline void +#if defined(PRECISION_d) || defined(PRECISION_z) +sumsq_update( int nb, double *scale, double *sumsq, const double *value ) +{ + double absval = fabs(*value); + double ratio; + if ( absval != 0. ){ + if ( (*scale) < absval ) { + ratio = (*scale) / absval; + *sumsq = (double)nb + (*sumsq) * ratio * ratio; + *scale = absval; + } else { + ratio = absval / (*scale); + *sumsq = (*sumsq) + (double)nb * ratio * ratio; + } + } +} +#elif defined(PRECISION_s) || defined(PRECISION_c) +sumsq_update( int nb, float *scale, float *sumsq, const float *value ) +{ + float absval = fabs(*value); + float ratio; + if ( absval != 0. ){ + if ( (*scale) < absval ) { + ratio = (*scale) / absval; + *sumsq = (float)nb + (*sumsq) * ratio * ratio; + *scale = absval; + } else { + ratio = absval / (*scale); + *sumsq = (*sumsq) + (float)nb * ratio * ratio; + } + } +} +#endif + +/** + ******************************************************************************* + * + * @brief Update the couple (scale, sumsq) by adding another couple when + * computing the Froebnius norm. + * + * The frobenius norm is equal to scale * sqrt( sumsq ), this method allows to + * avoid overflow in the sum square computation. + * + ******************************************************************************* + * + * @param[inout] scale + * On entry, the former scale + * On exit, the update scale to take into account the value + * + * @param[inout] sumsq + * On entry, the former sumsq + * On exit, the update sumsq to take into account the value + * + * @param[in] value + * The value to integrate into the couple (scale, sumsq) + * + *******************************************************************************/ +static inline void +#if defined(PRECISION_d) || defined(PRECISION_z) +sumsq_update_2( int nb, const double *scalein, const double *sumsqin, double *scaleout, double *sumsqout ) +{ + if (*scaleout >= 0.) { + if ( (*scaleout) < (*scalein) ) { + *sumsqout = *sumsqin + (*sumsqout * (( *scaleout / *scalein ) * ( *scaleout / *scalein ))); + *scaleout = *scalein; + } else { + if ( (*scaleout) > 0. ){ + *sumsqout = *sumsqout + (*sumsqin * (( *scalein / *scaleout ) * ( *scalein / *scaleout ))); + } + } + } +} +#elif defined(PRECISION_s) || defined(PRECISION_c) +sumsq_update_2( int nb, const float *scalein, const float *sumsqin, float *scaleout, float *sumsqout ) +{ + if (*scaleout >= 0.) { + if ( (*scaleout) < (*scalein) ) { + *sumsqout = *sumsqin + (*sumsqout * (( *scaleout / *scalein ) * ( *scaleout / *scalein ))); + *scaleout = *scalein; + } else { + if ( (*scaleout) > 0. ){ + *sumsqout = *sumsqout + (*sumsqin * (( *scalein / *scaleout ) * ( *scalein / *scaleout ))); + } + } + } +} +#endif + +#endif /* _sumsq_update_h_ */ diff --git a/include/chameleon/constants.h b/include/chameleon/constants.h index 7e623aebf155ab457a8fd15f85e55539553a6967..ec7ea257144d0f1fc0730feb2f5817b98cd956f0 100644 --- a/include/chameleon/constants.h +++ b/include/chameleon/constants.h @@ -151,6 +151,7 @@ typedef enum chameleon_dir_e { typedef enum chameleon_store_e { ChamColumnwise = 401, /**< Column wise storage */ ChamRowwise = 402, /**< Row wise storage */ + ChamEltwise = 403, /**< Element by element storage */ } cham_store_t; diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index c82829c9e78db4c216de1d41eaa0d8793384f37f..2d8ec988208595f60f34b9355fa6f192ba89db6b 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -66,7 +66,7 @@ void INSERT_TASK_zgessm( const RUNTIME_option_t *options, const CHAM_desc_t *D, int Dm, int Dn, int ldd, const CHAM_desc_t *A, int Am, int An, int lda ); void INSERT_TASK_zgessq( const RUNTIME_option_t *options, - int m, int n, + cham_store_t storev, int m, int n, const CHAM_desc_t *A, int Am, int An, int lda, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ); void INSERT_TASK_zgetrf( const RUNTIME_option_t *options, @@ -112,7 +112,7 @@ void INSERT_TASK_zherk( const RUNTIME_option_t *options, double alpha, const CHAM_desc_t *A, int Am, int An, int lda, double beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc ); void INSERT_TASK_zhessq( const RUNTIME_option_t *options, - cham_uplo_t uplo, int n, + cham_store_t storev, cham_uplo_t uplo, int n, const CHAM_desc_t *A, int Am, int An, int lda, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ); void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, @@ -172,9 +172,10 @@ void INSERT_TASK_zplrnt( const RUNTIME_option_t *options, int m, int n, const CHAM_desc_t *A, int Am, int An, int lda, int bigM, int m0, int n0, unsigned long long int seed ); void INSERT_TASK_zplssq( const RUNTIME_option_t *options, + cham_store_t storev, int M, int N, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn, const CHAM_desc_t *SCLSSQ, int SCLSSQm, int SCLSSQn ); -void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, +void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, int N, const CHAM_desc_t *RESULT, int RESULTm, int RESULTn ); void INSERT_TASK_zpotrf( const RUNTIME_option_t *options, cham_uplo_t uplo, int n, int nb, @@ -205,7 +206,7 @@ void INSERT_TASK_zsyrk( const RUNTIME_option_t *options, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda, CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc ); void INSERT_TASK_zsyssq( const RUNTIME_option_t *options, - cham_uplo_t uplo, int n, + cham_store_t storev, cham_uplo_t uplo, int n, const CHAM_desc_t *A, int Am, int An, int lda, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ); void INSERT_TASK_zsytrf_nopiv( const RUNTIME_option_t *options, diff --git a/runtime/openmp/codelets/codelet_zgessq.c b/runtime/openmp/codelets/codelet_zgessq.c index ae15172d441dcd6bbf8475b9c3d841a5acfc2be3..42453eaacfde2ddc798a70059df8d417c7bd1edb 100644 --- a/runtime/openmp/codelets/codelet_zgessq.c +++ b/runtime/openmp/codelets/codelet_zgessq.c @@ -25,12 +25,12 @@ #include "coreblas/coreblas_z.h" void INSERT_TASK_zgessq( const RUNTIME_option_t *options, - int m, int n, + cham_store_t storev, int m, int n, const CHAM_desc_t *A, int Am, int An, int lda, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); double *ptrScaleSum = RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn); -#pragma omp task firstprivate(m, n, ptrA, lda, ptrScaleSum) depend(in:ptrA[0]) depend(inout:ptrScaleSum[0]) - CORE_zgessq( m, n, ptrA, lda, &ptrScaleSum[0], &ptrScaleSum[1] ); +#pragma omp task firstprivate(storev, m, n, ptrA, lda, ptrScaleSum) depend(in:ptrA[0]) depend(inout:ptrScaleSum[0]) + CORE_zgessq( storev, m, n, ptrA, lda, ptrScaleSum ); } diff --git a/runtime/openmp/codelets/codelet_zhessq.c b/runtime/openmp/codelets/codelet_zhessq.c index 702095aed7388e453b0b9696b0291fc090392bd8..85232d001a1b29286796d1a2e17e77eb095f310c 100644 --- a/runtime/openmp/codelets/codelet_zhessq.c +++ b/runtime/openmp/codelets/codelet_zhessq.c @@ -22,15 +22,13 @@ */ #include "chameleon_openmp.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" void INSERT_TASK_zhessq( const RUNTIME_option_t *options, - cham_uplo_t uplo, int n, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) + cham_store_t storev, cham_uplo_t uplo, int n, + const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { - CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); - double *ptrScaleSum = RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn); -#pragma omp task firstprivate(uplo, n, ptrA, lda, ptrScaleSum) depend(in:ptrScaleSum[0]) depend(inout:ptrA[0]) - CORE_zhessq( uplo, n, ptrA, lda, &ptrScaleSum[0], &ptrScaleSum[1] ); + INSERT_TASK_zsyssq( options, storev, uplo, n, + A, Am, An, lda, + SCALESUMSQ, SCALESUMSQm, SCALESUMSQn ); } diff --git a/runtime/openmp/codelets/codelet_zplssq.c b/runtime/openmp/codelets/codelet_zplssq.c index e41484c09702a4f9deddbf2645f904b03ad0a6e4..61196102c2d1ba7646925bb4ff27c2e858e08dd2 100644 --- a/runtime/openmp/codelets/codelet_zplssq.c +++ b/runtime/openmp/codelets/codelet_zplssq.c @@ -25,61 +25,22 @@ #include "chameleon/tasks_z.h" #include "coreblas/coreblas_z.h" -/** - * - * @ingroup CORE_CHAMELEON_Complex64_t - * - * @brief Compute sum( a_ij ^ 2 ) = scl * sqrt(ssq) - * - * with scl and ssq such that - * - * ( scl**2 )*ssq = sum( A( 2*i )**2 * A( 2*i+1 ) ) - * i - * - * The values of A(2*i+1) are assumed to be at least unity. - * The values of A(2*i) are assumed to be non-negative and scl is - * - * scl = max( A( 2*i ) ), - * i - * - * The routine makes only one pass through the matrix A. - * - ******************************************************************************* - * - * @param[in] M - * The number of couple (scale, sumsq) in the matrix A. - * - * @param[in] A - * The 2-by-M matrix. - * - * @param[out] result - * On exit, result contains scl * sqrt( ssq ) - * - */ void INSERT_TASK_zplssq( const RUNTIME_option_t *options, + cham_store_t storev, int M, int N, const CHAM_desc_t *IN, int INm, int INn, const CHAM_desc_t *OUT, int OUTm, int OUTn ) { double *sclssq_in = RTBLKADDR(IN, double, INm, INn ); double *sclssq_out = RTBLKADDR(OUT, double, OUTm, OUTn); -#pragma omp task depend(in: sclssq_in[0]) depend(inout: sclssq_out[0]) - { - if( sclssq_out[0] < sclssq_in[0] ) { - sclssq_out[1] = sclssq_in[1] + (sclssq_out[1] * (( sclssq_out[0] / sclssq_in[0] ) * ( sclssq_out[0] / sclssq_in[0] ))); - sclssq_out[0] = sclssq_in[0]; - } else { - sclssq_out[1] = sclssq_out[1] + (sclssq_in[1] * (( sclssq_in[0] / sclssq_out[0] ) * ( sclssq_in[0] / sclssq_out[0] ))); - } - } +#pragma omp task firstprivate(storev, M, N) depend(in: sclssq_in[0]) depend(inout: sclssq_out[0]) + CORE_zplssq(storev, M, N, sclssq_in, sclssq_out); } -void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, +void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, int N, const CHAM_desc_t *RESULT, int RESULTm, int RESULTn ) { - CHAMELEON_Complex64_t *res = RTBLKADDR(RESULT, CHAMELEON_Complex64_t, RESULTm, RESULTn); + CHAMELEON_Complex64_t *res = RTBLKADDR(RESULT, double, RESULTm, RESULTn); -#pragma omp task depend(inout: res[0]) - { - res[0] = res[0] * sqrt( res[1] ); - } +#pragma omp task firstprivate(N) depend(inout: res[0]) + CORE_zplssq2(N, res); } diff --git a/runtime/openmp/codelets/codelet_zsyssq.c b/runtime/openmp/codelets/codelet_zsyssq.c index bc616c81beccd9a2bc9e3711991fe1d2d7d587b6..32a7dc9d52775570d9575755b8d5c05d53945b70 100644 --- a/runtime/openmp/codelets/codelet_zsyssq.c +++ b/runtime/openmp/codelets/codelet_zsyssq.c @@ -23,12 +23,12 @@ #include "chameleon/tasks_z.h" void INSERT_TASK_zsyssq( const RUNTIME_option_t *options, - cham_uplo_t uplo, int n, + cham_store_t storev, cham_uplo_t uplo, int n, const CHAM_desc_t *A, int Am, int An, int lda, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An); double *ptrSCALESUMSQ = RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn); -#pragma omp task firstprivate(uplo, n, ptrA, lda, ptrSCALESUMSQ) depend(in:ptrA[0]) depend(inout:ptrSCALESUMSQ[0]) - CORE_zsyssq( uplo, n, ptrA, lda, &ptrSCALESUMSQ[0], &ptrSCALESUMSQ[1] ); +#pragma omp task firstprivate(storev, uplo, n, ptrA, lda, ptrSCALESUMSQ) depend(in:ptrA[0]) depend(inout:ptrSCALESUMSQ[0]) + CORE_zsyssq( storev, uplo, n, ptrA, lda, ptrSCALESUMSQ ); } diff --git a/runtime/parsec/codelets/codelet_zgessq.c b/runtime/parsec/codelets/codelet_zgessq.c index eb1ffd6f66d86816206c819274f48265be2f6e88..93b332cbf9117375286077075141c3f4a7f14560 100644 --- a/runtime/parsec/codelets/codelet_zgessq.c +++ b/runtime/parsec/codelets/codelet_zgessq.c @@ -25,6 +25,7 @@ static inline int CORE_zgessq_parsec( parsec_execution_stream_t *context, parsec_task_t *this_task ) { + cham_store_t storev; int m; int n; CHAMELEON_Complex64_t *A; @@ -32,16 +33,16 @@ CORE_zgessq_parsec( parsec_execution_stream_t *context, double *SCALESUMSQ; parsec_dtd_unpack_args( - this_task, &m, &n, &A, &lda, &SCALESUMSQ ); + this_task, &storev, &m, &n, &A, &lda, &SCALESUMSQ ); - CORE_zgessq( m, n, A, lda, SCALESUMSQ, SCALESUMSQ+1 ); + CORE_zgessq( storev, m, n, A, lda, SCALESUMSQ ); (void)context; return PARSEC_HOOK_RETURN_DONE; } void INSERT_TASK_zgessq( const RUNTIME_option_t *options, - int m, int n, + cham_store_t storev, int m, int n, const CHAM_desc_t *A, int Am, int An, int lda, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { @@ -49,10 +50,11 @@ void INSERT_TASK_zgessq( const RUNTIME_option_t *options, parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zgessq_parsec, options->priority, "gessq", - sizeof(int), &m, VALUE, - sizeof(int), &n, VALUE, + sizeof(cham_store_t), &storev, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &lda, VALUE, + sizeof(int), &lda, VALUE, PASSED_BY_REF, RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ), INOUT | AFFINITY, PARSEC_DTD_ARG_END ); } diff --git a/runtime/parsec/codelets/codelet_zhessq.c b/runtime/parsec/codelets/codelet_zhessq.c index 238e56bcbc80cb0a74b6d55b0049bb71d32a3072..2229c62eb91c03e52cbc848f482b2702a40a4c0c 100644 --- a/runtime/parsec/codelets/codelet_zhessq.c +++ b/runtime/parsec/codelets/codelet_zhessq.c @@ -19,40 +19,13 @@ */ #include "chameleon_parsec.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" - -static inline int -CORE_zhessq_parsec( parsec_execution_stream_t *context, - parsec_task_t *this_task ) -{ - cham_uplo_t uplo; - int n; - CHAMELEON_Complex64_t *A; - int lda; - double *SCALESUMSQ; - - parsec_dtd_unpack_args( - this_task, &uplo, &n, &A, &lda, &SCALESUMSQ ); - - CORE_zhessq( uplo, n, A, lda, &SCALESUMSQ[0], &SCALESUMSQ[1]); - - (void)context; - return PARSEC_HOOK_RETURN_DONE; -} void INSERT_TASK_zhessq( const RUNTIME_option_t *options, - cham_uplo_t uplo, int n, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) + cham_store_t storev, cham_uplo_t uplo, int n, + const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { - parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); - - parsec_dtd_taskpool_insert_task( - PARSEC_dtd_taskpool, CORE_zhessq_parsec, options->priority, "hessq", - sizeof(int), &uplo, VALUE, - sizeof(int), &n, VALUE, - PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, - sizeof(int), &lda, VALUE, - PASSED_BY_REF, RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ), INOUT | AFFINITY, - PARSEC_DTD_ARG_END ); + INSERT_TASK_zsyssq( options, storev, uplo, n, + A, Am, An, lda, + SCALESUMSQ, SCALESUMSQm, SCALESUMSQn ); } diff --git a/runtime/parsec/codelets/codelet_zplssq.c b/runtime/parsec/codelets/codelet_zplssq.c index 6c797829395817c83819a5cca0ab53601d25e295..16f19a21fad3581ec4a170d7fae3a90b657d5350 100644 --- a/runtime/parsec/codelets/codelet_zplssq.c +++ b/runtime/parsec/codelets/codelet_zplssq.c @@ -26,58 +26,23 @@ static inline int CORE_zplssq_parsec( parsec_execution_stream_t *context, parsec_task_t *this_task ) { + cham_store_t storev; + int M; + int N; double *SCLSSQ_IN; double *SCLSSQ_OUT; parsec_dtd_unpack_args( - this_task, &SCLSSQ_IN, &SCLSSQ_OUT ); + this_task, &storev, &M, &N, &SCLSSQ_IN, &SCLSSQ_OUT ); - assert( SCLSSQ_OUT[0] >= 0. ); - if( SCLSSQ_OUT[0] < SCLSSQ_IN[0] ) { - SCLSSQ_OUT[1] = SCLSSQ_IN[1] + (SCLSSQ_OUT[1] * (( SCLSSQ_OUT[0] / SCLSSQ_IN[0] ) * ( SCLSSQ_OUT[0] / SCLSSQ_IN[0] ))); - SCLSSQ_OUT[0] = SCLSSQ_IN[0]; - } else { - if ( SCLSSQ_OUT[0] > 0 ) { - SCLSSQ_OUT[1] = SCLSSQ_OUT[1] + (SCLSSQ_IN[1] * (( SCLSSQ_IN[0] / SCLSSQ_OUT[0] ) * ( SCLSSQ_IN[0] / SCLSSQ_OUT[0] ))); - } - } + CORE_zplssq(storev, M, N, SCLSSQ_IN, SCLSSQ_OUT); (void)context; return PARSEC_HOOK_RETURN_DONE; } -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * @brief Compute sum( a_ij ^ 2 ) = scl * sqrt(ssq) - * - * with scl and ssq such that - * - * ( scl**2 )*ssq = sum( A( 2*i )**2 * A( 2*i+1 ) ) - * i - * - * The values of A(2*i+1) are assumed to be at least unity. - * The values of A(2*i) are assumed to be non-negative and scl is - * - * scl = max( A( 2*i ) ), - * i - * - * The routine makes only one pass through the matrix A. - * - ******************************************************************************* - * - * @param[in] M - * The number of couple (scale, sumsq) in the matrix A. - * - * @param[in] A - * The 2-by-M matrix. - * - * @param[out] result - * On exit, result contains scl * sqrt( ssq ) - * - */ void INSERT_TASK_zplssq( const RUNTIME_option_t *options, + cham_store_t storev, int M, int N, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn, const CHAM_desc_t *SCLSSQ, int SCLSSQm, int SCLSSQn ) { @@ -85,6 +50,9 @@ void INSERT_TASK_zplssq( const RUNTIME_option_t *options, parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zplssq_parsec, options->priority, "plssq", + sizeof(int), &storev, VALUE, + sizeof(int), &M, VALUE, + sizeof(int), &N, VALUE, PASSED_BY_REF, RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ), INPUT, PASSED_BY_REF, RTBLKADDR( SCLSSQ, double, SCLSSQm, SCLSSQn ), INOUT | AFFINITY, PARSEC_DTD_ARG_END ); @@ -94,24 +62,26 @@ static inline int CORE_zplssq2_parsec( parsec_execution_stream_t *context, parsec_task_t *this_task ) { + int N; double *RESULT; parsec_dtd_unpack_args( - this_task, &RESULT ); + this_task, &N, &RESULT ); - RESULT[0] = RESULT[0] * sqrt( RESULT[1] ); + CORE_zplssq2(N, RESULT); (void)context; return PARSEC_HOOK_RETURN_DONE; } -void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, +void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, int N, const CHAM_desc_t *RESULT, int RESULTm, int RESULTn ) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zplssq2_parsec, options->priority, "plssq2", + sizeof(int), &N, VALUE, PASSED_BY_REF, RTBLKADDR( RESULT, double, RESULTm, RESULTn ), INOUT | AFFINITY, PARSEC_DTD_ARG_END ); } diff --git a/runtime/parsec/codelets/codelet_zsyssq.c b/runtime/parsec/codelets/codelet_zsyssq.c index 9f06c6d4c35769048c1817d92fafe694d5543f09..8e86035ccbaafd400985c1ffea92b9e27643e588 100644 --- a/runtime/parsec/codelets/codelet_zsyssq.c +++ b/runtime/parsec/codelets/codelet_zsyssq.c @@ -25,6 +25,7 @@ static inline int CORE_zsyssq_parsec( parsec_execution_stream_t *context, parsec_task_t *this_task ) { + cham_store_t storev; cham_uplo_t uplo; int n; CHAMELEON_Complex64_t *A; @@ -32,16 +33,16 @@ CORE_zsyssq_parsec( parsec_execution_stream_t *context, double *SCALESUMSQ; parsec_dtd_unpack_args( - this_task, &uplo, &n, &A, &lda, &SCALESUMSQ ); + this_task, &storev, &uplo, &n, &A, &lda, &SCALESUMSQ ); - CORE_zsyssq( uplo, n, A, lda, &SCALESUMSQ[0], &SCALESUMSQ[1]); + CORE_zsyssq( storev, uplo, n, A, lda, SCALESUMSQ ); (void)context; return PARSEC_HOOK_RETURN_DONE; } void INSERT_TASK_zsyssq( const RUNTIME_option_t *options, - cham_uplo_t uplo, int n, + cham_store_t storev, cham_uplo_t uplo, int n, const CHAM_desc_t *A, int Am, int An, int lda, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { @@ -49,7 +50,8 @@ void INSERT_TASK_zsyssq( const RUNTIME_option_t *options, parsec_dtd_taskpool_insert_task( PARSEC_dtd_taskpool, CORE_zsyssq_parsec, options->priority, "syssq", - sizeof(int), &uplo, VALUE, + sizeof(cham_store_t), &storev, VALUE, + sizeof(int), &uplo, VALUE, sizeof(int), &n, VALUE, PASSED_BY_REF, RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT, sizeof(int), &lda, VALUE, diff --git a/runtime/quark/codelets/codelet_zgessq.c b/runtime/quark/codelets/codelet_zgessq.c index f63b6d99a6fc597a6b97549d24860a839f6364eb..bf5396e4206dbd5f9389377ac11a16c45b4e24d7 100644 --- a/runtime/quark/codelets/codelet_zgessq.c +++ b/runtime/quark/codelets/codelet_zgessq.c @@ -25,27 +25,29 @@ void CORE_zgessq_quark(Quark *quark) { + cham_store_t storev; int m; int n; CHAMELEON_Complex64_t *A; int lda; double *SCALESUMSQ; - quark_unpack_args_5( quark, m, n, A, lda, SCALESUMSQ ); - CORE_zgessq( m, n, A, lda, &SCALESUMSQ[0], &SCALESUMSQ[1]); + quark_unpack_args_6( quark, storev, m, n, A, lda, SCALESUMSQ ); + CORE_zgessq( storev, m, n, A, lda, SCALESUMSQ ); } void INSERT_TASK_zgessq( const RUNTIME_option_t *options, - int m, int n, + cham_store_t storev, int m, int n, const CHAM_desc_t *A, int Am, int An, int lda, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); QUARK_Insert_Task(opt->quark, CORE_zgessq_quark, (Quark_Task_Flags*)opt, - sizeof(int), &m, VALUE, - sizeof(int), &n, VALUE, + sizeof(cham_store_t), &storev, VALUE, + sizeof(int), &m, VALUE, + sizeof(int), &n, VALUE, sizeof(CHAMELEON_Complex64_t)*lda*n, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &lda, VALUE, + sizeof(int), &lda, VALUE, sizeof(double)*2, RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INOUT, 0); } diff --git a/runtime/quark/codelets/codelet_zhessq.c b/runtime/quark/codelets/codelet_zhessq.c index 5d797c984297b70b6bbbe570825ba76e069859f9..b66dfd60a9dcda3747993cacec4b4d33b22e6370 100644 --- a/runtime/quark/codelets/codelet_zhessq.c +++ b/runtime/quark/codelets/codelet_zhessq.c @@ -21,31 +21,13 @@ */ #include "chameleon_quark.h" #include "chameleon/tasks_z.h" -#include "coreblas/coreblas_z.h" - -void CORE_zhessq_quark(Quark *quark) -{ - cham_uplo_t uplo; - int n; - CHAMELEON_Complex64_t *A; - int lda; - double *SCALESUMSQ; - - quark_unpack_args_5( quark, uplo, n, A, lda, SCALESUMSQ ); - CORE_zhessq( uplo, n, A, lda, &SCALESUMSQ[0], &SCALESUMSQ[1]); -} void INSERT_TASK_zhessq( const RUNTIME_option_t *options, - cham_uplo_t uplo, int n, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) + cham_store_t storev, cham_uplo_t uplo, int n, + const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { - quark_option_t *opt = (quark_option_t*)(options->schedopt); - QUARK_Insert_Task(opt->quark, CORE_zhessq_quark, (Quark_Task_Flags*)opt, - sizeof(int), &uplo, VALUE, - sizeof(int), &n, VALUE, - sizeof(CHAMELEON_Complex64_t)*lda*n, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, - sizeof(int), &lda, VALUE, - sizeof(double)*2, RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INOUT, - 0); + INSERT_TASK_zsyssq( options, storev, uplo, n, + A, Am, An, lda, + SCALESUMSQ, SCALESUMSQm, SCALESUMSQn ); } diff --git a/runtime/quark/codelets/codelet_zplssq.c b/runtime/quark/codelets/codelet_zplssq.c index 8744b7b3ec319ce347abd8f40d101cb1a92b3e07..70810682c9a89cfba122782dbdff65149a17a09e 100644 --- a/runtime/quark/codelets/codelet_zplssq.c +++ b/runtime/quark/codelets/codelet_zplssq.c @@ -26,20 +26,15 @@ void CORE_zplssq_quark(Quark *quark) { + cham_store_t storev; + int M; + int N; double *SCLSSQ_IN; double *SCLSSQ_OUT; - quark_unpack_args_2( quark, SCLSSQ_IN, SCLSSQ_OUT ); + quark_unpack_args_5( quark, storev, M, N, SCLSSQ_IN, SCLSSQ_OUT ); - assert( SCLSSQ_OUT[0] >= 0. ); - if( SCLSSQ_OUT[0] < SCLSSQ_IN[0] ) { - SCLSSQ_OUT[1] = SCLSSQ_IN[1] + (SCLSSQ_OUT[1] * (( SCLSSQ_OUT[0] / SCLSSQ_IN[0] ) * ( SCLSSQ_OUT[0] / SCLSSQ_IN[0] ))); - SCLSSQ_OUT[0] = SCLSSQ_IN[0]; - } else { - if ( SCLSSQ_OUT[0] > 0 ) { - SCLSSQ_OUT[1] = SCLSSQ_OUT[1] + (SCLSSQ_IN[1] * (( SCLSSQ_IN[0] / SCLSSQ_OUT[0] ) * ( SCLSSQ_IN[0] / SCLSSQ_OUT[0] ))); - } - } + CORE_zplssq(storev, M, N, SCLSSQ_IN, SCLSSQ_OUT); } /** @@ -74,30 +69,48 @@ void CORE_zplssq_quark(Quark *quark) * */ void INSERT_TASK_zplssq( const RUNTIME_option_t *options, + cham_store_t storev, int M, int N, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn, const CHAM_desc_t *SCLSSQ, int SCLSSQm, int SCLSSQn ) { + int sizein = M*N; + int sizeout; + + if ( storev == ChamColumnwise ) { + sizeout = 2*N; + } else if ( storev == ChamRowwise ) { + sizeout = 2*M; + } else { + sizeout = 2; + } + quark_option_t *opt = (quark_option_t*)(options->schedopt); QUARK_Insert_Task(opt->quark, CORE_zplssq_quark, (Quark_Task_Flags*)opt, - sizeof(double)*2, RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INPUT, - sizeof(double)*2, RTBLKADDR(SCLSSQ, double, SCLSSQm, SCLSSQn), INOUT, + sizeof(int), &storev, VALUE, + sizeof(int), &M, VALUE, + sizeof(int), &N, VALUE, + sizeof(double)*sizein, RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INPUT, + sizeof(double)*sizeout, RTBLKADDR(SCLSSQ, double, SCLSSQm, SCLSSQn), INOUT, 0); } void CORE_zplssq2_quark(Quark *quark) { + int N; double *RESULT; - quark_unpack_args_1( quark, RESULT ); - RESULT[0] = RESULT[0] * sqrt( RESULT[1] ); + quark_unpack_args_2( quark, N, RESULT ); + + CORE_zplssq2(N, RESULT); } -void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, +void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, int N, const CHAM_desc_t *RESULT, int RESULTm, int RESULTn ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); QUARK_Insert_Task(opt->quark, CORE_zplssq2_quark, (Quark_Task_Flags*)opt, - sizeof(double)*2, RTBLKADDR(RESULT, double, RESULTm, RESULTn), INOUT, + sizeof(int), &N, VALUE, + sizeof(double)*2*N, RTBLKADDR(RESULT, double, RESULTm, RESULTn), INOUT, 0); } diff --git a/runtime/quark/codelets/codelet_zsyssq.c b/runtime/quark/codelets/codelet_zsyssq.c index 9611e2082f9fd7c971f196d3318c66edb4353a31..cf1f39442c015139d869ae1fc08a5e4a79bd83a9 100644 --- a/runtime/quark/codelets/codelet_zsyssq.c +++ b/runtime/quark/codelets/codelet_zsyssq.c @@ -25,24 +25,26 @@ void CORE_zsyssq_quark(Quark *quark) { + cham_store_t storev; cham_uplo_t uplo; int n; CHAMELEON_Complex64_t *A; int lda; double *SCALESUMSQ; - quark_unpack_args_5( quark, uplo, n, A, lda, SCALESUMSQ ); - CORE_zsyssq( uplo, n, A, lda, &SCALESUMSQ[0], &SCALESUMSQ[1]); + quark_unpack_args_6( quark, storev, uplo, n, A, lda, SCALESUMSQ ); + CORE_zsyssq( storev, uplo, n, A, lda, SCALESUMSQ ); } void INSERT_TASK_zsyssq( const RUNTIME_option_t *options, - cham_uplo_t uplo, int n, + cham_store_t storev, cham_uplo_t uplo, int n, const CHAM_desc_t *A, int Am, int An, int lda, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); QUARK_Insert_Task(opt->quark, CORE_zsyssq_quark, (Quark_Task_Flags*)opt, - sizeof(int), &uplo, VALUE, + sizeof(cham_store_t), &storev, VALUE, + sizeof(int), &uplo, VALUE, sizeof(int), &n, VALUE, sizeof(CHAMELEON_Complex64_t)*lda*n, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT, sizeof(int), &lda, VALUE, diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c index 35aea3122a2aa896ef9044c3a37c13191235296f..dddcb6994da86b889f7d484828d86a9d71cc9814 100644 --- a/runtime/starpu/codelets/codelet_zcallback.c +++ b/runtime/starpu/codelets/codelet_zcallback.c @@ -54,8 +54,8 @@ CHAMELEON_CL_CB(zsytrf_nopiv, starpu_matrix_get_nx(task->handles[0]), 0, 0, CHAMELEON_CL_CB(zplgsy, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N) CHAMELEON_CL_CB(zplrnt, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N) CHAMELEON_CL_CB(zbuild, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N) -CHAMELEON_CL_CB(zplssq, 1, 1, 0, 4) -CHAMELEON_CL_CB(zplssq2, 1, 1, 0, 1) +CHAMELEON_CL_CB(zplssq, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, M*N) +CHAMELEON_CL_CB(zplssq2, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0, 2*N) CHAMELEON_CL_CB(zpotrf, starpu_matrix_get_nx(task->handles[0]), 0, 0, (1./3.)*M* M*M) CHAMELEON_CL_CB(zssssm, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), M*M*(2.*M+starpu_matrix_get_nx(task->handles[2]))) CHAMELEON_CL_CB(zsymm, starpu_matrix_get_nx(task->handles[2]), starpu_matrix_get_ny(task->handles[2]), 0, 2.*M*M *N) diff --git a/runtime/starpu/codelets/codelet_zgessq.c b/runtime/starpu/codelets/codelet_zgessq.c index 3c37cb1c6858ab4af43475fd19e17f199d2cea65..c9141aefbd8aaa67be87429b7e0f29a34e2344eb 100644 --- a/runtime/starpu/codelets/codelet_zgessq.c +++ b/runtime/starpu/codelets/codelet_zgessq.c @@ -25,6 +25,7 @@ #if !defined(CHAMELEON_SIMULATION) static void cl_zgessq_cpu_func(void *descr[], void *cl_arg) { + cham_store_t storev; int m; int n; CHAMELEON_Complex64_t *A; @@ -33,8 +34,8 @@ static void cl_zgessq_cpu_func(void *descr[], void *cl_arg) A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); SCALESUMSQ = (double *)STARPU_MATRIX_GET_PTR(descr[1]); - starpu_codelet_unpack_args(cl_arg, &m, &n, &lda); - CORE_zgessq( m, n, A, lda, &SCALESUMSQ[0], &SCALESUMSQ[1] ); + starpu_codelet_unpack_args(cl_arg, &storev, &m, &n, &lda); + CORE_zgessq( storev, m, n, A, lda, SCALESUMSQ ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -44,7 +45,7 @@ static void cl_zgessq_cpu_func(void *descr[], void *cl_arg) CODELETS_CPU(zgessq, 2, cl_zgessq_cpu_func) void INSERT_TASK_zgessq( const RUNTIME_option_t *options, - int m, int n, + cham_store_t storev, int m, int n, const CHAM_desc_t *A, int Am, int An, int lda, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { @@ -58,6 +59,7 @@ void INSERT_TASK_zgessq( const RUNTIME_option_t *options, starpu_insert_task( starpu_mpi_codelet(codelet), + STARPU_VALUE, &storev, sizeof(cham_store_t), STARPU_VALUE, &m, sizeof(int), STARPU_VALUE, &n, sizeof(int), STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), diff --git a/runtime/starpu/codelets/codelet_zhessq.c b/runtime/starpu/codelets/codelet_zhessq.c index 724f5e6fba78f76a62541b8e1a0e40f207716fe8..c1b18c8bc388df6890ff44a1e74058036987451c 100644 --- a/runtime/starpu/codelets/codelet_zhessq.c +++ b/runtime/starpu/codelets/codelet_zhessq.c @@ -22,51 +22,12 @@ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" -#if !defined(CHAMELEON_SIMULATION) -static void cl_zhessq_cpu_func(void *descr[], void *cl_arg) -{ - cham_uplo_t uplo; - int n; - CHAMELEON_Complex64_t *A; - int lda; - double *SCALESUMSQ; - - A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); - SCALESUMSQ = (double *)STARPU_MATRIX_GET_PTR(descr[1]); - starpu_codelet_unpack_args(cl_arg, &uplo, &n, &lda); - CORE_zhessq( uplo, n, A, lda, &SCALESUMSQ[0], &SCALESUMSQ[1] ); -} -#endif /* !defined(CHAMELEON_SIMULATION) */ - -/* - * Codelet definition - */ -CODELETS_CPU(zhessq, 2, cl_zhessq_cpu_func) - void INSERT_TASK_zhessq( const RUNTIME_option_t *options, - cham_uplo_t uplo, int n, - const CHAM_desc_t *A, int Am, int An, int lda, - const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) + cham_store_t storev, cham_uplo_t uplo, int n, + const CHAM_desc_t *A, int Am, int An, int lda, + const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { - struct starpu_codelet *codelet = &cl_zhessq; - void (*callback)(void*) = options->profiling ? cl_zgessq_callback : NULL; - - CHAMELEON_BEGIN_ACCESS_DECLARATION; - CHAMELEON_ACCESS_R(A, Am, An); - CHAMELEON_ACCESS_RW(SCALESUMSQ, SCALESUMSQm, SCALESUMSQn); - CHAMELEON_END_ACCESS_DECLARATION; - - starpu_insert_task( - starpu_mpi_codelet(codelet), - STARPU_VALUE, &uplo, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_VALUE, &lda, sizeof(int), - STARPU_RW, RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, -#if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "zhessq", -#endif - 0); + INSERT_TASK_zsyssq( options, storev, uplo, n, + A, Am, An, lda, + SCALESUMSQ, SCALESUMSQm, SCALESUMSQn ); } diff --git a/runtime/starpu/codelets/codelet_zplssq.c b/runtime/starpu/codelets/codelet_zplssq.c index 683aedf2f070bccb037af1139abdd9d3c24ab15c..96a41eb75bf9e30d9ad939acd8c29f89c71cc7c0 100644 --- a/runtime/starpu/codelets/codelet_zplssq.c +++ b/runtime/starpu/codelets/codelet_zplssq.c @@ -26,21 +26,17 @@ #if !defined(CHAMELEON_SIMULATION) static void cl_zplssq_cpu_func(void *descr[], void *cl_arg) { + cham_store_t storev; + int M; + int N; double *SCLSSQ_IN; double *SCLSSQ_OUT; + starpu_codelet_unpack_args(cl_arg, &storev, &M, &N); SCLSSQ_IN = (double *)STARPU_MATRIX_GET_PTR(descr[0]); SCLSSQ_OUT = (double *)STARPU_MATRIX_GET_PTR(descr[1]); - assert( SCLSSQ_OUT[0] >= 0. ); - if( SCLSSQ_OUT[0] < SCLSSQ_IN[0] ) { - SCLSSQ_OUT[1] = SCLSSQ_IN[1] + (SCLSSQ_OUT[1] * (( SCLSSQ_OUT[0] / SCLSSQ_IN[0] ) * ( SCLSSQ_OUT[0] / SCLSSQ_IN[0] ))); - SCLSSQ_OUT[0] = SCLSSQ_IN[0]; - } else { - if ( SCLSSQ_OUT[0] > 0 ) { - SCLSSQ_OUT[1] = SCLSSQ_OUT[1] + (SCLSSQ_IN[1] * (( SCLSSQ_IN[0] / SCLSSQ_OUT[0] ) * ( SCLSSQ_IN[0] / SCLSSQ_OUT[0] ))); - } - } + CORE_zplssq(storev, M, N, SCLSSQ_IN, SCLSSQ_OUT); (void)cl_arg; } @@ -51,40 +47,10 @@ static void cl_zplssq_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU(zplssq, 2, cl_zplssq_cpu_func) -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - * @brief Compute sum( a_ij ^ 2 ) = scl * sqrt(ssq) - * - * with scl and ssq such that - * - * ( scl**2 )*ssq = sum( A( 2*i )**2 * A( 2*i+1 ) ) - * i - * - * The values of A(2*i+1) are assumed to be at least unity. - * The values of A(2*i) are assumed to be non-negative and scl is - * - * scl = max( A( 2*i ) ), - * i - * - * The routine makes only one pass through the matrix A. - * - ******************************************************************************* - * - * @param[in] M - * The number of couple (scale, sumsq) in the matrix A. - * - * @param[in] A - * The 2-by-M matrix. - * - * @param[out] result - * On exit, result contains scl * sqrt( ssq ) - * - */ void INSERT_TASK_zplssq( const RUNTIME_option_t *options, - const CHAM_desc_t *SCLSSQ_IN, int SCLSSQ_INm, int SCLSSQ_INn, - const CHAM_desc_t *SCLSSQ_OUT, int SCLSSQ_OUTm, int SCLSSQ_OUTn ) + cham_store_t storev, int M, int N, + const CHAM_desc_t *SCLSSQ_IN, int SCLSSQ_INm, int SCLSSQ_INn, + const CHAM_desc_t *SCLSSQ_OUT, int SCLSSQ_OUTm, int SCLSSQ_OUTn ) { struct starpu_codelet *codelet = &cl_zplssq; void (*callback)(void*) = options->profiling ? cl_zplssq_callback : NULL; @@ -96,6 +62,9 @@ void INSERT_TASK_zplssq( const RUNTIME_option_t *options, starpu_insert_task( starpu_mpi_codelet(codelet), + STARPU_VALUE, &storev, sizeof(int), + STARPU_VALUE, &M, sizeof(int), + STARPU_VALUE, &N, sizeof(int), STARPU_R, RTBLKADDR( SCLSSQ_IN, double, SCLSSQ_INm, SCLSSQ_INn ), STARPU_RW, RTBLKADDR( SCLSSQ_OUT, double, SCLSSQ_OUTm, SCLSSQ_OUTn ), STARPU_PRIORITY, options->priority, @@ -109,11 +78,13 @@ void INSERT_TASK_zplssq( const RUNTIME_option_t *options, #if !defined(CHAMELEON_SIMULATION) static void cl_zplssq2_cpu_func(void *descr[], void *cl_arg) { + int N; double *RESULT; + starpu_codelet_unpack_args(cl_arg, &N); RESULT = (double *)STARPU_MATRIX_GET_PTR(descr[0]); - RESULT[0] = RESULT[0] * sqrt( RESULT[1] ); + CORE_zplssq2(N, RESULT); (void)cl_arg; } @@ -124,7 +95,7 @@ static void cl_zplssq2_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU(zplssq2, 1, cl_zplssq2_cpu_func) -void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, +void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, int N, const CHAM_desc_t *RESULT, int RESULTm, int RESULTn ) { struct starpu_codelet *codelet = &cl_zplssq2; @@ -136,6 +107,7 @@ void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, starpu_insert_task( starpu_mpi_codelet(codelet), + STARPU_VALUE, &N, sizeof(int), STARPU_RW, RTBLKADDR(RESULT, double, RESULTm, RESULTn), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, diff --git a/runtime/starpu/codelets/codelet_zsyssq.c b/runtime/starpu/codelets/codelet_zsyssq.c index 543b6e1b6b5ef298c47f9dee64c1481596df35a5..30980d669565b05567026c20649459be00ac8549 100644 --- a/runtime/starpu/codelets/codelet_zsyssq.c +++ b/runtime/starpu/codelets/codelet_zsyssq.c @@ -25,6 +25,7 @@ #if !defined(CHAMELEON_SIMULATION) static void cl_zsyssq_cpu_func(void *descr[], void *cl_arg) { + cham_store_t storev; cham_uplo_t uplo; int n; CHAMELEON_Complex64_t *A; @@ -33,8 +34,8 @@ static void cl_zsyssq_cpu_func(void *descr[], void *cl_arg) A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); SCALESUMSQ = (double *)STARPU_MATRIX_GET_PTR(descr[1]); - starpu_codelet_unpack_args(cl_arg, &uplo, &n, &lda); - CORE_zsyssq( uplo, n, A, lda, &SCALESUMSQ[0], &SCALESUMSQ[1] ); + starpu_codelet_unpack_args(cl_arg, &storev, &uplo, &n, &lda); + CORE_zsyssq( storev, uplo, n, A, lda, SCALESUMSQ ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -44,7 +45,7 @@ static void cl_zsyssq_cpu_func(void *descr[], void *cl_arg) CODELETS_CPU(zsyssq, 2, cl_zsyssq_cpu_func) void INSERT_TASK_zsyssq( const RUNTIME_option_t *options, - cham_uplo_t uplo, int n, + cham_store_t storev, cham_uplo_t uplo, int n, const CHAM_desc_t *A, int Am, int An, int lda, const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn ) { @@ -58,6 +59,7 @@ void INSERT_TASK_zsyssq( const RUNTIME_option_t *options, starpu_insert_task( starpu_mpi_codelet(codelet), + STARPU_VALUE, &storev, sizeof(cham_store_t), STARPU_VALUE, &uplo, sizeof(int), STARPU_VALUE, &n, sizeof(int), STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),