Commit 2d0a51cb authored by PRUVOST Florent's avatar PRUVOST Florent Committed by Mathieu Faverge

update ge/he/syssq algos to make them more generic and get the possibility to...

update ge/he/syssq algos to make them more generic and get the possibility to accumulate squares over columns or rows
parent 3ca82fe8
...@@ -329,13 +329,14 @@ chameleon_pzlange_frb( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_ ...@@ -329,13 +329,14 @@ chameleon_pzlange_frb( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_
else { else {
INSERT_TASK_zgessq( INSERT_TASK_zgessq(
options, options,
ChamEltwise,
tempmm, tempnn, tempmm, tempnn,
A(m, n), ldam, W( Welt, m, n) ); A(m, n), ldam, W( Welt, m, n) );
} }
if ( n >= Q ) { if ( n >= Q ) {
INSERT_TASK_dplssq( INSERT_TASK_dplssq(
options, W( Welt, m, n), W( Welt, m, n%Q) ); options, ChamEltwise, 1, 1, W( Welt, m, n), W( Welt, m, n%Q) );
} }
} }
...@@ -345,7 +346,7 @@ chameleon_pzlange_frb( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_ ...@@ -345,7 +346,7 @@ chameleon_pzlange_frb( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_
*/ */
for(n = 1; n < Q; n++) { for(n = 1; n < Q; n++) {
INSERT_TASK_dplssq( INSERT_TASK_dplssq(
options, W( Welt, m, n), W( Welt, m, 0) ); options, ChamEltwise, 1, 1, W( Welt, m, n), W( Welt, m, 0) );
} }
} }
...@@ -355,7 +356,7 @@ chameleon_pzlange_frb( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_ ...@@ -355,7 +356,7 @@ chameleon_pzlange_frb( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_
*/ */
for(m = P; m < MT; m++) { for(m = P; m < MT; m++) {
INSERT_TASK_dplssq( INSERT_TASK_dplssq(
options, W( Welt, m, 0), W( Welt, m%P, 0) ); options, ChamEltwise, 1, 1, W( Welt, m, 0), W( Welt, m%P, 0) );
} }
/** /**
...@@ -364,11 +365,11 @@ chameleon_pzlange_frb( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_ ...@@ -364,11 +365,11 @@ chameleon_pzlange_frb( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_
*/ */
for(m = 1; m < P; m++) { for(m = 1; m < P; m++) {
INSERT_TASK_dplssq( INSERT_TASK_dplssq(
options, W( Welt, m, 0), W( Welt, 0, 0) ); options, ChamEltwise, 1, 1, W( Welt, m, 0), W( Welt, 0, 0) );
} }
INSERT_TASK_dplssq2( INSERT_TASK_dplssq2(
options, W( Welt, 0, 0) ); options, 1, W( Welt, 0, 0) );
} }
/** /**
......
...@@ -241,21 +241,21 @@ chameleon_pzlansy_frb( cham_trans_t trans, cham_uplo_t uplo, ...@@ -241,21 +241,21 @@ chameleon_pzlansy_frb( cham_trans_t trans, cham_uplo_t uplo,
if ( n == m ) { if ( n == m ) {
if ( trans == ChamConjTrans) { if ( trans == ChamConjTrans) {
INSERT_TASK_zhessq( INSERT_TASK_zhessq(
options, uplo, tempmm, options, ChamEltwise, uplo, tempmm,
A(m, n), ldam, W( Welt, m, n) ); A(m, n), ldam, W( Welt, m, n) );
} }
else { else {
INSERT_TASK_zsyssq( INSERT_TASK_zsyssq(
options, uplo, tempmm, options, ChamEltwise, uplo, tempmm,
A(m, n), ldam, W( Welt, m, n) ); A(m, n), ldam, W( Welt, m, n) );
} }
} }
else { else {
INSERT_TASK_zgessq( INSERT_TASK_zgessq(
options, tempmm, tempnn, options, ChamEltwise, tempmm, tempnn,
A(m, n), ldam, W( Welt, m, n) ); A(m, n), ldam, W( Welt, m, n) );
INSERT_TASK_zgessq( INSERT_TASK_zgessq(
options, tempmm, tempnn, options, ChamEltwise, tempmm, tempnn,
A(m, n), ldam, W( Welt, n, m) ); A(m, n), ldam, W( Welt, n, m) );
} }
} }
...@@ -264,7 +264,7 @@ chameleon_pzlansy_frb( cham_trans_t trans, cham_uplo_t uplo, ...@@ -264,7 +264,7 @@ chameleon_pzlansy_frb( cham_trans_t trans, cham_uplo_t uplo,
for(m = 0; m < MT; m++) { for(m = 0; m < MT; m++) {
for(n = Q; n < NT; n++) { for(n = Q; n < NT; n++) {
INSERT_TASK_dplssq( INSERT_TASK_dplssq(
options, W( Welt, m, n), W( Welt, m, n%Q) ); options, ChamEltwise, 1, 1, W( Welt, m, n), W( Welt, m, n%Q) );
} }
/** /**
...@@ -273,7 +273,7 @@ chameleon_pzlansy_frb( cham_trans_t trans, cham_uplo_t uplo, ...@@ -273,7 +273,7 @@ chameleon_pzlansy_frb( cham_trans_t trans, cham_uplo_t uplo,
*/ */
for(n = 1; n < Q; n++) { for(n = 1; n < Q; n++) {
INSERT_TASK_dplssq( INSERT_TASK_dplssq(
options, W( Welt, m, n), W( Welt, m, 0) ); options, ChamEltwise, 1, 1, W( Welt, m, n), W( Welt, m, 0) );
} }
} }
...@@ -283,7 +283,7 @@ chameleon_pzlansy_frb( cham_trans_t trans, cham_uplo_t uplo, ...@@ -283,7 +283,7 @@ chameleon_pzlansy_frb( cham_trans_t trans, cham_uplo_t uplo,
*/ */
for(m = P; m < MT; m++) { for(m = P; m < MT; m++) {
INSERT_TASK_dplssq( INSERT_TASK_dplssq(
options, W( Welt, m, 0), W( Welt, m%P, 0) ); options, ChamEltwise, 1, 1, W( Welt, m, 0), W( Welt, m%P, 0) );
} }
/** /**
...@@ -292,11 +292,11 @@ chameleon_pzlansy_frb( cham_trans_t trans, cham_uplo_t uplo, ...@@ -292,11 +292,11 @@ chameleon_pzlansy_frb( cham_trans_t trans, cham_uplo_t uplo,
*/ */
for(m = 1; m < P; m++) { for(m = 1; m < P; m++) {
INSERT_TASK_dplssq( INSERT_TASK_dplssq(
options, W( Welt, m, 0), W( Welt, 0, 0) ); options, ChamEltwise, 1, 1, W( Welt, m, 0), W( Welt, 0, 0) );
} }
INSERT_TASK_dplssq2( INSERT_TASK_dplssq2(
options, W( Welt, 0, 0) ); options, 1, W( Welt, 0, 0) );
} }
/** /**
......
...@@ -66,6 +66,7 @@ set(ZSRC ...@@ -66,6 +66,7 @@ set(ZSRC
core_zplghe.c core_zplghe.c
core_zplgsy.c core_zplgsy.c
core_zplrnt.c core_zplrnt.c
core_zplssq.c
core_zpotrf.c core_zpotrf.c
core_zssssm.c core_zssssm.c
core_zsymm.c core_zsymm.c
......
...@@ -15,23 +15,98 @@ ...@@ -15,23 +15,98 @@
* @comment This file has been automatically generated * @comment This file has been automatically generated
* from Plasma 2.6.0 for CHAMELEON 0.9.2 * from Plasma 2.6.0 for CHAMELEON 0.9.2
* @author Mathieu Faverge * @author Mathieu Faverge
* @author Florent Pruvost
* @date 2014-11-16 * @date 2014-11-16
* @precisions normal z -> c d s * @precisions normal z -> c d s
* *
*/ */
#include <math.h> #include <math.h>
#include "coreblas/lapacke.h" #include "coreblas/lapacke.h"
#include "coreblas/sumsq_update.h"
#include "coreblas.h" #include "coreblas.h"
#define UPDATE( __nb, __value ) \ /**
if (__value != 0. ){ \ * @brief Subcase storev == ChamColumnwise of CORE_zgessq()
if ( *scale < __value ) { \ */
*sumsq = __nb + (*sumsq) * ( *scale / __value ) * ( *scale / __value ); \ static inline int
*scale = __value; \ CORE_zgessq_col( int M, int N,
} else { \ const CHAMELEON_Complex64_t *A, int LDA,
*sumsq = *sumsq + __nb * ( __value / *scale ) * ( __value / *scale ); \ double *sclssq )
} \ {
int i, j;
double tmp;
double *ptr, *tmpScale, *tmpSumsq;
for(j=0; j<N; j++) {
ptr = (double*) ( A + j * LDA );
tmpScale = sclssq+2*j;
tmpSumsq = sclssq+2*j+1;
for(i=0; i<M; i++, ptr++) {
tmp = fabs(*ptr);
sumsq_update( 1., tmpScale, tmpSumsq, &tmp );
#if defined(PRECISION_z) || defined(PRECISION_c)
ptr++;
tmp = fabs(*ptr);
sumsq_update( 1., tmpScale, tmpSumsq, &tmp );
#endif
}
} }
return CHAMELEON_SUCCESS;
}
/**
* @brief Subcase storev == ChamRowwise of CORE_zgessq()
*/
static inline int
CORE_zgessq_row( int M, int N,
const CHAMELEON_Complex64_t *A, int LDA,
double *sclssq )
{
int i, j;
double tmp;
double *ptr, *tmpScale, *tmpSumsq;
for(j=0; j<N; j++) {
ptr = (double*) ( A + j * LDA );
tmpScale = sclssq;
tmpSumsq = sclssq+1;
for(i=0; i<M; i++, ptr++, tmpScale+=2, tmpSumsq+=2) {
tmp = fabs(*ptr);
sumsq_update( 1., tmpScale, tmpSumsq, &tmp );
#if defined(PRECISION_z) || defined(PRECISION_c)
ptr++;
tmp = fabs(*ptr);
sumsq_update( 1., tmpScale, tmpSumsq, &tmp );
#endif
}
}
return CHAMELEON_SUCCESS;
}
/**
* @brief Subcase storev == ChamEltwise of CORE_zgessq()
*/
static inline int
CORE_zgessq_elt( int M, int N,
const CHAMELEON_Complex64_t *A, int LDA,
double *sclssq )
{
int i, j;
double tmp;
double *ptr;
for(j=0; j<N; j++) {
ptr = (double*) ( A + j * LDA );
for(i=0; i<M; i++, ptr++) {
tmp = fabs(*ptr);
sumsq_update( 1., sclssq, sclssq+1, &tmp );
#if defined(PRECISION_z) || defined(PRECISION_c)
ptr++;
tmp = fabs(*ptr);
sumsq_update( 1., sclssq, sclssq+1, &tmp );
#endif
}
}
return CHAMELEON_SUCCESS;
}
/** /**
* *
...@@ -60,6 +135,12 @@ ...@@ -60,6 +135,12 @@
* *
******************************************************************************* *******************************************************************************
* *
* @param[in] storev
* Specifies whether the sums are made per column or row.
* = ChamColumnwise: Computes the sum of squares on each column
* = ChamRowwise: Computes the sum of squares on each row
* = ChamEltwise: Computes the sum of squares on all the matrix
*
* @param[in] M * @param[in] M
* The number of rows in the tile A. * The number of rows in the tile A.
* *
...@@ -72,13 +153,16 @@ ...@@ -72,13 +153,16 @@
* @param[in] LDA * @param[in] LDA
* The leading dimension of the tile A. LDA >= max(1,M). * The leading dimension of the tile A. LDA >= max(1,M).
* *
* @param[in,out] scale * @param[in,out] sclssq
* On entry, the value scale in the equation above. * Dimension: (2,K)
* On exit, scale is overwritten with the value scl. * if storev == ChamColumnwise, K = N
* * if storev == ChamRowwise, K = M
* @param[in,out] sumsq * if storev == ChamEltwise, K = 1
* On entry, the value sumsq in the equation above. * On entry, sclssq contains K couples (sclssq[2*i], sclssq[2*i+1])
* On exit, SUMSQ is overwritten with the value ssq. * which corresponds to (scale, sumsq) in the equation below
* ( scl**2 )*ssq = sum( A( i, j )**2 ) + ( scale**2 )*sumsq,
* respectively for the columns, the rows and the full matrix
* On exit, each couple is overwritten with the final result (scl, ssq).
* *
******************************************************************************* *******************************************************************************
* *
...@@ -86,26 +170,18 @@ ...@@ -86,26 +170,18 @@
* @retval -k, the k-th argument had an illegal value * @retval -k, the k-th argument had an illegal value
* *
*/ */
int CORE_zgessq(int M, int N, int CORE_zgessq( cham_store_t storev, int M, int N,
const CHAMELEON_Complex64_t *A, int LDA, const CHAMELEON_Complex64_t *A, int LDA,
double *scale, double *sumsq) double *sclssq )
{ {
int i, j; if (storev == ChamColumnwise) {
double tmp; CORE_zgessq_col( M, N, A, LDA, sclssq );
double *ptr; }
else if (storev == ChamRowwise) {
for(j=0; j<N; j++) { CORE_zgessq_row( M, N, A, LDA, sclssq );
ptr = (double*) ( A + j * LDA ); }
for(i=0; i<M; i++, ptr++) { else {
tmp = fabs(*ptr); CORE_zgessq_elt( M, N, A, LDA, sclssq );
UPDATE( 1., tmp );
#if defined(PRECISION_z) || defined(PRECISION_c)
ptr++;
tmp = fabs(*ptr);
UPDATE( 1., tmp );
#endif
}
} }
return CHAMELEON_SUCCESS; return CHAMELEON_SUCCESS;
} }
...@@ -19,20 +19,8 @@ ...@@ -19,20 +19,8 @@
* @precisions normal z -> c * @precisions normal z -> c
* *
*/ */
#include <math.h>
#include "coreblas/lapacke.h"
#include "coreblas.h" #include "coreblas.h"
#define UPDATE( __nb, __value ) \
if (__value != 0. ){ \
if ( *scale < __value ) { \
*sumsq = __nb + (*sumsq) * ( *scale / __value ) * ( *scale / __value ); \
*scale = __value; \
} else { \
*sumsq = *sumsq + __nb * ( __value / *scale ) * ( __value / *scale ); \
} \
}
/** /**
* *
* @ingroup CORE_CHAMELEON_Complex64_t * @ingroup CORE_CHAMELEON_Complex64_t
...@@ -56,18 +44,24 @@ ...@@ -56,18 +44,24 @@
* SCALE and SUMSQ are overwritten by scl and ssq respectively. * SCALE and SUMSQ are overwritten by scl and ssq respectively.
* *
* The routine makes only one pass through the tile triangular part of the * The routine makes only one pass through the tile triangular part of the
* hermitian tile A defined by uplo. * symmetric tile A defined by uplo.
* See also LAPACK zlassq.f * See also LAPACK zlassq.f
* *
******************************************************************************* *******************************************************************************
* *
* @param[in] storev
* Specifies whether the sums are made per column or row.
* = ChamColumnwise: Computes the sum of squares on each column
* = ChamRowwise: Computes the sum of squares on each row
* = ChamEltwise: Computes the sum of squares on all the matrix
*
* @param[in] uplo * @param[in] uplo
* Specifies whether the upper or lower triangular part of * Specifies whether the upper or lower triangular part of
* the hermitian matrix A is to be referenced as follows: * the symmetric matrix A is to be referenced as follows:
* = ChamLower: Only the lower triangular part of the * = ChamLower: Only the lower triangular part of the
* hermitian matrix A is to be referenced. * symmetric matrix A is to be referenced.
* = ChamUpper: Only the upper triangular part of the * = ChamUpper: Only the upper triangular part of the
* hermitian matrix A is to be referenced. * symmetric matrix A is to be referenced.
* *
* @param[in] N * @param[in] N
* The number of columns and rows in the tile A. * The number of columns and rows in the tile A.
...@@ -78,13 +72,16 @@ ...@@ -78,13 +72,16 @@
* @param[in] LDA * @param[in] LDA
* The leading dimension of the tile A. LDA >= max(1,N). * The leading dimension of the tile A. LDA >= max(1,N).
* *
* @param[in,out] scale * @param[in,out] sclssq
* On entry, the value scale in the equation above. * Dimension: (2,K)
* On exit, scale is overwritten with the value scl. * if storev == ChamColumnwise, K = N
* * if storev == ChamRowwise, K = N
* @param[in,out] sumsq * if storev == ChamEltwise, K = 1
* On entry, the value sumsq in the equation above. * On entry, sclssq contains K couples (sclssq[2*i], sclssq[2*i+1])
* On exit, SUMSQ is overwritten with the value ssq. * which corresponds to (scale, sumsq) in the equation below
* ( scl**2 )*ssq = sum( A( i, j )**2 ) + ( scale**2 )*sumsq,
* respectively for the columns, the rows and the full matrix
* On exit, each couple is overwritten with the final result (scl, ssq).
* *
******************************************************************************* *******************************************************************************
* *
...@@ -92,65 +89,9 @@ ...@@ -92,65 +89,9 @@
* @retval -k, the k-th argument had an illegal value * @retval -k, the k-th argument had an illegal value
* *
*/ */
int CORE_zhessq( cham_store_t storev, cham_uplo_t uplo, int N,
int CORE_zhessq(cham_uplo_t uplo, int N, const CHAMELEON_Complex64_t *A, int LDA,
const CHAMELEON_Complex64_t *A, int LDA, double *sclssq )
double *scale, double *sumsq)
{ {
int i, j; return CORE_zsyssq( storev, uplo, N, A, LDA, sclssq );
double tmp;
double *ptr;
if ( uplo == ChamUpper ) {
for(j=0; j<N; j++) {
ptr = (double*) ( A + j * LDA );
for(i=0; i<j; i++, ptr++) {
tmp = fabs(*ptr);
UPDATE( 2., tmp );
#if defined(PRECISION_z) || defined(PRECISION_c)
ptr++;
tmp = fabs(*ptr);
UPDATE( 2., tmp );
#endif
}
/* Diagonal */
tmp = fabs(*ptr);
UPDATE( 1., tmp );
#if defined(PRECISION_z) || defined(PRECISION_c)
ptr++;
#endif
}
} else {
for(j=0; j<N; j++) {
ptr = (double*) ( A + j * LDA + j);
/* Diagonal */
tmp = fabs(*ptr);
UPDATE( 1., tmp );
ptr++;
#if defined(PRECISION_z) || defined(PRECISION_c)
ptr++;
#endif
for(i=j+1; i<N; i++, ptr++) {
tmp = fabs(*ptr);
UPDATE( 2., tmp );
#if defined(PRECISION_z) || defined(PRECISION_c)
ptr++;
tmp = fabs(*ptr);
UPDATE( 2., tmp );
#endif
}
}
}
return CHAMELEON_SUCCESS;
} }
/**
*
* @file core_zplssq.c
*
* @copyright 2009-2014 The University of Tennessee and The University of
* Tennessee Research Foundation. All rights reserved.
* @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon core_zplssq CPU kernel
*
* @version 0.9.2
* @author Mathieu Faverge
* @author Florent Pruvost
* @date 2019-04-01
* @precisions normal z -> c d s
*
*/
#include <math.h>
#include "coreblas/lapacke.h"
#include "coreblas/sumsq_update.h"
#include "coreblas.h"
/**
* @brief Subcase storev == ChamColumnwise of CORE_zplssq()
*/
static inline int
CORE_zplssq_col( int M, int N,
double *sclssqin,
double *sclssqout )
{
int i, j;
double *tmpScaleIn, *tmpSumsqIn, *tmpScaleOut, *tmpSumsqOut;
tmpScaleIn = sclssqin;
tmpSumsqIn = sclssqin+1;
tmpScaleOut = sclssqout;
tmpSumsqOut = sclssqout+1;
for(j=0; j<N; j++) {
for(i=0; i<M; i++) {
sumsq_update_2( 1., tmpScaleIn, tmpSumsqIn, tmpScaleOut, tmpSumsqOut );
tmpScaleIn+=2;
tmpSumsqIn+=2;
}
tmpScaleOut+=2;
tmpSumsqOut+=2;
}
return CHAMELEON_SUCCESS;
}
/**
* @brief Subcase storev == ChamRowwise of CORE_zplssq()
*/
static inline int
CORE_zplssq_row( int M, int N,
double *sclssqin,
double *sclssqout )
{
int i, j;
double *tmpScaleIn, *tmpSumsqIn, *tmpScaleOut, *tmpSumsqOut;
tmpScaleIn = sclssqin;
tmpSumsqIn = sclssqin+1;
for(j=0; j<N; j++) {
tmpScaleOut = sclssqout;
tmpSumsqOut = sclssqout+1;
for(i=0; i<M; i++) {
sumsq_update_2( 1., tmpScaleIn, tmpSumsqIn, tmpScaleOut, tmpSumsqOut );
tmpScaleIn+=2;
tmpSumsqIn+=2;
tmpScaleOut+=2;
tmpSumsqOut+=2;
}
}
return CHAMELEON_SUCCESS;
}
/**
* @brief Subcase storev == ChamEltwise of CORE_zplssq()