Mentions légales du service

Skip to content
Snippets Groups Projects
Commit e526d7df authored by Mathieu Faverge's avatar Mathieu Faverge
Browse files

Merge branch 'hotfix/laxxx' into 'master'

Fix auxiliary routines

See merge request !169
parents 125faf10 89116d0a
No related branches found
No related tags found
1 merge request!169Fix auxiliary routines
...@@ -29,10 +29,10 @@ ...@@ -29,10 +29,10 @@
* Parallel initialization a 2-D array A to BETA on the diagonal and * Parallel initialization a 2-D array A to BETA on the diagonal and
* ALPHA on the offdiagonals. * ALPHA on the offdiagonals.
*/ */
void chameleon_pzlaset(cham_uplo_t uplo, void chameleon_pzlaset( cham_uplo_t uplo,
CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t beta, CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t beta,
CHAM_desc_t *A, CHAM_desc_t *A,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request) RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{ {
CHAM_context_t *chamctxt; CHAM_context_t *chamctxt;
RUNTIME_option_t options; RUNTIME_option_t options;
...@@ -71,18 +71,28 @@ void chameleon_pzlaset(cham_uplo_t uplo, ...@@ -71,18 +71,28 @@ void chameleon_pzlaset(cham_uplo_t uplo,
} }
} }
else if (uplo == ChamUpper) { else if (uplo == ChamUpper) {
for (j = 0; j < A->nt; j++){ for (i = 0; i < A->mt; i++) {
tempjn = j == A->nt-1 ? A->n-j*A->nb : A->nb; tempim = i == A->mt-1 ? A->m-i*A->mb : A->mb;
for (i = 0; i < chameleon_min(j+1, A->mt); i++){ ldai = BLKLDD(A, i);
tempim = i == A->mt-1 ? A->m-i*A->mb : A->mb;
ldai = BLKLDD(A, i); if ( i < A->nt ) {
INSERT_TASK_zlaset( j = i;
&options, tempjn = j == A->nt-1 ? A->n-j*A->nb : A->nb;
ChamUpperLower, tempim, tempjn,
alpha, (i == j) ? beta : alpha, INSERT_TASK_zlaset(
A(i, j), ldai); &options,
} uplo, tempim, tempjn,
} alpha, beta, A(i, j), ldai);
}
for (j = i+1; j < A->nt; j++) {
tempjn = j == A->nt-1 ? A->n-j*A->nb : A->nb;
INSERT_TASK_zlaset(
&options,
ChamUpperLower, tempim, tempjn,
alpha, alpha, A(i, j), ldai);
}
}
} }
else { else {
for (i = 0; i < A->mt; i++){ for (i = 0; i < A->mt; i++){
......
...@@ -12,9 +12,6 @@ ...@@ -12,9 +12,6 @@
* @brief Chameleon zplghe parallel algorithm * @brief Chameleon zplghe parallel algorithm
* *
* @version 0.9.2 * @version 0.9.2
* @comment This file is a copy from pzplghe.c
* wich has been automatically generated
* from Plasma 2.5.0 for CHAMELEON 0.9.2
* @author Mathieu Faverge * @author Mathieu Faverge
* @author Emmanuel Agullo * @author Emmanuel Agullo
* @author Cedric Castagnede * @author Cedric Castagnede
...@@ -31,13 +28,13 @@ ...@@ -31,13 +28,13 @@
* chameleon_pzplghe - Generate a random hermitian (positive definite if 'bump' is large enough) half-matrix by tiles. * chameleon_pzplghe - Generate a random hermitian (positive definite if 'bump' is large enough) half-matrix by tiles.
*/ */
void chameleon_pzplghe( double bump, cham_uplo_t uplo, CHAM_desc_t *A, void chameleon_pzplghe( double bump, cham_uplo_t uplo, CHAM_desc_t *A,
unsigned long long int seed, unsigned long long int seed,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{ {
CHAM_context_t *chamctxt; CHAM_context_t *chamctxt;
RUNTIME_option_t options; RUNTIME_option_t options;
int m, n; int m, n, minmn;
int ldam; int ldam;
int tempmm, tempnn; int tempmm, tempnn;
...@@ -47,16 +44,15 @@ void chameleon_pzplghe( double bump, cham_uplo_t uplo, CHAM_desc_t *A, ...@@ -47,16 +44,15 @@ void chameleon_pzplghe( double bump, cham_uplo_t uplo, CHAM_desc_t *A,
} }
RUNTIME_options_init(&options, chamctxt, sequence, request); RUNTIME_options_init(&options, chamctxt, sequence, request);
for (m = 0; m < A->mt; m++) { minmn = chameleon_min( A->mt, A->nt );
tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; switch ( uplo ) {
ldam = BLKLDD(A, m); case ChamLower:
for (n = 0; n < minmn; n++) {
tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
/* for (m = n; m < A->mt; m++) {
* ChamLower tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
*/ ldam = BLKLDD(A, m);
if (uplo == ChamLower) {
for (n = 0; n <= m; n++) {
tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
options.priority = m + n; options.priority = m + n;
INSERT_TASK_zplghe( INSERT_TASK_zplghe(
...@@ -65,10 +61,13 @@ void chameleon_pzplghe( double bump, cham_uplo_t uplo, CHAM_desc_t *A, ...@@ -65,10 +61,13 @@ void chameleon_pzplghe( double bump, cham_uplo_t uplo, CHAM_desc_t *A,
A->m, m*A->mb, n*A->nb, seed ); A->m, m*A->mb, n*A->nb, seed );
} }
} }
/* break;
* ChamUpper
*/ case ChamUpper:
else if (uplo == ChamUpper) { for (m = 0; m < minmn; m++) {
tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
ldam = BLKLDD(A, m);
for (n = m; n < A->nt; n++) { for (n = m; n < A->nt; n++) {
tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
...@@ -79,13 +78,17 @@ void chameleon_pzplghe( double bump, cham_uplo_t uplo, CHAM_desc_t *A, ...@@ -79,13 +78,17 @@ void chameleon_pzplghe( double bump, cham_uplo_t uplo, CHAM_desc_t *A,
A->m, m*A->mb, n*A->nb, seed ); A->m, m*A->mb, n*A->nb, seed );
} }
} }
/*
* ChamUpperLower case ChamUpperLower:
*/ default:
else { for (m = 0; m < A->mt; m++) {
tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
ldam = BLKLDD(A, m);
for (n = 0; n < A->nt; n++) { for (n = 0; n < A->nt; n++) {
tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
options.priority = m + n;
INSERT_TASK_zplghe( INSERT_TASK_zplghe(
&options, &options,
bump, tempmm, tempnn, A(m, n), ldam, bump, tempmm, tempnn, A(m, n), ldam,
......
...@@ -12,9 +12,6 @@ ...@@ -12,9 +12,6 @@
* @brief Chameleon zplgsy parallel algorithm * @brief Chameleon zplgsy parallel algorithm
* *
* @version 0.9.2 * @version 0.9.2
* @comment This file is a copy of pzplgsy.c,
wich has been automatically generated
* from Plasma 2.5.0 for CHAMELEON 0.9.2
* @author Mathieu Faverge * @author Mathieu Faverge
* @author Emmanuel Agullo * @author Emmanuel Agullo
* @author Cedric Castagnede * @author Cedric Castagnede
...@@ -31,13 +28,13 @@ ...@@ -31,13 +28,13 @@
* chameleon_pzplgsy - Generate a random symmetric (positive definite if 'bump' is large enough) half-matrix by tiles. * chameleon_pzplgsy - Generate a random symmetric (positive definite if 'bump' is large enough) half-matrix by tiles.
*/ */
void chameleon_pzplgsy( CHAMELEON_Complex64_t bump, cham_uplo_t uplo, CHAM_desc_t *A, void chameleon_pzplgsy( CHAMELEON_Complex64_t bump, cham_uplo_t uplo, CHAM_desc_t *A,
unsigned long long int seed, unsigned long long int seed,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{ {
CHAM_context_t *chamctxt; CHAM_context_t *chamctxt;
RUNTIME_option_t options; RUNTIME_option_t options;
int m, n; int m, n, minmn;
int ldam; int ldam;
int tempmm, tempnn; int tempmm, tempnn;
...@@ -47,16 +44,15 @@ void chameleon_pzplgsy( CHAMELEON_Complex64_t bump, cham_uplo_t uplo, CHAM_desc_ ...@@ -47,16 +44,15 @@ void chameleon_pzplgsy( CHAMELEON_Complex64_t bump, cham_uplo_t uplo, CHAM_desc_
} }
RUNTIME_options_init(&options, chamctxt, sequence, request); RUNTIME_options_init(&options, chamctxt, sequence, request);
for (m = 0; m < A->mt; m++) { minmn = chameleon_min( A->mt, A->nt );
tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; switch ( uplo ) {
ldam = BLKLDD(A, m); case ChamLower:
for (n = 0; n < minmn; n++) {
tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
/* for (m = n; m < A->mt; m++) {
* ChamLower tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
*/ ldam = BLKLDD(A, m);
if (uplo == ChamLower) {
for (n = 0; n <= m; n++) {
tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
options.priority = m + n; options.priority = m + n;
INSERT_TASK_zplgsy( INSERT_TASK_zplgsy(
...@@ -65,10 +61,13 @@ void chameleon_pzplgsy( CHAMELEON_Complex64_t bump, cham_uplo_t uplo, CHAM_desc_ ...@@ -65,10 +61,13 @@ void chameleon_pzplgsy( CHAMELEON_Complex64_t bump, cham_uplo_t uplo, CHAM_desc_
A->m, m*A->mb, n*A->nb, seed ); A->m, m*A->mb, n*A->nb, seed );
} }
} }
/* break;
* ChamUpper
*/ case ChamUpper:
else if (uplo == ChamUpper) { for (m = 0; m < minmn; m++) {
tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
ldam = BLKLDD(A, m);
for (n = m; n < A->nt; n++) { for (n = m; n < A->nt; n++) {
tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
...@@ -79,13 +78,17 @@ void chameleon_pzplgsy( CHAMELEON_Complex64_t bump, cham_uplo_t uplo, CHAM_desc_ ...@@ -79,13 +78,17 @@ void chameleon_pzplgsy( CHAMELEON_Complex64_t bump, cham_uplo_t uplo, CHAM_desc_
A->m, m*A->mb, n*A->nb, seed ); A->m, m*A->mb, n*A->nb, seed );
} }
} }
/*
* ChamUpperLower case ChamUpperLower:
*/ default:
else { for (m = 0; m < A->mt; m++) {
tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
ldam = BLKLDD(A, m);
for (n = 0; n < A->nt; n++) { for (n = 0; n < A->nt; n++) {
tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
options.priority = m + n;
INSERT_TASK_zplgsy( INSERT_TASK_zplgsy(
&options, &options,
bump, tempmm, tempnn, A(m, n), ldam, bump, tempmm, tempnn, A(m, n), ldam,
......
...@@ -179,7 +179,7 @@ int CHAMELEON_zlascal( cham_uplo_t uplo, int M, int N, ...@@ -179,7 +179,7 @@ int CHAMELEON_zlascal( cham_uplo_t uplo, int M, int N,
* *
*/ */
int CHAMELEON_zlascal_Tile( cham_uplo_t uplo, int CHAMELEON_zlascal_Tile( cham_uplo_t uplo,
CHAMELEON_Complex64_t alpha, CHAM_desc_t *A ) CHAMELEON_Complex64_t alpha, CHAM_desc_t *A )
{ {
CHAM_context_t *chamctxt; CHAM_context_t *chamctxt;
RUNTIME_sequence_t *sequence = NULL; RUNTIME_sequence_t *sequence = NULL;
......
...@@ -243,7 +243,7 @@ int CHAMELEON_zpotrf_Tile( cham_uplo_t uplo, CHAM_desc_t *A ) ...@@ -243,7 +243,7 @@ int CHAMELEON_zpotrf_Tile( cham_uplo_t uplo, CHAM_desc_t *A )
* *
*/ */
int CHAMELEON_zpotrf_Tile_Async( cham_uplo_t uplo, CHAM_desc_t *A, int CHAMELEON_zpotrf_Tile_Async( cham_uplo_t uplo, CHAM_desc_t *A,
RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
{ {
CHAM_context_t *chamctxt; CHAM_context_t *chamctxt;
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
* *
*/ */
#include "coreblas.h" #include "coreblas.h"
#include "coreblas/lapacke.h"
/** /**
****************************************************************************** ******************************************************************************
...@@ -112,12 +113,21 @@ int CORE_zgeadd(cham_trans_t trans, int M, int N, ...@@ -112,12 +113,21 @@ int CORE_zgeadd(cham_trans_t trans, int M, int N,
return -8; return -8;
} }
if ( beta == 0. ) {
LAPACKE_zlaset_work( LAPACK_COL_MAJOR, 'A',
M, N, 0., 0., B, LDB );
}
else if ( beta != 1. ) {
LAPACKE_zlascl_work( LAPACK_COL_MAJOR, 'G',
0, 0, 1., beta, M, N, B, LDB );
}
switch( trans ) { switch( trans ) {
#if defined(PRECISION_z) || defined(PRECISION_c) #if defined(PRECISION_z) || defined(PRECISION_c)
case ChamConjTrans: case ChamConjTrans:
for (j=0; j<N; j++, A++) { for (j=0; j<N; j++, A++) {
for(i=0; i<M; i++, B++) { for(i=0; i<M; i++, B++) {
*B = beta * (*B) + alpha * conj(A[LDA*i]); *B += alpha * conj(A[LDA*i]);
} }
B += LDB-M; B += LDB-M;
} }
...@@ -127,7 +137,7 @@ int CORE_zgeadd(cham_trans_t trans, int M, int N, ...@@ -127,7 +137,7 @@ int CORE_zgeadd(cham_trans_t trans, int M, int N,
case ChamTrans: case ChamTrans:
for (j=0; j<N; j++, A++) { for (j=0; j<N; j++, A++) {
for(i=0; i<M; i++, B++) { for(i=0; i<M; i++, B++) {
*B = beta * (*B) + alpha * A[LDA*i]; *B += alpha * A[LDA*i];
} }
B += LDB-M; B += LDB-M;
} }
...@@ -137,7 +147,7 @@ int CORE_zgeadd(cham_trans_t trans, int M, int N, ...@@ -137,7 +147,7 @@ int CORE_zgeadd(cham_trans_t trans, int M, int N,
default: default:
for (j=0; j<N; j++) { for (j=0; j<N; j++) {
for(i=0; i<M; i++, B++, A++) { for(i=0; i<M; i++, B++, A++) {
*B = beta * (*B) + alpha * (*A); *B += alpha * (*A);
} }
A += LDA-M; A += LDA-M;
B += LDB-M; B += LDB-M;
......
...@@ -78,10 +78,21 @@ void CORE_zplghe( double bump, int m, int n, CHAMELEON_Complex64_t *A, int lda, ...@@ -78,10 +78,21 @@ void CORE_zplghe( double bump, int m, int n, CHAMELEON_Complex64_t *A, int lda,
* Tile diagonal * Tile diagonal
*/ */
if ( m0 == n0 ) { if ( m0 == n0 ) {
for (j = 0; j < n; j++) { int minmn = chameleon_min( m, n );
/* Lower part */
for (j = 0; j < minmn; j++) {
ran = Rnd64_jump( NBELEM * jump, seed ); ran = Rnd64_jump( NBELEM * jump, seed );
for (i = j; i < m; i++) { *tmp = 0.5f - ran * RndF_Mul;
ran = Rnd64_A * ran + Rnd64_C;
#if defined(PRECISION_z) || defined(PRECISION_c)
ran = Rnd64_A * ran + Rnd64_C;
#endif
*tmp = creal( *tmp + bump );
tmp++;
for (i = j+1; i < m; i++) {
*tmp = 0.5f - ran * RndF_Mul; *tmp = 0.5f - ran * RndF_Mul;
ran = Rnd64_A * ran + Rnd64_C; ran = Rnd64_A * ran + Rnd64_C;
#if defined(PRECISION_z) || defined(PRECISION_c) #if defined(PRECISION_z) || defined(PRECISION_c)
...@@ -94,16 +105,21 @@ void CORE_zplghe( double bump, int m, int n, CHAMELEON_Complex64_t *A, int lda, ...@@ -94,16 +105,21 @@ void CORE_zplghe( double bump, int m, int n, CHAMELEON_Complex64_t *A, int lda,
jump += bigM + 1; jump += bigM + 1;
} }
for (j = 0; j < n; j++) { /* Upper part */
jump = (unsigned long long int)m0 + (unsigned long long int)n0 * (unsigned long long int)bigM;
for (i = 0; i < minmn; i++) {
ran = Rnd64_jump( NBELEM * (jump+i+1), seed );
for (j = i+1; j < n; j++) {
A[j*lda+i] = 0.5f - ran * RndF_Mul;
ran = Rnd64_A * ran + Rnd64_C;
#if defined(PRECISION_z) || defined(PRECISION_c) #if defined(PRECISION_z) || defined(PRECISION_c)
A[j+j*lda] += bump - I*cimag( A[j+j*lda] ); A[j*lda+i] -= I*(0.5f - ran * RndF_Mul);
#else ran = Rnd64_A * ran + Rnd64_C;
A[j+j*lda] += bump;
#endif #endif
for (i=0; i<j; i++) {
A[lda*j+i] = conj( A[lda*i+j] );
} }
jump += bigM;
} }
} }
/* /*
...@@ -148,6 +164,3 @@ void CORE_zplghe( double bump, int m, int n, CHAMELEON_Complex64_t *A, int lda, ...@@ -148,6 +164,3 @@ void CORE_zplghe( double bump, int m, int n, CHAMELEON_Complex64_t *A, int lda,
} }
} }
} }
...@@ -63,7 +63,6 @@ Rnd64_jump(unsigned long long int n, unsigned long long int seed ) { ...@@ -63,7 +63,6 @@ Rnd64_jump(unsigned long long int n, unsigned long long int seed ) {
return ran; return ran;
} }
// CORE_zplgsy - Generate a tile for random symmetric (positive definite if 'bump' is large enough) matrix. // CORE_zplgsy - Generate a tile for random symmetric (positive definite if 'bump' is large enough) matrix.
void CORE_zplgsy( CHAMELEON_Complex64_t bump, int m, int n, CHAMELEON_Complex64_t *A, int lda, void CORE_zplgsy( CHAMELEON_Complex64_t bump, int m, int n, CHAMELEON_Complex64_t *A, int lda,
...@@ -79,10 +78,22 @@ void CORE_zplgsy( CHAMELEON_Complex64_t bump, int m, int n, CHAMELEON_Complex64_ ...@@ -79,10 +78,22 @@ void CORE_zplgsy( CHAMELEON_Complex64_t bump, int m, int n, CHAMELEON_Complex64_
* Tile diagonal * Tile diagonal
*/ */
if ( m0 == n0 ) { if ( m0 == n0 ) {
for (j = 0; j < n; j++) { int minmn = chameleon_min( m, n );
/* Lower part */
for (j = 0; j < minmn; j++) {
ran = Rnd64_jump( NBELEM * jump, seed ); ran = Rnd64_jump( NBELEM * jump, seed );
for (i = j; i < m; i++) { *tmp = 0.5f - ran * RndF_Mul;
ran = Rnd64_A * ran + Rnd64_C;
#if defined(PRECISION_z) || defined(PRECISION_c)
*tmp += I*(0.5f - ran * RndF_Mul);
ran = Rnd64_A * ran + Rnd64_C;
#endif
*tmp = *tmp + bump;
tmp++;
for (i = j+1; i < m; i++) {
*tmp = 0.5f - ran * RndF_Mul; *tmp = 0.5f - ran * RndF_Mul;
ran = Rnd64_A * ran + Rnd64_C; ran = Rnd64_A * ran + Rnd64_C;
#if defined(PRECISION_z) || defined(PRECISION_c) #if defined(PRECISION_z) || defined(PRECISION_c)
...@@ -95,12 +106,21 @@ void CORE_zplgsy( CHAMELEON_Complex64_t bump, int m, int n, CHAMELEON_Complex64_ ...@@ -95,12 +106,21 @@ void CORE_zplgsy( CHAMELEON_Complex64_t bump, int m, int n, CHAMELEON_Complex64_
jump += bigM + 1; jump += bigM + 1;
} }
for (j = 0; j < n; j++) { /* Upper part */
A[j+j*lda] += bump; jump = (unsigned long long int)m0 + (unsigned long long int)n0 * (unsigned long long int)bigM;
for (i=0; i<j; i++) { for (i = 0; i < minmn; i++) {
A[lda*j+i] = A[lda*i+j]; ran = Rnd64_jump( NBELEM * (jump+i+1), seed );
for (j = i+1; j < n; j++) {
A[j*lda+i] = 0.5f - ran * RndF_Mul;
ran = Rnd64_A * ran + Rnd64_C;
#if defined(PRECISION_z) || defined(PRECISION_c)
A[j*lda+i] += I*(0.5f - ran * RndF_Mul);
ran = Rnd64_A * ran + Rnd64_C;
#endif
} }
jump += bigM;
} }
} }
/* /*
...@@ -145,6 +165,3 @@ void CORE_zplgsy( CHAMELEON_Complex64_t bump, int m, int n, CHAMELEON_Complex64_ ...@@ -145,6 +165,3 @@ void CORE_zplgsy( CHAMELEON_Complex64_t bump, int m, int n, CHAMELEON_Complex64_
} }
} }
} }
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
* *
*/ */
#include "coreblas.h" #include "coreblas.h"
#include "coreblas/lapacke.h"
/** /**
****************************************************************************** ******************************************************************************
...@@ -95,7 +96,7 @@ int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N, ...@@ -95,7 +96,7 @@ int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N,
CHAMELEON_Complex64_t beta, CHAMELEON_Complex64_t beta,
CHAMELEON_Complex64_t *B, int LDB) CHAMELEON_Complex64_t *B, int LDB)
{ {
int i, j; int i, j, minMN;
if (uplo == ChamUpperLower){ if (uplo == ChamUpperLower){
int rc = CORE_zgeadd( trans, M, N, alpha, A, LDA, beta, B, LDB ); int rc = CORE_zgeadd( trans, M, N, alpha, A, LDA, beta, B, LDB );
...@@ -137,6 +138,17 @@ int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N, ...@@ -137,6 +138,17 @@ int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N,
return -9; return -9;
} }
minMN = chameleon_min( M, N );
if ( beta == 0. ) {
LAPACKE_zlaset_work( LAPACK_COL_MAJOR, chameleon_lapack_const(uplo),
M, N, 0., 0., B, LDB );
}
else if ( beta != 1. ) {
LAPACKE_zlascl_work( LAPACK_COL_MAJOR, chameleon_lapack_const(uplo),
0, 0, 1., beta, M, N, B, LDB );
}
/** /**
* ChamLower * ChamLower
*/ */
...@@ -144,9 +156,9 @@ int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N, ...@@ -144,9 +156,9 @@ int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N,
switch( trans ) { switch( trans ) {
#if defined(PRECISION_z) || defined(PRECISION_c) #if defined(PRECISION_z) || defined(PRECISION_c)
case ChamConjTrans: case ChamConjTrans:
for (j=0; j<N; j++, A++) { for (j=0; j<minMN; j++, A++) {
for(i=j; i<M; i++, B++) { for(i=j; i<M; i++, B++) {
*B = beta * (*B) + alpha * conj(A[LDA*i]); *B += alpha * conj(A[LDA*i]);
} }
B += LDB-M+j+1; B += LDB-M+j+1;
} }
...@@ -154,9 +166,9 @@ int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N, ...@@ -154,9 +166,9 @@ int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N,
#endif /* defined(PRECISION_z) || defined(PRECISION_c) */ #endif /* defined(PRECISION_z) || defined(PRECISION_c) */
case ChamTrans: case ChamTrans:
for (j=0; j<N; j++, A++) { for (j=0; j<minMN; j++, A++) {
for(i=j; i<M; i++, B++) { for(i=j; i<M; i++, B++) {
*B = beta * (*B) + alpha * A[LDA*i]; *B += alpha * A[LDA*i];
} }
B += LDB-M+j+1; B += LDB-M+j+1;
} }
...@@ -164,9 +176,9 @@ int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N, ...@@ -164,9 +176,9 @@ int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N,
case ChamNoTrans: case ChamNoTrans:
default: default:
for (j=0; j<N; j++) { for (j=0; j<minMN; j++) {
for(i=j; i<M; i++, B++, A++) { for(i=j; i<M; i++, B++, A++) {
*B = beta * (*B) + alpha * (*A); *B += alpha * (*A);
} }
B += LDB-M+j+1; B += LDB-M+j+1;
A += LDA-M+j+1; A += LDA-M+j+1;
...@@ -183,7 +195,7 @@ int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N, ...@@ -183,7 +195,7 @@ int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N,
for (j=0; j<N; j++, A++) { for (j=0; j<N; j++, A++) {
int mm = chameleon_min( j+1, M ); int mm = chameleon_min( j+1, M );
for(i=0; i<mm; i++, B++) { for(i=0; i<mm; i++, B++) {
*B = beta * (*B) + alpha * conj(A[LDA*i]); *B += alpha * conj(A[LDA*i]);
} }
B += LDB-mm; B += LDB-mm;
} }
...@@ -194,7 +206,7 @@ int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N, ...@@ -194,7 +206,7 @@ int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N,
for (j=0; j<N; j++, A++) { for (j=0; j<N; j++, A++) {
int mm = chameleon_min( j+1, M ); int mm = chameleon_min( j+1, M );
for(i=0; i<mm; i++, B++) { for(i=0; i<mm; i++, B++) {
*B = beta * (*B) + alpha * (A[LDA*i]); *B += alpha * (A[LDA*i]);
} }
B += LDB-mm; B += LDB-mm;
} }
...@@ -205,7 +217,7 @@ int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N, ...@@ -205,7 +217,7 @@ int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N,
for (j=0; j<N; j++) { for (j=0; j<N; j++) {
int mm = chameleon_min( j+1, M ); int mm = chameleon_min( j+1, M );
for(i=0; i<mm; i++, B++, A++) { for(i=0; i<mm; i++, B++, A++) {
*B = beta * (*B) + alpha * (*A); *B += alpha * (*A);
} }
B += LDB-mm; B += LDB-mm;
A += LDA-mm; A += LDA-mm;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment