Commit 32de9c8d authored by PRUVOST Florent's avatar PRUVOST Florent

fix bug when minMT=1 in pzgelqf

parent 0f27a0da
......@@ -104,7 +104,7 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T,
/* necessary to avoid dependencies between tslqt and unmlq tasks regarding the diag tile */
DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, (minMT-1)*A->mb, A->nb, 0, 0, (minMT-1)*A->mb, A->nb, A->p, A->q);
morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb, A->p, A->q);
for (k = 0; k < min(A->mt, A->nt); k++) {
tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
......
......@@ -156,6 +156,22 @@ int CORE_zgelqt(int M, int N, int IB,
WORK, M-i-sb);
}
}
// print the matrices
// int i1, i2;
// printf("\nV \n");
// for (i1=0; i1<M; i1++){
// for (i2=0; i2<N; i2++){
// printf("%f ", A[i1+i2*LDA]);
// }
// printf("\n");
// }
// printf("\nT \n");
// for (i1=0; i1<IB; i1++){
// for (i2=0; i2<N; i2++){
// printf("%f ", T[i1+i2*LDT]);
// }
// printf("\n");
// }
return MORSE_SUCCESS;
}
......
......@@ -43,7 +43,7 @@ int CUDA_zgelqt(
#define dt_ref(a_1,a_2) ( dt+(a_2)*(lddt) + (a_1))
#define t_ref(a_1,a_2) ( t+(a_2)*(ldt) + (a_1))
int i, k, ib, lddwork, old_i, old_ib, rows, cols;
int i, k, ib, old_i, old_ib, rows, cols;
double _Complex one=1.;
if (m < 0) {
......@@ -60,107 +60,110 @@ int CUDA_zgelqt(
return MAGMA_SUCCESS;
}
lddwork= m;
/* lower parts of little T must be zero: memset to 0 for simplicity */
memset(t_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex));
cudaMemset(dt_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex));
/* copy first panel of A on the host */
cublasGetMatrix(min(m, nb), n, sizeof(magmaDoubleComplex),
da_ref(0, 0), ldda,
v, ldv);
/* Use blocked code initially */
for (i = 0; i < k; i += nb) {
ib = min(k-i, nb);
if (i+nb >= m) ib = min(m-i, nb);
cols = n-i;
if (i > 0){
/* copy panel of A from device to host */
cublasGetMatrix(ib, n, sizeof(magmaDoubleComplex),
da_ref(i, 0), ldda,
v, ldv);
/* Apply H' to A(i+2*ib:m, i:n) from the right */
rows = m-old_i-2*old_ib;
if (rows > 0){
magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
rows, n-old_i, old_ib,
da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt,
da_ref(old_i+2*old_ib, old_i), ldda,
dwork, lddwork);
}
/* copy the lower diag tile into d_A */
CUDA_zgemerge(MagmaRight, MagmaUnit, old_ib, old_ib,
dd, ldd, da_ref(old_i, old_i), ldda, stream);
}
/* Form the triangular factor of the block reflector on the host
H = H'(i+ib-1) . . . H(i+1) H(i) */
CORE_zgelqt(ib, cols, ib,
(double _Complex*) v_ref(0,i), ldv,
(double _Complex*) t_ref(0,0), ldt,
(double _Complex*) tau+i,
(double _Complex*) hwork);
if ( i + ib < m ){
/* put 0s in the lower triangular part of a panel (and 1s on the
diagonal); copy the lower triangular in d */
CORE_zgesplit(MorseRight, MorseUnit, ib, min(cols,ib),
(double _Complex*) v_ref(0,i), ldv,
(double _Complex*) d, ldd);
/* copy from host to device a tile diag */
cublasSetMatrix( ib, min(cols,ib), sizeof(magmaDoubleComplex),
d, ldd, dd, ldd );
}
/* Send the triangular factor T to the GPU */
cublasSetMatrix( ib, ib, sizeof(magmaDoubleComplex),
t_ref(0,0), ldt, dt_ref(0,i), lddt );
/* A panel (with zeros in lower tri of its diag) is ready to be used
in input of zlarfb_gpu: we send the panel to the gpu */
cublasSetMatrix( ib, cols, sizeof(magmaDoubleComplex),
v_ref(0,i), ldv, da_ref(i,i), ldda );
if (i + ib < m) {
if (i+2*ib < m){
rows = ib;
}
else{
rows = m-i-ib;
}
/* Apply H' to A(i+ib:i+2*ib, i:n) from the right */
magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
rows, cols, ib, da_ref(i,i), ldda, dt_ref(0,i),
lddt, da_ref(i+ib,i), ldda, dwork, lddwork);
cudaThreadSynchronize();
old_i = i;
old_ib = ib;
if (i+nb >= k){
/* Apply H' to A(i+2*ib:m, i:n) from the right */
rows = m-old_i-2*old_ib;
if (rows > 0){
magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
rows, cols, old_ib,
da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt,
da_ref(old_i+2*old_ib, old_i), ldda,
dwork, lddwork);
}
/* copy the upper diag tile into d_A */
CUDA_zgemerge(MagmaRight, MagmaUnit, old_ib, old_ib,
dd, ldd, da_ref(old_i, old_i), ldda, stream);
}
cudaMemsetAsync(dt_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex), stream);
if ( (nb > 1) && (nb < k) ) {
/* Use blocked code initially */
old_i = 0; old_ib = nb;
for (i = 0; i < k-nb; i += nb) {
ib = min(k-i, nb);
cols = n-i;
magma_zgetmatrix_async( ib, cols,
da_ref(i,i), ldda,
v_ref(0,i), ib, stream );
if (i>0){
/* Apply H' to A(i+2*ib:m, i:n) from the right */
rows = m-old_i-2*old_ib;
magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
rows, n-old_i, old_ib,
da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt,
da_ref(old_i+2*old_ib, old_i), ldda,
dwork, rows);
/* store the diagonal */
magma_zsetmatrix_async( old_ib, old_ib,
d, old_ib,
da_ref(old_i, old_i), ldda, stream );
}
magma_queue_sync( stream );
/* Form the triangular factor of the block reflector on the host
H = H'(i+ib-1) . . . H(i+1) H(i) */
CORE_zgelqt(ib, cols, ib,
(double _Complex*) v_ref(0,i), ib,
(double _Complex*) t_ref(0,0), ib,
(double _Complex*) tau+i,
(double _Complex*) hwork);
/* put 0s in the lower triangular part of a panel (and 1s on the
diagonal); copy the lower triangular in d */
CORE_zgesplit(MorseRight, MorseUnit, ib, min(ib,cols),
(double _Complex*) v_ref(0,i), ib,
(double _Complex*) d, ib);
/* send the custom panel to the GPU */
magma_zsetmatrix( ib, cols,
v_ref(0, i), ib,
da_ref(i, i), ldda );
if ( i + ib < n ){
/* Send the triangular factor T to the GPU */
magma_zsetmatrix( ib, ib,
t_ref(0, 0), ib,
dt_ref(0, i), lddt );
if (i+nb < k-nb) {
/* Apply H' to A(i+ib:i+2*ib, i:n) from the right */
magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
ib, cols, ib,
da_ref(i, i), ldda, dt_ref(0,i), lddt,
da_ref(i+ib,i), ldda, dwork, ib);
}
else {
rows = m-i-ib;
magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
rows, cols, ib,
da_ref(i, i), ldda, dt_ref(0,i), lddt,
da_ref(i+ib,i), ldda, dwork, rows);
cudaThreadSynchronize();
/* Fix the diagonal block */
magma_zsetmatrix_async( ib, ib,
d, ib,
da_ref(i, i), ldda,
stream );
}
old_i = i;
old_ib = ib;
}
}
} else {
i = 0;
}
/* Use unblocked code to factor the last or only block. */
if (i < k) {
ib = m-i;
cols = n-i;
magma_zgetmatrix( ib, cols,
da_ref(i,i), ldda,
v_ref(0,i), ib );
CORE_zgelqt(ib, cols, ib,
(double _Complex*) v_ref(0,i), ib,
(double _Complex*) t_ref(0,0), ib,
(double _Complex*) tau+i,
(double _Complex*) hwork);
/* send the last factorized panel to the GPU */
magma_zsetmatrix( ib, cols,
v_ref(0, i), ib,
da_ref(i, i), ldda );
/* Send the triangular factor T to the GPU */
magma_zsetmatrix( ib, cols,
t_ref(0, 0), ib,
dt_ref(0, i), lddt );
}
#undef da_ref
......
......@@ -24,7 +24,7 @@
**/
#include "cudablas/include/cudablas.h"
#if defined(CHAMELEON_USE_MAGMA)
#if defined(CHAMELEON_USE_MAGMA) && 0
int CUDA_ztslqt(
magma_int_t m, magma_int_t n, magma_int_t nb,
magmaDoubleComplex *da1, magma_int_t ldda1,
......@@ -70,16 +70,16 @@ int CUDA_ztslqt(
memset(t, 0, nb*n*sizeof(magmaDoubleComplex));
cudaMemset(dt, 0, nb*n*sizeof(magmaDoubleComplex));
k = min(m, nb); // m can be lower than IB
//k = min(m, nb); // m can be lower than IB
/* copy the first diag tile of A1 from device to host: da1 -> d */
cublasGetMatrix(k, k, sizeof(magmaDoubleComplex),
cublasGetMatrix(nb, nb, sizeof(magmaDoubleComplex),
da1_ref(0, 0), ldda1,
d, ldd);
d, nb);
/* copy first panel of A2 from device to host: da2 -> a2 */
cublasGetMatrix(k, n, sizeof(magmaDoubleComplex),
cublasGetMatrix(nb, n, sizeof(magmaDoubleComplex),
da2_ref(0, 0), ldda2,
a2, lda2);
a2, nb);
/* This is only blocked code for now */
for (i = 0; i < m; i += nb) {
......@@ -95,12 +95,12 @@ int CUDA_ztslqt(
/* copy the diag tile of A1 from device to host: da1 -> d */
cublasGetMatrix(ib, ib, sizeof(magmaDoubleComplex),
da1_ref(i, i), ldda1,
d, ldd);
d, ib);
/* copy panel of A2 from device to host: da2 -> a2 */
cublasGetMatrix(ib, cols, sizeof(magmaDoubleComplex),
da2_ref(i, 0), ldda2,
a2, lda2);
a2, ib);
/* Apply H' to A(i+2*ib:m,i:n) from the left */
rows = m-old_i-2*old_ib;
......@@ -121,25 +121,25 @@ int CUDA_ztslqt(
/* compute LQ factorization of the panel of A2 ib x cols */
CORE_ztslqt(ib, cols, ib,
(double _Complex*) d, ldd,
(double _Complex*) a2, lda2,
(double _Complex*) t, ldt,
(double _Complex*) d, ib,
(double _Complex*) a2, ib,
(double _Complex*) t, ib,
(double _Complex*) tau,
(double _Complex*) hwork);
/* Send the panel from A2 back to the GPU */
cublasSetMatrix(ib, cols, sizeof(magmaDoubleComplex),
a2, lda2,
a2, ib,
da2_ref(i, 0), ldda2);
/* Send the triangular factor T from hwork to the GPU */
cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex),
t, ldt,
cublasSetMatrix(ib, cols, sizeof(magmaDoubleComplex),
t, ib,
dt_ref(0, i), lddt);
/* get back the diag tile in A1 from host to device: d -> da1 */
cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex),
d, ldd,
d, ib,
da1_ref(i, i), ldda1);
/* tsmlq update on one panel forward (look ahead 1) */
......
......@@ -170,7 +170,7 @@ static void cl_ztslqt_cpu_func(void *descr[], void *cl_arg)
CORE_ztslqt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK);
}
#if defined(CHAMELEON_USE_MAGMA)
#if defined(CHAMELEON_USE_MAGMA) && 0
static void cl_ztslqt_cuda_func(void *descr[], void *cl_arg)
{
MORSE_starpu_ws_t *h_work;
......@@ -216,7 +216,7 @@ static void cl_ztslqt_cuda_func(void *descr[], void *cl_arg)
/*
* Codelet definition
*/
#if defined(CHAMELEON_USE_MAGMA) || defined(CHAMELEON_SIMULATION)
#if (defined(CHAMELEON_USE_MAGMA) && 0) || defined(CHAMELEON_SIMULATION)
CODELETS(ztslqt, 4, cl_ztslqt_cpu_func, cl_ztslqt_cuda_func, 0)
#else
CODELETS_CPU(ztslqt, 4, cl_ztslqt_cpu_func)
......
......@@ -83,6 +83,8 @@ set(ZSRC
time_zgels_tile.c
time_zgeqrf.c
time_zgeqrf_tile.c
time_zgelqf.c
time_zgelqf_tile.c
time_zgeqrs_tile.c
time_zgetrf_incpiv.c
time_zgetrf_incpiv_tile.c
......
......@@ -23,6 +23,7 @@ set(TESTLIST
getrf_incpiv
getrf_nopiv
geqrf
gelqf
posv
potrf
potri
......
/**
*
* @copyright (c) 2009-2014 The University of Tennessee and The University
* of Tennessee Research Foundation.
* All rights reserved.
* @copyright (c) 2012-2014 Inria. All rights reserved.
* @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
*
**/
/**
*
* @precisions normal z -> c d s
*
**/
#define _TYPE MORSE_Complex64_t
#define _PREC double
#define _LAMCH LAPACKE_dlamch_work
#define _NAME "MORSE_zgelqf"
/* See Lawn 41 page 120 */
#define _FMULS FMULS_GELQF(M, N)
#define _FADDS FADDS_GELQF(M, N)
#include "./timing.c"
#include "timing_zauxiliary.h"
static int
RunTest(int *iparam, double *dparam, morse_time_t *t_)
{
MORSE_desc_t *T;
PASTE_CODE_IPARAM_LOCALS( iparam );
if ( M != N && check ) {
fprintf(stderr, "Check cannot be perfomed with M != N\n");
check = 0;
}
/* Allocate Data */
PASTE_CODE_ALLOCATE_MATRIX( A, 1, MORSE_Complex64_t, LDA, N );
/* Initialize Data */
MORSE_zplrnt(M, N, A, LDA, 3456);
/* Allocate Workspace */
MORSE_Alloc_Workspace_zgels(M, N, &T, P, Q);
memset(T->mat, 0, (T->llm*T->lln)*sizeof(MorseComplexDouble));
/* Save AT in lapack layout for check */
PASTE_CODE_ALLOCATE_COPY( Acpy, check, MORSE_Complex64_t, A, LDA, N );
START_TIMING();
MORSE_zgelqf( M, N, A, LDA, T );
STOP_TIMING();
/* Check the solution */
if ( check )
{
PASTE_CODE_ALLOCATE_MATRIX( X, 1, MORSE_Complex64_t, LDB, NRHS );
MORSE_zplrnt( N, NRHS, X, LDB, 5673 );
PASTE_CODE_ALLOCATE_COPY( B, 1, MORSE_Complex64_t, X, LDB, NRHS );
MORSE_zgelqs(M, N, NRHS, A, LDA, T, X, LDB);
dparam[IPARAM_RES] = z_check_solution(M, N, NRHS, Acpy, LDA, B, X, LDB,
&(dparam[IPARAM_ANORM]),
&(dparam[IPARAM_BNORM]),
&(dparam[IPARAM_XNORM]));
free( Acpy );
free( B );
free( X );
}
/* Free Workspace */
MORSE_Dealloc_Workspace( &T );
free( A );
return 0;
}
/**
*
* @copyright (c) 2009-2014 The University of Tennessee and The University
* of Tennessee Research Foundation.
* All rights reserved.
* @copyright (c) 2012-2014 Inria. All rights reserved.
* @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
*
**/
/**
*
* @precisions normal z -> c d s
*
**/
#define _TYPE MORSE_Complex64_t
#define _PREC double
#define _LAMCH LAPACKE_dlamch_work
#define _NAME "MORSE_zgelqf_Tile"
/* See Lawn 41 page 120 */
#define _FMULS FMULS_GELQF( M, N )
#define _FADDS FADDS_GELQF( M, N )
#include "./timing.c"
static int
RunTest(int *iparam, double *dparam, morse_time_t *t_)
{
MORSE_desc_t *descT;
PASTE_CODE_IPARAM_LOCALS( iparam );
/* Allocate Data */
PASTE_CODE_ALLOCATE_MATRIX_TILE( descA, 1, MORSE_Complex64_t, MorseComplexDouble, LDA, M, N );
PASTE_CODE_ALLOCATE_MATRIX_TILE( descX, ( check && M == N ), MORSE_Complex64_t, MorseComplexDouble, LDB, M, NRHS );
PASTE_CODE_ALLOCATE_MATRIX_TILE( descAC, ( check && M == N ), MORSE_Complex64_t, MorseComplexDouble, LDA, M, N );
PASTE_CODE_ALLOCATE_MATRIX_TILE( descB, ( check && M == N ), MORSE_Complex64_t, MorseComplexDouble, LDB, M, NRHS );
MORSE_zplrnt_Tile( descA, 5373 );
/* Save A for check */
if (check == 1 && M == N){
MORSE_zlacpy_Tile(MorseUpperLower, descA, descAC);
}
/* Allocate Workspace */
MORSE_Alloc_Workspace_zgels_Tile(M, N, &descT, P, Q);
memset(descT->mat, 0, (descT->llm*descT->lln)*sizeof(MorseComplexDouble));
/* MORSE ZGEQRF */
START_TIMING();
MORSE_zgelqf_Tile( descA, descT );
STOP_TIMING();
/* Check the solution */
if ( check && M == N )
{
/* Initialize and save B */
MORSE_zplrnt_Tile( descX, 2264 );
MORSE_zlacpy_Tile(MorseUpperLower, descX, descB);
/* Compute the solution */
MORSE_zgelqs_Tile( descA, descT, descX );
/* Check solution */
dparam[IPARAM_ANORM] = MORSE_zlange_Tile(MorseInfNorm, descAC);
dparam[IPARAM_BNORM] = MORSE_zlange_Tile(MorseInfNorm, descB);
dparam[IPARAM_XNORM] = MORSE_zlange_Tile(MorseInfNorm, descX);
MORSE_zgemm_Tile( MorseNoTrans, MorseNoTrans, 1.0, descAC, descX, -1.0, descB );
dparam[IPARAM_RES] = MORSE_zlange_Tile(MorseInfNorm, descB);
PASTE_CODE_FREE_MATRIX( descX )
PASTE_CODE_FREE_MATRIX( descAC )
PASTE_CODE_FREE_MATRIX( descB )
}
/* Free data */
MORSE_Dealloc_Workspace(&descT);
PASTE_CODE_FREE_MATRIX( descA )
return 0;
}
......@@ -23,6 +23,7 @@
#define _FADDS FADDS_GEQRF( M, N )
#include "./timing.c"
#include <starpu.h>
static int
RunTest(int *iparam, double *dparam, morse_time_t *t_)
......@@ -36,6 +37,14 @@ RunTest(int *iparam, double *dparam, morse_time_t *t_)
PASTE_CODE_ALLOCATE_MATRIX_TILE( descAC, ( check && M == N ), MORSE_Complex64_t, MorseComplexDouble, LDA, M, N );
PASTE_CODE_ALLOCATE_MATRIX_TILE( descB, ( check && M == N ), MORSE_Complex64_t, MorseComplexDouble, LDB, M, NRHS );
// RUNTIME_zlocality_onerestrict( MORSE_GEQRT, STARPU_CUDA );
// RUNTIME_zlocality_onerestrict( MORSE_GEQRT, STARPU_CPU );
// RUNTIME_zlocality_onerestrict( MORSE_UNMQR, STARPU_CUDA );
// RUNTIME_zlocality_onerestrict( MORSE_TSQRT, STARPU_CPU );
// RUNTIME_zlocality_onerestrict( MORSE_TSMQR, STARPU_CUDA );
// RUNTIME_zlocality_allrestrict( STARPU_CUDA );
MORSE_zplrnt_Tile( descA, 5373 );
/* Save A for check */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment