Commit 4819e83d authored by Mathieu Faverge's avatar Mathieu Faverge

Remove magma codelets for StarPU

parent f6cad5a7
......@@ -155,55 +155,7 @@ static void cl_zgelqt_cpu_func(void *descr[], void *cl_arg)
}
#endif /* !defined(CHAMELEON_SIMULATION) */
#if defined(CHAMELEON_USE_MAGMA)
#if !defined(CHAMELEON_SIMULATION)
static void cl_zgelqt_cuda_func(void *descr[], void *cl_arg)
{
MORSE_starpu_ws_t *h_work;
int m;
int n;
int ib;
cuDoubleComplex *h_A, *h_T, *h_D, *h_W, *h_TAU;
cuDoubleComplex *d_A, *d_T, *d_D, *d_W;
int lda, ldt;
starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldt, &h_work);
/* Gather pointer to data on device */
d_A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
d_T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
d_W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); /* m*ib + ib*ib*/
d_D = d_W + m*ib;
/* scratch data on host */
/* ib*n + ib*ib + max(m,n) + ib*ib + ib*ib */
h_A = (cuDoubleComplex*)RUNTIME_starpu_ws_getlocal(h_work);
/* Gather pointer to scratch data on host */
h_T = h_A + ib*n;
h_TAU = h_T + ib*ib;
h_W = h_TAU + chameleon_max(m,n);
h_D = h_W + ib*ib;
RUNTIME_getStream(stream);
CUDA_zgelqt(
m, n, ib,
d_A, lda, h_A, ib,
d_T, ldt, h_T, ib,
d_D, h_D, ib, h_TAU,
h_W, d_W, stream );
cudaThreadSynchronize();
}
#endif /* defined(CHAMELEON_USE_MAGMA) */
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
#if defined(CHAMELEON_USE_MAGMA)
CODELETS(zgelqt, 3, cl_zgelqt_cpu_func, cl_zgelqt_cuda_func, 0)
#else
CODELETS_CPU(zgelqt, 3, cl_zgelqt_cpu_func)
#endif
......@@ -154,56 +154,9 @@ static void cl_zgeqrt_cpu_func(void *descr[], void *cl_arg)
WORK = TAU + chameleon_max( m, n );
CORE_zgeqrt(m, n, ib, A, lda, T, ldt, TAU, WORK);
}
#if defined(CHAMELEON_USE_MAGMA)
static void cl_zgeqrt_cuda_func(void *descr[], void *cl_arg)
{
MORSE_starpu_ws_t *h_work;
int m;
int n;
int ib;
cuDoubleComplex *h_A, *h_T, *h_D, *h_W, *h_TAU;
cuDoubleComplex *d_A, *d_T, *d_D, *d_W;
int lda, ldt;
starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldt, &h_work);
/* Gather pointer to data on device */
d_A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
d_T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
d_W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); /* ib*n + ib * ib*/
d_D = d_W + ib*n;
/* scratch data on host */
/* m*ib + ib*ib + max(m,n) + ib*ib + ib*ib */
h_A = (cuDoubleComplex*)RUNTIME_starpu_ws_getlocal(h_work);
/* Gather pointer to scratch data on host */
h_T = h_A + m*ib;
h_TAU = h_T + ib*ib;
h_W = h_TAU + chameleon_max(m,n);
h_D = h_W + ib*ib;
RUNTIME_getStream(stream);
CUDA_zgeqrt(
m, n, ib,
d_A, lda, h_A, m,
d_T, ldt, h_T, ib,
d_D, h_D, ib, h_TAU,
h_W, d_W, stream);
cudaThreadSynchronize();
}
#endif /* defined(CHAMELEON_USE_MAGMA) */
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
#if defined(CHAMELEON_USE_MAGMA)
CODELETS(zgeqrt, 3, cl_zgeqrt_cpu_func, cl_zgeqrt_cuda_func, 0)
#else
CODELETS_CPU(zgeqrt, 3, cl_zgeqrt_cpu_func)
#endif
......@@ -137,42 +137,9 @@ static void cl_zgessm_cpu_func(void *descr[], void *cl_arg)
starpu_codelet_unpack_args(cl_arg, &m, &n, &k, &ib, &IPIV, &ldl, &ldd, &lda);
CORE_zgessm(m, n, k, ib, IPIV, D, ldd, A, lda);
}
#if defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU)
static void cl_zgessm_cuda_func(void *descr[], void *cl_arg)
{
int m;
int n;
int k;
int ib;
int *IPIV;
cuDoubleComplex *dL, *dD, *dA;
int lddl, lddd, ldda;
int info = 0;
/*
* hwork => nb*nb
*/
dL = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
dD = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
dA = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
starpu_codelet_unpack_args(cl_arg, &m, &n, &k, &ib, &IPIV, &lddl, &lddd, &ldda);
CUDA_zgessm(
MagmaColMajor, m, n, k, ib,
IPIV, dL, lddl, dD, lddd, dA, ldda, &info );
cudaThreadSynchronize();
return;
}
#endif /* defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU) */
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
#if (defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU))
CODELETS(zgessm, 3, cl_zgessm_cpu_func, cl_zgessm_cuda_func, 0)
#else
CODELETS_CPU(zgessm, 3, cl_zgessm_cpu_func)
#endif
......@@ -147,99 +147,10 @@ static void cl_zgetrf_incpiv_cpu_func(void *descr[], void *cl_arg)
starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldl, &IPIV, &check_info, &iinfo, &h_work);
CORE_zgetrf_incpiv(m, n, ib, A, lda, IPIV, &info);
#if defined(CHAMELEON_USE_MAGMA)
{
MORSE_Complex64_t *L = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
/*
* L stores:
* L1 L2 L3 ...
* L1^-1 L2^-1 L3^-1 ...
*/
/* Compute L-1 in lower rectangle of L */
if ( ldl >= 2*ib )
{
int i, sb;
L += ib;
for (i=0; i<n; i+=ib) {
sb = chameleon_min( ib, n-i );
CORE_zlacpy(MorseUpperLower, sb, sb, A+(i*lda+i), lda, L+(i*ldl), ldl );
CORE_ztrtri( MorseLower, MorseUnit, sb, L+(i*ldl), ldl, &info );
if (info != 0 ) {
fprintf(stderr, "ERROR, trtri returned with info = %d\n", info);
}
}
}
}
#endif
}
/*
* Codelet GPU
*/
#if defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU)
static void cl_zgetrf_incpiv_cuda_func(void *descr[], void *cl_arg)
{
int m;
int n;
int ib;
cuDoubleComplex *hA, *dA;
cuDoubleComplex *hL, *dL;
cuDoubleComplex *dwork;
MORSE_starpu_ws_t *h_work;
int lda, ldl;
int *IPIV;
MORSE_bool check_info;
int iinfo;
int info;
starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldl, &IPIV, &check_info, &iinfo, &h_work);
dA = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
dL = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
/*
* hwork => at least (IB+NB)*IB contains all hA and hL
* dwork => at least IB*NB
*/
hA = (cuDoubleComplex*)RUNTIME_starpu_ws_getlocal(h_work);
dwork = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
hL = hA + lda*ib;
/* Initialize L to 0 */
memset(hL, 0, ib*ib*sizeof(cuDoubleComplex));
if ( ldl >= 2*ib ) {
/* Let's compute the inverses in the bottom part of L */
dL += ib;
} else {
/* We prefer to stick with TRSM */
dL = NULL;
hL = NULL;
}
CUDA_zgetrf_incpiv(
MagmaColMajor, m, n, ib,
hA, lda, dA, lda,
hL, ib, dL, ldl,
IPIV,
dwork, lda,
&info );
cudaThreadSynchronize();
}
#endif /* defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU) */
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
#if defined(HAVE_MAGMA_GETRF_INCPIV_GPU) && ( defined(CHAMELEON_USE_MAGMA) )
CODELETS(zgetrf_incpiv, 3, cl_zgetrf_incpiv_cpu_func, cl_zgetrf_incpiv_cuda_func, 0)
#else
CODELETS_CPU(zgetrf_incpiv, 3, cl_zgetrf_incpiv_cpu_func)
#endif
......@@ -123,35 +123,9 @@ static void cl_zgetrf_nopiv_cpu_func(void *descr[], void *cl_arg)
starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &iinfo);
CORE_zgetrf_nopiv(m, n, ib, A, lda, &info);
}
/*
* Codelet GPU
*/
#if defined(CHAMELEON_USE_MAGMA)
static void cl_zgetrf_nopiv_cuda_func(void *descr[], void *cl_arg)
{
int m;
int n;
int ib;
cuDoubleComplex *dA;
int lda;
int iinfo;
int info = 0;
starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &iinfo);
dA = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
CUDA_zgetrf_nopiv( m, n, dA, lda, &info );
cudaThreadSynchronize();
}
#endif
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
#if defined(CHAMELEON_USE_MAGMA)
CODELETS(zgetrf_nopiv, 1, cl_zgetrf_nopiv_cpu_func, cl_zgetrf_nopiv_cuda_func, 0)
#else
CODELETS_CPU(zgetrf_nopiv, 1, cl_zgetrf_nopiv_cpu_func)
#endif
......@@ -77,30 +77,9 @@ static void cl_zlauum_cpu_func(void *descr[], void *cl_arg)
starpu_codelet_unpack_args(cl_arg, &uplo, &N, &LDA);
CORE_zlauum(uplo, N, A, LDA);
}
#if defined(CHAMELEON_USE_MAGMA)
static void cl_zlauum_cuda_func(void *descr[], void *cl_arg)
{
MORSE_enum uplo;
int info = 0;
int N;
cuDoubleComplex *A;
int LDA;
A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
starpu_codelet_unpack_args(cl_arg, &uplo, &N, &LDA);
CUDA_zlauum( uplo, N, A, LDA, &info);
cudaThreadSynchronize();
return;
}
#endif
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
#if defined(CHAMELEON_USE_MAGMA)
CODELETS(zlauum, 1, cl_zlauum_cpu_func, cl_zlauum_cuda_func, 0)
#else
CODELETS_CPU(zlauum, 1, cl_zlauum_cpu_func)
#endif
......@@ -83,48 +83,10 @@ static void cl_zpotrf_cpu_func(void *descr[], void *cl_arg)
starpu_codelet_unpack_args(cl_arg, &uplo, &n, &lda, &iinfo);
CORE_zpotrf(uplo, n, A, lda, &info);
}
#ifdef CHAMELEON_USE_MAGMA
static void cl_zpotrf_cuda_func(void *descr[], void *cl_arg)
{
cudaStream_t stream[2], currentt_stream;
MORSE_enum uplo;
int n;
cuDoubleComplex *A;
/* cuDoubleComplex *hA; */
int lda;
int iinfo;
int info = 0;
A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
starpu_codelet_unpack_args(cl_arg, &uplo, &n, &lda, &iinfo);
/* /\* */
/* * hwork => nb*nb */
/* *\/ */
/* hA = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); */
/* stream[0] = starpu_cuda_get_local_stream(); */
/* if ( cudaStreamCreate( stream+1 ) != CUDA_SUCCESS ){ */
/* fprintf(stderr, "Error while creating stream in codelet_zpotrf\n"); */
/* exit(-1); */
/* } */
CUDA_zpotrf( uplo, n, A, lda, &info);
cudaThreadSynchronize();
/* cudaStreamDestroy( stream[1] ); */
return;
}
#endif
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
#if defined CHAMELEON_USE_MAGMA
CODELETS(zpotrf, 1, cl_zpotrf_cpu_func, cl_zpotrf_cuda_func, 0)
#else
CODELETS_CPU(zpotrf, 1, cl_zpotrf_cpu_func)
#endif
......@@ -174,56 +174,10 @@ static void cl_zssssm_cpu_func(void *descr[], void *cl_arg)
starpu_codelet_unpack_args(cl_arg, &m1, &n1, &m2, &n2, &k, &ib, &lda1, &lda2, &ldl1, &ldl2, &IPIV);
CORE_zssssm(m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, L1, ldl1, L2, ldl2, IPIV);
}
#if defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU)
static void cl_zssssm_cuda_func(void *descr[], void *cl_arg)
{
int m1;
int n1;
int m2;
int n2;
int k;
int ib;
cuDoubleComplex *dA1;
int lda1;
cuDoubleComplex *dA2;
int lda2;
cuDoubleComplex *dL1;
int ldl1;
cuDoubleComplex *dL2;
int ldl2;
int *IPIV;
int info;
starpu_codelet_unpack_args(cl_arg, &m1, &n1, &m2, &n2, &k, &ib, &lda1, &lda2, &ldl1, &ldl2, &IPIV);
dA1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
dA2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
dL1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
dL2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]);
if ( ldl1 >= 2*ib ) {
/* dL1 stores L and invL and the kernel is just using the inverted part */
dL1 += ib;
}
CUDA_zssssm(
MagmaColMajor, m1, n1, m2, n2, k, ib,
dA1, lda1, dA2, lda2,
dL1, ldl1, dL2, ldl2,
IPIV, &info);
cudaThreadSynchronize();
}
#endif
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
#if (defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU))
CODELETS(zssssm, 4, cl_zssssm_cpu_func, cl_zssssm_cuda_func, 0)
#else
CODELETS_CPU(zssssm, 4, cl_zssssm_cpu_func)
#endif
......@@ -85,32 +85,9 @@ static void cl_ztrtri_cpu_func(void *descr[], void *cl_arg)
starpu_codelet_unpack_args(cl_arg, &uplo, &diag, &N, &LDA, &iinfo);
CORE_ztrtri(uplo, diag, N, A, LDA, &info);
}
#if defined(CHAMELEON_USE_MAGMA)
static void cl_ztrtri_cuda_func(void *descr[], void *cl_arg)
{
MORSE_enum uplo;
MORSE_enum diag;
int N;
cuDoubleComplex *A;
int LDA;
int iinfo;
int info = 0;
A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
starpu_codelet_unpack_args(cl_arg, &uplo, &diag, &N, &LDA, &iinfo);
CUDA_ztrtri( uplo, diag, N, A, LDA, &info);
cudaThreadSynchronize();
return;
}
#endif
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
#if defined(CHAMELEON_USE_MAGMA)
CODELETS(ztrtri, 1, cl_ztrtri_cpu_func, cl_ztrtri_cuda_func, 0)
#else
CODELETS_CPU(ztrtri, 1, cl_ztrtri_cpu_func)
#endif
......@@ -174,54 +174,9 @@ static void cl_ztslqt_cpu_func(void *descr[], void *cl_arg)
WORK = TAU + chameleon_max( m, n );
CORE_ztslqt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK);
}
#if defined(CHAMELEON_USE_MAGMA) && 0
static void cl_ztslqt_cuda_func(void *descr[], void *cl_arg)
{
MORSE_starpu_ws_t *h_work;
int m;
int n;
int ib;
cuDoubleComplex *h_A2, *h_T, *h_D, *h_TAU, *h_W;
cuDoubleComplex *d_A1, *d_A2, *d_T, *d_D, *d_W;
int lda1, lda2, ldt;
starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda1, &lda2, &ldt, &h_work);
/* Gather pointer to data on device */
d_A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
d_A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
d_T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
d_W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); /* 2*ib*m + ib*ib */
d_D = d_W + 2*ib*m;
/* scratch data on host */
/* ib*n + ib*n + max(m,n) + ib*m + ib*ib */
h_A2 = (cuDoubleComplex*)RUNTIME_starpu_ws_getlocal(h_work);
h_T = h_A2 + ib*n;
h_TAU = h_T + ib*n;
h_W = h_TAU + chameleon_max(m,n);
h_D = h_W + ib*m;
RUNTIME_getStream(stream);
CUDA_ztslqt(
m, n, ib,
d_A1, lda1, d_A2, lda2,
h_A2, ib,
d_T, ldt, h_T, ib,
d_D, h_D, ib, h_TAU,
h_W, d_W, stream);
cudaThreadSynchronize();
}
#endif
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
#if (defined(CHAMELEON_USE_MAGMA)) && 0
CODELETS(ztslqt, 4, cl_ztslqt_cpu_func, cl_ztslqt_cuda_func, 0)
#else
CODELETS_CPU(ztslqt, 4, cl_ztslqt_cpu_func)
#endif
......@@ -165,53 +165,9 @@ static void cl_ztsqrt_cpu_func(void *descr[], void *cl_arg)
WORK = TAU + chameleon_max( m, n );
CORE_ztsqrt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK);
}
#if defined(CHAMELEON_USE_MAGMA)
static void cl_ztsqrt_cuda_func(void *descr[], void *cl_arg)
{
MORSE_starpu_ws_t *h_work;
int m;
int n;
int ib;
cuDoubleComplex *h_A2, *h_T, *h_D, *h_TAU, *h_W;
cuDoubleComplex *d_A1, *d_A2, *d_T, *d_D, *d_W;
int lda1, lda2, ldt;
starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda1, &lda2, &ldt, &h_work);
/* Gather pointer to data on device */
d_A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
d_A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
d_T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
d_W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); /* 2*ib*n + ib*ib */
d_D = d_W + 2*ib*n;
/* scratch data on host */
/* m*ib + ib*ib + max(m,n) + ib*n + ib*ib */
h_A2 = (cuDoubleComplex*)RUNTIME_starpu_ws_getlocal(h_work);
h_T = h_A2 + m*ib;
h_TAU = h_T + ib*ib;
h_W = h_TAU + chameleon_max(m,n);
h_D = h_W + ib*n;
RUNTIME_getStream(stream);
CUDA_ztsqrt(
m, n, ib,
d_A1, lda1, d_A2, lda2,
h_A2, lda2,
d_T, ldt, h_T, ib,
d_D, h_D, ib, h_TAU,
h_W, d_W, stream);
cudaThreadSynchronize();
}
#endif
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
#if defined(CHAMELEON_USE_MAGMA)
CODELETS(ztsqrt, 4, cl_ztsqrt_cpu_func, cl_ztsqrt_cuda_func, 0)
#else
CODELETS_CPU(ztsqrt, 4, cl_ztsqrt_cpu_func)
#endif
......@@ -176,102 +176,11 @@ static void cl_ztstrf_cpu_func(void *descr[], void *cl_arg)
starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &nb, &ldu, &lda, &ldl, &IPIV, &d_work, &ldwork, &check_info, &iinfo);
CORE_ztstrf(m, n, ib, nb, U, ldu, A, lda, L, ldl, IPIV, WORK, ldwork, &info);
#if defined(CHAMELEON_USE_MAGMA)
/*
* L stores the following if enough place:
* L1 L2 L3 ...
* L1^-1 L2^-1 L3^-1 ...
*/
/* Compute L-1 in lower rectangle of L */
if ( ldl >= 2*ib )
{
int i, sb;
for (i=0; i<n; i+=ib) {
sb = chameleon_min( ib, n-i );
CORE_zlacpy(MorseUpperLower, sb, sb, L+(i*ldl), ldl, L+(i*ldl)+ib, ldl );
CORE_ztrtri( MorseLower, MorseUnit, sb, L+(i*ldl)+ib, ldl, &info );
if (info != 0 ) {
fprintf(stderr, "ERROR, trtri returned with info = %d\n", info);
}
}
}
#endif
}
/*
* Codelet GPU
*/
/* TODO/WARNING: tstrf is not working on GPU for now */
#if defined(CHAMELEON_USE_MAGMA) && 0
static void cl_ztstrf_cuda_func(void *descr[], void *cl_arg)
{
MORSE_starpu_ws_t *d_work;
int m;
int n;
int ib;
int nb;
cuDoubleComplex *hU, *dU;
int ldu;
cuDoubleComplex *hA, *dA;
int lda;
cuDoubleComplex *hL, *dL;
int ldl;
int *ipiv;
cuDoubleComplex *hw2, *hw, *dw;
int ldwork;
MORSE_bool check_info;
int iinfo;
int info;
starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &nb, &ldu, &lda, &ldl, &ipiv,
&d_work, &ldwork, &check_info, &iinfo);
dU = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
dA = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
dL = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
/*
* hwork => 2*nb*(2*ib+2nb)
* dwork => 2*ib*nb
*/
hw2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]);
dw = (cuDoubleComplex*)RUNTIME_starpu_ws_getlocal(d_work);
hU = hw2;
hA = hU + ldu * nb;
hL = hA + lda * nb;
hw = hL + ldl * nb;
/* Download first panel from A and U */
cublasGetMatrix( nb, n, sizeof(cuDoubleComplex), dU, ldu, hU, ldu );
cublasGetMatrix( m, ib, sizeof(cuDoubleComplex), dA, lda, hA, lda );
/* Initialize L to 0 */
memset(hL, 0, ldl*nb*sizeof(cuDoubleComplex));
CUDA_ztstrf(
MagmaColMajor, m, n, ib, nb,
hU, ldu, dU, ldu,
hA, lda, dA, lda,
hL, ldl, dL, ldl,
ipiv,
hw, ldwork, dw, lda,
&info );
cudaThreadSynchronize();
}
#endif
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
#if (defined(CHAMELEON_USE_MAGMA) && 0)
CODELETS(ztstrf, 4, cl_ztstrf_cpu_func, cl_ztstrf_cuda_func, 0)
#else
CODELETS_CPU(ztstrf, 4, cl_ztstrf_cpu_func)
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment