Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 4819e83d authored by Mathieu Faverge's avatar Mathieu Faverge
Browse files

Remove magma codelets for StarPU

parent f6cad5a7
No related branches found
No related tags found
1 merge request!44Magma
Showing with 1 addition and 552 deletions
......@@ -155,55 +155,7 @@ static void cl_zgelqt_cpu_func(void *descr[], void *cl_arg)
}
#endif /* !defined(CHAMELEON_SIMULATION) */
#if defined(CHAMELEON_USE_MAGMA)
#if !defined(CHAMELEON_SIMULATION)
static void cl_zgelqt_cuda_func(void *descr[], void *cl_arg)
{
MORSE_starpu_ws_t *h_work;
int m;
int n;
int ib;
cuDoubleComplex *h_A, *h_T, *h_D, *h_W, *h_TAU;
cuDoubleComplex *d_A, *d_T, *d_D, *d_W;
int lda, ldt;
starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldt, &h_work);
/* Gather pointer to data on device */
d_A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
d_T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
d_W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); /* m*ib + ib*ib*/
d_D = d_W + m*ib;
/* scratch data on host */
/* ib*n + ib*ib + max(m,n) + ib*ib + ib*ib */
h_A = (cuDoubleComplex*)RUNTIME_starpu_ws_getlocal(h_work);
/* Gather pointer to scratch data on host */
h_T = h_A + ib*n;
h_TAU = h_T + ib*ib;
h_W = h_TAU + chameleon_max(m,n);
h_D = h_W + ib*ib;
RUNTIME_getStream(stream);
CUDA_zgelqt(
m, n, ib,
d_A, lda, h_A, ib,
d_T, ldt, h_T, ib,
d_D, h_D, ib, h_TAU,
h_W, d_W, stream );
cudaThreadSynchronize();
}
#endif /* defined(CHAMELEON_USE_MAGMA) */
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
#if defined(CHAMELEON_USE_MAGMA)
CODELETS(zgelqt, 3, cl_zgelqt_cpu_func, cl_zgelqt_cuda_func, 0)
#else
CODELETS_CPU(zgelqt, 3, cl_zgelqt_cpu_func)
#endif
......@@ -154,56 +154,9 @@ static void cl_zgeqrt_cpu_func(void *descr[], void *cl_arg)
WORK = TAU + chameleon_max( m, n );
CORE_zgeqrt(m, n, ib, A, lda, T, ldt, TAU, WORK);
}
#if defined(CHAMELEON_USE_MAGMA)
static void cl_zgeqrt_cuda_func(void *descr[], void *cl_arg)
{
MORSE_starpu_ws_t *h_work;
int m;
int n;
int ib;
cuDoubleComplex *h_A, *h_T, *h_D, *h_W, *h_TAU;
cuDoubleComplex *d_A, *d_T, *d_D, *d_W;
int lda, ldt;
starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldt, &h_work);
/* Gather pointer to data on device */
d_A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
d_T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
d_W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); /* ib*n + ib * ib*/
d_D = d_W + ib*n;
/* scratch data on host */
/* m*ib + ib*ib + max(m,n) + ib*ib + ib*ib */
h_A = (cuDoubleComplex*)RUNTIME_starpu_ws_getlocal(h_work);
/* Gather pointer to scratch data on host */
h_T = h_A + m*ib;
h_TAU = h_T + ib*ib;
h_W = h_TAU + chameleon_max(m,n);
h_D = h_W + ib*ib;
RUNTIME_getStream(stream);
CUDA_zgeqrt(
m, n, ib,
d_A, lda, h_A, m,
d_T, ldt, h_T, ib,
d_D, h_D, ib, h_TAU,
h_W, d_W, stream);
cudaThreadSynchronize();
}
#endif /* defined(CHAMELEON_USE_MAGMA) */
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
#if defined(CHAMELEON_USE_MAGMA)
CODELETS(zgeqrt, 3, cl_zgeqrt_cpu_func, cl_zgeqrt_cuda_func, 0)
#else
CODELETS_CPU(zgeqrt, 3, cl_zgeqrt_cpu_func)
#endif
......@@ -137,42 +137,9 @@ static void cl_zgessm_cpu_func(void *descr[], void *cl_arg)
starpu_codelet_unpack_args(cl_arg, &m, &n, &k, &ib, &IPIV, &ldl, &ldd, &lda);
CORE_zgessm(m, n, k, ib, IPIV, D, ldd, A, lda);
}
#if defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU)
static void cl_zgessm_cuda_func(void *descr[], void *cl_arg)
{
int m;
int n;
int k;
int ib;
int *IPIV;
cuDoubleComplex *dL, *dD, *dA;
int lddl, lddd, ldda;
int info = 0;
/*
* hwork => nb*nb
*/
dL = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
dD = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
dA = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
starpu_codelet_unpack_args(cl_arg, &m, &n, &k, &ib, &IPIV, &lddl, &lddd, &ldda);
CUDA_zgessm(
MagmaColMajor, m, n, k, ib,
IPIV, dL, lddl, dD, lddd, dA, ldda, &info );
cudaThreadSynchronize();
return;
}
#endif /* defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU) */
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
#if (defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU))
CODELETS(zgessm, 3, cl_zgessm_cpu_func, cl_zgessm_cuda_func, 0)
#else
CODELETS_CPU(zgessm, 3, cl_zgessm_cpu_func)
#endif
......@@ -147,99 +147,10 @@ static void cl_zgetrf_incpiv_cpu_func(void *descr[], void *cl_arg)
starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldl, &IPIV, &check_info, &iinfo, &h_work);
CORE_zgetrf_incpiv(m, n, ib, A, lda, IPIV, &info);
#if defined(CHAMELEON_USE_MAGMA)
{
MORSE_Complex64_t *L = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
/*
* L stores:
* L1 L2 L3 ...
* L1^-1 L2^-1 L3^-1 ...
*/
/* Compute L-1 in lower rectangle of L */
if ( ldl >= 2*ib )
{
int i, sb;
L += ib;
for (i=0; i<n; i+=ib) {
sb = chameleon_min( ib, n-i );
CORE_zlacpy(MorseUpperLower, sb, sb, A+(i*lda+i), lda, L+(i*ldl), ldl );
CORE_ztrtri( MorseLower, MorseUnit, sb, L+(i*ldl), ldl, &info );
if (info != 0 ) {
fprintf(stderr, "ERROR, trtri returned with info = %d\n", info);
}
}
}
}
#endif
}
/*
* Codelet GPU
*/
#if defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU)
static void cl_zgetrf_incpiv_cuda_func(void *descr[], void *cl_arg)
{
int m;
int n;
int ib;
cuDoubleComplex *hA, *dA;
cuDoubleComplex *hL, *dL;
cuDoubleComplex *dwork;
MORSE_starpu_ws_t *h_work;
int lda, ldl;
int *IPIV;
MORSE_bool check_info;
int iinfo;
int info;
starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldl, &IPIV, &check_info, &iinfo, &h_work);
dA = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
dL = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
/*
* hwork => at least (IB+NB)*IB contains all hA and hL
* dwork => at least IB*NB
*/
hA = (cuDoubleComplex*)RUNTIME_starpu_ws_getlocal(h_work);
dwork = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
hL = hA + lda*ib;
/* Initialize L to 0 */
memset(hL, 0, ib*ib*sizeof(cuDoubleComplex));
if ( ldl >= 2*ib ) {
/* Let's compute the inverses in the bottom part of L */
dL += ib;
} else {
/* We prefer to stick with TRSM */
dL = NULL;
hL = NULL;
}
CUDA_zgetrf_incpiv(
MagmaColMajor, m, n, ib,
hA, lda, dA, lda,
hL, ib, dL, ldl,
IPIV,
dwork, lda,
&info );
cudaThreadSynchronize();
}
#endif /* defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU) */
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
#if defined(HAVE_MAGMA_GETRF_INCPIV_GPU) && ( defined(CHAMELEON_USE_MAGMA) )
CODELETS(zgetrf_incpiv, 3, cl_zgetrf_incpiv_cpu_func, cl_zgetrf_incpiv_cuda_func, 0)
#else
CODELETS_CPU(zgetrf_incpiv, 3, cl_zgetrf_incpiv_cpu_func)
#endif
......@@ -123,35 +123,9 @@ static void cl_zgetrf_nopiv_cpu_func(void *descr[], void *cl_arg)
starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &iinfo);
CORE_zgetrf_nopiv(m, n, ib, A, lda, &info);
}
/*
* Codelet GPU
*/
#if defined(CHAMELEON_USE_MAGMA)
static void cl_zgetrf_nopiv_cuda_func(void *descr[], void *cl_arg)
{
int m;
int n;
int ib;
cuDoubleComplex *dA;
int lda;
int iinfo;
int info = 0;
starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &iinfo);
dA = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
CUDA_zgetrf_nopiv( m, n, dA, lda, &info );
cudaThreadSynchronize();
}
#endif
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
#if defined(CHAMELEON_USE_MAGMA)
CODELETS(zgetrf_nopiv, 1, cl_zgetrf_nopiv_cpu_func, cl_zgetrf_nopiv_cuda_func, 0)
#else
CODELETS_CPU(zgetrf_nopiv, 1, cl_zgetrf_nopiv_cpu_func)
#endif
......@@ -77,30 +77,9 @@ static void cl_zlauum_cpu_func(void *descr[], void *cl_arg)
starpu_codelet_unpack_args(cl_arg, &uplo, &N, &LDA);
CORE_zlauum(uplo, N, A, LDA);
}
#if defined(CHAMELEON_USE_MAGMA)
static void cl_zlauum_cuda_func(void *descr[], void *cl_arg)
{
MORSE_enum uplo;
int info = 0;
int N;
cuDoubleComplex *A;
int LDA;
A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
starpu_codelet_unpack_args(cl_arg, &uplo, &N, &LDA);
CUDA_zlauum( uplo, N, A, LDA, &info);
cudaThreadSynchronize();
return;
}
#endif
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
#if defined(CHAMELEON_USE_MAGMA)
CODELETS(zlauum, 1, cl_zlauum_cpu_func, cl_zlauum_cuda_func, 0)
#else
CODELETS_CPU(zlauum, 1, cl_zlauum_cpu_func)
#endif
......@@ -83,48 +83,10 @@ static void cl_zpotrf_cpu_func(void *descr[], void *cl_arg)
starpu_codelet_unpack_args(cl_arg, &uplo, &n, &lda, &iinfo);
CORE_zpotrf(uplo, n, A, lda, &info);
}
#ifdef CHAMELEON_USE_MAGMA
static void cl_zpotrf_cuda_func(void *descr[], void *cl_arg)
{
cudaStream_t stream[2], currentt_stream;
MORSE_enum uplo;
int n;
cuDoubleComplex *A;
/* cuDoubleComplex *hA; */
int lda;
int iinfo;
int info = 0;
A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
starpu_codelet_unpack_args(cl_arg, &uplo, &n, &lda, &iinfo);
/* /\* */
/* * hwork => nb*nb */
/* *\/ */
/* hA = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); */
/* stream[0] = starpu_cuda_get_local_stream(); */
/* if ( cudaStreamCreate( stream+1 ) != CUDA_SUCCESS ){ */
/* fprintf(stderr, "Error while creating stream in codelet_zpotrf\n"); */
/* exit(-1); */
/* } */
CUDA_zpotrf( uplo, n, A, lda, &info);
cudaThreadSynchronize();
/* cudaStreamDestroy( stream[1] ); */
return;
}
#endif
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
#if defined CHAMELEON_USE_MAGMA
CODELETS(zpotrf, 1, cl_zpotrf_cpu_func, cl_zpotrf_cuda_func, 0)
#else
CODELETS_CPU(zpotrf, 1, cl_zpotrf_cpu_func)
#endif
......@@ -174,56 +174,10 @@ static void cl_zssssm_cpu_func(void *descr[], void *cl_arg)
starpu_codelet_unpack_args(cl_arg, &m1, &n1, &m2, &n2, &k, &ib, &lda1, &lda2, &ldl1, &ldl2, &IPIV);
CORE_zssssm(m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, L1, ldl1, L2, ldl2, IPIV);
}
#if defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU)
static void cl_zssssm_cuda_func(void *descr[], void *cl_arg)
{
int m1;
int n1;
int m2;
int n2;
int k;
int ib;
cuDoubleComplex *dA1;
int lda1;
cuDoubleComplex *dA2;
int lda2;
cuDoubleComplex *dL1;
int ldl1;
cuDoubleComplex *dL2;
int ldl2;
int *IPIV;
int info;
starpu_codelet_unpack_args(cl_arg, &m1, &n1, &m2, &n2, &k, &ib, &lda1, &lda2, &ldl1, &ldl2, &IPIV);
dA1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
dA2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
dL1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
dL2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]);
if ( ldl1 >= 2*ib ) {
/* dL1 stores L and invL and the kernel is just using the inverted part */
dL1 += ib;
}
CUDA_zssssm(
MagmaColMajor, m1, n1, m2, n2, k, ib,
dA1, lda1, dA2, lda2,
dL1, ldl1, dL2, ldl2,
IPIV, &info);
cudaThreadSynchronize();
}
#endif
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
#if (defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU))
CODELETS(zssssm, 4, cl_zssssm_cpu_func, cl_zssssm_cuda_func, 0)
#else
CODELETS_CPU(zssssm, 4, cl_zssssm_cpu_func)
#endif
......@@ -85,32 +85,9 @@ static void cl_ztrtri_cpu_func(void *descr[], void *cl_arg)
starpu_codelet_unpack_args(cl_arg, &uplo, &diag, &N, &LDA, &iinfo);
CORE_ztrtri(uplo, diag, N, A, LDA, &info);
}
#if defined(CHAMELEON_USE_MAGMA)
static void cl_ztrtri_cuda_func(void *descr[], void *cl_arg)
{
MORSE_enum uplo;
MORSE_enum diag;
int N;
cuDoubleComplex *A;
int LDA;
int iinfo;
int info = 0;
A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
starpu_codelet_unpack_args(cl_arg, &uplo, &diag, &N, &LDA, &iinfo);
CUDA_ztrtri( uplo, diag, N, A, LDA, &info);
cudaThreadSynchronize();
return;
}
#endif
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
#if defined(CHAMELEON_USE_MAGMA)
CODELETS(ztrtri, 1, cl_ztrtri_cpu_func, cl_ztrtri_cuda_func, 0)
#else
CODELETS_CPU(ztrtri, 1, cl_ztrtri_cpu_func)
#endif
......@@ -174,54 +174,9 @@ static void cl_ztslqt_cpu_func(void *descr[], void *cl_arg)
WORK = TAU + chameleon_max( m, n );
CORE_ztslqt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK);
}
#if defined(CHAMELEON_USE_MAGMA) && 0
static void cl_ztslqt_cuda_func(void *descr[], void *cl_arg)
{
MORSE_starpu_ws_t *h_work;
int m;
int n;
int ib;
cuDoubleComplex *h_A2, *h_T, *h_D, *h_TAU, *h_W;
cuDoubleComplex *d_A1, *d_A2, *d_T, *d_D, *d_W;
int lda1, lda2, ldt;
starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda1, &lda2, &ldt, &h_work);
/* Gather pointer to data on device */
d_A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
d_A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
d_T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
d_W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); /* 2*ib*m + ib*ib */
d_D = d_W + 2*ib*m;
/* scratch data on host */
/* ib*n + ib*n + max(m,n) + ib*m + ib*ib */
h_A2 = (cuDoubleComplex*)RUNTIME_starpu_ws_getlocal(h_work);
h_T = h_A2 + ib*n;
h_TAU = h_T + ib*n;
h_W = h_TAU + chameleon_max(m,n);
h_D = h_W + ib*m;
RUNTIME_getStream(stream);
CUDA_ztslqt(
m, n, ib,
d_A1, lda1, d_A2, lda2,
h_A2, ib,
d_T, ldt, h_T, ib,
d_D, h_D, ib, h_TAU,
h_W, d_W, stream);
cudaThreadSynchronize();
}
#endif
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
#if (defined(CHAMELEON_USE_MAGMA)) && 0
CODELETS(ztslqt, 4, cl_ztslqt_cpu_func, cl_ztslqt_cuda_func, 0)
#else
CODELETS_CPU(ztslqt, 4, cl_ztslqt_cpu_func)
#endif
......@@ -165,53 +165,9 @@ static void cl_ztsqrt_cpu_func(void *descr[], void *cl_arg)
WORK = TAU + chameleon_max( m, n );
CORE_ztsqrt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK);
}
#if defined(CHAMELEON_USE_MAGMA)
static void cl_ztsqrt_cuda_func(void *descr[], void *cl_arg)
{
MORSE_starpu_ws_t *h_work;
int m;
int n;
int ib;
cuDoubleComplex *h_A2, *h_T, *h_D, *h_TAU, *h_W;
cuDoubleComplex *d_A1, *d_A2, *d_T, *d_D, *d_W;
int lda1, lda2, ldt;
starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda1, &lda2, &ldt, &h_work);
/* Gather pointer to data on device */
d_A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
d_A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
d_T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
d_W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); /* 2*ib*n + ib*ib */
d_D = d_W + 2*ib*n;
/* scratch data on host */
/* m*ib + ib*ib + max(m,n) + ib*n + ib*ib */
h_A2 = (cuDoubleComplex*)RUNTIME_starpu_ws_getlocal(h_work);
h_T = h_A2 + m*ib;
h_TAU = h_T + ib*ib;
h_W = h_TAU + chameleon_max(m,n);
h_D = h_W + ib*n;
RUNTIME_getStream(stream);
CUDA_ztsqrt(
m, n, ib,
d_A1, lda1, d_A2, lda2,
h_A2, lda2,
d_T, ldt, h_T, ib,
d_D, h_D, ib, h_TAU,
h_W, d_W, stream);
cudaThreadSynchronize();
}
#endif
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
#if defined(CHAMELEON_USE_MAGMA)
CODELETS(ztsqrt, 4, cl_ztsqrt_cpu_func, cl_ztsqrt_cuda_func, 0)
#else
CODELETS_CPU(ztsqrt, 4, cl_ztsqrt_cpu_func)
#endif
......@@ -176,102 +176,11 @@ static void cl_ztstrf_cpu_func(void *descr[], void *cl_arg)
starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &nb, &ldu, &lda, &ldl, &IPIV, &d_work, &ldwork, &check_info, &iinfo);
CORE_ztstrf(m, n, ib, nb, U, ldu, A, lda, L, ldl, IPIV, WORK, ldwork, &info);
#if defined(CHAMELEON_USE_MAGMA)
/*
* L stores the following if enough place:
* L1 L2 L3 ...
* L1^-1 L2^-1 L3^-1 ...
*/
/* Compute L-1 in lower rectangle of L */
if ( ldl >= 2*ib )
{
int i, sb;
for (i=0; i<n; i+=ib) {
sb = chameleon_min( ib, n-i );
CORE_zlacpy(MorseUpperLower, sb, sb, L+(i*ldl), ldl, L+(i*ldl)+ib, ldl );
CORE_ztrtri( MorseLower, MorseUnit, sb, L+(i*ldl)+ib, ldl, &info );
if (info != 0 ) {
fprintf(stderr, "ERROR, trtri returned with info = %d\n", info);
}
}
}
#endif
}
/*
* Codelet GPU
*/
/* TODO/WARNING: tstrf is not working on GPU for now */
#if defined(CHAMELEON_USE_MAGMA) && 0
static void cl_ztstrf_cuda_func(void *descr[], void *cl_arg)
{
MORSE_starpu_ws_t *d_work;
int m;
int n;
int ib;
int nb;
cuDoubleComplex *hU, *dU;
int ldu;
cuDoubleComplex *hA, *dA;
int lda;
cuDoubleComplex *hL, *dL;
int ldl;
int *ipiv;
cuDoubleComplex *hw2, *hw, *dw;
int ldwork;
MORSE_bool check_info;
int iinfo;
int info;
starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &nb, &ldu, &lda, &ldl, &ipiv,
&d_work, &ldwork, &check_info, &iinfo);
dU = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
dA = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
dL = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
/*
* hwork => 2*nb*(2*ib+2nb)
* dwork => 2*ib*nb
*/
hw2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]);
dw = (cuDoubleComplex*)RUNTIME_starpu_ws_getlocal(d_work);
hU = hw2;
hA = hU + ldu * nb;
hL = hA + lda * nb;
hw = hL + ldl * nb;
/* Download first panel from A and U */
cublasGetMatrix( nb, n, sizeof(cuDoubleComplex), dU, ldu, hU, ldu );
cublasGetMatrix( m, ib, sizeof(cuDoubleComplex), dA, lda, hA, lda );
/* Initialize L to 0 */
memset(hL, 0, ldl*nb*sizeof(cuDoubleComplex));
CUDA_ztstrf(
MagmaColMajor, m, n, ib, nb,
hU, ldu, dU, ldu,
hA, lda, dA, lda,
hL, ldl, dL, ldl,
ipiv,
hw, ldwork, dw, lda,
&info );
cudaThreadSynchronize();
}
#endif
#endif /* !defined(CHAMELEON_SIMULATION) */
/*
* Codelet definition
*/
#if (defined(CHAMELEON_USE_MAGMA) && 0)
CODELETS(ztstrf, 4, cl_ztstrf_cpu_func, cl_ztstrf_cuda_func, 0)
#else
CODELETS_CPU(ztstrf, 4, cl_ztstrf_cpu_func)
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment