From 4819e83d68f79d2bfb87affd52ea2a18ad58c43f Mon Sep 17 00:00:00 2001 From: Mathieu Faverge <mathieu.faverge@inria.fr> Date: Tue, 2 May 2017 18:02:25 +0200 Subject: [PATCH] Remove magma codelets for StarPU --- runtime/starpu/codelets/codelet_zgelqt.c | 48 ---------- runtime/starpu/codelets/codelet_zgeqrt.c | 47 ---------- runtime/starpu/codelets/codelet_zgessm.c | 33 ------- .../starpu/codelets/codelet_zgetrf_incpiv.c | 89 ------------------ .../starpu/codelets/codelet_zgetrf_nopiv.c | 26 ------ runtime/starpu/codelets/codelet_zlauum.c | 21 ----- runtime/starpu/codelets/codelet_zpotrf.c | 40 +------- runtime/starpu/codelets/codelet_zssssm.c | 46 ---------- runtime/starpu/codelets/codelet_ztrtri.c | 23 ----- runtime/starpu/codelets/codelet_ztslqt.c | 45 --------- runtime/starpu/codelets/codelet_ztsqrt.c | 44 --------- runtime/starpu/codelets/codelet_ztstrf.c | 91 ------------------- 12 files changed, 1 insertion(+), 552 deletions(-) diff --git a/runtime/starpu/codelets/codelet_zgelqt.c b/runtime/starpu/codelets/codelet_zgelqt.c index 223559778..e215ce20f 100644 --- a/runtime/starpu/codelets/codelet_zgelqt.c +++ b/runtime/starpu/codelets/codelet_zgelqt.c @@ -155,55 +155,7 @@ static void cl_zgelqt_cpu_func(void *descr[], void *cl_arg) } #endif /* !defined(CHAMELEON_SIMULATION) */ -#if defined(CHAMELEON_USE_MAGMA) -#if !defined(CHAMELEON_SIMULATION) -static void cl_zgelqt_cuda_func(void *descr[], void *cl_arg) -{ - MORSE_starpu_ws_t *h_work; - int m; - int n; - int ib; - cuDoubleComplex *h_A, *h_T, *h_D, *h_W, *h_TAU; - cuDoubleComplex *d_A, *d_T, *d_D, *d_W; - int lda, ldt; - - starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldt, &h_work); - - /* Gather pointer to data on device */ - d_A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - d_T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); - d_W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); /* m*ib + ib*ib*/ - d_D = d_W + m*ib; - - /* scratch data on host */ - /* ib*n + ib*ib + max(m,n) + ib*ib + ib*ib */ - h_A = (cuDoubleComplex*)RUNTIME_starpu_ws_getlocal(h_work); - - /* Gather pointer to scratch data on host */ - h_T = h_A + ib*n; - h_TAU = h_T + ib*ib; - h_W = h_TAU + chameleon_max(m,n); - h_D = h_W + ib*ib; - - RUNTIME_getStream(stream); - - CUDA_zgelqt( - m, n, ib, - d_A, lda, h_A, ib, - d_T, ldt, h_T, ib, - d_D, h_D, ib, h_TAU, - h_W, d_W, stream ); - - cudaThreadSynchronize(); -} -#endif /* defined(CHAMELEON_USE_MAGMA) */ -#endif /* !defined(CHAMELEON_SIMULATION) */ - /* * Codelet definition */ -#if defined(CHAMELEON_USE_MAGMA) -CODELETS(zgelqt, 3, cl_zgelqt_cpu_func, cl_zgelqt_cuda_func, 0) -#else CODELETS_CPU(zgelqt, 3, cl_zgelqt_cpu_func) -#endif diff --git a/runtime/starpu/codelets/codelet_zgeqrt.c b/runtime/starpu/codelets/codelet_zgeqrt.c index 868c10c16..595bafa1b 100644 --- a/runtime/starpu/codelets/codelet_zgeqrt.c +++ b/runtime/starpu/codelets/codelet_zgeqrt.c @@ -154,56 +154,9 @@ static void cl_zgeqrt_cpu_func(void *descr[], void *cl_arg) WORK = TAU + chameleon_max( m, n ); CORE_zgeqrt(m, n, ib, A, lda, T, ldt, TAU, WORK); } - - -#if defined(CHAMELEON_USE_MAGMA) -static void cl_zgeqrt_cuda_func(void *descr[], void *cl_arg) -{ - MORSE_starpu_ws_t *h_work; - int m; - int n; - int ib; - cuDoubleComplex *h_A, *h_T, *h_D, *h_W, *h_TAU; - cuDoubleComplex *d_A, *d_T, *d_D, *d_W; - int lda, ldt; - - starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldt, &h_work); - - /* Gather pointer to data on device */ - d_A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - d_T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); - d_W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); /* ib*n + ib * ib*/ - d_D = d_W + ib*n; - - /* scratch data on host */ - /* m*ib + ib*ib + max(m,n) + ib*ib + ib*ib */ - h_A = (cuDoubleComplex*)RUNTIME_starpu_ws_getlocal(h_work); - - /* Gather pointer to scratch data on host */ - h_T = h_A + m*ib; - h_TAU = h_T + ib*ib; - h_W = h_TAU + chameleon_max(m,n); - h_D = h_W + ib*ib; - - RUNTIME_getStream(stream); - - CUDA_zgeqrt( - m, n, ib, - d_A, lda, h_A, m, - d_T, ldt, h_T, ib, - d_D, h_D, ib, h_TAU, - h_W, d_W, stream); - - cudaThreadSynchronize(); -} -#endif /* defined(CHAMELEON_USE_MAGMA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -#if defined(CHAMELEON_USE_MAGMA) -CODELETS(zgeqrt, 3, cl_zgeqrt_cpu_func, cl_zgeqrt_cuda_func, 0) -#else CODELETS_CPU(zgeqrt, 3, cl_zgeqrt_cpu_func) -#endif diff --git a/runtime/starpu/codelets/codelet_zgessm.c b/runtime/starpu/codelets/codelet_zgessm.c index 547111784..1b5a72908 100644 --- a/runtime/starpu/codelets/codelet_zgessm.c +++ b/runtime/starpu/codelets/codelet_zgessm.c @@ -137,42 +137,9 @@ static void cl_zgessm_cpu_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &m, &n, &k, &ib, &IPIV, &ldl, &ldd, &lda); CORE_zgessm(m, n, k, ib, IPIV, D, ldd, A, lda); } - -#if defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU) -static void cl_zgessm_cuda_func(void *descr[], void *cl_arg) -{ - int m; - int n; - int k; - int ib; - int *IPIV; - cuDoubleComplex *dL, *dD, *dA; - int lddl, lddd, ldda; - int info = 0; - /* - * hwork => nb*nb - */ - dL = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - dD = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); - dA = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); - starpu_codelet_unpack_args(cl_arg, &m, &n, &k, &ib, &IPIV, &lddl, &lddd, &ldda); - - CUDA_zgessm( - MagmaColMajor, m, n, k, ib, - IPIV, dL, lddl, dD, lddd, dA, ldda, &info ); - - cudaThreadSynchronize(); - - return; -} -#endif /* defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -#if (defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU)) -CODELETS(zgessm, 3, cl_zgessm_cpu_func, cl_zgessm_cuda_func, 0) -#else CODELETS_CPU(zgessm, 3, cl_zgessm_cpu_func) -#endif diff --git a/runtime/starpu/codelets/codelet_zgetrf_incpiv.c b/runtime/starpu/codelets/codelet_zgetrf_incpiv.c index ad3475cb7..bc91e972a 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_incpiv.c +++ b/runtime/starpu/codelets/codelet_zgetrf_incpiv.c @@ -147,99 +147,10 @@ static void cl_zgetrf_incpiv_cpu_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldl, &IPIV, &check_info, &iinfo, &h_work); CORE_zgetrf_incpiv(m, n, ib, A, lda, IPIV, &info); - -#if defined(CHAMELEON_USE_MAGMA) - { - MORSE_Complex64_t *L = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); - /* - * L stores: - * L1 L2 L3 ... - * L1^-1 L2^-1 L3^-1 ... - */ - /* Compute L-1 in lower rectangle of L */ - if ( ldl >= 2*ib ) - { - int i, sb; - - L += ib; - for (i=0; i<n; i+=ib) { - sb = chameleon_min( ib, n-i ); - CORE_zlacpy(MorseUpperLower, sb, sb, A+(i*lda+i), lda, L+(i*ldl), ldl ); - - CORE_ztrtri( MorseLower, MorseUnit, sb, L+(i*ldl), ldl, &info ); - if (info != 0 ) { - fprintf(stderr, "ERROR, trtri returned with info = %d\n", info); - } - } - } - } -#endif -} - - -/* - * Codelet GPU - */ -#if defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU) -static void cl_zgetrf_incpiv_cuda_func(void *descr[], void *cl_arg) -{ - int m; - int n; - int ib; - cuDoubleComplex *hA, *dA; - cuDoubleComplex *hL, *dL; - cuDoubleComplex *dwork; - MORSE_starpu_ws_t *h_work; - int lda, ldl; - int *IPIV; - MORSE_bool check_info; - int iinfo; - int info; - - starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldl, &IPIV, &check_info, &iinfo, &h_work); - - dA = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - dL = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); - /* - * hwork => at least (IB+NB)*IB contains all hA and hL - * dwork => at least IB*NB - */ - hA = (cuDoubleComplex*)RUNTIME_starpu_ws_getlocal(h_work); - dwork = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); - - hL = hA + lda*ib; - - /* Initialize L to 0 */ - memset(hL, 0, ib*ib*sizeof(cuDoubleComplex)); - - if ( ldl >= 2*ib ) { - /* Let's compute the inverses in the bottom part of L */ - dL += ib; - } else { - /* We prefer to stick with TRSM */ - dL = NULL; - hL = NULL; - } - - CUDA_zgetrf_incpiv( - MagmaColMajor, m, n, ib, - hA, lda, dA, lda, - hL, ib, dL, ldl, - IPIV, - dwork, lda, - &info ); - - cudaThreadSynchronize(); } -#endif /* defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU) */ #endif /* !defined(CHAMELEON_SIMULATION) */ - /* * Codelet definition */ -#if defined(HAVE_MAGMA_GETRF_INCPIV_GPU) && ( defined(CHAMELEON_USE_MAGMA) ) -CODELETS(zgetrf_incpiv, 3, cl_zgetrf_incpiv_cpu_func, cl_zgetrf_incpiv_cuda_func, 0) -#else CODELETS_CPU(zgetrf_incpiv, 3, cl_zgetrf_incpiv_cpu_func) -#endif diff --git a/runtime/starpu/codelets/codelet_zgetrf_nopiv.c b/runtime/starpu/codelets/codelet_zgetrf_nopiv.c index ab9d87ace..8ca85664b 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_nopiv.c +++ b/runtime/starpu/codelets/codelet_zgetrf_nopiv.c @@ -123,35 +123,9 @@ static void cl_zgetrf_nopiv_cpu_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &iinfo); CORE_zgetrf_nopiv(m, n, ib, A, lda, &info); } - -/* - * Codelet GPU - */ -#if defined(CHAMELEON_USE_MAGMA) -static void cl_zgetrf_nopiv_cuda_func(void *descr[], void *cl_arg) -{ - int m; - int n; - int ib; - cuDoubleComplex *dA; - int lda; - int iinfo; - - int info = 0; - - starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &iinfo); - dA = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - CUDA_zgetrf_nopiv( m, n, dA, lda, &info ); - cudaThreadSynchronize(); -} -#endif #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -#if defined(CHAMELEON_USE_MAGMA) -CODELETS(zgetrf_nopiv, 1, cl_zgetrf_nopiv_cpu_func, cl_zgetrf_nopiv_cuda_func, 0) -#else CODELETS_CPU(zgetrf_nopiv, 1, cl_zgetrf_nopiv_cpu_func) -#endif diff --git a/runtime/starpu/codelets/codelet_zlauum.c b/runtime/starpu/codelets/codelet_zlauum.c index f578695ff..7f23c5682 100644 --- a/runtime/starpu/codelets/codelet_zlauum.c +++ b/runtime/starpu/codelets/codelet_zlauum.c @@ -77,30 +77,9 @@ static void cl_zlauum_cpu_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &uplo, &N, &LDA); CORE_zlauum(uplo, N, A, LDA); } - -#if defined(CHAMELEON_USE_MAGMA) -static void cl_zlauum_cuda_func(void *descr[], void *cl_arg) -{ - MORSE_enum uplo; - int info = 0; - int N; - cuDoubleComplex *A; - int LDA; - - A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - starpu_codelet_unpack_args(cl_arg, &uplo, &N, &LDA); - CUDA_zlauum( uplo, N, A, LDA, &info); - cudaThreadSynchronize(); - return; -} -#endif #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -#if defined(CHAMELEON_USE_MAGMA) -CODELETS(zlauum, 1, cl_zlauum_cpu_func, cl_zlauum_cuda_func, 0) -#else CODELETS_CPU(zlauum, 1, cl_zlauum_cpu_func) -#endif diff --git a/runtime/starpu/codelets/codelet_zpotrf.c b/runtime/starpu/codelets/codelet_zpotrf.c index 857cf3356..686814274 100644 --- a/runtime/starpu/codelets/codelet_zpotrf.c +++ b/runtime/starpu/codelets/codelet_zpotrf.c @@ -83,48 +83,10 @@ static void cl_zpotrf_cpu_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &uplo, &n, &lda, &iinfo); CORE_zpotrf(uplo, n, A, lda, &info); } - -#ifdef CHAMELEON_USE_MAGMA -static void cl_zpotrf_cuda_func(void *descr[], void *cl_arg) -{ - cudaStream_t stream[2], currentt_stream; - MORSE_enum uplo; - int n; - cuDoubleComplex *A; - /* cuDoubleComplex *hA; */ - int lda; - int iinfo; - int info = 0; - - A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - starpu_codelet_unpack_args(cl_arg, &uplo, &n, &lda, &iinfo); - - /* /\* */ - /* * hwork => nb*nb */ - /* *\/ */ - /* hA = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); */ - -/* stream[0] = starpu_cuda_get_local_stream(); */ -/* if ( cudaStreamCreate( stream+1 ) != CUDA_SUCCESS ){ */ -/* fprintf(stderr, "Error while creating stream in codelet_zpotrf\n"); */ -/* exit(-1); */ -/* } */ - - CUDA_zpotrf( uplo, n, A, lda, &info); - - cudaThreadSynchronize(); -/* cudaStreamDestroy( stream[1] ); */ - - return; -} -#endif #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -#if defined CHAMELEON_USE_MAGMA -CODELETS(zpotrf, 1, cl_zpotrf_cpu_func, cl_zpotrf_cuda_func, 0) -#else CODELETS_CPU(zpotrf, 1, cl_zpotrf_cpu_func) -#endif + diff --git a/runtime/starpu/codelets/codelet_zssssm.c b/runtime/starpu/codelets/codelet_zssssm.c index a1fae08bb..6d1c3ee08 100644 --- a/runtime/starpu/codelets/codelet_zssssm.c +++ b/runtime/starpu/codelets/codelet_zssssm.c @@ -174,56 +174,10 @@ static void cl_zssssm_cpu_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &m1, &n1, &m2, &n2, &k, &ib, &lda1, &lda2, &ldl1, &ldl2, &IPIV); CORE_zssssm(m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, L1, ldl1, L2, ldl2, IPIV); } - -#if defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU) -static void cl_zssssm_cuda_func(void *descr[], void *cl_arg) -{ - int m1; - int n1; - int m2; - int n2; - int k; - int ib; - cuDoubleComplex *dA1; - int lda1; - cuDoubleComplex *dA2; - int lda2; - cuDoubleComplex *dL1; - int ldl1; - cuDoubleComplex *dL2; - int ldl2; - int *IPIV; - int info; - - starpu_codelet_unpack_args(cl_arg, &m1, &n1, &m2, &n2, &k, &ib, &lda1, &lda2, &ldl1, &ldl2, &IPIV); - - dA1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - dA2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); - dL1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); - dL2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); - - if ( ldl1 >= 2*ib ) { - /* dL1 stores L and invL and the kernel is just using the inverted part */ - dL1 += ib; - } - - CUDA_zssssm( - MagmaColMajor, m1, n1, m2, n2, k, ib, - dA1, lda1, dA2, lda2, - dL1, ldl1, dL2, ldl2, - IPIV, &info); - - cudaThreadSynchronize(); -} -#endif #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -#if (defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU)) -CODELETS(zssssm, 4, cl_zssssm_cpu_func, cl_zssssm_cuda_func, 0) -#else CODELETS_CPU(zssssm, 4, cl_zssssm_cpu_func) -#endif diff --git a/runtime/starpu/codelets/codelet_ztrtri.c b/runtime/starpu/codelets/codelet_ztrtri.c index 1f619cd09..73d1a439e 100644 --- a/runtime/starpu/codelets/codelet_ztrtri.c +++ b/runtime/starpu/codelets/codelet_ztrtri.c @@ -85,32 +85,9 @@ static void cl_ztrtri_cpu_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &uplo, &diag, &N, &LDA, &iinfo); CORE_ztrtri(uplo, diag, N, A, LDA, &info); } - -#if defined(CHAMELEON_USE_MAGMA) -static void cl_ztrtri_cuda_func(void *descr[], void *cl_arg) -{ - MORSE_enum uplo; - MORSE_enum diag; - int N; - cuDoubleComplex *A; - int LDA; - int iinfo; - int info = 0; - - A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - starpu_codelet_unpack_args(cl_arg, &uplo, &diag, &N, &LDA, &iinfo); - CUDA_ztrtri( uplo, diag, N, A, LDA, &info); - cudaThreadSynchronize(); - return; -} -#endif #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -#if defined(CHAMELEON_USE_MAGMA) -CODELETS(ztrtri, 1, cl_ztrtri_cpu_func, cl_ztrtri_cuda_func, 0) -#else CODELETS_CPU(ztrtri, 1, cl_ztrtri_cpu_func) -#endif diff --git a/runtime/starpu/codelets/codelet_ztslqt.c b/runtime/starpu/codelets/codelet_ztslqt.c index 101feebfe..56c278094 100644 --- a/runtime/starpu/codelets/codelet_ztslqt.c +++ b/runtime/starpu/codelets/codelet_ztslqt.c @@ -174,54 +174,9 @@ static void cl_ztslqt_cpu_func(void *descr[], void *cl_arg) WORK = TAU + chameleon_max( m, n ); CORE_ztslqt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK); } - -#if defined(CHAMELEON_USE_MAGMA) && 0 -static void cl_ztslqt_cuda_func(void *descr[], void *cl_arg) -{ - MORSE_starpu_ws_t *h_work; - int m; - int n; - int ib; - cuDoubleComplex *h_A2, *h_T, *h_D, *h_TAU, *h_W; - cuDoubleComplex *d_A1, *d_A2, *d_T, *d_D, *d_W; - int lda1, lda2, ldt; - - starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda1, &lda2, &ldt, &h_work); - - /* Gather pointer to data on device */ - d_A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - d_A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); - d_T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); - d_W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); /* 2*ib*m + ib*ib */ - d_D = d_W + 2*ib*m; - - /* scratch data on host */ - /* ib*n + ib*n + max(m,n) + ib*m + ib*ib */ - h_A2 = (cuDoubleComplex*)RUNTIME_starpu_ws_getlocal(h_work); - h_T = h_A2 + ib*n; - h_TAU = h_T + ib*n; - h_W = h_TAU + chameleon_max(m,n); - h_D = h_W + ib*m; - - RUNTIME_getStream(stream); - CUDA_ztslqt( - m, n, ib, - d_A1, lda1, d_A2, lda2, - h_A2, ib, - d_T, ldt, h_T, ib, - d_D, h_D, ib, h_TAU, - h_W, d_W, stream); - - cudaThreadSynchronize(); -} -#endif #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -#if (defined(CHAMELEON_USE_MAGMA)) && 0 -CODELETS(ztslqt, 4, cl_ztslqt_cpu_func, cl_ztslqt_cuda_func, 0) -#else CODELETS_CPU(ztslqt, 4, cl_ztslqt_cpu_func) -#endif diff --git a/runtime/starpu/codelets/codelet_ztsqrt.c b/runtime/starpu/codelets/codelet_ztsqrt.c index 4c5d03fe2..969da0611 100644 --- a/runtime/starpu/codelets/codelet_ztsqrt.c +++ b/runtime/starpu/codelets/codelet_ztsqrt.c @@ -165,53 +165,9 @@ static void cl_ztsqrt_cpu_func(void *descr[], void *cl_arg) WORK = TAU + chameleon_max( m, n ); CORE_ztsqrt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK); } - -#if defined(CHAMELEON_USE_MAGMA) -static void cl_ztsqrt_cuda_func(void *descr[], void *cl_arg) -{ - MORSE_starpu_ws_t *h_work; - int m; - int n; - int ib; - cuDoubleComplex *h_A2, *h_T, *h_D, *h_TAU, *h_W; - cuDoubleComplex *d_A1, *d_A2, *d_T, *d_D, *d_W; - int lda1, lda2, ldt; - - starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda1, &lda2, &ldt, &h_work); - - /* Gather pointer to data on device */ - d_A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - d_A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); - d_T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); - d_W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); /* 2*ib*n + ib*ib */ - d_D = d_W + 2*ib*n; - - /* scratch data on host */ - /* m*ib + ib*ib + max(m,n) + ib*n + ib*ib */ - h_A2 = (cuDoubleComplex*)RUNTIME_starpu_ws_getlocal(h_work); - h_T = h_A2 + m*ib; - h_TAU = h_T + ib*ib; - h_W = h_TAU + chameleon_max(m,n); - h_D = h_W + ib*n; - - RUNTIME_getStream(stream); - CUDA_ztsqrt( - m, n, ib, - d_A1, lda1, d_A2, lda2, - h_A2, lda2, - d_T, ldt, h_T, ib, - d_D, h_D, ib, h_TAU, - h_W, d_W, stream); - cudaThreadSynchronize(); -} -#endif #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -#if defined(CHAMELEON_USE_MAGMA) -CODELETS(ztsqrt, 4, cl_ztsqrt_cpu_func, cl_ztsqrt_cuda_func, 0) -#else CODELETS_CPU(ztsqrt, 4, cl_ztsqrt_cpu_func) -#endif diff --git a/runtime/starpu/codelets/codelet_ztstrf.c b/runtime/starpu/codelets/codelet_ztstrf.c index eae5108ee..59f742807 100644 --- a/runtime/starpu/codelets/codelet_ztstrf.c +++ b/runtime/starpu/codelets/codelet_ztstrf.c @@ -176,102 +176,11 @@ static void cl_ztstrf_cpu_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &nb, &ldu, &lda, &ldl, &IPIV, &d_work, &ldwork, &check_info, &iinfo); CORE_ztstrf(m, n, ib, nb, U, ldu, A, lda, L, ldl, IPIV, WORK, ldwork, &info); - -#if defined(CHAMELEON_USE_MAGMA) - /* - * L stores the following if enough place: - * L1 L2 L3 ... - * L1^-1 L2^-1 L3^-1 ... - */ - /* Compute L-1 in lower rectangle of L */ - if ( ldl >= 2*ib ) - { - int i, sb; - for (i=0; i<n; i+=ib) { - sb = chameleon_min( ib, n-i ); - CORE_zlacpy(MorseUpperLower, sb, sb, L+(i*ldl), ldl, L+(i*ldl)+ib, ldl ); - - CORE_ztrtri( MorseLower, MorseUnit, sb, L+(i*ldl)+ib, ldl, &info ); - if (info != 0 ) { - fprintf(stderr, "ERROR, trtri returned with info = %d\n", info); - } - } - } -#endif -} - - -/* - * Codelet GPU - */ -/* TODO/WARNING: tstrf is not working on GPU for now */ -#if defined(CHAMELEON_USE_MAGMA) && 0 -static void cl_ztstrf_cuda_func(void *descr[], void *cl_arg) -{ - MORSE_starpu_ws_t *d_work; - int m; - int n; - int ib; - int nb; - cuDoubleComplex *hU, *dU; - int ldu; - cuDoubleComplex *hA, *dA; - int lda; - cuDoubleComplex *hL, *dL; - int ldl; - int *ipiv; - cuDoubleComplex *hw2, *hw, *dw; - int ldwork; - MORSE_bool check_info; - int iinfo; - int info; - - starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &nb, &ldu, &lda, &ldl, &ipiv, - &d_work, &ldwork, &check_info, &iinfo); - - dU = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); - dA = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); - dL = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); - - /* - * hwork => 2*nb*(2*ib+2nb) - * dwork => 2*ib*nb - */ - hw2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); - dw = (cuDoubleComplex*)RUNTIME_starpu_ws_getlocal(d_work); - - hU = hw2; - hA = hU + ldu * nb; - hL = hA + lda * nb; - hw = hL + ldl * nb; - - /* Download first panel from A and U */ - cublasGetMatrix( nb, n, sizeof(cuDoubleComplex), dU, ldu, hU, ldu ); - cublasGetMatrix( m, ib, sizeof(cuDoubleComplex), dA, lda, hA, lda ); - - /* Initialize L to 0 */ - memset(hL, 0, ldl*nb*sizeof(cuDoubleComplex)); - - CUDA_ztstrf( - MagmaColMajor, m, n, ib, nb, - hU, ldu, dU, ldu, - hA, lda, dA, lda, - hL, ldl, dL, ldl, - ipiv, - hw, ldwork, dw, lda, - &info ); - - cudaThreadSynchronize(); } -#endif #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -#if (defined(CHAMELEON_USE_MAGMA) && 0) -CODELETS(ztstrf, 4, cl_ztstrf_cpu_func, cl_ztstrf_cuda_func, 0) -#else CODELETS_CPU(ztstrf, 4, cl_ztstrf_cpu_func) -#endif -- GitLab