From 1841686627f2d91cfa6234ea0ea947140557c4dc Mon Sep 17 00:00:00 2001 From: Mathieu Faverge <mathieu.faverge@inria.fr> Date: Thu, 1 Dec 2016 16:54:43 +0000 Subject: [PATCH] It seems that almost everything is back --- CMakeLists.txt | 8 +- compute/CMakeLists.txt | 12 +++ compute/pzgelqf.c | 8 +- compute/pzgeqrfrh.c | 2 - control/common.h | 7 -- control/compute_z.h | 2 +- control/control.c | 3 +- control/descriptor.h | 1 + cudablas/compute/CMakeLists.txt | 25 +++--- cudablas/compute/cuda_zgemerge.c | 79 +++---------------- cudablas/compute/cuda_ztsmlq.c | 2 +- runtime/starpu/codelets/codelet_zgelqt.c | 2 +- runtime/starpu/codelets/codelet_zgemm.c | 6 +- runtime/starpu/codelets/codelet_zgeqrt.c | 5 +- runtime/starpu/codelets/codelet_zgessm.c | 4 +- .../starpu/codelets/codelet_zgetrf_incpiv.c | 5 +- .../starpu/codelets/codelet_zgetrf_nopiv.c | 4 +- runtime/starpu/codelets/codelet_zhemm.c | 6 +- runtime/starpu/codelets/codelet_zher2k.c | 6 +- runtime/starpu/codelets/codelet_zherk.c | 6 +- runtime/starpu/codelets/codelet_zlauum.c | 4 +- runtime/starpu/codelets/codelet_zpotrf.c | 4 +- runtime/starpu/codelets/codelet_zssssm.c | 4 +- runtime/starpu/codelets/codelet_zsymm.c | 6 +- runtime/starpu/codelets/codelet_zsyr2k.c | 6 +- runtime/starpu/codelets/codelet_zsyrk.c | 6 +- runtime/starpu/codelets/codelet_ztrmm.c | 6 +- runtime/starpu/codelets/codelet_ztrsm.c | 6 +- runtime/starpu/codelets/codelet_ztrtri.c | 4 +- runtime/starpu/codelets/codelet_ztslqt.c | 4 +- runtime/starpu/codelets/codelet_ztsmlq.c | 4 +- runtime/starpu/codelets/codelet_ztsmqr.c | 5 +- runtime/starpu/codelets/codelet_ztsqrt.c | 5 +- runtime/starpu/codelets/codelet_ztstrf.c | 5 +- runtime/starpu/codelets/codelet_zunmlq.c | 4 +- runtime/starpu/codelets/codelet_zunmqr.c | 4 +- runtime/starpu/control/runtime_descriptor.c | 17 ++-- runtime/starpu/control/runtime_workspace.c | 8 +- 38 files changed, 95 insertions(+), 200 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fb17b06b3..ff0813519 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -161,15 +161,15 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_SOURCE_DIR}/cmake_modules/") endif() # Use intermediate variable since cmake_dependent_option doesn't have OR conditions - set(CHAMELEON_ENABLE_MPI OFF CACHE BOOL "Tells if MPI might be supported by the runtime") + set(CHAMELEON_ENABLE_MPI OFF CACHE INTERNAL "Tells if MPI might be supported by the runtime") if ( CHAMELEON_SCHED_PARSEC OR CHAMELEON_SCHED_STARPU ) - set(CHAMELEON_ENABLE_MPI ON) + set(CHAMELEON_ENABLE_MPI ON FORCE) endif() # Use intermediate variable since cmake_dependent_option doesn't have OR conditions - set(CHAMELEON_ENABLE_CUDA OFF CACHE BOOL "Tells if CUDA might be supported by the runtime") + set(CHAMELEON_ENABLE_CUDA OFF CACHE INTERNAL "Tells if CUDA might be supported by the runtime") if ( CHAMELEON_SCHED_PARSEC OR CHAMELEON_SCHED_STARPU ) - set(CHAMELEON_ENABLE_CUDA ON) + set(CHAMELEON_ENABLE_CUDA ON FORCE) endif() # Additional options diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt index 165e60480..229ff8102 100644 --- a/compute/CMakeLists.txt +++ b/compute/CMakeLists.txt @@ -26,6 +26,18 @@ # ### +option(CHAMELEON_COPY_DIAG + "This options enables the duplication of the diagonal tiles in some algorithm to avoid anti-dependencies on lower/upper triangular parts (Might be useful to StarPU)" ON) +mark_as_advanced(CHAMELEON_COPY_DIAG) + +if ( CHAMELEON_SCHED_QUARK ) + # No need for those extra diagonale tiles + set( CHAMELEON_COPY_DIAG OFF ) +endif() + +if (CHAMELEON_COPY_DIAG) + add_definitions(-DCHAMELEON_COPY_DIAG) +endif() # Define the list of sources # -------------------------- diff --git a/compute/pzgelqf.c b/compute/pzgelqf.c index 74a07ffba..99c5a88c7 100644 --- a/compute/pzgelqf.c +++ b/compute/pzgelqf.c @@ -53,7 +53,7 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T, int k, m, n; int ldak, ldam; int tempkm, tempkn, tempmm, tempnn; - int ib; + int ib, minMT; morse = morse_context_self(); if (sequence->status != MORSE_SUCCESS) @@ -62,6 +62,12 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T, ib = MORSE_IB; + if (A->m > A->n) { + minMT = A->nt; + } else { + minMT = A->mt; + } + /* * zgelqt = A->nb * (ib+1) * zunmlq = A->nb * ib diff --git a/compute/pzgeqrfrh.c b/compute/pzgeqrfrh.c index 199539ad5..e4c1ba524 100644 --- a/compute/pzgeqrfrh.c +++ b/compute/pzgeqrfrh.c @@ -87,9 +87,7 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS, */ ws_worker = max( ws_worker, ib * (ib + A->nb) ); ws_worker = max( ws_worker, ib * A->nb * 2 ); -#endif -#if defined(CHAMELEON_USE_MAGMA) /* Host space * * zgeqrt = ib * (A->nb+3*ib) + A->nb ) diff --git a/control/common.h b/control/common.h index 1f52e0567..7f9a6beaf 100644 --- a/control/common.h +++ b/control/common.h @@ -143,13 +143,6 @@ #define MORSE_MPI_SIZE morse->mpi_comm_size #endif -/******************************************************************************* - * Activate copy of diagonal tile (StarPU only) for some tile algorithms (pz) - **/ -#if defined(CHAMELEON_SCHED_STARPU) -#define CHAMELEON_COPY_DIAG -#endif - /******************************************************************************* * IPT internal define **/ diff --git a/control/compute_z.h b/control/compute_z.h index 78656825a..53ee78e87 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -102,7 +102,7 @@ void morse_pzgetrf_incpiv(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV, MORSE_seq void morse_pzgetrf_nopiv(MORSE_desc_t *A, MORSE_sequence_t *sequence, MORSE_request_t *request); void morse_pzgetrf_reclap(MORSE_desc_t *A, int *IPIV, MORSE_sequence_t *sequence, MORSE_request_t *request); void morse_pzgetrf_rectil(MORSE_desc_t *A, int *IPIV, MORSE_sequence_t *sequence, MORSE_request_t *request); -void morse_pzhbcpy_t2bl(MORSE_enum uplo, MORSE_desc_t *A, MORSE_Complex64_t *AB, int LDAB, MORSE_sequence_t *sequence, MORSE_request_t *request); +void morse_pzhbcpy_t2bl(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *AB, MORSE_sequence_t *sequence, MORSE_request_t *request); void morse_pzhegst(MORSE_enum itype, MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_sequence_t *sequence, MORSE_request_t *request); #ifdef COMPLEX void morse_pzhemm(MORSE_enum side, MORSE_enum uplo, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_Complex64_t beta, MORSE_desc_t *C, MORSE_sequence_t *sequence, MORSE_request_t *request); diff --git a/control/control.c b/control/control.c index 900551a56..8098178bb 100644 --- a/control/control.c +++ b/control/control.c @@ -145,12 +145,13 @@ int MORSE_Finalize(void) #if defined(CHAMELEON_USE_MAGMA) && !defined(CHAMELEON_SIMULATION) magma_finalize(); #endif - morse_context_destroy(); + #if defined(CHAMELEON_USE_MPI) if (!morse->mpi_outer_init) MPI_Finalize(); #endif + morse_context_destroy(); return MORSE_SUCCESS; } diff --git a/control/descriptor.h b/control/descriptor.h index cea048776..09d3825ae 100644 --- a/control/descriptor.h +++ b/control/descriptor.h @@ -177,6 +177,7 @@ inline static int morse_getrankof_2d(const MORSE_desc_t *desc, int m, int n) **/ inline static int morse_getrankof_2d_diag(const MORSE_desc_t *desc, int m, int n) { + assert( n == 0 ); return (m % desc->p) * desc->q + (m % desc->q); } diff --git a/cudablas/compute/CMakeLists.txt b/cudablas/compute/CMakeLists.txt index 91550a049..7ef91aee0 100644 --- a/cudablas/compute/CMakeLists.txt +++ b/cudablas/compute/CMakeLists.txt @@ -27,33 +27,40 @@ # ------------------------------------------------------ set(CUDABLAS_SRCS_GENERATED "") set(ZSRC - cuda_zgelqt.c cuda_zgemerge.c cuda_zgemm.c - cuda_zgeqrt.c - cuda_zgessm.c - cuda_zgetrf.c cuda_zhemm.c cuda_zher2k.c cuda_zherk.c - cuda_zlauum.c cuda_zparfb.c - cuda_zpotrf.c - cuda_zssssm.c cuda_zsymm.c cuda_zsyr2k.c cuda_zsyrk.c cuda_ztrmm.c cuda_ztrsm.c - cuda_ztrtri.c - cuda_ztslqt.c cuda_ztsmlq.c cuda_ztsmqr.c + ) + +if( CHAMELEON_USE_MAGMA ) + set(ZSRC + ${ZSRC} + cuda_zgelqt.c + cuda_zgeqrt.c + cuda_zgessm.c + cuda_zgetrf.c + cuda_zlauum.c + cuda_zparfb.c + cuda_zpotrf.c + cuda_zssssm.c + cuda_ztrtri.c + cuda_ztslqt.c cuda_ztsqrt.c cuda_ztstrf.c cuda_zunmlqt.c cuda_zunmqrt.c ) +endif() precisions_rules_py(CUDABLAS_SRCS_GENERATED "${ZSRC}" PRECISIONS "${CHAMELEON_PRECISION}") diff --git a/cudablas/compute/cuda_zgemerge.c b/cudablas/compute/cuda_zgemerge.c index 76b0771ca..f04561eaf 100644 --- a/cudablas/compute/cuda_zgemerge.c +++ b/cudablas/compute/cuda_zgemerge.c @@ -24,28 +24,15 @@ **/ #include "cudablas/include/cudablas.h" -#if defined(CHAMELEON_USE_MAGMA) -#if defined(CHAMELEON_USE_CUBLAS_V2) -int CUDA_zgemerge( MORSE_enum side, MORSE_enum diag, - int M, int N, cuDoubleComplex *A, int LDA, - cuDoubleComplex *B, int LDB, CUBLAS_STREAM_PARAM) +int +CUDA_zgemerge( MORSE_enum side, MORSE_enum diag, + int M, int N, + cuDoubleComplex *A, int LDA, + cuDoubleComplex *B, int LDB, + CUBLAS_STREAM_PARAM) { int i, j; - magmaDoubleComplex *cola, *colb; - cublasHandle_t handle; - cublasStatus_t stat; - - stat = cublasCreate(&handle); - if (stat != CUBLAS_STATUS_SUCCESS) { - printf ("CUBLAS initialization failed\n"); - assert( stat == CUBLAS_STATUS_SUCCESS ); - } - - stat = cublasSetStream(handle, stream); - if (stat != CUBLAS_STATUS_SUCCESS) { - printf ("cublasSetStream failed\n"); - assert( stat == CUBLAS_STATUS_SUCCESS ); - } + cuDoubleComplex *cola, *colb; if (M < 0) { return -1; @@ -60,55 +47,9 @@ int CUDA_zgemerge( MORSE_enum side, MORSE_enum diag, return -7; } - if (side == MagmaLeft){ - for(i=0; i<N; i++){ - cola = A + i*LDA; - colb = B + i*LDB; -// cublasZcopy(handle, i+1, cola, 1, colb, 1); - cudaMemcpyAsync(colb , cola, - (i+1)*sizeof(cuDoubleComplex), - cudaMemcpyDeviceToDevice, stream); - } - }else{ - for(i=0; i<N; i++){ - cola = A + i*LDA; - colb = B + i*LDB; -// cublasZcopy(handle, M-i, cola + i, 1, colb + i, 1); - cudaMemcpyAsync(colb+i , cola+i, - (M-i)*sizeof(cuDoubleComplex), - cudaMemcpyDeviceToDevice, stream); - } - } - - cublasDestroy(handle); - - return MORSE_SUCCESS; -} -#else /* CHAMELEON_USE_CUBLAS_V2 */ -int CUDA_zgemerge( - magma_side_t side, magma_diag_t diag, - magma_int_t M, magma_int_t N, - magmaDoubleComplex *A, magma_int_t LDA, - magmaDoubleComplex *B, magma_int_t LDB, - CUstream stream) -{ - int i, j; - magmaDoubleComplex *cola, *colb; - - if (M < 0) { - return -1; - } - if (N < 0) { - return -2; - } - if ( (LDA < max(1,M)) && (M > 0) ) { - return -5; - } - if ( (LDB < max(1,M)) && (M > 0) ) { - return -7; - } + CUBLAS_GET_STREAM; - if (side == MagmaLeft){ + if (side == MorseLeft){ for(i=0; i<N; i++){ cola = A + i*LDA; colb = B + i*LDB; @@ -128,5 +69,3 @@ int CUDA_zgemerge( return MORSE_SUCCESS; } -#endif /* CHAMELEON_USE_CUBLAS_V2 */ -#endif diff --git a/cudablas/compute/cuda_ztsmlq.c b/cudablas/compute/cuda_ztsmlq.c index 6c525138a..0a44d5800 100644 --- a/cudablas/compute/cuda_ztsmlq.c +++ b/cudablas/compute/cuda_ztsmlq.c @@ -55,7 +55,7 @@ int CUDA_ztsmlq( NW = IB; } else { - NW = M1; + NW = N1; } if ((trans != MorseNoTrans) && (trans != MorseConjTrans)) { diff --git a/runtime/starpu/codelets/codelet_zgelqt.c b/runtime/starpu/codelets/codelet_zgelqt.c index 987b9264a..8e7dbdcba 100644 --- a/runtime/starpu/codelets/codelet_zgelqt.c +++ b/runtime/starpu/codelets/codelet_zgelqt.c @@ -194,8 +194,8 @@ static void cl_zgelqt_cuda_func(void *descr[], void *cl_arg) cudaThreadSynchronize(); } -#endif //!defined(CHAMELEON_SIMULATION) #endif //defined(CHAMELEON_USE_MAGMA) +#endif //!defined(CHAMELEON_SIMULATION) /* * Codelet definition diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c index da14986cd..1ff16ebf7 100644 --- a/runtime/starpu/codelets/codelet_zgemm.c +++ b/runtime/starpu/codelets/codelet_zgemm.c @@ -129,11 +129,9 @@ static void cl_zgemm_cpu_func(void *descr[], void *cl_arg) B, ldb, beta, C, ldc); } -#endif //!defined(CHAMELEON_SIMULATION) #ifdef CHAMELEON_USE_CUDA #if defined(CHAMELEON_USE_CUBLAS_V2) -#if !defined(CHAMELEON_SIMULATION) static void cl_zgemm_cuda_func(void *descr[], void *cl_arg) { MORSE_enum transA; @@ -172,9 +170,7 @@ static void cl_zgemm_cuda_func(void *descr[], void *cl_arg) return; } -#endif //!defined(CHAMELEON_SIMULATION) #else /* CHAMELEON_USE_CUBLAS_V2 */ -#if !defined(CHAMELEON_SIMULATION) static void cl_zgemm_cuda_func(void *descr[], void *cl_arg) { MORSE_enum transA; @@ -213,9 +209,9 @@ static void cl_zgemm_cuda_func(void *descr[], void *cl_arg) return; } -#endif //!defined(CHAMELEON_SIMULATION) #endif /* CHAMELEON_USE_CUBLAS_V2 */ #endif /* CHAMELEON_USE_CUDA */ +#endif //!defined(CHAMELEON_SIMULATION) /* * Codelet definition diff --git a/runtime/starpu/codelets/codelet_zgeqrt.c b/runtime/starpu/codelets/codelet_zgeqrt.c index 88675d977..c375bf281 100644 --- a/runtime/starpu/codelets/codelet_zgeqrt.c +++ b/runtime/starpu/codelets/codelet_zgeqrt.c @@ -151,10 +151,9 @@ static void cl_zgeqrt_cpu_func(void *descr[], void *cl_arg) WORK = TAU + max( m, n ); CORE_zgeqrt(m, n, ib, A, lda, T, ldt, TAU, WORK); } -#endif //!defined(CHAMELEON_SIMULATION) + #if defined(CHAMELEON_USE_MAGMA) -#if !defined(CHAMELEON_SIMULATION) static void cl_zgeqrt_cuda_func(void *descr[], void *cl_arg) { MORSE_starpu_ws_t *h_work; @@ -196,8 +195,8 @@ static void cl_zgeqrt_cuda_func(void *descr[], void *cl_arg) cudaThreadSynchronize(); } -#endif //!defined(CHAMELEON_SIMULATION) #endif //defined(CHAMELEON_USE_MAGMA) +#endif //!defined(CHAMELEON_SIMULATION) /* * Codelet definition diff --git a/runtime/starpu/codelets/codelet_zgessm.c b/runtime/starpu/codelets/codelet_zgessm.c index 3c7c1f052..a32507894 100644 --- a/runtime/starpu/codelets/codelet_zgessm.c +++ b/runtime/starpu/codelets/codelet_zgessm.c @@ -133,10 +133,8 @@ static void cl_zgessm_cpu_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &m, &n, &k, &ib, &IPIV, &ldl, &ldd, &lda); CORE_zgessm(m, n, k, ib, IPIV, D, ldd, A, lda); } -#endif //!defined(CHAMELEON_SIMULATION) #if defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU) -#if !defined(CHAMELEON_SIMULATION) static void cl_zgessm_cuda_func(void *descr[], void *cl_arg) { int m; @@ -163,8 +161,8 @@ static void cl_zgessm_cuda_func(void *descr[], void *cl_arg) return; } -#endif //!defined(CHAMELEON_SIMULATION) #endif //defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU) +#endif //!defined(CHAMELEON_SIMULATION) /* * Codelet definition diff --git a/runtime/starpu/codelets/codelet_zgetrf_incpiv.c b/runtime/starpu/codelets/codelet_zgetrf_incpiv.c index ad7b32dcf..5b0505322 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_incpiv.c +++ b/runtime/starpu/codelets/codelet_zgetrf_incpiv.c @@ -171,13 +171,12 @@ static void cl_zgetrf_incpiv_cpu_func(void *descr[], void *cl_arg) } #endif } -#endif //!defined(CHAMELEON_SIMULATION) + /* * Codelet GPU */ #if defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU) -#if !defined(CHAMELEON_SIMULATION) static void cl_zgetrf_incpiv_cuda_func(void *descr[], void *cl_arg) { int m; @@ -228,8 +227,8 @@ static void cl_zgetrf_incpiv_cuda_func(void *descr[], void *cl_arg) cudaThreadSynchronize(); } -#endif //!defined(CHAMELEON_SIMULATION) #endif //defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU) +#endif //!defined(CHAMELEON_SIMULATION) /* diff --git a/runtime/starpu/codelets/codelet_zgetrf_nopiv.c b/runtime/starpu/codelets/codelet_zgetrf_nopiv.c index 275ff280a..fcfd96fc3 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_nopiv.c +++ b/runtime/starpu/codelets/codelet_zgetrf_nopiv.c @@ -119,13 +119,11 @@ static void cl_zgetrf_nopiv_cpu_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &iinfo); CORE_zgetrf_nopiv(m, n, ib, A, lda, &info); } -#endif //!defined(CHAMELEON_SIMULATION) /* * Codelet GPU */ #if defined(CHAMELEON_USE_MAGMA) -#if !defined(CHAMELEON_SIMULATION) static void cl_zgetrf_nopiv_cuda_func(void *descr[], void *cl_arg) { int m; @@ -142,8 +140,8 @@ static void cl_zgetrf_nopiv_cuda_func(void *descr[], void *cl_arg) CUDA_zgetrf_nopiv( m, n, dA, lda, &info ); cudaThreadSynchronize(); } -#endif //!defined(CHAMELEON_SIMULATION) #endif +#endif //!defined(CHAMELEON_SIMULATION) /* * Codelet definition diff --git a/runtime/starpu/codelets/codelet_zhemm.c b/runtime/starpu/codelets/codelet_zhemm.c index 9accae112..705c3341f 100644 --- a/runtime/starpu/codelets/codelet_zhemm.c +++ b/runtime/starpu/codelets/codelet_zhemm.c @@ -99,11 +99,9 @@ static void cl_zhemm_cpu_func(void *descr[], void *cl_arg) B, LDB, beta, C, LDC); } -#endif //!defined(CHAMELEON_SIMULATION) #ifdef CHAMELEON_USE_CUDA #if defined(CHAMELEON_USE_CUBLAS_V2) -#if !defined(CHAMELEON_SIMULATION) static void cl_zhemm_cuda_func(void *descr[], void *cl_arg) { MORSE_enum side; @@ -143,9 +141,7 @@ static void cl_zhemm_cuda_func(void *descr[], void *cl_arg) return; } -#endif //!defined(CHAMELEON_SIMULATION) #else /* CHAMELEON_USE_CUBLAS_V2 */ -#if !defined(CHAMELEON_SIMULATION) static void cl_zhemm_cuda_func(void *descr[], void *cl_arg) { MORSE_enum side; @@ -183,9 +179,9 @@ static void cl_zhemm_cuda_func(void *descr[], void *cl_arg) return; } -#endif //!defined(CHAMELEON_SIMULATION) #endif /* CHAMELEON_USE_CUBLAS_V2 */ #endif /* CHAMELEON_USE_CUDA */ +#endif //!defined(CHAMELEON_SIMULATION) /* * Codelet definition diff --git a/runtime/starpu/codelets/codelet_zher2k.c b/runtime/starpu/codelets/codelet_zher2k.c index 794c474ea..03e9babea 100644 --- a/runtime/starpu/codelets/codelet_zher2k.c +++ b/runtime/starpu/codelets/codelet_zher2k.c @@ -96,11 +96,9 @@ static void cl_zher2k_cpu_func(void *descr[], void *cl_arg) CORE_zher2k(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc); } -#endif //!defined(CHAMELEON_SIMULATION) #ifdef CHAMELEON_USE_CUDA #if defined(CHAMELEON_USE_CUBLAS_V2) -#if !defined(CHAMELEON_SIMULATION) static void cl_zher2k_cuda_func(void *descr[], void *cl_arg) { MORSE_enum uplo; @@ -135,9 +133,7 @@ static void cl_zher2k_cuda_func(void *descr[], void *cl_arg) return; } -#endif //!defined(CHAMELEON_SIMULATION) #else /* CHAMELEON_USE_CUBLAS_V2 */ -#if !defined(CHAMELEON_SIMULATION) static void cl_zher2k_cuda_func(void *descr[], void *cl_arg) { MORSE_enum uplo; @@ -171,9 +167,9 @@ static void cl_zher2k_cuda_func(void *descr[], void *cl_arg) return; } -#endif //!defined(CHAMELEON_SIMULATION) #endif /* CHAMELEON_USE_CUBLAS_V2 */ #endif /* CHAMELEON_USE_CUDA */ +#endif //!defined(CHAMELEON_SIMULATION) /* * Codelet definition diff --git a/runtime/starpu/codelets/codelet_zherk.c b/runtime/starpu/codelets/codelet_zherk.c index 754b61ff8..385645231 100644 --- a/runtime/starpu/codelets/codelet_zherk.c +++ b/runtime/starpu/codelets/codelet_zherk.c @@ -90,11 +90,9 @@ static void cl_zherk_cpu_func(void *descr[], void *cl_arg) alpha, A, lda, beta, C, ldc); } -#endif //!defined(CHAMELEON_SIMULATION) #ifdef CHAMELEON_USE_CUDA #if defined(CHAMELEON_USE_CUBLAS_V2) -#if !defined(CHAMELEON_SIMULATION) static void cl_zherk_cuda_func(void *descr[], void *cl_arg) { MORSE_enum uplo; @@ -132,9 +130,7 @@ static void cl_zherk_cuda_func(void *descr[], void *cl_arg) return; } -#endif //!defined(CHAMELEON_SIMULATION) #else /* CHAMELEON_USE_CUBLAS_V2 */ -#if !defined(CHAMELEON_SIMULATION) static void cl_zherk_cuda_func(void *descr[], void *cl_arg) { MORSE_enum uplo; @@ -168,9 +164,9 @@ static void cl_zherk_cuda_func(void *descr[], void *cl_arg) return; } -#endif //!defined(CHAMELEON_SIMULATION) #endif /* CHAMELEON_USE_CUBLAS_V2 */ #endif /* CHAMELEON_USE_CUDA */ +#endif //!defined(CHAMELEON_SIMULATION) /* * Codelet definition diff --git a/runtime/starpu/codelets/codelet_zlauum.c b/runtime/starpu/codelets/codelet_zlauum.c index 27a888b28..a5272248b 100644 --- a/runtime/starpu/codelets/codelet_zlauum.c +++ b/runtime/starpu/codelets/codelet_zlauum.c @@ -73,10 +73,8 @@ static void cl_zlauum_cpu_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &uplo, &N, &LDA); CORE_zlauum(uplo, N, A, LDA); } -#endif //!defined(CHAMELEON_SIMULATION) #if defined(CHAMELEON_USE_MAGMA) -#if !defined(CHAMELEON_SIMULATION) static void cl_zlauum_cuda_func(void *descr[], void *cl_arg) { MORSE_enum uplo; @@ -91,8 +89,8 @@ static void cl_zlauum_cuda_func(void *descr[], void *cl_arg) cudaThreadSynchronize(); return; } -#endif //!defined(CHAMELEON_SIMULATION) #endif +#endif //!defined(CHAMELEON_SIMULATION) /* * Codelet definition diff --git a/runtime/starpu/codelets/codelet_zpotrf.c b/runtime/starpu/codelets/codelet_zpotrf.c index 13aedc1fb..d0a658443 100644 --- a/runtime/starpu/codelets/codelet_zpotrf.c +++ b/runtime/starpu/codelets/codelet_zpotrf.c @@ -79,10 +79,8 @@ static void cl_zpotrf_cpu_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &uplo, &n, &lda, &iinfo); CORE_zpotrf(uplo, n, A, lda, &info); } -#endif //!defined(CHAMELEON_SIMULATION) #ifdef CHAMELEON_USE_MAGMA -#if !defined(CHAMELEON_SIMULATION) static void cl_zpotrf_cuda_func(void *descr[], void *cl_arg) { cudaStream_t stream[2], currentt_stream; @@ -115,8 +113,8 @@ static void cl_zpotrf_cuda_func(void *descr[], void *cl_arg) return; } -#endif //!defined(CHAMELEON_SIMULATION) #endif +#endif //!defined(CHAMELEON_SIMULATION) /* * Codelet definition diff --git a/runtime/starpu/codelets/codelet_zssssm.c b/runtime/starpu/codelets/codelet_zssssm.c index 0bd1944b9..cf766396b 100644 --- a/runtime/starpu/codelets/codelet_zssssm.c +++ b/runtime/starpu/codelets/codelet_zssssm.c @@ -170,10 +170,8 @@ static void cl_zssssm_cpu_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &m1, &n1, &m2, &n2, &k, &ib, &lda1, &lda2, &ldl1, &ldl2, &IPIV); CORE_zssssm(m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, L1, ldl1, L2, ldl2, IPIV); } -#endif //!defined(CHAMELEON_SIMULATION) #if defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU) -#if !defined(CHAMELEON_SIMULATION) static void cl_zssssm_cuda_func(void *descr[], void *cl_arg) { int m1; @@ -213,8 +211,8 @@ static void cl_zssssm_cuda_func(void *descr[], void *cl_arg) cudaThreadSynchronize(); } -#endif //!defined(CHAMELEON_SIMULATION) #endif +#endif //!defined(CHAMELEON_SIMULATION) /* * Codelet definition diff --git a/runtime/starpu/codelets/codelet_zsymm.c b/runtime/starpu/codelets/codelet_zsymm.c index 02be8a966..c193f200d 100644 --- a/runtime/starpu/codelets/codelet_zsymm.c +++ b/runtime/starpu/codelets/codelet_zsymm.c @@ -99,11 +99,9 @@ static void cl_zsymm_cpu_func(void *descr[], void *cl_arg) B, LDB, beta, C, LDC); } -#endif //!defined(CHAMELEON_SIMULATION) #ifdef CHAMELEON_USE_CUDA #if defined(CHAMELEON_USE_CUBLAS_V2) -#if !defined(CHAMELEON_SIMULATION) static void cl_zsymm_cuda_func(void *descr[], void *cl_arg) { MORSE_enum side; @@ -141,9 +139,7 @@ static void cl_zsymm_cuda_func(void *descr[], void *cl_arg) return; } -#endif //!defined(CHAMELEON_SIMULATION) #else /* CHAMELEON_USE_CUBLAS_V2 */ -#if !defined(CHAMELEON_SIMULATION) static void cl_zsymm_cuda_func(void *descr[], void *cl_arg) { MORSE_enum side; @@ -181,9 +177,9 @@ static void cl_zsymm_cuda_func(void *descr[], void *cl_arg) return; } -#endif //!defined(CHAMELEON_SIMULATION) #endif /* CHAMELEON_USE_CUBLAS_V2 */ #endif /* CHAMELEON_USE_CUDA */ +#endif //!defined(CHAMELEON_SIMULATION) /* * Codelet definition diff --git a/runtime/starpu/codelets/codelet_zsyr2k.c b/runtime/starpu/codelets/codelet_zsyr2k.c index b07e5456d..e07c22f0d 100644 --- a/runtime/starpu/codelets/codelet_zsyr2k.c +++ b/runtime/starpu/codelets/codelet_zsyr2k.c @@ -96,11 +96,9 @@ static void cl_zsyr2k_cpu_func(void *descr[], void *cl_arg) CORE_zsyr2k(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc); } -#endif //!defined(CHAMELEON_SIMULATION) #ifdef CHAMELEON_USE_CUDA #if defined(CHAMELEON_USE_CUBLAS_V2) -#if !defined(CHAMELEON_SIMULATION) static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg) { MORSE_enum uplo; @@ -135,9 +133,7 @@ static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg) return; } -#endif //!defined(CHAMELEON_SIMULATION) #else /* CHAMELEON_USE_CUBLAS_V2 */ -#if !defined(CHAMELEON_SIMULATION) static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg) { MORSE_enum uplo; @@ -171,9 +167,9 @@ static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg) return; } -#endif //!defined(CHAMELEON_SIMULATION) #endif /* CHAMELEON_USE_CUBLAS_V2 */ #endif /* CHAMELEON_USE_CUDA */ +#endif //!defined(CHAMELEON_SIMULATION) /* * Codelet definition diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c index 6d29911c1..a4c12e551 100644 --- a/runtime/starpu/codelets/codelet_zsyrk.c +++ b/runtime/starpu/codelets/codelet_zsyrk.c @@ -91,11 +91,9 @@ static void cl_zsyrk_cpu_func(void *descr[], void *cl_arg) alpha, A, lda, beta, C, ldc); } -#endif //!defined(CHAMELEON_SIMULATION) #ifdef CHAMELEON_USE_CUDA #if defined(CHAMELEON_USE_CUBLAS_V2) -#if !defined(CHAMELEON_SIMULATION) static void cl_zsyrk_cuda_func(void *descr[], void *cl_arg) { MORSE_enum uplo; @@ -125,9 +123,7 @@ static void cl_zsyrk_cuda_func(void *descr[], void *cl_arg) return; } -#endif //!defined(CHAMELEON_SIMULATION) #else /* CHAMELEON_USE_CUBLAS_V2 */ -#if !defined(CHAMELEON_SIMULATION) static void cl_zsyrk_cuda_func(void *descr[], void *cl_arg) { MORSE_enum uplo; @@ -161,9 +157,9 @@ static void cl_zsyrk_cuda_func(void *descr[], void *cl_arg) return; } -#endif //!defined(CHAMELEON_SIMULATION) #endif /* CHAMELEON_USE_CUBLAS_V2 */ #endif /* CHAMELEON_USE_CUDA */ +#endif //!defined(CHAMELEON_SIMULATION) /* * Codelet definition diff --git a/runtime/starpu/codelets/codelet_ztrmm.c b/runtime/starpu/codelets/codelet_ztrmm.c index f353f6827..bd9787389 100644 --- a/runtime/starpu/codelets/codelet_ztrmm.c +++ b/runtime/starpu/codelets/codelet_ztrmm.c @@ -94,11 +94,9 @@ static void cl_ztrmm_cpu_func(void *descr[], void *cl_arg) alpha, A, LDA, B, LDB); } -#endif //!defined(CHAMELEON_SIMULATION) #ifdef CHAMELEON_USE_CUDA #if defined(CHAMELEON_USE_CUBLAS_V2) -#if !defined(CHAMELEON_SIMULATION) static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg) { MORSE_enum side; @@ -133,9 +131,7 @@ static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg) return; } -#endif //!defined(CHAMELEON_SIMULATION) #else /* CHAMELEON_USE_CUBLAS_V2 */ -#if !defined(CHAMELEON_SIMULATION) static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg) { MORSE_enum side; @@ -171,9 +167,9 @@ static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg) return; } -#endif //!defined(CHAMELEON_SIMULATION) #endif /* CHAMELEON_USE_CUBLAS_V2 */ #endif /* CHAMELEON_USE_CUDA */ +#endif //!defined(CHAMELEON_SIMULATION) /* diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c index 8796ca621..9d9138990 100644 --- a/runtime/starpu/codelets/codelet_ztrsm.c +++ b/runtime/starpu/codelets/codelet_ztrsm.c @@ -116,11 +116,9 @@ static void cl_ztrsm_cpu_func(void *descr[], void *cl_arg) alpha, A, lda, B, ldb); } -#endif //!defined(CHAMELEON_SIMULATION) #ifdef CHAMELEON_USE_CUDA #if defined(CHAMELEON_USE_CUBLAS_V2) -#if !defined(CHAMELEON_SIMULATION) static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg) { MORSE_enum side; @@ -156,9 +154,7 @@ static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg) return; } -#endif //!defined(CHAMELEON_SIMULATION) #else /* CHAMELEON_USE_CUBLAS_V2 */ -#if !defined(CHAMELEON_SIMULATION) static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg) { MORSE_enum side; @@ -193,9 +189,9 @@ static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg) return; } -#endif //!defined(CHAMELEON_SIMULATION) #endif /* CHAMELEON_USE_CUBLAS_V2 */ #endif /* CHAMELEON_USE_CUDA */ +#endif //!defined(CHAMELEON_SIMULATION) /* * Codelet definition diff --git a/runtime/starpu/codelets/codelet_ztrtri.c b/runtime/starpu/codelets/codelet_ztrtri.c index 516a4db4e..f5bfbfe4d 100644 --- a/runtime/starpu/codelets/codelet_ztrtri.c +++ b/runtime/starpu/codelets/codelet_ztrtri.c @@ -81,10 +81,8 @@ static void cl_ztrtri_cpu_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &uplo, &diag, &N, &LDA, &iinfo); CORE_ztrtri(uplo, diag, N, A, LDA, &info); } -#endif //!defined(CHAMELEON_SIMULATION) #if defined(CHAMELEON_USE_MAGMA) -#if !defined(CHAMELEON_SIMULATION) static void cl_ztrtri_cuda_func(void *descr[], void *cl_arg) { MORSE_enum uplo; @@ -101,8 +99,8 @@ static void cl_ztrtri_cuda_func(void *descr[], void *cl_arg) cudaThreadSynchronize(); return; } -#endif //!defined(CHAMELEON_SIMULATION) #endif +#endif //!defined(CHAMELEON_SIMULATION) /* * Codelet definition diff --git a/runtime/starpu/codelets/codelet_ztslqt.c b/runtime/starpu/codelets/codelet_ztslqt.c index a9b1fed72..e97dad0d6 100644 --- a/runtime/starpu/codelets/codelet_ztslqt.c +++ b/runtime/starpu/codelets/codelet_ztslqt.c @@ -170,10 +170,8 @@ static void cl_ztslqt_cpu_func(void *descr[], void *cl_arg) WORK = TAU + max( m, n ); CORE_ztslqt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK); } -#endif //!defined(CHAMELEON_SIMULATION) #if defined(CHAMELEON_USE_MAGMA) && 0 -#if !defined(CHAMELEON_SIMULATION) static void cl_ztslqt_cuda_func(void *descr[], void *cl_arg) { MORSE_starpu_ws_t *h_work; @@ -213,8 +211,8 @@ static void cl_ztslqt_cuda_func(void *descr[], void *cl_arg) cudaThreadSynchronize(); } -#endif //!defined(CHAMELEON_SIMULATION) #endif +#endif //!defined(CHAMELEON_SIMULATION) /* * Codelet definition diff --git a/runtime/starpu/codelets/codelet_ztsmlq.c b/runtime/starpu/codelets/codelet_ztsmlq.c index d582c0d99..cba63b524 100644 --- a/runtime/starpu/codelets/codelet_ztsmlq.c +++ b/runtime/starpu/codelets/codelet_ztsmlq.c @@ -212,10 +212,8 @@ static void cl_ztsmlq_cpu_func(void *descr[], void *cl_arg) CORE_ztsmlq(side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); } -#endif //!defined(CHAMELEON_SIMULATION) #if defined(CHAMELEON_USE_CUDA) -#if !defined(CHAMELEON_SIMULATION) static void cl_ztsmlq_cuda_func(void *descr[], void *cl_arg) { MORSE_enum side; @@ -262,8 +260,8 @@ static void cl_ztsmlq_cuda_func(void *descr[], void *cl_arg) cudaStreamSynchronize( stream ); #endif } -#endif //!defined(CHAMELEON_SIMULATION) #endif /* defined(CHAMELEON_USE_CUDA) */ +#endif //!defined(CHAMELEON_SIMULATION) /* * Codelet definition diff --git a/runtime/starpu/codelets/codelet_ztsmqr.c b/runtime/starpu/codelets/codelet_ztsmqr.c index 9f9528d7b..d83d1d2f4 100644 --- a/runtime/starpu/codelets/codelet_ztsmqr.c +++ b/runtime/starpu/codelets/codelet_ztsmqr.c @@ -242,10 +242,9 @@ static void cl_ztsmqr_cpu_func(void *descr[], void *cl_arg) CORE_ztsmqr(side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork); } -#endif //!defined(CHAMELEON_SIMULATION) + #if defined(CHAMELEON_USE_CUDA) -#if !defined(CHAMELEON_SIMULATION) static void cl_ztsmqr_cuda_func(void *descr[], void *cl_arg) { MORSE_enum side; @@ -293,8 +292,8 @@ static void cl_ztsmqr_cuda_func(void *descr[], void *cl_arg) cudaStreamSynchronize( stream ); #endif } -#endif //!defined(CHAMELEON_SIMULATION) #endif /* defined(CHAMELEON_USE_CUDA) */ +#endif //!defined(CHAMELEON_SIMULATION) /* diff --git a/runtime/starpu/codelets/codelet_ztsqrt.c b/runtime/starpu/codelets/codelet_ztsqrt.c index 9fdc44931..7cbff9edd 100644 --- a/runtime/starpu/codelets/codelet_ztsqrt.c +++ b/runtime/starpu/codelets/codelet_ztsqrt.c @@ -161,10 +161,8 @@ static void cl_ztsqrt_cpu_func(void *descr[], void *cl_arg) WORK = TAU + max( m, n ); CORE_ztsqrt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK); } -#endif //!defined(CHAMELEON_SIMULATION) #if defined(CHAMELEON_USE_MAGMA) -#if !defined(CHAMELEON_SIMULATION) static void cl_ztsqrt_cuda_func(void *descr[], void *cl_arg) { MORSE_starpu_ws_t *h_work; @@ -203,8 +201,9 @@ static void cl_ztsqrt_cuda_func(void *descr[], void *cl_arg) h_W, d_W, stream); cudaThreadSynchronize(); } -#endif //!defined(CHAMELEON_SIMULATION) #endif +#endif //!defined(CHAMELEON_SIMULATION) + /* * Codelet definition */ diff --git a/runtime/starpu/codelets/codelet_ztstrf.c b/runtime/starpu/codelets/codelet_ztstrf.c index cc83c63e8..4ce5e815c 100644 --- a/runtime/starpu/codelets/codelet_ztstrf.c +++ b/runtime/starpu/codelets/codelet_ztstrf.c @@ -195,14 +195,13 @@ static void cl_ztstrf_cpu_func(void *descr[], void *cl_arg) } #endif } -#endif //!defined(CHAMELEON_SIMULATION) + /* * Codelet GPU */ /* TODO/WARNING: tstrf is not working on GPU for now */ #if defined(CHAMELEON_USE_MAGMA) && 0 -#if !defined(CHAMELEON_SIMULATION) static void cl_ztstrf_cuda_func(void *descr[], void *cl_arg) { MORSE_starpu_ws_t *d_work; @@ -260,8 +259,8 @@ static void cl_ztstrf_cuda_func(void *descr[], void *cl_arg) cudaThreadSynchronize(); } -#endif //!defined(CHAMELEON_SIMULATION) #endif +#endif //!defined(CHAMELEON_SIMULATION) /* * Codelet definition diff --git a/runtime/starpu/codelets/codelet_zunmlq.c b/runtime/starpu/codelets/codelet_zunmlq.c index 4f03d8e70..fac849c0d 100644 --- a/runtime/starpu/codelets/codelet_zunmlq.c +++ b/runtime/starpu/codelets/codelet_zunmlq.c @@ -184,10 +184,8 @@ static void cl_zunmlq_cpu_func(void *descr[], void *cl_arg) CORE_zunmlq(side, trans, m, n, k, ib, A, lda, T, ldt, C, ldc, WORK, ldwork); } -#endif //!defined(CHAMELEON_SIMULATION) #if defined(CHAMELEON_USE_MAGMA) -#if !defined(CHAMELEON_SIMULATION) static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg) { MORSE_starpu_ws_t *d_work; @@ -215,8 +213,8 @@ static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg) cudaThreadSynchronize(); } -#endif //!defined(CHAMELEON_SIMULATION) #endif +#endif //!defined(CHAMELEON_SIMULATION) /* * Codelet definition diff --git a/runtime/starpu/codelets/codelet_zunmqr.c b/runtime/starpu/codelets/codelet_zunmqr.c index 306e4dc14..b4c14f7b8 100644 --- a/runtime/starpu/codelets/codelet_zunmqr.c +++ b/runtime/starpu/codelets/codelet_zunmqr.c @@ -210,10 +210,8 @@ static void cl_zunmqr_cpu_func(void *descr[], void *cl_arg) CORE_zunmqr(side, trans, m, n, k, ib, A, lda, T, ldt, C, ldc, WORK, ldwork); } -#endif //!defined(CHAMELEON_SIMULATION) #if defined(CHAMELEON_USE_MAGMA) -#if !defined(CHAMELEON_SIMULATION) static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg) { MORSE_starpu_ws_t *d_work; @@ -241,8 +239,8 @@ static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg) cudaThreadSynchronize(); } -#endif //!defined(CHAMELEON_SIMULATION) #endif +#endif //!defined(CHAMELEON_SIMULATION) /* * Codelet definition diff --git a/runtime/starpu/control/runtime_descriptor.c b/runtime/starpu/control/runtime_descriptor.c index 0e86da381..d6a2c670b 100644 --- a/runtime/starpu/control/runtime_descriptor.c +++ b/runtime/starpu/control/runtime_descriptor.c @@ -295,37 +295,40 @@ int RUNTIME_desc_getoncpu( MORSE_desc_t *desc ) void *RUNTIME_desc_getaddr( const MORSE_desc_t *desc, int m, int n ) { + int64_t im = m + (desc->i / desc->mb); + int64_t jn = n + (desc->j / desc->nb); + starpu_data_handle_t *ptrtile = (starpu_data_handle_t*)(desc->schedopt); - ptrtile += ((int64_t)(desc->lmt) * (int64_t)n + (int64_t)m); + ptrtile += ((int64_t)(desc->lmt) * (int64_t)jn + (int64_t)im); if (*ptrtile == NULL) { int64_t eltsze = MORSE_Element_Size(desc->dtyp); int myrank = desc->myrank; int owner = desc->get_rankof( desc, m, n ); - int tempmm = (m == desc->lmt-1) ? (desc->lm - m * desc->mb) : desc->mb; - int tempnn = (n == desc->lnt-1) ? (desc->ln - n * desc->nb) : desc->nb; + int tempmm = (im == desc->lmt-1) ? (desc->lm - im * desc->mb) : desc->mb; + int tempnn = (jn == desc->lnt-1) ? (desc->ln - jn * desc->nb) : desc->nb; if ( myrank == owner ) { if ( desc->get_blkaddr(desc, m, n) == (void*)NULL ) { starpu_matrix_data_register(ptrtile, -1, (uintptr_t) NULL, - BLKLDD(desc, m), tempmm, tempnn, eltsze); + BLKLDD(desc, im), tempmm, tempnn, eltsze); } else { starpu_matrix_data_register(ptrtile, STARPU_MAIN_RAM, (uintptr_t)desc->get_blkaddr(desc, m, n), - BLKLDD(desc, m), tempmm, tempnn, eltsze); + BLKLDD(desc, im), tempmm, tempnn, eltsze); } } else { starpu_matrix_data_register(ptrtile, -1, (uintptr_t) NULL, - BLKLDD(desc, m), tempmm, tempnn, eltsze); + BLKLDD(desc, im), tempmm, tempnn, eltsze); } #if defined(CHAMELEON_USE_MPI) { - int64_t block_ind = desc->lmt * n + m; + int64_t block_ind = desc->lmt * jn + im; starpu_mpi_data_register(*ptrtile, (desc->id << tag_sep) | (block_ind), owner); } #endif /* defined(CHAMELEON_USE_MPI) */ diff --git a/runtime/starpu/control/runtime_workspace.c b/runtime/starpu/control/runtime_workspace.c index 711652562..d67f95a82 100644 --- a/runtime/starpu/control/runtime_workspace.c +++ b/runtime/starpu/control/runtime_workspace.c @@ -52,14 +52,12 @@ static void RUNTIME_allocate_workspace_on_workers(void *arg) } } else - { #endif + { /* This buffer should only be used within the CPU kernel, so * there is no point in using pinned memory here. */ workspace->workspaces[id] = malloc(workspace->size); -#if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION) } -#endif assert(workspace->workspaces[id]); } @@ -87,12 +85,10 @@ static void RUNTIME_free_workspace_on_workers(void *arg) } } else - { #endif + { free(workspace->workspaces[id]); -#if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION) } -#endif workspace->workspaces[id] = NULL; } -- GitLab