diff --git a/CMakeLists.txt b/CMakeLists.txt index 0501e92baa1ee73ba77ecba6de6c5eb086aab5d4..66952aa9d463f50636f5c7edd2891a29f7727465 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,6 +60,8 @@ include(GenPkgConfig) # Parameters/Options # ###################### +set(CHAMELEON_DEFINITIONS_LIST "") + # Add define for Fortran Mangling (should be defined somewhere else) # ------------------------------------------------------------------ add_definitions(-DADD_) @@ -686,6 +688,7 @@ endif() ############################################################################### # Print Options # ################# +get_directory_property( CHAMELEON_DEFINITIONS_LIST DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS ) include(PrintOpts) ### diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c index ca94fdd206bf1d4e8748981b79cb0cd3a89e1bfc..8e52cb94b6268db109386e6957c39a5ac70709a5 100644 --- a/runtime/starpu/codelets/codelet_zgemm.c +++ b/runtime/starpu/codelets/codelet_zgemm.c @@ -121,27 +121,30 @@ static void cl_zgemm_cuda_func(void *descr[], void *cl_arg) cuDoubleComplex beta; cuDoubleComplex *C; int ldc; + CUstream stream; + cublasHandle_t handle; + cublasStatus_t stat; + cublasOperation_t cublasTransA; + cublasOperation_t cublasTransB; A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); starpu_codelet_unpack_args(cl_arg, &transA, &transB, &m, &n, &k, &alpha, &lda, &ldb, &beta, &ldc); - cublasHandle_t handle; - cublasStatus_t stat = cublasCreate(&handle); + stat = cublasCreate(&handle); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed\n"); assert( stat == CUBLAS_STATUS_SUCCESS ); } - CUstream stream = starpu_cuda_get_local_stream(); + stream = starpu_cuda_get_local_stream(); stat = cublasSetStream(handle, stream); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("cublasSetStream failed\n"); assert( stat == CUBLAS_STATUS_SUCCESS ); } - cublasOperation_t cublasTransA; if (transA == MorseNoTrans){ cublasTransA = CUBLAS_OP_N; }else if(transA == MorseTrans){ @@ -151,7 +154,6 @@ static void cl_zgemm_cuda_func(void *descr[], void *cl_arg) }else{ fprintf(stderr, "Error in cl_zgemm_cuda_func: bad transA parameter %d\n", transA); } - cublasOperation_t cublasTransB; if (transB == MorseNoTrans){ cublasTransB = CUBLAS_OP_N; }else if(transB == MorseTrans){ diff --git a/runtime/starpu/codelets/codelet_zgeqrt.c b/runtime/starpu/codelets/codelet_zgeqrt.c index f22e424249237e57ebafe08cfa71ce5dfc2d8368..4171aa70e77fd6dd073fbe4d3795e875a8d74179 100644 --- a/runtime/starpu/codelets/codelet_zgeqrt.c +++ b/runtime/starpu/codelets/codelet_zgeqrt.c @@ -162,14 +162,17 @@ magma_zgemerge_gpu(magma_side_t side, magma_diag_t diag, { int i, j; magmaDoubleComplex *cola, *colb; + CUstream stream; cublasHandle_t handle; - cublasStatus_t stat = cublasCreate(&handle); + cublasStatus_t stat; + + stat = cublasCreate(&handle); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed\n"); assert( stat == CUBLAS_STATUS_SUCCESS ); } - CUstream stream = starpu_cuda_get_local_stream(); + stream = starpu_cuda_get_local_stream(); stat = cublasSetStream(handle, stream); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("cublasSetStream failed\n"); @@ -234,13 +237,19 @@ magma_zgemerge_gpu(magma_side_t side, magma_diag_t diag, for(i=0; i<N; i++){ cola = A + i*LDA; colb = B + i*LDB; - cublasZcopy(i+1, cola, 1, colb, 1); + //cublasZcopy(i+1, cola, 1, colb, 1); + cudaMemcpy(colb , cola, + (i+1)*sizeof(cuDoubleComplex), + cudaMemcpyDeviceToDevice ); } }else{ for(i=0; i<N; i++){ cola = A + i*LDA; colb = B + i*LDB; - cublasZcopy(M-i, cola + i, 1, colb + i, 1); + //cublasZcopy(M-i, cola + i, 1, colb + i, 1); + cudaMemcpy(colb+i , cola+i, + (M-i)*sizeof(cuDoubleComplex), + cudaMemcpyDeviceToDevice ); } } @@ -291,9 +300,13 @@ magma_zgeqrt_gpu( magma_int_t m, magma_int_t n, magma_int_t nb, cudaMemset(dt_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex)); /* copy first panel of A on the host */ - cublasGetMatrix(m, min(nb,n), sizeof(magmaDoubleComplex), - da_ref(0, 0), ldda, - v, ldv); +// cublasGetMatrix(m, min(nb,n), sizeof(magmaDoubleComplex), +// da_ref(0, 0), ldda, +// v, ldv); + /* copy first panel of A on the host */ + cudaMemcpy( v, da_ref(0,0), + m*min(nb,n)*sizeof(magmaDoubleComplex), + cudaMemcpyDeviceToHost ); /* Use blocked code initially */ for (i = 0; i < k; i += nb) { @@ -305,9 +318,13 @@ magma_zgeqrt_gpu( magma_int_t m, magma_int_t n, magma_int_t nb, if (i>0){ /* copy panel of A from device to host */ - cublasGetMatrix(m, ib, sizeof(magmaDoubleComplex), - da_ref(0, i), ldda, - v, ldv); +// cublasGetMatrix(m, ib, sizeof(magmaDoubleComplex), +// da_ref(0, i), ldda, +// v, ldv); + /* copy panel of A from device to host */ + cudaMemcpy( v, da_ref(0,i), + m*ib*sizeof(magmaDoubleComplex), + cudaMemcpyDeviceToHost ); /* Apply H' to A(i:m,i+2*ib:n) from the left */ cols = n-old_i-2*old_ib; diff --git a/runtime/starpu/codelets/codelet_zhemm.c b/runtime/starpu/codelets/codelet_zhemm.c index e7c297dc5ce8ff90c876977db4437af58ae80f1f..1b4b1241b630ef62e11c2af655dc3fd4118e7adc 100644 --- a/runtime/starpu/codelets/codelet_zhemm.c +++ b/runtime/starpu/codelets/codelet_zhemm.c @@ -117,27 +117,30 @@ static void cl_zhemm_cuda_func(void *descr[], void *cl_arg) cuDoubleComplex beta; cuDoubleComplex *C; int LDC; + CUstream stream; + cublasHandle_t handle; + cublasStatus_t stat; + cublasSideMode_t cublasSide; + cublasFillMode_t cublasUplo; A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &LDA, &LDB, &beta, &LDC); - cublasHandle_t handle; - cublasStatus_t stat = cublasCreate(&handle); + stat = cublasCreate(&handle); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed\n"); assert( stat == CUBLAS_STATUS_SUCCESS ); } - CUstream stream = starpu_cuda_get_local_stream(); + stream = starpu_cuda_get_local_stream(); stat = cublasSetStream(handle, stream); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("cublasSetStream failed\n"); assert( stat == CUBLAS_STATUS_SUCCESS ); } - cublasSideMode_t cublasSide; if (side == MorseLeft){ cublasSide = CUBLAS_SIDE_LEFT; }else if (side == MorseRight){ @@ -145,7 +148,6 @@ static void cl_zhemm_cuda_func(void *descr[], void *cl_arg) }else{ fprintf(stderr, "Error in cl_zhemm_cuda_func: bad side parameter %d\n", side); } - cublasFillMode_t cublasUplo; if (uplo == MorseUpper){ cublasUplo = CUBLAS_FILL_MODE_UPPER; }else if(uplo == MorseLower){ diff --git a/runtime/starpu/codelets/codelet_zher2k.c b/runtime/starpu/codelets/codelet_zher2k.c index 4755ca80ef182c1b858770c5f60995f34ff30ec2..bd7da41cf61552d661b0ade9a5dc0e62277bc4ea 100644 --- a/runtime/starpu/codelets/codelet_zher2k.c +++ b/runtime/starpu/codelets/codelet_zher2k.c @@ -112,27 +112,30 @@ static void cl_zher2k_cuda_func(void *descr[], void *cl_arg) double beta; cuDoubleComplex *C; int ldc; + CUstream stream; + cublasHandle_t handle; + cublasStatus_t stat; + cublasFillMode_t cublasUplo; + cublasOperation_t cublasTrans; A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &lda, &ldb, &beta, &ldc); - cublasHandle_t handle; - cublasStatus_t stat = cublasCreate(&handle); + stat = cublasCreate(&handle); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed\n"); assert( stat == CUBLAS_STATUS_SUCCESS ); } - CUstream stream = starpu_cuda_get_local_stream(); + stream = starpu_cuda_get_local_stream(); stat = cublasSetStream(handle, stream); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("cublasSetStream failed\n"); assert( stat == CUBLAS_STATUS_SUCCESS ); } - cublasFillMode_t cublasUplo; if (uplo == MorseUpper){ cublasUplo = CUBLAS_FILL_MODE_UPPER; }else if(uplo == MorseLower){ @@ -142,8 +145,6 @@ static void cl_zher2k_cuda_func(void *descr[], void *cl_arg) }else{ fprintf(stderr, "Error in cl_zher2k_cuda_func: bad uplo parameter %d\n", uplo); } - - cublasOperation_t cublasTrans; if (trans == MorseNoTrans){ cublasTrans = CUBLAS_OP_N; }else if(trans == MorseTrans){ diff --git a/runtime/starpu/codelets/codelet_zherk.c b/runtime/starpu/codelets/codelet_zherk.c index 18584b8636cb8a67211cfefb9b89eea6e70b5daa..44ae4f252a1ac2edcf76ae10fcd1838a3ea696fe 100644 --- a/runtime/starpu/codelets/codelet_zherk.c +++ b/runtime/starpu/codelets/codelet_zherk.c @@ -106,26 +106,29 @@ static void cl_zherk_cuda_func(void *descr[], void *cl_arg) double beta; cuDoubleComplex *C; int ldc; + CUstream stream; + cublasHandle_t handle; + cublasStatus_t stat; + cublasFillMode_t cublasUplo; + cublasOperation_t cublasTrans; A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &lda, &beta, &ldc); - cublasHandle_t handle; - cublasStatus_t stat = cublasCreate(&handle); + stat = cublasCreate(&handle); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed\n"); assert( stat == CUBLAS_STATUS_SUCCESS ); } - CUstream stream = starpu_cuda_get_local_stream(); + stream = starpu_cuda_get_local_stream(); stat = cublasSetStream(handle, stream); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("cublasSetStream failed\n"); assert( stat == CUBLAS_STATUS_SUCCESS ); } - cublasFillMode_t cublasUplo; if (uplo == MorseUpper){ cublasUplo = CUBLAS_FILL_MODE_UPPER; }else if(uplo == MorseLower){ @@ -135,8 +138,6 @@ static void cl_zherk_cuda_func(void *descr[], void *cl_arg) }else{ fprintf(stderr, "Error in cl_zherk_cuda_func: bad uplo parameter %d\n", uplo); } - - cublasOperation_t cublasTrans; if (trans == MorseNoTrans){ cublasTrans = CUBLAS_OP_N; }else if(trans == MorseTrans){ diff --git a/runtime/starpu/codelets/codelet_zsymm.c b/runtime/starpu/codelets/codelet_zsymm.c index ae1903daabf8b3bae8333d2bdfa31894c546697a..173d6d3b85562aea11b53a2f7214674e5d0ac0ec 100644 --- a/runtime/starpu/codelets/codelet_zsymm.c +++ b/runtime/starpu/codelets/codelet_zsymm.c @@ -117,27 +117,30 @@ static void cl_zsymm_cuda_func(void *descr[], void *cl_arg) cuDoubleComplex beta; cuDoubleComplex *C; int LDC; + CUstream stream; + cublasHandle_t handle; + cublasStatus_t stat; + cublasSideMode_t cublasSide; + cublasFillMode_t cublasUplo; A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &LDA, &LDB, &beta, &LDC); - cublasHandle_t handle; - cublasStatus_t stat = cublasCreate(&handle); + stat = cublasCreate(&handle); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed\n"); assert( stat == CUBLAS_STATUS_SUCCESS ); } - CUstream stream = starpu_cuda_get_local_stream(); + stream = starpu_cuda_get_local_stream(); stat = cublasSetStream(handle, stream); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("cublasSetStream failed\n"); assert( stat == CUBLAS_STATUS_SUCCESS ); } - cublasSideMode_t cublasSide; if (side == MorseLeft){ cublasSide = CUBLAS_SIDE_LEFT; }else if (side == MorseRight){ @@ -145,7 +148,6 @@ static void cl_zsymm_cuda_func(void *descr[], void *cl_arg) }else{ fprintf(stderr, "Error in cl_zsymm_cuda_func: bad side parameter %d\n", side); } - cublasFillMode_t cublasUplo; if (uplo == MorseUpper){ cublasUplo = CUBLAS_FILL_MODE_UPPER; }else if(uplo == MorseLower){ diff --git a/runtime/starpu/codelets/codelet_zsyr2k.c b/runtime/starpu/codelets/codelet_zsyr2k.c index 2d99bdb82e8d11a97e699f71fb33b72fd834b6a3..2922de6fc788f852d80ee13756ee553274d52247 100644 --- a/runtime/starpu/codelets/codelet_zsyr2k.c +++ b/runtime/starpu/codelets/codelet_zsyr2k.c @@ -112,27 +112,30 @@ static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg) cuDoubleComplex beta; cuDoubleComplex *C; int ldc; + CUstream stream; + cublasHandle_t handle; + cublasStatus_t stat; + cublasFillMode_t cublasUplo; + cublasOperation_t cublasTrans; A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &lda, &ldb, &beta, &ldc); - cublasHandle_t handle; - cublasStatus_t stat = cublasCreate(&handle); + stat = cublasCreate(&handle); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed\n"); assert( stat == CUBLAS_STATUS_SUCCESS ); } - CUstream stream = starpu_cuda_get_local_stream(); + stream = starpu_cuda_get_local_stream(); stat = cublasSetStream(handle, stream); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("cublasSetStream failed\n"); assert( stat == CUBLAS_STATUS_SUCCESS ); } - cublasFillMode_t cublasUplo; if (uplo == MorseUpper){ cublasUplo = CUBLAS_FILL_MODE_UPPER; }else if(uplo == MorseLower){ @@ -142,8 +145,6 @@ static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg) }else{ fprintf(stderr, "Error in cl_zsyr2k_cuda_func: bad uplo parameter %d\n", uplo); } - - cublasOperation_t cublasTrans; if (trans == MorseNoTrans){ cublasTrans = CUBLAS_OP_N; }else if(trans == MorseTrans){ diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c index 8803124586868e7629669f18ad9aa132a31a1c2b..0f69498a4e77cd61b5c9515560c4a6d7d6d770c2 100644 --- a/runtime/starpu/codelets/codelet_zsyrk.c +++ b/runtime/starpu/codelets/codelet_zsyrk.c @@ -107,26 +107,29 @@ static void cl_zsyrk_cuda_func(void *descr[], void *cl_arg) cuDoubleComplex beta; cuDoubleComplex *C; int ldc; + CUstream stream; + cublasHandle_t handle; + cublasStatus_t stat; + cublasFillMode_t cublasUplo; + cublasOperation_t cublasTrans; A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &lda, &beta, &ldc); - cublasHandle_t handle; - cublasStatus_t stat = cublasCreate(&handle); + stat = cublasCreate(&handle); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed\n"); assert( stat == CUBLAS_STATUS_SUCCESS ); } - CUstream stream = starpu_cuda_get_local_stream(); + stream = starpu_cuda_get_local_stream(); stat = cublasSetStream(handle, stream); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("cublasSetStream failed\n"); assert( stat == CUBLAS_STATUS_SUCCESS ); } - cublasFillMode_t cublasUplo; if (uplo == MorseUpper){ cublasUplo = CUBLAS_FILL_MODE_UPPER; }else if(uplo == MorseLower){ @@ -136,8 +139,6 @@ static void cl_zsyrk_cuda_func(void *descr[], void *cl_arg) }else{ fprintf(stderr, "Error in cl_zsyrk_cuda_func: bad uplo parameter %d\n", uplo); } - - cublasOperation_t cublasTrans; if (trans == MorseNoTrans){ cublasTrans = CUBLAS_OP_N; }else if(trans == MorseTrans){ diff --git a/runtime/starpu/codelets/codelet_ztrmm.c b/runtime/starpu/codelets/codelet_ztrmm.c index 497ca32ef4801416429d041159ee601a82fd41b9..510005d40d382230dcf86d242e4f28d69e06c5d0 100644 --- a/runtime/starpu/codelets/codelet_ztrmm.c +++ b/runtime/starpu/codelets/codelet_ztrmm.c @@ -111,26 +111,31 @@ static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg) int LDA; cuDoubleComplex *B; int LDB; + CUstream stream; + cublasHandle_t handle; + cublasStatus_t stat; + cublasSideMode_t cublasSide; + cublasFillMode_t cublasUplo; + cublasOperation_t cublasTransA; + cublasDiagType_t cublasDiag; A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); B = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); starpu_codelet_unpack_args(cl_arg, &side, &uplo, &transA, &diag, &M, &N, &alpha, &LDA, &LDB); - cublasHandle_t handle; - cublasStatus_t stat = cublasCreate(&handle); + stat = cublasCreate(&handle); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed\n"); assert( stat == CUBLAS_STATUS_SUCCESS ); } - CUstream stream = starpu_cuda_get_local_stream(); + stream = starpu_cuda_get_local_stream(); stat = cublasSetStream(handle, stream); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("cublasSetStream failed\n"); assert( stat == CUBLAS_STATUS_SUCCESS ); } - cublasSideMode_t cublasSide; if (side == MorseLeft){ cublasSide = CUBLAS_SIDE_LEFT; }else if (side == MorseRight){ @@ -138,7 +143,6 @@ static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg) }else{ fprintf(stderr, "Error in cl_ztrmm_cuda_func: bad side parameter %d\n", side); } - cublasFillMode_t cublasUplo; if (uplo == MorseUpper){ cublasUplo = CUBLAS_FILL_MODE_UPPER; }else if(uplo == MorseLower){ @@ -148,7 +152,6 @@ static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg) }else{ fprintf(stderr, "Error in cl_ztrmm_cuda_func: bad uplo parameter %d\n", uplo); } - cublasOperation_t cublasTransA; if (transA == MorseNoTrans){ cublasTransA = CUBLAS_OP_N; }else if(transA == MorseTrans){ @@ -158,7 +161,6 @@ static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg) }else{ fprintf(stderr, "Error in cl_ztrmm_cuda_func: bad transA parameter %d\n", transA); } - cublasDiagType_t cublasDiag; if (diag == MorseNonUnit){ cublasDiag = CUBLAS_DIAG_NON_UNIT; }else if(diag == MorseUnit){ diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c index cca877b5635e4cf71d6479078011d1f6dd748c35..0b100e81a01d055a030dd5f0be086ff5894d20d6 100644 --- a/runtime/starpu/codelets/codelet_ztrsm.c +++ b/runtime/starpu/codelets/codelet_ztrsm.c @@ -111,26 +111,31 @@ static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg) int lda; cuDoubleComplex *B; int ldb; + CUstream stream; + cublasHandle_t handle; + cublasStatus_t stat; + cublasSideMode_t cublasSide; + cublasFillMode_t cublasUplo; + cublasOperation_t cublasTransA; + cublasDiagType_t cublasDiag; A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); B = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); starpu_codelet_unpack_args(cl_arg, &side, &uplo, &transA, &diag, &m, &n, &alpha, &lda, &ldb); - cublasHandle_t handle; - cublasStatus_t stat = cublasCreate(&handle); + stat = cublasCreate(&handle); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed\n"); assert( stat == CUBLAS_STATUS_SUCCESS ); } - CUstream stream = starpu_cuda_get_local_stream(); + stream = starpu_cuda_get_local_stream(); stat = cublasSetStream(handle, stream); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("cublasSetStream failed\n"); assert( stat == CUBLAS_STATUS_SUCCESS ); } - cublasSideMode_t cublasSide; if (side == MorseLeft){ cublasSide = CUBLAS_SIDE_LEFT; }else if (side == MorseRight){ @@ -138,7 +143,6 @@ static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg) }else{ fprintf(stderr, "Error in cl_ztrsm_cuda_func: bad side parameter %d\n", side); } - cublasFillMode_t cublasUplo; if (uplo == MorseUpper){ cublasUplo = CUBLAS_FILL_MODE_UPPER; }else if(uplo == MorseLower){ @@ -148,7 +152,6 @@ static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg) }else{ fprintf(stderr, "Error in cl_ztrsm_cuda_func: bad uplo parameter %d\n", uplo); } - cublasOperation_t cublasTransA; if (transA == MorseNoTrans){ cublasTransA = CUBLAS_OP_N; }else if(transA == MorseTrans){ @@ -158,7 +161,6 @@ static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg) }else{ fprintf(stderr, "Error in cl_ztrsm_cuda_func: bad transA parameter %d\n", transA); } - cublasDiagType_t cublasDiag; if (diag == MorseNonUnit){ cublasDiag = CUBLAS_DIAG_NON_UNIT; }else if(diag == MorseUnit){ diff --git a/runtime/starpu/codelets/codelet_ztsmqr.c b/runtime/starpu/codelets/codelet_ztsmqr.c index 8cfa12803a19361918145499979d0732df07f5bb..e88ca291d05880f79ded733d97daf0c942ef48a8 100644 --- a/runtime/starpu/codelets/codelet_ztsmqr.c +++ b/runtime/starpu/codelets/codelet_ztsmqr.c @@ -245,7 +245,12 @@ magma_zparfb_gpu(magma_side_t side, magma_trans_t trans, magma_trans_t transW; magma_trans_t transA2; cublasHandle_t handle; - cublasStatus_t stat = cublasCreate(&handle); + cublasStatus_t stat; + cublasOperation_t cublasTrans; + cublasOperation_t cublasTransW; + cublasOperation_t cublasTransA2; + + stat = cublasCreate(&handle); if (stat != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed\n"); assert( stat == CUBLAS_STATUS_SUCCESS ); @@ -257,7 +262,6 @@ magma_zparfb_gpu(magma_side_t side, magma_trans_t trans, assert( stat == CUBLAS_STATUS_SUCCESS ); } - cublasOperation_t cublasTrans; if (trans == MagmaNoTrans){ cublasTrans = CUBLAS_OP_N; }else if(trans == MagmaTrans){ @@ -329,7 +333,6 @@ magma_zparfb_gpu(magma_side_t side, magma_trans_t trans, transW = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans; transA2 = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans; - cublasOperation_t cublasTransW; if (transW == MagmaNoTrans){ cublasTransW = CUBLAS_OP_N; }else if(transW == MagmaTrans){ @@ -339,7 +342,6 @@ magma_zparfb_gpu(magma_side_t side, magma_trans_t trans, }else{ fprintf(stderr, "Error in magma_zparfb_gpu: bad transW parameter %d\n", transW); } - cublasOperation_t cublasTransA2; if (transA2 == MagmaNoTrans){ cublasTransA2 = CUBLAS_OP_N; }else if(transA2 == MagmaTrans){ @@ -358,7 +360,6 @@ magma_zparfb_gpu(magma_side_t side, magma_trans_t trans, (const cuDoubleComplex *) &zone, (cuDoubleComplex*)WORK /* K*N1 */, LDWORK); - WORKC = NULL; if (WORKC == NULL) { /* W = op(T) * W */ cublasZtrmm( handle, @@ -437,7 +438,6 @@ magma_zparfb_gpu(magma_side_t side, magma_trans_t trans, transW = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans; transA2 = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans; - cublasOperation_t cublasTransW; if (transW == MagmaNoTrans){ cublasTransW = CUBLAS_OP_N; }else if(transW == MagmaTrans){ @@ -447,7 +447,6 @@ magma_zparfb_gpu(magma_side_t side, magma_trans_t trans, }else{ fprintf(stderr, "Error in magma_zparfb_gpu: bad transW parameter %d\n", transW); } - cublasOperation_t cublasTransA2; if (transA2 == MagmaNoTrans){ cublasTransA2 = CUBLAS_OP_N; }else if(transA2 == MagmaTrans){ @@ -466,7 +465,6 @@ magma_zparfb_gpu(magma_side_t side, magma_trans_t trans, (const cuDoubleComplex *) &zone, (cuDoubleComplex*)WORK /* M1*K */, LDWORK); - WORKC = NULL; if (WORKC == NULL) { /* W = W * op(T) */ cublasZtrmm( handle, @@ -548,7 +546,6 @@ magma_zparfb_gpu(magma_side_t side, magma_trans_t trans, magmaDoubleComplex *WORK, magma_int_t LDWORK, magmaDoubleComplex *WORKC, magma_int_t LDWORKC, CUstream stream) - { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0); diff --git a/runtime/starpu/codelets/codelet_ztsqrt.c b/runtime/starpu/codelets/codelet_ztsqrt.c index 0aa41ebdfef0fd13ae1834241c2b455179cadd1d..9eedc071b00c1bac821a21741b27df3723c8fc22 100644 --- a/runtime/starpu/codelets/codelet_ztsqrt.c +++ b/runtime/starpu/codelets/codelet_ztsqrt.c @@ -213,11 +213,17 @@ magma_ztsqrt2_gpu( magma_int_t m, magma_int_t n, magma_int_t nb, cublasGetMatrix(nb, nb, sizeof(magmaDoubleComplex), da1_ref(0, 0), ldda1, d, ldd); +// cudaMemcpy( d, da1_ref(0,0), +// nb*nb*sizeof(cuDoubleComplex), +// cudaMemcpyDeviceToHost ); /* copy first panel of A2 from device to host: da2 -> a2 */ - cublasGetMatrix(m, nb, sizeof(magmaDoubleComplex), - da2_ref(0, 0), ldda2, - a2, lda2); +// cublasGetMatrix(m, nb, sizeof(magmaDoubleComplex), +// da2_ref(0, 0), ldda2, +// a2, lda2); + cudaMemcpy( a2, da2_ref(0, 0), + m*nb*sizeof(cuDoubleComplex), + cudaMemcpyDeviceToHost ); /* This is only blocked code for now */ for (i = 0; i < n; i += nb) { @@ -234,11 +240,17 @@ magma_ztsqrt2_gpu( magma_int_t m, magma_int_t n, magma_int_t nb, cublasGetMatrix(ib, ib, sizeof(magmaDoubleComplex), da1_ref(i, i), ldda1, d, ldd); +// cudaMemcpy( d, da1_ref(i,i), +// ib*ib*sizeof(cuDoubleComplex), +// cudaMemcpyDeviceToHost ); /* copy panel of A2 from device to host: da2 -> a2 */ cublasGetMatrix(rows, ib, sizeof(magmaDoubleComplex), da2_ref(0, i), ldda2, a2, lda2); +// cudaMemcpy( a2, da2_ref(0,i), +// rows*ib*sizeof(cuDoubleComplex), +// cudaMemcpyDeviceToHost ); /* Apply H' to A(i:m,i+2*ib:n) from the left */ cols = n-old_i-2*old_ib; @@ -268,16 +280,25 @@ magma_ztsqrt2_gpu( magma_int_t m, magma_int_t n, magma_int_t nb, cublasSetMatrix(rows, ib, sizeof(magmaDoubleComplex), a2, lda2, da2_ref(0, i), ldda2); +// cudaMemcpy( da2_ref(0,i), a2, +// rows*ib*sizeof(cuDoubleComplex), +// cudaMemcpyHostToDevice ); /* Send the triangular factor T from hwork to the GPU */ cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex), t, ldt, dt_ref(0, i), lddt); +// cudaMemcpy( dt_ref(0,i), t, +// ib*ib*sizeof(cuDoubleComplex), +// cudaMemcpyHostToDevice ); /* get back the diag tile in A1 from host to device: d -> da1 */ cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex), d, ldd, da1_ref(i, i), ldda1); +// cudaMemcpy( da1_ref(i, i), d, +// ib*ib*sizeof(cuDoubleComplex), +// cudaMemcpyHostToDevice ); /* tsmqr update on one panel forward (look ahead 1) */ if (i + ib < n) {