diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0501e92baa1ee73ba77ecba6de6c5eb086aab5d4..66952aa9d463f50636f5c7edd2891a29f7727465 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,6 +60,8 @@ include(GenPkgConfig)
 # Parameters/Options #
 ######################
 
+set(CHAMELEON_DEFINITIONS_LIST "")
+
 # Add define for Fortran Mangling (should be defined somewhere else)
 # ------------------------------------------------------------------
 add_definitions(-DADD_)
@@ -686,6 +688,7 @@ endif()
 ###############################################################################
 # Print Options #
 #################
+get_directory_property( CHAMELEON_DEFINITIONS_LIST DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS )
 include(PrintOpts)
 
 ###
diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c
index ca94fdd206bf1d4e8748981b79cb0cd3a89e1bfc..8e52cb94b6268db109386e6957c39a5ac70709a5 100644
--- a/runtime/starpu/codelets/codelet_zgemm.c
+++ b/runtime/starpu/codelets/codelet_zgemm.c
@@ -121,27 +121,30 @@ static void cl_zgemm_cuda_func(void *descr[], void *cl_arg)
     cuDoubleComplex beta;
     cuDoubleComplex *C;
     int ldc;
+    CUstream stream;
+    cublasHandle_t handle;
+    cublasStatus_t stat;
+    cublasOperation_t cublasTransA;
+    cublasOperation_t cublasTransB;
 
     A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
     B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
     C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
     starpu_codelet_unpack_args(cl_arg, &transA, &transB, &m, &n, &k, &alpha, &lda, &ldb, &beta, &ldc);
 
-    cublasHandle_t handle;
-    cublasStatus_t stat = cublasCreate(&handle);
+    stat = cublasCreate(&handle);
     if (stat != CUBLAS_STATUS_SUCCESS) {
         printf ("CUBLAS initialization failed\n");
         assert( stat == CUBLAS_STATUS_SUCCESS );
     }
 
-    CUstream stream = starpu_cuda_get_local_stream();
+    stream = starpu_cuda_get_local_stream();
     stat = cublasSetStream(handle, stream);
     if (stat != CUBLAS_STATUS_SUCCESS) {
             printf ("cublasSetStream failed\n");
             assert( stat == CUBLAS_STATUS_SUCCESS );
     }
 
-    cublasOperation_t cublasTransA;
     if (transA == MorseNoTrans){
         cublasTransA = CUBLAS_OP_N;
     }else if(transA == MorseTrans){
@@ -151,7 +154,6 @@ static void cl_zgemm_cuda_func(void *descr[], void *cl_arg)
     }else{
         fprintf(stderr, "Error in cl_zgemm_cuda_func: bad transA parameter %d\n", transA);
     }
-    cublasOperation_t cublasTransB;
     if (transB == MorseNoTrans){
         cublasTransB = CUBLAS_OP_N;
     }else if(transB == MorseTrans){
diff --git a/runtime/starpu/codelets/codelet_zgeqrt.c b/runtime/starpu/codelets/codelet_zgeqrt.c
index f22e424249237e57ebafe08cfa71ce5dfc2d8368..4171aa70e77fd6dd073fbe4d3795e875a8d74179 100644
--- a/runtime/starpu/codelets/codelet_zgeqrt.c
+++ b/runtime/starpu/codelets/codelet_zgeqrt.c
@@ -162,14 +162,17 @@ magma_zgemerge_gpu(magma_side_t side, magma_diag_t diag,
 {
     int i, j;
     magmaDoubleComplex *cola, *colb;
+    CUstream stream;
     cublasHandle_t handle;
-    cublasStatus_t stat = cublasCreate(&handle);
+    cublasStatus_t stat;
+
+    stat = cublasCreate(&handle);
     if (stat != CUBLAS_STATUS_SUCCESS) {
         printf ("CUBLAS initialization failed\n");
         assert( stat == CUBLAS_STATUS_SUCCESS );
     }
 
-    CUstream stream = starpu_cuda_get_local_stream();
+    stream = starpu_cuda_get_local_stream();
     stat = cublasSetStream(handle, stream);
     if (stat != CUBLAS_STATUS_SUCCESS) {
         printf ("cublasSetStream failed\n");
@@ -234,13 +237,19 @@ magma_zgemerge_gpu(magma_side_t side, magma_diag_t diag,
         for(i=0; i<N; i++){
             cola = A + i*LDA;
             colb = B + i*LDB;
-            cublasZcopy(i+1, cola, 1, colb, 1);
+            //cublasZcopy(i+1, cola, 1, colb, 1);
+            cudaMemcpy(colb , cola,
+                       (i+1)*sizeof(cuDoubleComplex),
+                       cudaMemcpyDeviceToDevice );
         }
     }else{
         for(i=0; i<N; i++){
             cola = A + i*LDA;
             colb = B + i*LDB;
-            cublasZcopy(M-i, cola + i, 1, colb + i, 1);
+            //cublasZcopy(M-i, cola + i, 1, colb + i, 1);
+            cudaMemcpy(colb+i , cola+i,
+                       (M-i)*sizeof(cuDoubleComplex),
+                       cudaMemcpyDeviceToDevice );
         }
     }
 
@@ -291,9 +300,13 @@ magma_zgeqrt_gpu( magma_int_t m, magma_int_t n, magma_int_t nb,
     cudaMemset(dt_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex));
 
     /* copy first panel of A on the host */
-	cublasGetMatrix(m, min(nb,n), sizeof(magmaDoubleComplex),
-                    da_ref(0, 0), ldda,
-                    v, ldv);
+//	cublasGetMatrix(m, min(nb,n), sizeof(magmaDoubleComplex),
+//                    da_ref(0, 0), ldda,
+//                    v, ldv);
+    /* copy first panel of A on the host */
+    cudaMemcpy( v, da_ref(0,0),
+                m*min(nb,n)*sizeof(magmaDoubleComplex),
+                cudaMemcpyDeviceToHost );
 
     /* Use blocked code initially */
     for (i = 0; i < k; i += nb) {
@@ -305,9 +318,13 @@ magma_zgeqrt_gpu( magma_int_t m, magma_int_t n, magma_int_t nb,
         if (i>0){
 
             /* copy panel of A from device to host */
-        	cublasGetMatrix(m, ib, sizeof(magmaDoubleComplex),
-                            da_ref(0, i), ldda,
-                            v, ldv);
+//        	cublasGetMatrix(m, ib, sizeof(magmaDoubleComplex),
+//                            da_ref(0, i), ldda,
+//                            v, ldv);
+            /* copy panel of A from device to host */
+            cudaMemcpy( v, da_ref(0,i),
+                        m*ib*sizeof(magmaDoubleComplex),
+                        cudaMemcpyDeviceToHost );
 
             /* Apply H' to A(i:m,i+2*ib:n) from the left */
             cols = n-old_i-2*old_ib;
diff --git a/runtime/starpu/codelets/codelet_zhemm.c b/runtime/starpu/codelets/codelet_zhemm.c
index e7c297dc5ce8ff90c876977db4437af58ae80f1f..1b4b1241b630ef62e11c2af655dc3fd4118e7adc 100644
--- a/runtime/starpu/codelets/codelet_zhemm.c
+++ b/runtime/starpu/codelets/codelet_zhemm.c
@@ -117,27 +117,30 @@ static void cl_zhemm_cuda_func(void *descr[], void *cl_arg)
     cuDoubleComplex beta;
     cuDoubleComplex *C;
     int LDC;
+    CUstream stream;
+    cublasHandle_t handle;
+    cublasStatus_t stat;
+    cublasSideMode_t cublasSide;
+    cublasFillMode_t cublasUplo;
 
     A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
     B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
     C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
     starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &LDA, &LDB, &beta, &LDC);
 
-    cublasHandle_t handle;
-    cublasStatus_t stat = cublasCreate(&handle);
+    stat = cublasCreate(&handle);
     if (stat != CUBLAS_STATUS_SUCCESS) {
         printf ("CUBLAS initialization failed\n");
         assert( stat == CUBLAS_STATUS_SUCCESS );
     }
 
-    CUstream stream = starpu_cuda_get_local_stream();
+    stream = starpu_cuda_get_local_stream();
     stat = cublasSetStream(handle, stream);
     if (stat != CUBLAS_STATUS_SUCCESS) {
         printf ("cublasSetStream failed\n");
         assert( stat == CUBLAS_STATUS_SUCCESS );
     }
 
-    cublasSideMode_t cublasSide;
     if (side == MorseLeft){
         cublasSide = CUBLAS_SIDE_LEFT;
     }else if (side == MorseRight){
@@ -145,7 +148,6 @@ static void cl_zhemm_cuda_func(void *descr[], void *cl_arg)
     }else{
         fprintf(stderr, "Error in cl_zhemm_cuda_func: bad side parameter %d\n", side);
     }
-    cublasFillMode_t cublasUplo;
     if (uplo == MorseUpper){
         cublasUplo = CUBLAS_FILL_MODE_UPPER;
     }else if(uplo == MorseLower){
diff --git a/runtime/starpu/codelets/codelet_zher2k.c b/runtime/starpu/codelets/codelet_zher2k.c
index 4755ca80ef182c1b858770c5f60995f34ff30ec2..bd7da41cf61552d661b0ade9a5dc0e62277bc4ea 100644
--- a/runtime/starpu/codelets/codelet_zher2k.c
+++ b/runtime/starpu/codelets/codelet_zher2k.c
@@ -112,27 +112,30 @@ static void cl_zher2k_cuda_func(void *descr[], void *cl_arg)
     double beta;
     cuDoubleComplex *C;
     int ldc;
+    CUstream stream;
+    cublasHandle_t handle;
+    cublasStatus_t stat;
+    cublasFillMode_t cublasUplo;
+    cublasOperation_t cublasTrans;
 
     A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
     B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
     C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
     starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &lda, &ldb, &beta, &ldc);
 
-    cublasHandle_t handle;
-    cublasStatus_t stat = cublasCreate(&handle);
+    stat = cublasCreate(&handle);
     if (stat != CUBLAS_STATUS_SUCCESS) {
         printf ("CUBLAS initialization failed\n");
         assert( stat == CUBLAS_STATUS_SUCCESS );
     }
 
-    CUstream stream = starpu_cuda_get_local_stream();
+    stream = starpu_cuda_get_local_stream();
     stat = cublasSetStream(handle, stream);
     if (stat != CUBLAS_STATUS_SUCCESS) {
         printf ("cublasSetStream failed\n");
         assert( stat == CUBLAS_STATUS_SUCCESS );
     }
 
-    cublasFillMode_t cublasUplo;
     if (uplo == MorseUpper){
         cublasUplo = CUBLAS_FILL_MODE_UPPER;
     }else if(uplo == MorseLower){
@@ -142,8 +145,6 @@ static void cl_zher2k_cuda_func(void *descr[], void *cl_arg)
     }else{
         fprintf(stderr, "Error in cl_zher2k_cuda_func: bad uplo parameter %d\n", uplo);
     }
-
-    cublasOperation_t cublasTrans;
     if (trans == MorseNoTrans){
         cublasTrans = CUBLAS_OP_N;
     }else if(trans == MorseTrans){
diff --git a/runtime/starpu/codelets/codelet_zherk.c b/runtime/starpu/codelets/codelet_zherk.c
index 18584b8636cb8a67211cfefb9b89eea6e70b5daa..44ae4f252a1ac2edcf76ae10fcd1838a3ea696fe 100644
--- a/runtime/starpu/codelets/codelet_zherk.c
+++ b/runtime/starpu/codelets/codelet_zherk.c
@@ -106,26 +106,29 @@ static void cl_zherk_cuda_func(void *descr[], void *cl_arg)
     double beta;
     cuDoubleComplex *C;
     int ldc;
+    CUstream stream;
+    cublasHandle_t handle;
+    cublasStatus_t stat;
+    cublasFillMode_t cublasUplo;
+    cublasOperation_t cublasTrans;
 
     A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
     C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
     starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &lda, &beta, &ldc);
 
-    cublasHandle_t handle;
-    cublasStatus_t stat = cublasCreate(&handle);
+    stat = cublasCreate(&handle);
     if (stat != CUBLAS_STATUS_SUCCESS) {
         printf ("CUBLAS initialization failed\n");
         assert( stat == CUBLAS_STATUS_SUCCESS );
     }
 
-    CUstream stream = starpu_cuda_get_local_stream();
+    stream = starpu_cuda_get_local_stream();
     stat = cublasSetStream(handle, stream);
     if (stat != CUBLAS_STATUS_SUCCESS) {
         printf ("cublasSetStream failed\n");
         assert( stat == CUBLAS_STATUS_SUCCESS );
     }
 
-    cublasFillMode_t cublasUplo;
     if (uplo == MorseUpper){
         cublasUplo = CUBLAS_FILL_MODE_UPPER;
     }else if(uplo == MorseLower){
@@ -135,8 +138,6 @@ static void cl_zherk_cuda_func(void *descr[], void *cl_arg)
     }else{
         fprintf(stderr, "Error in cl_zherk_cuda_func: bad uplo parameter %d\n", uplo);
     }
-
-    cublasOperation_t cublasTrans;
     if (trans == MorseNoTrans){
         cublasTrans = CUBLAS_OP_N;
     }else if(trans == MorseTrans){
diff --git a/runtime/starpu/codelets/codelet_zsymm.c b/runtime/starpu/codelets/codelet_zsymm.c
index ae1903daabf8b3bae8333d2bdfa31894c546697a..173d6d3b85562aea11b53a2f7214674e5d0ac0ec 100644
--- a/runtime/starpu/codelets/codelet_zsymm.c
+++ b/runtime/starpu/codelets/codelet_zsymm.c
@@ -117,27 +117,30 @@ static void cl_zsymm_cuda_func(void *descr[], void *cl_arg)
     cuDoubleComplex beta;
     cuDoubleComplex *C;
     int LDC;
+    CUstream stream;
+    cublasHandle_t handle;
+    cublasStatus_t stat;
+    cublasSideMode_t cublasSide;
+    cublasFillMode_t cublasUplo;
 
     A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
     B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
     C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
     starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &LDA, &LDB, &beta, &LDC);
 
-    cublasHandle_t handle;
-    cublasStatus_t stat = cublasCreate(&handle);
+    stat = cublasCreate(&handle);
     if (stat != CUBLAS_STATUS_SUCCESS) {
         printf ("CUBLAS initialization failed\n");
         assert( stat == CUBLAS_STATUS_SUCCESS );
     }
 
-    CUstream stream = starpu_cuda_get_local_stream();
+    stream = starpu_cuda_get_local_stream();
     stat = cublasSetStream(handle, stream);
     if (stat != CUBLAS_STATUS_SUCCESS) {
         printf ("cublasSetStream failed\n");
         assert( stat == CUBLAS_STATUS_SUCCESS );
     }
 
-    cublasSideMode_t cublasSide;
     if (side == MorseLeft){
         cublasSide = CUBLAS_SIDE_LEFT;
     }else if (side == MorseRight){
@@ -145,7 +148,6 @@ static void cl_zsymm_cuda_func(void *descr[], void *cl_arg)
     }else{
         fprintf(stderr, "Error in cl_zsymm_cuda_func: bad side parameter %d\n", side);
     }
-    cublasFillMode_t cublasUplo;
     if (uplo == MorseUpper){
         cublasUplo = CUBLAS_FILL_MODE_UPPER;
     }else if(uplo == MorseLower){
diff --git a/runtime/starpu/codelets/codelet_zsyr2k.c b/runtime/starpu/codelets/codelet_zsyr2k.c
index 2d99bdb82e8d11a97e699f71fb33b72fd834b6a3..2922de6fc788f852d80ee13756ee553274d52247 100644
--- a/runtime/starpu/codelets/codelet_zsyr2k.c
+++ b/runtime/starpu/codelets/codelet_zsyr2k.c
@@ -112,27 +112,30 @@ static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg)
     cuDoubleComplex beta;
     cuDoubleComplex *C;
     int ldc;
+    CUstream stream;
+    cublasHandle_t handle;
+    cublasStatus_t stat;
+    cublasFillMode_t cublasUplo;
+    cublasOperation_t cublasTrans;
 
     A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
     B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
     C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
     starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &lda, &ldb, &beta, &ldc);
 
-    cublasHandle_t handle;
-    cublasStatus_t stat = cublasCreate(&handle);
+    stat = cublasCreate(&handle);
     if (stat != CUBLAS_STATUS_SUCCESS) {
         printf ("CUBLAS initialization failed\n");
         assert( stat == CUBLAS_STATUS_SUCCESS );
     }
 
-    CUstream stream = starpu_cuda_get_local_stream();
+    stream = starpu_cuda_get_local_stream();
     stat = cublasSetStream(handle, stream);
     if (stat != CUBLAS_STATUS_SUCCESS) {
         printf ("cublasSetStream failed\n");
         assert( stat == CUBLAS_STATUS_SUCCESS );
     }
 
-    cublasFillMode_t cublasUplo;
     if (uplo == MorseUpper){
         cublasUplo = CUBLAS_FILL_MODE_UPPER;
     }else if(uplo == MorseLower){
@@ -142,8 +145,6 @@ static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg)
     }else{
         fprintf(stderr, "Error in cl_zsyr2k_cuda_func: bad uplo parameter %d\n", uplo);
     }
-
-    cublasOperation_t cublasTrans;
     if (trans == MorseNoTrans){
         cublasTrans = CUBLAS_OP_N;
     }else if(trans == MorseTrans){
diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c
index 8803124586868e7629669f18ad9aa132a31a1c2b..0f69498a4e77cd61b5c9515560c4a6d7d6d770c2 100644
--- a/runtime/starpu/codelets/codelet_zsyrk.c
+++ b/runtime/starpu/codelets/codelet_zsyrk.c
@@ -107,26 +107,29 @@ static void cl_zsyrk_cuda_func(void *descr[], void *cl_arg)
     cuDoubleComplex beta;
     cuDoubleComplex *C;
     int ldc;
+    CUstream stream;
+    cublasHandle_t handle;
+    cublasStatus_t stat;
+    cublasFillMode_t cublasUplo;
+    cublasOperation_t cublasTrans;
 
     A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
     C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
     starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &lda, &beta, &ldc);
 
-    cublasHandle_t handle;
-    cublasStatus_t stat = cublasCreate(&handle);
+    stat = cublasCreate(&handle);
     if (stat != CUBLAS_STATUS_SUCCESS) {
         printf ("CUBLAS initialization failed\n");
         assert( stat == CUBLAS_STATUS_SUCCESS );
     }
 
-    CUstream stream = starpu_cuda_get_local_stream();
+    stream = starpu_cuda_get_local_stream();
     stat = cublasSetStream(handle, stream);
     if (stat != CUBLAS_STATUS_SUCCESS) {
         printf ("cublasSetStream failed\n");
         assert( stat == CUBLAS_STATUS_SUCCESS );
     }
 
-    cublasFillMode_t cublasUplo;
     if (uplo == MorseUpper){
         cublasUplo = CUBLAS_FILL_MODE_UPPER;
     }else if(uplo == MorseLower){
@@ -136,8 +139,6 @@ static void cl_zsyrk_cuda_func(void *descr[], void *cl_arg)
     }else{
         fprintf(stderr, "Error in cl_zsyrk_cuda_func: bad uplo parameter %d\n", uplo);
     }
-
-    cublasOperation_t cublasTrans;
     if (trans == MorseNoTrans){
         cublasTrans = CUBLAS_OP_N;
     }else if(trans == MorseTrans){
diff --git a/runtime/starpu/codelets/codelet_ztrmm.c b/runtime/starpu/codelets/codelet_ztrmm.c
index 497ca32ef4801416429d041159ee601a82fd41b9..510005d40d382230dcf86d242e4f28d69e06c5d0 100644
--- a/runtime/starpu/codelets/codelet_ztrmm.c
+++ b/runtime/starpu/codelets/codelet_ztrmm.c
@@ -111,26 +111,31 @@ static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg)
     int LDA;
     cuDoubleComplex *B;
     int LDB;
+    CUstream stream;
+    cublasHandle_t handle;
+    cublasStatus_t stat;
+    cublasSideMode_t cublasSide;
+    cublasFillMode_t cublasUplo;
+    cublasOperation_t cublasTransA;
+    cublasDiagType_t cublasDiag;
 
     A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
     B = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
     starpu_codelet_unpack_args(cl_arg, &side, &uplo, &transA, &diag, &M, &N, &alpha, &LDA, &LDB);
 
-    cublasHandle_t handle;
-    cublasStatus_t stat = cublasCreate(&handle);
+    stat = cublasCreate(&handle);
     if (stat != CUBLAS_STATUS_SUCCESS) {
         printf ("CUBLAS initialization failed\n");
         assert( stat == CUBLAS_STATUS_SUCCESS );
     }
 
-    CUstream stream = starpu_cuda_get_local_stream();
+    stream = starpu_cuda_get_local_stream();
     stat = cublasSetStream(handle, stream);
     if (stat != CUBLAS_STATUS_SUCCESS) {
         printf ("cublasSetStream failed\n");
         assert( stat == CUBLAS_STATUS_SUCCESS );
     }
 
-    cublasSideMode_t cublasSide;
     if (side == MorseLeft){
         cublasSide = CUBLAS_SIDE_LEFT;
     }else if (side == MorseRight){
@@ -138,7 +143,6 @@ static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg)
     }else{
         fprintf(stderr, "Error in cl_ztrmm_cuda_func: bad side parameter %d\n", side);
     }
-    cublasFillMode_t cublasUplo;
     if (uplo == MorseUpper){
         cublasUplo = CUBLAS_FILL_MODE_UPPER;
     }else if(uplo == MorseLower){
@@ -148,7 +152,6 @@ static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg)
     }else{
         fprintf(stderr, "Error in cl_ztrmm_cuda_func: bad uplo parameter %d\n", uplo);
     }
-    cublasOperation_t cublasTransA;
     if (transA == MorseNoTrans){
         cublasTransA = CUBLAS_OP_N;
     }else if(transA == MorseTrans){
@@ -158,7 +161,6 @@ static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg)
     }else{
         fprintf(stderr, "Error in cl_ztrmm_cuda_func: bad transA parameter %d\n", transA);
     }
-    cublasDiagType_t cublasDiag;
     if (diag == MorseNonUnit){
         cublasDiag = CUBLAS_DIAG_NON_UNIT;
     }else if(diag == MorseUnit){
diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c
index cca877b5635e4cf71d6479078011d1f6dd748c35..0b100e81a01d055a030dd5f0be086ff5894d20d6 100644
--- a/runtime/starpu/codelets/codelet_ztrsm.c
+++ b/runtime/starpu/codelets/codelet_ztrsm.c
@@ -111,26 +111,31 @@ static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg)
     int lda;
     cuDoubleComplex *B;
     int ldb;
+    CUstream stream;
+    cublasHandle_t handle;
+    cublasStatus_t stat;
+    cublasSideMode_t cublasSide;
+    cublasFillMode_t cublasUplo;
+    cublasOperation_t cublasTransA;
+    cublasDiagType_t cublasDiag;
 
     A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
     B = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
     starpu_codelet_unpack_args(cl_arg, &side, &uplo, &transA, &diag, &m, &n, &alpha, &lda, &ldb);
 
-    cublasHandle_t handle;
-    cublasStatus_t stat = cublasCreate(&handle);
+    stat = cublasCreate(&handle);
     if (stat != CUBLAS_STATUS_SUCCESS) {
         printf ("CUBLAS initialization failed\n");
         assert( stat == CUBLAS_STATUS_SUCCESS );
     }
 
-    CUstream stream = starpu_cuda_get_local_stream();
+    stream = starpu_cuda_get_local_stream();
     stat = cublasSetStream(handle, stream);
     if (stat != CUBLAS_STATUS_SUCCESS) {
         printf ("cublasSetStream failed\n");
         assert( stat == CUBLAS_STATUS_SUCCESS );
     }
 
-    cublasSideMode_t cublasSide;
     if (side == MorseLeft){
         cublasSide = CUBLAS_SIDE_LEFT;
     }else if (side == MorseRight){
@@ -138,7 +143,6 @@ static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg)
     }else{
         fprintf(stderr, "Error in cl_ztrsm_cuda_func: bad side parameter %d\n", side);
     }
-    cublasFillMode_t cublasUplo;
     if (uplo == MorseUpper){
         cublasUplo = CUBLAS_FILL_MODE_UPPER;
     }else if(uplo == MorseLower){
@@ -148,7 +152,6 @@ static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg)
     }else{
         fprintf(stderr, "Error in cl_ztrsm_cuda_func: bad uplo parameter %d\n", uplo);
     }
-    cublasOperation_t cublasTransA;
     if (transA == MorseNoTrans){
         cublasTransA = CUBLAS_OP_N;
     }else if(transA == MorseTrans){
@@ -158,7 +161,6 @@ static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg)
     }else{
         fprintf(stderr, "Error in cl_ztrsm_cuda_func: bad transA parameter %d\n", transA);
     }
-    cublasDiagType_t cublasDiag;
     if (diag == MorseNonUnit){
         cublasDiag = CUBLAS_DIAG_NON_UNIT;
     }else if(diag == MorseUnit){
diff --git a/runtime/starpu/codelets/codelet_ztsmqr.c b/runtime/starpu/codelets/codelet_ztsmqr.c
index 8cfa12803a19361918145499979d0732df07f5bb..e88ca291d05880f79ded733d97daf0c942ef48a8 100644
--- a/runtime/starpu/codelets/codelet_ztsmqr.c
+++ b/runtime/starpu/codelets/codelet_ztsmqr.c
@@ -245,7 +245,12 @@ magma_zparfb_gpu(magma_side_t side, magma_trans_t trans,
     magma_trans_t transW;
     magma_trans_t transA2;
     cublasHandle_t handle;
-    cublasStatus_t stat = cublasCreate(&handle);
+    cublasStatus_t stat;
+    cublasOperation_t cublasTrans;
+    cublasOperation_t cublasTransW;
+    cublasOperation_t cublasTransA2;
+
+    stat = cublasCreate(&handle);
     if (stat != CUBLAS_STATUS_SUCCESS) {
         printf ("CUBLAS initialization failed\n");
         assert( stat == CUBLAS_STATUS_SUCCESS );
@@ -257,7 +262,6 @@ magma_zparfb_gpu(magma_side_t side, magma_trans_t trans,
         assert( stat == CUBLAS_STATUS_SUCCESS );
     }
 
-    cublasOperation_t cublasTrans;
     if (trans == MagmaNoTrans){
         cublasTrans = CUBLAS_OP_N;
     }else if(trans == MagmaTrans){
@@ -329,7 +333,6 @@ magma_zparfb_gpu(magma_side_t side, magma_trans_t trans,
             transW  = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans;
             transA2 = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans;
 
-            cublasOperation_t cublasTransW;
             if (transW == MagmaNoTrans){
                 cublasTransW = CUBLAS_OP_N;
             }else if(transW == MagmaTrans){
@@ -339,7 +342,6 @@ magma_zparfb_gpu(magma_side_t side, magma_trans_t trans,
             }else{
                 fprintf(stderr, "Error in magma_zparfb_gpu: bad transW parameter %d\n", transW);
             }
-            cublasOperation_t cublasTransA2;
             if (transA2 == MagmaNoTrans){
                 cublasTransA2 = CUBLAS_OP_N;
             }else if(transA2 == MagmaTrans){
@@ -358,7 +360,6 @@ magma_zparfb_gpu(magma_side_t side, magma_trans_t trans,
                         (const cuDoubleComplex *) &zone,
                         (cuDoubleComplex*)WORK  /* K*N1  */, LDWORK);
 
-            WORKC = NULL;
             if (WORKC == NULL) {
                 /* W = op(T) * W */
                 cublasZtrmm( handle,
@@ -437,7 +438,6 @@ magma_zparfb_gpu(magma_side_t side, magma_trans_t trans,
             transW  = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans;
             transA2 = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans;
 
-            cublasOperation_t cublasTransW;
             if (transW == MagmaNoTrans){
                 cublasTransW = CUBLAS_OP_N;
             }else if(transW == MagmaTrans){
@@ -447,7 +447,6 @@ magma_zparfb_gpu(magma_side_t side, magma_trans_t trans,
             }else{
                 fprintf(stderr, "Error in magma_zparfb_gpu: bad transW parameter %d\n", transW);
             }
-            cublasOperation_t cublasTransA2;
             if (transA2 == MagmaNoTrans){
                 cublasTransA2 = CUBLAS_OP_N;
             }else if(transA2 == MagmaTrans){
@@ -466,7 +465,6 @@ magma_zparfb_gpu(magma_side_t side, magma_trans_t trans,
                         (const cuDoubleComplex *) &zone,
                         (cuDoubleComplex*)WORK  /* M1*K  */, LDWORK);
 
-            WORKC = NULL;
             if (WORKC == NULL) {
                 /* W = W * op(T) */
                 cublasZtrmm( handle,
@@ -548,7 +546,6 @@ magma_zparfb_gpu(magma_side_t side, magma_trans_t trans,
                        magmaDoubleComplex *WORK, magma_int_t LDWORK,
                        magmaDoubleComplex *WORKC, magma_int_t LDWORKC,
                        CUstream stream)
-
 {
 #if defined(PRECISION_z) || defined(PRECISION_c)
     cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0);
diff --git a/runtime/starpu/codelets/codelet_ztsqrt.c b/runtime/starpu/codelets/codelet_ztsqrt.c
index 0aa41ebdfef0fd13ae1834241c2b455179cadd1d..9eedc071b00c1bac821a21741b27df3723c8fc22 100644
--- a/runtime/starpu/codelets/codelet_ztsqrt.c
+++ b/runtime/starpu/codelets/codelet_ztsqrt.c
@@ -213,11 +213,17 @@ magma_ztsqrt2_gpu( magma_int_t m, magma_int_t n, magma_int_t nb,
 	cublasGetMatrix(nb, nb, sizeof(magmaDoubleComplex),
                     da1_ref(0, 0), ldda1,
                     d, ldd);
+//	cudaMemcpy( d, da1_ref(0,0),
+//	            nb*nb*sizeof(cuDoubleComplex),
+//	            cudaMemcpyDeviceToHost );
 
 	/* copy first panel of A2 from device to host: da2 -> a2 */
-	cublasGetMatrix(m, nb, sizeof(magmaDoubleComplex),
-                    da2_ref(0, 0), ldda2,
-                    a2, lda2);
+//	cublasGetMatrix(m, nb, sizeof(magmaDoubleComplex),
+//                    da2_ref(0, 0), ldda2,
+//                    a2, lda2);
+	cudaMemcpy( a2, da2_ref(0, 0),
+	            m*nb*sizeof(cuDoubleComplex),
+	            cudaMemcpyDeviceToHost );
 
 	/* This is only blocked code for now */
 	for (i = 0; i < n; i += nb) {
@@ -234,11 +240,17 @@ magma_ztsqrt2_gpu( magma_int_t m, magma_int_t n, magma_int_t nb,
 			cublasGetMatrix(ib, ib, sizeof(magmaDoubleComplex),
 		                    da1_ref(i, i), ldda1,
 		                    d, ldd);
+//		    cudaMemcpy( d, da1_ref(i,i),
+//		        ib*ib*sizeof(cuDoubleComplex),
+//		        cudaMemcpyDeviceToHost );
 
 			/* copy panel of A2 from device to host: da2 -> a2 */
 			cublasGetMatrix(rows, ib, sizeof(magmaDoubleComplex),
                             da2_ref(0, i), ldda2,
                             a2, lda2);
+//            cudaMemcpy( a2, da2_ref(0,i),
+//                rows*ib*sizeof(cuDoubleComplex),
+//                cudaMemcpyDeviceToHost );
 
 			/* Apply H' to A(i:m,i+2*ib:n) from the left */
 			cols = n-old_i-2*old_ib;
@@ -268,16 +280,25 @@ magma_ztsqrt2_gpu( magma_int_t m, magma_int_t n, magma_int_t nb,
 		cublasSetMatrix(rows, ib, sizeof(magmaDoubleComplex),
                         a2, lda2,
                         da2_ref(0, i), ldda2);
+//        cudaMemcpy( da2_ref(0,i), a2,
+//            rows*ib*sizeof(cuDoubleComplex),
+//            cudaMemcpyHostToDevice );
 
 		/* Send the triangular factor T from hwork to the GPU */
 		cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex),
                         t, ldt,
                         dt_ref(0, i), lddt);
+//        cudaMemcpy( dt_ref(0,i), t,
+//            ib*ib*sizeof(cuDoubleComplex),
+//            cudaMemcpyHostToDevice );
 
 		/* get back the diag tile in A1 from host to device: d -> da1 */
 		cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex),
                         d, ldd,
                         da1_ref(i, i), ldda1);
+//        cudaMemcpy( da1_ref(i, i), d,
+//            ib*ib*sizeof(cuDoubleComplex),
+//            cudaMemcpyHostToDevice );
 
 		/* tsmqr update on one panel forward (look ahead 1) */
 		if (i + ib < n) {