diff --git a/control/common.h b/control/common.h
index 32ebb8c8bd7189ff011dd754fb7a77901033340c..8be1ac62403a74600dc7838e4527ad46b4a22808 100644
--- a/control/common.h
+++ b/control/common.h
@@ -50,13 +50,12 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_runtime_api.h>
-//#if defined(CHAMELEON_USE_CUBLAS_V2)
-//#include <cublas_v2.h>
-//#else
-//#include <cublas.h>
-//#endif
+#if defined(CHAMELEON_USE_CUBLAS_V2)
+#include <cublas_v2.h>
+#else
 #include <cublas.h>
 #endif
+#endif
 
 #if defined(CHAMELEON_USE_OPENCL)
 #include <OpenCL/cl.h>
diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c
index 915b0d181523f7e70f078df35208b5e53329d413..ca94fdd206bf1d4e8748981b79cb0cd3a89e1bfc 100644
--- a/runtime/starpu/codelets/codelet_zgemm.c
+++ b/runtime/starpu/codelets/codelet_zgemm.c
@@ -105,6 +105,84 @@ static void cl_zgemm_cpu_func(void *descr[], void *cl_arg)
 }
 
 #ifdef CHAMELEON_USE_CUDA
+#if defined(CHAMELEON_USE_CUBLAS_V2)
+static void cl_zgemm_cuda_func(void *descr[], void *cl_arg)
+{
+    MORSE_enum transA;
+    MORSE_enum transB;
+    int m;
+    int n;
+    int k;
+    cuDoubleComplex alpha;
+    const cuDoubleComplex *A;
+    int lda;
+    const cuDoubleComplex *B;
+    int ldb;
+    cuDoubleComplex beta;
+    cuDoubleComplex *C;
+    int ldc;
+
+    A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
+    B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
+    C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
+    starpu_codelet_unpack_args(cl_arg, &transA, &transB, &m, &n, &k, &alpha, &lda, &ldb, &beta, &ldc);
+
+    cublasHandle_t handle;
+    cublasStatus_t stat = cublasCreate(&handle);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("CUBLAS initialization failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    CUstream stream = starpu_cuda_get_local_stream();
+    stat = cublasSetStream(handle, stream);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+            printf ("cublasSetStream failed\n");
+            assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    cublasOperation_t cublasTransA;
+    if (transA == MorseNoTrans){
+        cublasTransA = CUBLAS_OP_N;
+    }else if(transA == MorseTrans){
+        cublasTransA = CUBLAS_OP_T;
+    }else if(transA == MorseConjTrans){
+        cublasTransA = CUBLAS_OP_C;
+    }else{
+        fprintf(stderr, "Error in cl_zgemm_cuda_func: bad transA parameter %d\n", transA);
+    }
+    cublasOperation_t cublasTransB;
+    if (transB == MorseNoTrans){
+        cublasTransB = CUBLAS_OP_N;
+    }else if(transB == MorseTrans){
+        cublasTransB = CUBLAS_OP_T;
+    }else if(transB == MorseConjTrans){
+        cublasTransB = CUBLAS_OP_C;
+    }else{
+        fprintf(stderr, "Error in cl_zgemm_cuda_func: bad transB parameter %d\n", transB);
+    }
+
+    stat = cublasZgemm(handle,
+        cublasTransA, cublasTransB,
+        m, n, k,
+        (const cuDoubleComplex *) &alpha, A, lda,
+        B, ldb,
+        (const cuDoubleComplex *) &beta,  C, ldc);
+    if (stat != CUBLAS_STATUS_SUCCESS){
+        printf ("cublasZgemm failed");
+        cublasDestroy(handle);
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    cublasDestroy(handle);
+
+#ifndef STARPU_CUDA_ASYNC
+    cudaStreamSynchronize( stream );
+#endif
+
+    return;
+}
+#else /* CHAMELEON_USE_CUBLAS_V2 */
 static void cl_zgemm_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum transA;
@@ -135,6 +213,7 @@ static void cl_zgemm_cuda_func(void *descr[], void *cl_arg)
         alpha, A, lda,
                B, ldb,
         beta,  C, ldc);
+
     assert( CUBLAS_STATUS_SUCCESS == cublasGetError() );
 
 #ifndef STARPU_CUDA_ASYNC
@@ -143,7 +222,8 @@ static void cl_zgemm_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif
+#endif /* CHAMELEON_USE_CUBLAS_V2 */
+#endif /* CHAMELEON_USE_CUDA */
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_zgeqrt.c b/runtime/starpu/codelets/codelet_zgeqrt.c
index 10b84ad4a57cad4162871a686054bdd7c30f2b21..f22e424249237e57ebafe08cfa71ce5dfc2d8368 100644
--- a/runtime/starpu/codelets/codelet_zgeqrt.c
+++ b/runtime/starpu/codelets/codelet_zgeqrt.c
@@ -153,6 +153,61 @@ static void cl_zgeqrt_cpu_func(void *descr[], void *cl_arg)
 
 #if defined(CHAMELEON_USE_MAGMA)
 
+#if defined(CHAMELEON_USE_CUBLAS_V2)
+magma_int_t
+magma_zgemerge_gpu(magma_side_t side, magma_diag_t diag,
+                   magma_int_t M, magma_int_t N,
+                   magmaDoubleComplex *A, magma_int_t LDA,
+                   magmaDoubleComplex *B, magma_int_t LDB)
+{
+    int i, j;
+    magmaDoubleComplex *cola, *colb;
+    cublasHandle_t handle;
+    cublasStatus_t stat = cublasCreate(&handle);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("CUBLAS initialization failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    CUstream stream = starpu_cuda_get_local_stream();
+    stat = cublasSetStream(handle, stream);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("cublasSetStream failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    if (M < 0) {
+        return -1;
+    }
+    if (N < 0) {
+        return -2;
+    }
+    if ( (LDA < max(1,M)) && (M > 0) ) {
+        return -5;
+    }
+    if ( (LDB < max(1,M)) && (M > 0) ) {
+        return -7;
+    }
+
+    if (side == MagmaLeft){
+        for(i=0; i<N; i++){
+            cola = A + i*LDA;
+            colb = B + i*LDB;
+            cublasZcopy(handle, i+1, cola, 1, colb, 1);
+        }
+    }else{
+        for(i=0; i<N; i++){
+            cola = A + i*LDA;
+            colb = B + i*LDB;
+            cublasZcopy(handle, M-i, cola + i, 1, colb + i, 1);
+        }
+    }
+
+    cublasDestroy(handle);
+
+    return MAGMA_SUCCESS;
+}
+#else /* CHAMELEON_USE_CUBLAS_V2 */
 magma_int_t
 magma_zgemerge_gpu(magma_side_t side, magma_diag_t diag,
                    magma_int_t M, magma_int_t N,
@@ -191,7 +246,7 @@ magma_zgemerge_gpu(magma_side_t side, magma_diag_t diag,
 
     return MAGMA_SUCCESS;
 }
-
+#endif /* CHAMELEON_USE_CUBLAS_V2 */
 
 magma_int_t
 magma_zgeqrt_gpu( magma_int_t m, magma_int_t n, magma_int_t nb,
diff --git a/runtime/starpu/codelets/codelet_zhemm.c b/runtime/starpu/codelets/codelet_zhemm.c
index d1ae45edd37775acd67c22f5c1c74339afe478d4..e7c297dc5ce8ff90c876977db4437af58ae80f1f 100644
--- a/runtime/starpu/codelets/codelet_zhemm.c
+++ b/runtime/starpu/codelets/codelet_zhemm.c
@@ -102,6 +102,81 @@ static void cl_zhemm_cpu_func(void *descr[], void *cl_arg)
 }
 
 #ifdef CHAMELEON_USE_CUDA
+#if defined(CHAMELEON_USE_CUBLAS_V2)
+static void cl_zhemm_cuda_func(void *descr[], void *cl_arg)
+{
+    MORSE_enum side;
+    MORSE_enum uplo;
+    int M;
+    int N;
+    cuDoubleComplex alpha;
+    const cuDoubleComplex *A;
+    int LDA;
+    const cuDoubleComplex *B;
+    int LDB;
+    cuDoubleComplex beta;
+    cuDoubleComplex *C;
+    int LDC;
+
+    A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
+    B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
+    C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
+    starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &LDA, &LDB, &beta, &LDC);
+
+    cublasHandle_t handle;
+    cublasStatus_t stat = cublasCreate(&handle);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("CUBLAS initialization failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    CUstream stream = starpu_cuda_get_local_stream();
+    stat = cublasSetStream(handle, stream);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("cublasSetStream failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    cublasSideMode_t cublasSide;
+    if (side == MorseLeft){
+        cublasSide = CUBLAS_SIDE_LEFT;
+    }else if (side == MorseRight){
+        cublasSide = CUBLAS_SIDE_RIGHT;
+    }else{
+        fprintf(stderr, "Error in cl_zhemm_cuda_func: bad side parameter %d\n", side);
+    }
+    cublasFillMode_t cublasUplo;
+    if (uplo == MorseUpper){
+        cublasUplo = CUBLAS_FILL_MODE_UPPER;
+    }else if(uplo == MorseLower){
+        cublasUplo = CUBLAS_FILL_MODE_LOWER;
+    }else if(uplo == MorseUpperLower){
+        cublasUplo = 0;
+    }else{
+        fprintf(stderr, "Error in cl_zhemm_cuda_func: bad uplo parameter %d\n", uplo);
+    }
+
+    stat = cublasZhemm(handle,
+        cublasSide, cublasUplo,
+        M, N,
+        (const cuDoubleComplex *) &alpha, A, LDA,
+        B, LDB,
+        (const cuDoubleComplex *) &beta, C, LDC);
+    if (stat != CUBLAS_STATUS_SUCCESS){
+        printf ("cublasZhemm failed");
+        cublasDestroy(handle);
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    cublasDestroy(handle);
+
+#ifndef STARPU_CUDA_ASYNC
+    cudaStreamSynchronize( stream );
+#endif
+
+    return;
+}
+#else /* CHAMELEON_USE_CUBLAS_V2 */
 static void cl_zhemm_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum side;
@@ -138,7 +213,8 @@ static void cl_zhemm_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif
+#endif /* CHAMELEON_USE_CUBLAS_V2 */
+#endif /* CHAMELEON_USE_CUDA */
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_zher2k.c b/runtime/starpu/codelets/codelet_zher2k.c
index e57c9bcbc78391438efcd4c7dc4324edb33af3af..4755ca80ef182c1b858770c5f60995f34ff30ec2 100644
--- a/runtime/starpu/codelets/codelet_zher2k.c
+++ b/runtime/starpu/codelets/codelet_zher2k.c
@@ -97,6 +97,81 @@ static void cl_zher2k_cpu_func(void *descr[], void *cl_arg)
 }
 
 #ifdef CHAMELEON_USE_CUDA
+#if defined(CHAMELEON_USE_CUBLAS_V2)
+static void cl_zher2k_cuda_func(void *descr[], void *cl_arg)
+{
+    MORSE_enum uplo;
+    MORSE_enum trans;
+    int n;
+    int k;
+    cuDoubleComplex alpha;
+    const cuDoubleComplex *A;
+    int lda;
+    const cuDoubleComplex *B;
+    int ldb;
+    double beta;
+    cuDoubleComplex *C;
+    int ldc;
+
+    A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
+    B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
+    C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
+    starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &lda, &ldb, &beta, &ldc);
+
+    cublasHandle_t handle;
+    cublasStatus_t stat = cublasCreate(&handle);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("CUBLAS initialization failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    CUstream stream = starpu_cuda_get_local_stream();
+    stat = cublasSetStream(handle, stream);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("cublasSetStream failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    cublasFillMode_t cublasUplo;
+    if (uplo == MorseUpper){
+        cublasUplo = CUBLAS_FILL_MODE_UPPER;
+    }else if(uplo == MorseLower){
+        cublasUplo = CUBLAS_FILL_MODE_LOWER;
+    }else if(uplo == MorseUpperLower){
+        cublasUplo = 0;
+    }else{
+        fprintf(stderr, "Error in cl_zher2k_cuda_func: bad uplo parameter %d\n", uplo);
+    }
+
+    cublasOperation_t cublasTrans;
+    if (trans == MorseNoTrans){
+        cublasTrans = CUBLAS_OP_N;
+    }else if(trans == MorseTrans){
+        cublasTrans = CUBLAS_OP_T;
+    }else if(trans == MorseConjTrans){
+        cublasTrans = CUBLAS_OP_C;
+    }else{
+        fprintf(stderr, "Error in cl_zher2k_cuda_func: bad trans parameter %d\n", trans);
+    }
+
+    stat = cublasZher2k( handle, cublasUplo, cublasTrans,
+        n, k, (const cuDoubleComplex *) &alpha, A, lda, B, ldb,
+        (const double *) &beta, C, ldc);
+    if (stat != CUBLAS_STATUS_SUCCESS){
+        printf ("cublasZher2k failed");
+        cublasDestroy(handle);
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    cublasDestroy(handle);
+
+#ifndef STARPU_CUDA_ASYNC
+    cudaStreamSynchronize( stream );
+#endif
+
+    return;
+}
+#else /* CHAMELEON_USE_CUBLAS_V2 */
 static void cl_zher2k_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum uplo;
@@ -129,7 +204,8 @@ static void cl_zher2k_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif
+#endif /* CHAMELEON_USE_CUBLAS_V2 */
+#endif /* CHAMELEON_USE_CUDA */
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_zherk.c b/runtime/starpu/codelets/codelet_zherk.c
index 4d2c3596b22606d77dcf175378c0c1ac5e9f6931..18584b8636cb8a67211cfefb9b89eea6e70b5daa 100644
--- a/runtime/starpu/codelets/codelet_zherk.c
+++ b/runtime/starpu/codelets/codelet_zherk.c
@@ -93,6 +93,80 @@ static void cl_zherk_cpu_func(void *descr[], void *cl_arg)
 }
 
 #ifdef CHAMELEON_USE_CUDA
+#if defined(CHAMELEON_USE_CUBLAS_V2)
+static void cl_zherk_cuda_func(void *descr[], void *cl_arg)
+{
+    MORSE_enum uplo;
+    MORSE_enum trans;
+    int n;
+    int k;
+    double alpha;
+    const cuDoubleComplex *A;
+    int lda;
+    double beta;
+    cuDoubleComplex *C;
+    int ldc;
+
+    A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
+    C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
+    starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &lda, &beta, &ldc);
+
+    cublasHandle_t handle;
+    cublasStatus_t stat = cublasCreate(&handle);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("CUBLAS initialization failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    CUstream stream = starpu_cuda_get_local_stream();
+    stat = cublasSetStream(handle, stream);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("cublasSetStream failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    cublasFillMode_t cublasUplo;
+    if (uplo == MorseUpper){
+        cublasUplo = CUBLAS_FILL_MODE_UPPER;
+    }else if(uplo == MorseLower){
+        cublasUplo = CUBLAS_FILL_MODE_LOWER;
+    }else if(uplo == MorseUpperLower){
+        cublasUplo = 0;
+    }else{
+        fprintf(stderr, "Error in cl_zherk_cuda_func: bad uplo parameter %d\n", uplo);
+    }
+
+    cublasOperation_t cublasTrans;
+    if (trans == MorseNoTrans){
+        cublasTrans = CUBLAS_OP_N;
+    }else if(trans == MorseTrans){
+        cublasTrans = CUBLAS_OP_T;
+    }else if(trans == MorseConjTrans){
+        cublasTrans = CUBLAS_OP_C;
+    }else{
+        fprintf(stderr, "Error in cl_zherk_cuda_func: bad trans parameter %d\n", trans);
+    }
+
+    stat = cublasZherk(handle,
+        cublasUplo, cublasTrans,
+        n, k,
+        (const double *) &alpha, A, lda,
+        (const double *) &beta, C, ldc);
+    if (stat != CUBLAS_STATUS_SUCCESS){
+        printf ("cublasZherk failed");
+        cublasDestroy(handle);
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    cublasDestroy(handle);
+
+#ifndef STARPU_CUDA_ASYNC
+    cudaStreamSynchronize( stream );
+#endif
+
+    return;
+}
+#else /* CHAMELEON_USE_CUBLAS_V2 */
 static void cl_zherk_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum uplo;
@@ -125,7 +199,8 @@ static void cl_zherk_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif
+#endif /* CHAMELEON_USE_CUBLAS_V2 */
+#endif /* CHAMELEON_USE_CUDA */
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_zsymm.c b/runtime/starpu/codelets/codelet_zsymm.c
index 46e0abd6799c1f45b5127c002697289a5fa5e1d6..ae1903daabf8b3bae8333d2bdfa31894c546697a 100644
--- a/runtime/starpu/codelets/codelet_zsymm.c
+++ b/runtime/starpu/codelets/codelet_zsymm.c
@@ -102,6 +102,81 @@ static void cl_zsymm_cpu_func(void *descr[], void *cl_arg)
 }
 
 #ifdef CHAMELEON_USE_CUDA
+#if defined(CHAMELEON_USE_CUBLAS_V2)
+static void cl_zsymm_cuda_func(void *descr[], void *cl_arg)
+{
+    MORSE_enum side;
+    MORSE_enum uplo;
+    int M;
+    int N;
+    cuDoubleComplex alpha;
+    const cuDoubleComplex *A;
+    int LDA;
+    const cuDoubleComplex *B;
+    int LDB;
+    cuDoubleComplex beta;
+    cuDoubleComplex *C;
+    int LDC;
+
+    A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
+    B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
+    C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
+    starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &LDA, &LDB, &beta, &LDC);
+
+    cublasHandle_t handle;
+    cublasStatus_t stat = cublasCreate(&handle);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("CUBLAS initialization failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    CUstream stream = starpu_cuda_get_local_stream();
+    stat = cublasSetStream(handle, stream);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("cublasSetStream failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    cublasSideMode_t cublasSide;
+    if (side == MorseLeft){
+        cublasSide = CUBLAS_SIDE_LEFT;
+    }else if (side == MorseRight){
+        cublasSide = CUBLAS_SIDE_RIGHT;
+    }else{
+        fprintf(stderr, "Error in cl_zsymm_cuda_func: bad side parameter %d\n", side);
+    }
+    cublasFillMode_t cublasUplo;
+    if (uplo == MorseUpper){
+        cublasUplo = CUBLAS_FILL_MODE_UPPER;
+    }else if(uplo == MorseLower){
+        cublasUplo = CUBLAS_FILL_MODE_LOWER;
+    }else if(uplo == MorseUpperLower){
+        cublasUplo = 0;
+    }else{
+        fprintf(stderr, "Error in cl_zsymm_cuda_func: bad uplo parameter %d\n", uplo);
+    }
+
+    stat = cublasZsymm(handle,
+        cublasSide, cublasUplo,
+        M, N,
+        (const cuDoubleComplex *) &alpha, A, LDA,
+        B, LDB,
+        (const cuDoubleComplex *) &beta, C, LDC);
+    if (stat != CUBLAS_STATUS_SUCCESS){
+        printf ("cublasZsymm failed");
+        cublasDestroy(handle);
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    cublasDestroy(handle);
+
+#ifndef STARPU_CUDA_ASYNC
+    cudaStreamSynchronize( stream );
+#endif
+
+    return;
+}
+#else /* CHAMELEON_USE_CUBLAS_V2 */
 static void cl_zsymm_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum side;
@@ -138,7 +213,8 @@ static void cl_zsymm_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif
+#endif /* CHAMELEON_USE_CUBLAS_V2 */
+#endif /* CHAMELEON_USE_CUDA */
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_zsyr2k.c b/runtime/starpu/codelets/codelet_zsyr2k.c
index 4c6bfff742729f5f284a2cbb087ebe5281c9c494..2d99bdb82e8d11a97e699f71fb33b72fd834b6a3 100644
--- a/runtime/starpu/codelets/codelet_zsyr2k.c
+++ b/runtime/starpu/codelets/codelet_zsyr2k.c
@@ -97,6 +97,81 @@ static void cl_zsyr2k_cpu_func(void *descr[], void *cl_arg)
 }
 
 #ifdef CHAMELEON_USE_CUDA
+#if defined(CHAMELEON_USE_CUBLAS_V2)
+static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg)
+{
+    MORSE_enum uplo;
+    MORSE_enum trans;
+    int n;
+    int k;
+    cuDoubleComplex alpha;
+    const cuDoubleComplex *A;
+    int lda;
+    const cuDoubleComplex *B;
+    int ldb;
+    cuDoubleComplex beta;
+    cuDoubleComplex *C;
+    int ldc;
+
+    A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
+    B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
+    C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
+    starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &lda, &ldb, &beta, &ldc);
+
+    cublasHandle_t handle;
+    cublasStatus_t stat = cublasCreate(&handle);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("CUBLAS initialization failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    CUstream stream = starpu_cuda_get_local_stream();
+    stat = cublasSetStream(handle, stream);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("cublasSetStream failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    cublasFillMode_t cublasUplo;
+    if (uplo == MorseUpper){
+        cublasUplo = CUBLAS_FILL_MODE_UPPER;
+    }else if(uplo == MorseLower){
+        cublasUplo = CUBLAS_FILL_MODE_LOWER;
+    }else if(uplo == MorseUpperLower){
+        cublasUplo = 0;
+    }else{
+        fprintf(stderr, "Error in cl_zsyr2k_cuda_func: bad uplo parameter %d\n", uplo);
+    }
+
+    cublasOperation_t cublasTrans;
+    if (trans == MorseNoTrans){
+        cublasTrans = CUBLAS_OP_N;
+    }else if(trans == MorseTrans){
+        cublasTrans = CUBLAS_OP_T;
+    }else if(trans == MorseConjTrans){
+        cublasTrans = CUBLAS_OP_C;
+    }else{
+        fprintf(stderr, "Error in cl_zsyr2k_cuda_func: bad trans parameter %d\n", trans);
+    }
+
+    stat = cublasZsyr2k( handle, cublasUplo, cublasTrans,
+        n, k, (const cuDoubleComplex *) &alpha, A, lda, B, ldb,
+        (const cuDoubleComplex *) &beta, C, ldc);
+    if (stat != CUBLAS_STATUS_SUCCESS){
+        printf ("cublasZsyr2k failed");
+        cublasDestroy(handle);
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    cublasDestroy(handle);
+
+#ifndef STARPU_CUDA_ASYNC
+    cudaStreamSynchronize( stream );
+#endif
+
+    return;
+}
+#else /* CHAMELEON_USE_CUBLAS_V2 */
 static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum uplo;
@@ -129,7 +204,8 @@ static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif
+#endif /* CHAMELEON_USE_CUBLAS_V2 */
+#endif /* CHAMELEON_USE_CUDA */
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c
index afd4260c21483742b3e4c19ed815da0a6a9c454a..8803124586868e7629669f18ad9aa132a31a1c2b 100644
--- a/runtime/starpu/codelets/codelet_zsyrk.c
+++ b/runtime/starpu/codelets/codelet_zsyrk.c
@@ -94,6 +94,80 @@ static void cl_zsyrk_cpu_func(void *descr[], void *cl_arg)
 }
 
 #ifdef CHAMELEON_USE_CUDA
+#if defined(CHAMELEON_USE_CUBLAS_V2)
+static void cl_zsyrk_cuda_func(void *descr[], void *cl_arg)
+{
+    MORSE_enum uplo;
+    MORSE_enum trans;
+    int n;
+    int k;
+    cuDoubleComplex alpha;
+    const cuDoubleComplex *A;
+    int lda;
+    cuDoubleComplex beta;
+    cuDoubleComplex *C;
+    int ldc;
+
+    A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
+    C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
+    starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &lda, &beta, &ldc);
+
+    cublasHandle_t handle;
+    cublasStatus_t stat = cublasCreate(&handle);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("CUBLAS initialization failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    CUstream stream = starpu_cuda_get_local_stream();
+    stat = cublasSetStream(handle, stream);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("cublasSetStream failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    cublasFillMode_t cublasUplo;
+    if (uplo == MorseUpper){
+        cublasUplo = CUBLAS_FILL_MODE_UPPER;
+    }else if(uplo == MorseLower){
+        cublasUplo = CUBLAS_FILL_MODE_LOWER;
+    }else if(uplo == MorseUpperLower){
+        cublasUplo = 0;
+    }else{
+        fprintf(stderr, "Error in cl_zsyrk_cuda_func: bad uplo parameter %d\n", uplo);
+    }
+
+    cublasOperation_t cublasTrans;
+    if (trans == MorseNoTrans){
+        cublasTrans = CUBLAS_OP_N;
+    }else if(trans == MorseTrans){
+        cublasTrans = CUBLAS_OP_T;
+    }else if(trans == MorseConjTrans){
+        cublasTrans = CUBLAS_OP_C;
+    }else{
+        fprintf(stderr, "Error in cl_zsyrk_cuda_func: bad trans parameter %d\n", trans);
+    }
+
+    stat = cublasZsyrk(handle,
+        cublasUplo, cublasTrans,
+        n, k,
+        (const cuDoubleComplex *) &alpha, A, lda,
+        (const cuDoubleComplex *) &beta, C, ldc);
+    if (stat != CUBLAS_STATUS_SUCCESS){
+        printf ("cublasZsyrk failed");
+        cublasDestroy(handle);
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    cublasDestroy(handle);
+
+#ifndef STARPU_CUDA_ASYNC
+    cudaStreamSynchronize( stream );
+#endif
+
+    return;
+}
+#else /* CHAMELEON_USE_CUBLAS_V2 */
 static void cl_zsyrk_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum uplo;
@@ -126,7 +200,8 @@ static void cl_zsyrk_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif
+#endif /* CHAMELEON_USE_CUBLAS_V2 */
+#endif /* CHAMELEON_USE_CUDA */
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_ztrmm.c b/runtime/starpu/codelets/codelet_ztrmm.c
index a0cb4717d8741391f3071dbbcc7839ffba6db003..497ca32ef4801416429d041159ee601a82fd41b9 100644
--- a/runtime/starpu/codelets/codelet_ztrmm.c
+++ b/runtime/starpu/codelets/codelet_ztrmm.c
@@ -97,6 +97,96 @@ static void cl_ztrmm_cpu_func(void *descr[], void *cl_arg)
 }
 
 #ifdef CHAMELEON_USE_CUDA
+#if defined(CHAMELEON_USE_CUBLAS_V2)
+static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg)
+{
+    MORSE_enum side;
+    MORSE_enum uplo;
+    MORSE_enum transA;
+    MORSE_enum diag;
+    int M;
+    int N;
+    cuDoubleComplex alpha;
+    const cuDoubleComplex *A;
+    int LDA;
+    cuDoubleComplex *B;
+    int LDB;
+
+    A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
+    B = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
+    starpu_codelet_unpack_args(cl_arg, &side, &uplo, &transA, &diag, &M, &N, &alpha, &LDA, &LDB);
+
+    cublasHandle_t handle;
+    cublasStatus_t stat = cublasCreate(&handle);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("CUBLAS initialization failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    CUstream stream = starpu_cuda_get_local_stream();
+    stat = cublasSetStream(handle, stream);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("cublasSetStream failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    cublasSideMode_t cublasSide;
+    if (side == MorseLeft){
+        cublasSide = CUBLAS_SIDE_LEFT;
+    }else if (side == MorseRight){
+        cublasSide = CUBLAS_SIDE_RIGHT;
+    }else{
+        fprintf(stderr, "Error in cl_ztrmm_cuda_func: bad side parameter %d\n", side);
+    }
+    cublasFillMode_t cublasUplo;
+    if (uplo == MorseUpper){
+        cublasUplo = CUBLAS_FILL_MODE_UPPER;
+    }else if(uplo == MorseLower){
+        cublasUplo = CUBLAS_FILL_MODE_LOWER;
+    }else if(uplo == MorseUpperLower){
+        cublasUplo = 0;
+    }else{
+        fprintf(stderr, "Error in cl_ztrmm_cuda_func: bad uplo parameter %d\n", uplo);
+    }
+    cublasOperation_t cublasTransA;
+    if (transA == MorseNoTrans){
+        cublasTransA = CUBLAS_OP_N;
+    }else if(transA == MorseTrans){
+        cublasTransA = CUBLAS_OP_T;
+    }else if(transA == MorseConjTrans){
+        cublasTransA = CUBLAS_OP_C;
+    }else{
+        fprintf(stderr, "Error in cl_ztrmm_cuda_func: bad transA parameter %d\n", transA);
+    }
+    cublasDiagType_t cublasDiag;
+    if (diag == MorseNonUnit){
+        cublasDiag = CUBLAS_DIAG_NON_UNIT;
+    }else if(diag == MorseUnit){
+        cublasDiag = CUBLAS_DIAG_UNIT;
+    }else{
+        fprintf(stderr, "Error in cl_ztrmm_cuda_func: bad diag parameter %d\n", diag);
+    }
+
+    stat = cublasZtrmm( handle,
+        cublasSide, cublasUplo, cublasTransA, cublasDiag,
+        M, N,
+        (const cuDoubleComplex *) &alpha, A, LDA,
+        B, LDB, B, LDB);
+    if (stat != CUBLAS_STATUS_SUCCESS){
+        printf ("cublasZtrmm failed");
+        cublasDestroy(handle);
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    cublasDestroy(handle);
+
+#ifndef STARPU_CUDA_ASYNC
+    cudaStreamSynchronize( stream );
+#endif
+
+    return;
+}
+#else /* CHAMELEON_USE_CUBLAS_V2 */
 static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum side;
@@ -131,8 +221,8 @@ static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif
-
+#endif /* CHAMELEON_USE_CUBLAS_V2 */
+#endif /* CHAMELEON_USE_CUDA */
 
 
 /*
diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c
index 141e0076bf743908b8170ccc3ca91bdc4b6939c7..cca877b5635e4cf71d6479078011d1f6dd748c35 100644
--- a/runtime/starpu/codelets/codelet_ztrsm.c
+++ b/runtime/starpu/codelets/codelet_ztrsm.c
@@ -97,6 +97,96 @@ static void cl_ztrsm_cpu_func(void *descr[], void *cl_arg)
 }
 
 #ifdef CHAMELEON_USE_CUDA
+#if defined(CHAMELEON_USE_CUBLAS_V2)
+static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg)
+{
+    MORSE_enum side;
+    MORSE_enum uplo;
+    MORSE_enum transA;
+    MORSE_enum diag;
+    int m;
+    int n;
+    cuDoubleComplex alpha;
+    const cuDoubleComplex *A;
+    int lda;
+    cuDoubleComplex *B;
+    int ldb;
+
+    A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
+    B = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
+    starpu_codelet_unpack_args(cl_arg, &side, &uplo, &transA, &diag, &m, &n, &alpha, &lda, &ldb);
+
+    cublasHandle_t handle;
+    cublasStatus_t stat = cublasCreate(&handle);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("CUBLAS initialization failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    CUstream stream = starpu_cuda_get_local_stream();
+    stat = cublasSetStream(handle, stream);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("cublasSetStream failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    cublasSideMode_t cublasSide;
+    if (side == MorseLeft){
+        cublasSide = CUBLAS_SIDE_LEFT;
+    }else if (side == MorseRight){
+        cublasSide = CUBLAS_SIDE_RIGHT;
+    }else{
+        fprintf(stderr, "Error in cl_ztrsm_cuda_func: bad side parameter %d\n", side);
+    }
+    cublasFillMode_t cublasUplo;
+    if (uplo == MorseUpper){
+        cublasUplo = CUBLAS_FILL_MODE_UPPER;
+    }else if(uplo == MorseLower){
+        cublasUplo = CUBLAS_FILL_MODE_LOWER;
+    }else if(uplo == MorseUpperLower){
+        cublasUplo = 0;
+    }else{
+        fprintf(stderr, "Error in cl_ztrsm_cuda_func: bad uplo parameter %d\n", uplo);
+    }
+    cublasOperation_t cublasTransA;
+    if (transA == MorseNoTrans){
+        cublasTransA = CUBLAS_OP_N;
+    }else if(transA == MorseTrans){
+        cublasTransA = CUBLAS_OP_T;
+    }else if(transA == MorseConjTrans){
+        cublasTransA = CUBLAS_OP_C;
+    }else{
+        fprintf(stderr, "Error in cl_ztrsm_cuda_func: bad transA parameter %d\n", transA);
+    }
+    cublasDiagType_t cublasDiag;
+    if (diag == MorseNonUnit){
+        cublasDiag = CUBLAS_DIAG_NON_UNIT;
+    }else if(diag == MorseUnit){
+        cublasDiag = CUBLAS_DIAG_UNIT;
+    }else{
+        fprintf(stderr, "Error in cl_ztrsm_cuda_func: bad diag parameter %d\n", diag);
+    }
+
+    stat = cublasZtrsm( handle,
+        cublasSide, cublasUplo, cublasTransA, cublasDiag,
+        m, n,
+        (const cuDoubleComplex *) &alpha, A, lda,
+        B, ldb);
+    if (stat != CUBLAS_STATUS_SUCCESS){
+        printf ("cublasZtrsm failed");
+        cublasDestroy(handle);
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    cublasDestroy(handle);
+
+#ifndef STARPU_CUDA_ASYNC
+    cudaStreamSynchronize( stream );
+#endif
+
+    return;
+}
+#else /* CHAMELEON_USE_CUBLAS_V2 */
 static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_enum side;
@@ -131,7 +221,8 @@ static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg)
 
     return;
 }
-#endif
+#endif /* CHAMELEON_USE_CUBLAS_V2 */
+#endif /* CHAMELEON_USE_CUDA */
 
 /*
  * Codelet definition
diff --git a/runtime/starpu/codelets/codelet_ztsmqr.c b/runtime/starpu/codelets/codelet_ztsmqr.c
index 0c283302fa408539bca68495e022fc85c1d0412b..f814bf858d8e4367bf86ca25cd3266efc675779d 100644
--- a/runtime/starpu/codelets/codelet_ztsmqr.c
+++ b/runtime/starpu/codelets/codelet_ztsmqr.c
@@ -215,6 +215,7 @@ static void cl_ztsmqr_cpu_func(void *descr[], void *cl_arg)
 
 #if defined(CHAMELEON_USE_MAGMA)
 
+#if defined(CHAMELEON_USE_CUBLAS_V2)
 magma_int_t
 magma_zparfb_gpu(magma_side_t side, magma_trans_t trans,
 		         magma_direct_t direct, magma_storev_t storev,
@@ -229,6 +230,325 @@ magma_zparfb_gpu(magma_side_t side, magma_trans_t trans,
 		               magmaDoubleComplex *WORKC, magma_int_t LDWORKC,
 		               CUstream stream)
 
+{
+#if defined(PRECISION_z) || defined(PRECISION_c)
+    cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0);
+    cuDoubleComplex zone  = make_cuDoubleComplex(1.0, 0.0);
+    cuDoubleComplex mzone = make_cuDoubleComplex(-1.0, 0.0);
+#else
+    double zzero = 0.0;
+    double zone  = 1.0;
+    double mzone = -1.0;
+#endif /* defined(PRECISION_z) || defined(PRECISION_c) */
+
+    int j;
+    magma_trans_t transW;
+    magma_trans_t transA2;
+    cublasHandle_t handle;
+    cublasStatus_t stat = cublasCreate(&handle);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("CUBLAS initialization failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    stat = cublasSetStream(handle, stream);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("cublasSetStream failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    cublasOperation_t cublasTrans;
+    if (trans == MagmaNoTrans){
+        cublasTrans = CUBLAS_OP_N;
+    }else if(trans == MagmaTrans){
+        cublasTrans = CUBLAS_OP_T;
+    }else if(trans == MagmaConjTrans){
+        cublasTrans = CUBLAS_OP_C;
+    }else{
+        fprintf(stderr, "Error in magma_zparfb_gpu: bad trans parameter %d\n", trans);
+    }
+
+    /* Check input arguments */
+    if ((side != MagmaLeft) && (side != MagmaRight)) {
+        return -1;
+    }
+    if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) {
+        return -2;
+    }
+    if ((direct != MagmaForward) && (direct != MagmaBackward)) {
+        return -3;
+    }
+    if ((storev != MagmaColumnwise) && (storev != MagmaRowwise)) {
+        return -4;
+    }
+    if (M1 < 0) {
+        return -5;
+    }
+    if (N1 < 0) {
+        return -6;
+    }
+    if ((M2 < 0) ||
+        ( (side == MagmaRight) && (M1 != M2) ) ) {
+        return -7;
+    }
+    if ((N2 < 0) ||
+        ( (side == MagmaLeft) && (N1 != N2) ) ) {
+        return -8;
+    }
+    if (K < 0) {
+        return -9;
+    }
+
+    /* Quick return */
+    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0))
+        return MAGMA_SUCCESS;
+
+    if (direct == MagmaForward) {
+
+        if (side == MagmaLeft) {
+
+            /*
+             * Column or Rowwise / Forward / Left
+             * ----------------------------------
+             *
+             * Form  H * A  or  H' * A  where  A = ( A1 )
+             *                                     ( A2 )
+             */
+
+            /*
+             * W = A1 + V' * A2:
+             *      W = A1
+             *      W = W + V' * A2
+             *
+             */
+            cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex),
+                               A1,   LDA1   * sizeof(cuDoubleComplex),
+                               K * sizeof(cuDoubleComplex), N1,
+                               cudaMemcpyDeviceToDevice, stream );
+
+            transW  = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans;
+            transA2 = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans;
+
+            cublasOperation_t cublasTransW;
+            if (transW == MagmaNoTrans){
+                cublasTransW = CUBLAS_OP_N;
+            }else if(transW == MagmaTrans){
+                cublasTransW = CUBLAS_OP_T;
+            }else if(transW == MagmaConjTrans){
+                cublasTransW = CUBLAS_OP_C;
+            }else{
+                fprintf(stderr, "Error in magma_zparfb_gpu: bad transW parameter %d\n", transW);
+            }
+            cublasOperation_t cublasTransA2;
+            if (transA2 == MagmaNoTrans){
+                cublasTransA2 = CUBLAS_OP_N;
+            }else if(transA2 == MagmaTrans){
+                cublasTransA2 = CUBLAS_OP_T;
+            }else if(transA2 == MagmaConjTrans){
+                cublasTransA2 = CUBLAS_OP_C;
+            }else{
+                fprintf(stderr, "Error in magma_zparfb_gpu: bad transA2 parameter %d\n", transA2);
+            }
+
+            cublasZgemm(handle, cublasTransW, CUBLAS_OP_N,
+                        K, N1, M2,
+                        (const cuDoubleComplex *) &zone,
+                        (const cuDoubleComplex*)V     /* K*M2  */, LDV,
+                        (const cuDoubleComplex*)A2    /* M2*N1 */, LDA2,
+                        (const cuDoubleComplex *) &zone,
+                        (cuDoubleComplex*)WORK  /* K*N1  */, LDWORK);
+
+            WORKC = NULL;
+            if (WORKC == NULL) {
+                /* W = op(T) * W */
+                cublasZtrmm( handle,
+                    CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER,
+                    cublasTrans, CUBLAS_DIAG_NON_UNIT,
+                    K, N2,
+                    (const cuDoubleComplex *) &zone,
+                    (const cuDoubleComplex*)T, LDT,
+                    (cuDoubleComplex*)WORK, LDWORK,
+                    (cuDoubleComplex*)WORK, LDWORK);
+
+
+                /* A1 = A1 - W = A1 - op(T) * W */
+                for(j = 0; j < N1; j++) {
+                    cublasZaxpy(handle, K, (const cuDoubleComplex *) &mzone,
+                                (const cuDoubleComplex*)(WORK + LDWORK*j), 1,
+                                (cuDoubleComplex*)(A1 + LDA1*j), 1);
+                }
+
+                /* A2 = A2 - op(V) * W  */
+                cublasZgemm(handle, cublasTransA2, CUBLAS_OP_N,
+                            M2, N2, K,
+                            (const cuDoubleComplex *) &mzone,
+                            (const cuDoubleComplex*)V     /* M2*K  */, LDV,
+                            (const cuDoubleComplex*)WORK  /* K*N2  */, LDWORK,
+                            (const cuDoubleComplex *) &zone,
+                            (cuDoubleComplex*)A2    /* m2*N2 */, LDA2);
+
+            } else {
+                /* Wc = V * op(T) */
+                cublasZgemm( handle, cublasTransA2, cublasTrans,
+                             M2, K, K,
+                             (const cuDoubleComplex *) &zone,  V,     LDV,
+                                    T,     LDT,
+                             (const cuDoubleComplex *) &zzero, WORKC, LDWORKC );
+
+                /* A1 = A1 - opt(T) * W */
+                cublasZgemm( handle, cublasTrans, CUBLAS_OP_N,
+                             K, N1, K,
+                             (const cuDoubleComplex *) &mzone,
+                             (const cuDoubleComplex *)T,    LDT,
+                             (const cuDoubleComplex *)WORK, LDWORK,
+                             (const cuDoubleComplex *) &zone,
+                             (cuDoubleComplex*)A1,   LDA1 );
+
+                /* A2 = A2 - Wc * W */
+                cublasZgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N,
+                             M2, N2, K,
+                             (const cuDoubleComplex *) &mzone,
+                             (const cuDoubleComplex *)WORKC, LDWORKC,
+                             (const cuDoubleComplex *)WORK,  LDWORK,
+                             (const cuDoubleComplex *) &zone,
+                             (cuDoubleComplex *)A2,    LDA2 );
+            }
+        }
+        else {
+            /*
+             * Column or Rowwise / Forward / Right
+             * -----------------------------------
+             *
+             * Form  H * A  or  H' * A  where A  = ( A1 A2 )
+             *
+             */
+
+            /*
+             * W = A1 + A2 * V':
+             *      W = A1
+             *      W = W + A2 * V'
+             *
+             */
+            cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex),
+                               A1,   LDA1   * sizeof(cuDoubleComplex),
+                               M1 * sizeof(cuDoubleComplex), K,
+                               cudaMemcpyDeviceToDevice, stream );
+
+            transW  = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans;
+            transA2 = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans;
+
+            cublasOperation_t cublasTransW;
+            if (transW == MagmaNoTrans){
+                cublasTransW = CUBLAS_OP_N;
+            }else if(transW == MagmaTrans){
+                cublasTransW = CUBLAS_OP_T;
+            }else if(transW == MagmaConjTrans){
+                cublasTransW = CUBLAS_OP_C;
+            }else{
+                fprintf(stderr, "Error in magma_zparfb_gpu: bad transW parameter %d\n", transW);
+            }
+            cublasOperation_t cublasTransA2;
+            if (transA2 == MagmaNoTrans){
+                cublasTransA2 = CUBLAS_OP_N;
+            }else if(transA2 == MagmaTrans){
+                cublasTransA2 = CUBLAS_OP_T;
+            }else if(transA2 == MagmaConjTrans){
+                cublasTransA2 = CUBLAS_OP_C;
+            }else{
+                fprintf(stderr, "Error in magma_zparfb_gpu: bad transA2 parameter %d\n", transA2);
+            }
+
+            cublasZgemm(handle, CUBLAS_OP_N, cublasTransW,
+                        M1, K, N2,
+                        (const cuDoubleComplex *) &zone,
+                        (const cuDoubleComplex*)A2    /* M1*N2 */, LDA2,
+                        (const cuDoubleComplex*)V     /* N2*K  */, LDV,
+                        (const cuDoubleComplex *) &zone,
+                        (cuDoubleComplex*)WORK  /* M1*K  */, LDWORK);
+
+            WORKC = NULL;
+            if (WORKC == NULL) {
+                /* W = W * op(T) */
+                cublasZtrmm( handle,
+                    CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER,
+                    cublasTrans, CUBLAS_DIAG_NON_UNIT,
+                    M2, K,
+                    (const cuDoubleComplex *) &zone,
+                    (const cuDoubleComplex*)T, LDT,
+                    (cuDoubleComplex*)WORK, LDWORK,
+                    (cuDoubleComplex*)WORK, LDWORK);
+
+
+                /* A1 = A1 - W = A1 - W * op(T) */
+                for(j = 0; j < K; j++) {
+                    cublasZaxpy(handle, M1, (const cuDoubleComplex *) &mzone,
+                        (const cuDoubleComplex*)(WORK + LDWORK*j), 1,
+                        (cuDoubleComplex*)(A1 + LDA1*j), 1);
+                }
+
+                /* A2 = A2 - W * op(V)  */
+                cublasZgemm(handle, CUBLAS_OP_N, cublasTransA2,
+                            M2, N2, K,
+                            (const cuDoubleComplex *) &mzone,
+                            (const cuDoubleComplex*)WORK  /* M2*K  */, LDWORK,
+                            (const cuDoubleComplex*)V     /* K*N2  */, LDV,
+                            (const cuDoubleComplex *) &zone,
+                            (cuDoubleComplex*)A2    /* M2*N2 */, LDA2);
+
+            } else {
+                /* A1 = A1 - W * opt(T) */
+                cublasZgemm( handle, CUBLAS_OP_N, cublasTrans,
+                    M1, K, K,
+                    (const cuDoubleComplex *) &mzone,
+                    (const cuDoubleComplex *)WORK, LDWORK,
+                    (const cuDoubleComplex *)T,    LDT,
+                    (const cuDoubleComplex *) &zone,
+                    (cuDoubleComplex *)A1,   LDA1 );
+
+                /* Wc = op(T) * V */
+                cublasZgemm( handle, cublasTrans, cublasTransA2,
+                             K, N2, K,
+                             (const cuDoubleComplex *) &zone,
+                             (const cuDoubleComplex *)T,     LDT,
+                             (const cuDoubleComplex *)V,     LDV,
+                             (const cuDoubleComplex *) &zzero,
+                             (cuDoubleComplex *)WORKC, LDWORKC );
+
+                /* A2 = A2 - W * Wc */
+                cublasZgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N,
+                             M2, N2, K,
+                             (const cuDoubleComplex *) &mzone,
+                             (const cuDoubleComplex *)WORK,  LDWORK,
+                             (const cuDoubleComplex *)WORKC, LDWORKC,
+                             (const cuDoubleComplex *) &zone,
+                             (cuDoubleComplex *)A2,    LDA2 );
+            }
+        }
+    }
+    else {
+        fprintf(stderr, "Not implemented (Backward / Left or Right)");
+        return MAGMA_ERR_NOT_SUPPORTED;
+    }
+
+    cublasDestroy(handle);
+
+    return MAGMA_SUCCESS;
+}
+#else /* CHAMELEON_USE_CUBLAS_V2 */
+magma_int_t
+magma_zparfb_gpu(magma_side_t side, magma_trans_t trans,
+                 magma_direct_t direct, magma_storev_t storev,
+                 magma_int_t M1, magma_int_t N1,
+                 magma_int_t M2, magma_int_t N2,
+                 magma_int_t K, magma_int_t L,
+                       magmaDoubleComplex *A1, magma_int_t LDA1,
+                       magmaDoubleComplex *A2, magma_int_t LDA2,
+                 const magmaDoubleComplex *V, magma_int_t LDV,
+                 const magmaDoubleComplex *T, magma_int_t LDT,
+                       magmaDoubleComplex *WORK, magma_int_t LDWORK,
+                       magmaDoubleComplex *WORKC, magma_int_t LDWORKC,
+                       CUstream stream)
+
 {
 #if defined(PRECISION_z) || defined(PRECISION_c)
     cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0);
@@ -452,6 +772,7 @@ magma_zparfb_gpu(magma_side_t side, magma_trans_t trans,
 
     return MAGMA_SUCCESS;
 }
+#endif /* CHAMELEON_USE_CUBLAS_V2 */
 
 magma_int_t
 magma_ztsmqr_gpu( magma_side_t side, magma_trans_t trans,