diff --git a/compute/pzgemm.c b/compute/pzgemm.c
index f0c77ad3f4773654701a41b2aae4c757e5a7d41a..5c6563d2d6d6280f0f08a35b9a25f266eb5e9764 100644
--- a/compute/pzgemm.c
+++ b/compute/pzgemm.c
@@ -233,7 +233,7 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran
                     options,
                     ChamUpperLower, tempkk, tempmm,
                     A(  k,  m ),
-                    WA( m, (k % C->q) + lq ) );
+                    WA( m, (m % C->q) + lq ) );
 
                 RUNTIME_data_flush( sequence, A( k, m ) );
 
@@ -241,8 +241,8 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran
                     INSERT_TASK_zlacpy(
                         options,
                         ChamUpperLower, tempkk, tempmm,
-                        WA( m, ((k+q-1) % C->q) + lq ),
-                        WA( m, ((k+q)   % C->q) + lq ) );
+                        WA( m, ((m+q-1) % C->q) + lq ),
+                        WA( m, ((m+q)   % C->q) + lq ) );
                 }
             }
         }
@@ -273,7 +273,7 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran
                     options,
                     ChamUpperLower, tempnn, tempkk,
                     B(   n,              k ),
-                    WB( (k % C->p) + lp, n ) );
+                    WB( (n % C->p) + lp, n ) );
 
                 RUNTIME_data_flush( sequence, B( n, k ) );
 
@@ -281,8 +281,8 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran
                     INSERT_TASK_zlacpy(
                         options,
                         ChamUpperLower, tempnn, tempkk,
-                        WB( ((k+p-1) % C->p) + lp, n ),
-                        WB( ((k+p)   % C->p) + lp, n ) );
+                        WB( ((n+p-1) % C->p) + lp, n ),
+                        WB( ((n+p)   % C->p) + lp, n ) );
                 }
             }
         }
diff --git a/compute/pzgepdf_qdwh.c b/compute/pzgepdf_qdwh.c
index de7a82cb1a1f8e99c79bbea4a3551e8c5549d835..420ad6f502ceccea3a253363e5b00333c5f49aea 100644
--- a/compute/pzgepdf_qdwh.c
+++ b/compute/pzgepdf_qdwh.c
@@ -36,7 +36,7 @@ static int _zgepdf_qdwh_opt_genD = 0;
 #endif
 
 static int _zgepdf_qdwh_opt_qr = 1;
-static int _zgepdf_qdwh_opt_id = 1;
+static int _zgepdf_qdwh_opt_id = 1; // There is a numerical issue when combining this optimization and the StarPU lacpy
 static int _zgepdf_qdwh_verbose = 0;
 
 /**
@@ -719,13 +719,13 @@ chameleon_pzgepdf_qdwh( cham_mtxtype_t mtxtype, CHAM_desc_t *descU, CHAM_desc_t
         it++;
         last = ( it >= itconv );
 
+        chameleon_sequence_wait( chamctxt, sequence_it );
         if ( params[2] > 100 ) {
             int do_qr = (!_zgepdf_qdwh_opt_qr) || (it > 1);
 
             if ( (chamctxt->scheduler == RUNTIME_SCHED_PARSEC) &&
                  ( sequence_it != sequence_qr ) )
             {
-                chameleon_sequence_wait( chamctxt, sequence_it );
                 sequence_it = sequence_qr;
                 request_it = &request_qr;
             }
@@ -753,7 +753,6 @@ chameleon_pzgepdf_qdwh( cham_mtxtype_t mtxtype, CHAM_desc_t *descU, CHAM_desc_t
             if ( (chamctxt->scheduler == RUNTIME_SCHED_PARSEC) &&
                  ( sequence_it != sequence_po ) )
             {
-                chameleon_sequence_wait( chamctxt, sequence_it );
                 sequence_it = sequence_po;
                 request_it = &request_po;
             }
@@ -796,10 +795,10 @@ chameleon_pzgepdf_qdwh( cham_mtxtype_t mtxtype, CHAM_desc_t *descU, CHAM_desc_t
         }
     }
 
+    chameleon_sequence_wait( chamctxt, sequence_it );
     if ( (chamctxt->scheduler == RUNTIME_SCHED_PARSEC) &&
          ( sequence_it != sequence ) )
     {
-        chameleon_sequence_wait( chamctxt, sequence_it );
         chameleon_sequence_destroy( chamctxt, sequence_qr );
         chameleon_sequence_destroy( chamctxt, sequence_po );
     }
diff --git a/compute/pzhemm.c b/compute/pzhemm.c
index b47dda5baa2ac08d4904d564a73a98b4cecdf515..12269d34a3abf4de45163eb75657441cceac5e74 100644
--- a/compute/pzhemm.c
+++ b/compute/pzhemm.c
@@ -339,7 +339,7 @@ chameleon_pzhemm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo,
                 options,
                 ChamUpperLower, tempam, tempak,
                 A( Am, Ak ),
-                WA( m, (k % C->q) + lq ) );
+                WA( m, (Ak % C->q) + lq ) );
 
             RUNTIME_data_flush( sequence, A( Am, Ak ) );
 
@@ -347,8 +347,8 @@ chameleon_pzhemm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo,
                 INSERT_TASK_zlacpy(
                     options,
                     ChamUpperLower, tempam, tempak,
-                    WA( m, ((k+q-1) % C->q) + lq ),
-                    WA( m, ((k+q)   % C->q) + lq ) );
+                    WA( m, ((Ak+q-1) % C->q) + lq ),
+                    WA( m, ((Ak+q)   % C->q) + lq ) );
             }
         }
 
@@ -496,7 +496,7 @@ chameleon_pzhemm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo,
                 options,
                 ChamUpperLower, tempak, tempan,
                 A(  Ak,              An ),
-                WB( (k % C->p) + lp, n  ) );
+                WB( (Ak % C->p) + lp, n  ) );
 
             RUNTIME_data_flush( sequence, A( Ak, An ) );
 
@@ -504,8 +504,8 @@ chameleon_pzhemm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo,
                 INSERT_TASK_zlacpy(
                     options,
                     ChamUpperLower, tempak, tempan,
-                    WB( ((k+p-1) % C->p) + lp, n ),
-                    WB( ((k+p)   % C->p) + lp, n ) );
+                    WB( ((Ak+p-1) % C->p) + lp, n ),
+                    WB( ((Ak+p)   % C->p) + lp, n ) );
             }
         }
 
diff --git a/compute/pzsymm.c b/compute/pzsymm.c
index 47632f6ce07bd4a0254cff94ad7b1a258f3d14e4..944ac75ea2daef41f206207d420e9d0543fd0ae1 100644
--- a/compute/pzsymm.c
+++ b/compute/pzsymm.c
@@ -340,7 +340,7 @@ chameleon_pzsymm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo,
                 options,
                 ChamUpperLower, tempam, tempak,
                 A( Am, Ak ),
-                WA( m, (k % C->q) + lq ) );
+                WA( m, (Ak % C->q) + lq ) );
 
             RUNTIME_data_flush( sequence, A( Am, Ak ) );
 
@@ -348,8 +348,8 @@ chameleon_pzsymm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo,
                 INSERT_TASK_zlacpy(
                     options,
                     ChamUpperLower, tempam, tempak,
-                    WA( m, ((k+q-1) % C->q) + lq ),
-                    WA( m, ((k+q)   % C->q) + lq ) );
+                    WA( m, ((Ak+q-1) % C->q) + lq ),
+                    WA( m, ((Ak+q)   % C->q) + lq ) );
             }
         }
 
@@ -497,7 +497,7 @@ chameleon_pzsymm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo,
                 options,
                 ChamUpperLower, tempak, tempan,
                 A(  Ak,              An ),
-                WB( (k % C->p) + lp, n  ) );
+                WB( (Ak % C->p) + lp, n  ) );
 
             RUNTIME_data_flush( sequence, A( Ak, An ) );
 
@@ -505,8 +505,8 @@ chameleon_pzsymm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo,
                 INSERT_TASK_zlacpy(
                     options,
                     ChamUpperLower, tempak, tempan,
-                    WB( ((k+p-1) % C->p) + lp, n ),
-                    WB( ((k+p)   % C->p) + lp, n ) );
+                    WB( ((Ak+p-1) % C->p) + lp, n ),
+                    WB( ((Ak+p)   % C->p) + lp, n ) );
             }
         }
 
diff --git a/runtime/starpu/codelets/codelet_zlacpy.c b/runtime/starpu/codelets/codelet_zlacpy.c
index 2d227b37b7167fa28f687652cdcc4b5e734b3299..aa8d73ed3774d60b34f1b9d6641c984db59fe820 100644
--- a/runtime/starpu/codelets/codelet_zlacpy.c
+++ b/runtime/starpu/codelets/codelet_zlacpy.c
@@ -151,7 +151,7 @@ void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options,
     if ( (uplo == ChamUpperLower) &&
          (tileA->m == m) && (tileA->n == n) &&
          (tileB->m == m) && (tileB->n == n) &&
-         (displA == 0) && (displB == 0) && 0 )
+         (displA == 0) && (displB == 0) )
     {
 #if defined(CHAMELEON_USE_MPI)
         insert_task_zlacpy_on_remote_node( options,
@@ -227,7 +227,7 @@ void INSERT_TASK_zlacpy( const RUNTIME_option_t *options,
     /* Insert the task */
     if ( (uplo == ChamUpperLower) &&
          (tileA->m == m) && (tileA->n == n) &&
-         (tileB->m == m) && (tileB->n == n) && 0 )
+         (tileB->m == m) && (tileB->n == n) )
     {
 #if defined(CHAMELEON_USE_MPI)
         insert_task_zlacpy_on_remote_node( options,
diff --git a/testing/CTestLists.cmake b/testing/CTestLists.cmake
index c185e50b525c719a7b62422f89d5d5b9a259c435..c8d012141de283ea45c4ebe3a4ae3f9270d7e435 100644
--- a/testing/CTestLists.cmake
+++ b/testing/CTestLists.cmake
@@ -21,6 +21,10 @@ if (CHAMELEON_SIMULATION)
   endif()
 endif()
 
+set( SINGLE_PRECISIONS s c )
+# list all tests that have a specific input file for single precision computations
+set( SINGLE_TESTS gepdf_qdwh genm2 )
+
 if (NOT CHAMELEON_SIMULATION)
 
   foreach(prec ${CHAMELEON_PRECISION})
@@ -82,7 +86,11 @@ if (NOT CHAMELEON_SIMULATION)
         endif()
 
         foreach( test ${TESTSTMP} )
-          add_test( test_${cat}_${prec}${test} ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/${test}.in )
+          if ( ${test} IN_LIST SINGLE_TESTS AND ${prec} IN_LIST SINGLE_PRECISIONS )
+            add_test( test_${cat}_${prec}${test} ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/${test}_32.in )
+          else()
+            add_test( test_${cat}_${prec}${test} ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/${test}.in )
+          endif()
         endforeach()
 
         if ( CHAMELEON_SCHED_STARPU )
@@ -111,11 +119,15 @@ if (NOT CHAMELEON_SIMULATION)
 
         list( REMOVE_ITEM TESTSTMP print gepdf_qr )
 
-        foreach( test ${TESTSTMP} )
-          if ( NOT (${cat} STREQUAL "mpi"))
-            add_test( test_${cat}_${prec}${test}_std ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/${test}.in --api=1 )
-          endif()
-        endforeach()
+        if ( NOT (${cat} STREQUAL "mpi"))
+          foreach( test ${TESTSTMP} )
+            if ( ${test} IN_LIST SINGLE_TESTS AND ${prec} IN_LIST SINGLE_PRECISIONS )
+              add_test( test_${cat}_${prec}${test}_std ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/${test}_32.in --api=1 )
+            else()
+              add_test( test_${cat}_${prec}${test}_std ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/${test}.in --api=1 )
+            endif()
+          endforeach()
+        endif()
       endforeach()
     endforeach()
   endforeach()
diff --git a/testing/input/genm2_32.in b/testing/input/genm2_32.in
new file mode 100644
index 0000000000000000000000000000000000000000..3d2a04f54b973ff0c0a78b2228ccfc4649b64839
--- /dev/null
+++ b/testing/input/genm2_32.in
@@ -0,0 +1,19 @@
+# You can enumerate each parameter's values as an explicit list separated by commas or by a range start:end[:step]
+# Not given parameters will receive default values
+
+# GENM2
+# mtxfmt
+# nb: Tile size
+# M: Number of rows of matrix A
+# N: Number of columns of matrix A
+# LDA: Leading dimension of matrix A
+# cond: The condition number
+# mode: the mode values for latms
+
+op = genm2
+nb = 16, 17
+m = 15, 25, 37
+n = 13, 23, 35
+lda = 41
+cond = 1., 1.e6
+mode = 1:6
diff --git a/testing/input/gepdf_qdwh_32.in b/testing/input/gepdf_qdwh_32.in
new file mode 100644
index 0000000000000000000000000000000000000000..80de2094db4a8f7ea63f49cfa4b0a57ed65fd835
--- /dev/null
+++ b/testing/input/gepdf_qdwh_32.in
@@ -0,0 +1,23 @@
+# You can enumerate each parameter's values as an explicit list separated by commas or by a range start:end[:step]
+# Not given parameters will receive default values
+
+# GEPDF_QDWH
+
+# nb: Tile size
+# ib: Inner tile size
+# m: Number of rows of the A matrix
+# n: Number of columns of the A matrix
+# lda: Leading dimension of the A matrix
+# ldb: Leading dimension of the H matrix
+# cond: The condition number
+# mode: the mode values for latms
+
+op = gepdf_qdwh
+nb = 8
+ib = 3
+m = 8, 32, 64
+n = 8, 16, 32
+lda = 79
+ldb = 78
+cond = 1., 1.e6
+mode = 1:6