diff --git a/.clang_complete b/.clang_complete
new file mode 100644
index 0000000000000000000000000000000000000000..fcc47adf3deff54f683d088e2c498eac0c122b6b
--- /dev/null
+++ b/.clang_complete
@@ -0,0 +1,32 @@
+-DCBLAS_HAS_CGEMM3M
+-DCBLAS_HAS_ZGEMM3M
+-DCHAMELEON_COPY_DIAG
+-DPRECISION_c
+-DPRECISION_d
+-DPRECISION_s
+-DPRECISION_z
+-I.
+-I./runtime/starpu/include
+-I./example/basic_zposv
+-I./example/lapack_to_chameleon
+-I./example/out_of_core
+-I./include
+-I./testing
+-I./timing
+-I./control
+-I./coreblas/include
+-I./cudablas/include
+-I./build/runtime/starpu/include
+-I./build/example/basic_zposv
+-I./build/example/lapack_to_chameleon
+-I./build/example/out_of_core
+-I./build/include
+-I./build/testing
+-I./build/timing
+-I./build/control
+-I./build/coreblas/include
+-I./build/cudablas/include
+-I/opt/intel/compilers_and_libraries/linux/mkl/include
+-I/usr/include/libxml2
+-I/usr/local/include
+-I/usr/local/include/starpu/1.3
diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt
index bed4df6f140c9449d7c7201bf4249373c14d6146..b1c733b9826bde9225d13343da6ea4f1a007e366 100644
--- a/compute/CMakeLists.txt
+++ b/compute/CMakeLists.txt
@@ -101,6 +101,8 @@ set(ZSRC
     pzgetrf_incpiv.c
     pzgetrf_nopiv.c
     pzgetrf.c
+    pzgetrf_sp.c
+    pzgetrf_pp.c
     pzlacpy.c
     pzlange.c
     pzlansy.c
@@ -152,6 +154,8 @@ set(ZSRC
     zgetrf_incpiv.c
     zgetrf_nopiv.c
     zgetrf.c
+    zgetrf_sp.c
+    zgetrf_pp.c
     zgetrs_incpiv.c
     zgetrs_nopiv.c
     zlacpy.c
diff --git a/compute/pzgetrf_pp.c b/compute/pzgetrf_pp.c
new file mode 100644
index 0000000000000000000000000000000000000000..44f6f84bbb10665962877d4313c6ad3b287023b8
--- /dev/null
+++ b/compute/pzgetrf_pp.c
@@ -0,0 +1,667 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Inria. All rights reserved.
+ * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ *  @file pzgetrf.c
+ *
+ *  CHAMELON compute
+ *  CHAMELON is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ *  @author Omar Zenati
+ *  @author Terry Cojean
+ *  @date 2011-07-06
+ *  @precisions normal z -> c d s
+ *
+ **/
+#include "control/common.h"
+
+#define A(m, n) A, m, n
+#define Acopy(m, n) Acopy, m, n
+#define IPIV_h(it) IPIV+it
+#define perm_to_upper_h(it) perm_to_upper+it
+#define perm_from_upper_h(it) perm_from_upper+it
+#define IPIV_p(it, index) (((int*)(IPIV+it)->ptr)+index)
+#define perm_to_upper_p(it, index) (((int*)(perm_to_upper+it)->ptr)+index)
+#define perm_from_upper_p(it, index) (((int*)(perm_from_upper+it)->ptr)+index)
+
+#define ALLOCATE_DYNAMIC_HANDLE(_handle_, _type_, _size_)               \
+    CHAMELON_data_handle_t *_handle_;                                      \
+    if (_size_ > 0) {                                                   \
+        _handle_ = malloc(sizeof(CHAMELON_data_handle_t));                 \
+        RUNTIME_data_alloc(_handle_);                                   \
+        RUNTIME_vector_data_register(_handle_, -1, 0, _size_, sizeof(_type_));}
+
+
+#define ALLOCATE_DYNAMIC_HANDLE_PTR(_handle_, _type_, _size_)               \
+	CHAMELON_data_handle_t *_handle_; \
+	if (_size_ > 0) { \
+		_handle_ = malloc(sizeof(CHAMELON_data_handle_t)); \
+		_handle_->ptr = malloc(sizeof(_type_) * _size_); \
+		RUNTIME_data_alloc(_handle_); \
+		RUNTIME_vector_data_register(_handle_, 0, (uintptr_t)_handle_->ptr, _size_, sizeof(_type_));}
+
+static inline void pivot_to_permutation(int diag_size, int height,
+                                        int *IPIV, int *source, int *dest)
+{
+    int i;
+    int pivoted, inverse;
+
+    for(i=0; i < diag_size; i++)
+        source[i] = dest[i] = i + height;
+
+    for(i = 0; i < diag_size; i++) {
+        int ip = IPIV[i]-1;
+        assert( ip - height >= i );
+        if ( ip-height > i ) {
+            pivoted = source[i];
+
+            if (ip-height < diag_size) {
+                inverse = source[ip-height];
+                source[ip-height] = pivoted;
+            } else {
+                inverse = ip;
+                int j;
+                for(j=0; j < diag_size; j++) {
+                    if( dest[j] == ip ) {
+                        inverse = j + height;
+                        break;
+                    }
+                }
+            }
+
+            source[i] = inverse;
+
+            pivoted -= height;
+            inverse -= height;
+
+            if (pivoted < diag_size) dest[pivoted] = ip;
+            if (inverse < diag_size) dest[inverse] = i+height;
+        }
+    }
+}
+
+static void submit_swaps(CHAMELON_desc_t *A, int k, int n,
+                         CHAMELON_data_handle_t *IPIV_handle,
+                         CHAMELON_data_handle_t *source_handle, CHAMELON_data_handle_t *dest_handle,
+                         int *TILES_SRC, int *TILES_DST,
+                         CHAMELON_data_handle_t* idxs_from_upper_h,
+                         CHAMELON_data_handle_t* idxs_to_upper_h, CHAMELON_option_t *options)
+{
+    int tempk  = k * A->mb;
+    int tempkmt = k * A->mt;
+    int tempm  = A->m - tempk;
+    int tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
+    int tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
+    int tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
+    CHAMELON_desc_t *updatePanel = chameleon_desc_submatrix(A, tempk, n*A->nb, tempm, tempnn);
+
+    if (options->swap_grain == CHAMELON_LINE_SWAPS) {
+        int m;
+
+        int nb_source = TILES_SRC[tempkmt];
+        ALLOCATE_DYNAMIC_HANDLE_PTR(handle_upper_to_itself, CHAMELON_Complex64_t, nb_source * tempnn);
+        /* ALLOCATE_DYNAMIC_HANDLE(handle_idxs_upper_to_itself, int, nb_source); */
+        if (nb_source > 0) {
+            m = 0;
+            int tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
+            CHAMELON_TASK_zswp_extract_rows2(options, nb_source, A->mb, k, k, k, tempmm, tempnn, IPIV_handle, source_handle, A(k,n), handle_upper_to_itself, &idxs_from_upper_h[tempkmt]);
+        }
+
+        for (m = 1 ; m < updatePanel->mt ; m++)
+        {
+            int tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
+            nb_source = TILES_SRC[tempkmt+m];
+            int nb_dest = TILES_DST[tempkmt+m];
+            if (nb_source > 0 || nb_dest > 0) {
+                /* ALLOC */
+                ALLOCATE_DYNAMIC_HANDLE_PTR(handle_to_upper_tile, CHAMELON_Complex64_t, nb_source * tempnn);
+                ALLOCATE_DYNAMIC_HANDLE_PTR(handle_from_upper_tile, CHAMELON_Complex64_t, nb_dest * tempnn);
+                /* ALLOCATE_DYNAMIC_HANDLE(handle_idxs_to_upper_tile, int, nb_source); */
+                /* ALLOCATE_DYNAMIC_HANDLE(handle_idxs_from_upper_tile, int, nb_dest); */
+
+                CHAMELON_TASK_zswp_extract_rows2(options, nb_dest, A->mb, k, k, k+m, tempmm, tempnn, IPIV_handle, dest_handle, A(k,n), handle_from_upper_tile, &idxs_to_upper_h[tempkmt+m]);
+                CHAMELON_TASK_zswp_extract_rows2(options, nb_source, A->mb, k, k+m, k, tempmm, tempnn, IPIV_handle, source_handle, A(k+m,n), handle_to_upper_tile, &idxs_from_upper_h[tempkmt+m]);
+
+                //Pour réinjecter ce que l'on a récup dans chacune
+                CHAMELON_TASK_zswp_insert_rows2(options, nb_source, A->mb, k, k+m, k, tempmm, tempnn, source_handle, A(k,n), handle_to_upper_tile, &idxs_from_upper_h[tempkmt+m]);
+                CHAMELON_TASK_zswp_insert_rows2(options, nb_dest, A->mb, k, k, k+m, tempmm, tempnn, dest_handle, A(k+m,n), handle_from_upper_tile, &idxs_to_upper_h[tempkmt+m]);
+
+                /* TODO: make these evolve to free the handle and its data */
+                /* RUNTIME_data_flush_handle(options, handle_to_upper_tile); */
+                /* RUNTIME_data_flush_handle(options, handle_from_upper_tile); */
+                RUNTIME_data_unregister_free_async(handle_to_upper_tile);
+                RUNTIME_data_unregister_free_async(handle_from_upper_tile);
+                /* RUNTIME_data_flush_handle(options, handle_idxs_to_upper_tile); */
+                /* RUNTIME_data_flush_handle(options, handle_idxs_from_upper_tile); */
+            }
+        }
+
+        nb_source = TILES_SRC[tempkmt];
+        if (nb_source > 0) {
+            m = 0;
+            int tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
+            CHAMELON_TASK_zswp_insert_rows2(options, nb_source, A->mb, k, k, k, tempmm, tempnn, source_handle, A(k,n), handle_upper_to_itself, &idxs_from_upper_h[tempkmt]);
+
+            /* RUNTIME_data_flush_handle(options, handle_upper_to_itself); */
+            RUNTIME_data_unregister_free_async(handle_upper_to_itself);
+            /* RUNTIME_data_flush_handle(options, handle_idxs_upper_to_itself); */
+        }
+    } else if (options->swap_grain == CHAMELON_TILE_SWAPS) {
+        int m;
+
+        for (m = 0 ; m < updatePanel->mt ; m++)
+        {
+            if (TILES_SRC[tempkmt+m] >= 1) {
+                CHAMELON_TASK_zswpp(options,
+                                   updatePanel, A(k, n), A(k+m, n),
+                                   1, tempkm,
+                                   IPIV_handle, 1);
+            }
+        }
+    } else if (options->swap_grain == CHAMELON_COLUMN_SWAPS) {
+        CHAMELON_TASK_zlaswp_ontile(options,
+                                 updatePanel, A(k, n), 1,
+                                 chameleon_min(tempkm, tempkn), IPIV_handle, 1);
+    } else {
+        /*
+         * Using something other than a DSD matrix without SWAPs produces
+         * unstable behaviour. Remove this assert with caution.
+         */
+        assert(options->dsd_matrix == 1);
+    }
+    /* TODO: free updatePanel asynchronously */
+}
+
+static void compute_swap_amount(CHAMELON_option_t *options, int k,
+                                CHAMELON_desc_t *A, CHAMELON_data_handle_t *IPIV,
+                                CHAMELON_data_handle_t *perm_from_upper,
+                                CHAMELON_data_handle_t *perm_to_upper,
+                                int* nbswaps_from_upper, int* nbswaps_to_upper,
+                                int* idxs_from_upper[], int* idxs_to_upper[],
+                                CHAMELON_data_handle_t* idxs_from_upper_h,
+                                CHAMELON_data_handle_t* idxs_to_upper_h)
+{
+    int tempk  = k * A->mb;
+    int tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
+    int tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
+    int minkmn = ((tempkm < tempkn)?tempkm:tempkn);
+
+    if (options->swap_grain == CHAMELON_LINE_SWAPS) {
+        int i;
+        pivot_to_permutation(tempkm, k*A->mb, IPIV_p(k,0),
+                             perm_from_upper_p(k,0), perm_to_upper_p(k,0));
+
+        for (i=0 ; i<tempkm ; i++) {
+            int index = (*perm_from_upper_p(k,i)-tempk)/((int)A->mb);
+            if (*perm_from_upper_p(k,i) != tempk+i) {
+                nbswaps_from_upper[A->mt*k+index] += 1;
+            }
+            index = (*perm_to_upper_p(k,i)-tempk)/((int)A->mb);
+            if (*perm_to_upper_p(k,i) != tempk+i) {
+                nbswaps_to_upper[A->mt*k+index] += 1;
+            }
+        }
+        int count_from_upper[A->mt], count_to_upper[A->mt];
+        for (i=0; i < A->mt; i++) {
+	        if (nbswaps_from_upper[A->nt*k+i] || nbswaps_to_upper[A->nt*k+i])
+	        {
+		        idxs_from_upper[A->nt*k+i] = (int*)malloc(nbswaps_from_upper[A->nt*k+i]*sizeof(int));
+		        /* RUNTIME_data_alloc(&idxs_from_upper_h[A->nt*k+i]); */
+		        /* RUNTIME_vector_data_register(&idxs_from_upper_h[A->nt*k+i], 0, */
+		        /*                              (uintptr_t)idxs_from_upper[A->nt*k+i], */
+		        /*                              nbswaps_from_upper[A->nt*k+i], sizeof(int)); */
+
+		        idxs_to_upper[A->nt*k+i] = (int*) malloc(nbswaps_to_upper[A->nt*k+i]*sizeof(int));
+		        /* RUNTIME_data_alloc(&idxs_to_upper_h[A->nt*k+i]); */
+		        /* RUNTIME_vector_data_register(&idxs_to_upper_h[A->nt*k+i], 0, */
+		        /*                              (uintptr_t)idxs_to_upper[A->nt*k+i], */
+		        /*                              nbswaps_to_upper[A->nt*k+i], sizeof(int)); */
+	        }
+	        count_from_upper[i] = 0;
+	        count_to_upper[i] = 0;
+        }
+
+        for (i=0 ; i < tempkm; i++)
+        {
+	        int index = (*perm_from_upper_p(k,i)-tempk)/((int)A->mb);
+	        if (*perm_from_upper_p(k,i) != tempk+i)
+		        idxs_from_upper[A->nt*k+index][count_from_upper[index]++] = i;
+
+	        index = (*perm_to_upper_p(k,i)-tempk)/((int)A->mb);
+	        if (*perm_to_upper_p(k,i) != tempk+i)
+		        idxs_to_upper[A->nt*k+index][count_to_upper[index]++] = i;
+        }
+
+        for (i=0; i < A->mt; i++) {
+	        idxs_from_upper_h[A->nt*k+i].data_handle = NULL;
+	        idxs_to_upper_h[A->nt*k+i].data_handle = NULL;
+	        if (nbswaps_from_upper[A->nt*k+i] || nbswaps_to_upper[A->nt*k+i])
+	        {
+		        RUNTIME_data_alloc(&idxs_from_upper_h[A->nt*k+i]);
+		        RUNTIME_vector_data_register(&idxs_from_upper_h[A->nt*k+i], 0,
+		                                     (uintptr_t)idxs_from_upper[A->nt*k+i],
+		                                     nbswaps_from_upper[A->nt*k+i], sizeof(int));
+
+		        RUNTIME_data_alloc(&idxs_to_upper_h[A->nt*k+i]);
+		        RUNTIME_vector_data_register(&idxs_to_upper_h[A->nt*k+i], 0,
+		                                     (uintptr_t)idxs_to_upper[A->nt*k+i],
+		                                     nbswaps_to_upper[A->nt*k+i], sizeof(int));
+	        }
+        }
+    } else if (options->swap_grain == CHAMELON_TILE_SWAPS) {
+        int i;
+        for (i=0 ; i<tempkm ; i++) {
+            int index = (*IPIV_p(k,i)-tempk-1)/((int)A->mb);
+            if (*IPIV_p(k,i) != tempk+i+1)
+                nbswaps_from_upper[A->mt*k+index] += 1;
+        }
+    }
+}
+
+static void factorize_panel(int rectil, CHAMELON_option_t *options,
+                            int k, CHAMELON_desc_t *A, CHAMELON_data_handle_t *IPIV,
+                            CHAMELON_data_handle_t *workspace)
+ {
+     int tempk  = k * A->mb;
+     int tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
+     int tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
+     int minkmn = ((tempkm < tempkn)?tempkm:tempkn);
+
+     if (rectil)
+     {
+         CHAMELON_TASK_zgetrf_rectil(
+             options,
+             chameleon_desc_submatrix(A, tempk, k*A->nb, A->m - tempk, tempkn),
+             IPIV);
+     }
+     else
+     {
+         int h, m, d, r;
+         int height, dmax, pui;
+         CHAMELON_data_handle_t *top;
+         CHAMELON_data_handle_t *bottom;
+         for(h = 0; h <= minkmn; h++) {
+            /* Apply last update and extract local max */
+            for (m = 0; m < A->mt; m++) {
+                int tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
+                CHAMELON_TASK_zgetrf_extract_max(options,
+                                              k,tempmm,tempkn,h,A->nb,
+                                              A(m,k),
+                                              workspace + m);
+            }
+            if(h != minkmn) {
+                /* Reduce max */
+                height = A->mt-k;
+                dmax = (int)ceil( log ( height ) / log(2.) );
+                /* if(h==0) */
+                /*   printf("k=%d, height=%d, dmax=%d\n",k,height,dmax); */
+                for(d = 0; d < dmax; d++){
+                    pui = (1 << d);
+                    for(r = 0; (r+pui) < height; r+=pui*2) {
+                        /* if(h == 0) */
+                        /*   printf("reduction entre %d et %d a %d\n",r+k,r+k+pui,k); */
+                        top = workspace + r + k;
+                        bottom = workspace + r + k + pui;
+                        CHAMELON_TASK_zgetrf_compare_workspace(options,
+                                                            tempkn, h, A->nb,
+                                                            top, bottom);
+                    }
+                }
+                /* Broadcast global max */
+                for (m = k+1; m < A->mt; m++) {
+                    top = workspace + m;
+                    bottom = workspace + k;
+                    CHAMELON_TASK_zgetrf_copy_workspace(options,
+                                                     tempkn, h, A->nb,
+                                                     top, bottom);
+
+                }
+                /* Save pivot */
+                CHAMELON_TASK_zgetrf_save_pivots(options, h, IPIV, workspace + k);
+            }
+        }
+
+     }
+
+ }
+
+void chameleon_pzgetrf_pp(CHAMELON_desc_t *A,
+                      CHAMELON_data_handle_t *IPIV,
+                      CHAMELON_data_handle_t *perm_from_upper,
+                      CHAMELON_data_handle_t *perm_to_upper,
+                      int *nbswaps_from_upper, int *nbswaps_to_upper,
+                      CHAMELON_data_handle_t *workspace,
+                      CHAMELON_sequence_t *sequence,
+                      CHAMELON_request_t *request)
+{
+    CHAMELON_context_t *chameleon;
+    CHAMELON_option_t options;
+    int k, m, n;
+    int tempkm, tempkn, tempmm, tempnn, minmnt;
+    int ldak, ldam;
+    int rectil = 0, ncores = -1;
+    int *cores = NULL;
+    int* idxs_from_upper[A->nt*A->mt];
+    int* idxs_to_upper[A->nt*A->mt];
+    CHAMELON_data_handle_t idxs_from_upper_h[A->nt*A->mt];
+    CHAMELON_data_handle_t idxs_to_upper_h[A->nt*A->mt];
+
+    CHAMELON_Complex64_t zone  = (CHAMELON_Complex64_t) 1.0;
+    CHAMELON_Complex64_t mzone = (CHAMELON_Complex64_t)-1.0;
+
+    chameleon = chameleon_context_self();
+    if (sequence->status != CHAMELON_SUCCESS)
+        return;
+
+    RUNTIME_options_init( &options, chameleon, sequence, request );
+
+    minmnt = (A->mt < A->nt)? A->mt : A->nt;
+
+    /* Treat the related chameleon context options */
+    if (CHAMELON_RECTIL_PANEL)
+    {
+        rectil = 1;
+        /* TODO: Fix it. This doesn't work with GPUs, it's too simple:
+         * we need to fetch a CPU list from the Runtime, and put ids from this list
+         * To do this, the best thing would be to push this into the runtime alloc function
+         */
+        if (CHAMELON_PANEL_THREADS > 0)
+        {
+            ncores = CHAMELON_PANEL_THREADS;
+            cores = (int*)malloc(ncores * sizeof(int));
+            for (k=0; k<ncores; k++)
+                cores[k] = k;
+        }
+        RUNTIME_alloc_threads(&options, ncores, cores);
+        free(cores);
+        CORE_zgetrf_rectil_init();
+    }
+
+    if (options.swap_grain == CHAMELON_LINE_SWAPS) {
+        assert(A->m%A->mb == 0);
+        assert(A->n%A->nb == 0);
+    }
+
+    /***********************/
+    /* Panel factorization */
+    /***********************/
+    options.priority = 2*A->nt;
+    factorize_panel(rectil, &options, 0, A,
+                    IPIV_h(0), workspace);
+
+    for (k = 0; k < minmnt; k++) {
+        tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
+        tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
+        ldak = BLKLDD(A, k);
+        options.priority = 2*A->nt - k;
+
+        /***************************/
+        /* Compute amount of swaps */
+        /***************************/
+        RUNTIME_data_acquire(IPIV_h(k));
+        /* RUNTIME_data_acquire_write(perm_from_upper_h(k)); */
+        /* RUNTIME_data_acquire_write(perm_to_upper_h(k)); */
+        compute_swap_amount(&options, k, A, IPIV, perm_from_upper, perm_to_upper,
+                            nbswaps_from_upper, nbswaps_to_upper,
+                            idxs_from_upper, idxs_to_upper,
+                            idxs_from_upper_h, idxs_to_upper_h);
+        /* RUNTIME_data_release(perm_to_upper_h(k)); */
+        /* RUNTIME_data_release(perm_from_upper_h(k)); */
+        RUNTIME_data_release(IPIV_h(k));
+
+        /* TRSM_U */
+        for (n = k+1; n < A->nt; n++) {
+            tempnn = ((n == A->nt-1)? A->n-n*A->nb : A->nb);
+            options.priority = 2*A->nt-n-k;
+
+            /* options.priority = 2*A->nt - 2*k - n; */
+            unsigned old_swap = options.swap_grain;
+            if (n > k + 10)
+            /* if (n == k+1 || k == 0) */
+            /* if (k == 0) */
+	            /* options.swap_grain = CHAMELON_COLUMN_SWAPS; */
+            submit_swaps(A, k, n, IPIV_h(k), perm_from_upper_h(k), perm_to_upper_h(k), nbswaps_from_upper, nbswaps_to_upper, idxs_from_upper_h, idxs_to_upper_h, &options);
+            options.swap_grain = old_swap;
+
+            CHAMELON_TASK_ztrsm(
+                &options,
+                ChamLeft, ChamLower, ChamNoTrans, ChamUnit,
+                tempkm, tempnn, tempnn,
+                zone, A(k, k), ldak,
+                A(k, n), ldak);
+
+            /* GEMM */
+            for (m = k+1; m < A->mt; m++) {
+	            /* options.priority = A->mt*A->nt - 2*k  - n - m; */
+                tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
+                ldam = BLKLDD(A, m);
+
+                CHAMELON_TASK_zgemm(
+                    &options,
+                    ChamNoTrans, ChamNoTrans,
+                    tempmm, tempnn, tempkm, tempkn,
+                    mzone, A(m, k), ldam,
+                    A(k, n), ldak,
+                    zone,  A(m, n), ldam);
+            }
+
+
+            if (n==k+1)
+            {
+	            /* options.priority = 2*A->nt - 2*n; */
+                factorize_panel(rectil, &options, n, A,
+                                IPIV_h(n), workspace);
+            }
+        }
+
+        /* Backward swaps */
+        unsigned old_swap = options.swap_grain;
+        options.swap_grain = CHAMELON_COLUMN_SWAPS;
+        /* options.priority = 0; */
+        for (n = 0 ; n < k ; n++) {
+	        /* options.priority = 2*A->nt - 2*k  - n; */
+	        options.priority = 2*A->nt - k - n;
+	        submit_swaps(A, k, n, IPIV_h(k), perm_from_upper_h(k), perm_to_upper_h(k),
+	                     nbswaps_from_upper, nbswaps_to_upper,
+	                     idxs_from_upper_h, idxs_to_upper_h, &options);
+        }
+        options.swap_grain = old_swap;
+
+        /*******************************************/
+        /* Let's properly cleanup to help StarPU ! */
+        /*******************************************/
+
+        /* for (n = 0; n < A->nt; n++) */
+	      /*   RUNTIME_data_flush( &options, A(k, n) ); */
+
+        RUNTIME_data_flush_handle(&options, IPIV_h(k));
+        if (options.swap_grain == CHAMELON_LINE_SWAPS)
+	        for (m = 0 ; m < A->mt ; m++) {
+		        RUNTIME_data_flush_handle(&options, perm_to_upper_h(k));
+		        RUNTIME_data_flush_handle(&options, perm_from_upper_h(k));
+		        RUNTIME_data_flush_handle(&options, &idxs_from_upper_h[k*A->nt+m]);
+		        RUNTIME_data_flush_handle(&options, &idxs_to_upper_h[k*A->nt+m]);
+		        RUNTIME_data_unregister_free_async(&idxs_from_upper_h[k*A->nt+m]);
+		        RUNTIME_data_unregister_free_async(&idxs_to_upper_h[k*A->nt+m]);
+	        }
+    }
+
+    if (CHAMELON_RECTIL_PANEL)
+    {
+        RUNTIME_unalloc_threads(&options);
+    }
+    RUNTIME_options_finalize( &options, chameleon );
+    RUNTIME_flush();
+}
+
+
+/***********************************************************
+ * I do not know what the following version is.
+ * Omar also had this version so I will just leave it here
+ **********************************************************/
+
+/*  void chameleon_pzgetrf_pp_v2(CHAMELON_desc_t *A, CHAMELON_desc_t *Acopy, */
+/*                          CHAMELON_data_handle_t *IPIV, */
+/*                          CHAMELON_data_handle_t *perm_to_upper, */
+/*                          CHAMELON_data_handle_t *perm_from_upper, */
+/*                          CHAMELON_data_handle_t *workspace, */
+/*                          CHAMELON_sequence_t *sequence, */
+/*                          CHAMELON_request_t *request) */
+/* { */
+/*     CHAMELON_context_t *chameleon; */
+/*     CHAMELON_option_t options; */
+/*     int k, m, n, h, d, r; */
+/*     int height, pui, dmax; */
+/*     int tempkm, tempkn, tempmm,tempnn; */
+/*     int minkmn, minmnt; */
+/*     int *pivots; */
+/*     int ldak, ldam; */
+/*     CHAMELON_data_handle_t *top, *bottom; */
+/*     int nb_pivots; */
+
+/*     CHAMELON_Complex64_t zone  = (CHAMELON_Complex64_t) 1.0; */
+/*     CHAMELON_Complex64_t mzone = (CHAMELON_Complex64_t)-1.0; */
+
+/*     chameleon = chameleon_context_self(); */
+/*     if (sequence->status != CHAMELON_SUCCESS) */
+/*         return; */
+
+/*     RUNTIME_options_init( &options, chameleon, sequence, request ); */
+
+/*     for (n = 0; n < A->nt; n++) { */
+
+/*         tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; */
+/*         minmnt = (A->mt < n)? A->mt : n; */
+
+/*         /\***********************\/ */
+/*         /\*        Update       *\/ */
+/*         /\***********************\/ */
+/*         for (k = 0; k < minmnt; k++) { */
+/*             tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; */
+/*             tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; */
+/*             minkmn = ((tempkm < A->nb) ? tempkm : A->nb); */
+/*             ldak = BLKLDD(A, k); */
+
+/*             /\* SWAP *\/ */
+/*             CHAMELON_TASK_zlacpy( &options, ChamUpperLower, */
+/*                                tempkm, tempnn, tempkn, */
+/*                                A(k,n), ldak, */
+/*                                Acopy(0,n), ldak); */
+
+/*             pivots = perm_to_upper_p(k); */
+/*             for (m = k+1; m < A->mt; m++) { */
+/*                 tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; */
+/*                 ldam = BLKLDD(A, m); */
+
+/*                 nb_pivots = how_much_swap_to(tempkm, A->mb, m, pivots); */
+/*                 /\* printf("nb_pivots=%d between A(%d,%d) and A(%d,%d)\n",nb_pivots,k,n,m,n); *\/ */
+/*                 if(nb_pivots > 0) { */
+/*                     ALLOCATE_DYNAMIC_HANDLE(handle_to_upper_tile, CHAMELON_Complex64_t, nb_pivots * A->nb); */
+/*                     ALLOCATE_DYNAMIC_HANDLE(handle_from_upper_tile, CHAMELON_Complex64_t, nb_pivots * A->nb); */
+
+/*                     CHAMELON_TASK_zswp_extract_rows(&options, k, k, m, tempkm, tempmm, tempnn, A->mb, perm_from_upper_h(k), Acopy(0,n), handle_from_upper_tile); */
+/*                     CHAMELON_TASK_zswp_extract_rows(&options, k, m, k, tempkm, tempmm, tempnn, A->mb, perm_to_upper_h(k), A(m,n), handle_to_upper_tile); */
+
+/*                     CHAMELON_TASK_zswp_insert_rows(&options, k, m, k, tempkm, tempmm, tempnn, A->mb, perm_to_upper_h(k), A(k,n), handle_to_upper_tile); */
+/*                     CHAMELON_TASK_zswp_insert_rows(&options, k, k, m, tempkm, tempmm, tempnn, A->mb, perm_from_upper_h(k), A(m,n), handle_from_upper_tile); */
+/*                 } */
+/*             } */
+
+/*             nb_pivots = how_much_swap_to(tempkm, A->mb, k, pivots); */
+
+/*             /\* printf("nb_pivots=%d in A(%d,%d)\n",nb_pivots,k,n); *\/ */
+/*             if(nb_pivots > 0) { */
+/*                 ALLOCATE_DYNAMIC_HANDLE(handle_to_upper_tile, CHAMELON_Complex64_t, nb_pivots * A->nb); */
+/*                 CHAMELON_TASK_zswp_extract_rows(&options, k, k, k, tempkm, tempkm, tempnn, A->mb, perm_to_upper_h(k), Acopy(0,n), handle_to_upper_tile); */
+/*                 CHAMELON_TASK_zswp_insert_rows(&options, k, k, k, tempkm, tempkm, tempnn, A->mb, perm_to_upper_h(k), A(k,n), handle_to_upper_tile); */
+/*             } */
+
+/*             /\* TRSM_U *\/ */
+/*             CHAMELON_TASK_ztrsm( */
+/*                 &options, */
+/*                 ChamLeft, ChamLower, ChamNoTrans, ChamUnit, */
+/*                 tempkm, tempnn, tempkm, */
+/*                 zone, A(k, k), ldak, */
+/*                 A(k, n), ldak); */
+
+/*             /\* GEMM *\/ */
+/*             for (m = k+1; m < A->mt; m++) { */
+/*                 tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; */
+/*                 CHAMELON_TASK_zgemm( */
+/* 		    &options, */
+/* 		    ChamNoTrans, ChamNoTrans, */
+/* 		    tempmm, tempnn, tempkm, tempkn, */
+/* 		    mzone, A(m, k), ldam, */
+/* 		    A(k, n), ldak, */
+/* 		    zone,  A(m, n), ldam); */
+/*             } */
+/*         } */
+
+/*         if (n < A->mt) { */
+/*             /\***********************\/ */
+/*             /\* Panel factorization *\/ */
+/*             /\***********************\/ */
+
+/*             tempkm = n == A->mt-1 ? A->m-n*A->mb : A->mb; */
+/*             tempkn = n == A->nt-1 ? A->n-n*A->nb : A->nb; */
+/*             minkmn = ((tempkm < tempkn)?tempkm:tempkn); */
+
+/*             for(h = 0; h <= minkmn; h++) { */
+/*                 /\* Apply last update and extract local max *\/ */
+/*                 for (m = n; m < A->mt; m++) { */
+/*                     tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; */
+/*                     CHAMELON_TASK_zgetrf_extract_max(&options, */
+/*                                                   n,tempmm,tempkn,h,A->nb, */
+/*                                                   A(m,n), */
+/*                                                   workspace + m); */
+/*                 } */
+/*                 if(h != minkmn) { */
+/*                     /\* Reduce max *\/ */
+/*                     height = A->mt-n; */
+/*                     dmax = (int)ceil( log ( height ) / log(2.) ); */
+/*                     /\* if(h==0) *\/ */
+/*                     /\*   printf("k=%d, height=%d, dmax=%d\n",k,height,dmax); *\/ */
+/*                     for(d = 0; d < dmax; d++){ */
+/*                         pui = (1 << d); */
+/*                         for(r = 0; (r+pui) < height; r+=pui*2) { */
+/*                             /\* if(h == 0) *\/ */
+/*                             /\*   printf("reduction entre %d et %d a %d\n",r+k,r+k+pui,k); *\/ */
+/*                             top = workspace + r + n; */
+/*                             bottom = workspace + r + n + pui; */
+/*                             CHAMELON_TASK_zgetrf_compare_workspace(&options, */
+/*                                                                 tempkn, h, A->nb, */
+/*                                                                 top, bottom); */
+/*                         } */
+/*                     } */
+/*                     /\* Broadcast global max *\/ */
+/*                     for (m = n+1; m < A->mt; m++) { */
+/*                         top = workspace + m; */
+/*                         bottom = workspace + n; */
+/*                         CHAMELON_TASK_zgetrf_copy_workspace(&options, */
+/*                                                          tempkn, h, A->nb, */
+/*                                                          top, bottom); */
+
+/*                     } */
+/*                     /\* Save pivot *\/ */
+/*                     CHAMELON_TASK_zgetrf_save_pivots(&options, h, IPIV_h(n), workspace + n); */
+/*                 } */
+/*             } */
+
+/*             RUNTIME_data_acquire(IPIV_h(k)); */
+/*             pivot_to_permutation(minkmn, k*A->mb, IPIV_p(k), perm_to_upper_p(k), perm_from_upper_p(k)); */
+/*             RUNTIME_data_release(IPIV_h(k)); */
+/*         } */
+/*     } */
+
+/*     RUNTIME_options_finalize(&options, chameleon); */
+/* } */
diff --git a/compute/pzgetrf_sp.c b/compute/pzgetrf_sp.c
new file mode 100644
index 0000000000000000000000000000000000000000..76555adae0a147c61775e359fc9bf00be185dbf0
--- /dev/null
+++ b/compute/pzgetrf_sp.c
@@ -0,0 +1,117 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Inria. All rights reserved.
+ * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file pzgetrf_sp.c
+ *
+ *  CHAMELON auxiliary routines
+ *  CHAMELON is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @author Omar Zenati
+ * @author Terry Cojean
+ * @date 2011-07-06
+ * @precisions normal z -> s d c
+ *
+ **/
+#include "control/common.h"
+
+#define A(m,n) A,  m,  n
+
+/***************************************************************************//**
+ *  Parallel tile LU decomposition - dynamic scheduling
+ **/
+void chameleon_pzgetrf_sp(CHAMELON_desc_t *A,
+		      double criteria, CHAMELON_data_handle_t *nb_pivot_handle,
+                      CHAMELON_sequence_t *sequence, CHAMELON_request_t *request)
+{
+    CHAMELON_context_t *chameleon;
+    CHAMELON_option_t options;
+
+    int k, m, n;
+    int tempkm, tempkn, tempmm,tempnn;
+    int minmnt;
+    int ldak, ldam;
+    size_t ws_host   = 0;
+
+    CHAMELON_Complex64_t zone  = (CHAMELON_Complex64_t) 1.0;
+    CHAMELON_Complex64_t mzone = (CHAMELON_Complex64_t)-1.0;
+
+    chameleon = chameleon_context_self();
+    if (sequence->status != CHAMELON_SUCCESS)
+        return;
+    RUNTIME_options_init(&options, chameleon, sequence, request);
+
+#ifdef CHAMELEON_USE_MAGMA
+    if (0) /* Disable the workspace as long as it is is not used (See StarPU codelet) */
+    {
+        int nb = CHAMELON_IB; /* Approximate nb for simulation */
+#if !defined(CHAMELEON_SIMULATION)
+        nb = magma_get_zpotrf_nb(A->nb);
+#endif
+        ws_host = sizeof(CHAMELON_Complex64_t)*nb*nb;
+    }
+#endif
+    RUNTIME_options_ws_alloc( &options, 0, ws_host );
+
+    minmnt = (A->mt < A->nt)? A->mt : A->nt;
+    for (k = 0; k < minmnt; k++) {
+        tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
+        tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
+        ldak = BLKLDD(A, k);
+
+        CHAMELON_TASK_zgetrf_sp(
+            &options,
+            tempkm, tempkn,
+            A(k, k), criteria, nb_pivot_handle);
+        /* TRSM_L */
+        for (m = k+1; m < A->mt; m++) {
+            tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
+            ldam = BLKLDD(A, m);
+            CHAMELON_TASK_ztrsm(
+                &options,
+                ChamRight, ChamUpper, ChamNoTrans, ChamNonUnit,
+                tempkm, tempmm, A->mb,
+                zone, A(k, k), ldak,
+                      A(m, k), ldam);
+        }
+
+        /* TRSM_U */
+        for (n = k+1; n < A->nt; n++) {
+            tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
+            CHAMELON_TASK_ztrsm(
+                &options,
+                ChamLeft, ChamLower, ChamNoTrans, ChamUnit,
+                tempkm, tempnn, A->nb,
+                zone, A(k, k), ldak,
+                      A(k, n), ldak);
+        }
+
+        /* GEMM */
+        for (m = k+1; m < A->mt; m++) {
+            tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
+            ldam = BLKLDD(A, m);
+            for (n = k+1; n < A->nt; n++) {
+                tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
+                CHAMELON_TASK_zgemm(
+		    &options,
+		    ChamNoTrans, ChamNoTrans,
+		    tempmm, tempnn, A->mb, A->mb,
+		    mzone, A(m, k), ldam,
+		           A(k, n), ldak,
+		    zone,  A(m, n), ldam);
+            }
+        }
+    }
+
+    RUNTIME_options_ws_free(&options);
+    RUNTIME_options_finalize(&options, chameleon);
+}
diff --git a/compute/pzplrnt.c b/compute/pzplrnt.c
index b6fba32067df9ea3a9487302f90569238003efd2..fddb3b6c8e321d91e90b44164011c8e3ebb0f1e3 100644
--- a/compute/pzplrnt.c
+++ b/compute/pzplrnt.c
@@ -53,7 +53,7 @@ void chameleon_pzplrnt( CHAM_desc_t *A,
             INSERT_TASK_zplrnt(
                 &options,
                 tempmm, tempnn, A(m, n),
-                bigM, m*A->mb + m0, n*A->nb + n0, seed );
+                bigM, m*A->mb + m0, n*A->nb + n0, seed, options.dsd_matrix );
         }
     }
     RUNTIME_options_finalize(&options, chamctxt);
diff --git a/compute/zgetrf_pp.c b/compute/zgetrf_pp.c
new file mode 100644
index 0000000000000000000000000000000000000000..cd09d6ca1c8e9a18385b5b77550455db680b8b17
--- /dev/null
+++ b/compute/zgetrf_pp.c
@@ -0,0 +1,392 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Inria. All rights reserved.
+ * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file zgetrf_pp.c
+ *
+ *  CHAMELON computational routines
+ *  CHAMELON is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @version 2.6.0
+ * @author Omar Zenati
+ * @author Terry Cojean
+ * @date 2011-07-06
+ *
+ * @precisions normal z -> s d c
+ *
+ **/
+#include "control/common.h"
+
+/***************************************************************************//**
+ *
+ * @ingroup CHAMELON_Complex64_t
+ *
+ *  CHAMELON_zgetrf_pp - Computes an LU factorization of a general M-by-N matrix A
+ *  using the tile LU algorithm with partial pivoting with row interchanges.
+ *
+ *******************************************************************************
+ *
+ * @param[in] M
+ *          The number of rows of the matrix A. M >= 0.
+ *
+ * @param[in] N
+ *          The number of columns of the matrix A. N >= 0.
+ *
+ * @param[in,out] A
+ *          On entry, the M-by-N matrix to be factored.
+ *          On exit, the tile factors L and U from the factorization.
+ *
+ * @param[in] LDA
+ *          The leading dimension of the array A. LDA >= max(1,M).
+ *
+ * @param[out] IPIV
+ *          The pivot indices that define the permutations (not equivalent to LAPACK).
+ *
+ *******************************************************************************
+ *
+ * @return
+ *          \retval CHAMELON_SUCCESS successful exit
+ *          \retval <0 if -i, the i-th argument had an illegal value
+ *          \retval >0 if i, U(i,i) is exactly zero. The factorization has been completed,
+ *               but the factor U is exactly singular, and division by zero will occur
+ *               if it is used to solve a system of equations.
+ *
+ *******************************************************************************
+ *
+ * @sa CHAMELON_zgetrf_Tile
+ * @sa CHAMELON_zgetrf_Tile_Async
+ * @sa CHAMELON_cgetrf
+ * @sa CHAMELON_dgetrf
+ * @sa CHAMELON_sgetrf
+ * @sa CHAMELON_zgetrs
+ *
+ ******************************************************************************/
+/* TODO: Fix the untiled version !! */
+int CHAMELON_zgetrf_pp(int M, int N,
+                    CHAMELON_Complex64_t *A, int LDA, int *IPIV)
+{
+    int NB;
+    int status;
+    CHAMELON_desc_t descAl, descAt;
+    CHAMELON_context_t *chameleon;
+    CHAMELON_sequence_t *sequence = NULL;
+    CHAMELON_request_t request = CHAMELON_REQUEST_INITIALIZER;
+
+    chameleon = chameleon_context_self();
+    if (chameleon == NULL) {
+        chameleon_fatal_error("CHAMELON_zgetrf_pp", "CHAMELON not initialized");
+        return CHAMELON_ERR_NOT_INITIALIZED;
+    }
+    /* Check input arguments */
+    if (M < 0) {
+        chameleon_error("CHAMELON_zgetrf_pp", "illegal value of M");
+        return -1;
+    }
+    if (N < 0) {
+        chameleon_error("CHAMELON_zgetrf_pp", "illegal value of N");
+        return -2;
+    }
+    if (LDA < chameleon_max(1, M)) {
+        chameleon_error("CHAMELON_zgetrf_pp", "illegal value of LDA");
+        return -4;
+    }
+    /* Quick return */
+    if (chameleon_min(M, N) == 0)
+        return CHAMELON_SUCCESS;
+
+    chameleon_sequence_create(chameleon, &sequence);
+
+    /* Set NT & NTRHS */
+    NB   = CHAMELON_NB;
+
+    /* Submit the matrix conversion */
+    chameleon_zlap2tile( chameleon, &descAl, &descAt, ChamDescInout, ChamUpperLower,
+                     A, NB, NB, LDA, N, N, N, sequence, &request);
+
+    int minmn = (M < N) ? M : N;
+    int nb_pivots = minmn / NB + ((minmn % NB) ? 1 : 0);
+    int minmnt = nb_pivots * NB;
+    int *nbswaps_from_upper = (int*)calloc(minmn, sizeof(int));
+    int *nbswaps_to_upper = (int*)calloc(minmn, sizeof(int));
+    int *swaps_from_upper = (int*)calloc(minmnt * descAt.nb, sizeof(int));
+    int *swaps_to_upper = (int*)calloc(minmnt * descAt.nb, sizeof(int));
+
+    CHAMELON_data_handle_t IPIV_h[nb_pivots];
+    CHAMELON_data_handle_t swaps_from_upper_h[nb_pivots];
+    CHAMELON_data_handle_t swaps_to_upper_h[nb_pivots];
+
+    int i;
+    for(i = 0; i < nb_pivots; i++) {
+	    RUNTIME_data_alloc(IPIV_h + i);
+	    RUNTIME_vector_data_register(IPIV_h + i, 0, (uintptr_t)(IPIV + i * descAt.nb), descAt.nb, sizeof(int));
+
+	    RUNTIME_data_alloc(swaps_from_upper_h + i);
+	    RUNTIME_vector_data_register(swaps_from_upper_h + i, 0,
+	                                 (uintptr_t)(swaps_from_upper + i * descAt.nb),
+	                                 descAt.nb, sizeof(int));
+
+	    RUNTIME_data_alloc(swaps_to_upper_h + i);
+	    RUNTIME_vector_data_register(swaps_to_upper_h + i, 0,
+	                                 (uintptr_t)(swaps_to_upper+ i * descAt.nb),
+	                                 descAt.nb, sizeof(int));
+    }
+
+    int nb_workspace = descAt.mt;
+    int workspace_size = NB * 2 + 1;
+    CHAMELON_data_handle_t *max_workspace = malloc(sizeof(CHAMELON_data_handle_t) * nb_workspace);
+    CHAMELON_Complex64_t **workspaces = malloc(sizeof(CHAMELON_Complex64_t *) * nb_workspace);
+
+    for(i = 0; i < nb_workspace; i++) {
+	    workspaces[i] = malloc(sizeof(CHAMELON_Complex64_t) * workspace_size);
+	    RUNTIME_vector_data_register(max_workspace + i, 0, (uintptr_t)(workspaces[i]),
+	                                 workspace_size, sizeof(CHAMELON_Complex64_t));
+    }
+
+    /* Call the tile interface */
+    CHAMELON_zgetrf_pp_Tile_Async(&descAt, IPIV_h, swaps_from_upper_h, swaps_to_upper_h,
+                               nbswaps_from_upper, nbswaps_to_upper, max_workspace,
+                               sequence, &request);
+
+
+    chameleon_ztile2lap( chameleon, &descAl, &descAt,
+                     ChamDescInout, ChamUpperLower, sequence, &request );
+
+    chameleon_sequence_wait( chameleon, sequence );
+
+    /* Cleanup the temporary data */
+    chameleon_ztile2lap_cleanup( chameleon, &descAl, &descAt );
+
+    for (i = 0 ; i < nb_pivots ; i++)  {
+	    RUNTIME_data_unregister(IPIV_h+i);
+	    RUNTIME_data_unregister(swaps_from_upper_h+i);
+	    RUNTIME_data_unregister(swaps_to_upper_h+i);
+    }
+
+    free(nbswaps_from_upper);
+    free(nbswaps_to_upper);
+    free(swaps_from_upper);
+    free(swaps_to_upper);
+    free(max_workspace);
+    for(i = 0; i < nb_workspace; i++) {
+	    free(workspaces[i]);
+    }
+    free(workspaces);
+
+    status = sequence->status;
+    chameleon_sequence_destroy( chameleon, sequence );
+
+    return status;
+}
+
+/***************************************************************************//**
+ *
+ * @ingroup CHAMELON_Complex64_t_Tile
+ *
+ *  CHAMELON_zgetrf_pp_Tile - Computes the tile LU factorization of a matrix.
+ *  Tile equivalent of CHAMELON_zgetrf().
+ *  Operates on matrices stored by tiles.
+ *  All matrices are passed through descriptors.
+ *  All dimensions are taken from the descriptors.
+ *
+ *******************************************************************************
+ *
+ * @param[in,out] A
+ *          On entry, the M-by-N matrix to be factored.
+ *          On exit, the tile factors L and U from the factorization.
+ *
+ * @param[out] IPIV
+ *          The pivot indices that define the permutations (not equivalent to LAPACK).
+ *
+ *******************************************************************************
+ *
+ * @return
+ *          \retval CHAMELON_SUCCESS successful exit
+ *          \retval >0 if i, U(i,i) is exactly zero. The factorization has been completed,
+ *               but the factor U is exactly singular, and division by zero will occur
+ *               if it is used to solve a system of equations.
+ *
+ *******************************************************************************
+ *
+ * @sa CHAMELON_zgetrf
+ * @sa CHAMELON_zgetrf_Tile_Async
+ * @sa CHAMELON_cgetrf_Tile
+ * @sa CHAMELON_dgetrf_Tile
+ * @sa CHAMELON_sgetrf_Tile
+ * @sa CHAMELON_zgetrs_Tile
+ *
+ ******************************************************************************/
+int CHAMELON_zgetrf_pp_Tile(CHAMELON_desc_t *A, int *IPIV)
+{
+    /* FIXME: Actually use correct nb/mb/min(...) scheme for computing pivot amount */
+    int minmn = chameleon_min(A->m, A->n);
+    int minmnt = (A->mt < A->nt) ? A->mt : A->nt;
+    int nb_pivots = minmn / A->nb + ((minmn % A->nb) ? 1 : 0);
+    int status;
+    int i;
+    CHAMELON_context_t *chameleon = chameleon_context_self();
+    CHAMELON_sequence_t *sequence = NULL;
+    CHAMELON_request_t request = CHAMELON_REQUEST_INITIALIZER;
+
+    int *swaps_from_upper = (int*)calloc(chameleon_max(A->m, A->n), sizeof(int));
+    int *swaps_to_upper = (int*)calloc(chameleon_max(A->m, A->n), sizeof(int));
+    CHAMELON_data_handle_t IPIV_h[nb_pivots];
+    CHAMELON_data_handle_t swaps_from_upper_h[nb_pivots];
+    CHAMELON_data_handle_t swaps_to_upper_h[nb_pivots];
+
+    int *nbswaps_from_upper = (int*)calloc(nb_pivots * A->mt, sizeof(int));
+    int *nbswaps_to_upper = (int*)calloc(nb_pivots * A->mt, sizeof(int));
+
+    int nb_workspace = A->mt;
+    int workspace_size = A->nb * 2 + 1;
+    CHAMELON_data_handle_t *max_workspace = malloc(sizeof(CHAMELON_data_handle_t) * nb_workspace);
+    CHAMELON_Complex64_t *workspaces = malloc(sizeof(CHAMELON_Complex64_t) * nb_workspace * workspace_size);
+
+    for(i = 0; i < nb_workspace; i++) {
+        RUNTIME_data_alloc(max_workspace + i);
+        RUNTIME_vector_data_register(max_workspace + i, 0,
+                                     (uintptr_t)(workspaces + i * workspace_size),
+                                     workspace_size, sizeof(CHAMELON_Complex64_t));
+    }
+
+    for(i = 0; i < nb_pivots; i++) {
+        int tempin = i == A->nt-1 ? A->n-i*A->nb : A->nb;
+        RUNTIME_data_alloc(IPIV_h + i);
+        RUNTIME_vector_data_register(IPIV_h + i, 0,
+                                     (uintptr_t)(IPIV + i * A->nb),
+                                     tempin, sizeof(int));
+
+        RUNTIME_data_alloc(swaps_from_upper_h + i);
+        RUNTIME_vector_data_register(swaps_from_upper_h + i, 0,
+                                     (uintptr_t)(swaps_from_upper + i * A->nb),
+                                     tempin, sizeof(int));
+
+        RUNTIME_data_alloc(swaps_to_upper_h + i);
+        RUNTIME_vector_data_register(swaps_to_upper_h + i, 0,
+                                     (uintptr_t)(swaps_to_upper+ i * A->nb),
+                                     tempin, sizeof(int));
+    }
+
+
+    chameleon_sequence_create(chameleon, &sequence);
+
+    CHAMELON_zgetrf_pp_Tile_Async(A, IPIV_h, swaps_from_upper_h, swaps_to_upper_h,
+                               nbswaps_from_upper, nbswaps_to_upper,
+                               max_workspace, sequence, &request);
+
+    CHAMELON_Desc_Flush( A, sequence );
+    chameleon_sequence_wait(chameleon, sequence);
+
+    for (i = 0 ; i < nb_pivots ; i++)  {
+        RUNTIME_data_unregister(IPIV_h+i);
+        RUNTIME_data_free(IPIV_h+i);
+
+        RUNTIME_data_unregister(swaps_from_upper_h+i);
+        RUNTIME_data_free(swaps_from_upper_h+i);
+
+        RUNTIME_data_unregister(swaps_to_upper_h+i);
+        RUNTIME_data_free(swaps_to_upper_h+i);
+    }
+
+    free(nbswaps_from_upper);
+    free(nbswaps_to_upper);
+    free(swaps_from_upper);
+    free(swaps_to_upper);
+
+    for(i = 0; i < nb_workspace; i++) {
+        RUNTIME_data_unregister(max_workspace+i);
+        RUNTIME_data_free(max_workspace+i);
+    }
+    free(max_workspace);
+    free(workspaces);
+
+    status = sequence->status;
+    chameleon_sequence_destroy(chameleon, sequence);
+    return status;
+}
+
+/***************************************************************************//**
+ *
+ * @ingroup CHAMELON_Complex64_t_Tile_Async
+ *
+ *  CHAMELON_zgetrf_pp_Tile_Async - Computes the tile LU factorization of a matrix.
+ *  Non-blocking equivalent of CHAMELON_zgetrf_Tile().
+ *  May return before the computation is finished.
+ *  Allows for pipelining of operations at runtime.
+ *
+ *******************************************************************************
+ *
+ * @param[in] sequence
+ *          Identifies the sequence of function calls that this call belongs to
+ *          (for completion checks and exception handling purposes).
+ *
+ * @param[out] request
+ *          Identifies this function call (for exception handling purposes).
+ *
+ *******************************************************************************
+ *
+ * @sa CHAMELON_zgetrf
+ * @sa CHAMELON_zgetrf_Tile
+ * @sa CHAMELON_cgetrf_Tile_Async
+ * @sa CHAMELON_dgetrf_Tile_Async
+ * @sa CHAMELON_sgetrf_Tile_Async
+ * @sa CHAMELON_zgetrs_Tile_Async
+ *
+ ******************************************************************************/
+int CHAMELON_zgetrf_pp_Tile_Async(CHAMELON_desc_t *A,
+                               CHAMELON_data_handle_t *IPIV_h,
+                               CHAMELON_data_handle_t *swaps_from_upper_h,
+                               CHAMELON_data_handle_t *swaps_to_upper_h,
+                               int *nbswaps_from_upper,
+                               int *nbswaps_to_upper,
+                               CHAMELON_data_handle_t *workspace_h,
+                               CHAMELON_sequence_t *sequence,
+                               CHAMELON_request_t *request)
+{
+     CHAMELON_context_t *chameleon;
+
+    chameleon = chameleon_context_self();
+    if (chameleon == NULL) {
+        chameleon_fatal_error("CHAMELON_zgetrf_pp_Tile", "CHAMELON not initialized");
+        return CHAMELON_ERR_NOT_INITIALIZED;
+    }
+    if (sequence == NULL) {
+        chameleon_fatal_error("CHAMELON_zgetrf_pp_Tile", "NULL sequence");
+        return CHAMELON_ERR_UNALLOCATED;
+    }
+    if (request == NULL) {
+        chameleon_fatal_error("CHAMELON_zgetrf_pp_Tile", "NULL request");
+        return CHAMELON_ERR_UNALLOCATED;
+    }
+    /* Check sequence status */
+    if (sequence->status == CHAMELON_SUCCESS)
+        request->status = CHAMELON_SUCCESS;
+    else
+        return chameleon_request_fail(sequence, request, CHAMELON_ERR_SEQUENCE_FLUSHED);
+
+    /* Check descriptors for correctness */
+    if (chameleon_desc_check(A) != CHAMELON_SUCCESS) {
+        chameleon_error("CHAMELON_zgetrf_pp_Tile", "invalid first descriptor");
+        return chameleon_request_fail(sequence, request, CHAMELON_ERR_ILLEGAL_VALUE);
+    }
+
+    /* Check input arguments */
+    if (A->nb != A->mb) {
+        chameleon_error("CHAMELON_zgetrf_pp_Tile", "only square tiles supported");
+        return chameleon_request_fail(sequence, request, CHAMELON_ERR_ILLEGAL_VALUE);
+    }
+
+    chameleon_pzgetrf_pp(A, IPIV_h, swaps_from_upper_h, swaps_to_upper_h,
+                     nbswaps_from_upper, nbswaps_to_upper, workspace_h,
+                     sequence, request);
+
+    return CHAMELON_SUCCESS;
+}
diff --git a/compute/zgetrf_sp.c b/compute/zgetrf_sp.c
new file mode 100644
index 0000000000000000000000000000000000000000..df4ae6fdede7d972f3fbbb3dde2d0265ae336120
--- /dev/null
+++ b/compute/zgetrf_sp.c
@@ -0,0 +1,273 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Inria. All rights reserved.
+ * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file zgetrf_sp.c
+ *
+ *  CHAMELON computational routines
+ *  CHAMELON is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @version 2.6.0
+ * @author Omar Zenati
+ * @author Terry Cojean
+ * @date 2011-07-06
+ *
+ * @precisions normal z -> s d c
+ *
+ **/
+#include "control/common.h"
+
+/***************************************************************************//**
+ *
+ * @ingroup CHAMELON_Complex64_t
+ *
+ *  CHAMELON_zgetrf_sp - Computes the LU decomposition of a matrix A.
+ *  The factorization has the form
+ *
+ *    \f[ A = \{_{L\times U} \f]
+ *
+ *  where U is an upper triangular matrix and L is a lower triangular matrix.
+ *
+ *******************************************************************************
+ *
+ * @param[in] M
+ *          The order of the matrix A. M >= 0.
+ *
+ * @param[in] N
+ *          The order of the matrix A. N >= 0.
+ *
+ * @param[in,out] A
+ *          On entry, the matrix A.
+ *          On exit, if return value = 0, the factor U or L from the LU decomposition
+ *          A = LU.
+ *
+ * @param[in] LDA
+ *          The leading dimension of the array A. LDA >= max(1,N).
+ *
+ *******************************************************************************
+ *
+ * @return
+ *          \retval CHAMELON_SUCCESS successful exit
+ *          \retval <0 if -i, the i-th argument had an illegal value
+ *          \retval >0 if i, the leading minor of order i of A is not positive definite, so the
+ *               factorization could not be completed, and the solution has not been computed.
+ *
+ *******************************************************************************
+ *
+ * @sa CHAMELON_zgetrf_sp_Tile
+ * @sa CHAMELON_zgetrf_sp_Tile_Async
+ * @sa CHAMELON_cgetrf_sp
+ * @sa CHAMELON_dgetrf_sp
+ * @sa CHAMELON_sgetrf_sp
+ *
+ ******************************************************************************/
+int CHAMELON_zgetrf_sp(int M, int N, CHAMELON_Complex64_t *A, int LDA,
+                    double criteria, int *nb_pivot)
+{
+    int NB;
+    int status;
+    CHAMELON_context_t *chameleon;
+    CHAMELON_sequence_t *sequence = NULL;
+    CHAMELON_request_t request = CHAMELON_REQUEST_INITIALIZER;
+    CHAMELON_desc_t descAl, descAt;
+
+    chameleon = chameleon_context_self();
+    if (chameleon == NULL) {
+        chameleon_fatal_error("CHAMELON_zgetrf_sp", "CHAMELON not initialized");
+        return CHAMELON_ERR_NOT_INITIALIZED;
+    }
+    /* Check input arguments */
+    if (M < 0) {
+        chameleon_error("CHAMELON_zgetrf_sp", "illegal value of M");
+        return -2;
+    }
+    if (N < 0) {
+        chameleon_error("CHAMELON_zgetrf_sp", "illegal value of N");
+        return -2;
+    }
+    if (LDA < chameleon_max(1, M)) {
+        chameleon_error("CHAMELON_zgetrf_sp", "illegal value of LDA");
+        return -4;
+    }
+    /* Quick return */
+    if ((chameleon_max(M, 0) == 0) || (chameleon_max(N, 0) == 0))
+        return CHAMELON_SUCCESS;
+
+    /* Set NT */
+    NB = CHAMELON_NB;
+
+    /* Registering nb_pivot */
+    int nb = 0;
+    CHAMELON_data_handle_t nb_pivot_handle;
+    RUNTIME_vector_data_register(&nb_pivot_handle, 0, (uintptr_t)&nb, 1, sizeof(int));
+
+    chameleon_sequence_create(chameleon, &sequence);
+
+    chameleon_zlap2tile( chameleon, &descAl, &descAt, ChamDescInout, ChamUpperLower,
+                     A, NB, NB, LDA, N, N, N, sequence, &request);
+
+    /* Call the tile interface */
+    CHAMELON_zgetrf_sp_Tile_Async(&descAt, sequence, &request, criteria, &nb_pivot_handle);
+
+    chameleon_ztile2lap( chameleon, &descAl, &descAt,
+                     ChamDescInout, ChamUpperLower, sequence, &request );
+    chameleon_sequence_wait(chameleon, sequence);
+
+    /* Cleanup the temporary data */
+    chameleon_ztile2lap_cleanup( chameleon, &descAl, &descAt );
+
+    status = sequence->status;
+    chameleon_sequence_destroy(chameleon, sequence);
+
+    RUNTIME_data_unregister(&nb_pivot_handle);
+
+    RUNTIME_data_acquire(&nb_pivot_handle);
+    RUNTIME_data_release(&nb_pivot_handle);
+    *nb_pivot = nb;
+
+    return status;
+}
+
+/***************************************************************************//**
+ *
+ * @ingroup CHAMELON_Complex64_t_Tile
+ *
+ *  CHAMELON_zgetrf_sp_Tile - Computes the LU decomposition of matrix.
+ *  Tile equivalent of CHAMELON_zgetrf_sp().
+ *  Operates on matrices stored by tiles.
+ *  All matrices are passed through descriptors.
+ *  All dimensions are taken from the descriptors.
+ *
+ *******************************************************************************
+ *
+ *
+ * @param[in] A
+ *          On entry, the matrix A.
+ *          On exit, if return value = 0, the factor U or L from the LU decomposition
+ *          A = LU.
+ *
+ *******************************************************************************
+ *
+ * @return
+ *          \retval CHAMELON_SUCCESS successful exit
+ *          \retval >0 if i, the leading minor of order i of A is not positive definite, so the
+ *               factorization could not be completed, and the solution has not been computed.
+ *
+ *******************************************************************************
+ *
+ * @sa CHAMELON_zgetrf_sp
+ * @sa CHAMELON_zgetrf_sp_Tile_Async
+ * @sa CHAMELON_cgetrf_sp_Tile
+ * @sa CHAMELON_dgetrf_sp_Tile
+ * @sa CHAMELON_sgetrf_sp_Tile
+ *
+ ******************************************************************************/
+int CHAMELON_zgetrf_sp_Tile(CHAMELON_desc_t *A, double criteria)
+{
+    CHAMELON_context_t *chameleon;
+    CHAMELON_sequence_t *sequence = NULL;
+    CHAMELON_request_t request = CHAMELON_REQUEST_INITIALIZER;
+    int status;
+
+    chameleon = chameleon_context_self();
+    if (chameleon== NULL) {
+        chameleon_fatal_error("CHAMELON_zgetrf_sp_Tile", "CHAMELON not initialized");
+        return CHAMELON_ERR_NOT_INITIALIZED;
+    }
+
+    int nb = 0;
+    CHAMELON_data_handle_t nb_pivot_handle;
+    RUNTIME_data_alloc(&nb_pivot_handle);
+    RUNTIME_vector_data_register(&nb_pivot_handle, 0, (uintptr_t)&nb, 1, sizeof(int));
+
+    chameleon_sequence_create(chameleon, &sequence);
+    CHAMELON_zgetrf_sp_Tile_Async(A, sequence, &request, criteria, &nb_pivot_handle);
+    CHAMELON_Desc_Flush( A, sequence );
+    chameleon_sequence_wait(chameleon, sequence);
+
+    status = sequence->status;
+    chameleon_sequence_destroy(chameleon, sequence);
+
+    RUNTIME_data_unregister(&nb_pivot_handle);
+    RUNTIME_data_free(&nb_pivot_handle);
+
+    return status;
+}
+
+/***************************************************************************//**
+ *
+ * @ingroup CHAMELON_Complex64_t_Tile_Async
+ *
+ *  CHAMELON_zgetrf_sp_Tile_Async - Computes the LU decomposition
+ *  Non-blocking equivalent of CHAMELON_zgetrf_sp_Tile().
+ *  May return before the computation is finished.
+ *  Allows for pipelining of operations ar runtime.
+ *
+ *******************************************************************************
+ *
+ * @param[in] sequence
+ *          Identifies the sequence of function calls that this call belongs to
+ *          (for completion checks and exception handling purposes).
+ *
+ * @param[out] request
+ *          Identifies this function call (for exception handling purposes).
+ *
+ *******************************************************************************
+ *
+ * @sa CHAMELON_zgetrf_sp
+ * @sa CHAMELON_zgetrf_sp_Tile
+ * @sa CHAMELON_cgetrf_sp_Tile_Async
+ * @sa CHAMELON_dgetrf_sp_Tile_Async
+ * @sa CHAMELON_sgetrf_sp_Tile_Async
+ * @sa CHAMELON_zpotrs_Tile_Async
+ *
+ ******************************************************************************/
+int CHAMELON_zgetrf_sp_Tile_Async(CHAMELON_desc_t *A,
+			       CHAMELON_sequence_t *sequence, CHAMELON_request_t *request,
+			       double criteria, CHAMELON_data_handle_t *nb_pivot_handle)
+{
+    CHAMELON_context_t *chameleon;
+
+    chameleon = chameleon_context_self();
+    if (chameleon == NULL) {
+        chameleon_fatal_error("CHAMELON_zgetrf_sp_Tile", "CHAMELON not initialized");
+        return CHAMELON_ERR_NOT_INITIALIZED;
+    }
+    if (sequence == NULL) {
+        chameleon_fatal_error("CHAMELON_zgetrf_sp_Tile", "NULL sequence");
+        return CHAMELON_ERR_UNALLOCATED;
+    }
+    if (request == NULL) {
+        chameleon_fatal_error("CHAMELON_zgetrf_sp_Tile", "NULL request");
+        return CHAMELON_ERR_UNALLOCATED;
+    }
+    /* Check sequence status */
+    if (sequence->status == CHAMELON_SUCCESS)
+        request->status = CHAMELON_SUCCESS;
+    else
+        return chameleon_request_fail(sequence, request, CHAMELON_ERR_SEQUENCE_FLUSHED);
+
+    /* Check descriptors for correctness */
+    if (chameleon_desc_check(A) != CHAMELON_SUCCESS) {
+        chameleon_error("CHAMELON_zgetrf_nopiv_Tile", "invalid first descriptor");
+        return chameleon_request_fail(sequence, request, CHAMELON_ERR_ILLEGAL_VALUE);
+    }
+
+    if (A->nb != A->mb) {
+        chameleon_error("CHAMELON_zgetrf_sp_Tile", "only square tiles supported");
+        return chameleon_request_fail(sequence, request, CHAMELON_ERR_ILLEGAL_VALUE);
+    }
+
+    chameleon_pzgetrf_sp(A, criteria, nb_pivot_handle, sequence, request);
+
+    return CHAMELON_SUCCESS;
+}
diff --git a/control/common.h b/control/common.h
index f31ec1af3bf5f0fd5af475e2630d1cd9c575bcd2..f5a34c8f3095b055137af132300e47f0d3a45722 100644
--- a/control/common.h
+++ b/control/common.h
@@ -84,6 +84,10 @@
 #define CHAMELEON_TRANSLATION chamctxt->translation
 #define CHAMELEON_PARALLEL    chamctxt->parallel_enabled
 #define CHAMELEON_STATISTICS  chamctxt->statistics_enabled
+#define CHAMELEON_DSDMATRIX     chamctxt->dsdmatrix_enabled
+#define CHAMELEON_SWAP_TYPE     chamctxt->swap_type
+#define CHAMELEON_PANEL_THREADS chamctxt->nb_panel_threads
+#define CHAMELEON_RECTIL_PANEL  chamctxt->rectil_panel_enabled
 
 /**
  *  IPT internal define
diff --git a/control/context.c b/control/context.c
index 66b844ccfacb5a86dda0ec95a0596f2a5a54b775..4f7804b80aeb7f7aff1ff95692c20ae43a709a4d 100644
--- a/control/context.c
+++ b/control/context.c
@@ -141,6 +141,11 @@ CHAM_context_t *chameleon_context_create()
     chamctxt->generic_enabled    = chameleon_env_on_off( "CHAMELEON_GENERIC",         CHAMELEON_FALSE );
     chamctxt->autominmax_enabled = chameleon_env_on_off( "CHAMELEON_AUTOMINMAX",      CHAMELEON_TRUE  );
 
+    chameleon->dsdmatrix_enabled    = CHAMELON_FALSE;
+    chameleon->swap_type            = CHAMELON_TILE_SWAPS;
+    chameleon->rectil_panel_enabled = CHAMELON_FALSE;
+    chameleon->nb_panel_threads     = -1;
+
     chamctxt->runtime_paused     = CHAMELEON_FALSE;
 
     chamctxt->householder = chameleon_getenv_householder( "CHAMELEON_HOUSEHOLDER_MODE", ChamFlatHouseholder );
@@ -224,6 +229,12 @@ int CHAMELEON_Enable(int option)
         case CHAMELEON_PROGRESS:
             chamctxt->progress_enabled = CHAMELEON_TRUE;
             break;
+        case CHAMELON_DSDMATRIX_MODE:
+            chameleon->dsdmatrix_enabled = CHAMELON_TRUE;
+            break;
+        case CHAMELON_RECTIL_PANEL_MODE:
+            chameleon->rectil_panel_enabled = CHAMELON_TRUE;
+            break;
         case CHAMELEON_GEMM3M:
 #if defined(CBLAS_HAS_ZGEMM3M) && !defined(CHAMELEON_SIMULATION)
             set_coreblas_gemm3m_enabled(1);
@@ -309,6 +320,12 @@ int CHAMELEON_Disable(int option)
         case CHAMELEON_GENERIC:
             chamctxt->generic_enabled = CHAMELEON_FALSE;
             break;
+        case CHAMELON_DSDMATRIX_MODE:
+            chameleon->dsdmatrix_enabled = CHAMELON_FALSE;
+            break;
+        case CHAMELON_RECTIL_PANEL_MODE:
+            chameleon->rectil_panel_enabled = CHAMELON_FALSE;
+            break;
         default:
             chameleon_error("CHAMELEON_Disable", "illegal parameter value");
             return CHAMELEON_ERR_ILLEGAL_VALUE;
@@ -413,6 +430,23 @@ int CHAMELEON_Set( int param, int value )
             }
             chamctxt->lookahead = value;
             break;
+        case CHAMELON_SWAP_TYPE_SET:
+            if (value < CHAMELON_NO_SWAPS && value >= CHAMELON_INVALID_SWAPS)
+            {
+                chameleon_error("CHAMELON_Set", "illegal value of CHAMELON_NEW_SWAP_TYPE");
+                return CHAMELON_ERR_ILLEGAL_VALUE;
+            }
+            chameleon->swap_type = value;
+            break;
+        case CHAMELON_NBPANEL_THREADS_SET:
+            if (value < 1)
+            {
+                chameleon_error("CHAMELON_Set", "illegal value of CHAMELON_NBPANEL_THREADS_SET");
+                return CHAMELON_ERR_ILLEGAL_VALUE;
+            }
+            CHAMELON_Enable(CHAMELON_RECTIL_PANEL_MODE);
+            chameleon->nb_panel_threads = value;
+            break;
         default:
             chameleon_error("CHAMELEON_Set", "unknown parameter");
             return CHAMELEON_ERR_ILLEGAL_VALUE;
@@ -479,6 +513,12 @@ int CHAMELEON_Get( int param, int *value )
         case CHAMELEON_RUNTIME:
             *value = chamctxt->scheduler;
             return CHAMELEON_SUCCESS;
+        case CHAMELON_SWAP_TYPE_SET:
+            *value = chameleon->swap_type;
+            return CHAMELON_SUCCESS;
+        case CHAMELON_NBPANEL_THREADS_SET:
+            *value = chameleon->nb_panel_threads;
+            return CHAMELON_SUCCESS;
         default:
             chameleon_error("CHAMELEON_Get", "unknown parameter");
             return CHAMELEON_ERR_ILLEGAL_VALUE;
diff --git a/coreblas/compute/CMakeLists.txt b/coreblas/compute/CMakeLists.txt
index 40baddc598df9da3ce04b58a54b7fe645435b564..00bb961f90ba4577c5456be235c645ea25af6c4b 100644
--- a/coreblas/compute/CMakeLists.txt
+++ b/coreblas/compute/CMakeLists.txt
@@ -51,6 +51,9 @@ set(ZSRC
     core_zgessq.c
     core_zgetf2_nopiv.c
     core_zgetrf.c
+    core_zgetrf_extract_max.c
+    core_zgetrf_rectil.c
+    core_zgetrf_sp.c
     core_zgetrf_incpiv.c
     core_zgetrf_nopiv.c
     core_zgram.c
@@ -68,6 +71,7 @@ set(ZSRC
     core_zlantr.c
     core_zlaset2.c
     core_zlaset.c
+    core_zlaswp.c
     core_zlatro.c
     core_zlauum.c
     core_zpamm.c
@@ -79,6 +83,7 @@ set(ZSRC
     core_zplssq.c
     core_zpotrf.c
     core_zssssm.c
+    core_zswp.c
     core_zsymm.c
     core_zsyr2k.c
     core_zsyrk.c
diff --git a/coreblas/compute/core_zgetrf_extract_max.c b/coreblas/compute/core_zgetrf_extract_max.c
new file mode 100644
index 0000000000000000000000000000000000000000..2119b666bf1417b3ca1253d230456f2ae69860a1
--- /dev/null
+++ b/coreblas/compute/core_zgetrf_extract_max.c
@@ -0,0 +1,94 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Inria. All rights reserved.
+ * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file core_zgetrf_extract_max.c
+ *
+ *  CHAMELON core_blas kernel
+ *  CHAMELON is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @version 1.0.0
+ * @author Omar Zenati
+ * @author Terry Cojean
+ * @date 2011-11-11
+ * @precisions normal z -> s d c
+ *
+ **/
+#include "coreblas/include/coreblas.h"
+
+#define get_pivot(_a_) (*((int *) (_a_)))
+#define get_diagonal(_a_) (_a_ + 1)
+#define get_maxrow(_a_) (_a_ + NB + 1)
+
+static CHAMELON_Complex64_t mzone = -1.;
+
+void CORE_zgetrf_extract_max(CHAMELON_Complex64_t *A, CHAMELON_Complex64_t *mw,
+                             int K, int M, int N, int H, int NB,
+                             int AM, int LDA)
+{
+    int p;
+    p = (AM == K)? H : 0;
+    M -= p;
+
+    if(H != 0) {
+        CHAMELON_Complex64_t alpha;
+        CHAMELON_Complex64_t *rmax = get_maxrow(mw);
+        int pivot = get_pivot(mw);
+        int owner_last_index = pivot / NB;
+        int last_index       = pivot % NB;
+
+        /* Applying the previous swap */
+        if( AM == K /* && (pivot != H-1) */ ) {
+            cblas_zcopy(N, get_maxrow(mw)       , 1,
+                        A + H-1 , LDA);
+        }
+
+        /* Copy the diagonal in place */
+        if( owner_last_index == AM /* && (pivot != H-1) */ ) {
+            cblas_zcopy(N, get_diagonal(mw)             ,  1,
+                        A + last_index, LDA);
+        }
+
+        /* Applying the update */
+        alpha = 1. / rmax[H-1];
+
+        /* printf("Le pivot a h=%d est localement a %d et globalement a %d et sa valeur est %e\n",H-1,last_index,get_pivot(mw), rmax[H-1]); */
+
+        cblas_zscal(M, CBLAS_SADDR( alpha ), A + (LDA*(H-1) + p), 1 );
+
+        cblas_zgeru(CblasColMajor, M, N-H,
+                    CBLAS_SADDR(mzone),
+                    A + (LDA*(H-1) + p) , 1   ,
+                    get_maxrow(mw) + H               , 1   ,
+                    A + (LDA*H + p)     , LDA);
+    }
+
+
+    /* if(H != (((M+p)<N)?M+p:N)) { */
+    int index_max = cblas_izamax( M, A + LDA*H + p, 1 ) + p;
+
+    get_pivot(mw) = index_max + AM * NB;
+
+    /* Copy the diagonal row */
+    if(K == AM)
+        cblas_zcopy(N, A                + H, LDA,
+                    get_diagonal(mw)    , 1);
+    /* Copy the maximum row */
+    cblas_zcopy(N, A              + index_max,  LDA,
+                get_maxrow(mw)            , 1);
+    /* CHAMELON_Complex64_t *rmax = get_maxrow(mw); */
+    /* if(H == 0) */
+    /* printf("m=%d, max in %d = %e, M=%d, p=%d\n",AM,get_pivot(mw),rmax[H],M,p); */
+    /* } */
+
+}
diff --git a/coreblas/compute/core_zgetrf_rectil.c b/coreblas/compute/core_zgetrf_rectil.c
new file mode 100644
index 0000000000000000000000000000000000000000..c58271e91f164ce6f3d3ffe101ee04b2d1437838
--- /dev/null
+++ b/coreblas/compute/core_zgetrf_rectil.c
@@ -0,0 +1,733 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Inria. All rights reserved.
+ * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file core_zgetrf_rectil.c
+ *
+ *  PLASMA core_blas kernel
+ *  PLASMA is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @version 2.6.0
+ * @author Hatem Ltaief
+ * @author Mathieu Faverge
+ * @author Piotr Luszczek
+ * @date 2009-11-15
+ *
+ * @precisions normal z -> c d s
+ *
+ **/
+
+#include "coreblas/include/coreblas.h"
+#include "control/descriptor.h"
+
+#define A(m, n) ((CHAMELON_Complex64_t *)chameleon_getaddr_ccrb(A, m, n))
+
+void
+CORE_zgetrf_rectil_init(void);
+
+static inline void
+CORE_zgetrf_rectil_rec(const CHAMELON_desc_t *A, int *IPIV, int *info,
+                       CHAMELON_Complex64_t *pivot,
+                       int thidx,  int thcnt,
+                       int column, int width,
+                       int ft,     int lt);
+static inline void
+CORE_zgetrf_rectil_update(const CHAMELON_desc_t *A, int *IPIV,
+                          int column, int n1,     int n2,
+                          int thidx,  int thcnt,
+                          int ft,     int lt);
+
+/*****************************************************************************
+ *
+ * @ingroup CORE_CHAMELON_Complex64_t
+ *
+ *  CORE_zgetrf_rectil computes a LU factorization of a general M-by-N
+ *  matrix A stored in CCRB layout using partial pivoting with row
+ *  interchanges.
+ *
+ *  The factorization has the form
+ *
+ *    A = P * L * U
+ *
+ *  where P is a permutation matrix, L is lower triangular with unit
+ *  diagonal elements (lower trapezoidal if m > n), and U is upper
+ *  triangular (upper trapezoidal if m < n).
+ *
+ *  This is the recursive version of the algorithm applied on tile layout.
+ *
+ *  WARNINGS:
+ *     - The function CORE_zgetrf_rectil_init has to be called prior
+ *     to any call to this function.
+ *     - You cannot call this kernel on different matrices at the same
+ *     time.
+ *     - The matrix A cannot be more than one tile wide.
+ *     - The number of threads calling this function has to be excatly
+ *     the number defined by info[2] with each one of them a different
+ *     index between 0 included and info[2] excluded.
+ *
+ *******************************************************************************
+ *
+ *  @param[in,out] A
+ *          PLASMA descriptor of the matrix A to be factorized.
+ *          On entry, the M-by-N matrix to be factorized.
+ *          On exit, the factors L and U from the factorization
+ *          A = P*L*U; the unit diagonal elements of L are not stored.
+ *
+ *  @param[out] IPIV
+ *          The pivot indices; for 0 <= i < min(M,N) stored in Fortran
+ *          mode (starting at 1), row i of the matrix was interchanged
+ *          with row IPIV(i).
+ *          On exit, each value IPIV[i] for 0 <= i < min(M,N) is
+ *          increased by A->i, which means A->i < IPIV[i] <= A->i+M.
+ *
+ *  @param[in,out] info
+ *          Array of 3 integers.
+ *          - info[0], see returned value
+ *          - info[1], is the thread index 0 <= info[0] < info[2]
+ *          - info[2], on entry is the number of threads trying to
+ *                     participate to the factorization,
+ *                     on exit is the real number of threads used to
+ *                     perform the factorization.
+ *    Info[2] threads, and exactly info[2], have to call this function
+ *    to avoid dead lock.
+ *
+ *******************************************************************************
+ *
+ * @return
+ *          \retval CHAMELON_SUCCESS successful exit
+ *          \retval -k, the k-th argument had an illegal value
+ *          \retval k if U(k,k) is exactly zero. The factorization
+ *                  has been completed, but the factor U is exactly
+ *                  singular, and division by zero will occur if it is used
+ *                  to solve a system of equations.
+ *
+ */
+
+int CORE_zgetrf_rectil(const CHAMELON_desc_t *A, int *IPIV, int *info)
+{
+    int ft, lt;
+    int thidx = info[1];
+    int thcnt = chameleon_min( info[2], A->mt );
+    int minMN = chameleon_min( A->m, A->n );
+    CHAMELON_Complex64_t pivot;
+
+    info[0] = 0;
+    info[2] = thcnt;
+
+    assert( A->styp == ChamCCRB );
+
+    if ( A->nt > 1 ) {
+        coreblas_error(1, "Illegal value of A->nt");
+        info[0] = -1;
+        return -1;
+    }
+
+    if ( thidx >= thcnt )
+      return 0;
+
+    int q = A->mt / thcnt;
+    int r = A->mt % thcnt;
+
+    if (thidx < r) {
+        q++;
+        ft = thidx * q;
+        lt = ft + q;
+    } else {
+        ft = r * (q + 1) + (thidx - r) * q;
+        lt = ft + q;
+        lt = chameleon_min( lt, A->mt );
+    }
+
+    if (0)
+    {
+        int i,j,k;
+
+        for (k=ft ; k<lt ; k++) {
+            fprintf(stderr, "\n\t\tTile %d\n", k);
+            int ld = BLKLDD(A,0);
+            for (j=0 ; j < A->mb ; j++) {
+                for (i=0 ; i < A->nb ; i++)
+                    fprintf(stderr, "\t%.2f", *(A(k, 0) + j + i*ld));
+                fprintf(stderr, "\n");
+            }
+        }
+    }
+
+    CORE_zgetrf_rectil_rec( A, IPIV, info, &pivot,
+                            thidx, thcnt, 0, minMN, ft, lt);
+
+    if ( A->n > minMN ) {
+        CORE_zgetrf_rectil_update( A, IPIV,
+                                   0, minMN, A->n-minMN,
+                                   thidx, thcnt,
+                                   ft, lt);
+    }
+
+    return info[0];
+}
+
+/*******************************************************************
+ *   Additional routines
+ */
+#define AMAX1BUF_SIZE (48 << 1)
+
+/* 48 threads should be enough for everybody */
+static volatile CHAMELON_Complex64_t CORE_zamax1buf[AMAX1BUF_SIZE];
+static double sfmin;
+
+void
+CORE_zgetrf_rectil_init(void)
+{
+    int i;
+    for (i = 0; i < AMAX1BUF_SIZE; ++i) CORE_zamax1buf[i] = -1.0;
+    sfmin =  LAPACKE_dlamch_work('S');
+}
+
+static void
+CORE_zamax1_thread(CHAMELON_Complex64_t localamx,
+                   int thidx, int thcnt, int *thwinner,
+                   CHAMELON_Complex64_t *diagvalue,
+                   CHAMELON_Complex64_t *globalamx,
+                   int pividx, int *ipiv)
+{
+    if (thidx == 0) {
+        int i, j = 0;
+        CHAMELON_Complex64_t curval = localamx, tmp;
+        double curamx = cabs(localamx);
+
+        /* make sure everybody filled in their value */
+        for (i = 1; i < thcnt; ++i) {
+            while (CORE_zamax1buf[i << 1] == -1.0) { /* wait for thread i to store its value */
+            }
+        }
+
+        /* better not fuse the loop above and below to make sure data is sync'd */
+
+        for (i = 1; i < thcnt; ++i) {
+            tmp = CORE_zamax1buf[ (i << 1) + 1];
+            if (cabs(tmp) > curamx) {
+                curamx = cabs(tmp);
+                curval = tmp;
+                j = i;
+            }
+        }
+
+        if (0 == j)
+            ipiv[0] = pividx;
+
+        /* make sure everybody knows the amax value */
+        for (i = 1; i < thcnt; ++i)
+            CORE_zamax1buf[ (i << 1) + 1] = curval;
+
+        CORE_zamax1buf[0] = -j - 2.0; /* set the index of the winning thread */
+        CORE_zamax1buf[1] = *diagvalue; /* set the index of the winning thread */
+
+        *thwinner = j;
+        *globalamx = curval;
+
+        for (i = 1; i < thcnt; ++i)
+            CORE_zamax1buf[i << 1] = -3.0;
+
+        /* make sure everybody read the max value */
+        for (i = 1; i < thcnt; ++i) {
+            while (CORE_zamax1buf[i << 1] != -1.0) {
+            }
+        }
+
+        CORE_zamax1buf[0] = -1.0;
+    } else {
+        CORE_zamax1buf[(thidx << 1) + 1] = localamx;
+        CORE_zamax1buf[thidx << 1] = -2.0;  /* announce to thread 0 that local amax was stored */
+        while (CORE_zamax1buf[0] == -1.0) { /* wait for thread 0 to finish calculating the global amax */
+        }
+        while (CORE_zamax1buf[thidx << 1] != -3.0) { /* wait for thread 0 to store amax */
+        }
+        *thwinner  = -CORE_zamax1buf[0] - 2.0;
+        *diagvalue = CORE_zamax1buf[1];
+        *globalamx = CORE_zamax1buf[(thidx << 1) + 1]; /* read the amax from the location adjacent to the one in the above loop */
+        CORE_zamax1buf[thidx << 1] = -1.0;  /* signal thread 0 that this thread is done reading */
+
+        if (thidx == *thwinner)
+            ipiv[0] = pividx;
+
+        while (CORE_zamax1buf[0] != -1.0) { /* wait for thread 0 to finish */
+        }
+    }
+}
+
+static void
+CORE_zbarrier_thread(int thidx,
+                     int thcnt)
+{
+    int idum1, idum2;
+    CHAMELON_Complex64_t ddum1 = 0.;
+    CHAMELON_Complex64_t ddum2 = 0.;
+    /* it's probably faster to implement a dedicated barrier */
+    CORE_zamax1_thread( 1.0, thidx, thcnt, &idum1, &ddum1, &ddum2, 0, &idum2 );
+}
+
+static inline void
+CORE_zgetrf_rectil_update(const CHAMELON_desc_t *A, int *IPIV,
+                          int column, int n1,     int n2,
+                          int thidx,  int thcnt,
+                          int ft,     int lt)
+{
+    int ld, lm, tmpM;
+    int ip, j, it, i, ldft;
+    CHAMELON_Complex64_t zone  = 1.0;
+    CHAMELON_Complex64_t mzone = -1.0;
+    CHAMELON_Complex64_t *Atop, *Atop2, *U, *L;
+    int offset = A->i;
+
+    ldft = BLKLDD(A, 0);
+    Atop  = A(0, 0) + column * ldft;
+    Atop2 = Atop    + n1     * ldft;
+
+    if (thidx == 0)
+    {
+        /* Swap to the right */
+        int *lipiv = IPIV+column;
+        int idxMax = column+n1;
+        for (j = column; j < idxMax; ++j, ++lipiv) {
+            ip = (*lipiv) - offset - 1;
+            if ( ip != j )
+            {
+                it = ip / A->mb;
+                i  = ip % A->mb;
+                ld = BLKLDD(A, it);
+                cblas_zswap(n2, Atop2                     + j, ldft,
+                            A(it, 0) + (column+n1)*ld + i, ld   );
+            }
+        }
+
+        /* Trsm on the upper part */
+        U = Atop2 + column;
+        cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower,
+                     CblasNoTrans, CblasUnit,
+                     n1, n2, CBLAS_SADDR(zone),
+                     Atop  + column, ldft,
+                     U,              ldft );
+
+        /* Signal to other threads that they can start update */
+        CORE_zbarrier_thread( thidx, thcnt );
+
+        /* First tile */
+        L = Atop + column + n1;
+        tmpM = chameleon_min(ldft, A->m) - column - n1;
+
+        /* Apply the GEMM */
+        cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
+                     tmpM, n2, n1,
+                     CBLAS_SADDR(mzone), L,      ldft,
+                     U,      ldft,
+                     CBLAS_SADDR(zone),  U + n1, ldft );
+
+    }
+    else
+    {
+        ld = BLKLDD( A, ft );
+        L  = A( ft, 0 ) + column * ld;
+        lm = ft == A->mt-1 ? A->m - ft * A->mb : A->mb;
+
+        U = Atop2 + column;
+
+        /* Wait for pivoting and triangular solve to be finished
+         * before to really start the update */
+        CORE_zbarrier_thread( thidx, thcnt );
+
+        /* First tile */
+        /* Apply the GEMM */
+        cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
+                     lm, n2, n1,
+                     CBLAS_SADDR(mzone), L,         ld,
+                                         U,         ldft,
+                     CBLAS_SADDR(zone),  L + n1*ld, ld );
+    }
+
+    /* Update the other blocks */
+    for( it = ft+1; it < lt; it++)
+    {
+        ld = BLKLDD( A, it );
+        L  = A( it, 0 ) + column * ld;
+        lm = it == A->mt-1 ? A->m - it * A->mb : A->mb;
+
+        /* Apply the GEMM */
+        cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
+                     lm, n2, n1,
+                     CBLAS_SADDR(mzone), L,          ld,
+                                         U,          ldft,
+                     CBLAS_SADDR(zone),  L + n1*ld,  ld );
+    }
+}
+
+static void
+CORE_zgetrf_rectil_rec(const CHAMELON_desc_t *A, int *IPIV, int *info,
+                       CHAMELON_Complex64_t *pivot,
+                       int thidx,  int thcnt,
+                       int column, int width,
+                       int ft,     int lt)
+{
+    int ld, jp, n1, n2, lm, tmpM, piv_sf;
+    int ip, j, it, i, ldft;
+    int max_i, max_it, thwin;
+    CHAMELON_Complex64_t zone  = 1.0;
+    CHAMELON_Complex64_t mzone = -1.0;
+    CHAMELON_Complex64_t tmp1;
+    CHAMELON_Complex64_t tmp2 = 0.;
+    CHAMELON_Complex64_t pivval;
+    CHAMELON_Complex64_t *Atop, *Atop2, *U, *L;
+    double             abstmp1;
+    int offset = A->i;
+
+    ldft = BLKLDD(A, 0);
+    Atop = A(0, 0) + column * ldft;
+
+    if ( width > 1 ) {
+        /* Assumption: N = min( M, N ); */
+        n1 = width / 2;
+        n2 = width - n1;
+
+        Atop2 = Atop + n1 * ldft;
+
+        CORE_zgetrf_rectil_rec( A, IPIV, info, pivot,
+                                thidx, thcnt, column, n1, ft, lt );
+
+        if ( *info != 0 )
+            return;
+
+        if (thidx == 0)
+        {
+            /* Swap to the right */
+            int *lipiv = IPIV+column;
+            int idxMax = column+n1;
+            for (j = column; j < idxMax; ++j, ++lipiv) {
+                ip = (*lipiv) - offset - 1;
+                if ( ip != j )
+                {
+                    it = ip / A->mb;
+                    i  = ip % A->mb;
+                    ld = BLKLDD(A, it);
+                    cblas_zswap(n2, Atop2                     + j, ldft,
+                                    A(it, 0) + (column+n1)*ld + i, ld   );
+                }
+            }
+            /* Trsm on the upper part */
+            U = Atop2 + column;
+            cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower,
+                         CblasNoTrans, CblasUnit,
+                         n1, n2, CBLAS_SADDR(zone),
+                         Atop  + column, ldft,
+                         U,              ldft );
+
+            /* SIgnal to other threads that they can start update */
+            CORE_zbarrier_thread( thidx, thcnt );
+            pivval = *pivot;
+            if ( pivval == 0.0 ) {
+                *info = column+n1;
+                return;
+            } else {
+                if ( cabs(pivval) >= sfmin ) {
+                    piv_sf = 1;
+                    pivval = 1.0 / pivval;
+                } else {
+                    piv_sf = 0;
+                }
+            }
+
+            /* First tile */
+            {
+                L = Atop + column + n1;
+                tmpM = chameleon_min(ldft, A->m) - column - n1;
+
+                /* Scale last column of L */
+                if ( piv_sf ) {
+                    cblas_zscal( tmpM, CBLAS_SADDR(pivval), L+(n1-1)*ldft, 1 );
+                } else {
+                    int i;
+                    Atop2 = L+(n1-1)*ldft;
+                    for( i=0; i < tmpM; i++, Atop2++)
+                        *Atop2 = *Atop2 / pivval;
+                }
+
+                /* Apply the GEMM */
+                cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
+                             tmpM, n2, n1,
+                             CBLAS_SADDR(mzone), L,      ldft,
+                                                 U,      ldft,
+                             CBLAS_SADDR(zone),  U + n1, ldft );
+
+                /* Search Max in first column of U+n1 */
+                tmp2    = U[n1];
+                max_it  = ft;
+                max_i   = cblas_izamax( tmpM, U+n1, 1 ) + n1;
+                tmp1    = U[max_i];
+                abstmp1 = cabs(tmp1);
+                max_i   += column;
+            }
+        }
+        else
+        {
+            pivval = *pivot;
+            if ( pivval == 0.0 ) {
+                CORE_zbarrier_thread( thidx, thcnt );
+                *info = column+n1;
+                return;
+            } else {
+                if ( cabs(pivval) >= sfmin ) {
+                    piv_sf = 1;
+                    pivval = 1.0 / pivval;
+                } else {
+                    piv_sf = 0;
+                }
+            }
+
+            ld = BLKLDD( A, ft );
+            L  = A( ft, 0 ) + column * ld;
+            lm = ft == A->mt-1 ? A->m - ft * A->mb : A->mb;
+
+            U = Atop2 + column;
+
+            /* First tile */
+            /* Scale last column of L */
+            if ( piv_sf ) {
+                cblas_zscal( lm, CBLAS_SADDR(pivval), L+(n1-1)*ld, 1 );
+            } else {
+                int i;
+                Atop2 = L+(n1-1)*ld;
+                for( i=0; i < lm; i++, Atop2++)
+                    *Atop2 = *Atop2 / pivval;
+            }
+
+            /* Wait for pivoting and triangular solve to be finished
+             * before to really start the update */
+            CORE_zbarrier_thread( thidx, thcnt );
+
+            /* Apply the GEMM */
+            cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
+                         lm, n2, n1,
+                         CBLAS_SADDR(mzone), L,         ld,
+                                             U,         ldft,
+                         CBLAS_SADDR(zone),  L + n1*ld, ld );
+
+            /* Search Max in first column of L+n1*ld */
+            max_it  = ft;
+            max_i   = cblas_izamax( lm, L+n1*ld, 1 );
+            tmp1    = L[n1*ld+max_i];
+            abstmp1 = cabs(tmp1);
+        }
+
+        /* Update the other blocks */
+        for( it = ft+1; it < lt; it++)
+        {
+            ld = BLKLDD( A, it );
+            L  = A( it, 0 ) + column * ld;
+            lm = it == A->mt-1 ? A->m - it * A->mb : A->mb;
+
+            /* Scale last column of L */
+            if ( piv_sf ) {
+                cblas_zscal( lm, CBLAS_SADDR(pivval), L+(n1-1)*ld, 1 );
+            } else {
+                int i;
+                Atop2 = L+(n1-1)*ld;
+                for( i=0; i < lm; i++, Atop2++)
+                    *Atop2 = *Atop2 / pivval;
+            }
+
+            /* Apply the GEMM */
+            cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
+                         lm, n2, n1,
+                         CBLAS_SADDR(mzone), L,          ld,
+                                             U,          ldft,
+                         CBLAS_SADDR(zone),  L + n1*ld,  ld );
+
+            /* Search the max on the first column of L+n1*ld */
+            jp = cblas_izamax( lm, L+n1*ld, 1 );
+            if ( cabs( L[n1*ld+jp] ) > abstmp1 ) {
+                tmp1 = L[n1*ld+jp];
+                abstmp1 = cabs(tmp1);
+                max_i  = jp;
+                max_it = it;
+            }
+        }
+
+        jp = offset + max_it*A->mb + max_i;
+        CORE_zamax1_thread( tmp1, thidx, thcnt, &thwin,
+                            &tmp2, pivot, jp + 1, IPIV + column + n1 );
+
+        if ( thidx == 0 ) {
+            U[n1] = *pivot; /* all threads have the pivot element: no need for synchronization */
+        }
+        if (thwin == thidx) { /* the thread that owns the best pivot */
+            if ( jp-offset != column+n1 ) /* if there is a need to exchange the pivot */
+            {
+                ld = BLKLDD(A, max_it);
+                Atop2 = A( max_it, 0 ) + (column + n1 )* ld + max_i;
+                *Atop2 = tmp2;
+            }
+        }
+
+        CORE_zgetrf_rectil_rec( A, IPIV, info, pivot,
+                                thidx, thcnt, column+n1, n2, ft, lt );
+        if ( *info != 0 )
+            return;
+
+        if ( thidx == 0 )
+        {
+            /* Swap to the left */
+            int *lipiv = IPIV+column+n1;
+            int idxMax = column+width;
+            for (j = column+n1; j < idxMax; ++j, ++lipiv) {
+                ip = (*lipiv) - offset - 1;
+                if ( ip != j )
+                {
+                    it = ip / A->mb;
+                    i  = ip % A->mb;
+                    ld = BLKLDD(A, it);
+                    cblas_zswap(n1, Atop + j,                 ldft,
+                                    A(it, 0) + column*ld + i, ld  );
+                }
+            }
+        }
+
+    } else if ( width == 1 ) {
+
+        /* Search maximum for column 0 */
+        if ( column == 0 )
+        {
+            if ( thidx == 0 )
+              tmp2 = Atop[column];
+
+            /* First tmp1 */
+            ld = BLKLDD(A, ft);
+            Atop2   = A( ft, 0 );
+            lm      = ft == A->mt-1 ? A->m - ft * A->mb : A->mb;
+            max_it  = ft;
+            max_i   = cblas_izamax( lm, Atop2, 1 );
+            tmp1    = Atop2[max_i];
+            abstmp1 = cabs(tmp1);
+
+            /* Update */
+            for( it = ft+1; it < lt; it++)
+            {
+                Atop2= A( it, 0 );
+                lm   = it == A->mt-1 ? A->m - it * A->mb : A->mb;
+                jp   = cblas_izamax( lm, Atop2, 1 );
+                if (  cabs(Atop2[jp]) > abstmp1 ) {
+                    tmp1 = Atop2[jp];
+                    abstmp1 = cabs(tmp1);
+                    max_i  = jp;
+                    max_it = it;
+                }
+            }
+
+            jp = offset + max_it*A->mb + max_i;
+            //fprintf(stderr, " column:%d ; jp:%d ; offset:%d ; max_it:%d ; max_i:%d ; thid:%d && tiles [%d,%d]\n", column, jp, offset, max_it, max_i, thidx, ft, lt);
+            CORE_zamax1_thread( tmp1, thidx, thcnt, &thwin,
+                                &tmp2, pivot, jp + 1, IPIV + column );
+
+            if ( thidx == 0 ) {
+                Atop[0] = *pivot; /* all threads have the pivot element: no need for synchronization */
+            }
+            if (thwin == thidx) { /* the thread that owns the best pivot */
+                if ( jp-offset != 0 ) /* if there is a need to exchange the pivot */
+                {
+                    Atop2 = A( max_it, 0 ) + max_i;
+                    *Atop2 = tmp2;
+                }
+            }
+        }
+
+        CORE_zbarrier_thread( thidx, thcnt );
+
+        /* If it is the last column, we just scale */
+        if ( column == (chameleon_min(A->m, A->n))-1 ) {
+
+            pivval = *pivot;
+            if ( pivval != 0.0 ) {
+                if ( thidx == 0 ) {
+                    if ( cabs(pivval) >= sfmin ) {
+                        pivval = 1.0 / pivval;
+
+                        /*
+                         * We guess than we never enter the function with m == A->mt-1
+                         * because it means that there is only one thread
+                         */
+                        lm = ft == A->mt-1 ? A->m - ft * A->mb : A->mb;
+                        cblas_zscal( lm - column - 1, CBLAS_SADDR(pivval), Atop+column+1, 1 );
+
+                        for( it = ft+1; it < lt; it++)
+                        {
+                            ld = BLKLDD(A, it);
+                            Atop2 = A( it, 0 ) + column * ld;
+                            lm = it == A->mt-1 ? A->m - it * A->mb : A->mb;
+                            cblas_zscal( lm, CBLAS_SADDR(pivval), Atop2, 1 );
+                        }
+                    } else {
+                        /*
+                         * We guess than we never enter the function with m == A->mt-1
+                         * because it means that there is only one thread
+                         */
+                        int i;
+                        Atop2 = Atop + column + 1;
+                        lm = ft == A->mt-1 ? A->m - ft * A->mb : A->mb;
+
+                        for( i=0; i < lm-column-1; i++, Atop2++)
+                            *Atop2 = *Atop2 / pivval;
+
+                        for( it = ft+1; it < lt; it++)
+                            {
+                                ld = BLKLDD(A, it);
+                                Atop2 = A( it, 0 ) + column * ld;
+                                lm = it == A->mt-1 ? A->m - it * A->mb : A->mb;
+
+                                for( i=0; i < lm; i++, Atop2++)
+                                    *Atop2 = *Atop2 / pivval;
+                            }
+                    }
+                }
+                else
+                {
+                    if ( cabs(pivval) >= sfmin ) {
+                        pivval = 1.0 / pivval;
+
+                        for( it = ft; it < lt; it++)
+                        {
+                            ld = BLKLDD(A, it);
+                            Atop2 = A( it, 0 ) + column * ld;
+                            lm = it == A->mt-1 ? A->m - it * A->mb : A->mb;
+                            cblas_zscal( lm, CBLAS_SADDR(pivval), Atop2, 1 );
+                        }
+                    } else {
+                        /*
+                         * We guess than we never enter the function with m == A->mt-1
+                         * because it means that there is only one thread
+                         */
+                        int i;
+                        for( it = ft; it < lt; it++)
+                        {
+                            ld = BLKLDD(A, it);
+                            Atop2 = A( it, 0 ) + column * ld;
+                            lm = it == A->mt-1 ? A->m - it * A->mb : A->mb;
+
+                            for( i=0; i < lm; i++, Atop2++)
+                                *Atop2 = *Atop2 / pivval;
+                        }
+                    }
+                }
+            }
+            else {
+                *info = column + 1;
+                return;
+            }
+        }
+    }
+}
diff --git a/coreblas/compute/core_zgetrf_sp.c b/coreblas/compute/core_zgetrf_sp.c
new file mode 100644
index 0000000000000000000000000000000000000000..8b9d7e18bca20cee4d6da50574631f7d6225f5ff
--- /dev/null
+++ b/coreblas/compute/core_zgetrf_sp.c
@@ -0,0 +1,204 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Inria. All rights reserved.
+ * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file core_zgetrf_sp.c
+ *
+ *  PLASMA core_blas kernel
+ *  PLASMA is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @version 1.0.0
+ * @author Mathieu Faverge
+ * @author Pierre Ramet
+ * @author Xavier Lacoste
+ * @author Omar Zenati
+ * @author Terry Cojean
+ * @date 2011-11-11
+ * @precisions normal z -> s d c
+ *
+ **/
+#include "coreblas/include/coreblas.h"
+
+/*
+ Constant: MAXSIZEOFBLOCKS
+ Maximum size of blocks given to blas in factorisation
+ */
+
+#define MAXSIZEOFBLOCKS 32 /*64 in LAPACK*/
+
+static CHAMELON_Complex64_t zone  = 1.;
+static CHAMELON_Complex64_t mzone = -1.;
+
+/*
+ Function: FactorizationLU
+
+ LU Factorisation of one (diagonal) block
+ $A = LU$
+
+ For each column :
+ - Divide the column by the diagonal element.
+ - Substract the product of the subdiagonal part by
+ the line after the diagonal element from the
+ matrix under the diagonal element.
+
+ Parameters:
+ A       - Matrix to factorize
+ m       - number of rows of the Matrix A
+ n       - number of cols of the Matrix A
+ stride  - Stide between 2 columns of the matrix
+ nbpivot - IN/OUT pivot number.
+ critere - Pivoting threshold.
+ */
+static void CORE_zgetf2_sp(int  m,
+                           int  n,
+                           CHAMELON_Complex64_t * A,
+                           int  stride,
+                           double criteria,
+                           int *nbpivot )
+{
+    int k, minMN;
+    CHAMELON_Complex64_t *Akk, *Aik, alpha;
+
+    minMN = chameleon_min( m, n );
+
+    Akk = A;
+    for (k=0; k<minMN; k++) {
+        Aik = Akk + 1;
+
+        if ( cabs(*Akk) < criteria ) {
+            (*Akk) = (CHAMELON_Complex64_t)criteria;
+            (*nbpivot)++;
+        }
+
+        /* A_ik = A_ik / A_kk, i = k+1 .. n */
+        alpha = 1. / (*Akk);
+        cblas_zscal(m-k-1, CBLAS_SADDR(alpha), Aik, 1 );
+
+        if ( k+1 < minMN ) {
+            /* A_ij = A_ij - A_ik * A_kj, i,j = k+1..n */
+            cblas_zgeru(CblasColMajor, m-k-1, n-k-1,
+                        CBLAS_SADDR(mzone), Aik, 1,
+                        Akk+stride, stride,
+                        Aik+stride, stride);
+        }
+
+        Akk += stride+1;
+    }
+}
+
+/*
+ Function: FactorizationLU_block
+
+ Block LU Factorisation of one (diagonal) big block
+ > A = LU
+
+ Parameters:
+ A       - Matrix to factorize
+ n       - Size of A
+ stride  - Stride between 2 columns of the matrix
+ nbpivot - IN/OUT pivot number.
+ critere - Pivoting threshold.
+ */
+void CORE_zgetrf_sp(int m, int  n,
+                    CHAMELON_Complex64_t *A,
+                    int  stride,
+                    double  criteria,
+                    int *nbpivot)
+{
+    int k, blocknbr, blocksize, u_size, l_size, tempm, tempn;
+    CHAMELON_Complex64_t *Akk, *Lik, *Ukj, *Aij;
+
+    blocknbr = (int) ceil( (double)chameleon_min(m,n)/(double)MAXSIZEOFBLOCKS );
+
+    Akk = A; /* Lk,k     */
+
+    for (k=0; k<blocknbr; k++) {
+        tempm = m - k * MAXSIZEOFBLOCKS;
+        tempn = n - k * MAXSIZEOFBLOCKS;
+
+        blocksize = chameleon_min(tempm, tempn);
+        blocksize = chameleon_min(MAXSIZEOFBLOCKS, blocksize);
+
+        Lik = Akk + blocksize;
+        Ukj = Akk + blocksize*stride;
+        Aij = Ukj + blocksize;
+
+        /* Factorize the diagonal block Akk */
+        CORE_zgetf2_sp( blocksize, blocksize, Akk, stride, criteria, nbpivot );
+
+        u_size = tempn - blocksize;
+        l_size = tempm - blocksize;
+        if ( u_size > 0 && l_size > 0) {
+
+            /* Compute the column Ukk+1 */
+            CORE_ztrsm(ChamLeft, ChamLower,
+                        ChamNoTrans, ChamUnit,
+                        blocksize, u_size,
+                        zone, Akk, stride,
+                        Ukj, stride);
+
+            /* Compute the column Lk+1k */
+            CORE_ztrsm(ChamRight, ChamUpper,
+                       ChamNoTrans, ChamNonUnit,
+                       l_size, blocksize,
+                       zone, Akk, stride,
+                       Lik, stride);
+
+            /* Update Ak+1,k+1 = Ak+1,k+1 - Lk+1,k*Uk,k+1 */
+            CORE_zgemm(ChamNoTrans, ChamNoTrans,
+                       l_size, u_size, blocksize,
+                       mzone, Lik, stride,
+                       Ukj, stride,
+                       zone,  Aij, stride);
+
+        }
+
+        Akk += blocksize * (stride+1);
+    }
+}
+
+void CORE_zgetrf_sp_rec(int m, int n,
+                        CHAMELON_Complex64_t *A,
+                        int  stride,
+                        double criteria,
+                        int *nbpivot)
+{
+    if(m > 0) {
+        if(n == 1) {
+            if ( cabs(*A) < criteria ) {
+                (*A) = (CHAMELON_Complex64_t)criteria;
+                (*nbpivot)++;
+            }
+            CHAMELON_Complex64_t alpha = 1. / (*A);
+            cblas_zscal(m-1, CBLAS_SADDR(alpha), A+1, 1);
+        }
+        else {
+            CORE_zgetrf_sp_rec(m, n/2, A, stride, criteria, nbpivot);
+
+            CORE_ztrsm(ChamLeft, ChamLower,
+                       ChamNoTrans, ChamUnit,
+                       n/2, n/2+n%2,
+                       zone, A, stride,
+                       &A[(n/2)*stride], stride);
+
+            CORE_zgemm(ChamNoTrans, ChamNoTrans,
+                       m-n/2, n/2+n%2, n/2,
+                       mzone, &A[n/2],              stride,
+                       &A[(n/2)*stride],     stride,
+                       zone,  &A[(n/2)*stride+n/2], stride);
+
+            CORE_zgetrf_sp_rec(m-n/2, n/2+n%2,
+                               &A[((n/2)*stride)+(n/2)],
+                               stride, criteria, nbpivot);
+        }
+    }
+}
diff --git a/coreblas/compute/core_zlaswp.c b/coreblas/compute/core_zlaswp.c
new file mode 100644
index 0000000000000000000000000000000000000000..5cc7feb811f31e8699c9b23ee7928e40d3183757
--- /dev/null
+++ b/coreblas/compute/core_zlaswp.c
@@ -0,0 +1,286 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Inria. All rights reserved.
+ * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file core_zlaswp.c
+ *
+ *  PLASMA core_blas kernel
+ *  PLASMA is a software package provided by Inria Bordeaux - Sud-Ouest, LaBRI,
+ *  University of Bordeaux, Bordeaux INP
+ *
+ * @version 2.6.0
+ * @comment This file has been automatically generated
+ *          from Plasma 2.6.0 for CHAMELON 1.0.0
+ * @author Mathieu Faverge
+ * @author Terry Cojean
+ * @date 2010-11-15
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "coreblas/include/coreblas/lapacke.h"
+#include "coreblas/include/coreblas.h"
+#include "control/descriptor.h"
+
+#define A(m, n) ((CHAMELON_Complex64_t *)chameleon_getaddr_ccrb(descA, m, n))
+
+/***************************************************************************//**
+ *
+ * @ingroup CORE_CHAMELON_Complex64_t
+ *
+ *  CORE_zlaswp performs a series of row interchanges on the matrix A.
+ *  One row interchange is initiated for each of rows I1 through I2 of A.
+ *
+ *******************************************************************************
+ *
+ *  @param[in] N
+ *          The number of columns in the matrix A. N >= 0.
+ *
+ *  @param[in,out] A
+ *         On entry, the matrix of column dimension N to which the row
+ *         interchanges will be applied.
+ *         On exit, the permuted matrix.
+ *
+ *  @param[in] LDA
+ *         The leading dimension of the array A.  LDA >= max(1,max(IPIV[I1..I2])).
+ *
+ *  @param[in] I1
+ *          The first element of IPIV for which a row interchange will
+ *          be done.
+ *
+ *  @param[in] I2
+ *          The last element of IPIV for which a row interchange will
+ *          be done.
+ *
+ *  @param[in] IPIV
+ *          The pivot indices; Only the element in position i1 to i2
+ *          are accessed. The pivot are offset by A.i.
+ *
+ *  @param[in] INC
+ *          The increment between successive values of IPIV.  If IPIV
+ *          is negative, the pivots are applied in reverse order.
+ *
+ *******************************************************************************
+ */
+
+void CORE_zlaswp(int N, CHAMELON_Complex64_t *A, int LDA, int I1, int I2, const int *IPIV, int INC)
+{
+    LAPACKE_zlaswp_work( LAPACK_COL_MAJOR, N, A, LDA, I1, I2, IPIV, INC );
+}
+
+/***************************************************************************//**
+ *
+ * @ingroup CORE_CHAMELON_Complex64_t
+ *
+ *  CORE_zlaswp_ontile apply the zlaswp function on a matrix stored in
+ *  tile layout
+ *
+ *******************************************************************************
+ *
+ *  @param[in,out] descA
+ *          The descriptor of the matrix A to permute.
+ *
+ *  @param[in] i1
+ *          The first element of IPIV for which a row interchange will
+ *          be done.
+ *
+ *  @param[in] i2
+ *          The last element of IPIV for which a row interchange will
+ *          be done.
+ *
+ *  @param[in] ipiv
+ *          The pivot indices; Only the element in position i1 to i2
+ *          are accessed. The pivot are offset by A.i.
+ *
+ *  @param[in] inc
+ *          The increment between successive values of IPIV.  If IPIV
+ *          is negative, the pivots are applied in reverse order.
+ *
+ *******************************************************************************
+ *
+ * @return
+ *         \retval CHAMELON_SUCCESS successful exit
+ *         \retval <0 if INFO = -k, the k-th argument had an illegal value
+ *
+ *******************************************************************************
+ */
+
+int CORE_zlaswp_ontile(CHAMELON_desc_t *descA, int i1, int i2, const int *ipiv, int inc)
+{
+    int i, j, ip, it;
+    CHAMELON_Complex64_t *A1;
+    int lda1, lda2;
+
+    /* Change i1 to C notation */
+    i1--;
+
+    /* Check parameters */
+    if ( descA->nt > 1 ) {
+        coreblas_error(1, "Illegal value of (A|B)->(nt|mt)");
+        return -1;
+    }
+    if ( i1 < 0 ) {
+        coreblas_error(2, "Illegal value of i1");
+        return -2;
+    }
+    if ( (i2 <= i1) || (i2 > descA->m) ) {
+        coreblas_error(3, "Illegal value of i2");
+        return -3;
+    }
+    if ( ! ( (i2 - i1 - i1%descA->mb -1) < descA->mb ) ) {
+        coreblas_error(2, "Illegal value of i1,i2. They have to be part of the same block.");
+        return -3;
+    }
+
+    if (inc > 0) {
+        it = i1 / descA->mb;
+        A1 = A(it, 0);
+        lda1 = BLKLDD(descA, 0);
+        /* printf("(i1;i2) :: (%d;%d) && descA->i:%d && it=%d\n",i1, i2, descA->i, it); */
+        for (j = i1; j < i2; ++j, ipiv+=inc) {
+            ip = (*ipiv) - descA->i - 1;
+            /* printf("(ipiv[%d],ip)=(%d,%d) ",j,*ipiv,ip); */
+            if ( ip != j )
+            {
+                it = ip / descA->mb;
+                i  = ip % descA->mb;
+                lda2 = BLKLDD(descA, it);
+                cblas_zswap(descA->n, A1       + j, lda1,
+                                     A(it, 0) + i, lda2 );
+            }
+        }
+        /* printf("\n"); */
+    }
+    else
+    {
+        it = (i2-1) / descA->mb;
+        A1 = A(it, 0);
+        lda1 = BLKLDD(descA, it);
+
+        i1--;
+        ipiv = &ipiv[(1-i2)*inc];
+        for (j = i2-1; j > i1; --j, ipiv+=inc) {
+            ip = (*ipiv) - descA->i - 1;
+            if ( ip != j )
+            {
+                it = ip / descA->mb;
+                i  = ip % descA->mb;
+                lda2 = BLKLDD(descA, it);
+                cblas_zswap(descA->n, A1       + j, lda1,
+                                     A(it, 0) + i, lda2 );
+            }
+        }
+    }
+
+    return CHAMELON_SUCCESS;
+}
+
+/***************************************************************************//**
+ *
+ * @ingroup CORE_CHAMELON_Complex64_t
+ *
+ *  CORE_zlaswpc_ontile apply the zlaswp function on a matrix stored in
+ *  tile layout
+ *
+ *******************************************************************************
+ *
+ *  @param[in,out] descA
+ *          The descriptor of the matrix A to permute.
+ *
+ *  @param[in] i1
+ *          The first element of IPIV for which a column interchange will
+ *          be done.
+ *
+ *  @param[in] i2
+ *          The last element of IPIV for which a column interchange will
+ *          be done.
+ *
+ *  @param[in] ipiv
+ *          The pivot indices; Only the element in position i1 to i2
+ *          are accessed. The pivot are offset by A.i.
+ *
+ *  @param[in] inc
+ *          The increment between successive values of IPIV.  If IPIV
+ *          is negative, the pivots are applied in reverse order.
+ *
+ *******************************************************************************
+ *
+ * @return
+ *         \retval CHAMELON_SUCCESS successful exit
+ *         \retval <0 if INFO = -k, the k-th argument had an illegal value
+ *
+ *******************************************************************************
+ */
+
+int CORE_zlaswpc_ontile(CHAMELON_desc_t *descA, int i1, int i2, const int *ipiv, int inc)
+{
+    int i, j, ip, it;
+    CHAMELON_Complex64_t *A1;
+    int lda;
+
+    /* Change i1 to C notation */
+    i1--;
+
+    /* Check parameters */
+    if ( descA->mt > 1 ) {
+        coreblas_error(1, "Illegal value of descA->mt");
+        return -1;
+    }
+    if ( i1 < 0 ) {
+        coreblas_error(2, "Illegal value of i1");
+        return -2;
+    }
+    if ( (i2 <= i1) || (i2 > descA->n) ) {
+        coreblas_error(3, "Illegal value of i2");
+        return -3;
+    }
+    if ( ! ( (i2 - i1 - i1%descA->nb -1) < descA->nb ) ) {
+        coreblas_error(2, "Illegal value of i1,i2. They have to be part of the same block.");
+        return -3;
+    }
+
+    lda = BLKLDD(descA, 0);
+
+    if (inc > 0) {
+        it = i1 / descA->nb;
+        A1 = A(0, it);
+
+        for (j = i1-1; j < i2; ++j, ipiv+=inc) {
+            ip = (*ipiv) - descA->j - 1;
+            if ( ip != j )
+            {
+                it = ip / descA->nb;
+                i  = ip % descA->nb;
+                cblas_zswap(descA->m, A1       + j*lda, 1,
+                                     A(0, it) + i*lda, 1 );
+            }
+        }
+    }
+    else
+    {
+        it = (i2-1) / descA->mb;
+        A1 = A(0, it);
+
+        i1--;
+        ipiv = &ipiv[(1-i2)*inc];
+        for (j = i2-1; j > i1; --j, ipiv+=inc) {
+            ip = (*ipiv) - descA->j - 1;
+            if ( ip != j )
+            {
+                it = ip / descA->nb;
+                i  = ip % descA->nb;
+                cblas_zswap(descA->m, A1       + j*lda, 1,
+                                     A(0, it) + i*lda, 1 );
+            }
+        }
+    }
+
+    return CHAMELON_SUCCESS;
+}
diff --git a/coreblas/compute/core_zplrnt.c b/coreblas/compute/core_zplrnt.c
index e5c7f2914cb6101c8f05e125bf40a5555adb23f1..6dca2ae257b0fab0ed5724766cc7309bf7cb322c 100644
--- a/coreblas/compute/core_zplrnt.c
+++ b/coreblas/compute/core_zplrnt.c
@@ -34,7 +34,7 @@
 #endif
 
 void CORE_zplrnt( int m, int n, CHAMELEON_Complex64_t *A, int lda,
-                  int bigM, int m0, int n0, unsigned long long int seed )
+                  int bigM, int m0, int n0, unsigned long long int seed, int dsdmatrix )
 {
     CHAMELEON_Complex64_t *tmp = A;
     int64_t i, j;
@@ -50,5 +50,8 @@ void CORE_zplrnt( int m, int n, CHAMELEON_Complex64_t *A, int lda,
         }
         tmp  += lda-i;
         jump += bigM;
+        if (dsdmatrix && m0 == n0) {
+            A[j+j*lda] += bigM;
+        }
     }
 }
diff --git a/coreblas/compute/core_zswp.c b/coreblas/compute/core_zswp.c
new file mode 100644
index 0000000000000000000000000000000000000000..5cc50b878039815659d454f5d3ae81b28a9683ac
--- /dev/null
+++ b/coreblas/compute/core_zswp.c
@@ -0,0 +1,320 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Inria. All rights reserved.
+ * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file core_zlaswp.c
+ *
+ *  PLASMA core_blas kernel
+ *  PLASMA is a software package provided by Inria Bordeaux - Sud-Ouest, LaBRI,
+ *  University of Bordeaux, Bordeaux INP
+ *
+ * @version 2.6.0
+ * @comment This file has been automatically generated
+ *          from Plasma 2.6.0 for CHAMELON 1.0.0
+ * @author Mathieu Faverge
+ * @author Terry Cojean
+ * @date 2010-11-15
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "coreblas/include/coreblas.h"
+#include "control/descriptor.h"
+
+#define A(m, n) ((CHAMELON_Complex64_t *)chameleon_getaddr_ccrb(descA, m, n))
+
+/***************************************************************************//**
+ *
+ * @ingroup CORE_CHAMELON_Complex64_t
+ *
+ *  CORE_zswpp apply the zlaswp function between a pair of tiles
+ *
+ *******************************************************************************
+ *
+ *  @param[in,out] descA
+ *          The descriptor of the matrix A to permute, containing both tiles.
+ *
+ *  @param[in] Bm
+ *          The indice of the destination tile
+ *
+ *  @param[in] i1
+ *          The first element of IPIV for which a row interchange will
+ *          be done.
+ *
+ *  @param[in] i2
+ *          The last element of IPIV for which a row interchange will
+ *          be done.
+ *
+ *  @param[in] ipiv
+ *          The pivot indices; Only the element in position i1 to i2
+ *          are accessed. The pivot are offset by A.i.
+ *
+ *  @param[in] inc
+ *          The increment between successive values of IPIV.  If IPIV
+ *          is negative, the pivots are applied in reverse order.
+ *
+ *******************************************************************************
+ *
+ * @return
+ *         \retval CHAMELON_SUCCESS successful exit
+ *         \retval <0 if INFO = -k, the k-th argument had an illegal value
+ *
+ *******************************************************************************
+ */
+int CORE_zswpp(CHAMELON_desc_t* descA, int Bm, int i1, int i2, const int *ipiv, int inc)
+{
+    /* Change i1 to C notation */
+    i1--;
+
+    /* Check parameters */
+    if ( descA->nt > 1 ) {
+        coreblas_error(1, "Illegal value of descA->nt");
+        return -1;
+    }
+    if ( i1 < 0 ) {
+        coreblas_error(2, "Illegal value of i1");
+        return -2;
+    }
+    if ( (i2 <= i1) || (i2 > descA->m) ) {
+        coreblas_error(3, "Illegal value of i2");
+        return -3;
+    }
+    if ( ! ( (i2 - i1 - i1%descA->mb -1) < descA->mb ) ) {
+        coreblas_error(2, "Illegal value of i1,i2. They have to be part of the same block.");
+        return -3;
+    }
+
+    if (inc > 0) {
+        int it, j, lda1;
+        CHAMELON_Complex64_t *A1;
+        it = i1 / descA->mb;
+        A1 = A(it, 0);
+        lda1 = BLKLDD(descA, 0);
+
+        for (j = i1; j < i2; ++j, ipiv+=inc) {
+            int ip = (*ipiv) - descA->i - 1;
+            it = ip / descA->mb;
+
+            if ( it == Bm && ip != j)
+            {
+                int i  = ip % descA->mb;
+                int lda2 = BLKLDD(descA, it );
+                cblas_zswap(descA->n, A1 + j, lda1,
+                            A(it,0) + i, lda2 );
+            }
+        }
+    }
+
+    return CHAMELON_SUCCESS;
+}
+
+/***************************************************************************//**
+ *
+ * @ingroup CORE_CHAMELON_Complex64_t
+ *
+ *  CORE_zswptr_ontile apply the zlaswp function on a matrix stored in
+ *  tile layout, followed by a ztrsm on the first tile of the panel.
+ *
+ *******************************************************************************
+ *
+ *  @param[in,out] descA
+ *          The descriptor of the matrix A to permute.
+ *
+ *  @param[in] i1
+ *          The first element of IPIV for which a row interchange will
+ *          be done.
+ *
+ *  @param[in] i2
+ *          The last element of IPIV for which a row interchange will
+ *          be done.
+ *
+ *  @param[in] ipiv
+ *          The pivot indices; Only the element in position i1 to i2
+ *          are accessed. The pivot are offset by A.i.
+ *
+ *  @param[in] inc
+ *          The increment between successive values of IPIV.  If IPIV
+ *          is negative, the pivots are applied in reverse order.
+ *
+ *  @param[in] Akk
+ *          The triangular matrix Akk. The leading descA->nb-by-descA->nb
+ *          lower triangular part of the array Akk contains the lower triangular matrix, and the
+ *          strictly upper triangular part of A is not referenced. The
+ *          diagonal elements of A are also not referenced and are assumed to be 1.
+ *
+ * @param[in] ldak
+ *          The leading dimension of the array Akk. ldak >= max(1,descA->nb).
+ *
+ *******************************************************************************
+ *
+ * @return
+ *         \retval CHAMELON_SUCCESS successful exit
+ *         \retval <0 if INFO = -k, the k-th argument had an illegal value
+ *
+ *******************************************************************************
+ */
+
+int CORE_zswptr_ontile(CHAMELON_desc_t *descA, int i1, int i2, const int *ipiv, int inc,
+                       const CHAMELON_Complex64_t *Akk, int ldak)
+{
+    CHAMELON_Complex64_t zone  = 1.0;
+    int lda;
+    int m = descA->mt == 1 ? descA->m : descA->mb;
+
+    if ( descA->nt > 1 ) {
+        coreblas_error(1, "Illegal value of descA->nt");
+        return -1;
+    }
+    if ( i1 < 1 ) {
+        coreblas_error(2, "Illegal value of i1");
+        return -2;
+    }
+    if ( (i2 < i1) || (i2 > m) ) {
+        coreblas_error(3, "Illegal value of i2");
+        return -3;
+    }
+
+    CORE_zlaswp_ontile(descA, i1, i2, ipiv, inc);
+
+    lda = BLKLDD(descA, 0);
+    cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower,
+                 CblasNoTrans, CblasUnit,
+                 m, descA->n, CBLAS_SADDR(zone),
+                 Akk,     ldak,
+                 A(0, 0), lda );
+
+    return CHAMELON_SUCCESS;
+}
+
+void CORE_zswp_insert_rows(int nb_pivots, int global_MB,
+                           int K, int SRC_M, int DST_M,
+                           CHAMELON_Complex64_t *A,
+                           int *perm,
+                           CHAMELON_Complex64_t *rows,
+                           int NB, int MB, int LDA)
+{
+    int i, offset=0;
+
+    for(i = 0; i < MB && offset < nb_pivots * NB ; i++) {
+        int owner_tile = perm[i] / global_MB;
+        int row_index  = perm[i] % global_MB;
+
+        if (perm[i] != i + global_MB*K) {
+            if ((owner_tile == SRC_M && DST_M == K) || (SRC_M == K && DST_M == owner_tile)) {
+                int insertion_index;
+                //redudant but both use case depend on this
+                if (DST_M == K || (DST_M == SRC_M && DST_M == K))
+                    insertion_index = i;
+                else
+                    insertion_index = row_index;
+                cblas_zcopy(NB, rows + offset, 1,
+                            A + insertion_index, LDA);
+                offset += NB;
+            }
+        }
+    }
+}
+
+void CORE_zswp_extract_rows(int nb_pivots, int global_MB,
+                            int K, int SRC_M, int DST_M,
+                            CHAMELON_Complex64_t *A,
+                            int *perm,
+                            CHAMELON_Complex64_t *rows,
+                            int NB, int MB, int LDA)
+{
+    int i, offset=0;
+
+    for (i = 0 ; i < MB && offset < nb_pivots * NB ; i++) {
+        int owner_tile = perm[i] / global_MB;
+        int row_index  = perm[i] % global_MB;
+
+        /* Do we need to extract something ? */
+        if (perm[i] != i + global_MB*K) {
+            /* Concerns us somehow */
+            if ((owner_tile == SRC_M && DST_M == K) || (SRC_M == K && DST_M == owner_tile)) {
+                int extraction_index;
+                if (SRC_M == DST_M && SRC_M == K)
+                    extraction_index = row_index;
+                else if (SRC_M == K) //tile superior
+                    extraction_index = i;
+                else //tile inferior
+                    extraction_index = row_index;
+                cblas_zcopy(NB, A + extraction_index, LDA,
+                                rows + offset, 1);
+                offset += NB;
+            }
+        }
+    }
+}
+
+void CORE_zswp_insert_rows2(int nb_pivots, int global_MB,
+                           int K, int SRC_M, int DST_M,
+                           CHAMELON_Complex64_t *A,
+                           int *perm,
+                           CHAMELON_Complex64_t *rows,
+                           int *idxs, int NB, int MB, int LDA)
+{
+    int i, offset=0;
+
+    /* printf("%d, %d, %d -------------\n", K, SRC_M, nb_pivots); */
+    for(i = 0; i < nb_pivots; i++) {
+	    int index=idxs[i];
+	    /* printf("(k, index, perm) = (%d, %d, %d)\n", K, index, perm[index]); */
+        /* int owner_tile = perm[index] / global_MB; */
+        int row_index  = perm[index] % global_MB;
+        int insertion_index;
+
+        /* if ((owner_tile == SRC_M && DST_M == K) || (SRC_M == K && DST_M == owner_tile)) { */
+	        //redudant but both use case depend on this
+	        if (DST_M == K || (DST_M == SRC_M && DST_M == K))
+		        insertion_index = index;
+	        else
+		        insertion_index = row_index;
+	        cblas_zcopy(NB, rows + offset, 1,
+	                    A + insertion_index, LDA);
+	        offset += NB;
+        /* } */
+    }
+    /* printf("%d, %d, %d -------------\n", K, SRC_M, nb_pivots); */
+}
+
+void CORE_zswp_extract_rows2(int nb_pivots, int global_MB,
+                            int K, int SRC_M, int DST_M,
+                            CHAMELON_Complex64_t *A,
+                            int *perm,
+                            CHAMELON_Complex64_t *rows,
+                            int *idxs, int NB, int MB, int LDA)
+{
+	int i, offset=0, count=0, index;
+
+    for (i = 0 ; i < nb_pivots ; i++) {
+	    index = idxs[i];
+        int owner_tile = perm[index] / global_MB;
+        int row_index  = perm[index] % global_MB;
+
+        /* Do we need to extract something ? */
+        /* if (perm[i] != i + global_MB*K) { */
+            /* Concerns us somehow */
+            /* if ((owner_tile == SRC_M && DST_M == K) || (SRC_M == K && DST_M == owner_tile)) { */
+                int extraction_index;
+                if (SRC_M == DST_M && SRC_M == K)
+                    extraction_index = row_index;
+                else if (SRC_M == K) //tile superior
+                    extraction_index = index;
+                else //tile inferior
+                    extraction_index = row_index;
+                cblas_zcopy(NB, A + extraction_index, LDA,
+                                rows + offset, 1);
+                /* idxs[count++] = i; */
+                offset += NB;
+            /* } */
+        /* } */
+    }
+}
diff --git a/coreblas/include/coreblas/coreblas_z.h b/coreblas/include/coreblas/coreblas_z.h
index 050a7968769a5c37a138faee18ef0a577ee6be19..a4e55a9b930d07be3668920ce9a7508ec93e8023 100644
--- a/coreblas/include/coreblas/coreblas_z.h
+++ b/coreblas/include/coreblas/coreblas_z.h
@@ -80,6 +80,9 @@ int CORE_zgetf2_nopiv(int M, int N,
 int  CORE_zgetrf(int M, int N,
                  CHAMELEON_Complex64_t *A, int LDA,
                  int *IPIV, int *INFO);
+void CORE_zgetrf_extract_max(CHAMELON_Complex64_t *A, CHAMELON_Complex64_t *mw,
+                             int K, int M, int N, int H, int NB,
+                             int AM, int LDA);
 int  CORE_zgetrf_incpiv(int M, int N, int IB,
                         CHAMELEON_Complex64_t *A, int LDA,
                         int *IPIV, int *INFO);
@@ -206,7 +209,7 @@ void CORE_zplghe(double bump, int m, int n, CHAMELEON_Complex64_t *A, int lda,
 void CORE_zplgsy(CHAMELEON_Complex64_t bump, int m, int n, CHAMELEON_Complex64_t *A, int lda,
                  int bigM, int m0, int n0, unsigned long long int seed );
 void CORE_zplrnt(int m, int n, CHAMELEON_Complex64_t *A, int lda,
-                 int bigM, int m0, int n0, unsigned long long int seed );
+                 int bigM, int m0, int n0, unsigned long long int seed, int dsdmatrix  );
 int CORE_zplssq( cham_store_t storev, int M, int N,
                  double *sclssqin, double *sclssqout );
 int CORE_zplssq2( int N, double *sclssq );
diff --git a/cudablas/compute/CMakeLists.txt b/cudablas/compute/CMakeLists.txt
index 5389d35fdcc459ebb8783635eed79dc840160e64..a051c3766f1838b73fdc7261441f5714f6c56e6b 100644
--- a/cudablas/compute/CMakeLists.txt
+++ b/cudablas/compute/CMakeLists.txt
@@ -116,6 +116,37 @@ install(TARGETS cudablas
         LIBRARY DESTINATION lib
         )
 
+
+# Actual CUDA source files
+# ------------
+set(CUDA_ZSRC
+  cuda_zswp.cu
+  )
+
+precisions_rules_py(CUDABLAS_CUSRCS_GENERATED "${CUDA_ZSRC}"
+  PRECISIONS "${CHAMELEON_PRECISION}")
+
+unset(_tmp)
+foreach(element ${CUDABLAS_CUSRCS_GENERATED})
+  set(_tmp
+    ${_tmp}
+    ${CMAKE_CURRENT_BINARY_DIR}/${element}
+    )
+endforeach(element)
+set(CUDABLAS_CUSRCS
+  ${_tmp}
+  )
+
+set(CUDA_SERPARABLE_COMPILATION ON)
+# CUDA_ADD_LIBRARY(cudablas_cuda ${CUDABLAS_CUSRCS} SHARED)
+CUDA_ADD_LIBRARY(cudablas_cuda ${CUDABLAS_CUSRCS})
+add_dependencies(cudablas_cuda cudablas_include)
+set_property(TARGET cudablas_cuda PROPERTY LINKER_LANGUAGE Fortran)
+
+target_link_libraries(cudablas_cuda coreblas ${CUDA_LIBRARIES})
+install(TARGETS cudablas_cuda
+  DESTINATION lib)
+
 ###
 ### END CMakeLists.txt
 ###
diff --git a/cudablas/compute/cuda_zgetrf_sp.c b/cudablas/compute/cuda_zgetrf_sp.c
new file mode 100644
index 0000000000000000000000000000000000000000..033cc42fea7ff9c71f595d243a7fb73d6441c14d
--- /dev/null
+++ b/cudablas/compute/cuda_zgetrf_sp.c
@@ -0,0 +1,42 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Inria. All rights reserved.
+ * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file cuda_zgetrf_ps.c
+ *
+ *  CHAMELON cudablas kernel
+ *  CHAMELON is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @author Terry Cojean
+ * @date 2017-02-08
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "cudablas/include/cudablas.h"
+#include "cudablas/include/cudablas_z.h"
+
+#if defined(CHAMELEON_USE_MAGMA)
+int CUDA_zgetrf_sp(magma_int_t m, magma_int_t n,
+                   magmaDoubleComplex *dA, magma_int_t ldda,
+                   magma_double_t criteria, magma_int_t *nb_pivot,
+                   magma_int_t *info)
+{
+    int ret;
+    ret = magma_zgetrf_sp_gpu(m, n, dA, ldda, criteria, nb_pivot, &info);
+    if (ret != MAGMA_SUCCESS) {
+        fprintf(stderr, "Error in MAGMA: %d\n", ret);
+        exit(-1);
+    }
+    return CHAMELON_SUCCESS;
+}
+#endif
diff --git a/cudablas/compute/cuda_zpotrf.c b/cudablas/compute/cuda_zpotrf.c
index 7c6c86c1456b4fc804fb31732c3e57db454bcc2e..f7efae99ef0286758fa860b738b07e7a3d41030f 100644
--- a/cudablas/compute/cuda_zpotrf.c
+++ b/cudablas/compute/cuda_zpotrf.c
@@ -22,15 +22,15 @@
 
 #if defined(CHAMELEON_USE_MAGMA)
 int CUDA_zpotrf(
-        magma_uplo_t uplo,  magma_int_t n,
-        magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info)
+    magma_uplo_t uplo,  magma_int_t n,
+    magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info)
 {
     int ret;
     ret = magma_zpotrf_gpu(
         uplo,
         n, dA, ldda, info);
-/*  hA, stream );*/
-     if (ret != MAGMA_SUCCESS) {
+    /*  hA, stream );*/
+    if (ret != MAGMA_SUCCESS) {
         fprintf(stderr, "Error in MAGMA: %d\n", ret);
         exit(-1);
     }
diff --git a/cudablas/compute/cuda_zswp.cu b/cudablas/compute/cuda_zswp.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3020f5608333ec52c1e8c159c753564714b1c500
--- /dev/null
+++ b/cudablas/compute/cuda_zswp.cu
@@ -0,0 +1,305 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Inria. All rights reserved.
+ * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file cuda_zswp.cu
+ *
+ *  CHAMELON cudablas kernel
+ *  CHAMELON is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @author Terry Cojean
+ * @date 2017-02-10
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "cudablas/include/cudablas.h"
+#include "cudablas/include/cudablas/cudablas_z.h"
+
+
+__global__ __launch_bounds__(CHAMELEON_CUDA_NTHREADS)
+	void CUDA_zswp_insert_rows_kernel(int nb_pivots, int global_MB,
+	                                  const int *__restrict__ perm,
+	                                  cuDoubleComplex *__restrict__ A,
+	                                  const cuDoubleComplex *__restrict__ rows,
+	                                  int K, int SRC_M, int DST_M,
+	                                  int NB, int MB, int LDA)
+{
+	int n = blockIdx.x * blockDim.x + threadIdx.x;
+	int i, owner_tile, row_index, offset=0;
+
+	if ( n < NB ) {
+		A = A + n*LDA;
+		if (DST_M == K) {
+			for (i = 0 ; i < MB && offset < nb_pivots * NB; i++) {
+				owner_tile = perm[i] / global_MB;
+				row_index  = perm[i] % global_MB;
+
+				if (perm[i] != i + global_MB*K && ((owner_tile == SRC_M && DST_M == K) || (owner_tile == DST_M && SRC_M == K))) {
+					A[i] = rows[offset+n];
+					offset += NB;
+				}
+			}
+		} else {
+			for (i = 0 ; i < MB && offset < nb_pivots * NB; i++) {
+				owner_tile = perm[i] / global_MB;
+				row_index  = perm[i] % global_MB;
+
+				if (perm[i] != i + global_MB*K && ((owner_tile == SRC_M && DST_M == K) || (owner_tile == DST_M && SRC_M == K))) {
+					A[row_index] = rows[offset+n];
+					offset += NB;
+				}
+			}
+		}
+	}
+}
+
+void CUDA_zswp_insert_rows(int nb_pivots, int global_MB, const int *perm,
+                           cuDoubleComplex *A, const cuDoubleComplex *rows,
+                           int K, int SRC_M, int DST_M, int NB, int MB, int LDA,
+                           cudaStream_t stream)
+{
+#if !defined(CHAMELEON_USE_CUBLAS_V2)
+	cublasSetKernelStream( stream );
+#endif
+
+	dim3 grid( NB / CHAMELEON_CUDA_NTHREADS );
+	dim3 threads( CHAMELEON_CUDA_NTHREADS );
+	CUDA_zswp_insert_rows_kernel<<<grid, threads, 0, stream>>>
+		(nb_pivots, global_MB, perm, A, rows, K, SRC_M, DST_M, NB, MB, LDA);
+
+	/* cudaError_t cures; */
+	/* cures = cudaStreamSynchronize(stream); */
+	/* if (cures) */
+  /*       { */
+  /*           const char *errormsg = cudaGetErrorString(cures); */
+  /*           printf("Error in swp_extract_rows_gpu: %s\n", errormsg); */
+  /*       } */
+}
+
+__global__ __launch_bounds__(CHAMELEON_CUDA_NTHREADS)
+	void CUDA_zswp_extract_rows_kernel(int nb_pivots, int global_MB,
+	                                   const int *__restrict__ perm,
+	                                   const cuDoubleComplex *__restrict__ A,
+	                                   cuDoubleComplex *__restrict__ rows,
+	                                   int K, int SRC_M, int DST_M,
+	                                   int NB, int MB, int LDA)
+{
+	int n = blockIdx.x * blockDim.x + threadIdx.x;
+    int i, owner_tile, row_index, offset=0;
+
+    if ( n < NB ) {
+        A = A + n * LDA;
+        if (SRC_M == DST_M && SRC_M == K)
+        {
+	        for (i = 0 ; i < MB && offset < nb_pivots * NB ; i++) {
+		        owner_tile = perm[i] / global_MB;
+		        row_index  = perm[i] % global_MB;
+
+		        if (perm[i] != i + global_MB*K && ((owner_tile == SRC_M && DST_M == K) || (owner_tile == DST_M && SRC_M ==K ))) {
+				        rows[n + offset] = A[row_index];
+				        offset += NB;
+		        }
+	        }
+        } else if (SRC_M == K) {
+	        for (i = 0 ; i < MB && offset < nb_pivots * NB ; i++) {
+		        owner_tile = perm[i] / global_MB;
+		        row_index  = perm[i] % global_MB;
+
+		        if (perm[i] != i + global_MB*K && ((owner_tile == SRC_M && DST_M == K) || (owner_tile == DST_M && SRC_M == K))) {
+				        rows[n + offset] = A[i];
+				        offset += NB;
+		        }
+	        }
+        } else {
+	        for (i = 0 ; i < MB && offset < nb_pivots * NB ; i++) {
+		        owner_tile = perm[i] / global_MB;
+		        row_index  = perm[i] % global_MB;
+
+		        if (perm[i] != i + global_MB*K && ((owner_tile == SRC_M && DST_M == K) || (owner_tile == DST_M && SRC_M == K))) {
+				        rows[n + offset] = A[row_index];
+				        offset += NB;
+		        }
+	        }
+        }
+    }
+}
+
+void CUDA_zswp_extract_rows(int nb_pivots, int global_MB, const int *perm,
+                            const cuDoubleComplex *A, cuDoubleComplex *rows,
+                            int K, int SRC_M, int DST_M, int NB, int MB, int LDA,
+                            cudaStream_t stream)
+{
+#if !defined(CHAMELEON_USE_CUBLAS_V2)
+	cublasSetKernelStream( stream );
+#endif
+
+	dim3 grid( NB / CHAMELEON_CUDA_NTHREADS );
+	dim3 threads( CHAMELEON_CUDA_NTHREADS );
+    /* dim3 grid( NB ); */
+    /* dim3 threads( 1 ); */
+
+    CUDA_zswp_extract_rows_kernel<<<grid, threads, 0, stream>>>
+	    (nb_pivots, global_MB, perm, A, rows, K, SRC_M, DST_M, NB, MB, LDA);
+
+    /* cudaError_t cures; */
+    /* cures = cudaStreamSynchronize(stream); */
+    /* if (cures) */
+    /* { */
+    /*     const char *errormsg = cudaGetErrorString(cures); */
+    /*     printf("Error in swp_extract_rows_gpu: %s\n", errormsg); */
+    /* } */
+}
+
+__global__ __launch_bounds__(CHAMELEON_CUDA_NTHREADS)
+	void CUDA_zswp_insert_rows2_kernel(int nb_pivots, int global_MB,
+	                                  const int *__restrict__ perm,
+	                                  cuDoubleComplex *__restrict__ A,
+	                                  const cuDoubleComplex *__restrict__ rows,
+	                                  const int *__restrict__ idxs,
+	                                  int K, int SRC_M, int DST_M,
+	                                  int NB, int MB, int LDA)
+{
+	int n = blockIdx.y * blockDim.y + threadIdx.y;
+	int i = blockIdx.x, row_index, index;
+
+	if ( n < NB ) {
+		A = A + n*LDA;
+		if (DST_M == K) {
+			/* for (i = 0 ; i < nb_pivots ; i++) { */
+				index = idxs[i];
+				/* owner_tile = perm[index] / global_MB; */
+				row_index  = perm[index] % global_MB;
+
+				/* if ((owner_tile == SRC_M && DST_M == K) || (SRC_M == K && DST_M == owner_tile)) { */
+					A[index] = rows[i*NB+n];
+					/* offset += NB; */
+				/* } */
+			/* } */
+		} else {
+			/* for (i = 0 ; i < nb_pivots ; i++) { */
+				index = idxs[i];
+				/* owner_tile = perm[index] / global_MB; */
+				row_index  = perm[index] % global_MB;
+
+				/* if ((owner_tile == SRC_M && DST_M == K) || (SRC_M == K && DST_M == owner_tile)) { */
+					A[row_index] = rows[i*NB+n];
+					/* offset += NB; */
+				/* } */
+			/* } */
+		}
+	}
+}
+
+void CUDA_zswp_insert_rows2(int nb_pivots, int global_MB, const int *perm,
+                            cuDoubleComplex *A, const cuDoubleComplex *rows,
+                            const int *idxs, int K, int SRC_M, int DST_M, int NB, int MB, int LDA,
+                            cudaStream_t stream)
+{
+#if !defined(CHAMELEON_USE_CUBLAS_V2)
+	cublasSetKernelStream( stream );
+#endif
+
+	dim3 grid(  nb_pivots, NB / CHAMELEON_CUDA_NTHREADS);
+	dim3 threads( 1, CHAMELEON_CUDA_NTHREADS );
+	CUDA_zswp_insert_rows2_kernel<<<grid, threads, 0, stream>>>
+		(nb_pivots, global_MB, perm, A, rows, idxs, K, SRC_M, DST_M, NB, MB, LDA);
+
+	/* cudaError_t cures; */
+	/* cures = cudaStreamSynchronize(stream); */
+	/* if (cures) */
+  /*       { */
+  /*           const char *errormsg = cudaGetErrorString(cures); */
+  /*           printf("Error in swp_extract_rows_gpu: %s\n", errormsg); */
+  /*       } */
+}
+
+__global__ __launch_bounds__(CHAMELEON_CUDA_NTHREADS)
+	void CUDA_zswp_extract_rows2_kernel(int nb_pivots, int global_MB,
+	                                   const int *__restrict__ perm,
+	                                   const cuDoubleComplex *__restrict__ A,
+	                                   cuDoubleComplex *__restrict__ rows,
+	                                   int *idxs,
+	                                   int K, int SRC_M, int DST_M,
+	                                   int NB, int MB, int LDA)
+{
+	int n = blockIdx.y * blockDim.y + threadIdx.y;
+	int i = blockIdx.x, owner_tile, row_index, offset=0, index;
+
+    if ( n < NB ) {
+        A = A + n * LDA;
+        if (SRC_M == DST_M && SRC_M == K)
+        {
+	        /* for (i = 0 ; i < nb_pivots ; i++) { */
+		        index = idxs[i];
+		        owner_tile = perm[index] / global_MB;
+		        row_index  = perm[index] % global_MB;
+
+		        /* if (perm[index] != index + global_MB*K && ((owner_tile == SRC_M && DST_M == K) || (owner_tile == DST_M && SRC_M ==K ))) { */
+				        rows[n + NB*i] = A[row_index];
+				        /* offset += NB; */
+				        /* if (n == 0)idxs[count++] = i; */
+		        /* } */
+	        /* } */
+        } else if (SRC_M == K) {
+	        /* for (i = 0 ; i < nb_pivots ; i++) { */
+		        index = idxs[i];
+		        owner_tile = perm[index] / global_MB;
+		        row_index  = perm[index] % global_MB;
+
+		        /* if (perm[index] != index + global_MB*K && ((owner_tile == SRC_M && DST_M == K) || (owner_tile == DST_M && SRC_M == K))) { */
+				        rows[n + NB*i] = A[index];
+				        /* offset += NB; */
+				        /* if (n == 0)idxs[count++] = i; */
+		        /* } */
+	        /* } */
+        } else {
+	        /* for (i = 0 ; i < nb_pivots ; i++) { */
+		        index = idxs[i];
+		        owner_tile = perm[index] / global_MB;
+		        row_index  = perm[index] % global_MB;
+
+		        /* if (perm[index] != index + global_MB*K && ((owner_tile == SRC_M && DST_M == K) || (owner_tile == DST_M && SRC_M == K))) { */
+				        rows[n + NB*i] = A[row_index];
+				        /* offset += NB; */
+				        /* if (n == 0)idxs[count++] = i; */
+		        /* } */
+	        /* } */
+        }
+    }
+}
+
+void CUDA_zswp_extract_rows2(int nb_pivots, int global_MB, const int *perm,
+                            const cuDoubleComplex *A, cuDoubleComplex *rows,
+                            int *idxs, int K, int SRC_M, int DST_M, int NB, int MB, int LDA,
+                            cudaStream_t stream)
+{
+#if !defined(CHAMELEON_USE_CUBLAS_V2)
+	cublasSetKernelStream( stream );
+#endif
+
+	dim3 grid( nb_pivots, NB / CHAMELEON_CUDA_NTHREADS );
+	dim3 threads( 1, CHAMELEON_CUDA_NTHREADS );
+    /* dim3 grid( NB ); */
+    /* dim3 threads( 1 ); */
+
+    CUDA_zswp_extract_rows2_kernel<<<grid, threads, 0, stream>>>
+	    (nb_pivots, global_MB, perm, A, rows, idxs, K, SRC_M, DST_M, NB, MB, LDA);
+
+    /* cudaError_t cures; */
+    /* cures = cudaStreamSynchronize(stream); */
+    /* if (cures) */
+    /* { */
+    /*     const char *errormsg = cudaGetErrorString(cures); */
+    /*     printf("Error in swp_extract_rows_gpu: %s\n", errormsg); */
+    /* } */
+}
diff --git a/cudablas/include/cudablas.h b/cudablas/include/cudablas.h
index fc6fe7a713e22de8a968678fb2311d69909f23fb..38b1f0d9a0df0f44bd1d016eccba1190466245d6 100644
--- a/cudablas/include/cudablas.h
+++ b/cudablas/include/cudablas.h
@@ -36,6 +36,8 @@
 #include <cuda.h>
 #include <cuComplex.h>
 
+#define CHAMELEON_CUDA_NTHREADS 64
+
 #include <cublas.h>
 #include <cublas_v2.h>
 
diff --git a/cudablas/include/cudablas/cudablas_z.h b/cudablas/include/cudablas/cudablas_z.h
index d0fa9be789de00afe44ae5c53172ace18ccac4ce..777a62aa1d839130a4c5ac45aee883f60717f325 100644
--- a/cudablas/include/cudablas/cudablas_z.h
+++ b/cudablas/include/cudablas/cudablas_z.h
@@ -24,6 +24,11 @@
 /**
  *  Declarations of cuda kernels - alphabetical order
  */
+void CUDA_zswp_extract_rows(int nb_pivots, int global_MB, const int *perm, const cuDoubleComplex *A, cuDoubleComplex *rows, int K, int SRC_M, int DST_M, int NB, int MB, int LDA, cudaStream_t stream);
+void CUDA_zswp_insert_rows(int nb_pivots, int global_MB, const int *perm, cuDoubleComplex *A, const cuDoubleComplex *rows, int K, int SRC_M, int DST_M, int NB, int MB, int LDA, cudaStream_t stream);
+void CUDA_zswp_extract_rows2(int nb_pivots, int global_MB, const int *perm, const cuDoubleComplex *A, cuDoubleComplex *rows, int *idxs, int K, int SRC_M, int DST_M, int NB, int MB, int LDA, cudaStream_t stream);
+void CUDA_zswp_insert_rows2(int nb_pivots, int global_MB, const int *perm, cuDoubleComplex *A, const cuDoubleComplex *rows, const int *idxs, int K, int SRC_M, int DST_M, int NB, int MB, int LDA, cudaStream_t stream);
+
 int CUDA_zgeadd( cham_trans_t trans, int m, int n, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *beta, cuDoubleComplex *B, int ldb, cublasHandle_t handle );
 int CUDA_zgemerge( cham_side_t side, cham_diag_t diag, int M, int N, const cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cublasHandle_t handle );
 int CUDA_zgemm(  cham_trans_t transa, cham_trans_t transb, int m, int n, int k, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, cublasHandle_t handle );
diff --git a/cudablas/include/cudablas_z.h b/cudablas/include/cudablas_z.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa946ea37978ca64fbcbd82c3a8154bef65f23a6
--- /dev/null
+++ b/cudablas/include/cudablas_z.h
@@ -0,0 +1,81 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2016 Inria. All rights reserved.
+ * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file cudablas_z.h
+ *
+ *  CHAMELON cudablas headers
+ *  CHAMELON is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @author Florent Pruvost
+ * @date 2015-09-16
+ * @precisions normal z -> c d s
+ *
+ **/
+#ifndef _CHAMELON_CUDA_ZBLAS_H_
+#define _CHAMELON_CUDA_ZBLAS_H_
+
+#include "include/chameleon_config.h"
+#include "chameleon_types.h"
+#include "cudablas/include/cudablas.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** ****************************************************************************
+ *  Declarations of cuda kernels - alphabetical order
+ **/
+int CUDA_zgemerge( CHAMELON_enum side, CHAMELON_enum diag, int M, int N, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, CUBLAS_STREAM_PARAM);
+int CUDA_zgemm(  CHAMELON_enum transa, CHAMELON_enum transb, int m, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM);
+int CUDA_zhemm(  CHAMELON_enum side, CHAMELON_enum uplo, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM);
+int CUDA_zher2k( CHAMELON_enum uplo, CHAMELON_enum trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, double *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM);
+int CUDA_zherfb( CHAMELON_enum uplo, int n, int k, int ib, int nb, const cuDoubleComplex *A, int lda, const cuDoubleComplex *T, int ldt, cuDoubleComplex *C, int ldc, cuDoubleComplex *WORK, int ldwork, CUBLAS_STREAM_PARAM );
+int CUDA_zherk(  CHAMELON_enum uplo, CHAMELON_enum trans, int n, int k, double *alpha, const cuDoubleComplex *A, int lda, double *beta, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM);
+int CUDA_zlarfb(CHAMELON_enum side, CHAMELON_enum trans, CHAMELON_enum direct, CHAMELON_enum storev, int M, int N, int K, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM );
+int CUDA_zparfb(CHAMELON_enum side, CHAMELON_enum trans, CHAMELON_enum direct, CHAMELON_enum storev, int M1, int N1, int M2, int N2, int K, int L, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM );
+int CUDA_zsymm(  CHAMELON_enum side, CHAMELON_enum uplo, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM);
+int CUDA_zsyr2k( CHAMELON_enum uplo, CHAMELON_enum trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM);
+int CUDA_zsyrk(  CHAMELON_enum uplo, CHAMELON_enum trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM);
+void CUDA_zswp_extract_rows(int nb_pivots, int global_MB, const int *perm, const cuDoubleComplex *A, cuDoubleComplex *rows, int K, int SRC_M, int DST_M, int NB, int MB, int LDA, cudaStream_t stream);
+void CUDA_zswp_insert_rows(int nb_pivots, int global_MB, const int *perm, cuDoubleComplex *A, const cuDoubleComplex *rows, int K, int SRC_M, int DST_M, int NB, int MB, int LDA, cudaStream_t stream);
+void CUDA_zswp_extract_rows2(int nb_pivots, int global_MB, const int *perm, const cuDoubleComplex *A, cuDoubleComplex *rows, int *idxs, int K, int SRC_M, int DST_M, int NB, int MB, int LDA, cudaStream_t stream);
+void CUDA_zswp_insert_rows2(int nb_pivots, int global_MB, const int *perm, cuDoubleComplex *A, const cuDoubleComplex *rows, const int *idxs, int K, int SRC_M, int DST_M, int NB, int MB, int LDA, cudaStream_t stream);
+int CUDA_ztpmqrt( CHAMELON_enum side, CHAMELON_enum trans, int M, int N, int K, int L, int IB, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, CUBLAS_STREAM_PARAM );
+int CUDA_ztrmm(  CHAMELON_enum side, CHAMELON_enum uplo, CHAMELON_enum transa, CHAMELON_enum diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM);
+int CUDA_ztrsm(  CHAMELON_enum side, CHAMELON_enum uplo, CHAMELON_enum transa, CHAMELON_enum diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM);
+int CUDA_ztsmlq( CHAMELON_enum side, CHAMELON_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM);
+int CUDA_ztsmqr( CHAMELON_enum side, CHAMELON_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM);
+int CUDA_zunmlqt(CHAMELON_enum side, CHAMELON_enum trans, int M, int N, int K, int IB, const cuDoubleComplex *A,    int LDA, const cuDoubleComplex *T,    int LDT, cuDoubleComplex *C,    int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM );
+int CUDA_zunmqrt(CHAMELON_enum side, CHAMELON_enum trans, int M, int N, int K, int IB, const cuDoubleComplex *A,    int LDA, const cuDoubleComplex *T,    int LDT, cuDoubleComplex *C,    int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM );
+
+#if defined(CHAMELEON_USE_MAGMA)
+int CUDA_zgelqt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da, magma_int_t ldda, magmaDoubleComplex *v, magma_int_t ldv, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM);
+int CUDA_zgeqrt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da, magma_int_t ldda, magmaDoubleComplex *v, magma_int_t ldv, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM);
+int CUDA_zgessm( char storev, magma_int_t m, magma_int_t n, magma_int_t k, magma_int_t ib, magma_int_t *ipiv, cuDoubleComplex *dL1, magma_int_t lddl1, cuDoubleComplex *dL, magma_int_t lddl, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info);
+int CUDA_zgetrf_incpiv( char storev, magma_int_t m, magma_int_t n, magma_int_t ib, cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl, magma_int_t *ipiv, cuDoubleComplex *dwork, magma_int_t lddwork, magma_int_t *info);
+int CUDA_zgetrf_nopiv( magma_int_t m, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info);
+int CUDA_zlauum( char uplo, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info);
+int CUDA_zpotrf( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info);
+int CUDA_zssssm( magma_storev_t storev, magma_int_t m1, magma_int_t n1, magma_int_t m2, magma_int_t n2, magma_int_t k, magma_int_t ib, magmaDoubleComplex *dA1, magma_int_t ldda1, magmaDoubleComplex *dA2, magma_int_t ldda2, magmaDoubleComplex *dL1, magma_int_t lddl1, magmaDoubleComplex *dL2, magma_int_t lddl2, magma_int_t *IPIV, magma_int_t *info);
+int CUDA_ztrtri( magma_uplo_t uplo, magma_diag_t diag, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info);
+int CUDA_ztslqt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da1, magma_int_t ldda1, magmaDoubleComplex *da2, magma_int_t ldda2, magmaDoubleComplex *a2, magma_int_t lda2, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM);
+int CUDA_ztsqrt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da1, magma_int_t ldda1, magmaDoubleComplex *da2, magma_int_t ldda2, magmaDoubleComplex *a2, magma_int_t lda2, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM);
+int CUDA_ztstrf( char storev, magma_int_t m, magma_int_t n, magma_int_t ib, magma_int_t nb, cuDoubleComplex *hU, magma_int_t ldhu, cuDoubleComplex *dU, magma_int_t lddu, cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl, magma_int_t *ipiv, cuDoubleComplex *hwork, magma_int_t ldhwork, cuDoubleComplex *dwork, magma_int_t lddwork, magma_int_t *info);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/example/lapack_to_chameleon/step0.c b/example/lapack_to_chameleon/step0.c
index ad6c57a627072b00a74e94aa5ff92f96e9195de0..53b436563f89c06424a08b7a5751d09f17b77e0c 100644
--- a/example/lapack_to_chameleon/step0.c
+++ b/example/lapack_to_chameleon/step0.c
@@ -79,7 +79,7 @@ int main(int argc, char *argv[]) {
     CORE_dplgsy( (double)N, N, N, A, N, N, N, N, 51 );
 
     /* generate RHS */
-    CORE_dplrnt( N, NRHS, B, N, N, N, NRHS, 5673 );
+    CORE_dplrnt( N, NRHS, B, N, N, N, NRHS, 5673, 0 );
 
     /* copy A before facto. in order to check the result */
     memcpy(Acpy, A, N * N * sizeof(double));
diff --git a/example/lapack_to_chameleon/step7.h b/example/lapack_to_chameleon/step7.h
index 752e7fd957f0816f3d09d2291162e9a82393d288..f87adc2bd37349d5f912d7e28a55e499f3bfee01 100644
--- a/example/lapack_to_chameleon/step7.h
+++ b/example/lapack_to_chameleon/step7.h
@@ -86,7 +86,7 @@ static void Cham_build_callback_plgsy(int row_min, int row_max, int col_min, int
 
 static void Cham_build_callback_plrnt(int row_min, int row_max, int col_min, int col_max, void *buffer, int ld, void *user_data) {
   struct data_pl *data=(struct data_pl *)user_data;
-  CORE_dplrnt(row_max-row_min+1, col_max-col_min+1, buffer, ld, data->bigM, row_min, col_min, data->seed);
+  CORE_dplrnt(row_max-row_min+1, col_max-col_min+1, buffer, ld, data->bigM, row_min, col_min, data->seed, 0);
 }
 
 /**
diff --git a/include/chameleon/constants.h b/include/chameleon/constants.h
index ba252005b72f862af1896ca1cac14ab0742f62a3..6c457effb681c36c4d8b7d0399b5fad3cc8dab93 100644
--- a/include/chameleon/constants.h
+++ b/include/chameleon/constants.h
@@ -225,6 +225,8 @@ typedef enum chameleon_gemm_e {
 #define CHAMELEON_PROGRESS            9
 #define CHAMELEON_GEMM3M             10
 #define CHAMELEON_GENERIC            11
+#define CHAMELEON_DSDMATRIX_MODE     12
+#define CHAMELEON_RECTIL_PANEL_MODE  13
 
 /**
  *  CHAMELEON constants - configuration parameters
@@ -259,6 +261,17 @@ typedef enum chameleon_translation_e {
     ChamOutOfPlace = 2,
 } cham_translation_t;
 
+/**
+ * @brief Swap types
+ **/
+typedef enum chameleon_swap_e {
+    ChamSwapNo,
+    ChamSwapLine,
+    ChamSwapTile,
+    ChamSwapColumn,
+    ChamSwapInvalid
+} cham_swap_t;
+
 /**
  * @brief Constant to describe how to initialize the mat pointer in descriptors
  */
diff --git a/include/chameleon/runtime.h b/include/chameleon/runtime.h
index 8d4570f3c24d3146cef2a206448be07b80ba504f..3ba9fe41e09a05c59c959cd9b19c91ce100799d9 100644
--- a/include/chameleon/runtime.h
+++ b/include/chameleon/runtime.h
@@ -522,6 +522,18 @@ void
 RUNTIME_data_flush( const RUNTIME_sequence_t *sequence,
                     const CHAM_desc_t *A, int Am, int An );
 
+/**
+ * @brief Flushes a CHAMELON handle. TODO: Make it work with descs and "tiles".
+ *
+ * @param[in] options
+ *            CHAMELON options used for the call
+ *
+ * @param[in] handle
+ *            The handle to flush
+ */
+void RUNTIME_data_flush_handle(const CHAMELON_option_t *options,
+                               const CHAMELON_data_handle_t *handle);
+
 /**
  * @brief Migrate a single piece of data.
  *
@@ -666,6 +678,39 @@ RUNTIME_options_ws_free( RUNTIME_option_t *options );
 void
 RUNTIME_set_minmax_submitted_tasks( int min, int max );
 
+/**
+ * @}
+ *
+ * @name RUNTIME Panel thread allocation management
+ * @{
+ */
+void RUNTIME_alloc_threads(CHAMELON_option_t *options, int ncores, int *cores);
+void RUNTIME_unalloc_threads(CHAMELON_option_t *options);
+
+
+/**
+ * @}
+ *
+ * @name RUNTIME data_handle utilites.
+ * TODO: Remove it and make all calls to these use descs and "tiles"
+ * @{
+ */
+void RUNTIME_data_alloc(CHAMELON_data_handle_t* data_handle);
+void RUNTIME_data_free(CHAMELON_data_handle_t* data_handle);
+void RUNTIME_vector_data_register(CHAMELON_data_handle_t* data_handle, uint32_t home_node,
+                                  uintptr_t ptr, uint32_t nx, size_t elemsize);
+void RUNTIME_matrix_data_register(CHAMELON_data_handle_t* data_handle, uint32_t home_node,
+                                  uintptr_t ptr, uint32_t ld, uint32_t nx,
+                                  uint32_t ny, size_t elemsize);
+void RUNTIME_data_acquire(CHAMELON_data_handle_t *data_handle);
+void RUNTIME_data_acquire_write(CHAMELON_data_handle_t *data_handle);
+void RUNTIME_data_release(CHAMELON_data_handle_t *data_handle);
+void RUNTIME_data_unregister(CHAMELON_data_handle_t* data_handle);
+void RUNTIME_data_unregister_async(CHAMELON_data_handle_t* data_handle);
+void RUNTIME_data_unregister_free_async(CHAMELON_data_handle_t* data_handle);
+void RUNTIME_data_acquire_all(CHAMELON_desc_t *desc, int mode);
+
+
 /**
  * @}
  *
diff --git a/include/chameleon/struct.h b/include/chameleon/struct.h
index e5df8fa6d030c831598cce505de55981bea35cca..d0f0d39563bcfad687fde4b18ccd16c3eb4aba0d 100644
--- a/include/chameleon/struct.h
+++ b/include/chameleon/struct.h
@@ -143,6 +143,7 @@ typedef struct chameleon_context_s {
     cham_bool_t        generic_enabled;
     cham_bool_t        autominmax_enabled;
     cham_bool_t        runtime_paused;
+    cham_bool_t        dsdmatrix_enabled;
 
     cham_householder_t householder;        // "domino" (flat) or tree-based (reduction) Householder
     cham_translation_t translation;        // In place or Out of place layout conversion
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index 125e16fd9fd65ceed38ad1e7271a64b43c01dfd1..058683d9ce5695206d8ca51e09b373c84a37208c 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -33,6 +33,16 @@ set(CODELETS_ZSRC
     codelets/codelet_zlag2c.c
     codelets/codelet_dlag2z.c
     codelets/codelet_dzasum.c
+    codelets/codelet_zgetrf_compare_workspace.c
+    codelets/codelet_zgetrf_copy_workspace.c
+    codelets/codelet_zgetrf_extract_max.c
+    codelets/codelet_zgetrf_rectil.c
+    codelets/codelet_zgetrf_save_pivots.c
+    codelets/codelet_zgetrf_sp.c
+    codelets/codelet_zlaswp.c
+    codelets/codelet_zswp.c
+    codelets/codelet_zswp_extract_rows.c
+    codelets/codelet_zswp_insert_rows.c
     ##################
     # BLAS 1
     ##################
diff --git a/runtime/parsec/CMakeLists.txt b/runtime/parsec/CMakeLists.txt
index 2355aa2069ee1ebbe6ce5ad33276660cc260a382..d0b150910015fd21c0029353c04582337af41f5c 100644
--- a/runtime/parsec/CMakeLists.txt
+++ b/runtime/parsec/CMakeLists.txt
@@ -93,6 +93,12 @@ set(RUNTIME_COMMON
   ${RUNTIME_COMMON_GENERATED}
   )
 
+ if (CHAMELEON_USE_CUDA)
+   set(RUNTIME_COMMON
+     control/runtime_cuda.c
+     ${RUNTIME_COMMON})
+ endif()
+
 # Generate the Chameleon sources for all possible precisions
 # ----------------------------------------------------------
 set(RUNTIME_SRCS_GENERATED "")
diff --git a/runtime/parsec/codelets/codelet_zplrnt.c b/runtime/parsec/codelets/codelet_zplrnt.c
index f1fec1593123d7eabaf6343f435110eabdd4dfdf..ff24b73802052d5b80083df8a35f0c6d5a60556f 100644
--- a/runtime/parsec/codelets/codelet_zplrnt.c
+++ b/runtime/parsec/codelets/codelet_zplrnt.c
@@ -34,11 +34,12 @@ CORE_zplrnt_parsec( parsec_execution_stream_t *context,
     int m0;
     int n0;
     unsigned long long int seed;
+    unsigned dsdmatrix;
 
     parsec_dtd_unpack_args(
-        this_task, &m, &n, &A, &lda, &bigM, &m0, &n0, &seed );
+        this_task, &m, &n, &A, &lda, &bigM, &m0, &n0, &seed, &dsdmatrix );
 
-    CORE_zplrnt( m, n, A, lda, bigM, m0, n0, seed );
+    CORE_zplrnt( m, n, A, lda, bigM, m0, n0, seed, dsdmatrix );
 
     (void)context;
     return PARSEC_HOOK_RETURN_DONE;
@@ -46,7 +47,7 @@ CORE_zplrnt_parsec( parsec_execution_stream_t *context,
 
 void INSERT_TASK_zplrnt( const RUNTIME_option_t *options,
                         int m, int n, const CHAM_desc_t *A, int Am, int An,
-                        int bigM, int m0, int n0, unsigned long long int seed )
+                        int bigM, int m0, int n0, unsigned long long int seed, int dsdmatrix )
 {
     parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
     CHAM_tile_t *tileA = A->get_blktile( A, Am, An );
@@ -61,5 +62,6 @@ void INSERT_TASK_zplrnt( const RUNTIME_option_t *options,
         sizeof(int),       &m0,                         VALUE,
         sizeof(int),       &n0,                         VALUE,
         sizeof(unsigned long long int),       &seed,    VALUE,
-        PARSEC_DTD_ARG_END );
+        sizeof(int),       &dsdmatrix,                  VALUE,
+        PARSEC_DTD_ARG_END);
 }
diff --git a/runtime/parsec/control/runtime_cuda.c b/runtime/parsec/control/runtime_cuda.c
new file mode 100644
index 0000000000000000000000000000000000000000..609454a50eadfaeea1d2160e927b2cad8917b2b3
--- /dev/null
+++ b/runtime/parsec/control/runtime_cuda.c
@@ -0,0 +1,31 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Inria. All rights reserved.
+ * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file runtime_cuda.c
+ *
+ *  CHAMELON auxiliary routines
+ *  CHAMELON is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @version 0.9.0
+ * @author Terry Cojean
+ * @date 2017-02-10
+ *
+ **/
+#include <stdio.h>
+#include <stdlib.h>
+#include "runtime/parsec/include/chameleon_parsec.h"
+
+cudaStream_t RUNTIME_cuda_get_local_stream()
+{
+    return NULL;
+}
diff --git a/runtime/parsec/control/runtime_data_handle.c b/runtime/parsec/control/runtime_data_handle.c
new file mode 100644
index 0000000000000000000000000000000000000000..382f28f7a2f8bd2e5b0d89560ceb5a51b31af8ff
--- /dev/null
+++ b/runtime/parsec/control/runtime_data_handle.c
@@ -0,0 +1,58 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2016 Inria. All rights reserved.
+ * @copyright (c) 2012-2014, 2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file runtime_data_handle.c
+ *
+ *  CHAMELON auxiliary routines
+ *  CHAMELON is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @author Terry Cojean
+ * @date 2017-02-11
+ *
+ **/
+#include "runtime/parsec/include/chameleon_parsec.h"
+
+void RUNTIME_data_alloc(CHAMELON_data_handle_t* data_handle)
+{
+}
+
+void RUNTIME_data_free(CHAMELON_data_handle_t* data_handle)
+{
+}
+
+void RUNTIME_vector_data_register(CHAMELON_data_handle_t* data_handle, uint32_t home_node,
+                                  uintptr_t ptr, uint32_t nx, size_t elemsize)
+{
+}
+
+void RUNTIME_matrix_data_register(CHAMELON_data_handle_t* data_handle, uint32_t home_node,
+                                  uintptr_t ptr, uint32_t ld, uint32_t nx,
+                                  uint32_t ny, size_t elemsize)
+{
+}
+
+void RUNTIME_data_acquire(CHAMELON_data_handle_t *data_handle)
+{
+}
+
+void RUNTIME_data_release(CHAMELON_data_handle_t *data_handle)
+{
+}
+
+void RUNTIME_data_unregister(CHAMELON_data_handle_t* data_handle)
+{
+}
+
+void RUNTIME_data_acquire_all(CHAMELON_desc_t *desc, int mode)
+{
+}
diff --git a/runtime/parsec/control/runtime_multithreading.c b/runtime/parsec/control/runtime_multithreading.c
new file mode 100644
index 0000000000000000000000000000000000000000..505c798cadaa95f6e3ede50f842ae6f2894ab906
--- /dev/null
+++ b/runtime/parsec/control/runtime_multithreading.c
@@ -0,0 +1,35 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Inria. All rights reserved.
+ * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file runtime_options.c
+ *
+ *  CHAMELON auxiliary routines
+ *  CHAMELON is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @version 0.9.0
+ * @author Terry Cojean
+ * @date 2017-02-13
+ *
+ **/
+#include "runtime/parsec/include/chameleon_parsec.h"
+
+void RUNTIME_unalloc_threads(CHAMELON_option_t *options)
+{
+    return;
+}
+
+void RUNTIME_alloc_threads(CHAMELON_option_t *options, int ncores, int *cores)
+{
+    return;
+}
diff --git a/runtime/parsec/control/runtime_options.c b/runtime/parsec/control/runtime_options.c
index 34f18da839e8b58011fe3138b012444710c0121b..575dca6d5b9926cb87f7c14f28a02a0ebd255ab1 100644
--- a/runtime/parsec/control/runtime_options.c
+++ b/runtime/parsec/control/runtime_options.c
@@ -33,6 +33,9 @@ void RUNTIME_options_init( RUNTIME_option_t *options, CHAM_context_t *chamctxt,
     options->ws_hsize   = 0;
     options->ws_worker  = NULL;
     options->ws_host    = NULL;
+    options->swap_grain = CHAMELON_SWAP_TYPE;
+    options->dsd_matrix = CHAMELON_DSDMATRIX == CHAMELON_TRUE;
+    options->panel_threads = -1;
     return;
 }
 
diff --git a/runtime/quark/codelets/codelet_zplrnt.c b/runtime/quark/codelets/codelet_zplrnt.c
index 0e597b6a20fc93ba3194973771f6d225ed8c3929..26fc05d8626a6c38b2ee784bd9f827681c554114 100644
--- a/runtime/quark/codelets/codelet_zplrnt.c
+++ b/runtime/quark/codelets/codelet_zplrnt.c
@@ -35,14 +35,15 @@ void CORE_zplrnt_quark(Quark *quark)
     int m0;
     int n0;
     unsigned long long int seed;
+    int dsdmatrix;
 
-    quark_unpack_args_7( quark, m, n, tileA, bigM, m0, n0, seed );
-    TCORE_zplrnt( m, n, tileA, bigM, m0, n0, seed );
+    quark_unpack_args_8( quark, m, n, tileA, bigM, m0, n0, seed );
+    TCORE_zplrnt( m, n, tileA, bigM, m0, n0, seed, dsdmatrix );
 }
 
 void INSERT_TASK_zplrnt( const RUNTIME_option_t *options,
                         int m, int n, const CHAM_desc_t *A, int Am, int An,
-                        int bigM, int m0, int n0, unsigned long long int seed )
+                        int bigM, int m0, int n0, unsigned long long int seed, int dsdmatrix )
 {
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
     DAG_CORE_PLRNT;
@@ -54,5 +55,6 @@ void INSERT_TASK_zplrnt( const RUNTIME_option_t *options,
         sizeof(int),                      &m0,   VALUE,
         sizeof(int),                      &n0,   VALUE,
         sizeof(unsigned long long int),   &seed, VALUE,
+        sizeof(int),                      &dsdmatrix, VALUE,
         0);
 }
diff --git a/runtime/quark/control/runtime_cuda.c b/runtime/quark/control/runtime_cuda.c
new file mode 100644
index 0000000000000000000000000000000000000000..666f298bbfe133fb1e992722a25d8d64442b564c
--- /dev/null
+++ b/runtime/quark/control/runtime_cuda.c
@@ -0,0 +1,32 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Inria. All rights reserved.
+ * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file runtime_cuda.c
+ *
+ *  CHAMELON auxiliary routines
+ *  CHAMELON is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @version 0.9.0
+ * @author Terry Cojean
+ * @date 2017-02-10
+ *
+ **/
+#include <stdio.h>
+#include <stdlib.h>
+#include "runtime/quark/include/chameleon_quark.h"
+
+cudaStream_t RUNTIME_cuda_get_local_stream()
+{
+    #warning I do not think quark supports cuda
+    return 0xdeadbeef;
+}
diff --git a/runtime/quark/control/runtime_data_handle.c b/runtime/quark/control/runtime_data_handle.c
new file mode 100644
index 0000000000000000000000000000000000000000..ea23951adb81aa61c6600de738ed7f43b3c078dc
--- /dev/null
+++ b/runtime/quark/control/runtime_data_handle.c
@@ -0,0 +1,58 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2016 Inria. All rights reserved.
+ * @copyright (c) 2012-2014, 2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file runtime_data_handle.c
+ *
+ *  CHAMELON auxiliary routines
+ *  CHAMELON is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @author Terry Cojean
+ * @date 2017-02-11
+ *
+ **/
+#include "runtime/quark/include/chameleon_quark.h"
+
+void RUNTIME_data_alloc(CHAMELON_data_handle_t* data_handle)
+{
+}
+
+void RUNTIME_data_free(CHAMELON_data_handle_t* data_handle)
+{
+}
+
+void RUNTIME_vector_data_register(CHAMELON_data_handle_t* data_handle, uint32_t home_node,
+                                  uintptr_t ptr, uint32_t nx, size_t elemsize)
+{
+}
+
+void RUNTIME_matrix_data_register(CHAMELON_data_handle_t* data_handle, uint32_t home_node,
+                                  uintptr_t ptr, uint32_t ld, uint32_t nx,
+                                  uint32_t ny, size_t elemsize)
+{
+}
+
+void RUNTIME_data_acquire(CHAMELON_data_handle_t *data_handle)
+{
+}
+
+void RUNTIME_data_release(CHAMELON_data_handle_t *data_handle)
+{
+}
+
+void RUNTIME_data_unregister(CHAMELON_data_handle_t* data_handle)
+{
+}
+
+void RUNTIME_data_acquire_all(CHAMELON_desc_t *desc, int mode)
+{
+}
diff --git a/runtime/quark/control/runtime_multithreading.c b/runtime/quark/control/runtime_multithreading.c
new file mode 100644
index 0000000000000000000000000000000000000000..d6be38f0063c0afda797aa39433718cbace99554
--- /dev/null
+++ b/runtime/quark/control/runtime_multithreading.c
@@ -0,0 +1,35 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Inria. All rights reserved.
+ * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file runtime_options.c
+ *
+ *  CHAMELON auxiliary routines
+ *  CHAMELON is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @version 0.9.0
+ * @author Terry Cojean
+ * @date 2017-02-13
+ *
+ **/
+#include "runtime/quark/include/chameleon_quark.h"
+
+void RUNTIME_unalloc_threads(CHAMELON_option_t *options)
+{
+    return;
+}
+
+void RUNTIME_alloc_threads(CHAMELON_option_t *options, int ncores, int *cores)
+{
+    return;
+}
diff --git a/runtime/quark/control/runtime_options.c b/runtime/quark/control/runtime_options.c
index 8d655d524f82d164acccee9d6d77c50a5a7d161b..b074f71d08924fc948a1051f5b598c9bc5c9ef91 100644
--- a/runtime/quark/control/runtime_options.c
+++ b/runtime/quark/control/runtime_options.c
@@ -47,6 +47,11 @@ void RUNTIME_options_init( RUNTIME_option_t *options, CHAM_context_t *chamctxt,
     options->ws_worker  = NULL;
     options->ws_host    = NULL;
 
+    options->swap_grain = CHAMELON_COLUMN_SWAPS;
+    options->dsd_matrix = CHAMELON_DSDMATRIX == CHAMELON_TRUE;
+    /* Unused here because of their internal structure */
+    options->panel_threads = -1;
+
     /* quark in options */
     qopt->quark = (Quark*)(chamctxt->schedopt);
     options->schedopt = qopt;
diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt
index 0968de66d35b777f2efbbc2d75ebfdbc9ccdb475..87e0560d16f34b31da66a6d11acbb07cd78e4afb 100644
--- a/runtime/starpu/CMakeLists.txt
+++ b/runtime/starpu/CMakeLists.txt
@@ -221,7 +221,9 @@ set(RUNTIME_COMMON
   control/runtime_async.c
   control/runtime_context.c
   control/runtime_control.c
+  control/runtime_data_handle.c
   control/runtime_descriptor.c
+  control/runtime_multithreading.c
   control/runtime_options.c
   control/runtime_profiling.c
   control/runtime_workspace.c
@@ -229,6 +231,12 @@ set(RUNTIME_COMMON
   ${RUNTIME_COMMON_GENERATED}
   )
 
+ if (CHAMELEON_USE_CUDA)
+   set(RUNTIME_COMMON
+     control/runtime_cuda.c
+     ${RUNTIME_COMMON})
+ endif()
+
 set(flags_to_add "")
 foreach(_prec ${CHAMELEON_PRECISION})
   set(flags_to_add "${flags_to_add} -DPRECISION_${_prec}")
diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c
index 8f05509f70faf0d971c15ecb42aa064738611631..3e1b4b9b6f6e8215a692fcf4211430a05c0fc7a5 100644
--- a/runtime/starpu/codelets/codelet_zcallback.c
+++ b/runtime/starpu/codelets/codelet_zcallback.c
@@ -40,6 +40,12 @@ CHAMELEON_CL_CB(zgessq,        cti_handle_get_m(task->handles[0]), cti_handle_ge
 CHAMELEON_CL_CB(zgetrf,        cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), (2./3.)*M*N*K)
 CHAMELEON_CL_CB(zgetrf_incpiv, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), (2./3.)*M*N*K)
 CHAMELEON_CL_CB(zgetrf_nopiv,  cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), (2./3.)*M*N*K)
+CHAMELEON_CL_CB(zgetrf_compare_workspace,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), N)
+CHAMELEON_CL_CB(zgetrf_copy_workspace,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), 2.*N)
+CHAMELEON_CL_CB(zgetrf_extract_max,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), 2.*M+4.*N)
+CHAMELEON_CL_CB(zgetrf_rectil,        starpu_matrix_get_nx(task->dyn_handles[1]), starpu_matrix_get_nx(task->dyn_handles[1]), starpu_matrix_get_nx(task->dyn_handles[1]), (2./3.)*M*N*K)
+CHAMELEON_CL_CB(zgetrf_save_pivots,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), 1)
+CHAMELEON_CL_CB(zgetrf_sp,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (2./3.)*M*N*K)
 CHAMELEON_CL_CB(zgesum,        cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), 0,                                      M*N)
 CHAMELEON_CL_CB(zcesca,        cti_handle_get_m(task->handles[5]), cti_handle_get_n(task->handles[5]), 0,                                      8.*M*N)
 CHAMELEON_CL_CB(zgram,         cti_handle_get_m(task->handles[3]), cti_handle_get_n(task->handles[3]), 0,                                      8.*M*N)
diff --git a/runtime/starpu/codelets/codelet_zgetrf_compare_workspace.c b/runtime/starpu/codelets/codelet_zgetrf_compare_workspace.c
new file mode 100644
index 0000000000000000000000000000000000000000..75b0df10f414e2ebaf39ac7277e87c61b5fda999
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_zgetrf_compare_workspace.c
@@ -0,0 +1,90 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Inria. All rights reserved.
+ * @copyright (c) 2012-2017, 2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ *  @file codelet_zgetrf_compare_workspace.c
+ *
+ *  CHAMELON codelets kernel
+ *  CHAMELON is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ *  @version 2.3.1
+ *  @author Omar Zenati
+ *  @author Terry Cojean
+ *  @date 2011-06-01
+ *  @precisions normal z -> c d s
+ *
+ **/
+
+#include "runtime/starpu/include/chameleon_starpu.h"
+#include "runtime/starpu/include/runtime_codelet_z.h"
+
+#define get_pivot(_a_) (*((int *) (_a_)))
+#define get_diagonal(_a_) (_a_ + 1)
+#define get_maxrow(_a_) (_a_ + NB + 1)
+
+void CHAMELON_TASK_zgetrf_compare_workspace(CHAMELON_option_t *options,
+                                         int n, int h, /* int r, int p, */ int nb,
+                                         CHAMELON_data_handle_t *top,
+                                         CHAMELON_data_handle_t *bottom)
+{
+  struct starpu_codelet *codelet = &cl_zgetrf_compare_workspace;
+  void (*callback)(void*) = options->profiling ? cl_zgetrf_compare_workspace_callback : NULL;
+
+  starpu_insert_task(codelet,
+                     STARPU_VALUE,    &n,        sizeof(int),
+                     STARPU_VALUE,    &h,        sizeof(int),
+                     /* STARPU_VALUE,    &r,        sizeof(int), */
+                     /* STARPU_VALUE,    &p,        sizeof(int), */
+                     STARPU_VALUE,    &nb,       sizeof(int),
+                     STARPU_RW,       *(starpu_data_handle_t *)top->data_handle,
+                     STARPU_R,        *(starpu_data_handle_t *)bottom->data_handle,
+                     STARPU_PRIORITY, options->priority,
+                     STARPU_CALLBACK, callback, NULL,
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+                     STARPU_NAME, "zgetrf_compare_workspace",
+#endif
+                     0);
+}
+
+static void cl_zgetrf_compare_workspace_cpu_func(void *descr[], void *cl_arg)
+{
+  int N,H,/* R,P, */NB;
+  CHAMELON_Complex64_t *top, *bottom;
+
+  top    = (CHAMELON_Complex64_t *) STARPU_VECTOR_GET_PTR(descr[0]);
+  bottom = (CHAMELON_Complex64_t *) STARPU_VECTOR_GET_PTR(descr[1]);
+
+  starpu_codelet_unpack_args(cl_arg, &N, &H, /* &R, &P, */ &NB);
+
+  /* The received buffer contains the diagonal line */
+  /* if( (P <= R) && (R < 2*P) ) { */
+  /*   cblas_zcopy(N, get_diagonal(bottom), 1, */
+  /* 		   get_diagonal(top)   , 1); */
+  /* } */
+
+  CHAMELON_Complex64_t *mT = get_maxrow( top );
+  CHAMELON_Complex64_t *mB = get_maxrow( bottom );
+  /* printf("compare %e and %e\n",  cabs( mB[H] ) , cabs( mT[H] )); */
+  if( cabs( mB[H] ) > cabs( mT[H] ) )
+    {
+      get_pivot( top ) = get_pivot( bottom );
+      cblas_zcopy(N, mB, 1,
+		     mT, 1);
+    }
+}
+
+/*
+ * Codelet definition
+ */
+CODELETS_CPU(zgetrf_compare_workspace, 2, cl_zgetrf_compare_workspace_cpu_func)
+
diff --git a/runtime/starpu/codelets/codelet_zgetrf_copy_workspace.c b/runtime/starpu/codelets/codelet_zgetrf_copy_workspace.c
new file mode 100644
index 0000000000000000000000000000000000000000..beb5cf31b961630335f20dad32964adac026f666
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_zgetrf_copy_workspace.c
@@ -0,0 +1,87 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Inria. All rights reserved.
+ * @copyright (c) 2012-2017, 2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ *  @file codelet_zgetrf_copy_workspace.c
+ *
+ *  CHAMELON codelets kernel
+ *  CHAMELON is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ *  @version 2.3.1
+ *  @author Omar Zenati
+ *  @author Terry Cojean
+ *  @date 2011-06-01
+ *  @precisions normal z -> c d s
+ *
+ **/
+
+#include "runtime/starpu/include/chameleon_starpu.h"
+#include "runtime/starpu/include/runtime_codelet_z.h"
+
+#define get_pivot(_a_) (*((int *) (_a_)))
+#define get_diagonal(_a_) (_a_ + 1)
+#define get_maxrow(_a_) (_a_ + NB + 1)
+
+void CHAMELON_TASK_zgetrf_copy_workspace(CHAMELON_option_t *options,
+                                      int n, int h, /* int r, int p, */ int nb,
+                                      CHAMELON_data_handle_t *top,
+                                      CHAMELON_data_handle_t *bottom)
+{
+    struct starpu_codelet *codelet = &cl_zgetrf_copy_workspace;
+    void (*callback)(void*) = options->profiling ? cl_zgetrf_copy_workspace_callback : NULL;
+
+    starpu_insert_task(codelet,
+		       STARPU_VALUE,    &n,        sizeof(int),
+                       STARPU_VALUE,    &h,        sizeof(int),
+                       /* STARPU_VALUE,    &r,        sizeof(int), */
+		       /* STARPU_VALUE,    &p,        sizeof(int), */
+                       STARPU_VALUE,    &nb,       sizeof(int),
+                       STARPU_RW,       *(starpu_data_handle_t *)top->data_handle,
+                       STARPU_R,        *(starpu_data_handle_t *)bottom->data_handle,
+                       STARPU_PRIORITY, options->priority,
+                       STARPU_CALLBACK, callback, NULL,
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+                       STARPU_NAME, "zgetrf_copy_workspace",
+#endif
+                       0);
+
+}
+
+static void cl_zgetrf_copy_workspace_cpu_func(void *descr[], void *cl_arg)
+{
+  int N,H,/* R,P, */NB;
+  CHAMELON_Complex64_t *top, *bottom;
+
+  top    = (CHAMELON_Complex64_t *) STARPU_VECTOR_GET_PTR(descr[0]);
+  bottom = (CHAMELON_Complex64_t *) STARPU_VECTOR_GET_PTR(descr[1]);
+
+  starpu_codelet_unpack_args(cl_arg, &N, &H, /* &R, &P, */ &NB);
+
+  CHAMELON_Complex64_t *mT = get_maxrow( top );
+  CHAMELON_Complex64_t *mB = get_maxrow( bottom );
+  CHAMELON_Complex64_t *dT = get_diagonal( top );
+  CHAMELON_Complex64_t *dB = get_diagonal( bottom );
+
+
+  get_pivot( top ) = get_pivot( bottom );
+  cblas_zcopy(N, mB, 1,
+	         mT, 1);
+  cblas_zcopy(N, dB, 1,
+	         dT, 1);
+}
+
+/*
+ * Codelet definition
+ */
+CODELETS_CPU(zgetrf_copy_workspace, 2, cl_zgetrf_copy_workspace_cpu_func)
+
diff --git a/runtime/starpu/codelets/codelet_zgetrf_extract_max.c b/runtime/starpu/codelets/codelet_zgetrf_extract_max.c
new file mode 100644
index 0000000000000000000000000000000000000000..5f19e3ac06665650ebafc2a0bf41b2ebdf9ac752
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_zgetrf_extract_max.c
@@ -0,0 +1,80 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Inria. All rights reserved.
+ * @copyright (c) 2012-2017, 2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ *  @file codelet_zgetrf_extract_max.c
+ *
+ *  CHAMELON codelets kernel
+ *  CHAMELON is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ *  @version 2.3.1
+ *  @author Omar Zenati
+ *  @author Terry Cojean
+ *  @date 2011-06-01
+ *  @precisions normal z -> c d s
+ *
+ **/
+
+#include "runtime/starpu/include/chameleon_starpu.h"
+#include "runtime/starpu/include/runtime_codelet_z.h"
+
+void CHAMELON_TASK_zgetrf_extract_max(const CHAMELON_option_t *options,
+                                   int k, int m, int n, int h, int nb,
+                                   CHAMELON_desc_t *A, int Am, int An,
+                                   CHAMELON_data_handle_t *max_workspace)
+{
+    struct starpu_codelet *codelet = &cl_zgetrf_extract_max;
+    void (*callback)(void*) = options->profiling ? cl_zgetrf_extract_max_callback : NULL;
+    int lda = BLKLDD( A, Am );
+
+    if (chameleon_desc_islocal(A, Am, An))
+    {
+        starpu_insert_task(codelet,
+                           STARPU_VALUE,    &k,        sizeof(int),
+                           STARPU_VALUE,    &m,        sizeof(int),
+                           STARPU_VALUE,    &n,        sizeof(int),
+                           STARPU_VALUE,    &h,        sizeof(int),
+                           STARPU_VALUE,    &nb,       sizeof(int),
+                           STARPU_VALUE,    &Am,       sizeof(int),
+                           STARPU_RW,       RTBLKADDR( A, CHAMELON_Complex64_t, Am, An ),
+                           STARPU_VALUE,    &lda,      sizeof(int),
+                           STARPU_RW,    *(starpu_data_handle_t *)max_workspace->data_handle,
+                           STARPU_PRIORITY, options->priority,
+                           STARPU_CALLBACK, callback, NULL,
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+                           STARPU_NAME, "zgetrf_extract_max",
+#endif
+                           0);
+    }
+}
+
+static void cl_zgetrf_extract_max_cpu_func(void *descr[], void *cl_arg)
+{
+  int K,M,N,H,NB,AM;
+  CHAMELON_Complex64_t *A;
+  int LDA;
+  CHAMELON_Complex64_t *mw;
+
+  A  = (CHAMELON_Complex64_t *) STARPU_MATRIX_GET_PTR(descr[0]);
+  mw = (CHAMELON_Complex64_t *) STARPU_VECTOR_GET_PTR(descr[1]);
+
+  starpu_codelet_unpack_args(cl_arg, &K, &M, &N, &H, &NB, &AM, &LDA);
+
+  CORE_zgetrf_extract_max(A, mw, K, M, N, H, NB, AM, LDA);
+
+}
+
+/*
+ * Codelet definition
+ */
+CODELETS_CPU(zgetrf_extract_max, 2, cl_zgetrf_extract_max_cpu_func)
diff --git a/runtime/starpu/codelets/codelet_zgetrf_rectil.c b/runtime/starpu/codelets/codelet_zgetrf_rectil.c
new file mode 100644
index 0000000000000000000000000000000000000000..95a4584aaaf92c040836b8715bce2997d98ec82c
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_zgetrf_rectil.c
@@ -0,0 +1,122 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Inria. All rights reserved.
+ * @copyright (c) 2012-2017, 2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file codelet_zgetrf_nopiv.c
+ *
+ *  CHAMELON codelets kernel
+ *  CHAMELON is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @version 2.6.0
+ * @author Omar Zenati
+ * @author Terry Cojean
+ * @author Mathieu Faverge
+ * @author Emmanuel Agullo
+ * @author Cedric Castagnede
+ * @date 2013-02-01
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "runtime/starpu/include/chameleon_starpu.h"
+#include "runtime/starpu/include/runtime_codelet_z.h"
+
+/**
+ *
+ * @ingroup CORE_CHAMELON_Complex64_t
+ *
+ *  CORE_zgetrf_rectil computes an LU factorization of a general diagonal
+ *  dominant M-by-N matrix A with pivoting right-looking.
+ *
+ *  The factorization has the form
+ *     A = L * U
+ *  where L is lower triangular with unit
+ *  diagonal elements (lower trapezoidal if m > n), and U is upper
+ *  triangular (upper trapezoidal if m < n).
+ *
+ *  This is the right-looking Level 3 BLAS version of the algorithm.
+ *  WARNING: Your matrix need to be diagonal dominant if you want to call this
+ *  routine safely.
+ *
+ *******************************************************************************
+ *
+ *  @param[in,out] A
+ *          On entry, the M-by-N matrix to be factored.
+ *          On exit, the factors L and U from the factorization
+ *          A = P*L*U; the unit diagonal elements of L are not stored.
+ *
+ * @param[out] IPIV
+ *         The pivot indices; for 1 <= i <= min(M,N), row i of the
+ *         tile was interchanged with row IPIV(i).
+ *******************************************************************************
+ *
+ * @return
+ *         \retval CHAMELON_SUCCESS successful exit
+ *         \retval <0 if INFO = -k, the k-th argument had an illegal value
+ *         \retval >0 if INFO = k, U(k,k) is exactly zero. The factorization
+ *              has been completed, but the factor U is exactly
+ *              singular, and division by zero will occur if it is used
+ *              to solve a system of equations.
+ *
+ ******************************************************************************/
+
+void CHAMELON_TASK_zgetrf_rectil(const CHAMELON_option_t *options,
+                              const CHAMELON_desc_t *A, CHAMELON_data_handle_t *IPIV_h)
+{
+    int i;
+    struct starpu_codelet *codelet = &cl_zgetrf_rectil;
+    void (*callback)(void*) = options->profiling ? cl_zgetrf_rectil_callback : NULL;
+    struct starpu_data_descr data[A->mt+1];
+
+    data[0].handle = *(starpu_data_handle_t*)(IPIV_h->data_handle);
+    data[0].mode   = STARPU_RW;
+    for (i=0 ; i< A->mt ; i++) {
+        data[i+1].handle = RTBLKADDR(A, CHAMELON_Complex64_t, i, 0);
+        data[i+1].mode   = STARPU_RW;
+    }
+
+    starpu_insert_task(codelet,
+                       STARPU_VALUE,     &A,            sizeof(CHAMELON_desc_t*),
+                       STARPU_DATA_MODE_ARRAY, &data, A->mt+1,
+                       STARPU_PRIORITY,  options->priority,
+                       STARPU_SCHED_CTX, options->panel_threads,
+                       STARPU_CALLBACK,  callback,
+                       STARPU_POSSIBLY_PARALLEL, 1,
+                       0);
+}
+
+/*
+ * Codelet CPU
+ */
+static void cl_zgetrf_rectil_cpu_func(void *descr[], void *cl_arg)
+{
+    CHAMELON_desc_t *A = NULL;
+    int *IPIV = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+    unsigned sched_ctx = starpu_task_get_current()->sched_ctx;
+    int info[3];
+
+    starpu_codelet_unpack_args(cl_arg, &A);
+
+    int rank = starpu_sched_ctx_get_worker_rank(sched_ctx);
+    info[1] = rank == -1 ? 0 : rank;
+    info[2] = rank == -1 ? 1 : starpu_sched_ctx_get_nworkers(sched_ctx);
+
+    CORE_zgetrf_rectil(A, IPIV, info );
+    if (*info!=0) {
+        fprintf(stderr, "Core_zgetrf_rectil failed at column/line %d.\n", *info);
+        exit(-1);
+    }
+}
+
+/*
+ * Codelet definition
+ */
+CODELETS_CPU(zgetrf_rectil, STARPU_VARIABLE_NBUFFERS, cl_zgetrf_rectil_cpu_func)
diff --git a/runtime/starpu/codelets/codelet_zgetrf_save_pivots.c b/runtime/starpu/codelets/codelet_zgetrf_save_pivots.c
new file mode 100644
index 0000000000000000000000000000000000000000..88314d1913980e689e10aa56b05597f748d42d8e
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_zgetrf_save_pivots.c
@@ -0,0 +1,63 @@
+/**
+ *
+ *  @file codelet_zgetrf_save_pivots.c
+ *
+ *  MAGMA codelets kernel
+ *  MAGMA is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ *  @version 2.3.1
+ *  @author Omar Zenati
+ *  @date 2011-06-01
+ *  @precisions normal z -> c d s
+ *
+ **/
+
+#include "runtime/starpu/include/chameleon_starpu.h"
+#include "runtime/starpu/include/runtime_codelet_z.h"
+
+#define get_pivot(_a_) (*((int *) (_a_)))
+
+void CHAMELON_TASK_zgetrf_save_pivots(CHAMELON_option_t *options,
+                                   int h,
+                                   CHAMELON_data_handle_t *IPIV,
+                                   CHAMELON_data_handle_t *mw)
+{
+    struct starpu_codelet *codelet = &cl_zgetrf_save_pivots;
+    void (*callback)(void*) = options->profiling ? cl_zgetrf_save_pivots_callback : NULL;
+
+
+    starpu_insert_task(codelet,
+                       STARPU_VALUE,    &h,        sizeof(int),
+                       STARPU_RW,       *(starpu_data_handle_t *)IPIV->data_handle,
+                       STARPU_R,        *(starpu_data_handle_t *)mw->data_handle,
+                       STARPU_PRIORITY, options->priority,
+                       STARPU_CALLBACK, callback, NULL,
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+                       STARPU_NAME, "zgetrf_save_pivots",
+#endif
+                       0);
+
+}
+
+static void cl_zgetrf_save_pivots_cpu_func(void *descr[], void *cl_arg)
+{
+  int H;
+  int *pivots;
+  CHAMELON_Complex64_t *mw;
+
+  pivots = (int *)                STARPU_VECTOR_GET_PTR(descr[0]);
+  mw     = (CHAMELON_Complex64_t *) STARPU_VECTOR_GET_PTR(descr[1]);
+
+  starpu_codelet_unpack_args(cl_arg, &H);
+
+  pivots[H] = get_pivot( mw ) + 1;
+
+}
+
+/*
+ * Codelet definition
+ */
+CODELETS_CPU(zgetrf_save_pivots, 2, cl_zgetrf_save_pivots_cpu_func)
+
diff --git a/runtime/starpu/codelets/codelet_zgetrf_sp.c b/runtime/starpu/codelets/codelet_zgetrf_sp.c
new file mode 100644
index 0000000000000000000000000000000000000000..f52a1e5d8659c730fcde61f1e37fa8962e237b52
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_zgetrf_sp.c
@@ -0,0 +1,109 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Inria. All rights reserved.
+ * @copyright (c) 2012-2017, 2016, 2017, Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file codelet_zgetrf_sp.c
+ *
+ *  CHAMELON codelets kernel
+ *  CHAMELON is a software package provided by Inria Bordeaux - Sud-Ouest, LaBRI,
+ *  University of Bordeaux, Bordeaux INP
+ *
+ * @version 1.0.0
+ * @author Omar Zenati
+ * @author Terry Cojean
+ * @date 2011-06-01
+ * @precisions normal z -> c d s
+ *
+ **/
+
+#include "runtime/starpu/include/chameleon_starpu.h"
+#include "runtime/starpu/include/runtime_codelet_z.h"
+
+/**
+ *
+ * @ingroup CORE_CHAMELON_Complex64_t
+ *
+ **/
+void CHAMELON_TASK_zgetrf_sp( CHAMELON_option_t *options,
+                           int m, int n,
+                           CHAMELON_desc_t *A, int Am, int An,
+                           double criteria, CHAMELON_data_handle_t *nb_pivot)
+{
+    struct starpu_codelet *zgetrf_sp_codelet;
+    void (*callback)(void*) = options->profiling ? cl_zgetrf_sp_callback : NULL;
+    int lda = BLKLDD( A, Am );
+
+    zgetrf_sp_codelet = &cl_zgetrf_sp;
+
+    if ( chameleon_desc_islocal( A, Am, An ) ) {
+        starpu_insert_task(zgetrf_sp_codelet,
+                           STARPU_VALUE,    &m,        sizeof(int),
+                           STARPU_VALUE,    &n,        sizeof(int),
+                           STARPU_RW,       RTBLKADDR( A, CHAMELON_Complex64_t, Am, An ),
+                           STARPU_VALUE,    &lda,      sizeof(int),
+                           STARPU_VALUE,    &criteria, sizeof(int),
+                           STARPU_RW,       *(starpu_data_handle_t *)nb_pivot->data_handle,
+                           STARPU_PRIORITY, options->priority,
+                           STARPU_CALLBACK, callback,
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+                           STARPU_NAME, "zgetrf_sp",
+#endif
+                           0);
+    }
+}
+
+#if !defined(CHAMELEON_SIMULATION)
+static void cl_zgetrf_sp_cpu_func(void *descr[], void *cl_arg)
+{
+    int M,N;
+    CHAMELON_Complex64_t *A;
+    int LDA;
+    double CRITERIA;
+    int *nb_pivot;
+
+    A = (CHAMELON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
+    nb_pivot = (int *) STARPU_MATRIX_GET_PTR(descr[1]);
+
+    starpu_codelet_unpack_args(cl_arg, &M, &N, &LDA, &CRITERIA);
+    CORE_zgetrf_sp( M, N, A, LDA, CRITERIA, nb_pivot );
+}
+
+#ifdef CHAMELEON_USE_MAGMA
+static void cl_zgetrf_sp_cuda_func(void *descr[], void *cl_arg)
+{
+    int M;
+    int N;
+    cuDoubleComplex *A;
+    int LDA;
+    double CRITERIA;
+    int *nb_pivot;
+    int info = 0;
+
+    A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
+    nb_pivot = (int *) STARPU_MATRIX_GET_PTR(descr[1]);
+
+    starpu_codelet_unpack_args(cl_arg, &M, &N, &LDA, &CRITERIA);
+
+    CUDA_zgetrf_sp(M, N, A, LDA, CRITERIA, nb_pivot, &info);
+
+    cudaThreadSynchronize();
+}
+#endif
+#endif /* !defined(CHAMELEON_SIMULATION) */
+
+/*
+ * Codelet definition
+ */
+#if defined CHAMELEON_USE_MAGMA
+CODELETS(zgetrf_sp, 2, cl_zgetrf_sp_cpu_func, cl_zgetrf_sp_cuda_func, 0)
+#else
+CODELETS_CPU(zgetrf_sp, 2, cl_zgetrf_sp_cpu_func)
+#endif
diff --git a/runtime/starpu/codelets/codelet_zlaswp.c b/runtime/starpu/codelets/codelet_zlaswp.c
new file mode 100644
index 0000000000000000000000000000000000000000..9fda53877f07c414b6f12796895756a40f9ed37e
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_zlaswp.c
@@ -0,0 +1,168 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Inria. All rights reserved.
+ * @copyright (c) 2012-2017 IPB. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file codelet_zlaswp.c
+ *
+ *  CHAMELON codelets kernel
+ *  CHAMELON is a software package provided by Inria Bordeaux - Sud-Ouest, LaBRI,
+ *  University of Bordeaux, Bordeaux INP
+ *
+ * @version 2.6.0
+ * @author Mathieu Faverge
+ * @author Terry Cojean
+ * @date 2010-11-15
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "runtime/starpu/include/chameleon_starpu.h"
+#include "runtime/starpu/include/runtime_codelet_z.h"
+
+void CHAMELON_TASK_zlaswp(const CHAMELON_option_t *options,
+                       int n, const CHAMELON_desc_t *A, int Am, int An, int lda,
+                       int i1,  int i2,
+                       CHAMELON_data_handle_t *ipiv, int inc)
+{
+    struct starpu_codelet *codelet = &cl_zlaswp;
+    void (*callback)(void*) = options->profiling ? cl_zlaswp_callback : NULL;
+    starpu_insert_task(codelet,
+                       STARPU_VALUE,    &n,                      sizeof(int),
+                       STARPU_RW,        RTBLKADDR(A, CHAMELON_Complex64_t, Am, An),
+                       STARPU_VALUE,  &lda,                      sizeof(int),
+                       STARPU_VALUE,   &i1,                      sizeof(int),
+                       STARPU_VALUE,   &i2,                      sizeof(int),
+                       STARPU_R,      *(starpu_data_handle_t*)(ipiv->data_handle),
+                       STARPU_VALUE,  &inc,                      sizeof(int),
+                       STARPU_PRIORITY,    options->priority,
+                       STARPU_CALLBACK,    callback, NULL,
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+                       STARPU_NAME, "zlaswp",
+#endif
+                       0);
+}
+
+static void cl_zlaswp_cpu_func(void *descr[], void *cl_arg)
+{
+    int n, lda, i1, i2, inc;
+    int *ipiv;
+    CHAMELON_Complex64_t *A;
+
+    A = (CHAMELON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
+    ipiv = (int *)STARPU_VECTOR_GET_PTR(descr[1]);
+    starpu_codelet_unpack_args(cl_arg, &n, &lda, &i1, &i2, &inc);
+    CORE_zlaswp(n, A, lda, i1, i2, ipiv, inc);
+}
+
+/* descA is a submatrix representing a panel (with multiple subtiles) */
+void CHAMELON_TASK_zlaswp_ontile(const CHAMELON_option_t *options,
+                              const CHAMELON_desc_t *descA,
+                              const CHAMELON_desc_t *Aij, int Aijm, int Aijn,
+                              int i1,  int i2,
+                              CHAMELON_data_handle_t *ipiv, int inc)
+{
+    int i;
+    struct starpu_codelet *codelet = &cl_zlaswp_ontile;
+    void (*callback)(void*) = options->profiling ? cl_zlaswp_ontile_callback : NULL;
+    struct starpu_data_descr data[descA->mt+1];
+
+    data[0].handle = *(starpu_data_handle_t*)(ipiv->data_handle);
+    data[0].mode   = STARPU_R;
+    for (i=0 ; i< descA->mt ; i++) {
+        data[i+1].handle = RTBLKADDR(descA, CHAMELON_Complex64_t, i, 0);
+        data[i+1].mode   = STARPU_RW;
+    }
+
+    starpu_insert_task(codelet,
+                       STARPU_VALUE,     &descA,            sizeof(CHAMELON_desc_t*),
+                       STARPU_VALUE,        &i1,                      sizeof(int),
+                       STARPU_VALUE,        &i2,                      sizeof(int),
+                       STARPU_VALUE,       &inc,                      sizeof(int),
+                       STARPU_VALUE,      &Aijm,                      sizeof(int),
+                       STARPU_VALUE,      &Aijn,                      sizeof(int),
+                       STARPU_DATA_MODE_ARRAY, &data, descA->mt+1,
+                       STARPU_PRIORITY,    options->priority,
+                       STARPU_CALLBACK,    callback,
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+                       STARPU_NAME, "zlaswp_ontile",
+#endif
+                       0);
+}
+
+
+static void cl_zlaswp_ontile_cpu_func(void *descr[], void *cl_arg)
+{
+    int i1, i2, inc;
+    int *ipiv;
+    int Am, An;
+    CHAMELON_Complex64_t *A;
+    CHAMELON_desc_t *descA;
+
+    ipiv = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+    A = (CHAMELON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
+    starpu_codelet_unpack_args(cl_arg, &descA, &i1, &i2, &inc, &Am, &An);
+
+    CORE_zlaswp_ontile(descA, i1, i2, ipiv, inc);
+}
+
+/* descA is a submatrix representing a panel (with multiple subtiles) */
+void CHAMELON_TASK_zlaswpc_ontile(const CHAMELON_option_t *options,
+                               const CHAMELON_desc_t *descA,
+                               const CHAMELON_desc_t *Aij, int Aijm, int Aijn,
+                               int i1,  int i2,
+                               CHAMELON_data_handle_t *ipiv, int inc)
+{
+    int i;
+    struct starpu_codelet *codelet = &cl_zlaswpc_ontile;
+    void (*callback)(void*) = options->profiling ? cl_zlaswpc_ontile_callback : NULL;
+    struct starpu_data_descr data[descA->mt+1];
+
+    data[0].handle = *(starpu_data_handle_t*)(ipiv->data_handle);
+    data[0].mode   = STARPU_R;
+    for (i=0 ; i< descA->mt ; i++) {
+        data[i+1].handle = RTBLKADDR(descA, CHAMELON_Complex64_t, i, 0);
+        data[i+1].mode   = STARPU_RW;
+    }
+
+    starpu_insert_task(
+        codelet,
+        STARPU_VALUE,     &descA,              sizeof(CHAMELON_desc_t),
+        STARPU_VALUE,        &i1,                      sizeof(int),
+        STARPU_VALUE,        &i2,                      sizeof(int),
+        STARPU_VALUE,       &inc,                      sizeof(int),
+        STARPU_DATA_MODE_ARRAY, &data, descA->mt+1,
+        STARPU_PRIORITY,    options->priority,
+        STARPU_CALLBACK,    callback,
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+        STARPU_NAME, "zlaswp_ontile",
+#endif
+        0);
+}
+
+
+static void cl_zlaswpc_ontile_cpu_func(void *descr[], void *cl_arg)
+{
+    int i1, i2, inc;
+    int *ipiv;
+    CHAMELON_Complex64_t *A;
+    CHAMELON_desc_t *descA;
+
+    starpu_codelet_unpack_args(cl_arg, &descA, &i1, &i2, &ipiv, &inc);
+    CORE_zlaswpc_ontile(descA, i1, i2, ipiv, inc);
+}
+
+/*
+ * Codelet definition
+ */
+CODELETS_CPU(zlaswp, 2, cl_zlaswp_cpu_func)
+
+CODELETS_CPU(zlaswp_ontile, STARPU_VARIABLE_NBUFFERS, cl_zlaswp_ontile_cpu_func)
+
+CODELETS_CPU(zlaswpc_ontile, STARPU_VARIABLE_NBUFFERS, cl_zlaswpc_ontile_cpu_func)
diff --git a/runtime/starpu/codelets/codelet_zplrnt.c b/runtime/starpu/codelets/codelet_zplrnt.c
index df95bf5d2189bf8d481f4178ef3444d4f2cfc8f5..d62b8ee6ef34458537228a5e7b9feb9da9e2f52b 100644
--- a/runtime/starpu/codelets/codelet_zplrnt.c
+++ b/runtime/starpu/codelets/codelet_zplrnt.c
@@ -35,6 +35,7 @@ struct cl_zplrnt_args_s {
     int m0;
     int n0;
     unsigned long long int seed;
+    int dsdmatrix;
 };
 
 #if !defined(CHAMELEON_SIMULATION)
@@ -47,7 +48,7 @@ cl_zplrnt_cpu_func(void *descr[], void *cl_arg)
     tileA = cti_interface_get(descr[0]);
 
     TCORE_zplrnt( clargs->m, clargs->n, tileA,
-                  clargs->bigM, clargs->m0, clargs->n0, clargs->seed );
+                  clargs->bigM, clargs->m0, clargs->n0, clargs->seed, clargs->dsdmatrix );
 }
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
@@ -58,7 +59,7 @@ CODELETS_CPU( zplrnt, cl_zplrnt_cpu_func )
 
 void INSERT_TASK_zplrnt( const RUNTIME_option_t *options,
                          int m, int n, const CHAM_desc_t *A, int Am, int An,
-                         int bigM, int m0, int n0, unsigned long long int seed )
+                         int bigM, int m0, int n0, unsigned long long int seed, int dsdmatrix )
 {
     struct cl_zplrnt_args_s *clargs = NULL;
     void (*callback)(void*);
@@ -80,6 +81,7 @@ void INSERT_TASK_zplrnt( const RUNTIME_option_t *options,
         clargs->m0    = m0;
         clargs->n0    = n0;
         clargs->seed  = seed;
+        clargs->dsdmatrix = dsdmatrix,
     }
 
     /* Callback fro profiling information */
diff --git a/runtime/starpu/codelets/codelet_zswp.c b/runtime/starpu/codelets/codelet_zswp.c
new file mode 100644
index 0000000000000000000000000000000000000000..97bb72cb4b432b6f01487bc98eed513c8d6bd44a
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_zswp.c
@@ -0,0 +1,141 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Inria. All rights reserved.
+ * @copyright (c) 2012-2017 IPB. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file codelet_zlaswp.c
+ *
+ *  CHAMELON codelets kernel
+ *  CHAMELON is a software package provided by Inria Bordeaux - Sud-Ouest, LaBRI,
+ *  University of Bordeaux, Bordeaux INP
+ *
+ * @version 2.6.0
+ * @author Mathieu Faverge
+ * @author Terry Cojean
+ * @date 2010-11-15
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "runtime/starpu/include/chameleon_starpu.h"
+#include "runtime/starpu/include/runtime_codelet_z.h"
+
+void CHAMELON_TASK_zswpp(CHAMELON_option_t *options,
+                      CHAMELON_desc_t *A, CHAMELON_desc_t *Aij, int Aijm, int Aijn,
+                      CHAMELON_desc_t *B, int Bm, int Bn, int i1,  int i2,
+                      CHAMELON_data_handle_t *ipiv_handle, int inc)
+{
+    if ( chameleon_desc_islocal( Aij, Aijm, Aijn ) ||  chameleon_desc_islocal( B, Bm, Bn )) {
+        struct starpu_codelet *codelet = &cl_zswpp;
+        void (*callback)(void*) = options->profiling ? cl_zswpp_callback : NULL;
+
+        starpu_task_insert(codelet,
+                           STARPU_VALUE,      &A,                   sizeof(CHAMELON_desc_t*),
+                           STARPU_VALUE,      &i1,                  sizeof(int),
+                           STARPU_VALUE,      &i2,                  sizeof(int),
+                           STARPU_VALUE,      &inc,                 sizeof(int),
+                           STARPU_VALUE,      &Aijm,                sizeof(int),
+                           STARPU_VALUE,      &Aijn,                sizeof(int),
+                           STARPU_VALUE,      &Bm,                  sizeof(int),
+                           STARPU_VALUE,      &Bn,                  sizeof(int),
+                           STARPU_RW,         RTBLKADDR(Aij, CHAMELON_Complex64_t, Aijm, Aijn),
+                           STARPU_R,      *(starpu_data_handle_t *)ipiv_handle->data_handle,
+                           STARPU_RW,         RTBLKADDR(B, CHAMELON_Complex64_t, Bm, Bn),
+                           STARPU_PRIORITY,   options->priority,
+                           STARPU_CALLBACK,   callback, NULL,
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+                           STARPU_NAME, "zswpp",
+#endif
+                           0);
+    }
+
+}
+
+static void cl_zswpp_cpu_func(void *descr[], void *cl_arg)
+{
+    int i1, i2, inc;
+    int *ipiv;
+    int Am, An, Bm, Bn;
+    CHAMELON_Complex64_t *A, *B;
+    CHAMELON_desc_t *descA;
+
+    starpu_codelet_unpack_args(cl_arg, &descA, &i1, &i2, &inc, &Am, &An, &Bm, &Bn);
+
+    A = (CHAMELON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
+    ipiv = (int *)STARPU_VECTOR_GET_PTR(descr[1]);
+    B = (CHAMELON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]);
+
+    //Put Bm relative to the submatrix
+    Bm -= descA->i/descA->mb;
+    CORE_zswpp(descA, Bm, i1, i2, ipiv, inc);
+}
+
+void CHAMELON_TASK_zswptr_ontile(const CHAMELON_option_t *options,
+                              const CHAMELON_desc_t *descA,
+                              const CHAMELON_desc_t *Aij, int Aijm, int Aijn,
+                              int i1,  int i2,
+                              CHAMELON_data_handle_t *ipiv, int inc,
+                              const CHAMELON_desc_t *Akk, int Akkm, int Akkn, int ldak)
+{
+    int i;
+    struct starpu_codelet *codelet = &cl_zswptr_ontile;
+    void (*callback)(void*) = options->profiling ? cl_zswptr_ontile_callback : NULL;
+
+    if ( chameleon_desc_islocal( descA, 0, 0 )) {
+        struct starpu_data_descr data[descA->mt+3];
+
+        data[0].handle = RTBLKADDR(Aij, CHAMELON_Complex64_t, Aijm, Aijn);
+        data[0].mode   = STARPU_RW;
+        data[1].handle = *(starpu_data_handle_t*)(ipiv->data_handle);
+        data[1].mode   = STARPU_R;
+        data[2].handle = RTBLKADDR(Akk, CHAMELON_Complex64_t, Akkm, Akkn);
+        data[2].mode   = STARPU_RW;
+        for (i=0 ; i< descA->mt ; i++) {
+            data[i+3].handle = RTBLKADDR(descA, CHAMELON_Complex64_t, i, 0);
+            data[i+3].mode   = STARPU_RW;
+        }
+
+        starpu_insert_task(codelet,
+                           STARPU_VALUE, &descA,            sizeof(CHAMELON_desc_t*),
+                           STARPU_VALUE,    &i1,                      sizeof(int),
+                           STARPU_VALUE,    &i2,                      sizeof(int),
+                           STARPU_VALUE,   &inc,                      sizeof(int),
+                           STARPU_VALUE,  &ldak,                      sizeof(int),
+                           STARPU_VALUE,  &Aijm,                      sizeof(int),
+                           STARPU_VALUE,  &Aijn,                      sizeof(int),
+                           STARPU_DATA_MODE_ARRAY, &data, descA->mt + 3,
+                           STARPU_PRIORITY, options->priority,
+                           STARPU_CALLBACK, callback, NULL,
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+                           STARPU_NAME, "zswptr_ontile",
+#endif
+                           0);
+    }
+}
+
+
+static void cl_zswptr_ontile_cpu_func(void *descr[], void *cl_arg)
+{
+    int i1, i2, inc, ldak;
+    int *ipiv;
+    int Am, An;
+    CHAMELON_Complex64_t *A, *Akk;
+    CHAMELON_desc_t *descA;
+
+    A = (CHAMELON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
+    ipiv = (int *)STARPU_VECTOR_GET_PTR(descr[1]);
+    Akk = (CHAMELON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]);
+    starpu_codelet_unpack_args(cl_arg, &descA, &i1, &i2, &inc, &ldak, &Am, &An);
+
+    CORE_zswptr_ontile(descA, i1, i2, ipiv, inc, Akk, ldak);
+}
+
+CODELETS_CPU(zswptr_ontile, STARPU_VARIABLE_NBUFFERS, cl_zswptr_ontile_cpu_func)
+
+CODELETS_CPU(zswpp, 3, cl_zswpp_cpu_func)
diff --git a/runtime/starpu/codelets/codelet_zswp_extract_rows.c b/runtime/starpu/codelets/codelet_zswp_extract_rows.c
new file mode 100644
index 0000000000000000000000000000000000000000..897bf11f2df18123b49fdfa7d3100dd0687826db
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_zswp_extract_rows.c
@@ -0,0 +1,190 @@
+/**
+ *
+ *  @file codelet_zswap_insert_rows.c
+ *
+ *  CHAMELON codelets kernel
+ *  CHAMELON is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ *  @version 2.3.1
+ *  @author Omar Zenati
+ *  @author Terry Cojean
+ *  @date 2011-06-01
+ *  @precisions normal z -> c d s
+ *
+ **/
+
+#include "runtime/starpu/include/chameleon_starpu.h"
+#include "runtime/starpu/include/runtime_codelet_z.h"
+
+void CHAMELON_TASK_zswp_extract_rows(CHAMELON_option_t *options,
+                                  int nb_pivots, int global_MB,
+                                  int K, int SRC_M, int DST_M,
+                                  int NB, int MB,
+                                  CHAMELON_data_handle_t *perm,
+                                  CHAMELON_desc_t *A, int Am, int An,
+                                  CHAMELON_data_handle_t *rows_handle)
+{
+    void (*callback)(void*) = options->profiling ? cl_zswp_extract_rows_callback : NULL;
+    struct starpu_codelet *codelet = &cl_zswp_extract_rows;
+    int LDA = BLKLDD(A, Am);
+
+    if (nb_pivots > 0) {
+            starpu_task_insert(codelet,
+                               STARPU_VALUE,    &nb_pivots, sizeof(int),
+                               STARPU_VALUE,    &global_MB,    sizeof(int),
+                               STARPU_VALUE,    &K,         sizeof(int),
+                               STARPU_VALUE,    &SRC_M,     sizeof(int),
+                               STARPU_VALUE,    &DST_M,     sizeof(int),
+                               STARPU_VALUE,    &NB,        sizeof(int),
+                               STARPU_VALUE,    &MB,        sizeof(int),
+                               STARPU_VALUE,    &LDA,       sizeof(int),
+                               STARPU_R,    *(starpu_data_handle_t *)perm->data_handle,
+                               STARPU_R|STARPU_LOCALITY,        RTBLKADDR(A, CHAMELON_Complex64_t, Am, An),
+                               STARPU_W,   *(starpu_data_handle_t *)rows_handle->data_handle,
+                               STARPU_PRIORITY, options->priority,
+                               STARPU_CALLBACK, callback,
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+                               STARPU_NAME, "zswp_extract_rows",
+#endif
+                               0);
+    }
+}
+
+static void cl_zswp_extract_rows_cpu_func(void *descr[], void *cl_arg)
+{
+    int K, SRC_M, DST_M, NB, MB, LDA;
+    int *perm, nb_pivots, global_MB;
+    CHAMELON_Complex64_t *A;
+    CHAMELON_Complex64_t *rows;
+
+    perm        =               (int *) STARPU_VECTOR_GET_PTR(descr[0]);
+    A           = (CHAMELON_Complex64_t *) STARPU_MATRIX_GET_PTR(descr[1]);
+    rows        = (CHAMELON_Complex64_t *) STARPU_VECTOR_GET_PTR(descr[2]);
+
+    starpu_codelet_unpack_args(cl_arg, &nb_pivots, &global_MB, &K, &SRC_M, &DST_M, &NB, &MB, &LDA);
+
+    CORE_zswp_extract_rows(nb_pivots, global_MB, K, SRC_M, DST_M, A, perm, rows, NB, MB, LDA);
+ }
+
+#ifdef CHAMELEON_USE_CUDA
+static void cl_zswp_extract_rows_cuda_func(void *descr[], void *cl_arg)
+{
+    int K, SRC_M, DST_M, NB, MB, LDA;
+    int *perm, nb_pivots, global_MB;
+    cuDoubleComplex *A;
+    cuDoubleComplex *rows;
+    CUstream stream;
+
+    perm        =               (int *) STARPU_VECTOR_GET_PTR(descr[0]);
+    A           = (cuDoubleComplex *) STARPU_MATRIX_GET_PTR(descr[1]);
+    rows        = (cuDoubleComplex *) STARPU_VECTOR_GET_PTR(descr[2]);
+    starpu_codelet_unpack_args(cl_arg, &nb_pivots, &global_MB, &K, &SRC_M, &DST_M, &NB, &MB, &LDA);
+
+    stream = starpu_cuda_get_local_stream();
+
+    CUDA_zswp_extract_rows(nb_pivots, global_MB, perm, A, rows, K, SRC_M, DST_M, NB, MB, LDA, stream);
+
+#ifndef STARPU_CUDA_ASYNC
+    cudaStreamSynchronize( stream );
+#endif
+
+    return;
+}
+#endif
+
+/*
+ * Codelet definition
+ */
+CODELETS(zswp_extract_rows, 3, cl_zswp_extract_rows_cpu_func, cl_zswp_extract_rows_cuda_func, STARPU_CUDA_ASYNC)
+
+
+void CHAMELON_TASK_zswp_extract_rows2(CHAMELON_option_t *options,
+                                   int nb_pivots, int global_MB,
+                                   int K, int SRC_M, int DST_M,
+                                   int NB, int MB,
+                                   CHAMELON_data_handle_t *ipiv,
+                                   CHAMELON_data_handle_t *perm,
+                                   CHAMELON_desc_t *A, int Am, int An,
+                                   CHAMELON_data_handle_t *rows_handle,
+                                   CHAMELON_data_handle_t *idxs)
+{
+    void (*callback)(void*) = options->profiling ? cl_zswp_extract_rows_callback : NULL;
+    struct starpu_codelet *codelet = &cl_zswp_extract_rows2;
+    int LDA = BLKLDD(A, Am);
+
+    if (nb_pivots > 0) {
+            starpu_task_insert(codelet,
+                               STARPU_VALUE,    &nb_pivots, sizeof(int),
+                               STARPU_VALUE,    &global_MB,    sizeof(int),
+                               STARPU_VALUE,    &K,         sizeof(int),
+                               STARPU_VALUE,    &SRC_M,     sizeof(int),
+                               STARPU_VALUE,    &DST_M,     sizeof(int),
+                               STARPU_VALUE,    &NB,        sizeof(int),
+                               STARPU_VALUE,    &MB,        sizeof(int),
+                               STARPU_VALUE,    &LDA,       sizeof(int),
+                               STARPU_R,    *(starpu_data_handle_t *)perm->data_handle,
+                               STARPU_R|STARPU_LOCALITY, RTBLKADDR(A, CHAMELON_Complex64_t, Am, An),
+                               STARPU_W,   *(starpu_data_handle_t *)rows_handle->data_handle,
+                               STARPU_R,   *(starpu_data_handle_t *)idxs->data_handle,
+                               STARPU_R,   *(starpu_data_handle_t *)ipiv->data_handle, /* dummy for proper synchronization */
+                               STARPU_PRIORITY, options->priority,
+                               STARPU_CALLBACK, callback,
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+                               STARPU_NAME, "zswp_extract_rows",
+#endif
+                               0);
+    }
+}
+
+static void cl_zswp_extract_rows2_cpu_func(void *descr[], void *cl_arg)
+{
+    int K, SRC_M, DST_M, NB, MB, LDA;
+    int *perm, *idxs, *ipiv, nb_pivots, global_MB;
+    CHAMELON_Complex64_t *A;
+    CHAMELON_Complex64_t *rows;
+
+    perm        =               (int *) STARPU_VECTOR_GET_PTR(descr[0]);
+    A           = (CHAMELON_Complex64_t *) STARPU_MATRIX_GET_PTR(descr[1]);
+    rows        = (CHAMELON_Complex64_t *) STARPU_VECTOR_GET_PTR(descr[2]);
+    idxs        =               (int *) STARPU_VECTOR_GET_PTR(descr[3]);
+    ipiv        =               (int *) STARPU_VECTOR_GET_PTR(descr[4]);
+
+    starpu_codelet_unpack_args(cl_arg, &nb_pivots, &global_MB, &K, &SRC_M, &DST_M, &NB, &MB, &LDA);
+
+    CORE_zswp_extract_rows2(nb_pivots, global_MB, K, SRC_M, DST_M, A, perm, rows, idxs, NB, MB, LDA);
+ }
+
+#ifdef CHAMELEON_USE_CUDA
+static void cl_zswp_extract_rows2_cuda_func(void *descr[], void *cl_arg)
+{
+    int K, SRC_M, DST_M, NB, MB, LDA;
+    int *perm, *idxs, *ipiv, nb_pivots, global_MB;
+    cuDoubleComplex *A;
+    cuDoubleComplex *rows;
+    CUstream stream;
+
+    perm        =             (int *) STARPU_VECTOR_GET_PTR(descr[0]);
+    A           = (cuDoubleComplex *) STARPU_MATRIX_GET_PTR(descr[1]);
+    rows        = (cuDoubleComplex *) STARPU_VECTOR_GET_PTR(descr[2]);
+    idxs        =             (int *) STARPU_VECTOR_GET_PTR(descr[3]);
+    ipiv        =             (int *) STARPU_VECTOR_GET_PTR(descr[4]);
+    starpu_codelet_unpack_args(cl_arg, &nb_pivots, &global_MB, &K, &SRC_M, &DST_M, &NB, &MB, &LDA);
+
+    stream = starpu_cuda_get_local_stream();
+
+    CUDA_zswp_extract_rows2(nb_pivots, global_MB, perm, A, rows, idxs, K, SRC_M, DST_M, NB, MB, LDA, stream);
+
+#ifndef STARPU_CUDA_ASYNC
+    cudaStreamSynchronize( stream );
+#endif
+
+    return;
+}
+#endif
+
+/*
+ * Codelet definition
+ */
+CODELETS(zswp_extract_rows2, 5, cl_zswp_extract_rows2_cpu_func, cl_zswp_extract_rows2_cuda_func, STARPU_CUDA_ASYNC)
diff --git a/runtime/starpu/codelets/codelet_zswp_insert_rows.c b/runtime/starpu/codelets/codelet_zswp_insert_rows.c
new file mode 100644
index 0000000000000000000000000000000000000000..1710e74c4f6bd684754dc8294666cfadb2dc19cb
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_zswp_insert_rows.c
@@ -0,0 +1,195 @@
+/**
+ *
+ *  @file codelet_zswap_insert_rows.c
+ *
+ *  CHAMELON codelets kernel
+ *  CHAMELON is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ *  @version 2.3.1
+ *  @author Omar Zenati
+ *  @author Terry Cojean
+ *  @date 2011-06-01
+ *  @precisions normal z -> c d s
+ *
+ **/
+
+#include "runtime/starpu/include/chameleon_starpu.h"
+#include "runtime/starpu/include/runtime_codelet_z.h"
+
+void CHAMELON_TASK_zswp_insert_rows(CHAMELON_option_t *options,
+                                 int nb_pivots, int global_MB,
+                                 int K, int SRC_M, int DST_M,
+                                 int NB, int MB,
+                                 CHAMELON_data_handle_t *perm,
+                                 CHAMELON_desc_t *A, int Am, int An,
+                                 CHAMELON_data_handle_t *rows_handle)
+{
+    void (*callback)(void*) = options->profiling ? cl_zswp_insert_rows_callback : NULL;
+    struct starpu_codelet *codelet = &cl_zswp_insert_rows;
+    int LDA = BLKLDD(A, Am);
+
+    if(nb_pivots > 0) {
+            starpu_insert_task(
+                codelet,
+                STARPU_VALUE,    &nb_pivots, sizeof(int),
+                STARPU_VALUE,    &global_MB, sizeof(int),
+                STARPU_VALUE,    &K,         sizeof(int),
+                STARPU_VALUE,    &SRC_M,     sizeof(int),
+                STARPU_VALUE,    &DST_M,     sizeof(int),
+                STARPU_VALUE,    &NB,        sizeof(int),
+                STARPU_VALUE,    &MB,        sizeof(int),
+                STARPU_VALUE,    &LDA,       sizeof(int),
+                STARPU_R,   *(starpu_data_handle_t *)perm->data_handle,
+                STARPU_RW|STARPU_LOCALITY,  RTBLKADDR(A, CHAMELON_Complex64_t, Am, An),
+                STARPU_R,   *(starpu_data_handle_t *)rows_handle->data_handle,
+                STARPU_PRIORITY, options->priority,
+                STARPU_CALLBACK, callback,
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+                STARPU_NAME, "zswp_insert_rows",
+#endif
+                0);
+    }
+}
+
+static void cl_zswp_insert_rows_cpu_func(void *descr[], void *cl_arg)
+{
+    int K, SRC_M, DST_M, NB, MB, LDA;
+    int *perm, nb_pivots, global_MB;
+    CHAMELON_Complex64_t *A;
+    CHAMELON_Complex64_t *rows;
+
+    perm =               (int *) STARPU_VECTOR_GET_PTR(descr[0]);
+    A    = (CHAMELON_Complex64_t *) STARPU_MATRIX_GET_PTR(descr[1]);
+    rows = (CHAMELON_Complex64_t *) STARPU_VECTOR_GET_PTR(descr[2]);
+
+    starpu_codelet_unpack_args(cl_arg, &nb_pivots, &global_MB, &K, &SRC_M, &DST_M, &NB, &MB, &LDA);
+
+    CORE_zswp_insert_rows(nb_pivots, global_MB, K, SRC_M, DST_M, A, perm, rows, NB, MB, LDA);
+
+    return;
+}
+
+#ifdef CHAMELEON_USE_CUDA
+static void cl_zswp_insert_rows_cuda_func(void *descr[], void *cl_arg)
+{
+    int K, SRC_M, DST_M, NB, MB, LDA;
+    int *perm, nb_pivots, global_MB;
+    cuDoubleComplex *A;
+    cuDoubleComplex *rows;
+    CUstream stream;
+
+    perm =             (int *) STARPU_VECTOR_GET_PTR(descr[0]);
+    A    = (cuDoubleComplex *) STARPU_MATRIX_GET_PTR(descr[1]);
+    rows = (cuDoubleComplex *) STARPU_VECTOR_GET_PTR(descr[2]);
+
+    starpu_codelet_unpack_args(cl_arg, &nb_pivots, &global_MB, &K, &SRC_M, &DST_M, &NB, &MB, &LDA);
+
+    stream = starpu_cuda_get_local_stream();
+
+    CUDA_zswp_insert_rows(nb_pivots, global_MB, perm, A, rows, K, SRC_M, DST_M, NB, MB, LDA, stream);
+
+#ifndef STARPU_CUDA_ASYNC
+    cudaStreamSynchronize( stream );
+#endif
+
+    return;
+}
+#endif
+
+/*
+ * Codelet definition
+ */
+CODELETS(zswp_insert_rows, 3, cl_zswp_insert_rows_cpu_func, cl_zswp_insert_rows_cuda_func, STARPU_CUDA_ASYNC)
+
+
+
+void CHAMELON_TASK_zswp_insert_rows2(CHAMELON_option_t *options,
+                                 int nb_pivots, int global_MB,
+                                 int K, int SRC_M, int DST_M,
+                                 int NB, int MB,
+                                 CHAMELON_data_handle_t *perm,
+                                 CHAMELON_desc_t *A, int Am, int An,
+                                  CHAMELON_data_handle_t *rows_handle,
+                                  CHAMELON_data_handle_t *idxs)
+{
+    void (*callback)(void*) = options->profiling ? cl_zswp_insert_rows_callback : NULL;
+    struct starpu_codelet *codelet = &cl_zswp_insert_rows2;
+    int LDA = BLKLDD(A, Am);
+
+    if(nb_pivots > 0) {
+            starpu_insert_task(
+                codelet,
+                STARPU_VALUE,    &nb_pivots, sizeof(int),
+                STARPU_VALUE,    &global_MB, sizeof(int),
+                STARPU_VALUE,    &K,         sizeof(int),
+                STARPU_VALUE,    &SRC_M,     sizeof(int),
+                STARPU_VALUE,    &DST_M,     sizeof(int),
+                STARPU_VALUE,    &NB,        sizeof(int),
+                STARPU_VALUE,    &MB,        sizeof(int),
+                STARPU_VALUE,    &LDA,       sizeof(int),
+                STARPU_R,   *(starpu_data_handle_t *)perm->data_handle,
+                STARPU_RW|STARPU_LOCALITY,  RTBLKADDR(A, CHAMELON_Complex64_t, Am, An),
+                STARPU_R,   *(starpu_data_handle_t *)rows_handle->data_handle,
+                STARPU_R,   *(starpu_data_handle_t *)idxs->data_handle,
+                STARPU_PRIORITY, options->priority,
+                STARPU_CALLBACK, callback,
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+                STARPU_NAME, "zswp_insert_rows",
+#endif
+                0);
+    }
+}
+
+static void cl_zswp_insert_rows2_cpu_func(void *descr[], void *cl_arg)
+{
+    int K, SRC_M, DST_M, NB, MB, LDA;
+    int *perm, *idxs, nb_pivots, global_MB;
+    CHAMELON_Complex64_t *A;
+    CHAMELON_Complex64_t *rows;
+
+    perm =               (int *) STARPU_VECTOR_GET_PTR(descr[0]);
+    A    = (CHAMELON_Complex64_t *) STARPU_MATRIX_GET_PTR(descr[1]);
+    rows = (CHAMELON_Complex64_t *) STARPU_VECTOR_GET_PTR(descr[2]);
+    idxs =               (int *) STARPU_VECTOR_GET_PTR(descr[3]);
+
+    starpu_codelet_unpack_args(cl_arg, &nb_pivots, &global_MB, &K, &SRC_M, &DST_M, &NB, &MB, &LDA);
+
+    CORE_zswp_insert_rows2(nb_pivots, global_MB, K, SRC_M, DST_M, A, perm, rows, idxs, NB, MB, LDA);
+
+    return;
+}
+
+#ifdef CHAMELEON_USE_CUDA
+static void cl_zswp_insert_rows2_cuda_func(void *descr[], void *cl_arg)
+{
+    int K, SRC_M, DST_M, NB, MB, LDA;
+    int *perm, *idxs, nb_pivots, global_MB;
+    cuDoubleComplex *A;
+    cuDoubleComplex *rows;
+    CUstream stream;
+
+    perm =             (int *) STARPU_VECTOR_GET_PTR(descr[0]);
+    A    = (cuDoubleComplex *) STARPU_MATRIX_GET_PTR(descr[1]);
+    rows = (cuDoubleComplex *) STARPU_VECTOR_GET_PTR(descr[2]);
+    idxs =             (int *) STARPU_VECTOR_GET_PTR(descr[3]);
+
+    starpu_codelet_unpack_args(cl_arg, &nb_pivots, &global_MB, &K, &SRC_M, &DST_M, &NB, &MB, &LDA);
+
+    stream = starpu_cuda_get_local_stream();
+
+    CUDA_zswp_insert_rows2(nb_pivots, global_MB, perm, A, rows, idxs, K, SRC_M, DST_M, NB, MB, LDA, stream);
+
+#ifndef STARPU_CUDA_ASYNC
+    cudaStreamSynchronize( stream );
+#endif
+
+    return;
+}
+#endif
+
+/*
+ * Codelet definition
+ */
+CODELETS(zswp_insert_rows2, 4, cl_zswp_insert_rows2_cpu_func, cl_zswp_insert_rows2_cuda_func, STARPU_CUDA_ASYNC)
diff --git a/runtime/starpu/control/runtime_cuda.c b/runtime/starpu/control/runtime_cuda.c
new file mode 100644
index 0000000000000000000000000000000000000000..e35850ed2e33677c55fad61a444f2753a7da2ebd
--- /dev/null
+++ b/runtime/starpu/control/runtime_cuda.c
@@ -0,0 +1,31 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Inria. All rights reserved.
+ * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file runtime_cuda.c
+ *
+ *  CHAMELON auxiliary routines
+ *  CHAMELON is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @version 0.9.0
+ * @author Terry Cojean
+ * @date 2017-02-10
+ *
+ **/
+#include <stdio.h>
+#include <stdlib.h>
+#include "runtime/starpu/include/chameleon_starpu.h"
+
+cudaStream_t RUNTIME_cuda_get_local_stream()
+{
+  return starpu_cuda_get_local_stream();
+}
diff --git a/runtime/starpu/control/runtime_data_handle.c b/runtime/starpu/control/runtime_data_handle.c
new file mode 100644
index 0000000000000000000000000000000000000000..97f949afc3d3ff738c8c64a6a25527a3dd983b36
--- /dev/null
+++ b/runtime/starpu/control/runtime_data_handle.c
@@ -0,0 +1,139 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2016 Inria. All rights reserved.
+ * @copyright (c) 2012-2014, 2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file runtime_data_handle.c
+ *
+ *  CHAMELON auxiliary routines
+ *  CHAMELON is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @author Terry Cojean
+ * @date 2017-02-11
+ *
+ **/
+#include "runtime/starpu/include/chameleon_starpu.h"
+
+void RUNTIME_data_alloc(CHAMELON_data_handle_t* data_handle)
+{
+    data_handle->data_handle = (starpu_data_handle_t*)malloc(sizeof(starpu_data_handle_t));
+    return;
+}
+
+void RUNTIME_data_free(CHAMELON_data_handle_t* data_handle)
+{
+    free(data_handle->data_handle);
+    return;
+}
+
+void RUNTIME_vector_data_register(CHAMELON_data_handle_t* data_handle, uint32_t home_node,
+                                  uintptr_t ptr, uint32_t nx, size_t elemsize)
+{
+    starpu_vector_data_register((starpu_data_handle_t*)data_handle->data_handle, home_node, ptr, nx, elemsize);
+
+    data_handle->ptr = (void*)ptr;
+    return;
+}
+
+void RUNTIME_matrix_data_register(CHAMELON_data_handle_t* data_handle, uint32_t home_node,
+                                  uintptr_t ptr, uint32_t ld, uint32_t nx,
+                                  uint32_t ny, size_t elemsize)
+{
+    starpu_matrix_data_register((starpu_data_handle_t*)data_handle->data_handle, home_node, ptr, ld, nx, ny, elemsize);
+    data_handle->ptr = (void*)ptr;
+    return;
+}
+
+void RUNTIME_data_acquire(CHAMELON_data_handle_t *data_handle)
+{
+    starpu_data_acquire(*(starpu_data_handle_t*)data_handle->data_handle, STARPU_R);
+}
+
+void RUNTIME_data_acquire_write(CHAMELON_data_handle_t *data_handle)
+{
+	starpu_data_acquire(*(starpu_data_handle_t*)data_handle->data_handle, STARPU_W);
+}
+
+void RUNTIME_data_release(CHAMELON_data_handle_t *data_handle)
+{
+    starpu_data_release(*(starpu_data_handle_t*)data_handle->data_handle);
+}
+
+void RUNTIME_data_unregister(CHAMELON_data_handle_t* data_handle)
+{
+    starpu_data_handle_t *handle = (starpu_data_handle_t*)data_handle->data_handle;
+    starpu_data_unregister(*handle);
+    return;
+}
+
+void RUNTIME_data_unregister_async(CHAMELON_data_handle_t* data_handle)
+{
+	starpu_data_handle_t *handle = (starpu_data_handle_t*)data_handle->data_handle;
+	if (handle != NULL)
+		starpu_data_unregister_submit(*handle);
+	return;
+}
+
+struct callback_args {
+	starpu_data_handle_t* handle;
+	void *ptr;
+};
+
+static
+void callback_free(void * arg)
+{
+	struct callback_args *data = (struct callback_args*)arg;
+	free(data->ptr);
+	starpu_data_release(*data->handle);
+	free(data);
+}
+
+void RUNTIME_data_unregister_free_async(CHAMELON_data_handle_t* data_handle)
+{
+	starpu_data_handle_t *handle = (starpu_data_handle_t*)data_handle->data_handle;
+	if (handle != NULL)
+	{
+		struct callback_args *args = malloc(sizeof(struct callback_args));
+		args->handle = handle;
+		args->ptr = data_handle->ptr;
+		starpu_data_acquire_cb(*handle, STARPU_W, callback_free, args);
+		starpu_data_unregister_submit(*handle);
+	}
+	return;
+}
+
+static
+void callback(void *arg)
+{
+    starpu_data_release(*((starpu_data_handle_t*)arg));
+}
+
+
+void RUNTIME_data_acquire_all(CHAMELON_desc_t *desc, int mode)
+{
+    starpu_data_handle_t* handle = desc->schedopt;
+    int lmt = desc->lmt;
+    int lnt = desc->lnt;
+    int m, n;
+
+    for (n = 0; n < lnt; n++)
+        for (m = 0; m < lmt; m++)
+        {
+            if ( (*handle == NULL) ||
+                 !chameleon_desc_islocal( desc, m, n ) )
+            {
+                handle++;
+                continue;
+            }
+            starpu_data_acquire_cb(*handle, mode, callback, handle);
+            handle++;
+        }
+}
diff --git a/runtime/starpu/control/runtime_descriptor.c b/runtime/starpu/control/runtime_descriptor.c
index 385adc0df81a1a87de81971fe3590853b5214287..4a76381127d4039bf3be8ff6a347a9587f7a5d9b 100644
--- a/runtime/starpu/control/runtime_descriptor.c
+++ b/runtime/starpu/control/runtime_descriptor.c
@@ -403,6 +403,31 @@ void RUNTIME_desc_flush( const CHAM_desc_t        *desc,
     }
 }
 
+void RUNTIME_data_flush_handle(const CHAMELON_option_t *options,
+                               const CHAMELON_data_handle_t *handle)
+{
+	(void)options;
+	{
+		starpu_data_handle_t *ptr = (starpu_data_handle_t*)(handle->data_handle);
+
+		if (ptr != NULL && *ptr != NULL)
+		{
+#if defined(CHAMELEON_USE_MPI)
+			starpu_mpi_cache_flush(MPI_COMM_WORLD, *ptr);
+#endif
+
+			/* Push data to main memory when we have time to */
+#ifdef HAVE_STARPU_DATA_WONT_USE
+			starpu_data_wont_use(*ptr);
+#elif defined HAVE_STARPU_IDLE_PREFETCH
+			starpu_data_acquire_on_node_cb(*ptr, -1, STARPU_R, data_flush, *ptr);
+#else
+			starpu_data_acquire_cb(*ptr, STARPU_R, data_release, *ptr);
+#endif
+		}
+	}
+}
+
 void RUNTIME_data_flush( const RUNTIME_sequence_t *sequence,
                          const CHAM_desc_t *A, int m, int n )
 {
diff --git a/runtime/starpu/control/runtime_multithreading.c b/runtime/starpu/control/runtime_multithreading.c
new file mode 100644
index 0000000000000000000000000000000000000000..ddf159cf83b228a44cd85aa9d825f21dcd41534f
--- /dev/null
+++ b/runtime/starpu/control/runtime_multithreading.c
@@ -0,0 +1,130 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Inria. All rights reserved.
+ * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file runtime_options.c
+ *
+ *  CHAMELON auxiliary routines
+ *  CHAMELON is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @version 0.9.0
+ * @author Terry Cojean
+ * @date 2017-02-13
+ *
+ **/
+#include "runtime/starpu/include/chameleon_starpu.h"
+#include <starpu_sched_ctx.h>
+#ifdef STARPU_HAVE_HWLOC
+#include <hwloc.h>
+#define MIN(a,b) (((a)<(b))?(a):(b))
+
+void find_biggest_available_numa_node(int *ncores, int **cores)
+{
+    hwloc_topology_t topology;
+
+    int nNUMAnodes;
+    int *navail_cores;
+    int biggest_avail_numa = 0;
+    int biggest_avail_ncores = 0;
+    int biggest_avail_stride = 0;
+    int i, j, k, w, workers_offset=0;
+
+    for (w=STARPU_CUDA_WORKER ; w<STARPU_ANY_WORKER ; w++)
+        workers_offset += starpu_worker_get_count_by_type(w);
+
+    hwloc_topology_init(&topology);
+    hwloc_topology_load(topology);
+
+    nNUMAnodes = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_MACHINE);
+    navail_cores=(int*)malloc(sizeof(int)*nNUMAnodes);
+
+    for (i=0 ; i<nNUMAnodes ; i++) {
+        //printf("Numa node %d = ", i);
+        navail_cores[i]=0;
+        hwloc_obj_t NUMAnode = hwloc_get_obj_by_type(topology, HWLOC_OBJ_MACHINE, i);
+
+        int nPUs = hwloc_get_nbobjs_inside_cpuset_by_type(topology, NUMAnode->cpuset, HWLOC_OBJ_PU);
+
+        for (k=0 ; k<nPUs ; k++ ) {
+            hwloc_obj_t child = hwloc_get_obj_inside_cpuset_by_type(topology, NUMAnode->cpuset, HWLOC_OBJ_PU, k);
+
+            int found = 0;
+            for (w=STARPU_CUDA_WORKER ; w<STARPU_ANY_WORKER && !found; w++) {
+                int nworkers = starpu_worker_get_count_by_type(w);
+                int *workers = (int*) malloc(sizeof(int) * nworkers);
+
+                starpu_worker_get_ids_by_type(w, workers, nworkers);
+
+                for(j=0 ; j<nworkers && !found ; j++) {
+                    if (child->logical_index == starpu_worker_get_bindid(workers[j])){
+                        found = 1;
+                    }
+                }
+
+                free(workers);
+            }
+            if (!found)
+                navail_cores[i]++;
+        }
+
+        //I prefere taking the last numa node if the same number of PUs are available
+        if (navail_cores[i] >= biggest_avail_ncores) {
+            biggest_avail_numa = i;
+            biggest_avail_ncores = navail_cores[i];
+        }
+    }
+
+    hwloc_topology_destroy(topology);
+
+    //Compute stride for later
+    for (i=0 ; i<nNUMAnodes && i < biggest_avail_numa; i++)
+        biggest_avail_stride += navail_cores[i];
+
+    *ncores = MIN(biggest_avail_ncores, starpu_worker_get_count_by_type(STARPU_CPU_WORKER));
+    (*cores) = (int*)malloc(biggest_avail_ncores*sizeof(int));
+
+    for (i=0 ; i<biggest_avail_ncores ; i++){
+        (*cores)[i] = biggest_avail_stride + workers_offset + i;
+        //printf("Workers_offset... %d ;; Parallel panel : worker[%d]=%d\n", workers_offset, i, (*cores)[i]);
+    }
+
+    free(navail_cores);
+}
+#else
+void find_biggest_available_numa_node(int *ncores, int **cores)
+{
+    return;
+}
+#endif
+
+
+void RUNTIME_unalloc_threads(CHAMELON_option_t *options)
+{
+#if (STARPU_MAJOR_VERSION == 1 && STARPU_MINOR_VERSION >= 2) || STARPU_MAJOR_VERSION > 1
+    starpu_sched_ctx_delete(options->panel_threads);
+    options->panel_threads = -1;
+#endif
+}
+
+void RUNTIME_alloc_threads(CHAMELON_option_t *options, int ncores, int *cores)
+{
+#if (STARPU_MAJOR_VERSION == 1 && STARPU_MINOR_VERSION >= 2) || STARPU_MAJOR_VERSION > 1
+    if (options->panel_threads < 0)
+    {
+        if (cores == NULL)
+            find_biggest_available_numa_node(&ncores, &cores);
+        options->panel_threads = starpu_sched_ctx_create(cores, ncores, "ctx_panel", STARPU_SCHED_CTX_AWAKE_WORKERS,  0);
+    }
+#endif
+}
+
diff --git a/runtime/starpu/control/runtime_options.c b/runtime/starpu/control/runtime_options.c
index 6ade7b962cf17ec170c6e35465086b57583e7bdd..4f66c5ace3ea236ca771d005f6e75b70003057ae 100644
--- a/runtime/starpu/control/runtime_options.c
+++ b/runtime/starpu/control/runtime_options.c
@@ -36,6 +36,10 @@ void RUNTIME_options_init( RUNTIME_option_t *options, CHAM_context_t *chamctxt,
     options->ws_hsize  = 0;
     options->ws_worker = NULL;
     options->ws_host   = NULL;
+
+    options->swap_grain = CHAMELON_SWAP_TYPE;
+    options->dsd_matrix = CHAMELON_DSDMATRIX == CHAMELON_TRUE;
+    options->panel_threads = -1;
     return;
 }
 
diff --git a/runtime/starpu/include/runtime_codelet_z.h b/runtime/starpu/include/runtime_codelet_z.h
index 673d8c5d7a6ea287ed2fa4e948c6d8820f35a00a..c2ec382d601bd693b459580002642ff14420d7ff 100644
--- a/runtime/starpu/include/runtime_codelet_z.h
+++ b/runtime/starpu/include/runtime_codelet_z.h
@@ -76,6 +76,8 @@ CODELETS_HEADER(zgessq);
 CODELETS_HEADER(zgetrf);
 CODELETS_HEADER(zgetrf_incpiv);
 CODELETS_HEADER(zgetrf_nopiv);
+ZCODELETS_HEADER(getrf_rectil);
+ZCODELETS_HEADER(getrf_sp);
 CODELETS_HEADER(zherfb);
 CODELETS_HEADER(zlauum);
 CODELETS_HEADER(zpotrf);
@@ -113,6 +115,17 @@ CODELETS_HEADER(zlatro);
 CODELETS_HEADER(zplssq);
 CODELETS_HEADER(zplssq2);
 
+ZCODELETS_HEADER(getrf_compare_workspace);
+ZCODELETS_HEADER(getrf_copy_workspace);
+ZCODELETS_HEADER(getrf_extract_max);
+ZCODELETS_HEADER(getrf_save_pivots);
+ZCODELETS_HEADER(swp_extract_rows);
+ZCODELETS_HEADER(swp_insert_rows);
+ZCODELETS_HEADER(swp_extract_rows2);
+ZCODELETS_HEADER(swp_insert_rows2);
+ZCODELETS_HEADER(swpp);
+ZCODELETS_HEADER(swptr_ontile);
+
 /*
  * DZ functions
  */
diff --git a/timing/time_zgetrf_pp_tile.c b/timing/time_zgetrf_pp_tile.c
new file mode 100644
index 0000000000000000000000000000000000000000..978ab145b2a38b987532e94dc920faaa524e9dd2
--- /dev/null
+++ b/timing/time_zgetrf_pp_tile.c
@@ -0,0 +1,96 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Inria. All rights reserved.
+ * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @precisions normal z -> c d s
+ *
+ **/
+#define _TYPE  CHAMELON_Complex64_t
+#define _PREC  double
+#define _LAMCH LAPACKE_dlamch_work
+
+#define _NAME  "CHAMELON_zgetrf_pp_Tile"
+/* See Lawn 41 page 120 */
+#define _FMULS FMULS_GETRF(M, N)
+#define _FADDS FADDS_GETRF(M, N)
+
+#include "./timing.c"
+
+static int
+RunTest(int *iparam, double *dparam, chameleon_time_t *t_)
+{
+    PASTE_CODE_IPARAM_LOCALS( iparam );
+
+    if ( M != N && check ) {
+        fprintf(stderr, "Check cannot be perfomed with M != N\n");
+        check = 0;
+    }
+
+    /* Allocate Data */
+    PASTE_CODE_ALLOCATE_MATRIX_TILE( descA, 1, CHAMELON_Complex64_t, ChamComplexDouble, LDA, M, N );
+    PASTE_CODE_ALLOCATE_MATRIX( piv, 1, int, chameleon_max(M, N), 1 );
+
+    CHAMELON_zplrnt_Tile(descA, 3456);
+
+    /* Save A in lapack layout for check */
+    PASTE_TILE_TO_LAPACK( descA, A2, check, CHAMELON_Complex64_t, LDA, N );
+    PASTE_CODE_ALLOCATE_MATRIX( ipiv2, check, int, chameleon_max(M, N), 1 );
+
+    /* Save AT in lapack layout for check */
+    if ( check ) {
+        LAPACKE_zgetrf_work(LAPACK_COL_MAJOR, M, N, A2, LDA, ipiv2 );
+    }
+
+    START_TIMING();
+    CHAMELON_zgetrf_pp_Tile( descA, piv);
+    STOP_TIMING();
+
+    if ( check )
+    {
+        int64_t i;
+        double *work = (double *)malloc(chameleon_max(M, N)*sizeof(double));
+        PASTE_TILE_TO_LAPACK( descA, A, 1, CHAMELON_Complex64_t, LDA, chameleon_min(M,N) );
+
+        /* Check ipiv */
+        for(i=0; i<chameleon_min(M,N); i++)
+        {
+            if( piv[i] != ipiv2[i] ) {
+                fprintf(stdout, "\nCHAMELEON (piv[%ld] = %d, A[%ld] = %e) / LAPACK (piv[%ld] = %d, A[%ld] = [%e])\n",
+                        i, piv[i],  i, creal(A[  i * LDA + i ]),
+                        i, ipiv2[i], i, creal(A2[ i * LDA + i ]));
+                break;
+            }
+        }
+
+        dparam[IPARAM_ANORM] = LAPACKE_zlange_work(LAPACK_COL_MAJOR,
+                                                   chameleon_lapack_const(ChamMaxNorm),
+                                                   M, N, A, LDA, work);
+        dparam[IPARAM_XNORM] = LAPACKE_zlange_work(LAPACK_COL_MAJOR,
+                                                   chameleon_lapack_const(ChamMaxNorm),
+                                                   M, N, A2, LDA, work);
+        dparam[IPARAM_BNORM] = 0.0;
+
+        CORE_zgeadd( ChamNoTrans, M, N, -1.0, A, LDA, 1.0, A2, LDA);
+
+        dparam[IPARAM_RES] = LAPACKE_zlange_work(LAPACK_COL_MAJOR,
+                                                 chameleon_lapack_const(ChamMaxNorm),
+                                                 M, N, A2, LDA, work);
+
+        free( A );
+        free( A2 );
+        free( ipiv2 );
+        free( work );
+    }
+
+    PASTE_CODE_FREE_MATRIX(descA);
+    free(piv);
+    return 0;
+}
diff --git a/timing/time_zgetrf_sp_tile.c b/timing/time_zgetrf_sp_tile.c
new file mode 100644
index 0000000000000000000000000000000000000000..0ffde6adc514af039dd608dc49b9284e81950bdc
--- /dev/null
+++ b/timing/time_zgetrf_sp_tile.c
@@ -0,0 +1,84 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Inria. All rights reserved.
+ * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @precisions normal z -> c d s
+ *
+ **/
+#define _TYPE  CHAMELON_Complex64_t
+#define _PREC  double
+#define _LAMCH LAPACKE_dlamch_work
+
+#define _NAME  "CHAMELON_zgetrf_Tile"
+/* See Lawn 41 page 120 */
+#define _FMULS FMULS_GETRF(M, N)
+#define _FADDS FADDS_GETRF(M, N)
+
+#include "./timing.c"
+
+static int
+RunTest(int *iparam, double *dparam, chameleon_time_t *t_)
+{
+    double criteria;
+
+    PASTE_CODE_IPARAM_LOCALS( iparam );
+
+    if ( M != N && check ) {
+        fprintf(stderr, "Check cannot be perfomed with M != N\n");
+        check = 0;
+    }
+
+    PASTE_CODE_ALLOCATE_MATRIX_TILE( descA, 1, CHAMELON_Complex64_t, ChamComplexDouble, LDA, M, N );
+    CHAMELON_zplrnt_Tile(descA, 3456);
+
+    /* Allocate Data */
+    double eps = LAPACKE_dlamch_work('e');
+    double anorm = CHAMELON_zlange_Tile(ChamInfNorm, descA);
+    criteria = eps*anorm;
+
+    PASTE_CODE_ALLOCATE_MATRIX_TILE( descX,  check, CHAMELON_Complex64_t, ChamComplexDouble, LDB, M, NRHS );
+    PASTE_CODE_ALLOCATE_MATRIX_TILE( descAC, check, CHAMELON_Complex64_t, ChamComplexDouble, LDA, M, N    );
+    PASTE_CODE_ALLOCATE_MATRIX_TILE( descB,  check, CHAMELON_Complex64_t, ChamComplexDouble, LDB, M, NRHS );
+
+
+    /* Save A for check */
+    if (check == 1){
+        CHAMELON_zlacpy_Tile(ChamUpperLower, descA, descAC);
+    }
+
+    START_TIMING();
+    CHAMELON_zgetrf_sp_Tile( descA, criteria );
+    STOP_TIMING();
+
+    /* Check the solution */
+    if ( check )
+    {
+        /* TODO: correct the check ? */
+        CHAMELON_zplrnt_Tile( descX, 7732 );
+        CHAMELON_zlacpy_Tile(ChamUpperLower, descX, descB);
+
+        CHAMELON_zgetrs_nopiv_Tile( descA, descX );
+
+        dparam[IPARAM_ANORM] = CHAMELON_zlange_Tile(ChamInfNorm, descAC);
+        dparam[IPARAM_BNORM] = CHAMELON_zlange_Tile(ChamInfNorm, descB);
+        dparam[IPARAM_XNORM] = CHAMELON_zlange_Tile(ChamInfNorm, descX);
+        CHAMELON_zgemm_Tile( ChamNoTrans, ChamNoTrans, 1.0, descAC, descX, -1.0, descB );
+        dparam[IPARAM_RES] = CHAMELON_zlange_Tile(ChamInfNorm, descB);
+        PASTE_CODE_FREE_MATRIX( descX  );
+        PASTE_CODE_FREE_MATRIX( descAC );
+        PASTE_CODE_FREE_MATRIX( descB  );
+    }
+
+    PASTE_CODE_FREE_MATRIX( descA );
+
+
+    return 0;
+}
diff --git a/timing/timing.c b/timing/timing.c
new file mode 100644
index 0000000000000000000000000000000000000000..3ac4ecf95997029c903ec3f04284bf98c664b1b2
--- /dev/null
+++ b/timing/timing.c
@@ -0,0 +1,773 @@
+/**
+ *
+ * @file timing.c
+ *
+ * @copyright 2009-2014 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2012-2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon auxiliary routines
+ *
+ * @version 1.0.0
+ * @author Mathieu Faverge
+ * @author Raphael Boucherie
+ * @author Dulceneia Becker
+ * @author Cedric Castagnede
+ * @date 2010-11-15
+ *
+ */
+#if defined( _WIN32 ) || defined( _WIN64 )
+#define int64_t __int64
+#endif
+
+/* Define these so that the Microsoft VC compiler stops complaining
+   about scanf and friends */
+#define _CRT_SECURE_NO_DEPRECATE
+#define _CRT_SECURE_NO_WARNINGS
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+#if defined( _WIN32 ) || defined( _WIN64 )
+#include <windows.h>
+#else  /* Non-Windows */
+#include <unistd.h>
+#include <sys/resource.h>
+#endif
+
+#include <chameleon.h>
+#if !defined(CHAMELEON_SIMULATION)
+#include <coreblas/lapacke.h>
+#include <coreblas.h>
+#endif
+#include "flops.h"
+#include "timing.h"
+#include "control/auxiliary.h"
+
+#if defined(CHAMELEON_USE_MPI)
+#include <mpi.h>
+#endif /* defined(CHAMELEON_USE_MPI */
+
+#if defined(CHAMELEON_SCHED_STARPU)
+#include <starpu.h>
+#endif /* defined(CHAMELEON_SCHED_STARPU) */
+
+
+#if defined(CHAMELEON_HAVE_GETOPT_H)
+#include <getopt.h>
+#endif /* defined(CHAMELEON_HAVE_GETOPT_H) */
+
+static int RunTest(int *iparam, _PREC *dparam, double *t_);
+static inline void* chameleon_getaddr_null(const CHAMELON_desc_t *A, int m, int n)
+{
+    (void)A;(void)m;(void)n;
+    return (void*)( NULL );
+}
+
+int ISEED[4] = {0,0,0,1};   /* initial seed for zlarnv() */
+
+static int
+Test(int64_t n, int *iparam) {
+    int      i, j, iter;
+    int      thrdnbr, niter;
+    int64_t  M, N, K, NRHS;
+    double  *t;
+#if defined(CHAMELEON_SIMULATION)
+    _PREC    eps = 0.;
+#else
+    _PREC    eps = _LAMCH( 'e' );
+#endif
+    _PREC    dparam[IPARAM_DNBPARAM];
+    double   fmuls, fadds, fp_per_mul, fp_per_add;
+    double   sumgf, sumgf2, sumt, sd, flops, gflops;
+    char    *s;
+    char    *env[] = {
+        "OMP_NUM_THREADS",
+        "MKL_NUM_THREADS",
+        "GOTO_NUM_THREADS",
+        "ACML_NUM_THREADS",
+        "ATLAS_NUM_THREADS",
+        "BLAS_NUM_THREADS", ""
+    };
+    int gnuplot = 0;
+
+/*
+ * if hres = 0 then the test succeed
+ * if hres = n then the test failed n times
+ */
+    int hres = 0;
+
+    memset( &dparam, 0, IPARAM_DNBPARAM * sizeof(_PREC) );
+    dparam[IPARAM_THRESHOLD_CHECK] = 100.0;
+
+    thrdnbr = iparam[IPARAM_THRDNBR];
+    niter   = iparam[IPARAM_NITER];
+
+    M    = iparam[IPARAM_M];
+    N    = iparam[IPARAM_N];
+    K    = iparam[IPARAM_K];
+    NRHS = K;
+    (void)M;(void)N;(void)K;(void)NRHS;
+
+    if ( (n < 0) || (thrdnbr < 0 ) ) {
+        if (gnuplot && (CHAMELON_My_Mpi_Rank() == 0) ) {
+            printf( "set title '%d_NUM_THREADS: ", thrdnbr );
+            for (i = 0; env[i][0]; ++i) {
+                s = getenv( env[i] );
+
+                if (i) printf( " " ); /* separating space */
+
+                for (j = 0; j < 5 && env[i][j] && env[i][j] != '_'; ++j)
+                    printf( "%c", env[i][j] );
+
+                if (s)
+                    printf( "=%s", s );
+                else
+                    printf( "->%s", "?" );
+            }
+            printf( "'\n" );
+            printf( "%s\n%s\n%s\n%s\n%s%s%s\n",
+                    "set xlabel 'Matrix size'",
+                    "set ylabel 'Gflop/s'",
+                    "set key bottom",
+                    gnuplot > 1 ? "set terminal png giant\nset output 'timeplot.png'" : "",
+                    "plot '-' using 1:5 title '", _NAME, "' with linespoints" );
+        }
+        return 0;
+    }
+
+    if ( CHAMELON_My_Mpi_Rank() == 0)
+        printf( "%7d %7d %7d ", iparam[IPARAM_M], iparam[IPARAM_N], iparam[IPARAM_K] );
+    fflush( stdout );
+
+    t = (double*)malloc(niter*sizeof(double));
+    memset(t, 0, niter*sizeof(double));
+
+    if (sizeof(_TYPE) == sizeof(_PREC)) {
+        fp_per_mul = 1;
+        fp_per_add = 1;
+    } else {
+        fp_per_mul = 6;
+        fp_per_add = 2;
+    }
+
+    fadds = (double)(_FADDS);
+    fmuls = (double)(_FMULS);
+    flops = 1e-9 * (fmuls * fp_per_mul + fadds * fp_per_add);
+
+    if ( iparam[IPARAM_WARMUP] ) {
+      int status = RunTest( iparam, dparam, &(t[0]));
+      if (status != CHAMELON_SUCCESS) {
+          free(t);
+          return status;
+      }
+    }
+
+    sumgf  = 0.0;
+    double sumgf_upper  = 0.0;
+    sumgf2 = 0.0;
+    sumt   = 0.0;
+
+    for (iter = 0; iter < niter; iter++)
+    {
+        if( iter == 0 ) {
+          if ( iparam[IPARAM_TRACE] )
+            iparam[IPARAM_TRACE] = 2;
+          if ( iparam[IPARAM_DAG] )
+            iparam[IPARAM_DAG] = 2;
+          if ( iparam[IPARAM_PROFILE] )
+            iparam[IPARAM_PROFILE] = 2;
+
+          int status = RunTest( iparam, dparam, &(t[iter]));
+          if (status != CHAMELON_SUCCESS) return status;
+
+          iparam[IPARAM_TRACE] = 0;
+          iparam[IPARAM_DAG] = 0;
+          iparam[IPARAM_PROFILE] = 0;
+        }
+        else {
+          int status = RunTest( iparam, dparam, &(t[iter]));
+          if (status != CHAMELON_SUCCESS) return status;
+        }
+        gflops = flops / t[iter];
+
+#if defined (CHAMELEON_SCHED_STARPU)
+        /* TODO: create chameleon interface encapsulating this instead */
+        if (iparam[IPARAM_BOUND])
+        {
+            double upper_gflops = 0.0;
+            double tmin = 0.0;
+            double integer_tmin = 0.0;
+            starpu_bound_compute(&tmin, &integer_tmin, 0);
+            upper_gflops  = (flops / (tmin / 1000.0));
+            sumgf_upper += upper_gflops;
+        }
+#endif
+        sumt   += t[iter];
+        sumgf  += gflops;
+        sumgf2 += gflops*gflops;
+    }
+
+    gflops = sumgf / niter;
+    sd = sqrt((sumgf2 - (sumgf*sumgf)/niter)/niter);
+
+    if ( CHAMELON_My_Mpi_Rank() == 0) {
+        printf( "%9.3f %9.2f +-%7.2f  ", sumt/niter, gflops, sd);
+
+        if (iparam[IPARAM_BOUND]) {
+            printf(" %9.2f",  sumgf_upper/niter);
+        }
+
+        if ( iparam[IPARAM_CHECK] ){
+            hres = ( dparam[IPARAM_RES] / n / eps / (dparam[IPARAM_ANORM] * dparam[IPARAM_XNORM] + dparam[IPARAM_BNORM] ) > dparam[IPARAM_THRESHOLD_CHECK] );
+
+            if (hres) {
+                printf( "%8.5e %8.5e %8.5e %8.5e                       %8.5e FAILURE",
+                    dparam[IPARAM_RES], dparam[IPARAM_ANORM], dparam[IPARAM_XNORM], dparam[IPARAM_BNORM],
+                    dparam[IPARAM_RES] / n / eps / (dparam[IPARAM_ANORM] * dparam[IPARAM_XNORM] + dparam[IPARAM_BNORM] ));
+            }
+            else {
+                printf( "%8.5e %8.5e %8.5e %8.5e                       %8.5e SUCCESS",
+                    dparam[IPARAM_RES], dparam[IPARAM_ANORM], dparam[IPARAM_XNORM], dparam[IPARAM_BNORM],
+                    dparam[IPARAM_RES] / n / eps / (dparam[IPARAM_ANORM] * dparam[IPARAM_XNORM] + dparam[IPARAM_BNORM] ));
+            }
+        }
+
+        if ( iparam[IPARAM_INVERSE] ) {
+            printf( " %8.5e %8.5e %8.5e     %8.5e",
+                    dparam[IPARAM_RNORM], dparam[IPARAM_ANORM], dparam[IPARAM_AinvNORM],
+                    dparam[IPARAM_RNORM] /((dparam[IPARAM_ANORM] * dparam[IPARAM_AinvNORM])*n*eps));
+        }
+
+        printf("\n");
+
+        fflush( stdout );
+    }
+    free(t);
+
+    return hres;
+}
+
+static inline int
+startswith(const char *s, const char *prefix) {
+    size_t n = strlen( prefix );
+    if (strncmp( s, prefix, n ))
+        return 0;
+    return 1;
+}
+
+static int
+get_range(char *range, int *start_p, int *stop_p, int *step_p) {
+    char *s, *s1, buf[21];
+    int colon_count, copy_len, nbuf=20, n;
+    int start=1000, stop=10000, step=1000;
+
+    colon_count = 0;
+    for (s = strchr( range, ':'); s; s = strchr( s+1, ':'))
+        colon_count++;
+
+    if (colon_count == 0) { /* No colon in range. */
+        if (sscanf( range, "%d", &start ) < 1 || start < 1)
+            return -1;
+        step = start / 10;
+        if (step < 1) step = 1;
+        stop = start + 10 * step;
+
+    } else if (colon_count == 1) { /* One colon in range.*/
+        /* First, get the second number (after colon): the stop value. */
+        s = strchr( range, ':' );
+        if (sscanf( s+1, "%d", &stop ) < 1 || stop < 1)
+            return -1;
+
+        /* Next, get the first number (before colon): the start value. */
+        n = s - range;
+        copy_len = n > nbuf ? nbuf : n;
+        strncpy( buf, range, copy_len );
+        buf[copy_len] = 0;
+        if (sscanf( buf, "%d", &start ) < 1 || start > stop || start < 1)
+            return -1;
+
+        /* Let's have 10 steps or less. */
+        step = (stop - start) / 10;
+        if (step < 1)
+            step = 1;
+    } else if (colon_count == 2) { /* Two colons in range. */
+        /* First, get the first number (before the first colon): the start value. */
+        s = strchr( range, ':' );
+        n = s - range;
+        copy_len = n > nbuf ? nbuf : n;
+        strncpy( buf, range, copy_len );
+        buf[copy_len] = 0;
+        if (sscanf( buf, "%d", &start ) < 1 || start < 1)
+            return -1;
+
+        /* Next, get the second number (after the first colon): the stop value. */
+        s1 = strchr( s+1, ':' );
+        n = s1 - (s + 1);
+        copy_len = n > nbuf ? nbuf : n;
+        strncpy( buf, s+1, copy_len );
+        buf[copy_len] = 0;
+        if (sscanf( buf, "%d", &stop ) < 1 || stop < start)
+            return -1;
+
+        /* Finally, get the third number (after the second colon): the step value. */
+        if (sscanf( s1+1, "%d", &step ) < 1 || step < 1)
+            return -1;
+    } else
+
+        return -1;
+
+    *start_p = start;
+    *stop_p = stop;
+    *step_p = step;
+
+    return 0;
+}
+
+static void
+show_help(char *prog_name) {
+    printf( "Usage:\n%s [options]\n\n", prog_name );
+    printf( "Options are:\n"
+            "  -h  --help               Show this help\n"
+            "\n"
+            "  Machine parameters:\n"
+            "    -t, --threads=x        Number of CPU workers (default: automatic detection through runtime)\n"
+            "    -g, --gpus=x           Number of GPU workers (default: 0)\n"
+            "    -P, --P=x              Rows (P) in the PxQ process grid (deafult: 1)\n"
+            "        --nocpu            All GPU kernels are exclusively executed on GPUs (default: 0)\n"
+            "\n"
+            "  Matrix parameters:\n"
+            "    -m, --m, --M=x         Dimension (M) of the matrices (default: N)\n"
+            "    -n, --n, --N=x         Dimension (N) of the matrices\n"
+            "    -N, --n_range=R        Range of N values\n"
+            "                           with R=Start:Stop:Step (default: 500:5000:500)\n"
+            "    -k, --k, --K, --nrhs=x Dimension (K) of the matrices or number of right-hand size (default: 1)\n"
+            "    -b, --nb=x             NB size. (default: 320)\n"
+            "    -i, --ib=x             IB size. (default: 32)\n"
+            //"    -x, --mx=x             ?\n" todo
+            //"    -X, --nx=x             ?\n" todo
+            "\n"
+            "  Check/prints:\n"
+            "        --niter=x          Number of iterations performed for each test (default: 1)\n"
+            "    -W, --nowarnings       Do not show warnings\n"
+            "    -w, --nowarmup         Cancel the warmup run to pre-load libraries\n"
+            "    -c, --check            Check result\n"
+            "    -C, --inv              Check on inverse\n"
+            "        --mode=x           Change xLATMS matrix mode generation for SVD/EVD (default: 4)\n"
+            "                           Must be between 0 and 20 included\n"
+            "\n"
+            "  Profiling:\n"
+            "    -T, --trace            Enable trace generation\n"
+            "        --progress         Display progress indicator\n"
+            "    -d, --dag              Enable DAG generation\n"
+            "                           Generates a dot_dag_file.dot.\n"
+            "    -p, --profile          Print profiling informations\n"
+            "\n"
+            "  HQR options:\n"
+            "    -a, --qr_a, --rhblk=N  If N > 0, enable Householder mode for QR and LQ factorization\n"
+            "                           N is the size of each subdomain (default: -1)\n"
+            "    -l, --llvl=x           Tree used for low level reduction inside nodes (default: -1)\n"
+            "    -L, --hlvl=x           Tree used for high level reduction between nodes, only if P > 1 (default: -1).\n"
+            "                           (-1: Automatic, 0: Flat, 1: Greedy, 2: Fibonacci, 3: Binary, 4: Replicated greedy)\n"
+            "    -D, --domino           Enable the domino between upper and lower trees.\n"
+            "\n"
+            "  Advanced options\n"
+            "        --nobigmat         Disable single large matrix allocation for multiple tiled allocations\n"
+            "    -s, --sync             Enable synchronous calls in wrapper function such as POTRI\n"
+            "    -o, --ooc              Enable out-of-core (available only with StarPU)\n"
+            "    -G, --gemm3m           Use gemm3m complex method\n"
+            "        --bound            Compare result to area bound\n"
+            "        --dsdmatrix      Makes use of a dsdmatrix notably for GETRF\n"
+            "        --[no|line|tile|column]swaps Change the type of swap which are used\n"
+            "        --rectil_panel   Makes use of the rectil panel for GETRF_PP\n"
+            "        --nbpanel_threads=N Number of threads to use for the panel of GETRF_PP (default is one NUMA NODE)\n"
+            "\n");
+}
+
+
+static void
+print_header(char *prog_name, int * iparam) {
+    const char *bound_header   = iparam[IPARAM_BOUND]   ? "   thGflop/s" : "";
+    const char *check_header   = iparam[IPARAM_CHECK]   ? "     ||Ax-b||       ||A||       ||x||       ||b|| ||Ax-b||/N/eps/(||A||||x||+||b||)  RETURN" : "";
+    const char *inverse_header = iparam[IPARAM_INVERSE] ? " ||I-A*Ainv||       ||A||    ||Ainv||       ||Id - A*Ainv||/((||A|| ||Ainv||).N.eps)" : "";
+#if defined(CHAMELEON_SIMULATION)
+    _PREC    eps = 0.;
+#else
+    _PREC    eps = _LAMCH( 'e' );
+#endif
+
+    printf( "#\n"
+            "# CHAMELEON %d.%d.%d, %s\n"
+            "# Nb threads: %d\n"
+            "# Nb GPUs:    %d\n"
+#if defined(CHAMELEON_USE_MPI)
+            "# Nb mpi:     %d\n"
+            "# PxQ:        %dx%d\n"
+#endif
+            "# NB:         %d\n"
+            "# IB:         %d\n"
+            "# eps:        %e\n"
+            "#\n",
+            CHAMELEON_VERSION_MAJOR,
+            CHAMELEON_VERSION_MINOR,
+            CHAMELEON_VERSION_MICRO,
+            prog_name,
+            iparam[IPARAM_THRDNBR],
+            iparam[IPARAM_NCUDAS],
+#if defined(CHAMELEON_USE_MPI)
+            iparam[IPARAM_NMPI],
+            iparam[IPARAM_P], iparam[IPARAM_Q],
+#endif
+            iparam[IPARAM_NB],
+            iparam[IPARAM_IB],
+            eps );
+
+    printf( "#     M       N  K/NRHS   seconds   Gflop/s Deviation%s%s\n",
+            bound_header, iparam[IPARAM_INVERSE] ? inverse_header : check_header);
+    return;
+}
+
+#define GETOPT_STRING "ht:g:P:8M:m:N:n:K:k:b:i:x:X:1:WwcCT2dpa:l:L:D9:3soG45"
+#if defined(CHAMELEON_HAVE_GETOPT_LONG)
+static struct option long_options[] =
+{
+    {"help",            no_argument,       0,      'h'},
+    // Configuration
+    {"threads",         required_argument, 0,      't'},
+    {"gpus",            required_argument, 0,      'g'},
+    {"P",               required_argument, 0,      'P'},
+    {"nocpu",           no_argument,       0,      '8'},
+    // Matrix parameters
+    {"M",               required_argument, 0,      'm'},
+    {"m",               required_argument, 0,      'm'},
+    {"N",               required_argument, 0,      'n'},
+    {"n",               required_argument, 0,      'n'},
+    {"n_range",         required_argument, 0,      'N'},
+    {"K",               required_argument, 0,      'K'},
+    {"k",               required_argument, 0,      'k'},
+    {"nrhs",            required_argument, 0,      'k'},
+    {"nb",              required_argument, 0,      'b'},
+    {"ib",              required_argument, 0,      'i'},
+    {"mx",              required_argument, 0,      'x'},
+    {"nx",              required_argument, 0,      'X'},
+    // Check/prints
+    {"niter",           required_argument, 0,      '1'},
+    {"nowarnings",      no_argument,       0,      'W'},
+    {"nowarmup",        no_argument,       0,      'w'},
+    {"check",           no_argument,       0,      'c'},
+    {"inv",             no_argument,       0,      'C'},
+    // Profiling
+    {"trace",           no_argument,       0,      'T'},
+    {"progress",        no_argument,       0,      '2'},
+    {"dag",             no_argument,       0,      'd'},
+    {"profile",         no_argument,       0,      'p'},
+    // HQR options
+    {"rhblk",           required_argument, 0,      'a'},
+    {"qr_a",            required_argument, 0,      'a'},
+    {"llvl",            required_argument, 0,      'l'},
+    {"hlvl",            required_argument, 0,      'L'},
+    {"domino",          no_argument,       0,      'D'},
+    // Other
+    {"mode",            required_argument, 0,      '9'},
+    {"nobigmat",        no_argument,       0,      '3'},
+    {"sync",            no_argument,       0,      's'},
+    {"ooc",             no_argument,       0,      'o'},
+    {"gemm3m",          no_argument,       0,      'G'},
+    {"bound",           no_argument,       0,      '5'},
+    {"dsdmatrix",       no_argument,       0,      'S'},
+    {"noswaps",         no_argument,       0,      '0'},
+    {"columnswaps",     no_argument,       0,      '4'},
+    {"tileswaps",       no_argument,       0,      '6'},
+    {"lineswaps",       no_argument,       0,      '7'},
+    {"rectil_panel",    no_argument,       0,      'z'},
+    {"nbpanel_threads", required_argument, 0,      'Z'},
+    {0, 0, 0, 0}
+};
+#endif  /* defined(CHAMELEON_HAVE_GETOPT_LONG) */
+
+static void
+set_iparam_default(int *iparam){
+
+    memset(iparam, 0, IPARAM_SIZEOF*sizeof(int));
+
+    iparam[IPARAM_THRDNBR          ] = -1;
+    iparam[IPARAM_THRDNBR_SUBGRP   ] = 1;
+    iparam[IPARAM_M                ] = -1;
+    iparam[IPARAM_N                ] = -1;
+    iparam[IPARAM_K                ] = 1;
+    iparam[IPARAM_LDA              ] = -1;
+    iparam[IPARAM_LDB              ] = -1;
+    iparam[IPARAM_LDC              ] = -1;
+    iparam[IPARAM_MB               ] = 320;
+    iparam[IPARAM_NB               ] = 320;
+    iparam[IPARAM_IB               ] = 32;
+    iparam[IPARAM_NITER            ] = 1;
+    iparam[IPARAM_WARMUP           ] = 1;
+    iparam[IPARAM_BIGMAT           ] = 1;
+    iparam[IPARAM_ASYNC            ] = 1;
+    iparam[IPARAM_MX               ] = -1;
+    iparam[IPARAM_NX               ] = -1;
+    iparam[IPARAM_MX               ] = -1;
+    iparam[IPARAM_NX               ] = -1;
+    iparam[IPARAM_INPLACE          ] = CHAMELON_OUTOFPLACE;
+    iparam[IPARAM_NMPI             ] = 1;
+    iparam[IPARAM_P                ] = 1;
+    iparam[IPARAM_Q                ] = 1;
+    iparam[IPARAM_PRINT_WARNINGS   ] = 1;
+    iparam[IPARAM_LOWLVL_TREE      ] = -1;
+    iparam[IPARAM_HIGHLVL_TREE     ] = -1;
+    iparam[IPARAM_QR_TS_SZE        ] = -1;
+    iparam[IPARAM_QR_HLVL_SZE      ] = -1;
+    iparam[IPARAM_QR_DOMINO        ] = -1;
+    iparam[IPARAM_SWAP_TYPE        ] = CHAMELON_NO_SWAPS;
+    iparam[IPARAM_DSDMATRIX_MODE   ] = 0;
+    iparam[IPARAM_RECTIL_PANEL_MODE] = 0;
+    iparam[IPARAM_NBPANEL_THREADS  ] = -1;
+}
+
+static inline int
+read_integer_from_options(int long_index, int opt_char)
+{
+    char *endptr;
+    long int value;
+    (void) long_index;
+
+    value = strtol(optarg, &endptr, 10);
+    if ( *optarg == '\0' || *endptr != '\0' ) {
+#ifdef CHAMELEON_HAVE_GETOPT_LONG
+        if ( long_index < 0 ) {
+#endif
+            fprintf(stderr, "Invalid numeric value '%s' for '-%c' parameter\n", optarg, opt_char);
+#ifdef CHAMELEON_HAVE_GETOPT_LONG
+        } else {
+            fprintf(stderr, "Invalid numeric value '%s' for '--%s' parameter\n", optarg, long_options[long_index].name);
+        }
+#endif
+        exit(EXIT_FAILURE);
+    }
+    if ( value > INT_MAX || value < INT_MIN ) {
+        fprintf(stderr, "Out of range integer '%ld'\n", value);
+        exit(EXIT_FAILURE);
+    }
+    return (int)value;
+}
+
+void
+parse_arguments(int *_argc, char ***_argv, int *iparam, int *start, int *stop, int*step)
+{
+    int opt = -1;
+    int c;
+    int argc = *_argc;
+    char **argv = *_argv;
+
+    optind = 1;
+    do {
+#if defined(CHAMELEON_HAVE_GETOPT_LONG)
+        opt = -1;
+        c = getopt_long(argc, argv, GETOPT_STRING,
+                             long_options, &opt);
+#else
+        c = getopt(argc, argv, GETOPT_STRING);
+        (void) opt;
+#endif  /* defined(CHAMELEON_HAVE_GETOPT_LONG) */
+
+         switch(c)
+        {
+        // Configuration
+        case 't' : iparam[IPARAM_THRDNBR          ] = read_integer_from_options(opt, c); break;
+        case 'g' : iparam[IPARAM_NCUDAS           ] = read_integer_from_options(opt, c); break;
+        case 'P' : iparam[IPARAM_P                ] = read_integer_from_options(opt, c); break;
+        case '8' : iparam[IPARAM_NO_CPU           ] = 1; break;
+        // Matrix parameters
+        case 'M' :
+        case 'm' : iparam[IPARAM_M                ] = read_integer_from_options(opt, c); break;
+        case 'n' : iparam[IPARAM_N                ] = read_integer_from_options(opt, c); break;
+        case 'N' : get_range(optarg, start, stop, step); break;
+        case 'K' :
+        case 'k' : iparam[IPARAM_K                ] = read_integer_from_options(opt, c); break;
+        case 'b' : iparam[IPARAM_NB               ] = read_integer_from_options(opt, c);
+                   iparam[IPARAM_MB               ] = read_integer_from_options(opt, c); break;
+        case 'i' : iparam[IPARAM_IB               ] = read_integer_from_options(opt, c); break;
+        case 'x' : iparam[IPARAM_MX               ] = read_integer_from_options(opt, c); break;
+        case 'X' : iparam[IPARAM_NX               ] = read_integer_from_options(opt, c); break;
+        // Check/prints
+        case '1' : iparam[IPARAM_NITER            ] = read_integer_from_options(opt, c); break;
+        case 'W' : iparam[IPARAM_PRINT_WARNINGS   ] = 0; break;
+        case 'w' : iparam[IPARAM_WARMUP           ] = 0; break;
+        case 'c' : iparam[IPARAM_CHECK            ] = 1; break;
+        case 'C' : iparam[IPARAM_INVERSE          ] = 1; break;
+        // Profiling
+        case 'T' : iparam[IPARAM_TRACE            ] = 1; break;
+        case '2' : iparam[IPARAM_PROGRESS         ] = 1; break;
+        case 'd' : iparam[IPARAM_DAG              ] = 1; break;
+        case 'p' : iparam[IPARAM_PROFILE          ] = 1; break;
+        // HQR options
+        case 'a' : iparam[IPARAM_RHBLK            ] = read_integer_from_options(opt, c); break;
+        case 'l' : iparam[IPARAM_LOWLVL_TREE      ] = read_integer_from_options(opt, c); break;
+        case 'L' : iparam[IPARAM_HIGHLVL_TREE     ] = read_integer_from_options(opt, c); break;
+        case 'D' : iparam[IPARAM_QR_DOMINO        ] = 1; break;
+        //Other
+        case '9' : iparam[IPARAM_MODE             ] = read_integer_from_options(opt, c); break;
+        case '3' : iparam[IPARAM_BIGMAT           ] = 0; break;
+        case 's' : iparam[IPARAM_ASYNC            ] = 0; break;
+        case 'o' : iparam[IPARAM_OOC              ] = 1; break;
+        case 'G' : iparam[IPARAM_GEMM3M           ] = 1; break;
+        case '5' : iparam[IPARAM_BOUND            ] = 1; break;
+        case 'S' : iparam[IPARAM_DSDMATRIX_MODE   ] = 1; break;
+        case '0' : iparam[IPARAM_SWAP_TYPE        ] = CHAMELON_NO_SWAPS; break;
+        case '4' : iparam[IPARAM_SWAP_TYPE        ] = CHAMELON_COLUMN_SWAPS; break;
+        case '6' : iparam[IPARAM_SWAP_TYPE        ] = CHAMELON_TILE_SWAPS; break;
+        case '7' : iparam[IPARAM_SWAP_TYPE        ] = CHAMELON_LINE_SWAPS; break;
+        case 'z' : iparam[IPARAM_RECTIL_PANEL_MODE] = 1; break;
+        case 'Z' : iparam[IPARAM_NBPANEL_THREADS  ] = read_integer_from_options(opt, c); break;
+        case 'h' :
+        case '?' :
+            show_help(argv[0]); exit(EXIT_FAILURE);
+        default:
+            break;
+        }
+    } while(-1 != c);
+}
+
+int
+main(int argc, char *argv[]) {
+    int i, m, n, mx, nx;
+    int status;
+    int nbnode = 1;
+    int start =  500;
+    int stop  = 5000;
+    int step  =  500;
+    int iparam[IPARAM_SIZEOF];
+    int success = 0;
+
+    set_iparam_default(iparam);
+
+    parse_arguments(&argc, &argv, iparam, &start, &stop, &step);
+
+#if !defined(CHAMELEON_USE_CUDA)
+    if (iparam[IPARAM_NCUDAS] != 0){
+    	fprintf(stderr, "ERROR: CHAMELEON_USE_CUDA is not defined. "
+    			"The number of CUDA devices must be set to 0 (--gpus=0).\n");
+    	return EXIT_FAILURE;
+    }
+#endif
+
+    n  = iparam[IPARAM_N];
+    m  = iparam[IPARAM_M];
+    mx = iparam[IPARAM_MX];
+    nx = iparam[IPARAM_NX];
+
+    /* Initialize CHAMELON */
+    CHAMELON_Init( iparam[IPARAM_THRDNBR],
+                iparam[IPARAM_NCUDAS] );
+
+    /* Get the number of threads set by the runtime */
+    iparam[IPARAM_THRDNBR] = CHAMELON_GetThreadNbr();
+
+    /* Stops profiling here to avoid profiling uninteresting routines.
+       It will be reactivated in the time_*.c routines with the macro START_TIMING() */
+    RUNTIME_stop_profiling();
+
+    CHAMELON_Disable(CHAMELON_AUTOTUNING);
+    CHAMELON_Set(CHAMELON_TILE_SIZE,        iparam[IPARAM_NB] );
+    CHAMELON_Set(CHAMELON_INNER_BLOCK_SIZE, iparam[IPARAM_IB] );
+    CHAMELON_Set(CHAMELON_SWAP_TYPE_SET,    iparam[IPARAM_SWAP_TYPE]);
+    CHAMELON_Set(CHAMELON_NBPANEL_THREADS_SET, iparam[IPARAM_NBPANEL_THREADS]);
+
+    /* Householder mode */
+    if (iparam[IPARAM_RHBLK] < 1) {
+        CHAMELON_Set(CHAMELON_HOUSEHOLDER_MODE, CHAMELON_FLAT_HOUSEHOLDER);
+    } else {
+        CHAMELON_Set(CHAMELON_HOUSEHOLDER_MODE, CHAMELON_TREE_HOUSEHOLDER);
+        CHAMELON_Set(CHAMELON_HOUSEHOLDER_SIZE, iparam[IPARAM_RHBLK]);
+    }
+
+    if (iparam[IPARAM_PROFILE] == 1) {
+        CHAMELON_Enable(CHAMELON_PROFILING_MODE);
+    }
+
+    if (iparam[IPARAM_PROGRESS] == 1) {
+        CHAMELON_Enable(CHAMELON_PROGRESS);
+    }
+
+    if (iparam[IPARAM_PRINT_WARNINGS] == 0) {
+        CHAMELON_Disable(CHAMELON_WARNINGS);
+    }
+
+    if (iparam[IPARAM_GEMM3M] == 1) {
+        CHAMELON_Enable(CHAMELON_GEMM3M);
+    }
+
+    if (iparam[IPARAM_DSDMATRIX_MODE] == 1)
+        CHAMELON_Enable(CHAMELON_DSDMATRIX_MODE);
+
+    if (iparam[IPARAM_RECTIL_PANEL_MODE] == 1)
+        CHAMELON_Enable(CHAMELON_RECTIL_PANEL_MODE);
+
+#if defined(CHAMELEON_USE_MPI)
+    nbnode = CHAMELON_Comm_size();
+    iparam[IPARAM_NMPI] = nbnode;
+    /* Check P */
+    if ( (iparam[IPARAM_P] > 1) &&
+         (nbnode % iparam[IPARAM_P] != 0) ) {
+      fprintf(stderr, "ERROR: %d doesn't divide the number of node %d\n",
+              iparam[IPARAM_P], nbnode );
+      return EXIT_FAILURE;
+    }
+#endif
+    iparam[IPARAM_Q] = nbnode / iparam[IPARAM_P];
+
+    /* Layout conversion */
+    CHAMELON_Set(CHAMELON_TRANSLATION_MODE, iparam[IPARAM_INPLACE]);
+
+    if ( CHAMELON_My_Mpi_Rank() == 0 ) {
+        print_header( argv[0], iparam);
+    }
+
+    if (step < 1) step = 1;
+
+    status = Test( -1, iparam ); /* print header */
+    if (status != CHAMELON_SUCCESS) return status;
+    if ( n == -1 ){
+        for (i = start; i <= stop; i += step)
+        {
+            if ( nx > 0 ) {
+                iparam[IPARAM_M] = i;
+                iparam[IPARAM_N] = chameleon_max(1, i/nx);
+            }
+            else if ( mx > 0 ) {
+                iparam[IPARAM_M] = chameleon_max(1, i/mx);
+                iparam[IPARAM_N] = i;
+            }
+            else {
+                if ( m == -1 ) {
+                    iparam[IPARAM_M] = i;
+                }
+                iparam[IPARAM_N] = i;
+            }
+            status = Test( iparam[IPARAM_N], iparam );
+            if (status != CHAMELON_SUCCESS) {
+                return status;
+            }
+            success += status;
+        }
+    }
+    else {
+        if ( m == -1 ) {
+            iparam[IPARAM_M] = n;
+        }
+        iparam[IPARAM_N] = n;
+        status = Test( iparam[IPARAM_N], iparam );
+        if (status != CHAMELON_SUCCESS) return status;
+        success += status;
+    }
+    CHAMELON_Finalize();
+    return success;
+}
+
diff --git a/timing/timing.h b/timing/timing.h
new file mode 100644
index 0000000000000000000000000000000000000000..8bcc43119a07240b704a4cba41cebe8a08b10ce4
--- /dev/null
+++ b/timing/timing.h
@@ -0,0 +1,247 @@
+/**
+ *
+ * @file timing.h
+ *
+ * @copyright 2009-2014 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2012-2015 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ * @version 1.0.0
+ *
+ */
+#ifndef TIMING_H
+#define TIMING_H
+
+typedef double chameleon_time_t;
+
+enum iparam_timing {
+    IPARAM_THRDNBR,        /* Number of cores                            */
+    IPARAM_THRDNBR_SUBGRP, /* Number of cores in a subgroup (NUMA node)  */
+    IPARAM_SCHEDULER,      /* What scheduler do we choose (dyn, stat)    */
+    IPARAM_M,              /* Number of rows of the matrix               */
+    IPARAM_N,              /* Number of columns of the matrix            */
+    IPARAM_K,              /* RHS or K                                   */
+    IPARAM_LDA,            /* Leading dimension of A                     */
+    IPARAM_LDB,            /* Leading dimension of B                     */
+    IPARAM_LDC,            /* Leading dimension of C                     */
+    IPARAM_IB,             /* Inner-blocking size                        */
+    IPARAM_NB,             /* Number of columns in a tile                */
+    IPARAM_MB,             /* Number of rows in a tile                   */
+    IPARAM_NITER,          /* Number of iteration of each test           */
+    IPARAM_WARMUP,         /* Run one test to load dynamic libraries     */
+    IPARAM_BIGMAT,         /* Allocating one big mat or plenty of small  */
+    IPARAM_CHECK,          /* Checking activated or not                  */
+    IPARAM_VERBOSE,        /* How much noise do we want?                 */
+    IPARAM_AUTOTUNING,     /* Disable/enable autotuning                  */
+    IPARAM_INPUTFMT,       /* Input format (Use only for getmi/gecfi)    */
+    IPARAM_OUTPUTFMT,      /* Output format (Use only for getmi/gecfi)   */
+    IPARAM_TRACE,          /* Generate trace on the first non warmup run */
+    IPARAM_DAG,            /* Do we require to output the DOT file?      */
+    IPARAM_ASYNC,          /* Asynchronous calls                         */
+    IPARAM_OOC,            /* Out of Core                                */
+    IPARAM_MX,             /* */
+    IPARAM_NX,             /* */
+    IPARAM_RHBLK,          /* Householder reduction parameter for QR/LQ  */
+    IPARAM_INPLACE,        /* InPlace/OutOfPlace translation mode        */
+    IPARAM_MODE,           /* Eigenvalue generation mode                 */
+
+    IPARAM_INVERSE,
+    IPARAM_NCUDAS,
+    IPARAM_NMPI,
+    IPARAM_P,              /* Parameter for 2D cyclic distribution       */
+    IPARAM_Q,              /* Parameter for 2D cyclic distribution       */
+
+    IPARAM_PROGRESS,       /* Use a progress indicator during computations */
+    IPARAM_GEMM3M,         /* Use GEMM3M for complex matrix vector products */
+    /* Added for StarPU version */
+    IPARAM_PROFILE,
+    IPARAM_PRINT_WARNINGS,
+    IPARAM_PARALLEL_TASKS,
+    IPARAM_NO_CPU,
+    IPARAM_BOUND,
+    /* End */
+    /* Added for libhqr version */
+    IPARAM_LOWLVL_TREE,  /* Tree used for reduction inside nodes        */
+    IPARAM_HIGHLVL_TREE, /* Tree used for reduction between nodes       */
+    IPARAM_QR_TS_SZE,    /* Size of TS domain                           */
+    IPARAM_QR_HLVL_SZE,  /* Size of the high level tree                 */
+    IPARAM_QR_DOMINO,    /* Enable/disable the domino tree              */
+    IPARAM_QR_TSRR,      /* Enable/disable the round-robin on TS domain */
+    IPARAM_SWAP_TYPE,
+    IPARAM_DSDMATRIX_MODE,
+    IPARAM_RECTIL_PANEL_MODE,
+    IPARAM_NBPANEL_THREADS,
+    /* End */
+    IPARAM_SIZEOF
+};
+
+enum dparam_timing {
+  IPARAM_TIME,
+  IPARAM_ANORM,
+  IPARAM_BNORM,
+  IPARAM_XNORM,
+  IPARAM_RNORM,
+  IPARAM_AinvNORM,
+  IPARAM_RES,
+  /* Begin section for hydra integration tool */
+  IPARAM_THRESHOLD_CHECK, /* Maximum value accepted for: |Ax-b||/N/eps/(||A||||x||+||b||) */
+  /* End section for hydra integration tool  */
+  IPARAM_DNBPARAM
+};
+
+#define PASTE_CODE_IPARAM_LOCALS(iparam)           \
+    double  t;                                     \
+    int64_t M     = iparam[IPARAM_M];              \
+    int64_t N     = iparam[IPARAM_N];              \
+    int64_t K     = iparam[IPARAM_K];              \
+    int64_t NRHS  = K;                             \
+    int64_t LDA   = chameleon_max(M, iparam[IPARAM_LDA]);    \
+    int64_t LDB   = chameleon_max(N, iparam[IPARAM_LDB]);    \
+    int64_t LDC   = chameleon_max(K, iparam[IPARAM_LDC]);    \
+    int64_t IB    = iparam[IPARAM_IB];             \
+    int64_t MB    = iparam[IPARAM_MB];             \
+    int64_t NB    = iparam[IPARAM_NB];             \
+    int64_t P     = iparam[IPARAM_P];              \
+    int64_t Q     = iparam[IPARAM_Q];              \
+    int64_t MT    = (M%MB==0) ? (M/MB) : (M/MB+1); \
+    int64_t NT    = (N%NB==0) ? (N/NB) : (N/NB+1); \
+    int bigmat     = iparam[IPARAM_BIGMAT];         \
+    int ooc       = iparam[IPARAM_OOC];            \
+    int check     = iparam[IPARAM_CHECK];          \
+    int loud      = iparam[IPARAM_VERBOSE];        \
+    (void)M;(void)N;(void)K;(void)NRHS;            \
+    (void)LDA;(void)LDB;(void)LDC;                 \
+    (void)IB;(void)MB;(void)NB;(void)P;(void)Q;    \
+    (void)MT;(void)NT;(void)check;                 \
+    (void)loud;(void)bigmat;(void)ooc;
+
+/* Paste code to allocate a matrix in desc if cond_init is true */
+#define PASTE_CODE_ALLOCATE_MATRIX_TILE(_desc_, _cond_, _type_, _type2_, _lda_, _m_, _n_) \
+    CHAMELON_desc_t *_desc_ = NULL;                                        \
+    int status ## _desc_ ;                                              \
+    if( _cond_ ) {                                                      \
+        if (ooc)                                                        \
+            status ## _desc_ = CHAMELON_Desc_Create_OOC(&(_desc_), _type2_, MB, NB, MB*NB, _lda_, _n_, 0, 0, _m_, _n_, \
+                                                     P, Q);             \
+        else if (!bigmat)                                               \
+            status ## _desc_ = CHAMELON_Desc_Create_User(&(_desc_), NULL, _type2_, MB, NB, MB*NB, _lda_, _n_, 0, 0, _m_, _n_, \
+                                                      P, Q, chameleon_getaddr_null, NULL, NULL); \
+        else                                                            \
+            status ## _desc_ = CHAMELON_Desc_Create(&(_desc_), NULL, _type2_, MB, NB, MB*NB, _lda_, _n_, 0, 0, _m_, _n_, \
+                                                 P, Q);                 \
+        if (status ## _desc_ != CHAMELON_SUCCESS) return (status ## _desc_); \
+    }
+
+#define PASTE_CODE_FREE_MATRIX(_desc_)                                  \
+    CHAMELON_Desc_Destroy( &_desc_ );
+
+#define PASTE_TILE_TO_LAPACK(_desc_, _name_, _cond_, _type_, _lda_, _n_) \
+    _type_ *_name_ = NULL;                                               \
+    if ( _cond_ ) {                                                      \
+        _name_ = (_type_*)malloc( (_lda_) * (_n_) * sizeof(_type_));     \
+        if ( ! _name_ ) {                                                \
+            fprintf(stderr, "Out of Memory for %s\n", #_name_);          \
+            return -1;                                                   \
+        }                                                                \
+        CHAMELON_Tile_to_Lapack(_desc_, (void*)_name_, _lda_);              \
+    }
+
+#define PASTE_CODE_ALLOCATE_MATRIX(_name_, _cond_, _type_, _lda_, _n_)  \
+    _type_ *_name_ = NULL;                                              \
+    if( _cond_ ) {                                                      \
+        _name_ = (_type_*)malloc( (_lda_) * (_n_) * sizeof(_type_) );   \
+        if ( ! _name_ ) {                                               \
+            fprintf(stderr, "Out of Memory for %s\n", #_name_);         \
+            return -1;                                                  \
+        }                                                               \
+    }
+
+#define PASTE_CODE_ALLOCATE_COPY(_name_, _cond_, _type_, _orig_, _lda_, _n_) \
+    _type_ *_name_ = NULL;                                                   \
+    if( _cond_ ) {                                                           \
+        _name_ = (_type_*)malloc( (_lda_) * (_n_) * sizeof(_type_) );        \
+        if ( ! _name_ ) {                                                    \
+            fprintf(stderr, "Out of Memory for %s\n", #_name_);              \
+            return -1;                                                       \
+        }                                                                    \
+        memcpy(_name_, _orig_, (_lda_) * (_n_) * sizeof(_type_) );           \
+    }
+
+/**
+ *
+ * Macro for trace generation
+ *
+ */
+#define START_TRACING()                        \
+    RUNTIME_start_stats();                     \
+    if(iparam[IPARAM_TRACE] == 2) {            \
+    	RUNTIME_start_profiling();             \
+    }                                          \
+    if(iparam[IPARAM_BOUND]) {                 \
+        CHAMELON_Enable(CHAMELON_BOUND);             \
+    }
+
+#define STOP_TRACING()                         \
+    RUNTIME_stop_stats();                      \
+    if(iparam[IPARAM_TRACE] == 2) {            \
+    	RUNTIME_stop_profiling();              \
+    }                                          \
+    if(iparam[IPARAM_BOUND]) {                 \
+        CHAMELON_Disable(CHAMELON_BOUND);            \
+    }
+
+/**
+ *
+ * Macro for DAG generation
+ *
+ */
+#if 0
+#define START_DAG()                   \
+    if ( iparam[IPARAM_DAG] == 2 )    \
+        CHAMELON_Enable(CHAMELON_DAG);
+
+#define STOP_DAG()                    \
+    if ( iparam[IPARAM_DAG] == 2 )    \
+        CHAMELON_Disable(CHAMELON_DAG);
+#else
+#define START_DAG()  do {} while(0);
+#define STOP_DAG()   do {} while(0);
+#endif
+
+/**
+ *
+ * Synchro for distributed computations
+ *
+ */
+#if defined(CHAMELEON_USE_MPI)
+#define START_DISTRIBUTED()  CHAMELON_Distributed_start();
+#define STOP_DISTRIBUTED()   CHAMELON_Distributed_stop();
+#else
+#define START_DISTRIBUTED()  do {} while(0);
+#define STOP_DISTRIBUTED()   do {} while(0);
+#endif
+
+/**
+ *
+ * General Macros for timing
+ *
+ */
+#define START_TIMING()                          \
+    START_DAG();                                \
+    START_TRACING();                            \
+    START_DISTRIBUTED();                        \
+    t = -RUNTIME_get_time();
+
+#define STOP_TIMING()                           \
+    STOP_DISTRIBUTED();                         \
+    t += RUNTIME_get_time();                    \
+    STOP_TRACING();                             \
+    STOP_DAG();                                 \
+    if (iparam[IPARAM_PROFILE] == 2) {          \
+        RUNTIME_kernelprofile_display();        \
+        RUNTIME_schedprofile_display();         \
+    }                                           \
+    *t_ = t;
+
+#endif /* TIMING_H */