From 4819e83d68f79d2bfb87affd52ea2a18ad58c43f Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Tue, 2 May 2017 18:02:25 +0200
Subject: [PATCH] Remove magma codelets for StarPU

---
 runtime/starpu/codelets/codelet_zgelqt.c      | 48 ----------
 runtime/starpu/codelets/codelet_zgeqrt.c      | 47 ----------
 runtime/starpu/codelets/codelet_zgessm.c      | 33 -------
 .../starpu/codelets/codelet_zgetrf_incpiv.c   | 89 ------------------
 .../starpu/codelets/codelet_zgetrf_nopiv.c    | 26 ------
 runtime/starpu/codelets/codelet_zlauum.c      | 21 -----
 runtime/starpu/codelets/codelet_zpotrf.c      | 40 +-------
 runtime/starpu/codelets/codelet_zssssm.c      | 46 ----------
 runtime/starpu/codelets/codelet_ztrtri.c      | 23 -----
 runtime/starpu/codelets/codelet_ztslqt.c      | 45 ---------
 runtime/starpu/codelets/codelet_ztsqrt.c      | 44 ---------
 runtime/starpu/codelets/codelet_ztstrf.c      | 91 -------------------
 12 files changed, 1 insertion(+), 552 deletions(-)

diff --git a/runtime/starpu/codelets/codelet_zgelqt.c b/runtime/starpu/codelets/codelet_zgelqt.c
index 223559778..e215ce20f 100644
--- a/runtime/starpu/codelets/codelet_zgelqt.c
+++ b/runtime/starpu/codelets/codelet_zgelqt.c
@@ -155,55 +155,7 @@ static void cl_zgelqt_cpu_func(void *descr[], void *cl_arg)
 }
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
-#if defined(CHAMELEON_USE_MAGMA)
-#if !defined(CHAMELEON_SIMULATION)
-static void cl_zgelqt_cuda_func(void *descr[], void *cl_arg)
-{
-    MORSE_starpu_ws_t *h_work;
-    int m;
-    int n;
-    int ib;
-    cuDoubleComplex *h_A, *h_T, *h_D, *h_W, *h_TAU;
-    cuDoubleComplex *d_A, *d_T, *d_D, *d_W;
-    int lda, ldt;
-
-    starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldt, &h_work);
-
-    /* Gather pointer to data on device */
-    d_A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
-    d_T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
-    d_W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); /* m*ib + ib*ib*/
-    d_D = d_W + m*ib;
-
-    /* scratch data on host */
-    /* ib*n + ib*ib + max(m,n) + ib*ib + ib*ib */
-    h_A = (cuDoubleComplex*)RUNTIME_starpu_ws_getlocal(h_work);
-
-    /* Gather pointer to scratch data on host */
-    h_T   = h_A   + ib*n;
-    h_TAU = h_T   + ib*ib;
-    h_W   = h_TAU + chameleon_max(m,n);
-    h_D   = h_W   + ib*ib;
-
-    RUNTIME_getStream(stream);
-
-    CUDA_zgelqt(
-            m, n, ib,
-            d_A, lda, h_A, ib,
-            d_T, ldt, h_T, ib,
-            d_D, h_D, ib, h_TAU,
-            h_W, d_W, stream );
-
-    cudaThreadSynchronize();
-}
-#endif /* defined(CHAMELEON_USE_MAGMA) */
-#endif /* !defined(CHAMELEON_SIMULATION) */
-
 /*
  * Codelet definition
  */
-#if defined(CHAMELEON_USE_MAGMA)
-CODELETS(zgelqt, 3, cl_zgelqt_cpu_func, cl_zgelqt_cuda_func, 0)
-#else
 CODELETS_CPU(zgelqt, 3, cl_zgelqt_cpu_func)
-#endif
diff --git a/runtime/starpu/codelets/codelet_zgeqrt.c b/runtime/starpu/codelets/codelet_zgeqrt.c
index 868c10c16..595bafa1b 100644
--- a/runtime/starpu/codelets/codelet_zgeqrt.c
+++ b/runtime/starpu/codelets/codelet_zgeqrt.c
@@ -154,56 +154,9 @@ static void cl_zgeqrt_cpu_func(void *descr[], void *cl_arg)
     WORK = TAU + chameleon_max( m, n );
     CORE_zgeqrt(m, n, ib, A, lda, T, ldt, TAU, WORK);
 }
-
-
-#if defined(CHAMELEON_USE_MAGMA)
-static void cl_zgeqrt_cuda_func(void *descr[], void *cl_arg)
-{
-    MORSE_starpu_ws_t *h_work;
-    int m;
-    int n;
-    int ib;
-    cuDoubleComplex *h_A, *h_T, *h_D, *h_W, *h_TAU;
-    cuDoubleComplex *d_A, *d_T, *d_D, *d_W;
-    int lda, ldt;
-
-    starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldt, &h_work);
-
-    /* Gather pointer to data on device */
-    d_A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
-    d_T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
-    d_W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); /* ib*n + ib * ib*/
-    d_D = d_W + ib*n;
-
-    /* scratch data on host */
-    /* m*ib + ib*ib + max(m,n) + ib*ib + ib*ib */
-    h_A = (cuDoubleComplex*)RUNTIME_starpu_ws_getlocal(h_work);
-
-    /* Gather pointer to scratch data on host */
-    h_T   = h_A   + m*ib;
-    h_TAU = h_T   + ib*ib;
-    h_W   = h_TAU + chameleon_max(m,n);
-    h_D   = h_W   + ib*ib;
-
-    RUNTIME_getStream(stream);
-
-    CUDA_zgeqrt(
-            m, n, ib,
-            d_A, lda, h_A, m,
-            d_T, ldt, h_T, ib,
-            d_D, h_D, ib, h_TAU,
-            h_W, d_W, stream);
-
-    cudaThreadSynchronize();
-}
-#endif /* defined(CHAMELEON_USE_MAGMA) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
-#if defined(CHAMELEON_USE_MAGMA)
-CODELETS(zgeqrt, 3, cl_zgeqrt_cpu_func, cl_zgeqrt_cuda_func, 0)
-#else
 CODELETS_CPU(zgeqrt, 3, cl_zgeqrt_cpu_func)
-#endif
diff --git a/runtime/starpu/codelets/codelet_zgessm.c b/runtime/starpu/codelets/codelet_zgessm.c
index 547111784..1b5a72908 100644
--- a/runtime/starpu/codelets/codelet_zgessm.c
+++ b/runtime/starpu/codelets/codelet_zgessm.c
@@ -137,42 +137,9 @@ static void cl_zgessm_cpu_func(void *descr[], void *cl_arg)
     starpu_codelet_unpack_args(cl_arg, &m, &n, &k, &ib, &IPIV, &ldl, &ldd, &lda);
     CORE_zgessm(m, n, k, ib, IPIV, D, ldd, A, lda);
 }
-
-#if defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU)
-static void cl_zgessm_cuda_func(void *descr[], void *cl_arg)
-{
-    int m;
-    int n;
-    int k;
-    int ib;
-    int *IPIV;
-    cuDoubleComplex *dL, *dD, *dA;
-    int lddl, lddd, ldda;
-    int info = 0;
-    /*
-     *  hwork => nb*nb
-     */
-    dL = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
-    dD = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
-    dA = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
-    starpu_codelet_unpack_args(cl_arg, &m, &n, &k, &ib, &IPIV, &lddl, &lddd, &ldda);
-
-    CUDA_zgessm(
-            MagmaColMajor, m, n, k, ib,
-            IPIV, dL, lddl, dD, lddd, dA, ldda, &info );
-
-    cudaThreadSynchronize();
-
-    return;
-}
-#endif /* defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
-#if (defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU))
-CODELETS(zgessm, 3, cl_zgessm_cpu_func, cl_zgessm_cuda_func, 0)
-#else
 CODELETS_CPU(zgessm, 3, cl_zgessm_cpu_func)
-#endif
diff --git a/runtime/starpu/codelets/codelet_zgetrf_incpiv.c b/runtime/starpu/codelets/codelet_zgetrf_incpiv.c
index ad3475cb7..bc91e972a 100644
--- a/runtime/starpu/codelets/codelet_zgetrf_incpiv.c
+++ b/runtime/starpu/codelets/codelet_zgetrf_incpiv.c
@@ -147,99 +147,10 @@ static void cl_zgetrf_incpiv_cpu_func(void *descr[], void *cl_arg)
 
     starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldl, &IPIV, &check_info, &iinfo, &h_work);
     CORE_zgetrf_incpiv(m, n, ib, A, lda, IPIV, &info);
-
-#if defined(CHAMELEON_USE_MAGMA)
-    {
-        MORSE_Complex64_t *L = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
-        /*
-         * L stores:
-         *      L1     L2    L3     ...
-         *      L1^-1  L2^-1 L3^-1  ...
-         */
-        /* Compute L-1 in lower rectangle of L */
-        if ( ldl >= 2*ib )
-        {
-            int i, sb;
-
-            L += ib;
-            for (i=0; i<n; i+=ib) {
-                sb = chameleon_min( ib, n-i );
-                CORE_zlacpy(MorseUpperLower, sb, sb, A+(i*lda+i), lda, L+(i*ldl), ldl );
-
-                CORE_ztrtri( MorseLower, MorseUnit, sb, L+(i*ldl), ldl, &info );
-                if (info != 0 ) {
-                    fprintf(stderr, "ERROR, trtri returned with info = %d\n", info);
-                }
-            }
-        }
-    }
-#endif
-}
-
-
-/*
- * Codelet GPU
- */
-#if defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU)
-static void cl_zgetrf_incpiv_cuda_func(void *descr[], void *cl_arg)
-{
-    int m;
-    int n;
-    int ib;
-    cuDoubleComplex *hA, *dA;
-    cuDoubleComplex *hL, *dL;
-    cuDoubleComplex *dwork;
-    MORSE_starpu_ws_t *h_work;
-    int lda, ldl;
-    int *IPIV;
-    MORSE_bool check_info;
-    int iinfo;
-    int info;
-
-    starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldl, &IPIV, &check_info, &iinfo, &h_work);
-
-    dA = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
-    dL = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
-    /*
-     * hwork => at least (IB+NB)*IB contains all hA and hL
-     * dwork => at least IB*NB
-     */
-    hA    = (cuDoubleComplex*)RUNTIME_starpu_ws_getlocal(h_work);
-    dwork = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
-
-    hL = hA + lda*ib;
-
-    /* Initialize L to 0 */
-    memset(hL, 0, ib*ib*sizeof(cuDoubleComplex));
-
-    if ( ldl >= 2*ib ) {
-      /* Let's compute the inverses in the bottom part of L */
-      dL += ib;
-    } else {
-      /* We prefer to stick with TRSM */
-      dL = NULL;
-      hL = NULL;
-    }
-
-    CUDA_zgetrf_incpiv(
-            MagmaColMajor, m, n, ib,
-            hA, lda, dA, lda,
-            hL, ib,  dL, ldl,
-            IPIV,
-            dwork, lda,
-            &info );
-
-    cudaThreadSynchronize();
 }
-#endif /* defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
-
 /*
  * Codelet definition
  */
-#if defined(HAVE_MAGMA_GETRF_INCPIV_GPU) && ( defined(CHAMELEON_USE_MAGMA) )
-CODELETS(zgetrf_incpiv, 3, cl_zgetrf_incpiv_cpu_func, cl_zgetrf_incpiv_cuda_func, 0)
-#else
 CODELETS_CPU(zgetrf_incpiv, 3, cl_zgetrf_incpiv_cpu_func)
-#endif
diff --git a/runtime/starpu/codelets/codelet_zgetrf_nopiv.c b/runtime/starpu/codelets/codelet_zgetrf_nopiv.c
index ab9d87ace..8ca85664b 100644
--- a/runtime/starpu/codelets/codelet_zgetrf_nopiv.c
+++ b/runtime/starpu/codelets/codelet_zgetrf_nopiv.c
@@ -123,35 +123,9 @@ static void cl_zgetrf_nopiv_cpu_func(void *descr[], void *cl_arg)
     starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &iinfo);
     CORE_zgetrf_nopiv(m, n, ib, A, lda, &info);
 }
-
-/*
- * Codelet GPU
- */
-#if defined(CHAMELEON_USE_MAGMA)
-static void cl_zgetrf_nopiv_cuda_func(void *descr[], void *cl_arg)
-{
-    int m;
-    int n;
-    int ib;
-    cuDoubleComplex *dA;
-    int lda;
-    int iinfo;
-
-    int info = 0;
-
-    starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &iinfo);
-    dA = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
-    CUDA_zgetrf_nopiv( m, n, dA, lda, &info );
-    cudaThreadSynchronize();
-}
-#endif
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
-#if defined(CHAMELEON_USE_MAGMA)
-CODELETS(zgetrf_nopiv, 1, cl_zgetrf_nopiv_cpu_func, cl_zgetrf_nopiv_cuda_func, 0)
-#else
 CODELETS_CPU(zgetrf_nopiv, 1, cl_zgetrf_nopiv_cpu_func)
-#endif
diff --git a/runtime/starpu/codelets/codelet_zlauum.c b/runtime/starpu/codelets/codelet_zlauum.c
index f578695ff..7f23c5682 100644
--- a/runtime/starpu/codelets/codelet_zlauum.c
+++ b/runtime/starpu/codelets/codelet_zlauum.c
@@ -77,30 +77,9 @@ static void cl_zlauum_cpu_func(void *descr[], void *cl_arg)
     starpu_codelet_unpack_args(cl_arg, &uplo, &N, &LDA);
     CORE_zlauum(uplo, N, A, LDA);
 }
-
-#if defined(CHAMELEON_USE_MAGMA)
-static void cl_zlauum_cuda_func(void *descr[], void *cl_arg)
-{
-    MORSE_enum uplo;
-    int info = 0;
-    int N;
-    cuDoubleComplex *A;
-    int LDA;
-
-    A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
-    starpu_codelet_unpack_args(cl_arg, &uplo, &N, &LDA);
-    CUDA_zlauum( uplo, N, A, LDA, &info);
-    cudaThreadSynchronize();
-    return;
-}
-#endif
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
-#if defined(CHAMELEON_USE_MAGMA)
-CODELETS(zlauum, 1, cl_zlauum_cpu_func, cl_zlauum_cuda_func, 0)
-#else
 CODELETS_CPU(zlauum, 1, cl_zlauum_cpu_func)
-#endif
diff --git a/runtime/starpu/codelets/codelet_zpotrf.c b/runtime/starpu/codelets/codelet_zpotrf.c
index 857cf3356..686814274 100644
--- a/runtime/starpu/codelets/codelet_zpotrf.c
+++ b/runtime/starpu/codelets/codelet_zpotrf.c
@@ -83,48 +83,10 @@ static void cl_zpotrf_cpu_func(void *descr[], void *cl_arg)
     starpu_codelet_unpack_args(cl_arg, &uplo, &n, &lda, &iinfo);
     CORE_zpotrf(uplo, n, A, lda, &info);
 }
-
-#ifdef CHAMELEON_USE_MAGMA
-static void cl_zpotrf_cuda_func(void *descr[], void *cl_arg)
-{
-    cudaStream_t stream[2], currentt_stream;
-    MORSE_enum uplo;
-    int n;
-    cuDoubleComplex *A;
-    /* cuDoubleComplex *hA; */
-    int lda;
-    int iinfo;
-    int info = 0;
-
-    A  = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
-    starpu_codelet_unpack_args(cl_arg, &uplo, &n, &lda, &iinfo);
-
-    /* /\* */
-    /*  *  hwork => nb*nb */
-    /*  *\/ */
-    /* hA = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); */
-
-/*      stream[0] = starpu_cuda_get_local_stream(); */
-/*      if ( cudaStreamCreate( stream+1 ) != CUDA_SUCCESS ){ */
-/*          fprintf(stderr, "Error while creating stream in codelet_zpotrf\n"); */
-/*          exit(-1); */
-/*      } */
-
-    CUDA_zpotrf( uplo, n, A, lda, &info);
-
-    cudaThreadSynchronize();
-/*      cudaStreamDestroy( stream[1] ); */
-
-    return;
-}
-#endif
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
-#if defined CHAMELEON_USE_MAGMA
-CODELETS(zpotrf, 1, cl_zpotrf_cpu_func, cl_zpotrf_cuda_func, 0)
-#else
 CODELETS_CPU(zpotrf, 1, cl_zpotrf_cpu_func)
-#endif
+
diff --git a/runtime/starpu/codelets/codelet_zssssm.c b/runtime/starpu/codelets/codelet_zssssm.c
index a1fae08bb..6d1c3ee08 100644
--- a/runtime/starpu/codelets/codelet_zssssm.c
+++ b/runtime/starpu/codelets/codelet_zssssm.c
@@ -174,56 +174,10 @@ static void cl_zssssm_cpu_func(void *descr[], void *cl_arg)
     starpu_codelet_unpack_args(cl_arg, &m1, &n1, &m2, &n2, &k, &ib, &lda1, &lda2, &ldl1, &ldl2, &IPIV);
     CORE_zssssm(m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, L1, ldl1, L2, ldl2, IPIV);
 }
-
-#if defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU)
-static void cl_zssssm_cuda_func(void *descr[], void *cl_arg)
-{
-    int m1;
-    int n1;
-    int m2;
-    int n2;
-    int k;
-    int ib;
-    cuDoubleComplex *dA1;
-    int lda1;
-    cuDoubleComplex *dA2;
-    int lda2;
-    cuDoubleComplex *dL1;
-    int ldl1;
-    cuDoubleComplex *dL2;
-    int ldl2;
-    int *IPIV;
-    int info;
-
-    starpu_codelet_unpack_args(cl_arg, &m1, &n1, &m2, &n2, &k, &ib, &lda1, &lda2, &ldl1, &ldl2, &IPIV);
-
-    dA1  = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
-    dA2  = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
-    dL1  = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
-    dL2  = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]);
-
-    if ( ldl1 >= 2*ib ) {
-        /* dL1 stores L and invL and the kernel is just using the inverted part */
-        dL1 += ib;
-    }
-
-    CUDA_zssssm(
-        MagmaColMajor, m1, n1, m2, n2, k, ib,
-        dA1, lda1, dA2, lda2,
-        dL1, ldl1, dL2, ldl2,
-        IPIV, &info);
-
-    cudaThreadSynchronize();
-}
-#endif
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
-#if (defined(CHAMELEON_USE_MAGMA) && defined(HAVE_MAGMA_GETRF_INCPIV_GPU))
-CODELETS(zssssm, 4, cl_zssssm_cpu_func, cl_zssssm_cuda_func, 0)
-#else
 CODELETS_CPU(zssssm, 4, cl_zssssm_cpu_func)
-#endif
 
diff --git a/runtime/starpu/codelets/codelet_ztrtri.c b/runtime/starpu/codelets/codelet_ztrtri.c
index 1f619cd09..73d1a439e 100644
--- a/runtime/starpu/codelets/codelet_ztrtri.c
+++ b/runtime/starpu/codelets/codelet_ztrtri.c
@@ -85,32 +85,9 @@ static void cl_ztrtri_cpu_func(void *descr[], void *cl_arg)
     starpu_codelet_unpack_args(cl_arg, &uplo, &diag, &N, &LDA, &iinfo);
     CORE_ztrtri(uplo, diag, N, A, LDA, &info);
 }
-
-#if defined(CHAMELEON_USE_MAGMA)
-static void cl_ztrtri_cuda_func(void *descr[], void *cl_arg)
-{
-    MORSE_enum uplo;
-    MORSE_enum diag;
-    int N;
-    cuDoubleComplex *A;
-    int LDA;
-    int iinfo;
-    int info = 0;
-
-    A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
-    starpu_codelet_unpack_args(cl_arg, &uplo, &diag, &N, &LDA, &iinfo);
-    CUDA_ztrtri( uplo, diag, N, A, LDA, &info);
-    cudaThreadSynchronize();
-    return;
-}
-#endif
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
-#if defined(CHAMELEON_USE_MAGMA)
-CODELETS(ztrtri, 1, cl_ztrtri_cpu_func, cl_ztrtri_cuda_func, 0)
-#else
 CODELETS_CPU(ztrtri, 1, cl_ztrtri_cpu_func)
-#endif
diff --git a/runtime/starpu/codelets/codelet_ztslqt.c b/runtime/starpu/codelets/codelet_ztslqt.c
index 101feebfe..56c278094 100644
--- a/runtime/starpu/codelets/codelet_ztslqt.c
+++ b/runtime/starpu/codelets/codelet_ztslqt.c
@@ -174,54 +174,9 @@ static void cl_ztslqt_cpu_func(void *descr[], void *cl_arg)
     WORK = TAU + chameleon_max( m, n );
     CORE_ztslqt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK);
 }
-
-#if defined(CHAMELEON_USE_MAGMA) && 0
-static void cl_ztslqt_cuda_func(void *descr[], void *cl_arg)
-{
-    MORSE_starpu_ws_t *h_work;
-    int m;
-    int n;
-    int ib;
-    cuDoubleComplex *h_A2, *h_T, *h_D, *h_TAU, *h_W;
-    cuDoubleComplex *d_A1, *d_A2, *d_T, *d_D, *d_W;
-    int lda1, lda2, ldt;
-
-    starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda1, &lda2, &ldt, &h_work);
-
-    /* Gather pointer to data on device */
-    d_A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
-    d_A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
-    d_T  = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
-    d_W  = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); /* 2*ib*m + ib*ib */
-    d_D  = d_W + 2*ib*m;
-
-    /* scratch data on host */
-    /* ib*n + ib*n + max(m,n) + ib*m + ib*ib */
-    h_A2  = (cuDoubleComplex*)RUNTIME_starpu_ws_getlocal(h_work);
-    h_T   = h_A2  + ib*n;
-    h_TAU = h_T   + ib*n;
-    h_W   = h_TAU + chameleon_max(m,n);
-    h_D   = h_W   + ib*m;
-
-    RUNTIME_getStream(stream);
-    CUDA_ztslqt(
-            m, n, ib,
-            d_A1, lda1, d_A2, lda2,
-            h_A2, ib,
-            d_T, ldt, h_T, ib,
-            d_D, h_D, ib, h_TAU,
-            h_W, d_W, stream);
-
-    cudaThreadSynchronize();
-}
-#endif
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
-#if (defined(CHAMELEON_USE_MAGMA)) && 0
-CODELETS(ztslqt, 4, cl_ztslqt_cpu_func, cl_ztslqt_cuda_func, 0)
-#else
 CODELETS_CPU(ztslqt, 4, cl_ztslqt_cpu_func)
-#endif
diff --git a/runtime/starpu/codelets/codelet_ztsqrt.c b/runtime/starpu/codelets/codelet_ztsqrt.c
index 4c5d03fe2..969da0611 100644
--- a/runtime/starpu/codelets/codelet_ztsqrt.c
+++ b/runtime/starpu/codelets/codelet_ztsqrt.c
@@ -165,53 +165,9 @@ static void cl_ztsqrt_cpu_func(void *descr[], void *cl_arg)
     WORK = TAU + chameleon_max( m, n );
     CORE_ztsqrt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK);
 }
-
-#if defined(CHAMELEON_USE_MAGMA)
-static void cl_ztsqrt_cuda_func(void *descr[], void *cl_arg)
-{
-    MORSE_starpu_ws_t *h_work;
-    int m;
-    int n;
-    int ib;
-    cuDoubleComplex *h_A2, *h_T, *h_D, *h_TAU, *h_W;
-    cuDoubleComplex *d_A1, *d_A2, *d_T, *d_D, *d_W;
-    int lda1, lda2, ldt;
-
-    starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda1, &lda2, &ldt, &h_work);
-
-    /* Gather pointer to data on device */
-    d_A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
-    d_A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
-    d_T  = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
-    d_W  = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); /* 2*ib*n + ib*ib */
-    d_D  = d_W + 2*ib*n;
-
-    /* scratch data on host */
-    /* m*ib + ib*ib + max(m,n) + ib*n + ib*ib */
-    h_A2  = (cuDoubleComplex*)RUNTIME_starpu_ws_getlocal(h_work);
-    h_T   = h_A2  + m*ib;
-    h_TAU = h_T   + ib*ib;
-    h_W   = h_TAU + chameleon_max(m,n);
-    h_D   = h_W   + ib*n;
-
-    RUNTIME_getStream(stream);
-    CUDA_ztsqrt(
-            m, n, ib,
-            d_A1, lda1, d_A2, lda2,
-            h_A2, lda2,
-            d_T, ldt, h_T, ib,
-            d_D, h_D, ib, h_TAU,
-            h_W, d_W, stream);
-    cudaThreadSynchronize();
-}
-#endif
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
-#if defined(CHAMELEON_USE_MAGMA)
-CODELETS(ztsqrt, 4, cl_ztsqrt_cpu_func, cl_ztsqrt_cuda_func, 0)
-#else
 CODELETS_CPU(ztsqrt, 4, cl_ztsqrt_cpu_func)
-#endif
diff --git a/runtime/starpu/codelets/codelet_ztstrf.c b/runtime/starpu/codelets/codelet_ztstrf.c
index eae5108ee..59f742807 100644
--- a/runtime/starpu/codelets/codelet_ztstrf.c
+++ b/runtime/starpu/codelets/codelet_ztstrf.c
@@ -176,102 +176,11 @@ static void cl_ztstrf_cpu_func(void *descr[], void *cl_arg)
     starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &nb, &ldu, &lda, &ldl, &IPIV, &d_work, &ldwork, &check_info, &iinfo);
 
     CORE_ztstrf(m, n, ib, nb, U, ldu, A, lda, L, ldl, IPIV, WORK, ldwork, &info);
-
-#if defined(CHAMELEON_USE_MAGMA)
-    /*
-     * L stores the following if enough place:
-     *      L1     L2    L3     ...
-     *      L1^-1  L2^-1 L3^-1  ...
-     */
-    /* Compute L-1 in lower rectangle of L */
-    if ( ldl >= 2*ib )
-    {
-        int i, sb;
-        for (i=0; i<n; i+=ib) {
-            sb = chameleon_min( ib, n-i );
-            CORE_zlacpy(MorseUpperLower, sb, sb, L+(i*ldl), ldl, L+(i*ldl)+ib, ldl );
-
-            CORE_ztrtri( MorseLower, MorseUnit, sb, L+(i*ldl)+ib, ldl, &info );
-            if (info != 0 ) {
-                fprintf(stderr, "ERROR, trtri returned with info = %d\n", info);
-            }
-        }
-    }
-#endif
-}
-
-
-/*
- * Codelet GPU
- */
-/* TODO/WARNING: tstrf is not working on GPU for now */
-#if defined(CHAMELEON_USE_MAGMA) && 0
-static void cl_ztstrf_cuda_func(void *descr[], void *cl_arg)
-{
-    MORSE_starpu_ws_t *d_work;
-    int m;
-    int n;
-    int ib;
-    int nb;
-    cuDoubleComplex *hU, *dU;
-    int ldu;
-    cuDoubleComplex *hA, *dA;
-    int lda;
-    cuDoubleComplex *hL, *dL;
-    int ldl;
-    int *ipiv;
-    cuDoubleComplex *hw2, *hw, *dw;
-    int ldwork;
-    MORSE_bool check_info;
-    int iinfo;
-    int info;
-
-    starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &nb, &ldu, &lda, &ldl, &ipiv,
-			       &d_work, &ldwork, &check_info, &iinfo);
-
-    dU = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
-    dA = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
-    dL = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
-
-    /*
-     *  hwork => 2*nb*(2*ib+2nb)
-     *  dwork => 2*ib*nb
-     */
-    hw2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]);
-    dw  = (cuDoubleComplex*)RUNTIME_starpu_ws_getlocal(d_work);
-
-    hU = hw2;
-    hA = hU + ldu * nb;
-    hL = hA + lda * nb;
-    hw = hL + ldl * nb;
-
-    /* Download first panel from A and U */
-    cublasGetMatrix( nb, n,  sizeof(cuDoubleComplex), dU, ldu, hU, ldu );
-    cublasGetMatrix( m,  ib, sizeof(cuDoubleComplex), dA, lda, hA, lda );
-
-    /* Initialize L to 0 */
-    memset(hL, 0, ldl*nb*sizeof(cuDoubleComplex));
-
-    CUDA_ztstrf(
-            MagmaColMajor, m, n, ib, nb,
-            hU, ldu, dU, ldu,
-            hA, lda, dA, lda,
-            hL, ldl, dL, ldl,
-            ipiv,
-            hw, ldwork, dw, lda,
-            &info );
-
-    cudaThreadSynchronize();
 }
-#endif
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
-#if (defined(CHAMELEON_USE_MAGMA) && 0)
-CODELETS(ztstrf, 4, cl_ztstrf_cpu_func, cl_ztstrf_cuda_func, 0)
-#else
 CODELETS_CPU(ztstrf, 4, cl_ztstrf_cpu_func)
-#endif
 
-- 
GitLab