From 2740ea28f12be3a9717f75a7a29da6e5564ca302 Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Wed, 31 Jan 2018 18:51:37 +0100
Subject: [PATCH] Add new tpmlqt and tplqt tasks to starpu

---
 runtime/starpu/codelets/codelet_zcallback.c |   2 +
 runtime/starpu/codelets/codelet_ztplqt.c    | 102 +++++++++++++
 runtime/starpu/codelets/codelet_ztpmlqt.c   | 154 ++++++++++++++++++++
 runtime/starpu/include/runtime_codelet_z.h  |   2 +
 4 files changed, 260 insertions(+)
 create mode 100644 runtime/starpu/codelets/codelet_ztplqt.c
 create mode 100644 runtime/starpu/codelets/codelet_ztpmlqt.c

diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c
index 905bc8a9a..b32105832 100644
--- a/runtime/starpu/codelets/codelet_zcallback.c
+++ b/runtime/starpu/codelets/codelet_zcallback.c
@@ -67,7 +67,9 @@ CHAMELEON_CL_CB(zssssm,        starpu_matrix_get_nx(task->handles[0]), starpu_ma
 CHAMELEON_CL_CB(zsymm,         starpu_matrix_get_nx(task->handles[2]), starpu_matrix_get_ny(task->handles[2]), 0,                                           2.*M*M *N)
 CHAMELEON_CL_CB(zsyr2k,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0,                                      ( 1.+2.*M*N)*M)
 CHAMELEON_CL_CB(zsyrk,         starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0,                                      ( 1.+   M)*M*N)
+CHAMELEON_CL_CB(ztplqt,        starpu_matrix_get_nx(task->handles[1]), starpu_matrix_get_ny(task->handles[1]), starpu_matrix_get_nx(task->handles[0]),       2.*M*N*K)
 CHAMELEON_CL_CB(ztpqrt,        starpu_matrix_get_nx(task->handles[1]), starpu_matrix_get_ny(task->handles[1]), starpu_matrix_get_nx(task->handles[0]),       2.*M*N*K)
+CHAMELEON_CL_CB(ztpmlqt,       starpu_matrix_get_nx(task->handles[3]), starpu_matrix_get_ny(task->handles[3]), starpu_matrix_get_nx(task->handles[2]),       4.*M*N*K)
 CHAMELEON_CL_CB(ztpmqrt,       starpu_matrix_get_nx(task->handles[3]), starpu_matrix_get_ny(task->handles[3]), starpu_matrix_get_nx(task->handles[2]),       4.*M*N*K)
 CHAMELEON_CL_CB(ztrasm,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0,                                         0.5*M*(M+1))
 CHAMELEON_CL_CB(ztrmm,         starpu_matrix_get_nx(task->handles[1]), starpu_matrix_get_ny(task->handles[1]), 0,                                               M*M*N)
diff --git a/runtime/starpu/codelets/codelet_ztplqt.c b/runtime/starpu/codelets/codelet_ztplqt.c
new file mode 100644
index 000000000..17132351f
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_ztplqt.c
@@ -0,0 +1,102 @@
+/**
+ *
+ * @copyright (c) 2009-2016 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                          Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file codelet_ztplqt.c
+ *
+ *  MORSE codelets kernel
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @version 0.9.0
+ * @author Mathieu Faverge
+ * @date 2016-12-15
+ * @precisions normal z -> s d c
+ *
+ **/
+#include "chameleon_starpu.h"
+#include "runtime_codelet_z.h"
+
+#if !defined(CHAMELEON_SIMULATION)
+static void cl_ztplqt_cpu_func(void *descr[], void *cl_arg)
+{
+    int M;
+    int N;
+    int L;
+    int ib;
+    MORSE_Complex64_t *A;
+    int lda;
+    MORSE_Complex64_t *B;
+    int ldb;
+    MORSE_Complex64_t *T;
+    int ldt;
+    MORSE_Complex64_t *WORK;
+
+    A    = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
+    B    = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
+    T    = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]);
+    WORK = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */
+
+    starpu_codelet_unpack_args( cl_arg, &M, &N, &L, &ib,
+                                &lda, &ldb, &ldt );
+
+    CORE_ztplqt( M, N, L, ib,
+                 A, lda, B, ldb, T, ldt, WORK );
+}
+#endif /* !defined(CHAMELEON_SIMULATION) */
+
+/*
+ * Codelet definition
+ */
+CODELETS_CPU(ztplqt, 4, cl_ztplqt_cpu_func)
+
+void
+MORSE_TASK_ztplqt( const MORSE_option_t *options,
+                   int M, int N, int L, int ib, int nb,
+                   const MORSE_desc_t *A, int Am, int An, int lda,
+                   const MORSE_desc_t *B, int Bm, int Bn, int ldb,
+                   const MORSE_desc_t *T, int Tm, int Tn, int ldt )
+{
+    struct starpu_codelet *codelet = &cl_ztplqt;
+    void (*callback)(void*) = options->profiling ? cl_ztplqt_callback : NULL;
+
+    MORSE_BEGIN_ACCESS_DECLARATION;
+    MORSE_ACCESS_RW(A, Am, An);
+    MORSE_ACCESS_RW(B, Bm, Bn);
+    MORSE_ACCESS_W(T, Tm, Tn);
+    MORSE_END_ACCESS_DECLARATION;
+
+    starpu_insert_task(
+        starpu_mpi_codelet(codelet),
+        STARPU_VALUE, &M,     sizeof(int),
+        STARPU_VALUE, &N,     sizeof(int),
+        STARPU_VALUE, &L,     sizeof(int),
+        STARPU_VALUE, &ib,    sizeof(int),
+        STARPU_RW,     RTBLKADDR(A, MORSE_Complex64_t, Am, An),
+        STARPU_VALUE, &lda,   sizeof(int),
+        STARPU_RW,     RTBLKADDR(B, MORSE_Complex64_t, Bm, Bn),
+        STARPU_VALUE, &ldb,   sizeof(int),
+        STARPU_W,      RTBLKADDR(T, MORSE_Complex64_t, Tm, Tn),
+        STARPU_VALUE, &ldt,   sizeof(int),
+        /* Other options */
+        STARPU_SCRATCH,   options->ws_worker,
+        STARPU_PRIORITY,  options->priority,
+        STARPU_CALLBACK,  callback,
+#if defined(CHAMELEON_USE_MPI)
+        STARPU_EXECUTE_ON_NODE, B->get_rankof(B, Bm, Bn),
+#endif
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+        STARPU_NAME, (L == 0) ? "ztplqs" : "ztplqt",
+#endif
+        0);
+
+    (void)ib; (void)nb;
+}
diff --git a/runtime/starpu/codelets/codelet_ztpmlqt.c b/runtime/starpu/codelets/codelet_ztpmlqt.c
new file mode 100644
index 000000000..2363d67c0
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_ztpmlqt.c
@@ -0,0 +1,154 @@
+/**
+ *
+ * @copyright 2009-2016 The University of Tennessee and The University
+ *                      of Tennessee Research Foundation.
+ *                      All rights reserved.
+ * @copyright 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ * @file codelet_ztpmlqt.c
+ *
+ * MORSE codelets kernel
+ * MORSE is a software package provided by Univ. of Tennessee,
+ * Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @version 0.9.0
+ * @author Mathieu Faverge
+ * @date 2016-12-15
+ * @precisions normal z -> s d c
+ *
+ **/
+#include "chameleon_starpu.h"
+#include "runtime_codelet_z.h"
+
+#if !defined(CHAMELEON_SIMULATION)
+static void cl_ztpmlqt_cpu_func(void *descr[], void *cl_arg)
+{
+    MORSE_enum side;
+    MORSE_enum trans;
+    int M;
+    int N;
+    int K;
+    int L;
+    int ib;
+    const MORSE_Complex64_t *V;
+    int ldv;
+    const MORSE_Complex64_t *T;
+    int ldt;
+    MORSE_Complex64_t *A;
+    int lda;
+    MORSE_Complex64_t *B;
+    int ldb;
+    MORSE_Complex64_t *WORK;
+
+    V    = (const MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
+    T    = (const MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
+    A    = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]);
+    B    = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]);
+    WORK = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */
+
+    starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib,
+                                &ldv, &ldt, &lda, &ldb );
+
+    CORE_ztpmlqt( side, trans, M, N, K, L, ib,
+                  V, ldv, T, ldt, A, lda, B, ldb, WORK );
+}
+
+#if defined(CHAMELEON_USE_CUDA)
+static void cl_ztpmlqt_cuda_func(void *descr[], void *cl_arg)
+{
+    MORSE_enum side;
+    MORSE_enum trans;
+    int M;
+    int N;
+    int K;
+    int L;
+    int ib;
+    const cuDoubleComplex *V;
+    int ldv;
+    const cuDoubleComplex *T;
+    int ldt;
+    cuDoubleComplex *A;
+    int lda;
+    cuDoubleComplex *B;
+    int ldb;
+    cuDoubleComplex *W;
+
+    V = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
+    T = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
+    A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
+    B = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]);
+    W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */
+
+    starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib,
+                                &ldv, &ldt, &lda, &ldb );
+
+    RUNTIME_getStream(stream);
+
+    CUDA_ztpmlqt(
+            side, trans, M, N, K, L, ib,
+            V, ldv, T, ldt, A, lda, B, ldb,
+            W, stream );
+
+#ifndef STARPU_CUDA_ASYNC
+    cudaStreamSynchronize( stream );
+#endif
+}
+#endif /* defined(CHAMELEON_USE_CUDA) */
+#endif /* !defined(CHAMELEON_SIMULATION) */
+
+/*
+ * Codelet definition
+ */
+CODELETS(ztpmlqt, 5, cl_ztpmlqt_cpu_func, cl_ztpmlqt_cuda_func, STARPU_CUDA_ASYNC)
+
+void
+MORSE_TASK_ztpmlqt( const MORSE_option_t *options,
+                    MORSE_enum side, MORSE_enum trans,
+                    int M, int N, int K, int L, int ib, int nb,
+                    const MORSE_desc_t *V, int Vm, int Vn, int ldv,
+                    const MORSE_desc_t *T, int Tm, int Tn, int ldt,
+                    const MORSE_desc_t *A, int Am, int An, int lda,
+                    const MORSE_desc_t *B, int Bm, int Bn, int ldb )
+{
+    struct starpu_codelet *codelet = &cl_ztpmlqt;
+    void (*callback)(void*) = options->profiling ? cl_ztpmlqt_callback : NULL;
+
+    MORSE_BEGIN_ACCESS_DECLARATION;
+    MORSE_ACCESS_R(V, Vm, Vn);
+    MORSE_ACCESS_R(T, Tm, Tn);
+    MORSE_ACCESS_RW(A, Am, An);
+    MORSE_ACCESS_RW(B, Bm, Bn);
+    MORSE_END_ACCESS_DECLARATION;
+
+    starpu_insert_task(
+        starpu_mpi_codelet(codelet),
+        STARPU_VALUE, &side,  sizeof(MORSE_enum),
+        STARPU_VALUE, &trans, sizeof(MORSE_enum),
+        STARPU_VALUE, &M,     sizeof(int),
+        STARPU_VALUE, &N,     sizeof(int),
+        STARPU_VALUE, &K,     sizeof(int),
+        STARPU_VALUE, &L,     sizeof(int),
+        STARPU_VALUE, &ib,     sizeof(int),
+        STARPU_R,      RTBLKADDR(V, MORSE_Complex64_t, Vm, Vn),
+        STARPU_VALUE, &ldv,   sizeof(int),
+        STARPU_R,      RTBLKADDR(T, MORSE_Complex64_t, Tm, Tn),
+        STARPU_VALUE, &ldt,   sizeof(int),
+        STARPU_RW,     RTBLKADDR(A, MORSE_Complex64_t, Am, An),
+        STARPU_VALUE, &lda,   sizeof(int),
+        STARPU_RW,     RTBLKADDR(B, MORSE_Complex64_t, Bm, Bn),
+        STARPU_VALUE, &ldb,   sizeof(int),
+        /* Other options */
+        STARPU_SCRATCH,   options->ws_worker,
+        STARPU_PRIORITY,  options->priority,
+        STARPU_CALLBACK,  callback,
+#if defined(CHAMELEON_USE_MPI)
+        STARPU_EXECUTE_ON_NODE, B->get_rankof(B, Bm, Bn),
+#endif
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+        STARPU_NAME, (( L == 0 ) ? "ztsmlq" : "ztpmlqt"),
+#endif
+        0);
+
+    (void)ib; (void)nb;
+}
diff --git a/runtime/starpu/include/runtime_codelet_z.h b/runtime/starpu/include/runtime_codelet_z.h
index 7b973d116..6d3187da1 100644
--- a/runtime/starpu/include/runtime_codelet_z.h
+++ b/runtime/starpu/include/runtime_codelet_z.h
@@ -81,7 +81,9 @@ ZCODELETS_HEADER(syssq)
 ZCODELETS_HEADER(trasm)
 ZCODELETS_HEADER(trssq)
 ZCODELETS_HEADER(trtri)
+ZCODELETS_HEADER(tplqt)
 ZCODELETS_HEADER(tpqrt)
+ZCODELETS_HEADER(tpmlqt)
 ZCODELETS_HEADER(tpmqrt)
 ZCODELETS_HEADER(tslqt)
 ZCODELETS_HEADER(tsmlq)
-- 
GitLab