From d1dc8c2531eb3a8c45be24de0515251432d7dfac Mon Sep 17 00:00:00 2001
From: Ana Hourcau <ahourcau@sirocco15.plafrim.cluster>
Date: Mon, 19 Aug 2024 10:52:17 +0200
Subject: [PATCH] Adapting gered and gerst codelets cuda parts

---
 runtime/starpu/codelets/codelet_zgered.c | 81 ++++++++++++------------
 runtime/starpu/codelets/codelet_zgerst.c | 44 +++++++------
 2 files changed, 62 insertions(+), 63 deletions(-)

diff --git a/runtime/starpu/codelets/codelet_zgered.c b/runtime/starpu/codelets/codelet_zgered.c
index d7a132200..430db00ff 100644
--- a/runtime/starpu/codelets/codelet_zgered.c
+++ b/runtime/starpu/codelets/codelet_zgered.c
@@ -71,8 +71,7 @@ void INSERT_TASK_zgered( const RUNTIME_option_t *options,
 
 #if defined(CHAMELEON_USE_MPI)
     /* Backup the MPI tag */
-    if (A->myrank == tileA->rank)
-    {
+    if ( A->myrank == tileA->rank ) {
         tag = starpu_mpi_data_get_tag( *handleAin );
     }
 #endif /* defined(CHAMELEON_USE_MPI) */
@@ -89,39 +88,41 @@ void INSERT_TASK_zgered( const RUNTIME_option_t *options,
 #if defined(CHAMELEON_DEBUG_GERED)
             fprintf( stderr,
                      "[%2d] Convert the tile ( %d, %d ) to half precision\n",
-                    A->myrank, Am, An);
+                     A->myrank, Am, An);
 #endif
-            starpu_cham_tile_register( &handleAout, -1, tileA, ChamComplexHalf );
+            if ( A->myrank == tileA->rank )
+            {
+                starpu_cham_tile_register( &handleAout, -1, tileA, ChamComplexHalf );
 
-            rt_shm_starpu_insert_task(
-                &cl_dlag2h,
-                STARPU_VALUE,    &m,                 sizeof(int),
-                STARPU_VALUE,    &n,                 sizeof(int),
-                STARPU_R,        *handleAin,
-                STARPU_W,         handleAout,
-                STARPU_PRIORITY,  options->priority,
-                STARPU_EXECUTE_ON_WORKER, options->workerid,
+                rt_shm_starpu_insert_task(
+                    &cl_dlag2h,
+                    STARPU_VALUE,            &m, sizeof(int),
+                    STARPU_VALUE,            &n, sizeof(int),
+                    STARPU_R,                *handleAin,
+                    STARPU_W,                 handleAout,
+                    STARPU_PRIORITY,          options->priority,
+                    STARPU_EXECUTE_ON_WORKER, options->workerid,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
-                STARPU_NAME, "dlag2h",
+                    STARPU_NAME,              "dlag2h",
 #endif
-                0);
+                    0);
 
-            starpu_data_unregister_no_coherency( *handleAin );
-            *handleAin = handleAout;
-            tileA->flttype = ChamComplexHalf;
-            starpu_mpi_data_register( handleAout, tag, tileA->rank );
-        }
-        else
-        {
-            tileA->flttype = ChamComplexHalf;
-            if (*handleAin != NULL)
+                starpu_data_unregister_no_coherency( *handleAin );
+                *handleAin     = handleAout;
+                tileA->flttype = ChamComplexHalf;
+                starpu_mpi_data_register( handleAout, tag, tileA->rank );
+            }
+            else
             {
-                starpu_data_unregister_no_coherency(*handleAin);
-                *handleAin = NULL;
+                tileA->flttype = ChamComplexHalf;
+                if ( *handleAin != NULL )
+                {
+                    starpu_data_unregister_no_coherency( *handleAin );
+                    *handleAin = NULL;
+                }
             }
+            return;
         }
-        return;
-
     }
 #endif
 #endif
@@ -129,11 +130,7 @@ void INSERT_TASK_zgered( const RUNTIME_option_t *options,
     /*
      * Check for single precision
      */
-#if !defined(CHAMELEON_SIMULATION)
-    u_low = LAPACKE_slamch_work('e');
-#else
-    u_low = 1e-8;
-#endif
+    u_low = CHAMELEON_slamch();
     if ( lnorm < (threshold / u_low) )
     {
 #if defined(CHAMELEON_DEBUG_GERED)
@@ -141,34 +138,34 @@ void INSERT_TASK_zgered( const RUNTIME_option_t *options,
                  "[%2d] Convert the tile ( %d, %d ) to single precision\n",
                  A->myrank, Am, An );
 #endif
-        if (A->myrank == tileA->rank)
+        if ( A->myrank == tileA->rank )
         {
             starpu_cham_tile_register( &handleAout, -1, tileA, ChamComplexFloat );
 
             rt_shm_starpu_insert_task(
                 &cl_zlag2c,
-                STARPU_VALUE,    &m,                 sizeof(int),
-                STARPU_VALUE,    &n,                 sizeof(int),
-                STARPU_R,        *handleAin,
-                STARPU_W,         handleAout,
-                STARPU_PRIORITY,  options->priority,
+                STARPU_VALUE,            &m, sizeof(int),
+                STARPU_VALUE,            &n, sizeof(int),
+                STARPU_R,                *handleAin,
+                STARPU_W,                 handleAout,
+                STARPU_PRIORITY,          options->priority,
                 STARPU_EXECUTE_ON_WORKER, options->workerid,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
-                STARPU_NAME, "zlag2c",
+                STARPU_NAME,              "zlag2c",
 #endif
                 0);
 
             starpu_data_unregister_no_coherency( *handleAin );
-            *handleAin = handleAout;
+            *handleAin     = handleAout;
             tileA->flttype = ChamComplexFloat;
             starpu_mpi_data_register( *handleAin, tag, tileA->rank );
         }
         else
         {
             tileA->flttype = ChamComplexFloat;
-            if (*handleAin != NULL)
+            if ( *handleAin != NULL )
             {
-                starpu_data_unregister_no_coherency(*handleAin);
+                starpu_data_unregister_no_coherency( *handleAin );
                 *handleAin = NULL;
             }
         }
diff --git a/runtime/starpu/codelets/codelet_zgerst.c b/runtime/starpu/codelets/codelet_zgerst.c
index f0fbdc1a4..c2faaf939 100644
--- a/runtime/starpu/codelets/codelet_zgerst.c
+++ b/runtime/starpu/codelets/codelet_zgerst.c
@@ -39,7 +39,8 @@ void INSERT_TASK_zgerst( const RUNTIME_option_t *options,
     handleAin = A->schedopt;
     handleAin += ((int64_t)A->lmt) * nn + mm;
 
-    if ( tileA->flttype == ChamComplexDouble ) {
+    if ( tileA->flttype == ChamComplexDouble )
+    {
         starpu_data_handle_t *copy = handleAin;
 
         /* Remove first copy */
@@ -59,12 +60,12 @@ void INSERT_TASK_zgerst( const RUNTIME_option_t *options,
         return;
     }
 
-    if (A->myrank != tileA->rank)
+    if ( A->myrank != tileA->rank )
     {
         tileA->flttype = ChamComplexDouble;
-        if (*handleAin != NULL)
+        if ( *handleAin != NULL )
         {
-            starpu_data_unregister_no_coherency(*handleAin);
+            starpu_data_unregister_no_coherency( *handleAin );
             *handleAin = NULL;
         }
         return;
@@ -79,9 +80,9 @@ void INSERT_TASK_zgerst( const RUNTIME_option_t *options,
     switch( tileA->flttype ) {
 #if defined(CHAMELEON_USE_CUDA) && (CUDA_VERSION >= 7500)
 #if defined(PRECISION_d)
-    /*
-     * Restore from half precision
-     */
+        /*
+         * Restore from half precision
+         */
     case ChamComplexHalf:
         assert( options->withcuda );
 #if defined(CHAMELEON_DEBUG_GERED)
@@ -91,14 +92,14 @@ void INSERT_TASK_zgerst( const RUNTIME_option_t *options,
 #endif
         rt_shm_starpu_insert_task(
             &cl_hlag2d,
-            STARPU_VALUE,    &m,                 sizeof(int),
-            STARPU_VALUE,    &n,                 sizeof(int),
-            STARPU_R,        *handleAin,
-            STARPU_W,         handleAout,
-            STARPU_PRIORITY,  options->priority,
+            STARPU_VALUE,            &m,                 sizeof(int),
+            STARPU_VALUE,            &n,                 sizeof(int),
+            STARPU_R,                *handleAin,
+            STARPU_W,                 handleAout,
+            STARPU_PRIORITY,          options->priority,
             STARPU_EXECUTE_ON_WORKER, options->workerid,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
-            STARPU_NAME, "hlag2d",
+            STARPU_NAME,              "hlag2d",
 #endif
             0);
         break;
@@ -108,19 +109,20 @@ void INSERT_TASK_zgerst( const RUNTIME_option_t *options,
     case ChamComplexFloat:
 #if defined(CHAMELEON_DEBUG_GERED)
         fprintf( stderr,
-                 "[%2d] Convert back the tile ( %d, %d ) from half precision\n",
+                 "[%2d] Convert back the tile ( %d, %d ) from single precision\n",
                  A->myrank, Am, An );
 #endif
+
         rt_shm_starpu_insert_task(
             &cl_clag2z,
-            STARPU_VALUE,    &m,                 sizeof(int),
-            STARPU_VALUE,    &n,                 sizeof(int),
-            STARPU_R,        *handleAin,
-            STARPU_W,         handleAout,
-            STARPU_PRIORITY,  options->priority,
+            STARPU_VALUE,            &m,                 sizeof(int),
+            STARPU_VALUE,            &n,                 sizeof(int),
+            STARPU_R,                *handleAin,
+            STARPU_W,                 handleAout,
+            STARPU_PRIORITY,          options->priority,
             STARPU_EXECUTE_ON_WORKER, options->workerid,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
-            STARPU_NAME, "clag2z",
+            STARPU_NAME,              "clag2z",
 #endif
             0);
         break;
@@ -130,7 +132,7 @@ void INSERT_TASK_zgerst( const RUNTIME_option_t *options,
     }
 
     starpu_data_unregister_no_coherency( *handleAin );
-    *handleAin = handleAout;
+    *handleAin     = handleAout;
     tileA->flttype = ChamComplexDouble;
     starpu_mpi_data_register( handleAout, tag, tileA->rank );
 }
-- 
GitLab