From 764be57d352afb5496c29afc0042a120bc1b4fc7 Mon Sep 17 00:00:00 2001
From: Florent Pruvost <florent.pruvost@inria.fr>
Date: Thu, 25 Aug 2016 15:45:42 +0000
Subject: [PATCH] chameleon: add a out_of_core example, requires to handle the
 case where starpu handles itself the allocation of tiles -> we give a NULL
 pointer through the get_blkaddr function for registering

---
 example/CMakeLists.txt                      |  11 +-
 example/lapack_to_morse/CTestLists.cmake    |   2 +-
 example/lapack_to_morse/step3.c             |   2 +-
 example/out_of_core/CMakeLists.txt          | 107 ++++++++
 example/out_of_core/CTestLists.cmake        |  11 +
 example/out_of_core/out_of_core.c           | 193 ++++++++++++++
 example/out_of_core/out_of_core.h           | 272 ++++++++++++++++++++
 include/runtime.h                           |  20 +-
 runtime/starpu/control/runtime_descriptor.c |  17 +-
 9 files changed, 614 insertions(+), 21 deletions(-)
 create mode 100644 example/out_of_core/CMakeLists.txt
 create mode 100644 example/out_of_core/CTestLists.cmake
 create mode 100644 example/out_of_core/out_of_core.c
 create mode 100644 example/out_of_core/out_of_core.h

diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index 6983ac335..430392799 100755
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -28,11 +28,14 @@ add_subdirectory(basic_zposv)
 
 if (CHAMELEON_PREC_D)
     add_subdirectory(lapack_to_morse)
+    if (CHAMELEON_SCHED_STARPU)
+        add_subdirectory(out_of_core)
+    endif()
 else()
-    message(WARNING "CHAMELEON_PREC_D is set to OFF so that lapack_to_morse"
-    "tutorial cannot be built (use only double arithmetic precision).\n"
-    "Please set CHAMELEON_PREC_D to ON if you want to build executables of"
-    "this tutorial.")
+    message(WARNING "CHAMELEON_PREC_D is set to OFF so that lapack_to_morse "
+    "and out_core tutorials cannot be built (use only double arithmetic "
+    "precision).\n Please set CHAMELEON_PREC_D to ON if you want to build "
+    "executables of this tutorial.")
 endif()
 
 ###
diff --git a/example/lapack_to_morse/CTestLists.cmake b/example/lapack_to_morse/CTestLists.cmake
index d6baba725..202140571 100644
--- a/example/lapack_to_morse/CTestLists.cmake
+++ b/example/lapack_to_morse/CTestLists.cmake
@@ -1,5 +1,5 @@
 #
-# Check Example basic_zposv
+# Check Example lapack_to_morse
 #
 
 set(TESTLIST 
diff --git a/example/lapack_to_morse/step3.c b/example/lapack_to_morse/step3.c
index 8f77be1a5..4677bd55f 100644
--- a/example/lapack_to_morse/step3.c
+++ b/example/lapack_to_morse/step3.c
@@ -34,7 +34,7 @@
  */
 int main(int argc, char *argv[]) {
 
-    size_t N;    // matrix order
+    size_t N; // matrix order
     int NB;   // number of rows and columns in tiles
     int NRHS; // number of RHS vectors
     int NCPU; // number of cores to use
diff --git a/example/out_of_core/CMakeLists.txt b/example/out_of_core/CMakeLists.txt
new file mode 100644
index 000000000..1b6c61fc8
--- /dev/null
+++ b/example/out_of_core/CMakeLists.txt
@@ -0,0 +1,107 @@
+###
+#
+# @copyright (c) 2009-2014 The University of Tennessee and The University
+#                          of Tennessee Research Foundation.
+#                          All rights reserved.
+# @copyright (c) 2012-2016 Inria. All rights reserved.
+# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+#
+###
+#
+# @file CMakeLists.c
+#
+#  MORSE example routines
+#  MORSE is a software package provided by Inria Bordeaux - Sud-Ouest, LaBRI,
+#  University of Bordeaux, Bordeaux INP
+#
+#  @version 1.0.0
+#  @author Florent Pruvost
+#  @date 2016-08-23
+#
+###
+
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+# list of sources
+set(OOC_SOURCES
+    out_of_core.c
+   )
+
+# Define what libraries we have to link with
+# ------------------------------------------
+unset(libs_for_ooc)
+list(APPEND libs_for_ooc
+     chameleon
+     chameleon_starpu
+     ${STARPU_LIBRARIES_DEP}
+)
+link_directories(${STARPU_LIBRARY_DIRS})
+
+
+if(NOT CHAMELEON_SIMULATION)
+
+    if(CHAMELEON_USE_CUDA OR CHAMELEON_USE_MAGMA)
+        list(APPEND libs_for_ooc
+        cudablas)
+    endif()
+    if(CHAMELEON_USE_CUDA)
+        list(APPEND libs_for_ooc
+             ${CUDA_LIBRARIES}
+        )
+        link_directories(${CUDA_LIBRARY_DIRS})
+    endif()
+    if(CHAMELEON_USE_MAGMA)
+        list(APPEND libs_for_ooc
+             ${MAGMA_LIBRARIES}
+        )
+        link_directories(${MAGMA_LIBRARY_DIRS})
+    endif()
+
+    list(APPEND libs_for_ooc
+         coreblas
+         ${LAPACKE_LIBRARIES}
+         ${CBLAS_LIBRARIES}
+         ${LAPACK_SEQ_LIBRARIES}
+         ${BLAS_SEQ_LIBRARIES}
+         ${HWLOC_LIBRARIES}
+         ${EXTRA_LIBRARIES}
+    )
+
+    link_directories(${LAPACKE_LIBRARY_DIRS})
+    link_directories(${LAPACK_LIBRARY_DIRS})
+    link_directories(${CBLAS_LIBRARY_DIRS})
+    link_directories(${BLAS_LIBRARY_DIRS})
+
+else()
+
+    list(APPEND libs_for_ooc
+         coreblas
+         simulapacke
+         simucblas
+         ${HWLOC_LIBRARIES}
+         ${EXTRA_LIBRARIES}
+    )
+
+endif()
+
+link_directories(${HWLOC_LIBRARY_DIRS})
+
+
+# message(STATUS "libs examples: ${libs_for_ooc}")
+foreach(_ooc ${OOC_SOURCES})
+    get_filename_component(_name_exe ${_ooc} NAME_WE)
+    add_executable(${_name_exe} ${_ooc})
+    set_property(TARGET ${_name_exe} PROPERTY LINKER_LANGUAGE Fortran)
+    target_link_libraries(${_name_exe} ${libs_for_ooc})
+    install(TARGETS ${_name_exe}
+            DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/chameleon/example/out_of_core)
+endforeach()
+
+#-------- Tests ---------
+include(CTestLists.cmake)
+
+###
+### END CMakeLists.txt
+###
diff --git a/example/out_of_core/CTestLists.cmake b/example/out_of_core/CTestLists.cmake
new file mode 100644
index 000000000..1d02adac6
--- /dev/null
+++ b/example/out_of_core/CTestLists.cmake
@@ -0,0 +1,11 @@
+#
+# Check Example out_of_core
+#
+
+set(TESTLIST
+    out_of_core
+    )
+
+foreach(test ${TESTLIST})
+    add_test(example_ooc_${test} ./${test})
+endforeach()
diff --git a/example/out_of_core/out_of_core.c b/example/out_of_core/out_of_core.c
new file mode 100644
index 000000000..254e79410
--- /dev/null
+++ b/example/out_of_core/out_of_core.c
@@ -0,0 +1,193 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2014 Inria. All rights reserved.
+ * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file ooc.c
+ *
+ *  MORSE example routines
+ *  MORSE is a software package provided by Inria Bordeaux - Sud-Ouest, LaBRI,
+ *  University of Bordeaux, Bordeaux INP
+ *
+ * @version 1.0.0
+ * @author Florent Pruvost
+ * @date 2014-10-29
+ *
+ **/
+
+#include "out_of_core.h"
+
+/*
+ * @brief ooc is driver example routine to test the out-of-core feature with StarPU
+ * @details TODO: write some details
+ */
+int main(int argc, char *argv[]) {
+
+    size_t N; // matrix order
+    int NB;   // number of rows and columns in tiles
+    int NRHS; // number of RHS vectors
+    int NCPU; // number of cores to use
+    int NGPU; // number of gpus (cuda devices) to use
+    int UPLO = MorseUpper; // where is stored L
+
+    /* descriptors necessary for calling MORSE tile interface  */
+    MORSE_desc_t *descA = NULL, *descAC = NULL, *descB = NULL, *descX = NULL;
+
+    /* declarations to time the program and evaluate performances */
+    double fmuls, fadds, flops, gflops, cpu_time;
+
+    /* variable to check the numerical results */
+    double anorm, bnorm, xnorm, eps, res;
+    int hres;
+
+    /* initialize some parameters with default values */
+    int iparam[IPARAM_SIZEOF];
+    memset(iparam, 0, IPARAM_SIZEOF*sizeof(int));
+    init_iparam(iparam);
+
+    /* read arguments */
+    read_args(argc, argv, iparam);
+    N    = iparam[IPARAM_N];
+    NB   = iparam[IPARAM_NB];
+    NRHS = iparam[IPARAM_NRHS];
+
+    /* compute the algorithm complexity to evaluate performances */
+    fadds = (double)( FADDS_POTRF(N) + 2 * FADDS_TRSM(N,NRHS) );
+    fmuls = (double)( FMULS_POTRF(N) + 2 * FMULS_TRSM(N,NRHS) );
+    flops = 1e-9 * (fmuls + fadds);
+    gflops = 0.0;
+    cpu_time = 0.0;
+
+    /* initialize the number of thread if not given by the user in argv */
+    if ( iparam[IPARAM_THRDNBR] == -1 ) {
+        get_thread_count( &(iparam[IPARAM_THRDNBR]) );
+    }
+    NCPU = iparam[IPARAM_THRDNBR];
+    NGPU = 0;
+
+    /* print informations to user */
+    print_header( argv[0], iparam);
+
+    /* check that o direct will work */
+    if (iparam[IPARAM_OUTOFCORE] > 0) {
+        if (! will_o_direct_work(NB)) {
+            print_o_direct_wont_work();
+            return EXIT_FAILURE;
+        }
+        char maxMemoryAllowed[32];
+        sprintf (maxMemoryAllowed, "%d", iparam[IPARAM_OUTOFCORE]);
+        setenv ("STARPU_LIMIT_CPU_MEM", maxMemoryAllowed, 1);
+    }
+
+     /* Initialize MORSE with main parameters */
+    if ( MORSE_Init( NCPU, NGPU ) != MORSE_SUCCESS ) {
+        fprintf(stderr, "Error initializing MORSE library\n");
+        return EXIT_FAILURE;
+    }
+    MORSE_Set(MORSE_TILE_SIZE, NB);
+
+    /* limit ram memory */
+    if (iparam[IPARAM_OUTOFCORE] > 0) {
+        int new_dd = starpu_disk_register (&starpu_disk_unistd_o_direct_ops,
+                                           (void*) "/tmp/starpu_ooc/", 1024*1024*10);
+    }
+
+    MORSE_Desc_Create_User(&descA, NULL, MorseRealDouble,
+                           NB, NB, NB*NB, N, N, 0, 0, N, N, 1, 1,
+                           morse_getaddr_null,
+                           morse_getblkldd_ccrb,
+                           morse_getrankof_2d);
+    MORSE_Desc_Create(&descB,  NULL, MorseRealDouble,
+                      NB, NB,  NB*NB, N, NRHS, 0, 0, N, NRHS, 1, 1);
+    MORSE_Desc_Create(&descX,  NULL, MorseRealDouble,
+                      NB, NB,  NB*NB, N, NRHS, 0, 0, N, NRHS, 1, 1);
+    MORSE_Desc_Create(&descAC, NULL, MorseRealDouble,
+                      NB, NB,  NB*NB, N, N, 0, 0, N, N, 1, 1);
+
+    /* generate A matrix with random values such that it is spd */
+    MORSE_dplgsy_Tile( (double)N, descA, 51 );
+
+    /* generate RHS */
+    MORSE_dplrnt_Tile( descB, 5673 );
+
+    /* copy A before facto. in order to check the result */
+    MORSE_dlacpy_Tile(MorseUpperLower, descA, descAC);
+
+    /* copy B in X before solving
+     * same sense as memcpy(X, B, N*NRHS*sizeof(double)) but for descriptors */
+    MORSE_dlacpy_Tile(MorseUpperLower, descB, descX);
+
+    /************************************************************/
+    /* solve the system AX = B using the Cholesky factorization */
+    /************************************************************/
+
+    cpu_time = -cWtime();
+
+    /* Cholesky factorization:
+     * A is replaced by its factorization L or L^T depending on uplo */
+    MORSE_dpotrf_Tile( UPLO, descA );
+
+    /* Solve:
+     * B is stored in X on entry, X contains the result on exit.
+     * Forward and back substitutions
+     */
+    MORSE_dpotrs_Tile( UPLO, descA, descX );
+
+    cpu_time += cWtime();
+
+    /* print informations to user */
+    gflops = flops / cpu_time;
+    printf( "%9.3f %9.2f\n", cpu_time, gflops);
+    fflush( stdout );
+
+    /************************************************************/
+    /* check if solve is correct i.e. AX-B = 0                  */
+    /************************************************************/
+
+    /* compute norms to check the result */
+    anorm = MORSE_dlange_Tile( MorseInfNorm, descAC);
+    bnorm = MORSE_dlange_Tile( MorseInfNorm, descB);
+    xnorm = MORSE_dlange_Tile( MorseInfNorm, descX);
+
+    /* compute A*X-B, store the result in B */
+    MORSE_dgemm_Tile( MorseNoTrans, MorseNoTrans,
+                      1.0, descAC, descX, -1.0, descB );
+    res = MORSE_dlange_Tile( MorseInfNorm, descB );
+
+    /* check residual and print a message */
+    eps = LAPACKE_dlamch_work( 'e' );
+
+    /*
+     * if hres = 0 then the test succeed
+     * else the test failed
+     */
+    hres = 0;
+    hres = ( res / N / eps / (anorm * xnorm + bnorm ) > 100.0 );
+    printf( "   ||Ax-b||       ||A||       ||x||       ||b|| ||Ax-b||/N/eps/(||A||||x||+||b||)  RETURN\n");
+    if (hres)
+        printf( "%8.5e %8.5e %8.5e %8.5e                       %8.5e FAILURE \n",
+            res, anorm, xnorm, bnorm,
+            res / N / eps / (anorm * xnorm + bnorm ));
+    else
+        printf( "%8.5e %8.5e %8.5e %8.5e                       %8.5e SUCCESS \n",
+            res, anorm, xnorm, bnorm,
+            res / N / eps / (anorm * xnorm + bnorm ));
+
+    /* free descriptors descA, descB, descX, descAC */
+    MORSE_Desc_Destroy( &descA );
+    MORSE_Desc_Destroy( &descB );
+    MORSE_Desc_Destroy( &descX );
+    MORSE_Desc_Destroy( &descAC );
+
+    /* Finalize MORSE */
+    MORSE_Finalize();
+
+    return EXIT_SUCCESS;
+}
diff --git a/example/out_of_core/out_of_core.h b/example/out_of_core/out_of_core.h
new file mode 100644
index 000000000..dcb9e9702
--- /dev/null
+++ b/example/out_of_core/out_of_core.h
@@ -0,0 +1,272 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2016 Inria. All rights reserved.
+ * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file out_of_core.h
+ *
+ *  MORSE example routines
+ *  MORSE is a software package provided by Inria Bordeaux - Sud-Ouest, LaBRI,
+ *  University of Bordeaux, Bordeaux INP
+ *
+ * @version 1.0.0
+ * @author Florent Pruvost
+ * @date 2016-08-23
+ *
+ **/
+
+#ifndef OOC_H
+#define OOC_H
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#if defined( _WIN32 ) || defined( _WIN64 )
+#define int64_t __int64
+#endif
+
+/* Define these so that the Microsoft VC compiler stops complaining
+   about scanf and friends */
+#define _CRT_SECURE_NO_DEPRECATE
+#define _CRT_SECURE_NO_WARNINGS
+
+#if defined( _WIN32 ) || defined( _WIN64 )
+#include <windows.h>
+#else  /* Non-Windows */
+#include <unistd.h>
+#include <sys/resource.h>
+#endif
+
+#include <starpu.h>
+#include "coreblas/include/lapacke.h"
+#include "morse.h"
+#include "control/common.h"
+
+/* Common functions for all steps of the tutorial */
+
+static void get_thread_count(int *thrdnbr) {
+#if defined WIN32 || defined WIN64
+    sscanf( getenv( "NUMBER_OF_PROCESSORS" ), "%d", thrdnbr );
+#else
+    *thrdnbr = sysconf(_SC_NPROCESSORS_ONLN);
+#endif
+}
+
+static int startswith(const char *s, const char *prefix) {
+    size_t n = strlen( prefix );
+    if (strncmp( s, prefix, n ))
+        return 0;
+    return 1;
+}
+
+
+/* define complexity of algorithms - see Lawn 41 page 120 */
+#define FMULS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) + 0.5) * (double)(__n) + (1. / 3.)))
+#define FADDS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n)      ) * (double)(__n) - (1. / 6.)))
+#define FMULS_TRSM(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)+1.))
+#define FADDS_TRSM(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.))
+
+/* define some tools to time the program */
+#if defined( _WIN32 ) || defined( _WIN64 )
+#include <windows.h>
+#include <time.h>
+#include <sys/timeb.h>
+#if defined(_MSC_VER) || defined(_MSC_EXTENSIONS)
+#define DELTA_EPOCH_IN_MICROSECS  11644473600000000Ui64
+#else
+#define DELTA_EPOCH_IN_MICROSECS  11644473600000000ULL
+#endif
+
+struct timezone
+{
+    int  tz_minuteswest; /* minutes W of Greenwich */
+    int  tz_dsttime;     /* type of dst correction */
+};
+
+int gettimeofday(struct timeval* tv, struct timezone* tz)
+{
+    FILETIME         ft;
+    unsigned __int64 tmpres = 0;
+    static int       tzflag;
+
+    if (NULL != tv)
+        {
+            GetSystemTimeAsFileTime(&ft);
+            tmpres |=  ft.dwHighDateTime;
+            tmpres <<= 32;
+            tmpres |=  ft.dwLowDateTime;
+
+            /*converting file time to unix epoch*/
+            tmpres /= 10;  /*convert into microseconds*/
+            tmpres -= DELTA_EPOCH_IN_MICROSECS;
+
+            tv->tv_sec  = (long)(tmpres / 1000000UL);
+            tv->tv_usec = (long)(tmpres % 1000000UL);
+        }
+    if (NULL != tz)
+        {
+            if (!tzflag)
+                {
+                    _tzset();
+                    tzflag++;
+                }
+            tz->tz_minuteswest = _timezone / 60;
+            tz->tz_dsttime     = _daylight;
+        }
+    return 0;
+}
+
+#else  /* Non-Windows */
+#include <sys/time.h>
+#endif
+
+/*
+ * struct timeval {time_t tv_sec; suseconds_t tv_usec;};
+ */
+double cWtime(void)
+{
+    struct timeval tp;
+    gettimeofday( &tp, NULL );
+    return tp.tv_sec + 1e-6 * tp.tv_usec;
+}
+
+/* Integer parameters */
+enum iparam_ooc {
+    IPARAM_THRDNBR,        /* Number of cores                            */
+    IPARAM_N,              /* Number of columns of the matrix            */
+    IPARAM_NB,             /* Number of columns in a tile                */
+    IPARAM_NRHS,           /* Number of RHS                              */
+    IPARAM_OUTOFCORE,      /* if > 0 --> how many memory accepted incore */
+                           /* else --> do not use ooc.                   */
+    /* End */
+    IPARAM_SIZEOF
+};
+
+/* Specific routines */
+
+/******************************************************************************
+ * Initialize integer parameters
+ */
+static void init_iparam(int iparam[IPARAM_SIZEOF]){
+    iparam[IPARAM_THRDNBR       ] = -1;
+    iparam[IPARAM_N             ] = 500;
+    iparam[IPARAM_NB            ] = 128;
+    iparam[IPARAM_NRHS          ] = 1;
+    iparam[IPARAM_OUTOFCORE     ] = 2000;
+ }
+
+/******************************************************************************
+ * Print how to use the program
+ */
+static void show_help(char *prog_name) {
+    printf( "Usage:\n%s [options]\n\n", prog_name );
+    printf( "Options are:\n"
+            "  --help           Show this help\n"
+            "\n"
+            "  --n=X            dimension (N). (default: 500)\n"
+            "  --nb=X           NB size. (default: 128)\n"
+            "  --nrhs=X         number of RHS. (default: 1)\n"
+            "\n"
+            "  --threads=X      Number of CPU workers (default: _SC_NPROCESSORS_ONLN)\n"
+            "  --ooc=N          Allow to store N MiB in main memory. (default: )\n"
+            "\n");
+}
+
+/******************************************************************************
+ * Read arguments following ooc program call
+ */
+static void read_args(int argc, char *argv[], int *iparam){
+    int i;
+    for (i = 1; i < argc && argv[i]; ++i) {
+        if ( startswith( argv[i], "--help") || startswith( argv[i], "-help") ||
+             startswith( argv[i], "--h") || startswith( argv[i], "-h") ) {
+            show_help( argv[0] );
+            exit(0);
+        } else if (startswith( argv[i], "--n=" )) {
+            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[IPARAM_N]) );
+        } else if (startswith( argv[i], "--nb=" )) {
+            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[IPARAM_NB]) );
+        } else if (startswith( argv[i], "--nrhs=" )) {
+            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[IPARAM_NRHS]) );
+        } else if (startswith( argv[i], "--threads=" )) {
+            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[IPARAM_THRDNBR]) );
+        } else if (startswith( argv[i], "--ooc=" )) {
+            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[IPARAM_OUTOFCORE]) );
+        } else {
+            fprintf( stderr, "Unknown option: %s\n", argv[i] );
+        }
+    }
+}
+
+/******************************************************************************
+ * Print a header message to summarize main parameters
+ */
+static void print_header(char *prog_name, int * iparam) {
+#if defined(CHAMELEON_SIMULATION)
+    double    eps = 0.;
+#else
+    double    eps = LAPACKE_dlamch_work( 'e' );
+#endif
+
+    printf( "#\n"
+            "# CHAMELEON %d.%d.%d, %s\n"
+            "# Nb threads: %d\n"
+            "# Nb gpus:    %d\n"
+            "# N:          %d\n"
+            "# NB:         %d\n"
+            "# IB:         %d\n"
+            "# eps:        %e\n"
+            "# ooc:        %d\n"
+            "#\n",
+            CHAMELEON_VERSION_MAJOR,
+            CHAMELEON_VERSION_MINOR,
+            CHAMELEON_VERSION_MICRO,
+            prog_name,
+            iparam[IPARAM_THRDNBR],
+            0,
+            iparam[IPARAM_N],
+            iparam[IPARAM_NB],
+            32,
+            eps,
+            iparam[IPARAM_OUTOFCORE]);
+
+    printf( "#      M       N  K/NRHS   seconds   Gflop/s\n");
+    printf( "#%7d %7d %7d ", iparam[IPARAM_N], iparam[IPARAM_N], iparam[IPARAM_NRHS]);
+    fflush( stdout );
+    return;
+}
+
+// Checking if all block size is a multiple of 4096 Bytes
+static int
+will_o_direct_work(int nb) {
+    if ((nb * nb * sizeof(float)) % 4096 != 0)
+        return 0;
+    return 1;
+}
+
+static void
+print_o_direct_wont_work(void) {
+    fprintf(stderr, "\n[chameleon] Using out-of-core in o_direct force your blocks' size to be\n"
+                    "multiples of 4096. Tip : chose 'n' and 'nb' as both multiples of 32.\n");
+}
+
+/******************************************************************************
+ *  Ffunction to return address of block (m,n) -> here NULL because memory is
+ *  directly handled by StarPU
+ **/
+inline static void* morse_getaddr_null(const MORSE_desc_t *A, int m, int n)
+{
+    return (void*)( NULL );
+}
+
+#endif /* OOC_H */
diff --git a/include/runtime.h b/include/runtime.h
index 38b2ed213..6cd6c2a14 100644
--- a/include/runtime.h
+++ b/include/runtime.h
@@ -60,18 +60,18 @@ void  RUNTIME_comm_size          (int*);
 /*******************************************************************************
  * RUNTIME Descriptor
  **/
-void* RUNTIME_mat_alloc      (size_t);
-void  RUNTIME_mat_free       (void*, size_t);
-void  RUNTIME_desc_init      (MORSE_desc_t*);
-void  RUNTIME_desc_create    (MORSE_desc_t*);
-void  RUNTIME_desc_destroy   (MORSE_desc_t*);
-void  RUNTIME_desc_submatrix (MORSE_desc_t*);
-void* RUNTIME_desc_getaddr   (MORSE_desc_t*, int, int);
+void* RUNTIME_mat_alloc        (size_t);
+void  RUNTIME_mat_free         (void*, size_t);
+void  RUNTIME_desc_init        (MORSE_desc_t*);
+void  RUNTIME_desc_create      (MORSE_desc_t*);
+void  RUNTIME_desc_destroy     (MORSE_desc_t*);
+void  RUNTIME_desc_submatrix   (MORSE_desc_t*);
+void* RUNTIME_desc_getaddr     (MORSE_desc_t*, int, int);
 /* Acquire in main memory an up-to-date copy of the data described by the descriptor for read-write access. */
-int   RUNTIME_desc_acquire   (MORSE_desc_t*);
+int   RUNTIME_desc_acquire     (MORSE_desc_t*);
 /* Release the data described by the descriptor to be used by the StarPU tasks again. */
-int   RUNTIME_desc_release   (MORSE_desc_t*);
-int   RUNTIME_desc_getoncpu  (MORSE_desc_t*);
+int   RUNTIME_desc_release     (MORSE_desc_t*);
+int   RUNTIME_desc_getoncpu    (MORSE_desc_t*);
 
 /*******************************************************************************
  * RUNTIME Options
diff --git a/runtime/starpu/control/runtime_descriptor.c b/runtime/starpu/control/runtime_descriptor.c
index b7926f2ae..313ceff09 100644
--- a/runtime/starpu/control/runtime_descriptor.c
+++ b/runtime/starpu/control/runtime_descriptor.c
@@ -56,7 +56,7 @@ void *RUNTIME_mat_alloc( size_t size)
 #else
     void *mat;
 
-    if (starpu_malloc_flags(&mat, size, STARPU_MALLOC_PINNED|FOLDED) != 0)
+    if (starpu_malloc_flags(&mat, size, STARPU_MALLOC_PINNED|FOLDED|STARPU_MALLOC_COUNT) != 0)
         return NULL;
     return mat;
 #endif
@@ -67,7 +67,7 @@ void RUNTIME_mat_free( void *mat, size_t size)
 #if defined(CHAMELEON_SIMULATION) && !defined(STARPU_MALLOC_SIMULATION_FOLDED) && !defined(CHAMELEON_USE_MPI)
     return (void*) 1;
 #else
-    starpu_free_flags(mat, size, STARPU_MALLOC_PINNED|FOLDED);
+    starpu_free_flags(mat, size, STARPU_MALLOC_PINNED|FOLDED|STARPU_MALLOC_COUNT);
 #endif
 }
 
@@ -297,9 +297,16 @@ void *RUNTIME_desc_getaddr( MORSE_desc_t *desc, int m, int n )
         int tempnn = (n == desc->lnt-1) ? (desc->ln - n * desc->nb) : desc->nb;
 
         if ( myrank == owner ) {
-            starpu_matrix_data_register(ptrtile, 0,
-                                        (uintptr_t)desc->get_blkaddr(desc, m, n),
-                                        BLKLDD(desc, m), tempmm, tempnn, eltsze);
+            if ( desc->get_blkaddr(desc, m, n) == (void*)NULL ) {
+                starpu_matrix_data_register(ptrtile, -1,
+                                            (uintptr_t) NULL,
+                                            BLKLDD(desc, m), tempmm, tempnn, eltsze);
+            }
+            else {
+                starpu_matrix_data_register(ptrtile, 0,
+                                            (uintptr_t)desc->get_blkaddr(desc, m, n),
+                                            BLKLDD(desc, m), tempmm, tempnn, eltsze);
+            }
         }
         else {
             starpu_matrix_data_register(ptrtile, -1,
-- 
GitLab