Subject: [PATCH] chameleon: add a out_of_core example, requires to handle the
 case where starpu handles itself the allocation of tiles -> we give a NULL
 pointer through the get_blkaddr function for registering

 create mode 100644 example/out_of_core/out_of_core.h

+        add_subdirectory(out_of_core)
+    endif()
+    endif()
-    message(WARNING "CHAMELEON_PREC_D is set to OFF so that lapack_to_morse"
-    "tutorial cannot be built (use only double arithmetic precision).\n"
-    "Please set CHAMELEON_PREC_D to ON if you want to build executables of"
-    "this tutorial.")
+    message(WARNING "CHAMELEON_PREC_D is set to OFF so that lapack_to_morse "
+    "and out_core tutorials cannot be built (use only double arithmetic "
+    "precision).\n Please set CHAMELEON_PREC_D to ON if you want to build "
+    "executables of this tutorial.")
-# Check Example basic_zposv
+# Check Example lapack_to_morse
 int main(int argc, char *argv[]) {
-    size_t N;    // matrix order
+    size_t N; // matrix order
     int NB;   // number of rows and columns in tiles
     int NRHS; // number of RHS vectors
     int NCPU; // number of cores to use
+# @copyright (c) 2009-2014 The University of Tennessee and The University
+#                          of Tennessee Research Foundation.
+#                          All rights reserved.
+# @copyright (c) 2012-2016 Inria. All rights reserved.
+# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+# @file CMakeLists.c
+#  MORSE example routines
+#  MORSE is a software package provided by Inria Bordeaux - Sud-Ouest, LaBRI,
+#  University of Bordeaux, Bordeaux INP
+#  @version 1.0.0
+#  @author Florent Pruvost
+#  @date 2016-08-23
+# list of sources
+    out_of_core.c
+   )
+# Define what libraries we have to link with
+# ------------------------------------------
+list(APPEND libs_for_ooc
+     chameleon
+     chameleon_starpu
+        list(APPEND libs_for_ooc
+        cudablas)
+    endif()
+        list(APPEND libs_for_ooc
+             ${CUDA_LIBRARIES}
+        )
+        link_directories(${CUDA_LIBRARY_DIRS})
+    endif()
+        list(APPEND libs_for_ooc
+             ${MAGMA_LIBRARIES}
+        )
+        link_directories(${MAGMA_LIBRARY_DIRS})
+    endif()
+    list(APPEND libs_for_ooc
+         coreblas
+         ${CBLAS_LIBRARIES}
+         ${HWLOC_LIBRARIES}
+         ${EXTRA_LIBRARIES}
+    )
+    link_directories(${LAPACKE_LIBRARY_DIRS})
+    link_directories(${LAPACK_LIBRARY_DIRS})
+    link_directories(${CBLAS_LIBRARY_DIRS})
+    link_directories(${BLAS_LIBRARY_DIRS})
+    list(APPEND libs_for_ooc
+         coreblas
+         simulapacke
+         simucblas
+         ${HWLOC_LIBRARIES}
+         ${EXTRA_LIBRARIES}
+    )
+# message(STATUS "libs examples: ${libs_for_ooc}")
+foreach(_ooc ${OOC_SOURCES})
+    get_filename_component(_name_exe ${_ooc} NAME_WE)
+    add_executable(${_name_exe} ${_ooc})
+    set_property(TARGET ${_name_exe} PROPERTY LINKER_LANGUAGE Fortran)
+    target_link_libraries(${_name_exe} ${libs_for_ooc})
+    install(TARGETS ${_name_exe}
+            DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/chameleon/example/out_of_core)
+#-------- Tests ---------
+### END CMakeLists.txt
+# Check Example out_of_core
+    out_of_core
+    )
+foreach(test ${TESTLIST})
+    add_test(example_ooc_${test} ./${test})
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2014 Inria. All rights reserved.
+ * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+ *
+ * @file ooc.c
+ *
+ *  MORSE example routines
+ *  MORSE is a software package provided by Inria Bordeaux - Sud-Ouest, LaBRI,
+ *  University of Bordeaux, Bordeaux INP
+ *
+ * @version 1.0.0
+ * @author Florent Pruvost
+ * @date 2014-10-29
+ *
+ **/
+#include "out_of_core.h"
+ * @brief ooc is driver example routine to test the out-of-core feature with StarPU
+ * @details TODO: write some details
+ */
+int main(int argc, char *argv[]) {
+    size_t N; // matrix order
+    int NB;   // number of rows and columns in tiles
+    int NRHS; // number of RHS vectors
+    int NCPU; // number of cores to use
+    int NGPU; // number of gpus (cuda devices) to use
+    int UPLO = MorseUpper; // where is stored L
+    /* descriptors necessary for calling MORSE tile interface  */
+    MORSE_desc_t *descA = NULL, *descAC = NULL, *descB = NULL, *descX = NULL;
+    /* declarations to time the program and evaluate performances */
+    double fmuls, fadds, flops, gflops, cpu_time;
+    /* variable to check the numerical results */
+    double anorm, bnorm, xnorm, eps, res;
+    int hres;
+    /* initialize some parameters with default values */
+    int iparam[IPARAM_SIZEOF];
+    memset(iparam, 0, IPARAM_SIZEOF*sizeof(int));
+    init_iparam(iparam);
+    /* read arguments */
+    read_args(argc, argv, iparam);
+    N    = iparam[IPARAM_N];
+    NB   = iparam[IPARAM_NB];
+    NRHS = iparam[IPARAM_NRHS];
+    /* compute the algorithm complexity to evaluate performances */
+    fadds = (double)( FADDS_POTRF(N) + 2 * FADDS_TRSM(N,NRHS) );
+    fmuls = (double)( FMULS_POTRF(N) + 2 * FMULS_TRSM(N,NRHS) );
+    flops = 1e-9 * (fmuls + fadds);
+    gflops = 0.0;
+    cpu_time = 0.0;
+    /* initialize the number of thread if not given by the user in argv */
+    if ( iparam[IPARAM_THRDNBR] == -1 ) {
+        get_thread_count( &(iparam[IPARAM_THRDNBR]) );
+    }
+    NCPU = iparam[IPARAM_THRDNBR];
+    NGPU = 0;
+    /* print informations to user */
+    print_header( argv[0], iparam);
+    /* check that o direct will work */
+    if (iparam[IPARAM_OUTOFCORE] > 0) {
+        if (! will_o_direct_work(NB)) {
+            print_o_direct_wont_work();
+            return EXIT_FAILURE;
+        }
+        char maxMemoryAllowed[32];
+        sprintf (maxMemoryAllowed, "%d", iparam[IPARAM_OUTOFCORE]);
+        setenv ("STARPU_LIMIT_CPU_MEM", maxMemoryAllowed, 1);
+    }
+     /* Initialize MORSE with main parameters */
+    if ( MORSE_Init( NCPU, NGPU ) != MORSE_SUCCESS ) {
+        fprintf(stderr, "Error initializing MORSE library\n");
+        return EXIT_FAILURE;
+    }
+    /* limit ram memory */
+    if (iparam[IPARAM_OUTOFCORE] > 0) {
+        int new_dd = starpu_disk_register (&starpu_disk_unistd_o_direct_ops,
+                                           (void*) "/tmp/starpu_ooc/", 1024*1024*10);
+    }
+    MORSE_Desc_Create_User(&descA, NULL, MorseRealDouble,
+                           NB, NB, NB*NB, N, N, 0, 0, N, N, 1, 1,
+                           morse_getaddr_null,
+                           morse_getblkldd_ccrb,
+                           morse_getrankof_2d);
+    MORSE_Desc_Create(&descB,  NULL, MorseRealDouble,
+                      NB, NB,  NB*NB, N, NRHS, 0, 0, N, NRHS, 1, 1);
+    MORSE_Desc_Create(&descX,  NULL, MorseRealDouble,
+                      NB, NB,  NB*NB, N, NRHS, 0, 0, N, NRHS, 1, 1);
+    MORSE_Desc_Create(&descAC, NULL, MorseRealDouble,
+                      NB, NB,  NB*NB, N, N, 0, 0, N, N, 1, 1);
+    /* generate A matrix with random values such that it is spd */
+    MORSE_dplgsy_Tile( (double)N, descA, 51 );
+    /* generate RHS */
+    MORSE_dplrnt_Tile( descB, 5673 );
+    /* copy A before facto. in order to check the result */
+    MORSE_dlacpy_Tile(MorseUpperLower, descA, descAC);
+    /* copy B in X before solving
+     * same sense as memcpy(X, B, N*NRHS*sizeof(double)) but for descriptors */
+    MORSE_dlacpy_Tile(MorseUpperLower, descB, descX);
+    /************************************************************/
+    /* solve the system AX = B using the Cholesky factorization */
+    /************************************************************/
+    cpu_time = -cWtime();
+    /* Cholesky factorization:
+     * A is replaced by its factorization L or L^T depending on uplo */
+    MORSE_dpotrf_Tile( UPLO, descA );
+    /* Solve:
+     * B is stored in X on entry, X contains the result on exit.
+     * Forward and back substitutions
+     */
+    MORSE_dpotrs_Tile( UPLO, descA, descX );
+    cpu_time += cWtime();
+    /* print informations to user */
+    gflops = flops / cpu_time;
+    printf( "%9.3f %9.2f\n", cpu_time, gflops);
+    fflush( stdout );
+    /************************************************************/
+    /* check if solve is correct i.e. AX-B = 0                  */
+    /************************************************************/
+    /* compute norms to check the result */
+    anorm = MORSE_dlange_Tile( MorseInfNorm, descAC);
+    bnorm = MORSE_dlange_Tile( MorseInfNorm, descB);
+    xnorm = MORSE_dlange_Tile( MorseInfNorm, descX);
+    /* compute A*X-B, store the result in B */
+    MORSE_dgemm_Tile( MorseNoTrans, MorseNoTrans,
+                      1.0, descAC, descX, -1.0, descB );
+    res = MORSE_dlange_Tile( MorseInfNorm, descB );
+    /* check residual and print a message */
+    eps = LAPACKE_dlamch_work( 'e' );
+    /*
+     * if hres = 0 then the test succeed
+     * else the test failed
+     */
+    hres = 0;
+    hres = ( res / N / eps / (anorm * xnorm + bnorm ) > 100.0 );
+    printf( "   ||Ax-b||       ||A||       ||x||       ||b|| ||Ax-b||/N/eps/(||A||||x||+||b||)  RETURN\n");
+    if (hres)
+        printf( "%8.5e %8.5e %8.5e %8.5e                       %8.5e FAILURE \n",
+            res, anorm, xnorm, bnorm,
+            res / N / eps / (anorm * xnorm + bnorm ));
+    else
+        printf( "%8.5e %8.5e %8.5e %8.5e                       %8.5e SUCCESS \n",
+            res, anorm, xnorm, bnorm,
+            res / N / eps / (anorm * xnorm + bnorm ));
+    /* free descriptors descA, descB, descX, descAC */
+    MORSE_Desc_Destroy( &descA );
+    MORSE_Desc_Destroy( &descB );
+    MORSE_Desc_Destroy( &descX );
+    MORSE_Desc_Destroy( &descAC );
+    /* Finalize MORSE */
+    MORSE_Finalize();
+    return EXIT_SUCCESS;
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2016 Inria. All rights reserved.
+ * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+ *
+ * @file out_of_core.h
+ *
+ *  MORSE example routines
+ *  MORSE is a software package provided by Inria Bordeaux - Sud-Ouest, LaBRI,
+ *  University of Bordeaux, Bordeaux INP
+ *
+ * @version 1.0.0
+ * @author Florent Pruvost
+ * @date 2016-08-23
+ *
+ **/
+#ifndef OOC_H
+#define OOC_H
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#if defined( _WIN32 ) || defined( _WIN64 )
+#define int64_t __int64
+/* Define these so that the Microsoft VC compiler stops complaining
+   about scanf and friends */
+#if defined( _WIN32 ) || defined( _WIN64 )
+#include <windows.h>
+#else  /* Non-Windows */
+#include <unistd.h>
+#include <sys/resource.h>
+#include <starpu.h>
+#include "coreblas/include/lapacke.h"
+#include "morse.h"
+#include "control/common.h"
+/* Common functions for all steps of the tutorial */
+static void get_thread_count(int *thrdnbr) {
+#if defined WIN32 || defined WIN64
+    sscanf( getenv( "NUMBER_OF_PROCESSORS" ), "%d", thrdnbr );
+    *thrdnbr = sysconf(_SC_NPROCESSORS_ONLN);
+static int startswith(const char *s, const char *prefix) {
+    size_t n = strlen( prefix );
+    if (strncmp( s, prefix, n ))
+        return 0;
+    return 1;
+/* define complexity of algorithms - see Lawn 41 page 120 */
+#define FMULS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) + 0.5) * (double)(__n) + (1. / 3.)))
+#define FADDS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n)      ) * (double)(__n) - (1. / 6.)))
+#define FMULS_TRSM(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)+1.))
+#define FADDS_TRSM(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.))
+/* define some tools to time the program */
+#if defined( _WIN32 ) || defined( _WIN64 )
+#include <windows.h>
+#include <time.h>
+#include <sys/timeb.h>
+#if defined(_MSC_VER) || defined(_MSC_EXTENSIONS)
+#define DELTA_EPOCH_IN_MICROSECS  11644473600000000Ui64
+#define DELTA_EPOCH_IN_MICROSECS  11644473600000000ULL
+struct timezone
+    int  tz_minuteswest; /* minutes W of Greenwich */
+    int  tz_dsttime;     /* type of dst correction */
+int gettimeofday(struct timeval* tv, struct timezone* tz)
+    FILETIME         ft;
+    unsigned __int64 tmpres = 0;
+    static int       tzflag;
+    if (NULL != tv)
+        {
+            GetSystemTimeAsFileTime(&ft);
+            tmpres |=  ft.dwHighDateTime;
+            tmpres <<= 32;
+            tmpres |=  ft.dwLowDateTime;
+            /*converting file time to unix epoch*/
+            tmpres /= 10;  /*convert into microseconds*/
+            tmpres -= DELTA_EPOCH_IN_MICROSECS;
+            tv->tv_sec  = (long)(tmpres / 1000000UL);
+            tv->tv_usec = (long)(tmpres % 1000000UL);
+        }
+    if (NULL != tz)
+        {
+            if (!tzflag)
+                {
+                    _tzset();
+                    tzflag++;
+                }
+            tz->tz_minuteswest = _timezone / 60;
+            tz->tz_dsttime     = _daylight;
+        }
+    return 0;
+#else  /* Non-Windows */
+#include <sys/time.h>
+ * struct timeval {time_t tv_sec; suseconds_t tv_usec;};
+ */
+double cWtime(void)
+    struct timeval tp;
+    gettimeofday( &tp, NULL );
+    return tp.tv_sec + 1e-6 * tp.tv_usec;
+/* Integer parameters */
+enum iparam_ooc {
+    IPARAM_THRDNBR,        /* Number of cores                            */
+    IPARAM_N,              /* Number of columns of the matrix            */
+    IPARAM_NB,             /* Number of columns in a tile                */
+    IPARAM_NRHS,           /* Number of RHS                              */
+    IPARAM_OUTOFCORE,      /* if > 0 --> how many memory accepted incore */
+                           /* else --> do not use ooc.                   */
+    /* End */
+/* Specific routines */
+ * Initialize integer parameters
+ */
+static void init_iparam(int iparam[IPARAM_SIZEOF]){
+    iparam[IPARAM_THRDNBR       ] = -1;
+    iparam[IPARAM_N             ] = 500;
+    iparam[IPARAM_NB            ] = 128;
+    iparam[IPARAM_NRHS          ] = 1;
+    iparam[IPARAM_OUTOFCORE     ] = 2000;
+ }
+ * Print how to use the program
+ */
+static void show_help(char *prog_name) {
+    printf( "Usage:\n%s [options]\n\n", prog_name );
+    printf( "Options are:\n"
+            "  --help           Show this help\n"
+            "\n"
+            "  --n=X            dimension (N). (default: 500)\n"
+            "  --nb=X           NB size. (default: 128)\n"
+            "  --nrhs=X         number of RHS. (default: 1)\n"
+            "\n"
+            "  --threads=X      Number of CPU workers (default: _SC_NPROCESSORS_ONLN)\n"
+            "  --ooc=N          Allow to store N MiB in main memory. (default: )\n"
+            "\n");
+ * Read arguments following ooc program call
+ */
+static void read_args(int argc, char *argv[], int *iparam){
+    int i;
+    for (i = 1; i < argc && argv[i]; ++i) {
+        if ( startswith( argv[i], "--help") || startswith( argv[i], "-help") ||
+             startswith( argv[i], "--h") || startswith( argv[i], "-h") ) {
+            show_help( argv[0] );
+            exit(0);
+        } else if (startswith( argv[i], "--n=" )) {
+            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[IPARAM_N]) );
+        } else if (startswith( argv[i], "--nb=" )) {
+            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[IPARAM_NB]) );
+        } else if (startswith( argv[i], "--nrhs=" )) {
+            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[IPARAM_NRHS]) );
+        } else if (startswith( argv[i], "--threads=" )) {
+            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[IPARAM_THRDNBR]) );
+        } else if (startswith( argv[i], "--ooc=" )) {
+            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[IPARAM_OUTOFCORE]) );
+        } else {
+            fprintf( stderr, "Unknown option: %s\n", argv[i] );
+        }
+    }
+ * Print a header message to summarize main parameters
+ */
+static void print_header(char *prog_name, int * iparam) {
+    double    eps = 0.;
+    double    eps = LAPACKE_dlamch_work( 'e' );
+    printf( "#\n"
+            "# CHAMELEON %d.%d.%d, %s\n"
+            "# Nb threads: %d\n"
+            "# Nb gpus:    %d\n"
+            "# N:          %d\n"
+            "# NB:         %d\n"
+            "# IB:         %d\n"
+            "# eps:        %e\n"
+            "# ooc:        %d\n"
+            "#\n",
+            prog_name,
+            iparam[IPARAM_THRDNBR],
+            0,
+            iparam[IPARAM_N],
+            iparam[IPARAM_NB],
+            32,
+            eps,
+            iparam[IPARAM_OUTOFCORE]);
+    printf( "#      M       N  K/NRHS   seconds   Gflop/s\n");
+    printf( "#%7d %7d %7d ", iparam[IPARAM_N], iparam[IPARAM_N], iparam[IPARAM_NRHS]);
+    fflush( stdout );
+    return;
+// Checking if all block size is a multiple of 4096 Bytes
+static int
+will_o_direct_work(int nb) {
+    if ((nb * nb * sizeof(float)) % 4096 != 0)
+        return 0;
+    return 1;
+static void
+print_o_direct_wont_work(void) {
+    fprintf(stderr, "\n[chameleon] Using out-of-core in o_direct force your blocks' size to be\n"
+                    "multiples of 4096. Tip : chose 'n' and 'nb' as both multiples of 32.\n");
+ *  Ffunction to return address of block (m,n) -> here NULL because memory is
+ *  directly handled by StarPU
+ **/
+inline static void* morse_getaddr_null(const MORSE_desc_t *A, int m, int n)
+    return (void*)( NULL );
+#endif /* OOC_H */
  * RUNTIME Descriptor
-void* RUNTIME_mat_alloc      (size_t);
-void  RUNTIME_mat_free       (void*, size_t);
-void  RUNTIME_desc_init      (MORSE_desc_t*);
-void  RUNTIME_desc_create    (MORSE_desc_t*);
-void  RUNTIME_desc_destroy   (MORSE_desc_t*);
-void  RUNTIME_desc_submatrix (MORSE_desc_t*);
-void* RUNTIME_desc_getaddr   (MORSE_desc_t*, int, int);
+void* RUNTIME_mat_alloc        (size_t);
+void  RUNTIME_mat_free         (void*, size_t);
+void  RUNTIME_desc_init        (MORSE_desc_t*);
+void  RUNTIME_desc_create      (MORSE_desc_t*);
+void  RUNTIME_desc_destroy     (MORSE_desc_t*);
+void  RUNTIME_desc_submatrix   (MORSE_desc_t*);
+void* RUNTIME_desc_getaddr     (MORSE_desc_t*, int, int);
 /* Acquire in main memory an up-to-date copy of the data described by the descriptor for read-write access. */
-int   RUNTIME_desc_acquire   (MORSE_desc_t*);
+int   RUNTIME_desc_acquire     (MORSE_desc_t*);
 /* Release the data described by the descriptor to be used by the StarPU tasks again. */
-int   RUNTIME_desc_release   (MORSE_desc_t*);
-int   RUNTIME_desc_getoncpu  (MORSE_desc_t*);
+int   RUNTIME_desc_release     (MORSE_desc_t*);
+int   RUNTIME_desc_getoncpu    (MORSE_desc_t*);
  * RUNTIME Options
     void *mat;
-    if (starpu_malloc_flags(&mat, size, STARPU_MALLOC_PINNED|FOLDED) != 0)
+    if (starpu_malloc_flags(&mat, size, STARPU_MALLOC_PINNED|FOLDED|STARPU_MALLOC_COUNT) != 0)
         return NULL;
     return mat;
-    starpu_free_flags(mat, size, STARPU_MALLOC_PINNED|FOLDED);
+    starpu_free_flags(mat, size, STARPU_MALLOC_PINNED|FOLDED|STARPU_MALLOC_COUNT);
@@ -297,9 +297,16 @@ void *RUNTIME_desc_getaddr( MORSE_desc_t *desc, int m, int n )
         int tempnn = (n == desc->lnt-1) ? (desc->ln - n * desc->nb) : desc->nb;
         if ( myrank == owner ) {
-            starpu_matrix_data_register(ptrtile, 0,
-                                        (uintptr_t)desc->get_blkaddr(desc, m, n),
-                                        BLKLDD(desc, m), tempmm, tempnn, eltsze);
+            if ( desc->get_blkaddr(desc, m, n) == (void*)NULL ) {
+                starpu_matrix_data_register(ptrtile, -1,
+                                            (uintptr_t) NULL,
+                                            BLKLDD(desc, m), tempmm, tempnn, eltsze);
+            }
+            else {
+                starpu_matrix_data_register(ptrtile, 0,
+                                            (uintptr_t)desc->get_blkaddr(desc, m, n),
+                                            BLKLDD(desc, m), tempmm, tempnn, eltsze);
+            }
         else {
             starpu_matrix_data_register(ptrtile, -1,