Commit 764be57d authored by PRUVOST Florent's avatar PRUVOST Florent

chameleon: add a out_of_core example, requires to handle the case where starpu...

chameleon: add a out_of_core example, requires to handle the case where starpu handles itself the allocation of tiles -> we give a NULL pointer through the get_blkaddr function for registering
parent 8aee6e96
......@@ -28,11 +28,14 @@ add_subdirectory(basic_zposv)
if (CHAMELEON_PREC_D)
add_subdirectory(lapack_to_morse)
if (CHAMELEON_SCHED_STARPU)
add_subdirectory(out_of_core)
endif()
else()
message(WARNING "CHAMELEON_PREC_D is set to OFF so that lapack_to_morse"
"tutorial cannot be built (use only double arithmetic precision).\n"
"Please set CHAMELEON_PREC_D to ON if you want to build executables of"
"this tutorial.")
message(WARNING "CHAMELEON_PREC_D is set to OFF so that lapack_to_morse "
"and out_core tutorials cannot be built (use only double arithmetic "
"precision).\n Please set CHAMELEON_PREC_D to ON if you want to build "
"executables of this tutorial.")
endif()
###
......
#
# Check Example basic_zposv
# Check Example lapack_to_morse
#
set(TESTLIST
......
......@@ -34,7 +34,7 @@
*/
int main(int argc, char *argv[]) {
size_t N; // matrix order
size_t N; // matrix order
int NB; // number of rows and columns in tiles
int NRHS; // number of RHS vectors
int NCPU; // number of cores to use
......
###
#
# @copyright (c) 2009-2014 The University of Tennessee and The University
# of Tennessee Research Foundation.
# All rights reserved.
# @copyright (c) 2012-2016 Inria. All rights reserved.
# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
#
###
#
# @file CMakeLists.c
#
# MORSE example routines
# MORSE is a software package provided by Inria Bordeaux - Sud-Ouest, LaBRI,
# University of Bordeaux, Bordeaux INP
#
# @version 1.0.0
# @author Florent Pruvost
# @date 2016-08-23
#
###
include_directories(${CMAKE_CURRENT_BINARY_DIR})
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
# list of sources
set(OOC_SOURCES
out_of_core.c
)
# Define what libraries we have to link with
# ------------------------------------------
unset(libs_for_ooc)
list(APPEND libs_for_ooc
chameleon
chameleon_starpu
${STARPU_LIBRARIES_DEP}
)
link_directories(${STARPU_LIBRARY_DIRS})
if(NOT CHAMELEON_SIMULATION)
if(CHAMELEON_USE_CUDA OR CHAMELEON_USE_MAGMA)
list(APPEND libs_for_ooc
cudablas)
endif()
if(CHAMELEON_USE_CUDA)
list(APPEND libs_for_ooc
${CUDA_LIBRARIES}
)
link_directories(${CUDA_LIBRARY_DIRS})
endif()
if(CHAMELEON_USE_MAGMA)
list(APPEND libs_for_ooc
${MAGMA_LIBRARIES}
)
link_directories(${MAGMA_LIBRARY_DIRS})
endif()
list(APPEND libs_for_ooc
coreblas
${LAPACKE_LIBRARIES}
${CBLAS_LIBRARIES}
${LAPACK_SEQ_LIBRARIES}
${BLAS_SEQ_LIBRARIES}
${HWLOC_LIBRARIES}
${EXTRA_LIBRARIES}
)
link_directories(${LAPACKE_LIBRARY_DIRS})
link_directories(${LAPACK_LIBRARY_DIRS})
link_directories(${CBLAS_LIBRARY_DIRS})
link_directories(${BLAS_LIBRARY_DIRS})
else()
list(APPEND libs_for_ooc
coreblas
simulapacke
simucblas
${HWLOC_LIBRARIES}
${EXTRA_LIBRARIES}
)
endif()
link_directories(${HWLOC_LIBRARY_DIRS})
# message(STATUS "libs examples: ${libs_for_ooc}")
foreach(_ooc ${OOC_SOURCES})
get_filename_component(_name_exe ${_ooc} NAME_WE)
add_executable(${_name_exe} ${_ooc})
set_property(TARGET ${_name_exe} PROPERTY LINKER_LANGUAGE Fortran)
target_link_libraries(${_name_exe} ${libs_for_ooc})
install(TARGETS ${_name_exe}
DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/chameleon/example/out_of_core)
endforeach()
#-------- Tests ---------
include(CTestLists.cmake)
###
### END CMakeLists.txt
###
#
# Check Example out_of_core
#
set(TESTLIST
out_of_core
)
foreach(test ${TESTLIST})
add_test(example_ooc_${test} ./${test})
endforeach()
/**
*
* @copyright (c) 2009-2014 The University of Tennessee and The University
* of Tennessee Research Foundation.
* All rights reserved.
* @copyright (c) 2012-2014 Inria. All rights reserved.
* @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
*
**/
/**
*
* @file ooc.c
*
* MORSE example routines
* MORSE is a software package provided by Inria Bordeaux - Sud-Ouest, LaBRI,
* University of Bordeaux, Bordeaux INP
*
* @version 1.0.0
* @author Florent Pruvost
* @date 2014-10-29
*
**/
#include "out_of_core.h"
/*
* @brief ooc is driver example routine to test the out-of-core feature with StarPU
* @details TODO: write some details
*/
int main(int argc, char *argv[]) {
size_t N; // matrix order
int NB; // number of rows and columns in tiles
int NRHS; // number of RHS vectors
int NCPU; // number of cores to use
int NGPU; // number of gpus (cuda devices) to use
int UPLO = MorseUpper; // where is stored L
/* descriptors necessary for calling MORSE tile interface */
MORSE_desc_t *descA = NULL, *descAC = NULL, *descB = NULL, *descX = NULL;
/* declarations to time the program and evaluate performances */
double fmuls, fadds, flops, gflops, cpu_time;
/* variable to check the numerical results */
double anorm, bnorm, xnorm, eps, res;
int hres;
/* initialize some parameters with default values */
int iparam[IPARAM_SIZEOF];
memset(iparam, 0, IPARAM_SIZEOF*sizeof(int));
init_iparam(iparam);
/* read arguments */
read_args(argc, argv, iparam);
N = iparam[IPARAM_N];
NB = iparam[IPARAM_NB];
NRHS = iparam[IPARAM_NRHS];
/* compute the algorithm complexity to evaluate performances */
fadds = (double)( FADDS_POTRF(N) + 2 * FADDS_TRSM(N,NRHS) );
fmuls = (double)( FMULS_POTRF(N) + 2 * FMULS_TRSM(N,NRHS) );
flops = 1e-9 * (fmuls + fadds);
gflops = 0.0;
cpu_time = 0.0;
/* initialize the number of thread if not given by the user in argv */
if ( iparam[IPARAM_THRDNBR] == -1 ) {
get_thread_count( &(iparam[IPARAM_THRDNBR]) );
}
NCPU = iparam[IPARAM_THRDNBR];
NGPU = 0;
/* print informations to user */
print_header( argv[0], iparam);
/* check that o direct will work */
if (iparam[IPARAM_OUTOFCORE] > 0) {
if (! will_o_direct_work(NB)) {
print_o_direct_wont_work();
return EXIT_FAILURE;
}
char maxMemoryAllowed[32];
sprintf (maxMemoryAllowed, "%d", iparam[IPARAM_OUTOFCORE]);
setenv ("STARPU_LIMIT_CPU_MEM", maxMemoryAllowed, 1);
}
/* Initialize MORSE with main parameters */
if ( MORSE_Init( NCPU, NGPU ) != MORSE_SUCCESS ) {
fprintf(stderr, "Error initializing MORSE library\n");
return EXIT_FAILURE;
}
MORSE_Set(MORSE_TILE_SIZE, NB);
/* limit ram memory */
if (iparam[IPARAM_OUTOFCORE] > 0) {
int new_dd = starpu_disk_register (&starpu_disk_unistd_o_direct_ops,
(void*) "/tmp/starpu_ooc/", 1024*1024*10);
}
MORSE_Desc_Create_User(&descA, NULL, MorseRealDouble,
NB, NB, NB*NB, N, N, 0, 0, N, N, 1, 1,
morse_getaddr_null,
morse_getblkldd_ccrb,
morse_getrankof_2d);
MORSE_Desc_Create(&descB, NULL, MorseRealDouble,
NB, NB, NB*NB, N, NRHS, 0, 0, N, NRHS, 1, 1);
MORSE_Desc_Create(&descX, NULL, MorseRealDouble,
NB, NB, NB*NB, N, NRHS, 0, 0, N, NRHS, 1, 1);
MORSE_Desc_Create(&descAC, NULL, MorseRealDouble,
NB, NB, NB*NB, N, N, 0, 0, N, N, 1, 1);
/* generate A matrix with random values such that it is spd */
MORSE_dplgsy_Tile( (double)N, descA, 51 );
/* generate RHS */
MORSE_dplrnt_Tile( descB, 5673 );
/* copy A before facto. in order to check the result */
MORSE_dlacpy_Tile(MorseUpperLower, descA, descAC);
/* copy B in X before solving
* same sense as memcpy(X, B, N*NRHS*sizeof(double)) but for descriptors */
MORSE_dlacpy_Tile(MorseUpperLower, descB, descX);
/************************************************************/
/* solve the system AX = B using the Cholesky factorization */
/************************************************************/
cpu_time = -cWtime();
/* Cholesky factorization:
* A is replaced by its factorization L or L^T depending on uplo */
MORSE_dpotrf_Tile( UPLO, descA );
/* Solve:
* B is stored in X on entry, X contains the result on exit.
* Forward and back substitutions
*/
MORSE_dpotrs_Tile( UPLO, descA, descX );
cpu_time += cWtime();
/* print informations to user */
gflops = flops / cpu_time;
printf( "%9.3f %9.2f\n", cpu_time, gflops);
fflush( stdout );
/************************************************************/
/* check if solve is correct i.e. AX-B = 0 */
/************************************************************/
/* compute norms to check the result */
anorm = MORSE_dlange_Tile( MorseInfNorm, descAC);
bnorm = MORSE_dlange_Tile( MorseInfNorm, descB);
xnorm = MORSE_dlange_Tile( MorseInfNorm, descX);
/* compute A*X-B, store the result in B */
MORSE_dgemm_Tile( MorseNoTrans, MorseNoTrans,
1.0, descAC, descX, -1.0, descB );
res = MORSE_dlange_Tile( MorseInfNorm, descB );
/* check residual and print a message */
eps = LAPACKE_dlamch_work( 'e' );
/*
* if hres = 0 then the test succeed
* else the test failed
*/
hres = 0;
hres = ( res / N / eps / (anorm * xnorm + bnorm ) > 100.0 );
printf( " ||Ax-b|| ||A|| ||x|| ||b|| ||Ax-b||/N/eps/(||A||||x||+||b||) RETURN\n");
if (hres)
printf( "%8.5e %8.5e %8.5e %8.5e %8.5e FAILURE \n",
res, anorm, xnorm, bnorm,
res / N / eps / (anorm * xnorm + bnorm ));
else
printf( "%8.5e %8.5e %8.5e %8.5e %8.5e SUCCESS \n",
res, anorm, xnorm, bnorm,
res / N / eps / (anorm * xnorm + bnorm ));
/* free descriptors descA, descB, descX, descAC */
MORSE_Desc_Destroy( &descA );
MORSE_Desc_Destroy( &descB );
MORSE_Desc_Destroy( &descX );
MORSE_Desc_Destroy( &descAC );
/* Finalize MORSE */
MORSE_Finalize();
return EXIT_SUCCESS;
}
/**
*
* @copyright (c) 2009-2014 The University of Tennessee and The University
* of Tennessee Research Foundation.
* All rights reserved.
* @copyright (c) 2012-2016 Inria. All rights reserved.
* @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
*
**/
/**
*
* @file out_of_core.h
*
* MORSE example routines
* MORSE is a software package provided by Inria Bordeaux - Sud-Ouest, LaBRI,
* University of Bordeaux, Bordeaux INP
*
* @version 1.0.0
* @author Florent Pruvost
* @date 2016-08-23
*
**/
#ifndef OOC_H
#define OOC_H
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#if defined( _WIN32 ) || defined( _WIN64 )
#define int64_t __int64
#endif
/* Define these so that the Microsoft VC compiler stops complaining
about scanf and friends */
#define _CRT_SECURE_NO_DEPRECATE
#define _CRT_SECURE_NO_WARNINGS
#if defined( _WIN32 ) || defined( _WIN64 )
#include <windows.h>
#else /* Non-Windows */
#include <unistd.h>
#include <sys/resource.h>
#endif
#include <starpu.h>
#include "coreblas/include/lapacke.h"
#include "morse.h"
#include "control/common.h"
/* Common functions for all steps of the tutorial */
static void get_thread_count(int *thrdnbr) {
#if defined WIN32 || defined WIN64
sscanf( getenv( "NUMBER_OF_PROCESSORS" ), "%d", thrdnbr );
#else
*thrdnbr = sysconf(_SC_NPROCESSORS_ONLN);
#endif
}
static int startswith(const char *s, const char *prefix) {
size_t n = strlen( prefix );
if (strncmp( s, prefix, n ))
return 0;
return 1;
}
/* define complexity of algorithms - see Lawn 41 page 120 */
#define FMULS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) + 0.5) * (double)(__n) + (1. / 3.)))
#define FADDS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) ) * (double)(__n) - (1. / 6.)))
#define FMULS_TRSM(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)+1.))
#define FADDS_TRSM(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.))
/* define some tools to time the program */
#if defined( _WIN32 ) || defined( _WIN64 )
#include <windows.h>
#include <time.h>
#include <sys/timeb.h>
#if defined(_MSC_VER) || defined(_MSC_EXTENSIONS)
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64
#else
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
struct timezone
{
int tz_minuteswest; /* minutes W of Greenwich */
int tz_dsttime; /* type of dst correction */
};
int gettimeofday(struct timeval* tv, struct timezone* tz)
{
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
if (NULL != tz)
{
if (!tzflag)
{
_tzset();
tzflag++;
}
tz->tz_minuteswest = _timezone / 60;
tz->tz_dsttime = _daylight;
}
return 0;
}
#else /* Non-Windows */
#include <sys/time.h>
#endif
/*
* struct timeval {time_t tv_sec; suseconds_t tv_usec;};
*/
double cWtime(void)
{
struct timeval tp;
gettimeofday( &tp, NULL );
return tp.tv_sec + 1e-6 * tp.tv_usec;
}
/* Integer parameters */
enum iparam_ooc {
IPARAM_THRDNBR, /* Number of cores */
IPARAM_N, /* Number of columns of the matrix */
IPARAM_NB, /* Number of columns in a tile */
IPARAM_NRHS, /* Number of RHS */
IPARAM_OUTOFCORE, /* if > 0 --> how many memory accepted incore */
/* else --> do not use ooc. */
/* End */
IPARAM_SIZEOF
};
/* Specific routines */
/******************************************************************************
* Initialize integer parameters
*/
static void init_iparam(int iparam[IPARAM_SIZEOF]){
iparam[IPARAM_THRDNBR ] = -1;
iparam[IPARAM_N ] = 500;
iparam[IPARAM_NB ] = 128;
iparam[IPARAM_NRHS ] = 1;
iparam[IPARAM_OUTOFCORE ] = 2000;
}
/******************************************************************************
* Print how to use the program
*/
static void show_help(char *prog_name) {
printf( "Usage:\n%s [options]\n\n", prog_name );
printf( "Options are:\n"
" --help Show this help\n"
"\n"
" --n=X dimension (N). (default: 500)\n"
" --nb=X NB size. (default: 128)\n"
" --nrhs=X number of RHS. (default: 1)\n"
"\n"
" --threads=X Number of CPU workers (default: _SC_NPROCESSORS_ONLN)\n"
" --ooc=N Allow to store N MiB in main memory. (default: )\n"
"\n");
}
/******************************************************************************
* Read arguments following ooc program call
*/
static void read_args(int argc, char *argv[], int *iparam){
int i;
for (i = 1; i < argc && argv[i]; ++i) {
if ( startswith( argv[i], "--help") || startswith( argv[i], "-help") ||
startswith( argv[i], "--h") || startswith( argv[i], "-h") ) {
show_help( argv[0] );
exit(0);
} else if (startswith( argv[i], "--n=" )) {
sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[IPARAM_N]) );
} else if (startswith( argv[i], "--nb=" )) {
sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[IPARAM_NB]) );
} else if (startswith( argv[i], "--nrhs=" )) {
sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[IPARAM_NRHS]) );
} else if (startswith( argv[i], "--threads=" )) {
sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[IPARAM_THRDNBR]) );
} else if (startswith( argv[i], "--ooc=" )) {
sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[IPARAM_OUTOFCORE]) );
} else {
fprintf( stderr, "Unknown option: %s\n", argv[i] );
}
}
}
/******************************************************************************
* Print a header message to summarize main parameters
*/
static void print_header(char *prog_name, int * iparam) {
#if defined(CHAMELEON_SIMULATION)
double eps = 0.;
#else
double eps = LAPACKE_dlamch_work( 'e' );
#endif
printf( "#\n"
"# CHAMELEON %d.%d.%d, %s\n"
"# Nb threads: %d\n"
"# Nb gpus: %d\n"
"# N: %d\n"
"# NB: %d\n"
"# IB: %d\n"
"# eps: %e\n"
"# ooc: %d\n"
"#\n",
CHAMELEON_VERSION_MAJOR,
CHAMELEON_VERSION_MINOR,
CHAMELEON_VERSION_MICRO,
prog_name,
iparam[IPARAM_THRDNBR],
0,
iparam[IPARAM_N],
iparam[IPARAM_NB],
32,
eps,
iparam[IPARAM_OUTOFCORE]);
printf( "# M N K/NRHS seconds Gflop/s\n");
printf( "#%7d %7d %7d ", iparam[IPARAM_N], iparam[IPARAM_N], iparam[IPARAM_NRHS]);
fflush( stdout );
return;
}
// Checking if all block size is a multiple of 4096 Bytes
static int
will_o_direct_work(int nb) {
if ((nb * nb * sizeof(float)) % 4096 != 0)
return 0;
return 1;
}
static void
print_o_direct_wont_work(void) {
fprintf(stderr, "\n[chameleon] Using out-of-core in o_direct force your blocks' size to be\n"
"multiples of 4096. Tip : chose 'n' and 'nb' as both multiples of 32.\n");
}
/******************************************************************************
* Ffunction to return address of block (m,n) -> here NULL because memory is
* directly handled by StarPU
**/
inline static void* morse_getaddr_null(const MORSE_desc_t *A, int m, int n)
{
return (void*)( NULL );
}
#endif /* OOC_H */
......@@ -60,18 +60,18 @@ void RUNTIME_comm_size (int*);
/*******************************************************************************
* RUNTIME Descriptor
**/
void* RUNTIME_mat_alloc (size_t);
void RUNTIME_mat_free (void*, size_t);
void RUNTIME_desc_init (MORSE_desc_t*);
void RUNTIME_desc_create (MORSE_desc_t*);
void RUNTIME_desc_destroy (MORSE_desc_t*);
void RUNTIME_desc_submatrix (MORSE_desc_t*);
void* RUNTIME_desc_getaddr (MORSE_desc_t*, int, int);
void* RUNTIME_mat_alloc (size_t);
void RUNTIME_mat_free (void*, size_t);
void RUNTIME_desc_init (MORSE_desc_t*);
void RUNTIME_desc_create (MORSE_desc_t*);
void RUNTIME_desc_destroy (MORSE_desc_t*);
void RUNTIME_desc_submatrix (MORSE_desc_t*);
void* RUNTIME_desc_getaddr (MORSE_desc_t*, int, int);
/* Acquire in main memory an up-to-date copy of the data described by the descriptor for read-write access. */
int RUNTIME_desc_acquire (MORSE_desc_t*);