diff --git a/CMakeLists.txt b/CMakeLists.txt index 525c916133da05c4f5fdc79f32f55ac32662c821..f1f938142e16fb9804510842f0bd7397c255433a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -197,6 +197,9 @@ if (CHAMELEON_ENABLE_CUDA AND NOT CHAMELEON_USE_CUDA) message("-- ${BoldGreen}CHAMELEON_USE_CUDA is set to OFF, turn it ON to use CUDA (unsupported by Quark)${ColourReset}") endif() +# Enable Hmat-OSS kernels +option(CHAMELEON_USE_HMAT "Enable hmat kernels" OFF) + option(CHAMELEON_RUNTIME_SYNC "Enable synchronous task submission when available to debug the code without parallelism" OFF) if (CHAMELEON_RUNTIME_SYNC) message("-- ${BoldGreen}CHAMELEON_RUNTIME_SYNC is set to ON, turn it OFF to avoid synchronisation in the tasks submission${ColourReset}") @@ -262,6 +265,24 @@ mark_as_advanced(CHAMELEON_COPY_DIAG) ################################ add_subdirectory(hqr) +############################################################################### +# Build dependency HMAT-OSS library # +##################################### +if ( CHAMELEON_USE_HMAT ) + find_package(HMAT REQUIRED) + if(HMAT_FOUND) + set_target_properties(HMAT::hmat PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${HMAT_INCLUDE_DIRS}" + INTERFACE_COMPILE_DEFINITIONS "${HMAT_DEFINITIONS}" + INTERFACE_LINK_DIRECTORIES "${HMAT_LIBRARY_DIRS}" + INTERFACE_LINK_LIBRARIES "hmat" + ) + message( STATUS "HMAT: Found" ) + else() + message( ERROR "HMAT: Not found" ) + endif() +endif() + ############################################################################### # Look for dependencies # ######################### diff --git a/cmake_modules/PrintOpts.cmake b/cmake_modules/PrintOpts.cmake index 8aac5f8b4c6b6ed1325ed2a1e1d46bfa4e3a519c..c439626b1739980c5201303e9826f723f6b4d792 100644 --- a/cmake_modules/PrintOpts.cmake +++ b/cmake_modules/PrintOpts.cmake @@ -82,6 +82,7 @@ set(dep_message "${dep_message}" " Kernels specific\n" " BLAS ................: ${BLAS_VENDOR_FOUND}\n" " LAPACK...............: ${LAPACK_VENDOR_FOUND}\n" +" HMAT-OSS.............: ${CHAMELEON_USE_HMAT}\n" "\n" " Simulation mode .....: ${CHAMELEON_SIMULATION}\n" "\n" diff --git a/cmake_modules/local_subs.py b/cmake_modules/local_subs.py index c26498a1fd7d9761dd14d36f3f0e5ca6b61d2ae5..10a56bd0e1e3b0531944fbd926a71231cd887539 100644 --- a/cmake_modules/local_subs.py +++ b/cmake_modules/local_subs.py @@ -52,6 +52,7 @@ subs = { ('ChamPattern', 'ChamRealFloat', 'ChamRealDouble', 'ChamRealFloat', r'\bChamRealDouble' ), ('int', 'float', 'double', 'complex32', 'complex64' ), ('Int', 'Float', 'Double', 'Complex32', 'Complex64' ), + ('Int', 'HMAT_SIMPLE_PRECISION','HMAT_DOUBLE_PRECISION','HMAT_SIMPLE_COMPLEX', 'HMAT_DOUBLE_COMPLEX'), # ----- Additional BLAS ('', 'sTile', 'dTile', 'cTile', 'zTile' ), @@ -102,5 +103,7 @@ subs = { # ('', 'stesting', 'dtesting', 'ctesting', 'ztesting' ), # ('', 'SAUXILIARY', 'DAUXILIARY', 'CAUXILIARY', 'ZAUXILIARY' ), # ('', 'sbuild', 'dbuild', 'cbuild', 'zbuild' ), + # Hmat-OSS kernels + ('hmat_p', 'hmat_s', 'hmat_d', 'hmat_c', 'hmat_z' ), ] } diff --git a/coreblas/compute/CMakeLists.txt b/coreblas/compute/CMakeLists.txt index 3e5cf7238a6b089f8c31c20d459b71cc1be162ed..081a231b89a1cb26b45edcc7fb1eb36ebdb70458 100644 --- a/coreblas/compute/CMakeLists.txt +++ b/coreblas/compute/CMakeLists.txt @@ -105,6 +105,10 @@ set(ZSRC core_zunmqr.c core_ztile.c ) +if( CHAMELEON_USE_HMAT ) + list( APPEND ZSRC + hmat_z.c ) +endif() precisions_rules_py(COREBLAS_SRCS_GENERATED "${ZSRC}" PRECISIONS "${CHAMELEON_PRECISION}") @@ -133,6 +137,10 @@ target_include_directories(coreblas PUBLIC $<INSTALL_INTERFACE:include>) set_property(TARGET coreblas PROPERTY INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/lib") +if( CHAMELEON_USE_HMAT ) + target_link_libraries(coreblas PUBLIC HMAT::hmat ) +endif() + target_link_libraries(coreblas PRIVATE MORSE::LAPACKE) target_link_libraries(coreblas PRIVATE MORSE::CBLAS) target_link_libraries(coreblas PUBLIC MORSE::M) diff --git a/coreblas/compute/core_ztile.c b/coreblas/compute/core_ztile.c index 65f8818a5cd756477a1e32af93ea8339da2b8462..3eb9cffc00d6da2e2e156280be218449791a5883 100644 --- a/coreblas/compute/core_ztile.c +++ b/coreblas/compute/core_ztile.c @@ -18,6 +18,10 @@ #include "coreblas.h" #include "coreblas/coreblas_ztile.h" +#if defined( CHAMELEON_USE_HMAT ) +#include "coreblas/hmat.h" +#endif + #if defined( PRECISION_z ) || defined( PRECISION_c ) void TCORE_dlag2z( cham_uplo_t uplo, int M, int N, @@ -114,15 +118,41 @@ TCORE_zgemm( cham_trans_t transA, CHAMELEON_Complex64_t beta, CHAM_tile_t * C ) { - assert( A->format & CHAMELEON_TILE_FULLRANK ); - assert( B->format & CHAMELEON_TILE_FULLRANK ); - assert( C->format & CHAMELEON_TILE_FULLRANK ); - CORE_zgemm( - transA, transB, M, N, K, alpha, - CHAM_tile_get_ptr( A ), A->ld, - CHAM_tile_get_ptr( B ), B->ld, - beta, - CHAM_tile_get_ptr( C ), C->ld ); + if ( ( A->format & CHAMELEON_TILE_FULLRANK ) && + ( B->format & CHAMELEON_TILE_FULLRANK ) && + ( C->format & CHAMELEON_TILE_FULLRANK ) ) + { + CORE_zgemm( transA, transB, M, N, K, alpha, + CHAM_tile_get_ptr( A ), A->ld, + CHAM_tile_get_ptr( B ), B->ld, + beta, + CHAM_tile_get_ptr( C ), C->ld ); + } +#if defined( CHAMELEON_USE_HMAT ) + else if ( ( A->format & CHAMELEON_TILE_HMAT ) && + ( B->format & CHAMELEON_TILE_HMAT ) && + ( C->format & CHAMELEON_TILE_HMAT ) ) + { + hmat_zgemm( chameleon_lapack_const( transA ), + chameleon_lapack_const( transB ), + &alpha, A->mat, B->mat, + &beta, C->mat ); + } + else if ( ( A->format & CHAMELEON_TILE_HMAT ) && + ( B->format & CHAMELEON_TILE_FULLRANK ) && + ( C->format & CHAMELEON_TILE_FULLRANK ) ) + { + assert( transB == ChamNoTrans ); + hmat_zgemv( chameleon_lapack_const( transA ), + &alpha, A->mat, + CHAM_tile_get_ptr( B ), + &beta, + CHAM_tile_get_ptr( C ), C->n ); + } +#endif + else { + assert( 0 ); + } } int @@ -158,22 +188,58 @@ TCORE_zgessq( cham_store_t storev, int M, int N, const CHAM_tile_t *A, CHAM_tile int TCORE_zgetrf( int M, int N, CHAM_tile_t *A, int *IPIV, int *INFO ) { - assert( A->format & CHAMELEON_TILE_FULLRANK ); - return CORE_zgetrf( M, N, CHAM_tile_get_ptr( A ), A->ld, IPIV, INFO ); + int rc = -1; + if ( A->format & CHAMELEON_TILE_FULLRANK ) { + rc = CORE_zgetrf( M, N, CHAM_tile_get_ptr( A ), A->ld, IPIV, INFO ); + } +#if defined( CHAMELEON_USE_HMAT ) + else if ( A->format & CHAMELEON_TILE_HMAT ) { + rc = hmat_zgetrf( A->mat ); + assert( rc == 0 ); + } +#endif + else { + assert( 0 ); + } + return rc; } int TCORE_zgetrf_incpiv( int M, int N, int IB, CHAM_tile_t *A, int *IPIV, int *INFO ) { - assert( A->format & CHAMELEON_TILE_FULLRANK ); - return CORE_zgetrf_incpiv( M, N, IB, CHAM_tile_get_ptr( A ), A->ld, IPIV, INFO ); + if ( A->format & CHAMELEON_TILE_FULLRANK ) { + return CORE_zgetrf_incpiv( M, N, IB, CHAM_tile_get_ptr( A ), A->ld, IPIV, INFO ); + } +#if defined( CHAMELEON_USE_HMAT ) + else if ( A->format & CHAMELEON_TILE_HMAT ) { + return hmat_zgetrf( A->mat ); + } +#endif + else { + assert( 0 ); + } + return -1; } int TCORE_zgetrf_nopiv( int M, int N, int IB, CHAM_tile_t *A, int *INFO ) { - assert( A->format & CHAMELEON_TILE_FULLRANK ); - return CORE_zgetrf_nopiv( M, N, IB, CHAM_tile_get_ptr( A ), A->ld, INFO ); + int rc = -1; + *INFO = 0; + + if ( A->format & CHAMELEON_TILE_FULLRANK ) { + rc = CORE_zgetrf_nopiv( M, N, IB, CHAM_tile_get_ptr( A ), A->ld, INFO ); + } +#if defined( CHAMELEON_USE_HMAT ) + else if ( A->format & CHAMELEON_TILE_HMAT ) { + rc = hmat_zgetrf( A->mat ); + assert( rc == 0 ); + } +#endif + else { + assert( 0 ); + } + return rc; } void @@ -423,8 +489,19 @@ TCORE_zplrnt( int m, void TCORE_zpotrf( cham_uplo_t uplo, int n, CHAM_tile_t *A, int *INFO ) { - assert( A->format & CHAMELEON_TILE_FULLRANK ); - CORE_zpotrf( uplo, n, CHAM_tile_get_ptr( A ), A->ld, INFO ); + if ( A->format & CHAMELEON_TILE_FULLRANK ) { + CORE_zpotrf( uplo, n, CHAM_tile_get_ptr( A ), A->ld, INFO ); + } +#if defined( CHAMELEON_USE_HMAT ) + else if ( A->format & CHAMELEON_TILE_HMAT ) { + assert( uplo == ChamLower ); + *INFO = hmat_zpotrf( A->mat ); + } +#endif + else { + assert( 0 ); + } + return; } int @@ -693,9 +770,25 @@ TCORE_ztrsm( cham_side_t side, const CHAM_tile_t * A, CHAM_tile_t * B ) { - assert( A->format & CHAMELEON_TILE_FULLRANK ); - assert( B->format & CHAMELEON_TILE_FULLRANK ); - CORE_ztrsm( side, uplo, transA, diag, M, N, alpha, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld ); + if ( ( A->format & CHAMELEON_TILE_FULLRANK ) && + ( B->format & CHAMELEON_TILE_FULLRANK ) ) + { + CORE_ztrsm( side, uplo, transA, diag, M, N, alpha, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld ); + } +#if defined( CHAMELEON_USE_HMAT ) + else if ( A->format & CHAMELEON_TILE_HMAT ) { + hmat_ztrsm( chameleon_lapack_const( side ), + chameleon_lapack_const( uplo ), + chameleon_lapack_const( transA ), + chameleon_lapack_const( diag ), + M, N, &alpha, A->mat, + ( B->format & CHAMELEON_TILE_HMAT ), + CHAM_tile_get_ptr( B ) ); + } +#endif + else { + assert( 0 ); + } } int diff --git a/coreblas/compute/hmat_z.c b/coreblas/compute/hmat_z.c new file mode 100644 index 0000000000000000000000000000000000000000..b232bf71bd580a3be395cae60ce3c7ca866cb21b --- /dev/null +++ b/coreblas/compute/hmat_z.c @@ -0,0 +1,89 @@ +/** + * + * @file core_zhmat.c + * + * @copyright 2019-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * @copyright 2019-2019 Universidad Jaume I. All rights reserved. + * + * @brief Chameleon CPU kernel interface from CHAM_tile_t layout to the real one. + * + * @version 1.0.0 + * @author Rocio Carratala-Saez + * @author Mathieu Faverge + * @date 2019-12-02 + * @precisions normal z -> c d s + * + */ +#include "coreblas/hmat.h" + +/** + * @brief The hmat interface to the C++ functions + */ +hmat_interface_t hmat_zinterface; + +void __hmat_zinit() __attribute__(( constructor )); +void __hmat_zinit() { + hmat_init_default_interface( &hmat_zinterface, HMAT_DOUBLE_COMPLEX ); +} + +void __hmat_zfini() __attribute__(( destructor )); +void __hmat_zfini() { + hmat_zinterface.finalize(); +} + +int hmat_zpotrf( hmat_matrix_t *A ) { + hmat_factorization_context_t ctx_facto; + hmat_factorization_context_init( &ctx_facto ); + ctx_facto.factorization = hmat_factorization_llt; + hmat_zinterface.factorize_generic( A, &ctx_facto ); + return 0; +} + +int hmat_zgetrf( hmat_matrix_t *A ) { + hmat_factorization_context_t ctx_facto; + hmat_factorization_context_init( &ctx_facto ); + ctx_facto.factorization = hmat_factorization_lu; + hmat_zinterface.factorize_generic( A, &ctx_facto ); + return 0; +} + +int hmat_zgemm( char transA, char transB, void* alpha, hmat_matrix_t* A, + hmat_matrix_t* B, void* beta, hmat_matrix_t* C ) { + return hmat_zinterface.gemm( transA, transB, alpha, A, B, beta, C ); +} + +int hmat_zgemv( char transA, void* alpha, hmat_matrix_t* A, + void* B, void* beta, void* C, int nrhs ) { + return hmat_zinterface.gemm_scalar( transA, alpha, A, B, beta, C, nrhs ); +} + +int hmat_ztrsm( char side, char uplo, char trans, char diag, int m, int n, + void* alpha, hmat_matrix_t* A, int is_b_hmat, void* B ) { + return hmat_zinterface.trsm( side, uplo, trans, diag, m, n, alpha, A, is_b_hmat, B ); +} + +hmat_matrix_t *hmat_zread( void *buffer ) { + hmat_buffer_comm_t buffer_struct = { 0, (char*)buffer }; + hmat_matrix_t *hmat = hmat_zinterface.read_struct( (hmat_iostream)(&buffer_comm_read), + (void *)(&buffer_struct) ); + hmat_zinterface.read_data( hmat, (hmat_iostream)(&buffer_comm_read), (void *)(&buffer_struct) ); + return hmat; +} + +size_t hmat_zsize( hmat_matrix_t *hmat ) { + size_t size = 0; + hmat_zinterface.write_struct( hmat, (hmat_iostream)(&buffer_comm_size), (void *)(&size) ); + hmat_zinterface.write_data( hmat, (hmat_iostream)(&buffer_comm_size), (void *)(&size) ); + return size; +} + +void hmat_zwrite( hmat_matrix_t *hmat, char *ptr ) { + hmat_buffer_comm_t buffer_struct = { 0, ptr }; + hmat_zinterface.write_struct( hmat, (hmat_iostream)(&buffer_comm_write), (void *)(&buffer_struct) ); + hmat_zinterface.write_data( hmat, (hmat_iostream)(&buffer_comm_write), (void *)(&buffer_struct) ); +} + +void hmat_zdestroy( hmat_matrix_t *hmat ) { + hmat_zinterface.destroy( hmat ); +} diff --git a/coreblas/include/CMakeLists.txt b/coreblas/include/CMakeLists.txt index 56b43bcfedbe6cbf798388fd9251b9ba1a9de83a..30f7ad411e5212aff837151d20d072db44a08033 100644 --- a/coreblas/include/CMakeLists.txt +++ b/coreblas/include/CMakeLists.txt @@ -34,6 +34,11 @@ set(ZHDR coreblas/coreblas_zc.h coreblas/coreblas_ztile.h ) +if( CHAMELEON_USE_HMAT ) + list( APPEND ZHDR + coreblas/hmat_z.h ) +endif() + precisions_rules_py( COREBLAS_HDRS_GENERATED "${ZHDR}" TARGETDIR coreblas @@ -49,6 +54,10 @@ set(COREBLAS_HDRS coreblas/lapacke_mangling.h coreblas/random.h ) +if( CHAMELEON_USE_HMAT ) + list( APPEND COREBLAS_HDRS + coreblas/hmat.h ) +endif() # Add generated headers # --------------------- diff --git a/coreblas/include/coreblas/hmat.h b/coreblas/include/coreblas/hmat.h new file mode 100644 index 0000000000000000000000000000000000000000..dabcea3b0e7a185aae5b69242a644b450cdddd8a --- /dev/null +++ b/coreblas/include/coreblas/hmat.h @@ -0,0 +1,75 @@ +/** + * + * @file hmat.h + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon CPU hmat-oss function declaration + * + * @version 0.9.2 + * @author Rocio Carratala-Saez + * @author Mathieu Faverge + * @date 2019-12-02 + * + */ +#ifndef _coreblas_hmat_h_ +#define _coreblas_hmat_h_ + +#include "coreblas.h" + +#if !defined( CHAMELEON_USE_HMAT ) +#error "This file should not be included by itself" +#endif + +#include <hmat/hmat.h> + +BEGIN_C_DECLS + +/** + * Functions to get linearize/unlinearize hmat into/from a buffer + */ +typedef struct hmat_buffer_comm_s { + size_t offset; + char *data; +} hmat_buffer_comm_t; + +// Count the size of the structure to write +static inline void +buffer_comm_size( void * buffer, size_t n, void *user_data ) +{ + size_t *size = (size_t *)user_data; + *size += n; + (void) buffer; +} + +static inline void +buffer_comm_read(void * buffer, size_t n, void *user_data) +{ + hmat_buffer_comm_t *buffer_struct = (hmat_buffer_comm_t *) user_data; + char *buffer_read = buffer_struct->data + buffer_struct->offset; + memcpy(buffer, buffer_read, n); + buffer_struct->offset += n; +} + +static inline void +buffer_comm_write(void * buffer, size_t n, void *user_data) +{ + hmat_buffer_comm_t *buffer_struct = (hmat_buffer_comm_t *) user_data; + char *buffer_write = buffer_struct->data + buffer_struct->offset; + memcpy(buffer_write, buffer, n); + buffer_struct->offset += n; +} + +#include "coreblas/hmat_z.h" +#include "coreblas/hmat_c.h" +#include "coreblas/hmat_d.h" +#include "coreblas/hmat_s.h" + +END_C_DECLS + +#endif /* _coreblas_hmat_h_ */ diff --git a/coreblas/include/coreblas/hmat_z.h b/coreblas/include/coreblas/hmat_z.h new file mode 100644 index 0000000000000000000000000000000000000000..b8d38928d6bf6c9596f82c7e8175ad01c01873b6 --- /dev/null +++ b/coreblas/include/coreblas/hmat_z.h @@ -0,0 +1,34 @@ +/** + * + * @file hmat_z.h + * + * @copyright 2019-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * @copyright 2019-2019 Universidad Jaume I. All rights reserved. + * + * @brief Chameleon CPU kernel interface from CHAM_tile_t layout to the real one. + * + * @version 1.0.0 + * @author Rocio Carratala-Saez + * @author Mathieu Faverge + * @date 2019-12-02 + * @precisions normal z -> c d s + * + */ +#ifndef _hmat_z_h_ +#define _hmat_z_h_ + +HMAT_API int hmat_zgetrf( hmat_matrix_t *A ); +HMAT_API int hmat_zpotrf( hmat_matrix_t *A ); +HMAT_API int hmat_zgemm( char transA, char transB, void* alpha, hmat_matrix_t* A, hmat_matrix_t* B, void* beta, hmat_matrix_t* C ); +HMAT_API int hmat_ztrsm( char side, char uplo, char trans, char diag, int m, int n, + void* alpha, hmat_matrix_t* A, int is_b_hmat, void* B ); +HMAT_API int hmat_zgemv( char transA, void* alpha, hmat_matrix_t* A, + void* B, void* beta, void* C, int nrhs ); +HMAT_API hmat_matrix_t *hmat_zread( void *buffer ); +HMAT_API size_t hmat_zsize( hmat_matrix_t *hmat ); +HMAT_API void hmat_zwrite( hmat_matrix_t *hmat, char *ptr ); + +HMAT_API void hmat_zdestroy( hmat_matrix_t *hmat ); + +#endif /* _hmat_z_h_ */ diff --git a/include/chameleon/config.h.in b/include/chameleon/config.h.in index 58b066488d52e36b333eae3c92fc159b8ba6bb1a..15073d1c4604947b704f0f3cf5ff55f15074012a 100644 --- a/include/chameleon/config.h.in +++ b/include/chameleon/config.h.in @@ -51,6 +51,9 @@ #cmakedefine CHAMELEON_USE_CUBLAS #cmakedefine CHAMELEON_USE_CUBLAS_V2 +/* Hmat-oss */ +#cmakedefine CHAMELEON_USE_HMAT + /* Simulation */ #cmakedefine CHAMELEON_SIMULATION #cmakedefine CHAMELEON_SIMULATION_EXTENDED diff --git a/runtime/starpu/control/runtime_descriptor.c b/runtime/starpu/control/runtime_descriptor.c index 965f47cc4008c633d1ba6e55b0e42aa6337617bc..2ed7183a7b627f2b3e78a5cc6bf641991bd57d80 100644 --- a/runtime/starpu/control/runtime_descriptor.c +++ b/runtime/starpu/control/runtime_descriptor.c @@ -454,7 +454,8 @@ void *RUNTIME_data_getaddr( const CHAM_desc_t *A, int m, int n ) CHAM_tile_t *tile = A->get_blktile( A, m, n ); if ( myrank == owner ) { - if ( tile->mat != NULL ) + if ( (tile->format & CHAMELEON_TILE_HMAT) || + (tile->mat != NULL) ) { home_node = STARPU_MAIN_RAM; } diff --git a/runtime/starpu/interface/cham_tile_interface.c b/runtime/starpu/interface/cham_tile_interface.c index bfb528561b99fa86439f2d31683776d676dc2e47..a15e0b78117814f8e2a8ee886f3c40668e914bb3 100644 --- a/runtime/starpu/interface/cham_tile_interface.c +++ b/runtime/starpu/interface/cham_tile_interface.c @@ -17,6 +17,61 @@ * */ #include "chameleon_starpu.h" +#if defined(CHAMELEON_USE_HMAT) +#include "coreblas/hmat.h" +#endif + +static inline void +cti_hmat_destroy( starpu_cham_tile_interface_t *cham_tile_interface ) +{ + switch( cham_tile_interface->flttype ) { + case ChamComplexDouble: + hmat_zdestroy( cham_tile_interface->tile.mat ); + break; + case ChamComplexFloat: + hmat_cdestroy( cham_tile_interface->tile.mat ); + break; + case ChamRealDouble: + hmat_ddestroy( cham_tile_interface->tile.mat ); + break; + case ChamRealFloat: + hmat_sdestroy( cham_tile_interface->tile.mat ); + break; + default: + STARPU_ASSERT_MSG( 0, "cti_hmat_destroy(): unknown flttype\n" ); + } + cham_tile_interface->tile.mat = NULL; +} + +static inline size_t +cti_get_hmat_required_size( starpu_cham_tile_interface_t *cham_tile_interface ) +{ + size_t size = 0; +#if defined(CHAMELEON_USE_HMAT) + if ( (cham_tile_interface->tile.format & CHAMELEON_TILE_HMAT) && + (cham_tile_interface->tile.mat != NULL ) ) + { + switch( cham_tile_interface->flttype ) { + case ChamComplexDouble: + size = hmat_zsize( cham_tile_interface->tile.mat ); + break; + case ChamComplexFloat: + size = hmat_csize( cham_tile_interface->tile.mat ); + break; + case ChamRealDouble: + size = hmat_dsize( cham_tile_interface->tile.mat ); + break; + case ChamRealFloat: + size = hmat_ssize( cham_tile_interface->tile.mat ); + break; + default: + STARPU_ASSERT_MSG( 0, "cti_get_hmat_required_size(cham_tile_interface): unknown flttype\n" ); + } + } +#endif + (void)cham_tile_interface; + return size; +} static inline CHAM_tile_t * cti_handle_get( starpu_data_handle_t handle ) @@ -118,6 +173,18 @@ cti_free_data_on_node( void *data_interface, unsigned node ) starpu_cham_tile_interface_t *cham_tile_interface = (starpu_cham_tile_interface_t *) data_interface; +#if defined(CHAMELEON_USE_HMAT) + if ( (cham_tile_interface->tile.format & CHAMELEON_TILE_HMAT) && + (cham_tile_interface->tile.mat != NULL ) ) + { + cti_hmat_destroy( cham_tile_interface ); + } + else +#endif + { + assert( (intptr_t)(cham_tile_interface->tile.mat) == cham_tile_interface->dev_handle ); + } + starpu_free_on_node( node, cham_tile_interface->dev_handle, cham_tile_interface->allocsize ); cham_tile_interface->tile.mat = NULL; cham_tile_interface->dev_handle = 0; @@ -250,6 +317,35 @@ cti_pack_data_fullrank( starpu_cham_tile_interface_t *cham_tile_interface, return 0; } +static int +cti_pack_data_hmat( starpu_cham_tile_interface_t *cham_tile_interface, + void *ptr ) +{ +#if !defined(CHAMELEON_USE_HMAT) + assert( 0 ); +#else + hmat_matrix_t *mat = cham_tile_interface->tile.mat; + STARPU_ASSERT_MSG( mat != NULL, "cti_pack_data_hmat: Try to pack a NULL pointer\n" ); + switch( cham_tile_interface->flttype ) { + case ChamComplexDouble: + hmat_zwrite( mat, ptr ); + break; + case ChamComplexFloat: + hmat_cwrite( mat, ptr ); + break; + case ChamRealDouble: + hmat_dwrite( mat, ptr ); + break; + case ChamRealFloat: + hmat_swrite( mat, ptr ); + break; + default: + STARPU_ASSERT_MSG( 0, "cti_pack_data_hmat: unknown flttype\n" ); + } +#endif + return 0; +} + static int cti_pack_data( starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count ) { @@ -257,9 +353,11 @@ cti_pack_data( starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ss starpu_cham_tile_interface_t *cham_tile_interface = (starpu_cham_tile_interface_t *) starpu_data_get_interface_on_node(handle, node); + size_t size; - *count = (starpu_ssize_t)(cham_tile_interface->allocsize); - *count += sizeof(size_t) + sizeof(CHAM_tile_t); + size = (starpu_ssize_t)(cham_tile_interface->allocsize); + size += cti_get_hmat_required_size( cham_tile_interface ); + *count = size + sizeof(size_t) + sizeof(CHAM_tile_t); if ( ptr != NULL ) { @@ -268,7 +366,7 @@ cti_pack_data( starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ss tmp = (char*)(*ptr); /* Start by the size to allocate on reception */ - memcpy( tmp, &(cham_tile_interface->allocsize), sizeof(size_t) ); + memcpy( tmp, &size, sizeof(size_t) ); tmp += sizeof(size_t); /* Copy the tile metadata */ @@ -279,6 +377,9 @@ cti_pack_data( starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ss if ( cham_tile_interface->tile.format & CHAMELEON_TILE_FULLRANK ) { cti_pack_data_fullrank( cham_tile_interface, tmp ); } + else if ( cham_tile_interface->tile.format & CHAMELEON_TILE_HMAT ) { + cti_pack_data_hmat( cham_tile_interface, tmp ); + } else { STARPU_ASSERT_MSG( 1, "Unsupported format for pack." ); } @@ -292,6 +393,8 @@ cti_unpack_data_fullrank( starpu_cham_tile_interface_t *cham_tile_interface, void *ptr ) { char *matrix = (void *)cham_tile_interface->tile.mat; + assert( cham_tile_interface->tile.format & CHAMELEON_TILE_FULLRANK ); + assert( matrix != NULL ); if ( cham_tile_interface->tile.m == cham_tile_interface->tile.ld ) { memcpy( matrix, ptr, cham_tile_interface->allocsize ); @@ -312,6 +415,36 @@ cti_unpack_data_fullrank( starpu_cham_tile_interface_t *cham_tile_interface, return 0; } +static int +cti_unpack_data_hmat( starpu_cham_tile_interface_t *cham_tile_interface, + void *ptr ) +{ + assert( cham_tile_interface->tile.format & CHAMELEON_TILE_HMAT ); +#if !defined(CHAMELEON_USE_HMAT) + assert( 0 ); +#else + hmat_matrix_t *mat = NULL; + switch( cham_tile_interface->flttype ) { + case ChamComplexDouble: + mat = hmat_zread( ptr ); + break; + case ChamComplexFloat: + mat = hmat_cread( ptr ); + break; + case ChamRealDouble: + mat = hmat_dread( ptr ); + break; + case ChamRealFloat: + mat = hmat_sread( ptr ); + break; + default: + STARPU_ASSERT_MSG( 0, "cti_unpack_data_hmat: unknown flttype\n" ); + } + cham_tile_interface->tile.mat = mat; +#endif + return 0; +} + static int cti_peek_data( starpu_data_handle_t handle, unsigned node, void *ptr, size_t count ) { @@ -321,6 +454,7 @@ cti_peek_data( starpu_data_handle_t handle, unsigned node, void *ptr, size_t cou starpu_data_get_interface_on_node(handle, node); char *tmp = ptr; + size_t size; #if defined(CHAMELEON_USE_MPI_DATATYPES) /* @@ -335,13 +469,16 @@ cti_peek_data( starpu_data_handle_t handle, unsigned node, void *ptr, size_t cou CHAM_tile_t dsttile; /* Extract the size of the information to unpack */ - memcpy( &(cham_tile_interface->allocsize), tmp, sizeof(size_t) ); + memcpy( &size, tmp, sizeof(size_t) ); tmp += sizeof(size_t); /* Extract the tile metadata of the remote tile */ memcpy( &dsttile, tmp, sizeof(CHAM_tile_t) ); tmp += sizeof(CHAM_tile_t); + assert( ( (dsttile.format & CHAMELEON_TILE_HMAT) && (cham_tile_interface->allocsize == 0 )) || + (!(dsttile.format & CHAMELEON_TILE_HMAT) && (cham_tile_interface->allocsize == size)) ); + /* * Update with the local information. Data is packed now, and do not * need leading dimension anymore @@ -359,6 +496,9 @@ cti_peek_data( starpu_data_handle_t handle, unsigned node, void *ptr, size_t cou if ( cham_tile_interface->tile.format & CHAMELEON_TILE_FULLRANK ) { cti_unpack_data_fullrank( cham_tile_interface, tmp ); } + else if ( cham_tile_interface->tile.format & CHAMELEON_TILE_HMAT ) { + cti_unpack_data_hmat( cham_tile_interface, tmp ); + } else { STARPU_ASSERT_MSG( 1, "Unsupported format for unpack." ); } @@ -488,11 +628,19 @@ starpu_cham_tile_register( starpu_data_handle_t *handleptr, .id = STARPU_CHAM_TILE_INTERFACE_ID, .flttype = flttype, .dev_handle = (intptr_t)(tile->mat), - .allocsize = tile->m * tile->n * elemsize, /* We compute with m even if it's allocated with ld */ + .allocsize = -1, .tilesize = tile->m * tile->n * elemsize, }; memcpy( &(cham_tile_interface.tile), tile, sizeof( CHAM_tile_t ) ); + if ( tile->format & CHAMELEON_TILE_FULLRANK ) { + cham_tile_interface.allocsize = tile->m * tile->n * elemsize; + } + else if ( tile->format & CHAMELEON_TILE_HMAT ) { + /* For hmat, allocated data will be handled by hmat library. StarPU cannot allocate it for the library */ + cham_tile_interface.allocsize = 0; + } + starpu_data_register( handleptr, home_node, &cham_tile_interface, &starpu_interface_cham_tile_ops ); }