diff --git a/.clang_complete b/.clang_complete new file mode 100644 index 0000000000000000000000000000000000000000..fcc47adf3deff54f683d088e2c498eac0c122b6b --- /dev/null +++ b/.clang_complete @@ -0,0 +1,32 @@ +-DCBLAS_HAS_CGEMM3M +-DCBLAS_HAS_ZGEMM3M +-DCHAMELEON_COPY_DIAG +-DPRECISION_c +-DPRECISION_d +-DPRECISION_s +-DPRECISION_z +-I. +-I./runtime/starpu/include +-I./example/basic_zposv +-I./example/lapack_to_chameleon +-I./example/out_of_core +-I./include +-I./testing +-I./timing +-I./control +-I./coreblas/include +-I./cudablas/include +-I./build/runtime/starpu/include +-I./build/example/basic_zposv +-I./build/example/lapack_to_chameleon +-I./build/example/out_of_core +-I./build/include +-I./build/testing +-I./build/timing +-I./build/control +-I./build/coreblas/include +-I./build/cudablas/include +-I/opt/intel/compilers_and_libraries/linux/mkl/include +-I/usr/include/libxml2 +-I/usr/local/include +-I/usr/local/include/starpu/1.3 diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt index bed4df6f140c9449d7c7201bf4249373c14d6146..b1c733b9826bde9225d13343da6ea4f1a007e366 100644 --- a/compute/CMakeLists.txt +++ b/compute/CMakeLists.txt @@ -101,6 +101,8 @@ set(ZSRC pzgetrf_incpiv.c pzgetrf_nopiv.c pzgetrf.c + pzgetrf_sp.c + pzgetrf_pp.c pzlacpy.c pzlange.c pzlansy.c @@ -152,6 +154,8 @@ set(ZSRC zgetrf_incpiv.c zgetrf_nopiv.c zgetrf.c + zgetrf_sp.c + zgetrf_pp.c zgetrs_incpiv.c zgetrs_nopiv.c zlacpy.c diff --git a/compute/pzgetrf_pp.c b/compute/pzgetrf_pp.c new file mode 100644 index 0000000000000000000000000000000000000000..44f6f84bbb10665962877d4313c6ad3b287023b8 --- /dev/null +++ b/compute/pzgetrf_pp.c @@ -0,0 +1,667 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Inria. All rights reserved. + * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file pzgetrf.c + * + * CHAMELON compute + * CHAMELON is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @author Omar Zenati + * @author Terry Cojean + * @date 2011-07-06 + * @precisions normal z -> c d s + * + **/ +#include "control/common.h" + +#define A(m, n) A, m, n +#define Acopy(m, n) Acopy, m, n +#define IPIV_h(it) IPIV+it +#define perm_to_upper_h(it) perm_to_upper+it +#define perm_from_upper_h(it) perm_from_upper+it +#define IPIV_p(it, index) (((int*)(IPIV+it)->ptr)+index) +#define perm_to_upper_p(it, index) (((int*)(perm_to_upper+it)->ptr)+index) +#define perm_from_upper_p(it, index) (((int*)(perm_from_upper+it)->ptr)+index) + +#define ALLOCATE_DYNAMIC_HANDLE(_handle_, _type_, _size_) \ + CHAMELON_data_handle_t *_handle_; \ + if (_size_ > 0) { \ + _handle_ = malloc(sizeof(CHAMELON_data_handle_t)); \ + RUNTIME_data_alloc(_handle_); \ + RUNTIME_vector_data_register(_handle_, -1, 0, _size_, sizeof(_type_));} + + +#define ALLOCATE_DYNAMIC_HANDLE_PTR(_handle_, _type_, _size_) \ + CHAMELON_data_handle_t *_handle_; \ + if (_size_ > 0) { \ + _handle_ = malloc(sizeof(CHAMELON_data_handle_t)); \ + _handle_->ptr = malloc(sizeof(_type_) * _size_); \ + RUNTIME_data_alloc(_handle_); \ + RUNTIME_vector_data_register(_handle_, 0, (uintptr_t)_handle_->ptr, _size_, sizeof(_type_));} + +static inline void pivot_to_permutation(int diag_size, int height, + int *IPIV, int *source, int *dest) +{ + int i; + int pivoted, inverse; + + for(i=0; i < diag_size; i++) + source[i] = dest[i] = i + height; + + for(i = 0; i < diag_size; i++) { + int ip = IPIV[i]-1; + assert( ip - height >= i ); + if ( ip-height > i ) { + pivoted = source[i]; + + if (ip-height < diag_size) { + inverse = source[ip-height]; + source[ip-height] = pivoted; + } else { + inverse = ip; + int j; + for(j=0; j < diag_size; j++) { + if( dest[j] == ip ) { + inverse = j + height; + break; + } + } + } + + source[i] = inverse; + + pivoted -= height; + inverse -= height; + + if (pivoted < diag_size) dest[pivoted] = ip; + if (inverse < diag_size) dest[inverse] = i+height; + } + } +} + +static void submit_swaps(CHAMELON_desc_t *A, int k, int n, + CHAMELON_data_handle_t *IPIV_handle, + CHAMELON_data_handle_t *source_handle, CHAMELON_data_handle_t *dest_handle, + int *TILES_SRC, int *TILES_DST, + CHAMELON_data_handle_t* idxs_from_upper_h, + CHAMELON_data_handle_t* idxs_to_upper_h, CHAMELON_option_t *options) +{ + int tempk = k * A->mb; + int tempkmt = k * A->mt; + int tempm = A->m - tempk; + int tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; + int tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; + int tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; + CHAMELON_desc_t *updatePanel = chameleon_desc_submatrix(A, tempk, n*A->nb, tempm, tempnn); + + if (options->swap_grain == CHAMELON_LINE_SWAPS) { + int m; + + int nb_source = TILES_SRC[tempkmt]; + ALLOCATE_DYNAMIC_HANDLE_PTR(handle_upper_to_itself, CHAMELON_Complex64_t, nb_source * tempnn); + /* ALLOCATE_DYNAMIC_HANDLE(handle_idxs_upper_to_itself, int, nb_source); */ + if (nb_source > 0) { + m = 0; + int tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + CHAMELON_TASK_zswp_extract_rows2(options, nb_source, A->mb, k, k, k, tempmm, tempnn, IPIV_handle, source_handle, A(k,n), handle_upper_to_itself, &idxs_from_upper_h[tempkmt]); + } + + for (m = 1 ; m < updatePanel->mt ; m++) + { + int tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + nb_source = TILES_SRC[tempkmt+m]; + int nb_dest = TILES_DST[tempkmt+m]; + if (nb_source > 0 || nb_dest > 0) { + /* ALLOC */ + ALLOCATE_DYNAMIC_HANDLE_PTR(handle_to_upper_tile, CHAMELON_Complex64_t, nb_source * tempnn); + ALLOCATE_DYNAMIC_HANDLE_PTR(handle_from_upper_tile, CHAMELON_Complex64_t, nb_dest * tempnn); + /* ALLOCATE_DYNAMIC_HANDLE(handle_idxs_to_upper_tile, int, nb_source); */ + /* ALLOCATE_DYNAMIC_HANDLE(handle_idxs_from_upper_tile, int, nb_dest); */ + + CHAMELON_TASK_zswp_extract_rows2(options, nb_dest, A->mb, k, k, k+m, tempmm, tempnn, IPIV_handle, dest_handle, A(k,n), handle_from_upper_tile, &idxs_to_upper_h[tempkmt+m]); + CHAMELON_TASK_zswp_extract_rows2(options, nb_source, A->mb, k, k+m, k, tempmm, tempnn, IPIV_handle, source_handle, A(k+m,n), handle_to_upper_tile, &idxs_from_upper_h[tempkmt+m]); + + //Pour réinjecter ce que l'on a récup dans chacune + CHAMELON_TASK_zswp_insert_rows2(options, nb_source, A->mb, k, k+m, k, tempmm, tempnn, source_handle, A(k,n), handle_to_upper_tile, &idxs_from_upper_h[tempkmt+m]); + CHAMELON_TASK_zswp_insert_rows2(options, nb_dest, A->mb, k, k, k+m, tempmm, tempnn, dest_handle, A(k+m,n), handle_from_upper_tile, &idxs_to_upper_h[tempkmt+m]); + + /* TODO: make these evolve to free the handle and its data */ + /* RUNTIME_data_flush_handle(options, handle_to_upper_tile); */ + /* RUNTIME_data_flush_handle(options, handle_from_upper_tile); */ + RUNTIME_data_unregister_free_async(handle_to_upper_tile); + RUNTIME_data_unregister_free_async(handle_from_upper_tile); + /* RUNTIME_data_flush_handle(options, handle_idxs_to_upper_tile); */ + /* RUNTIME_data_flush_handle(options, handle_idxs_from_upper_tile); */ + } + } + + nb_source = TILES_SRC[tempkmt]; + if (nb_source > 0) { + m = 0; + int tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + CHAMELON_TASK_zswp_insert_rows2(options, nb_source, A->mb, k, k, k, tempmm, tempnn, source_handle, A(k,n), handle_upper_to_itself, &idxs_from_upper_h[tempkmt]); + + /* RUNTIME_data_flush_handle(options, handle_upper_to_itself); */ + RUNTIME_data_unregister_free_async(handle_upper_to_itself); + /* RUNTIME_data_flush_handle(options, handle_idxs_upper_to_itself); */ + } + } else if (options->swap_grain == CHAMELON_TILE_SWAPS) { + int m; + + for (m = 0 ; m < updatePanel->mt ; m++) + { + if (TILES_SRC[tempkmt+m] >= 1) { + CHAMELON_TASK_zswpp(options, + updatePanel, A(k, n), A(k+m, n), + 1, tempkm, + IPIV_handle, 1); + } + } + } else if (options->swap_grain == CHAMELON_COLUMN_SWAPS) { + CHAMELON_TASK_zlaswp_ontile(options, + updatePanel, A(k, n), 1, + chameleon_min(tempkm, tempkn), IPIV_handle, 1); + } else { + /* + * Using something other than a DSD matrix without SWAPs produces + * unstable behaviour. Remove this assert with caution. + */ + assert(options->dsd_matrix == 1); + } + /* TODO: free updatePanel asynchronously */ +} + +static void compute_swap_amount(CHAMELON_option_t *options, int k, + CHAMELON_desc_t *A, CHAMELON_data_handle_t *IPIV, + CHAMELON_data_handle_t *perm_from_upper, + CHAMELON_data_handle_t *perm_to_upper, + int* nbswaps_from_upper, int* nbswaps_to_upper, + int* idxs_from_upper[], int* idxs_to_upper[], + CHAMELON_data_handle_t* idxs_from_upper_h, + CHAMELON_data_handle_t* idxs_to_upper_h) +{ + int tempk = k * A->mb; + int tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; + int tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; + int minkmn = ((tempkm < tempkn)?tempkm:tempkn); + + if (options->swap_grain == CHAMELON_LINE_SWAPS) { + int i; + pivot_to_permutation(tempkm, k*A->mb, IPIV_p(k,0), + perm_from_upper_p(k,0), perm_to_upper_p(k,0)); + + for (i=0 ; i<tempkm ; i++) { + int index = (*perm_from_upper_p(k,i)-tempk)/((int)A->mb); + if (*perm_from_upper_p(k,i) != tempk+i) { + nbswaps_from_upper[A->mt*k+index] += 1; + } + index = (*perm_to_upper_p(k,i)-tempk)/((int)A->mb); + if (*perm_to_upper_p(k,i) != tempk+i) { + nbswaps_to_upper[A->mt*k+index] += 1; + } + } + int count_from_upper[A->mt], count_to_upper[A->mt]; + for (i=0; i < A->mt; i++) { + if (nbswaps_from_upper[A->nt*k+i] || nbswaps_to_upper[A->nt*k+i]) + { + idxs_from_upper[A->nt*k+i] = (int*)malloc(nbswaps_from_upper[A->nt*k+i]*sizeof(int)); + /* RUNTIME_data_alloc(&idxs_from_upper_h[A->nt*k+i]); */ + /* RUNTIME_vector_data_register(&idxs_from_upper_h[A->nt*k+i], 0, */ + /* (uintptr_t)idxs_from_upper[A->nt*k+i], */ + /* nbswaps_from_upper[A->nt*k+i], sizeof(int)); */ + + idxs_to_upper[A->nt*k+i] = (int*) malloc(nbswaps_to_upper[A->nt*k+i]*sizeof(int)); + /* RUNTIME_data_alloc(&idxs_to_upper_h[A->nt*k+i]); */ + /* RUNTIME_vector_data_register(&idxs_to_upper_h[A->nt*k+i], 0, */ + /* (uintptr_t)idxs_to_upper[A->nt*k+i], */ + /* nbswaps_to_upper[A->nt*k+i], sizeof(int)); */ + } + count_from_upper[i] = 0; + count_to_upper[i] = 0; + } + + for (i=0 ; i < tempkm; i++) + { + int index = (*perm_from_upper_p(k,i)-tempk)/((int)A->mb); + if (*perm_from_upper_p(k,i) != tempk+i) + idxs_from_upper[A->nt*k+index][count_from_upper[index]++] = i; + + index = (*perm_to_upper_p(k,i)-tempk)/((int)A->mb); + if (*perm_to_upper_p(k,i) != tempk+i) + idxs_to_upper[A->nt*k+index][count_to_upper[index]++] = i; + } + + for (i=0; i < A->mt; i++) { + idxs_from_upper_h[A->nt*k+i].data_handle = NULL; + idxs_to_upper_h[A->nt*k+i].data_handle = NULL; + if (nbswaps_from_upper[A->nt*k+i] || nbswaps_to_upper[A->nt*k+i]) + { + RUNTIME_data_alloc(&idxs_from_upper_h[A->nt*k+i]); + RUNTIME_vector_data_register(&idxs_from_upper_h[A->nt*k+i], 0, + (uintptr_t)idxs_from_upper[A->nt*k+i], + nbswaps_from_upper[A->nt*k+i], sizeof(int)); + + RUNTIME_data_alloc(&idxs_to_upper_h[A->nt*k+i]); + RUNTIME_vector_data_register(&idxs_to_upper_h[A->nt*k+i], 0, + (uintptr_t)idxs_to_upper[A->nt*k+i], + nbswaps_to_upper[A->nt*k+i], sizeof(int)); + } + } + } else if (options->swap_grain == CHAMELON_TILE_SWAPS) { + int i; + for (i=0 ; i<tempkm ; i++) { + int index = (*IPIV_p(k,i)-tempk-1)/((int)A->mb); + if (*IPIV_p(k,i) != tempk+i+1) + nbswaps_from_upper[A->mt*k+index] += 1; + } + } +} + +static void factorize_panel(int rectil, CHAMELON_option_t *options, + int k, CHAMELON_desc_t *A, CHAMELON_data_handle_t *IPIV, + CHAMELON_data_handle_t *workspace) + { + int tempk = k * A->mb; + int tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; + int tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; + int minkmn = ((tempkm < tempkn)?tempkm:tempkn); + + if (rectil) + { + CHAMELON_TASK_zgetrf_rectil( + options, + chameleon_desc_submatrix(A, tempk, k*A->nb, A->m - tempk, tempkn), + IPIV); + } + else + { + int h, m, d, r; + int height, dmax, pui; + CHAMELON_data_handle_t *top; + CHAMELON_data_handle_t *bottom; + for(h = 0; h <= minkmn; h++) { + /* Apply last update and extract local max */ + for (m = 0; m < A->mt; m++) { + int tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + CHAMELON_TASK_zgetrf_extract_max(options, + k,tempmm,tempkn,h,A->nb, + A(m,k), + workspace + m); + } + if(h != minkmn) { + /* Reduce max */ + height = A->mt-k; + dmax = (int)ceil( log ( height ) / log(2.) ); + /* if(h==0) */ + /* printf("k=%d, height=%d, dmax=%d\n",k,height,dmax); */ + for(d = 0; d < dmax; d++){ + pui = (1 << d); + for(r = 0; (r+pui) < height; r+=pui*2) { + /* if(h == 0) */ + /* printf("reduction entre %d et %d a %d\n",r+k,r+k+pui,k); */ + top = workspace + r + k; + bottom = workspace + r + k + pui; + CHAMELON_TASK_zgetrf_compare_workspace(options, + tempkn, h, A->nb, + top, bottom); + } + } + /* Broadcast global max */ + for (m = k+1; m < A->mt; m++) { + top = workspace + m; + bottom = workspace + k; + CHAMELON_TASK_zgetrf_copy_workspace(options, + tempkn, h, A->nb, + top, bottom); + + } + /* Save pivot */ + CHAMELON_TASK_zgetrf_save_pivots(options, h, IPIV, workspace + k); + } + } + + } + + } + +void chameleon_pzgetrf_pp(CHAMELON_desc_t *A, + CHAMELON_data_handle_t *IPIV, + CHAMELON_data_handle_t *perm_from_upper, + CHAMELON_data_handle_t *perm_to_upper, + int *nbswaps_from_upper, int *nbswaps_to_upper, + CHAMELON_data_handle_t *workspace, + CHAMELON_sequence_t *sequence, + CHAMELON_request_t *request) +{ + CHAMELON_context_t *chameleon; + CHAMELON_option_t options; + int k, m, n; + int tempkm, tempkn, tempmm, tempnn, minmnt; + int ldak, ldam; + int rectil = 0, ncores = -1; + int *cores = NULL; + int* idxs_from_upper[A->nt*A->mt]; + int* idxs_to_upper[A->nt*A->mt]; + CHAMELON_data_handle_t idxs_from_upper_h[A->nt*A->mt]; + CHAMELON_data_handle_t idxs_to_upper_h[A->nt*A->mt]; + + CHAMELON_Complex64_t zone = (CHAMELON_Complex64_t) 1.0; + CHAMELON_Complex64_t mzone = (CHAMELON_Complex64_t)-1.0; + + chameleon = chameleon_context_self(); + if (sequence->status != CHAMELON_SUCCESS) + return; + + RUNTIME_options_init( &options, chameleon, sequence, request ); + + minmnt = (A->mt < A->nt)? A->mt : A->nt; + + /* Treat the related chameleon context options */ + if (CHAMELON_RECTIL_PANEL) + { + rectil = 1; + /* TODO: Fix it. This doesn't work with GPUs, it's too simple: + * we need to fetch a CPU list from the Runtime, and put ids from this list + * To do this, the best thing would be to push this into the runtime alloc function + */ + if (CHAMELON_PANEL_THREADS > 0) + { + ncores = CHAMELON_PANEL_THREADS; + cores = (int*)malloc(ncores * sizeof(int)); + for (k=0; k<ncores; k++) + cores[k] = k; + } + RUNTIME_alloc_threads(&options, ncores, cores); + free(cores); + CORE_zgetrf_rectil_init(); + } + + if (options.swap_grain == CHAMELON_LINE_SWAPS) { + assert(A->m%A->mb == 0); + assert(A->n%A->nb == 0); + } + + /***********************/ + /* Panel factorization */ + /***********************/ + options.priority = 2*A->nt; + factorize_panel(rectil, &options, 0, A, + IPIV_h(0), workspace); + + for (k = 0; k < minmnt; k++) { + tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; + tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; + ldak = BLKLDD(A, k); + options.priority = 2*A->nt - k; + + /***************************/ + /* Compute amount of swaps */ + /***************************/ + RUNTIME_data_acquire(IPIV_h(k)); + /* RUNTIME_data_acquire_write(perm_from_upper_h(k)); */ + /* RUNTIME_data_acquire_write(perm_to_upper_h(k)); */ + compute_swap_amount(&options, k, A, IPIV, perm_from_upper, perm_to_upper, + nbswaps_from_upper, nbswaps_to_upper, + idxs_from_upper, idxs_to_upper, + idxs_from_upper_h, idxs_to_upper_h); + /* RUNTIME_data_release(perm_to_upper_h(k)); */ + /* RUNTIME_data_release(perm_from_upper_h(k)); */ + RUNTIME_data_release(IPIV_h(k)); + + /* TRSM_U */ + for (n = k+1; n < A->nt; n++) { + tempnn = ((n == A->nt-1)? A->n-n*A->nb : A->nb); + options.priority = 2*A->nt-n-k; + + /* options.priority = 2*A->nt - 2*k - n; */ + unsigned old_swap = options.swap_grain; + if (n > k + 10) + /* if (n == k+1 || k == 0) */ + /* if (k == 0) */ + /* options.swap_grain = CHAMELON_COLUMN_SWAPS; */ + submit_swaps(A, k, n, IPIV_h(k), perm_from_upper_h(k), perm_to_upper_h(k), nbswaps_from_upper, nbswaps_to_upper, idxs_from_upper_h, idxs_to_upper_h, &options); + options.swap_grain = old_swap; + + CHAMELON_TASK_ztrsm( + &options, + ChamLeft, ChamLower, ChamNoTrans, ChamUnit, + tempkm, tempnn, tempnn, + zone, A(k, k), ldak, + A(k, n), ldak); + + /* GEMM */ + for (m = k+1; m < A->mt; m++) { + /* options.priority = A->mt*A->nt - 2*k - n - m; */ + tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + ldam = BLKLDD(A, m); + + CHAMELON_TASK_zgemm( + &options, + ChamNoTrans, ChamNoTrans, + tempmm, tempnn, tempkm, tempkn, + mzone, A(m, k), ldam, + A(k, n), ldak, + zone, A(m, n), ldam); + } + + + if (n==k+1) + { + /* options.priority = 2*A->nt - 2*n; */ + factorize_panel(rectil, &options, n, A, + IPIV_h(n), workspace); + } + } + + /* Backward swaps */ + unsigned old_swap = options.swap_grain; + options.swap_grain = CHAMELON_COLUMN_SWAPS; + /* options.priority = 0; */ + for (n = 0 ; n < k ; n++) { + /* options.priority = 2*A->nt - 2*k - n; */ + options.priority = 2*A->nt - k - n; + submit_swaps(A, k, n, IPIV_h(k), perm_from_upper_h(k), perm_to_upper_h(k), + nbswaps_from_upper, nbswaps_to_upper, + idxs_from_upper_h, idxs_to_upper_h, &options); + } + options.swap_grain = old_swap; + + /*******************************************/ + /* Let's properly cleanup to help StarPU ! */ + /*******************************************/ + + /* for (n = 0; n < A->nt; n++) */ + /* RUNTIME_data_flush( &options, A(k, n) ); */ + + RUNTIME_data_flush_handle(&options, IPIV_h(k)); + if (options.swap_grain == CHAMELON_LINE_SWAPS) + for (m = 0 ; m < A->mt ; m++) { + RUNTIME_data_flush_handle(&options, perm_to_upper_h(k)); + RUNTIME_data_flush_handle(&options, perm_from_upper_h(k)); + RUNTIME_data_flush_handle(&options, &idxs_from_upper_h[k*A->nt+m]); + RUNTIME_data_flush_handle(&options, &idxs_to_upper_h[k*A->nt+m]); + RUNTIME_data_unregister_free_async(&idxs_from_upper_h[k*A->nt+m]); + RUNTIME_data_unregister_free_async(&idxs_to_upper_h[k*A->nt+m]); + } + } + + if (CHAMELON_RECTIL_PANEL) + { + RUNTIME_unalloc_threads(&options); + } + RUNTIME_options_finalize( &options, chameleon ); + RUNTIME_flush(); +} + + +/*********************************************************** + * I do not know what the following version is. + * Omar also had this version so I will just leave it here + **********************************************************/ + +/* void chameleon_pzgetrf_pp_v2(CHAMELON_desc_t *A, CHAMELON_desc_t *Acopy, */ +/* CHAMELON_data_handle_t *IPIV, */ +/* CHAMELON_data_handle_t *perm_to_upper, */ +/* CHAMELON_data_handle_t *perm_from_upper, */ +/* CHAMELON_data_handle_t *workspace, */ +/* CHAMELON_sequence_t *sequence, */ +/* CHAMELON_request_t *request) */ +/* { */ +/* CHAMELON_context_t *chameleon; */ +/* CHAMELON_option_t options; */ +/* int k, m, n, h, d, r; */ +/* int height, pui, dmax; */ +/* int tempkm, tempkn, tempmm,tempnn; */ +/* int minkmn, minmnt; */ +/* int *pivots; */ +/* int ldak, ldam; */ +/* CHAMELON_data_handle_t *top, *bottom; */ +/* int nb_pivots; */ + +/* CHAMELON_Complex64_t zone = (CHAMELON_Complex64_t) 1.0; */ +/* CHAMELON_Complex64_t mzone = (CHAMELON_Complex64_t)-1.0; */ + +/* chameleon = chameleon_context_self(); */ +/* if (sequence->status != CHAMELON_SUCCESS) */ +/* return; */ + +/* RUNTIME_options_init( &options, chameleon, sequence, request ); */ + +/* for (n = 0; n < A->nt; n++) { */ + +/* tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; */ +/* minmnt = (A->mt < n)? A->mt : n; */ + +/* /\***********************\/ */ +/* /\* Update *\/ */ +/* /\***********************\/ */ +/* for (k = 0; k < minmnt; k++) { */ +/* tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; */ +/* tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; */ +/* minkmn = ((tempkm < A->nb) ? tempkm : A->nb); */ +/* ldak = BLKLDD(A, k); */ + +/* /\* SWAP *\/ */ +/* CHAMELON_TASK_zlacpy( &options, ChamUpperLower, */ +/* tempkm, tempnn, tempkn, */ +/* A(k,n), ldak, */ +/* Acopy(0,n), ldak); */ + +/* pivots = perm_to_upper_p(k); */ +/* for (m = k+1; m < A->mt; m++) { */ +/* tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; */ +/* ldam = BLKLDD(A, m); */ + +/* nb_pivots = how_much_swap_to(tempkm, A->mb, m, pivots); */ +/* /\* printf("nb_pivots=%d between A(%d,%d) and A(%d,%d)\n",nb_pivots,k,n,m,n); *\/ */ +/* if(nb_pivots > 0) { */ +/* ALLOCATE_DYNAMIC_HANDLE(handle_to_upper_tile, CHAMELON_Complex64_t, nb_pivots * A->nb); */ +/* ALLOCATE_DYNAMIC_HANDLE(handle_from_upper_tile, CHAMELON_Complex64_t, nb_pivots * A->nb); */ + +/* CHAMELON_TASK_zswp_extract_rows(&options, k, k, m, tempkm, tempmm, tempnn, A->mb, perm_from_upper_h(k), Acopy(0,n), handle_from_upper_tile); */ +/* CHAMELON_TASK_zswp_extract_rows(&options, k, m, k, tempkm, tempmm, tempnn, A->mb, perm_to_upper_h(k), A(m,n), handle_to_upper_tile); */ + +/* CHAMELON_TASK_zswp_insert_rows(&options, k, m, k, tempkm, tempmm, tempnn, A->mb, perm_to_upper_h(k), A(k,n), handle_to_upper_tile); */ +/* CHAMELON_TASK_zswp_insert_rows(&options, k, k, m, tempkm, tempmm, tempnn, A->mb, perm_from_upper_h(k), A(m,n), handle_from_upper_tile); */ +/* } */ +/* } */ + +/* nb_pivots = how_much_swap_to(tempkm, A->mb, k, pivots); */ + +/* /\* printf("nb_pivots=%d in A(%d,%d)\n",nb_pivots,k,n); *\/ */ +/* if(nb_pivots > 0) { */ +/* ALLOCATE_DYNAMIC_HANDLE(handle_to_upper_tile, CHAMELON_Complex64_t, nb_pivots * A->nb); */ +/* CHAMELON_TASK_zswp_extract_rows(&options, k, k, k, tempkm, tempkm, tempnn, A->mb, perm_to_upper_h(k), Acopy(0,n), handle_to_upper_tile); */ +/* CHAMELON_TASK_zswp_insert_rows(&options, k, k, k, tempkm, tempkm, tempnn, A->mb, perm_to_upper_h(k), A(k,n), handle_to_upper_tile); */ +/* } */ + +/* /\* TRSM_U *\/ */ +/* CHAMELON_TASK_ztrsm( */ +/* &options, */ +/* ChamLeft, ChamLower, ChamNoTrans, ChamUnit, */ +/* tempkm, tempnn, tempkm, */ +/* zone, A(k, k), ldak, */ +/* A(k, n), ldak); */ + +/* /\* GEMM *\/ */ +/* for (m = k+1; m < A->mt; m++) { */ +/* tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; */ +/* CHAMELON_TASK_zgemm( */ +/* &options, */ +/* ChamNoTrans, ChamNoTrans, */ +/* tempmm, tempnn, tempkm, tempkn, */ +/* mzone, A(m, k), ldam, */ +/* A(k, n), ldak, */ +/* zone, A(m, n), ldam); */ +/* } */ +/* } */ + +/* if (n < A->mt) { */ +/* /\***********************\/ */ +/* /\* Panel factorization *\/ */ +/* /\***********************\/ */ + +/* tempkm = n == A->mt-1 ? A->m-n*A->mb : A->mb; */ +/* tempkn = n == A->nt-1 ? A->n-n*A->nb : A->nb; */ +/* minkmn = ((tempkm < tempkn)?tempkm:tempkn); */ + +/* for(h = 0; h <= minkmn; h++) { */ +/* /\* Apply last update and extract local max *\/ */ +/* for (m = n; m < A->mt; m++) { */ +/* tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; */ +/* CHAMELON_TASK_zgetrf_extract_max(&options, */ +/* n,tempmm,tempkn,h,A->nb, */ +/* A(m,n), */ +/* workspace + m); */ +/* } */ +/* if(h != minkmn) { */ +/* /\* Reduce max *\/ */ +/* height = A->mt-n; */ +/* dmax = (int)ceil( log ( height ) / log(2.) ); */ +/* /\* if(h==0) *\/ */ +/* /\* printf("k=%d, height=%d, dmax=%d\n",k,height,dmax); *\/ */ +/* for(d = 0; d < dmax; d++){ */ +/* pui = (1 << d); */ +/* for(r = 0; (r+pui) < height; r+=pui*2) { */ +/* /\* if(h == 0) *\/ */ +/* /\* printf("reduction entre %d et %d a %d\n",r+k,r+k+pui,k); *\/ */ +/* top = workspace + r + n; */ +/* bottom = workspace + r + n + pui; */ +/* CHAMELON_TASK_zgetrf_compare_workspace(&options, */ +/* tempkn, h, A->nb, */ +/* top, bottom); */ +/* } */ +/* } */ +/* /\* Broadcast global max *\/ */ +/* for (m = n+1; m < A->mt; m++) { */ +/* top = workspace + m; */ +/* bottom = workspace + n; */ +/* CHAMELON_TASK_zgetrf_copy_workspace(&options, */ +/* tempkn, h, A->nb, */ +/* top, bottom); */ + +/* } */ +/* /\* Save pivot *\/ */ +/* CHAMELON_TASK_zgetrf_save_pivots(&options, h, IPIV_h(n), workspace + n); */ +/* } */ +/* } */ + +/* RUNTIME_data_acquire(IPIV_h(k)); */ +/* pivot_to_permutation(minkmn, k*A->mb, IPIV_p(k), perm_to_upper_p(k), perm_from_upper_p(k)); */ +/* RUNTIME_data_release(IPIV_h(k)); */ +/* } */ +/* } */ + +/* RUNTIME_options_finalize(&options, chameleon); */ +/* } */ diff --git a/compute/pzgetrf_sp.c b/compute/pzgetrf_sp.c new file mode 100644 index 0000000000000000000000000000000000000000..76555adae0a147c61775e359fc9bf00be185dbf0 --- /dev/null +++ b/compute/pzgetrf_sp.c @@ -0,0 +1,117 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Inria. All rights reserved. + * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file pzgetrf_sp.c + * + * CHAMELON auxiliary routines + * CHAMELON is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @author Omar Zenati + * @author Terry Cojean + * @date 2011-07-06 + * @precisions normal z -> s d c + * + **/ +#include "control/common.h" + +#define A(m,n) A, m, n + +/***************************************************************************//** + * Parallel tile LU decomposition - dynamic scheduling + **/ +void chameleon_pzgetrf_sp(CHAMELON_desc_t *A, + double criteria, CHAMELON_data_handle_t *nb_pivot_handle, + CHAMELON_sequence_t *sequence, CHAMELON_request_t *request) +{ + CHAMELON_context_t *chameleon; + CHAMELON_option_t options; + + int k, m, n; + int tempkm, tempkn, tempmm,tempnn; + int minmnt; + int ldak, ldam; + size_t ws_host = 0; + + CHAMELON_Complex64_t zone = (CHAMELON_Complex64_t) 1.0; + CHAMELON_Complex64_t mzone = (CHAMELON_Complex64_t)-1.0; + + chameleon = chameleon_context_self(); + if (sequence->status != CHAMELON_SUCCESS) + return; + RUNTIME_options_init(&options, chameleon, sequence, request); + +#ifdef CHAMELEON_USE_MAGMA + if (0) /* Disable the workspace as long as it is is not used (See StarPU codelet) */ + { + int nb = CHAMELON_IB; /* Approximate nb for simulation */ +#if !defined(CHAMELEON_SIMULATION) + nb = magma_get_zpotrf_nb(A->nb); +#endif + ws_host = sizeof(CHAMELON_Complex64_t)*nb*nb; + } +#endif + RUNTIME_options_ws_alloc( &options, 0, ws_host ); + + minmnt = (A->mt < A->nt)? A->mt : A->nt; + for (k = 0; k < minmnt; k++) { + tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; + tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; + ldak = BLKLDD(A, k); + + CHAMELON_TASK_zgetrf_sp( + &options, + tempkm, tempkn, + A(k, k), criteria, nb_pivot_handle); + /* TRSM_L */ + for (m = k+1; m < A->mt; m++) { + tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + ldam = BLKLDD(A, m); + CHAMELON_TASK_ztrsm( + &options, + ChamRight, ChamUpper, ChamNoTrans, ChamNonUnit, + tempkm, tempmm, A->mb, + zone, A(k, k), ldak, + A(m, k), ldam); + } + + /* TRSM_U */ + for (n = k+1; n < A->nt; n++) { + tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; + CHAMELON_TASK_ztrsm( + &options, + ChamLeft, ChamLower, ChamNoTrans, ChamUnit, + tempkm, tempnn, A->nb, + zone, A(k, k), ldak, + A(k, n), ldak); + } + + /* GEMM */ + for (m = k+1; m < A->mt; m++) { + tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + ldam = BLKLDD(A, m); + for (n = k+1; n < A->nt; n++) { + tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; + CHAMELON_TASK_zgemm( + &options, + ChamNoTrans, ChamNoTrans, + tempmm, tempnn, A->mb, A->mb, + mzone, A(m, k), ldam, + A(k, n), ldak, + zone, A(m, n), ldam); + } + } + } + + RUNTIME_options_ws_free(&options); + RUNTIME_options_finalize(&options, chameleon); +} diff --git a/compute/pzplrnt.c b/compute/pzplrnt.c index b6fba32067df9ea3a9487302f90569238003efd2..fddb3b6c8e321d91e90b44164011c8e3ebb0f1e3 100644 --- a/compute/pzplrnt.c +++ b/compute/pzplrnt.c @@ -53,7 +53,7 @@ void chameleon_pzplrnt( CHAM_desc_t *A, INSERT_TASK_zplrnt( &options, tempmm, tempnn, A(m, n), - bigM, m*A->mb + m0, n*A->nb + n0, seed ); + bigM, m*A->mb + m0, n*A->nb + n0, seed, options.dsd_matrix ); } } RUNTIME_options_finalize(&options, chamctxt); diff --git a/compute/zgetrf_pp.c b/compute/zgetrf_pp.c new file mode 100644 index 0000000000000000000000000000000000000000..cd09d6ca1c8e9a18385b5b77550455db680b8b17 --- /dev/null +++ b/compute/zgetrf_pp.c @@ -0,0 +1,392 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Inria. All rights reserved. + * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file zgetrf_pp.c + * + * CHAMELON computational routines + * CHAMELON is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 2.6.0 + * @author Omar Zenati + * @author Terry Cojean + * @date 2011-07-06 + * + * @precisions normal z -> s d c + * + **/ +#include "control/common.h" + +/***************************************************************************//** + * + * @ingroup CHAMELON_Complex64_t + * + * CHAMELON_zgetrf_pp - Computes an LU factorization of a general M-by-N matrix A + * using the tile LU algorithm with partial pivoting with row interchanges. + * + ******************************************************************************* + * + * @param[in] M + * The number of rows of the matrix A. M >= 0. + * + * @param[in] N + * The number of columns of the matrix A. N >= 0. + * + * @param[in,out] A + * On entry, the M-by-N matrix to be factored. + * On exit, the tile factors L and U from the factorization. + * + * @param[in] LDA + * The leading dimension of the array A. LDA >= max(1,M). + * + * @param[out] IPIV + * The pivot indices that define the permutations (not equivalent to LAPACK). + * + ******************************************************************************* + * + * @return + * \retval CHAMELON_SUCCESS successful exit + * \retval <0 if -i, the i-th argument had an illegal value + * \retval >0 if i, U(i,i) is exactly zero. The factorization has been completed, + * but the factor U is exactly singular, and division by zero will occur + * if it is used to solve a system of equations. + * + ******************************************************************************* + * + * @sa CHAMELON_zgetrf_Tile + * @sa CHAMELON_zgetrf_Tile_Async + * @sa CHAMELON_cgetrf + * @sa CHAMELON_dgetrf + * @sa CHAMELON_sgetrf + * @sa CHAMELON_zgetrs + * + ******************************************************************************/ +/* TODO: Fix the untiled version !! */ +int CHAMELON_zgetrf_pp(int M, int N, + CHAMELON_Complex64_t *A, int LDA, int *IPIV) +{ + int NB; + int status; + CHAMELON_desc_t descAl, descAt; + CHAMELON_context_t *chameleon; + CHAMELON_sequence_t *sequence = NULL; + CHAMELON_request_t request = CHAMELON_REQUEST_INITIALIZER; + + chameleon = chameleon_context_self(); + if (chameleon == NULL) { + chameleon_fatal_error("CHAMELON_zgetrf_pp", "CHAMELON not initialized"); + return CHAMELON_ERR_NOT_INITIALIZED; + } + /* Check input arguments */ + if (M < 0) { + chameleon_error("CHAMELON_zgetrf_pp", "illegal value of M"); + return -1; + } + if (N < 0) { + chameleon_error("CHAMELON_zgetrf_pp", "illegal value of N"); + return -2; + } + if (LDA < chameleon_max(1, M)) { + chameleon_error("CHAMELON_zgetrf_pp", "illegal value of LDA"); + return -4; + } + /* Quick return */ + if (chameleon_min(M, N) == 0) + return CHAMELON_SUCCESS; + + chameleon_sequence_create(chameleon, &sequence); + + /* Set NT & NTRHS */ + NB = CHAMELON_NB; + + /* Submit the matrix conversion */ + chameleon_zlap2tile( chameleon, &descAl, &descAt, ChamDescInout, ChamUpperLower, + A, NB, NB, LDA, N, N, N, sequence, &request); + + int minmn = (M < N) ? M : N; + int nb_pivots = minmn / NB + ((minmn % NB) ? 1 : 0); + int minmnt = nb_pivots * NB; + int *nbswaps_from_upper = (int*)calloc(minmn, sizeof(int)); + int *nbswaps_to_upper = (int*)calloc(minmn, sizeof(int)); + int *swaps_from_upper = (int*)calloc(minmnt * descAt.nb, sizeof(int)); + int *swaps_to_upper = (int*)calloc(minmnt * descAt.nb, sizeof(int)); + + CHAMELON_data_handle_t IPIV_h[nb_pivots]; + CHAMELON_data_handle_t swaps_from_upper_h[nb_pivots]; + CHAMELON_data_handle_t swaps_to_upper_h[nb_pivots]; + + int i; + for(i = 0; i < nb_pivots; i++) { + RUNTIME_data_alloc(IPIV_h + i); + RUNTIME_vector_data_register(IPIV_h + i, 0, (uintptr_t)(IPIV + i * descAt.nb), descAt.nb, sizeof(int)); + + RUNTIME_data_alloc(swaps_from_upper_h + i); + RUNTIME_vector_data_register(swaps_from_upper_h + i, 0, + (uintptr_t)(swaps_from_upper + i * descAt.nb), + descAt.nb, sizeof(int)); + + RUNTIME_data_alloc(swaps_to_upper_h + i); + RUNTIME_vector_data_register(swaps_to_upper_h + i, 0, + (uintptr_t)(swaps_to_upper+ i * descAt.nb), + descAt.nb, sizeof(int)); + } + + int nb_workspace = descAt.mt; + int workspace_size = NB * 2 + 1; + CHAMELON_data_handle_t *max_workspace = malloc(sizeof(CHAMELON_data_handle_t) * nb_workspace); + CHAMELON_Complex64_t **workspaces = malloc(sizeof(CHAMELON_Complex64_t *) * nb_workspace); + + for(i = 0; i < nb_workspace; i++) { + workspaces[i] = malloc(sizeof(CHAMELON_Complex64_t) * workspace_size); + RUNTIME_vector_data_register(max_workspace + i, 0, (uintptr_t)(workspaces[i]), + workspace_size, sizeof(CHAMELON_Complex64_t)); + } + + /* Call the tile interface */ + CHAMELON_zgetrf_pp_Tile_Async(&descAt, IPIV_h, swaps_from_upper_h, swaps_to_upper_h, + nbswaps_from_upper, nbswaps_to_upper, max_workspace, + sequence, &request); + + + chameleon_ztile2lap( chameleon, &descAl, &descAt, + ChamDescInout, ChamUpperLower, sequence, &request ); + + chameleon_sequence_wait( chameleon, sequence ); + + /* Cleanup the temporary data */ + chameleon_ztile2lap_cleanup( chameleon, &descAl, &descAt ); + + for (i = 0 ; i < nb_pivots ; i++) { + RUNTIME_data_unregister(IPIV_h+i); + RUNTIME_data_unregister(swaps_from_upper_h+i); + RUNTIME_data_unregister(swaps_to_upper_h+i); + } + + free(nbswaps_from_upper); + free(nbswaps_to_upper); + free(swaps_from_upper); + free(swaps_to_upper); + free(max_workspace); + for(i = 0; i < nb_workspace; i++) { + free(workspaces[i]); + } + free(workspaces); + + status = sequence->status; + chameleon_sequence_destroy( chameleon, sequence ); + + return status; +} + +/***************************************************************************//** + * + * @ingroup CHAMELON_Complex64_t_Tile + * + * CHAMELON_zgetrf_pp_Tile - Computes the tile LU factorization of a matrix. + * Tile equivalent of CHAMELON_zgetrf(). + * Operates on matrices stored by tiles. + * All matrices are passed through descriptors. + * All dimensions are taken from the descriptors. + * + ******************************************************************************* + * + * @param[in,out] A + * On entry, the M-by-N matrix to be factored. + * On exit, the tile factors L and U from the factorization. + * + * @param[out] IPIV + * The pivot indices that define the permutations (not equivalent to LAPACK). + * + ******************************************************************************* + * + * @return + * \retval CHAMELON_SUCCESS successful exit + * \retval >0 if i, U(i,i) is exactly zero. The factorization has been completed, + * but the factor U is exactly singular, and division by zero will occur + * if it is used to solve a system of equations. + * + ******************************************************************************* + * + * @sa CHAMELON_zgetrf + * @sa CHAMELON_zgetrf_Tile_Async + * @sa CHAMELON_cgetrf_Tile + * @sa CHAMELON_dgetrf_Tile + * @sa CHAMELON_sgetrf_Tile + * @sa CHAMELON_zgetrs_Tile + * + ******************************************************************************/ +int CHAMELON_zgetrf_pp_Tile(CHAMELON_desc_t *A, int *IPIV) +{ + /* FIXME: Actually use correct nb/mb/min(...) scheme for computing pivot amount */ + int minmn = chameleon_min(A->m, A->n); + int minmnt = (A->mt < A->nt) ? A->mt : A->nt; + int nb_pivots = minmn / A->nb + ((minmn % A->nb) ? 1 : 0); + int status; + int i; + CHAMELON_context_t *chameleon = chameleon_context_self(); + CHAMELON_sequence_t *sequence = NULL; + CHAMELON_request_t request = CHAMELON_REQUEST_INITIALIZER; + + int *swaps_from_upper = (int*)calloc(chameleon_max(A->m, A->n), sizeof(int)); + int *swaps_to_upper = (int*)calloc(chameleon_max(A->m, A->n), sizeof(int)); + CHAMELON_data_handle_t IPIV_h[nb_pivots]; + CHAMELON_data_handle_t swaps_from_upper_h[nb_pivots]; + CHAMELON_data_handle_t swaps_to_upper_h[nb_pivots]; + + int *nbswaps_from_upper = (int*)calloc(nb_pivots * A->mt, sizeof(int)); + int *nbswaps_to_upper = (int*)calloc(nb_pivots * A->mt, sizeof(int)); + + int nb_workspace = A->mt; + int workspace_size = A->nb * 2 + 1; + CHAMELON_data_handle_t *max_workspace = malloc(sizeof(CHAMELON_data_handle_t) * nb_workspace); + CHAMELON_Complex64_t *workspaces = malloc(sizeof(CHAMELON_Complex64_t) * nb_workspace * workspace_size); + + for(i = 0; i < nb_workspace; i++) { + RUNTIME_data_alloc(max_workspace + i); + RUNTIME_vector_data_register(max_workspace + i, 0, + (uintptr_t)(workspaces + i * workspace_size), + workspace_size, sizeof(CHAMELON_Complex64_t)); + } + + for(i = 0; i < nb_pivots; i++) { + int tempin = i == A->nt-1 ? A->n-i*A->nb : A->nb; + RUNTIME_data_alloc(IPIV_h + i); + RUNTIME_vector_data_register(IPIV_h + i, 0, + (uintptr_t)(IPIV + i * A->nb), + tempin, sizeof(int)); + + RUNTIME_data_alloc(swaps_from_upper_h + i); + RUNTIME_vector_data_register(swaps_from_upper_h + i, 0, + (uintptr_t)(swaps_from_upper + i * A->nb), + tempin, sizeof(int)); + + RUNTIME_data_alloc(swaps_to_upper_h + i); + RUNTIME_vector_data_register(swaps_to_upper_h + i, 0, + (uintptr_t)(swaps_to_upper+ i * A->nb), + tempin, sizeof(int)); + } + + + chameleon_sequence_create(chameleon, &sequence); + + CHAMELON_zgetrf_pp_Tile_Async(A, IPIV_h, swaps_from_upper_h, swaps_to_upper_h, + nbswaps_from_upper, nbswaps_to_upper, + max_workspace, sequence, &request); + + CHAMELON_Desc_Flush( A, sequence ); + chameleon_sequence_wait(chameleon, sequence); + + for (i = 0 ; i < nb_pivots ; i++) { + RUNTIME_data_unregister(IPIV_h+i); + RUNTIME_data_free(IPIV_h+i); + + RUNTIME_data_unregister(swaps_from_upper_h+i); + RUNTIME_data_free(swaps_from_upper_h+i); + + RUNTIME_data_unregister(swaps_to_upper_h+i); + RUNTIME_data_free(swaps_to_upper_h+i); + } + + free(nbswaps_from_upper); + free(nbswaps_to_upper); + free(swaps_from_upper); + free(swaps_to_upper); + + for(i = 0; i < nb_workspace; i++) { + RUNTIME_data_unregister(max_workspace+i); + RUNTIME_data_free(max_workspace+i); + } + free(max_workspace); + free(workspaces); + + status = sequence->status; + chameleon_sequence_destroy(chameleon, sequence); + return status; +} + +/***************************************************************************//** + * + * @ingroup CHAMELON_Complex64_t_Tile_Async + * + * CHAMELON_zgetrf_pp_Tile_Async - Computes the tile LU factorization of a matrix. + * Non-blocking equivalent of CHAMELON_zgetrf_Tile(). + * May return before the computation is finished. + * Allows for pipelining of operations at runtime. + * + ******************************************************************************* + * + * @param[in] sequence + * Identifies the sequence of function calls that this call belongs to + * (for completion checks and exception handling purposes). + * + * @param[out] request + * Identifies this function call (for exception handling purposes). + * + ******************************************************************************* + * + * @sa CHAMELON_zgetrf + * @sa CHAMELON_zgetrf_Tile + * @sa CHAMELON_cgetrf_Tile_Async + * @sa CHAMELON_dgetrf_Tile_Async + * @sa CHAMELON_sgetrf_Tile_Async + * @sa CHAMELON_zgetrs_Tile_Async + * + ******************************************************************************/ +int CHAMELON_zgetrf_pp_Tile_Async(CHAMELON_desc_t *A, + CHAMELON_data_handle_t *IPIV_h, + CHAMELON_data_handle_t *swaps_from_upper_h, + CHAMELON_data_handle_t *swaps_to_upper_h, + int *nbswaps_from_upper, + int *nbswaps_to_upper, + CHAMELON_data_handle_t *workspace_h, + CHAMELON_sequence_t *sequence, + CHAMELON_request_t *request) +{ + CHAMELON_context_t *chameleon; + + chameleon = chameleon_context_self(); + if (chameleon == NULL) { + chameleon_fatal_error("CHAMELON_zgetrf_pp_Tile", "CHAMELON not initialized"); + return CHAMELON_ERR_NOT_INITIALIZED; + } + if (sequence == NULL) { + chameleon_fatal_error("CHAMELON_zgetrf_pp_Tile", "NULL sequence"); + return CHAMELON_ERR_UNALLOCATED; + } + if (request == NULL) { + chameleon_fatal_error("CHAMELON_zgetrf_pp_Tile", "NULL request"); + return CHAMELON_ERR_UNALLOCATED; + } + /* Check sequence status */ + if (sequence->status == CHAMELON_SUCCESS) + request->status = CHAMELON_SUCCESS; + else + return chameleon_request_fail(sequence, request, CHAMELON_ERR_SEQUENCE_FLUSHED); + + /* Check descriptors for correctness */ + if (chameleon_desc_check(A) != CHAMELON_SUCCESS) { + chameleon_error("CHAMELON_zgetrf_pp_Tile", "invalid first descriptor"); + return chameleon_request_fail(sequence, request, CHAMELON_ERR_ILLEGAL_VALUE); + } + + /* Check input arguments */ + if (A->nb != A->mb) { + chameleon_error("CHAMELON_zgetrf_pp_Tile", "only square tiles supported"); + return chameleon_request_fail(sequence, request, CHAMELON_ERR_ILLEGAL_VALUE); + } + + chameleon_pzgetrf_pp(A, IPIV_h, swaps_from_upper_h, swaps_to_upper_h, + nbswaps_from_upper, nbswaps_to_upper, workspace_h, + sequence, request); + + return CHAMELON_SUCCESS; +} diff --git a/compute/zgetrf_sp.c b/compute/zgetrf_sp.c new file mode 100644 index 0000000000000000000000000000000000000000..df4ae6fdede7d972f3fbbb3dde2d0265ae336120 --- /dev/null +++ b/compute/zgetrf_sp.c @@ -0,0 +1,273 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Inria. All rights reserved. + * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file zgetrf_sp.c + * + * CHAMELON computational routines + * CHAMELON is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 2.6.0 + * @author Omar Zenati + * @author Terry Cojean + * @date 2011-07-06 + * + * @precisions normal z -> s d c + * + **/ +#include "control/common.h" + +/***************************************************************************//** + * + * @ingroup CHAMELON_Complex64_t + * + * CHAMELON_zgetrf_sp - Computes the LU decomposition of a matrix A. + * The factorization has the form + * + * \f[ A = \{_{L\times U} \f] + * + * where U is an upper triangular matrix and L is a lower triangular matrix. + * + ******************************************************************************* + * + * @param[in] M + * The order of the matrix A. M >= 0. + * + * @param[in] N + * The order of the matrix A. N >= 0. + * + * @param[in,out] A + * On entry, the matrix A. + * On exit, if return value = 0, the factor U or L from the LU decomposition + * A = LU. + * + * @param[in] LDA + * The leading dimension of the array A. LDA >= max(1,N). + * + ******************************************************************************* + * + * @return + * \retval CHAMELON_SUCCESS successful exit + * \retval <0 if -i, the i-th argument had an illegal value + * \retval >0 if i, the leading minor of order i of A is not positive definite, so the + * factorization could not be completed, and the solution has not been computed. + * + ******************************************************************************* + * + * @sa CHAMELON_zgetrf_sp_Tile + * @sa CHAMELON_zgetrf_sp_Tile_Async + * @sa CHAMELON_cgetrf_sp + * @sa CHAMELON_dgetrf_sp + * @sa CHAMELON_sgetrf_sp + * + ******************************************************************************/ +int CHAMELON_zgetrf_sp(int M, int N, CHAMELON_Complex64_t *A, int LDA, + double criteria, int *nb_pivot) +{ + int NB; + int status; + CHAMELON_context_t *chameleon; + CHAMELON_sequence_t *sequence = NULL; + CHAMELON_request_t request = CHAMELON_REQUEST_INITIALIZER; + CHAMELON_desc_t descAl, descAt; + + chameleon = chameleon_context_self(); + if (chameleon == NULL) { + chameleon_fatal_error("CHAMELON_zgetrf_sp", "CHAMELON not initialized"); + return CHAMELON_ERR_NOT_INITIALIZED; + } + /* Check input arguments */ + if (M < 0) { + chameleon_error("CHAMELON_zgetrf_sp", "illegal value of M"); + return -2; + } + if (N < 0) { + chameleon_error("CHAMELON_zgetrf_sp", "illegal value of N"); + return -2; + } + if (LDA < chameleon_max(1, M)) { + chameleon_error("CHAMELON_zgetrf_sp", "illegal value of LDA"); + return -4; + } + /* Quick return */ + if ((chameleon_max(M, 0) == 0) || (chameleon_max(N, 0) == 0)) + return CHAMELON_SUCCESS; + + /* Set NT */ + NB = CHAMELON_NB; + + /* Registering nb_pivot */ + int nb = 0; + CHAMELON_data_handle_t nb_pivot_handle; + RUNTIME_vector_data_register(&nb_pivot_handle, 0, (uintptr_t)&nb, 1, sizeof(int)); + + chameleon_sequence_create(chameleon, &sequence); + + chameleon_zlap2tile( chameleon, &descAl, &descAt, ChamDescInout, ChamUpperLower, + A, NB, NB, LDA, N, N, N, sequence, &request); + + /* Call the tile interface */ + CHAMELON_zgetrf_sp_Tile_Async(&descAt, sequence, &request, criteria, &nb_pivot_handle); + + chameleon_ztile2lap( chameleon, &descAl, &descAt, + ChamDescInout, ChamUpperLower, sequence, &request ); + chameleon_sequence_wait(chameleon, sequence); + + /* Cleanup the temporary data */ + chameleon_ztile2lap_cleanup( chameleon, &descAl, &descAt ); + + status = sequence->status; + chameleon_sequence_destroy(chameleon, sequence); + + RUNTIME_data_unregister(&nb_pivot_handle); + + RUNTIME_data_acquire(&nb_pivot_handle); + RUNTIME_data_release(&nb_pivot_handle); + *nb_pivot = nb; + + return status; +} + +/***************************************************************************//** + * + * @ingroup CHAMELON_Complex64_t_Tile + * + * CHAMELON_zgetrf_sp_Tile - Computes the LU decomposition of matrix. + * Tile equivalent of CHAMELON_zgetrf_sp(). + * Operates on matrices stored by tiles. + * All matrices are passed through descriptors. + * All dimensions are taken from the descriptors. + * + ******************************************************************************* + * + * + * @param[in] A + * On entry, the matrix A. + * On exit, if return value = 0, the factor U or L from the LU decomposition + * A = LU. + * + ******************************************************************************* + * + * @return + * \retval CHAMELON_SUCCESS successful exit + * \retval >0 if i, the leading minor of order i of A is not positive definite, so the + * factorization could not be completed, and the solution has not been computed. + * + ******************************************************************************* + * + * @sa CHAMELON_zgetrf_sp + * @sa CHAMELON_zgetrf_sp_Tile_Async + * @sa CHAMELON_cgetrf_sp_Tile + * @sa CHAMELON_dgetrf_sp_Tile + * @sa CHAMELON_sgetrf_sp_Tile + * + ******************************************************************************/ +int CHAMELON_zgetrf_sp_Tile(CHAMELON_desc_t *A, double criteria) +{ + CHAMELON_context_t *chameleon; + CHAMELON_sequence_t *sequence = NULL; + CHAMELON_request_t request = CHAMELON_REQUEST_INITIALIZER; + int status; + + chameleon = chameleon_context_self(); + if (chameleon== NULL) { + chameleon_fatal_error("CHAMELON_zgetrf_sp_Tile", "CHAMELON not initialized"); + return CHAMELON_ERR_NOT_INITIALIZED; + } + + int nb = 0; + CHAMELON_data_handle_t nb_pivot_handle; + RUNTIME_data_alloc(&nb_pivot_handle); + RUNTIME_vector_data_register(&nb_pivot_handle, 0, (uintptr_t)&nb, 1, sizeof(int)); + + chameleon_sequence_create(chameleon, &sequence); + CHAMELON_zgetrf_sp_Tile_Async(A, sequence, &request, criteria, &nb_pivot_handle); + CHAMELON_Desc_Flush( A, sequence ); + chameleon_sequence_wait(chameleon, sequence); + + status = sequence->status; + chameleon_sequence_destroy(chameleon, sequence); + + RUNTIME_data_unregister(&nb_pivot_handle); + RUNTIME_data_free(&nb_pivot_handle); + + return status; +} + +/***************************************************************************//** + * + * @ingroup CHAMELON_Complex64_t_Tile_Async + * + * CHAMELON_zgetrf_sp_Tile_Async - Computes the LU decomposition + * Non-blocking equivalent of CHAMELON_zgetrf_sp_Tile(). + * May return before the computation is finished. + * Allows for pipelining of operations ar runtime. + * + ******************************************************************************* + * + * @param[in] sequence + * Identifies the sequence of function calls that this call belongs to + * (for completion checks and exception handling purposes). + * + * @param[out] request + * Identifies this function call (for exception handling purposes). + * + ******************************************************************************* + * + * @sa CHAMELON_zgetrf_sp + * @sa CHAMELON_zgetrf_sp_Tile + * @sa CHAMELON_cgetrf_sp_Tile_Async + * @sa CHAMELON_dgetrf_sp_Tile_Async + * @sa CHAMELON_sgetrf_sp_Tile_Async + * @sa CHAMELON_zpotrs_Tile_Async + * + ******************************************************************************/ +int CHAMELON_zgetrf_sp_Tile_Async(CHAMELON_desc_t *A, + CHAMELON_sequence_t *sequence, CHAMELON_request_t *request, + double criteria, CHAMELON_data_handle_t *nb_pivot_handle) +{ + CHAMELON_context_t *chameleon; + + chameleon = chameleon_context_self(); + if (chameleon == NULL) { + chameleon_fatal_error("CHAMELON_zgetrf_sp_Tile", "CHAMELON not initialized"); + return CHAMELON_ERR_NOT_INITIALIZED; + } + if (sequence == NULL) { + chameleon_fatal_error("CHAMELON_zgetrf_sp_Tile", "NULL sequence"); + return CHAMELON_ERR_UNALLOCATED; + } + if (request == NULL) { + chameleon_fatal_error("CHAMELON_zgetrf_sp_Tile", "NULL request"); + return CHAMELON_ERR_UNALLOCATED; + } + /* Check sequence status */ + if (sequence->status == CHAMELON_SUCCESS) + request->status = CHAMELON_SUCCESS; + else + return chameleon_request_fail(sequence, request, CHAMELON_ERR_SEQUENCE_FLUSHED); + + /* Check descriptors for correctness */ + if (chameleon_desc_check(A) != CHAMELON_SUCCESS) { + chameleon_error("CHAMELON_zgetrf_nopiv_Tile", "invalid first descriptor"); + return chameleon_request_fail(sequence, request, CHAMELON_ERR_ILLEGAL_VALUE); + } + + if (A->nb != A->mb) { + chameleon_error("CHAMELON_zgetrf_sp_Tile", "only square tiles supported"); + return chameleon_request_fail(sequence, request, CHAMELON_ERR_ILLEGAL_VALUE); + } + + chameleon_pzgetrf_sp(A, criteria, nb_pivot_handle, sequence, request); + + return CHAMELON_SUCCESS; +} diff --git a/control/common.h b/control/common.h index f31ec1af3bf5f0fd5af475e2630d1cd9c575bcd2..f5a34c8f3095b055137af132300e47f0d3a45722 100644 --- a/control/common.h +++ b/control/common.h @@ -84,6 +84,10 @@ #define CHAMELEON_TRANSLATION chamctxt->translation #define CHAMELEON_PARALLEL chamctxt->parallel_enabled #define CHAMELEON_STATISTICS chamctxt->statistics_enabled +#define CHAMELEON_DSDMATRIX chamctxt->dsdmatrix_enabled +#define CHAMELEON_SWAP_TYPE chamctxt->swap_type +#define CHAMELEON_PANEL_THREADS chamctxt->nb_panel_threads +#define CHAMELEON_RECTIL_PANEL chamctxt->rectil_panel_enabled /** * IPT internal define diff --git a/control/context.c b/control/context.c index 66b844ccfacb5a86dda0ec95a0596f2a5a54b775..4f7804b80aeb7f7aff1ff95692c20ae43a709a4d 100644 --- a/control/context.c +++ b/control/context.c @@ -141,6 +141,11 @@ CHAM_context_t *chameleon_context_create() chamctxt->generic_enabled = chameleon_env_on_off( "CHAMELEON_GENERIC", CHAMELEON_FALSE ); chamctxt->autominmax_enabled = chameleon_env_on_off( "CHAMELEON_AUTOMINMAX", CHAMELEON_TRUE ); + chameleon->dsdmatrix_enabled = CHAMELON_FALSE; + chameleon->swap_type = CHAMELON_TILE_SWAPS; + chameleon->rectil_panel_enabled = CHAMELON_FALSE; + chameleon->nb_panel_threads = -1; + chamctxt->runtime_paused = CHAMELEON_FALSE; chamctxt->householder = chameleon_getenv_householder( "CHAMELEON_HOUSEHOLDER_MODE", ChamFlatHouseholder ); @@ -224,6 +229,12 @@ int CHAMELEON_Enable(int option) case CHAMELEON_PROGRESS: chamctxt->progress_enabled = CHAMELEON_TRUE; break; + case CHAMELON_DSDMATRIX_MODE: + chameleon->dsdmatrix_enabled = CHAMELON_TRUE; + break; + case CHAMELON_RECTIL_PANEL_MODE: + chameleon->rectil_panel_enabled = CHAMELON_TRUE; + break; case CHAMELEON_GEMM3M: #if defined(CBLAS_HAS_ZGEMM3M) && !defined(CHAMELEON_SIMULATION) set_coreblas_gemm3m_enabled(1); @@ -309,6 +320,12 @@ int CHAMELEON_Disable(int option) case CHAMELEON_GENERIC: chamctxt->generic_enabled = CHAMELEON_FALSE; break; + case CHAMELON_DSDMATRIX_MODE: + chameleon->dsdmatrix_enabled = CHAMELON_FALSE; + break; + case CHAMELON_RECTIL_PANEL_MODE: + chameleon->rectil_panel_enabled = CHAMELON_FALSE; + break; default: chameleon_error("CHAMELEON_Disable", "illegal parameter value"); return CHAMELEON_ERR_ILLEGAL_VALUE; @@ -413,6 +430,23 @@ int CHAMELEON_Set( int param, int value ) } chamctxt->lookahead = value; break; + case CHAMELON_SWAP_TYPE_SET: + if (value < CHAMELON_NO_SWAPS && value >= CHAMELON_INVALID_SWAPS) + { + chameleon_error("CHAMELON_Set", "illegal value of CHAMELON_NEW_SWAP_TYPE"); + return CHAMELON_ERR_ILLEGAL_VALUE; + } + chameleon->swap_type = value; + break; + case CHAMELON_NBPANEL_THREADS_SET: + if (value < 1) + { + chameleon_error("CHAMELON_Set", "illegal value of CHAMELON_NBPANEL_THREADS_SET"); + return CHAMELON_ERR_ILLEGAL_VALUE; + } + CHAMELON_Enable(CHAMELON_RECTIL_PANEL_MODE); + chameleon->nb_panel_threads = value; + break; default: chameleon_error("CHAMELEON_Set", "unknown parameter"); return CHAMELEON_ERR_ILLEGAL_VALUE; @@ -479,6 +513,12 @@ int CHAMELEON_Get( int param, int *value ) case CHAMELEON_RUNTIME: *value = chamctxt->scheduler; return CHAMELEON_SUCCESS; + case CHAMELON_SWAP_TYPE_SET: + *value = chameleon->swap_type; + return CHAMELON_SUCCESS; + case CHAMELON_NBPANEL_THREADS_SET: + *value = chameleon->nb_panel_threads; + return CHAMELON_SUCCESS; default: chameleon_error("CHAMELEON_Get", "unknown parameter"); return CHAMELEON_ERR_ILLEGAL_VALUE; diff --git a/coreblas/compute/CMakeLists.txt b/coreblas/compute/CMakeLists.txt index 40baddc598df9da3ce04b58a54b7fe645435b564..00bb961f90ba4577c5456be235c645ea25af6c4b 100644 --- a/coreblas/compute/CMakeLists.txt +++ b/coreblas/compute/CMakeLists.txt @@ -51,6 +51,9 @@ set(ZSRC core_zgessq.c core_zgetf2_nopiv.c core_zgetrf.c + core_zgetrf_extract_max.c + core_zgetrf_rectil.c + core_zgetrf_sp.c core_zgetrf_incpiv.c core_zgetrf_nopiv.c core_zgram.c @@ -68,6 +71,7 @@ set(ZSRC core_zlantr.c core_zlaset2.c core_zlaset.c + core_zlaswp.c core_zlatro.c core_zlauum.c core_zpamm.c @@ -79,6 +83,7 @@ set(ZSRC core_zplssq.c core_zpotrf.c core_zssssm.c + core_zswp.c core_zsymm.c core_zsyr2k.c core_zsyrk.c diff --git a/coreblas/compute/core_zgetrf_extract_max.c b/coreblas/compute/core_zgetrf_extract_max.c new file mode 100644 index 0000000000000000000000000000000000000000..2119b666bf1417b3ca1253d230456f2ae69860a1 --- /dev/null +++ b/coreblas/compute/core_zgetrf_extract_max.c @@ -0,0 +1,94 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Inria. All rights reserved. + * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file core_zgetrf_extract_max.c + * + * CHAMELON core_blas kernel + * CHAMELON is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @version 1.0.0 + * @author Omar Zenati + * @author Terry Cojean + * @date 2011-11-11 + * @precisions normal z -> s d c + * + **/ +#include "coreblas/include/coreblas.h" + +#define get_pivot(_a_) (*((int *) (_a_))) +#define get_diagonal(_a_) (_a_ + 1) +#define get_maxrow(_a_) (_a_ + NB + 1) + +static CHAMELON_Complex64_t mzone = -1.; + +void CORE_zgetrf_extract_max(CHAMELON_Complex64_t *A, CHAMELON_Complex64_t *mw, + int K, int M, int N, int H, int NB, + int AM, int LDA) +{ + int p; + p = (AM == K)? H : 0; + M -= p; + + if(H != 0) { + CHAMELON_Complex64_t alpha; + CHAMELON_Complex64_t *rmax = get_maxrow(mw); + int pivot = get_pivot(mw); + int owner_last_index = pivot / NB; + int last_index = pivot % NB; + + /* Applying the previous swap */ + if( AM == K /* && (pivot != H-1) */ ) { + cblas_zcopy(N, get_maxrow(mw) , 1, + A + H-1 , LDA); + } + + /* Copy the diagonal in place */ + if( owner_last_index == AM /* && (pivot != H-1) */ ) { + cblas_zcopy(N, get_diagonal(mw) , 1, + A + last_index, LDA); + } + + /* Applying the update */ + alpha = 1. / rmax[H-1]; + + /* printf("Le pivot a h=%d est localement a %d et globalement a %d et sa valeur est %e\n",H-1,last_index,get_pivot(mw), rmax[H-1]); */ + + cblas_zscal(M, CBLAS_SADDR( alpha ), A + (LDA*(H-1) + p), 1 ); + + cblas_zgeru(CblasColMajor, M, N-H, + CBLAS_SADDR(mzone), + A + (LDA*(H-1) + p) , 1 , + get_maxrow(mw) + H , 1 , + A + (LDA*H + p) , LDA); + } + + + /* if(H != (((M+p)<N)?M+p:N)) { */ + int index_max = cblas_izamax( M, A + LDA*H + p, 1 ) + p; + + get_pivot(mw) = index_max + AM * NB; + + /* Copy the diagonal row */ + if(K == AM) + cblas_zcopy(N, A + H, LDA, + get_diagonal(mw) , 1); + /* Copy the maximum row */ + cblas_zcopy(N, A + index_max, LDA, + get_maxrow(mw) , 1); + /* CHAMELON_Complex64_t *rmax = get_maxrow(mw); */ + /* if(H == 0) */ + /* printf("m=%d, max in %d = %e, M=%d, p=%d\n",AM,get_pivot(mw),rmax[H],M,p); */ + /* } */ + +} diff --git a/coreblas/compute/core_zgetrf_rectil.c b/coreblas/compute/core_zgetrf_rectil.c new file mode 100644 index 0000000000000000000000000000000000000000..c58271e91f164ce6f3d3ffe101ee04b2d1437838 --- /dev/null +++ b/coreblas/compute/core_zgetrf_rectil.c @@ -0,0 +1,733 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Inria. All rights reserved. + * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file core_zgetrf_rectil.c + * + * PLASMA core_blas kernel + * PLASMA is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 2.6.0 + * @author Hatem Ltaief + * @author Mathieu Faverge + * @author Piotr Luszczek + * @date 2009-11-15 + * + * @precisions normal z -> c d s + * + **/ + +#include "coreblas/include/coreblas.h" +#include "control/descriptor.h" + +#define A(m, n) ((CHAMELON_Complex64_t *)chameleon_getaddr_ccrb(A, m, n)) + +void +CORE_zgetrf_rectil_init(void); + +static inline void +CORE_zgetrf_rectil_rec(const CHAMELON_desc_t *A, int *IPIV, int *info, + CHAMELON_Complex64_t *pivot, + int thidx, int thcnt, + int column, int width, + int ft, int lt); +static inline void +CORE_zgetrf_rectil_update(const CHAMELON_desc_t *A, int *IPIV, + int column, int n1, int n2, + int thidx, int thcnt, + int ft, int lt); + +/***************************************************************************** + * + * @ingroup CORE_CHAMELON_Complex64_t + * + * CORE_zgetrf_rectil computes a LU factorization of a general M-by-N + * matrix A stored in CCRB layout using partial pivoting with row + * interchanges. + * + * The factorization has the form + * + * A = P * L * U + * + * where P is a permutation matrix, L is lower triangular with unit + * diagonal elements (lower trapezoidal if m > n), and U is upper + * triangular (upper trapezoidal if m < n). + * + * This is the recursive version of the algorithm applied on tile layout. + * + * WARNINGS: + * - The function CORE_zgetrf_rectil_init has to be called prior + * to any call to this function. + * - You cannot call this kernel on different matrices at the same + * time. + * - The matrix A cannot be more than one tile wide. + * - The number of threads calling this function has to be excatly + * the number defined by info[2] with each one of them a different + * index between 0 included and info[2] excluded. + * + ******************************************************************************* + * + * @param[in,out] A + * PLASMA descriptor of the matrix A to be factorized. + * On entry, the M-by-N matrix to be factorized. + * On exit, the factors L and U from the factorization + * A = P*L*U; the unit diagonal elements of L are not stored. + * + * @param[out] IPIV + * The pivot indices; for 0 <= i < min(M,N) stored in Fortran + * mode (starting at 1), row i of the matrix was interchanged + * with row IPIV(i). + * On exit, each value IPIV[i] for 0 <= i < min(M,N) is + * increased by A->i, which means A->i < IPIV[i] <= A->i+M. + * + * @param[in,out] info + * Array of 3 integers. + * - info[0], see returned value + * - info[1], is the thread index 0 <= info[0] < info[2] + * - info[2], on entry is the number of threads trying to + * participate to the factorization, + * on exit is the real number of threads used to + * perform the factorization. + * Info[2] threads, and exactly info[2], have to call this function + * to avoid dead lock. + * + ******************************************************************************* + * + * @return + * \retval CHAMELON_SUCCESS successful exit + * \retval -k, the k-th argument had an illegal value + * \retval k if U(k,k) is exactly zero. The factorization + * has been completed, but the factor U is exactly + * singular, and division by zero will occur if it is used + * to solve a system of equations. + * + */ + +int CORE_zgetrf_rectil(const CHAMELON_desc_t *A, int *IPIV, int *info) +{ + int ft, lt; + int thidx = info[1]; + int thcnt = chameleon_min( info[2], A->mt ); + int minMN = chameleon_min( A->m, A->n ); + CHAMELON_Complex64_t pivot; + + info[0] = 0; + info[2] = thcnt; + + assert( A->styp == ChamCCRB ); + + if ( A->nt > 1 ) { + coreblas_error(1, "Illegal value of A->nt"); + info[0] = -1; + return -1; + } + + if ( thidx >= thcnt ) + return 0; + + int q = A->mt / thcnt; + int r = A->mt % thcnt; + + if (thidx < r) { + q++; + ft = thidx * q; + lt = ft + q; + } else { + ft = r * (q + 1) + (thidx - r) * q; + lt = ft + q; + lt = chameleon_min( lt, A->mt ); + } + + if (0) + { + int i,j,k; + + for (k=ft ; k<lt ; k++) { + fprintf(stderr, "\n\t\tTile %d\n", k); + int ld = BLKLDD(A,0); + for (j=0 ; j < A->mb ; j++) { + for (i=0 ; i < A->nb ; i++) + fprintf(stderr, "\t%.2f", *(A(k, 0) + j + i*ld)); + fprintf(stderr, "\n"); + } + } + } + + CORE_zgetrf_rectil_rec( A, IPIV, info, &pivot, + thidx, thcnt, 0, minMN, ft, lt); + + if ( A->n > minMN ) { + CORE_zgetrf_rectil_update( A, IPIV, + 0, minMN, A->n-minMN, + thidx, thcnt, + ft, lt); + } + + return info[0]; +} + +/******************************************************************* + * Additional routines + */ +#define AMAX1BUF_SIZE (48 << 1) + +/* 48 threads should be enough for everybody */ +static volatile CHAMELON_Complex64_t CORE_zamax1buf[AMAX1BUF_SIZE]; +static double sfmin; + +void +CORE_zgetrf_rectil_init(void) +{ + int i; + for (i = 0; i < AMAX1BUF_SIZE; ++i) CORE_zamax1buf[i] = -1.0; + sfmin = LAPACKE_dlamch_work('S'); +} + +static void +CORE_zamax1_thread(CHAMELON_Complex64_t localamx, + int thidx, int thcnt, int *thwinner, + CHAMELON_Complex64_t *diagvalue, + CHAMELON_Complex64_t *globalamx, + int pividx, int *ipiv) +{ + if (thidx == 0) { + int i, j = 0; + CHAMELON_Complex64_t curval = localamx, tmp; + double curamx = cabs(localamx); + + /* make sure everybody filled in their value */ + for (i = 1; i < thcnt; ++i) { + while (CORE_zamax1buf[i << 1] == -1.0) { /* wait for thread i to store its value */ + } + } + + /* better not fuse the loop above and below to make sure data is sync'd */ + + for (i = 1; i < thcnt; ++i) { + tmp = CORE_zamax1buf[ (i << 1) + 1]; + if (cabs(tmp) > curamx) { + curamx = cabs(tmp); + curval = tmp; + j = i; + } + } + + if (0 == j) + ipiv[0] = pividx; + + /* make sure everybody knows the amax value */ + for (i = 1; i < thcnt; ++i) + CORE_zamax1buf[ (i << 1) + 1] = curval; + + CORE_zamax1buf[0] = -j - 2.0; /* set the index of the winning thread */ + CORE_zamax1buf[1] = *diagvalue; /* set the index of the winning thread */ + + *thwinner = j; + *globalamx = curval; + + for (i = 1; i < thcnt; ++i) + CORE_zamax1buf[i << 1] = -3.0; + + /* make sure everybody read the max value */ + for (i = 1; i < thcnt; ++i) { + while (CORE_zamax1buf[i << 1] != -1.0) { + } + } + + CORE_zamax1buf[0] = -1.0; + } else { + CORE_zamax1buf[(thidx << 1) + 1] = localamx; + CORE_zamax1buf[thidx << 1] = -2.0; /* announce to thread 0 that local amax was stored */ + while (CORE_zamax1buf[0] == -1.0) { /* wait for thread 0 to finish calculating the global amax */ + } + while (CORE_zamax1buf[thidx << 1] != -3.0) { /* wait for thread 0 to store amax */ + } + *thwinner = -CORE_zamax1buf[0] - 2.0; + *diagvalue = CORE_zamax1buf[1]; + *globalamx = CORE_zamax1buf[(thidx << 1) + 1]; /* read the amax from the location adjacent to the one in the above loop */ + CORE_zamax1buf[thidx << 1] = -1.0; /* signal thread 0 that this thread is done reading */ + + if (thidx == *thwinner) + ipiv[0] = pividx; + + while (CORE_zamax1buf[0] != -1.0) { /* wait for thread 0 to finish */ + } + } +} + +static void +CORE_zbarrier_thread(int thidx, + int thcnt) +{ + int idum1, idum2; + CHAMELON_Complex64_t ddum1 = 0.; + CHAMELON_Complex64_t ddum2 = 0.; + /* it's probably faster to implement a dedicated barrier */ + CORE_zamax1_thread( 1.0, thidx, thcnt, &idum1, &ddum1, &ddum2, 0, &idum2 ); +} + +static inline void +CORE_zgetrf_rectil_update(const CHAMELON_desc_t *A, int *IPIV, + int column, int n1, int n2, + int thidx, int thcnt, + int ft, int lt) +{ + int ld, lm, tmpM; + int ip, j, it, i, ldft; + CHAMELON_Complex64_t zone = 1.0; + CHAMELON_Complex64_t mzone = -1.0; + CHAMELON_Complex64_t *Atop, *Atop2, *U, *L; + int offset = A->i; + + ldft = BLKLDD(A, 0); + Atop = A(0, 0) + column * ldft; + Atop2 = Atop + n1 * ldft; + + if (thidx == 0) + { + /* Swap to the right */ + int *lipiv = IPIV+column; + int idxMax = column+n1; + for (j = column; j < idxMax; ++j, ++lipiv) { + ip = (*lipiv) - offset - 1; + if ( ip != j ) + { + it = ip / A->mb; + i = ip % A->mb; + ld = BLKLDD(A, it); + cblas_zswap(n2, Atop2 + j, ldft, + A(it, 0) + (column+n1)*ld + i, ld ); + } + } + + /* Trsm on the upper part */ + U = Atop2 + column; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, + CblasNoTrans, CblasUnit, + n1, n2, CBLAS_SADDR(zone), + Atop + column, ldft, + U, ldft ); + + /* Signal to other threads that they can start update */ + CORE_zbarrier_thread( thidx, thcnt ); + + /* First tile */ + L = Atop + column + n1; + tmpM = chameleon_min(ldft, A->m) - column - n1; + + /* Apply the GEMM */ + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, + tmpM, n2, n1, + CBLAS_SADDR(mzone), L, ldft, + U, ldft, + CBLAS_SADDR(zone), U + n1, ldft ); + + } + else + { + ld = BLKLDD( A, ft ); + L = A( ft, 0 ) + column * ld; + lm = ft == A->mt-1 ? A->m - ft * A->mb : A->mb; + + U = Atop2 + column; + + /* Wait for pivoting and triangular solve to be finished + * before to really start the update */ + CORE_zbarrier_thread( thidx, thcnt ); + + /* First tile */ + /* Apply the GEMM */ + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, + lm, n2, n1, + CBLAS_SADDR(mzone), L, ld, + U, ldft, + CBLAS_SADDR(zone), L + n1*ld, ld ); + } + + /* Update the other blocks */ + for( it = ft+1; it < lt; it++) + { + ld = BLKLDD( A, it ); + L = A( it, 0 ) + column * ld; + lm = it == A->mt-1 ? A->m - it * A->mb : A->mb; + + /* Apply the GEMM */ + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, + lm, n2, n1, + CBLAS_SADDR(mzone), L, ld, + U, ldft, + CBLAS_SADDR(zone), L + n1*ld, ld ); + } +} + +static void +CORE_zgetrf_rectil_rec(const CHAMELON_desc_t *A, int *IPIV, int *info, + CHAMELON_Complex64_t *pivot, + int thidx, int thcnt, + int column, int width, + int ft, int lt) +{ + int ld, jp, n1, n2, lm, tmpM, piv_sf; + int ip, j, it, i, ldft; + int max_i, max_it, thwin; + CHAMELON_Complex64_t zone = 1.0; + CHAMELON_Complex64_t mzone = -1.0; + CHAMELON_Complex64_t tmp1; + CHAMELON_Complex64_t tmp2 = 0.; + CHAMELON_Complex64_t pivval; + CHAMELON_Complex64_t *Atop, *Atop2, *U, *L; + double abstmp1; + int offset = A->i; + + ldft = BLKLDD(A, 0); + Atop = A(0, 0) + column * ldft; + + if ( width > 1 ) { + /* Assumption: N = min( M, N ); */ + n1 = width / 2; + n2 = width - n1; + + Atop2 = Atop + n1 * ldft; + + CORE_zgetrf_rectil_rec( A, IPIV, info, pivot, + thidx, thcnt, column, n1, ft, lt ); + + if ( *info != 0 ) + return; + + if (thidx == 0) + { + /* Swap to the right */ + int *lipiv = IPIV+column; + int idxMax = column+n1; + for (j = column; j < idxMax; ++j, ++lipiv) { + ip = (*lipiv) - offset - 1; + if ( ip != j ) + { + it = ip / A->mb; + i = ip % A->mb; + ld = BLKLDD(A, it); + cblas_zswap(n2, Atop2 + j, ldft, + A(it, 0) + (column+n1)*ld + i, ld ); + } + } + /* Trsm on the upper part */ + U = Atop2 + column; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, + CblasNoTrans, CblasUnit, + n1, n2, CBLAS_SADDR(zone), + Atop + column, ldft, + U, ldft ); + + /* SIgnal to other threads that they can start update */ + CORE_zbarrier_thread( thidx, thcnt ); + pivval = *pivot; + if ( pivval == 0.0 ) { + *info = column+n1; + return; + } else { + if ( cabs(pivval) >= sfmin ) { + piv_sf = 1; + pivval = 1.0 / pivval; + } else { + piv_sf = 0; + } + } + + /* First tile */ + { + L = Atop + column + n1; + tmpM = chameleon_min(ldft, A->m) - column - n1; + + /* Scale last column of L */ + if ( piv_sf ) { + cblas_zscal( tmpM, CBLAS_SADDR(pivval), L+(n1-1)*ldft, 1 ); + } else { + int i; + Atop2 = L+(n1-1)*ldft; + for( i=0; i < tmpM; i++, Atop2++) + *Atop2 = *Atop2 / pivval; + } + + /* Apply the GEMM */ + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, + tmpM, n2, n1, + CBLAS_SADDR(mzone), L, ldft, + U, ldft, + CBLAS_SADDR(zone), U + n1, ldft ); + + /* Search Max in first column of U+n1 */ + tmp2 = U[n1]; + max_it = ft; + max_i = cblas_izamax( tmpM, U+n1, 1 ) + n1; + tmp1 = U[max_i]; + abstmp1 = cabs(tmp1); + max_i += column; + } + } + else + { + pivval = *pivot; + if ( pivval == 0.0 ) { + CORE_zbarrier_thread( thidx, thcnt ); + *info = column+n1; + return; + } else { + if ( cabs(pivval) >= sfmin ) { + piv_sf = 1; + pivval = 1.0 / pivval; + } else { + piv_sf = 0; + } + } + + ld = BLKLDD( A, ft ); + L = A( ft, 0 ) + column * ld; + lm = ft == A->mt-1 ? A->m - ft * A->mb : A->mb; + + U = Atop2 + column; + + /* First tile */ + /* Scale last column of L */ + if ( piv_sf ) { + cblas_zscal( lm, CBLAS_SADDR(pivval), L+(n1-1)*ld, 1 ); + } else { + int i; + Atop2 = L+(n1-1)*ld; + for( i=0; i < lm; i++, Atop2++) + *Atop2 = *Atop2 / pivval; + } + + /* Wait for pivoting and triangular solve to be finished + * before to really start the update */ + CORE_zbarrier_thread( thidx, thcnt ); + + /* Apply the GEMM */ + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, + lm, n2, n1, + CBLAS_SADDR(mzone), L, ld, + U, ldft, + CBLAS_SADDR(zone), L + n1*ld, ld ); + + /* Search Max in first column of L+n1*ld */ + max_it = ft; + max_i = cblas_izamax( lm, L+n1*ld, 1 ); + tmp1 = L[n1*ld+max_i]; + abstmp1 = cabs(tmp1); + } + + /* Update the other blocks */ + for( it = ft+1; it < lt; it++) + { + ld = BLKLDD( A, it ); + L = A( it, 0 ) + column * ld; + lm = it == A->mt-1 ? A->m - it * A->mb : A->mb; + + /* Scale last column of L */ + if ( piv_sf ) { + cblas_zscal( lm, CBLAS_SADDR(pivval), L+(n1-1)*ld, 1 ); + } else { + int i; + Atop2 = L+(n1-1)*ld; + for( i=0; i < lm; i++, Atop2++) + *Atop2 = *Atop2 / pivval; + } + + /* Apply the GEMM */ + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, + lm, n2, n1, + CBLAS_SADDR(mzone), L, ld, + U, ldft, + CBLAS_SADDR(zone), L + n1*ld, ld ); + + /* Search the max on the first column of L+n1*ld */ + jp = cblas_izamax( lm, L+n1*ld, 1 ); + if ( cabs( L[n1*ld+jp] ) > abstmp1 ) { + tmp1 = L[n1*ld+jp]; + abstmp1 = cabs(tmp1); + max_i = jp; + max_it = it; + } + } + + jp = offset + max_it*A->mb + max_i; + CORE_zamax1_thread( tmp1, thidx, thcnt, &thwin, + &tmp2, pivot, jp + 1, IPIV + column + n1 ); + + if ( thidx == 0 ) { + U[n1] = *pivot; /* all threads have the pivot element: no need for synchronization */ + } + if (thwin == thidx) { /* the thread that owns the best pivot */ + if ( jp-offset != column+n1 ) /* if there is a need to exchange the pivot */ + { + ld = BLKLDD(A, max_it); + Atop2 = A( max_it, 0 ) + (column + n1 )* ld + max_i; + *Atop2 = tmp2; + } + } + + CORE_zgetrf_rectil_rec( A, IPIV, info, pivot, + thidx, thcnt, column+n1, n2, ft, lt ); + if ( *info != 0 ) + return; + + if ( thidx == 0 ) + { + /* Swap to the left */ + int *lipiv = IPIV+column+n1; + int idxMax = column+width; + for (j = column+n1; j < idxMax; ++j, ++lipiv) { + ip = (*lipiv) - offset - 1; + if ( ip != j ) + { + it = ip / A->mb; + i = ip % A->mb; + ld = BLKLDD(A, it); + cblas_zswap(n1, Atop + j, ldft, + A(it, 0) + column*ld + i, ld ); + } + } + } + + } else if ( width == 1 ) { + + /* Search maximum for column 0 */ + if ( column == 0 ) + { + if ( thidx == 0 ) + tmp2 = Atop[column]; + + /* First tmp1 */ + ld = BLKLDD(A, ft); + Atop2 = A( ft, 0 ); + lm = ft == A->mt-1 ? A->m - ft * A->mb : A->mb; + max_it = ft; + max_i = cblas_izamax( lm, Atop2, 1 ); + tmp1 = Atop2[max_i]; + abstmp1 = cabs(tmp1); + + /* Update */ + for( it = ft+1; it < lt; it++) + { + Atop2= A( it, 0 ); + lm = it == A->mt-1 ? A->m - it * A->mb : A->mb; + jp = cblas_izamax( lm, Atop2, 1 ); + if ( cabs(Atop2[jp]) > abstmp1 ) { + tmp1 = Atop2[jp]; + abstmp1 = cabs(tmp1); + max_i = jp; + max_it = it; + } + } + + jp = offset + max_it*A->mb + max_i; + //fprintf(stderr, " column:%d ; jp:%d ; offset:%d ; max_it:%d ; max_i:%d ; thid:%d && tiles [%d,%d]\n", column, jp, offset, max_it, max_i, thidx, ft, lt); + CORE_zamax1_thread( tmp1, thidx, thcnt, &thwin, + &tmp2, pivot, jp + 1, IPIV + column ); + + if ( thidx == 0 ) { + Atop[0] = *pivot; /* all threads have the pivot element: no need for synchronization */ + } + if (thwin == thidx) { /* the thread that owns the best pivot */ + if ( jp-offset != 0 ) /* if there is a need to exchange the pivot */ + { + Atop2 = A( max_it, 0 ) + max_i; + *Atop2 = tmp2; + } + } + } + + CORE_zbarrier_thread( thidx, thcnt ); + + /* If it is the last column, we just scale */ + if ( column == (chameleon_min(A->m, A->n))-1 ) { + + pivval = *pivot; + if ( pivval != 0.0 ) { + if ( thidx == 0 ) { + if ( cabs(pivval) >= sfmin ) { + pivval = 1.0 / pivval; + + /* + * We guess than we never enter the function with m == A->mt-1 + * because it means that there is only one thread + */ + lm = ft == A->mt-1 ? A->m - ft * A->mb : A->mb; + cblas_zscal( lm - column - 1, CBLAS_SADDR(pivval), Atop+column+1, 1 ); + + for( it = ft+1; it < lt; it++) + { + ld = BLKLDD(A, it); + Atop2 = A( it, 0 ) + column * ld; + lm = it == A->mt-1 ? A->m - it * A->mb : A->mb; + cblas_zscal( lm, CBLAS_SADDR(pivval), Atop2, 1 ); + } + } else { + /* + * We guess than we never enter the function with m == A->mt-1 + * because it means that there is only one thread + */ + int i; + Atop2 = Atop + column + 1; + lm = ft == A->mt-1 ? A->m - ft * A->mb : A->mb; + + for( i=0; i < lm-column-1; i++, Atop2++) + *Atop2 = *Atop2 / pivval; + + for( it = ft+1; it < lt; it++) + { + ld = BLKLDD(A, it); + Atop2 = A( it, 0 ) + column * ld; + lm = it == A->mt-1 ? A->m - it * A->mb : A->mb; + + for( i=0; i < lm; i++, Atop2++) + *Atop2 = *Atop2 / pivval; + } + } + } + else + { + if ( cabs(pivval) >= sfmin ) { + pivval = 1.0 / pivval; + + for( it = ft; it < lt; it++) + { + ld = BLKLDD(A, it); + Atop2 = A( it, 0 ) + column * ld; + lm = it == A->mt-1 ? A->m - it * A->mb : A->mb; + cblas_zscal( lm, CBLAS_SADDR(pivval), Atop2, 1 ); + } + } else { + /* + * We guess than we never enter the function with m == A->mt-1 + * because it means that there is only one thread + */ + int i; + for( it = ft; it < lt; it++) + { + ld = BLKLDD(A, it); + Atop2 = A( it, 0 ) + column * ld; + lm = it == A->mt-1 ? A->m - it * A->mb : A->mb; + + for( i=0; i < lm; i++, Atop2++) + *Atop2 = *Atop2 / pivval; + } + } + } + } + else { + *info = column + 1; + return; + } + } + } +} diff --git a/coreblas/compute/core_zgetrf_sp.c b/coreblas/compute/core_zgetrf_sp.c new file mode 100644 index 0000000000000000000000000000000000000000..8b9d7e18bca20cee4d6da50574631f7d6225f5ff --- /dev/null +++ b/coreblas/compute/core_zgetrf_sp.c @@ -0,0 +1,204 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Inria. All rights reserved. + * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file core_zgetrf_sp.c + * + * PLASMA core_blas kernel + * PLASMA is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 1.0.0 + * @author Mathieu Faverge + * @author Pierre Ramet + * @author Xavier Lacoste + * @author Omar Zenati + * @author Terry Cojean + * @date 2011-11-11 + * @precisions normal z -> s d c + * + **/ +#include "coreblas/include/coreblas.h" + +/* + Constant: MAXSIZEOFBLOCKS + Maximum size of blocks given to blas in factorisation + */ + +#define MAXSIZEOFBLOCKS 32 /*64 in LAPACK*/ + +static CHAMELON_Complex64_t zone = 1.; +static CHAMELON_Complex64_t mzone = -1.; + +/* + Function: FactorizationLU + + LU Factorisation of one (diagonal) block + $A = LU$ + + For each column : + - Divide the column by the diagonal element. + - Substract the product of the subdiagonal part by + the line after the diagonal element from the + matrix under the diagonal element. + + Parameters: + A - Matrix to factorize + m - number of rows of the Matrix A + n - number of cols of the Matrix A + stride - Stide between 2 columns of the matrix + nbpivot - IN/OUT pivot number. + critere - Pivoting threshold. + */ +static void CORE_zgetf2_sp(int m, + int n, + CHAMELON_Complex64_t * A, + int stride, + double criteria, + int *nbpivot ) +{ + int k, minMN; + CHAMELON_Complex64_t *Akk, *Aik, alpha; + + minMN = chameleon_min( m, n ); + + Akk = A; + for (k=0; k<minMN; k++) { + Aik = Akk + 1; + + if ( cabs(*Akk) < criteria ) { + (*Akk) = (CHAMELON_Complex64_t)criteria; + (*nbpivot)++; + } + + /* A_ik = A_ik / A_kk, i = k+1 .. n */ + alpha = 1. / (*Akk); + cblas_zscal(m-k-1, CBLAS_SADDR(alpha), Aik, 1 ); + + if ( k+1 < minMN ) { + /* A_ij = A_ij - A_ik * A_kj, i,j = k+1..n */ + cblas_zgeru(CblasColMajor, m-k-1, n-k-1, + CBLAS_SADDR(mzone), Aik, 1, + Akk+stride, stride, + Aik+stride, stride); + } + + Akk += stride+1; + } +} + +/* + Function: FactorizationLU_block + + Block LU Factorisation of one (diagonal) big block + > A = LU + + Parameters: + A - Matrix to factorize + n - Size of A + stride - Stride between 2 columns of the matrix + nbpivot - IN/OUT pivot number. + critere - Pivoting threshold. + */ +void CORE_zgetrf_sp(int m, int n, + CHAMELON_Complex64_t *A, + int stride, + double criteria, + int *nbpivot) +{ + int k, blocknbr, blocksize, u_size, l_size, tempm, tempn; + CHAMELON_Complex64_t *Akk, *Lik, *Ukj, *Aij; + + blocknbr = (int) ceil( (double)chameleon_min(m,n)/(double)MAXSIZEOFBLOCKS ); + + Akk = A; /* Lk,k */ + + for (k=0; k<blocknbr; k++) { + tempm = m - k * MAXSIZEOFBLOCKS; + tempn = n - k * MAXSIZEOFBLOCKS; + + blocksize = chameleon_min(tempm, tempn); + blocksize = chameleon_min(MAXSIZEOFBLOCKS, blocksize); + + Lik = Akk + blocksize; + Ukj = Akk + blocksize*stride; + Aij = Ukj + blocksize; + + /* Factorize the diagonal block Akk */ + CORE_zgetf2_sp( blocksize, blocksize, Akk, stride, criteria, nbpivot ); + + u_size = tempn - blocksize; + l_size = tempm - blocksize; + if ( u_size > 0 && l_size > 0) { + + /* Compute the column Ukk+1 */ + CORE_ztrsm(ChamLeft, ChamLower, + ChamNoTrans, ChamUnit, + blocksize, u_size, + zone, Akk, stride, + Ukj, stride); + + /* Compute the column Lk+1k */ + CORE_ztrsm(ChamRight, ChamUpper, + ChamNoTrans, ChamNonUnit, + l_size, blocksize, + zone, Akk, stride, + Lik, stride); + + /* Update Ak+1,k+1 = Ak+1,k+1 - Lk+1,k*Uk,k+1 */ + CORE_zgemm(ChamNoTrans, ChamNoTrans, + l_size, u_size, blocksize, + mzone, Lik, stride, + Ukj, stride, + zone, Aij, stride); + + } + + Akk += blocksize * (stride+1); + } +} + +void CORE_zgetrf_sp_rec(int m, int n, + CHAMELON_Complex64_t *A, + int stride, + double criteria, + int *nbpivot) +{ + if(m > 0) { + if(n == 1) { + if ( cabs(*A) < criteria ) { + (*A) = (CHAMELON_Complex64_t)criteria; + (*nbpivot)++; + } + CHAMELON_Complex64_t alpha = 1. / (*A); + cblas_zscal(m-1, CBLAS_SADDR(alpha), A+1, 1); + } + else { + CORE_zgetrf_sp_rec(m, n/2, A, stride, criteria, nbpivot); + + CORE_ztrsm(ChamLeft, ChamLower, + ChamNoTrans, ChamUnit, + n/2, n/2+n%2, + zone, A, stride, + &A[(n/2)*stride], stride); + + CORE_zgemm(ChamNoTrans, ChamNoTrans, + m-n/2, n/2+n%2, n/2, + mzone, &A[n/2], stride, + &A[(n/2)*stride], stride, + zone, &A[(n/2)*stride+n/2], stride); + + CORE_zgetrf_sp_rec(m-n/2, n/2+n%2, + &A[((n/2)*stride)+(n/2)], + stride, criteria, nbpivot); + } + } +} diff --git a/coreblas/compute/core_zlaswp.c b/coreblas/compute/core_zlaswp.c new file mode 100644 index 0000000000000000000000000000000000000000..5cc7feb811f31e8699c9b23ee7928e40d3183757 --- /dev/null +++ b/coreblas/compute/core_zlaswp.c @@ -0,0 +1,286 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Inria. All rights reserved. + * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file core_zlaswp.c + * + * PLASMA core_blas kernel + * PLASMA is a software package provided by Inria Bordeaux - Sud-Ouest, LaBRI, + * University of Bordeaux, Bordeaux INP + * + * @version 2.6.0 + * @comment This file has been automatically generated + * from Plasma 2.6.0 for CHAMELON 1.0.0 + * @author Mathieu Faverge + * @author Terry Cojean + * @date 2010-11-15 + * @precisions normal z -> c d s + * + **/ +#include "coreblas/include/coreblas/lapacke.h" +#include "coreblas/include/coreblas.h" +#include "control/descriptor.h" + +#define A(m, n) ((CHAMELON_Complex64_t *)chameleon_getaddr_ccrb(descA, m, n)) + +/***************************************************************************//** + * + * @ingroup CORE_CHAMELON_Complex64_t + * + * CORE_zlaswp performs a series of row interchanges on the matrix A. + * One row interchange is initiated for each of rows I1 through I2 of A. + * + ******************************************************************************* + * + * @param[in] N + * The number of columns in the matrix A. N >= 0. + * + * @param[in,out] A + * On entry, the matrix of column dimension N to which the row + * interchanges will be applied. + * On exit, the permuted matrix. + * + * @param[in] LDA + * The leading dimension of the array A. LDA >= max(1,max(IPIV[I1..I2])). + * + * @param[in] I1 + * The first element of IPIV for which a row interchange will + * be done. + * + * @param[in] I2 + * The last element of IPIV for which a row interchange will + * be done. + * + * @param[in] IPIV + * The pivot indices; Only the element in position i1 to i2 + * are accessed. The pivot are offset by A.i. + * + * @param[in] INC + * The increment between successive values of IPIV. If IPIV + * is negative, the pivots are applied in reverse order. + * + ******************************************************************************* + */ + +void CORE_zlaswp(int N, CHAMELON_Complex64_t *A, int LDA, int I1, int I2, const int *IPIV, int INC) +{ + LAPACKE_zlaswp_work( LAPACK_COL_MAJOR, N, A, LDA, I1, I2, IPIV, INC ); +} + +/***************************************************************************//** + * + * @ingroup CORE_CHAMELON_Complex64_t + * + * CORE_zlaswp_ontile apply the zlaswp function on a matrix stored in + * tile layout + * + ******************************************************************************* + * + * @param[in,out] descA + * The descriptor of the matrix A to permute. + * + * @param[in] i1 + * The first element of IPIV for which a row interchange will + * be done. + * + * @param[in] i2 + * The last element of IPIV for which a row interchange will + * be done. + * + * @param[in] ipiv + * The pivot indices; Only the element in position i1 to i2 + * are accessed. The pivot are offset by A.i. + * + * @param[in] inc + * The increment between successive values of IPIV. If IPIV + * is negative, the pivots are applied in reverse order. + * + ******************************************************************************* + * + * @return + * \retval CHAMELON_SUCCESS successful exit + * \retval <0 if INFO = -k, the k-th argument had an illegal value + * + ******************************************************************************* + */ + +int CORE_zlaswp_ontile(CHAMELON_desc_t *descA, int i1, int i2, const int *ipiv, int inc) +{ + int i, j, ip, it; + CHAMELON_Complex64_t *A1; + int lda1, lda2; + + /* Change i1 to C notation */ + i1--; + + /* Check parameters */ + if ( descA->nt > 1 ) { + coreblas_error(1, "Illegal value of (A|B)->(nt|mt)"); + return -1; + } + if ( i1 < 0 ) { + coreblas_error(2, "Illegal value of i1"); + return -2; + } + if ( (i2 <= i1) || (i2 > descA->m) ) { + coreblas_error(3, "Illegal value of i2"); + return -3; + } + if ( ! ( (i2 - i1 - i1%descA->mb -1) < descA->mb ) ) { + coreblas_error(2, "Illegal value of i1,i2. They have to be part of the same block."); + return -3; + } + + if (inc > 0) { + it = i1 / descA->mb; + A1 = A(it, 0); + lda1 = BLKLDD(descA, 0); + /* printf("(i1;i2) :: (%d;%d) && descA->i:%d && it=%d\n",i1, i2, descA->i, it); */ + for (j = i1; j < i2; ++j, ipiv+=inc) { + ip = (*ipiv) - descA->i - 1; + /* printf("(ipiv[%d],ip)=(%d,%d) ",j,*ipiv,ip); */ + if ( ip != j ) + { + it = ip / descA->mb; + i = ip % descA->mb; + lda2 = BLKLDD(descA, it); + cblas_zswap(descA->n, A1 + j, lda1, + A(it, 0) + i, lda2 ); + } + } + /* printf("\n"); */ + } + else + { + it = (i2-1) / descA->mb; + A1 = A(it, 0); + lda1 = BLKLDD(descA, it); + + i1--; + ipiv = &ipiv[(1-i2)*inc]; + for (j = i2-1; j > i1; --j, ipiv+=inc) { + ip = (*ipiv) - descA->i - 1; + if ( ip != j ) + { + it = ip / descA->mb; + i = ip % descA->mb; + lda2 = BLKLDD(descA, it); + cblas_zswap(descA->n, A1 + j, lda1, + A(it, 0) + i, lda2 ); + } + } + } + + return CHAMELON_SUCCESS; +} + +/***************************************************************************//** + * + * @ingroup CORE_CHAMELON_Complex64_t + * + * CORE_zlaswpc_ontile apply the zlaswp function on a matrix stored in + * tile layout + * + ******************************************************************************* + * + * @param[in,out] descA + * The descriptor of the matrix A to permute. + * + * @param[in] i1 + * The first element of IPIV for which a column interchange will + * be done. + * + * @param[in] i2 + * The last element of IPIV for which a column interchange will + * be done. + * + * @param[in] ipiv + * The pivot indices; Only the element in position i1 to i2 + * are accessed. The pivot are offset by A.i. + * + * @param[in] inc + * The increment between successive values of IPIV. If IPIV + * is negative, the pivots are applied in reverse order. + * + ******************************************************************************* + * + * @return + * \retval CHAMELON_SUCCESS successful exit + * \retval <0 if INFO = -k, the k-th argument had an illegal value + * + ******************************************************************************* + */ + +int CORE_zlaswpc_ontile(CHAMELON_desc_t *descA, int i1, int i2, const int *ipiv, int inc) +{ + int i, j, ip, it; + CHAMELON_Complex64_t *A1; + int lda; + + /* Change i1 to C notation */ + i1--; + + /* Check parameters */ + if ( descA->mt > 1 ) { + coreblas_error(1, "Illegal value of descA->mt"); + return -1; + } + if ( i1 < 0 ) { + coreblas_error(2, "Illegal value of i1"); + return -2; + } + if ( (i2 <= i1) || (i2 > descA->n) ) { + coreblas_error(3, "Illegal value of i2"); + return -3; + } + if ( ! ( (i2 - i1 - i1%descA->nb -1) < descA->nb ) ) { + coreblas_error(2, "Illegal value of i1,i2. They have to be part of the same block."); + return -3; + } + + lda = BLKLDD(descA, 0); + + if (inc > 0) { + it = i1 / descA->nb; + A1 = A(0, it); + + for (j = i1-1; j < i2; ++j, ipiv+=inc) { + ip = (*ipiv) - descA->j - 1; + if ( ip != j ) + { + it = ip / descA->nb; + i = ip % descA->nb; + cblas_zswap(descA->m, A1 + j*lda, 1, + A(0, it) + i*lda, 1 ); + } + } + } + else + { + it = (i2-1) / descA->mb; + A1 = A(0, it); + + i1--; + ipiv = &ipiv[(1-i2)*inc]; + for (j = i2-1; j > i1; --j, ipiv+=inc) { + ip = (*ipiv) - descA->j - 1; + if ( ip != j ) + { + it = ip / descA->nb; + i = ip % descA->nb; + cblas_zswap(descA->m, A1 + j*lda, 1, + A(0, it) + i*lda, 1 ); + } + } + } + + return CHAMELON_SUCCESS; +} diff --git a/coreblas/compute/core_zplrnt.c b/coreblas/compute/core_zplrnt.c index e5c7f2914cb6101c8f05e125bf40a5555adb23f1..6dca2ae257b0fab0ed5724766cc7309bf7cb322c 100644 --- a/coreblas/compute/core_zplrnt.c +++ b/coreblas/compute/core_zplrnt.c @@ -34,7 +34,7 @@ #endif void CORE_zplrnt( int m, int n, CHAMELEON_Complex64_t *A, int lda, - int bigM, int m0, int n0, unsigned long long int seed ) + int bigM, int m0, int n0, unsigned long long int seed, int dsdmatrix ) { CHAMELEON_Complex64_t *tmp = A; int64_t i, j; @@ -50,5 +50,8 @@ void CORE_zplrnt( int m, int n, CHAMELEON_Complex64_t *A, int lda, } tmp += lda-i; jump += bigM; + if (dsdmatrix && m0 == n0) { + A[j+j*lda] += bigM; + } } } diff --git a/coreblas/compute/core_zswp.c b/coreblas/compute/core_zswp.c new file mode 100644 index 0000000000000000000000000000000000000000..5cc50b878039815659d454f5d3ae81b28a9683ac --- /dev/null +++ b/coreblas/compute/core_zswp.c @@ -0,0 +1,320 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Inria. All rights reserved. + * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file core_zlaswp.c + * + * PLASMA core_blas kernel + * PLASMA is a software package provided by Inria Bordeaux - Sud-Ouest, LaBRI, + * University of Bordeaux, Bordeaux INP + * + * @version 2.6.0 + * @comment This file has been automatically generated + * from Plasma 2.6.0 for CHAMELON 1.0.0 + * @author Mathieu Faverge + * @author Terry Cojean + * @date 2010-11-15 + * @precisions normal z -> c d s + * + **/ +#include "coreblas/include/coreblas.h" +#include "control/descriptor.h" + +#define A(m, n) ((CHAMELON_Complex64_t *)chameleon_getaddr_ccrb(descA, m, n)) + +/***************************************************************************//** + * + * @ingroup CORE_CHAMELON_Complex64_t + * + * CORE_zswpp apply the zlaswp function between a pair of tiles + * + ******************************************************************************* + * + * @param[in,out] descA + * The descriptor of the matrix A to permute, containing both tiles. + * + * @param[in] Bm + * The indice of the destination tile + * + * @param[in] i1 + * The first element of IPIV for which a row interchange will + * be done. + * + * @param[in] i2 + * The last element of IPIV for which a row interchange will + * be done. + * + * @param[in] ipiv + * The pivot indices; Only the element in position i1 to i2 + * are accessed. The pivot are offset by A.i. + * + * @param[in] inc + * The increment between successive values of IPIV. If IPIV + * is negative, the pivots are applied in reverse order. + * + ******************************************************************************* + * + * @return + * \retval CHAMELON_SUCCESS successful exit + * \retval <0 if INFO = -k, the k-th argument had an illegal value + * + ******************************************************************************* + */ +int CORE_zswpp(CHAMELON_desc_t* descA, int Bm, int i1, int i2, const int *ipiv, int inc) +{ + /* Change i1 to C notation */ + i1--; + + /* Check parameters */ + if ( descA->nt > 1 ) { + coreblas_error(1, "Illegal value of descA->nt"); + return -1; + } + if ( i1 < 0 ) { + coreblas_error(2, "Illegal value of i1"); + return -2; + } + if ( (i2 <= i1) || (i2 > descA->m) ) { + coreblas_error(3, "Illegal value of i2"); + return -3; + } + if ( ! ( (i2 - i1 - i1%descA->mb -1) < descA->mb ) ) { + coreblas_error(2, "Illegal value of i1,i2. They have to be part of the same block."); + return -3; + } + + if (inc > 0) { + int it, j, lda1; + CHAMELON_Complex64_t *A1; + it = i1 / descA->mb; + A1 = A(it, 0); + lda1 = BLKLDD(descA, 0); + + for (j = i1; j < i2; ++j, ipiv+=inc) { + int ip = (*ipiv) - descA->i - 1; + it = ip / descA->mb; + + if ( it == Bm && ip != j) + { + int i = ip % descA->mb; + int lda2 = BLKLDD(descA, it ); + cblas_zswap(descA->n, A1 + j, lda1, + A(it,0) + i, lda2 ); + } + } + } + + return CHAMELON_SUCCESS; +} + +/***************************************************************************//** + * + * @ingroup CORE_CHAMELON_Complex64_t + * + * CORE_zswptr_ontile apply the zlaswp function on a matrix stored in + * tile layout, followed by a ztrsm on the first tile of the panel. + * + ******************************************************************************* + * + * @param[in,out] descA + * The descriptor of the matrix A to permute. + * + * @param[in] i1 + * The first element of IPIV for which a row interchange will + * be done. + * + * @param[in] i2 + * The last element of IPIV for which a row interchange will + * be done. + * + * @param[in] ipiv + * The pivot indices; Only the element in position i1 to i2 + * are accessed. The pivot are offset by A.i. + * + * @param[in] inc + * The increment between successive values of IPIV. If IPIV + * is negative, the pivots are applied in reverse order. + * + * @param[in] Akk + * The triangular matrix Akk. The leading descA->nb-by-descA->nb + * lower triangular part of the array Akk contains the lower triangular matrix, and the + * strictly upper triangular part of A is not referenced. The + * diagonal elements of A are also not referenced and are assumed to be 1. + * + * @param[in] ldak + * The leading dimension of the array Akk. ldak >= max(1,descA->nb). + * + ******************************************************************************* + * + * @return + * \retval CHAMELON_SUCCESS successful exit + * \retval <0 if INFO = -k, the k-th argument had an illegal value + * + ******************************************************************************* + */ + +int CORE_zswptr_ontile(CHAMELON_desc_t *descA, int i1, int i2, const int *ipiv, int inc, + const CHAMELON_Complex64_t *Akk, int ldak) +{ + CHAMELON_Complex64_t zone = 1.0; + int lda; + int m = descA->mt == 1 ? descA->m : descA->mb; + + if ( descA->nt > 1 ) { + coreblas_error(1, "Illegal value of descA->nt"); + return -1; + } + if ( i1 < 1 ) { + coreblas_error(2, "Illegal value of i1"); + return -2; + } + if ( (i2 < i1) || (i2 > m) ) { + coreblas_error(3, "Illegal value of i2"); + return -3; + } + + CORE_zlaswp_ontile(descA, i1, i2, ipiv, inc); + + lda = BLKLDD(descA, 0); + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, + CblasNoTrans, CblasUnit, + m, descA->n, CBLAS_SADDR(zone), + Akk, ldak, + A(0, 0), lda ); + + return CHAMELON_SUCCESS; +} + +void CORE_zswp_insert_rows(int nb_pivots, int global_MB, + int K, int SRC_M, int DST_M, + CHAMELON_Complex64_t *A, + int *perm, + CHAMELON_Complex64_t *rows, + int NB, int MB, int LDA) +{ + int i, offset=0; + + for(i = 0; i < MB && offset < nb_pivots * NB ; i++) { + int owner_tile = perm[i] / global_MB; + int row_index = perm[i] % global_MB; + + if (perm[i] != i + global_MB*K) { + if ((owner_tile == SRC_M && DST_M == K) || (SRC_M == K && DST_M == owner_tile)) { + int insertion_index; + //redudant but both use case depend on this + if (DST_M == K || (DST_M == SRC_M && DST_M == K)) + insertion_index = i; + else + insertion_index = row_index; + cblas_zcopy(NB, rows + offset, 1, + A + insertion_index, LDA); + offset += NB; + } + } + } +} + +void CORE_zswp_extract_rows(int nb_pivots, int global_MB, + int K, int SRC_M, int DST_M, + CHAMELON_Complex64_t *A, + int *perm, + CHAMELON_Complex64_t *rows, + int NB, int MB, int LDA) +{ + int i, offset=0; + + for (i = 0 ; i < MB && offset < nb_pivots * NB ; i++) { + int owner_tile = perm[i] / global_MB; + int row_index = perm[i] % global_MB; + + /* Do we need to extract something ? */ + if (perm[i] != i + global_MB*K) { + /* Concerns us somehow */ + if ((owner_tile == SRC_M && DST_M == K) || (SRC_M == K && DST_M == owner_tile)) { + int extraction_index; + if (SRC_M == DST_M && SRC_M == K) + extraction_index = row_index; + else if (SRC_M == K) //tile superior + extraction_index = i; + else //tile inferior + extraction_index = row_index; + cblas_zcopy(NB, A + extraction_index, LDA, + rows + offset, 1); + offset += NB; + } + } + } +} + +void CORE_zswp_insert_rows2(int nb_pivots, int global_MB, + int K, int SRC_M, int DST_M, + CHAMELON_Complex64_t *A, + int *perm, + CHAMELON_Complex64_t *rows, + int *idxs, int NB, int MB, int LDA) +{ + int i, offset=0; + + /* printf("%d, %d, %d -------------\n", K, SRC_M, nb_pivots); */ + for(i = 0; i < nb_pivots; i++) { + int index=idxs[i]; + /* printf("(k, index, perm) = (%d, %d, %d)\n", K, index, perm[index]); */ + /* int owner_tile = perm[index] / global_MB; */ + int row_index = perm[index] % global_MB; + int insertion_index; + + /* if ((owner_tile == SRC_M && DST_M == K) || (SRC_M == K && DST_M == owner_tile)) { */ + //redudant but both use case depend on this + if (DST_M == K || (DST_M == SRC_M && DST_M == K)) + insertion_index = index; + else + insertion_index = row_index; + cblas_zcopy(NB, rows + offset, 1, + A + insertion_index, LDA); + offset += NB; + /* } */ + } + /* printf("%d, %d, %d -------------\n", K, SRC_M, nb_pivots); */ +} + +void CORE_zswp_extract_rows2(int nb_pivots, int global_MB, + int K, int SRC_M, int DST_M, + CHAMELON_Complex64_t *A, + int *perm, + CHAMELON_Complex64_t *rows, + int *idxs, int NB, int MB, int LDA) +{ + int i, offset=0, count=0, index; + + for (i = 0 ; i < nb_pivots ; i++) { + index = idxs[i]; + int owner_tile = perm[index] / global_MB; + int row_index = perm[index] % global_MB; + + /* Do we need to extract something ? */ + /* if (perm[i] != i + global_MB*K) { */ + /* Concerns us somehow */ + /* if ((owner_tile == SRC_M && DST_M == K) || (SRC_M == K && DST_M == owner_tile)) { */ + int extraction_index; + if (SRC_M == DST_M && SRC_M == K) + extraction_index = row_index; + else if (SRC_M == K) //tile superior + extraction_index = index; + else //tile inferior + extraction_index = row_index; + cblas_zcopy(NB, A + extraction_index, LDA, + rows + offset, 1); + /* idxs[count++] = i; */ + offset += NB; + /* } */ + /* } */ + } +} diff --git a/coreblas/include/coreblas/coreblas_z.h b/coreblas/include/coreblas/coreblas_z.h index 050a7968769a5c37a138faee18ef0a577ee6be19..a4e55a9b930d07be3668920ce9a7508ec93e8023 100644 --- a/coreblas/include/coreblas/coreblas_z.h +++ b/coreblas/include/coreblas/coreblas_z.h @@ -80,6 +80,9 @@ int CORE_zgetf2_nopiv(int M, int N, int CORE_zgetrf(int M, int N, CHAMELEON_Complex64_t *A, int LDA, int *IPIV, int *INFO); +void CORE_zgetrf_extract_max(CHAMELON_Complex64_t *A, CHAMELON_Complex64_t *mw, + int K, int M, int N, int H, int NB, + int AM, int LDA); int CORE_zgetrf_incpiv(int M, int N, int IB, CHAMELEON_Complex64_t *A, int LDA, int *IPIV, int *INFO); @@ -206,7 +209,7 @@ void CORE_zplghe(double bump, int m, int n, CHAMELEON_Complex64_t *A, int lda, void CORE_zplgsy(CHAMELEON_Complex64_t bump, int m, int n, CHAMELEON_Complex64_t *A, int lda, int bigM, int m0, int n0, unsigned long long int seed ); void CORE_zplrnt(int m, int n, CHAMELEON_Complex64_t *A, int lda, - int bigM, int m0, int n0, unsigned long long int seed ); + int bigM, int m0, int n0, unsigned long long int seed, int dsdmatrix ); int CORE_zplssq( cham_store_t storev, int M, int N, double *sclssqin, double *sclssqout ); int CORE_zplssq2( int N, double *sclssq ); diff --git a/cudablas/compute/CMakeLists.txt b/cudablas/compute/CMakeLists.txt index 5389d35fdcc459ebb8783635eed79dc840160e64..a051c3766f1838b73fdc7261441f5714f6c56e6b 100644 --- a/cudablas/compute/CMakeLists.txt +++ b/cudablas/compute/CMakeLists.txt @@ -116,6 +116,37 @@ install(TARGETS cudablas LIBRARY DESTINATION lib ) + +# Actual CUDA source files +# ------------ +set(CUDA_ZSRC + cuda_zswp.cu + ) + +precisions_rules_py(CUDABLAS_CUSRCS_GENERATED "${CUDA_ZSRC}" + PRECISIONS "${CHAMELEON_PRECISION}") + +unset(_tmp) +foreach(element ${CUDABLAS_CUSRCS_GENERATED}) + set(_tmp + ${_tmp} + ${CMAKE_CURRENT_BINARY_DIR}/${element} + ) +endforeach(element) +set(CUDABLAS_CUSRCS + ${_tmp} + ) + +set(CUDA_SERPARABLE_COMPILATION ON) +# CUDA_ADD_LIBRARY(cudablas_cuda ${CUDABLAS_CUSRCS} SHARED) +CUDA_ADD_LIBRARY(cudablas_cuda ${CUDABLAS_CUSRCS}) +add_dependencies(cudablas_cuda cudablas_include) +set_property(TARGET cudablas_cuda PROPERTY LINKER_LANGUAGE Fortran) + +target_link_libraries(cudablas_cuda coreblas ${CUDA_LIBRARIES}) +install(TARGETS cudablas_cuda + DESTINATION lib) + ### ### END CMakeLists.txt ### diff --git a/cudablas/compute/cuda_zgetrf_sp.c b/cudablas/compute/cuda_zgetrf_sp.c new file mode 100644 index 0000000000000000000000000000000000000000..033cc42fea7ff9c71f595d243a7fb73d6441c14d --- /dev/null +++ b/cudablas/compute/cuda_zgetrf_sp.c @@ -0,0 +1,42 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Inria. All rights reserved. + * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file cuda_zgetrf_ps.c + * + * CHAMELON cudablas kernel + * CHAMELON is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @author Terry Cojean + * @date 2017-02-08 + * @precisions normal z -> c d s + * + **/ +#include "cudablas/include/cudablas.h" +#include "cudablas/include/cudablas_z.h" + +#if defined(CHAMELEON_USE_MAGMA) +int CUDA_zgetrf_sp(magma_int_t m, magma_int_t n, + magmaDoubleComplex *dA, magma_int_t ldda, + magma_double_t criteria, magma_int_t *nb_pivot, + magma_int_t *info) +{ + int ret; + ret = magma_zgetrf_sp_gpu(m, n, dA, ldda, criteria, nb_pivot, &info); + if (ret != MAGMA_SUCCESS) { + fprintf(stderr, "Error in MAGMA: %d\n", ret); + exit(-1); + } + return CHAMELON_SUCCESS; +} +#endif diff --git a/cudablas/compute/cuda_zpotrf.c b/cudablas/compute/cuda_zpotrf.c index 7c6c86c1456b4fc804fb31732c3e57db454bcc2e..f7efae99ef0286758fa860b738b07e7a3d41030f 100644 --- a/cudablas/compute/cuda_zpotrf.c +++ b/cudablas/compute/cuda_zpotrf.c @@ -22,15 +22,15 @@ #if defined(CHAMELEON_USE_MAGMA) int CUDA_zpotrf( - magma_uplo_t uplo, magma_int_t n, - magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info) + magma_uplo_t uplo, magma_int_t n, + magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info) { int ret; ret = magma_zpotrf_gpu( uplo, n, dA, ldda, info); -/* hA, stream );*/ - if (ret != MAGMA_SUCCESS) { + /* hA, stream );*/ + if (ret != MAGMA_SUCCESS) { fprintf(stderr, "Error in MAGMA: %d\n", ret); exit(-1); } diff --git a/cudablas/compute/cuda_zswp.cu b/cudablas/compute/cuda_zswp.cu new file mode 100644 index 0000000000000000000000000000000000000000..3020f5608333ec52c1e8c159c753564714b1c500 --- /dev/null +++ b/cudablas/compute/cuda_zswp.cu @@ -0,0 +1,305 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Inria. All rights reserved. + * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file cuda_zswp.cu + * + * CHAMELON cudablas kernel + * CHAMELON is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @author Terry Cojean + * @date 2017-02-10 + * @precisions normal z -> c d s + * + **/ +#include "cudablas/include/cudablas.h" +#include "cudablas/include/cudablas/cudablas_z.h" + + +__global__ __launch_bounds__(CHAMELEON_CUDA_NTHREADS) + void CUDA_zswp_insert_rows_kernel(int nb_pivots, int global_MB, + const int *__restrict__ perm, + cuDoubleComplex *__restrict__ A, + const cuDoubleComplex *__restrict__ rows, + int K, int SRC_M, int DST_M, + int NB, int MB, int LDA) +{ + int n = blockIdx.x * blockDim.x + threadIdx.x; + int i, owner_tile, row_index, offset=0; + + if ( n < NB ) { + A = A + n*LDA; + if (DST_M == K) { + for (i = 0 ; i < MB && offset < nb_pivots * NB; i++) { + owner_tile = perm[i] / global_MB; + row_index = perm[i] % global_MB; + + if (perm[i] != i + global_MB*K && ((owner_tile == SRC_M && DST_M == K) || (owner_tile == DST_M && SRC_M == K))) { + A[i] = rows[offset+n]; + offset += NB; + } + } + } else { + for (i = 0 ; i < MB && offset < nb_pivots * NB; i++) { + owner_tile = perm[i] / global_MB; + row_index = perm[i] % global_MB; + + if (perm[i] != i + global_MB*K && ((owner_tile == SRC_M && DST_M == K) || (owner_tile == DST_M && SRC_M == K))) { + A[row_index] = rows[offset+n]; + offset += NB; + } + } + } + } +} + +void CUDA_zswp_insert_rows(int nb_pivots, int global_MB, const int *perm, + cuDoubleComplex *A, const cuDoubleComplex *rows, + int K, int SRC_M, int DST_M, int NB, int MB, int LDA, + cudaStream_t stream) +{ +#if !defined(CHAMELEON_USE_CUBLAS_V2) + cublasSetKernelStream( stream ); +#endif + + dim3 grid( NB / CHAMELEON_CUDA_NTHREADS ); + dim3 threads( CHAMELEON_CUDA_NTHREADS ); + CUDA_zswp_insert_rows_kernel<<<grid, threads, 0, stream>>> + (nb_pivots, global_MB, perm, A, rows, K, SRC_M, DST_M, NB, MB, LDA); + + /* cudaError_t cures; */ + /* cures = cudaStreamSynchronize(stream); */ + /* if (cures) */ + /* { */ + /* const char *errormsg = cudaGetErrorString(cures); */ + /* printf("Error in swp_extract_rows_gpu: %s\n", errormsg); */ + /* } */ +} + +__global__ __launch_bounds__(CHAMELEON_CUDA_NTHREADS) + void CUDA_zswp_extract_rows_kernel(int nb_pivots, int global_MB, + const int *__restrict__ perm, + const cuDoubleComplex *__restrict__ A, + cuDoubleComplex *__restrict__ rows, + int K, int SRC_M, int DST_M, + int NB, int MB, int LDA) +{ + int n = blockIdx.x * blockDim.x + threadIdx.x; + int i, owner_tile, row_index, offset=0; + + if ( n < NB ) { + A = A + n * LDA; + if (SRC_M == DST_M && SRC_M == K) + { + for (i = 0 ; i < MB && offset < nb_pivots * NB ; i++) { + owner_tile = perm[i] / global_MB; + row_index = perm[i] % global_MB; + + if (perm[i] != i + global_MB*K && ((owner_tile == SRC_M && DST_M == K) || (owner_tile == DST_M && SRC_M ==K ))) { + rows[n + offset] = A[row_index]; + offset += NB; + } + } + } else if (SRC_M == K) { + for (i = 0 ; i < MB && offset < nb_pivots * NB ; i++) { + owner_tile = perm[i] / global_MB; + row_index = perm[i] % global_MB; + + if (perm[i] != i + global_MB*K && ((owner_tile == SRC_M && DST_M == K) || (owner_tile == DST_M && SRC_M == K))) { + rows[n + offset] = A[i]; + offset += NB; + } + } + } else { + for (i = 0 ; i < MB && offset < nb_pivots * NB ; i++) { + owner_tile = perm[i] / global_MB; + row_index = perm[i] % global_MB; + + if (perm[i] != i + global_MB*K && ((owner_tile == SRC_M && DST_M == K) || (owner_tile == DST_M && SRC_M == K))) { + rows[n + offset] = A[row_index]; + offset += NB; + } + } + } + } +} + +void CUDA_zswp_extract_rows(int nb_pivots, int global_MB, const int *perm, + const cuDoubleComplex *A, cuDoubleComplex *rows, + int K, int SRC_M, int DST_M, int NB, int MB, int LDA, + cudaStream_t stream) +{ +#if !defined(CHAMELEON_USE_CUBLAS_V2) + cublasSetKernelStream( stream ); +#endif + + dim3 grid( NB / CHAMELEON_CUDA_NTHREADS ); + dim3 threads( CHAMELEON_CUDA_NTHREADS ); + /* dim3 grid( NB ); */ + /* dim3 threads( 1 ); */ + + CUDA_zswp_extract_rows_kernel<<<grid, threads, 0, stream>>> + (nb_pivots, global_MB, perm, A, rows, K, SRC_M, DST_M, NB, MB, LDA); + + /* cudaError_t cures; */ + /* cures = cudaStreamSynchronize(stream); */ + /* if (cures) */ + /* { */ + /* const char *errormsg = cudaGetErrorString(cures); */ + /* printf("Error in swp_extract_rows_gpu: %s\n", errormsg); */ + /* } */ +} + +__global__ __launch_bounds__(CHAMELEON_CUDA_NTHREADS) + void CUDA_zswp_insert_rows2_kernel(int nb_pivots, int global_MB, + const int *__restrict__ perm, + cuDoubleComplex *__restrict__ A, + const cuDoubleComplex *__restrict__ rows, + const int *__restrict__ idxs, + int K, int SRC_M, int DST_M, + int NB, int MB, int LDA) +{ + int n = blockIdx.y * blockDim.y + threadIdx.y; + int i = blockIdx.x, row_index, index; + + if ( n < NB ) { + A = A + n*LDA; + if (DST_M == K) { + /* for (i = 0 ; i < nb_pivots ; i++) { */ + index = idxs[i]; + /* owner_tile = perm[index] / global_MB; */ + row_index = perm[index] % global_MB; + + /* if ((owner_tile == SRC_M && DST_M == K) || (SRC_M == K && DST_M == owner_tile)) { */ + A[index] = rows[i*NB+n]; + /* offset += NB; */ + /* } */ + /* } */ + } else { + /* for (i = 0 ; i < nb_pivots ; i++) { */ + index = idxs[i]; + /* owner_tile = perm[index] / global_MB; */ + row_index = perm[index] % global_MB; + + /* if ((owner_tile == SRC_M && DST_M == K) || (SRC_M == K && DST_M == owner_tile)) { */ + A[row_index] = rows[i*NB+n]; + /* offset += NB; */ + /* } */ + /* } */ + } + } +} + +void CUDA_zswp_insert_rows2(int nb_pivots, int global_MB, const int *perm, + cuDoubleComplex *A, const cuDoubleComplex *rows, + const int *idxs, int K, int SRC_M, int DST_M, int NB, int MB, int LDA, + cudaStream_t stream) +{ +#if !defined(CHAMELEON_USE_CUBLAS_V2) + cublasSetKernelStream( stream ); +#endif + + dim3 grid( nb_pivots, NB / CHAMELEON_CUDA_NTHREADS); + dim3 threads( 1, CHAMELEON_CUDA_NTHREADS ); + CUDA_zswp_insert_rows2_kernel<<<grid, threads, 0, stream>>> + (nb_pivots, global_MB, perm, A, rows, idxs, K, SRC_M, DST_M, NB, MB, LDA); + + /* cudaError_t cures; */ + /* cures = cudaStreamSynchronize(stream); */ + /* if (cures) */ + /* { */ + /* const char *errormsg = cudaGetErrorString(cures); */ + /* printf("Error in swp_extract_rows_gpu: %s\n", errormsg); */ + /* } */ +} + +__global__ __launch_bounds__(CHAMELEON_CUDA_NTHREADS) + void CUDA_zswp_extract_rows2_kernel(int nb_pivots, int global_MB, + const int *__restrict__ perm, + const cuDoubleComplex *__restrict__ A, + cuDoubleComplex *__restrict__ rows, + int *idxs, + int K, int SRC_M, int DST_M, + int NB, int MB, int LDA) +{ + int n = blockIdx.y * blockDim.y + threadIdx.y; + int i = blockIdx.x, owner_tile, row_index, offset=0, index; + + if ( n < NB ) { + A = A + n * LDA; + if (SRC_M == DST_M && SRC_M == K) + { + /* for (i = 0 ; i < nb_pivots ; i++) { */ + index = idxs[i]; + owner_tile = perm[index] / global_MB; + row_index = perm[index] % global_MB; + + /* if (perm[index] != index + global_MB*K && ((owner_tile == SRC_M && DST_M == K) || (owner_tile == DST_M && SRC_M ==K ))) { */ + rows[n + NB*i] = A[row_index]; + /* offset += NB; */ + /* if (n == 0)idxs[count++] = i; */ + /* } */ + /* } */ + } else if (SRC_M == K) { + /* for (i = 0 ; i < nb_pivots ; i++) { */ + index = idxs[i]; + owner_tile = perm[index] / global_MB; + row_index = perm[index] % global_MB; + + /* if (perm[index] != index + global_MB*K && ((owner_tile == SRC_M && DST_M == K) || (owner_tile == DST_M && SRC_M == K))) { */ + rows[n + NB*i] = A[index]; + /* offset += NB; */ + /* if (n == 0)idxs[count++] = i; */ + /* } */ + /* } */ + } else { + /* for (i = 0 ; i < nb_pivots ; i++) { */ + index = idxs[i]; + owner_tile = perm[index] / global_MB; + row_index = perm[index] % global_MB; + + /* if (perm[index] != index + global_MB*K && ((owner_tile == SRC_M && DST_M == K) || (owner_tile == DST_M && SRC_M == K))) { */ + rows[n + NB*i] = A[row_index]; + /* offset += NB; */ + /* if (n == 0)idxs[count++] = i; */ + /* } */ + /* } */ + } + } +} + +void CUDA_zswp_extract_rows2(int nb_pivots, int global_MB, const int *perm, + const cuDoubleComplex *A, cuDoubleComplex *rows, + int *idxs, int K, int SRC_M, int DST_M, int NB, int MB, int LDA, + cudaStream_t stream) +{ +#if !defined(CHAMELEON_USE_CUBLAS_V2) + cublasSetKernelStream( stream ); +#endif + + dim3 grid( nb_pivots, NB / CHAMELEON_CUDA_NTHREADS ); + dim3 threads( 1, CHAMELEON_CUDA_NTHREADS ); + /* dim3 grid( NB ); */ + /* dim3 threads( 1 ); */ + + CUDA_zswp_extract_rows2_kernel<<<grid, threads, 0, stream>>> + (nb_pivots, global_MB, perm, A, rows, idxs, K, SRC_M, DST_M, NB, MB, LDA); + + /* cudaError_t cures; */ + /* cures = cudaStreamSynchronize(stream); */ + /* if (cures) */ + /* { */ + /* const char *errormsg = cudaGetErrorString(cures); */ + /* printf("Error in swp_extract_rows_gpu: %s\n", errormsg); */ + /* } */ +} diff --git a/cudablas/include/cudablas.h b/cudablas/include/cudablas.h index fc6fe7a713e22de8a968678fb2311d69909f23fb..38b1f0d9a0df0f44bd1d016eccba1190466245d6 100644 --- a/cudablas/include/cudablas.h +++ b/cudablas/include/cudablas.h @@ -36,6 +36,8 @@ #include <cuda.h> #include <cuComplex.h> +#define CHAMELEON_CUDA_NTHREADS 64 + #include <cublas.h> #include <cublas_v2.h> diff --git a/cudablas/include/cudablas/cudablas_z.h b/cudablas/include/cudablas/cudablas_z.h index d0fa9be789de00afe44ae5c53172ace18ccac4ce..777a62aa1d839130a4c5ac45aee883f60717f325 100644 --- a/cudablas/include/cudablas/cudablas_z.h +++ b/cudablas/include/cudablas/cudablas_z.h @@ -24,6 +24,11 @@ /** * Declarations of cuda kernels - alphabetical order */ +void CUDA_zswp_extract_rows(int nb_pivots, int global_MB, const int *perm, const cuDoubleComplex *A, cuDoubleComplex *rows, int K, int SRC_M, int DST_M, int NB, int MB, int LDA, cudaStream_t stream); +void CUDA_zswp_insert_rows(int nb_pivots, int global_MB, const int *perm, cuDoubleComplex *A, const cuDoubleComplex *rows, int K, int SRC_M, int DST_M, int NB, int MB, int LDA, cudaStream_t stream); +void CUDA_zswp_extract_rows2(int nb_pivots, int global_MB, const int *perm, const cuDoubleComplex *A, cuDoubleComplex *rows, int *idxs, int K, int SRC_M, int DST_M, int NB, int MB, int LDA, cudaStream_t stream); +void CUDA_zswp_insert_rows2(int nb_pivots, int global_MB, const int *perm, cuDoubleComplex *A, const cuDoubleComplex *rows, const int *idxs, int K, int SRC_M, int DST_M, int NB, int MB, int LDA, cudaStream_t stream); + int CUDA_zgeadd( cham_trans_t trans, int m, int n, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *beta, cuDoubleComplex *B, int ldb, cublasHandle_t handle ); int CUDA_zgemerge( cham_side_t side, cham_diag_t diag, int M, int N, const cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cublasHandle_t handle ); int CUDA_zgemm( cham_trans_t transa, cham_trans_t transb, int m, int n, int k, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, cublasHandle_t handle ); diff --git a/cudablas/include/cudablas_z.h b/cudablas/include/cudablas_z.h new file mode 100644 index 0000000000000000000000000000000000000000..fa946ea37978ca64fbcbd82c3a8154bef65f23a6 --- /dev/null +++ b/cudablas/include/cudablas_z.h @@ -0,0 +1,81 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2016 Inria. All rights reserved. + * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file cudablas_z.h + * + * CHAMELON cudablas headers + * CHAMELON is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @author Florent Pruvost + * @date 2015-09-16 + * @precisions normal z -> c d s + * + **/ +#ifndef _CHAMELON_CUDA_ZBLAS_H_ +#define _CHAMELON_CUDA_ZBLAS_H_ + +#include "include/chameleon_config.h" +#include "chameleon_types.h" +#include "cudablas/include/cudablas.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** **************************************************************************** + * Declarations of cuda kernels - alphabetical order + **/ +int CUDA_zgemerge( CHAMELON_enum side, CHAMELON_enum diag, int M, int N, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, CUBLAS_STREAM_PARAM); +int CUDA_zgemm( CHAMELON_enum transa, CHAMELON_enum transb, int m, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM); +int CUDA_zhemm( CHAMELON_enum side, CHAMELON_enum uplo, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM); +int CUDA_zher2k( CHAMELON_enum uplo, CHAMELON_enum trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, double *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM); +int CUDA_zherfb( CHAMELON_enum uplo, int n, int k, int ib, int nb, const cuDoubleComplex *A, int lda, const cuDoubleComplex *T, int ldt, cuDoubleComplex *C, int ldc, cuDoubleComplex *WORK, int ldwork, CUBLAS_STREAM_PARAM ); +int CUDA_zherk( CHAMELON_enum uplo, CHAMELON_enum trans, int n, int k, double *alpha, const cuDoubleComplex *A, int lda, double *beta, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM); +int CUDA_zlarfb(CHAMELON_enum side, CHAMELON_enum trans, CHAMELON_enum direct, CHAMELON_enum storev, int M, int N, int K, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM ); +int CUDA_zparfb(CHAMELON_enum side, CHAMELON_enum trans, CHAMELON_enum direct, CHAMELON_enum storev, int M1, int N1, int M2, int N2, int K, int L, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM ); +int CUDA_zsymm( CHAMELON_enum side, CHAMELON_enum uplo, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM); +int CUDA_zsyr2k( CHAMELON_enum uplo, CHAMELON_enum trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM); +int CUDA_zsyrk( CHAMELON_enum uplo, CHAMELON_enum trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM); +void CUDA_zswp_extract_rows(int nb_pivots, int global_MB, const int *perm, const cuDoubleComplex *A, cuDoubleComplex *rows, int K, int SRC_M, int DST_M, int NB, int MB, int LDA, cudaStream_t stream); +void CUDA_zswp_insert_rows(int nb_pivots, int global_MB, const int *perm, cuDoubleComplex *A, const cuDoubleComplex *rows, int K, int SRC_M, int DST_M, int NB, int MB, int LDA, cudaStream_t stream); +void CUDA_zswp_extract_rows2(int nb_pivots, int global_MB, const int *perm, const cuDoubleComplex *A, cuDoubleComplex *rows, int *idxs, int K, int SRC_M, int DST_M, int NB, int MB, int LDA, cudaStream_t stream); +void CUDA_zswp_insert_rows2(int nb_pivots, int global_MB, const int *perm, cuDoubleComplex *A, const cuDoubleComplex *rows, const int *idxs, int K, int SRC_M, int DST_M, int NB, int MB, int LDA, cudaStream_t stream); +int CUDA_ztpmqrt( CHAMELON_enum side, CHAMELON_enum trans, int M, int N, int K, int L, int IB, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, CUBLAS_STREAM_PARAM ); +int CUDA_ztrmm( CHAMELON_enum side, CHAMELON_enum uplo, CHAMELON_enum transa, CHAMELON_enum diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM); +int CUDA_ztrsm( CHAMELON_enum side, CHAMELON_enum uplo, CHAMELON_enum transa, CHAMELON_enum diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM); +int CUDA_ztsmlq( CHAMELON_enum side, CHAMELON_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM); +int CUDA_ztsmqr( CHAMELON_enum side, CHAMELON_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM); +int CUDA_zunmlqt(CHAMELON_enum side, CHAMELON_enum trans, int M, int N, int K, int IB, const cuDoubleComplex *A, int LDA, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM ); +int CUDA_zunmqrt(CHAMELON_enum side, CHAMELON_enum trans, int M, int N, int K, int IB, const cuDoubleComplex *A, int LDA, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM ); + +#if defined(CHAMELEON_USE_MAGMA) +int CUDA_zgelqt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da, magma_int_t ldda, magmaDoubleComplex *v, magma_int_t ldv, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM); +int CUDA_zgeqrt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da, magma_int_t ldda, magmaDoubleComplex *v, magma_int_t ldv, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM); +int CUDA_zgessm( char storev, magma_int_t m, magma_int_t n, magma_int_t k, magma_int_t ib, magma_int_t *ipiv, cuDoubleComplex *dL1, magma_int_t lddl1, cuDoubleComplex *dL, magma_int_t lddl, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info); +int CUDA_zgetrf_incpiv( char storev, magma_int_t m, magma_int_t n, magma_int_t ib, cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl, magma_int_t *ipiv, cuDoubleComplex *dwork, magma_int_t lddwork, magma_int_t *info); +int CUDA_zgetrf_nopiv( magma_int_t m, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info); +int CUDA_zlauum( char uplo, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info); +int CUDA_zpotrf( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info); +int CUDA_zssssm( magma_storev_t storev, magma_int_t m1, magma_int_t n1, magma_int_t m2, magma_int_t n2, magma_int_t k, magma_int_t ib, magmaDoubleComplex *dA1, magma_int_t ldda1, magmaDoubleComplex *dA2, magma_int_t ldda2, magmaDoubleComplex *dL1, magma_int_t lddl1, magmaDoubleComplex *dL2, magma_int_t lddl2, magma_int_t *IPIV, magma_int_t *info); +int CUDA_ztrtri( magma_uplo_t uplo, magma_diag_t diag, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info); +int CUDA_ztslqt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da1, magma_int_t ldda1, magmaDoubleComplex *da2, magma_int_t ldda2, magmaDoubleComplex *a2, magma_int_t lda2, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM); +int CUDA_ztsqrt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da1, magma_int_t ldda1, magmaDoubleComplex *da2, magma_int_t ldda2, magmaDoubleComplex *a2, magma_int_t lda2, magmaDoubleComplex *dt, magma_int_t lddt, magmaDoubleComplex *t, magma_int_t ldt, magmaDoubleComplex *dd, magmaDoubleComplex *d, magma_int_t ldd, magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, CUBLAS_STREAM_PARAM); +int CUDA_ztstrf( char storev, magma_int_t m, magma_int_t n, magma_int_t ib, magma_int_t nb, cuDoubleComplex *hU, magma_int_t ldhu, cuDoubleComplex *dU, magma_int_t lddu, cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl, magma_int_t *ipiv, cuDoubleComplex *hwork, magma_int_t ldhwork, cuDoubleComplex *dwork, magma_int_t lddwork, magma_int_t *info); +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/example/lapack_to_chameleon/step0.c b/example/lapack_to_chameleon/step0.c index ad6c57a627072b00a74e94aa5ff92f96e9195de0..53b436563f89c06424a08b7a5751d09f17b77e0c 100644 --- a/example/lapack_to_chameleon/step0.c +++ b/example/lapack_to_chameleon/step0.c @@ -79,7 +79,7 @@ int main(int argc, char *argv[]) { CORE_dplgsy( (double)N, N, N, A, N, N, N, N, 51 ); /* generate RHS */ - CORE_dplrnt( N, NRHS, B, N, N, N, NRHS, 5673 ); + CORE_dplrnt( N, NRHS, B, N, N, N, NRHS, 5673, 0 ); /* copy A before facto. in order to check the result */ memcpy(Acpy, A, N * N * sizeof(double)); diff --git a/example/lapack_to_chameleon/step7.h b/example/lapack_to_chameleon/step7.h index 752e7fd957f0816f3d09d2291162e9a82393d288..f87adc2bd37349d5f912d7e28a55e499f3bfee01 100644 --- a/example/lapack_to_chameleon/step7.h +++ b/example/lapack_to_chameleon/step7.h @@ -86,7 +86,7 @@ static void Cham_build_callback_plgsy(int row_min, int row_max, int col_min, int static void Cham_build_callback_plrnt(int row_min, int row_max, int col_min, int col_max, void *buffer, int ld, void *user_data) { struct data_pl *data=(struct data_pl *)user_data; - CORE_dplrnt(row_max-row_min+1, col_max-col_min+1, buffer, ld, data->bigM, row_min, col_min, data->seed); + CORE_dplrnt(row_max-row_min+1, col_max-col_min+1, buffer, ld, data->bigM, row_min, col_min, data->seed, 0); } /** diff --git a/include/chameleon/constants.h b/include/chameleon/constants.h index ba252005b72f862af1896ca1cac14ab0742f62a3..6c457effb681c36c4d8b7d0399b5fad3cc8dab93 100644 --- a/include/chameleon/constants.h +++ b/include/chameleon/constants.h @@ -225,6 +225,8 @@ typedef enum chameleon_gemm_e { #define CHAMELEON_PROGRESS 9 #define CHAMELEON_GEMM3M 10 #define CHAMELEON_GENERIC 11 +#define CHAMELEON_DSDMATRIX_MODE 12 +#define CHAMELEON_RECTIL_PANEL_MODE 13 /** * CHAMELEON constants - configuration parameters @@ -259,6 +261,17 @@ typedef enum chameleon_translation_e { ChamOutOfPlace = 2, } cham_translation_t; +/** + * @brief Swap types + **/ +typedef enum chameleon_swap_e { + ChamSwapNo, + ChamSwapLine, + ChamSwapTile, + ChamSwapColumn, + ChamSwapInvalid +} cham_swap_t; + /** * @brief Constant to describe how to initialize the mat pointer in descriptors */ diff --git a/include/chameleon/runtime.h b/include/chameleon/runtime.h index 8d4570f3c24d3146cef2a206448be07b80ba504f..3ba9fe41e09a05c59c959cd9b19c91ce100799d9 100644 --- a/include/chameleon/runtime.h +++ b/include/chameleon/runtime.h @@ -522,6 +522,18 @@ void RUNTIME_data_flush( const RUNTIME_sequence_t *sequence, const CHAM_desc_t *A, int Am, int An ); +/** + * @brief Flushes a CHAMELON handle. TODO: Make it work with descs and "tiles". + * + * @param[in] options + * CHAMELON options used for the call + * + * @param[in] handle + * The handle to flush + */ +void RUNTIME_data_flush_handle(const CHAMELON_option_t *options, + const CHAMELON_data_handle_t *handle); + /** * @brief Migrate a single piece of data. * @@ -666,6 +678,39 @@ RUNTIME_options_ws_free( RUNTIME_option_t *options ); void RUNTIME_set_minmax_submitted_tasks( int min, int max ); +/** + * @} + * + * @name RUNTIME Panel thread allocation management + * @{ + */ +void RUNTIME_alloc_threads(CHAMELON_option_t *options, int ncores, int *cores); +void RUNTIME_unalloc_threads(CHAMELON_option_t *options); + + +/** + * @} + * + * @name RUNTIME data_handle utilites. + * TODO: Remove it and make all calls to these use descs and "tiles" + * @{ + */ +void RUNTIME_data_alloc(CHAMELON_data_handle_t* data_handle); +void RUNTIME_data_free(CHAMELON_data_handle_t* data_handle); +void RUNTIME_vector_data_register(CHAMELON_data_handle_t* data_handle, uint32_t home_node, + uintptr_t ptr, uint32_t nx, size_t elemsize); +void RUNTIME_matrix_data_register(CHAMELON_data_handle_t* data_handle, uint32_t home_node, + uintptr_t ptr, uint32_t ld, uint32_t nx, + uint32_t ny, size_t elemsize); +void RUNTIME_data_acquire(CHAMELON_data_handle_t *data_handle); +void RUNTIME_data_acquire_write(CHAMELON_data_handle_t *data_handle); +void RUNTIME_data_release(CHAMELON_data_handle_t *data_handle); +void RUNTIME_data_unregister(CHAMELON_data_handle_t* data_handle); +void RUNTIME_data_unregister_async(CHAMELON_data_handle_t* data_handle); +void RUNTIME_data_unregister_free_async(CHAMELON_data_handle_t* data_handle); +void RUNTIME_data_acquire_all(CHAMELON_desc_t *desc, int mode); + + /** * @} * diff --git a/include/chameleon/struct.h b/include/chameleon/struct.h index e5df8fa6d030c831598cce505de55981bea35cca..d0f0d39563bcfad687fde4b18ccd16c3eb4aba0d 100644 --- a/include/chameleon/struct.h +++ b/include/chameleon/struct.h @@ -143,6 +143,7 @@ typedef struct chameleon_context_s { cham_bool_t generic_enabled; cham_bool_t autominmax_enabled; cham_bool_t runtime_paused; + cham_bool_t dsdmatrix_enabled; cham_householder_t householder; // "domino" (flat) or tree-based (reduction) Householder cham_translation_t translation; // In place or Out of place layout conversion diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index 125e16fd9fd65ceed38ad1e7271a64b43c01dfd1..058683d9ce5695206d8ca51e09b373c84a37208c 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -33,6 +33,16 @@ set(CODELETS_ZSRC codelets/codelet_zlag2c.c codelets/codelet_dlag2z.c codelets/codelet_dzasum.c + codelets/codelet_zgetrf_compare_workspace.c + codelets/codelet_zgetrf_copy_workspace.c + codelets/codelet_zgetrf_extract_max.c + codelets/codelet_zgetrf_rectil.c + codelets/codelet_zgetrf_save_pivots.c + codelets/codelet_zgetrf_sp.c + codelets/codelet_zlaswp.c + codelets/codelet_zswp.c + codelets/codelet_zswp_extract_rows.c + codelets/codelet_zswp_insert_rows.c ################## # BLAS 1 ################## diff --git a/runtime/parsec/CMakeLists.txt b/runtime/parsec/CMakeLists.txt index 2355aa2069ee1ebbe6ce5ad33276660cc260a382..d0b150910015fd21c0029353c04582337af41f5c 100644 --- a/runtime/parsec/CMakeLists.txt +++ b/runtime/parsec/CMakeLists.txt @@ -93,6 +93,12 @@ set(RUNTIME_COMMON ${RUNTIME_COMMON_GENERATED} ) + if (CHAMELEON_USE_CUDA) + set(RUNTIME_COMMON + control/runtime_cuda.c + ${RUNTIME_COMMON}) + endif() + # Generate the Chameleon sources for all possible precisions # ---------------------------------------------------------- set(RUNTIME_SRCS_GENERATED "") diff --git a/runtime/parsec/codelets/codelet_zplrnt.c b/runtime/parsec/codelets/codelet_zplrnt.c index f1fec1593123d7eabaf6343f435110eabdd4dfdf..ff24b73802052d5b80083df8a35f0c6d5a60556f 100644 --- a/runtime/parsec/codelets/codelet_zplrnt.c +++ b/runtime/parsec/codelets/codelet_zplrnt.c @@ -34,11 +34,12 @@ CORE_zplrnt_parsec( parsec_execution_stream_t *context, int m0; int n0; unsigned long long int seed; + unsigned dsdmatrix; parsec_dtd_unpack_args( - this_task, &m, &n, &A, &lda, &bigM, &m0, &n0, &seed ); + this_task, &m, &n, &A, &lda, &bigM, &m0, &n0, &seed, &dsdmatrix ); - CORE_zplrnt( m, n, A, lda, bigM, m0, n0, seed ); + CORE_zplrnt( m, n, A, lda, bigM, m0, n0, seed, dsdmatrix ); (void)context; return PARSEC_HOOK_RETURN_DONE; @@ -46,7 +47,7 @@ CORE_zplrnt_parsec( parsec_execution_stream_t *context, void INSERT_TASK_zplrnt( const RUNTIME_option_t *options, int m, int n, const CHAM_desc_t *A, int Am, int An, - int bigM, int m0, int n0, unsigned long long int seed ) + int bigM, int m0, int n0, unsigned long long int seed, int dsdmatrix ) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); @@ -61,5 +62,6 @@ void INSERT_TASK_zplrnt( const RUNTIME_option_t *options, sizeof(int), &m0, VALUE, sizeof(int), &n0, VALUE, sizeof(unsigned long long int), &seed, VALUE, - PARSEC_DTD_ARG_END ); + sizeof(int), &dsdmatrix, VALUE, + PARSEC_DTD_ARG_END); } diff --git a/runtime/parsec/control/runtime_cuda.c b/runtime/parsec/control/runtime_cuda.c new file mode 100644 index 0000000000000000000000000000000000000000..609454a50eadfaeea1d2160e927b2cad8917b2b3 --- /dev/null +++ b/runtime/parsec/control/runtime_cuda.c @@ -0,0 +1,31 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Inria. All rights reserved. + * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file runtime_cuda.c + * + * CHAMELON auxiliary routines + * CHAMELON is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 0.9.0 + * @author Terry Cojean + * @date 2017-02-10 + * + **/ +#include <stdio.h> +#include <stdlib.h> +#include "runtime/parsec/include/chameleon_parsec.h" + +cudaStream_t RUNTIME_cuda_get_local_stream() +{ + return NULL; +} diff --git a/runtime/parsec/control/runtime_data_handle.c b/runtime/parsec/control/runtime_data_handle.c new file mode 100644 index 0000000000000000000000000000000000000000..382f28f7a2f8bd2e5b0d89560ceb5a51b31af8ff --- /dev/null +++ b/runtime/parsec/control/runtime_data_handle.c @@ -0,0 +1,58 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2016 Inria. All rights reserved. + * @copyright (c) 2012-2014, 2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file runtime_data_handle.c + * + * CHAMELON auxiliary routines + * CHAMELON is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @author Terry Cojean + * @date 2017-02-11 + * + **/ +#include "runtime/parsec/include/chameleon_parsec.h" + +void RUNTIME_data_alloc(CHAMELON_data_handle_t* data_handle) +{ +} + +void RUNTIME_data_free(CHAMELON_data_handle_t* data_handle) +{ +} + +void RUNTIME_vector_data_register(CHAMELON_data_handle_t* data_handle, uint32_t home_node, + uintptr_t ptr, uint32_t nx, size_t elemsize) +{ +} + +void RUNTIME_matrix_data_register(CHAMELON_data_handle_t* data_handle, uint32_t home_node, + uintptr_t ptr, uint32_t ld, uint32_t nx, + uint32_t ny, size_t elemsize) +{ +} + +void RUNTIME_data_acquire(CHAMELON_data_handle_t *data_handle) +{ +} + +void RUNTIME_data_release(CHAMELON_data_handle_t *data_handle) +{ +} + +void RUNTIME_data_unregister(CHAMELON_data_handle_t* data_handle) +{ +} + +void RUNTIME_data_acquire_all(CHAMELON_desc_t *desc, int mode) +{ +} diff --git a/runtime/parsec/control/runtime_multithreading.c b/runtime/parsec/control/runtime_multithreading.c new file mode 100644 index 0000000000000000000000000000000000000000..505c798cadaa95f6e3ede50f842ae6f2894ab906 --- /dev/null +++ b/runtime/parsec/control/runtime_multithreading.c @@ -0,0 +1,35 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Inria. All rights reserved. + * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file runtime_options.c + * + * CHAMELON auxiliary routines + * CHAMELON is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @version 0.9.0 + * @author Terry Cojean + * @date 2017-02-13 + * + **/ +#include "runtime/parsec/include/chameleon_parsec.h" + +void RUNTIME_unalloc_threads(CHAMELON_option_t *options) +{ + return; +} + +void RUNTIME_alloc_threads(CHAMELON_option_t *options, int ncores, int *cores) +{ + return; +} diff --git a/runtime/parsec/control/runtime_options.c b/runtime/parsec/control/runtime_options.c index 34f18da839e8b58011fe3138b012444710c0121b..575dca6d5b9926cb87f7c14f28a02a0ebd255ab1 100644 --- a/runtime/parsec/control/runtime_options.c +++ b/runtime/parsec/control/runtime_options.c @@ -33,6 +33,9 @@ void RUNTIME_options_init( RUNTIME_option_t *options, CHAM_context_t *chamctxt, options->ws_hsize = 0; options->ws_worker = NULL; options->ws_host = NULL; + options->swap_grain = CHAMELON_SWAP_TYPE; + options->dsd_matrix = CHAMELON_DSDMATRIX == CHAMELON_TRUE; + options->panel_threads = -1; return; } diff --git a/runtime/quark/codelets/codelet_zplrnt.c b/runtime/quark/codelets/codelet_zplrnt.c index 0e597b6a20fc93ba3194973771f6d225ed8c3929..26fc05d8626a6c38b2ee784bd9f827681c554114 100644 --- a/runtime/quark/codelets/codelet_zplrnt.c +++ b/runtime/quark/codelets/codelet_zplrnt.c @@ -35,14 +35,15 @@ void CORE_zplrnt_quark(Quark *quark) int m0; int n0; unsigned long long int seed; + int dsdmatrix; - quark_unpack_args_7( quark, m, n, tileA, bigM, m0, n0, seed ); - TCORE_zplrnt( m, n, tileA, bigM, m0, n0, seed ); + quark_unpack_args_8( quark, m, n, tileA, bigM, m0, n0, seed ); + TCORE_zplrnt( m, n, tileA, bigM, m0, n0, seed, dsdmatrix ); } void INSERT_TASK_zplrnt( const RUNTIME_option_t *options, int m, int n, const CHAM_desc_t *A, int Am, int An, - int bigM, int m0, int n0, unsigned long long int seed ) + int bigM, int m0, int n0, unsigned long long int seed, int dsdmatrix ) { quark_option_t *opt = (quark_option_t*)(options->schedopt); DAG_CORE_PLRNT; @@ -54,5 +55,6 @@ void INSERT_TASK_zplrnt( const RUNTIME_option_t *options, sizeof(int), &m0, VALUE, sizeof(int), &n0, VALUE, sizeof(unsigned long long int), &seed, VALUE, + sizeof(int), &dsdmatrix, VALUE, 0); } diff --git a/runtime/quark/control/runtime_cuda.c b/runtime/quark/control/runtime_cuda.c new file mode 100644 index 0000000000000000000000000000000000000000..666f298bbfe133fb1e992722a25d8d64442b564c --- /dev/null +++ b/runtime/quark/control/runtime_cuda.c @@ -0,0 +1,32 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Inria. All rights reserved. + * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file runtime_cuda.c + * + * CHAMELON auxiliary routines + * CHAMELON is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 0.9.0 + * @author Terry Cojean + * @date 2017-02-10 + * + **/ +#include <stdio.h> +#include <stdlib.h> +#include "runtime/quark/include/chameleon_quark.h" + +cudaStream_t RUNTIME_cuda_get_local_stream() +{ + #warning I do not think quark supports cuda + return 0xdeadbeef; +} diff --git a/runtime/quark/control/runtime_data_handle.c b/runtime/quark/control/runtime_data_handle.c new file mode 100644 index 0000000000000000000000000000000000000000..ea23951adb81aa61c6600de738ed7f43b3c078dc --- /dev/null +++ b/runtime/quark/control/runtime_data_handle.c @@ -0,0 +1,58 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2016 Inria. All rights reserved. + * @copyright (c) 2012-2014, 2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file runtime_data_handle.c + * + * CHAMELON auxiliary routines + * CHAMELON is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @author Terry Cojean + * @date 2017-02-11 + * + **/ +#include "runtime/quark/include/chameleon_quark.h" + +void RUNTIME_data_alloc(CHAMELON_data_handle_t* data_handle) +{ +} + +void RUNTIME_data_free(CHAMELON_data_handle_t* data_handle) +{ +} + +void RUNTIME_vector_data_register(CHAMELON_data_handle_t* data_handle, uint32_t home_node, + uintptr_t ptr, uint32_t nx, size_t elemsize) +{ +} + +void RUNTIME_matrix_data_register(CHAMELON_data_handle_t* data_handle, uint32_t home_node, + uintptr_t ptr, uint32_t ld, uint32_t nx, + uint32_t ny, size_t elemsize) +{ +} + +void RUNTIME_data_acquire(CHAMELON_data_handle_t *data_handle) +{ +} + +void RUNTIME_data_release(CHAMELON_data_handle_t *data_handle) +{ +} + +void RUNTIME_data_unregister(CHAMELON_data_handle_t* data_handle) +{ +} + +void RUNTIME_data_acquire_all(CHAMELON_desc_t *desc, int mode) +{ +} diff --git a/runtime/quark/control/runtime_multithreading.c b/runtime/quark/control/runtime_multithreading.c new file mode 100644 index 0000000000000000000000000000000000000000..d6be38f0063c0afda797aa39433718cbace99554 --- /dev/null +++ b/runtime/quark/control/runtime_multithreading.c @@ -0,0 +1,35 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Inria. All rights reserved. + * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file runtime_options.c + * + * CHAMELON auxiliary routines + * CHAMELON is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @version 0.9.0 + * @author Terry Cojean + * @date 2017-02-13 + * + **/ +#include "runtime/quark/include/chameleon_quark.h" + +void RUNTIME_unalloc_threads(CHAMELON_option_t *options) +{ + return; +} + +void RUNTIME_alloc_threads(CHAMELON_option_t *options, int ncores, int *cores) +{ + return; +} diff --git a/runtime/quark/control/runtime_options.c b/runtime/quark/control/runtime_options.c index 8d655d524f82d164acccee9d6d77c50a5a7d161b..b074f71d08924fc948a1051f5b598c9bc5c9ef91 100644 --- a/runtime/quark/control/runtime_options.c +++ b/runtime/quark/control/runtime_options.c @@ -47,6 +47,11 @@ void RUNTIME_options_init( RUNTIME_option_t *options, CHAM_context_t *chamctxt, options->ws_worker = NULL; options->ws_host = NULL; + options->swap_grain = CHAMELON_COLUMN_SWAPS; + options->dsd_matrix = CHAMELON_DSDMATRIX == CHAMELON_TRUE; + /* Unused here because of their internal structure */ + options->panel_threads = -1; + /* quark in options */ qopt->quark = (Quark*)(chamctxt->schedopt); options->schedopt = qopt; diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt index 0968de66d35b777f2efbbc2d75ebfdbc9ccdb475..87e0560d16f34b31da66a6d11acbb07cd78e4afb 100644 --- a/runtime/starpu/CMakeLists.txt +++ b/runtime/starpu/CMakeLists.txt @@ -221,7 +221,9 @@ set(RUNTIME_COMMON control/runtime_async.c control/runtime_context.c control/runtime_control.c + control/runtime_data_handle.c control/runtime_descriptor.c + control/runtime_multithreading.c control/runtime_options.c control/runtime_profiling.c control/runtime_workspace.c @@ -229,6 +231,12 @@ set(RUNTIME_COMMON ${RUNTIME_COMMON_GENERATED} ) + if (CHAMELEON_USE_CUDA) + set(RUNTIME_COMMON + control/runtime_cuda.c + ${RUNTIME_COMMON}) + endif() + set(flags_to_add "") foreach(_prec ${CHAMELEON_PRECISION}) set(flags_to_add "${flags_to_add} -DPRECISION_${_prec}") diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c index 8f05509f70faf0d971c15ecb42aa064738611631..3e1b4b9b6f6e8215a692fcf4211430a05c0fc7a5 100644 --- a/runtime/starpu/codelets/codelet_zcallback.c +++ b/runtime/starpu/codelets/codelet_zcallback.c @@ -40,6 +40,12 @@ CHAMELEON_CL_CB(zgessq, cti_handle_get_m(task->handles[0]), cti_handle_ge CHAMELEON_CL_CB(zgetrf, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), (2./3.)*M*N*K) CHAMELEON_CL_CB(zgetrf_incpiv, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), (2./3.)*M*N*K) CHAMELEON_CL_CB(zgetrf_nopiv, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), (2./3.)*M*N*K) +CHAMELEON_CL_CB(zgetrf_compare_workspace, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), N) +CHAMELEON_CL_CB(zgetrf_copy_workspace, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), 2.*N) +CHAMELEON_CL_CB(zgetrf_extract_max, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), 2.*M+4.*N) +CHAMELEON_CL_CB(zgetrf_rectil, starpu_matrix_get_nx(task->dyn_handles[1]), starpu_matrix_get_nx(task->dyn_handles[1]), starpu_matrix_get_nx(task->dyn_handles[1]), (2./3.)*M*N*K) +CHAMELEON_CL_CB(zgetrf_save_pivots, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), 1) +CHAMELEON_CL_CB(zgetrf_sp, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (2./3.)*M*N*K) CHAMELEON_CL_CB(zgesum, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), 0, M*N) CHAMELEON_CL_CB(zcesca, cti_handle_get_m(task->handles[5]), cti_handle_get_n(task->handles[5]), 0, 8.*M*N) CHAMELEON_CL_CB(zgram, cti_handle_get_m(task->handles[3]), cti_handle_get_n(task->handles[3]), 0, 8.*M*N) diff --git a/runtime/starpu/codelets/codelet_zgetrf_compare_workspace.c b/runtime/starpu/codelets/codelet_zgetrf_compare_workspace.c new file mode 100644 index 0000000000000000000000000000000000000000..75b0df10f414e2ebaf39ac7277e87c61b5fda999 --- /dev/null +++ b/runtime/starpu/codelets/codelet_zgetrf_compare_workspace.c @@ -0,0 +1,90 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Inria. All rights reserved. + * @copyright (c) 2012-2017, 2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file codelet_zgetrf_compare_workspace.c + * + * CHAMELON codelets kernel + * CHAMELON is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @version 2.3.1 + * @author Omar Zenati + * @author Terry Cojean + * @date 2011-06-01 + * @precisions normal z -> c d s + * + **/ + +#include "runtime/starpu/include/chameleon_starpu.h" +#include "runtime/starpu/include/runtime_codelet_z.h" + +#define get_pivot(_a_) (*((int *) (_a_))) +#define get_diagonal(_a_) (_a_ + 1) +#define get_maxrow(_a_) (_a_ + NB + 1) + +void CHAMELON_TASK_zgetrf_compare_workspace(CHAMELON_option_t *options, + int n, int h, /* int r, int p, */ int nb, + CHAMELON_data_handle_t *top, + CHAMELON_data_handle_t *bottom) +{ + struct starpu_codelet *codelet = &cl_zgetrf_compare_workspace; + void (*callback)(void*) = options->profiling ? cl_zgetrf_compare_workspace_callback : NULL; + + starpu_insert_task(codelet, + STARPU_VALUE, &n, sizeof(int), + STARPU_VALUE, &h, sizeof(int), + /* STARPU_VALUE, &r, sizeof(int), */ + /* STARPU_VALUE, &p, sizeof(int), */ + STARPU_VALUE, &nb, sizeof(int), + STARPU_RW, *(starpu_data_handle_t *)top->data_handle, + STARPU_R, *(starpu_data_handle_t *)bottom->data_handle, + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, NULL, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "zgetrf_compare_workspace", +#endif + 0); +} + +static void cl_zgetrf_compare_workspace_cpu_func(void *descr[], void *cl_arg) +{ + int N,H,/* R,P, */NB; + CHAMELON_Complex64_t *top, *bottom; + + top = (CHAMELON_Complex64_t *) STARPU_VECTOR_GET_PTR(descr[0]); + bottom = (CHAMELON_Complex64_t *) STARPU_VECTOR_GET_PTR(descr[1]); + + starpu_codelet_unpack_args(cl_arg, &N, &H, /* &R, &P, */ &NB); + + /* The received buffer contains the diagonal line */ + /* if( (P <= R) && (R < 2*P) ) { */ + /* cblas_zcopy(N, get_diagonal(bottom), 1, */ + /* get_diagonal(top) , 1); */ + /* } */ + + CHAMELON_Complex64_t *mT = get_maxrow( top ); + CHAMELON_Complex64_t *mB = get_maxrow( bottom ); + /* printf("compare %e and %e\n", cabs( mB[H] ) , cabs( mT[H] )); */ + if( cabs( mB[H] ) > cabs( mT[H] ) ) + { + get_pivot( top ) = get_pivot( bottom ); + cblas_zcopy(N, mB, 1, + mT, 1); + } +} + +/* + * Codelet definition + */ +CODELETS_CPU(zgetrf_compare_workspace, 2, cl_zgetrf_compare_workspace_cpu_func) + diff --git a/runtime/starpu/codelets/codelet_zgetrf_copy_workspace.c b/runtime/starpu/codelets/codelet_zgetrf_copy_workspace.c new file mode 100644 index 0000000000000000000000000000000000000000..beb5cf31b961630335f20dad32964adac026f666 --- /dev/null +++ b/runtime/starpu/codelets/codelet_zgetrf_copy_workspace.c @@ -0,0 +1,87 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Inria. All rights reserved. + * @copyright (c) 2012-2017, 2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file codelet_zgetrf_copy_workspace.c + * + * CHAMELON codelets kernel + * CHAMELON is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @version 2.3.1 + * @author Omar Zenati + * @author Terry Cojean + * @date 2011-06-01 + * @precisions normal z -> c d s + * + **/ + +#include "runtime/starpu/include/chameleon_starpu.h" +#include "runtime/starpu/include/runtime_codelet_z.h" + +#define get_pivot(_a_) (*((int *) (_a_))) +#define get_diagonal(_a_) (_a_ + 1) +#define get_maxrow(_a_) (_a_ + NB + 1) + +void CHAMELON_TASK_zgetrf_copy_workspace(CHAMELON_option_t *options, + int n, int h, /* int r, int p, */ int nb, + CHAMELON_data_handle_t *top, + CHAMELON_data_handle_t *bottom) +{ + struct starpu_codelet *codelet = &cl_zgetrf_copy_workspace; + void (*callback)(void*) = options->profiling ? cl_zgetrf_copy_workspace_callback : NULL; + + starpu_insert_task(codelet, + STARPU_VALUE, &n, sizeof(int), + STARPU_VALUE, &h, sizeof(int), + /* STARPU_VALUE, &r, sizeof(int), */ + /* STARPU_VALUE, &p, sizeof(int), */ + STARPU_VALUE, &nb, sizeof(int), + STARPU_RW, *(starpu_data_handle_t *)top->data_handle, + STARPU_R, *(starpu_data_handle_t *)bottom->data_handle, + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, NULL, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "zgetrf_copy_workspace", +#endif + 0); + +} + +static void cl_zgetrf_copy_workspace_cpu_func(void *descr[], void *cl_arg) +{ + int N,H,/* R,P, */NB; + CHAMELON_Complex64_t *top, *bottom; + + top = (CHAMELON_Complex64_t *) STARPU_VECTOR_GET_PTR(descr[0]); + bottom = (CHAMELON_Complex64_t *) STARPU_VECTOR_GET_PTR(descr[1]); + + starpu_codelet_unpack_args(cl_arg, &N, &H, /* &R, &P, */ &NB); + + CHAMELON_Complex64_t *mT = get_maxrow( top ); + CHAMELON_Complex64_t *mB = get_maxrow( bottom ); + CHAMELON_Complex64_t *dT = get_diagonal( top ); + CHAMELON_Complex64_t *dB = get_diagonal( bottom ); + + + get_pivot( top ) = get_pivot( bottom ); + cblas_zcopy(N, mB, 1, + mT, 1); + cblas_zcopy(N, dB, 1, + dT, 1); +} + +/* + * Codelet definition + */ +CODELETS_CPU(zgetrf_copy_workspace, 2, cl_zgetrf_copy_workspace_cpu_func) + diff --git a/runtime/starpu/codelets/codelet_zgetrf_extract_max.c b/runtime/starpu/codelets/codelet_zgetrf_extract_max.c new file mode 100644 index 0000000000000000000000000000000000000000..5f19e3ac06665650ebafc2a0bf41b2ebdf9ac752 --- /dev/null +++ b/runtime/starpu/codelets/codelet_zgetrf_extract_max.c @@ -0,0 +1,80 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Inria. All rights reserved. + * @copyright (c) 2012-2017, 2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file codelet_zgetrf_extract_max.c + * + * CHAMELON codelets kernel + * CHAMELON is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @version 2.3.1 + * @author Omar Zenati + * @author Terry Cojean + * @date 2011-06-01 + * @precisions normal z -> c d s + * + **/ + +#include "runtime/starpu/include/chameleon_starpu.h" +#include "runtime/starpu/include/runtime_codelet_z.h" + +void CHAMELON_TASK_zgetrf_extract_max(const CHAMELON_option_t *options, + int k, int m, int n, int h, int nb, + CHAMELON_desc_t *A, int Am, int An, + CHAMELON_data_handle_t *max_workspace) +{ + struct starpu_codelet *codelet = &cl_zgetrf_extract_max; + void (*callback)(void*) = options->profiling ? cl_zgetrf_extract_max_callback : NULL; + int lda = BLKLDD( A, Am ); + + if (chameleon_desc_islocal(A, Am, An)) + { + starpu_insert_task(codelet, + STARPU_VALUE, &k, sizeof(int), + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_VALUE, &h, sizeof(int), + STARPU_VALUE, &nb, sizeof(int), + STARPU_VALUE, &Am, sizeof(int), + STARPU_RW, RTBLKADDR( A, CHAMELON_Complex64_t, Am, An ), + STARPU_VALUE, &lda, sizeof(int), + STARPU_RW, *(starpu_data_handle_t *)max_workspace->data_handle, + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, NULL, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "zgetrf_extract_max", +#endif + 0); + } +} + +static void cl_zgetrf_extract_max_cpu_func(void *descr[], void *cl_arg) +{ + int K,M,N,H,NB,AM; + CHAMELON_Complex64_t *A; + int LDA; + CHAMELON_Complex64_t *mw; + + A = (CHAMELON_Complex64_t *) STARPU_MATRIX_GET_PTR(descr[0]); + mw = (CHAMELON_Complex64_t *) STARPU_VECTOR_GET_PTR(descr[1]); + + starpu_codelet_unpack_args(cl_arg, &K, &M, &N, &H, &NB, &AM, &LDA); + + CORE_zgetrf_extract_max(A, mw, K, M, N, H, NB, AM, LDA); + +} + +/* + * Codelet definition + */ +CODELETS_CPU(zgetrf_extract_max, 2, cl_zgetrf_extract_max_cpu_func) diff --git a/runtime/starpu/codelets/codelet_zgetrf_rectil.c b/runtime/starpu/codelets/codelet_zgetrf_rectil.c new file mode 100644 index 0000000000000000000000000000000000000000..95a4584aaaf92c040836b8715bce2997d98ec82c --- /dev/null +++ b/runtime/starpu/codelets/codelet_zgetrf_rectil.c @@ -0,0 +1,122 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Inria. All rights reserved. + * @copyright (c) 2012-2017, 2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file codelet_zgetrf_nopiv.c + * + * CHAMELON codelets kernel + * CHAMELON is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 2.6.0 + * @author Omar Zenati + * @author Terry Cojean + * @author Mathieu Faverge + * @author Emmanuel Agullo + * @author Cedric Castagnede + * @date 2013-02-01 + * @precisions normal z -> c d s + * + **/ +#include "runtime/starpu/include/chameleon_starpu.h" +#include "runtime/starpu/include/runtime_codelet_z.h" + +/** + * + * @ingroup CORE_CHAMELON_Complex64_t + * + * CORE_zgetrf_rectil computes an LU factorization of a general diagonal + * dominant M-by-N matrix A with pivoting right-looking. + * + * The factorization has the form + * A = L * U + * where L is lower triangular with unit + * diagonal elements (lower trapezoidal if m > n), and U is upper + * triangular (upper trapezoidal if m < n). + * + * This is the right-looking Level 3 BLAS version of the algorithm. + * WARNING: Your matrix need to be diagonal dominant if you want to call this + * routine safely. + * + ******************************************************************************* + * + * @param[in,out] A + * On entry, the M-by-N matrix to be factored. + * On exit, the factors L and U from the factorization + * A = P*L*U; the unit diagonal elements of L are not stored. + * + * @param[out] IPIV + * The pivot indices; for 1 <= i <= min(M,N), row i of the + * tile was interchanged with row IPIV(i). + ******************************************************************************* + * + * @return + * \retval CHAMELON_SUCCESS successful exit + * \retval <0 if INFO = -k, the k-th argument had an illegal value + * \retval >0 if INFO = k, U(k,k) is exactly zero. The factorization + * has been completed, but the factor U is exactly + * singular, and division by zero will occur if it is used + * to solve a system of equations. + * + ******************************************************************************/ + +void CHAMELON_TASK_zgetrf_rectil(const CHAMELON_option_t *options, + const CHAMELON_desc_t *A, CHAMELON_data_handle_t *IPIV_h) +{ + int i; + struct starpu_codelet *codelet = &cl_zgetrf_rectil; + void (*callback)(void*) = options->profiling ? cl_zgetrf_rectil_callback : NULL; + struct starpu_data_descr data[A->mt+1]; + + data[0].handle = *(starpu_data_handle_t*)(IPIV_h->data_handle); + data[0].mode = STARPU_RW; + for (i=0 ; i< A->mt ; i++) { + data[i+1].handle = RTBLKADDR(A, CHAMELON_Complex64_t, i, 0); + data[i+1].mode = STARPU_RW; + } + + starpu_insert_task(codelet, + STARPU_VALUE, &A, sizeof(CHAMELON_desc_t*), + STARPU_DATA_MODE_ARRAY, &data, A->mt+1, + STARPU_PRIORITY, options->priority, + STARPU_SCHED_CTX, options->panel_threads, + STARPU_CALLBACK, callback, + STARPU_POSSIBLY_PARALLEL, 1, + 0); +} + +/* + * Codelet CPU + */ +static void cl_zgetrf_rectil_cpu_func(void *descr[], void *cl_arg) +{ + CHAMELON_desc_t *A = NULL; + int *IPIV = (int *)STARPU_VECTOR_GET_PTR(descr[0]); + unsigned sched_ctx = starpu_task_get_current()->sched_ctx; + int info[3]; + + starpu_codelet_unpack_args(cl_arg, &A); + + int rank = starpu_sched_ctx_get_worker_rank(sched_ctx); + info[1] = rank == -1 ? 0 : rank; + info[2] = rank == -1 ? 1 : starpu_sched_ctx_get_nworkers(sched_ctx); + + CORE_zgetrf_rectil(A, IPIV, info ); + if (*info!=0) { + fprintf(stderr, "Core_zgetrf_rectil failed at column/line %d.\n", *info); + exit(-1); + } +} + +/* + * Codelet definition + */ +CODELETS_CPU(zgetrf_rectil, STARPU_VARIABLE_NBUFFERS, cl_zgetrf_rectil_cpu_func) diff --git a/runtime/starpu/codelets/codelet_zgetrf_save_pivots.c b/runtime/starpu/codelets/codelet_zgetrf_save_pivots.c new file mode 100644 index 0000000000000000000000000000000000000000..88314d1913980e689e10aa56b05597f748d42d8e --- /dev/null +++ b/runtime/starpu/codelets/codelet_zgetrf_save_pivots.c @@ -0,0 +1,63 @@ +/** + * + * @file codelet_zgetrf_save_pivots.c + * + * MAGMA codelets kernel + * MAGMA is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @version 2.3.1 + * @author Omar Zenati + * @date 2011-06-01 + * @precisions normal z -> c d s + * + **/ + +#include "runtime/starpu/include/chameleon_starpu.h" +#include "runtime/starpu/include/runtime_codelet_z.h" + +#define get_pivot(_a_) (*((int *) (_a_))) + +void CHAMELON_TASK_zgetrf_save_pivots(CHAMELON_option_t *options, + int h, + CHAMELON_data_handle_t *IPIV, + CHAMELON_data_handle_t *mw) +{ + struct starpu_codelet *codelet = &cl_zgetrf_save_pivots; + void (*callback)(void*) = options->profiling ? cl_zgetrf_save_pivots_callback : NULL; + + + starpu_insert_task(codelet, + STARPU_VALUE, &h, sizeof(int), + STARPU_RW, *(starpu_data_handle_t *)IPIV->data_handle, + STARPU_R, *(starpu_data_handle_t *)mw->data_handle, + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, NULL, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "zgetrf_save_pivots", +#endif + 0); + +} + +static void cl_zgetrf_save_pivots_cpu_func(void *descr[], void *cl_arg) +{ + int H; + int *pivots; + CHAMELON_Complex64_t *mw; + + pivots = (int *) STARPU_VECTOR_GET_PTR(descr[0]); + mw = (CHAMELON_Complex64_t *) STARPU_VECTOR_GET_PTR(descr[1]); + + starpu_codelet_unpack_args(cl_arg, &H); + + pivots[H] = get_pivot( mw ) + 1; + +} + +/* + * Codelet definition + */ +CODELETS_CPU(zgetrf_save_pivots, 2, cl_zgetrf_save_pivots_cpu_func) + diff --git a/runtime/starpu/codelets/codelet_zgetrf_sp.c b/runtime/starpu/codelets/codelet_zgetrf_sp.c new file mode 100644 index 0000000000000000000000000000000000000000..f52a1e5d8659c730fcde61f1e37fa8962e237b52 --- /dev/null +++ b/runtime/starpu/codelets/codelet_zgetrf_sp.c @@ -0,0 +1,109 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Inria. All rights reserved. + * @copyright (c) 2012-2017, 2016, 2017, Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file codelet_zgetrf_sp.c + * + * CHAMELON codelets kernel + * CHAMELON is a software package provided by Inria Bordeaux - Sud-Ouest, LaBRI, + * University of Bordeaux, Bordeaux INP + * + * @version 1.0.0 + * @author Omar Zenati + * @author Terry Cojean + * @date 2011-06-01 + * @precisions normal z -> c d s + * + **/ + +#include "runtime/starpu/include/chameleon_starpu.h" +#include "runtime/starpu/include/runtime_codelet_z.h" + +/** + * + * @ingroup CORE_CHAMELON_Complex64_t + * + **/ +void CHAMELON_TASK_zgetrf_sp( CHAMELON_option_t *options, + int m, int n, + CHAMELON_desc_t *A, int Am, int An, + double criteria, CHAMELON_data_handle_t *nb_pivot) +{ + struct starpu_codelet *zgetrf_sp_codelet; + void (*callback)(void*) = options->profiling ? cl_zgetrf_sp_callback : NULL; + int lda = BLKLDD( A, Am ); + + zgetrf_sp_codelet = &cl_zgetrf_sp; + + if ( chameleon_desc_islocal( A, Am, An ) ) { + starpu_insert_task(zgetrf_sp_codelet, + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_RW, RTBLKADDR( A, CHAMELON_Complex64_t, Am, An ), + STARPU_VALUE, &lda, sizeof(int), + STARPU_VALUE, &criteria, sizeof(int), + STARPU_RW, *(starpu_data_handle_t *)nb_pivot->data_handle, + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "zgetrf_sp", +#endif + 0); + } +} + +#if !defined(CHAMELEON_SIMULATION) +static void cl_zgetrf_sp_cpu_func(void *descr[], void *cl_arg) +{ + int M,N; + CHAMELON_Complex64_t *A; + int LDA; + double CRITERIA; + int *nb_pivot; + + A = (CHAMELON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); + nb_pivot = (int *) STARPU_MATRIX_GET_PTR(descr[1]); + + starpu_codelet_unpack_args(cl_arg, &M, &N, &LDA, &CRITERIA); + CORE_zgetrf_sp( M, N, A, LDA, CRITERIA, nb_pivot ); +} + +#ifdef CHAMELEON_USE_MAGMA +static void cl_zgetrf_sp_cuda_func(void *descr[], void *cl_arg) +{ + int M; + int N; + cuDoubleComplex *A; + int LDA; + double CRITERIA; + int *nb_pivot; + int info = 0; + + A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); + nb_pivot = (int *) STARPU_MATRIX_GET_PTR(descr[1]); + + starpu_codelet_unpack_args(cl_arg, &M, &N, &LDA, &CRITERIA); + + CUDA_zgetrf_sp(M, N, A, LDA, CRITERIA, nb_pivot, &info); + + cudaThreadSynchronize(); +} +#endif +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +#if defined CHAMELEON_USE_MAGMA +CODELETS(zgetrf_sp, 2, cl_zgetrf_sp_cpu_func, cl_zgetrf_sp_cuda_func, 0) +#else +CODELETS_CPU(zgetrf_sp, 2, cl_zgetrf_sp_cpu_func) +#endif diff --git a/runtime/starpu/codelets/codelet_zlaswp.c b/runtime/starpu/codelets/codelet_zlaswp.c new file mode 100644 index 0000000000000000000000000000000000000000..9fda53877f07c414b6f12796895756a40f9ed37e --- /dev/null +++ b/runtime/starpu/codelets/codelet_zlaswp.c @@ -0,0 +1,168 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Inria. All rights reserved. + * @copyright (c) 2012-2017 IPB. All rights reserved. + * + **/ + +/** + * + * @file codelet_zlaswp.c + * + * CHAMELON codelets kernel + * CHAMELON is a software package provided by Inria Bordeaux - Sud-Ouest, LaBRI, + * University of Bordeaux, Bordeaux INP + * + * @version 2.6.0 + * @author Mathieu Faverge + * @author Terry Cojean + * @date 2010-11-15 + * @precisions normal z -> c d s + * + **/ +#include "runtime/starpu/include/chameleon_starpu.h" +#include "runtime/starpu/include/runtime_codelet_z.h" + +void CHAMELON_TASK_zlaswp(const CHAMELON_option_t *options, + int n, const CHAMELON_desc_t *A, int Am, int An, int lda, + int i1, int i2, + CHAMELON_data_handle_t *ipiv, int inc) +{ + struct starpu_codelet *codelet = &cl_zlaswp; + void (*callback)(void*) = options->profiling ? cl_zlaswp_callback : NULL; + starpu_insert_task(codelet, + STARPU_VALUE, &n, sizeof(int), + STARPU_RW, RTBLKADDR(A, CHAMELON_Complex64_t, Am, An), + STARPU_VALUE, &lda, sizeof(int), + STARPU_VALUE, &i1, sizeof(int), + STARPU_VALUE, &i2, sizeof(int), + STARPU_R, *(starpu_data_handle_t*)(ipiv->data_handle), + STARPU_VALUE, &inc, sizeof(int), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, NULL, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "zlaswp", +#endif + 0); +} + +static void cl_zlaswp_cpu_func(void *descr[], void *cl_arg) +{ + int n, lda, i1, i2, inc; + int *ipiv; + CHAMELON_Complex64_t *A; + + A = (CHAMELON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); + ipiv = (int *)STARPU_VECTOR_GET_PTR(descr[1]); + starpu_codelet_unpack_args(cl_arg, &n, &lda, &i1, &i2, &inc); + CORE_zlaswp(n, A, lda, i1, i2, ipiv, inc); +} + +/* descA is a submatrix representing a panel (with multiple subtiles) */ +void CHAMELON_TASK_zlaswp_ontile(const CHAMELON_option_t *options, + const CHAMELON_desc_t *descA, + const CHAMELON_desc_t *Aij, int Aijm, int Aijn, + int i1, int i2, + CHAMELON_data_handle_t *ipiv, int inc) +{ + int i; + struct starpu_codelet *codelet = &cl_zlaswp_ontile; + void (*callback)(void*) = options->profiling ? cl_zlaswp_ontile_callback : NULL; + struct starpu_data_descr data[descA->mt+1]; + + data[0].handle = *(starpu_data_handle_t*)(ipiv->data_handle); + data[0].mode = STARPU_R; + for (i=0 ; i< descA->mt ; i++) { + data[i+1].handle = RTBLKADDR(descA, CHAMELON_Complex64_t, i, 0); + data[i+1].mode = STARPU_RW; + } + + starpu_insert_task(codelet, + STARPU_VALUE, &descA, sizeof(CHAMELON_desc_t*), + STARPU_VALUE, &i1, sizeof(int), + STARPU_VALUE, &i2, sizeof(int), + STARPU_VALUE, &inc, sizeof(int), + STARPU_VALUE, &Aijm, sizeof(int), + STARPU_VALUE, &Aijn, sizeof(int), + STARPU_DATA_MODE_ARRAY, &data, descA->mt+1, + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "zlaswp_ontile", +#endif + 0); +} + + +static void cl_zlaswp_ontile_cpu_func(void *descr[], void *cl_arg) +{ + int i1, i2, inc; + int *ipiv; + int Am, An; + CHAMELON_Complex64_t *A; + CHAMELON_desc_t *descA; + + ipiv = (int *)STARPU_VECTOR_GET_PTR(descr[0]); + A = (CHAMELON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]); + starpu_codelet_unpack_args(cl_arg, &descA, &i1, &i2, &inc, &Am, &An); + + CORE_zlaswp_ontile(descA, i1, i2, ipiv, inc); +} + +/* descA is a submatrix representing a panel (with multiple subtiles) */ +void CHAMELON_TASK_zlaswpc_ontile(const CHAMELON_option_t *options, + const CHAMELON_desc_t *descA, + const CHAMELON_desc_t *Aij, int Aijm, int Aijn, + int i1, int i2, + CHAMELON_data_handle_t *ipiv, int inc) +{ + int i; + struct starpu_codelet *codelet = &cl_zlaswpc_ontile; + void (*callback)(void*) = options->profiling ? cl_zlaswpc_ontile_callback : NULL; + struct starpu_data_descr data[descA->mt+1]; + + data[0].handle = *(starpu_data_handle_t*)(ipiv->data_handle); + data[0].mode = STARPU_R; + for (i=0 ; i< descA->mt ; i++) { + data[i+1].handle = RTBLKADDR(descA, CHAMELON_Complex64_t, i, 0); + data[i+1].mode = STARPU_RW; + } + + starpu_insert_task( + codelet, + STARPU_VALUE, &descA, sizeof(CHAMELON_desc_t), + STARPU_VALUE, &i1, sizeof(int), + STARPU_VALUE, &i2, sizeof(int), + STARPU_VALUE, &inc, sizeof(int), + STARPU_DATA_MODE_ARRAY, &data, descA->mt+1, + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "zlaswp_ontile", +#endif + 0); +} + + +static void cl_zlaswpc_ontile_cpu_func(void *descr[], void *cl_arg) +{ + int i1, i2, inc; + int *ipiv; + CHAMELON_Complex64_t *A; + CHAMELON_desc_t *descA; + + starpu_codelet_unpack_args(cl_arg, &descA, &i1, &i2, &ipiv, &inc); + CORE_zlaswpc_ontile(descA, i1, i2, ipiv, inc); +} + +/* + * Codelet definition + */ +CODELETS_CPU(zlaswp, 2, cl_zlaswp_cpu_func) + +CODELETS_CPU(zlaswp_ontile, STARPU_VARIABLE_NBUFFERS, cl_zlaswp_ontile_cpu_func) + +CODELETS_CPU(zlaswpc_ontile, STARPU_VARIABLE_NBUFFERS, cl_zlaswpc_ontile_cpu_func) diff --git a/runtime/starpu/codelets/codelet_zplrnt.c b/runtime/starpu/codelets/codelet_zplrnt.c index df95bf5d2189bf8d481f4178ef3444d4f2cfc8f5..d62b8ee6ef34458537228a5e7b9feb9da9e2f52b 100644 --- a/runtime/starpu/codelets/codelet_zplrnt.c +++ b/runtime/starpu/codelets/codelet_zplrnt.c @@ -35,6 +35,7 @@ struct cl_zplrnt_args_s { int m0; int n0; unsigned long long int seed; + int dsdmatrix; }; #if !defined(CHAMELEON_SIMULATION) @@ -47,7 +48,7 @@ cl_zplrnt_cpu_func(void *descr[], void *cl_arg) tileA = cti_interface_get(descr[0]); TCORE_zplrnt( clargs->m, clargs->n, tileA, - clargs->bigM, clargs->m0, clargs->n0, clargs->seed ); + clargs->bigM, clargs->m0, clargs->n0, clargs->seed, clargs->dsdmatrix ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -58,7 +59,7 @@ CODELETS_CPU( zplrnt, cl_zplrnt_cpu_func ) void INSERT_TASK_zplrnt( const RUNTIME_option_t *options, int m, int n, const CHAM_desc_t *A, int Am, int An, - int bigM, int m0, int n0, unsigned long long int seed ) + int bigM, int m0, int n0, unsigned long long int seed, int dsdmatrix ) { struct cl_zplrnt_args_s *clargs = NULL; void (*callback)(void*); @@ -80,6 +81,7 @@ void INSERT_TASK_zplrnt( const RUNTIME_option_t *options, clargs->m0 = m0; clargs->n0 = n0; clargs->seed = seed; + clargs->dsdmatrix = dsdmatrix, } /* Callback fro profiling information */ diff --git a/runtime/starpu/codelets/codelet_zswp.c b/runtime/starpu/codelets/codelet_zswp.c new file mode 100644 index 0000000000000000000000000000000000000000..97bb72cb4b432b6f01487bc98eed513c8d6bd44a --- /dev/null +++ b/runtime/starpu/codelets/codelet_zswp.c @@ -0,0 +1,141 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Inria. All rights reserved. + * @copyright (c) 2012-2017 IPB. All rights reserved. + * + **/ + +/** + * + * @file codelet_zlaswp.c + * + * CHAMELON codelets kernel + * CHAMELON is a software package provided by Inria Bordeaux - Sud-Ouest, LaBRI, + * University of Bordeaux, Bordeaux INP + * + * @version 2.6.0 + * @author Mathieu Faverge + * @author Terry Cojean + * @date 2010-11-15 + * @precisions normal z -> c d s + * + **/ +#include "runtime/starpu/include/chameleon_starpu.h" +#include "runtime/starpu/include/runtime_codelet_z.h" + +void CHAMELON_TASK_zswpp(CHAMELON_option_t *options, + CHAMELON_desc_t *A, CHAMELON_desc_t *Aij, int Aijm, int Aijn, + CHAMELON_desc_t *B, int Bm, int Bn, int i1, int i2, + CHAMELON_data_handle_t *ipiv_handle, int inc) +{ + if ( chameleon_desc_islocal( Aij, Aijm, Aijn ) || chameleon_desc_islocal( B, Bm, Bn )) { + struct starpu_codelet *codelet = &cl_zswpp; + void (*callback)(void*) = options->profiling ? cl_zswpp_callback : NULL; + + starpu_task_insert(codelet, + STARPU_VALUE, &A, sizeof(CHAMELON_desc_t*), + STARPU_VALUE, &i1, sizeof(int), + STARPU_VALUE, &i2, sizeof(int), + STARPU_VALUE, &inc, sizeof(int), + STARPU_VALUE, &Aijm, sizeof(int), + STARPU_VALUE, &Aijn, sizeof(int), + STARPU_VALUE, &Bm, sizeof(int), + STARPU_VALUE, &Bn, sizeof(int), + STARPU_RW, RTBLKADDR(Aij, CHAMELON_Complex64_t, Aijm, Aijn), + STARPU_R, *(starpu_data_handle_t *)ipiv_handle->data_handle, + STARPU_RW, RTBLKADDR(B, CHAMELON_Complex64_t, Bm, Bn), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, NULL, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "zswpp", +#endif + 0); + } + +} + +static void cl_zswpp_cpu_func(void *descr[], void *cl_arg) +{ + int i1, i2, inc; + int *ipiv; + int Am, An, Bm, Bn; + CHAMELON_Complex64_t *A, *B; + CHAMELON_desc_t *descA; + + starpu_codelet_unpack_args(cl_arg, &descA, &i1, &i2, &inc, &Am, &An, &Bm, &Bn); + + A = (CHAMELON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); + ipiv = (int *)STARPU_VECTOR_GET_PTR(descr[1]); + B = (CHAMELON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); + + //Put Bm relative to the submatrix + Bm -= descA->i/descA->mb; + CORE_zswpp(descA, Bm, i1, i2, ipiv, inc); +} + +void CHAMELON_TASK_zswptr_ontile(const CHAMELON_option_t *options, + const CHAMELON_desc_t *descA, + const CHAMELON_desc_t *Aij, int Aijm, int Aijn, + int i1, int i2, + CHAMELON_data_handle_t *ipiv, int inc, + const CHAMELON_desc_t *Akk, int Akkm, int Akkn, int ldak) +{ + int i; + struct starpu_codelet *codelet = &cl_zswptr_ontile; + void (*callback)(void*) = options->profiling ? cl_zswptr_ontile_callback : NULL; + + if ( chameleon_desc_islocal( descA, 0, 0 )) { + struct starpu_data_descr data[descA->mt+3]; + + data[0].handle = RTBLKADDR(Aij, CHAMELON_Complex64_t, Aijm, Aijn); + data[0].mode = STARPU_RW; + data[1].handle = *(starpu_data_handle_t*)(ipiv->data_handle); + data[1].mode = STARPU_R; + data[2].handle = RTBLKADDR(Akk, CHAMELON_Complex64_t, Akkm, Akkn); + data[2].mode = STARPU_RW; + for (i=0 ; i< descA->mt ; i++) { + data[i+3].handle = RTBLKADDR(descA, CHAMELON_Complex64_t, i, 0); + data[i+3].mode = STARPU_RW; + } + + starpu_insert_task(codelet, + STARPU_VALUE, &descA, sizeof(CHAMELON_desc_t*), + STARPU_VALUE, &i1, sizeof(int), + STARPU_VALUE, &i2, sizeof(int), + STARPU_VALUE, &inc, sizeof(int), + STARPU_VALUE, &ldak, sizeof(int), + STARPU_VALUE, &Aijm, sizeof(int), + STARPU_VALUE, &Aijn, sizeof(int), + STARPU_DATA_MODE_ARRAY, &data, descA->mt + 3, + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, NULL, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "zswptr_ontile", +#endif + 0); + } +} + + +static void cl_zswptr_ontile_cpu_func(void *descr[], void *cl_arg) +{ + int i1, i2, inc, ldak; + int *ipiv; + int Am, An; + CHAMELON_Complex64_t *A, *Akk; + CHAMELON_desc_t *descA; + + A = (CHAMELON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]); + ipiv = (int *)STARPU_VECTOR_GET_PTR(descr[1]); + Akk = (CHAMELON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]); + starpu_codelet_unpack_args(cl_arg, &descA, &i1, &i2, &inc, &ldak, &Am, &An); + + CORE_zswptr_ontile(descA, i1, i2, ipiv, inc, Akk, ldak); +} + +CODELETS_CPU(zswptr_ontile, STARPU_VARIABLE_NBUFFERS, cl_zswptr_ontile_cpu_func) + +CODELETS_CPU(zswpp, 3, cl_zswpp_cpu_func) diff --git a/runtime/starpu/codelets/codelet_zswp_extract_rows.c b/runtime/starpu/codelets/codelet_zswp_extract_rows.c new file mode 100644 index 0000000000000000000000000000000000000000..897bf11f2df18123b49fdfa7d3100dd0687826db --- /dev/null +++ b/runtime/starpu/codelets/codelet_zswp_extract_rows.c @@ -0,0 +1,190 @@ +/** + * + * @file codelet_zswap_insert_rows.c + * + * CHAMELON codelets kernel + * CHAMELON is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @version 2.3.1 + * @author Omar Zenati + * @author Terry Cojean + * @date 2011-06-01 + * @precisions normal z -> c d s + * + **/ + +#include "runtime/starpu/include/chameleon_starpu.h" +#include "runtime/starpu/include/runtime_codelet_z.h" + +void CHAMELON_TASK_zswp_extract_rows(CHAMELON_option_t *options, + int nb_pivots, int global_MB, + int K, int SRC_M, int DST_M, + int NB, int MB, + CHAMELON_data_handle_t *perm, + CHAMELON_desc_t *A, int Am, int An, + CHAMELON_data_handle_t *rows_handle) +{ + void (*callback)(void*) = options->profiling ? cl_zswp_extract_rows_callback : NULL; + struct starpu_codelet *codelet = &cl_zswp_extract_rows; + int LDA = BLKLDD(A, Am); + + if (nb_pivots > 0) { + starpu_task_insert(codelet, + STARPU_VALUE, &nb_pivots, sizeof(int), + STARPU_VALUE, &global_MB, sizeof(int), + STARPU_VALUE, &K, sizeof(int), + STARPU_VALUE, &SRC_M, sizeof(int), + STARPU_VALUE, &DST_M, sizeof(int), + STARPU_VALUE, &NB, sizeof(int), + STARPU_VALUE, &MB, sizeof(int), + STARPU_VALUE, &LDA, sizeof(int), + STARPU_R, *(starpu_data_handle_t *)perm->data_handle, + STARPU_R|STARPU_LOCALITY, RTBLKADDR(A, CHAMELON_Complex64_t, Am, An), + STARPU_W, *(starpu_data_handle_t *)rows_handle->data_handle, + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "zswp_extract_rows", +#endif + 0); + } +} + +static void cl_zswp_extract_rows_cpu_func(void *descr[], void *cl_arg) +{ + int K, SRC_M, DST_M, NB, MB, LDA; + int *perm, nb_pivots, global_MB; + CHAMELON_Complex64_t *A; + CHAMELON_Complex64_t *rows; + + perm = (int *) STARPU_VECTOR_GET_PTR(descr[0]); + A = (CHAMELON_Complex64_t *) STARPU_MATRIX_GET_PTR(descr[1]); + rows = (CHAMELON_Complex64_t *) STARPU_VECTOR_GET_PTR(descr[2]); + + starpu_codelet_unpack_args(cl_arg, &nb_pivots, &global_MB, &K, &SRC_M, &DST_M, &NB, &MB, &LDA); + + CORE_zswp_extract_rows(nb_pivots, global_MB, K, SRC_M, DST_M, A, perm, rows, NB, MB, LDA); + } + +#ifdef CHAMELEON_USE_CUDA +static void cl_zswp_extract_rows_cuda_func(void *descr[], void *cl_arg) +{ + int K, SRC_M, DST_M, NB, MB, LDA; + int *perm, nb_pivots, global_MB; + cuDoubleComplex *A; + cuDoubleComplex *rows; + CUstream stream; + + perm = (int *) STARPU_VECTOR_GET_PTR(descr[0]); + A = (cuDoubleComplex *) STARPU_MATRIX_GET_PTR(descr[1]); + rows = (cuDoubleComplex *) STARPU_VECTOR_GET_PTR(descr[2]); + starpu_codelet_unpack_args(cl_arg, &nb_pivots, &global_MB, &K, &SRC_M, &DST_M, &NB, &MB, &LDA); + + stream = starpu_cuda_get_local_stream(); + + CUDA_zswp_extract_rows(nb_pivots, global_MB, perm, A, rows, K, SRC_M, DST_M, NB, MB, LDA, stream); + +#ifndef STARPU_CUDA_ASYNC + cudaStreamSynchronize( stream ); +#endif + + return; +} +#endif + +/* + * Codelet definition + */ +CODELETS(zswp_extract_rows, 3, cl_zswp_extract_rows_cpu_func, cl_zswp_extract_rows_cuda_func, STARPU_CUDA_ASYNC) + + +void CHAMELON_TASK_zswp_extract_rows2(CHAMELON_option_t *options, + int nb_pivots, int global_MB, + int K, int SRC_M, int DST_M, + int NB, int MB, + CHAMELON_data_handle_t *ipiv, + CHAMELON_data_handle_t *perm, + CHAMELON_desc_t *A, int Am, int An, + CHAMELON_data_handle_t *rows_handle, + CHAMELON_data_handle_t *idxs) +{ + void (*callback)(void*) = options->profiling ? cl_zswp_extract_rows_callback : NULL; + struct starpu_codelet *codelet = &cl_zswp_extract_rows2; + int LDA = BLKLDD(A, Am); + + if (nb_pivots > 0) { + starpu_task_insert(codelet, + STARPU_VALUE, &nb_pivots, sizeof(int), + STARPU_VALUE, &global_MB, sizeof(int), + STARPU_VALUE, &K, sizeof(int), + STARPU_VALUE, &SRC_M, sizeof(int), + STARPU_VALUE, &DST_M, sizeof(int), + STARPU_VALUE, &NB, sizeof(int), + STARPU_VALUE, &MB, sizeof(int), + STARPU_VALUE, &LDA, sizeof(int), + STARPU_R, *(starpu_data_handle_t *)perm->data_handle, + STARPU_R|STARPU_LOCALITY, RTBLKADDR(A, CHAMELON_Complex64_t, Am, An), + STARPU_W, *(starpu_data_handle_t *)rows_handle->data_handle, + STARPU_R, *(starpu_data_handle_t *)idxs->data_handle, + STARPU_R, *(starpu_data_handle_t *)ipiv->data_handle, /* dummy for proper synchronization */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "zswp_extract_rows", +#endif + 0); + } +} + +static void cl_zswp_extract_rows2_cpu_func(void *descr[], void *cl_arg) +{ + int K, SRC_M, DST_M, NB, MB, LDA; + int *perm, *idxs, *ipiv, nb_pivots, global_MB; + CHAMELON_Complex64_t *A; + CHAMELON_Complex64_t *rows; + + perm = (int *) STARPU_VECTOR_GET_PTR(descr[0]); + A = (CHAMELON_Complex64_t *) STARPU_MATRIX_GET_PTR(descr[1]); + rows = (CHAMELON_Complex64_t *) STARPU_VECTOR_GET_PTR(descr[2]); + idxs = (int *) STARPU_VECTOR_GET_PTR(descr[3]); + ipiv = (int *) STARPU_VECTOR_GET_PTR(descr[4]); + + starpu_codelet_unpack_args(cl_arg, &nb_pivots, &global_MB, &K, &SRC_M, &DST_M, &NB, &MB, &LDA); + + CORE_zswp_extract_rows2(nb_pivots, global_MB, K, SRC_M, DST_M, A, perm, rows, idxs, NB, MB, LDA); + } + +#ifdef CHAMELEON_USE_CUDA +static void cl_zswp_extract_rows2_cuda_func(void *descr[], void *cl_arg) +{ + int K, SRC_M, DST_M, NB, MB, LDA; + int *perm, *idxs, *ipiv, nb_pivots, global_MB; + cuDoubleComplex *A; + cuDoubleComplex *rows; + CUstream stream; + + perm = (int *) STARPU_VECTOR_GET_PTR(descr[0]); + A = (cuDoubleComplex *) STARPU_MATRIX_GET_PTR(descr[1]); + rows = (cuDoubleComplex *) STARPU_VECTOR_GET_PTR(descr[2]); + idxs = (int *) STARPU_VECTOR_GET_PTR(descr[3]); + ipiv = (int *) STARPU_VECTOR_GET_PTR(descr[4]); + starpu_codelet_unpack_args(cl_arg, &nb_pivots, &global_MB, &K, &SRC_M, &DST_M, &NB, &MB, &LDA); + + stream = starpu_cuda_get_local_stream(); + + CUDA_zswp_extract_rows2(nb_pivots, global_MB, perm, A, rows, idxs, K, SRC_M, DST_M, NB, MB, LDA, stream); + +#ifndef STARPU_CUDA_ASYNC + cudaStreamSynchronize( stream ); +#endif + + return; +} +#endif + +/* + * Codelet definition + */ +CODELETS(zswp_extract_rows2, 5, cl_zswp_extract_rows2_cpu_func, cl_zswp_extract_rows2_cuda_func, STARPU_CUDA_ASYNC) diff --git a/runtime/starpu/codelets/codelet_zswp_insert_rows.c b/runtime/starpu/codelets/codelet_zswp_insert_rows.c new file mode 100644 index 0000000000000000000000000000000000000000..1710e74c4f6bd684754dc8294666cfadb2dc19cb --- /dev/null +++ b/runtime/starpu/codelets/codelet_zswp_insert_rows.c @@ -0,0 +1,195 @@ +/** + * + * @file codelet_zswap_insert_rows.c + * + * CHAMELON codelets kernel + * CHAMELON is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @version 2.3.1 + * @author Omar Zenati + * @author Terry Cojean + * @date 2011-06-01 + * @precisions normal z -> c d s + * + **/ + +#include "runtime/starpu/include/chameleon_starpu.h" +#include "runtime/starpu/include/runtime_codelet_z.h" + +void CHAMELON_TASK_zswp_insert_rows(CHAMELON_option_t *options, + int nb_pivots, int global_MB, + int K, int SRC_M, int DST_M, + int NB, int MB, + CHAMELON_data_handle_t *perm, + CHAMELON_desc_t *A, int Am, int An, + CHAMELON_data_handle_t *rows_handle) +{ + void (*callback)(void*) = options->profiling ? cl_zswp_insert_rows_callback : NULL; + struct starpu_codelet *codelet = &cl_zswp_insert_rows; + int LDA = BLKLDD(A, Am); + + if(nb_pivots > 0) { + starpu_insert_task( + codelet, + STARPU_VALUE, &nb_pivots, sizeof(int), + STARPU_VALUE, &global_MB, sizeof(int), + STARPU_VALUE, &K, sizeof(int), + STARPU_VALUE, &SRC_M, sizeof(int), + STARPU_VALUE, &DST_M, sizeof(int), + STARPU_VALUE, &NB, sizeof(int), + STARPU_VALUE, &MB, sizeof(int), + STARPU_VALUE, &LDA, sizeof(int), + STARPU_R, *(starpu_data_handle_t *)perm->data_handle, + STARPU_RW|STARPU_LOCALITY, RTBLKADDR(A, CHAMELON_Complex64_t, Am, An), + STARPU_R, *(starpu_data_handle_t *)rows_handle->data_handle, + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "zswp_insert_rows", +#endif + 0); + } +} + +static void cl_zswp_insert_rows_cpu_func(void *descr[], void *cl_arg) +{ + int K, SRC_M, DST_M, NB, MB, LDA; + int *perm, nb_pivots, global_MB; + CHAMELON_Complex64_t *A; + CHAMELON_Complex64_t *rows; + + perm = (int *) STARPU_VECTOR_GET_PTR(descr[0]); + A = (CHAMELON_Complex64_t *) STARPU_MATRIX_GET_PTR(descr[1]); + rows = (CHAMELON_Complex64_t *) STARPU_VECTOR_GET_PTR(descr[2]); + + starpu_codelet_unpack_args(cl_arg, &nb_pivots, &global_MB, &K, &SRC_M, &DST_M, &NB, &MB, &LDA); + + CORE_zswp_insert_rows(nb_pivots, global_MB, K, SRC_M, DST_M, A, perm, rows, NB, MB, LDA); + + return; +} + +#ifdef CHAMELEON_USE_CUDA +static void cl_zswp_insert_rows_cuda_func(void *descr[], void *cl_arg) +{ + int K, SRC_M, DST_M, NB, MB, LDA; + int *perm, nb_pivots, global_MB; + cuDoubleComplex *A; + cuDoubleComplex *rows; + CUstream stream; + + perm = (int *) STARPU_VECTOR_GET_PTR(descr[0]); + A = (cuDoubleComplex *) STARPU_MATRIX_GET_PTR(descr[1]); + rows = (cuDoubleComplex *) STARPU_VECTOR_GET_PTR(descr[2]); + + starpu_codelet_unpack_args(cl_arg, &nb_pivots, &global_MB, &K, &SRC_M, &DST_M, &NB, &MB, &LDA); + + stream = starpu_cuda_get_local_stream(); + + CUDA_zswp_insert_rows(nb_pivots, global_MB, perm, A, rows, K, SRC_M, DST_M, NB, MB, LDA, stream); + +#ifndef STARPU_CUDA_ASYNC + cudaStreamSynchronize( stream ); +#endif + + return; +} +#endif + +/* + * Codelet definition + */ +CODELETS(zswp_insert_rows, 3, cl_zswp_insert_rows_cpu_func, cl_zswp_insert_rows_cuda_func, STARPU_CUDA_ASYNC) + + + +void CHAMELON_TASK_zswp_insert_rows2(CHAMELON_option_t *options, + int nb_pivots, int global_MB, + int K, int SRC_M, int DST_M, + int NB, int MB, + CHAMELON_data_handle_t *perm, + CHAMELON_desc_t *A, int Am, int An, + CHAMELON_data_handle_t *rows_handle, + CHAMELON_data_handle_t *idxs) +{ + void (*callback)(void*) = options->profiling ? cl_zswp_insert_rows_callback : NULL; + struct starpu_codelet *codelet = &cl_zswp_insert_rows2; + int LDA = BLKLDD(A, Am); + + if(nb_pivots > 0) { + starpu_insert_task( + codelet, + STARPU_VALUE, &nb_pivots, sizeof(int), + STARPU_VALUE, &global_MB, sizeof(int), + STARPU_VALUE, &K, sizeof(int), + STARPU_VALUE, &SRC_M, sizeof(int), + STARPU_VALUE, &DST_M, sizeof(int), + STARPU_VALUE, &NB, sizeof(int), + STARPU_VALUE, &MB, sizeof(int), + STARPU_VALUE, &LDA, sizeof(int), + STARPU_R, *(starpu_data_handle_t *)perm->data_handle, + STARPU_RW|STARPU_LOCALITY, RTBLKADDR(A, CHAMELON_Complex64_t, Am, An), + STARPU_R, *(starpu_data_handle_t *)rows_handle->data_handle, + STARPU_R, *(starpu_data_handle_t *)idxs->data_handle, + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "zswp_insert_rows", +#endif + 0); + } +} + +static void cl_zswp_insert_rows2_cpu_func(void *descr[], void *cl_arg) +{ + int K, SRC_M, DST_M, NB, MB, LDA; + int *perm, *idxs, nb_pivots, global_MB; + CHAMELON_Complex64_t *A; + CHAMELON_Complex64_t *rows; + + perm = (int *) STARPU_VECTOR_GET_PTR(descr[0]); + A = (CHAMELON_Complex64_t *) STARPU_MATRIX_GET_PTR(descr[1]); + rows = (CHAMELON_Complex64_t *) STARPU_VECTOR_GET_PTR(descr[2]); + idxs = (int *) STARPU_VECTOR_GET_PTR(descr[3]); + + starpu_codelet_unpack_args(cl_arg, &nb_pivots, &global_MB, &K, &SRC_M, &DST_M, &NB, &MB, &LDA); + + CORE_zswp_insert_rows2(nb_pivots, global_MB, K, SRC_M, DST_M, A, perm, rows, idxs, NB, MB, LDA); + + return; +} + +#ifdef CHAMELEON_USE_CUDA +static void cl_zswp_insert_rows2_cuda_func(void *descr[], void *cl_arg) +{ + int K, SRC_M, DST_M, NB, MB, LDA; + int *perm, *idxs, nb_pivots, global_MB; + cuDoubleComplex *A; + cuDoubleComplex *rows; + CUstream stream; + + perm = (int *) STARPU_VECTOR_GET_PTR(descr[0]); + A = (cuDoubleComplex *) STARPU_MATRIX_GET_PTR(descr[1]); + rows = (cuDoubleComplex *) STARPU_VECTOR_GET_PTR(descr[2]); + idxs = (int *) STARPU_VECTOR_GET_PTR(descr[3]); + + starpu_codelet_unpack_args(cl_arg, &nb_pivots, &global_MB, &K, &SRC_M, &DST_M, &NB, &MB, &LDA); + + stream = starpu_cuda_get_local_stream(); + + CUDA_zswp_insert_rows2(nb_pivots, global_MB, perm, A, rows, idxs, K, SRC_M, DST_M, NB, MB, LDA, stream); + +#ifndef STARPU_CUDA_ASYNC + cudaStreamSynchronize( stream ); +#endif + + return; +} +#endif + +/* + * Codelet definition + */ +CODELETS(zswp_insert_rows2, 4, cl_zswp_insert_rows2_cpu_func, cl_zswp_insert_rows2_cuda_func, STARPU_CUDA_ASYNC) diff --git a/runtime/starpu/control/runtime_cuda.c b/runtime/starpu/control/runtime_cuda.c new file mode 100644 index 0000000000000000000000000000000000000000..e35850ed2e33677c55fad61a444f2753a7da2ebd --- /dev/null +++ b/runtime/starpu/control/runtime_cuda.c @@ -0,0 +1,31 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Inria. All rights reserved. + * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file runtime_cuda.c + * + * CHAMELON auxiliary routines + * CHAMELON is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 0.9.0 + * @author Terry Cojean + * @date 2017-02-10 + * + **/ +#include <stdio.h> +#include <stdlib.h> +#include "runtime/starpu/include/chameleon_starpu.h" + +cudaStream_t RUNTIME_cuda_get_local_stream() +{ + return starpu_cuda_get_local_stream(); +} diff --git a/runtime/starpu/control/runtime_data_handle.c b/runtime/starpu/control/runtime_data_handle.c new file mode 100644 index 0000000000000000000000000000000000000000..97f949afc3d3ff738c8c64a6a25527a3dd983b36 --- /dev/null +++ b/runtime/starpu/control/runtime_data_handle.c @@ -0,0 +1,139 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2016 Inria. All rights reserved. + * @copyright (c) 2012-2014, 2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file runtime_data_handle.c + * + * CHAMELON auxiliary routines + * CHAMELON is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @author Terry Cojean + * @date 2017-02-11 + * + **/ +#include "runtime/starpu/include/chameleon_starpu.h" + +void RUNTIME_data_alloc(CHAMELON_data_handle_t* data_handle) +{ + data_handle->data_handle = (starpu_data_handle_t*)malloc(sizeof(starpu_data_handle_t)); + return; +} + +void RUNTIME_data_free(CHAMELON_data_handle_t* data_handle) +{ + free(data_handle->data_handle); + return; +} + +void RUNTIME_vector_data_register(CHAMELON_data_handle_t* data_handle, uint32_t home_node, + uintptr_t ptr, uint32_t nx, size_t elemsize) +{ + starpu_vector_data_register((starpu_data_handle_t*)data_handle->data_handle, home_node, ptr, nx, elemsize); + + data_handle->ptr = (void*)ptr; + return; +} + +void RUNTIME_matrix_data_register(CHAMELON_data_handle_t* data_handle, uint32_t home_node, + uintptr_t ptr, uint32_t ld, uint32_t nx, + uint32_t ny, size_t elemsize) +{ + starpu_matrix_data_register((starpu_data_handle_t*)data_handle->data_handle, home_node, ptr, ld, nx, ny, elemsize); + data_handle->ptr = (void*)ptr; + return; +} + +void RUNTIME_data_acquire(CHAMELON_data_handle_t *data_handle) +{ + starpu_data_acquire(*(starpu_data_handle_t*)data_handle->data_handle, STARPU_R); +} + +void RUNTIME_data_acquire_write(CHAMELON_data_handle_t *data_handle) +{ + starpu_data_acquire(*(starpu_data_handle_t*)data_handle->data_handle, STARPU_W); +} + +void RUNTIME_data_release(CHAMELON_data_handle_t *data_handle) +{ + starpu_data_release(*(starpu_data_handle_t*)data_handle->data_handle); +} + +void RUNTIME_data_unregister(CHAMELON_data_handle_t* data_handle) +{ + starpu_data_handle_t *handle = (starpu_data_handle_t*)data_handle->data_handle; + starpu_data_unregister(*handle); + return; +} + +void RUNTIME_data_unregister_async(CHAMELON_data_handle_t* data_handle) +{ + starpu_data_handle_t *handle = (starpu_data_handle_t*)data_handle->data_handle; + if (handle != NULL) + starpu_data_unregister_submit(*handle); + return; +} + +struct callback_args { + starpu_data_handle_t* handle; + void *ptr; +}; + +static +void callback_free(void * arg) +{ + struct callback_args *data = (struct callback_args*)arg; + free(data->ptr); + starpu_data_release(*data->handle); + free(data); +} + +void RUNTIME_data_unregister_free_async(CHAMELON_data_handle_t* data_handle) +{ + starpu_data_handle_t *handle = (starpu_data_handle_t*)data_handle->data_handle; + if (handle != NULL) + { + struct callback_args *args = malloc(sizeof(struct callback_args)); + args->handle = handle; + args->ptr = data_handle->ptr; + starpu_data_acquire_cb(*handle, STARPU_W, callback_free, args); + starpu_data_unregister_submit(*handle); + } + return; +} + +static +void callback(void *arg) +{ + starpu_data_release(*((starpu_data_handle_t*)arg)); +} + + +void RUNTIME_data_acquire_all(CHAMELON_desc_t *desc, int mode) +{ + starpu_data_handle_t* handle = desc->schedopt; + int lmt = desc->lmt; + int lnt = desc->lnt; + int m, n; + + for (n = 0; n < lnt; n++) + for (m = 0; m < lmt; m++) + { + if ( (*handle == NULL) || + !chameleon_desc_islocal( desc, m, n ) ) + { + handle++; + continue; + } + starpu_data_acquire_cb(*handle, mode, callback, handle); + handle++; + } +} diff --git a/runtime/starpu/control/runtime_descriptor.c b/runtime/starpu/control/runtime_descriptor.c index 385adc0df81a1a87de81971fe3590853b5214287..4a76381127d4039bf3be8ff6a347a9587f7a5d9b 100644 --- a/runtime/starpu/control/runtime_descriptor.c +++ b/runtime/starpu/control/runtime_descriptor.c @@ -403,6 +403,31 @@ void RUNTIME_desc_flush( const CHAM_desc_t *desc, } } +void RUNTIME_data_flush_handle(const CHAMELON_option_t *options, + const CHAMELON_data_handle_t *handle) +{ + (void)options; + { + starpu_data_handle_t *ptr = (starpu_data_handle_t*)(handle->data_handle); + + if (ptr != NULL && *ptr != NULL) + { +#if defined(CHAMELEON_USE_MPI) + starpu_mpi_cache_flush(MPI_COMM_WORLD, *ptr); +#endif + + /* Push data to main memory when we have time to */ +#ifdef HAVE_STARPU_DATA_WONT_USE + starpu_data_wont_use(*ptr); +#elif defined HAVE_STARPU_IDLE_PREFETCH + starpu_data_acquire_on_node_cb(*ptr, -1, STARPU_R, data_flush, *ptr); +#else + starpu_data_acquire_cb(*ptr, STARPU_R, data_release, *ptr); +#endif + } + } +} + void RUNTIME_data_flush( const RUNTIME_sequence_t *sequence, const CHAM_desc_t *A, int m, int n ) { diff --git a/runtime/starpu/control/runtime_multithreading.c b/runtime/starpu/control/runtime_multithreading.c new file mode 100644 index 0000000000000000000000000000000000000000..ddf159cf83b228a44cd85aa9d825f21dcd41534f --- /dev/null +++ b/runtime/starpu/control/runtime_multithreading.c @@ -0,0 +1,130 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Inria. All rights reserved. + * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @file runtime_options.c + * + * CHAMELON auxiliary routines + * CHAMELON is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver, + * and INRIA Bordeaux Sud-Ouest + * + * @version 0.9.0 + * @author Terry Cojean + * @date 2017-02-13 + * + **/ +#include "runtime/starpu/include/chameleon_starpu.h" +#include <starpu_sched_ctx.h> +#ifdef STARPU_HAVE_HWLOC +#include <hwloc.h> +#define MIN(a,b) (((a)<(b))?(a):(b)) + +void find_biggest_available_numa_node(int *ncores, int **cores) +{ + hwloc_topology_t topology; + + int nNUMAnodes; + int *navail_cores; + int biggest_avail_numa = 0; + int biggest_avail_ncores = 0; + int biggest_avail_stride = 0; + int i, j, k, w, workers_offset=0; + + for (w=STARPU_CUDA_WORKER ; w<STARPU_ANY_WORKER ; w++) + workers_offset += starpu_worker_get_count_by_type(w); + + hwloc_topology_init(&topology); + hwloc_topology_load(topology); + + nNUMAnodes = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_MACHINE); + navail_cores=(int*)malloc(sizeof(int)*nNUMAnodes); + + for (i=0 ; i<nNUMAnodes ; i++) { + //printf("Numa node %d = ", i); + navail_cores[i]=0; + hwloc_obj_t NUMAnode = hwloc_get_obj_by_type(topology, HWLOC_OBJ_MACHINE, i); + + int nPUs = hwloc_get_nbobjs_inside_cpuset_by_type(topology, NUMAnode->cpuset, HWLOC_OBJ_PU); + + for (k=0 ; k<nPUs ; k++ ) { + hwloc_obj_t child = hwloc_get_obj_inside_cpuset_by_type(topology, NUMAnode->cpuset, HWLOC_OBJ_PU, k); + + int found = 0; + for (w=STARPU_CUDA_WORKER ; w<STARPU_ANY_WORKER && !found; w++) { + int nworkers = starpu_worker_get_count_by_type(w); + int *workers = (int*) malloc(sizeof(int) * nworkers); + + starpu_worker_get_ids_by_type(w, workers, nworkers); + + for(j=0 ; j<nworkers && !found ; j++) { + if (child->logical_index == starpu_worker_get_bindid(workers[j])){ + found = 1; + } + } + + free(workers); + } + if (!found) + navail_cores[i]++; + } + + //I prefere taking the last numa node if the same number of PUs are available + if (navail_cores[i] >= biggest_avail_ncores) { + biggest_avail_numa = i; + biggest_avail_ncores = navail_cores[i]; + } + } + + hwloc_topology_destroy(topology); + + //Compute stride for later + for (i=0 ; i<nNUMAnodes && i < biggest_avail_numa; i++) + biggest_avail_stride += navail_cores[i]; + + *ncores = MIN(biggest_avail_ncores, starpu_worker_get_count_by_type(STARPU_CPU_WORKER)); + (*cores) = (int*)malloc(biggest_avail_ncores*sizeof(int)); + + for (i=0 ; i<biggest_avail_ncores ; i++){ + (*cores)[i] = biggest_avail_stride + workers_offset + i; + //printf("Workers_offset... %d ;; Parallel panel : worker[%d]=%d\n", workers_offset, i, (*cores)[i]); + } + + free(navail_cores); +} +#else +void find_biggest_available_numa_node(int *ncores, int **cores) +{ + return; +} +#endif + + +void RUNTIME_unalloc_threads(CHAMELON_option_t *options) +{ +#if (STARPU_MAJOR_VERSION == 1 && STARPU_MINOR_VERSION >= 2) || STARPU_MAJOR_VERSION > 1 + starpu_sched_ctx_delete(options->panel_threads); + options->panel_threads = -1; +#endif +} + +void RUNTIME_alloc_threads(CHAMELON_option_t *options, int ncores, int *cores) +{ +#if (STARPU_MAJOR_VERSION == 1 && STARPU_MINOR_VERSION >= 2) || STARPU_MAJOR_VERSION > 1 + if (options->panel_threads < 0) + { + if (cores == NULL) + find_biggest_available_numa_node(&ncores, &cores); + options->panel_threads = starpu_sched_ctx_create(cores, ncores, "ctx_panel", STARPU_SCHED_CTX_AWAKE_WORKERS, 0); + } +#endif +} + diff --git a/runtime/starpu/control/runtime_options.c b/runtime/starpu/control/runtime_options.c index 6ade7b962cf17ec170c6e35465086b57583e7bdd..4f66c5ace3ea236ca771d005f6e75b70003057ae 100644 --- a/runtime/starpu/control/runtime_options.c +++ b/runtime/starpu/control/runtime_options.c @@ -36,6 +36,10 @@ void RUNTIME_options_init( RUNTIME_option_t *options, CHAM_context_t *chamctxt, options->ws_hsize = 0; options->ws_worker = NULL; options->ws_host = NULL; + + options->swap_grain = CHAMELON_SWAP_TYPE; + options->dsd_matrix = CHAMELON_DSDMATRIX == CHAMELON_TRUE; + options->panel_threads = -1; return; } diff --git a/runtime/starpu/include/runtime_codelet_z.h b/runtime/starpu/include/runtime_codelet_z.h index 673d8c5d7a6ea287ed2fa4e948c6d8820f35a00a..c2ec382d601bd693b459580002642ff14420d7ff 100644 --- a/runtime/starpu/include/runtime_codelet_z.h +++ b/runtime/starpu/include/runtime_codelet_z.h @@ -76,6 +76,8 @@ CODELETS_HEADER(zgessq); CODELETS_HEADER(zgetrf); CODELETS_HEADER(zgetrf_incpiv); CODELETS_HEADER(zgetrf_nopiv); +ZCODELETS_HEADER(getrf_rectil); +ZCODELETS_HEADER(getrf_sp); CODELETS_HEADER(zherfb); CODELETS_HEADER(zlauum); CODELETS_HEADER(zpotrf); @@ -113,6 +115,17 @@ CODELETS_HEADER(zlatro); CODELETS_HEADER(zplssq); CODELETS_HEADER(zplssq2); +ZCODELETS_HEADER(getrf_compare_workspace); +ZCODELETS_HEADER(getrf_copy_workspace); +ZCODELETS_HEADER(getrf_extract_max); +ZCODELETS_HEADER(getrf_save_pivots); +ZCODELETS_HEADER(swp_extract_rows); +ZCODELETS_HEADER(swp_insert_rows); +ZCODELETS_HEADER(swp_extract_rows2); +ZCODELETS_HEADER(swp_insert_rows2); +ZCODELETS_HEADER(swpp); +ZCODELETS_HEADER(swptr_ontile); + /* * DZ functions */ diff --git a/timing/time_zgetrf_pp_tile.c b/timing/time_zgetrf_pp_tile.c new file mode 100644 index 0000000000000000000000000000000000000000..978ab145b2a38b987532e94dc920faaa524e9dd2 --- /dev/null +++ b/timing/time_zgetrf_pp_tile.c @@ -0,0 +1,96 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Inria. All rights reserved. + * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @precisions normal z -> c d s + * + **/ +#define _TYPE CHAMELON_Complex64_t +#define _PREC double +#define _LAMCH LAPACKE_dlamch_work + +#define _NAME "CHAMELON_zgetrf_pp_Tile" +/* See Lawn 41 page 120 */ +#define _FMULS FMULS_GETRF(M, N) +#define _FADDS FADDS_GETRF(M, N) + +#include "./timing.c" + +static int +RunTest(int *iparam, double *dparam, chameleon_time_t *t_) +{ + PASTE_CODE_IPARAM_LOCALS( iparam ); + + if ( M != N && check ) { + fprintf(stderr, "Check cannot be perfomed with M != N\n"); + check = 0; + } + + /* Allocate Data */ + PASTE_CODE_ALLOCATE_MATRIX_TILE( descA, 1, CHAMELON_Complex64_t, ChamComplexDouble, LDA, M, N ); + PASTE_CODE_ALLOCATE_MATRIX( piv, 1, int, chameleon_max(M, N), 1 ); + + CHAMELON_zplrnt_Tile(descA, 3456); + + /* Save A in lapack layout for check */ + PASTE_TILE_TO_LAPACK( descA, A2, check, CHAMELON_Complex64_t, LDA, N ); + PASTE_CODE_ALLOCATE_MATRIX( ipiv2, check, int, chameleon_max(M, N), 1 ); + + /* Save AT in lapack layout for check */ + if ( check ) { + LAPACKE_zgetrf_work(LAPACK_COL_MAJOR, M, N, A2, LDA, ipiv2 ); + } + + START_TIMING(); + CHAMELON_zgetrf_pp_Tile( descA, piv); + STOP_TIMING(); + + if ( check ) + { + int64_t i; + double *work = (double *)malloc(chameleon_max(M, N)*sizeof(double)); + PASTE_TILE_TO_LAPACK( descA, A, 1, CHAMELON_Complex64_t, LDA, chameleon_min(M,N) ); + + /* Check ipiv */ + for(i=0; i<chameleon_min(M,N); i++) + { + if( piv[i] != ipiv2[i] ) { + fprintf(stdout, "\nCHAMELEON (piv[%ld] = %d, A[%ld] = %e) / LAPACK (piv[%ld] = %d, A[%ld] = [%e])\n", + i, piv[i], i, creal(A[ i * LDA + i ]), + i, ipiv2[i], i, creal(A2[ i * LDA + i ])); + break; + } + } + + dparam[IPARAM_ANORM] = LAPACKE_zlange_work(LAPACK_COL_MAJOR, + chameleon_lapack_const(ChamMaxNorm), + M, N, A, LDA, work); + dparam[IPARAM_XNORM] = LAPACKE_zlange_work(LAPACK_COL_MAJOR, + chameleon_lapack_const(ChamMaxNorm), + M, N, A2, LDA, work); + dparam[IPARAM_BNORM] = 0.0; + + CORE_zgeadd( ChamNoTrans, M, N, -1.0, A, LDA, 1.0, A2, LDA); + + dparam[IPARAM_RES] = LAPACKE_zlange_work(LAPACK_COL_MAJOR, + chameleon_lapack_const(ChamMaxNorm), + M, N, A2, LDA, work); + + free( A ); + free( A2 ); + free( ipiv2 ); + free( work ); + } + + PASTE_CODE_FREE_MATRIX(descA); + free(piv); + return 0; +} diff --git a/timing/time_zgetrf_sp_tile.c b/timing/time_zgetrf_sp_tile.c new file mode 100644 index 0000000000000000000000000000000000000000..0ffde6adc514af039dd608dc49b9284e81950bdc --- /dev/null +++ b/timing/time_zgetrf_sp_tile.c @@ -0,0 +1,84 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University + * of Tennessee Research Foundation. + * All rights reserved. + * @copyright (c) 2012-2017 Inria. All rights reserved. + * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. + * + **/ + +/** + * + * @precisions normal z -> c d s + * + **/ +#define _TYPE CHAMELON_Complex64_t +#define _PREC double +#define _LAMCH LAPACKE_dlamch_work + +#define _NAME "CHAMELON_zgetrf_Tile" +/* See Lawn 41 page 120 */ +#define _FMULS FMULS_GETRF(M, N) +#define _FADDS FADDS_GETRF(M, N) + +#include "./timing.c" + +static int +RunTest(int *iparam, double *dparam, chameleon_time_t *t_) +{ + double criteria; + + PASTE_CODE_IPARAM_LOCALS( iparam ); + + if ( M != N && check ) { + fprintf(stderr, "Check cannot be perfomed with M != N\n"); + check = 0; + } + + PASTE_CODE_ALLOCATE_MATRIX_TILE( descA, 1, CHAMELON_Complex64_t, ChamComplexDouble, LDA, M, N ); + CHAMELON_zplrnt_Tile(descA, 3456); + + /* Allocate Data */ + double eps = LAPACKE_dlamch_work('e'); + double anorm = CHAMELON_zlange_Tile(ChamInfNorm, descA); + criteria = eps*anorm; + + PASTE_CODE_ALLOCATE_MATRIX_TILE( descX, check, CHAMELON_Complex64_t, ChamComplexDouble, LDB, M, NRHS ); + PASTE_CODE_ALLOCATE_MATRIX_TILE( descAC, check, CHAMELON_Complex64_t, ChamComplexDouble, LDA, M, N ); + PASTE_CODE_ALLOCATE_MATRIX_TILE( descB, check, CHAMELON_Complex64_t, ChamComplexDouble, LDB, M, NRHS ); + + + /* Save A for check */ + if (check == 1){ + CHAMELON_zlacpy_Tile(ChamUpperLower, descA, descAC); + } + + START_TIMING(); + CHAMELON_zgetrf_sp_Tile( descA, criteria ); + STOP_TIMING(); + + /* Check the solution */ + if ( check ) + { + /* TODO: correct the check ? */ + CHAMELON_zplrnt_Tile( descX, 7732 ); + CHAMELON_zlacpy_Tile(ChamUpperLower, descX, descB); + + CHAMELON_zgetrs_nopiv_Tile( descA, descX ); + + dparam[IPARAM_ANORM] = CHAMELON_zlange_Tile(ChamInfNorm, descAC); + dparam[IPARAM_BNORM] = CHAMELON_zlange_Tile(ChamInfNorm, descB); + dparam[IPARAM_XNORM] = CHAMELON_zlange_Tile(ChamInfNorm, descX); + CHAMELON_zgemm_Tile( ChamNoTrans, ChamNoTrans, 1.0, descAC, descX, -1.0, descB ); + dparam[IPARAM_RES] = CHAMELON_zlange_Tile(ChamInfNorm, descB); + PASTE_CODE_FREE_MATRIX( descX ); + PASTE_CODE_FREE_MATRIX( descAC ); + PASTE_CODE_FREE_MATRIX( descB ); + } + + PASTE_CODE_FREE_MATRIX( descA ); + + + return 0; +} diff --git a/timing/timing.c b/timing/timing.c new file mode 100644 index 0000000000000000000000000000000000000000..3ac4ecf95997029c903ec3f04284bf98c664b1b2 --- /dev/null +++ b/timing/timing.c @@ -0,0 +1,773 @@ +/** + * + * @file timing.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon auxiliary routines + * + * @version 1.0.0 + * @author Mathieu Faverge + * @author Raphael Boucherie + * @author Dulceneia Becker + * @author Cedric Castagnede + * @date 2010-11-15 + * + */ +#if defined( _WIN32 ) || defined( _WIN64 ) +#define int64_t __int64 +#endif + +/* Define these so that the Microsoft VC compiler stops complaining + about scanf and friends */ +#define _CRT_SECURE_NO_DEPRECATE +#define _CRT_SECURE_NO_WARNINGS + +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> + +#if defined( _WIN32 ) || defined( _WIN64 ) +#include <windows.h> +#else /* Non-Windows */ +#include <unistd.h> +#include <sys/resource.h> +#endif + +#include <chameleon.h> +#if !defined(CHAMELEON_SIMULATION) +#include <coreblas/lapacke.h> +#include <coreblas.h> +#endif +#include "flops.h" +#include "timing.h" +#include "control/auxiliary.h" + +#if defined(CHAMELEON_USE_MPI) +#include <mpi.h> +#endif /* defined(CHAMELEON_USE_MPI */ + +#if defined(CHAMELEON_SCHED_STARPU) +#include <starpu.h> +#endif /* defined(CHAMELEON_SCHED_STARPU) */ + + +#if defined(CHAMELEON_HAVE_GETOPT_H) +#include <getopt.h> +#endif /* defined(CHAMELEON_HAVE_GETOPT_H) */ + +static int RunTest(int *iparam, _PREC *dparam, double *t_); +static inline void* chameleon_getaddr_null(const CHAMELON_desc_t *A, int m, int n) +{ + (void)A;(void)m;(void)n; + return (void*)( NULL ); +} + +int ISEED[4] = {0,0,0,1}; /* initial seed for zlarnv() */ + +static int +Test(int64_t n, int *iparam) { + int i, j, iter; + int thrdnbr, niter; + int64_t M, N, K, NRHS; + double *t; +#if defined(CHAMELEON_SIMULATION) + _PREC eps = 0.; +#else + _PREC eps = _LAMCH( 'e' ); +#endif + _PREC dparam[IPARAM_DNBPARAM]; + double fmuls, fadds, fp_per_mul, fp_per_add; + double sumgf, sumgf2, sumt, sd, flops, gflops; + char *s; + char *env[] = { + "OMP_NUM_THREADS", + "MKL_NUM_THREADS", + "GOTO_NUM_THREADS", + "ACML_NUM_THREADS", + "ATLAS_NUM_THREADS", + "BLAS_NUM_THREADS", "" + }; + int gnuplot = 0; + +/* + * if hres = 0 then the test succeed + * if hres = n then the test failed n times + */ + int hres = 0; + + memset( &dparam, 0, IPARAM_DNBPARAM * sizeof(_PREC) ); + dparam[IPARAM_THRESHOLD_CHECK] = 100.0; + + thrdnbr = iparam[IPARAM_THRDNBR]; + niter = iparam[IPARAM_NITER]; + + M = iparam[IPARAM_M]; + N = iparam[IPARAM_N]; + K = iparam[IPARAM_K]; + NRHS = K; + (void)M;(void)N;(void)K;(void)NRHS; + + if ( (n < 0) || (thrdnbr < 0 ) ) { + if (gnuplot && (CHAMELON_My_Mpi_Rank() == 0) ) { + printf( "set title '%d_NUM_THREADS: ", thrdnbr ); + for (i = 0; env[i][0]; ++i) { + s = getenv( env[i] ); + + if (i) printf( " " ); /* separating space */ + + for (j = 0; j < 5 && env[i][j] && env[i][j] != '_'; ++j) + printf( "%c", env[i][j] ); + + if (s) + printf( "=%s", s ); + else + printf( "->%s", "?" ); + } + printf( "'\n" ); + printf( "%s\n%s\n%s\n%s\n%s%s%s\n", + "set xlabel 'Matrix size'", + "set ylabel 'Gflop/s'", + "set key bottom", + gnuplot > 1 ? "set terminal png giant\nset output 'timeplot.png'" : "", + "plot '-' using 1:5 title '", _NAME, "' with linespoints" ); + } + return 0; + } + + if ( CHAMELON_My_Mpi_Rank() == 0) + printf( "%7d %7d %7d ", iparam[IPARAM_M], iparam[IPARAM_N], iparam[IPARAM_K] ); + fflush( stdout ); + + t = (double*)malloc(niter*sizeof(double)); + memset(t, 0, niter*sizeof(double)); + + if (sizeof(_TYPE) == sizeof(_PREC)) { + fp_per_mul = 1; + fp_per_add = 1; + } else { + fp_per_mul = 6; + fp_per_add = 2; + } + + fadds = (double)(_FADDS); + fmuls = (double)(_FMULS); + flops = 1e-9 * (fmuls * fp_per_mul + fadds * fp_per_add); + + if ( iparam[IPARAM_WARMUP] ) { + int status = RunTest( iparam, dparam, &(t[0])); + if (status != CHAMELON_SUCCESS) { + free(t); + return status; + } + } + + sumgf = 0.0; + double sumgf_upper = 0.0; + sumgf2 = 0.0; + sumt = 0.0; + + for (iter = 0; iter < niter; iter++) + { + if( iter == 0 ) { + if ( iparam[IPARAM_TRACE] ) + iparam[IPARAM_TRACE] = 2; + if ( iparam[IPARAM_DAG] ) + iparam[IPARAM_DAG] = 2; + if ( iparam[IPARAM_PROFILE] ) + iparam[IPARAM_PROFILE] = 2; + + int status = RunTest( iparam, dparam, &(t[iter])); + if (status != CHAMELON_SUCCESS) return status; + + iparam[IPARAM_TRACE] = 0; + iparam[IPARAM_DAG] = 0; + iparam[IPARAM_PROFILE] = 0; + } + else { + int status = RunTest( iparam, dparam, &(t[iter])); + if (status != CHAMELON_SUCCESS) return status; + } + gflops = flops / t[iter]; + +#if defined (CHAMELEON_SCHED_STARPU) + /* TODO: create chameleon interface encapsulating this instead */ + if (iparam[IPARAM_BOUND]) + { + double upper_gflops = 0.0; + double tmin = 0.0; + double integer_tmin = 0.0; + starpu_bound_compute(&tmin, &integer_tmin, 0); + upper_gflops = (flops / (tmin / 1000.0)); + sumgf_upper += upper_gflops; + } +#endif + sumt += t[iter]; + sumgf += gflops; + sumgf2 += gflops*gflops; + } + + gflops = sumgf / niter; + sd = sqrt((sumgf2 - (sumgf*sumgf)/niter)/niter); + + if ( CHAMELON_My_Mpi_Rank() == 0) { + printf( "%9.3f %9.2f +-%7.2f ", sumt/niter, gflops, sd); + + if (iparam[IPARAM_BOUND]) { + printf(" %9.2f", sumgf_upper/niter); + } + + if ( iparam[IPARAM_CHECK] ){ + hres = ( dparam[IPARAM_RES] / n / eps / (dparam[IPARAM_ANORM] * dparam[IPARAM_XNORM] + dparam[IPARAM_BNORM] ) > dparam[IPARAM_THRESHOLD_CHECK] ); + + if (hres) { + printf( "%8.5e %8.5e %8.5e %8.5e %8.5e FAILURE", + dparam[IPARAM_RES], dparam[IPARAM_ANORM], dparam[IPARAM_XNORM], dparam[IPARAM_BNORM], + dparam[IPARAM_RES] / n / eps / (dparam[IPARAM_ANORM] * dparam[IPARAM_XNORM] + dparam[IPARAM_BNORM] )); + } + else { + printf( "%8.5e %8.5e %8.5e %8.5e %8.5e SUCCESS", + dparam[IPARAM_RES], dparam[IPARAM_ANORM], dparam[IPARAM_XNORM], dparam[IPARAM_BNORM], + dparam[IPARAM_RES] / n / eps / (dparam[IPARAM_ANORM] * dparam[IPARAM_XNORM] + dparam[IPARAM_BNORM] )); + } + } + + if ( iparam[IPARAM_INVERSE] ) { + printf( " %8.5e %8.5e %8.5e %8.5e", + dparam[IPARAM_RNORM], dparam[IPARAM_ANORM], dparam[IPARAM_AinvNORM], + dparam[IPARAM_RNORM] /((dparam[IPARAM_ANORM] * dparam[IPARAM_AinvNORM])*n*eps)); + } + + printf("\n"); + + fflush( stdout ); + } + free(t); + + return hres; +} + +static inline int +startswith(const char *s, const char *prefix) { + size_t n = strlen( prefix ); + if (strncmp( s, prefix, n )) + return 0; + return 1; +} + +static int +get_range(char *range, int *start_p, int *stop_p, int *step_p) { + char *s, *s1, buf[21]; + int colon_count, copy_len, nbuf=20, n; + int start=1000, stop=10000, step=1000; + + colon_count = 0; + for (s = strchr( range, ':'); s; s = strchr( s+1, ':')) + colon_count++; + + if (colon_count == 0) { /* No colon in range. */ + if (sscanf( range, "%d", &start ) < 1 || start < 1) + return -1; + step = start / 10; + if (step < 1) step = 1; + stop = start + 10 * step; + + } else if (colon_count == 1) { /* One colon in range.*/ + /* First, get the second number (after colon): the stop value. */ + s = strchr( range, ':' ); + if (sscanf( s+1, "%d", &stop ) < 1 || stop < 1) + return -1; + + /* Next, get the first number (before colon): the start value. */ + n = s - range; + copy_len = n > nbuf ? nbuf : n; + strncpy( buf, range, copy_len ); + buf[copy_len] = 0; + if (sscanf( buf, "%d", &start ) < 1 || start > stop || start < 1) + return -1; + + /* Let's have 10 steps or less. */ + step = (stop - start) / 10; + if (step < 1) + step = 1; + } else if (colon_count == 2) { /* Two colons in range. */ + /* First, get the first number (before the first colon): the start value. */ + s = strchr( range, ':' ); + n = s - range; + copy_len = n > nbuf ? nbuf : n; + strncpy( buf, range, copy_len ); + buf[copy_len] = 0; + if (sscanf( buf, "%d", &start ) < 1 || start < 1) + return -1; + + /* Next, get the second number (after the first colon): the stop value. */ + s1 = strchr( s+1, ':' ); + n = s1 - (s + 1); + copy_len = n > nbuf ? nbuf : n; + strncpy( buf, s+1, copy_len ); + buf[copy_len] = 0; + if (sscanf( buf, "%d", &stop ) < 1 || stop < start) + return -1; + + /* Finally, get the third number (after the second colon): the step value. */ + if (sscanf( s1+1, "%d", &step ) < 1 || step < 1) + return -1; + } else + + return -1; + + *start_p = start; + *stop_p = stop; + *step_p = step; + + return 0; +} + +static void +show_help(char *prog_name) { + printf( "Usage:\n%s [options]\n\n", prog_name ); + printf( "Options are:\n" + " -h --help Show this help\n" + "\n" + " Machine parameters:\n" + " -t, --threads=x Number of CPU workers (default: automatic detection through runtime)\n" + " -g, --gpus=x Number of GPU workers (default: 0)\n" + " -P, --P=x Rows (P) in the PxQ process grid (deafult: 1)\n" + " --nocpu All GPU kernels are exclusively executed on GPUs (default: 0)\n" + "\n" + " Matrix parameters:\n" + " -m, --m, --M=x Dimension (M) of the matrices (default: N)\n" + " -n, --n, --N=x Dimension (N) of the matrices\n" + " -N, --n_range=R Range of N values\n" + " with R=Start:Stop:Step (default: 500:5000:500)\n" + " -k, --k, --K, --nrhs=x Dimension (K) of the matrices or number of right-hand size (default: 1)\n" + " -b, --nb=x NB size. (default: 320)\n" + " -i, --ib=x IB size. (default: 32)\n" + //" -x, --mx=x ?\n" todo + //" -X, --nx=x ?\n" todo + "\n" + " Check/prints:\n" + " --niter=x Number of iterations performed for each test (default: 1)\n" + " -W, --nowarnings Do not show warnings\n" + " -w, --nowarmup Cancel the warmup run to pre-load libraries\n" + " -c, --check Check result\n" + " -C, --inv Check on inverse\n" + " --mode=x Change xLATMS matrix mode generation for SVD/EVD (default: 4)\n" + " Must be between 0 and 20 included\n" + "\n" + " Profiling:\n" + " -T, --trace Enable trace generation\n" + " --progress Display progress indicator\n" + " -d, --dag Enable DAG generation\n" + " Generates a dot_dag_file.dot.\n" + " -p, --profile Print profiling informations\n" + "\n" + " HQR options:\n" + " -a, --qr_a, --rhblk=N If N > 0, enable Householder mode for QR and LQ factorization\n" + " N is the size of each subdomain (default: -1)\n" + " -l, --llvl=x Tree used for low level reduction inside nodes (default: -1)\n" + " -L, --hlvl=x Tree used for high level reduction between nodes, only if P > 1 (default: -1).\n" + " (-1: Automatic, 0: Flat, 1: Greedy, 2: Fibonacci, 3: Binary, 4: Replicated greedy)\n" + " -D, --domino Enable the domino between upper and lower trees.\n" + "\n" + " Advanced options\n" + " --nobigmat Disable single large matrix allocation for multiple tiled allocations\n" + " -s, --sync Enable synchronous calls in wrapper function such as POTRI\n" + " -o, --ooc Enable out-of-core (available only with StarPU)\n" + " -G, --gemm3m Use gemm3m complex method\n" + " --bound Compare result to area bound\n" + " --dsdmatrix Makes use of a dsdmatrix notably for GETRF\n" + " --[no|line|tile|column]swaps Change the type of swap which are used\n" + " --rectil_panel Makes use of the rectil panel for GETRF_PP\n" + " --nbpanel_threads=N Number of threads to use for the panel of GETRF_PP (default is one NUMA NODE)\n" + "\n"); +} + + +static void +print_header(char *prog_name, int * iparam) { + const char *bound_header = iparam[IPARAM_BOUND] ? " thGflop/s" : ""; + const char *check_header = iparam[IPARAM_CHECK] ? " ||Ax-b|| ||A|| ||x|| ||b|| ||Ax-b||/N/eps/(||A||||x||+||b||) RETURN" : ""; + const char *inverse_header = iparam[IPARAM_INVERSE] ? " ||I-A*Ainv|| ||A|| ||Ainv|| ||Id - A*Ainv||/((||A|| ||Ainv||).N.eps)" : ""; +#if defined(CHAMELEON_SIMULATION) + _PREC eps = 0.; +#else + _PREC eps = _LAMCH( 'e' ); +#endif + + printf( "#\n" + "# CHAMELEON %d.%d.%d, %s\n" + "# Nb threads: %d\n" + "# Nb GPUs: %d\n" +#if defined(CHAMELEON_USE_MPI) + "# Nb mpi: %d\n" + "# PxQ: %dx%d\n" +#endif + "# NB: %d\n" + "# IB: %d\n" + "# eps: %e\n" + "#\n", + CHAMELEON_VERSION_MAJOR, + CHAMELEON_VERSION_MINOR, + CHAMELEON_VERSION_MICRO, + prog_name, + iparam[IPARAM_THRDNBR], + iparam[IPARAM_NCUDAS], +#if defined(CHAMELEON_USE_MPI) + iparam[IPARAM_NMPI], + iparam[IPARAM_P], iparam[IPARAM_Q], +#endif + iparam[IPARAM_NB], + iparam[IPARAM_IB], + eps ); + + printf( "# M N K/NRHS seconds Gflop/s Deviation%s%s\n", + bound_header, iparam[IPARAM_INVERSE] ? inverse_header : check_header); + return; +} + +#define GETOPT_STRING "ht:g:P:8M:m:N:n:K:k:b:i:x:X:1:WwcCT2dpa:l:L:D9:3soG45" +#if defined(CHAMELEON_HAVE_GETOPT_LONG) +static struct option long_options[] = +{ + {"help", no_argument, 0, 'h'}, + // Configuration + {"threads", required_argument, 0, 't'}, + {"gpus", required_argument, 0, 'g'}, + {"P", required_argument, 0, 'P'}, + {"nocpu", no_argument, 0, '8'}, + // Matrix parameters + {"M", required_argument, 0, 'm'}, + {"m", required_argument, 0, 'm'}, + {"N", required_argument, 0, 'n'}, + {"n", required_argument, 0, 'n'}, + {"n_range", required_argument, 0, 'N'}, + {"K", required_argument, 0, 'K'}, + {"k", required_argument, 0, 'k'}, + {"nrhs", required_argument, 0, 'k'}, + {"nb", required_argument, 0, 'b'}, + {"ib", required_argument, 0, 'i'}, + {"mx", required_argument, 0, 'x'}, + {"nx", required_argument, 0, 'X'}, + // Check/prints + {"niter", required_argument, 0, '1'}, + {"nowarnings", no_argument, 0, 'W'}, + {"nowarmup", no_argument, 0, 'w'}, + {"check", no_argument, 0, 'c'}, + {"inv", no_argument, 0, 'C'}, + // Profiling + {"trace", no_argument, 0, 'T'}, + {"progress", no_argument, 0, '2'}, + {"dag", no_argument, 0, 'd'}, + {"profile", no_argument, 0, 'p'}, + // HQR options + {"rhblk", required_argument, 0, 'a'}, + {"qr_a", required_argument, 0, 'a'}, + {"llvl", required_argument, 0, 'l'}, + {"hlvl", required_argument, 0, 'L'}, + {"domino", no_argument, 0, 'D'}, + // Other + {"mode", required_argument, 0, '9'}, + {"nobigmat", no_argument, 0, '3'}, + {"sync", no_argument, 0, 's'}, + {"ooc", no_argument, 0, 'o'}, + {"gemm3m", no_argument, 0, 'G'}, + {"bound", no_argument, 0, '5'}, + {"dsdmatrix", no_argument, 0, 'S'}, + {"noswaps", no_argument, 0, '0'}, + {"columnswaps", no_argument, 0, '4'}, + {"tileswaps", no_argument, 0, '6'}, + {"lineswaps", no_argument, 0, '7'}, + {"rectil_panel", no_argument, 0, 'z'}, + {"nbpanel_threads", required_argument, 0, 'Z'}, + {0, 0, 0, 0} +}; +#endif /* defined(CHAMELEON_HAVE_GETOPT_LONG) */ + +static void +set_iparam_default(int *iparam){ + + memset(iparam, 0, IPARAM_SIZEOF*sizeof(int)); + + iparam[IPARAM_THRDNBR ] = -1; + iparam[IPARAM_THRDNBR_SUBGRP ] = 1; + iparam[IPARAM_M ] = -1; + iparam[IPARAM_N ] = -1; + iparam[IPARAM_K ] = 1; + iparam[IPARAM_LDA ] = -1; + iparam[IPARAM_LDB ] = -1; + iparam[IPARAM_LDC ] = -1; + iparam[IPARAM_MB ] = 320; + iparam[IPARAM_NB ] = 320; + iparam[IPARAM_IB ] = 32; + iparam[IPARAM_NITER ] = 1; + iparam[IPARAM_WARMUP ] = 1; + iparam[IPARAM_BIGMAT ] = 1; + iparam[IPARAM_ASYNC ] = 1; + iparam[IPARAM_MX ] = -1; + iparam[IPARAM_NX ] = -1; + iparam[IPARAM_MX ] = -1; + iparam[IPARAM_NX ] = -1; + iparam[IPARAM_INPLACE ] = CHAMELON_OUTOFPLACE; + iparam[IPARAM_NMPI ] = 1; + iparam[IPARAM_P ] = 1; + iparam[IPARAM_Q ] = 1; + iparam[IPARAM_PRINT_WARNINGS ] = 1; + iparam[IPARAM_LOWLVL_TREE ] = -1; + iparam[IPARAM_HIGHLVL_TREE ] = -1; + iparam[IPARAM_QR_TS_SZE ] = -1; + iparam[IPARAM_QR_HLVL_SZE ] = -1; + iparam[IPARAM_QR_DOMINO ] = -1; + iparam[IPARAM_SWAP_TYPE ] = CHAMELON_NO_SWAPS; + iparam[IPARAM_DSDMATRIX_MODE ] = 0; + iparam[IPARAM_RECTIL_PANEL_MODE] = 0; + iparam[IPARAM_NBPANEL_THREADS ] = -1; +} + +static inline int +read_integer_from_options(int long_index, int opt_char) +{ + char *endptr; + long int value; + (void) long_index; + + value = strtol(optarg, &endptr, 10); + if ( *optarg == '\0' || *endptr != '\0' ) { +#ifdef CHAMELEON_HAVE_GETOPT_LONG + if ( long_index < 0 ) { +#endif + fprintf(stderr, "Invalid numeric value '%s' for '-%c' parameter\n", optarg, opt_char); +#ifdef CHAMELEON_HAVE_GETOPT_LONG + } else { + fprintf(stderr, "Invalid numeric value '%s' for '--%s' parameter\n", optarg, long_options[long_index].name); + } +#endif + exit(EXIT_FAILURE); + } + if ( value > INT_MAX || value < INT_MIN ) { + fprintf(stderr, "Out of range integer '%ld'\n", value); + exit(EXIT_FAILURE); + } + return (int)value; +} + +void +parse_arguments(int *_argc, char ***_argv, int *iparam, int *start, int *stop, int*step) +{ + int opt = -1; + int c; + int argc = *_argc; + char **argv = *_argv; + + optind = 1; + do { +#if defined(CHAMELEON_HAVE_GETOPT_LONG) + opt = -1; + c = getopt_long(argc, argv, GETOPT_STRING, + long_options, &opt); +#else + c = getopt(argc, argv, GETOPT_STRING); + (void) opt; +#endif /* defined(CHAMELEON_HAVE_GETOPT_LONG) */ + + switch(c) + { + // Configuration + case 't' : iparam[IPARAM_THRDNBR ] = read_integer_from_options(opt, c); break; + case 'g' : iparam[IPARAM_NCUDAS ] = read_integer_from_options(opt, c); break; + case 'P' : iparam[IPARAM_P ] = read_integer_from_options(opt, c); break; + case '8' : iparam[IPARAM_NO_CPU ] = 1; break; + // Matrix parameters + case 'M' : + case 'm' : iparam[IPARAM_M ] = read_integer_from_options(opt, c); break; + case 'n' : iparam[IPARAM_N ] = read_integer_from_options(opt, c); break; + case 'N' : get_range(optarg, start, stop, step); break; + case 'K' : + case 'k' : iparam[IPARAM_K ] = read_integer_from_options(opt, c); break; + case 'b' : iparam[IPARAM_NB ] = read_integer_from_options(opt, c); + iparam[IPARAM_MB ] = read_integer_from_options(opt, c); break; + case 'i' : iparam[IPARAM_IB ] = read_integer_from_options(opt, c); break; + case 'x' : iparam[IPARAM_MX ] = read_integer_from_options(opt, c); break; + case 'X' : iparam[IPARAM_NX ] = read_integer_from_options(opt, c); break; + // Check/prints + case '1' : iparam[IPARAM_NITER ] = read_integer_from_options(opt, c); break; + case 'W' : iparam[IPARAM_PRINT_WARNINGS ] = 0; break; + case 'w' : iparam[IPARAM_WARMUP ] = 0; break; + case 'c' : iparam[IPARAM_CHECK ] = 1; break; + case 'C' : iparam[IPARAM_INVERSE ] = 1; break; + // Profiling + case 'T' : iparam[IPARAM_TRACE ] = 1; break; + case '2' : iparam[IPARAM_PROGRESS ] = 1; break; + case 'd' : iparam[IPARAM_DAG ] = 1; break; + case 'p' : iparam[IPARAM_PROFILE ] = 1; break; + // HQR options + case 'a' : iparam[IPARAM_RHBLK ] = read_integer_from_options(opt, c); break; + case 'l' : iparam[IPARAM_LOWLVL_TREE ] = read_integer_from_options(opt, c); break; + case 'L' : iparam[IPARAM_HIGHLVL_TREE ] = read_integer_from_options(opt, c); break; + case 'D' : iparam[IPARAM_QR_DOMINO ] = 1; break; + //Other + case '9' : iparam[IPARAM_MODE ] = read_integer_from_options(opt, c); break; + case '3' : iparam[IPARAM_BIGMAT ] = 0; break; + case 's' : iparam[IPARAM_ASYNC ] = 0; break; + case 'o' : iparam[IPARAM_OOC ] = 1; break; + case 'G' : iparam[IPARAM_GEMM3M ] = 1; break; + case '5' : iparam[IPARAM_BOUND ] = 1; break; + case 'S' : iparam[IPARAM_DSDMATRIX_MODE ] = 1; break; + case '0' : iparam[IPARAM_SWAP_TYPE ] = CHAMELON_NO_SWAPS; break; + case '4' : iparam[IPARAM_SWAP_TYPE ] = CHAMELON_COLUMN_SWAPS; break; + case '6' : iparam[IPARAM_SWAP_TYPE ] = CHAMELON_TILE_SWAPS; break; + case '7' : iparam[IPARAM_SWAP_TYPE ] = CHAMELON_LINE_SWAPS; break; + case 'z' : iparam[IPARAM_RECTIL_PANEL_MODE] = 1; break; + case 'Z' : iparam[IPARAM_NBPANEL_THREADS ] = read_integer_from_options(opt, c); break; + case 'h' : + case '?' : + show_help(argv[0]); exit(EXIT_FAILURE); + default: + break; + } + } while(-1 != c); +} + +int +main(int argc, char *argv[]) { + int i, m, n, mx, nx; + int status; + int nbnode = 1; + int start = 500; + int stop = 5000; + int step = 500; + int iparam[IPARAM_SIZEOF]; + int success = 0; + + set_iparam_default(iparam); + + parse_arguments(&argc, &argv, iparam, &start, &stop, &step); + +#if !defined(CHAMELEON_USE_CUDA) + if (iparam[IPARAM_NCUDAS] != 0){ + fprintf(stderr, "ERROR: CHAMELEON_USE_CUDA is not defined. " + "The number of CUDA devices must be set to 0 (--gpus=0).\n"); + return EXIT_FAILURE; + } +#endif + + n = iparam[IPARAM_N]; + m = iparam[IPARAM_M]; + mx = iparam[IPARAM_MX]; + nx = iparam[IPARAM_NX]; + + /* Initialize CHAMELON */ + CHAMELON_Init( iparam[IPARAM_THRDNBR], + iparam[IPARAM_NCUDAS] ); + + /* Get the number of threads set by the runtime */ + iparam[IPARAM_THRDNBR] = CHAMELON_GetThreadNbr(); + + /* Stops profiling here to avoid profiling uninteresting routines. + It will be reactivated in the time_*.c routines with the macro START_TIMING() */ + RUNTIME_stop_profiling(); + + CHAMELON_Disable(CHAMELON_AUTOTUNING); + CHAMELON_Set(CHAMELON_TILE_SIZE, iparam[IPARAM_NB] ); + CHAMELON_Set(CHAMELON_INNER_BLOCK_SIZE, iparam[IPARAM_IB] ); + CHAMELON_Set(CHAMELON_SWAP_TYPE_SET, iparam[IPARAM_SWAP_TYPE]); + CHAMELON_Set(CHAMELON_NBPANEL_THREADS_SET, iparam[IPARAM_NBPANEL_THREADS]); + + /* Householder mode */ + if (iparam[IPARAM_RHBLK] < 1) { + CHAMELON_Set(CHAMELON_HOUSEHOLDER_MODE, CHAMELON_FLAT_HOUSEHOLDER); + } else { + CHAMELON_Set(CHAMELON_HOUSEHOLDER_MODE, CHAMELON_TREE_HOUSEHOLDER); + CHAMELON_Set(CHAMELON_HOUSEHOLDER_SIZE, iparam[IPARAM_RHBLK]); + } + + if (iparam[IPARAM_PROFILE] == 1) { + CHAMELON_Enable(CHAMELON_PROFILING_MODE); + } + + if (iparam[IPARAM_PROGRESS] == 1) { + CHAMELON_Enable(CHAMELON_PROGRESS); + } + + if (iparam[IPARAM_PRINT_WARNINGS] == 0) { + CHAMELON_Disable(CHAMELON_WARNINGS); + } + + if (iparam[IPARAM_GEMM3M] == 1) { + CHAMELON_Enable(CHAMELON_GEMM3M); + } + + if (iparam[IPARAM_DSDMATRIX_MODE] == 1) + CHAMELON_Enable(CHAMELON_DSDMATRIX_MODE); + + if (iparam[IPARAM_RECTIL_PANEL_MODE] == 1) + CHAMELON_Enable(CHAMELON_RECTIL_PANEL_MODE); + +#if defined(CHAMELEON_USE_MPI) + nbnode = CHAMELON_Comm_size(); + iparam[IPARAM_NMPI] = nbnode; + /* Check P */ + if ( (iparam[IPARAM_P] > 1) && + (nbnode % iparam[IPARAM_P] != 0) ) { + fprintf(stderr, "ERROR: %d doesn't divide the number of node %d\n", + iparam[IPARAM_P], nbnode ); + return EXIT_FAILURE; + } +#endif + iparam[IPARAM_Q] = nbnode / iparam[IPARAM_P]; + + /* Layout conversion */ + CHAMELON_Set(CHAMELON_TRANSLATION_MODE, iparam[IPARAM_INPLACE]); + + if ( CHAMELON_My_Mpi_Rank() == 0 ) { + print_header( argv[0], iparam); + } + + if (step < 1) step = 1; + + status = Test( -1, iparam ); /* print header */ + if (status != CHAMELON_SUCCESS) return status; + if ( n == -1 ){ + for (i = start; i <= stop; i += step) + { + if ( nx > 0 ) { + iparam[IPARAM_M] = i; + iparam[IPARAM_N] = chameleon_max(1, i/nx); + } + else if ( mx > 0 ) { + iparam[IPARAM_M] = chameleon_max(1, i/mx); + iparam[IPARAM_N] = i; + } + else { + if ( m == -1 ) { + iparam[IPARAM_M] = i; + } + iparam[IPARAM_N] = i; + } + status = Test( iparam[IPARAM_N], iparam ); + if (status != CHAMELON_SUCCESS) { + return status; + } + success += status; + } + } + else { + if ( m == -1 ) { + iparam[IPARAM_M] = n; + } + iparam[IPARAM_N] = n; + status = Test( iparam[IPARAM_N], iparam ); + if (status != CHAMELON_SUCCESS) return status; + success += status; + } + CHAMELON_Finalize(); + return success; +} + diff --git a/timing/timing.h b/timing/timing.h new file mode 100644 index 0000000000000000000000000000000000000000..8bcc43119a07240b704a4cba41cebe8a08b10ce4 --- /dev/null +++ b/timing/timing.h @@ -0,0 +1,247 @@ +/** + * + * @file timing.h + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2015 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + * @version 1.0.0 + * + */ +#ifndef TIMING_H +#define TIMING_H + +typedef double chameleon_time_t; + +enum iparam_timing { + IPARAM_THRDNBR, /* Number of cores */ + IPARAM_THRDNBR_SUBGRP, /* Number of cores in a subgroup (NUMA node) */ + IPARAM_SCHEDULER, /* What scheduler do we choose (dyn, stat) */ + IPARAM_M, /* Number of rows of the matrix */ + IPARAM_N, /* Number of columns of the matrix */ + IPARAM_K, /* RHS or K */ + IPARAM_LDA, /* Leading dimension of A */ + IPARAM_LDB, /* Leading dimension of B */ + IPARAM_LDC, /* Leading dimension of C */ + IPARAM_IB, /* Inner-blocking size */ + IPARAM_NB, /* Number of columns in a tile */ + IPARAM_MB, /* Number of rows in a tile */ + IPARAM_NITER, /* Number of iteration of each test */ + IPARAM_WARMUP, /* Run one test to load dynamic libraries */ + IPARAM_BIGMAT, /* Allocating one big mat or plenty of small */ + IPARAM_CHECK, /* Checking activated or not */ + IPARAM_VERBOSE, /* How much noise do we want? */ + IPARAM_AUTOTUNING, /* Disable/enable autotuning */ + IPARAM_INPUTFMT, /* Input format (Use only for getmi/gecfi) */ + IPARAM_OUTPUTFMT, /* Output format (Use only for getmi/gecfi) */ + IPARAM_TRACE, /* Generate trace on the first non warmup run */ + IPARAM_DAG, /* Do we require to output the DOT file? */ + IPARAM_ASYNC, /* Asynchronous calls */ + IPARAM_OOC, /* Out of Core */ + IPARAM_MX, /* */ + IPARAM_NX, /* */ + IPARAM_RHBLK, /* Householder reduction parameter for QR/LQ */ + IPARAM_INPLACE, /* InPlace/OutOfPlace translation mode */ + IPARAM_MODE, /* Eigenvalue generation mode */ + + IPARAM_INVERSE, + IPARAM_NCUDAS, + IPARAM_NMPI, + IPARAM_P, /* Parameter for 2D cyclic distribution */ + IPARAM_Q, /* Parameter for 2D cyclic distribution */ + + IPARAM_PROGRESS, /* Use a progress indicator during computations */ + IPARAM_GEMM3M, /* Use GEMM3M for complex matrix vector products */ + /* Added for StarPU version */ + IPARAM_PROFILE, + IPARAM_PRINT_WARNINGS, + IPARAM_PARALLEL_TASKS, + IPARAM_NO_CPU, + IPARAM_BOUND, + /* End */ + /* Added for libhqr version */ + IPARAM_LOWLVL_TREE, /* Tree used for reduction inside nodes */ + IPARAM_HIGHLVL_TREE, /* Tree used for reduction between nodes */ + IPARAM_QR_TS_SZE, /* Size of TS domain */ + IPARAM_QR_HLVL_SZE, /* Size of the high level tree */ + IPARAM_QR_DOMINO, /* Enable/disable the domino tree */ + IPARAM_QR_TSRR, /* Enable/disable the round-robin on TS domain */ + IPARAM_SWAP_TYPE, + IPARAM_DSDMATRIX_MODE, + IPARAM_RECTIL_PANEL_MODE, + IPARAM_NBPANEL_THREADS, + /* End */ + IPARAM_SIZEOF +}; + +enum dparam_timing { + IPARAM_TIME, + IPARAM_ANORM, + IPARAM_BNORM, + IPARAM_XNORM, + IPARAM_RNORM, + IPARAM_AinvNORM, + IPARAM_RES, + /* Begin section for hydra integration tool */ + IPARAM_THRESHOLD_CHECK, /* Maximum value accepted for: |Ax-b||/N/eps/(||A||||x||+||b||) */ + /* End section for hydra integration tool */ + IPARAM_DNBPARAM +}; + +#define PASTE_CODE_IPARAM_LOCALS(iparam) \ + double t; \ + int64_t M = iparam[IPARAM_M]; \ + int64_t N = iparam[IPARAM_N]; \ + int64_t K = iparam[IPARAM_K]; \ + int64_t NRHS = K; \ + int64_t LDA = chameleon_max(M, iparam[IPARAM_LDA]); \ + int64_t LDB = chameleon_max(N, iparam[IPARAM_LDB]); \ + int64_t LDC = chameleon_max(K, iparam[IPARAM_LDC]); \ + int64_t IB = iparam[IPARAM_IB]; \ + int64_t MB = iparam[IPARAM_MB]; \ + int64_t NB = iparam[IPARAM_NB]; \ + int64_t P = iparam[IPARAM_P]; \ + int64_t Q = iparam[IPARAM_Q]; \ + int64_t MT = (M%MB==0) ? (M/MB) : (M/MB+1); \ + int64_t NT = (N%NB==0) ? (N/NB) : (N/NB+1); \ + int bigmat = iparam[IPARAM_BIGMAT]; \ + int ooc = iparam[IPARAM_OOC]; \ + int check = iparam[IPARAM_CHECK]; \ + int loud = iparam[IPARAM_VERBOSE]; \ + (void)M;(void)N;(void)K;(void)NRHS; \ + (void)LDA;(void)LDB;(void)LDC; \ + (void)IB;(void)MB;(void)NB;(void)P;(void)Q; \ + (void)MT;(void)NT;(void)check; \ + (void)loud;(void)bigmat;(void)ooc; + +/* Paste code to allocate a matrix in desc if cond_init is true */ +#define PASTE_CODE_ALLOCATE_MATRIX_TILE(_desc_, _cond_, _type_, _type2_, _lda_, _m_, _n_) \ + CHAMELON_desc_t *_desc_ = NULL; \ + int status ## _desc_ ; \ + if( _cond_ ) { \ + if (ooc) \ + status ## _desc_ = CHAMELON_Desc_Create_OOC(&(_desc_), _type2_, MB, NB, MB*NB, _lda_, _n_, 0, 0, _m_, _n_, \ + P, Q); \ + else if (!bigmat) \ + status ## _desc_ = CHAMELON_Desc_Create_User(&(_desc_), NULL, _type2_, MB, NB, MB*NB, _lda_, _n_, 0, 0, _m_, _n_, \ + P, Q, chameleon_getaddr_null, NULL, NULL); \ + else \ + status ## _desc_ = CHAMELON_Desc_Create(&(_desc_), NULL, _type2_, MB, NB, MB*NB, _lda_, _n_, 0, 0, _m_, _n_, \ + P, Q); \ + if (status ## _desc_ != CHAMELON_SUCCESS) return (status ## _desc_); \ + } + +#define PASTE_CODE_FREE_MATRIX(_desc_) \ + CHAMELON_Desc_Destroy( &_desc_ ); + +#define PASTE_TILE_TO_LAPACK(_desc_, _name_, _cond_, _type_, _lda_, _n_) \ + _type_ *_name_ = NULL; \ + if ( _cond_ ) { \ + _name_ = (_type_*)malloc( (_lda_) * (_n_) * sizeof(_type_)); \ + if ( ! _name_ ) { \ + fprintf(stderr, "Out of Memory for %s\n", #_name_); \ + return -1; \ + } \ + CHAMELON_Tile_to_Lapack(_desc_, (void*)_name_, _lda_); \ + } + +#define PASTE_CODE_ALLOCATE_MATRIX(_name_, _cond_, _type_, _lda_, _n_) \ + _type_ *_name_ = NULL; \ + if( _cond_ ) { \ + _name_ = (_type_*)malloc( (_lda_) * (_n_) * sizeof(_type_) ); \ + if ( ! _name_ ) { \ + fprintf(stderr, "Out of Memory for %s\n", #_name_); \ + return -1; \ + } \ + } + +#define PASTE_CODE_ALLOCATE_COPY(_name_, _cond_, _type_, _orig_, _lda_, _n_) \ + _type_ *_name_ = NULL; \ + if( _cond_ ) { \ + _name_ = (_type_*)malloc( (_lda_) * (_n_) * sizeof(_type_) ); \ + if ( ! _name_ ) { \ + fprintf(stderr, "Out of Memory for %s\n", #_name_); \ + return -1; \ + } \ + memcpy(_name_, _orig_, (_lda_) * (_n_) * sizeof(_type_) ); \ + } + +/** + * + * Macro for trace generation + * + */ +#define START_TRACING() \ + RUNTIME_start_stats(); \ + if(iparam[IPARAM_TRACE] == 2) { \ + RUNTIME_start_profiling(); \ + } \ + if(iparam[IPARAM_BOUND]) { \ + CHAMELON_Enable(CHAMELON_BOUND); \ + } + +#define STOP_TRACING() \ + RUNTIME_stop_stats(); \ + if(iparam[IPARAM_TRACE] == 2) { \ + RUNTIME_stop_profiling(); \ + } \ + if(iparam[IPARAM_BOUND]) { \ + CHAMELON_Disable(CHAMELON_BOUND); \ + } + +/** + * + * Macro for DAG generation + * + */ +#if 0 +#define START_DAG() \ + if ( iparam[IPARAM_DAG] == 2 ) \ + CHAMELON_Enable(CHAMELON_DAG); + +#define STOP_DAG() \ + if ( iparam[IPARAM_DAG] == 2 ) \ + CHAMELON_Disable(CHAMELON_DAG); +#else +#define START_DAG() do {} while(0); +#define STOP_DAG() do {} while(0); +#endif + +/** + * + * Synchro for distributed computations + * + */ +#if defined(CHAMELEON_USE_MPI) +#define START_DISTRIBUTED() CHAMELON_Distributed_start(); +#define STOP_DISTRIBUTED() CHAMELON_Distributed_stop(); +#else +#define START_DISTRIBUTED() do {} while(0); +#define STOP_DISTRIBUTED() do {} while(0); +#endif + +/** + * + * General Macros for timing + * + */ +#define START_TIMING() \ + START_DAG(); \ + START_TRACING(); \ + START_DISTRIBUTED(); \ + t = -RUNTIME_get_time(); + +#define STOP_TIMING() \ + STOP_DISTRIBUTED(); \ + t += RUNTIME_get_time(); \ + STOP_TRACING(); \ + STOP_DAG(); \ + if (iparam[IPARAM_PROFILE] == 2) { \ + RUNTIME_kernelprofile_display(); \ + RUNTIME_schedprofile_display(); \ + } \ + *t_ = t; + +#endif /* TIMING_H */