diff --git a/doc/orgmode/chapters/using.org b/doc/orgmode/chapters/using.org index 7bf39140f57702191dee06f459a506e2754c254f..86e108b4fc76133408a616fa21c6cc3750b0507f 100644 --- a/doc/orgmode/chapters/using.org +++ b/doc/orgmode/chapters/using.org @@ -165,33 +165,58 @@ #+end_src List of main options that can be used in timing: - * ~--help~: show usage - * ~--threads~: Number of CPU workers (default: - ~_SC_NPROCESSORS_ONLN~) - * ~--gpus~: number of GPU workers (default: ~0~) - * ~--n_range=R~: range of N values, with ~R=Start:Stop:Step~ - (default: ~500:5000:500~) - * ~--m=X~: dimension (M) of the matrices (default: ~N~) - * ~--k=X~: dimension (K) of the matrices (default: ~1~), useful for - GEMM algorithm (k is the shared dimension and must be defined - >1 to consider matrices and not vectors) - * ~--nrhs=X~: number of right-hand size (default: ~1~) - * ~--nb=X~: block/tile size. (default: ~128~) - * ~--ib=X~: inner-blocking/IB size. (default: ~32~) - * ~--niter=X~: number of iterations performed for each test - (default: ~1~) - * ~--rhblk=X~: if X > 0, enable Householder mode for QR and LQ - factorization. X is the size of each subdomain (default: ~0~) - * ~--[no]check~: check result (default: ~nocheck~) - * ~--[no]profile~: print profiling informations (default: - ~noprofile~) - * ~--[no]trace~: enable/disable trace generation (default: ~notrace~) - * ~--[no]dag~: enable/disable DAG generation (default: ~nodag~) - * ~--[no]inv~: check on inverse (default: ~noinv~) - * ~--nocpu~: all GPU kernels are exclusively executed on GPUs - * ~--ooc~: Enable out-of-core (available only with StarPU) - * ~--bound~: Compare result to area bound (available only with - StarPU) (default: ~0~) + * ~--help~: Show usage + * Machine parameters + * ~-t x, --threads=x~: Number of CPU workers (default: automatic + detection through runtime) + * ~-g x, --gpus=x~: Number of GPU workers (default: ~0~) + * ~-P x, --P=x~: Rows (P) in the PxQ process grid (default: ~1~) + * ~--nocpu~: All GPU kernels are exclusively executed on GPUs + * Matrix parameters + * ~-m x, --m=X, --M=x~: Dimension (M) of the matrices (default: + ~N~) + * ~-n x, --n=X, --N=x~: Dimension (N) of the matrices + * ~-N R, --n_range=R~: Range of N values to time with + ~R=Start:Stop:Step~ (default: ~500:5000:500~) + * ~-k x, --k=x, --K=x, --nrhs=x~: Dimension (K) of the matrices + or number of right-hand size (default: ~1~). This is useful for + GEMM algorithms (k is the shared dimension and must be defined + >1 to consider matrices and not vectors) + * ~-b x, --nb=x~: NB size. (default: ~320~) + * ~-i x, --ib=x~: IB size. (default: ~32~) + * Check/prints + * ~--niter=X~: Number of iterations performed for each test + (default: ~1~) + * ~-W, --nowarning~: Do not show warnings + * ~-w, --nowarmup~: Cancel the warmup run to pre-load libraries + * ~-c, --check~: Check result + * ~-C, --inc~: Check on inverse + * ~--mode=x~ : Change the xLATMS matrix mode generation for + SVD/EVD (default: ~4~). It must be between 0 and 20 included. + * Profiling parameters + * ~-T, --trace~: Enable trace generation + * ~--progress~: Display progress indicator + * ~-d, --dag~: Enable DAG generation. Generates a dot_dag_file.dot. + * ~-p, --profile~: Print profiling informations + * HQR parameters + * ~-a x, --qr_a=x, --rhblk=x~: Define the size of the local TS + trees in housholder reduction trees for QR and LQ + factorization. N is the size of each subdomain (default: ~-1~) + * ~-l x, --llvl=x~: Tree used for low level reduction inside + nodes (default: ~-1~) + * ~-L x, --hlvl=x~: Tree used for high level reduction between + nodes, only if P > 1 (default: ~-1~). Possible values are -1: + Automatic, 0: Flat, 1: Greedy, 2: Fibonacci, 3: Binary, 4: + Replicated greedy. + * ~-D, --domino~: Enable the domino between upper and lower trees + * Advanced options + * ~--nobigmat~: Disable single large matrix allocation for + multiple tiled allocations + * ~-s, --sync~: Enable synchronous calls in wrapper function such + as POTRI + * ~-o, --ooc~: Enable out-of-core (available only with StarPU) + * ~-G, --gemm3m~: Use gemm3m complex method + * ~--bound~: Compare result to area bound List of timing algorithms available: * LANGE: norms of matrices diff --git a/doc/texinfo/chapters/using.texi b/doc/texinfo/chapters/using.texi index 61963c4410973358ad5d0b268bbfeb593075128f..cf83f26e85cd2f7959a3b2ce81cfd96882469bc9 100644 --- a/doc/texinfo/chapters/using.texi +++ b/doc/texinfo/chapters/using.texi @@ -83,36 +83,53 @@ copy from LAPACK matrix layout to tile matrix layout are necessary. List of main options that can be used in timing: @itemize @bullet @item @option{--help}: show usage - @item @option{--threads}: Number of CPU workers (default: -@option{_SC_NPROCESSORS_ONLN}) - @item @option{--gpus}: number of GPU workers (default: @option{0}) - @item @option{--n_range=R}: range of N values, with -@option{R=Start:Stop:Step} -(default: @option{500:5000:500}) - @item @option{--m=X}: dimension (M) of the matrices (default: @option{N}) - @item @option{--k=X}: dimension (K) of the matrices (default: @option{1}), -useful for GEMM algorithm (k is the shared dimension and must be defined >1 to -consider matrices and not vectors) - @item @option{--nrhs=X}: number of right-hand size (default: @option{1}) - @item @option{--nb=X}: block/tile size. (default: @option{128}) - @item @option{--ib=X}: inner-blocking/IB size. (default: @option{32}) - @item @option{--niter=X}: number of iterations performed for each test -(default: @option{1}) - @item @option{--rhblk=X}: if X > 0, enable Householder mode for QR and LQ -factorization. X is the size of each subdomain (default: @option{0}) - @item @option{--[no]check}: check result (default: @option{nocheck}) - @item @option{--[no]profile}: print profiling informations (default: -@option{noprofile}) - @item @option{--[no]trace}: enable/disable trace generation (default: -@option{notrace}) - @item @option{--[no]dag}: enable/disable DAG generation (default: -@option{nodag}) - @item @option{--[no]inv}: check on inverse (default: @option{noinv}) - @item @option{--nocpu}: all GPU kernels are exclusively executed on GPUs - @item @option{--ooc}: Enable out-of-core (available only with StarPU) - @item @option{--bound}: Compare result to area bound (available only with StarPU) -(default: @option{0}) - @end itemize + @item Machine parameters + @itemize @bullet + @item @option{-t x, --threads=x}: Number of CPU workers (default: automatic detection through runtime) + @item @option{-g x, --gpus=x}: Number of GPU workers (default: @option{0}) + @item @option{-P x, --P=x}: Rows (P) in the PxQ process grid (deafult: @option{1}) + @item @option{--nocpu}: All GPU kernels are exclusively executed on GPUs (default: @option{0}) + @end itemize + @item Matrix parameters + @itemize @bullet + @item @option{-m x, --m=x, --M=x}: Dimension (M) of the matrices (default: @option{N}) + @item @option{-n x, --n=x, --N=x}: Dimension (N) of the matrices + @item @option{-N R, --n_range=R}: Range of N values to time with R=Start:Stop:Step (default: @option{500:5000:500}) + @item @option{-k x, --k=x, --K=x, --nrhs=x}: Dimension (K) of the matrices or number of right-hand size (default: @option{1}). This is useful for GEMM like algorithms (k is the shared dimension and must be defined >1 to consider matrices and not vectors) + @item @option{-b x, --nb=x}: NB size. (default: @option{320}) + @item @option{-i x, --ib=x}: IB size. (default: @option{32}) + @end itemize + @item Check/prints + @itemize @bullet + @item @option{--niter=x}: number of iterations performed for each test (default: @option{1}) + @item @option{-W, --nowarnings}: Do not show warnings + @item @option{-w, --nowarmup}: Cancel the warmup run to pre-load libraries + @item @option{-c, --check}: Check result + @item @option{-C, --inv}: Check on inverse + @item @option{--mode=x}: Change the xLATMS matrix mode generation for SVD/EVD (default: @option{4}). It must be between 0 and 20 included. + @end itemize + @item Profiling parameters + @itemize @bullet + @item @option{-T, --trace}: Enable trace generation + @item @option{--progress}: Display progress indicator + @item @option{-d, --dag}: Enable DAG generation. Generates a dot_dag_file.dot. + @item @option{-p, --profile}: Print profiling informations + @end itemize + @item HQR parameters + @itemize @bullet + @item @option{-a x, --qr_a=x, --rhblk=x}: Define the size of the local TS trees in housholder reduction trees for QR and LQ factorization. N is the size of each subdomain (default: @option{-1}) + @item @option{-l x, --llvl=x}: Tree used for low level reduction inside nodes (default: @option{-1}) + @item @option{-L x, --hlvl=x}: Tree used for high level reduction between nodes, only if P > 1 (default: @option{-1}). Possible values are -1: Automatic, 0: Flat, 1: Greedy, 2: Fibonacci, 3: Binary, 4: Replicated greedy. + @item @option{-D, --domino}: Enable the domino between upper and lower trees + @end itemize + @item Advanced options + @itemize @bullet + @item @option{--nobigmat}: Disable single large matrix allocation for multiple tiled allocations + @item @option{-s, --sync}: Enable synchronous calls in wrapper function such as POTRI + @item @option{-o, --ooc}: Enable out-of-core (available only with StarPU) + @item @option{-G, --gemm3m}: Use gemm3m complex method + @item @option{--bound}: Compare result to area bound + @end itemize List of timing algorithms available: @itemize @bullet diff --git a/example/basic_zposv/basic_posv.h b/example/basic_zposv/basic_posv.h index 04748c34f6b768f9fa2ff4a6bc7aa9206c9b13b5..5f7aa0dc06ece32f9e856083439ab7cf035f7cb7 100644 --- a/example/basic_zposv/basic_posv.h +++ b/example/basic_zposv/basic_posv.h @@ -117,7 +117,6 @@ enum iparam_examples { /* Added for StarPU version */ IPARAM_PROFILE, IPARAM_PRINT_ERRORS, - IPARAM_PEAK, IPARAM_PARALLEL_TASKS, IPARAM_NO_CPU, IPARAM_BOUND, @@ -132,7 +131,6 @@ enum dparam_examples { IPARAM_XNORM, IPARAM_RNORM, IPARAM_AinvNORM, - IPARAM_ESTIMATED_PEAK, IPARAM_RES, /* Begin section for hydra integration tool */ IPARAM_THRESHOLD_CHECK, /* Maximum value accepted for: |Ax-b||/N/eps/(||A||||x||+||b||) */ @@ -179,7 +177,6 @@ static void init_iparam(int iparam[IPARAM_SIZEOF]){ iparam[IPARAM_Q ] = 1; iparam[IPARAM_PROFILE ] = 0; iparam[IPARAM_PRINT_ERRORS ] = 0; - iparam[IPARAM_PEAK ] = 0; iparam[IPARAM_PARALLEL_TASKS] = 0; iparam[IPARAM_NO_CPU ] = 0; iparam[IPARAM_BOUND ] = 0; @@ -210,7 +207,6 @@ static void print_header(char *prog_name, int * iparam) { const char *bound_header = iparam[IPARAM_BOUND] ? " thGflop/s" : ""; const char *check_header = iparam[IPARAM_CHECK] ? " ||Ax-b|| ||A|| ||x|| ||b|| ||Ax-b||/N/eps/(||A||||x||+||b||) RETURN" : ""; const char *inverse_header = iparam[IPARAM_INVERSE] ? " ||I-A*Ainv|| ||A|| ||Ainv|| ||Id - A*Ainv||/((||A|| ||Ainv||).N.eps)" : ""; - const char *peak_header = iparam[IPARAM_PEAK] ? " (% of peak) peak" : ""; #if defined(CHAMELEON_SIMULATION) double eps = 0.; #else @@ -235,8 +231,8 @@ static void print_header(char *prog_name, int * iparam) { iparam[IPARAM_IB], eps ); - printf( "# M N K/NRHS seconds Gflop/s Deviation%s%s%s\n", - bound_header, peak_header, iparam[IPARAM_INVERSE] ? inverse_header : check_header); + printf( "# M N K/NRHS seconds Gflop/s Deviation%s%s\n", + bound_header, iparam[IPARAM_INVERSE] ? inverse_header : check_header); printf( "# %5.0d %5.0d %5.0d\n", iparam[IPARAM_N], iparam[IPARAM_N], iparam[IPARAM_K]); return; } diff --git a/runtime/starpu/include/runtime_codelet_profile.h b/runtime/starpu/include/runtime_codelet_profile.h index 88fa98da4802e603e196ae80195fc57e76a327eb..c9cbb5099c15d0896f7e0097c593a25ca78b4072 100644 --- a/runtime/starpu/include/runtime_codelet_profile.h +++ b/runtime/starpu/include/runtime_codelet_profile.h @@ -67,53 +67,11 @@ } \ } \ } \ - void estimate_##name##_sustained_peak(double *res_peak) \ - { \ - /* We use a heuristic where we assume that all GPUs have some work, and \ - * that some CPUs may not have been involved. This may not be \ - * applicable to small problems where only a subset of the processing \ - * units are used. We assume that all CPUs are the same, so we multiply \ - * the best performance obtained on a CPU by the number of CPUs, and we \ - * add this to the sum of the performance obtained by the GPUs. */ \ - double peak = 0.0; \ - double best_cpu = 0.0; \ - unsigned ncpus = 0; \ - \ - unsigned worker; \ - for (worker = 0; worker < starpu_worker_get_count(); worker++) \ - { \ - unsigned cpu_worker = (starpu_worker_get_type(worker) == STARPU_CPU_WORKER); \ - if (cpu_worker) \ - ncpus++; \ - \ - if (name##_perf[worker].n > 0) \ - { \ - long n = name##_perf[worker].n; \ - double sum = name##_perf[worker].sum; \ - double avg = sum / n; \ - \ - if (cpu_worker) \ - { \ - if (avg > best_cpu) \ - best_cpu = avg; \ - } \ - else \ - { \ - peak += avg; \ - } \ - } \ - } \ - \ - peak += ncpus * best_cpu; \ - \ - *res_peak = peak; \ - } \ #define CHAMELEON_CL_CB_HEADER(name) \ extern struct starpu_perfmodel*cl_##name##_save; \ extern struct starpu_perfmodel cl_##name##_fake; \ void cl_##name##_callback(); \ - void profiling_display_##name##_info(void); \ - void estimate_##name##_sustained_peak(double *res) + void profiling_display_##name##_info(void); #endif /* __CODELET_PROFILE_H__ */ diff --git a/timing/timing.c b/timing/timing.c index ca5f3d1b7e646f46f4f086963a1c32b5e8805950..093ade83cec701cb3880aaa4ee36e91fe161880f 100644 --- a/timing/timing.c +++ b/timing/timing.c @@ -221,34 +221,30 @@ Test(int64_t n, int *iparam) { if ( MORSE_My_Mpi_Rank() == 0) { printf( "%9.3f %9.2f +-%7.2f ", sumt/niter, gflops, sd); - if (iparam[IPARAM_BOUND]) + if (iparam[IPARAM_BOUND]) { printf(" %9.2f", sumgf_upper/niter); - - if ( iparam[IPARAM_PEAK] ) - { - if (dparam[IPARAM_ESTIMATED_PEAK]<0.0f) - printf(" n/a n/a "); - else - printf(" %5.2f%% %9.2f ", 100.0f*(gflops/dparam[IPARAM_ESTIMATED_PEAK]), dparam[IPARAM_ESTIMATED_PEAK]); } if ( iparam[IPARAM_CHECK] ){ hres = ( dparam[IPARAM_RES] / n / eps / (dparam[IPARAM_ANORM] * dparam[IPARAM_XNORM] + dparam[IPARAM_BNORM] ) > dparam[IPARAM_THRESHOLD_CHECK] ); - if (hres) + if (hres) { printf( "%8.5e %8.5e %8.5e %8.5e %8.5e FAILURE", dparam[IPARAM_RES], dparam[IPARAM_ANORM], dparam[IPARAM_XNORM], dparam[IPARAM_BNORM], dparam[IPARAM_RES] / n / eps / (dparam[IPARAM_ANORM] * dparam[IPARAM_XNORM] + dparam[IPARAM_BNORM] )); - else + } + else { printf( "%8.5e %8.5e %8.5e %8.5e %8.5e SUCCESS", dparam[IPARAM_RES], dparam[IPARAM_ANORM], dparam[IPARAM_XNORM], dparam[IPARAM_BNORM], dparam[IPARAM_RES] / n / eps / (dparam[IPARAM_ANORM] * dparam[IPARAM_XNORM] + dparam[IPARAM_BNORM] )); + } } - if ( iparam[IPARAM_INVERSE] ) + if ( iparam[IPARAM_INVERSE] ) { printf( " %8.5e %8.5e %8.5e %8.5e", dparam[IPARAM_RNORM], dparam[IPARAM_ANORM], dparam[IPARAM_AinvNORM], dparam[IPARAM_RNORM] /((dparam[IPARAM_ANORM] * dparam[IPARAM_AinvNORM])*n*eps)); + } printf("\n"); @@ -387,7 +383,6 @@ show_help(char *prog_name) { " -s, --sync Enable synchronous calls in wrapper function such as POTRI\n" " -o, --ooc Enable out-of-core (available only with StarPU)\n" " -G, --gemm3m Use gemm3m complex method\n" - //" --peak ?\n"todo " --bound Compare result to area bound\n" "\n"); } @@ -398,7 +393,6 @@ print_header(char *prog_name, int * iparam) { const char *bound_header = iparam[IPARAM_BOUND] ? " thGflop/s" : ""; const char *check_header = iparam[IPARAM_CHECK] ? " ||Ax-b|| ||A|| ||x|| ||b|| ||Ax-b||/N/eps/(||A||||x||+||b||) RETURN" : ""; const char *inverse_header = iparam[IPARAM_INVERSE] ? " ||I-A*Ainv|| ||A|| ||Ainv|| ||Id - A*Ainv||/((||A|| ||Ainv||).N.eps)" : ""; - const char *peak_header = iparam[IPARAM_PEAK] ? " (% of peak) peak" : ""; #if defined(CHAMELEON_SIMULATION) _PREC eps = 0.; #else @@ -431,8 +425,8 @@ print_header(char *prog_name, int * iparam) { iparam[IPARAM_IB], eps ); - printf( "# M N K/NRHS seconds Gflop/s Deviation%s%s%s\n", - bound_header, peak_header, iparam[IPARAM_INVERSE] ? inverse_header : check_header); + printf( "# M N K/NRHS seconds Gflop/s Deviation%s%s\n", + bound_header, iparam[IPARAM_INVERSE] ? inverse_header : check_header); return; } @@ -482,7 +476,6 @@ static struct option long_options[] = {"sync", no_argument, 0, 's'}, {"ooc", no_argument, 0, 'o'}, {"gemm3m", no_argument, 0, 'G'}, - {"peak", no_argument, 0, '4'}, {"bound", no_argument, 0, '5'}, {0, 0, 0, 0} }; @@ -496,7 +489,7 @@ set_iparam_default(int *iparam){ iparam[IPARAM_THRDNBR ] = -1; iparam[IPARAM_THRDNBR_SUBGRP] = 1; iparam[IPARAM_M ] = -1; - iparam[IPARAM_N ] = 500; + iparam[IPARAM_N ] = -1; iparam[IPARAM_K ] = 1; iparam[IPARAM_LDA ] = -1; iparam[IPARAM_LDB ] = -1; @@ -611,7 +604,6 @@ parse_arguments(int *_argc, char ***_argv, int *iparam, int *start, int *stop, i case 's' : iparam[IPARAM_ASYNC ] = 0; break; case 'o' : iparam[IPARAM_OOC ] = 1; break; case 'G' : iparam[IPARAM_GEMM3M ] = 1; break; - case '4' : iparam[IPARAM_PEAK ] = 1; break; case '5' : iparam[IPARAM_BOUND ] = 1; break; case 'h' : case '?' : @@ -624,7 +616,8 @@ parse_arguments(int *_argc, char ***_argv, int *iparam, int *start, int *stop, i int main(int argc, char *argv[]) { - int i, m, mx, nx; + int i, m, n, mx, nx; + int status; int nbnode = 1; int start = 500; int stop = 5000; @@ -644,6 +637,7 @@ main(int argc, char *argv[]) { } #endif + n = iparam[IPARAM_N]; m = iparam[IPARAM_M]; mx = iparam[IPARAM_MX]; nx = iparam[IPARAM_NX]; @@ -709,26 +703,41 @@ main(int argc, char *argv[]) { if (step < 1) step = 1; - int status = Test( -1, iparam ); /* print header */ + status = Test( -1, iparam ); /* print header */ if (status != MORSE_SUCCESS) return status; - for (i = start; i <= stop; i += step) - { - if ( nx > 0 ) { - iparam[IPARAM_M] = i; - iparam[IPARAM_N] = chameleon_max(1, i/nx); - } else if ( mx > 0 ) { - iparam[IPARAM_M] = chameleon_max(1, i/mx); - iparam[IPARAM_N] = i; - } else { - if ( m == -1 ) + if ( n == -1 ){ + for (i = start; i <= stop; i += step) + { + if ( nx > 0 ) { iparam[IPARAM_M] = i; - iparam[IPARAM_N] = i; + iparam[IPARAM_N] = chameleon_max(1, i/nx); + } + else if ( mx > 0 ) { + iparam[IPARAM_M] = chameleon_max(1, i/mx); + iparam[IPARAM_N] = i; + } + else { + if ( m == -1 ) { + iparam[IPARAM_M] = i; + } + iparam[IPARAM_N] = i; + } + status = Test( iparam[IPARAM_N], iparam ); + if (status != MORSE_SUCCESS) { + return status; + } + success += status; + } + } + else { + if ( m == -1 ) { + iparam[IPARAM_M] = n; } - int status = Test( iparam[IPARAM_N], iparam ); + iparam[IPARAM_N] = n; + status = Test( iparam[IPARAM_N], iparam ); if (status != MORSE_SUCCESS) return status; success += status; } - MORSE_Finalize(); return success; } diff --git a/timing/timing.h b/timing/timing.h index 9eb12fdbcd6164f1ead0c590e83468287ebe0348..ef7357b3dfadfa8678fce12fab1110c29acd99fd 100644 --- a/timing/timing.h +++ b/timing/timing.h @@ -57,7 +57,6 @@ enum iparam_timing { /* Added for StarPU version */ IPARAM_PROFILE, IPARAM_PRINT_WARNINGS, - IPARAM_PEAK, IPARAM_PARALLEL_TASKS, IPARAM_NO_CPU, IPARAM_BOUND, @@ -80,7 +79,6 @@ enum dparam_timing { IPARAM_XNORM, IPARAM_RNORM, IPARAM_AinvNORM, - IPARAM_ESTIMATED_PEAK, IPARAM_RES, /* Begin section for hydra integration tool */ IPARAM_THRESHOLD_CHECK, /* Maximum value accepted for: |Ax-b||/N/eps/(||A||||x||+||b||) */