Commit bd9b8f15 authored by Mathieu Faverge's avatar Mathieu Faverge

Merge branch 'issue40' into 'master'

Cleanup the timing parameters and their documentation

Closes #40

See merge request !83
parents b7ab8c09 d1c68fb1
......@@ -165,33 +165,58 @@
#+end_src
List of main options that can be used in timing:
* ~--help~: show usage
* ~--threads~: Number of CPU workers (default:
~_SC_NPROCESSORS_ONLN~)
* ~--gpus~: number of GPU workers (default: ~0~)
* ~--n_range=R~: range of N values, with ~R=Start:Stop:Step~
(default: ~500:5000:500~)
* ~--m=X~: dimension (M) of the matrices (default: ~N~)
* ~--k=X~: dimension (K) of the matrices (default: ~1~), useful for
GEMM algorithm (k is the shared dimension and must be defined
>1 to consider matrices and not vectors)
* ~--nrhs=X~: number of right-hand size (default: ~1~)
* ~--nb=X~: block/tile size. (default: ~128~)
* ~--ib=X~: inner-blocking/IB size. (default: ~32~)
* ~--niter=X~: number of iterations performed for each test
(default: ~1~)
* ~--rhblk=X~: if X > 0, enable Householder mode for QR and LQ
factorization. X is the size of each subdomain (default: ~0~)
* ~--[no]check~: check result (default: ~nocheck~)
* ~--[no]profile~: print profiling informations (default:
~noprofile~)
* ~--[no]trace~: enable/disable trace generation (default: ~notrace~)
* ~--[no]dag~: enable/disable DAG generation (default: ~nodag~)
* ~--[no]inv~: check on inverse (default: ~noinv~)
* ~--nocpu~: all GPU kernels are exclusively executed on GPUs
* ~--ooc~: Enable out-of-core (available only with StarPU)
* ~--bound~: Compare result to area bound (available only with
StarPU) (default: ~0~)
* ~--help~: Show usage
* Machine parameters
* ~-t x, --threads=x~: Number of CPU workers (default: automatic
detection through runtime)
* ~-g x, --gpus=x~: Number of GPU workers (default: ~0~)
* ~-P x, --P=x~: Rows (P) in the PxQ process grid (default: ~1~)
* ~--nocpu~: All GPU kernels are exclusively executed on GPUs
* Matrix parameters
* ~-m x, --m=X, --M=x~: Dimension (M) of the matrices (default:
~N~)
* ~-n x, --n=X, --N=x~: Dimension (N) of the matrices
* ~-N R, --n_range=R~: Range of N values to time with
~R=Start:Stop:Step~ (default: ~500:5000:500~)
* ~-k x, --k=x, --K=x, --nrhs=x~: Dimension (K) of the matrices
or number of right-hand size (default: ~1~). This is useful for
GEMM algorithms (k is the shared dimension and must be defined
>1 to consider matrices and not vectors)
* ~-b x, --nb=x~: NB size. (default: ~320~)
* ~-i x, --ib=x~: IB size. (default: ~32~)
* Check/prints
* ~--niter=X~: Number of iterations performed for each test
(default: ~1~)
* ~-W, --nowarning~: Do not show warnings
* ~-w, --nowarmup~: Cancel the warmup run to pre-load libraries
* ~-c, --check~: Check result
* ~-C, --inc~: Check on inverse
* ~--mode=x~ : Change the xLATMS matrix mode generation for
SVD/EVD (default: ~4~). It must be between 0 and 20 included.
* Profiling parameters
* ~-T, --trace~: Enable trace generation
* ~--progress~: Display progress indicator
* ~-d, --dag~: Enable DAG generation. Generates a dot_dag_file.dot.
* ~-p, --profile~: Print profiling informations
* HQR parameters
* ~-a x, --qr_a=x, --rhblk=x~: Define the size of the local TS
trees in housholder reduction trees for QR and LQ
factorization. N is the size of each subdomain (default: ~-1~)
* ~-l x, --llvl=x~: Tree used for low level reduction inside
nodes (default: ~-1~)
* ~-L x, --hlvl=x~: Tree used for high level reduction between
nodes, only if P > 1 (default: ~-1~). Possible values are -1:
Automatic, 0: Flat, 1: Greedy, 2: Fibonacci, 3: Binary, 4:
Replicated greedy.
* ~-D, --domino~: Enable the domino between upper and lower trees
* Advanced options
* ~--nobigmat~: Disable single large matrix allocation for
multiple tiled allocations
* ~-s, --sync~: Enable synchronous calls in wrapper function such
as POTRI
* ~-o, --ooc~: Enable out-of-core (available only with StarPU)
* ~-G, --gemm3m~: Use gemm3m complex method
* ~--bound~: Compare result to area bound
List of timing algorithms available:
* LANGE: norms of matrices
......
......@@ -83,36 +83,53 @@ copy from LAPACK matrix layout to tile matrix layout are necessary.
List of main options that can be used in timing:
@itemize @bullet
@item @option{--help}: show usage
@item @option{--threads}: Number of CPU workers (default:
@option{_SC_NPROCESSORS_ONLN})
@item @option{--gpus}: number of GPU workers (default: @option{0})
@item @option{--n_range=R}: range of N values, with
@option{R=Start:Stop:Step}
(default: @option{500:5000:500})
@item @option{--m=X}: dimension (M) of the matrices (default: @option{N})
@item @option{--k=X}: dimension (K) of the matrices (default: @option{1}),
useful for GEMM algorithm (k is the shared dimension and must be defined >1 to
consider matrices and not vectors)
@item @option{--nrhs=X}: number of right-hand size (default: @option{1})
@item @option{--nb=X}: block/tile size. (default: @option{128})
@item @option{--ib=X}: inner-blocking/IB size. (default: @option{32})
@item @option{--niter=X}: number of iterations performed for each test
(default: @option{1})
@item @option{--rhblk=X}: if X > 0, enable Householder mode for QR and LQ
factorization. X is the size of each subdomain (default: @option{0})
@item @option{--[no]check}: check result (default: @option{nocheck})
@item @option{--[no]profile}: print profiling informations (default:
@option{noprofile})
@item @option{--[no]trace}: enable/disable trace generation (default:
@option{notrace})
@item @option{--[no]dag}: enable/disable DAG generation (default:
@option{nodag})
@item @option{--[no]inv}: check on inverse (default: @option{noinv})
@item @option{--nocpu}: all GPU kernels are exclusively executed on GPUs
@item @option{--ooc}: Enable out-of-core (available only with StarPU)
@item @option{--bound}: Compare result to area bound (available only with StarPU)
(default: @option{0})
@end itemize
@item Machine parameters
@itemize @bullet
@item @option{-t x, --threads=x}: Number of CPU workers (default: automatic detection through runtime)
@item @option{-g x, --gpus=x}: Number of GPU workers (default: @option{0})
@item @option{-P x, --P=x}: Rows (P) in the PxQ process grid (deafult: @option{1})
@item @option{--nocpu}: All GPU kernels are exclusively executed on GPUs (default: @option{0})
@end itemize
@item Matrix parameters
@itemize @bullet
@item @option{-m x, --m=x, --M=x}: Dimension (M) of the matrices (default: @option{N})
@item @option{-n x, --n=x, --N=x}: Dimension (N) of the matrices
@item @option{-N R, --n_range=R}: Range of N values to time with R=Start:Stop:Step (default: @option{500:5000:500})
@item @option{-k x, --k=x, --K=x, --nrhs=x}: Dimension (K) of the matrices or number of right-hand size (default: @option{1}). This is useful for GEMM like algorithms (k is the shared dimension and must be defined >1 to consider matrices and not vectors)
@item @option{-b x, --nb=x}: NB size. (default: @option{320})
@item @option{-i x, --ib=x}: IB size. (default: @option{32})
@end itemize
@item Check/prints
@itemize @bullet
@item @option{--niter=x}: number of iterations performed for each test (default: @option{1})
@item @option{-W, --nowarnings}: Do not show warnings
@item @option{-w, --nowarmup}: Cancel the warmup run to pre-load libraries
@item @option{-c, --check}: Check result
@item @option{-C, --inv}: Check on inverse
@item @option{--mode=x}: Change the xLATMS matrix mode generation for SVD/EVD (default: @option{4}). It must be between 0 and 20 included.
@end itemize
@item Profiling parameters
@itemize @bullet
@item @option{-T, --trace}: Enable trace generation
@item @option{--progress}: Display progress indicator
@item @option{-d, --dag}: Enable DAG generation. Generates a dot_dag_file.dot.
@item @option{-p, --profile}: Print profiling informations
@end itemize
@item HQR parameters
@itemize @bullet
@item @option{-a x, --qr_a=x, --rhblk=x}: Define the size of the local TS trees in housholder reduction trees for QR and LQ factorization. N is the size of each subdomain (default: @option{-1})
@item @option{-l x, --llvl=x}: Tree used for low level reduction inside nodes (default: @option{-1})
@item @option{-L x, --hlvl=x}: Tree used for high level reduction between nodes, only if P > 1 (default: @option{-1}). Possible values are -1: Automatic, 0: Flat, 1: Greedy, 2: Fibonacci, 3: Binary, 4: Replicated greedy.
@item @option{-D, --domino}: Enable the domino between upper and lower trees
@end itemize
@item Advanced options
@itemize @bullet
@item @option{--nobigmat}: Disable single large matrix allocation for multiple tiled allocations
@item @option{-s, --sync}: Enable synchronous calls in wrapper function such as POTRI
@item @option{-o, --ooc}: Enable out-of-core (available only with StarPU)
@item @option{-G, --gemm3m}: Use gemm3m complex method
@item @option{--bound}: Compare result to area bound
@end itemize
List of timing algorithms available:
@itemize @bullet
......
......@@ -117,7 +117,6 @@ enum iparam_examples {
/* Added for StarPU version */
IPARAM_PROFILE,
IPARAM_PRINT_ERRORS,
IPARAM_PEAK,
IPARAM_PARALLEL_TASKS,
IPARAM_NO_CPU,
IPARAM_BOUND,
......@@ -132,7 +131,6 @@ enum dparam_examples {
IPARAM_XNORM,
IPARAM_RNORM,
IPARAM_AinvNORM,
IPARAM_ESTIMATED_PEAK,
IPARAM_RES,
/* Begin section for hydra integration tool */
IPARAM_THRESHOLD_CHECK, /* Maximum value accepted for: |Ax-b||/N/eps/(||A||||x||+||b||) */
......@@ -179,7 +177,6 @@ static void init_iparam(int iparam[IPARAM_SIZEOF]){
iparam[IPARAM_Q ] = 1;
iparam[IPARAM_PROFILE ] = 0;
iparam[IPARAM_PRINT_ERRORS ] = 0;
iparam[IPARAM_PEAK ] = 0;
iparam[IPARAM_PARALLEL_TASKS] = 0;
iparam[IPARAM_NO_CPU ] = 0;
iparam[IPARAM_BOUND ] = 0;
......@@ -210,7 +207,6 @@ static void print_header(char *prog_name, int * iparam) {
const char *bound_header = iparam[IPARAM_BOUND] ? " thGflop/s" : "";
const char *check_header = iparam[IPARAM_CHECK] ? " ||Ax-b|| ||A|| ||x|| ||b|| ||Ax-b||/N/eps/(||A||||x||+||b||) RETURN" : "";
const char *inverse_header = iparam[IPARAM_INVERSE] ? " ||I-A*Ainv|| ||A|| ||Ainv|| ||Id - A*Ainv||/((||A|| ||Ainv||).N.eps)" : "";
const char *peak_header = iparam[IPARAM_PEAK] ? " (% of peak) peak" : "";
#if defined(CHAMELEON_SIMULATION)
double eps = 0.;
#else
......@@ -235,8 +231,8 @@ static void print_header(char *prog_name, int * iparam) {
iparam[IPARAM_IB],
eps );
printf( "# M N K/NRHS seconds Gflop/s Deviation%s%s%s\n",
bound_header, peak_header, iparam[IPARAM_INVERSE] ? inverse_header : check_header);
printf( "# M N K/NRHS seconds Gflop/s Deviation%s%s\n",
bound_header, iparam[IPARAM_INVERSE] ? inverse_header : check_header);
printf( "# %5.0d %5.0d %5.0d\n", iparam[IPARAM_N], iparam[IPARAM_N], iparam[IPARAM_K]);
return;
}
......
......@@ -67,53 +67,11 @@
} \
} \
} \
void estimate_##name##_sustained_peak(double *res_peak) \
{ \
/* We use a heuristic where we assume that all GPUs have some work, and \
* that some CPUs may not have been involved. This may not be \
* applicable to small problems where only a subset of the processing \
* units are used. We assume that all CPUs are the same, so we multiply \
* the best performance obtained on a CPU by the number of CPUs, and we \
* add this to the sum of the performance obtained by the GPUs. */ \
double peak = 0.0; \
double best_cpu = 0.0; \
unsigned ncpus = 0; \
\
unsigned worker; \
for (worker = 0; worker < starpu_worker_get_count(); worker++) \
{ \
unsigned cpu_worker = (starpu_worker_get_type(worker) == STARPU_CPU_WORKER); \
if (cpu_worker) \
ncpus++; \
\
if (name##_perf[worker].n > 0) \
{ \
long n = name##_perf[worker].n; \
double sum = name##_perf[worker].sum; \
double avg = sum / n; \
\
if (cpu_worker) \
{ \
if (avg > best_cpu) \
best_cpu = avg; \
} \
else \
{ \
peak += avg; \
} \
} \
} \
\
peak += ncpus * best_cpu; \
\
*res_peak = peak; \
} \
#define CHAMELEON_CL_CB_HEADER(name) \
extern struct starpu_perfmodel*cl_##name##_save; \
extern struct starpu_perfmodel cl_##name##_fake; \
void cl_##name##_callback(); \
void profiling_display_##name##_info(void); \
void estimate_##name##_sustained_peak(double *res)
void profiling_display_##name##_info(void);
#endif /* __CODELET_PROFILE_H__ */
......@@ -221,34 +221,30 @@ Test(int64_t n, int *iparam) {
if ( MORSE_My_Mpi_Rank() == 0) {
printf( "%9.3f %9.2f +-%7.2f ", sumt/niter, gflops, sd);
if (iparam[IPARAM_BOUND])
if (iparam[IPARAM_BOUND]) {
printf(" %9.2f", sumgf_upper/niter);
if ( iparam[IPARAM_PEAK] )
{
if (dparam[IPARAM_ESTIMATED_PEAK]<0.0f)
printf(" n/a n/a ");
else
printf(" %5.2f%% %9.2f ", 100.0f*(gflops/dparam[IPARAM_ESTIMATED_PEAK]), dparam[IPARAM_ESTIMATED_PEAK]);
}
if ( iparam[IPARAM_CHECK] ){
hres = ( dparam[IPARAM_RES] / n / eps / (dparam[IPARAM_ANORM] * dparam[IPARAM_XNORM] + dparam[IPARAM_BNORM] ) > dparam[IPARAM_THRESHOLD_CHECK] );
if (hres)
if (hres) {
printf( "%8.5e %8.5e %8.5e %8.5e %8.5e FAILURE",
dparam[IPARAM_RES], dparam[IPARAM_ANORM], dparam[IPARAM_XNORM], dparam[IPARAM_BNORM],
dparam[IPARAM_RES] / n / eps / (dparam[IPARAM_ANORM] * dparam[IPARAM_XNORM] + dparam[IPARAM_BNORM] ));
else
}
else {
printf( "%8.5e %8.5e %8.5e %8.5e %8.5e SUCCESS",
dparam[IPARAM_RES], dparam[IPARAM_ANORM], dparam[IPARAM_XNORM], dparam[IPARAM_BNORM],
dparam[IPARAM_RES] / n / eps / (dparam[IPARAM_ANORM] * dparam[IPARAM_XNORM] + dparam[IPARAM_BNORM] ));
}
}
if ( iparam[IPARAM_INVERSE] )
if ( iparam[IPARAM_INVERSE] ) {
printf( " %8.5e %8.5e %8.5e %8.5e",
dparam[IPARAM_RNORM], dparam[IPARAM_ANORM], dparam[IPARAM_AinvNORM],
dparam[IPARAM_RNORM] /((dparam[IPARAM_ANORM] * dparam[IPARAM_AinvNORM])*n*eps));
}
printf("\n");
......@@ -387,7 +383,6 @@ show_help(char *prog_name) {
" -s, --sync Enable synchronous calls in wrapper function such as POTRI\n"
" -o, --ooc Enable out-of-core (available only with StarPU)\n"
" -G, --gemm3m Use gemm3m complex method\n"
//" --peak ?\n"todo
" --bound Compare result to area bound\n"
"\n");
}
......@@ -398,7 +393,6 @@ print_header(char *prog_name, int * iparam) {
const char *bound_header = iparam[IPARAM_BOUND] ? " thGflop/s" : "";
const char *check_header = iparam[IPARAM_CHECK] ? " ||Ax-b|| ||A|| ||x|| ||b|| ||Ax-b||/N/eps/(||A||||x||+||b||) RETURN" : "";
const char *inverse_header = iparam[IPARAM_INVERSE] ? " ||I-A*Ainv|| ||A|| ||Ainv|| ||Id - A*Ainv||/((||A|| ||Ainv||).N.eps)" : "";
const char *peak_header = iparam[IPARAM_PEAK] ? " (% of peak) peak" : "";
#if defined(CHAMELEON_SIMULATION)
_PREC eps = 0.;
#else
......@@ -431,8 +425,8 @@ print_header(char *prog_name, int * iparam) {
iparam[IPARAM_IB],
eps );
printf( "# M N K/NRHS seconds Gflop/s Deviation%s%s%s\n",
bound_header, peak_header, iparam[IPARAM_INVERSE] ? inverse_header : check_header);
printf( "# M N K/NRHS seconds Gflop/s Deviation%s%s\n",
bound_header, iparam[IPARAM_INVERSE] ? inverse_header : check_header);
return;
}
......@@ -482,7 +476,6 @@ static struct option long_options[] =
{"sync", no_argument, 0, 's'},
{"ooc", no_argument, 0, 'o'},
{"gemm3m", no_argument, 0, 'G'},
{"peak", no_argument, 0, '4'},
{"bound", no_argument, 0, '5'},
{0, 0, 0, 0}
};
......@@ -496,7 +489,7 @@ set_iparam_default(int *iparam){
iparam[IPARAM_THRDNBR ] = -1;
iparam[IPARAM_THRDNBR_SUBGRP] = 1;
iparam[IPARAM_M ] = -1;
iparam[IPARAM_N ] = 500;
iparam[IPARAM_N ] = -1;
iparam[IPARAM_K ] = 1;
iparam[IPARAM_LDA ] = -1;
iparam[IPARAM_LDB ] = -1;
......@@ -611,7 +604,6 @@ parse_arguments(int *_argc, char ***_argv, int *iparam, int *start, int *stop, i
case 's' : iparam[IPARAM_ASYNC ] = 0; break;
case 'o' : iparam[IPARAM_OOC ] = 1; break;
case 'G' : iparam[IPARAM_GEMM3M ] = 1; break;
case '4' : iparam[IPARAM_PEAK ] = 1; break;
case '5' : iparam[IPARAM_BOUND ] = 1; break;
case 'h' :
case '?' :
......@@ -624,7 +616,8 @@ parse_arguments(int *_argc, char ***_argv, int *iparam, int *start, int *stop, i
int
main(int argc, char *argv[]) {
int i, m, mx, nx;
int i, m, n, mx, nx;
int status;
int nbnode = 1;
int start = 500;
int stop = 5000;
......@@ -644,6 +637,7 @@ main(int argc, char *argv[]) {
}
#endif
n = iparam[IPARAM_N];
m = iparam[IPARAM_M];
mx = iparam[IPARAM_MX];
nx = iparam[IPARAM_NX];
......@@ -709,26 +703,41 @@ main(int argc, char *argv[]) {
if (step < 1) step = 1;
int status = Test( -1, iparam ); /* print header */
status = Test( -1, iparam ); /* print header */
if (status != MORSE_SUCCESS) return status;
for (i = start; i <= stop; i += step)
{
if ( nx > 0 ) {
iparam[IPARAM_M] = i;
iparam[IPARAM_N] = chameleon_max(1, i/nx);
} else if ( mx > 0 ) {
iparam[IPARAM_M] = chameleon_max(1, i/mx);
iparam[IPARAM_N] = i;
} else {
if ( m == -1 )
if ( n == -1 ){
for (i = start; i <= stop; i += step)
{
if ( nx > 0 ) {
iparam[IPARAM_M] = i;
iparam[IPARAM_N] = i;
iparam[IPARAM_N] = chameleon_max(1, i/nx);
}
else if ( mx > 0 ) {
iparam[IPARAM_M] = chameleon_max(1, i/mx);
iparam[IPARAM_N] = i;
}
else {
if ( m == -1 ) {
iparam[IPARAM_M] = i;
}
iparam[IPARAM_N] = i;
}
status = Test( iparam[IPARAM_N], iparam );
if (status != MORSE_SUCCESS) {
return status;
}
success += status;
}
}
else {
if ( m == -1 ) {
iparam[IPARAM_M] = n;
}
int status = Test( iparam[IPARAM_N], iparam );
iparam[IPARAM_N] = n;
status = Test( iparam[IPARAM_N], iparam );
if (status != MORSE_SUCCESS) return status;
success += status;
}
MORSE_Finalize();
return success;
}
......
......@@ -57,7 +57,6 @@ enum iparam_timing {
/* Added for StarPU version */
IPARAM_PROFILE,
IPARAM_PRINT_WARNINGS,
IPARAM_PEAK,
IPARAM_PARALLEL_TASKS,
IPARAM_NO_CPU,
IPARAM_BOUND,
......@@ -80,7 +79,6 @@ enum dparam_timing {
IPARAM_XNORM,
IPARAM_RNORM,
IPARAM_AinvNORM,
IPARAM_ESTIMATED_PEAK,
IPARAM_RES,
/* Begin section for hydra integration tool */
IPARAM_THRESHOLD_CHECK, /* Maximum value accepted for: |Ax-b||/N/eps/(||A||||x||+||b||) */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment