Mentions légales du service

Skip to content
Snippets Groups Projects
Commit bd9b8f15 authored by Mathieu Faverge's avatar Mathieu Faverge
Browse files

Merge branch 'issue40' into 'master'

Cleanup the timing parameters and their documentation

Closes #40

See merge request !83
parents b7ab8c09 d1c68fb1
No related branches found
No related tags found
1 merge request!83Cleanup the timing parameters and their documentation
......@@ -165,33 +165,58 @@
#+end_src
List of main options that can be used in timing:
* ~--help~: show usage
* ~--threads~: Number of CPU workers (default:
~_SC_NPROCESSORS_ONLN~)
* ~--gpus~: number of GPU workers (default: ~0~)
* ~--n_range=R~: range of N values, with ~R=Start:Stop:Step~
(default: ~500:5000:500~)
* ~--m=X~: dimension (M) of the matrices (default: ~N~)
* ~--k=X~: dimension (K) of the matrices (default: ~1~), useful for
GEMM algorithm (k is the shared dimension and must be defined
>1 to consider matrices and not vectors)
* ~--nrhs=X~: number of right-hand size (default: ~1~)
* ~--nb=X~: block/tile size. (default: ~128~)
* ~--ib=X~: inner-blocking/IB size. (default: ~32~)
* ~--niter=X~: number of iterations performed for each test
(default: ~1~)
* ~--rhblk=X~: if X > 0, enable Householder mode for QR and LQ
factorization. X is the size of each subdomain (default: ~0~)
* ~--[no]check~: check result (default: ~nocheck~)
* ~--[no]profile~: print profiling informations (default:
~noprofile~)
* ~--[no]trace~: enable/disable trace generation (default: ~notrace~)
* ~--[no]dag~: enable/disable DAG generation (default: ~nodag~)
* ~--[no]inv~: check on inverse (default: ~noinv~)
* ~--nocpu~: all GPU kernels are exclusively executed on GPUs
* ~--ooc~: Enable out-of-core (available only with StarPU)
* ~--bound~: Compare result to area bound (available only with
StarPU) (default: ~0~)
* ~--help~: Show usage
* Machine parameters
* ~-t x, --threads=x~: Number of CPU workers (default: automatic
detection through runtime)
* ~-g x, --gpus=x~: Number of GPU workers (default: ~0~)
* ~-P x, --P=x~: Rows (P) in the PxQ process grid (default: ~1~)
* ~--nocpu~: All GPU kernels are exclusively executed on GPUs
* Matrix parameters
* ~-m x, --m=X, --M=x~: Dimension (M) of the matrices (default:
~N~)
* ~-n x, --n=X, --N=x~: Dimension (N) of the matrices
* ~-N R, --n_range=R~: Range of N values to time with
~R=Start:Stop:Step~ (default: ~500:5000:500~)
* ~-k x, --k=x, --K=x, --nrhs=x~: Dimension (K) of the matrices
or number of right-hand size (default: ~1~). This is useful for
GEMM algorithms (k is the shared dimension and must be defined
>1 to consider matrices and not vectors)
* ~-b x, --nb=x~: NB size. (default: ~320~)
* ~-i x, --ib=x~: IB size. (default: ~32~)
* Check/prints
* ~--niter=X~: Number of iterations performed for each test
(default: ~1~)
* ~-W, --nowarning~: Do not show warnings
* ~-w, --nowarmup~: Cancel the warmup run to pre-load libraries
* ~-c, --check~: Check result
* ~-C, --inc~: Check on inverse
* ~--mode=x~ : Change the xLATMS matrix mode generation for
SVD/EVD (default: ~4~). It must be between 0 and 20 included.
* Profiling parameters
* ~-T, --trace~: Enable trace generation
* ~--progress~: Display progress indicator
* ~-d, --dag~: Enable DAG generation. Generates a dot_dag_file.dot.
* ~-p, --profile~: Print profiling informations
* HQR parameters
* ~-a x, --qr_a=x, --rhblk=x~: Define the size of the local TS
trees in housholder reduction trees for QR and LQ
factorization. N is the size of each subdomain (default: ~-1~)
* ~-l x, --llvl=x~: Tree used for low level reduction inside
nodes (default: ~-1~)
* ~-L x, --hlvl=x~: Tree used for high level reduction between
nodes, only if P > 1 (default: ~-1~). Possible values are -1:
Automatic, 0: Flat, 1: Greedy, 2: Fibonacci, 3: Binary, 4:
Replicated greedy.
* ~-D, --domino~: Enable the domino between upper and lower trees
* Advanced options
* ~--nobigmat~: Disable single large matrix allocation for
multiple tiled allocations
* ~-s, --sync~: Enable synchronous calls in wrapper function such
as POTRI
* ~-o, --ooc~: Enable out-of-core (available only with StarPU)
* ~-G, --gemm3m~: Use gemm3m complex method
* ~--bound~: Compare result to area bound
List of timing algorithms available:
* LANGE: norms of matrices
......
......@@ -83,36 +83,53 @@ copy from LAPACK matrix layout to tile matrix layout are necessary.
List of main options that can be used in timing:
@itemize @bullet
@item @option{--help}: show usage
@item @option{--threads}: Number of CPU workers (default:
@option{_SC_NPROCESSORS_ONLN})
@item @option{--gpus}: number of GPU workers (default: @option{0})
@item @option{--n_range=R}: range of N values, with
@option{R=Start:Stop:Step}
(default: @option{500:5000:500})
@item @option{--m=X}: dimension (M) of the matrices (default: @option{N})
@item @option{--k=X}: dimension (K) of the matrices (default: @option{1}),
useful for GEMM algorithm (k is the shared dimension and must be defined >1 to
consider matrices and not vectors)
@item @option{--nrhs=X}: number of right-hand size (default: @option{1})
@item @option{--nb=X}: block/tile size. (default: @option{128})
@item @option{--ib=X}: inner-blocking/IB size. (default: @option{32})
@item @option{--niter=X}: number of iterations performed for each test
(default: @option{1})
@item @option{--rhblk=X}: if X > 0, enable Householder mode for QR and LQ
factorization. X is the size of each subdomain (default: @option{0})
@item @option{--[no]check}: check result (default: @option{nocheck})
@item @option{--[no]profile}: print profiling informations (default:
@option{noprofile})
@item @option{--[no]trace}: enable/disable trace generation (default:
@option{notrace})
@item @option{--[no]dag}: enable/disable DAG generation (default:
@option{nodag})
@item @option{--[no]inv}: check on inverse (default: @option{noinv})
@item @option{--nocpu}: all GPU kernels are exclusively executed on GPUs
@item @option{--ooc}: Enable out-of-core (available only with StarPU)
@item @option{--bound}: Compare result to area bound (available only with StarPU)
(default: @option{0})
@end itemize
@item Machine parameters
@itemize @bullet
@item @option{-t x, --threads=x}: Number of CPU workers (default: automatic detection through runtime)
@item @option{-g x, --gpus=x}: Number of GPU workers (default: @option{0})
@item @option{-P x, --P=x}: Rows (P) in the PxQ process grid (deafult: @option{1})
@item @option{--nocpu}: All GPU kernels are exclusively executed on GPUs (default: @option{0})
@end itemize
@item Matrix parameters
@itemize @bullet
@item @option{-m x, --m=x, --M=x}: Dimension (M) of the matrices (default: @option{N})
@item @option{-n x, --n=x, --N=x}: Dimension (N) of the matrices
@item @option{-N R, --n_range=R}: Range of N values to time with R=Start:Stop:Step (default: @option{500:5000:500})
@item @option{-k x, --k=x, --K=x, --nrhs=x}: Dimension (K) of the matrices or number of right-hand size (default: @option{1}). This is useful for GEMM like algorithms (k is the shared dimension and must be defined >1 to consider matrices and not vectors)
@item @option{-b x, --nb=x}: NB size. (default: @option{320})
@item @option{-i x, --ib=x}: IB size. (default: @option{32})
@end itemize
@item Check/prints
@itemize @bullet
@item @option{--niter=x}: number of iterations performed for each test (default: @option{1})
@item @option{-W, --nowarnings}: Do not show warnings
@item @option{-w, --nowarmup}: Cancel the warmup run to pre-load libraries
@item @option{-c, --check}: Check result
@item @option{-C, --inv}: Check on inverse
@item @option{--mode=x}: Change the xLATMS matrix mode generation for SVD/EVD (default: @option{4}). It must be between 0 and 20 included.
@end itemize
@item Profiling parameters
@itemize @bullet
@item @option{-T, --trace}: Enable trace generation
@item @option{--progress}: Display progress indicator
@item @option{-d, --dag}: Enable DAG generation. Generates a dot_dag_file.dot.
@item @option{-p, --profile}: Print profiling informations
@end itemize
@item HQR parameters
@itemize @bullet
@item @option{-a x, --qr_a=x, --rhblk=x}: Define the size of the local TS trees in housholder reduction trees for QR and LQ factorization. N is the size of each subdomain (default: @option{-1})
@item @option{-l x, --llvl=x}: Tree used for low level reduction inside nodes (default: @option{-1})
@item @option{-L x, --hlvl=x}: Tree used for high level reduction between nodes, only if P > 1 (default: @option{-1}). Possible values are -1: Automatic, 0: Flat, 1: Greedy, 2: Fibonacci, 3: Binary, 4: Replicated greedy.
@item @option{-D, --domino}: Enable the domino between upper and lower trees
@end itemize
@item Advanced options
@itemize @bullet
@item @option{--nobigmat}: Disable single large matrix allocation for multiple tiled allocations
@item @option{-s, --sync}: Enable synchronous calls in wrapper function such as POTRI
@item @option{-o, --ooc}: Enable out-of-core (available only with StarPU)
@item @option{-G, --gemm3m}: Use gemm3m complex method
@item @option{--bound}: Compare result to area bound
@end itemize
List of timing algorithms available:
@itemize @bullet
......
......@@ -117,7 +117,6 @@ enum iparam_examples {
/* Added for StarPU version */
IPARAM_PROFILE,
IPARAM_PRINT_ERRORS,
IPARAM_PEAK,
IPARAM_PARALLEL_TASKS,
IPARAM_NO_CPU,
IPARAM_BOUND,
......@@ -132,7 +131,6 @@ enum dparam_examples {
IPARAM_XNORM,
IPARAM_RNORM,
IPARAM_AinvNORM,
IPARAM_ESTIMATED_PEAK,
IPARAM_RES,
/* Begin section for hydra integration tool */
IPARAM_THRESHOLD_CHECK, /* Maximum value accepted for: |Ax-b||/N/eps/(||A||||x||+||b||) */
......@@ -179,7 +177,6 @@ static void init_iparam(int iparam[IPARAM_SIZEOF]){
iparam[IPARAM_Q ] = 1;
iparam[IPARAM_PROFILE ] = 0;
iparam[IPARAM_PRINT_ERRORS ] = 0;
iparam[IPARAM_PEAK ] = 0;
iparam[IPARAM_PARALLEL_TASKS] = 0;
iparam[IPARAM_NO_CPU ] = 0;
iparam[IPARAM_BOUND ] = 0;
......@@ -210,7 +207,6 @@ static void print_header(char *prog_name, int * iparam) {
const char *bound_header = iparam[IPARAM_BOUND] ? " thGflop/s" : "";
const char *check_header = iparam[IPARAM_CHECK] ? " ||Ax-b|| ||A|| ||x|| ||b|| ||Ax-b||/N/eps/(||A||||x||+||b||) RETURN" : "";
const char *inverse_header = iparam[IPARAM_INVERSE] ? " ||I-A*Ainv|| ||A|| ||Ainv|| ||Id - A*Ainv||/((||A|| ||Ainv||).N.eps)" : "";
const char *peak_header = iparam[IPARAM_PEAK] ? " (% of peak) peak" : "";
#if defined(CHAMELEON_SIMULATION)
double eps = 0.;
#else
......@@ -235,8 +231,8 @@ static void print_header(char *prog_name, int * iparam) {
iparam[IPARAM_IB],
eps );
printf( "# M N K/NRHS seconds Gflop/s Deviation%s%s%s\n",
bound_header, peak_header, iparam[IPARAM_INVERSE] ? inverse_header : check_header);
printf( "# M N K/NRHS seconds Gflop/s Deviation%s%s\n",
bound_header, iparam[IPARAM_INVERSE] ? inverse_header : check_header);
printf( "# %5.0d %5.0d %5.0d\n", iparam[IPARAM_N], iparam[IPARAM_N], iparam[IPARAM_K]);
return;
}
......
......@@ -67,53 +67,11 @@
} \
} \
} \
void estimate_##name##_sustained_peak(double *res_peak) \
{ \
/* We use a heuristic where we assume that all GPUs have some work, and \
* that some CPUs may not have been involved. This may not be \
* applicable to small problems where only a subset of the processing \
* units are used. We assume that all CPUs are the same, so we multiply \
* the best performance obtained on a CPU by the number of CPUs, and we \
* add this to the sum of the performance obtained by the GPUs. */ \
double peak = 0.0; \
double best_cpu = 0.0; \
unsigned ncpus = 0; \
\
unsigned worker; \
for (worker = 0; worker < starpu_worker_get_count(); worker++) \
{ \
unsigned cpu_worker = (starpu_worker_get_type(worker) == STARPU_CPU_WORKER); \
if (cpu_worker) \
ncpus++; \
\
if (name##_perf[worker].n > 0) \
{ \
long n = name##_perf[worker].n; \
double sum = name##_perf[worker].sum; \
double avg = sum / n; \
\
if (cpu_worker) \
{ \
if (avg > best_cpu) \
best_cpu = avg; \
} \
else \
{ \
peak += avg; \
} \
} \
} \
\
peak += ncpus * best_cpu; \
\
*res_peak = peak; \
} \
#define CHAMELEON_CL_CB_HEADER(name) \
extern struct starpu_perfmodel*cl_##name##_save; \
extern struct starpu_perfmodel cl_##name##_fake; \
void cl_##name##_callback(); \
void profiling_display_##name##_info(void); \
void estimate_##name##_sustained_peak(double *res)
void profiling_display_##name##_info(void);
#endif /* __CODELET_PROFILE_H__ */
......@@ -221,34 +221,30 @@ Test(int64_t n, int *iparam) {
if ( MORSE_My_Mpi_Rank() == 0) {
printf( "%9.3f %9.2f +-%7.2f ", sumt/niter, gflops, sd);
if (iparam[IPARAM_BOUND])
if (iparam[IPARAM_BOUND]) {
printf(" %9.2f", sumgf_upper/niter);
if ( iparam[IPARAM_PEAK] )
{
if (dparam[IPARAM_ESTIMATED_PEAK]<0.0f)
printf(" n/a n/a ");
else
printf(" %5.2f%% %9.2f ", 100.0f*(gflops/dparam[IPARAM_ESTIMATED_PEAK]), dparam[IPARAM_ESTIMATED_PEAK]);
}
if ( iparam[IPARAM_CHECK] ){
hres = ( dparam[IPARAM_RES] / n / eps / (dparam[IPARAM_ANORM] * dparam[IPARAM_XNORM] + dparam[IPARAM_BNORM] ) > dparam[IPARAM_THRESHOLD_CHECK] );
if (hres)
if (hres) {
printf( "%8.5e %8.5e %8.5e %8.5e %8.5e FAILURE",
dparam[IPARAM_RES], dparam[IPARAM_ANORM], dparam[IPARAM_XNORM], dparam[IPARAM_BNORM],
dparam[IPARAM_RES] / n / eps / (dparam[IPARAM_ANORM] * dparam[IPARAM_XNORM] + dparam[IPARAM_BNORM] ));
else
}
else {
printf( "%8.5e %8.5e %8.5e %8.5e %8.5e SUCCESS",
dparam[IPARAM_RES], dparam[IPARAM_ANORM], dparam[IPARAM_XNORM], dparam[IPARAM_BNORM],
dparam[IPARAM_RES] / n / eps / (dparam[IPARAM_ANORM] * dparam[IPARAM_XNORM] + dparam[IPARAM_BNORM] ));
}
}
if ( iparam[IPARAM_INVERSE] )
if ( iparam[IPARAM_INVERSE] ) {
printf( " %8.5e %8.5e %8.5e %8.5e",
dparam[IPARAM_RNORM], dparam[IPARAM_ANORM], dparam[IPARAM_AinvNORM],
dparam[IPARAM_RNORM] /((dparam[IPARAM_ANORM] * dparam[IPARAM_AinvNORM])*n*eps));
}
printf("\n");
......@@ -387,7 +383,6 @@ show_help(char *prog_name) {
" -s, --sync Enable synchronous calls in wrapper function such as POTRI\n"
" -o, --ooc Enable out-of-core (available only with StarPU)\n"
" -G, --gemm3m Use gemm3m complex method\n"
//" --peak ?\n"todo
" --bound Compare result to area bound\n"
"\n");
}
......@@ -398,7 +393,6 @@ print_header(char *prog_name, int * iparam) {
const char *bound_header = iparam[IPARAM_BOUND] ? " thGflop/s" : "";
const char *check_header = iparam[IPARAM_CHECK] ? " ||Ax-b|| ||A|| ||x|| ||b|| ||Ax-b||/N/eps/(||A||||x||+||b||) RETURN" : "";
const char *inverse_header = iparam[IPARAM_INVERSE] ? " ||I-A*Ainv|| ||A|| ||Ainv|| ||Id - A*Ainv||/((||A|| ||Ainv||).N.eps)" : "";
const char *peak_header = iparam[IPARAM_PEAK] ? " (% of peak) peak" : "";
#if defined(CHAMELEON_SIMULATION)
_PREC eps = 0.;
#else
......@@ -431,8 +425,8 @@ print_header(char *prog_name, int * iparam) {
iparam[IPARAM_IB],
eps );
printf( "# M N K/NRHS seconds Gflop/s Deviation%s%s%s\n",
bound_header, peak_header, iparam[IPARAM_INVERSE] ? inverse_header : check_header);
printf( "# M N K/NRHS seconds Gflop/s Deviation%s%s\n",
bound_header, iparam[IPARAM_INVERSE] ? inverse_header : check_header);
return;
}
......@@ -482,7 +476,6 @@ static struct option long_options[] =
{"sync", no_argument, 0, 's'},
{"ooc", no_argument, 0, 'o'},
{"gemm3m", no_argument, 0, 'G'},
{"peak", no_argument, 0, '4'},
{"bound", no_argument, 0, '5'},
{0, 0, 0, 0}
};
......@@ -496,7 +489,7 @@ set_iparam_default(int *iparam){
iparam[IPARAM_THRDNBR ] = -1;
iparam[IPARAM_THRDNBR_SUBGRP] = 1;
iparam[IPARAM_M ] = -1;
iparam[IPARAM_N ] = 500;
iparam[IPARAM_N ] = -1;
iparam[IPARAM_K ] = 1;
iparam[IPARAM_LDA ] = -1;
iparam[IPARAM_LDB ] = -1;
......@@ -611,7 +604,6 @@ parse_arguments(int *_argc, char ***_argv, int *iparam, int *start, int *stop, i
case 's' : iparam[IPARAM_ASYNC ] = 0; break;
case 'o' : iparam[IPARAM_OOC ] = 1; break;
case 'G' : iparam[IPARAM_GEMM3M ] = 1; break;
case '4' : iparam[IPARAM_PEAK ] = 1; break;
case '5' : iparam[IPARAM_BOUND ] = 1; break;
case 'h' :
case '?' :
......@@ -624,7 +616,8 @@ parse_arguments(int *_argc, char ***_argv, int *iparam, int *start, int *stop, i
int
main(int argc, char *argv[]) {
int i, m, mx, nx;
int i, m, n, mx, nx;
int status;
int nbnode = 1;
int start = 500;
int stop = 5000;
......@@ -644,6 +637,7 @@ main(int argc, char *argv[]) {
}
#endif
n = iparam[IPARAM_N];
m = iparam[IPARAM_M];
mx = iparam[IPARAM_MX];
nx = iparam[IPARAM_NX];
......@@ -709,26 +703,41 @@ main(int argc, char *argv[]) {
if (step < 1) step = 1;
int status = Test( -1, iparam ); /* print header */
status = Test( -1, iparam ); /* print header */
if (status != MORSE_SUCCESS) return status;
for (i = start; i <= stop; i += step)
{
if ( nx > 0 ) {
iparam[IPARAM_M] = i;
iparam[IPARAM_N] = chameleon_max(1, i/nx);
} else if ( mx > 0 ) {
iparam[IPARAM_M] = chameleon_max(1, i/mx);
iparam[IPARAM_N] = i;
} else {
if ( m == -1 )
if ( n == -1 ){
for (i = start; i <= stop; i += step)
{
if ( nx > 0 ) {
iparam[IPARAM_M] = i;
iparam[IPARAM_N] = i;
iparam[IPARAM_N] = chameleon_max(1, i/nx);
}
else if ( mx > 0 ) {
iparam[IPARAM_M] = chameleon_max(1, i/mx);
iparam[IPARAM_N] = i;
}
else {
if ( m == -1 ) {
iparam[IPARAM_M] = i;
}
iparam[IPARAM_N] = i;
}
status = Test( iparam[IPARAM_N], iparam );
if (status != MORSE_SUCCESS) {
return status;
}
success += status;
}
}
else {
if ( m == -1 ) {
iparam[IPARAM_M] = n;
}
int status = Test( iparam[IPARAM_N], iparam );
iparam[IPARAM_N] = n;
status = Test( iparam[IPARAM_N], iparam );
if (status != MORSE_SUCCESS) return status;
success += status;
}
MORSE_Finalize();
return success;
}
......
......@@ -57,7 +57,6 @@ enum iparam_timing {
/* Added for StarPU version */
IPARAM_PROFILE,
IPARAM_PRINT_WARNINGS,
IPARAM_PEAK,
IPARAM_PARALLEL_TASKS,
IPARAM_NO_CPU,
IPARAM_BOUND,
......@@ -80,7 +79,6 @@ enum dparam_timing {
IPARAM_XNORM,
IPARAM_RNORM,
IPARAM_AinvNORM,
IPARAM_ESTIMATED_PEAK,
IPARAM_RES,
/* Begin section for hydra integration tool */
IPARAM_THRESHOLD_CHECK, /* Maximum value accepted for: |Ax-b||/N/eps/(||A||||x||+||b||) */
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment