Commit 8301ebf0 authored by Antoine Jego's avatar Antoine Jego
Browse files

clearer stop codes, added 'super pruning' to see how tight pruning can be...

clearer stop codes, added 'super pruning' to see how tight pruning can be interpreted by STARPU (C: not that much ; fortran: as much as we want (why?)) ; more printing to pinpoint segfault
parent 3f9886dc
......@@ -35,6 +35,7 @@ program fstarpu_example_dgemm
logical :: trace = .false.
logical :: lflush = .false.
logical :: prune = .false.
logical :: super_prune = .false.
logical :: prune_handles = .false.
integer(c_int) :: comm_size, comm_rank
integer(c_int), target :: comm_world
......@@ -46,6 +47,7 @@ program fstarpu_example_dgemm
character(len=20) :: str
type(dsmat_type),target :: A, B, C
logical :: A_local, B_local, C_local
real(kind=c_double), target :: alpha, beta, zbeta
type(c_ptr) :: cl_mm, cl_fill
integer(c_int) :: ncpu
......@@ -68,6 +70,7 @@ program fstarpu_example_dgemm
write(*,'("fstarpu_mpi_init status:",i4)') ret
stop 2
end if
write(*,*) "initialized starpu+MPI!"
! stop there if no CPU worker available
ncpu = fstarpu_cpu_worker_get_count()
......@@ -98,20 +101,24 @@ program fstarpu_example_dgemm
do j = 8, command_argument_count()
call get_command_argument(j, value=str, length=i)
select case(str)
case('t')
case('-t')
trace = .true.
case('f')
case('-f')
lflush = .true.
case('p')
case('-p')
prune = .true.
case('h')
super_prune = .false.
case('-s')
prune = .false.
super_prune = .true.
case('-h')
prune_handles = .true.
end select
end do
if (mod(m,bs).ne.0) stop 75
if (mod(n,bs).ne.0) stop 75
if (mod(k,bs).ne.0) stop 75
if (mod(m,bs).ne.0) stop 65
if (mod(n,bs).ne.0) stop 64
if (mod(k,bs).ne.0) stop 63
if (p*q.ne.comm_size) stop 74
mb = m/bs
nb = n/bs
......@@ -124,7 +131,8 @@ program fstarpu_example_dgemm
write(*,'("PxQ = ",i3,"x",i3)') p,q
if (trace) write(*,*) "(T)racing enabled"
if (lflush) write(*,*) "(F)lushing enabled"
if (prune) write(*,*) "(P)runing enabled"
if (super_prune) then; write(*,*) "(S)uper-pruning enabled"
else if (prune) then ; write(*,*) "(P)runing enabled"; endif
if (prune_handles) write(*,*) "(H)andles pruning enabled"
write(*,'("========================================")')
end if
......@@ -153,9 +161,11 @@ program fstarpu_example_dgemm
do i=1,mb
do j=1,nb
do l=1,kb
if (.not.prune.or.(A%blocks(i,l)%owner == comm_rank.or.&
B%blocks(l,j)%owner == comm_rank.or.&
C%blocks(i,j)%owner == comm_rank)) then
A_local = A%blocks(i,l)%owner == comm_rank
B_local = B%blocks(i,l)%owner == comm_rank
C_local = C%blocks(l,j)%owner == comm_rank
if ( (.not.super_prune.or.(C_local.or.l<=q.or.l<=p)).and.&
(.not.prune.or.(A_local.or.B_local.or.C_local))) then
! if (comm_rank.eq.0) write(*,*) "GEMM", b_col,b_row,b_aisle
if (l.eq.1) then; zbeta = beta; else; zbeta = 1.0d0; end if
call fstarpu_mpi_task_insert((/ c_loc(comm_world), cl_mm, &
......
......@@ -35,8 +35,7 @@
#include "optional_tile_interface.h"
/* STARPU_EXAMPLE_DGEMM default values */
#define STARPU_EXAMPLE_DGEMM_TRACE 0
#define STARPU_EXAMPLE_DGEMM_OWNDATATYPE 0
// line 142 and below
/* From StarPU starpu/tests/helper.c */
#define STARPU_TEST_SKIPPED 77
......@@ -44,7 +43,7 @@
//const char *argp_program_version = "standalone 0.2";
//const char *argp_program_bug_address
static char doc[] = "Standalone DGEMM using StarPU-MPI";
static char args_doc[] = "-m [m] -n [n] -k [k] -b [b] -p [p] -q [q] --niter [l] [--check] [--trace] [--datatype] [--mpi-thread [t]] [--no-flush] [--prune] [--prune-handles] ";;
static char args_doc[] = "-m [m] -n [n] -k [k] -b [b] -p [p] -q [q] --niter [l] [--check] [--trace] [--datatype] [--mpi-thread [t]] [--no-flush] [--prune] [--prune-handles] [--super-prune]";;
static struct argp_option options[] = {
{"m", 'm', "int", 0, "Number of rows in A and C (deprecated)" },
{"n", 'n', "int", 0, "Dimension of A B and C" },
......@@ -59,7 +58,8 @@ static struct argp_option options[] = {
{"mpi-thread", 't', "int", 0, "MPI thread level support : -1 StarPU, 0 SINGLE, 1 FUNNELED, 2 SERIALIZED, 3 MULTIPLE"},
{"datatype", 'd', 0, 0, "Whether or not to use our own datatype implementation"},
{"no-flush", 's', 0, 0, "If handed out to the program, do not flush anything until computation has completed."},
{"prune", 'r', 0, 0, "If handed out to the program, prune the DAG as tightly as possible."},
{"prune", 'r', 0, 0, "If handed out to the program, prune the DAG tightly enough (relying on STARPU+mpi cache to avoid redundant data transfers)."},
{"super-prune", 'R', 0, 0, "If handed out to the program, prune the DAG as tightly as possible."},
{"prune-handles",'z', 0, 0, "If handed out to the program, prune the handle registration."},
{ 0 }
};
......@@ -72,7 +72,7 @@ struct arguments
int check,verbose,trace;
int niter;
int mpi_thread, datatype;
int no_flush, prune, prune_handles;
int no_flush, prune, super_prune, prune_handles;
};
......@@ -123,6 +123,11 @@ parse_opt (int key, char *arg, struct argp_state *state)
break;
case 'r':
arguments->prune = 1;
arguments->super_prune = 0;
break;
case 'R':
arguments->super_prune = 1;
arguments->prune = 0;
break;
case 'z':
arguments->prune_handles = 1;
......@@ -139,17 +144,17 @@ static int M = 1024; /* Matrix size */
static int N = 1024; /* Matrix size */
static int K = 1024; /* Matrix size */
static int BS = 512; /* Block size */
static int P = 2; /* height of the grid */
static int Q = 2; /* width of the grid */
static int P = 1; /* height of the grid */
static int Q = 1; /* width of the grid */
static int T = 1; /* number of runs */
static int check = 0;
static int trace = STARPU_EXAMPLE_DGEMM_TRACE; /* whether to trace */
static int datatype =
STARPU_EXAMPLE_DGEMM_OWNDATATYPE; /* whether to register our own datatype */
static int trace = 0; /* whether to trace */
static int datatype = 0; /* whether to register our own datatype */
static int mpi_thread = -1; /* whether to register our own datatype */
static int verbose = 0;
static int flush = 1;
static int prune = 0;
static int super_prune = 0;
static int prune_handles = 0;
#define MB ((M)/(BS)) /* Number of blocks */
......@@ -203,7 +208,7 @@ static void register_matrices(int prune_handles)
register_matrix(A,A_h,&tag,MB,KB,datatype,prune_handles,P,Q,1,0,check);
register_matrix(B,B_h,&tag,KB,NB,datatype,prune_handles,P,Q,0,1,check);
//register_matrix(C,C_h,&tag,MB,NB,datatype,prune_handles,P,Q,0,0);
// FIXME :
// FIXME (starpu-side) :
// the previous one seems logical because we do not need to know
// about blocks of C we do not contribute to, however startPU seems to be
// pending on task_insertion if we do not know about blocks on our row/column even if we do not contribute to them)
......@@ -463,6 +468,7 @@ int main(int argc, char *argv[])
arguments.datatype = datatype;
arguments.no_flush = 0;
arguments.prune = 0;
arguments.super_prune = 0;
arguments.prune_handles = 0;
argp_parse(&argp, argc, argv, 0, 0, &arguments);
......@@ -480,6 +486,7 @@ int main(int argc, char *argv[])
datatype = arguments.datatype;
flush = !arguments.no_flush;
prune = arguments.prune;
super_prune = arguments.super_prune;
prune_handles = arguments.prune_handles;
/* Initializes StarPU and the StarPU-MPI layer */
......@@ -542,7 +549,8 @@ int main(int argc, char *argv[])
if (datatype) printf("- MPI datatype enabled\n");
if (mpi_thread > -1) printf("- MPI thread support level : %d\n", provided_mpi_thread);
if (!flush) printf("- Flushing disabled\n");
if (prune) printf("- Pruning enabled\n");
if (super_prune) printf("- Super-Pruning enabled\n");
else if (prune) printf("- Pruning enabled\n");
if (prune_handles) printf("- Handle pruning enabled\n");
}
int barrier_ret, trial;
......@@ -570,6 +578,7 @@ int main(int argc, char *argv[])
barrier_ret = starpu_mpi_barrier(MPI_COMM_WORLD);
start = starpu_timing_now();
int a_local, b_local, c_local;
int b_row,b_col,b_aisle;
for (b_row = 0; b_row < MB; b_row++)
{
......@@ -577,10 +586,16 @@ int main(int argc, char *argv[])
{
for (b_aisle=0;b_aisle<KB;b_aisle++)
{
// this just needs to be clarified
a_local = A->blocks[b_row*KB+b_aisle].owner == comm_rank;
b_local = B->blocks[b_aisle*NB+b_col].owner == comm_rank;
c_local = C->blocks[ b_row*NB+b_col].owner == comm_rank;
// when prune and/or prune_handles are allowed needs to be clarified
//if ((!prune && !prune_handles) || (A->blocks[b_row*KB+b_aisle].owner == comm_rank || B->blocks[b_aisle*NB+b_col].owner == comm_rank || C->blocks[b_row*NB+b_col].owner == comm_rank)) {
if (!prune || (A->blocks[b_row*KB+b_aisle].owner == comm_rank || B->blocks[b_aisle*NB+b_col].owner == comm_rank || C->blocks[b_row*NB+b_col].owner == comm_rank)) {
//printf("[%d] inserting C_%d,%d += A_%d,%d B_%d,%d\n",comm_rank, b_row,b_col, b_row,b_aisle, b_aisle,b_col);
// TODO : logic might be written more clearly (a/b/c_local may be redundant)
// FIXME (starpu-side) : fortran allows this but not C
// (note : inserting tasks *is* different but it does not feel *that* different
if ((!super_prune || (c_local || b_aisle <= Q || b_aisle <= P )) &&
(!prune || (a_local || b_local || c_local))) {
struct cl_zgemm_args_s *clargs = NULL;
if (C->blocks[b_row*NB+b_col].owner == comm_rank) {
clargs = malloc(sizeof( struct cl_zgemm_args_s ));
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment