Commit 0045debc authored by Jego Antoine's avatar Jego Antoine
Browse files

add outer code

parent b887214c
cmake_minimum_required (VERSION 3.3)
project(starpu_example_dgemm C Fortran)
add_compile_options(-Wall -Wextra -pedantic) # -Werror)
#add_compile_options(-Wall -Wextra -pedantic) # -Werror)
# Check that we do no try to configure/build inside the source directory
# ----------------------------------------------------------------------
if( ${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR} )
......
......@@ -42,6 +42,7 @@ program fstarpu_example_dgemm
logical :: provide_context = .false.
logical :: warmup = .true.
logical :: codelet_args = .false.
logical :: outer = .false.
integer(c_int) :: comm_size, comm_rank
integer(c_int), target :: comm_world
......@@ -131,6 +132,8 @@ program fstarpu_example_dgemm
warmup = .false.
case('-a')
codelet_args = .true.
case('-o')
outer = .true.
! keep -e as an empty argument for debug purpose
end select
end do
......@@ -157,6 +160,7 @@ program fstarpu_example_dgemm
if (provide_context) write(*,*) "(C)ontext provided at submission"
if (.not.warmup) write(*,*) "(W)armup disabled"
if (codelet_args) write(*,*) "(A)rguments NOT by values"
if (outer) write(*,*) "(O)uter product submission"
write(*,'("========================================")')
end if
ret = fstarpu_mpi_barrier(comm_world)
......@@ -189,9 +193,77 @@ program fstarpu_example_dgemm
call system_clock(ts)
! submit matrix multiplication
do i=1,mb
do j=1,nb
do l=1,kb
if (outer) then
do l=1,kb
do i=1,mb
do j=1,nb
call submit_gemm_task()
end do
end do
if (lflush) then
do i=1,mb
if (c_associated(A%blocks(i,l)%h)) &
call fstarpu_mpi_cache_flush(comm_world,A%blocks(i,l)%h)
end do
do j=1,nb
if (c_associated(B%blocks(l,j)%h)) &
call fstarpu_mpi_cache_flush(comm_world,B%blocks(l,j)%h)
end do
end if
end do
else
do i=1,mb
do j=1,nb
do l=1,kb
call submit_gemm_task()
end do
end do
if (lflush) then
do l=1,kb
if (c_associated(A%blocks(i,l)%h)) &
call fstarpu_mpi_cache_flush(comm_world,A%blocks(i,l)%h)
end do
end if
end do
end if
ret = fstarpu_mpi_wait_for_all(comm_world)
ret = fstarpu_mpi_barrier(comm_world)
call system_clock(te,tr)
tf = max(real(te-ts)/real(tr),1e-20)
gflops = 2.0*m*n*k/(tf*10**9)
if (comm_rank.eq.0.and.(.not.warmup.or.trial.gt.1)) &
write(*,'("RANK ",i3," -> took ",e15.8," s | ", e15.8," Gflop/s")') &
comm_rank, tf, gflops
call fstarpu_mpi_cache_flush_all_data(comm_world)
! unregister matrices
call unregister_matrix(A,mb,kb)
call unregister_matrix(B,kb,nb)
call unregister_matrix(C,mb,nb)
end do
if (provide_context) then
call fstarpu_sched_ctx_delete(ctx)
deallocate(procs)
endif
call fstarpu_codelet_free(cl_mm)
call fstarpu_codelet_free(cl_mm_args)
call fstarpu_codelet_free(cl_fill)
call fstarpu_shutdown()
ret = fstarpu_mpi_shutdown()
if (ret /= 0) then
write(*,'("fstarpu_mpi_shutdown status:",i4)') ret
stop 1
end if
contains
subroutine submit_gemm_task()
implicit none
A_local = A%blocks(i,l)%owner == comm_rank
B_local = B%blocks(l,j)%owner == comm_rank
C_local = C%blocks(i,j)%owner == comm_rank
......@@ -236,50 +308,7 @@ program fstarpu_example_dgemm
else
!could write something
end if
end do
end do
if (lflush) then
do l=1,kb
if (c_associated(A%blocks(i,l)%h)) &
call fstarpu_mpi_cache_flush(comm_world,A%blocks(i,l)%h)
end do
end if
end do
ret = fstarpu_mpi_wait_for_all(comm_world)
ret = fstarpu_mpi_barrier(comm_world)
call system_clock(te,tr)
tf = max(real(te-ts)/real(tr),1e-20)
gflops = 2.0*m*n*k/(tf*10**9)
if (comm_rank.eq.0.and.(.not.warmup.or.trial.gt.1)) &
write(*,'("RANK ",i3," -> took ",e15.8," s | ", e15.8," Gflop/s")') &
comm_rank, tf, gflops
call fstarpu_mpi_cache_flush_all_data(comm_world)
! unregister matrices
call unregister_matrix(A,mb,kb)
call unregister_matrix(B,kb,nb)
call unregister_matrix(C,mb,nb)
end do
if (provide_context) then
call fstarpu_sched_ctx_delete(ctx)
deallocate(procs)
endif
call fstarpu_codelet_free(cl_mm)
call fstarpu_codelet_free(cl_mm_args)
call fstarpu_codelet_free(cl_fill)
call fstarpu_shutdown()
ret = fstarpu_mpi_shutdown()
if (ret /= 0) then
write(*,'("fstarpu_mpi_shutdown status:",i4)') ret
stop 1
end if
contains
end subroutine submit_gemm_task
subroutine initialize_codelets()
implicit none
......
......@@ -65,6 +65,7 @@ static struct argp_option options[] = {
{"nowarmup", 'w', 0, 0, "If handed out to the program, register warmup run."},
{"byvalues", 'V', 0, 0, "If handed out to the program, codelet arguments are handed out through STARPU_VALUE instead of cl_gemm_args struct."},
{"delay", 'D', 0, 0, "If handed out to the program, delay handles registration."},
{"outer", 'O', 0, 0, "If handed out to the program, submit tasks outer products wise."},
{ 0 }
};
......@@ -81,6 +82,7 @@ struct arguments
int warmup;
int values;
int delay;
int outer;
};
......@@ -152,6 +154,9 @@ parse_opt (int key, char *arg, struct argp_state *state)
case 'V':
arguments->values = 1;
break;
case 'O':
arguments->outer = 1;
break;
default:
return ARGP_ERR_UNKNOWN;
}
......@@ -180,6 +185,7 @@ static int context = 0;
static int warmup = 1;
static int values = 1;
static int delay = 0;
static int outer = 0;
#define MB ((M)/(BS)) /* Number of blocks */
#define NB ((N)/(BS)) /* Number of blocks */
......@@ -480,6 +486,72 @@ static void init_matrices(void)
if (verbose) printf( "[%d] Initialized matrices\n", comm_rank);
}
void submit_gemm(int b_row, int b_col, int b_aisle, double alpha,double beta,unsigned ctx) {
double zbeta;
int a_local, b_local, c_local;
Block *Ail, *Blj, *Cij;
Ail = & A->blocks[b_row*KB + b_aisle];
Blj = & B->blocks[b_aisle*NB + b_col];
Cij = & C->blocks[b_row * NB + b_col];
a_local = Ail->owner == comm_rank;
b_local = Blj->owner == comm_rank;
c_local = Cij->owner == comm_rank;
// when prune and/or prune_handles are allowed needs to be clarified
//if ((!prune && !prune_handles) || (A->blocks[b_row*KB+b_aisle].owner == comm_rank || B->blocks[b_aisle*NB+b_col].owner == comm_rank || C->blocks[b_row*NB+b_col].owner == comm_rank)) {
// TODO : logic might be written more clearly (a/b/c_local may be redundant)
if ((!super_prune || (c_local || (a_local && b_col <= Q) || (b_local && b_row <= P) )) &&
(!prune || (a_local || b_local || c_local))) {
if (delay) {
// printf("[%d] late registration i,j,l %d,%d,%d\n",comm_rank,b_row,b_col,b_aisle);
if (!prune_handles || c_local) {
block_starpu_register(Ail,datatype);
block_starpu_register(Blj,datatype);
}
block_starpu_register(Cij, datatype);
}
struct cl_zgemm_args_s *clargs = NULL;
if (c_local) {
if (verbose) printf("[%d] exec. C_%d,%d (%d-%p-%d) += A_%d,%d (%d-%p-%d) B_%d,%d (%d-%p-%d)\n", comm_rank,
b_row,b_col, Cij->registered,Cij->hdl,Cij->owner,
b_row,b_aisle,Ail->registered,Ail->hdl,Ail->owner,
b_aisle,b_col,Blj->registered,Blj->hdl,Blj->owner);
if (!values) {
clargs = malloc(sizeof( struct cl_zgemm_args_s ));
clargs->alpha = alpha;
clargs->beta = b_aisle==0? beta : 1.0;
}
} else if (verbose){
printf("[%d] insert. C_%d,%d (%d-%p-%d) += A_%d,%d (%d-%p-%d) B_%d,%d (%d-%p-%d)\n", comm_rank,
b_row,b_col, Cij->registered,Cij->hdl,Cij->owner,
b_row,b_aisle,Ail->registered,Ail->hdl,Ail->owner,
b_aisle,b_col,Blj->registered,Blj->hdl,Blj->owner);
}
if (context) {
starpu_mpi_task_insert(MPI_COMM_WORLD, &gemm_cl,
STARPU_CL_ARGS, clargs, sizeof(struct cl_zgemm_args_s),
STARPU_SCHED_CTX, ctx,
STARPU_R, Ail->hdl,
STARPU_R, Blj->hdl,
STARPU_RW,Cij->hdl, 0);
} else if (values) {
zbeta = b_aisle == 0 ? beta : 1.0;
starpu_mpi_task_insert(MPI_COMM_WORLD, &gemm_cl,
STARPU_VALUE, &alpha, sizeof(double),
STARPU_VALUE, &zbeta, sizeof(double),
STARPU_R, Ail->hdl,
STARPU_R, Blj->hdl,
STARPU_RW,Cij->hdl, 0);
} else {
starpu_mpi_task_insert(MPI_COMM_WORLD, &gemm_cl,
STARPU_CL_ARGS, clargs, sizeof(struct cl_zgemm_args_s),
STARPU_R, Ail->hdl,
STARPU_R, Blj->hdl,
STARPU_RW,Cij->hdl, 0);
}
} else {
// printf("[%d] NOT inserted C_%d,%d += A_%d,%d B_%d,%d\n",comm_rank, b_row,b_col, b_row,b_aisle, b_aisle,b_col);
}
}
int main(int argc, char *argv[])
{
......@@ -503,6 +575,7 @@ int main(int argc, char *argv[])
arguments.prune_handles = 0;
arguments.context = 0;
arguments.delay = 0;
arguments.outer = 0;
arguments.warmup = 1;
arguments.values = 0;
argp_parse(&argp, argc, argv, 0, 0, &arguments);
......@@ -525,6 +598,7 @@ int main(int argc, char *argv[])
prune_handles = arguments.prune_handles;
context = arguments.context;
delay = arguments.delay;
outer = arguments.outer;
warmup = arguments.warmup;
values = arguments.values;
......@@ -596,14 +670,15 @@ int main(int argc, char *argv[])
if (!warmup) printf("- Warmup disabled\n");
if (values) printf("- Passing by values (instead of cl_args)\n");
if (delay) printf("- Delayed handle registration enabled\n");
}
if (outer) printf("- Submit tasks by outer products");
}
int barrier_ret, trial;
double start, stop;
double alpha = 3.14, beta=0.42;
barrier_ret = starpu_mpi_barrier(MPI_COMM_WORLD);
if (trace) starpu_fxt_start_profiling();
unsigned ctx;
int* procs;
unsigned ctx;
if (context) {
procs = (int*) malloc(ncpu*sizeof(int));
starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs, ncpu);
......@@ -630,88 +705,47 @@ int main(int argc, char *argv[])
barrier_ret = starpu_mpi_barrier(MPI_COMM_WORLD);
start = starpu_timing_now();
int a_local, b_local, c_local;
int b_row,b_col,b_aisle;
double zbeta;
Block *Ail, *Blj, *Cij;
for (b_row = 0; b_row < MB; b_row++)
{
for (b_col = 0; b_col < NB; b_col++)
{
for (b_aisle=0;b_aisle<KB;b_aisle++)
{
Ail = & A->blocks[b_row*KB + b_aisle];
Blj = & B->blocks[b_aisle*NB + b_col];
Cij = & C->blocks[b_row * NB + b_col];
a_local = Ail->owner == comm_rank;
b_local = Blj->owner == comm_rank;
c_local = Cij->owner == comm_rank;
// when prune and/or prune_handles are allowed needs to be clarified
//if ((!prune && !prune_handles) || (A->blocks[b_row*KB+b_aisle].owner == comm_rank || B->blocks[b_aisle*NB+b_col].owner == comm_rank || C->blocks[b_row*NB+b_col].owner == comm_rank)) {
// TODO : logic might be written more clearly (a/b/c_local may be redundant)
if ((!super_prune || (c_local || (a_local && b_col <= Q) || (b_local && b_row <= P) )) &&
(!prune || (a_local || b_local || c_local))) {
if (delay) {
// printf("[%d] late registration i,j,l %d,%d,%d\n",comm_rank,b_row,b_col,b_aisle);
if (!prune_handles || c_local) {
block_starpu_register(Ail,datatype);
block_starpu_register(Blj,datatype);
}
block_starpu_register(Cij, datatype);
}
struct cl_zgemm_args_s *clargs = NULL;
if (c_local) {
if (verbose) printf("[%d] exec. C_%d,%d (%d-%p-%d) += A_%d,%d (%d-%p-%d) B_%d,%d (%d-%p-%d)\n", comm_rank,
b_row,b_col, Cij->registered,Cij->hdl,Cij->owner,
b_row,b_aisle,Ail->registered,Ail->hdl,Ail->owner,
b_aisle,b_col,Blj->registered,Blj->hdl,Blj->owner);
if (!values) {
clargs = malloc(sizeof( struct cl_zgemm_args_s ));
clargs->alpha = alpha;
clargs->beta = b_aisle==0? beta : 1.0;
}
} else if (verbose){
printf("[%d] insert. C_%d,%d (%d-%p-%d) += A_%d,%d (%d-%p-%d) B_%d,%d (%d-%p-%d)\n", comm_rank,
b_row,b_col, Cij->registered,Cij->hdl,Cij->owner,
b_row,b_aisle,Ail->registered,Ail->hdl,Ail->owner,
b_aisle,b_col,Blj->registered,Blj->hdl,Blj->owner);
}
if (context) {
starpu_mpi_task_insert(MPI_COMM_WORLD, &gemm_cl,
STARPU_CL_ARGS, clargs, sizeof(struct cl_zgemm_args_s),
STARPU_SCHED_CTX, ctx,
STARPU_R, Ail->hdl,
STARPU_R, Blj->hdl,
STARPU_RW,Cij->hdl, 0);
} else if (values) {
zbeta = b_aisle == 0 ? beta : 1.0;
starpu_mpi_task_insert(MPI_COMM_WORLD, &gemm_cl,
STARPU_VALUE, &alpha, sizeof(double),
STARPU_VALUE, &zbeta, sizeof(double),
STARPU_R, Ail->hdl,
STARPU_R, Blj->hdl,
STARPU_RW,Cij->hdl, 0);
} else {
starpu_mpi_task_insert(MPI_COMM_WORLD, &gemm_cl,
STARPU_CL_ARGS, clargs, sizeof(struct cl_zgemm_args_s),
STARPU_R, Ail->hdl,
STARPU_R, Blj->hdl,
STARPU_RW,Cij->hdl, 0);
}
} else {
// printf("[%d] NOT inserted C_%d,%d += A_%d,%d B_%d,%d\n",comm_rank, b_row,b_col, b_row,b_aisle, b_aisle,b_col);
}
}
}
if (flush) {
for (b_aisle=0;b_aisle<KB;b_aisle++)
{
Ail = & A->blocks[b_row*KB + b_aisle];
if (Ail->registered)
starpu_mpi_cache_flush(MPI_COMM_WORLD, Ail->hdl);
}
}
}
Block *Ail, *Blj, *Cij;
if (outer) {
for (b_aisle=0;b_aisle<KB;b_aisle++)
{
for (b_row = 0; b_row < MB; b_row++)
{
for (b_col = 0; b_col < NB; b_col++)
{
submit_gemm(b_row,b_col,b_aisle,alpha,beta, ctx);
}
}
if (flush) {
for (b_row = 0; b_row < MB; b_row++) {
Ail = & A->blocks[b_row*KB + b_aisle];
if (Ail->registered) starpu_mpi_cache_flush(MPI_COMM_WORLD, Ail->hdl);
}
for (b_col = 0; b_col < NB; b_col++) {
Blj = & B->blocks[b_aisle*NB + b_col];
if (Blj->registered) starpu_mpi_cache_flush(MPI_COMM_WORLD, Blj->hdl);
}
}
}
} else {
for (b_row = 0; b_row < MB; b_row++)
{
for (b_col = 0; b_col < NB; b_col++)
{
for (b_aisle=0;b_aisle<KB;b_aisle++)
{
submit_gemm(b_row,b_col,b_aisle,alpha,beta,ctx);
}
}
if (flush) {
for (b_aisle=0;b_aisle<KB;b_aisle++) {
Ail = & A->blocks[b_row*KB + b_aisle];
if (Ail->registered) starpu_mpi_cache_flush(MPI_COMM_WORLD, Ail->hdl);
}
}
}
}
// printf("[%d] finished submission\n",comm_rank);
starpu_mpi_wait_for_all(MPI_COMM_WORLD);
barrier_ret = starpu_mpi_barrier(MPI_COMM_WORLD);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment