Commit 34dbc460 authored by Antoine Jego's avatar Antoine Jego
Browse files

unified codes (F: +flush_all / C: +delay)

parent 31fb7973
......@@ -6,7 +6,7 @@
int comm_rank; /* mpi rank of the process */
Matrix* alloc_matrix(int mb, int nb, int b, int p, int q)
Matrix* alloc_matrix(int mb, int nb, int b, int p, int q, starpu_mpi_tag_t* tag)
{
starpu_mpi_comm_rank(MPI_COMM_WORLD, &comm_rank);
Matrix* X;
......@@ -18,11 +18,15 @@ Matrix* alloc_matrix(int mb, int nb, int b, int p, int q)
for (j= 0; j<nb; j++)
{
X->blocks[i*nb+j].owner = (i%p)*q + (j%q);
X->blocks[i*nb+j].m = b;
X->blocks[i*nb+j].n = b;
X->blocks[i*nb+j].ld= b;
X->blocks[i*nb+j].m = b;
X->blocks[i*nb+j].n = b;
X->blocks[i*nb+j].ld = b;
X->blocks[i*nb+j].tag= *tag;
X->blocks[i*nb+j].hdl= malloc(sizeof(starpu_data_handle_t));
X->blocks[i*nb+j].registered = 0;
if (X->blocks[i*nb+j].owner == comm_rank)
X->blocks[i*nb+j].c = malloc(b*b*sizeof(double));
*tag = *tag + 1;
}
}
X->mb = mb;
......@@ -47,7 +51,7 @@ void free_matrix(Matrix* X)
free(X);
}
void register_matrix(Matrix* X, starpu_data_handle_t* X_h, starpu_mpi_tag_t *tag, int mb, int nb, int datatype, int prune_handles, int p, int q, int row, int col, int check)
void register_matrix(Matrix* X, int mb, int nb, int datatype, int prune_handles, int p, int q, int row, int col, int check, int delay)
{
starpu_mpi_comm_rank(MPI_COMM_WORLD, &comm_rank);
int proc_row, proc_col;
......@@ -57,43 +61,38 @@ void register_matrix(Matrix* X, starpu_data_handle_t* X_h, starpu_mpi_tag_t *tag
// comm_rank = proc_row * q + proc_col
proc_col = comm_rank % q;
proc_row = (comm_rank - proc_col)/q;
// printf("[%d] delayed ? %d\n", comm_rank, delay);
for (b_row = 0; b_row < mb; b_row++)
{
for (b_col = 0; b_col < nb; b_col++)
{
Xij = & X->blocks[b_row*nb+b_col];
// printf("[%d] X_%d,%d | tag:%d\n",comm_rank,b_row,b_col,*tag + b_row*nb + b_col);
// printf("[%d] X_%d,%d | tag:%d\n",comm_rank,b_row,b_col,Xij->tag);
if (Xij->owner == comm_rank)
{
if (datatype) {
starpu_tile_register( &X_h[b_row*nb+b_col], STARPU_MAIN_RAM, Xij );
starpu_tile_register( Xij->hdl, STARPU_MAIN_RAM, Xij );
} else {
starpu_matrix_data_register(&X_h[b_row*nb+b_col], STARPU_MAIN_RAM,
(uintptr_t) Xij->c, X->b, X->b, X->b,
starpu_matrix_data_register(Xij->hdl, STARPU_MAIN_RAM,
(uintptr_t) Xij->c, Xij->m, Xij->n, Xij->ld,
sizeof(double));
}
starpu_mpi_data_register(X_h[b_row*nb+b_col], (*tag + b_row*nb + b_col), Xij->owner);
} else if (!prune_handles || (row && proc_row == b_row % p) ||
// printf("[%d] X_%d,%d | mpi_data_register %p %p\n",comm_rank,b_row,b_col,*Xij->hdl,Xij->hdl);
starpu_mpi_data_register(*Xij->hdl, Xij->tag, Xij->owner);
Xij->registered = 1;
} else if (!delay && (!prune_handles || (row && proc_row == b_row % p) ||
(col && proc_col == b_col % q) ||
(check && Xij->owner == 0) ||
(check && comm_rank == 0) ) {
if (datatype) {
starpu_tile_register( &X_h[b_row*nb+b_col], -1, Xij );
} else {
starpu_matrix_data_register(&X_h[b_row*nb+b_col], -1,
(uintptr_t) NULL, X->b, X->b, X->b,
sizeof(double));
}
starpu_mpi_data_register(X_h[b_row*nb+b_col], (*tag + b_row*nb + b_col), Xij->owner);
(check && comm_rank == 0)) ) {
block_starpu_register(Xij, datatype);
} else {
// printf("[%d] pruned X_%d,%d\n",comm_rank,b_row,b_col);
}
}
}
*tag = *tag + mb*nb;
}
void unregister_matrix(Matrix* X, starpu_data_handle_t* X_h, int mb, int nb)
void unregister_matrix(Matrix* X, int mb, int nb)
{
starpu_mpi_comm_rank(MPI_COMM_WORLD, &comm_rank);
// printf("[%d]unregistering %dx%d matrix\n", comm_rank, mb, nb);
......@@ -105,11 +104,10 @@ void unregister_matrix(Matrix* X, starpu_data_handle_t* X_h, int mb, int nb)
// assuming we flush, we do not need to unregister everywhere
if (X->blocks[b_row*nb+b_col].owner == comm_rank) {
// printf("[%d] unregistering X_%d,%d\n", comm_rank, b_row, b_col);
starpu_data_unregister(X_h[b_row*nb+b_col]);
starpu_data_unregister(*X->blocks[b_row*nb+b_col].hdl);
}
}
}
free(X_h);
}
......@@ -132,3 +130,18 @@ void print_matrix(Matrix* X, char* name) {
}
}
}
void block_starpu_register(Block* Xij, int datatype) {
if (!Xij->registered) {
starpu_mpi_comm_rank(MPI_COMM_WORLD, &comm_rank);
if (datatype) {
starpu_tile_register( Xij->hdl, -1, Xij );
} else {
starpu_matrix_data_register(Xij->hdl, -1,
(uintptr_t) NULL, Xij->m, Xij->n, Xij->ld,
sizeof(double));
}
starpu_mpi_data_register(*Xij->hdl, Xij->tag, Xij->owner);
Xij->registered = 1;
}
}
......@@ -6,6 +6,9 @@ typedef struct Blocks
double* c;
int m,n,ld;
int owner;
starpu_data_handle_t* hdl;
starpu_mpi_tag_t tag;
int registered;
} Block;
typedef struct Matrices
......@@ -14,9 +17,11 @@ typedef struct Matrices
Block* blocks;
} Matrix;
Matrix* alloc_matrix(int mb, int nb, int b, int p, int q);
Matrix* alloc_matrix(int mb, int nb, int b, int p, int q, starpu_mpi_tag_t * tag);
void free_matrix(Matrix* X);
void register_matrix(Matrix* X, starpu_data_handle_t* X_h, starpu_mpi_tag_t *tag, int mb, int nb, int datatype, int prune_handles, int p, int q, int row, int col,int check);
void unregister_matrix(Matrix* X, starpu_data_handle_t* X_h, int mb, int nb);
void register_matrix(Matrix* X, int mb, int nb, int datatype, int prune_handles, int p, int q, int row, int col,int check, int delay);
void unregister_matrix(Matrix* X, int mb, int nb);
void print_matrix(Matrix* X, char* name);
void block_starpu_register(Block* Xij, int datatype);
#endif
......@@ -249,6 +249,8 @@ program fstarpu_example_dgemm
write(*,'("RANK ",i3," -> took ",e15.8," s | ", e15.8," Gflop/s")') &
comm_rank, tf, gflops
call fstarpu_mpi_cache_flush_all_data(comm_world)
! unregister matrices
call unregister_matrix(A,mb,kb)
call unregister_matrix(B,kb,nb)
......
......@@ -45,25 +45,26 @@
static char doc[] = "Standalone DGEMM using StarPU-MPI";
static char args_doc[] = "-m [m] -n [n] -k [k] -b [b] -p [p] -q [q] --niter [l] [--check] [--trace] [--datatype] [--mpi-thread [t]] [--no-flush] [--prune] [--prune-handles] [--super-prune]";;
static struct argp_option options[] = {
{"m", 'm', "int", 0, "Number of rows in A and C (deprecated)" },
{"n", 'n', "int", 0, "Dimension of A B and C" },
{"k", 'k', "int", 0, "Shared dimension of A and B (deprecated)" },
{"blocking", 'b', "int", 0, "Size of the square block of A, B and C (must divide m,n and k" },
{"p", 'p', "int", 0, "Length of the logical grid"},
{"q", 'q', "int", 0, "Width of the logical grid"},
{"check", 'c', 0, 0, "If the program checks gemm results (NOT INTERPRETED)"},
{"verbose", 'v', 0, 0, "Extend program vebosity"},
{"trace", 'f', 0, 0, "Whether or not to use FxT tracing"},
{"niter", 'l', "int", 0, "Number of iterations (loops)"},
{"mpi-thread", 't', "int", 0, "MPI thread level support : -1 StarPU, 0 SINGLE, 1 FUNNELED, 2 SERIALIZED, 3 MULTIPLE"},
{"datatype", 'd', 0, 0, "Whether or not to use our own datatype implementation"},
{"no-flush", 's', 0, 0, "If handed out to the program, do not flush anything until computation has completed."},
{"prune", 'r', 0, 0, "If handed out to the program, prune the DAG tightly enough (relying on STARPU+mpi cache to avoid redundant data transfers)."},
{"super-prune", 'R', 0, 0, "If handed out to the program, prune the DAG as tightly as possible."},
{"prune-handles",'z', 0, 0, "If handed out to the program, prune the handle registration."},
{"own-context", 'o', 0, 0, "If handed out to the program, schedule tasks in a created context."},
{"nowarmup", 'w', 0, 0, "If handed out to the program, register warmup run."},
{ 0 }
{"m", 'm', "int", 0, "Number of rows in A and C (deprecated)" },
{"n", 'n', "int", 0, "Dimension of A B and C" },
{"k", 'k', "int", 0, "Shared dimension of A and B (deprecated)" },
{"blocking", 'b', "int", 0, "Size of the square block of A, B and C (must divide m,n and k" },
{"p", 'p', "int", 0, "Length of the logical grid"},
{"q", 'q', "int", 0, "Width of the logical grid"},
{"check", 'c', 0, 0, "If the program checks gemm results (NOT INTERPRETED)"},
{"verbose", 'v', 0, 0, "Extend program vebosity"},
{"trace", 'f', 0, 0, "Whether or not to use FxT tracing"},
{"niter", 'l', "int", 0, "Number of iterations (loops)"},
{"mpi-thread", 't', "int", 0, "MPI thread level support : -1 StarPU, 0 SINGLE, 1 FUNNELED, 2 SERIALIZED, 3 MULTIPLE"},
{"datatype", 'd', 0, 0, "Whether or not to use our own datatype implementation"},
{"no-flush", 's', 0, 0, "If handed out to the program, do not flush anything until computation has completed."},
{"prune", 'r', 0, 0, "If handed out to the program, prune the DAG tightly enough (relying on STARPU+mpi cache to avoid redundant data transfers)."},
{"super-prune", 'R', 0, 0, "If handed out to the program, prune the DAG as tightly as possible."},
{"prune-handles",'z', 0, 0, "If handed out to the program, prune the handle registration."},
{"own-context", 'o', 0, 0, "If handed out to the program, schedule tasks in a created context."},
{"nowarmup", 'w', 0, 0, "If handed out to the program, register warmup run."},
{"delay", 'D', 0, 0, "If handed out to the program, delay handles registration."},
{ 0 }
};
......@@ -77,6 +78,7 @@ struct arguments
int no_flush, prune, super_prune, prune_handles;
int context;
int warmup;
int delay;
};
......@@ -142,6 +144,9 @@ parse_opt (int key, char *arg, struct argp_state *state)
case 'w':
arguments->warmup = 0;
break;
case 'D':
arguments->delay = 1;
break;
default:
return ARGP_ERR_UNKNOWN;
}
......@@ -168,17 +173,13 @@ static int super_prune = 0;
static int prune_handles = 0;
static int context = 0;
static int warmup = 1;
static int delay = 0;
#define MB ((M)/(BS)) /* Number of blocks */
#define NB ((N)/(BS)) /* Number of blocks */
#define KB ((K)/(BS)) /* Number of blocks */
/* Arrays of data handles for managing matrix blocks */
static starpu_data_handle_t *A_h;
static starpu_data_handle_t *B_h;
static starpu_data_handle_t *C_h;
static int comm_rank; /* mpi rank of the process */
static int comm_size; /* size of the mpi session */
......@@ -187,14 +188,15 @@ static Matrix *A = NULL; /* A will be partitioned as MB x KB blocks */
static Matrix *B = NULL; /* B will be partitioned as KB x NB blocks */
static Matrix *C = NULL; /* C will be partitioned as MB x NB blocks */
starpu_mpi_tag_t tag = 0;
static void alloc_matrices(void)
{
if (verbose) printf( "[%d] Allocating matrices\n", comm_rank);
A = alloc_matrix(MB,KB,BS,P,Q);
B = alloc_matrix(KB,NB,BS,P,Q);
C = alloc_matrix(MB,NB,BS,P,Q);
A = alloc_matrix(MB,KB,BS,P,Q,&tag);
B = alloc_matrix(KB,NB,BS,P,Q,&tag);
C = alloc_matrix(MB,NB,BS,P,Q,&tag);
}
static void free_matrices(void)
......@@ -205,36 +207,32 @@ static void free_matrices(void)
free_matrix(C);
}
starpu_mpi_tag_t tag = 0;
/* Register the matrix blocks to StarPU and to StarPU-MPI */
static void register_matrices(int prune_handles)
{
if (verbose) printf("[%d] Registering matrices\n", comm_rank);
A_h = calloc(MB*KB, sizeof(starpu_data_handle_t));
B_h = calloc(KB*NB, sizeof(starpu_data_handle_t));
C_h = calloc(MB*NB, sizeof(starpu_data_handle_t));
if (datatype) {
starpu_tile_interface_register();
}
register_matrix(A,A_h,&tag,MB,KB,datatype,prune_handles,P,Q,1,0,check);
register_matrix(B,B_h,&tag,KB,NB,datatype,prune_handles,P,Q,0,1,check);
register_matrix(A,MB,KB,datatype,prune_handles,P,Q,1,0,check,delay);
register_matrix(B,KB,NB,datatype,prune_handles,P,Q,0,1,check,delay);
//register_matrix(C,C_h,&tag,MB,NB,datatype,prune_handles,P,Q,0,0);
// FIXME (starpu-side) :
// the previous one seems logical because we do not need to know
// about blocks of C we do not contribute to, however startPU seems to be
// pending on task_insertion if we do not know about blocks on our row/column even if we do not contribute to them)
// - This could happen because we are seeing a Write on a NULL handle and StarPU is waiting (for what ?)
register_matrix(C,C_h,&tag,MB,NB,datatype,prune_handles,P,Q,1,1,check);
register_matrix(C,MB,NB,datatype,prune_handles,P,Q,1,1,check,delay);
}
/* Unregister matrices from the StarPU management. */
static void unregister_matrices()
{
if (verbose) printf( "[%d] Unregistering matrices\n", comm_rank);
unregister_matrix(A,A_h,MB,KB);
unregister_matrix(B,B_h,KB,NB);
unregister_matrix(C,C_h,MB,NB);
unregister_matrix(A,MB,KB);
unregister_matrix(B,KB,NB);
unregister_matrix(C,MB,NB);
if (datatype) {
starpu_tile_interface_register();
}
......@@ -416,7 +414,7 @@ static struct starpu_codelet nrm_cl =
.name = "nrm2_comp" /* to display task name in traces */
};
static void init_matrix(Matrix* X, starpu_data_handle_t* X_h, int mb, int nb)
static void init_matrix(Matrix* X, int mb, int nb)
{
int row, col;
for (row = 0; row < mb; row++)
......@@ -425,16 +423,16 @@ static void init_matrix(Matrix* X, starpu_data_handle_t* X_h, int mb, int nb)
{
if (X->blocks[row*nb+col].owner == comm_rank)
{
//printf("[%d] fill X_%d,%d\n",comm_rank,row,col);
// printf("[%d] fill X_%d,%d %p\n",comm_rank,row,col, X->blocks[row*nb+col].hdl);
starpu_mpi_task_insert(MPI_COMM_WORLD, &fill_cl,
STARPU_W, X_h[row*nb+col], 0);
//printf("[%d] filled X_%d,%d\n",comm_rank,row,col);
STARPU_W, *X->blocks[row*nb+col].hdl, 0);
// printf("[%d] filled X_%d,%d\n",comm_rank,row,col);
}
}
}
}
static void copy_matrix(Matrix* A, starpu_data_handle_t* A_h, Matrix* B, starpu_data_handle_t* B_h)
static void copy_matrix(Matrix* A, Matrix* B)
{
int row, col;
for (row = 0; row < A->mb; row++)
......@@ -445,8 +443,8 @@ static void copy_matrix(Matrix* A, starpu_data_handle_t* A_h, Matrix* B, starpu_
|| B->blocks[row*A->nb+col].owner == comm_rank)
{
starpu_mpi_task_insert(MPI_COMM_WORLD, &copy_cl,
STARPU_W, A_h[row*A->nb+col],
STARPU_R, B_h[row*A->nb+col], 0);
STARPU_W, *A->blocks[row*A->nb+col].hdl,
STARPU_R, *B->blocks[row*A->nb+col].hdl, 0);
}
}
}
......@@ -457,11 +455,11 @@ static void init_matrices(void)
{
if (verbose) printf( "[%d] Initializing matrices\n", comm_rank);
// I own all the blocks
init_matrix(A,A_h,MB,KB);
init_matrix(A,MB,KB);
starpu_mpi_wait_for_all(MPI_COMM_WORLD);
init_matrix(B,B_h,KB,NB);
init_matrix(B,KB,NB);
starpu_mpi_wait_for_all(MPI_COMM_WORLD);
init_matrix(C,C_h,MB,NB);
init_matrix(C,MB,NB);
starpu_mpi_wait_for_all(MPI_COMM_WORLD);
if (verbose) printf( "[%d] Initialized matrices\n", comm_rank);
}
......@@ -470,6 +468,7 @@ static void init_matrices(void)
int main(int argc, char *argv[])
{
struct arguments arguments;
/* Default values */
arguments.m = M;
arguments.n = N;
arguments.k = K;
......@@ -487,6 +486,7 @@ int main(int argc, char *argv[])
arguments.super_prune = 0;
arguments.prune_handles = 0;
arguments.context = 0;
arguments.delay = 0;
argp_parse(&argp, argc, argv, 0, 0, &arguments);
M = arguments.m;
......@@ -506,6 +506,7 @@ int main(int argc, char *argv[])
super_prune = arguments.super_prune;
prune_handles = arguments.prune_handles;
context = arguments.context;
delay = arguments.delay;
/* Initializes StarPU and the StarPU-MPI layer */
starpu_fxt_autostart_profiling(0);
......@@ -573,6 +574,7 @@ int main(int argc, char *argv[])
if (prune_handles) printf("- Handle pruning enabled\n");
if (context) printf("- Submitting own context\n");
if (!warmup) printf("- Warmup disabled\n");
if (delay) printf("- Delayed handle registration enabled\n");
}
int barrier_ret, trial;
double start, stop;
......@@ -594,12 +596,10 @@ int main(int argc, char *argv[])
init_matrices();
Matrix* Cwork;
starpu_data_handle_t *Cwh;
if (check) {
Cwh = malloc(MB*NB*sizeof(starpu_data_handle_t));
Cwork = alloc_matrix(MB,NB,BS,1,1);
register_matrix(Cwork,Cwh,&tag,MB,NB,datatype,0,1,1,1,1,1);
copy_matrix(Cwork,Cwh,C,C_h);
Cwork = alloc_matrix(MB,NB,BS,1,1,&tag);
register_matrix(Cwork,MB,NB,datatype,0,1,1,1,1,1,0);
copy_matrix(Cwork,C);
starpu_mpi_wait_for_all(MPI_COMM_WORLD);
if (verbose) print_matrix(C,"Cinit");
if (verbose) print_matrix(Cwork,"Cwork");
......@@ -622,33 +622,37 @@ int main(int argc, char *argv[])
// when prune and/or prune_handles are allowed needs to be clarified
//if ((!prune && !prune_handles) || (A->blocks[b_row*KB+b_aisle].owner == comm_rank || B->blocks[b_aisle*NB+b_col].owner == comm_rank || C->blocks[b_row*NB+b_col].owner == comm_rank)) {
// TODO : logic might be written more clearly (a/b/c_local may be redundant)
// FIXME (starpu-side) : fortran allows this but not C
// (note : inserting tasks *is* different but it does not feel *that* different
if ((!super_prune || (c_local || (a_local && b_col <= Q) || (b_local && b_row <= P) )) &&
(!prune || (a_local || b_local || c_local))) {
if (delay) {
if (!prune_handles || c_local) {
block_starpu_register(&A->blocks[b_row*KB+b_aisle],datatype);
block_starpu_register(&B->blocks[b_aisle*NB+b_col],datatype);
}
block_starpu_register(&C->blocks[b_row*NB+b_col], datatype);
}
struct cl_zgemm_args_s *clargs = NULL;
if (C->blocks[b_row*NB+b_col].owner == comm_rank) {
if (c_local) {
clargs = malloc(sizeof( struct cl_zgemm_args_s ));
clargs->alpha = alpha;
clargs->beta = b_aisle==0? beta : 1.0;
}
starpu_mpi_task_insert(MPI_COMM_WORLD, &gemm_cl,
STARPU_CL_ARGS, clargs, sizeof(struct cl_zgemm_args_s),
STARPU_R, A_h[b_row*KB+b_aisle],
STARPU_R, B_h[b_aisle*NB+b_col],
STARPU_RW, C_h[b_row*NB+b_col], 0);
//printf("[%d] inserted C_%d,%d += A_%d,%d B_%d,%d\n",comm_rank, b_row,b_col, b_row,b_aisle, b_aisle,b_col);
STARPU_R, *A->blocks[b_row*KB+b_aisle].hdl,
STARPU_R, *B->blocks[b_aisle*NB+b_col].hdl,
STARPU_RW, *C->blocks[b_row*NB+b_col].hdl, 0);
// printf("[%d] inserted C_%d,%d += A_%d,%d B_%d,%d\n",comm_rank, b_row,b_col, b_row,b_aisle, b_aisle,b_col);
}
}
}
if (flush) {
for (b_aisle=0;b_aisle<KB;b_aisle++)
{
if (A_h[b_row*KB+b_aisle] != NULL) starpu_mpi_cache_flush(MPI_COMM_WORLD, A_h[b_row*KB+b_aisle]);
if (A->blocks[b_row*KB+b_aisle].registered) starpu_mpi_cache_flush(MPI_COMM_WORLD, *A->blocks[b_row*KB+b_aisle].hdl);
}
}
}
starpu_mpi_wait_for_all(MPI_COMM_WORLD);
barrier_ret = starpu_mpi_barrier(MPI_COMM_WORLD);
stop = starpu_timing_now();
......@@ -657,27 +661,22 @@ int main(int argc, char *argv[])
if (check) {
Matrix* Acheck, *Bcheck, *Ccheck;
starpu_data_handle_t *Ach,*Bch,*Cch;
Acheck = alloc_matrix(MB,KB,BS,1,1);
Bcheck = alloc_matrix(KB,NB,BS,1,1);
Ccheck = alloc_matrix(MB,NB,BS,1,1);
Ach = malloc(MB*KB*sizeof(starpu_data_handle_t));
Bch = malloc(KB*NB*sizeof(starpu_data_handle_t));
Cch = malloc(MB*NB*sizeof(starpu_data_handle_t));
Acheck = alloc_matrix(MB,KB,BS,1,1,&tag);
Bcheck = alloc_matrix(KB,NB,BS,1,1,&tag);
Ccheck = alloc_matrix(MB,NB,BS,1,1,&tag);
register_matrix(Acheck,Ach,&tag,MB,KB,datatype,0,1,1,1,1,1);
register_matrix(Bcheck,Bch,&tag,KB,NB,datatype,0,1,1,1,1,1);
register_matrix(Ccheck,Cch,&tag,MB,NB,datatype,0,1,1,1,1,1);
register_matrix(Acheck,MB,KB,datatype,0,1,1,1,1,1,0);
register_matrix(Bcheck,KB,NB,datatype,0,1,1,1,1,1,0);
register_matrix(Ccheck,MB,NB,datatype,0,1,1,1,1,1,0);
copy_matrix(Acheck,Ach,A,A_h);
copy_matrix(Acheck,A);
if (verbose) print_matrix(A,"A");
if (verbose) print_matrix(Acheck,"Ac");
copy_matrix(Bcheck,Bch,B,B_h);
copy_matrix(Bcheck,B);
if (verbose) print_matrix(B,"B");
if (verbose) print_matrix(Bcheck,"Bc");
copy_matrix(Ccheck,Cch,C,C_h);
copy_matrix(Ccheck,C);
if (verbose) print_matrix(C,"C");
if (verbose) print_matrix(Ccheck,"Cc");
if (comm_rank == 0) {
......@@ -691,22 +690,22 @@ int main(int argc, char *argv[])
clargs->beta = b_aisle==0? beta : 1.0;
starpu_mpi_task_insert(MPI_COMM_WORLD, &gemm_cl,
STARPU_CL_ARGS, clargs, sizeof(struct cl_zgemm_args_s),
STARPU_R, Ach[b_row*KB+b_aisle],
STARPU_R, Bch[b_aisle*NB+b_col],
STARPU_RW, Cwh[b_row*NB+b_col], 0);
STARPU_R, *Acheck->blocks[b_row*KB+b_aisle].hdl,
STARPU_R, *Bcheck->blocks[b_aisle*NB+b_col].hdl,
STARPU_RW, *Cwork->blocks[b_row*NB+b_col].hdl, 0);
}
starpu_mpi_task_insert(MPI_COMM_WORLD, &nrm_cl,
STARPU_R, Cch[b_row*NB+b_col],
STARPU_R, Cwh[b_row*NB+b_col], 0);
STARPU_R, Ccheck->blocks[b_row*NB+b_col].hdl,
STARPU_R, Cwork->blocks[b_row*NB+b_col].hdl, 0);
}
}
starpu_mpi_wait_for_all(MPI_COMM_WORLD);
}
barrier_ret = starpu_mpi_barrier(MPI_COMM_WORLD);
unregister_matrix(Acheck,Ach,MB,KB);
unregister_matrix(Bcheck,Bch,KB,NB);
unregister_matrix(Ccheck,Cch,MB,NB);
unregister_matrix(Cwork,Cwh,MB,NB);
unregister_matrix(Acheck,MB,KB);
unregister_matrix(Bcheck,KB,NB);
unregister_matrix(Ccheck,MB,NB);
unregister_matrix(Cwork,MB,NB);
barrier_ret = starpu_mpi_barrier(MPI_COMM_WORLD);
free_matrix(Acheck);
free_matrix(Bcheck);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment