Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
solverstack
mini-examples
starpu_example_dgemm
Commits
0045debc
Commit
0045debc
authored
Jan 08, 2022
by
Jego Antoine
Browse files
add outer code
parent
b887214c
Changes
3
Hide whitespace changes
Inline
Side-by-side
CMakeLists.txt
View file @
0045debc
cmake_minimum_required
(
VERSION 3.3
)
project
(
starpu_example_dgemm C Fortran
)
add_compile_options
(
-Wall -Wextra -pedantic
)
# -Werror)
#
add_compile_options(-Wall -Wextra -pedantic) # -Werror)
# Check that we do no try to configure/build inside the source directory
# ----------------------------------------------------------------------
if
(
${
CMAKE_SOURCE_DIR
}
STREQUAL
${
CMAKE_BINARY_DIR
}
)
...
...
fstarpu_example_dgemm.f90
View file @
0045debc
...
...
@@ -42,6 +42,7 @@ program fstarpu_example_dgemm
logical
::
provide_context
=
.false.
logical
::
warmup
=
.true.
logical
::
codelet_args
=
.false.
logical
::
outer
=
.false.
integer
(
c_int
)
::
comm_size
,
comm_rank
integer
(
c_int
),
target
::
comm_world
...
...
@@ -131,6 +132,8 @@ program fstarpu_example_dgemm
warmup
=
.false.
case
(
'-a'
)
codelet_args
=
.true.
case
(
'-o'
)
outer
=
.true.
! keep -e as an empty argument for debug purpose
end
select
end
do
...
...
@@ -157,6 +160,7 @@ program fstarpu_example_dgemm
if
(
provide_context
)
write
(
*
,
*
)
"(C)ontext provided at submission"
if
(
.not.
warmup
)
write
(
*
,
*
)
"(W)armup disabled"
if
(
codelet_args
)
write
(
*
,
*
)
"(A)rguments NOT by values"
if
(
outer
)
write
(
*
,
*
)
"(O)uter product submission"
write
(
*
,
'("========================================")'
)
end
if
ret
=
fstarpu_mpi_barrier
(
comm_world
)
...
...
@@ -189,9 +193,77 @@ program fstarpu_example_dgemm
call
system_clock
(
ts
)
! submit matrix multiplication
do
i
=
1
,
mb
do
j
=
1
,
nb
do
l
=
1
,
kb
if
(
outer
)
then
do
l
=
1
,
kb
do
i
=
1
,
mb
do
j
=
1
,
nb
call
submit_gemm_task
()
end
do
end
do
if
(
lflush
)
then
do
i
=
1
,
mb
if
(
c_associated
(
A
%
blocks
(
i
,
l
)
%
h
))
&
call
fstarpu_mpi_cache_flush
(
comm_world
,
A
%
blocks
(
i
,
l
)
%
h
)
end
do
do
j
=
1
,
nb
if
(
c_associated
(
B
%
blocks
(
l
,
j
)
%
h
))
&
call
fstarpu_mpi_cache_flush
(
comm_world
,
B
%
blocks
(
l
,
j
)
%
h
)
end
do
end
if
end
do
else
do
i
=
1
,
mb
do
j
=
1
,
nb
do
l
=
1
,
kb
call
submit_gemm_task
()
end
do
end
do
if
(
lflush
)
then
do
l
=
1
,
kb
if
(
c_associated
(
A
%
blocks
(
i
,
l
)
%
h
))
&
call
fstarpu_mpi_cache_flush
(
comm_world
,
A
%
blocks
(
i
,
l
)
%
h
)
end
do
end
if
end
do
end
if
ret
=
fstarpu_mpi_wait_for_all
(
comm_world
)
ret
=
fstarpu_mpi_barrier
(
comm_world
)
call
system_clock
(
te
,
tr
)
tf
=
max
(
real
(
te
-
ts
)/
real
(
tr
),
1e-20
)
gflops
=
2.0
*
m
*
n
*
k
/(
tf
*
10
**
9
)
if
(
comm_rank
.eq.
0.
and
.
(
.not.
warmup
.or.
trial
.gt.
1
))
&
write
(
*
,
'("RANK ",i3," -> took ",e15.8," s | ", e15.8," Gflop/s")'
)
&
comm_rank
,
tf
,
gflops
call
fstarpu_mpi_cache_flush_all_data
(
comm_world
)
! unregister matrices
call
unregister_matrix
(
A
,
mb
,
kb
)
call
unregister_matrix
(
B
,
kb
,
nb
)
call
unregister_matrix
(
C
,
mb
,
nb
)
end
do
if
(
provide_context
)
then
call
fstarpu_sched_ctx_delete
(
ctx
)
deallocate
(
procs
)
endif
call
fstarpu_codelet_free
(
cl_mm
)
call
fstarpu_codelet_free
(
cl_mm_args
)
call
fstarpu_codelet_free
(
cl_fill
)
call
fstarpu_shutdown
()
ret
=
fstarpu_mpi_shutdown
()
if
(
ret
/
=
0
)
then
write
(
*
,
'("fstarpu_mpi_shutdown status:",i4)'
)
ret
stop
1
end
if
contains
subroutine
submit_gemm_task
()
implicit
none
A_local
=
A
%
blocks
(
i
,
l
)
%
owner
==
comm_rank
B_local
=
B
%
blocks
(
l
,
j
)
%
owner
==
comm_rank
C_local
=
C
%
blocks
(
i
,
j
)
%
owner
==
comm_rank
...
...
@@ -236,50 +308,7 @@ program fstarpu_example_dgemm
else
!could write something
end
if
end
do
end
do
if
(
lflush
)
then
do
l
=
1
,
kb
if
(
c_associated
(
A
%
blocks
(
i
,
l
)
%
h
))
&
call
fstarpu_mpi_cache_flush
(
comm_world
,
A
%
blocks
(
i
,
l
)
%
h
)
end
do
end
if
end
do
ret
=
fstarpu_mpi_wait_for_all
(
comm_world
)
ret
=
fstarpu_mpi_barrier
(
comm_world
)
call
system_clock
(
te
,
tr
)
tf
=
max
(
real
(
te
-
ts
)/
real
(
tr
),
1e-20
)
gflops
=
2.0
*
m
*
n
*
k
/(
tf
*
10
**
9
)
if
(
comm_rank
.eq.
0.
and
.
(
.not.
warmup
.or.
trial
.gt.
1
))
&
write
(
*
,
'("RANK ",i3," -> took ",e15.8," s | ", e15.8," Gflop/s")'
)
&
comm_rank
,
tf
,
gflops
call
fstarpu_mpi_cache_flush_all_data
(
comm_world
)
! unregister matrices
call
unregister_matrix
(
A
,
mb
,
kb
)
call
unregister_matrix
(
B
,
kb
,
nb
)
call
unregister_matrix
(
C
,
mb
,
nb
)
end
do
if
(
provide_context
)
then
call
fstarpu_sched_ctx_delete
(
ctx
)
deallocate
(
procs
)
endif
call
fstarpu_codelet_free
(
cl_mm
)
call
fstarpu_codelet_free
(
cl_mm_args
)
call
fstarpu_codelet_free
(
cl_fill
)
call
fstarpu_shutdown
()
ret
=
fstarpu_mpi_shutdown
()
if
(
ret
/
=
0
)
then
write
(
*
,
'("fstarpu_mpi_shutdown status:",i4)'
)
ret
stop
1
end
if
contains
end
subroutine
submit_gemm_task
subroutine
initialize_codelets
()
implicit
none
...
...
starpu_example_dgemm.c
View file @
0045debc
...
...
@@ -65,6 +65,7 @@ static struct argp_option options[] = {
{
"nowarmup"
,
'w'
,
0
,
0
,
"If handed out to the program, register warmup run."
},
{
"byvalues"
,
'V'
,
0
,
0
,
"If handed out to the program, codelet arguments are handed out through STARPU_VALUE instead of cl_gemm_args struct."
},
{
"delay"
,
'D'
,
0
,
0
,
"If handed out to the program, delay handles registration."
},
{
"outer"
,
'O'
,
0
,
0
,
"If handed out to the program, submit tasks outer products wise."
},
{
0
}
};
...
...
@@ -81,6 +82,7 @@ struct arguments
int
warmup
;
int
values
;
int
delay
;
int
outer
;
};
...
...
@@ -152,6 +154,9 @@ parse_opt (int key, char *arg, struct argp_state *state)
case
'V'
:
arguments
->
values
=
1
;
break
;
case
'O'
:
arguments
->
outer
=
1
;
break
;
default:
return
ARGP_ERR_UNKNOWN
;
}
...
...
@@ -180,6 +185,7 @@ static int context = 0;
static
int
warmup
=
1
;
static
int
values
=
1
;
static
int
delay
=
0
;
static
int
outer
=
0
;
#define MB ((M)/(BS))
/* Number of blocks */
#define NB ((N)/(BS))
/* Number of blocks */
...
...
@@ -480,6 +486,72 @@ static void init_matrices(void)
if
(
verbose
)
printf
(
"[%d] Initialized matrices
\n
"
,
comm_rank
);
}
void
submit_gemm
(
int
b_row
,
int
b_col
,
int
b_aisle
,
double
alpha
,
double
beta
,
unsigned
ctx
)
{
double
zbeta
;
int
a_local
,
b_local
,
c_local
;
Block
*
Ail
,
*
Blj
,
*
Cij
;
Ail
=
&
A
->
blocks
[
b_row
*
KB
+
b_aisle
];
Blj
=
&
B
->
blocks
[
b_aisle
*
NB
+
b_col
];
Cij
=
&
C
->
blocks
[
b_row
*
NB
+
b_col
];
a_local
=
Ail
->
owner
==
comm_rank
;
b_local
=
Blj
->
owner
==
comm_rank
;
c_local
=
Cij
->
owner
==
comm_rank
;
// when prune and/or prune_handles are allowed needs to be clarified
//if ((!prune && !prune_handles) || (A->blocks[b_row*KB+b_aisle].owner == comm_rank || B->blocks[b_aisle*NB+b_col].owner == comm_rank || C->blocks[b_row*NB+b_col].owner == comm_rank)) {
// TODO : logic might be written more clearly (a/b/c_local may be redundant)
if
((
!
super_prune
||
(
c_local
||
(
a_local
&&
b_col
<=
Q
)
||
(
b_local
&&
b_row
<=
P
)
))
&&
(
!
prune
||
(
a_local
||
b_local
||
c_local
)))
{
if
(
delay
)
{
// printf("[%d] late registration i,j,l %d,%d,%d\n",comm_rank,b_row,b_col,b_aisle);
if
(
!
prune_handles
||
c_local
)
{
block_starpu_register
(
Ail
,
datatype
);
block_starpu_register
(
Blj
,
datatype
);
}
block_starpu_register
(
Cij
,
datatype
);
}
struct
cl_zgemm_args_s
*
clargs
=
NULL
;
if
(
c_local
)
{
if
(
verbose
)
printf
(
"[%d] exec. C_%d,%d (%d-%p-%d) += A_%d,%d (%d-%p-%d) B_%d,%d (%d-%p-%d)
\n
"
,
comm_rank
,
b_row
,
b_col
,
Cij
->
registered
,
Cij
->
hdl
,
Cij
->
owner
,
b_row
,
b_aisle
,
Ail
->
registered
,
Ail
->
hdl
,
Ail
->
owner
,
b_aisle
,
b_col
,
Blj
->
registered
,
Blj
->
hdl
,
Blj
->
owner
);
if
(
!
values
)
{
clargs
=
malloc
(
sizeof
(
struct
cl_zgemm_args_s
));
clargs
->
alpha
=
alpha
;
clargs
->
beta
=
b_aisle
==
0
?
beta
:
1
.
0
;
}
}
else
if
(
verbose
){
printf
(
"[%d] insert. C_%d,%d (%d-%p-%d) += A_%d,%d (%d-%p-%d) B_%d,%d (%d-%p-%d)
\n
"
,
comm_rank
,
b_row
,
b_col
,
Cij
->
registered
,
Cij
->
hdl
,
Cij
->
owner
,
b_row
,
b_aisle
,
Ail
->
registered
,
Ail
->
hdl
,
Ail
->
owner
,
b_aisle
,
b_col
,
Blj
->
registered
,
Blj
->
hdl
,
Blj
->
owner
);
}
if
(
context
)
{
starpu_mpi_task_insert
(
MPI_COMM_WORLD
,
&
gemm_cl
,
STARPU_CL_ARGS
,
clargs
,
sizeof
(
struct
cl_zgemm_args_s
),
STARPU_SCHED_CTX
,
ctx
,
STARPU_R
,
Ail
->
hdl
,
STARPU_R
,
Blj
->
hdl
,
STARPU_RW
,
Cij
->
hdl
,
0
);
}
else
if
(
values
)
{
zbeta
=
b_aisle
==
0
?
beta
:
1
.
0
;
starpu_mpi_task_insert
(
MPI_COMM_WORLD
,
&
gemm_cl
,
STARPU_VALUE
,
&
alpha
,
sizeof
(
double
),
STARPU_VALUE
,
&
zbeta
,
sizeof
(
double
),
STARPU_R
,
Ail
->
hdl
,
STARPU_R
,
Blj
->
hdl
,
STARPU_RW
,
Cij
->
hdl
,
0
);
}
else
{
starpu_mpi_task_insert
(
MPI_COMM_WORLD
,
&
gemm_cl
,
STARPU_CL_ARGS
,
clargs
,
sizeof
(
struct
cl_zgemm_args_s
),
STARPU_R
,
Ail
->
hdl
,
STARPU_R
,
Blj
->
hdl
,
STARPU_RW
,
Cij
->
hdl
,
0
);
}
}
else
{
// printf("[%d] NOT inserted C_%d,%d += A_%d,%d B_%d,%d\n",comm_rank, b_row,b_col, b_row,b_aisle, b_aisle,b_col);
}
}
int
main
(
int
argc
,
char
*
argv
[])
{
...
...
@@ -503,6 +575,7 @@ int main(int argc, char *argv[])
arguments
.
prune_handles
=
0
;
arguments
.
context
=
0
;
arguments
.
delay
=
0
;
arguments
.
outer
=
0
;
arguments
.
warmup
=
1
;
arguments
.
values
=
0
;
argp_parse
(
&
argp
,
argc
,
argv
,
0
,
0
,
&
arguments
);
...
...
@@ -525,6 +598,7 @@ int main(int argc, char *argv[])
prune_handles
=
arguments
.
prune_handles
;
context
=
arguments
.
context
;
delay
=
arguments
.
delay
;
outer
=
arguments
.
outer
;
warmup
=
arguments
.
warmup
;
values
=
arguments
.
values
;
...
...
@@ -596,14 +670,15 @@ int main(int argc, char *argv[])
if
(
!
warmup
)
printf
(
"- Warmup disabled
\n
"
);
if
(
values
)
printf
(
"- Passing by values (instead of cl_args)
\n
"
);
if
(
delay
)
printf
(
"- Delayed handle registration enabled
\n
"
);
}
if
(
outer
)
printf
(
"- Submit tasks by outer products"
);
}
int
barrier_ret
,
trial
;
double
start
,
stop
;
double
alpha
=
3
.
14
,
beta
=
0
.
42
;
barrier_ret
=
starpu_mpi_barrier
(
MPI_COMM_WORLD
);
if
(
trace
)
starpu_fxt_start_profiling
();
unsigned
ctx
;
int
*
procs
;
unsigned
ctx
;
if
(
context
)
{
procs
=
(
int
*
)
malloc
(
ncpu
*
sizeof
(
int
));
starpu_worker_get_ids_by_type
(
STARPU_CPU_WORKER
,
procs
,
ncpu
);
...
...
@@ -630,88 +705,47 @@ int main(int argc, char *argv[])
barrier_ret
=
starpu_mpi_barrier
(
MPI_COMM_WORLD
);
start
=
starpu_timing_now
();
int
a_local
,
b_local
,
c_local
;
int
b_row
,
b_col
,
b_aisle
;
double
zbeta
;
Block
*
Ail
,
*
Blj
,
*
Cij
;
for
(
b_row
=
0
;
b_row
<
MB
;
b_row
++
)
{
for
(
b_col
=
0
;
b_col
<
NB
;
b_col
++
)
{
for
(
b_aisle
=
0
;
b_aisle
<
KB
;
b_aisle
++
)
{
Ail
=
&
A
->
blocks
[
b_row
*
KB
+
b_aisle
];
Blj
=
&
B
->
blocks
[
b_aisle
*
NB
+
b_col
];
Cij
=
&
C
->
blocks
[
b_row
*
NB
+
b_col
];
a_local
=
Ail
->
owner
==
comm_rank
;
b_local
=
Blj
->
owner
==
comm_rank
;
c_local
=
Cij
->
owner
==
comm_rank
;
// when prune and/or prune_handles are allowed needs to be clarified
//if ((!prune && !prune_handles) || (A->blocks[b_row*KB+b_aisle].owner == comm_rank || B->blocks[b_aisle*NB+b_col].owner == comm_rank || C->blocks[b_row*NB+b_col].owner == comm_rank)) {
// TODO : logic might be written more clearly (a/b/c_local may be redundant)
if
((
!
super_prune
||
(
c_local
||
(
a_local
&&
b_col
<=
Q
)
||
(
b_local
&&
b_row
<=
P
)
))
&&
(
!
prune
||
(
a_local
||
b_local
||
c_local
)))
{
if
(
delay
)
{
// printf("[%d] late registration i,j,l %d,%d,%d\n",comm_rank,b_row,b_col,b_aisle);
if
(
!
prune_handles
||
c_local
)
{
block_starpu_register
(
Ail
,
datatype
);
block_starpu_register
(
Blj
,
datatype
);
}
block_starpu_register
(
Cij
,
datatype
);
}
struct
cl_zgemm_args_s
*
clargs
=
NULL
;
if
(
c_local
)
{
if
(
verbose
)
printf
(
"[%d] exec. C_%d,%d (%d-%p-%d) += A_%d,%d (%d-%p-%d) B_%d,%d (%d-%p-%d)
\n
"
,
comm_rank
,
b_row
,
b_col
,
Cij
->
registered
,
Cij
->
hdl
,
Cij
->
owner
,
b_row
,
b_aisle
,
Ail
->
registered
,
Ail
->
hdl
,
Ail
->
owner
,
b_aisle
,
b_col
,
Blj
->
registered
,
Blj
->
hdl
,
Blj
->
owner
);
if
(
!
values
)
{
clargs
=
malloc
(
sizeof
(
struct
cl_zgemm_args_s
));
clargs
->
alpha
=
alpha
;
clargs
->
beta
=
b_aisle
==
0
?
beta
:
1
.
0
;
}
}
else
if
(
verbose
){
printf
(
"[%d] insert. C_%d,%d (%d-%p-%d) += A_%d,%d (%d-%p-%d) B_%d,%d (%d-%p-%d)
\n
"
,
comm_rank
,
b_row
,
b_col
,
Cij
->
registered
,
Cij
->
hdl
,
Cij
->
owner
,
b_row
,
b_aisle
,
Ail
->
registered
,
Ail
->
hdl
,
Ail
->
owner
,
b_aisle
,
b_col
,
Blj
->
registered
,
Blj
->
hdl
,
Blj
->
owner
);
}
if
(
context
)
{
starpu_mpi_task_insert
(
MPI_COMM_WORLD
,
&
gemm_cl
,
STARPU_CL_ARGS
,
clargs
,
sizeof
(
struct
cl_zgemm_args_s
),
STARPU_SCHED_CTX
,
ctx
,
STARPU_R
,
Ail
->
hdl
,
STARPU_R
,
Blj
->
hdl
,
STARPU_RW
,
Cij
->
hdl
,
0
);
}
else
if
(
values
)
{
zbeta
=
b_aisle
==
0
?
beta
:
1
.
0
;
starpu_mpi_task_insert
(
MPI_COMM_WORLD
,
&
gemm_cl
,
STARPU_VALUE
,
&
alpha
,
sizeof
(
double
),
STARPU_VALUE
,
&
zbeta
,
sizeof
(
double
),
STARPU_R
,
Ail
->
hdl
,
STARPU_R
,
Blj
->
hdl
,
STARPU_RW
,
Cij
->
hdl
,
0
);
}
else
{
starpu_mpi_task_insert
(
MPI_COMM_WORLD
,
&
gemm_cl
,
STARPU_CL_ARGS
,
clargs
,
sizeof
(
struct
cl_zgemm_args_s
),
STARPU_R
,
Ail
->
hdl
,
STARPU_R
,
Blj
->
hdl
,
STARPU_RW
,
Cij
->
hdl
,
0
);
}
}
else
{
// printf("[%d] NOT inserted C_%d,%d += A_%d,%d B_%d,%d\n",comm_rank, b_row,b_col, b_row,b_aisle, b_aisle,b_col);
}
}
}
if
(
flush
)
{
for
(
b_aisle
=
0
;
b_aisle
<
KB
;
b_aisle
++
)
{
Ail
=
&
A
->
blocks
[
b_row
*
KB
+
b_aisle
];
if
(
Ail
->
registered
)
starpu_mpi_cache_flush
(
MPI_COMM_WORLD
,
Ail
->
hdl
);
}
}
}
Block
*
Ail
,
*
Blj
,
*
Cij
;
if
(
outer
)
{
for
(
b_aisle
=
0
;
b_aisle
<
KB
;
b_aisle
++
)
{
for
(
b_row
=
0
;
b_row
<
MB
;
b_row
++
)
{
for
(
b_col
=
0
;
b_col
<
NB
;
b_col
++
)
{
submit_gemm
(
b_row
,
b_col
,
b_aisle
,
alpha
,
beta
,
ctx
);
}
}
if
(
flush
)
{
for
(
b_row
=
0
;
b_row
<
MB
;
b_row
++
)
{
Ail
=
&
A
->
blocks
[
b_row
*
KB
+
b_aisle
];
if
(
Ail
->
registered
)
starpu_mpi_cache_flush
(
MPI_COMM_WORLD
,
Ail
->
hdl
);
}
for
(
b_col
=
0
;
b_col
<
NB
;
b_col
++
)
{
Blj
=
&
B
->
blocks
[
b_aisle
*
NB
+
b_col
];
if
(
Blj
->
registered
)
starpu_mpi_cache_flush
(
MPI_COMM_WORLD
,
Blj
->
hdl
);
}
}
}
}
else
{
for
(
b_row
=
0
;
b_row
<
MB
;
b_row
++
)
{
for
(
b_col
=
0
;
b_col
<
NB
;
b_col
++
)
{
for
(
b_aisle
=
0
;
b_aisle
<
KB
;
b_aisle
++
)
{
submit_gemm
(
b_row
,
b_col
,
b_aisle
,
alpha
,
beta
,
ctx
);
}
}
if
(
flush
)
{
for
(
b_aisle
=
0
;
b_aisle
<
KB
;
b_aisle
++
)
{
Ail
=
&
A
->
blocks
[
b_row
*
KB
+
b_aisle
];
if
(
Ail
->
registered
)
starpu_mpi_cache_flush
(
MPI_COMM_WORLD
,
Ail
->
hdl
);
}
}
}
}
// printf("[%d] finished submission\n",comm_rank);
starpu_mpi_wait_for_all
(
MPI_COMM_WORLD
);
barrier_ret
=
starpu_mpi_barrier
(
MPI_COMM_WORLD
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment