Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
solverstack
Chameleon
Commits
96ec99ec
Commit
96ec99ec
authored
May 19, 2019
by
Mathieu Faverge
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'feature/gemm_summa' into 'master'
SUMMA GEMM See merge request
!154
parents
e02d3679
ef90cff0
Changes
16
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
1134 additions
and
226 deletions
+1134
-226
compute/pzgemm.c
compute/pzgemm.c
+208
-22
compute/pzhemm.c
compute/pzhemm.c
+264
-29
compute/pzsymm.c
compute/pzsymm.c
+352
-147
control/context.c
control/context.c
+21
-1
control/descriptor.c
control/descriptor.c
+21
-0
control/descriptor.h
control/descriptor.h
+2
-23
include/chameleon/constants.h
include/chameleon/constants.h
+2
-0
include/chameleon/struct.h
include/chameleon/struct.h
+2
-0
runtime/starpu/control/runtime_context.c
runtime/starpu/control/runtime_context.c
+4
-0
runtime/starpu/control/runtime_profiling.c
runtime/starpu/control/runtime_profiling.c
+2
-2
testing/testing_dgram.c
testing/testing_dgram.c
+1
-2
timing/CMakeLists.txt
timing/CMakeLists.txt
+2
-0
timing/time_zhemm_tile.c
timing/time_zhemm_tile.c
+87
-0
timing/time_zsymm_tile.c
timing/time_zsymm_tile.c
+87
-0
timing/timing_zauxiliary.c
timing/timing_zauxiliary.c
+64
-0
timing/timing_zauxiliary.h
timing/timing_zauxiliary.h
+15
-0
No files found.
compute/pzgemm.c
View file @
96ec99ec
...
...
@@ -26,16 +26,176 @@
#define A(m, n) A, m, n
#define B(m, n) B, m, n
#define C(m, n) C, m, n
#define WA(m, n) &WA, m, n
#define WB(m, n) &WB, m, n
/**
* Parallel tile matrix-matrix multiplication - dynamic scheduling
* Parallel tile matrix-matrix multiplication
* SUMMA algorithm for 2D block-cyclic data distribution.
*/
void
chameleon_pzgemm
(
cham_trans_t
transA
,
cham_trans_t
transB
,
CHAMELEON_Complex64_t
alpha
,
CHAM_desc_t
*
A
,
CHAM_desc_t
*
B
,
CHAMELEON_Complex64_t
beta
,
CHAM_desc_t
*
C
,
RUNTIME_sequence_t
*
sequence
,
RUNTIME_request_t
*
request
)
static
inline
void
chameleon_pzgemm_summa
(
CHAM_context_t
*
chamctxt
,
cham_trans_t
transA
,
cham_trans_t
transB
,
CHAMELEON_Complex64_t
alpha
,
CHAM_desc_t
*
A
,
CHAM_desc_t
*
B
,
CHAMELEON_Complex64_t
beta
,
CHAM_desc_t
*
C
,
RUNTIME_option_t
*
options
)
{
CHAM_context_t
*
chamctxt
;
RUNTIME_option_t
options
;
RUNTIME_sequence_t
*
sequence
=
options
->
sequence
;
int
m
,
n
,
k
,
p
,
q
,
KT
,
K
,
lp
,
lq
;
int
ldam
,
ldak
,
ldbn
,
ldbk
,
ldcm
;
int
tempmm
,
tempnn
,
tempkk
;
int
lookahead
,
myp
,
myq
;
CHAMELEON_Complex64_t
zbeta
;
CHAMELEON_Complex64_t
zone
=
(
CHAMELEON_Complex64_t
)
1
.
0
;
CHAM_desc_t
WA
,
WB
;
lookahead
=
chamctxt
->
lookahead
;
chameleon_desc_init
(
&
WA
,
CHAMELEON_MAT_ALLOC_TILE
,
ChamComplexDouble
,
C
->
mb
,
C
->
nb
,
(
C
->
mb
*
C
->
nb
),
C
->
mt
*
C
->
mb
,
C
->
nb
*
C
->
q
*
lookahead
,
0
,
0
,
C
->
mt
*
C
->
mb
,
C
->
nb
*
C
->
q
*
lookahead
,
C
->
p
,
C
->
q
,
NULL
,
NULL
,
NULL
);
chameleon_desc_init
(
&
WB
,
CHAMELEON_MAT_ALLOC_TILE
,
ChamComplexDouble
,
C
->
mb
,
C
->
nb
,
(
C
->
mb
*
C
->
nb
),
C
->
mb
*
C
->
p
*
lookahead
,
C
->
nt
*
C
->
nb
,
0
,
0
,
C
->
mb
*
C
->
p
*
lookahead
,
C
->
nt
*
C
->
nb
,
C
->
p
,
C
->
q
,
NULL
,
NULL
,
NULL
);
KT
=
transA
==
ChamNoTrans
?
A
->
nt
:
A
->
mt
;
K
=
transA
==
ChamNoTrans
?
A
->
n
:
A
->
m
;
myp
=
A
->
myrank
/
A
->
q
;
myq
=
A
->
myrank
%
A
->
q
;
/*
* A: ChamNoTrans / B: ChamNoTrans
*/
for
(
k
=
0
;
k
<
KT
;
k
++
)
{
lp
=
(
k
%
lookahead
)
*
C
->
p
;
lq
=
(
k
%
lookahead
)
*
C
->
q
;
tempkk
=
k
==
KT
-
1
?
K
-
k
*
A
->
nb
:
A
->
nb
;
zbeta
=
k
==
0
?
beta
:
zone
;
ldak
=
BLKLDD
(
A
,
k
);
ldbk
=
BLKLDD
(
B
,
k
);
/* Transfert ownership of the k column of A */
for
(
m
=
0
;
m
<
C
->
mt
;
m
++
)
{
tempmm
=
m
==
C
->
mt
-
1
?
C
->
m
-
m
*
C
->
mb
:
C
->
mb
;
ldam
=
BLKLDD
(
A
,
m
);
if
(
transA
==
ChamNoTrans
)
{
INSERT_TASK_zlacpy
(
options
,
ChamUpperLower
,
tempmm
,
tempkk
,
C
->
mb
,
A
(
m
,
k
),
ldam
,
WA
(
m
,
(
k
%
C
->
q
)
+
lq
),
WA
.
mb
);
RUNTIME_data_flush
(
sequence
,
A
(
m
,
k
)
);
for
(
q
=
1
;
q
<
C
->
q
;
q
++
)
{
INSERT_TASK_zlacpy
(
options
,
ChamUpperLower
,
tempmm
,
tempkk
,
C
->
mb
,
WA
(
m
,
((
k
+
q
-
1
)
%
C
->
q
)
+
lq
),
WA
.
mb
,
WA
(
m
,
((
k
+
q
)
%
C
->
q
)
+
lq
),
WA
.
mb
);
}
}
else
{
INSERT_TASK_zlacpy
(
options
,
ChamUpperLower
,
tempkk
,
tempmm
,
C
->
mb
,
A
(
k
,
m
),
ldak
,
WA
(
m
,
(
k
%
C
->
q
)
+
lq
),
WA
.
mb
);
RUNTIME_data_flush
(
sequence
,
A
(
k
,
m
)
);
for
(
q
=
1
;
q
<
C
->
q
;
q
++
)
{
INSERT_TASK_zlacpy
(
options
,
ChamUpperLower
,
tempkk
,
tempmm
,
C
->
mb
,
WA
(
m
,
((
k
+
q
-
1
)
%
C
->
q
)
+
lq
),
WA
.
mb
,
WA
(
m
,
((
k
+
q
)
%
C
->
q
)
+
lq
),
WA
.
mb
);
}
}
}
/* Transfert ownership of the k row of B */
for
(
n
=
0
;
n
<
C
->
nt
;
n
++
)
{
tempnn
=
n
==
C
->
nt
-
1
?
C
->
n
-
n
*
C
->
nb
:
C
->
nb
;
ldbn
=
BLKLDD
(
B
,
n
);
if
(
transB
==
ChamNoTrans
)
{
INSERT_TASK_zlacpy
(
options
,
ChamUpperLower
,
tempkk
,
tempnn
,
C
->
mb
,
B
(
k
,
n
),
ldbk
,
WB
(
(
k
%
C
->
p
)
+
lp
,
n
),
WB
.
mb
);
RUNTIME_data_flush
(
sequence
,
B
(
k
,
n
)
);
for
(
p
=
1
;
p
<
C
->
p
;
p
++
)
{
INSERT_TASK_zlacpy
(
options
,
ChamUpperLower
,
tempkk
,
tempnn
,
C
->
mb
,
WB
(
((
k
+
p
-
1
)
%
C
->
p
)
+
lp
,
n
),
WB
.
mb
,
WB
(
((
k
+
p
)
%
C
->
p
)
+
lp
,
n
),
WB
.
mb
);
}
}
else
{
INSERT_TASK_zlacpy
(
options
,
ChamUpperLower
,
tempnn
,
tempkk
,
C
->
mb
,
B
(
n
,
k
),
ldbn
,
WB
(
(
k
%
C
->
p
)
+
lp
,
n
),
WB
.
mb
);
RUNTIME_data_flush
(
sequence
,
B
(
n
,
k
)
);
for
(
p
=
1
;
p
<
C
->
p
;
p
++
)
{
INSERT_TASK_zlacpy
(
options
,
ChamUpperLower
,
tempnn
,
tempkk
,
C
->
mb
,
WB
(
((
k
+
p
-
1
)
%
C
->
p
)
+
lp
,
n
),
WB
.
mb
,
WB
(
((
k
+
p
)
%
C
->
p
)
+
lp
,
n
),
WB
.
mb
);
}
}
}
for
(
m
=
myp
;
m
<
C
->
mt
;
m
+=
C
->
p
)
{
tempmm
=
m
==
C
->
mt
-
1
?
C
->
m
-
m
*
C
->
mb
:
C
->
mb
;
ldcm
=
BLKLDD
(
C
,
m
);
for
(
n
=
myq
;
n
<
C
->
nt
;
n
+=
C
->
q
)
{
tempnn
=
n
==
C
->
nt
-
1
?
C
->
n
-
n
*
C
->
nb
:
C
->
nb
;
INSERT_TASK_zgemm
(
options
,
transA
,
transB
,
tempmm
,
tempnn
,
tempkk
,
A
->
mb
,
alpha
,
WA
(
m
,
myq
+
lq
),
WA
.
mb
,
/* lda * Z */
WB
(
myp
+
lp
,
n
),
WB
.
mb
,
/* ldb * Y */
zbeta
,
C
(
m
,
n
),
ldcm
);
/* ldc * Y */
}
}
}
RUNTIME_desc_flush
(
&
WA
,
sequence
);
RUNTIME_desc_flush
(
&
WB
,
sequence
);
RUNTIME_desc_flush
(
C
,
sequence
);
chameleon_sequence_wait
(
chamctxt
,
sequence
);
chameleon_desc_destroy
(
&
WA
);
chameleon_desc_destroy
(
&
WB
);
}
/**
* Parallel tile matrix-matrix multiplication.
* Generic algorithm for any data distribution.
*/
static
inline
void
chameleon_pzgemm_generic
(
CHAM_context_t
*
chamctxt
,
cham_trans_t
transA
,
cham_trans_t
transB
,
CHAMELEON_Complex64_t
alpha
,
CHAM_desc_t
*
A
,
CHAM_desc_t
*
B
,
CHAMELEON_Complex64_t
beta
,
CHAM_desc_t
*
C
,
RUNTIME_option_t
*
options
)
{
RUNTIME_sequence_t
*
sequence
=
options
->
sequence
;
int
m
,
n
,
k
;
int
ldam
,
ldak
,
ldbn
,
ldbk
,
ldcm
;
...
...
@@ -44,12 +204,6 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
CHAMELEON_Complex64_t
zbeta
;
CHAMELEON_Complex64_t
zone
=
(
CHAMELEON_Complex64_t
)
1
.
0
;
chamctxt
=
chameleon_context_self
();
if
(
sequence
->
status
!=
CHAMELEON_SUCCESS
)
{
return
;
}
RUNTIME_options_init
(
&
options
,
chamctxt
,
sequence
,
request
);
for
(
m
=
0
;
m
<
C
->
mt
;
m
++
)
{
tempmm
=
m
==
C
->
mt
-
1
?
C
->
m
-
m
*
C
->
mb
:
C
->
mb
;
ldcm
=
BLKLDD
(
C
,
m
);
...
...
@@ -66,11 +220,11 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
ldbk
=
BLKLDD
(
B
,
k
);
zbeta
=
k
==
0
?
beta
:
zone
;
INSERT_TASK_zgemm
(
&
options
,
options
,
transA
,
transB
,
tempmm
,
tempnn
,
tempkn
,
A
->
mb
,
alpha
,
A
(
m
,
k
),
ldam
,
/* lda * Z */
B
(
k
,
n
),
ldbk
,
/* ldb * Y */
B
(
k
,
n
),
ldbk
,
/* ldb * Y */
zbeta
,
C
(
m
,
n
),
ldcm
);
/* ldc * Y */
}
}
...
...
@@ -83,11 +237,11 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
tempkn
=
k
==
A
->
nt
-
1
?
A
->
n
-
k
*
A
->
nb
:
A
->
nb
;
zbeta
=
k
==
0
?
beta
:
zone
;
INSERT_TASK_zgemm
(
&
options
,
options
,
transA
,
transB
,
tempmm
,
tempnn
,
tempkn
,
A
->
mb
,
alpha
,
A
(
m
,
k
),
ldam
,
/* lda * Z */
B
(
n
,
k
),
ldbn
,
/* ldb * Z */
B
(
n
,
k
),
ldbn
,
/* ldb * Z */
zbeta
,
C
(
m
,
n
),
ldcm
);
/* ldc * Y */
}
}
...
...
@@ -103,11 +257,11 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
ldbk
=
BLKLDD
(
B
,
k
);
zbeta
=
k
==
0
?
beta
:
zone
;
INSERT_TASK_zgemm
(
&
options
,
options
,
transA
,
transB
,
tempmm
,
tempnn
,
tempkm
,
A
->
mb
,
alpha
,
A
(
k
,
m
),
ldak
,
/* lda * X */
B
(
k
,
n
),
ldbk
,
/* ldb * Y */
B
(
k
,
n
),
ldbk
,
/* ldb * Y */
zbeta
,
C
(
m
,
n
),
ldcm
);
/* ldc * Y */
}
}
...
...
@@ -121,11 +275,11 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
ldak
=
BLKLDD
(
A
,
k
);
zbeta
=
k
==
0
?
beta
:
zone
;
INSERT_TASK_zgemm
(
&
options
,
options
,
transA
,
transB
,
tempmm
,
tempnn
,
tempkm
,
A
->
mb
,
alpha
,
A
(
k
,
m
),
ldak
,
/* lda * X */
B
(
n
,
k
),
ldbn
,
/* ldb * Z */
B
(
n
,
k
),
ldbn
,
/* ldb * Z */
zbeta
,
C
(
m
,
n
),
ldcm
);
/* ldc * Y */
}
}
...
...
@@ -142,5 +296,37 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
}
}
}
RUNTIME_options_finalize
(
&
options
,
chamctxt
);
(
void
)
chamctxt
;
}
/**
* Parallel tile matrix-matrix multiplication. wrapper.
*/
void
chameleon_pzgemm
(
cham_trans_t
transA
,
cham_trans_t
transB
,
CHAMELEON_Complex64_t
alpha
,
CHAM_desc_t
*
A
,
CHAM_desc_t
*
B
,
CHAMELEON_Complex64_t
beta
,
CHAM_desc_t
*
C
,
RUNTIME_sequence_t
*
sequence
,
RUNTIME_request_t
*
request
)
{
CHAM_context_t
*
chamctxt
;
RUNTIME_option_t
options
;
chamctxt
=
chameleon_context_self
();
if
(
sequence
->
status
!=
CHAMELEON_SUCCESS
)
{
return
;
}
RUNTIME_options_init
(
&
options
,
chamctxt
,
sequence
,
request
);
if
(
((
C
->
p
>
1
)
||
(
C
->
q
>
1
))
&&
(
C
->
get_rankof
==
chameleon_getrankof_2d
)
&&
(
chamctxt
->
generic_enabled
!=
CHAMELEON_TRUE
)
)
{
chameleon_pzgemm_summa
(
chamctxt
,
transA
,
transB
,
alpha
,
A
,
B
,
beta
,
C
,
&
options
);
}
else
{
chameleon_pzgemm_generic
(
chamctxt
,
transA
,
transB
,
alpha
,
A
,
B
,
beta
,
C
,
&
options
);
}
RUNTIME_options_finalize
(
&
options
,
chamctxt
);
}
compute/pzhemm.c
View file @
96ec99ec
...
...
@@ -23,20 +23,230 @@
*/
#include "control/common.h"
#define A(m,n) A, m, n
#define B(m,n) B, m, n
#define C(m,n) C, m, n
#define A(m, n) A, m, n
#define B(m, n) B, m, n
#define C(m, n) C, m, n
#define WA(m, n) &WA, m, n
#define WB(m, n) &WB, m, n
/**
* Parallel tile Hermitian matrix-matrix multiplication - dynamic scheduling
* Parallel tile hermitian matrix-matrix multiplication.
* SUMMA algorithm for 2D block-cyclic data distribution.
*/
void
chameleon_pzhemm
(
cham_side_t
side
,
cham_uplo_t
uplo
,
CHAMELEON_Complex64_t
alpha
,
CHAM_desc_t
*
A
,
CHAM_desc_t
*
B
,
CHAMELEON_Complex64_t
beta
,
CHAM_desc_t
*
C
,
RUNTIME_sequence_t
*
sequence
,
RUNTIME_request_t
*
request
)
static
inline
void
chameleon_pzhemm_summa
(
CHAM_context_t
*
chamctxt
,
cham_side_t
side
,
cham_uplo_t
uplo
,
CHAMELEON_Complex64_t
alpha
,
CHAM_desc_t
*
A
,
CHAM_desc_t
*
B
,
CHAMELEON_Complex64_t
beta
,
CHAM_desc_t
*
C
,
RUNTIME_option_t
*
options
)
{
CHAM_context_t
*
chamctxt
;
RUNTIME_option_t
options
;
RUNTIME_sequence_t
*
sequence
=
options
->
sequence
;
cham_trans_t
transA
,
transB
;
int
Am
,
An
,
m
,
n
,
k
,
p
,
q
,
KT
,
K
,
lp
,
lq
;
int
ldam
,
ldbk
,
ldbm
,
ldcm
;
int
tempmm
,
tempnn
,
tempkk
;
int
lookahead
,
myp
,
myq
;
CHAMELEON_Complex64_t
zbeta
;
CHAMELEON_Complex64_t
zone
=
(
CHAMELEON_Complex64_t
)
1
.
0
;
CHAM_desc_t
WA
,
WB
;
lookahead
=
chamctxt
->
lookahead
;
chameleon_desc_init
(
&
WA
,
CHAMELEON_MAT_ALLOC_TILE
,
ChamComplexDouble
,
C
->
mb
,
C
->
nb
,
(
C
->
mb
*
C
->
nb
),
C
->
mt
*
C
->
mb
,
C
->
nb
*
C
->
q
*
lookahead
,
0
,
0
,
C
->
mt
*
C
->
mb
,
C
->
nb
*
C
->
q
*
lookahead
,
C
->
p
,
C
->
q
,
NULL
,
NULL
,
NULL
);
chameleon_desc_init
(
&
WB
,
CHAMELEON_MAT_ALLOC_TILE
,
ChamComplexDouble
,
C
->
mb
,
C
->
nb
,
(
C
->
mb
*
C
->
nb
),
C
->
mb
*
C
->
p
*
lookahead
,
C
->
nt
*
C
->
nb
,
0
,
0
,
C
->
mb
*
C
->
p
*
lookahead
,
C
->
nt
*
C
->
nb
,
C
->
p
,
C
->
q
,
NULL
,
NULL
,
NULL
);
KT
=
side
==
ChamLeft
?
A
->
nt
:
A
->
mt
;
K
=
side
==
ChamLeft
?
A
->
n
:
A
->
m
;
myp
=
C
->
myrank
/
C
->
q
;
myq
=
C
->
myrank
%
C
->
q
;
for
(
k
=
0
;
k
<
KT
;
k
++
)
{
lp
=
(
k
%
lookahead
)
*
C
->
p
;
lq
=
(
k
%
lookahead
)
*
C
->
q
;
tempkk
=
k
==
KT
-
1
?
K
-
k
*
A
->
nb
:
A
->
nb
;
zbeta
=
k
==
0
?
beta
:
zone
;
ldbk
=
BLKLDD
(
B
,
k
);
/* Transfert ownership of the k column of A or B */
for
(
m
=
0
;
m
<
C
->
mt
;
m
++
)
{
tempmm
=
m
==
C
->
mt
-
1
?
C
->
m
-
m
*
C
->
mb
:
C
->
mb
;
if
(
side
==
ChamLeft
)
{
if
(
((
uplo
==
ChamUpper
)
&&
(
m
>
k
))
||
((
uplo
==
ChamLower
)
&&
(
m
<
k
))
)
{
Am
=
k
;
An
=
m
;
}
else
{
Am
=
m
;
An
=
k
;
}
ldam
=
BLKLDD
(
A
,
Am
);
INSERT_TASK_zlacpy
(
options
,
ChamUpperLower
,
tempmm
,
tempkk
,
C
->
mb
,
A
(
Am
,
An
),
ldam
,
WA
(
m
,
(
k
%
C
->
q
)
+
lq
),
WA
.
mb
);
RUNTIME_data_flush
(
sequence
,
A
(
Am
,
An
)
);
for
(
q
=
1
;
q
<
C
->
q
;
q
++
)
{
INSERT_TASK_zlacpy
(
options
,
ChamUpperLower
,
tempmm
,
tempkk
,
C
->
mb
,
WA
(
m
,
((
k
+
q
-
1
)
%
C
->
q
)
+
lq
),
WA
.
mb
,
WA
(
m
,
((
k
+
q
)
%
C
->
q
)
+
lq
),
WA
.
mb
);
}
}
else
{
ldbm
=
BLKLDD
(
B
,
m
);
INSERT_TASK_zlacpy
(
options
,
ChamUpperLower
,
tempmm
,
tempkk
,
C
->
mb
,
B
(
m
,
k
),
ldbm
,
WA
(
m
,
(
k
%
C
->
q
)
+
lq
),
WA
.
mb
);
RUNTIME_data_flush
(
sequence
,
B
(
m
,
k
)
);
for
(
q
=
1
;
q
<
C
->
q
;
q
++
)
{
INSERT_TASK_zlacpy
(
options
,
ChamUpperLower
,
tempmm
,
tempkk
,
C
->
mb
,
WA
(
m
,
((
k
+
q
-
1
)
%
C
->
q
)
+
lq
),
WA
.
mb
,
WA
(
m
,
((
k
+
q
)
%
C
->
q
)
+
lq
),
WA
.
mb
);
}
}
}
/* Transfert ownership of the k row of B, or A */
for
(
n
=
0
;
n
<
C
->
nt
;
n
++
)
{
tempnn
=
n
==
C
->
nt
-
1
?
C
->
n
-
n
*
C
->
nb
:
C
->
nb
;
if
(
side
==
ChamRight
)
{
if
(
((
uplo
==
ChamUpper
)
&&
(
n
<
k
))
||
((
uplo
==
ChamLower
)
&&
(
n
>
k
))
)
{
Am
=
n
;
An
=
k
;
}
else
{
Am
=
k
;
An
=
n
;
}
ldam
=
BLKLDD
(
A
,
Am
);
INSERT_TASK_zlacpy
(
options
,
ChamUpperLower
,
tempkk
,
tempnn
,
C
->
mb
,
A
(
Am
,
An
),
ldam
,
WB
(
(
k
%
C
->
p
)
+
lp
,
n
),
WB
.
mb
);
RUNTIME_data_flush
(
sequence
,
A
(
Am
,
An
)
);
for
(
p
=
1
;
p
<
C
->
p
;
p
++
)
{
INSERT_TASK_zlacpy
(
options
,
ChamUpperLower
,
tempkk
,
tempnn
,
C
->
mb
,
WB
(
((
k
+
p
-
1
)
%
C
->
p
)
+
lp
,
n
),
WB
.
mb
,
WB
(
((
k
+
p
)
%
C
->
p
)
+
lp
,
n
),
WB
.
mb
);
}
}
else
{
INSERT_TASK_zlacpy
(
options
,
ChamUpperLower
,
tempkk
,
tempnn
,
C
->
mb
,
B
(
k
,
n
),
ldbk
,
WB
(
(
k
%
C
->
p
)
+
lp
,
n
),
WB
.
mb
);
RUNTIME_data_flush
(
sequence
,
B
(
k
,
n
)
);
for
(
p
=
1
;
p
<
C
->
p
;
p
++
)
{
INSERT_TASK_zlacpy
(
options
,
ChamUpperLower
,
tempkk
,
tempnn
,
C
->
mb
,
WB
(
((
k
+
p
-
1
)
%
C
->
p
)
+
lp
,
n
),
WB
.
mb
,
WB
(
((
k
+
p
)
%
C
->
p
)
+
lp
,
n
),
WB
.
mb
);
}
}
}
/*
* ChamLeft / ChamLower
*/
for
(
m
=
myp
;
m
<
C
->
mt
;
m
+=
C
->
p
)
{
tempmm
=
m
==
C
->
mt
-
1
?
C
->
m
-
m
*
C
->
mb
:
C
->
mb
;
ldcm
=
BLKLDD
(
C
,
m
);
for
(
n
=
myq
;
n
<
C
->
nt
;
n
+=
C
->
q
)
{
tempnn
=
n
==
C
->
nt
-
1
?
C
->
n
-
n
*
C
->
nb
:
C
->
nb
;
if
(
side
==
ChamLeft
)
{
transB
=
ChamNoTrans
;
if
(
((
uplo
==
ChamUpper
)
&&
(
m
>
k
))
||
((
uplo
==
ChamLower
)
&&
(
m
<
k
))
)
{
transA
=
ChamConjTrans
;
}
else
{
transA
=
ChamNoTrans
;
}
}
else
{
transA
=
ChamNoTrans
;
if
(
((
uplo
==
ChamUpper
)
&&
(
n
<
k
))
||
((
uplo
==
ChamLower
)
&&
(
n
>
k
))
)
{
transB
=
ChamConjTrans
;
}
else
{
transB
=
ChamNoTrans
;
}
}
if
(
k
==
m
)
{
INSERT_TASK_zhemm
(
options
,
side
,
uplo
,
tempmm
,
tempnn
,
A
->
mb
,
alpha
,
WA
(
m
,
myq
+
lq
),
WA
.
mb
,
/* lda * Z */
WB
(
myp
+
lp
,
n
),
WB
.
mb
,
/* ldb * Y */
zbeta
,
C
(
m
,
n
),
ldcm
);
/* ldc * Y */
}
else
{
INSERT_TASK_zgemm
(
options
,
transA
,
transB
,
tempmm
,
tempnn
,
tempkk
,
A
->
mb
,
alpha
,
WA
(
m
,
myq
+
lq
),
WA
.
mb
,
/* lda * Z */
WB
(
myp
+
lp
,
n
),
WB
.
mb
,
/* ldb * Y */
zbeta
,
C
(
m
,
n
),
ldcm
);
/* ldc * Y */
}
}
}
}
RUNTIME_desc_flush
(
&
WA
,
sequence
);
RUNTIME_desc_flush
(
&
WB
,
sequence
);
RUNTIME_desc_flush
(
C
,
sequence
);
chameleon_sequence_wait
(
chamctxt
,
sequence
);
chameleon_desc_destroy
(
&
WA
);
chameleon_desc_destroy
(
&
WB
);
}
/**
* Parallel tile hermitian matrix-matrix multiplication.
* Generic algorithm for any data distribution.
*/
static
inline
void
chameleon_pzhemm_generic
(
CHAM_context_t
*
chamctxt
,
cham_side_t
side
,
cham_uplo_t
uplo
,
CHAMELEON_Complex64_t
alpha
,
CHAM_desc_t
*
A
,
CHAM_desc_t
*
B
,
CHAMELEON_Complex64_t
beta
,
CHAM_desc_t
*
C
,
RUNTIME_option_t
*
options
)
{
int
k
,
m
,
n
;
int
ldam
,
ldan
,
ldak
,
ldbk
,
ldbm
,
ldcm
;
int
tempmm
,
tempnn
,
tempkn
,
tempkm
;
...
...
@@ -44,12 +254,6 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
CHAMELEON_Complex64_t
zbeta
;
CHAMELEON_Complex64_t
zone
=
(
CHAMELEON_Complex64_t
)
1
.
0
;
chamctxt
=
chameleon_context_self
();
if
(
sequence
->
status
!=
CHAMELEON_SUCCESS
)
{
return
;
}
RUNTIME_options_init
(
&
options
,
chamctxt
,
sequence
,
request
);
for
(
m
=
0
;
m
<
C
->
mt
;
m
++
)
{
tempmm
=
m
==
C
->
mt
-
1
?
C
->
m
-
m
*
C
->
mb
:
C
->
mb
;
ldcm
=
BLKLDD
(
C
,
m
);
...
...
@@ -68,7 +272,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
zbeta
=
k
==
0
?
beta
:
zone
;
if
(
k
<
m
)
{
INSERT_TASK_zgemm
(
&
options
,
options
,
ChamNoTrans
,
ChamNoTrans
,
tempmm
,
tempnn
,
tempkm
,
A
->
mb
,
alpha
,
A
(
m
,
k
),
ldam
,
/* lda * K */
...
...
@@ -78,7 +282,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
else
{
if
(
k
==
m
)
{
INSERT_TASK_zhemm
(
&
options
,
options
,
side
,
uplo
,
tempmm
,
tempnn
,
A
->
mb
,
alpha
,
A
(
k
,
k
),
ldak
,
/* ldak * X */
...
...
@@ -87,7 +291,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
}
else
{
INSERT_TASK_zgemm
(
&
options
,
options
,
ChamConjTrans
,
ChamNoTrans
,
tempmm
,
tempnn
,
tempkm
,
A
->
mb
,
alpha
,
A
(
k
,
m
),
ldak
,
/* ldak * X */
...
...
@@ -108,7 +312,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
zbeta
=
k
==
0
?
beta
:
zone
;
if
(
k
<
m
)
{
INSERT_TASK_zgemm
(
&
options
,
options
,
ChamConjTrans
,
ChamNoTrans
,
tempmm
,
tempnn
,
tempkm
,
A
->
mb
,
alpha
,
A
(
k
,
m
),
ldak
,
/* ldak * X */
...
...
@@ -118,7 +322,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
else
{
if
(
k
==
m
)
{
INSERT_TASK_zhemm
(
&
options
,
options
,
side
,
uplo
,
tempmm
,
tempnn
,
A
->
mb
,
alpha
,
A
(
k
,
k
),
ldak
,
/* ldak * K */
...
...
@@ -127,7 +331,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
}
else
{
INSERT_TASK_zgemm
(
&
options
,
options
,
ChamNoTrans
,
ChamNoTrans
,
tempmm
,
tempnn
,
tempkm
,
A
->
mb
,
alpha
,
A
(
m
,
k
),
ldam
,
/* lda * K */
...
...
@@ -151,7 +355,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
zbeta
=
k
==
0
?
beta
:
zone
;
if
(
k
<
n
)
{
INSERT_TASK_zgemm
(
&
options
,
options
,
ChamNoTrans
,
ChamConjTrans
,
tempmm
,
tempnn
,
tempkn
,
A
->
mb
,