Mentions légales du service
Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
V
vector_transpose
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Model registry
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Admin message
GitLab upgrade completed. Current version is 17.11.3.
Show more breadcrumbs
TOLLENAERE Nicolas
vector_transpose
Commits
73b4b018
Commit
73b4b018
authored
5 years ago
by
TOLLENAERE Nicolas
Browse files
Options
Downloads
Patches
Plain Diff
Unroll
parent
33345e46
No related branches found
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
Makefile
+3
-1
3 additions, 1 deletion
Makefile
micro_kernels.c
+0
-52
0 additions, 52 deletions
micro_kernels.c
transpose_blocks.c
+24
-7
24 additions, 7 deletions
transpose_blocks.c
with
27 additions
and
60 deletions
Makefile
+
3
−
1
View file @
73b4b018
CC
=
gcc
BUILD
=
./build
FLAGS
=
-Wall
-O
3
-march
=
native
FLAGS
=
-Wall
-O
2
-march
=
core-avx2
LFLAGS
=
-lm
SRC
=
transpose_blocks.c utils.c
OBJ
=
$(
SRC:.c
=
.o
)
...
...
@@ -16,3 +16,5 @@ $(BUILD)/transpose_blocks.exe:$(patsubst %.o, $(BUILD)/%.o,$(OBJ))
$(BUILD)/%.o
:
%.c micro_kernels.h utils.h
$(
CC
)
$(
FLAGS
)
-o
$@
-c
$<
$(
LFLAGS
)
clean
:
rm
${
BUILD
}
/
*
.o
${
BUILD
}
/
*
.exe
This diff is collapsed.
Click to expand it.
micro_kernels.c
+
0
−
52
View file @
73b4b018
...
...
@@ -137,58 +137,6 @@ void trans_unpack_out_8x8_size(float* __restrict__ mat, float* __restrict__ matT
_mm256_store_ps
(
&
matT
[
cst2
+
7
*
size
],
t7
);
}
void
trans_unpack_out_8x8
(
float
*
__restrict__
mat
,
float
*
__restrict__
matT
,
int
i
,
int
j
)
{
__m256
r0
,
r1
,
r2
,
r3
,
r4
,
r5
,
r6
,
r7
;
__m256
t0
,
t1
,
t2
,
t3
,
t4
,
t5
,
t6
,
t7
;
int
cst
=
i
*
MAT_SIZE
+
j
;
r0
=
_mm256_load_ps
(
&
mat
[
cst
+
0
*
MAT_SIZE
]);
r1
=
_mm256_load_ps
(
&
mat
[
cst
+
1
*
MAT_SIZE
]);
r2
=
_mm256_load_ps
(
&
mat
[
cst
+
2
*
MAT_SIZE
]);
r3
=
_mm256_load_ps
(
&
mat
[
cst
+
3
*
MAT_SIZE
]);
r4
=
_mm256_load_ps
(
&
mat
[
cst
+
4
*
MAT_SIZE
]);
r5
=
_mm256_load_ps
(
&
mat
[
cst
+
5
*
MAT_SIZE
]);
r6
=
_mm256_load_ps
(
&
mat
[
cst
+
6
*
MAT_SIZE
]);
r7
=
_mm256_load_ps
(
&
mat
[
cst
+
7
*
MAT_SIZE
]);
t0
=
_mm256_unpacklo_ps
(
r0
,
r1
);
t1
=
_mm256_unpackhi_ps
(
r0
,
r1
);
t2
=
_mm256_unpacklo_ps
(
r2
,
r3
);
t3
=
_mm256_unpackhi_ps
(
r2
,
r3
);
t4
=
_mm256_unpacklo_ps
(
r4
,
r5
);
t5
=
_mm256_unpackhi_ps
(
r4
,
r5
);
t6
=
_mm256_unpacklo_ps
(
r6
,
r7
);
t7
=
_mm256_unpackhi_ps
(
r6
,
r7
);
r0
=
_mm256_shuffle_ps
(
t0
,
t2
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
r1
=
_mm256_shuffle_ps
(
t0
,
t2
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
r2
=
_mm256_shuffle_ps
(
t1
,
t3
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
r3
=
_mm256_shuffle_ps
(
t1
,
t3
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
r4
=
_mm256_shuffle_ps
(
t4
,
t6
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
r5
=
_mm256_shuffle_ps
(
t4
,
t6
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
r6
=
_mm256_shuffle_ps
(
t5
,
t7
,
_MM_SHUFFLE
(
1
,
0
,
1
,
0
));
r7
=
_mm256_shuffle_ps
(
t5
,
t7
,
_MM_SHUFFLE
(
3
,
2
,
3
,
2
));
t0
=
_mm256_permute2f128_ps
(
r0
,
r4
,
0x20
);
t1
=
_mm256_permute2f128_ps
(
r1
,
r5
,
0x20
);
t2
=
_mm256_permute2f128_ps
(
r2
,
r6
,
0x20
);
t3
=
_mm256_permute2f128_ps
(
r3
,
r7
,
0x20
);
t4
=
_mm256_permute2f128_ps
(
r0
,
r4
,
0x31
);
t5
=
_mm256_permute2f128_ps
(
r1
,
r5
,
0x31
);
t6
=
_mm256_permute2f128_ps
(
r2
,
r6
,
0x31
);
t7
=
_mm256_permute2f128_ps
(
r3
,
r7
,
0x31
);
int
cst2
=
j
*
MAT_SIZE
+
i
;
_mm256_store_ps
(
&
matT
[
cst2
+
0
*
MAT_SIZE
],
t0
);
_mm256_store_ps
(
&
matT
[
cst2
+
1
*
MAT_SIZE
],
t1
);
_mm256_store_ps
(
&
matT
[
cst2
+
2
*
MAT_SIZE
],
t2
);
_mm256_store_ps
(
&
matT
[
cst2
+
3
*
MAT_SIZE
],
t3
);
_mm256_store_ps
(
&
matT
[
cst2
+
4
*
MAT_SIZE
],
t4
);
_mm256_store_ps
(
&
matT
[
cst2
+
5
*
MAT_SIZE
],
t5
);
_mm256_store_ps
(
&
matT
[
cst2
+
6
*
MAT_SIZE
],
t6
);
_mm256_store_ps
(
&
matT
[
cst2
+
7
*
MAT_SIZE
],
t7
);
}
void
trans_blend_out_8x8
(
float
*
mat
,
float
*
matT
,
int
i
,
int
j
)
{
__m256
r0
,
r1
,
r2
,
r3
,
r4
,
r5
,
r6
,
r7
;
__m256
t0
,
t1
,
t2
,
t3
,
t4
,
t5
,
t6
,
t7
;
...
...
This diff is collapsed.
Click to expand it.
transpose_blocks.c
+
24
−
7
View file @
73b4b018
...
...
@@ -20,16 +20,33 @@ void function_name(float* __restrict__ mat, float* __restrict__ matT){\
}\
}\
}
#define UNROLL_J_LOOP(block_call, mat, matT, i) \
for (int j = 0; j < MAT_SIZE; j += BLOCK_SIZE * 4) {\
block_call(mat, matT, (i), j);\
block_call(mat, matT, (i), j + BLOCK_SIZE);\
block_call(mat, matT, (i), j + 2 * BLOCK_SIZE);\
block_call(mat, matT, (i), j + 3 * BLOCK_SIZE);\
}
// Kernels declaration
LOOP_OVER
(
transpose_blocks_naive
,
copy_naive_out_bxb
)
LOOP_OVER
(
transpose_blocks_sse_4x4
,
copy_sse_4x4
)
LOOP_OVER
(
sse_transpose
,
trans_unpack_out_4x4
)
#define UNROLL_4_OVER(function_name, block_call
/*void block_call(float*, float*, int i, int j)*/
) \
void function_name(float* __restrict__ mat, float* __restrict__ matT){\
for (int i = 0; i < MAT_SIZE; i += BLOCK_SIZE) {\
UNROLL_J_LOOP(block_call, mat, matT, i);\
}\
}
//LOOP_OVER(transpose_blocks_naive, copy_naive_out_bxb)
UNROLL_4_OVER
(
transpose_blocks_naive
,
copy_naive_out_bxb
)
//LOOP_OVER(transpose_blocks_sse_4x4, copy_sse_4x4)
UNROLL_4_OVER
(
transpose_blocks_sse_4x4
,
copy_sse_4x4
)
#if HAS_AVX
LOOP_OVER
(
transpose_blocks_avx_8x8
,
copy_avx_8x8
)
LOOP_OVER
(
copy_all_avx_8x8
,
copy_contiguous_avx_8x8
)
LOOP_OVER
(
real_transpose
,
trans_unpack_out_8x8
)
//LOOP_OVER(transpose_blocks_avx_8x8, copy_avx_8x8)
UNROLL_4_OVER
(
transpose_blocks_avx_8x8
,
copy_avx_8x8
)
//LOOP_OVER(copy_all_avx_8x8, copy_contiguous_avx_8x8)
UNROLL_4_OVER
(
copy_all_avx_8x8
,
copy_contiguous_avx_8x8
)
//LOOP_OVER(real_transpose, trans_unpack_out_8x8)
UNROLL_4_OVER
(
real_transpose
,
trans_unpack_out_8x8
)
#endif
long
NBYTES
=
MAT_SIZE
*
MAT_SIZE
*
sizeof
(
float
);
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment