From 73b4b01821c223cbd2580f777716b51c0444892d Mon Sep 17 00:00:00 2001 From: Nicolas Tollenaere <nicolas.tollenaere@inria.fr> Date: Tue, 10 Sep 2019 14:21:10 +0200 Subject: [PATCH] Unroll --- Makefile | 4 +++- micro_kernels.c | 52 ---------------------------------------------- transpose_blocks.c | 31 ++++++++++++++++++++------- 3 files changed, 27 insertions(+), 60 deletions(-) diff --git a/Makefile b/Makefile index 7db3379..3cf2849 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ CC=gcc BUILD=./build -FLAGS=-Wall -O3 -march=native +FLAGS=-Wall -O2 -march=core-avx2 LFLAGS=-lm SRC=transpose_blocks.c utils.c OBJ= $(SRC:.c=.o) @@ -16,3 +16,5 @@ $(BUILD)/transpose_blocks.exe:$(patsubst %.o, $(BUILD)/%.o,$(OBJ)) $(BUILD)/%.o:%.c micro_kernels.h utils.h $(CC) $(FLAGS) -o $@ -c $< $(LFLAGS) +clean: + rm ${BUILD}/*.o ${BUILD}/*.exe diff --git a/micro_kernels.c b/micro_kernels.c index de11607..ebda448 100644 --- a/micro_kernels.c +++ b/micro_kernels.c @@ -137,58 +137,6 @@ void trans_unpack_out_8x8_size(float* __restrict__ mat, float* __restrict__ matT _mm256_store_ps(&matT[cst2 + 7 * size], t7); } -void trans_unpack_out_8x8(float* __restrict__ mat, float* __restrict__ matT, int i, int j) { - __m256 r0, r1, r2, r3, r4, r5, r6, r7; - __m256 t0, t1, t2, t3, t4, t5, t6, t7; - - int cst = i * MAT_SIZE + j; - r0 = _mm256_load_ps(&mat[cst + 0 * MAT_SIZE]); - r1 = _mm256_load_ps(&mat[cst + 1 * MAT_SIZE]); - r2 = _mm256_load_ps(&mat[cst + 2 * MAT_SIZE]); - r3 = _mm256_load_ps(&mat[cst + 3 * MAT_SIZE]); - r4 = _mm256_load_ps(&mat[cst + 4 * MAT_SIZE]); - r5 = _mm256_load_ps(&mat[cst + 5 * MAT_SIZE]); - r6 = _mm256_load_ps(&mat[cst + 6 * MAT_SIZE]); - r7 = _mm256_load_ps(&mat[cst + 7 * MAT_SIZE]); - - t0 = _mm256_unpacklo_ps(r0, r1); - t1 = _mm256_unpackhi_ps(r0, r1); - t2 = _mm256_unpacklo_ps(r2, r3); - t3 = _mm256_unpackhi_ps(r2, r3); - t4 = _mm256_unpacklo_ps(r4, r5); - t5 = _mm256_unpackhi_ps(r4, r5); - t6 = _mm256_unpacklo_ps(r6, r7); - t7 = _mm256_unpackhi_ps(r6, r7); - - r0 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(1,0,1,0)); - r1 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(3,2,3,2)); - r2 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0)); - r3 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2)); - r4 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(1,0,1,0)); - r5 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(3,2,3,2)); - r6 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(1,0,1,0)); - r7 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(3,2,3,2)); - - t0 = _mm256_permute2f128_ps(r0, r4, 0x20); - t1 = _mm256_permute2f128_ps(r1, r5, 0x20); - t2 = _mm256_permute2f128_ps(r2, r6, 0x20); - t3 = _mm256_permute2f128_ps(r3, r7, 0x20); - t4 = _mm256_permute2f128_ps(r0, r4, 0x31); - t5 = _mm256_permute2f128_ps(r1, r5, 0x31); - t6 = _mm256_permute2f128_ps(r2, r6, 0x31); - t7 = _mm256_permute2f128_ps(r3, r7, 0x31); - - int cst2 = j * MAT_SIZE + i; - _mm256_store_ps(&matT[cst2 + 0 * MAT_SIZE], t0); - _mm256_store_ps(&matT[cst2 + 1 * MAT_SIZE], t1); - _mm256_store_ps(&matT[cst2 + 2 * MAT_SIZE], t2); - _mm256_store_ps(&matT[cst2 + 3 * MAT_SIZE], t3); - _mm256_store_ps(&matT[cst2 + 4 * MAT_SIZE], t4); - _mm256_store_ps(&matT[cst2 + 5 * MAT_SIZE], t5); - _mm256_store_ps(&matT[cst2 + 6 * MAT_SIZE], t6); - _mm256_store_ps(&matT[cst2 + 7 * MAT_SIZE], t7); -} - void trans_blend_out_8x8(float* mat, float* matT, int i, int j) { __m256 r0, r1, r2, r3, r4, r5, r6, r7; __m256 t0, t1, t2, t3, t4, t5, t6, t7; diff --git a/transpose_blocks.c b/transpose_blocks.c index 79af953..31dd89e 100644 --- a/transpose_blocks.c +++ b/transpose_blocks.c @@ -20,16 +20,33 @@ void function_name(float* __restrict__ mat, float* __restrict__ matT){\ }\ }\ } +#define UNROLL_J_LOOP(block_call, mat, matT, i) \ + for (int j = 0; j < MAT_SIZE; j += BLOCK_SIZE * 4) {\ + block_call(mat, matT, (i), j);\ + block_call(mat, matT, (i), j + BLOCK_SIZE);\ + block_call(mat, matT, (i), j + 2 * BLOCK_SIZE);\ + block_call(mat, matT, (i), j + 3 * BLOCK_SIZE);\ + } -// Kernels declaration -LOOP_OVER(transpose_blocks_naive, copy_naive_out_bxb) -LOOP_OVER(transpose_blocks_sse_4x4, copy_sse_4x4) -LOOP_OVER(sse_transpose, trans_unpack_out_4x4) +#define UNROLL_4_OVER(function_name, block_call/*void block_call(float*, float*, int i, int j)*/) \ +void function_name(float* __restrict__ mat, float* __restrict__ matT){\ + for (int i = 0; i < MAT_SIZE; i += BLOCK_SIZE) {\ + UNROLL_J_LOOP(block_call, mat, matT, i);\ + }\ +} + +//LOOP_OVER(transpose_blocks_naive, copy_naive_out_bxb) +UNROLL_4_OVER(transpose_blocks_naive, copy_naive_out_bxb) +//LOOP_OVER(transpose_blocks_sse_4x4, copy_sse_4x4) +UNROLL_4_OVER(transpose_blocks_sse_4x4, copy_sse_4x4) #if HAS_AVX -LOOP_OVER(transpose_blocks_avx_8x8, copy_avx_8x8) -LOOP_OVER(copy_all_avx_8x8, copy_contiguous_avx_8x8) -LOOP_OVER(real_transpose, trans_unpack_out_8x8) +//LOOP_OVER(transpose_blocks_avx_8x8, copy_avx_8x8) +UNROLL_4_OVER(transpose_blocks_avx_8x8, copy_avx_8x8) +//LOOP_OVER(copy_all_avx_8x8, copy_contiguous_avx_8x8) +UNROLL_4_OVER(copy_all_avx_8x8, copy_contiguous_avx_8x8) +//LOOP_OVER(real_transpose, trans_unpack_out_8x8) +UNROLL_4_OVER(real_transpose, trans_unpack_out_8x8) #endif long NBYTES = MAT_SIZE * MAT_SIZE * sizeof(float); -- GitLab