Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 73b4b018 authored by TOLLENAERE Nicolas's avatar TOLLENAERE Nicolas
Browse files

Unroll

parent 33345e46
No related branches found
No related tags found
No related merge requests found
CC=gcc
BUILD=./build
FLAGS=-Wall -O3 -march=native
FLAGS=-Wall -O2 -march=core-avx2
LFLAGS=-lm
SRC=transpose_blocks.c utils.c
OBJ= $(SRC:.c=.o)
......@@ -16,3 +16,5 @@ $(BUILD)/transpose_blocks.exe:$(patsubst %.o, $(BUILD)/%.o,$(OBJ))
$(BUILD)/%.o:%.c micro_kernels.h utils.h
$(CC) $(FLAGS) -o $@ -c $< $(LFLAGS)
clean:
rm ${BUILD}/*.o ${BUILD}/*.exe
......@@ -137,58 +137,6 @@ void trans_unpack_out_8x8_size(float* __restrict__ mat, float* __restrict__ matT
_mm256_store_ps(&matT[cst2 + 7 * size], t7);
}
void trans_unpack_out_8x8(float* __restrict__ mat, float* __restrict__ matT, int i, int j) {
__m256 r0, r1, r2, r3, r4, r5, r6, r7;
__m256 t0, t1, t2, t3, t4, t5, t6, t7;
int cst = i * MAT_SIZE + j;
r0 = _mm256_load_ps(&mat[cst + 0 * MAT_SIZE]);
r1 = _mm256_load_ps(&mat[cst + 1 * MAT_SIZE]);
r2 = _mm256_load_ps(&mat[cst + 2 * MAT_SIZE]);
r3 = _mm256_load_ps(&mat[cst + 3 * MAT_SIZE]);
r4 = _mm256_load_ps(&mat[cst + 4 * MAT_SIZE]);
r5 = _mm256_load_ps(&mat[cst + 5 * MAT_SIZE]);
r6 = _mm256_load_ps(&mat[cst + 6 * MAT_SIZE]);
r7 = _mm256_load_ps(&mat[cst + 7 * MAT_SIZE]);
t0 = _mm256_unpacklo_ps(r0, r1);
t1 = _mm256_unpackhi_ps(r0, r1);
t2 = _mm256_unpacklo_ps(r2, r3);
t3 = _mm256_unpackhi_ps(r2, r3);
t4 = _mm256_unpacklo_ps(r4, r5);
t5 = _mm256_unpackhi_ps(r4, r5);
t6 = _mm256_unpacklo_ps(r6, r7);
t7 = _mm256_unpackhi_ps(r6, r7);
r0 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(1,0,1,0));
r1 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(3,2,3,2));
r2 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0));
r3 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2));
r4 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(1,0,1,0));
r5 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(3,2,3,2));
r6 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(1,0,1,0));
r7 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(3,2,3,2));
t0 = _mm256_permute2f128_ps(r0, r4, 0x20);
t1 = _mm256_permute2f128_ps(r1, r5, 0x20);
t2 = _mm256_permute2f128_ps(r2, r6, 0x20);
t3 = _mm256_permute2f128_ps(r3, r7, 0x20);
t4 = _mm256_permute2f128_ps(r0, r4, 0x31);
t5 = _mm256_permute2f128_ps(r1, r5, 0x31);
t6 = _mm256_permute2f128_ps(r2, r6, 0x31);
t7 = _mm256_permute2f128_ps(r3, r7, 0x31);
int cst2 = j * MAT_SIZE + i;
_mm256_store_ps(&matT[cst2 + 0 * MAT_SIZE], t0);
_mm256_store_ps(&matT[cst2 + 1 * MAT_SIZE], t1);
_mm256_store_ps(&matT[cst2 + 2 * MAT_SIZE], t2);
_mm256_store_ps(&matT[cst2 + 3 * MAT_SIZE], t3);
_mm256_store_ps(&matT[cst2 + 4 * MAT_SIZE], t4);
_mm256_store_ps(&matT[cst2 + 5 * MAT_SIZE], t5);
_mm256_store_ps(&matT[cst2 + 6 * MAT_SIZE], t6);
_mm256_store_ps(&matT[cst2 + 7 * MAT_SIZE], t7);
}
void trans_blend_out_8x8(float* mat, float* matT, int i, int j) {
__m256 r0, r1, r2, r3, r4, r5, r6, r7;
__m256 t0, t1, t2, t3, t4, t5, t6, t7;
......
......@@ -20,16 +20,33 @@ void function_name(float* __restrict__ mat, float* __restrict__ matT){\
}\
}\
}
#define UNROLL_J_LOOP(block_call, mat, matT, i) \
for (int j = 0; j < MAT_SIZE; j += BLOCK_SIZE * 4) {\
block_call(mat, matT, (i), j);\
block_call(mat, matT, (i), j + BLOCK_SIZE);\
block_call(mat, matT, (i), j + 2 * BLOCK_SIZE);\
block_call(mat, matT, (i), j + 3 * BLOCK_SIZE);\
}
// Kernels declaration
LOOP_OVER(transpose_blocks_naive, copy_naive_out_bxb)
LOOP_OVER(transpose_blocks_sse_4x4, copy_sse_4x4)
LOOP_OVER(sse_transpose, trans_unpack_out_4x4)
#define UNROLL_4_OVER(function_name, block_call/*void block_call(float*, float*, int i, int j)*/) \
void function_name(float* __restrict__ mat, float* __restrict__ matT){\
for (int i = 0; i < MAT_SIZE; i += BLOCK_SIZE) {\
UNROLL_J_LOOP(block_call, mat, matT, i);\
}\
}
//LOOP_OVER(transpose_blocks_naive, copy_naive_out_bxb)
UNROLL_4_OVER(transpose_blocks_naive, copy_naive_out_bxb)
//LOOP_OVER(transpose_blocks_sse_4x4, copy_sse_4x4)
UNROLL_4_OVER(transpose_blocks_sse_4x4, copy_sse_4x4)
#if HAS_AVX
LOOP_OVER(transpose_blocks_avx_8x8, copy_avx_8x8)
LOOP_OVER(copy_all_avx_8x8, copy_contiguous_avx_8x8)
LOOP_OVER(real_transpose, trans_unpack_out_8x8)
//LOOP_OVER(transpose_blocks_avx_8x8, copy_avx_8x8)
UNROLL_4_OVER(transpose_blocks_avx_8x8, copy_avx_8x8)
//LOOP_OVER(copy_all_avx_8x8, copy_contiguous_avx_8x8)
UNROLL_4_OVER(copy_all_avx_8x8, copy_contiguous_avx_8x8)
//LOOP_OVER(real_transpose, trans_unpack_out_8x8)
UNROLL_4_OVER(real_transpose, trans_unpack_out_8x8)
#endif
long NBYTES = MAT_SIZE * MAT_SIZE * sizeof(float);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment