Commit 548b8c7f authored by TOLLENAERE Nicolas's avatar TOLLENAERE Nicolas
Browse files

Other kernel (without packing)

parent 3a5371fe
......@@ -23,20 +23,19 @@
//#define J_SIZE (1 << 12)
//#define K_SIZE (1 << 9)
#define I_SIZE (3 * (1 << 10))
#define J_SIZE (3 * (1 << 10))
#define J_SIZE (3 * (1 << 8))
#define K_SIZE (3 * (1 << 10))
// A[i, k], B[k, j] and C[i, j] are all stored row-major
void gen_matmul( M_TYPE* __restrict__ A, M_TYPE* __restrict__ B, M_TYPE* __restrict__ C,
IND_TYPE I, IND_TYPE J, IND_TYPE K) {
/*
[V j; U (2, j); U (6, i); S (192, k); Pack_var [k]; Pack B; S (12, j);
Pack A; A i; A k; A j]
[V j; U (2, j); U (6, i); S (192, k); Pack_var [k]; S (12, j); A i; A k; A j]
*/
int i, iall, il, j, j0, jp_0, jall, jl, k, k0, kp_0, kg0, kl0;
float * B0 = (float *)aligned_alloc(32, sizeof(float) * 3072);
float * A0 = (float *)aligned_alloc(32, sizeof(float) * 1152);
int i, j, j0, jp_0, k, k0, kp_0;
assert(6 <= I);
assert(192 <= J);
assert(192 <= K);
......@@ -51,16 +50,6 @@ void gen_matmul( M_TYPE* __restrict__ A, M_TYPE* __restrict__ B, M_TYPE* __restr
for (i = 0;
i < I;
i += 6){
for (kg0 = k0, kl0 = 0;
kg0 < MIN(k0 + 192, K);
kg0 += 1, kl0 += 1){
for (iall = i, il = 0;
iall < MIN(i + 6, I);
iall += 1, il += 1){
scal_0 = A[kg0 + K * iall];
A0[kl0 + 192 * il] = scal_0;
}
}
//Tiling dim j by 12
for (j = j0, jp_0 = 0;
j < MIN(j0 + 192, J);
......@@ -77,33 +66,23 @@ void gen_matmul( M_TYPE* __restrict__ A, M_TYPE* __restrict__ B, M_TYPE* __restr
mem_vec_9 = _mm256_load_ps(&C[J * (i + 4) + j + 8]);
mem_vec_10 = _mm256_load_ps(&C[J * (i + 5) + j]);
mem_vec_11 = _mm256_load_ps(&C[J * (i + 5) + j + 8]);
for (jall = j, jl = 0;
jall < MIN(j + 16, J);
jall += 8, jl += 8){
for (kg0 = k0, kl0 = 0;
kg0 < MIN(k0 + 192, K);
kg0 += 1, kl0 += 1){
vec_0 = _mm256_load_ps(&B[jall + J * kg0]);
_mm256_store_ps(&B0[jl + 16 * kl0], vec_0);
}
}
//Tiling dim k by 192
for (k = k0, kp_0 = 0;
k < MIN(k0 + 192, K);
k += 1, kp_0 += 1){
scal_0 = A0[kp_0];
scal_0 = A[K * i + k];
vec_1 = _mm256_set1_ps(scal_0);
vec_2 = _mm256_load_ps(&B0[16 * kp_0]);
vec_2 = _mm256_load_ps(&B[J * k + j]);
vec_0 = _mm256_fmadd_ps(mem_vec_0, vec_1, vec_2);
mem_vec_0 = vec_0;
vec_4 = _mm256_load_ps(&B0[16 * kp_0 + 8]);
vec_4 = _mm256_load_ps(&B[J * k + j + 8]);
vec_3 = _mm256_fmadd_ps(mem_vec_1, vec_1, vec_4);
mem_vec_1 = vec_3;
scal_1 = A0[192 + kp_0];
scal_1 = A[K * (i + 1) + k];
vec_6 = _mm256_set1_ps(scal_1);
vec_5 = _mm256_fmadd_ps(mem_vec_2, vec_6, vec_2);
......@@ -114,7 +93,7 @@ void gen_matmul( M_TYPE* __restrict__ A, M_TYPE* __restrict__ B, M_TYPE* __restr
vec_7 = _mm256_fmadd_ps(mem_vec_3, vec_6, vec_4);
mem_vec_3 = vec_7;
scal_2 = A0[384 + kp_0];
scal_2 = A[K * (i + 2) + k];
vec_9 = _mm256_set1_ps(scal_2);
vec_8 = _mm256_fmadd_ps(mem_vec_4, vec_9, vec_2);
......@@ -125,7 +104,7 @@ void gen_matmul( M_TYPE* __restrict__ A, M_TYPE* __restrict__ B, M_TYPE* __restr
vec_10 = _mm256_fmadd_ps(mem_vec_5, vec_9, vec_4);
mem_vec_5 = vec_10;
scal_3 = A0[576 + kp_0];
scal_3 = A[K * (i + 3) + k];
vec_12 = _mm256_set1_ps(scal_3);
vec_11 = _mm256_fmadd_ps(mem_vec_6, vec_12, vec_2);
......@@ -136,7 +115,7 @@ void gen_matmul( M_TYPE* __restrict__ A, M_TYPE* __restrict__ B, M_TYPE* __restr
vec_13 = _mm256_fmadd_ps(mem_vec_7, vec_12, vec_4);
mem_vec_7 = vec_13;
scal_4 = A0[768 + kp_0];
scal_4 = A[K * (i + 4) + k];
vec_15 = _mm256_set1_ps(scal_4);
vec_14 = _mm256_fmadd_ps(mem_vec_8, vec_15, vec_2);
......@@ -147,7 +126,7 @@ void gen_matmul( M_TYPE* __restrict__ A, M_TYPE* __restrict__ B, M_TYPE* __restr
vec_16 = _mm256_fmadd_ps(mem_vec_9, vec_15, vec_4);
mem_vec_9 = vec_16;
scal_5 = A0[960 + kp_0];
scal_5 = A[K * (i + 5) + k];
vec_18 = _mm256_set1_ps(scal_5);
vec_17 = _mm256_fmadd_ps(mem_vec_10, vec_18, vec_2);
......@@ -174,8 +153,7 @@ void gen_matmul( M_TYPE* __restrict__ A, M_TYPE* __restrict__ B, M_TYPE* __restr
}
}
}
free(B0);
free(A0);
}
// A[i, k], B[k, j] and C[i, j] are all stored row-major
......@@ -305,7 +283,7 @@ int main() {
gen_matmul(a, b, c, I_SIZE, J_SIZE, K_SIZE);
t2 = papi_get_timestamp();
//printf("GEN RESULT\n");
//is_matmul(a, b, c, I_SIZE, J_SIZE, K_SIZE);
is_matmul(a, b, c, I_SIZE, J_SIZE, K_SIZE);
d = delta(t1, t2);
time_str = from_timing(d);
printf("GEN : %s\n", time_str);
......
......@@ -21,7 +21,7 @@ let peak_perf (module A: Arch_t) =
let () =
let i = 3 * 1024
and j = 3 * 1024
and j = 3 * 256
and k = 3 * 1024 in
let cycles = peak_perf (module Sky_lake) i j k in
Printf.printf "i: %d, j: %d, k: %d, peak number of cycles:%d (%e)\n"
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment