-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdgemm_unrolled.cpp
42 lines (38 loc) · 1.43 KB
/
dgemm_unrolled.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#include "dgemm_unrolled.h"
#include <x86intrin.h>
/*
Optimized version of DGEMM using C intrinsics to generate the AVX subword-parallel instructions
for the x86 and loop unrolling to create more opportunities for instruction-level parallelism.
We can see the impact of
instruction-level parallelism by unrolling the loop so that the multiple-issue,
out-of-order execution processor has more instructions to work with.
The function below is the unrolled version of function avx512, which contains
the C intrinsics to produce the AVX-512 instructions.
*/
void dgemm_unrolled(const uint32_t n, const double* A, const double* B, double* C)
{
constexpr uint32_t UNROLL = 4;
for( uint32_t i = 0; i < n; i += UNROLL * 8)
{
for( uint32_t j = 0; j < n; ++j)
{
__m512d c[UNROLL];
for( uint32_t r = 0; r < UNROLL; r++)
{
c[r] = _mm512_load_pd(C + i + r * 8 + j * n); //[ UNROLL];
}
for( uint32_t k = 0; k < n; k++ )
{
__m512d bb = _mm512_broadcastsd_pd(_mm_load_sd(B + j * n + k));
for( uint32_t r = 0; r < UNROLL; r++)
{
c[r] = _mm512_fmadd_pd(_mm512_load_pd(A + n * k + r * 8 + i), bb, c[r]);
}
}
for( uint32_t r = 0; r < UNROLL; r++)
{
_mm512_store_pd(C + i + r * 8 + j * n, c[r]);
}
}
}
}