Skip to content

Commit 5d4d6ab

Browse files
authored
[OpenBLAS_jll] Update to new build with BFloat16 kernels (JuliaLang#53059)
This also * drops a patch (`deps/patches/neoverse-generic-kernels.patch`) not needed anymore for an [old bug](OpenMathLib/OpenBLAS#2998) fixed upstream in OpenBLAS. This results in ~5x speedup in the computation of `BLAS.nrm2` (and hence `LinearAlgebra.norm` for vectors longer than `LinearAlgebra.NRM2_CUTOFF` (== 32) elements) when the neoversen1 kernels are used, e.g. by default on all Apple Silicon CPUs * adds a regression test for the above bug * updates other patches when building openblas from source Corresponding PR in Yggdrasil: JuliaPackaging/Yggdrasil#7202.
1 parent 5cf1021 commit 5d4d6ab

9 files changed

+301
-123
lines changed

deps/checksums/openblas

+92-92
Large diffs are not rendered by default.

deps/openblas.mk

+16-3
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ endif
2020
# don't touch scheduler affinity since we manage this ourselves
2121
OPENBLAS_BUILD_OPTS += NO_AFFINITY=1
2222

23+
# Build BFloat16 kernels
24+
OPENBLAS_BUILD_OPTS += BUILD_BFLOAT16=1
25+
2326
# Build for all architectures - required for distribution
2427
ifeq ($(SANITIZE_MEMORY),1)
2528
OPENBLAS_BUILD_OPTS += TARGET=GENERIC
@@ -95,12 +98,22 @@ $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-ofast-power.patch-applied: $(BUILDDIR)/
9598
patch -p1 -f < $(SRCDIR)/patches/openblas-ofast-power.patch
9699
echo 1 > $@
97100

98-
$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/neoverse-generic-kernels.patch-applied: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-ofast-power.patch-applied
101+
$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-avx512bf-kernels.patch-applied: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-ofast-power.patch-applied
102+
cd $(BUILDDIR)/$(OPENBLAS_SRC_DIR) && \
103+
patch -p1 -f < $(SRCDIR)/patches/openblas-avx512bf-kernels.patch
104+
echo 1 > $@
105+
106+
$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-gemv-multithreading.patch-applied: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-avx512bf-kernels.patch-applied
107+
cd $(BUILDDIR)/$(OPENBLAS_SRC_DIR) && \
108+
patch -p1 -f < $(SRCDIR)/patches/openblas-gemv-multithreading.patch
109+
echo 1 > $@
110+
111+
$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-darwin-sve.patch-applied: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-gemv-multithreading.patch-applied
99112
cd $(BUILDDIR)/$(OPENBLAS_SRC_DIR) && \
100-
patch -p1 -f < $(SRCDIR)/patches/neoverse-generic-kernels.patch
113+
patch -p1 -f < $(SRCDIR)/patches/openblas-darwin-sve.patch
101114
echo 1 > $@
102115

103-
$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/build-configured: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/neoverse-generic-kernels.patch-applied
116+
$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/build-configured: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-darwin-sve.patch-applied
104117
echo 1 > $@
105118

106119
$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/build-compiled: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/build-configured

deps/patches/neoverse-generic-kernels.patch

-19
This file was deleted.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
From 1dada6d65d89d19b2cf89b12169f6b2196c90f1d Mon Sep 17 00:00:00 2001
2+
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
3+
Date: Fri, 12 Jan 2024 00:10:56 +0100
4+
Subject: [PATCH 1/2] Add compiler test and flag for AVX512BF16 capability
5+
6+
---
7+
c_check | 22 ++++++++++++++++++++++
8+
1 file changed, 22 insertions(+)
9+
10+
diff --git a/c_check b/c_check
11+
index b5e4a9ad00..3e507be818 100755
12+
--- a/c_check
13+
+++ b/c_check
14+
@@ -244,6 +244,7 @@ case "$data" in
15+
esac
16+
17+
no_avx512=0
18+
+no_avx512bf=0
19+
if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
20+
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
21+
tmpf="$tmpd/a.c"
22+
@@ -262,6 +263,25 @@ if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
23+
}
24+
25+
rm -rf "$tmpd"
26+
+ if [ "$no_avx512" -eq 0 ]; then
27+
+ tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
28+
+ tmpf="$tmpd/a.c"
29+
+ code='"__m512 a= _mm512_dpbf16_ps(a, (__m512bh) _mm512_loadu_si512(%1]), (__m512bh) _mm512_loadu_si512(%2]));"'
30+
+ printf "#include <immintrin.h>\n\nint main(void){ %s; }\n" "$code" >> "$tmpf"
31+
+ if [ "$compiler" = "PGI" ]; then
32+
+ args=" -tp cooperlake -c -o $tmpf.o $tmpf"
33+
+ else
34+
+ args=" -march=cooperlake -c -o $tmpf.o $tmpf"
35+
+ fi
36+
+ no_avx512bf=0
37+
+ {
38+
+ $compiler_name $flags $args >/dev/null 2>&1
39+
+ } || {
40+
+ no_avx512bf=1
41+
+ }
42+
+
43+
+ rm -rf "$tmpd"
44+
+ fi
45+
fi
46+
47+
no_rv64gv=0
48+
@@ -409,6 +429,7 @@ done
49+
[ "$makefile" = "-" ] && {
50+
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
51+
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
52+
+ [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"
53+
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
54+
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
55+
exit 0
56+
@@ -437,6 +458,7 @@ done
57+
[ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n"
58+
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
59+
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
60+
+ [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"
61+
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
62+
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
63+
[ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n"
64+
65+
From 995a990e24fdcc8080128a8abc17b4ccc66bd4fd Mon Sep 17 00:00:00 2001
66+
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
67+
Date: Fri, 12 Jan 2024 00:12:46 +0100
68+
Subject: [PATCH 2/2] Make AVX512 BFLOAT16 kernels conditional on compiler
69+
capability
70+
71+
---
72+
kernel/x86_64/KERNEL.COOPERLAKE | 3 ++-
73+
kernel/x86_64/KERNEL.SAPPHIRERAPIDS | 2 ++
74+
2 files changed, 4 insertions(+), 1 deletion(-)
75+
76+
diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE
77+
index dba94aea86..22b042029f 100644
78+
--- a/kernel/x86_64/KERNEL.COOPERLAKE
79+
+++ b/kernel/x86_64/KERNEL.COOPERLAKE
80+
@@ -1,5 +1,5 @@
81+
include $(KERNELDIR)/KERNEL.SKYLAKEX
82+
-
83+
+ifneq ($(NO_AVX512BF16), 1)
84+
SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_cooperlake.c
85+
SBGEMM_SMALL_K_NN = sbgemm_small_kernel_nn_cooperlake.c
86+
SBGEMM_SMALL_K_B0_NN = sbgemm_small_kernel_nn_cooperlake.c
87+
@@ -20,3 +20,4 @@ SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX)
88+
SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
89+
SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
90+
SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
91+
+endif
92+
diff --git a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
93+
index 3a832e9174..0ab2b4ddcf 100644
94+
--- a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
95+
+++ b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
96+
@@ -1,5 +1,6 @@
97+
include $(KERNELDIR)/KERNEL.COOPERLAKE
98+
99+
+ifneq ($(NO_AVX512BF16), 1)
100+
SBGEMM_SMALL_M_PERMIT =
101+
SBGEMM_SMALL_K_NN =
102+
SBGEMM_SMALL_K_B0_NN =
103+
@@ -20,3 +21,4 @@ SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX)
104+
SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
105+
SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
106+
SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
107+
+endif
+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
From 03688a42622cf76e696859ce384e45aa26d927fc Mon Sep 17 00:00:00 2001
2+
From: Ian McInerney <i.mcinerney17@imperial.ac.uk>
3+
Date: Tue, 23 Jan 2024 10:29:57 +0000
4+
Subject: [PATCH] Build with proper aarch64 flags on Neoverse Darwin
5+
6+
We aren't affected by the problems in AppleClang that prompted this
7+
fallback to an older architecture.
8+
---
9+
Makefile.arm64 | 8 ++++----
10+
1 file changed, 4 insertions(+), 4 deletions(-)
11+
12+
diff --git a/Makefile.arm64 b/Makefile.arm64
13+
index ed52a9424..a8f3cb0f0 100644
14+
--- a/Makefile.arm64
15+
+++ b/Makefile.arm64
16+
@@ -135,11 +135,11 @@ ifeq ($(CORE), NEOVERSEN2)
17+
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
18+
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
19+
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
20+
-ifneq ($(OSNAME), Darwin)
21+
+#ifneq ($(OSNAME), Darwin)
22+
CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
23+
-else
24+
-CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
25+
-endif
26+
+#else
27+
+#CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
28+
+#endif
29+
ifneq ($(F_COMPILER), NAG)
30+
FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
31+
endif
32+
--
33+
2.43.0
34+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
From d2fc4f3b4d7f41527bc7dc8f62e9aa6229cfac89 Mon Sep 17 00:00:00 2001
2+
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
3+
Date: Wed, 17 Jan 2024 20:59:24 +0100
4+
Subject: [PATCH] Increase multithreading threshold by a factor of 50
5+
6+
---
7+
interface/gemv.c | 2 +-
8+
1 file changed, 1 insertion(+), 1 deletion(-)
9+
10+
diff --git a/interface/gemv.c b/interface/gemv.c
11+
index 1f07635799..2c121f1308 100644
12+
--- a/interface/gemv.c
13+
+++ b/interface/gemv.c
14+
@@ -226,7 +226,7 @@ void CNAME(enum CBLAS_ORDER order,
15+
16+
#ifdef SMP
17+
18+
- if ( 1L * m * n < 2304L * GEMM_MULTITHREAD_THRESHOLD )
19+
+ if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD )
20+
nthreads = 1;
21+
else
22+
nthreads = num_cpu_avail(2);

deps/patches/openblas-ofast-power.patch

+20-8
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,29 @@
11
diff --git a/Makefile.power b/Makefile.power
2-
index 28a0bae0..b4869fbd 100644
2+
index aa1ca080a..42c417a78 100644
33
--- a/Makefile.power
44
+++ b/Makefile.power
5-
@@ -11,7 +11,7 @@ endif
6-
7-
ifeq ($(CORE), POWER10)
5+
@@ -13,16 +13,16 @@ ifeq ($(CORE), POWER10)
86
ifneq ($(C_COMPILER), PGI)
7+
ifeq ($(C_COMPILER), GCC))
8+
ifeq ($(GCCVERSIONGTEQ10), 1)
99
-CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
1010
+CCOMMON_OPT += -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
11-
ifeq ($(F_COMPILER), IBM)
12-
FCOMMON_OPT += -O2 -qrecur -qnosave
11+
else ifneq ($(GCCVERSIONGT4), 1)
12+
$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
13+
-CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
14+
+CCOMMON_OPT += -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
15+
else
16+
$(warning your compiler is too old to fully support POWER10, getting a newer version of gcc is recommended)
17+
-CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math
18+
+CCOMMON_OPT += -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math
19+
endif
1320
else
14-
@@ -22,7 +22,7 @@ endif
21+
-CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
22+
+CCOMMON_OPT += -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
23+
endif
24+
ifeq ($(F_COMPILER), IBM)
25+
FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr10 -qtune=pwr10 -qfloat=nomaf -qzerosize
26+
@@ -34,7 +34,7 @@ endif
1527

1628
ifeq ($(CORE), POWER9)
1729
ifneq ($(C_COMPILER), PGI)
@@ -20,7 +32,7 @@ index 28a0bae0..b4869fbd 100644
2032
ifeq ($(C_COMPILER), GCC)
2133
ifneq ($(GCCVERSIONGT4), 1)
2234
$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
23-
@@ -59,7 +59,7 @@ endif
35+
@@ -70,7 +70,7 @@ endif
2436

2537
ifeq ($(CORE), POWER8)
2638
ifneq ($(C_COMPILER), PGI)

stdlib/LinearAlgebra/test/blas.jl

+9
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,15 @@ Random.seed!(100)
126126
@test BLAS.iamax(b) == findmax(fabs, b)[2] * (step(ind) >= 0)
127127
end
128128
end
129+
@testset "nrm2 with non-finite elements" begin
130+
# These tests would have caught <https://github.com/OpenMathLib/OpenBLAS/issues/2998>
131+
# when running on appropriate hardware.
132+
a = zeros(elty,n)
133+
a[begin] = elty(-Inf)
134+
@test BLAS.nrm2(a) === abs2(elty(Inf))
135+
a[begin] = elty(NaN)
136+
@test BLAS.nrm2(a) === abs2(elty(NaN))
137+
end
129138
@testset "scal" begin
130139
α = rand(elty)
131140
a = rand(elty,n)

stdlib/OpenBLAS_jll/Project.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
name = "OpenBLAS_jll"
22
uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
3-
version = "0.3.26+1"
3+
version = "0.3.26+2"
44

55
[deps]
66
# See note in `src/OpenBLAS_jll.jl` about this dependency.

0 commit comments

Comments
 (0)