[OpenBLAS_jll] Update to new build with BFloat16 kernels (JuliaLang#53059)

giordano · web-flow · commit 5d4d6ab498ab · 2024-01-26T10:17:58.000Z
This also * drops a patch (`deps/patches/neoverse-generic-kernels.patch`) not needed anymore for an [old bug](OpenMathLib/OpenBLAS#2998) fixed upstream in OpenBLAS. This results in ~5x speedup in the computation of `BLAS.nrm2` (and hence `LinearAlgebra.norm` for vectors longer than `LinearAlgebra.NRM2_CUTOFF` (== 32) elements) when the neoversen1 kernels are used, e.g. by default on all Apple Silicon CPUs * adds a regression test for the above bug * updates other patches when building openblas from source Corresponding PR in Yggdrasil: JuliaPackaging/Yggdrasil#7202.
diff --git a/deps/checksums/openblas b/deps/checksums/openblas
diff --git a/deps/openblas.mk b/deps/openblas.mk
@@ -20,6 +20,9 @@ endif
 # don't touch scheduler affinity since we manage this ourselves
 OPENBLAS_BUILD_OPTS += NO_AFFINITY=1
 
+# Build BFloat16 kernels
+OPENBLAS_BUILD_OPTS += BUILD_BFLOAT16=1
+
 # Build for all architectures - required for distribution
 ifeq ($(SANITIZE_MEMORY),1)
 OPENBLAS_BUILD_OPTS += TARGET=GENERIC
@@ -95,12 +98,22 @@ $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-ofast-power.patch-applied: $(BUILDDIR)/
 		patch -p1 -f < $(SRCDIR)/patches/openblas-ofast-power.patch
 	echo 1 > $@
 
-$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/neoverse-generic-kernels.patch-applied: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-ofast-power.patch-applied
+$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-avx512bf-kernels.patch-applied: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-ofast-power.patch-applied
+	cd $(BUILDDIR)/$(OPENBLAS_SRC_DIR) && \
+		patch -p1 -f < $(SRCDIR)/patches/openblas-avx512bf-kernels.patch
+	echo 1 > $@
+
+$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-gemv-multithreading.patch-applied: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-avx512bf-kernels.patch-applied
+	cd $(BUILDDIR)/$(OPENBLAS_SRC_DIR) && \
+		patch -p1 -f < $(SRCDIR)/patches/openblas-gemv-multithreading.patch
+	echo 1 > $@
+
+$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-darwin-sve.patch-applied: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-gemv-multithreading.patch-applied
 	cd $(BUILDDIR)/$(OPENBLAS_SRC_DIR) && \
-		patch -p1 -f < $(SRCDIR)/patches/neoverse-generic-kernels.patch
+		patch -p1 -f < $(SRCDIR)/patches/openblas-darwin-sve.patch
 	echo 1 > $@
 
-$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/build-configured: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/neoverse-generic-kernels.patch-applied
+$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/build-configured: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-darwin-sve.patch-applied
 	echo 1 > $@
 
 $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/build-compiled: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/build-configured
diff --git a/deps/patches/neoverse-generic-kernels.patch b/deps/patches/neoverse-generic-kernels.patch
diff --git a/deps/patches/openblas-avx512bf-kernels.patch b/deps/patches/openblas-avx512bf-kernels.patch
@@ -0,0 +1,107 @@
+From 1dada6d65d89d19b2cf89b12169f6b2196c90f1d Mon Sep 17 00:00:00 2001
+From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
+Date: Fri, 12 Jan 2024 00:10:56 +0100
+Subject: [PATCH 1/2] Add compiler test and flag for AVX512BF16 capability
+
+---
+ c_check | 22 ++++++++++++++++++++++
+ 1 file changed, 22 insertions(+)
+
+diff --git a/c_check b/c_check
+index b5e4a9ad00..3e507be818 100755
+--- a/c_check
++++ b/c_check
+@@ -244,6 +244,7 @@ case "$data" in
+ esac
+ 
+ no_avx512=0
++no_avx512bf=0
+ if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
+     tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
+     tmpf="$tmpd/a.c"
+@@ -262,6 +263,25 @@ if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
+     }
+ 
+     rm -rf "$tmpd"
++    if [ "$no_avx512" -eq 0 ]; then
++    tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
++    tmpf="$tmpd/a.c"
++    code='"__m512 a= _mm512_dpbf16_ps(a, (__m512bh) _mm512_loadu_si512(%1]), (__m512bh) _mm512_loadu_si512(%2]));"'
++    printf "#include <immintrin.h>\n\nint main(void){ %s; }\n" "$code" >> "$tmpf"
++    if [ "$compiler" = "PGI" ]; then
++        args=" -tp cooperlake -c -o $tmpf.o $tmpf"
++    else
++        args=" -march=cooperlake -c -o $tmpf.o $tmpf"
++    fi
++    no_avx512bf=0
++    {
++        $compiler_name $flags $args >/dev/null 2>&1
++    } || {
++        no_avx512bf=1
++    }
++
++    rm -rf "$tmpd"
++  fi
+ fi
+ 
+ no_rv64gv=0
+@@ -409,6 +429,7 @@ done
+  [ "$makefile" = "-" ] && {
+     [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
+     [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
++    [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"
+     [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
+     [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
+     exit 0
+@@ -437,6 +458,7 @@ done
+     [ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n"
+     [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
+     [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
++    [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"
+     [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
+     [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
+     [ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n"
+
+From 995a990e24fdcc8080128a8abc17b4ccc66bd4fd Mon Sep 17 00:00:00 2001
+From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
+Date: Fri, 12 Jan 2024 00:12:46 +0100
+Subject: [PATCH 2/2] Make AVX512 BFLOAT16 kernels conditional on compiler
+ capability
+
+---
+ kernel/x86_64/KERNEL.COOPERLAKE     | 3 ++-
+ kernel/x86_64/KERNEL.SAPPHIRERAPIDS | 2 ++
+ 2 files changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE
+index dba94aea86..22b042029f 100644
+--- a/kernel/x86_64/KERNEL.COOPERLAKE
++++ b/kernel/x86_64/KERNEL.COOPERLAKE
+@@ -1,5 +1,5 @@
+ include $(KERNELDIR)/KERNEL.SKYLAKEX
+-
++ifneq ($(NO_AVX512BF16), 1)
+ SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_cooperlake.c
+ SBGEMM_SMALL_K_NN = sbgemm_small_kernel_nn_cooperlake.c
+ SBGEMM_SMALL_K_B0_NN = sbgemm_small_kernel_nn_cooperlake.c
+@@ -20,3 +20,4 @@ SBGEMMINCOPYOBJ =  sbgemm_incopy$(TSUFFIX).$(SUFFIX)
+ SBGEMMITCOPYOBJ =  sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
+ SBGEMMONCOPYOBJ =  sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ SBGEMMOTCOPYOBJ =  sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
++endif
+diff --git a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
+index 3a832e9174..0ab2b4ddcf 100644
+--- a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
++++ b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
+@@ -1,5 +1,6 @@
+ include $(KERNELDIR)/KERNEL.COOPERLAKE
+ 
++ifneq ($(NO_AVX512BF16), 1)
+ SBGEMM_SMALL_M_PERMIT =
+ SBGEMM_SMALL_K_NN     =
+ SBGEMM_SMALL_K_B0_NN  =
+@@ -20,3 +21,4 @@ SBGEMMINCOPYOBJ =  sbgemm_incopy$(TSUFFIX).$(SUFFIX)
+ SBGEMMITCOPYOBJ =  sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
+ SBGEMMONCOPYOBJ =  sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ SBGEMMOTCOPYOBJ =  sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
++endif
diff --git a/deps/patches/openblas-darwin-sve.patch b/deps/patches/openblas-darwin-sve.patch
@@ -0,0 +1,34 @@
+From 03688a42622cf76e696859ce384e45aa26d927fc Mon Sep 17 00:00:00 2001
+From: Ian McInerney <i.mcinerney17@imperial.ac.uk>
+Date: Tue, 23 Jan 2024 10:29:57 +0000
+Subject: [PATCH] Build with proper aarch64 flags on Neoverse Darwin
+
+We aren't affected by the problems in AppleClang that prompted this
+fallback to an older architecture.
+---
+ Makefile.arm64 | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/Makefile.arm64 b/Makefile.arm64
+index ed52a9424..a8f3cb0f0 100644
+--- a/Makefile.arm64
++++ b/Makefile.arm64
+@@ -135,11 +135,11 @@ ifeq ($(CORE), NEOVERSEN2)
+ ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
+ ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
+ ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
+-ifneq ($(OSNAME), Darwin)
++#ifneq ($(OSNAME), Darwin)
+ CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
+-else
+-CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+-endif
++#else
++#CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
++#endif
+ ifneq ($(F_COMPILER), NAG)
+ FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
+ endif
+-- 
+2.43.0
+
diff --git a/deps/patches/openblas-gemv-multithreading.patch b/deps/patches/openblas-gemv-multithreading.patch
@@ -0,0 +1,22 @@
+From d2fc4f3b4d7f41527bc7dc8f62e9aa6229cfac89 Mon Sep 17 00:00:00 2001
+From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
+Date: Wed, 17 Jan 2024 20:59:24 +0100
+Subject: [PATCH] Increase multithreading threshold by a factor of 50
+
+---
+ interface/gemv.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/interface/gemv.c b/interface/gemv.c
+index 1f07635799..2c121f1308 100644
+--- a/interface/gemv.c
++++ b/interface/gemv.c
+@@ -226,7 +226,7 @@ void CNAME(enum CBLAS_ORDER order,
+ 
+ #ifdef SMP
+ 
+-  if ( 1L * m * n < 2304L * GEMM_MULTITHREAD_THRESHOLD )
++  if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD )
+     nthreads = 1;
+   else
+     nthreads = num_cpu_avail(2);
diff --git a/deps/patches/openblas-ofast-power.patch b/deps/patches/openblas-ofast-power.patch
@@ -1,17 +1,29 @@
 diff --git a/Makefile.power b/Makefile.power
-index 28a0bae0..b4869fbd 100644
+index aa1ca080a..42c417a78 100644
 --- a/Makefile.power
 +++ b/Makefile.power
-@@ -11,7 +11,7 @@ endif
- 
- ifeq ($(CORE), POWER10)
+@@ -13,16 +13,16 @@ ifeq ($(CORE), POWER10)
  ifneq ($(C_COMPILER), PGI)
+ ifeq ($(C_COMPILER), GCC))
+ ifeq ($(GCCVERSIONGTEQ10), 1)
 -CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
 +CCOMMON_OPT += -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
- ifeq ($(F_COMPILER), IBM)
- FCOMMON_OPT += -O2 -qrecur -qnosave
+ else ifneq ($(GCCVERSIONGT4), 1)
+ $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
+-CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
++CCOMMON_OPT += -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
+ else
+ $(warning your compiler is too old to fully support POWER10, getting a newer version of gcc is recommended)
+-CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math
++CCOMMON_OPT += -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math
+ endif
  else
-@@ -22,7 +22,7 @@ endif
+-CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
++CCOMMON_OPT += -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
+ endif
+ ifeq ($(F_COMPILER), IBM)
+ FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr10 -qtune=pwr10 -qfloat=nomaf -qzerosize
+@@ -34,7 +34,7 @@ endif
  
  ifeq ($(CORE), POWER9)
  ifneq ($(C_COMPILER), PGI)
@@ -20,7 +32,7 @@ index 28a0bae0..b4869fbd 100644
  ifeq ($(C_COMPILER), GCC)
  ifneq ($(GCCVERSIONGT4), 1)
  $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
-@@ -59,7 +59,7 @@ endif
+@@ -70,7 +70,7 @@ endif
  
  ifeq ($(CORE), POWER8)
  ifneq ($(C_COMPILER), PGI)
diff --git a/stdlib/LinearAlgebra/test/blas.jl b/stdlib/LinearAlgebra/test/blas.jl
@@ -126,6 +126,15 @@ Random.seed!(100)
                 @test BLAS.iamax(b) == findmax(fabs, b)[2] * (step(ind) >= 0)
             end
         end
+        @testset "nrm2 with non-finite elements" begin
+            # These tests would have caught <https://github.com/OpenMathLib/OpenBLAS/issues/2998>
+            # when running on appropriate hardware.
+            a = zeros(elty,n)
+            a[begin] = elty(-Inf)
+            @test BLAS.nrm2(a) === abs2(elty(Inf))
+            a[begin] = elty(NaN)
+            @test BLAS.nrm2(a) === abs2(elty(NaN))
+        end
         @testset "scal" begin
             α = rand(elty)
             a = rand(elty,n)
diff --git a/stdlib/OpenBLAS_jll/Project.toml b/stdlib/OpenBLAS_jll/Project.toml
@@ -1,6 +1,6 @@
 name = "OpenBLAS_jll"
 uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
-version = "0.3.26+1"
+version = "0.3.26+2"
 
 [deps]
 # See note in `src/OpenBLAS_jll.jl` about this dependency.