Skip to content

Commit c8dc049

Browse files
committed
Fix support for Intel Compute Runtime with VectorSize > 1
The fallback implementation of amd_bitalign() triggers a bug with Intel Compute Runtime (NEO) versions from 23.22.26516.18 to 24.45.31740.9 inclusive. intel/intel-graphics-compiler#358 The bug affects all but the first component of the vectors, so the self-tests would pass with VectorSize=1. For higher values of VectorSize, including the default VectorSize=2, approximately half of the self-tests fail, all in barrett32 kernels. Add generic_bitalign() that is always implemented using shifts. Use it in all cases when the destination is the same as one of the sources. If Intel Compute Runtime is detected, use 64-bit shifts in generic_bitalign(). For other platforms, keep using 32-bit shifts. Make amd_bitalign() an alias to generic_bitalign() on systems where amd_bitalign() is not available. That way, it would also expand to 64-bit shifts for Intel Compute Runtime.
1 parent b7da6f8 commit c8dc049

File tree

2 files changed

+46
-28
lines changed

2 files changed

+46
-28
lines changed

src/barrett.cl

+27-27
Original file line numberDiff line numberDiff line change
@@ -253,27 +253,27 @@ Adding x*x to a few carries will not cascade the carry
253253

254254
void shl_96(int96_v * const a)
255255
/* shiftleft a one bit */
256-
{ /* here, bitalign improves the 92-bit kernel, and slows down 76-bit */
256+
{ /* here, amd_bitalign improves the 92-bit kernel, and slows down 76-bit */
257257
a->d2 = amd_bitalign(a->d2, a->d1, 31);
258258
a->d1 = amd_bitalign(a->d1, a->d0, 31);
259-
// a->d2 = (a->d2 << 1) | (a->d1 >> 31);
260-
// a->d1 = (a->d1 << 1) | (a->d0 >> 31);
259+
// a->d2 = generic_bitalign(a->d2, a->d1, 31);
260+
// a->d1 = generic_bitalign(a->d1, a->d0, 31);
261261
a->d0 = a->d0 << 1;
262262
}
263263

264264
void shl_192(int192_v * const a)
265265
/* shiftleft a one bit */
266-
{ /* in this function, bitalign slows down all kernels */
266+
{ /* in this function, amd_bitalign slows down all kernels */
267267
// a->d5 = amd_bitalign(a->d5, a->d4, 31);
268268
// a->d4 = amd_bitalign(a->d4, a->d3, 31);
269269
// a->d3 = amd_bitalign(a->d3, a->d2, 31);
270270
// a->d2 = amd_bitalign(a->d2, a->d1, 31);
271271
// a->d1 = amd_bitalign(a->d1, a->d0, 31);
272-
a->d5 = (a->d5 << 1) | (a->d4 >> 31);
273-
a->d4 = (a->d4 << 1) | (a->d3 >> 31);
274-
a->d3 = (a->d3 << 1) | (a->d2 >> 31);
275-
a->d2 = (a->d2 << 1) | (a->d1 >> 31);
276-
a->d1 = (a->d1 << 1) | (a->d0 >> 31);
272+
a->d5 = generic_bitalign(a->d5, a->d4, 31);
273+
a->d4 = generic_bitalign(a->d4, a->d3, 31);
274+
a->d3 = generic_bitalign(a->d3, a->d2, 31);
275+
a->d2 = generic_bitalign(a->d2, a->d1, 31);
276+
a->d1 = generic_bitalign(a->d1, a->d0, 31);
277277
a->d0 = a->d0 << 1;
278278
}
279279

@@ -442,12 +442,12 @@ void div_192_96(int96_v * const res, __private uint qd5, const int96_v n, const
442442

443443
// shiftleft nn 11 bits
444444
#ifndef DIV_160_96
445-
nn.d3 = (nn.d3 << 11) + (nn.d2 >> 21);
445+
nn.d3 = generic_bitalign(nn.d3, nn.d2, 21);
446446
#endif
447447
nn.d2 = amd_bitalign(nn.d2, nn.d1, 21);
448448
nn.d1 = amd_bitalign(nn.d1, nn.d0, 21);
449-
// nn.d2 = (nn.d2 << 11) + (nn.d1 >> 21);
450-
// nn.d1 = (nn.d1 << 11) + (nn.d0 >> 21);
449+
// nn.d2 = generic_bitalign(nn.d2, nn.d1, 21);
450+
// nn.d1 = generic_bitalign(nn.d1, nn.d0, 21);
451451
nn.d0 = nn.d0 << 11;
452452

453453
// q = q - nn
@@ -510,11 +510,11 @@ void div_192_96(int96_v * const res, __private uint qd5, const int96_v n, const
510510
nn.d4 = nn.d3 >> 9;
511511
#endif
512512
// nn.d3 = amd_bitalign(nn.d3, nn.d2, 9);
513-
nn.d3 = (nn.d3 << 23) + (nn.d2 >> 9);
513+
nn.d3 = generic_bitalign(nn.d3, nn.d2, 9);
514514
nn.d2 = amd_bitalign(nn.d2, nn.d1, 9);
515-
// nn.d2 = (nn.d2 << 23) + (nn.d1 >> 9);
515+
// nn.d2 = generic_bitalign(nn.d2, nn.d1, 9);
516516
// nn.d1 = amd_bitalign(nn.d1, nn.d0, 9);
517-
nn.d1 = (nn.d1 << 23) + (nn.d0 >> 9);
517+
nn.d1 = generic_bitalign(nn.d1, nn.d0, 9);
518518
nn.d0 = nn.d0 << 23;
519519

520520
// q = q - nn
@@ -642,9 +642,9 @@ void div_192_96(int96_v * const res, __private uint qd5, const int96_v n, const
642642
#ifdef CHECKS_MODBASECASE
643643
nn.d4 = nn.d3 >> 17;
644644
#endif
645-
nn.d3 = (nn.d3 << 15) + (nn.d2 >> 17);
646-
nn.d2 = (nn.d2 << 15) + (nn.d1 >> 17);
647-
nn.d1 = (nn.d1 << 15) + (nn.d0 >> 17);
645+
nn.d3 = generic_bitalign(nn.d3, nn.d2, 17);
646+
nn.d2 = generic_bitalign(nn.d2, nn.d1, 17);
647+
nn.d1 = generic_bitalign(nn.d1, nn.d0, 17);
648648
nn.d0 = nn.d0 << 15;
649649

650650
// q = q - nn
@@ -877,12 +877,12 @@ DIV_160_96 here. */
877877

878878
// shiftleft nn 11 bits
879879
#ifndef DIV_160_96
880-
nn.d3 = (nn.d3 << 11) + (nn.d2 >> 21);
880+
nn.d3 = generic_bitalign(nn.d3, nn.d2, 21);
881881
#endif
882882
nn.d2 = amd_bitalign(nn.d2, nn.d1, 21);
883883
nn.d1 = amd_bitalign(nn.d1, nn.d0, 21);
884-
// nn.d2 = (nn.d2 << 11) + (nn.d1 >> 21);
885-
// nn.d1 = (nn.d1 << 11) + (nn.d0 >> 21);
884+
// nn.d2 = generic_bitalign(nn.d2, nn.d1, 21);
885+
// nn.d1 = generic_bitalign(nn.d1, nn.d0, 21);
886886
nn.d0 = nn.d0 << 11;
887887

888888
// q = q - nn
@@ -945,11 +945,11 @@ DIV_160_96 here. */
945945
nn.d4 = nn.d3 >> 9;
946946
#endif
947947
// nn.d3 = amd_bitalign(nn.d3, nn.d2, 9);
948-
nn.d3 = (nn.d3 << 23) + (nn.d2 >> 9);
948+
nn.d3 = generic_bitalign(nn.d3, nn.d2, 9);
949949
nn.d2 = amd_bitalign(nn.d2, nn.d1, 9);
950-
// nn.d2 = (nn.d2 << 23) + (nn.d1 >> 9);
950+
// nn.d2 = generic_bitalign(nn.d2, nn.d1, 9);
951951
// nn.d1 = amd_bitalign(nn.d1, nn.d0, 9);
952-
nn.d1 = (nn.d1 << 23) + (nn.d0 >> 9);
952+
nn.d1 = generic_bitalign(nn.d1, nn.d0, 9);
953953
nn.d0 = nn.d0 << 23;
954954

955955
// q = q - nn
@@ -1077,9 +1077,9 @@ DIV_160_96 here. */
10771077
#ifdef CHECKS_MODBASECASE
10781078
nn.d4 = nn.d3 >> 17;
10791079
#endif
1080-
nn.d3 = (nn.d3 << 15) + (nn.d2 >> 17);
1081-
nn.d2 = (nn.d2 << 15) + (nn.d1 >> 17);
1082-
nn.d1 = (nn.d1 << 15) + (nn.d0 >> 17);
1080+
nn.d3 = generic_bitalign(nn.d3, nn.d2, 17);
1081+
nn.d2 = generic_bitalign(nn.d2, nn.d1, 17);
1082+
nn.d1 = generic_bitalign(nn.d1, nn.d0, 17);
10831083
nn.d0 = nn.d0 << 15;
10841084

10851085
// q = q - nn

src/common.cl

+19-1
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,24 @@ uint popcount(uint x)
170170
#define ATOMIC_INC(x) ((x)++)
171171
#endif
172172

173+
// generic_bitalign() emulates amd_bitalign() using shifts.
174+
#ifdef cl_intel_subgroups
175+
// Workaround for Intel Compute Runtime (NEO) versions 23.22.26516.18 to
176+
// 24.45.31740.9: https://github.com/intel/intel-graphics-compiler/issues/358
177+
// Use 64-bit shifts. They are faster than 32-bit shifts on Intel, so it's not
178+
// needed to limit this workaround to specific versions.
179+
inline uint_v generic_bitalign(const uint_v high, const uint_v low, const int shift)
180+
{
181+
return CONVERT_UINT_V(((CONVERT_ULONG_V(high) << 32) | CONVERT_ULONG_V(low)) >> shift);
182+
}
183+
#else
184+
// Use 32-bit shifts for other platforms.
185+
inline uint_v generic_bitalign(const uint_v high, const uint_v low, const int shift)
186+
{
187+
return (high << (32 - shift)) | (low >> shift);
188+
}
189+
#endif
190+
173191
#ifdef cl_amd_media_ops
174192
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
175193
#else
@@ -180,7 +198,7 @@ uint popcount(uint x)
180198
// Description
181199
// dst.s0 = (uint) (((((long)src0.s0) << 32) | (long)src1.s0) >> (src2.s0 & 31))
182200
// similar operation applied to other components of the vectors.
183-
#define amd_bitalign(src0, src1, src2) (src0 << (32-src2)) | (src1 >> src2)
201+
#define amd_bitalign(src0, src1, src2) generic_bitalign(src0, src1, src2)
184202
#endif
185203

186204
#ifdef cl_amd_media_ops2

0 commit comments

Comments
 (0)