Skip to content

Commit a77cc90

Browse files
authored
Merge branch 'main' into penghuic/add_hook_random_failure
2 parents 37cae65 + a868a2e commit a77cc90

14 files changed

+639
-17
lines changed

.github/scripts/apply_torch_pr.py

+2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
"https://github.com/pytorch/pytorch/pull/126516",
1414
# Modify the tolerance level in TIMM benchmark
1515
"https://github.com/pytorch/pytorch/pull/143739",
16+
# Fix build error caused by incorrect namespace change by #144014
17+
"https://github.com/pytorch/pytorch/pull/144450",
1618
]
1719
)
1820
parser.add_argument('--extra-pr-list', '-e', nargs='+',default=[])

.github/workflows/_linux_transformers.yml

+11-1
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ jobs:
4646
Torch-XPU-Transformers-Tests:
4747
runs-on: ${{ inputs.runner != '' && inputs.runner || 'linux.idc.xpu' }}
4848
env:
49+
HF_HOME: ${{ github.workspace }}/.hf_home
50+
HF_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
4951
NEOReadDebugKeys: ${{ inputs.driver == 'rolling' && '1' || '0' }}
5052
DisableScratchPages: ${{ inputs.driver == 'rolling' && '1' || '0' }}
5153
python: ${{ inputs.python != '' && inputs.python || '3.10' }}
@@ -115,7 +117,7 @@ jobs:
115117
cat /sys/class/drm/render*/device/device | tee ${{ github.workspace }}/transformers/tests_log/device_IDs.txt
116118
echo "xpu-smi output:"
117119
xpu-smi discovery -y --json --dump -1
118-
- name: Sanitry check installed packages
120+
- name: Sanity check installed packages
119121
run: |
120122
source activate huggingface_transformers_test
121123
# These checks are to exit earlier if for any reason Transformers
@@ -124,6 +126,9 @@ jobs:
124126
pip show torchaudio | grep Version | grep xpu
125127
pip show torchvision | grep Version | grep xpu
126128
python -c 'import torch; exit(not torch.xpu.is_available())'
129+
- name: Clean HF home directory and cache
130+
run: |
131+
rm -rf ${{ env.HF_HOME }}
127132
- name: Run -k backbone tests
128133
env:
129134
TEST_CASE: 'tests_backbone'
@@ -212,6 +217,11 @@ jobs:
212217
FAILED_CASES=$(echo $FAILED_CASES | sed 's/^,//')
213218
echo "Failed cases: [$(echo $FAILED_CASES | sed 's/,/, /g')]"
214219
test -z "$FAILED_CASES"
220+
- name: Clean HF home directory and cache
221+
if: ${{ always() }}
222+
run: |
223+
du -sh ${{ env.HF_HOME }} || true
224+
rm -rf ${{ env.HF_HOME }}
215225
- name: Print results table
216226
if: ${{ ! cancelled() }}
217227
run: |

cmake/Codegen.cmake

+6-2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ file(MAKE_DIRECTORY ${BUILD_TORCH_XPU_ATEN_GENERATED})
88

99
set(RegisterXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU.cpp)
1010
set(RegisterSparseXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseXPU.cpp)
11+
set(RegisterNestedTensorXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterNestedTensorXPU.cpp)
1112
set(XPUFallback_PATH ${TORCH_XPU_OPS_ROOT}/src/ATen/native/xpu/XPUFallback.template)
1213

1314
if(WIN32)
@@ -47,6 +48,7 @@ endfunction(GEN_BACKEND)
4748

4849
set(RegisterXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU.cpp)
4950
set(RegisterSparseXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseXPU.cpp)
51+
set(RegisterNestedTensorXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterNestedTensorXPU.cpp)
5052
set(XPUFallback_PATH ${TORCH_XPU_OPS_ROOT}/src/ATen/native/xpu/XPUFallback.template)
5153
set(XPU_AOTI_INSTALL_DIR ${TORCH_ROOT}/torch/csrc/inductor/aoti_torch/generated/extend)
5254
function(GEN_XPU file_yaml)
@@ -77,7 +79,7 @@ function(GEN_XPU file_yaml)
7779
--install-dir ${BUILD_TORCH_XPU_ATEN_GENERATED}
7880
--per-operator-headers
7981
--static-dispatch-backend
80-
--backend-whitelist XPU SparseXPU
82+
--backend-whitelist XPU SparseXPU NestedTensorXPU
8183
# --xpu: generate in-tree RegisterXPU.cpp for in-tree OPs
8284
--xpu
8385
# --update-aoti-c-shim: generate extend/c_shim_xpu.h
@@ -93,6 +95,7 @@ function(GEN_XPU file_yaml)
9395
# Codegen post-process
9496
COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterXPU_PATH}
9597
COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterSparseXPU_PATH}
98+
COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterNestedTensorXPU_PATH}
9699
${SIMPLE_TRACE}
97100
WORKING_DIRECTORY ${TORCH_ROOT}
98101
DEPENDS
@@ -122,6 +125,7 @@ GEN_XPU(
122125
${BUILD_TORCH_XPU_ATEN_GENERATED}/XPUFunctions.h
123126
${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU.cpp
124127
${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseXPU.cpp
128+
${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterNestedTensorXPU.cpp
125129
${XPU_AOTI_INSTALL_DIR}/c_shim_xpu.h
126130
${XPU_AOTI_INSTALL_DIR}/c_shim_xpu.cpp
127131
)
@@ -133,7 +137,7 @@ GEN_XPU(
133137
# $TORCH_XPU_OPS_INCLUDE_DIRS, so that "#include <ATen/ops/*.h>" works.
134138
list(APPEND TORCH_XPU_OPS_INCLUDE_DIRS ${CMAKE_BINARY_DIR}/xpu)
135139

136-
list(APPEND xpu_generated_src ${RegisterXPU_PATH} ${RegisterSparseXPU_PATH})
140+
list(APPEND xpu_generated_src ${RegisterXPU_PATH} ${RegisterSparseXPU_PATH} ${RegisterNestedTensorXPU_PATH})
137141
list(APPEND xpu_generated_src ${XPU_AOTI_INSTALL_DIR}/c_shim_xpu.cpp)
138142
add_custom_target(TORCH_XPU_GEN_TARGET DEPENDS ${xpu_generated_src})
139143
set(ATen_XPU_GEN_SRCS ${xpu_generated_src})

src/ATen/native/xpu/TensorAdvancedIndexing.cpp

+47
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,24 @@
77
#include <ATen/core/op_registration/adaption.h>
88
#include <ATen/native/DispatchStub.h>
99
#include <ATen/native/IndexKernel.h>
10+
#include <ATen/native/ReductionType.h>
1011
#include <ATen/native/TensorAdvancedIndexing.h>
1112
#include <ATen/native/TensorAdvancedIndexingUtils.h>
1213
#include <ATen/native/TensorIterator.h>
14+
//#include <ATen/native/TensorFactories.cpp>
1315
#include <ATen/native/xpu/sycl/IndexingKernels.h>
1416
#include <ATen/native/xpu/sycl/ScatterGatherKernels.h>
17+
#include <ATen/ops/ones_like.h>
18+
#include <ATen/ops/zeros_like.h>
1519
#include <comm/RegisterUtils.h>
1620
#include <comm/xpu_aten.h>
1721
#include <torch/library.h>
1822

1923
#include <ATen/ops/index_add_meta.h>
24+
#include <ATen/ops/index_reduce_meta.h>
2025
#include <xpu/ATen/ops/index_add_native.h>
26+
#include <xpu/ATen/ops/index_reduce_native.h> //generated
27+
//#include <xpu/ATen/ops/index_reduce_prod_native.h> //generated
2128

2229
namespace at {
2330

@@ -42,6 +49,7 @@ REGISTER_XPU_DISPATCH(index_fill_stub, &xpu::index_fill_kernel);
4249
REGISTER_XPU_DISPATCH(index_copy_stub, &xpu::index_copy_kernel);
4350
REGISTER_XPU_DISPATCH(put_stub, &xpu::put_kernel);
4451
REGISTER_XPU_DISPATCH(take_stub, &xpu::take_kernel);
52+
// REGISTER_XPU_DISPATCH(index_reduce_stub, &xpu::index_reduce_kernel);
4553

4654
TORCH_IMPL_FUNC(index_add_xpu_out)
4755
(const Tensor& self,
@@ -126,5 +134,44 @@ Tensor count_nonzero_xpu(const Tensor& self, IntArrayRef dims) {
126134
return (self != 0).sum(dims);
127135
}
128136

137+
TORCH_IMPL_FUNC(index_reduce_xpu_out)
138+
(const Tensor& self,
139+
int64_t dim,
140+
const Tensor& index,
141+
const Tensor& source,
142+
const c10::string_view reduce,
143+
bool include_self,
144+
const Tensor& result) {
145+
TORCH_WARN_ONCE(
146+
"index_reduce() is in beta and the API may change at any time.");
147+
if (reduce == "prod") {
148+
xpu::index_reduce_prod_kernel(
149+
self, dim, index, source, include_self, ReductionType::PROD, result);
150+
} else if (reduce == "mean") {
151+
xpu::index_reduce_mean_kernel(
152+
self, dim, index, source, include_self, ReductionType::MEAN, result);
153+
auto counts = include_self ? ones_like(result) : zeros_like(result);
154+
counts.index_add_(dim, index, ones_like(source));
155+
counts.masked_fill_(counts == 0, 1);
156+
if (result.is_floating_point() || result.is_complex()) {
157+
result.div_(counts);
158+
} else {
159+
result.div_(counts, "floor");
160+
}
161+
} else if (reduce == "amax") {
162+
xpu::index_reduce_amax_kernel(
163+
self, dim, index, source, include_self, ReductionType::MAX, result);
164+
} else if (reduce == "amin") {
165+
xpu::index_reduce_amin_kernel(
166+
self, dim, index, source, include_self, ReductionType::MIN, result);
167+
} else {
168+
TORCH_CHECK(
169+
false,
170+
"Only support prod, mean, amax or amin reduce operator. Input was ",
171+
reduce,
172+
".");
173+
}
174+
}
175+
129176
} // namespace native
130177
} // namespace at

src/ATen/native/xpu/XPUFallback.template

-1
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
163163
"_fft_r2c",
164164
"_flash_attention_forward",
165165
"geqrf",
166-
"index_reduce.out",
167166
"linalg_cholesky_ex.L",
168167
"_linalg_det.result",
169168
"linalg_eig",

src/ATen/native/xpu/sycl/Atomics.h

+6
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,8 @@ SYCL_ATOMIC_INTEGER(Mul, std::multiplies<int8_t>()(a, b), int8_t)
360360
SYCL_ATOMIC_INTEGER(Mul, std::multiplies<int16_t>()(a, b), int16_t)
361361
SYCL_ATOMIC_INTEGER(Mul, std::multiplies<int32_t>()(a, b), int32_t)
362362
SYCL_ATOMIC_INTEGER(Mul, std::multiplies<int64_t>()(a, b), int64_t)
363+
SYCL_ATOMIC_INTEGER(Mul, std::multiplies<uint32_t>()(a, b), uint32_t)
364+
SYCL_ATOMIC_INTEGER(Mul, std::multiplies<uint64_t>()(a, b), uint64_t)
363365

364366
SYCL_ATOMIC_FP(Mul, std::multiplies<float>()(a, b), float)
365367
SYCL_ATOMIC_FP(Mul, std::multiplies<double>()(a, b), double)
@@ -391,6 +393,8 @@ SYCL_ATOMIC_INTEGER(Max, safe_max<int8_t>(a, b), int8_t)
391393
SYCL_ATOMIC_INTEGER(Max, safe_max<int16_t>(a, b), int16_t)
392394
SYCL_ATOMIC_INTEGER(Max, safe_max<int32_t>(a, b), int32_t)
393395
SYCL_ATOMIC_INTEGER(Max, safe_max<int64_t>(a, b), int64_t)
396+
SYCL_ATOMIC_INTEGER(Max, safe_max<uint32_t>(a, b), uint32_t)
397+
SYCL_ATOMIC_INTEGER(Max, safe_max<uint64_t>(a, b), uint64_t)
394398

395399
SYCL_ATOMIC_FP(Max, safe_max<float>(a, b), float)
396400
SYCL_ATOMIC_FP(Max, safe_max<double>(a, b), double)
@@ -403,6 +407,8 @@ SYCL_ATOMIC_INTEGER(Min, safe_min<int8_t>(a, b), int8_t)
403407
SYCL_ATOMIC_INTEGER(Min, safe_min<int16_t>(a, b), int16_t)
404408
SYCL_ATOMIC_INTEGER(Min, safe_min<int32_t>(a, b), int32_t)
405409
SYCL_ATOMIC_INTEGER(Min, safe_min<int64_t>(a, b), int64_t)
410+
SYCL_ATOMIC_INTEGER(Min, safe_min<uint32_t>(a, b), uint32_t)
411+
SYCL_ATOMIC_INTEGER(Min, safe_min<uint64_t>(a, b), uint64_t)
406412

407413
SYCL_ATOMIC_FP(Min, safe_min<float>(a, b), float)
408414
SYCL_ATOMIC_FP(Min, safe_min<double>(a, b), double)

0 commit comments

Comments
 (0)