Skip to content

Commit 336a74b

Browse files
authored
Merge branch 'main' into chao/avg
2 parents 14fdda7 + b8c05de commit 336a74b

File tree

4 files changed

+126
-117
lines changed

4 files changed

+126
-117
lines changed

CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ include(${TORCH_XPU_OPS_ROOT}/cmake/ONEMKL.cmake)
4040
include(${TORCH_XPU_OPS_ROOT}/cmake/BuildFlags.cmake)
4141

4242
option(USE_XCCL "Build with XCCL support" OFF)
43-
option(USE_C10D_XCCL "Build with XCCL support for C10D" OFF)
43+
option(USE_C10D_XCCL "Build with XCCL support for C10D" ON)
4444

4545
# -- [ Re-generate the macros file for https://github.com/pytorch/pytorch/pull/147161
4646
macro(update_caffe2_macros_file)

cmake/Codegen.cmake

+49-81
Original file line numberDiff line numberDiff line change
@@ -1,89 +1,60 @@
1-
if(Codegen_GPU_cmake_included)
1+
if(Codegen_XPU_cmake_included)
22
return()
33
endif()
4-
set(Codegen_GPU_cmake_included true)
4+
set(Codegen_XPU_cmake_included true)
55

6-
set(BUILD_TORCH_XPU_ATEN_GENERATED "${CMAKE_BINARY_DIR}/xpu/ATen/")
6+
set(BUILD_TORCH_XPU_ATEN_GENERATED "${CMAKE_BINARY_DIR}/xpu/ATen")
77
file(MAKE_DIRECTORY ${BUILD_TORCH_XPU_ATEN_GENERATED})
88

9-
set(RegisterXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU_0.cpp)
10-
set(RegisterSparseXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseXPU_0.cpp)
11-
set(RegisterSparseCsrXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseCsrXPU_0.cpp)
12-
set(RegisterNestedTensorXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterNestedTensorXPU_0.cpp)
13-
set(XPUFallback_PATH ${TORCH_XPU_OPS_ROOT}/src/ATen/native/xpu/XPUFallback.template)
9+
set(RegisterXPU_GENERATED ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU_0.cpp)
10+
set(RegisterSparseXPU_GENERATED ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseXPU_0.cpp)
11+
set(RegisterSparseCsrXPU_GENERATED ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseCsrXPU_0.cpp)
12+
set(RegisterNestedTensorXPU_GENERATED ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterNestedTensorXPU_0.cpp)
13+
set(XPUFallback_TEMPLATE ${TORCH_XPU_OPS_ROOT}/src/ATen/native/xpu/XPUFallback.template)
14+
set(XPU_AOTI_INSTALL_DIR ${TORCH_ROOT}/torch/csrc/inductor/aoti_torch/generated/extend)
15+
set(XPU_AOTI_SHIM_HEADER ${XPU_AOTI_INSTALL_DIR}/c_shim_xpu.h)
16+
set(XPU_AOTI_SHIM_SOURCE ${XPU_AOTI_INSTALL_DIR}/c_shim_xpu.cpp)
1417

1518
if(WIN32)
1619
set(FILE_DISPLAY_CMD type)
17-
# replace forward slash with back slash for compatibility with 'type' command on Windows
18-
string(REPLACE "/" "\\" RegisterXPU_PATH_BACKSLASH "${RegisterXPU_PATH}")
19-
string(REPLACE "/" "\\" XPUFallback_PATH_BACKSLASH "${XPUFallback_PATH}")
20-
set(REGISTER_FALLBACK_CMD ${FILE_DISPLAY_CMD} ${XPUFallback_PATH_BACKSLASH} ">>" ${RegisterXPU_PATH_BACKSLASH})
2120
else()
2221
set(FILE_DISPLAY_CMD cat)
23-
set(REGISTER_FALLBACK_CMD ${FILE_DISPLAY_CMD} ${XPUFallback_PATH} ">>" ${RegisterXPU_PATH})
2422
endif()
23+
file(TO_NATIVE_PATH "${RegisterXPU_GENERATED}" RegisterXPU_GENERATED_NATIVE)
24+
file(TO_NATIVE_PATH "${XPUFallback_TEMPLATE}" XPUFallback_TEMPLATE_NATIVE)
25+
set(REGISTER_FALLBACK_CMD ${FILE_DISPLAY_CMD} ${XPUFallback_TEMPLATE_NATIVE} ">>" ${RegisterXPU_GENERATED_NATIVE})
2526

26-
function(GEN_BACKEND file_yaml)
27-
set(generated_files "")
28-
foreach(f ${ARGN})
29-
list(APPEND generated_files "${BUILD_TORCH_XPU_ATEN_GENERATED}/${f}")
30-
endforeach()
31-
file(GLOB_RECURSE depended_files ${TORCH_XPU_OPS_ROOT}/yaml/${file_yaml})
32-
add_custom_command(
33-
OUTPUT ${generated_files}
34-
COMMAND
35-
"${PYTHON_EXECUTABLE}" -m torchgen.gen_backend_stubs
36-
--output_dir ${BUILD_TORCH_XPU_ATEN_GENERATED}
37-
--source_yaml ${TORCH_XPU_OPS_ROOT}/yaml/${file_yaml}
38-
COMMAND
39-
${REGISTER_FALLBACK_CMD}
40-
${SIMPLE_TRACE}
41-
WORKING_DIRECTORY ${TORCH_ROOT}
42-
DEPENDS
43-
${depended_files}
44-
${TORCH_XPU_OPS_ROOT}/yaml/${file_yaml}
45-
${XPUFallback_PATH}
46-
)
47-
endfunction(GEN_BACKEND)
48-
49-
50-
set(RegisterXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU_0.cpp)
51-
set(RegisterSparseXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseXPU_0.cpp)
52-
set(RegisterSparseCsrXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseCsrXPU_0.cpp)
53-
set(RegisterNestedTensorXPU_PATH ${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterNestedTensorXPU_0.cpp)
54-
set(XPUFallback_PATH ${TORCH_XPU_OPS_ROOT}/src/ATen/native/xpu/XPUFallback.template)
55-
set(XPU_AOTI_INSTALL_DIR ${TORCH_ROOT}/torch/csrc/inductor/aoti_torch/generated/extend)
5627
function(GEN_XPU file_yaml)
5728
set(generated_files "")
5829
foreach(f ${ARGN})
5930
list(APPEND generated_files "${f}")
6031
endforeach()
61-
file(GLOB_RECURSE depend_files ${TORCH_XPU_OPS_ROOT}/yaml/${file_yaml})
62-
set(CODEGEN_TEMPLATE ${TORCH_XPU_OPS_ROOT}/yaml/)
32+
set(CODEGEN_XPU_YAML_DIR ${TORCH_XPU_OPS_ROOT}/yaml)
6333

6434
# Codegen prepare process
6535
if(WIN32)
66-
string(REPLACE "/" "\\" DestPATH "${CODEGEN_TEMPLATE}templates")
67-
string(REPLACE "/" "\\" SrcPATH "${CMAKE_SOURCE_DIR}/aten/src/ATen/templates")
36+
file(TO_NATIVE_PATH "${CODEGEN_XPU_YAML_DIR}/templates" DestPATH)
37+
file(TO_NATIVE_PATH "${CMAKE_SOURCE_DIR}/aten/src/ATen/templates" SrcPATH)
6838
execute_process(COMMAND cmd /c xcopy ${SrcPATH} ${DestPATH} /E /H /C /I /Y > nul)
69-
string(REPLACE "/" "\\" RegisterXPU_PATH_BACKSLASH "${RegisterXPU_PATH}")
70-
string(REPLACE "/" "\\" XPUFallback_PATH_BACKSLASH "${XPUFallback_PATH}")
71-
set(REGISTER_FALLBACK_CMD ${FILE_DISPLAY_CMD} ${XPUFallback_PATH_BACKSLASH} ">>" ${RegisterXPU_PATH_BACKSLASH})
7239
else()
73-
execute_process(COMMAND ln -s ${CMAKE_SOURCE_DIR}/aten/src/ATen/templates ${CODEGEN_TEMPLATE}) # soft link to pytorch templates
74-
set(REGISTER_FALLBACK_CMD ${FILE_DISPLAY_CMD} ${XPUFallback_PATH} ">>" ${RegisterXPU_PATH})
40+
execute_process(COMMAND ln -s ${CMAKE_SOURCE_DIR}/aten/src/ATen/templates ${CODEGEN_XPU_YAML_DIR}) # soft link to pytorch templates
7541
endif()
76-
add_custom_command(
77-
OUTPUT ${generated_files}
78-
COMMAND
42+
43+
set(XPU_CODEGEN_COMMAND
7944
"${PYTHON_EXECUTABLE}" -m torchgen.gen
80-
--source-path ${TORCH_XPU_OPS_ROOT}/yaml/
45+
--source-path ${CODEGEN_XPU_YAML_DIR}
8146
--install-dir ${BUILD_TORCH_XPU_ATEN_GENERATED}
8247
--per-operator-headers
83-
--static-dispatch-backend
8448
--backend-whitelist XPU SparseXPU SparseCsrXPU NestedTensorXPU
85-
# --xpu: generate in-tree RegisterXPU_0.cpp for in-tree OPs
8649
--xpu
50+
)
51+
52+
add_custom_command(
53+
COMMENT "Generating XPU ATen Codegen..."
54+
OUTPUT ${generated_files}
55+
COMMAND
56+
${XPU_CODEGEN_COMMAND}
57+
--static-dispatch-backend
8758
# --update-aoti-c-shim: generate extend/c_shim_xpu.h
8859
--update-aoti-c-shim
8960
# --exten-aoti-c-shim: specifiy the extend/c_shim_xpu
@@ -95,16 +66,14 @@ function(GEN_XPU file_yaml)
9566
COMMAND
9667
${REGISTER_FALLBACK_CMD}
9768
# Codegen post-process
98-
COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterXPU_PATH}
99-
COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterSparseXPU_PATH}
100-
COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterSparseCsrXPU_PATH}
101-
COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterNestedTensorXPU_PATH}
102-
${SIMPLE_TRACE}
69+
COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterXPU_GENERATED}
70+
COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterSparseXPU_GENERATED}
71+
COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterSparseCsrXPU_GENERATED}
72+
COMMAND "${PYTHON_EXECUTABLE}" ${TORCH_XPU_OPS_ROOT}/tools/codegen/remove_headers.py --register_xpu_path ${RegisterNestedTensorXPU_GENERATED}
10373
WORKING_DIRECTORY ${TORCH_ROOT}
10474
DEPENDS
105-
${depended_files}
106-
${TORCH_XPU_OPS_ROOT}/yaml/native/${file_yaml}
107-
${XPUFallback_PATH}
75+
${CODEGEN_XPU_YAML_DIR}/native/${file_yaml}
76+
${XPUFallback_TEMPLATE}
10877
)
10978

11079
# Post codegen delete the copied templates folder only on Windows.
@@ -118,30 +87,29 @@ function(GEN_XPU file_yaml)
11887
endif()
11988
endfunction(GEN_XPU)
12089

121-
# GEN_BACKEND(
122-
# xpu_functions.yaml
123-
# XPUNativeFunctions.h
124-
# RegisterXPU_0.cpp)
125-
12690
GEN_XPU(
12791
native_functions.yaml
12892
${BUILD_TORCH_XPU_ATEN_GENERATED}/XPUFunctions.h
129-
${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterXPU_0.cpp
130-
${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseXPU_0.cpp
131-
${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterSparseCsrXPU_0.cpp
132-
${BUILD_TORCH_XPU_ATEN_GENERATED}/RegisterNestedTensorXPU_0.cpp
133-
${XPU_AOTI_INSTALL_DIR}/c_shim_xpu.h
134-
${XPU_AOTI_INSTALL_DIR}/c_shim_xpu.cpp
93+
${BUILD_TORCH_XPU_ATEN_GENERATED}/XPUFunctions_inl.h
94+
${RegisterXPU_GENERATED}
95+
${RegisterSparseXPU_GENERATED}
96+
${RegisterSparseCsrXPU_GENERATED}
97+
${RegisterNestedTensorXPU_GENERATED}
98+
${XPU_AOTI_SHIM_HEADER}
99+
${XPU_AOTI_SHIM_SOURCE}
135100
)
136101

137-
138102
# The c_shim_xpu.cpp needs include files in ${CMAKE_BINARY_DIR}/xpu/ATen/ops/*.h)
139103
# The include path is auto generated as "#include <ATen/ops/*.h">
140104
# To follow the design of aoti codegen, here ${CMAKE_BINARY_DIR}/xpu is added to
141105
# $TORCH_XPU_OPS_INCLUDE_DIRS, so that "#include <ATen/ops/*.h>" works.
142106
list(APPEND TORCH_XPU_OPS_INCLUDE_DIRS ${CMAKE_BINARY_DIR}/xpu)
143107

144-
list(APPEND xpu_generated_src ${RegisterXPU_PATH} ${RegisterSparseXPU_PATH} ${RegisterSparseCsrXPU_PATH} ${RegisterNestedTensorXPU_PATH})
145-
list(APPEND xpu_generated_src ${XPU_AOTI_INSTALL_DIR}/c_shim_xpu.cpp)
146-
add_custom_target(TORCH_XPU_GEN_TARGET DEPENDS ${xpu_generated_src})
108+
list(APPEND xpu_generated_src
109+
${RegisterXPU_GENERATED}
110+
${RegisterSparseXPU_GENERATED}
111+
${RegisterSparseCsrXPU_GENERATED}
112+
${RegisterNestedTensorXPU_GENERATED}
113+
${XPU_AOTI_SHIM_SOURCE}
114+
)
147115
set(ATen_XPU_GEN_SRCS ${xpu_generated_src})

src/ATen/native/xpu/NMS.cpp

+2-33
Original file line numberDiff line numberDiff line change
@@ -42,39 +42,8 @@ Tensor nms(const Tensor& dets, const Tensor& scores, double iou_threshold_) {
4242
scores.sort(/*stable=*/true, /*dim=*/0, /* descending=*/true));
4343
auto dets_sorted = dets.index_select(0, order_t).contiguous();
4444

45-
int dets_num = dets.size(0);
46-
int col_blocks = (dets_num + nms_items_per_group - 1) / nms_items_per_group;
47-
48-
auto mask = nms_kernel(dets_sorted, iou_threshold);
49-
50-
at::Tensor mask_cpu = mask.to(at::kCPU);
51-
unsigned long long* mask_host =
52-
(unsigned long long*)mask_cpu.mutable_data_ptr();
53-
54-
std::vector<unsigned long long> remv(col_blocks);
55-
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
56-
57-
at::Tensor keep =
58-
at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
59-
int64_t* keep_out = keep.mutable_data_ptr<int64_t>();
60-
61-
int num_to_keep = 0;
62-
for (int i = 0; i < dets_num; i++) {
63-
int nblock = i / nms_items_per_group;
64-
int inblock = i % nms_items_per_group;
65-
66-
if (!(remv[nblock] & (1ULL << inblock))) {
67-
keep_out[num_to_keep++] = i;
68-
unsigned long long* p = mask_host + i * col_blocks;
69-
for (int j = nblock; j < col_blocks; j++) {
70-
remv[j] |= p[j];
71-
}
72-
}
73-
}
74-
75-
return order_t.index(
76-
{keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
77-
.to(order_t.device(), keep.scalar_type())});
45+
auto keep = nms_kernel(dets_sorted, iou_threshold);
46+
return order_t.masked_select(keep);
7847
}
7948

8049
} // namespace at::native::xpu

src/ATen/native/xpu/sycl/NMSKernel.cpp

+74-2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include <comm/SYCLContext.h>
33
#include <comm/xpu_aten.h>
44

5+
#include <ATen/ceil_div.h>
56
#include <ATen/native/xpu/sycl/NMSKernel.h>
67

78
namespace at {
@@ -97,9 +98,68 @@ struct NMSKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
9798
sycl_local_acc_t<acc_t> slm_;
9899
};
99100

101+
struct GatherKeepFromMask : public __SYCL_KER_CONFIG_CONVENTION__ {
102+
void operator()(sycl::nd_item<1> item) const {
103+
const int thread_id = item.get_local_id(0);
104+
105+
// Initialize removed
106+
for (int i = thread_id; i < col_blocks_; i += nms_items_per_group) {
107+
removed_[i] = 0;
108+
}
109+
item.barrier(sycl_local_fence);
110+
111+
for (int nblock = 0; nblock < col_blocks_; nblock++) {
112+
auto removed_val = removed_[nblock];
113+
item.barrier(sycl_local_fence);
114+
const int i_offset = nblock * nms_items_per_group;
115+
116+
for (int inblock = 0; inblock < nms_items_per_group; inblock++) {
117+
const int i = i_offset + inblock;
118+
if (i >= n_boxes_)
119+
break;
120+
121+
// Select a candidate, check if it should be kept
122+
if (!(removed_val & (1ULL << inblock))) {
123+
if (thread_id == 0) {
124+
keep_[i] = true;
125+
}
126+
auto p = dev_mask_ + i * col_blocks_;
127+
128+
// Remove all bboxes which overlap the candidate
129+
for (int j = thread_id; j < col_blocks_; j += nms_items_per_group) {
130+
if (j >= nblock)
131+
removed_[j] |= p[j];
132+
}
133+
item.barrier(sycl_local_fence);
134+
removed_val = removed_[nblock];
135+
}
136+
}
137+
}
138+
}
139+
GatherKeepFromMask(
140+
bool* keep,
141+
const unsigned long long* dev_mask,
142+
const int n_boxes)
143+
: keep_(keep),
144+
dev_mask_(dev_mask),
145+
n_boxes_(n_boxes),
146+
col_blocks_(ceil_div(n_boxes, nms_items_per_group)) {}
147+
148+
void sycl_ker_config_convention(sycl::handler& cgh) {
149+
removed_ = sycl_local_acc_t<unsigned long long>(col_blocks_, cgh);
150+
}
151+
152+
private:
153+
bool* keep_;
154+
const unsigned long long* dev_mask_;
155+
const int n_boxes_;
156+
const int col_blocks_;
157+
sycl_local_acc_t<unsigned long long> removed_;
158+
};
159+
100160
Tensor nms_kernel(const Tensor& dets_sorted, float iou_threshold) {
101161
int dets_num = dets_sorted.size(0);
102-
int col_blocks = (dets_num + nms_items_per_group - 1) / nms_items_per_group;
162+
int col_blocks = ceil_div(dets_num, nms_items_per_group);
103163
auto mask = at::empty(
104164
{dets_num * col_blocks}, dets_sorted.options().dtype(at::kLong));
105165

@@ -120,7 +180,19 @@ Tensor nms_kernel(const Tensor& dets_sorted, float iou_threshold) {
120180
sycl_kernel_submit(
121181
global_range, local_range, at::xpu::getCurrentSYCLQueue(), caller);
122182
});
123-
return mask;
183+
184+
at::Tensor keep = at::zeros(
185+
{dets_num}, dets_sorted.options().dtype(at::kBool).device(at::kXPU));
186+
auto caller = GatherKeepFromMask(
187+
keep.data_ptr<bool>(),
188+
(unsigned long long*)mask.data_ptr<int64_t>(),
189+
dets_num);
190+
sycl_kernel_submit(
191+
std::min(col_blocks, nms_items_per_group),
192+
std::min(col_blocks, nms_items_per_group),
193+
at::xpu::getCurrentSYCLQueue(),
194+
caller);
195+
return keep;
124196
}
125197

126198
} // namespace xpu

0 commit comments

Comments
 (0)