Skip to content

Commit 26f3123

Browse files
committed
[GPU] Gemm indirect input1 optimization
1 parent 44e9a00 commit 26f3123

File tree

5 files changed

+53
-0
lines changed

5 files changed

+53
-0
lines changed

src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp

+8
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,15 @@ struct multi_stage_primitive : public typed_primitive_impl<PType> {
111111
void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) override {
112112
_kernels.clear();
113113
if (!_kernels_data.empty() && !_kernels_data[0].kernels.empty()) {
114+
auto expected = 0;
115+
for (auto& kd : _kernels_data) {
116+
for (auto& k : kd.kernels) {
117+
GPU_DEBUG_TRACE_DETAIL << k.code.kernelString->entry_point << "\n";
118+
expected++;
119+
}
120+
}
114121
auto compiled_kernels = kernels_cache.get_kernels(params);
122+
GPU_DEBUG_TRACE_DETAIL << "Init kernels call, size: " << _kernels_data.size() << " compiled=" << compiled_kernels.size() << "\n";
115123
_kernels.insert(_kernels.begin(), compiled_kernels.begin(), compiled_kernels.end());
116124
// batch program hash and kernel entry point to find corresponding cl source code
117125
kernel_dump_info = std::make_pair(std::to_string(kernels_cache.get_kernel_batch_hash(params)),

src/plugins/intel_gpu/src/graph/primitive_inst.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -1513,6 +1513,9 @@ primitive_inst::primitive_inst(network& network, program_node const& node, bool
15131513
_outputs = allocate_outputs();
15141514
}
15151515
}
1516+
if (_node) {
1517+
GPU_DEBUG_TRACE_DETAIL << _node->type()->to_string(*_node) << "\n";
1518+
}
15161519
if (_impl) {
15171520
_impl->set_node_params(node);
15181521
if (_impl->is_dynamic() && !_impl->is_cpu()) {

src/plugins/intel_gpu/src/kernel_selector/cl_kernels/beam_table_update_ref.cl

+12
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,19 @@
44

55
#include "include/batch_headers/common.cl"
66

7+
8+
// printf("in0 shape[%dx%d], in1 shape[%dx%d], out shape[%dx%d]", INPUT0_BATCH_NUM, INPUT0_BATCH_PITCH, INPUT1_BATCH_NUM, INPUT1_BATCH_PITCH, OUTPUT_BATCH_NUM, OUTPUT_BATCH_PITCH);
9+
710
KERNEL(beam_table_update)(
811
OPTIONAL_SHAPE_INFO_ARG
912
__global const INPUT0_TYPE* state_prev,
1013
__global const INPUT1_TYPE* beam_idx,
1114
__global OUTPUT_TYPE* state_new,
1215
uchar is_state_set)
1316
{
17+
if (get_global_id(0) == 0 && get_global_id(1) == 0 && get_global_id(2) == 0 && INPUT1_BATCH_NUM == 2) {
18+
// printf("Bean content: %d %d\n", beam_idx[0], beam_idx[1]);
19+
}
1420
const unsigned int b = (uint)get_global_id(0);
1521
const unsigned int s = (uint)get_global_id(1);
1622

@@ -21,11 +27,17 @@ KERNEL(beam_table_update)(
2127
return;
2228

2329
if (!is_state_set) {
30+
// printf("%d %d. in0 shape[%dx%d], in1 shape[%dx%d], out shape[%dx%d]. Init state_new[%d]=%d\n",
31+
// b, s, INPUT0_BATCH_NUM, INPUT0_BATCH_PITCH, INPUT1_BATCH_NUM, INPUT1_BATCH_PITCH, OUTPUT_BATCH_NUM, OUTPUT_BATCH_PITCH, out_offset, b);
2432
state_new[out_offset] = TO_OUTPUT_TYPE(b);
2533
} else {
2634
if (s < INPUT0_BATCH_PITCH) {
35+
// printf("%d %d. in0 shape[%dx%d], in1 shape[%dx%d], out shape[%dx%d]. Reuse state_new[%d]=state_prev[%d](%d)\n",
36+
// b, s, INPUT0_BATCH_NUM, INPUT0_BATCH_PITCH, INPUT1_BATCH_NUM, INPUT1_BATCH_PITCH, OUTPUT_BATCH_NUM, OUTPUT_BATCH_PITCH, out_offset, in_offset, state_prev[in_offset]);
2737
state_new[out_offset] = state_prev[in_offset];
2838
} else {
39+
// printf("%d %d. in0 shape[%dx%d], in1 shape[%dx%d], out shape[%dx%d]. New state_new[%d]=%d\n",
40+
// b, s, INPUT0_BATCH_NUM, INPUT0_BATCH_PITCH, INPUT1_BATCH_NUM, INPUT1_BATCH_PITCH, OUTPUT_BATCH_NUM, OUTPUT_BATCH_PITCH, out_offset, b);
2941
state_new[out_offset] = b;
3042
}
3143
}

src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl

+29
Original file line numberDiff line numberDiff line change
@@ -306,11 +306,33 @@ KERNEL(gemm_tiled_opt)(
306306
#if INDIRECT_INPUT1
307307
if (do_indirect_load)
308308
{
309+
#if INPUT1_SIZE_X == 128 && INPUT1_FEATURE_NUM == 32 && defined(INPUT2_TYPE) && 0
310+
const __global INPUT1_TYPE* b_ptr_new = input1;
311+
uint b_new = beam_table[FUNC_CALL(get_bt_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (k * TILE_K), x)];
312+
uint load_idx = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b_new, f, w, z, (k * TILE_K), x);
313+
b_ptr_new += load_idx;
314+
b_tile = (N > b_raw_global_id) ? VLOAD(0, b_ptr_new) : 0;
315+
#elif INPUT1_SIZE_X == 128 && INPUT1_FEATURE_NUM == 32 && defined(INPUT2_TYPE) && 2
316+
const __global INPUT1_TYPE* b_ptr_new = input1;
317+
unroll_for (uint tile_n_load_idx = 0; tile_n_load_idx < TILE_N; tile_n_load_idx++) {
318+
if (tile_n_offset + tile_n_load_idx >= N) {
319+
b_tile[tile_n_load_idx] = 0;
320+
} else {
321+
// uint b_new = beam_table[FUNC_CALL(get_bt_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (k * TILE_K), tile_n_offset + tile_n_load_idx)];
322+
// uint load_idx = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b_new, f, w, z, (k * TILE_K), tile_n_offset + tile_n_load_idx);
323+
uint load_idx = FUNC_CALL(get_input1_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (k * TILE_K) + sglid, tile_n_offset + tile_n_load_idx, beam_table);
324+
// b_tile[tile_n_load_idx] = BLOCK_READ_B(b_ptr_new + load_idx, 0);
325+
b_tile[tile_n_load_idx] = b_ptr_new[load_idx];
326+
// b_tile[tile_n_load_idx] = b_ptr_new[load_idx + sglid];
327+
}
328+
}
329+
#else
309330
unroll_for (uint b_load_id = 0; b_load_id < TILE_K; b_load_id++) {
310331
uint b_load_offset = (k * TILE_K) + b_load_id;
311332
uint b_idx = FUNC_CALL(get_input1_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, b_load_offset, x, beam_table);
312333
b_tile[b_load_id] = b_raw_global_id >= N ? 0 : input1[b_idx];
313334
}
335+
#endif
314336
}
315337
else
316338
#endif
@@ -354,7 +376,14 @@ KERNEL(gemm_tiled_opt)(
354376
c_tile[dot_id] = mad((INPUT0_TYPE)(sub_group_broadcast(a_read[subtile_k_id], simd_local_id)),
355377
b_tile[subtile_k_id * SIMD_WIDTH + simd_local_id], c_tile[dot_id]);
356378
#else // TILE_K > SIMD_WIDTH
379+
#if INPUT1_SIZE_X == 128 && INPUT1_FEATURE_NUM == 32 && defined(INPUT2_TYPE) && 2
380+
INPUT0_TYPE tmp = a_read * b_tile[simd_local_id];
381+
INPUT0_TYPE res = sub_group_reduce_add(tmp);
382+
if (sglid == simd_local_id)
383+
c_tile[dot_id] = res + c_tile[dot_id];
384+
#else
357385
c_tile[dot_id] = mad((INPUT0_TYPE)(sub_group_broadcast(a_read, simd_local_id)), b_tile[simd_local_id], c_tile[dot_id]);
386+
#endif
358387
#endif // TILE_K > SIMD_WIDTH
359388
}
360389
}

src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -698,6 +698,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
698698

699699
{
700700
ov::pass::Manager manager;
701+
manager.m_visualize = false;
701702
manager.register_pass<ov::intel_gpu::ClampFP16Output>();
702703
manager.register_pass<ov::intel_gpu::ConvertMatMulToFullyConnected>();
703704
manager.register_pass<ov::intel_gpu::MoveFCReshapeToWeights>();

0 commit comments

Comments
 (0)