[GPU] Gemm indirect input1 optimization

sshlyapn · sshlyapn · commit 26f3123ebd42 · 2024-02-12T12:57:50.000+04:00
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/multi_stage_primitive.hpp
@@ -111,7 +111,15 @@ struct multi_stage_primitive : public typed_primitive_impl<PType> {
     void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) override {
         _kernels.clear();
         if (!_kernels_data.empty() && !_kernels_data[0].kernels.empty()) {
+            auto expected = 0;
+            for (auto& kd : _kernels_data) {
+                for (auto& k : kd.kernels) {
+                    GPU_DEBUG_TRACE_DETAIL << k.code.kernelString->entry_point << "\n";
+                    expected++;
+                }
+            }
             auto compiled_kernels = kernels_cache.get_kernels(params);
+            GPU_DEBUG_TRACE_DETAIL << "Init kernels call, size: " << _kernels_data.size() << " compiled=" << compiled_kernels.size() << "\n";
             _kernels.insert(_kernels.begin(), compiled_kernels.begin(), compiled_kernels.end());
             // batch program hash and kernel entry point to find corresponding cl source code
             kernel_dump_info = std::make_pair(std::to_string(kernels_cache.get_kernel_batch_hash(params)),
diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -1513,6 +1513,9 @@ primitive_inst::primitive_inst(network& network, program_node const& node, bool
             _outputs = allocate_outputs();
         }
     }
+    if (_node) {
+        GPU_DEBUG_TRACE_DETAIL << _node->type()->to_string(*_node) << "\n";
+    }
     if (_impl) {
         _impl->set_node_params(node);
         if (_impl->is_dynamic() && !_impl->is_cpu()) {
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/beam_table_update_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/beam_table_update_ref.cl
@@ -4,13 +4,19 @@
 
 #include "include/batch_headers/common.cl"
 
+
+// printf("in0 shape[%dx%d], in1 shape[%dx%d], out shape[%dx%d]", INPUT0_BATCH_NUM, INPUT0_BATCH_PITCH, INPUT1_BATCH_NUM, INPUT1_BATCH_PITCH, OUTPUT_BATCH_NUM, OUTPUT_BATCH_PITCH);
+
 KERNEL(beam_table_update)(
     OPTIONAL_SHAPE_INFO_ARG
     __global const INPUT0_TYPE* state_prev,
     __global const INPUT1_TYPE* beam_idx,
     __global OUTPUT_TYPE* state_new,
     uchar is_state_set)
 {
+    if (get_global_id(0) == 0 && get_global_id(1) == 0 && get_global_id(2) == 0 && INPUT1_BATCH_NUM == 2) {
+        // printf("Bean content: %d %d\n", beam_idx[0], beam_idx[1]);
+    }
     const unsigned int b = (uint)get_global_id(0);
     const unsigned int s = (uint)get_global_id(1);
 
@@ -21,11 +27,17 @@ KERNEL(beam_table_update)(
         return;
 
     if (!is_state_set) {
+        // printf("%d %d. in0 shape[%dx%d], in1 shape[%dx%d], out shape[%dx%d]. Init state_new[%d]=%d\n",
+        //     b, s, INPUT0_BATCH_NUM, INPUT0_BATCH_PITCH, INPUT1_BATCH_NUM, INPUT1_BATCH_PITCH, OUTPUT_BATCH_NUM, OUTPUT_BATCH_PITCH, out_offset, b);
         state_new[out_offset] = TO_OUTPUT_TYPE(b);
     } else {
         if (s < INPUT0_BATCH_PITCH) {
+            // printf("%d %d. in0 shape[%dx%d], in1 shape[%dx%d], out shape[%dx%d]. Reuse state_new[%d]=state_prev[%d](%d)\n",
+            //     b, s, INPUT0_BATCH_NUM, INPUT0_BATCH_PITCH, INPUT1_BATCH_NUM, INPUT1_BATCH_PITCH, OUTPUT_BATCH_NUM, OUTPUT_BATCH_PITCH, out_offset, in_offset, state_prev[in_offset]);
             state_new[out_offset] = state_prev[in_offset];
         } else {
+            // printf("%d %d. in0 shape[%dx%d], in1 shape[%dx%d], out shape[%dx%d]. New state_new[%d]=%d\n",
+            //     b, s, INPUT0_BATCH_NUM, INPUT0_BATCH_PITCH, INPUT1_BATCH_NUM, INPUT1_BATCH_PITCH, OUTPUT_BATCH_NUM, OUTPUT_BATCH_PITCH, out_offset, b);
             state_new[out_offset] = b;
         }
     }
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl
@@ -306,11 +306,33 @@ KERNEL(gemm_tiled_opt)(
 #if INDIRECT_INPUT1
         if (do_indirect_load)
         {
+            #if INPUT1_SIZE_X == 128 && INPUT1_FEATURE_NUM == 32 && defined(INPUT2_TYPE) && 0
+                const __global INPUT1_TYPE* b_ptr_new = input1;
+                uint b_new = beam_table[FUNC_CALL(get_bt_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (k * TILE_K), x)];
+                uint load_idx = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b_new, f, w, z, (k * TILE_K), x);
+                b_ptr_new += load_idx;
+                b_tile = (N > b_raw_global_id) ? VLOAD(0, b_ptr_new) : 0;
+            #elif INPUT1_SIZE_X == 128 && INPUT1_FEATURE_NUM == 32 && defined(INPUT2_TYPE) && 2
+                const __global INPUT1_TYPE* b_ptr_new = input1;
+                unroll_for (uint tile_n_load_idx = 0; tile_n_load_idx < TILE_N; tile_n_load_idx++) {
+                    if (tile_n_offset + tile_n_load_idx >= N) {
+                        b_tile[tile_n_load_idx] = 0;
+                    } else {
+                        // uint b_new = beam_table[FUNC_CALL(get_bt_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (k * TILE_K), tile_n_offset + tile_n_load_idx)];
+                        // uint load_idx = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b_new, f, w, z, (k * TILE_K), tile_n_offset + tile_n_load_idx);
+                        uint load_idx = FUNC_CALL(get_input1_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (k * TILE_K) + sglid, tile_n_offset + tile_n_load_idx, beam_table);
+                        // b_tile[tile_n_load_idx] = BLOCK_READ_B(b_ptr_new + load_idx, 0);
+                        b_tile[tile_n_load_idx] = b_ptr_new[load_idx];
+                        // b_tile[tile_n_load_idx] = b_ptr_new[load_idx + sglid];
+                    }
+                }
+            #else
             unroll_for (uint b_load_id = 0; b_load_id < TILE_K; b_load_id++) {
                 uint b_load_offset = (k * TILE_K) + b_load_id;
                 uint b_idx = FUNC_CALL(get_input1_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, b_load_offset, x, beam_table);
                 b_tile[b_load_id] = b_raw_global_id >= N ? 0 : input1[b_idx];
             }
+            #endif
         }
         else
 #endif
@@ -354,7 +376,14 @@ KERNEL(gemm_tiled_opt)(
                     c_tile[dot_id] = mad((INPUT0_TYPE)(sub_group_broadcast(a_read[subtile_k_id], simd_local_id)),
                                          b_tile[subtile_k_id * SIMD_WIDTH + simd_local_id], c_tile[dot_id]);
 #else // TILE_K > SIMD_WIDTH
+            #if INPUT1_SIZE_X == 128 && INPUT1_FEATURE_NUM == 32 && defined(INPUT2_TYPE) && 2
+                    INPUT0_TYPE tmp = a_read * b_tile[simd_local_id];
+                    INPUT0_TYPE res = sub_group_reduce_add(tmp);
+                    if (sglid == simd_local_id)
+                        c_tile[dot_id] = res + c_tile[dot_id];
+            #else
                     c_tile[dot_id] = mad((INPUT0_TYPE)(sub_group_broadcast(a_read, simd_local_id)), b_tile[simd_local_id], c_tile[dot_id]);
+            #endif
 #endif // TILE_K > SIMD_WIDTH
                 }
             }
diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
@@ -698,6 +698,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
 
     {
         ov::pass::Manager manager;
+        manager.m_visualize = false;
         manager.register_pass<ov::intel_gpu::ClampFP16Output>();
         manager.register_pass<ov::intel_gpu::ConvertMatMulToFullyConnected>();
         manager.register_pass<ov::intel_gpu::MoveFCReshapeToWeights>();

Original file line number	Diff line number	Diff line change
`@@ -1513,6 +1513,9 @@ primitive_inst::primitive_inst(network& network, program_node const& node, bool`
`1513`	`1513`	`_outputs = allocate_outputs();`
`1514`	`1514`	`}`
`1515`	`1515`	`}`
	`1516`	`+ if (_node) {`
	`1517`	`+ GPU_DEBUG_TRACE_DETAIL << _node->type()->to_string(*_node) << "\n";`
	`1518`	`+ }`
`1516`	`1519`	`if (_impl) {`
`1517`	`1520`	`_impl->set_node_params(node);`
`1518`	`1521`	`if (_impl->is_dynamic() && !_impl->is_cpu()) {`
Original file line number	Diff line number	Diff line change
`@@ -698,6 +698,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {`
`698`	`698`
`699`	`699`	`{`
`700`	`700`	`ov::pass::Manager manager;`
	`701`	`+ manager.m_visualize = false;`
`701`	`702`	`manager.register_pass<ov::intel_gpu::ClampFP16Output>();`
`702`	`703`	`manager.register_pass<ov::intel_gpu::ConvertMatMulToFullyConnected>();`
`703`	`704`	`manager.register_pass<ov::intel_gpu::MoveFCReshapeToWeights>();`