openvinotoolkit · riverlijunjie · Feb 10, 2025 · Feb 10, 2025 · Feb 13, 2025 · Feb 15, 2025
@@ -214,6 +214,10 @@ std::vector<layout> fully_connected_inst::calc_output_layouts(fully_connected_no
 }
 
 kernel_impl_params fully_connected_inst::get_fake_aligned_params(kernel_impl_params const& orig_impl_param) {
+    if (can_apply_single_batch_optimization(orig_impl_param)) {
+        return std::move(orig_impl_param);
+    }
+
     // fc_tiled_opt kernel is optimized for row shape aligned by 8.
     // Thus, use fake aligned shape at kernel execution for better performance.
     const auto& orig_input_layout = orig_impl_param.get_input_layout();
@@ -326,6 +330,32 @@ std::string fully_connected_inst::to_string(fully_connected_node const& node) {
     return primitive_description.str();
 }
 
+bool fully_connected_inst::can_apply_single_batch_optimization(const kernel_impl_params& impl_param) {
+    if ((impl_param.output_layouts.size() == 0) || impl_param.output_layouts[0].is_dynamic())
+        return false;
+
+    // Only support i4/u4 weight so far
+    if (impl_param.weights_layout) {
+        auto weights_layout_dt = impl_param.weights_layout.value().data_type;
+        if (weights_layout_dt != data_types::i4 && weights_layout_dt != data_types::u4) {
+            return false;
+        }
+    }
+
+    // Don't support swiglu fused
+    if (impl_param.fused_desc.size() > 0) {
+        for (const auto& f : impl_param.fused_desc) {
+            if (f.is_type<swiglu>())
+                return false;
+        }
+    }
+
+    // Single batch
+    auto shape = impl_param.output_layouts[0].get_partial_shape().to_shape();
+    auto shape_size = ov::shape_size(shape);
+    return one_of(shape_size, shape) && (shape_size % 16 == 0);
+}
+
 fully_connected_inst::typed_primitive_inst(network& network, fully_connected_node const& node)
     : parent(network, node) { }
 }  // namespace cldnn
@@ -183,6 +183,11 @@ struct fully_connected_impl : typed_primitive_impl_ocl<fully_connected> {
             if (with_zp) {
                 params.has_decompression_zp = true;
                 params.decompression_zero_point = convert_data_tensor(updated_impl_param.input_layouts[3]);
+                if (updated_impl_param.input_layouts[3].get_linear_size() == 1 &&
+                    primitive->decompression_zero_point_scalar.has_value()) {
+                    params.scalar_zp = true;
+                    params.zp_value = primitive->decompression_zero_point_scalar.value();
+                }
             } else if (primitive->decompression_zero_point_scalar.has_value()) {
                 params.has_decompression_zp = true;
                 params.scalar_zp = true;
@@ -203,7 +208,9 @@ struct fully_connected_impl : typed_primitive_impl_ocl<fully_connected> {
             params.quantization = kernel_selector::QuantizationType::NONE;
         }
 
-        params.dynamic_quantization_group_size = impl_param.get_program().get_config().get_dynamic_quantization_group_size();
+        params._single_batch_optimized = fully_connected_inst::can_apply_single_batch_optimization(updated_impl_param);
+        params.dynamic_quantization_group_size =
+            impl_param.get_program().get_config().get_dynamic_quantization_group_size();
 
         return params;
     }

@@ -49,6 +49,7 @@ class typed_primitive_inst<fully_connected> : public typed_primitive_inst_base<f
     static layout calc_output_layout(fully_connected_node const& node, kernel_impl_params const& impl_param);
     static kernel_impl_params get_fake_aligned_params(kernel_impl_params const& orig_impl_param);
     static std::string to_string(fully_connected_node const& node);
+    static bool can_apply_single_batch_optimization(const kernel_impl_params& impl_param);
 
     typed_primitive_inst(network& network, fully_connected_node const& node);
 

diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -2837,11 +2837,26 @@ std::shared_ptr<primitive_impl> ImplementationsFactory::get_primitive_impl_for_p
         o.data_padding._dynamic_dims_mask = padding::EMPTY_MASK;
     }
 
+    auto need_single_batch_optimization = [&inst, &updated_params](const std::shared_ptr<primitive_impl> impl) -> bool {
+        auto is_cldnn_fc_impl = inst.get_node().get_preferred_impl_type() == impl_types::ocl;
+        auto kernel_name = impl->get_kernel_name();
+        // Avoid ref_kernel test issue.
+        auto is_ref_impl = kernel_name.find("fully_connected_gpu_bfyx_ref") != std::string::npos;
+        auto is_gemv_impl = kernel_name.find("gemv") != std::string::npos;
+        return is_cldnn_fc_impl && fully_connected_inst::can_apply_single_batch_optimization(updated_params) &&
+               !is_ref_impl && !is_gemv_impl;
+    };
+
     // 1. If we have static impl in the cache - use it
     if (use_async_compilation && ((inst.get_impl() && inst.get_impl()->is_dynamic()) || inst.get_flag(ExecutionFlags::SHAPE_CHANGED))) {
         auto cached_impl = m_static_impls_cache.get(updated_params);
         if (cached_impl) {
-            return cached_impl->clone();
+            if (inst.get_node().is_type<fully_connected>() && need_single_batch_optimization(cached_impl)) {
+                // Switch to single batch optimization.
+                cached_impl = nullptr;
+            } else {
+                return cached_impl->clone();
+            }
         }
 
         // 1.1. Static impl not found - run async compilation
@@ -2871,6 +2886,10 @@ std::shared_ptr<primitive_impl> ImplementationsFactory::get_primitive_impl_for_p
     std::shared_ptr<primitive_impl> dynamic_impl = nullptr;
     // 2. Try to find existing dynamic impl which supports given shapes
     for (auto& impl : m_dynamic_impls_cache) {
+        if (inst.get_node().is_type<fully_connected>() && need_single_batch_optimization(impl)) {
+            // Switch to single batch optimization.
+            continue;
+        }
         if (impl->m_manager->support_shapes(params)) {
             dynamic_impl = impl;
             break;