[GPU] Updated GPU cache size retrieval and refined closest_pow_of_2 (openvinotoolkit#28059)

arshadlab · web-flow · commit b0a8c14d452e · 2024-12-17T07:43:32.000Z
Details:
Existing method for cache size calculation was static and need
continious updates to the sku table which was already being missed for
latest skus e.g DG2.
This update introduces a new member variable, max_global_cache_size, to
store the GPU's global cache size, obtained via the OpenCL property
CL_DEVICE_GLOBAL_MEM_CACHE_SIZE. The existing hard coded cache
calculations are removed. Additionally, the closest_pow_of_2 function
has been enhanced to return the nearest power of 2, favoring the upper
value if the input is within 30% of the range for the upper bound. These
changes improve memory management and ensure better utilization of GPU
resources towards bottle neck situations.

Tickets:
     CVS-159076

Signed-off-by: Arshad Mehmood &lt;arshad.mehmood@intel.com&gt;
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp
@@ -56,6 +56,7 @@ struct device_info {
     uint64_t max_local_mem_size;                ///< Maximum size of local memory arena in bytes.
     uint64_t max_global_mem_size;               ///< Maximum size of global device memory in bytes.
     uint64_t max_alloc_mem_size;                ///< Maximum size of memory object allocation in bytes.
+    uint64_t max_global_cache_size;             ///< Maximum size of cache memory bytes.
 
     uint64_t max_image2d_width;                 ///< Maximum image 2d width supported by the device.
     uint64_t max_image2d_height;                ///< Maximum image 2d height supported by the device.
diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp
@@ -797,12 +797,24 @@ uint32_t Plugin::get_optimal_batch_size(const ov::AnyMap& options) const {
     auto device_id = get_property(ov::device::id.name(), options).as<std::string>();
     auto context = get_default_contexts().at(device_id);
     const auto& device_info = context->get_engine().get_device_info();
-    auto next_pow_of_2 = [] (float x) {
-        return pow(2, ceil(std::log(x)/std::log(2)));
-    };
+
     auto closest_pow_of_2 = [] (float x) {
-        return pow(2, floor(std::log(x)/std::log(2)));
+        int lower_power = static_cast<int>(floor(std::log(x) / std::log(2)));
+        double lower_value = pow(2, lower_power);        // Current power of 2
+        double upper_value = pow(2, lower_power + 1);   // Next power of 2
+
+        // Determine the threshold (70% of the range between lower and upper values)
+        // If x is within the upper 30% of the range, return the upper power of 2.
+        double threshold = 0.7 * (upper_value - lower_value);
+
+        // Compare x with the threshold and return the appropriate power of 2
+        if (x - lower_value > threshold) {
+            return upper_value;  // Return the next power of 2
+        } else {
+            return lower_value;  // Return the current power of 2
+        }
     };
+
     auto model_param = options.find(ov::hint::model.name());
     if (model_param == options.end()) {
         GPU_DEBUG_INFO << "[OPTIMAL_BATCH_SIZE] ov::hint::model is not set: return 1" << std::endl;
@@ -816,31 +828,10 @@ uint32_t Plugin::get_optimal_batch_size(const ov::AnyMap& options) const {
     }
     GPU_DEBUG_INFO << "DEVICE_INFO:"
                    << "gfx_version.major, " << device_info.gfx_ver.major
-                   << "gfx_version.minor " << std::to_string(device_info.gfx_ver.minor) << std::endl;
-    static std::map<cldnn::gfx_version, size_t> gen_kbytes_per_bank = {
-            {{12, 0, 0}, 480},  // TGL
-            {{12, 1, 0}, 2048}, // DG1
-            {{12, 5, 0}, 320},
-            {{12, 7, 0}, 512},
-    };
-    size_t L3_cache_size = device_info.gfx_ver.major && (device_info.gfx_ver.major <= 9)
-            ? 768 * 1024 // Gen9
-            : 2 * 768 * 1024;  //reasonable default when no arch has been detected (e.g. due to old driver ver)
-    cldnn::gfx_version gen = {device_info.gfx_ver.major, device_info.gfx_ver.minor, 0 /*ignore the revision*/};
-    auto val = gen_kbytes_per_bank.find(gen);
-    if (gen_kbytes_per_bank.end() != val) {
-        auto kbytes_per_bank = val->second;
-        auto num_banks_per_slice = device_info.num_sub_slices_per_slice > 4
-                                    ? next_pow_of_2(device_info.num_sub_slices_per_slice)
-                                    : 2 * device_info.num_sub_slices_per_slice;
-        L3_cache_size = kbytes_per_bank * 1024 * num_banks_per_slice * device_info.num_slices;
-        GPU_DEBUG_INFO << "DEVICE_INFO:"
-                        << "num_slices " << device_info.num_slices
-                        << ", num_sub_slices_per_slice " << device_info.num_sub_slices_per_slice
-                        << ", num_banks_per_slice " << num_banks_per_slice
-                        << ", gen_kbytes_per_bank : " << kbytes_per_bank
-                        << ", L3_cache_size is (MB): " << float(L3_cache_size) / 1024 / 1024 << std::endl;
-    }
+                   << "gfx_version.minor " << std::to_string(device_info.gfx_ver.minor)
+                   << "Cache size " << std::to_string(device_info.max_global_cache_size) << std::endl;
+
+    size_t L3_cache_size = device_info.max_global_cache_size;
     auto config = m_configs_map.at(device_id);
     auto cloned_model = clone_and_transform_model(model, config, context);
     ov::MemBandwidthPressure memPressure = ov::mem_bandwidth_pressure_tolerance(cloned_model, L3_cache_size);
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
@@ -224,6 +224,7 @@ device_info init_device_info(const cl::Device& device, const cl::Context& contex
     info.max_local_mem_size = static_cast<uint64_t>(device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>());
     info.max_global_mem_size = static_cast<uint64_t>(device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>());
     info.max_alloc_mem_size = static_cast<uint64_t>(device.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>());
+    info.max_global_cache_size = static_cast<uint64_t>(device.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>());
 
     info.supports_image = static_cast<uint8_t>(device.getInfo<CL_DEVICE_IMAGE_SUPPORT>());
     info.max_image2d_width = static_cast<uint64_t>(device.getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>());