[TESTS] Use FP32 inference precision, FP16 KV cache precision for pipelines (#1485)

ilya-lavrenov · web-flow · commit 48dfd1619e42 · 2025-01-07T00:39:11.000+04:00
OpenVINO plugins enable different kind of optimizations by default like KV cache compression to int8, fp16 inference precision, while in GenAI tests we want to test pipelines and how they are compared against HF / optimum w/o extra optimizations: https://github.com/openvinotoolkit/openvino.genai/blob/4db67aecac78885c6d1e302f348c9489e2154388/tests/python_tests/common.py#L318-L325 Hopefully, we can merge int8 KV cache by default for CB then #1206, because in tests we will still compare FP16 KV cache, while official Validation should be responsible for validation against reference via WWB metrics.
diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt
@@ -2,7 +2,7 @@
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 openvino-tokenizers~=2025.0.0.0.dev
-optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631
 numpy<2.0.0; sys_platform == 'darwin'
 einops==0.8.0  # For Qwen
 transformers_stream_generator==0.0.5  # For Qwen
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -364,7 +364,7 @@ class ContinuousBatchingPipeline:
     def __init__(self, models_path: os.PathLike, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}, tokenizer_properties: dict[str, typing.Any] = {}) -> None:
         ...
     @typing.overload
-    def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}) -> None:
+    def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, **kwargs) -> None:
         ...
     @typing.overload
     def add_request(self, request_id: int, input_ids: openvino._pyopenvino.Tensor, generation_config: GenerationConfig) -> GenerationHandle:
diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp
@@ -223,15 +223,14 @@ void init_continuous_batching_pipeline(py::module_& m) {
         py::arg("properties") = ov::AnyMap({}),
         py::arg("tokenizer_properties") = ov::AnyMap({}))
 
-        .def(py::init([](const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& plugin_config) {
+        .def(py::init([](const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const py::kwargs& kwargs) {
             ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
-            return std::make_unique<ContinuousBatchingPipeline>(models_path, tokenizer, scheduler_config, device, pyutils::properties_to_any_map(plugin_config));
+            return std::make_unique<ContinuousBatchingPipeline>(models_path, tokenizer, scheduler_config, device, pyutils::kwargs_to_any_map(kwargs));
         }),
         py::arg("models_path"),
         py::arg("tokenizer"),
         py::arg("scheduler_config"),
-        py::arg("device"),
-        py::arg("properties") = ov::AnyMap({}))
+        py::arg("device"))
 
         .def("get_tokenizer", &ContinuousBatchingPipeline::get_tokenizer)
         .def("get_config", &ContinuousBatchingPipeline::get_config)
diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
@@ -306,7 +306,7 @@ def run_continuous_batching(
     if type(generation_configs) is not list:
         generation_configs = [generation_configs] * len(prompts)
  
-    cb_pipe = ContinuousBatchingPipeline(models_path, scheduler_config=scheduler_config, device='CPU')
+    cb_pipe = ContinuousBatchingPipeline(models_path, scheduler_config=scheduler_config, device='CPU', tokenizer_properties={}, properties=get_default_properties())
     output = cb_pipe.generate(prompts, generation_configs)
 
     del cb_pipe
diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py
@@ -14,7 +14,7 @@
 import json
 
 import openvino_genai as ov_genai
-
+from common import get_default_properties
 
 def get_models_list():
     precommit_models = [
@@ -92,7 +92,7 @@ def read_model(params, **tokenizer_kwargs):
 
     if (models_path / "openvino_model.xml").exists():
         opt_model = OVModelForCausalLM.from_pretrained(models_path, trust_remote_code=True,
-                                                       compile=False, device='CPU')
+                                                       compile=False, device='CPU', ov_config=get_default_properties())
     else:
         ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(hf_tokenizer,
                                                                              with_detokenizer=True,
@@ -104,7 +104,7 @@ def read_model(params, **tokenizer_kwargs):
         hf_tokenizer.save_pretrained(models_path)
 
         opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True,
-                                                       compile=False, device='CPU', load_in_8bit=False)
+                                                       compile=False, device='CPU', load_in_8bit=False, ov_config=get_default_properties())
         opt_model.generation_config.save_pretrained(models_path)
         opt_model.config.save_pretrained(models_path)
         opt_model.save_pretrained(models_path)
@@ -114,7 +114,7 @@ def read_model(params, **tokenizer_kwargs):
         models_path,
         hf_tokenizer,
         opt_model,
-        ov_genai.LLMPipeline(models_path, 'CPU', ENABLE_MMAP=False),
+        ov_genai.LLMPipeline(models_path, 'CPU', ENABLE_MMAP=False, **get_default_properties()),
     )
 
 
@@ -178,7 +178,7 @@ def load_genai_pipe_with_configs(configs: List[Tuple], temp_path):
         with (temp_path / config_name).open('w') as f:
             json.dump(config_json, f)
 
-    ov_pipe = ov_genai.LLMPipeline(temp_path, 'CPU')
+    ov_pipe = ov_genai.LLMPipeline(temp_path, 'CPU', **get_default_properties())
 
     for _, config_name in configs:
         os.remove(temp_path / config_name)
@@ -188,4 +188,4 @@ def load_genai_pipe_with_configs(configs: List[Tuple], temp_path):
 
 @functools.lru_cache(1)
 def get_continuous_batching(path):
-    return ov_genai.LLMPipeline(path, 'CPU', scheduler_config=ov_genai.SchedulerConfig())
+    return ov_genai.LLMPipeline(path, 'CPU', scheduler_config=ov_genai.SchedulerConfig(), **get_default_properties())
diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 diffusers==0.32.1
-optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631
 numpy<2.0.0; platform_system == "Darwin" and platform_machine == "x86_64"
 onnx==1.17.0
 pytest
diff --git a/tests/python_tests/test_continuous_batching.py b/tests/python_tests/test_continuous_batching.py
@@ -9,7 +9,7 @@
 from pathlib import Path
 from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer
 
-from common import get_hugging_face_models, convert_models, generate_and_compare_with_reference_text, \
+from common import get_default_properties, get_hugging_face_models, convert_models, generate_and_compare_with_reference_text, \
     get_scheduler_config, get_greedy, run_cb_pipeline_with_ref, get_beam_search, get_greedy, \
     get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \
     get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p
@@ -155,7 +155,7 @@ def test_post_oom_health(tmp_path, sampling_config):
     models_path : Path = tmp_path / model_id
     convert_models(opt_model, hf_tokenizer, models_path)
 
-    cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU")
+    cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU", **get_default_properties())
 
     # First run should return incomplete response
     output = cb_pipe.generate(["What is OpenVINO?"], [generation_config])
diff --git a/tests/python_tests/test_kv_cache_eviction.py b/tests/python_tests/test_kv_cache_eviction.py
@@ -15,7 +15,7 @@
 from openvino import serialize
 from transformers import AutoTokenizer
 
-from common import TESTS_ROOT, run_cb_pipeline_with_ref
+from common import TESTS_ROOT, run_cb_pipeline_with_ref, get_default_properties
 
 
 def load_prompts_dataset(file_name : str) -> Dict[str, List[str]]:
@@ -42,7 +42,7 @@ class ConvertedModel:
 @pytest.fixture(scope='module')
 def converted_model(tmp_path_factory):
     model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-    model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, load_in_8bit=False, compile=False)
+    model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, load_in_8bit=False, compile=False, ov_config=get_default_properties())
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     models_path = tmp_path_factory.mktemp("cacheopt_test_models") / model_id
     model.save_pretrained(models_path)
@@ -112,8 +112,8 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t
     scheduler_config_opt.enable_prefix_caching = enable_prefix_caching
 
     models_path = converted_model.models_path
-    model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU")
-    model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, "CPU")
+    model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU", {}, get_default_properties())
+    model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, "CPU", {}, get_default_properties())
 
     tokenizer = converted_model.tokenizer
 
diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py
@@ -7,7 +7,7 @@
 import transformers
 from optimum.intel.openvino import OVModelForVisualCausalLM
 from openvino_genai import VLMPipeline, GenerationConfig
-from common import get_image_by_link, get_beam_search, get_multinomial_all_parameters
+from common import get_image_by_link, get_beam_search, get_multinomial_all_parameters, get_default_properties
 
 def get_ov_model(cache):
     model_dir = cache.mkdir("tiny-random-minicpmv-2_6")
@@ -19,7 +19,7 @@ def get_ov_model(cache):
     ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(processor.tokenizer, with_detokenizer=True)
     openvino.save_model(ov_tokenizer, model_dir / "openvino_tokenizer.xml")
     openvino.save_model(ov_detokenizer, model_dir / "openvino_detokenizer.xml")
-    model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, load_in_8bit=False, trust_remote_code=True)
+    model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, load_in_8bit=False, trust_remote_code=True, ov_config=get_default_properties())
     processor.save_pretrained(model_dir)
     model.save_pretrained(model_dir)
     return model_dir