Skip to content

Commit 48dfd16

Browse files
[TESTS] Use FP32 inference precision, FP16 KV cache precision for pipelines (#1485)
OpenVINO plugins enable different kind of optimizations by default like KV cache compression to int8, fp16 inference precision, while in GenAI tests we want to test pipelines and how they are compared against HF / optimum w/o extra optimizations: https://github.com/openvinotoolkit/openvino.genai/blob/4db67aecac78885c6d1e302f348c9489e2154388/tests/python_tests/common.py#L318-L325 Hopefully, we can merge int8 KV cache by default for CB then #1206, because in tests we will still compare FP16 KV cache, while official Validation should be responsible for validation against reference via WWB metrics.
1 parent b04b28b commit 48dfd16

9 files changed

+21
-22
lines changed

samples/export-requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
--extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
33
--extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
44
openvino-tokenizers~=2025.0.0.0.dev
5-
optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
5+
optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631
66
numpy<2.0.0; sys_platform == 'darwin'
77
einops==0.8.0 # For Qwen
88
transformers_stream_generator==0.0.5 # For Qwen

src/python/openvino_genai/py_openvino_genai.pyi

+1-1
Original file line numberDiff line numberDiff line change
@@ -364,7 +364,7 @@ class ContinuousBatchingPipeline:
364364
def __init__(self, models_path: os.PathLike, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}, tokenizer_properties: dict[str, typing.Any] = {}) -> None:
365365
...
366366
@typing.overload
367-
def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, properties: dict[str, typing.Any] = {}) -> None:
367+
def __init__(self, models_path: os.PathLike, tokenizer: Tokenizer, scheduler_config: SchedulerConfig, device: str, **kwargs) -> None:
368368
...
369369
@typing.overload
370370
def add_request(self, request_id: int, input_ids: openvino._pyopenvino.Tensor, generation_config: GenerationConfig) -> GenerationHandle:

src/python/py_continuous_batching_pipeline.cpp

+3-4
Original file line numberDiff line numberDiff line change
@@ -223,15 +223,14 @@ void init_continuous_batching_pipeline(py::module_& m) {
223223
py::arg("properties") = ov::AnyMap({}),
224224
py::arg("tokenizer_properties") = ov::AnyMap({}))
225225

226-
.def(py::init([](const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& plugin_config) {
226+
.def(py::init([](const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const py::kwargs& kwargs) {
227227
ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
228-
return std::make_unique<ContinuousBatchingPipeline>(models_path, tokenizer, scheduler_config, device, pyutils::properties_to_any_map(plugin_config));
228+
return std::make_unique<ContinuousBatchingPipeline>(models_path, tokenizer, scheduler_config, device, pyutils::kwargs_to_any_map(kwargs));
229229
}),
230230
py::arg("models_path"),
231231
py::arg("tokenizer"),
232232
py::arg("scheduler_config"),
233-
py::arg("device"),
234-
py::arg("properties") = ov::AnyMap({}))
233+
py::arg("device"))
235234

236235
.def("get_tokenizer", &ContinuousBatchingPipeline::get_tokenizer)
237236
.def("get_config", &ContinuousBatchingPipeline::get_config)

tests/python_tests/common.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ def run_continuous_batching(
306306
if type(generation_configs) is not list:
307307
generation_configs = [generation_configs] * len(prompts)
308308

309-
cb_pipe = ContinuousBatchingPipeline(models_path, scheduler_config=scheduler_config, device='CPU')
309+
cb_pipe = ContinuousBatchingPipeline(models_path, scheduler_config=scheduler_config, device='CPU', tokenizer_properties={}, properties=get_default_properties())
310310
output = cb_pipe.generate(prompts, generation_configs)
311311

312312
del cb_pipe

tests/python_tests/ov_genai_test_utils.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
import json
1515

1616
import openvino_genai as ov_genai
17-
17+
from common import get_default_properties
1818

1919
def get_models_list():
2020
precommit_models = [
@@ -92,7 +92,7 @@ def read_model(params, **tokenizer_kwargs):
9292

9393
if (models_path / "openvino_model.xml").exists():
9494
opt_model = OVModelForCausalLM.from_pretrained(models_path, trust_remote_code=True,
95-
compile=False, device='CPU')
95+
compile=False, device='CPU', ov_config=get_default_properties())
9696
else:
9797
ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(hf_tokenizer,
9898
with_detokenizer=True,
@@ -104,7 +104,7 @@ def read_model(params, **tokenizer_kwargs):
104104
hf_tokenizer.save_pretrained(models_path)
105105

106106
opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True,
107-
compile=False, device='CPU', load_in_8bit=False)
107+
compile=False, device='CPU', load_in_8bit=False, ov_config=get_default_properties())
108108
opt_model.generation_config.save_pretrained(models_path)
109109
opt_model.config.save_pretrained(models_path)
110110
opt_model.save_pretrained(models_path)
@@ -114,7 +114,7 @@ def read_model(params, **tokenizer_kwargs):
114114
models_path,
115115
hf_tokenizer,
116116
opt_model,
117-
ov_genai.LLMPipeline(models_path, 'CPU', ENABLE_MMAP=False),
117+
ov_genai.LLMPipeline(models_path, 'CPU', ENABLE_MMAP=False, **get_default_properties()),
118118
)
119119

120120

@@ -178,7 +178,7 @@ def load_genai_pipe_with_configs(configs: List[Tuple], temp_path):
178178
with (temp_path / config_name).open('w') as f:
179179
json.dump(config_json, f)
180180

181-
ov_pipe = ov_genai.LLMPipeline(temp_path, 'CPU')
181+
ov_pipe = ov_genai.LLMPipeline(temp_path, 'CPU', **get_default_properties())
182182

183183
for _, config_name in configs:
184184
os.remove(temp_path / config_name)
@@ -188,4 +188,4 @@ def load_genai_pipe_with_configs(configs: List[Tuple], temp_path):
188188

189189
@functools.lru_cache(1)
190190
def get_continuous_batching(path):
191-
return ov_genai.LLMPipeline(path, 'CPU', scheduler_config=ov_genai.SchedulerConfig())
191+
return ov_genai.LLMPipeline(path, 'CPU', scheduler_config=ov_genai.SchedulerConfig(), **get_default_properties())

tests/python_tests/requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
--extra-index-url https://download.pytorch.org/whl/cpu
22
diffusers==0.32.1
3-
optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
3+
optimum-intel @ git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631
44
numpy<2.0.0; platform_system == "Darwin" and platform_machine == "x86_64"
55
onnx==1.17.0
66
pytest

tests/python_tests/test_continuous_batching.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from pathlib import Path
1010
from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer
1111

12-
from common import get_hugging_face_models, convert_models, generate_and_compare_with_reference_text, \
12+
from common import get_default_properties, get_hugging_face_models, convert_models, generate_and_compare_with_reference_text, \
1313
get_scheduler_config, get_greedy, run_cb_pipeline_with_ref, get_beam_search, get_greedy, \
1414
get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \
1515
get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p
@@ -155,7 +155,7 @@ def test_post_oom_health(tmp_path, sampling_config):
155155
models_path : Path = tmp_path / model_id
156156
convert_models(opt_model, hf_tokenizer, models_path)
157157

158-
cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU")
158+
cb_pipe = ContinuousBatchingPipeline(models_path, Tokenizer(models_path), scheduler_config, "CPU", **get_default_properties())
159159

160160
# First run should return incomplete response
161161
output = cb_pipe.generate(["What is OpenVINO?"], [generation_config])

tests/python_tests/test_kv_cache_eviction.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from openvino import serialize
1616
from transformers import AutoTokenizer
1717

18-
from common import TESTS_ROOT, run_cb_pipeline_with_ref
18+
from common import TESTS_ROOT, run_cb_pipeline_with_ref, get_default_properties
1919

2020

2121
def load_prompts_dataset(file_name : str) -> Dict[str, List[str]]:
@@ -42,7 +42,7 @@ class ConvertedModel:
4242
@pytest.fixture(scope='module')
4343
def converted_model(tmp_path_factory):
4444
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
45-
model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, load_in_8bit=False, compile=False)
45+
model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, load_in_8bit=False, compile=False, ov_config=get_default_properties())
4646
tokenizer = AutoTokenizer.from_pretrained(model_id)
4747
models_path = tmp_path_factory.mktemp("cacheopt_test_models") / model_id
4848
model.save_pretrained(models_path)
@@ -112,8 +112,8 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t
112112
scheduler_config_opt.enable_prefix_caching = enable_prefix_caching
113113

114114
models_path = converted_model.models_path
115-
model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU")
116-
model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, "CPU")
115+
model_cb_noopt = ContinuousBatchingPipeline(models_path, scheduler_config, "CPU", {}, get_default_properties())
116+
model_cb_opt = ContinuousBatchingPipeline(models_path, scheduler_config_opt, "CPU", {}, get_default_properties())
117117

118118
tokenizer = converted_model.tokenizer
119119

tests/python_tests/test_vlm_pipeline.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import transformers
88
from optimum.intel.openvino import OVModelForVisualCausalLM
99
from openvino_genai import VLMPipeline, GenerationConfig
10-
from common import get_image_by_link, get_beam_search, get_multinomial_all_parameters
10+
from common import get_image_by_link, get_beam_search, get_multinomial_all_parameters, get_default_properties
1111

1212
def get_ov_model(cache):
1313
model_dir = cache.mkdir("tiny-random-minicpmv-2_6")
@@ -19,7 +19,7 @@ def get_ov_model(cache):
1919
ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(processor.tokenizer, with_detokenizer=True)
2020
openvino.save_model(ov_tokenizer, model_dir / "openvino_tokenizer.xml")
2121
openvino.save_model(ov_detokenizer, model_dir / "openvino_detokenizer.xml")
22-
model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, load_in_8bit=False, trust_remote_code=True)
22+
model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, load_in_8bit=False, trust_remote_code=True, ov_config=get_default_properties())
2323
processor.save_pretrained(model_dir)
2424
model.save_pretrained(model_dir)
2525
return model_dir

0 commit comments

Comments
 (0)