Skip to content

Commit 89aba29

Browse files
authored
[TESTS] WhoWhatBench version update (#3241)
### Changes Update whowhatbench version to the latest from GenAI ### Reason for changes To use the latest features from whowhatbench
1 parent 9d5da2d commit 89aba29

File tree

8 files changed

+63
-65
lines changed

8 files changed

+63
-65
lines changed
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
datasets
2-
whowhatbench @ git+https://github.com/andreyanufr/who_what_benchmark.git
3-
numpy>=1.23.5
2+
whowhatbench @ git+https://github.com/openvinotoolkit/openvino.genai#subdirectory=tools/who_what_benchmark
3+
numpy>=1.23.5,<2
44
openvino==2025.0
5-
optimum-intel[openvino]>=1.13.0
5+
optimum-intel>=1.13.0
66
transformers>=4.35.2
77
onnx==1.17.0
8-
numpy<2

examples/llm_compression/openvino/tiny_llama_synthetic_data/README.md

+1-5
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,12 @@
11
# Compress TinyLLama model using synthetic data
22

3-
This example demonstrates how to optimize Large Language Models (LLMs) using NNCF weight compression API & synthetic data for the advanced algorithms usage. The example applies 4/8-bit mixed-precision quantization & Scale Estimation algorithm to weights of Linear (Fully-connected) layers of [TinyLlama/TinyLlama-1.1B-Chat-v1.0](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0) model.
4-
To evaluate the accuracy of the compressed model we measure similarity between two texts generated by the baseline and compressed models using [WhoWhatBench](https://github.com/openvinotoolkit/openvino.genai/tree/master/tools/who_what_benchmark) library.
3+
This example demonstrates how to optimize Large Language Models (LLMs) using NNCF weight compression API & synthetic data for the advanced algorithms usage. The example applies 4/8-bit mixed-precision quantization & Scale Estimation algorithm to weights of Linear (Fully-connected) layers of [TinyLlama/TinyLlama-1.1B-Chat-v1.0](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0) model. This leads to a significant decrease in model footprint and performance improvement with OpenVINO.
54

65
The example includes the following steps:
76

8-
- Prepare `wikitext` dataset.
97
- Prepare `TinyLlama/TinyLlama-1.1B-Chat-v1.0` text-generation model in OpenVINO representation using [Optimum-Intel](https://huggingface.co/docs/optimum/intel/inference).
10-
- Compress weights of the model with NNCF Weight compression algorithm with Scale Estimation & `wikitext` dataset.
118
- Prepare `synthetic` dataset using `nncf.data.generate_text_data` method.
129
- Compress weights of the model with NNCF Weight compression algorithm with Scale Estimation & `synthetic` dataset.
13-
- Measure the similarity of the two models optimized with different datasets.
1410

1511
## Install requirements
1612

examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py

-1
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,6 @@ def main():
7777
scale_estimation=True,
7878
)
7979

80-
# Verify the model output in comparison to floating-point one
8180
input_ids = tokenizer("What is Python? ", return_tensors="pt").to(device=hf_model.device)
8281
max_new_tokens = 100
8382

Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
torch==2.5.1
22
datasets==3.0.1
3-
numpy>=1.23.5
3+
numpy>=1.23.5,<2
44
openvino==2025.0
5-
optimum-intel[openvino]>=1.13.0
5+
optimum-intel>=1.13.0
66
transformers>=4.35.2
77
onnx==1.17.0
8-
numpy<2

tests/post_training/data/ptq_reference_data.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ hf/hf-internal-testing/tiny-random-GPTNeoXForCausalLM_statefull_backend_OPTIMUM:
1818
metric_value: null
1919
hf/hf-internal-testing/tiny-random-GPTNeoXForCausalLM_stateless_backend_OPTIMUM:
2020
metric_value: null
21+
xfail_reason: "Issue-161969"
2122
hf/hf-internal-testing/tiny-random-gpt2_backend_FP32:
2223
metric_value: null
2324
hf/hf-internal-testing/tiny-random-gpt2_backend_OPTIMUM:

tests/post_training/data/wwb_ref_answers/tinyllama__tinyllama-1.1b-step-50k-105b/ref_qa.csv

+28-28
Large diffs are not rendered by default.

tests/post_training/requirements.txt

+5-5
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,13 @@ pytest-split
1010

1111
librosa==0.10.0
1212
memory-profiler==0.61.0
13-
optimum-intel==1.15.2
14-
optimum==1.17.1
13+
optimum-intel==1.21.0
14+
optimum==1.23.3
1515
scikit-learn>=1.2.2,<=1.5.0
1616
soundfile==0.12.1
1717
tensorboard==2.13.0
1818
tensorflow-io==0.32.0
1919
timm==0.9.2
20-
transformers==4.38.2
21-
whowhatbench @ git+https://github.com/andreyanufr/who_what_benchmark@456d3584ce628f6c8605f37cd9a3ab2db1ebf933
22-
datasets==2.21.0
20+
transformers==4.46.3
21+
whowhatbench @ git+https://github.com/openvinotoolkit/openvino.genai.git@2025.0.0.0#subdirectory=tools/who_what_benchmark
22+
datasets==3.1.0

tests/post_training/test_quantize_conformance.py

+23-19
Original file line numberDiff line numberDiff line change
@@ -331,25 +331,29 @@ def test_ptq_quantization(
331331
err_msg = "Unknown exception"
332332
traceback.print_exc()
333333

334-
if pipeline is not None:
335-
pipeline.cleanup_cache()
336-
run_info = pipeline.run_info
337-
if err_msg:
338-
run_info.status = f"{run_info.status} | {err_msg}" if run_info.status else err_msg
339-
340-
captured = capsys.readouterr()
341-
write_logs(captured, pipeline)
342-
343-
if extra_columns:
344-
pipeline.collect_data_from_stdout(captured.out)
345-
else:
346-
run_info = create_short_run_info(test_model_param, err_msg, test_case_name)
347-
348-
run_info.time_total = time.perf_counter() - start_time
349-
ptq_result_data[test_case_name] = run_info
350-
351-
if err_msg:
352-
pytest.fail(err_msg)
334+
finally:
335+
if pipeline is not None:
336+
pipeline.cleanup_cache()
337+
run_info = pipeline.run_info
338+
if err_msg:
339+
run_info.status = f"{run_info.status} | {err_msg}" if run_info.status else err_msg
340+
341+
captured = capsys.readouterr()
342+
write_logs(captured, pipeline)
343+
344+
if extra_columns:
345+
pipeline.collect_data_from_stdout(captured.out)
346+
else:
347+
run_info = create_short_run_info(test_model_param, err_msg, test_case_name)
348+
349+
run_info.time_total = time.perf_counter() - start_time
350+
ptq_result_data[test_case_name] = run_info
351+
if "xfail_reason" in ptq_reference_data[test_case_name]:
352+
xfail_msg = f"XFAIL: {ptq_reference_data[test_case_name]['xfail_reason']} - {run_info.status}"
353+
run_info.status = xfail_msg
354+
pytest.xfail(xfail_msg)
355+
elif err_msg:
356+
pytest.fail(err_msg)
353357

354358

355359
@pytest.mark.parametrize("test_case_name", WC_TEST_CASES.keys())

0 commit comments

Comments
 (0)