[TESTS] WhoWhatBench version update (#3241)

kshpv · web-flow · commit 89aba29dc840 · 2025-02-10T14:27:19.000+04:00
### Changes

Update whowhatbench version to the latest from GenAI

### Reason for changes

To use the latest features from whowhatbench
diff --git a/examples/llm_compression/openvino/tiny_llama_find_hyperparams/requirements.txt b/examples/llm_compression/openvino/tiny_llama_find_hyperparams/requirements.txt
@@ -1,8 +1,7 @@
 datasets
-whowhatbench @ git+https://github.com/andreyanufr/who_what_benchmark.git
-numpy>=1.23.5
+whowhatbench @ git+https://github.com/openvinotoolkit/openvino.genai#subdirectory=tools/who_what_benchmark
+numpy>=1.23.5,<2
 openvino==2025.0
-optimum-intel[openvino]>=1.13.0
+optimum-intel>=1.13.0
 transformers>=4.35.2
 onnx==1.17.0
-numpy<2
diff --git a/examples/llm_compression/openvino/tiny_llama_synthetic_data/README.md b/examples/llm_compression/openvino/tiny_llama_synthetic_data/README.md
@@ -1,16 +1,12 @@
 # Compress TinyLLama model using synthetic data
 
-This example demonstrates how to optimize Large Language Models (LLMs) using NNCF weight compression API & synthetic data for the advanced algorithms usage. The example applies 4/8-bit mixed-precision quantization & Scale Estimation algorithm to weights of Linear (Fully-connected) layers of [TinyLlama/TinyLlama-1.1B-Chat-v1.0](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0) model.
-To evaluate the accuracy of the compressed model we measure similarity between two texts generated by the baseline and compressed models using [WhoWhatBench](https://github.com/openvinotoolkit/openvino.genai/tree/master/tools/who_what_benchmark) library.
+This example demonstrates how to optimize Large Language Models (LLMs) using NNCF weight compression API & synthetic data for the advanced algorithms usage. The example applies 4/8-bit mixed-precision quantization & Scale Estimation algorithm to weights of Linear (Fully-connected) layers of [TinyLlama/TinyLlama-1.1B-Chat-v1.0](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0) model. This leads to a significant decrease in model footprint and performance improvement with OpenVINO.
 
 The example includes the following steps:
 
-- Prepare `wikitext` dataset.
 - Prepare `TinyLlama/TinyLlama-1.1B-Chat-v1.0` text-generation model in OpenVINO representation using [Optimum-Intel](https://huggingface.co/docs/optimum/intel/inference).
-- Compress weights of the model with NNCF Weight compression algorithm with Scale Estimation & `wikitext` dataset.
 - Prepare `synthetic` dataset using `nncf.data.generate_text_data` method.
 - Compress weights of the model with NNCF Weight compression algorithm with Scale Estimation & `synthetic` dataset.
-- Measure the similarity of the two models optimized with different datasets.
 
 ## Install requirements
 
diff --git a/examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py b/examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py
@@ -77,7 +77,6 @@ def main():
         scale_estimation=True,
     )
 
-    # Verify the model output in comparison to floating-point one
     input_ids = tokenizer("What is Python? ", return_tensors="pt").to(device=hf_model.device)
     max_new_tokens = 100
 
diff --git a/examples/llm_compression/openvino/tiny_llama_synthetic_data/requirements.txt b/examples/llm_compression/openvino/tiny_llama_synthetic_data/requirements.txt
@@ -1,8 +1,7 @@
 torch==2.5.1
 datasets==3.0.1
-numpy>=1.23.5
+numpy>=1.23.5,<2
 openvino==2025.0
-optimum-intel[openvino]>=1.13.0
+optimum-intel>=1.13.0
 transformers>=4.35.2
 onnx==1.17.0
-numpy<2
diff --git a/tests/post_training/data/ptq_reference_data.yaml b/tests/post_training/data/ptq_reference_data.yaml
@@ -18,6 +18,7 @@ hf/hf-internal-testing/tiny-random-GPTNeoXForCausalLM_statefull_backend_OPTIMUM:
   metric_value: null
 hf/hf-internal-testing/tiny-random-GPTNeoXForCausalLM_stateless_backend_OPTIMUM:
   metric_value: null
+  xfail_reason: "Issue-161969"
 hf/hf-internal-testing/tiny-random-gpt2_backend_FP32:
   metric_value: null
 hf/hf-internal-testing/tiny-random-gpt2_backend_OPTIMUM:
diff --git a/tests/post_training/data/wwb_ref_answers/tinyllama__tinyllama-1.1b-step-50k-105b/ref_qa.csv b/tests/post_training/data/wwb_ref_answers/tinyllama__tinyllama-1.1b-step-50k-105b/ref_qa.csv
diff --git a/tests/post_training/requirements.txt b/tests/post_training/requirements.txt
@@ -10,13 +10,13 @@ pytest-split
 
 librosa==0.10.0
 memory-profiler==0.61.0
-optimum-intel==1.15.2
-optimum==1.17.1
+optimum-intel==1.21.0
+optimum==1.23.3
 scikit-learn>=1.2.2,<=1.5.0
 soundfile==0.12.1
 tensorboard==2.13.0
 tensorflow-io==0.32.0
 timm==0.9.2
-transformers==4.38.2
-whowhatbench @ git+https://github.com/andreyanufr/who_what_benchmark@456d3584ce628f6c8605f37cd9a3ab2db1ebf933
-datasets==2.21.0
+transformers==4.46.3
+whowhatbench @ git+https://github.com/openvinotoolkit/openvino.genai.git@2025.0.0.0#subdirectory=tools/who_what_benchmark
+datasets==3.1.0
diff --git a/tests/post_training/test_quantize_conformance.py b/tests/post_training/test_quantize_conformance.py
@@ -331,25 +331,29 @@ def test_ptq_quantization(
             err_msg = "Unknown exception"
         traceback.print_exc()
 
-    if pipeline is not None:
-        pipeline.cleanup_cache()
-        run_info = pipeline.run_info
-        if err_msg:
-            run_info.status = f"{run_info.status} | {err_msg}" if run_info.status else err_msg
-
-        captured = capsys.readouterr()
-        write_logs(captured, pipeline)
-
-        if extra_columns:
-            pipeline.collect_data_from_stdout(captured.out)
-    else:
-        run_info = create_short_run_info(test_model_param, err_msg, test_case_name)
-
-    run_info.time_total = time.perf_counter() - start_time
-    ptq_result_data[test_case_name] = run_info
-
-    if err_msg:
-        pytest.fail(err_msg)
+    finally:
+        if pipeline is not None:
+            pipeline.cleanup_cache()
+            run_info = pipeline.run_info
+            if err_msg:
+                run_info.status = f"{run_info.status} | {err_msg}" if run_info.status else err_msg
+
+            captured = capsys.readouterr()
+            write_logs(captured, pipeline)
+
+            if extra_columns:
+                pipeline.collect_data_from_stdout(captured.out)
+        else:
+            run_info = create_short_run_info(test_model_param, err_msg, test_case_name)
+
+        run_info.time_total = time.perf_counter() - start_time
+        ptq_result_data[test_case_name] = run_info
+        if "xfail_reason" in ptq_reference_data[test_case_name]:
+            xfail_msg = f"XFAIL: {ptq_reference_data[test_case_name]['xfail_reason']} - {run_info.status}"
+            run_info.status = xfail_msg
+            pytest.xfail(xfail_msg)
+        elif err_msg:
+            pytest.fail(err_msg)
 
 
 @pytest.mark.parametrize("test_case_name", WC_TEST_CASES.keys())

Original file line number	Diff line number	Diff line change
`@@ -77,7 +77,6 @@ def main():`
`77`	`77`	`scale_estimation=True,`
`78`	`78`	`)`
`79`	`79`
`80`		`- # Verify the model output in comparison to floating-point one`
`81`	`80`	`input_ids = tokenizer("What is Python? ", return_tensors="pt").to(device=hf_model.device)`
`82`	`81`	`max_new_tokens = 100`
`83`	`82`