Added WWB tests (openvinotoolkit#880)

AlexKoff88 · web-flow · commit 20238a5b28c5 · 2024-09-19T19:00:19.000+04:00
diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml
@@ -5,6 +5,7 @@ name: llm_bench Python Test
 
 env:
   LLM_BENCH_PYPATH: llm_bench/python
+  WWB_PATH: llm_bench/python/who_what_benchmark
 
 on:
   push:
@@ -40,6 +41,8 @@ jobs:
         python -m pip install flake8 pytest black
         GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt
         pip install openvino-nightly
+        GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.WWB_PATH }}/requirements.txt
+        GIT_CLONE_PROTECTION_ACTIVE=false pip install ${{ env.WWB_PATH }}
 
     - name: Lint with flake8
       run: |
@@ -68,6 +71,9 @@ jobs:
       run: |
         python ./llm_bench/python/convert.py --model_id segmind/tiny-sd --output_dir ./ov_models/tiny-sd --precision FP16
         python ./llm_bench/python/benchmark.py -m ./ov_models/tiny-sd/pytorch/dldt/FP16/ -pf ./llm_bench/python/prompts/stable-diffusion.jsonl -d cpu -n 1
+    - name: WWB Tests
+      run: |
+        python -m pytest  ./llm_bench/python/who_what_benchmark/tests
   stateful:
     runs-on: ubuntu-20.04
     steps:
@@ -82,3 +88,9 @@ jobs:
           python -m pip install openvino-nightly
           python llm_bench/python/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir . --stateful
           grep beam_idx pytorch/dldt/FP32/openvino_model.xml
+      - name: WWB Tests
+        run: |
+          GIT_CLONE_PROTECTION_ACTIVE=false pip install -r llm_bench/python/who_what_benchmark/requirements.txt
+          GIT_CLONE_PROTECTION_ACTIVE=false pip install llm_bench/python/who_what_benchmark/
+          pip install pytest
+          python -m pytest  llm_bench/python/who_what_benchmark/tests
diff --git a/bandit.yml b/bandit.yml
@@ -131,7 +131,7 @@ any_other_function_with_shell_equals_true:
   - subprocess.check_output
   - subprocess.run
 assert_used:
-  skips: []
+  skips: ["llm_bench/python/who_what_benchmark/tests/test_*.py"]
 hardcoded_tmp_directory:
   tmp_dirs:
   - /tmp
diff --git a/llm_bench/python/who_what_benchmark/tests/test_cli.py b/llm_bench/python/who_what_benchmark/tests/test_cli.py
@@ -0,0 +1,129 @@
+import subprocess  # nosec B404
+import os
+import shutil
+import tempfile
+import pandas as pd
+import pytest
+import logging
+
+from transformers import AutoTokenizer
+from optimum.intel.openvino import OVModelForCausalLM, OVWeightQuantizationConfig
+
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def run_wwb(args):
+    logger.info(" ".join(["wwb"] + args))
+    result = subprocess.run(
+        ["wwb"] + args,
+        capture_output=True,
+        text=True
+    )
+    logger.info(result)
+    return result
+
+
+model_id = "facebook/opt-125m"
+tmp_dir = tempfile.mkdtemp()
+base_model_path = os.path.join(tmp_dir, "opt125m")
+target_model_path = os.path.join(tmp_dir, "opt125m_int8")
+
+
+def setup_module():
+    logger.info("Create models")
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    base_model = OVModelForCausalLM.from_pretrained(model_id)
+    base_model.save_pretrained(base_model_path)
+    tokenizer.save_pretrained(base_model_path)
+
+    target_model = OVModelForCausalLM.from_pretrained(
+        model_id, quantization_config=OVWeightQuantizationConfig(bits=8)
+    )
+    target_model.save_pretrained(target_model_path)
+    tokenizer.save_pretrained(target_model_path)
+
+
+def teardown_module():
+    logger.info("Remove models")
+    shutil.rmtree(tmp_dir)
+
+
+def test_target_model():
+    result = run_wwb([
+        "--base-model", base_model_path,
+        "--target-model", target_model_path,
+        "--num-samples", "2",
+        "--device", "CPU"
+    ])
+    assert result.returncode == 0
+    assert "Metrics for model" in result.stdout
+    assert "## Reference text" not in result.stdout
+
+
+@pytest.fixture
+def test_gt_data():
+    with tempfile.NamedTemporaryFile(suffix=".csv") as tmpfile:
+        temp_file_name = tmpfile.name
+
+    result = run_wwb([
+        "--base-model", base_model_path,
+        "--gt-data", temp_file_name,
+        "--dataset", "EleutherAI/lambada_openai,en",
+        "--dataset-field", "text",
+        "--split", "test",
+        "--num-samples", "2",
+        "--device", "CPU"
+    ])
+    import time
+    time.sleep(1)
+    data = pd.read_csv(temp_file_name)
+    os.remove(temp_file_name)
+
+    assert result.returncode == 0
+    assert len(data["questions"].values) == 2
+
+
+def test_output_directory():
+    with tempfile.TemporaryDirectory() as temp_dir:
+        result = run_wwb([
+            "--base-model", base_model_path,
+            "--target-model", target_model_path,
+            "--num-samples", "2",
+            "--device", "CPU",
+            "--output", temp_dir
+        ])
+        assert result.returncode == 0
+        assert "Metrics for model" in result.stdout
+        assert os.path.exists(os.path.join(temp_dir, "metrics_per_qustion.csv"))
+        assert os.path.exists(os.path.join(temp_dir, "metrics.csv"))
+
+
+def test_verbose():
+    result = run_wwb([
+        "--base-model", base_model_path,
+        "--target-model", target_model_path,
+        "--num-samples", "2",
+        "--device", "CPU",
+        "--verbose"
+    ])
+    assert result.returncode == 0
+    assert "## Reference text" in result.stdout
+
+
+def test_language_autodetect():
+    with tempfile.NamedTemporaryFile(suffix=".csv") as tmpfile:
+        temp_file_name = tmpfile.name
+
+    result = run_wwb([
+        "--base-model", "Qwen/Qwen2-0.5B",
+        "--gt-data", temp_file_name,
+        "--num-samples", "2",
+        "--device", "CPU"
+    ])
+    data = pd.read_csv(temp_file_name)
+    os.remove(temp_file_name)
+
+    assert result.returncode == 0
+    assert "马克" in data["questions"].values[0]