From 17d811d7140c69937722033a9d90147fb8543904 Mon Sep 17 00:00:00 2001
From: Andrey Churkin <andrey.churkin@intel.com>
Date: Mon, 17 Feb 2025 12:45:49 +0000
Subject: [PATCH 01/11] initial commit

---
 tools/llm_weight_compression/README.md        |  29 ++
 .../config_optimum_cli.json                   |  23 +
 tools/llm_weight_compression/requirements.txt |   1 +
 tools/llm_weight_compression/run.py           | 405 ++++++++++++++++++
 4 files changed, 458 insertions(+)
 create mode 100644 tools/llm_weight_compression/README.md
 create mode 100644 tools/llm_weight_compression/config_optimum_cli.json
 create mode 100644 tools/llm_weight_compression/requirements.txt
 create mode 100644 tools/llm_weight_compression/run.py

diff --git a/tools/llm_weight_compression/README.md b/tools/llm_weight_compression/README.md
new file mode 100644
index 00000000000..c2d38caadcf
--- /dev/null
+++ b/tools/llm_weight_compression/README.md
@@ -0,0 +1,29 @@
+# LLM Weight Compression Tool
+
+## Install
+
+```bash
+python3.10 -m venv env
+. env/bin/activate
+pip install --upgrade pip
+
+pip install openvino==2025.0.0
+pip install nncf==2.15.0
+pip install "git+https://github.com/huggingface/optimum.git@v1.24.0"
+pip install git+https://github.com/huggingface/optimum-intel.git@v1.22.0
+
+# #whowhatbench
+git clone --depth 1 --branch 2025.0.0.0 https://github.com/openvinotoolkit/openvino.genai.git
+
+cd openvino.genai/tools/who_what_benchmark
+pip install .
+```
+
+```bash
+# For test
+python run.py \
+--model-id facebook/opt-125m \
+--config config_optimum_cli.json \
+--root-dir experiment_dir \
+--dump-packages
+```
\ No newline at end of file
diff --git a/tools/llm_weight_compression/config_optimum_cli.json b/tools/llm_weight_compression/config_optimum_cli.json
new file mode 100644
index 00000000000..1441af5a366
--- /dev/null
+++ b/tools/llm_weight_compression/config_optimum_cli.json
@@ -0,0 +1,23 @@
+{
+    "compression": {
+        "backend": "optimum_cli",
+        "params": [
+            {
+                "task": ["text-generation"],
+                "weight_format": ["int4"],
+                "ratio": [0.2, 0.4],
+                "group_size": [64, 128],
+                "awq": [false, true],
+                "dataset": ["auto"]
+            }
+        ]
+    },
+    "evaluation": {
+        "backend": "lm_eval",
+        "params": {
+            "tasks": ["wikitext"],
+            "device": "cpu",
+            "limit": 3
+        }
+    }
+}
diff --git a/tools/llm_weight_compression/requirements.txt b/tools/llm_weight_compression/requirements.txt
new file mode 100644
index 00000000000..464090415c4
--- /dev/null
+++ b/tools/llm_weight_compression/requirements.txt
@@ -0,0 +1 @@
+# TODO
diff --git a/tools/llm_weight_compression/run.py b/tools/llm_weight_compression/run.py
new file mode 100644
index 00000000000..6f404543e18
--- /dev/null
+++ b/tools/llm_weight_compression/run.py
@@ -0,0 +1,405 @@
+# Copyright (c) 2025 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import itertools
+import json
+import shutil
+import subprocess
+from dataclasses import asdict
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from optimum.intel import OVModelForCausalLM
+from tabulate import tabulate
+from transformers import AutoTokenizer
+
+
+class CompressBackendType(Enum):
+    OPTIMUM_CLI = "optimum_cli"
+    NNCF = "nncf"
+
+
+def export_base_model(model_id: str, base_model_dir: Path) -> None:
+    """
+    Exports a base openvino model into the following folder structure
+
+        {ROOT_DIR}
+        |-- {encoded model ID}
+            |-- fp32
+                |-- openvino_model.xml
+                |-- openvino_model.bin
+                |-- ...
+
+    :param model_id: A model ID of a model hosted on the [Hub](https://huggingface.co/models).
+    :param base_model_dir: A directory where the model should be saved.
+    """
+    model = OVModelForCausalLM.from_pretrained(
+        model_id=model_id, export=True, load_in_8bit=False, load_in_4bit=False, compile=False, trust_remote_code=True
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+
+    model.save_pretrained(base_model_dir)
+    tokenizer.save_pretrained(base_model_dir)
+
+
+def dump_all_packages(output_file: str) -> None:
+    """
+    Generates a list of all installed Python packages and save it to a file.
+
+    :param output_file: The path to the file where the package list
+        should be saved.
+    """
+    with open(output_file, "w") as f:
+        subprocess.run(["pip", "freeze"], stdout=f)
+
+
+def load_json(path: str):
+    with open(path, encoding="utf8") as f:
+        return json.load(f)
+
+
+def save_json(data, path: str, indent: int = 4):
+    with open(path, "w", encoding="utf8") as outfile:
+        json.dump(data, outfile, indent=indent)
+
+
+# ------------------------------ Params Grid ------------------------------
+
+
+class Params:
+    """ """
+
+    def get_key(self) -> str:
+        """ """
+        raise NotImplementedError
+
+    def save_to_json(self, path: str) -> None:
+        """
+        :param path:
+        """
+        raise NotImplementedError
+
+
+@dataclass
+class OptimumCLIParams(Params):
+    # -------------------------------------- #
+    task: Optional[str] = None
+    trust_remote_code: Optional[bool] = True
+    weight_format: Optional[str] = "fp32"
+    # -------------------------------------- #
+    ratio: Optional[float] = None
+    sym: bool = False
+    group_size: Optional[int] = None
+    backup_precision: Optional[str] = None
+    dataset: Optional[str] = None
+    all_layers: bool = False
+    # -------------------------------------- #
+    awq: bool = False
+    scale_estimation: bool = False
+    gptq: bool = False
+    lora_correction: bool = False
+
+    def get_key(self) -> str:
+        # Skipped: task, trust_remote_code
+        key_items = []
+        key_items.append(f"{self.weight_format}")
+        if self.sym:
+            key_items.append("sym")
+        if self.ratio is not None:
+            key_items.append(f"r{self.ratio}")
+        if self.group_size is not None:
+            key_items.append(f"gs{self.group_size}")
+        if self.backup_precision is not None:
+            key_items.append(f"{self.backup_precision}")
+        if self.dataset:
+            key_items.append(f"{self.dataset}")
+
+        for field_name in ["all_layers", "awq", "scale_estimation", "gptq", "lora_correction"]:
+            if getattr(self, field_name):
+                key_items.append(field_name)
+
+        return "_".join(key_items)
+
+    def save_to_json(self, path: str) -> None:
+        data = asdict(self)
+        save_json(data, path)
+
+
+@dataclass
+class NNCFAPIParams(Params):
+    pass
+
+
+def optimum_cli_create_params_grid(compression_params: List[Dict[str, List[Any]]]) -> List[OptimumCLIParams]:
+    """ """
+    params_grid = []
+    for p in compression_params:
+        params_grid.extend(get_all_param_combinations(p, OptimumCLIParams))
+    return params_grid
+
+
+def nncf_create_params_grid(compression_params: List[Dict[str, List[Any]]]) -> List[NNCFAPIParams]:
+    raise NotImplementedError
+
+
+def visualize_experiments(model_id: str, params_grid: List[Params]):
+    """
+    :param model_id:
+    :param params_grid:
+    """
+    rows = [[model_id, params.get_key()] for params in params_grid]
+    print("List of configurations to test out:")
+    print(tabulate(tabular_data=rows, headers=["Model ID", "Experiment"], tablefmt="mixed_grid"))
+
+
+# ------------------------------ Params Grid ------------------------------
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-id", type=str, required=True, help="A model ID of a model hosted on the [Hub](https://huggingface.co/models)")
+    parser.add_argument("--config", type=str, required=True)
+    parser.add_argument("--root-dir", type=str, required=True)
+    parser.add_argument("--show-only", action="store_true")
+    parser.add_argument("--dump-packages", action="store_true")
+    return parser.parse_args()
+
+
+def encode_model_id(model_id):
+    """
+    :param model_id:
+    """
+    # return name.replace("/", "_").replace(".", "_")
+    if "/" in model_id:
+        model_id = "/".join(model_id.split("/")[1:])
+    return model_id.replace("/", "_").replace(".", "_")
+
+
+def run_command(command: str) -> None:
+    print(f"Run command: {command}")
+    subprocess.run(command, check=True, shell=True)
+
+
+def run_optimum_cli(
+    model_id: Path,
+    output_dir: Path,
+    params: OptimumCLIParams,
+    log_filename: Optional[str] = None,
+) -> None:
+    """
+    :param model_id:
+    :param output_dir:
+    :param params:
+    :param log_filename:
+    """
+    cmd_line = "optimum-cli"
+    cmd_line += " export openvino"
+    cmd_line += f" --model {model_id}"
+    if params.task:
+        cmd_line += f" --task {params.task}"
+    if params.trust_remote_code:
+        cmd_line += " --trust-remote-code"
+    if params.weight_format:
+        cmd_line += f" --weight-format {params.weight_format}"
+    if params.ratio:
+        cmd_line += f" --ratio {params.ratio}"
+    if params.sym:
+        cmd_line += " --sym"
+    if params.group_size:
+        cmd_line += f" --group-size {params.group_size}"
+    if params.backup_precision:
+        cmd_line += f" --backup-precision {params.backup_precision}"
+    if params.dataset:
+        cmd_line += f" --dataset {params.dataset}"
+    if params.all_layers:
+        cmd_line += " --all-layers"
+    if params.awq:
+        cmd_line += " --awq"
+    if params.scale_estimation:
+        cmd_line += " --scale-estimation"
+    if params.gptq:
+        cmd_line += " --gptq"
+    if params.lora_correction:
+        cmd_line += " --lora-correction"
+
+    # output argument
+    cmd_line += f" {output_dir.as_posix()}"
+
+    if log_filename:
+        optimum_cli_log = output_dir.joinpath("optimum_cli_log.txt")
+        cmd_line += f" 2>&1 | tee -a {optimum_cli_log.as_posix()}"
+
+    return run_command(cmd_line)
+
+
+def run_nncf(
+    model_id: Path,
+    output_dir: Path,
+    params: NNCFAPIParams,
+    log_filename: Optional[str] = None,
+) -> None:
+    """
+    :param model_id:
+    :param output_dir:
+    :param params:
+    :param log_filename:
+    """
+    raise NotImplementedError
+
+
+def get_all_param_combinations(experiment: Dict[str, List[Any]], cls) -> List[Params]:
+    keys = experiment.keys()
+    values = experiment.values()
+    combinations = [cls(**dict(zip(keys, combination))) for combination in itertools.product(*values)]
+    return combinations
+
+
+class EvaluateBackendType(Enum):
+    LM_EVAL = "lm_eval"
+
+
+def run_lm_eval_cli(model_dir: Path, evaluation_params: Dict[str, Any]):
+    """
+    :param model_dir:
+    :param evaluation_params:
+    """
+    cmd_line = "lm_eval"
+    cmd_line += f" --model openvino"
+
+    tasks_arg = ",".join(evaluation_params["tasks"])
+    cmd_line += f" --tasks {tasks_arg}"
+
+    cmd_line += f" --model_args pretrained={model_dir.as_posix()}"
+
+    num_fewshot = evaluation_params.get("num_fewshot")
+    if num_fewshot:
+        cmd_line += f" --num_fewshot {num_fewshot}"
+
+    batch_size = evaluation_params.get("batch_size")
+    if batch_size:
+        cmd_line += f" --batch_size {batch_size}"
+
+    device = evaluation_params.get("device")
+    if device:
+        cmd_line += f" --device {device}"
+
+    cmd_line += f" --output_path {model_dir.as_posix()}"
+
+    limit = evaluation_params.get("limit")
+    if limit:
+        cmd_line += f" --limit {limit}"
+
+    cmd_line += f" --trust_remote_code"
+
+    return run_command(cmd_line)
+
+
+def evaluate(model_dir: Path, evaluation_config: Dict[str, Any]):
+    """
+    """
+    backend = EvaluateBackendType(evaluation_config["backend"])
+    evaluation_params = evaluation_config["params"]
+
+    print(f"Run evaluation ({backend.name}): {model_dir.as_posix()}")
+
+    if backend == EvaluateBackendType.LM_EVAL:
+        run_lm_eval_cli(model_dir, evaluation_params)
+    else:
+        raise NotImplementedError
+
+
+def compress(model_id: str,
+             root_model_dir: Path,
+             compression_config: Dict[str, Any],
+             show_only: bool = False) -> None:
+    """
+    :param model_id:
+    :param root_model_dir:
+    :param compression_config:
+    """
+    backend = CompressBackendType(compression_config["backend"])
+    compression_params = compression_config["params"]
+
+    if backend == CompressBackendType.OPTIMUM_CLI:
+        grid = optimum_cli_create_params_grid(compression_params)
+    elif backend == CompressBackendType.NNCF:
+        grid = nncf_create_params_grid(compression_params)
+
+    visualize_experiments(model_id, grid)
+
+    if show_only:
+        return
+
+    for params in grid:
+        EXPERIMENT_DIR = root_model_dir / params.get_key()
+        if EXPERIMENT_DIR.exists():
+            shutil.rmtree(EXPERIMENT_DIR)
+        EXPERIMENT_DIR.mkdir(exist_ok=True, parents=True)
+
+        print(f"Applying configuration: {params.get_key()}")
+
+        if backend == CompressBackendType.OPTIMUM_CLI:
+            params_filename = "optimum_cli_params.json"
+            run_optimum_cli(model_id, EXPERIMENT_DIR, params)
+        elif backend == CompressBackendType.NNCF:
+            params_filename = "nncf_params.json"
+            run_nncf(model_id, EXPERIMENT_DIR, params)
+
+        # --------- Save params ---------
+        print(f"Saving compression parameters: {EXPERIMENT_DIR / params_filename}")
+        params.save_to_json(EXPERIMENT_DIR / params_filename)
+
+
+def main():
+    args = parse_args()
+
+    ROOT_DIR = Path(args.root_dir)
+    ROOT_MODEL_DIR = ROOT_DIR / encode_model_id(args.model_id)
+
+    # --------- Export base model ---------
+    BASE_MODEL_DIR = ROOT_MODEL_DIR / "fp32"
+    if BASE_MODEL_DIR.exists():
+        shutil.rmtree(BASE_MODEL_DIR)
+    BASE_MODEL_DIR.mkdir(exist_ok=True, parents=True)
+    print(f"Saving a base model: {BASE_MODEL_DIR}")
+    export_base_model(args.model_id, BASE_MODEL_DIR)
+
+    config = load_json(args.config)
+
+    # --------- Compress ---------
+    compression_config = config["compression"]
+    compress(args.model_id, ROOT_MODEL_DIR, compression_config)
+
+    if args.show_only:
+        return
+
+    # --------- Evaluate ---------
+    evaluation_config = config["evaluation"]
+    for model_dir in ROOT_MODEL_DIR.iterdir():
+        if not model_dir.is_dir():
+            continue
+
+        try:
+            evaluate(model_dir, evaluation_config)
+        except Exception as e:
+            print(e)
+
+    # --------- Save extra info ---------
+    if args.dump_packages:
+        dump_all_packages(ROOT_MODEL_DIR / "versions.txt")
+
+
+if __name__ == "__main__":
+    main()

From a007a437b08cc34c04fa07c2c52116ca524a3e20 Mon Sep 17 00:00:00 2001
From: Andrey Churkin <andrey.churkin@intel.com>
Date: Tue, 18 Feb 2025 11:24:33 +0000
Subject: [PATCH 02/11] add lm_eval version

---
 tools/llm_weight_compression/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/llm_weight_compression/README.md b/tools/llm_weight_compression/README.md
index c2d38caadcf..1294e740092 100644
--- a/tools/llm_weight_compression/README.md
+++ b/tools/llm_weight_compression/README.md
@@ -10,7 +10,8 @@ pip install --upgrade pip
 pip install openvino==2025.0.0
 pip install nncf==2.15.0
 pip install "git+https://github.com/huggingface/optimum.git@v1.24.0"
-pip install git+https://github.com/huggingface/optimum-intel.git@v1.22.0
+pip install "git+https://github.com/huggingface/optimum-intel.git@v1.22.0"
+pip install "git+https://github.com/EleutherAI/lm-evaluation-harness@v0.4.2"
 
 # #whowhatbench
 git clone --depth 1 --branch 2025.0.0.0 https://github.com/openvinotoolkit/openvino.genai.git

From bd3a0014bb94db0814326c51798e4a78c4f95ff6 Mon Sep 17 00:00:00 2001
From: Andrey Churkin <andrey.churkin@intel.com>
Date: Tue, 18 Feb 2025 11:41:01 +0000
Subject: [PATCH 03/11] minor improvements

---
 tools/llm_weight_compression/run.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/tools/llm_weight_compression/run.py b/tools/llm_weight_compression/run.py
index 6f404543e18..117fb7947af 100644
--- a/tools/llm_weight_compression/run.py
+++ b/tools/llm_weight_compression/run.py
@@ -37,9 +37,10 @@ def export_base_model(model_id: str, base_model_dir: Path) -> None:
         {ROOT_DIR}
         |-- {encoded model ID}
             |-- fp32
-                |-- openvino_model.xml
-                |-- openvino_model.bin
-                |-- ...
+                |-- model
+                    |-- openvino_model.xml
+                    |-- openvino_model.bin
+                    |-- ...
 
     :param model_id: A model ID of a model hosted on the [Hub](https://huggingface.co/models).
     :param base_model_dir: A directory where the model should be saved.
@@ -50,8 +51,8 @@ def export_base_model(model_id: str, base_model_dir: Path) -> None:
 
     tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
 
-    model.save_pretrained(base_model_dir)
-    tokenizer.save_pretrained(base_model_dir)
+    model.save_pretrained(base_model_dir.joinpath("model"))
+    tokenizer.save_pretrained(base_model_dir.joinpath("model"))
 
 
 def dump_all_packages(output_file: str) -> None:
@@ -235,7 +236,7 @@ def run_optimum_cli(
         cmd_line += " --lora-correction"
 
     # output argument
-    cmd_line += f" {output_dir.as_posix()}"
+    cmd_line += f" {output_dir.joinpath('model').as_posix()}"
 
     if log_filename:
         optimum_cli_log = output_dir.joinpath("optimum_cli_log.txt")
@@ -281,7 +282,7 @@ def run_lm_eval_cli(model_dir: Path, evaluation_params: Dict[str, Any]):
     tasks_arg = ",".join(evaluation_params["tasks"])
     cmd_line += f" --tasks {tasks_arg}"
 
-    cmd_line += f" --model_args pretrained={model_dir.as_posix()}"
+    cmd_line += f" --model_args pretrained={model_dir.joinpath('model').as_posix()}"
 
     num_fewshot = evaluation_params.get("num_fewshot")
     if num_fewshot:
@@ -295,7 +296,7 @@ def run_lm_eval_cli(model_dir: Path, evaluation_params: Dict[str, Any]):
     if device:
         cmd_line += f" --device {device}"
 
-    cmd_line += f" --output_path {model_dir.as_posix()}"
+    cmd_line += f" --output_path {model_dir.joinpath('lm_eval_results.json').as_posix()}"
 
     limit = evaluation_params.get("limit")
     if limit:

From 6ec8952995a593bf4384073646f183d6d84cb631 Mon Sep 17 00:00:00 2001
From: Andrey Churkin <andrey.churkin@intel.com>
Date: Tue, 18 Feb 2025 12:59:18 +0000
Subject: [PATCH 04/11] add who_what_benchmark

---
 ...m_cli.json => config_optimum_lm_eval.json} |  0
 .../config_optimum_wwb.json                   | 23 ++++++++++++++
 tools/llm_weight_compression/run.py           | 30 ++++++++++++++++---
 3 files changed, 49 insertions(+), 4 deletions(-)
 rename tools/llm_weight_compression/{config_optimum_cli.json => config_optimum_lm_eval.json} (100%)
 create mode 100644 tools/llm_weight_compression/config_optimum_wwb.json

diff --git a/tools/llm_weight_compression/config_optimum_cli.json b/tools/llm_weight_compression/config_optimum_lm_eval.json
similarity index 100%
rename from tools/llm_weight_compression/config_optimum_cli.json
rename to tools/llm_weight_compression/config_optimum_lm_eval.json
diff --git a/tools/llm_weight_compression/config_optimum_wwb.json b/tools/llm_weight_compression/config_optimum_wwb.json
new file mode 100644
index 00000000000..e78fc602f9b
--- /dev/null
+++ b/tools/llm_weight_compression/config_optimum_wwb.json
@@ -0,0 +1,23 @@
+{
+    "compression": {
+        "backend": "optimum_cli",
+        "params": [
+            {
+                "task": ["text-generation"],
+                "weight_format": ["int4"],
+                "ratio": [0.2],
+                "group_size": [64],
+                "awq": [false],
+                "dataset": ["auto"]
+            }
+        ]
+    },
+    "evaluation": {
+        "backend": "who_what_benchmark",
+        "params": {
+            "model_type": "text",
+            "device": "CPU",
+            "language": "en"
+        }
+    }
+}
diff --git a/tools/llm_weight_compression/run.py b/tools/llm_weight_compression/run.py
index 117fb7947af..c6baa37cbb1 100644
--- a/tools/llm_weight_compression/run.py
+++ b/tools/llm_weight_compression/run.py
@@ -269,6 +269,7 @@ def get_all_param_combinations(experiment: Dict[str, List[Any]], cls) -> List[Pa
 
 class EvaluateBackendType(Enum):
     LM_EVAL = "lm_eval"
+    WHO_WHAT_BENCHMARK = "who_what_benchmark"
 
 
 def run_lm_eval_cli(model_dir: Path, evaluation_params: Dict[str, Any]):
@@ -307,7 +308,27 @@ def run_lm_eval_cli(model_dir: Path, evaluation_params: Dict[str, Any]):
     return run_command(cmd_line)
 
 
-def evaluate(model_dir: Path, evaluation_config: Dict[str, Any]):
+def run_who_what_benchmark_cli(model_dir: Path, base_model_dir: Path, evaluation_params: Dict[str, Any]):
+    if model_dir.resolve() == base_model_dir.resolve():
+        return
+
+    language = evaluation_params['language']
+    gt_data_filename = f"gt_{language}.csv"
+
+    cmd_line = "wwb"
+    cmd_line += f" --base-model {base_model_dir.joinpath('model')}"
+    cmd_line += f" --target-model {model_dir.joinpath('model')}"
+    cmd_line += f" --gt-data {base_model_dir.joinpath(gt_data_filename)}"
+    cmd_line += f" --model-type {evaluation_params['model_type']}"
+    cmd_line += f" --device {evaluation_params['device']}"
+    cmd_line += f" --language {language}"
+    # cmd_line += " --hf"
+    cmd_line += f" --output {model_dir.as_posix()}"
+
+    return run_command(cmd_line)
+
+
+def evaluate(model_dir: Path, base_model_dir: Path, evaluation_config: Dict[str, Any]):
     """
     """
     backend = EvaluateBackendType(evaluation_config["backend"])
@@ -317,8 +338,9 @@ def evaluate(model_dir: Path, evaluation_config: Dict[str, Any]):
 
     if backend == EvaluateBackendType.LM_EVAL:
         run_lm_eval_cli(model_dir, evaluation_params)
-    else:
-        raise NotImplementedError
+
+    if backend == EvaluateBackendType.WHO_WHAT_BENCHMARK:
+        run_who_what_benchmark_cli(model_dir, base_model_dir, evaluation_params)
 
 
 def compress(model_id: str,
@@ -393,7 +415,7 @@ def main():
             continue
 
         try:
-            evaluate(model_dir, evaluation_config)
+            evaluate(model_dir, BASE_MODEL_DIR, evaluation_config)
         except Exception as e:
             print(e)
 

From 478aef01c964542498498c0f6d2655ba2afe68df Mon Sep 17 00:00:00 2001
From: Andrey Churkin <andrey.churkin@intel.com>
Date: Tue, 18 Feb 2025 13:48:23 +0000
Subject: [PATCH 05/11] minor

---
 tools/llm_weight_compression/run.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/llm_weight_compression/run.py b/tools/llm_weight_compression/run.py
index c6baa37cbb1..27169eb545f 100644
--- a/tools/llm_weight_compression/run.py
+++ b/tools/llm_weight_compression/run.py
@@ -272,7 +272,7 @@ class EvaluateBackendType(Enum):
     WHO_WHAT_BENCHMARK = "who_what_benchmark"
 
 
-def run_lm_eval_cli(model_dir: Path, evaluation_params: Dict[str, Any]):
+def run_lm_eval(model_dir: Path, evaluation_params: Dict[str, Any]):
     """
     :param model_dir:
     :param evaluation_params:
@@ -308,7 +308,7 @@ def run_lm_eval_cli(model_dir: Path, evaluation_params: Dict[str, Any]):
     return run_command(cmd_line)
 
 
-def run_who_what_benchmark_cli(model_dir: Path, base_model_dir: Path, evaluation_params: Dict[str, Any]):
+def run_who_what_benchmark(model_dir: Path, base_model_dir: Path, evaluation_params: Dict[str, Any]):
     if model_dir.resolve() == base_model_dir.resolve():
         return
 
@@ -337,10 +337,10 @@ def evaluate(model_dir: Path, base_model_dir: Path, evaluation_config: Dict[str,
     print(f"Run evaluation ({backend.name}): {model_dir.as_posix()}")
 
     if backend == EvaluateBackendType.LM_EVAL:
-        run_lm_eval_cli(model_dir, evaluation_params)
+        run_lm_eval(model_dir, evaluation_params)
 
     if backend == EvaluateBackendType.WHO_WHAT_BENCHMARK:
-        run_who_what_benchmark_cli(model_dir, base_model_dir, evaluation_params)
+        run_who_what_benchmark(model_dir, base_model_dir, evaluation_params)
 
 
 def compress(model_id: str,

From 6adf764ef8277124dd2ab189e5f5889df375b93f Mon Sep 17 00:00:00 2001
From: Andrey Churkin <andrey.churkin@intel.com>
Date: Tue, 18 Feb 2025 16:34:36 +0000
Subject: [PATCH 06/11] parse results

---
 tools/llm_weight_compression/run.py | 77 ++++++++++++++++++++++++++++-
 1 file changed, 75 insertions(+), 2 deletions(-)

diff --git a/tools/llm_weight_compression/run.py b/tools/llm_weight_compression/run.py
index 27169eb545f..94f834846dd 100644
--- a/tools/llm_weight_compression/run.py
+++ b/tools/llm_weight_compression/run.py
@@ -25,6 +25,10 @@
 from transformers import AutoTokenizer
 
 
+LM_EVAL_RESULTS_FILENAME = "lm_eval_results.json"
+OPTIMUM_CLI_PARAMS_FILENAME = "optimum_cli_params.json"
+
+
 class CompressBackendType(Enum):
     OPTIMUM_CLI = "optimum_cli"
     NNCF = "nncf"
@@ -297,7 +301,7 @@ def run_lm_eval(model_dir: Path, evaluation_params: Dict[str, Any]):
     if device:
         cmd_line += f" --device {device}"
 
-    cmd_line += f" --output_path {model_dir.joinpath('lm_eval_results.json').as_posix()}"
+    cmd_line += f" --output_path {model_dir.joinpath(LM_EVAL_RESULTS_FILENAME).as_posix()}"
 
     limit = evaluation_params.get("limit")
     if limit:
@@ -374,7 +378,7 @@ def compress(model_id: str,
         print(f"Applying configuration: {params.get_key()}")
 
         if backend == CompressBackendType.OPTIMUM_CLI:
-            params_filename = "optimum_cli_params.json"
+            params_filename = OPTIMUM_CLI_PARAMS_FILENAME
             run_optimum_cli(model_id, EXPERIMENT_DIR, params)
         elif backend == CompressBackendType.NNCF:
             params_filename = "nncf_params.json"
@@ -385,6 +389,72 @@ def compress(model_id: str,
         params.save_to_json(EXPERIMENT_DIR / params_filename)
 
 
+class ResultsParser:
+
+    @staticmethod
+    def parse_lm_eval(path: Path):
+
+        METRICS = [
+            "acc",
+            "ppl",
+            "word_perplexity",
+            "exact_match,strict-match",
+            "perplexity",
+            "similarity",
+            "fdt_norm",
+        ]
+        METRICS.extend([metric + ",none" for metric in METRICS])
+
+        data = load_json(path)
+        limit = data.get("config", {}).get("limit", None)
+        results_section = data.get("results")
+
+        results = []
+        for task, task_results in results_section.items():
+            res = {}
+            for metric, value in task_results.items():
+                res["task"] = task
+
+                if metric in METRICS:
+                    metric = metric.replace(",none", "")
+                    res[metric] = value
+            res["limit"] = limit
+            results.append(res)
+
+        return results
+
+    @staticmethod
+    def parse_optimum_params(path: Path, fields: List[str]):
+        data = load_json(path)
+        return {field_name: data[field_name] for field_name in fields}
+
+    @staticmethod
+    def parse(root_model_dir: Path):
+        c = {}  # configuration_key -> {/* data */}
+
+        for model_dir in root_model_dir.iterdir():
+            if not model_dir.is_dir():
+                continue
+
+            configuration_key = model_dir.name
+
+            c[configuration_key] = {}
+            c[configuration_key]["model"] = root_model_dir.name
+            c[configuration_key]["configuration"] = configuration_key
+
+            # Parse the `lm_eval_results.json` file
+            path = model_dir.joinpath(LM_EVAL_RESULTS_FILENAME)
+            if path.exists():
+                c[configuration_key]["lm_eval"] = ResultsParser.parse_lm_eval(path)
+
+            # Parse the `optimum_cli_params.json` file
+            path = model_dir.joinpath(OPTIMUM_CLI_PARAMS_FILENAME)
+            if path.exists():
+                c[configuration_key]["optimum_params"] = ResultsParser.parse_optimum_params(path, ["weight_format", "ratio", "group_size"]) # TODO
+
+        # print(json.dumps(c, indent=4))
+
+
 def main():
     args = parse_args()
 
@@ -423,6 +493,9 @@ def main():
     if args.dump_packages:
         dump_all_packages(ROOT_MODEL_DIR / "versions.txt")
 
+    # --------- Parse results ---------
+    ResultsParser.parse(ROOT_MODEL_DIR)
+
 
 if __name__ == "__main__":
     main()

From a0b0dce413ffe253b669fa95f44e6375f195a7b4 Mon Sep 17 00:00:00 2001
From: Andrey Churkin <andrey.churkin@intel.com>
Date: Tue, 18 Feb 2025 16:37:38 +0000
Subject: [PATCH 07/11] fix style

---
 tools/llm_weight_compression/run.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/tools/llm_weight_compression/run.py b/tools/llm_weight_compression/run.py
index 94f834846dd..4943d62088d 100644
--- a/tools/llm_weight_compression/run.py
+++ b/tools/llm_weight_compression/run.py
@@ -24,7 +24,6 @@
 from tabulate import tabulate
 from transformers import AutoTokenizer
 
-
 LM_EVAL_RESULTS_FILENAME = "lm_eval_results.json"
 OPTIMUM_CLI_PARAMS_FILENAME = "optimum_cli_params.json"
 
@@ -174,7 +173,12 @@ def visualize_experiments(model_id: str, params_grid: List[Params]):
 
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--model-id", type=str, required=True, help="A model ID of a model hosted on the [Hub](https://huggingface.co/models)")
+    parser.add_argument(
+        "--model-id",
+        type=str,
+        required=True,
+        help="A model ID of a model hosted on the [Hub](https://huggingface.co/models)",
+    )
     parser.add_argument("--config", type=str, required=True)
     parser.add_argument("--root-dir", type=str, required=True)
     parser.add_argument("--show-only", action="store_true")
@@ -282,7 +286,7 @@ def run_lm_eval(model_dir: Path, evaluation_params: Dict[str, Any]):
     :param evaluation_params:
     """
     cmd_line = "lm_eval"
-    cmd_line += f" --model openvino"
+    cmd_line += " --model openvino"
 
     tasks_arg = ",".join(evaluation_params["tasks"])
     cmd_line += f" --tasks {tasks_arg}"
@@ -307,7 +311,7 @@ def run_lm_eval(model_dir: Path, evaluation_params: Dict[str, Any]):
     if limit:
         cmd_line += f" --limit {limit}"
 
-    cmd_line += f" --trust_remote_code"
+    cmd_line += " --trust_remote_code"
 
     return run_command(cmd_line)
 
@@ -316,7 +320,7 @@ def run_who_what_benchmark(model_dir: Path, base_model_dir: Path, evaluation_par
     if model_dir.resolve() == base_model_dir.resolve():
         return
 
-    language = evaluation_params['language']
+    language = evaluation_params["language"]
     gt_data_filename = f"gt_{language}.csv"
 
     cmd_line = "wwb"
@@ -333,8 +337,7 @@ def run_who_what_benchmark(model_dir: Path, base_model_dir: Path, evaluation_par
 
 
 def evaluate(model_dir: Path, base_model_dir: Path, evaluation_config: Dict[str, Any]):
-    """
-    """
+    """ """
     backend = EvaluateBackendType(evaluation_config["backend"])
     evaluation_params = evaluation_config["params"]
 
@@ -347,10 +350,7 @@ def evaluate(model_dir: Path, base_model_dir: Path, evaluation_config: Dict[str,
         run_who_what_benchmark(model_dir, base_model_dir, evaluation_params)
 
 
-def compress(model_id: str,
-             root_model_dir: Path,
-             compression_config: Dict[str, Any],
-             show_only: bool = False) -> None:
+def compress(model_id: str, root_model_dir: Path, compression_config: Dict[str, Any], show_only: bool = False) -> None:
     """
     :param model_id:
     :param root_model_dir:
@@ -450,7 +450,9 @@ def parse(root_model_dir: Path):
             # Parse the `optimum_cli_params.json` file
             path = model_dir.joinpath(OPTIMUM_CLI_PARAMS_FILENAME)
             if path.exists():
-                c[configuration_key]["optimum_params"] = ResultsParser.parse_optimum_params(path, ["weight_format", "ratio", "group_size"]) # TODO
+                c[configuration_key]["optimum_params"] = ResultsParser.parse_optimum_params(
+                    path, ["weight_format", "ratio", "group_size"]
+                )  # TODO
 
         # print(json.dumps(c, indent=4))
 

From caa1c91b8a4ac151c167be044cb688b196dafefc Mon Sep 17 00:00:00 2001
From: Andrey Churkin <andrey.churkin@intel.com>
Date: Wed, 19 Feb 2025 12:03:07 +0000
Subject: [PATCH 08/11] update

---
 tools/llm_weight_compression/README.md |  5 +-
 tools/llm_weight_compression/run.py    | 86 ++++++++++++++++++++++++--
 2 files changed, 85 insertions(+), 6 deletions(-)

diff --git a/tools/llm_weight_compression/README.md b/tools/llm_weight_compression/README.md
index 1294e740092..cfeca63d2e3 100644
--- a/tools/llm_weight_compression/README.md
+++ b/tools/llm_weight_compression/README.md
@@ -12,6 +12,7 @@ pip install nncf==2.15.0
 pip install "git+https://github.com/huggingface/optimum.git@v1.24.0"
 pip install "git+https://github.com/huggingface/optimum-intel.git@v1.22.0"
 pip install "git+https://github.com/EleutherAI/lm-evaluation-harness@v0.4.2"
+pip install xlsxwriter
 
 # #whowhatbench
 git clone --depth 1 --branch 2025.0.0.0 https://github.com/openvinotoolkit/openvino.genai.git
@@ -24,7 +25,7 @@ pip install .
 # For test
 python run.py \
 --model-id facebook/opt-125m \
---config config_optimum_cli.json \
+--config config_optimum_lm_eval.json \
 --root-dir experiment_dir \
 --dump-packages
-```
\ No newline at end of file
+```
diff --git a/tools/llm_weight_compression/run.py b/tools/llm_weight_compression/run.py
index 4943d62088d..c0ed94bcc67 100644
--- a/tools/llm_weight_compression/run.py
+++ b/tools/llm_weight_compression/run.py
@@ -20,6 +20,8 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 
+import pandas as pd
+
 from optimum.intel import OVModelForCausalLM
 from tabulate import tabulate
 from transformers import AutoTokenizer
@@ -450,11 +452,84 @@ def parse(root_model_dir: Path):
             # Parse the `optimum_cli_params.json` file
             path = model_dir.joinpath(OPTIMUM_CLI_PARAMS_FILENAME)
             if path.exists():
+                # TODO(andrey-churkin): Add more fields
                 c[configuration_key]["optimum_params"] = ResultsParser.parse_optimum_params(
                     path, ["weight_format", "ratio", "group_size"]
-                )  # TODO
-
-        # print(json.dumps(c, indent=4))
+                )
+
+        return c
+
+
+def save_results(results: Dict[str, Dict[str, Any]], root_path: Path):
+# {
+#     "int4_r0.2_gs64_auto": {
+#         "model": "opt-125m",
+#         "configuration": "int4_r0.2_gs64_auto",
+#         "lm_eval": [{...}, ...],
+#         "optimum_params": {
+#             "weight_format": "int4",
+#             "ratio": 0.2,
+#             "group_size": 64
+#         }
+#     },
+#     "fp32": {
+#         "model": "opt-125m",
+#         "configuration": "fp32",
+#         "lm_eval": [{...}, ...],
+#     },
+#      ...
+# }
+    rows: List[Dict[str, Any]] = []
+    for val in results.values():
+        row = {
+            "model": val["model"],
+            "configuration": val["configuration"],
+        }
+        # Add optimum params
+        row.update(val.get("optimum_params", {}))
+
+        # Add lm_eval results
+        lm_eval = val.get("lm_eval", [])
+        for dct in lm_eval:
+            new_row = row.copy()
+            new_row.update(dct)
+            rows.append(new_row)
+
+    pd.set_option("display.precision", 2)
+    df = pd.DataFrame(rows)
+    df.to_csv(root_path / "raw_results.csv")
+
+    dump_to_excel(df, root_path / "results.xlsx")
+
+
+def dump_to_excel(df, output_path: Path):
+    # to have all columns, not only pivot's values, but also index one.
+    print(df.columns)
+
+    writer = pd.ExcelWriter(output_path, engine="xlsxwriter")
+    df.to_excel(writer, sheet_name="all", index=False)
+    # (max_row, max_col) = df.shape
+    workbook = writer.book
+    worksheet = writer.sheets["all"]
+
+    format1 = workbook.add_format({"num_format": "#,##0.00"})
+    worksheet.set_column("A:X", 18, format1)
+    col_names = [{"header": col_name} for col_name in df.columns]
+    worksheet.add_table(
+        0,
+        0,
+        df.shape[0],
+        df.shape[1] - 1,
+        {
+            "columns": col_names,
+            # 'style' = option Format as table value and is case sensitive
+            # (look at the exact name into Excel)
+            "style": None,
+        },
+    )
+    worksheet.autofit()
+    workbook.close()
+    print("Path to parsed results: ", output_path.resolve())
 
 
 def main():
@@ -496,7 +571,10 @@ def main():
         dump_all_packages(ROOT_MODEL_DIR / "versions.txt")
 
     # --------- Parse results ---------
-    ResultsParser.parse(ROOT_MODEL_DIR)
+    results = ResultsParser.parse(ROOT_MODEL_DIR)
+
+    # --------- Save results ---------
+    save_results(results, ROOT_MODEL_DIR)
 
 
 if __name__ == "__main__":

From 59e188e020c8c5f6f45c06caa46459bc701e2086 Mon Sep 17 00:00:00 2001
From: Andrey Churkin <andrey.churkin@intel.com>
Date: Wed, 19 Feb 2025 15:22:56 +0000
Subject: [PATCH 09/11] update

---
 tools/llm_weight_compression/run.py | 36 ++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/tools/llm_weight_compression/run.py b/tools/llm_weight_compression/run.py
index c0ed94bcc67..dcab07411dd 100644
--- a/tools/llm_weight_compression/run.py
+++ b/tools/llm_weight_compression/run.py
@@ -28,6 +28,7 @@
 
 LM_EVAL_RESULTS_FILENAME = "lm_eval_results.json"
 OPTIMUM_CLI_PARAMS_FILENAME = "optimum_cli_params.json"
+WWB_METRICS_FILENAME = "metrics.csv"
 
 
 class CompressBackendType(Enum):
@@ -394,7 +395,7 @@ def compress(model_id: str, root_model_dir: Path, compression_config: Dict[str,
 class ResultsParser:
 
     @staticmethod
-    def parse_lm_eval(path: Path):
+    def parse_lm_eval_metrics(path: Path):
 
         METRICS = [
             "acc",
@@ -425,6 +426,14 @@ def parse_lm_eval(path: Path):
 
         return results
 
+    @staticmethod
+    def parse_who_what_benchmark_metrics(path: Path):
+        # TODO(andrey-churkin): Clarify possible field names
+        df = pd.read_csv(path)
+        return {
+            "similarity": float(df['similarity'][0]),
+        }
+
     @staticmethod
     def parse_optimum_params(path: Path, fields: List[str]):
         data = load_json(path)
@@ -447,7 +456,13 @@ def parse(root_model_dir: Path):
             # Parse the `lm_eval_results.json` file
             path = model_dir.joinpath(LM_EVAL_RESULTS_FILENAME)
             if path.exists():
-                c[configuration_key]["lm_eval"] = ResultsParser.parse_lm_eval(path)
+                c[configuration_key]["lm_eval"] = ResultsParser.parse_lm_eval_metrics(path)
+
+            # Parse the WWB metrics file
+            path = model_dir.joinpath(WWB_METRICS_FILENAME)
+            if path.exists():
+                # TODO(andrey-churkin): Find the format specification for the `metrics.csv` file
+                c[configuration_key]["who_what_benchmark"] = ResultsParser.parse_who_what_benchmark_metrics(path)
 
             # Parse the `optimum_cli_params.json` file
             path = model_dir.joinpath(OPTIMUM_CLI_PARAMS_FILENAME)
@@ -485,15 +500,24 @@ def save_results(results: Dict[str, Dict[str, Any]], root_path: Path):
             "model": val["model"],
             "configuration": val["configuration"],
         }
+
         # Add optimum params
         row.update(val.get("optimum_params", {}))
+        # Add who_what_benchmark results
+        row.update(val.get("who_what_benchmark", {}))
 
         # Add lm_eval results
         lm_eval = val.get("lm_eval", [])
-        for dct in lm_eval:
-            new_row = row.copy()
-            new_row.update(dct)
-            rows.append(new_row)
+        if lm_eval:
+            new_rows = []
+            for dct in lm_eval:
+                new_row = row.copy()
+                new_row.update(dct)
+                new_rows.append(new_row)
+        else:
+            new_rows = [row]
+
+        rows.extend(new_rows)
 
     pd.set_option("display.precision", 2)
     df = pd.DataFrame(rows)

From b5f72d5d9dc3fdc2fa0de6e77b1194ec97ecfe2c Mon Sep 17 00:00:00 2001
From: Andrey Churkin <andrey.churkin@intel.com>
Date: Wed, 19 Feb 2025 16:43:09 +0000
Subject: [PATCH 10/11] minor update

---
 tools/llm_weight_compression/run.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tools/llm_weight_compression/run.py b/tools/llm_weight_compression/run.py
index dcab07411dd..24899a93e6a 100644
--- a/tools/llm_weight_compression/run.py
+++ b/tools/llm_weight_compression/run.py
@@ -428,11 +428,14 @@ def parse_lm_eval_metrics(path: Path):
 
     @staticmethod
     def parse_who_what_benchmark_metrics(path: Path):
-        # TODO(andrey-churkin): Clarify possible field names
         df = pd.read_csv(path)
-        return {
-            "similarity": float(df['similarity'][0]),
-        }
+
+        val = {}
+        for name in df:
+            if name in ["similarity", "FDT", "FDT norm", "SDT", "SDT norm"]:
+                val[name] = float(df[name][0])
+
+        return val
 
     @staticmethod
     def parse_optimum_params(path: Path, fields: List[str]):

From 3e4cfca6942df39b929e9f825c348e79569e1a3e Mon Sep 17 00:00:00 2001
From: Andrey Churkin <andrey.churkin@intel.com>
Date: Thu, 20 Feb 2025 14:25:03 +0000
Subject: [PATCH 11/11] minor improvements

---
 tools/llm_weight_compression/run.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/llm_weight_compression/run.py b/tools/llm_weight_compression/run.py
index 24899a93e6a..5ddc6d6e42a 100644
--- a/tools/llm_weight_compression/run.py
+++ b/tools/llm_weight_compression/run.py
@@ -167,7 +167,7 @@ def visualize_experiments(model_id: str, params_grid: List[Params]):
     :param params_grid:
     """
     rows = [[model_id, params.get_key()] for params in params_grid]
-    print("List of configurations to test out:")
+    print(f"List of configurations to test out ({len(params_grid)}):")
     print(tabulate(tabular_data=rows, headers=["Model ID", "Experiment"], tablefmt="mixed_grid"))
 
 
@@ -577,7 +577,7 @@ def main():
 
     # --------- Compress ---------
     compression_config = config["compression"]
-    compress(args.model_id, ROOT_MODEL_DIR, compression_config)
+    compress(args.model_id, ROOT_MODEL_DIR, compression_config, show_only=args.show_only)
 
     if args.show_only:
         return