Support hallucination score, deepeval part

WenjingKangIntel · WenjingKangIntel · commit 9dd08842033f · 2025-03-07T12:55:45.000+08:00
Signed-off-by: Kang Wenjing &lt;wenjing.kang@intel.com&gt;
diff --git a/demos/virtual_ai_assistant_demo/test_vaa_hallucination.md b/demos/virtual_ai_assistant_demo/test_vaa_hallucination.md
@@ -0,0 +1,30 @@
+# Use deepeval to compute hallucination score
+## Prerequisite libraries
+1. [deepeval](https://github.com/confident-ai/deepeval)
+2. [Ollama](https://github.com/ollama/ollama/blob/main/README.md) 
+
+## How to set up
+1. Install deepeval:
+    ```
+    pip install -U deepeval
+    ```
+2. Install Ollama:
+    Please refer to [ollama](https://github.com/ollama/ollama/blob/main/README.md#ollama)
+
+3. Run Ollama, taking `deepseek-r1` as an example:
+    ```
+    ollama run deepseek-r1
+    ```
+4. Set deepeval to use Ollama for evaluation:
+    ```
+    deepeval set-ollama deepseek-r1
+    ```
+
+## How to run the test
+```
+python test_vaa_deepeval.py --personality /path/to/agribot_personality.yaml
+```
+
+## More to read
+[deepeval hallucination](https://docs.confident-ai.com/docs/metrics-hallucination)
+
diff --git a/demos/virtual_ai_assistant_demo/test_vaa_hallucination.py b/demos/virtual_ai_assistant_demo/test_vaa_hallucination.py
@@ -0,0 +1,139 @@
+import argparse
+import logging as log
+import os
+
+from typing import Set
+from pathlib import Path
+from tqdm import tqdm
+
+import openvino as ov
+import yaml
+
+from datasets import load_dataset
+from urllib.request import getproxies
+from deepeval.metrics import HallucinationMetric
+from deepeval.test_case import LLMTestCase
+from llama_index.core.chat_engine import SimpleChatEngine
+from llama_index.core.memory import ChatMemoryBuffer
+from llama_index.llms.openvino import OpenVINOLLM
+from transformers import AutoTokenizer
+
+proxies = getproxies()
+os.environ["http_proxy"]  = proxies["http"]
+os.environ["https_proxy"] = proxies["https"]
+os.environ["no_proxy"]    = "localhost, 127.0.0.1/8, ::1"
+from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig
+
+
+DATASET_MAPPING = {
+    "agribot_personality.yaml": "KisanVaani/agriculture-qa-english-only"
+}
+MODEL_DIR = Path("model")
+
+
+def get_available_devices() -> Set[str]:
+    core = ov.Core()
+    return {device.split(".")[0] for device in core.available_devices}
+
+
+def compute_deepeval_hallucination(inputs, outputs, contexts) -> float:
+    avg_score = 0.
+    for input, output, context in zip(inputs, outputs, contexts):
+        test_case = LLMTestCase(
+            input=input,
+            actual_output=output,
+            context=context
+        )
+        metric = HallucinationMetric(threshold=0.5)
+        metric.measure(test_case)
+        score = metric.score
+        # reason = metric.reason
+        avg_score += score / len(inputs)
+    return avg_score
+
+
+# this is necessary for thinking models e.g. deepseek
+def emphasize_thinking_mode(token: str) -> str:
+    return token + "<em><small>" if "<think>" in token else "</small></em>" + token if "</think>" in token else token
+
+
+def extract_personality_path(path):
+    return os.path.basename(path)
+
+
+def get_dataset_name(personality_file_path):
+    dataset_name = DATASET_MAPPING.get(extract_personality_path(personality_file_path), "")
+    assert dataset_name != ""
+    return dataset_name
+
+
+def load_chat_model(model_name: str, token: str = None) -> OpenVINOLLM:
+    model_path = MODEL_DIR / model_name    
+
+    # tokenizers are disabled anyway, this allows to avoid warning
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    if token is not None:
+        os.environ["HUGGING_FACE_HUB_TOKEN"] = token
+
+    ov_config = {"PERFORMANCE_HINT": "LATENCY", "CACHE_DIR": ""}
+    # load llama model and its tokenizer
+    if not model_path.exists():
+        log.info(f"Downloading {model_name}... It may take up to 1h depending on your Internet connection and model size.")     
+        
+        chat_tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
+        chat_tokenizer.save_pretrained(model_path)
+
+        # openvino models are used as is
+        is_openvino_model = model_name.split("/")[0] == "OpenVINO"
+        if is_openvino_model:
+            chat_model = OVModelForCausalLM.from_pretrained(model_name, export=False, compile=False, token=token)
+            chat_model.save_pretrained(model_path)
+        else:
+            log.info(f"Loading and quantizing {model_name} to INT4...")
+            log.info(f"Quantizing {model_name} to INT4... It may take significant amount of time depending on your machine power.")
+            quant_config = OVWeightQuantizationConfig(bits=4, sym=False, ratio=0.8, quant_method="awq", group_size=128, dataset="wikitext2")
+            chat_model = OVModelForCausalLM.from_pretrained(model_name, export=True, compile=False, quantization_config=quant_config,
+                                                            token=token, trust_remote_code=True, library_name="transformers")
+            chat_model.save_pretrained(model_path)
+
+    device = "GPU" if "GPU" in get_available_devices() else "CPU"
+    return OpenVINOLLM(context_window=4096, model_id_or_path=str(model_path), max_new_tokens=1024, device_map=device,
+                       model_kwargs={"ov_config": ov_config, "library_name": "transformers"}, generate_kwargs={"do_sample": True, "temperature": 0.7, "top_k": 50, "top_p": 0.95})
+
+
+def run_test_deepeval(chat_model_name, personality_file_path, auth_token):
+    dataset_name = get_dataset_name(personality_file_path)
+    log.info("Loading dataset")
+    dataset = load_dataset(dataset_name)['train']
+    log.info("Dataset loading is finished")
+    inputs = dataset['question']
+    # We use question as context because the dataset lacks context
+    contexts = dataset['question']
+    contexts_res = [[context] for context in contexts]
+
+    with open(personality_file_path, "rb") as f:
+        chatbot_config = yaml.safe_load(f)
+
+    ov_llm = load_chat_model(chat_model_name, auth_token)
+    ov_chat_engine = SimpleChatEngine.from_defaults(llm=ov_llm, system_prompt=chatbot_config["system_configuration"],
+                                                memory=ChatMemoryBuffer.from_defaults())
+    outputs = []
+    for input in tqdm(inputs[:2]):
+        output = ov_chat_engine.chat(input).response
+        outputs.append(output)
+
+    final_score = compute_deepeval_hallucination(inputs[:2], outputs[:2], contexts_res[:2])
+    print(f"final_score is {final_score}")
+
+
+if __name__ == "__main__":
+    # set up logging
+    log.getLogger().setLevel(log.INFO)
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--chat_model", type=str, default="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", help="Path/name of the chat model")
+    parser.add_argument("--personality", type=str, default="healthcare_personality.yaml", help="Path to the YAML file with chatbot personality")
+    parser.add_argument("--hf_token", type=str, help="HuggingFace access token to get Llama3")
+
+    args = parser.parse_args()
+    run_test_deepeval(args.chat_model, Path(args.personality), args.hf_token)