Skip to content

Commit 9dd0884

Browse files
Support hallucination score, deepeval part
Signed-off-by: Kang Wenjing <wenjing.kang@intel.com>
1 parent 9cfe8f6 commit 9dd0884

File tree

2 files changed

+169
-0
lines changed

2 files changed

+169
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Use deepeval to compute hallucination score
2+
## Prerequisite libraries
3+
1. [deepeval](https://github.com/confident-ai/deepeval)
4+
2. [Ollama](https://github.com/ollama/ollama/blob/main/README.md)
5+
6+
## How to set up
7+
1. Install deepeval:
8+
```
9+
pip install -U deepeval
10+
```
11+
2. Install Ollama:
12+
Please refer to [ollama](https://github.com/ollama/ollama/blob/main/README.md#ollama)
13+
14+
3. Run Ollama, taking `deepseek-r1` as an example:
15+
```
16+
ollama run deepseek-r1
17+
```
18+
4. Set deepeval to use Ollama for evaluation:
19+
```
20+
deepeval set-ollama deepseek-r1
21+
```
22+
23+
## How to run the test
24+
```
25+
python test_vaa_deepeval.py --personality /path/to/agribot_personality.yaml
26+
```
27+
28+
## More to read
29+
[deepeval hallucination](https://docs.confident-ai.com/docs/metrics-hallucination)
30+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
import argparse
2+
import logging as log
3+
import os
4+
5+
from typing import Set
6+
from pathlib import Path
7+
from tqdm import tqdm
8+
9+
import openvino as ov
10+
import yaml
11+
12+
from datasets import load_dataset
13+
from urllib.request import getproxies
14+
from deepeval.metrics import HallucinationMetric
15+
from deepeval.test_case import LLMTestCase
16+
from llama_index.core.chat_engine import SimpleChatEngine
17+
from llama_index.core.memory import ChatMemoryBuffer
18+
from llama_index.llms.openvino import OpenVINOLLM
19+
from transformers import AutoTokenizer
20+
21+
proxies = getproxies()
22+
os.environ["http_proxy"] = proxies["http"]
23+
os.environ["https_proxy"] = proxies["https"]
24+
os.environ["no_proxy"] = "localhost, 127.0.0.1/8, ::1"
25+
from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig
26+
27+
28+
DATASET_MAPPING = {
29+
"agribot_personality.yaml": "KisanVaani/agriculture-qa-english-only"
30+
}
31+
MODEL_DIR = Path("model")
32+
33+
34+
def get_available_devices() -> Set[str]:
35+
core = ov.Core()
36+
return {device.split(".")[0] for device in core.available_devices}
37+
38+
39+
def compute_deepeval_hallucination(inputs, outputs, contexts) -> float:
40+
avg_score = 0.
41+
for input, output, context in zip(inputs, outputs, contexts):
42+
test_case = LLMTestCase(
43+
input=input,
44+
actual_output=output,
45+
context=context
46+
)
47+
metric = HallucinationMetric(threshold=0.5)
48+
metric.measure(test_case)
49+
score = metric.score
50+
# reason = metric.reason
51+
avg_score += score / len(inputs)
52+
return avg_score
53+
54+
55+
# this is necessary for thinking models e.g. deepseek
56+
def emphasize_thinking_mode(token: str) -> str:
57+
return token + "<em><small>" if "<think>" in token else "</small></em>" + token if "</think>" in token else token
58+
59+
60+
def extract_personality_path(path):
61+
return os.path.basename(path)
62+
63+
64+
def get_dataset_name(personality_file_path):
65+
dataset_name = DATASET_MAPPING.get(extract_personality_path(personality_file_path), "")
66+
assert dataset_name != ""
67+
return dataset_name
68+
69+
70+
def load_chat_model(model_name: str, token: str = None) -> OpenVINOLLM:
71+
model_path = MODEL_DIR / model_name
72+
73+
# tokenizers are disabled anyway, this allows to avoid warning
74+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
75+
if token is not None:
76+
os.environ["HUGGING_FACE_HUB_TOKEN"] = token
77+
78+
ov_config = {"PERFORMANCE_HINT": "LATENCY", "CACHE_DIR": ""}
79+
# load llama model and its tokenizer
80+
if not model_path.exists():
81+
log.info(f"Downloading {model_name}... It may take up to 1h depending on your Internet connection and model size.")
82+
83+
chat_tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
84+
chat_tokenizer.save_pretrained(model_path)
85+
86+
# openvino models are used as is
87+
is_openvino_model = model_name.split("/")[0] == "OpenVINO"
88+
if is_openvino_model:
89+
chat_model = OVModelForCausalLM.from_pretrained(model_name, export=False, compile=False, token=token)
90+
chat_model.save_pretrained(model_path)
91+
else:
92+
log.info(f"Loading and quantizing {model_name} to INT4...")
93+
log.info(f"Quantizing {model_name} to INT4... It may take significant amount of time depending on your machine power.")
94+
quant_config = OVWeightQuantizationConfig(bits=4, sym=False, ratio=0.8, quant_method="awq", group_size=128, dataset="wikitext2")
95+
chat_model = OVModelForCausalLM.from_pretrained(model_name, export=True, compile=False, quantization_config=quant_config,
96+
token=token, trust_remote_code=True, library_name="transformers")
97+
chat_model.save_pretrained(model_path)
98+
99+
device = "GPU" if "GPU" in get_available_devices() else "CPU"
100+
return OpenVINOLLM(context_window=4096, model_id_or_path=str(model_path), max_new_tokens=1024, device_map=device,
101+
model_kwargs={"ov_config": ov_config, "library_name": "transformers"}, generate_kwargs={"do_sample": True, "temperature": 0.7, "top_k": 50, "top_p": 0.95})
102+
103+
104+
def run_test_deepeval(chat_model_name, personality_file_path, auth_token):
105+
dataset_name = get_dataset_name(personality_file_path)
106+
log.info("Loading dataset")
107+
dataset = load_dataset(dataset_name)['train']
108+
log.info("Dataset loading is finished")
109+
inputs = dataset['question']
110+
# We use question as context because the dataset lacks context
111+
contexts = dataset['question']
112+
contexts_res = [[context] for context in contexts]
113+
114+
with open(personality_file_path, "rb") as f:
115+
chatbot_config = yaml.safe_load(f)
116+
117+
ov_llm = load_chat_model(chat_model_name, auth_token)
118+
ov_chat_engine = SimpleChatEngine.from_defaults(llm=ov_llm, system_prompt=chatbot_config["system_configuration"],
119+
memory=ChatMemoryBuffer.from_defaults())
120+
outputs = []
121+
for input in tqdm(inputs[:2]):
122+
output = ov_chat_engine.chat(input).response
123+
outputs.append(output)
124+
125+
final_score = compute_deepeval_hallucination(inputs[:2], outputs[:2], contexts_res[:2])
126+
print(f"final_score is {final_score}")
127+
128+
129+
if __name__ == "__main__":
130+
# set up logging
131+
log.getLogger().setLevel(log.INFO)
132+
133+
parser = argparse.ArgumentParser()
134+
parser.add_argument("--chat_model", type=str, default="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", help="Path/name of the chat model")
135+
parser.add_argument("--personality", type=str, default="healthcare_personality.yaml", help="Path to the YAML file with chatbot personality")
136+
parser.add_argument("--hf_token", type=str, help="HuggingFace access token to get Llama3")
137+
138+
args = parser.parse_args()
139+
run_test_deepeval(args.chat_model, Path(args.personality), args.hf_token)

0 commit comments

Comments
 (0)