Skip to content

Commit d016944

Browse files
Support hallucination score, deepeval part
Signed-off-by: Kang Wenjing <wenjing.kang@intel.com>
1 parent 9cfe8f6 commit d016944

File tree

2 files changed

+171
-0
lines changed

2 files changed

+171
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Use deepeval to compute hallucination score
2+
## Prerequisite libraries
3+
1. [deepeval](https://github.com/confident-ai/deepeval)
4+
2. [Ollama](https://github.com/ollama/ollama/blob/main/README.md)
5+
6+
## How to set up
7+
1. Install deepeval:
8+
```
9+
pip install -U deepeval
10+
```
11+
2. Install Ollama:
12+
Please refer to [ollama](https://github.com/ollama/ollama/blob/main/README.md#ollama)
13+
14+
3. Run Ollama, taking `deepseek-r1` as an example:
15+
```
16+
ollama run deepseek-r1
17+
```
18+
4. Set deepeval to use Ollama for evaluation:
19+
```
20+
deepeval set-ollama deepseek-r1
21+
```
22+
23+
## How to run the test
24+
```
25+
python test_vaa_deepeval.py --personality /path/to/agribot_personality.yaml
26+
```
27+
28+
## More to read
29+
[deepeval hallucination](https://docs.confident-ai.com/docs/metrics-hallucination)
30+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
2+
import argparse
3+
import logging as log
4+
import os
5+
6+
from typing import Set
7+
from pathlib import Path
8+
from tqdm import tqdm
9+
10+
import openvino as ov
11+
import yaml
12+
13+
from datasets import load_dataset
14+
from urllib.request import getproxies
15+
from deepeval.metrics import HallucinationMetric
16+
from deepeval.test_case import LLMTestCase
17+
from llama_index.core.chat_engine import SimpleChatEngine
18+
from llama_index.core.memory import ChatMemoryBuffer
19+
from llama_index.llms.openvino import OpenVINOLLM
20+
from transformers import AutoTokenizer
21+
22+
proxies = getproxies()
23+
os.environ["http_proxy"] = proxies["http"]
24+
os.environ["https_proxy"] = proxies["https"]
25+
os.environ["no_proxy"] = "localhost, 127.0.0.1/8, ::1"
26+
from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig
27+
28+
29+
DATASET_MAPPING = {
30+
"agribot_personality.yaml": "KisanVaani/agriculture-qa-english-only"
31+
}
32+
MODEL_DIR = Path("model")
33+
34+
35+
def get_available_devices() -> Set[str]:
36+
core = ov.Core()
37+
return {device.split(".")[0] for device in core.available_devices}
38+
39+
40+
def compute_deepeval_hallucination(inputs, outputs, contexts) -> float:
41+
avg_score = 0.
42+
for input, output, context in zip(inputs, outputs, contexts):
43+
test_case = LLMTestCase(
44+
input=input,
45+
actual_output=output,
46+
context=context
47+
)
48+
metric = HallucinationMetric(threshold=0.5)
49+
metric.measure(test_case)
50+
score = metric.score
51+
# reason = metric.reason
52+
avg_score += score / len(inputs)
53+
return avg_score
54+
55+
56+
# this is necessary for thinking models e.g. deepseek
57+
def emphasize_thinking_mode(token: str) -> str:
58+
return token + "<em><small>" if "<think>" in token else "</small></em>" + token if "</think>" in token else token
59+
60+
61+
def extract_personality_path(path):
62+
return os.path.basename(path)
63+
64+
65+
def get_dataset_name(personality_file_path):
66+
dataset_name = DATASET_MAPPING.get(extract_personality_path(personality_file_path), "")
67+
assert dataset_name != ""
68+
return dataset_name
69+
70+
71+
def load_chat_model(model_name: str, token: str = None) -> OpenVINOLLM:
72+
model_path = MODEL_DIR / model_name
73+
74+
# tokenizers are disabled anyway, this allows to avoid warning
75+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
76+
if token is not None:
77+
os.environ["HUGGING_FACE_HUB_TOKEN"] = token
78+
79+
ov_config = {"PERFORMANCE_HINT": "LATENCY", "CACHE_DIR": ""}
80+
# load llama model and its tokenizer
81+
if not model_path.exists():
82+
log.info(f"Downloading {model_name}... It may take up to 1h depending on your Internet connection and model size.")
83+
84+
chat_tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
85+
chat_tokenizer.save_pretrained(model_path)
86+
87+
# openvino models are used as is
88+
is_openvino_model = model_name.split("/")[0] == "OpenVINO"
89+
if is_openvino_model:
90+
chat_model = OVModelForCausalLM.from_pretrained(model_name, export=False, compile=False, token=token)
91+
chat_model.save_pretrained(model_path)
92+
else:
93+
log.info(f"Loading and quantizing {model_name} to INT4...")
94+
log.info(f"Quantizing {model_name} to INT4... It may take significant amount of time depending on your machine power.")
95+
quant_config = OVWeightQuantizationConfig(bits=4, sym=False, ratio=0.8, quant_method="awq", group_size=128, dataset="wikitext2")
96+
chat_model = OVModelForCausalLM.from_pretrained(model_name, export=True, compile=False, quantization_config=quant_config,
97+
token=token, trust_remote_code=True, library_name="transformers")
98+
chat_model.save_pretrained(model_path)
99+
100+
device = "GPU" if "GPU" in get_available_devices() else "CPU"
101+
return OpenVINOLLM(context_window=4096, model_id_or_path=str(model_path), max_new_tokens=1024, device_map=device,
102+
model_kwargs={"ov_config": ov_config, "library_name": "transformers"}, generate_kwargs={"do_sample": True, "temperature": 0.7, "top_k": 50, "top_p": 0.95})
103+
104+
105+
def run_test_deepeval(chat_model_name, personality_file_path, auth_token):
106+
dataset_name = get_dataset_name(personality_file_path)
107+
log.info("Loading dataset")
108+
dataset = load_dataset(dataset_name)['train']
109+
log.info("Dataset loading is finished")
110+
inputs = dataset['question']
111+
# We use question as context because the dataset lacks context
112+
contexts = dataset['question']
113+
contexts_res = [[context] for context in contexts]
114+
115+
with open(personality_file_path, "rb") as f:
116+
chatbot_config = yaml.safe_load(f)
117+
118+
ov_llm = load_chat_model(chat_model_name, auth_token)
119+
ov_chat_engine = SimpleChatEngine.from_defaults(llm=ov_llm, system_prompt=chatbot_config["system_configuration"],
120+
memory=ChatMemoryBuffer.from_defaults())
121+
outputs = []
122+
for input in tqdm(inputs[:2]):
123+
output = ov_chat_engine.chat(input).response
124+
outputs.append(output)
125+
126+
final_score = compute_deepeval_hallucination(inputs[:2], outputs[:2], contexts_res[:2])
127+
print(f"final_score is {final_score}")
128+
129+
130+
131+
if __name__ == "__main__":
132+
# set up logging
133+
log.getLogger().setLevel(log.INFO)
134+
135+
parser = argparse.ArgumentParser()
136+
parser.add_argument("--chat_model", type=str, default="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", help="Path/name of the chat model")
137+
parser.add_argument("--personality", type=str, default="healthcare_personality.yaml", help="Path to the YAML file with chatbot personality")
138+
parser.add_argument("--hf_token", type=str, help="HuggingFace access token to get Llama3")
139+
140+
args = parser.parse_args()
141+
run_test_deepeval(args.chat_model, Path(args.personality), args.hf_token)

0 commit comments

Comments
 (0)