Skip to content

Commit 65e8362

Browse files
authored
Added ability to compare results vs. llama.cpp (openvinotoolkit#1461)
Example: ```bash rm -rf results/smollm2_N_FP16/gt.csv mkdir -p results/smollm2_N_FP16 # References from PyTorch FP16 wwb --base-model HuggingFaceTB/SmolLM2-360M-Instruct --gt-data results/smollm2_N_FP16/gt.csv --hf --num-samples 4 #huggingface-cli download "bartowski/SmolLM2-360M-Instruct-GGUF" "SmolLM2-360M-Instruct-f16.gguf" wwb --target-model models/SmolLM2-360M-Instruct-f16.gguf --gt-data results/smollm2_N_FP16/gt.csv --llamacpp --output results/smollm2_N_L_FP16 --num-samples ```
1 parent d7d117a commit 65e8362

File tree

3 files changed

+88
-20
lines changed

3 files changed

+88
-20
lines changed

tools/who_what_benchmark/whowhatbench/model_loaders.py

+17-3
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,19 @@ def load_text_genai_pipeline(model_dir, device="CPU", ov_config=None):
4141
return GenAIModelWrapper(openvino_genai.LLMPipeline(model_dir, device=device, **ov_config), model_dir, "text")
4242

4343

44+
def load_text_llamacpp_pipeline(model_dir):
45+
try:
46+
from llama_cpp import Llama
47+
except ImportError:
48+
logger.error(
49+
"Failed to import llama_cpp package. Please install llama-cpp-python.")
50+
exit(-1)
51+
model = Llama(model_dir)
52+
return model
53+
54+
4455
def load_text_model(
45-
model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False
56+
model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False, use_llamacpp=False,
4657
):
4758
if use_hf:
4859
logger.info("Using HF Transformers API")
@@ -53,6 +64,9 @@ def load_text_model(
5364
elif use_genai:
5465
logger.info("Using OpenVINO GenAI API")
5566
model = load_text_genai_pipeline(model_id, device, ov_config)
67+
elif use_llamacpp:
68+
logger.info("Using llama.cpp API")
69+
model = load_text_llamacpp_pipeline(model_id)
5670
else:
5771
logger.info("Using Optimum API")
5872
from optimum.intel.openvino import OVModelForCausalLM
@@ -276,7 +290,7 @@ def load_inpainting_model(
276290

277291

278292
def load_model(
279-
model_type, model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False
293+
model_type, model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False, use_llamacpp=False
280294
):
281295
if model_id is None:
282296
return None
@@ -288,7 +302,7 @@ def load_model(
288302
ov_options = {}
289303

290304
if model_type == "text":
291-
return load_text_model(model_id, device, ov_options, use_hf, use_genai)
305+
return load_text_model(model_id, device, ov_options, use_hf, use_genai, use_llamacpp)
292306
elif model_type == "text-to-image":
293307
return load_text2image_model(
294308
model_id, device, ov_options, use_hf, use_genai

tools/who_what_benchmark/whowhatbench/text_evaluator.py

+18-9
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ def __init__(
108108
generation_config=None,
109109
generation_config_base=None,
110110
seqs_per_request=None,
111+
use_chat_template=None,
111112
) -> None:
112113
assert (
113114
base_model is not None or gt_data is not None
@@ -123,6 +124,7 @@ def __init__(
123124
self.generation_config_base = generation_config
124125
self.seqs_per_request = seqs_per_request
125126
self.generation_fn = gen_answer_fn
127+
self.use_chat_template = use_chat_template
126128
if self.generation_config is not None:
127129
assert self.seqs_per_request is not None
128130

@@ -202,15 +204,21 @@ def worst_examples(self, top_k: int = 5, metric="similarity"):
202204
return res
203205

204206
def _generate_data(self, model, gen_answer_fn=None, generation_config=None):
205-
def default_gen_answer(model, tokenizer, prompt, max_new_tokens, crop_question):
206-
inputs = self.tokenizer(prompt, return_tensors="pt")
207-
208-
tokens = model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens)
209-
210-
if crop_question:
211-
tokens = tokens[:, inputs["input_ids"].shape[-1] :]
212-
213-
return self.tokenizer.batch_decode(tokens, skip_special_tokens=True)[0]
207+
def default_gen_answer(model, tokenizer, prompt, max_new_tokens, crop_question, use_chat_template=False):
208+
if use_chat_template:
209+
message = [{"role": "user", "content": prompt}]
210+
inputs = tokenizer.apply_chat_template(message, tokenize=True, add_generation_prompt=True, return_tensors="pt")
211+
tokens = model.generate(inputs, do_sample=False, max_new_tokens=max_new_tokens)
212+
if crop_question:
213+
tokens = tokens[:, inputs.shape[-1]:]
214+
res = self.tokenizer.decode(tokens[0], skip_special_tokens=True)
215+
return res
216+
else:
217+
inputs = self.tokenizer(prompt, return_tensors="pt")
218+
tokens = model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens)
219+
if crop_question:
220+
tokens = tokens[:, inputs["input_ids"].shape[-1] :]
221+
return self.tokenizer.batch_decode(tokens, skip_special_tokens=True)[0]
214222

215223
gen_answer_fn = gen_answer_fn or default_gen_answer
216224

@@ -250,6 +258,7 @@ def default_gen_answer(model, tokenizer, prompt, max_new_tokens, crop_question):
250258
p,
251259
self.max_new_tokens,
252260
self._crop_question,
261+
self.use_chat_template
253262
)
254263
)
255264
else:

tools/who_what_benchmark/whowhatbench/wwb.py

+53-8
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,11 @@ def parse_args():
4040
default=None,
4141
help="Tokenizer for divergency metric. If not provided, it will be load from base_model or target_model.",
4242
)
43+
parser.add_argument(
44+
"--chat-template",
45+
action="store_true",
46+
help="Whether apply the default chat template.",
47+
)
4348
parser.add_argument(
4449
"--gt-data",
4550
default=None,
@@ -137,6 +142,11 @@ def parse_args():
137142
action="store_true",
138143
help="Use LLMPipeline from transformers library to instantiate the model.",
139144
)
145+
parser.add_argument(
146+
"--llamacpp",
147+
action="store_true",
148+
help="Use llama-cpp-python to instantiate the model.",
149+
)
140150
parser.add_argument(
141151
"--image-size",
142152
type=int,
@@ -190,9 +200,13 @@ def load_prompts(args):
190200
def load_tokenizer(args):
191201
tokenizer = None
192202
if args.tokenizer is not None:
193-
tokenizer = AutoTokenizer.from_pretrained(
194-
args.tokenizer, trust_remote_code=True
195-
)
203+
if args.llamacpp:
204+
from llama_cpp.llama_tokenizer import LlamaHFTokenizer
205+
tokenizer = LlamaHFTokenizer.from_pretrained(args.tokenizer)
206+
else:
207+
tokenizer = AutoTokenizer.from_pretrained(
208+
args.tokenizer, trust_remote_code=True
209+
)
196210
elif args.base_model is not None:
197211
tokenizer = AutoTokenizer.from_pretrained(
198212
args.base_model, trust_remote_code=True
@@ -246,8 +260,29 @@ def diff_strings(a: str, b: str, *, use_loguru_colors: bool = False) -> str:
246260
return "".join(output)
247261

248262

249-
def genai_gen_text(model, tokenizer, question, max_new_tokens, skip_question):
250-
return model.generate(question, do_sample=False, max_new_tokens=max_new_tokens)
263+
def genai_gen_text(model, tokenizer, question, max_new_tokens, skip_question, use_chat_template=False):
264+
if use_chat_template:
265+
model.start_chat()
266+
result = model.generate(question, do_sample=False, max_new_tokens=max_new_tokens)
267+
model.finish_chat()
268+
return result
269+
else:
270+
return model.generate(question, do_sample=False, max_new_tokens=max_new_tokens)
271+
272+
273+
def llamacpp_gen_text(model, tokenizer, question, max_new_tokens, skip_question, use_chat_template=False):
274+
if use_chat_template:
275+
output = model.create_chat_completion(messages=[{"role": "user", "content": question}], max_tokens=max_new_tokens, temperature=0.0)
276+
text = output["choices"][0]["message"]["content"]
277+
if skip_question:
278+
text = text[len(question):]
279+
return text
280+
else:
281+
output = model(question, max_tokens=max_new_tokens, echo=True, temperature=0.0)
282+
text = output["choices"][0]["text"]
283+
if skip_question:
284+
text = text[len(question):]
285+
return text
251286

252287

253288
def genai_gen_image(model, prompt, num_inference_steps, generator=None):
@@ -322,7 +357,15 @@ def create_evaluator(base_model, args):
322357
prompts = load_prompts(args)
323358

324359
if task == "text":
325-
tokenizer = load_tokenizer(args)
360+
tokenizer = load_tokenizer(args) if not args.llamacpp else None
361+
362+
if args.genai:
363+
gen_answer_fn = genai_gen_text
364+
elif args.llamacpp:
365+
gen_answer_fn = llamacpp_gen_text
366+
else:
367+
gen_answer_fn = None
368+
326369
return EvaluatorCLS(
327370
base_model=base_model,
328371
gt_data=args.gt_data,
@@ -331,7 +374,8 @@ def create_evaluator(base_model, args):
331374
similarity_model_id=args.data_encoder,
332375
num_samples=args.num_samples,
333376
language=args.language,
334-
gen_answer_fn=genai_gen_text if args.genai else None,
377+
gen_answer_fn=gen_answer_fn,
378+
use_chat_template=args.chat_template,
335379
)
336380
elif task == "text-to-image":
337381
return EvaluatorCLS(
@@ -467,10 +511,11 @@ def main():
467511
args.ov_config,
468512
args.hf,
469513
args.genai,
514+
args.llamacpp
470515
)
471516
all_metrics_per_question, all_metrics = evaluator.score(
472517
target_model,
473-
evaluator.get_generation_fn() if args.genai else None,
518+
evaluator.get_generation_fn() if args.genai or args.llamacpp else None,
474519
output_dir=args.output
475520
)
476521
logger.info("Metrics for model: %s", args.target_model)

0 commit comments

Comments
 (0)