Skip to content

Commit fdb5097

Browse files
xin3hechensuyue
andauthored
upgrade lm_eval to 0.4.2 following ITREX (#1727)
Signed-off-by: xin3he <xin3.he@intel.com> Co-authored-by: chensuyue <suyue.chen@intel.com>
1 parent 4351bc8 commit fdb5097

File tree

8 files changed

+97
-79
lines changed

8 files changed

+97
-79
lines changed

.azure-pipelines/model-test-3x.yml

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ pr:
1010
include:
1111
- neural_compressor/common
1212
- neural_compressor/torch
13+
- examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm
1314
- setup.py
1415
- requirements_pt.txt
1516
- .azure-pipelines/scripts/models

.github/checkgroup.yml

+1
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ subprojects:
6464
paths:
6565
- "neural_compressor/common/**"
6666
- "neural_compressor/torch/**"
67+
- "examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/**"
6768
- "setup.py"
6869
- "requirements_pt.txt"
6970
- ".azure-pipelines/scripts/models/**"

examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,5 @@ wandb
99
einops
1010
neural-compressor
1111
intel-extension-for-transformers
12-
lm-eval
12+
lm_eval==0.4.2
1313
peft

examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py

+20-25
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
parser.add_argument("--calib_iters", default=512, type=int,
5252
help="calibration iters.")
5353
parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext",
54-
type=str, help="tasks list for accuracy validation")
54+
type=str, help="tasks for accuracy validation")
5555
parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model")
5656
# ============SmoothQuant configs==============
5757
parser.add_argument("--sq", action="store_true")
@@ -372,39 +372,36 @@ def run_fn(model):
372372
)
373373
user_model.save(args.output_dir)
374374

375-
if args.int8 or args.int8_bf16_mixed:
376-
print("load int8 model")
377375

378-
from neural_compressor.torch.algorithms.static_quant import load
376+
# TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result
377+
# if args.int8 or args.int8_bf16_mixed:
378+
# print("load int8 model")
379379

380-
if args.ipex:
381-
user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)))
382-
else:
383-
# TODO: WOQ save&load
384-
print("Int8 model loading does not support WeightOnlyQuant now.")
385-
pass
386-
else:
387-
user_model, _ = get_user_model()
380+
# # TODO: from neural_compressor.torch.quantization import load
381+
# from neural_compressor.torch.algorithms.static_quant import load
382+
383+
# if args.ipex:
384+
# user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)))
385+
# else:
386+
# # TODO: WOQ save&load
387+
# print("Int8 model loading does not support WeightOnlyQuant now.")
388+
# pass
389+
# else:
390+
# user_model, _ = get_user_model()
388391

389392

390393
if args.accuracy:
391394
user_model.eval()
392395
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
393396
eval_args = LMEvalParser(
394-
model="hf",
395-
model_args='pretrained=' + args.model + ',tokenizer=' + args.model + ',dtype=float32',
397+
model="hf",
396398
user_model=user_model,
397-
tokenizer = tokenizer,
399+
tokenizer=tokenizer,
398400
batch_size=args.batch_size,
399401
tasks=args.tasks,
400402
device="cpu",
401403
)
402404
results = evaluate(eval_args)
403-
404-
dumped = json.dumps(results, indent=2)
405-
if args.save_accuracy_path:
406-
with open(args.save_accuracy_path, "w") as f:
407-
f.write(dumped)
408405
for task_name in args.tasks.split(","):
409406
if task_name == "wikitext":
410407
acc = results["results"][task_name]["word_perplexity,none"]
@@ -415,16 +412,14 @@ def run_fn(model):
415412

416413
if args.performance:
417414
user_model.eval()
418-
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate
415+
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
419416
import time
420417

421418
samples = args.iters * args.batch_size
422-
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
423419
eval_args = LMEvalParser(
424-
model="hf",
425-
model_args='pretrained=' + args.model + ',tokenizer=' + args.model + ',dtype=float32',
420+
model="hf",
426421
user_model=user_model,
427-
tokenizer = tokenizer,
422+
tokenizer=tokenizer,
428423
batch_size=args.batch_size,
429424
tasks=args.tasks,
430425
limit=samples,
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
accelerate
22
datasets
33
einops
4-
intel_extension_for_transformers
4+
intel-extension-for-transformers
55
optimum
66
peft
77
sentencepiece
@@ -10,4 +10,4 @@ torch
1010
tqdm
1111
tiktoken
1212
transformers_stream_generator
13-
git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2abed9deb6180fd40e7e2
13+
lm_eval==0.4.2

examples/pytorch/nlp/huggingface_models/language-modeling/pruning/eager/run_clm_sparsegpt.py

+13-11
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@ def skip(*args, **kwargs):
5151
from timers import CPUTimer, GPUTimer
5252
from neural_compressor.training import WeightPruningConfig, prepare_pruning
5353
from neural_compressor.compression.pruner import (parse_auto_slim_config)
54-
from intel_extension_for_transformers.llm.evaluation.lm_eval import evaluate
5554

5655
check_min_version("4.27.0.dev0")
5756
logger = logging.getLogger(__name__)
@@ -271,8 +270,8 @@ def parse_args():
271270
help="Transformers parameter: use the external repo")
272271

273272
# Evaluation config
274-
parser.add_argument("--tasks", default=["lambada_openai"],
275-
help="Usually chosen with ['lambada_openai','hellaswag','winogrande','piqa']",
273+
parser.add_argument("--tasks", default="lambada_openai",
274+
type=str, help="tasks for accuracy validation",
276275
)
277276
parser.add_argument("--use_accelerate", action='store_true',
278277
help="Usually use to accelerate evaluation for large models"
@@ -588,14 +587,17 @@ def group_texts(examples):
588587
model_args = f'pretrained={model_name},tokenizer={model_name},dtype={dtype},use_accelerate={args.use_accelerate},trust_remote_code={args.trust_remote_code}'
589588
eval_batch = args.per_device_eval_batch_size
590589
user_model = None if args.use_accelerate else model
591-
results = evaluate(
592-
model="hf-causal",
593-
model_args=model_args,
594-
user_model=user_model,
595-
batch_size=eval_batch,
596-
tasks=args.tasks,
597-
device=device,
598-
)
590+
591+
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
592+
eval_args = LMEvalParser(
593+
model="hf",
594+
user_model=user_model,
595+
tokenizer=tokenizer,
596+
batch_size=eval_batch,
597+
tasks=args.tasks,
598+
device=device,
599+
)
600+
results = evaluate(eval_args)
599601

600602
if __name__ == "__main__":
601603
main()

examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,4 @@ wandb
1010
einops
1111
neural-compressor
1212
intel-extension-for-transformers
13-
git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2abed9deb6180fd40e7e2
13+
lm_eval==0.4.2

examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py

+58-39
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,8 @@
5050
help="Pad input ids to max length.")
5151
parser.add_argument("--calib_iters", default=512, type=int,
5252
help="calibration iters.")
53-
parser.add_argument("--tasks", nargs='+', default=["lambada_openai",
54-
"hellaswag", "winogrande", "piqa", "wikitext"],
55-
type=str, help="tasks list for accuracy validation, text-generation and code-generation tasks are different.")
53+
parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext",
54+
type=str, help="tasks for accuracy validation, text-generation and code-generation tasks are different.")
5655
parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model")
5756
# ============SmoothQuant configs==============
5857
parser.add_argument("--sq", action="store_true")
@@ -351,62 +350,82 @@ def eval_func(model):
351350
if args.accuracy:
352351
user_model.eval()
353352
if args.code_generation:
354-
from intel_extension_for_transformers.llm.evaluation.lm_code_eval import evaluate
353+
from intel_extension_for_transformers.transformers.llm.evaluation.bigcode_eval import evaluate
355354
from transformers import AutoTokenizer
356355
tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
357356
results = evaluate(
358357
model=user_model,
359358
tokenizer=tokenizer,
360-
tasks=",".join(args.tasks),
359+
tasks=args.tasks,
361360
batch_size=args.batch_size,
362361
args=args,
363362
)
363+
for task_name in args.tasks:
364+
if task_name == "truthfulqa_mc":
365+
acc = results["results"][task_name]["mc1"]
366+
else:
367+
acc = results["results"][task_name]["acc"]
364368
else:
365-
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate
366-
results = evaluate(
367-
model="hf-causal",
368-
model_args='pretrained=' + args.model + ',tokenizer=' + args.model + ',dtype=float32',
369+
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
370+
eval_args = LMEvalParser(
371+
model="hf",
369372
user_model=user_model,
373+
tokenizer=tokenizer,
370374
batch_size=args.batch_size,
371375
tasks=args.tasks,
376+
device="cpu",
372377
)
378+
results = evaluate(eval_args)
379+
for task_name in args.tasks.split(","):
380+
if task_name == "wikitext":
381+
acc = results["results"][task_name]["word_perplexity,none"]
382+
else:
383+
acc = results["results"][task_name]["acc,none"]
373384

374-
dumped = json.dumps(results, indent=2)
375-
if args.save_accuracy_path:
376-
with open(args.save_accuracy_path, "w") as f:
377-
f.write(dumped)
378-
for task_name in args.tasks:
379-
if task_name == "wikitext":
380-
acc = results["results"][task_name]["word_perplexity"]
381-
else:
382-
acc = results["results"][task_name]["acc"]
383385
print("Accuracy: %.5f" % acc)
384386
print('Batch size = %d' % args.batch_size)
385387

386388
if args.performance:
387-
user_model.eval()
388-
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate
389389
import time
390-
390+
user_model.eval()
391391
samples = args.iters * args.batch_size
392-
start = time.time()
393-
results = evaluate(
394-
model="hf-causal",
395-
model_args='pretrained=' + args.model + ',tokenizer=' + args.model \
396-
+ ',dtype=float32' + ",trust_remote_code=" + str(args.trust_remote_code),
397-
user_model=user_model,
398-
batch_size=args.batch_size,
399-
tasks=args.tasks,
400-
limit=samples,
401-
)
402-
end = time.time()
403-
for task_name in args.tasks:
404-
if task_name == "wikitext":
405-
acc = results["results"][task_name]["word_perplexity"]
406-
elif task_name == "truthfulqa_mc":
407-
acc = results["results"][task_name]["mc1"]
408-
else:
409-
acc = results["results"][task_name]["acc"]
392+
393+
if args.code_generation:
394+
from intel_extension_for_transformers.transformers.llm.evaluation.bigcode_eval import evaluate
395+
from transformers import AutoTokenizer
396+
tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
397+
start = time.time()
398+
results = evaluate(
399+
model=user_model,
400+
tokenizer=tokenizer,
401+
tasks=args.tasks,
402+
batch_size=args.batch_size,
403+
args=args,
404+
)
405+
end = time.time()
406+
for task_name in args.tasks:
407+
if task_name == "truthfulqa_mc":
408+
acc = results["results"][task_name]["mc1"]
409+
else:
410+
acc = results["results"][task_name]["acc"]
411+
else:
412+
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
413+
eval_args = LMEvalParser(
414+
model="hf",
415+
user_model=user_model,
416+
tokenizer=tokenizer,
417+
batch_size=args.batch_size,
418+
tasks=args.tasks,
419+
device="cpu",
420+
)
421+
start = time.time()
422+
results = evaluate(eval_args)
423+
end = time.time()
424+
for task_name in args.tasks.split(","):
425+
if task_name == "wikitext":
426+
acc = results["results"][task_name]["word_perplexity,none"]
427+
else:
428+
acc = results["results"][task_name]["acc,none"]
410429
print("Accuracy: %.5f" % acc)
411430
print('Throughput: %.3f samples/sec' % (samples / (end - start)))
412431
print('Latency: %.3f ms' % ((end - start) * 1000 / samples))

0 commit comments

Comments
 (0)