|
50 | 50 | help="Pad input ids to max length.")
|
51 | 51 | parser.add_argument("--calib_iters", default=512, type=int,
|
52 | 52 | help="calibration iters.")
|
53 |
| -parser.add_argument("--tasks", nargs='+', default=["lambada_openai", |
54 |
| - "hellaswag", "winogrande", "piqa", "wikitext"], |
55 |
| - type=str, help="tasks list for accuracy validation, text-generation and code-generation tasks are different.") |
| 53 | +parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext", |
| 54 | + type=str, help="tasks for accuracy validation, text-generation and code-generation tasks are different.") |
56 | 55 | parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model")
|
57 | 56 | # ============SmoothQuant configs==============
|
58 | 57 | parser.add_argument("--sq", action="store_true")
|
@@ -351,62 +350,82 @@ def eval_func(model):
|
351 | 350 | if args.accuracy:
|
352 | 351 | user_model.eval()
|
353 | 352 | if args.code_generation:
|
354 |
| - from intel_extension_for_transformers.llm.evaluation.lm_code_eval import evaluate |
| 353 | + from intel_extension_for_transformers.transformers.llm.evaluation.bigcode_eval import evaluate |
355 | 354 | from transformers import AutoTokenizer
|
356 | 355 | tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
|
357 | 356 | results = evaluate(
|
358 | 357 | model=user_model,
|
359 | 358 | tokenizer=tokenizer,
|
360 |
| - tasks=",".join(args.tasks), |
| 359 | + tasks=args.tasks, |
361 | 360 | batch_size=args.batch_size,
|
362 | 361 | args=args,
|
363 | 362 | )
|
| 363 | + for task_name in args.tasks: |
| 364 | + if task_name == "truthfulqa_mc": |
| 365 | + acc = results["results"][task_name]["mc1"] |
| 366 | + else: |
| 367 | + acc = results["results"][task_name]["acc"] |
364 | 368 | else:
|
365 |
| - from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate |
366 |
| - results = evaluate( |
367 |
| - model="hf-causal", |
368 |
| - model_args='pretrained=' + args.model + ',tokenizer=' + args.model + ',dtype=float32', |
| 369 | + from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser |
| 370 | + eval_args = LMEvalParser( |
| 371 | + model="hf", |
369 | 372 | user_model=user_model,
|
| 373 | + tokenizer=tokenizer, |
370 | 374 | batch_size=args.batch_size,
|
371 | 375 | tasks=args.tasks,
|
| 376 | + device="cpu", |
372 | 377 | )
|
| 378 | + results = evaluate(eval_args) |
| 379 | + for task_name in args.tasks.split(","): |
| 380 | + if task_name == "wikitext": |
| 381 | + acc = results["results"][task_name]["word_perplexity,none"] |
| 382 | + else: |
| 383 | + acc = results["results"][task_name]["acc,none"] |
373 | 384 |
|
374 |
| - dumped = json.dumps(results, indent=2) |
375 |
| - if args.save_accuracy_path: |
376 |
| - with open(args.save_accuracy_path, "w") as f: |
377 |
| - f.write(dumped) |
378 |
| - for task_name in args.tasks: |
379 |
| - if task_name == "wikitext": |
380 |
| - acc = results["results"][task_name]["word_perplexity"] |
381 |
| - else: |
382 |
| - acc = results["results"][task_name]["acc"] |
383 | 385 | print("Accuracy: %.5f" % acc)
|
384 | 386 | print('Batch size = %d' % args.batch_size)
|
385 | 387 |
|
386 | 388 | if args.performance:
|
387 |
| - user_model.eval() |
388 |
| - from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate |
389 | 389 | import time
|
390 |
| - |
| 390 | + user_model.eval() |
391 | 391 | samples = args.iters * args.batch_size
|
392 |
| - start = time.time() |
393 |
| - results = evaluate( |
394 |
| - model="hf-causal", |
395 |
| - model_args='pretrained=' + args.model + ',tokenizer=' + args.model \ |
396 |
| - + ',dtype=float32' + ",trust_remote_code=" + str(args.trust_remote_code), |
397 |
| - user_model=user_model, |
398 |
| - batch_size=args.batch_size, |
399 |
| - tasks=args.tasks, |
400 |
| - limit=samples, |
401 |
| - ) |
402 |
| - end = time.time() |
403 |
| - for task_name in args.tasks: |
404 |
| - if task_name == "wikitext": |
405 |
| - acc = results["results"][task_name]["word_perplexity"] |
406 |
| - elif task_name == "truthfulqa_mc": |
407 |
| - acc = results["results"][task_name]["mc1"] |
408 |
| - else: |
409 |
| - acc = results["results"][task_name]["acc"] |
| 392 | + |
| 393 | + if args.code_generation: |
| 394 | + from intel_extension_for_transformers.transformers.llm.evaluation.bigcode_eval import evaluate |
| 395 | + from transformers import AutoTokenizer |
| 396 | + tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code) |
| 397 | + start = time.time() |
| 398 | + results = evaluate( |
| 399 | + model=user_model, |
| 400 | + tokenizer=tokenizer, |
| 401 | + tasks=args.tasks, |
| 402 | + batch_size=args.batch_size, |
| 403 | + args=args, |
| 404 | + ) |
| 405 | + end = time.time() |
| 406 | + for task_name in args.tasks: |
| 407 | + if task_name == "truthfulqa_mc": |
| 408 | + acc = results["results"][task_name]["mc1"] |
| 409 | + else: |
| 410 | + acc = results["results"][task_name]["acc"] |
| 411 | + else: |
| 412 | + from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser |
| 413 | + eval_args = LMEvalParser( |
| 414 | + model="hf", |
| 415 | + user_model=user_model, |
| 416 | + tokenizer=tokenizer, |
| 417 | + batch_size=args.batch_size, |
| 418 | + tasks=args.tasks, |
| 419 | + device="cpu", |
| 420 | + ) |
| 421 | + start = time.time() |
| 422 | + results = evaluate(eval_args) |
| 423 | + end = time.time() |
| 424 | + for task_name in args.tasks.split(","): |
| 425 | + if task_name == "wikitext": |
| 426 | + acc = results["results"][task_name]["word_perplexity,none"] |
| 427 | + else: |
| 428 | + acc = results["results"][task_name]["acc,none"] |
410 | 429 | print("Accuracy: %.5f" % acc)
|
411 | 430 | print('Throughput: %.3f samples/sec' % (samples / (end - start)))
|
412 | 431 | print('Latency: %.3f ms' % ((end - start) * 1000 / samples))
|
|
0 commit comments