add ipex model example for text-generation

jiqing-feng · jiqing-feng · commit 4f0248515fc8 · 2024-03-11T12:01:43.000-04:00
diff --git a/examples/ipex/text-generation/README.md b/examples/ipex/text-generation/README.md
@@ -0,0 +1,31 @@
+<!---
+Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+## Language generation
+
+Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-generation/run_generation.py).
+
+The original generation task only supported the PyTorch eager and graph model. By calling the `IPEXModelForCausalLM` class, we can now apply ipex optimizations to the eager and graph model for generation tasks.
+
+
+Example usage:
+### Use bf16 and JIT model
+```bash
+python run_generation.py \
+    --model_name_or_path=gpt2 \
+    --bf16 \
+    --jit
+```
diff --git a/examples/ipex/text-generation/requirements.txt b/examples/ipex/text-generation/requirements.txt
@@ -0,0 +1,3 @@
+sentencepiece
+protobuf
+torch >= 2.1.0
diff --git a/examples/ipex/text-generation/run_generation.py b/examples/ipex/text-generation/run_generation.py
@@ -0,0 +1,211 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, INTEL CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/CTRL/Transformer-XL/XLNet)
+"""
+
+
+import argparse
+import logging
+
+import torch
+from accelerate import PartialState
+from accelerate.utils import set_seed
+from transformers import AutoTokenizer
+
+from optimum.intel.ipex import IPEXModelForCausalLM
+
+
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    level=logging.INFO,
+)
+logger = logging.getLogger(__name__)
+
+MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop
+
+
+def adjust_length_to_model(length, max_sequence_length):
+    if length < 0 and max_sequence_length > 0:
+        length = max_sequence_length
+    elif 0 < max_sequence_length < length:
+        length = max_sequence_length  # No generation bigger than model size
+    elif length < 0:
+        length = MAX_LENGTH  # avoid infinite loop
+    return length
+
+
+def sparse_model_config(model_config):
+    embedding_size = None
+    if hasattr(model_config, "hidden_size"):
+        embedding_size = model_config.hidden_size
+    elif hasattr(model_config, "n_embed"):
+        embedding_size = model_config.n_embed
+    elif hasattr(model_config, "n_embd"):
+        embedding_size = model_config.n_embd
+
+    num_head = None
+    if hasattr(model_config, "num_attention_heads"):
+        num_head = model_config.num_attention_heads
+    elif hasattr(model_config, "n_head"):
+        num_head = model_config.n_head
+
+    if embedding_size is None or num_head is None or num_head == 0:
+        raise ValueError("Check the model config")
+
+    num_embedding_size_per_head = int(embedding_size / num_head)
+    if hasattr(model_config, "n_layer"):
+        num_layer = model_config.n_layer
+    elif hasattr(model_config, "num_hidden_layers"):
+        num_layer = model_config.num_hidden_layers
+    else:
+        raise ValueError("Number of hidden layers couldn't be determined from the model config")
+
+    return num_layer, num_head, num_embedding_size_per_head
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name",
+    )
+
+    parser.add_argument("--prompt", type=str, default="")
+    parser.add_argument("--length", type=int, default=20)
+    parser.add_argument("--stop_token", type=str, default=None, help="Token at which text generation is stopped")
+
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=1.0,
+        help="temperature of 1.0 has no effect, lower tend toward greedy sampling",
+    )
+    parser.add_argument(
+        "--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2"
+    )
+    parser.add_argument("--k", type=int, default=0)
+    parser.add_argument("--p", type=float, default=0.9)
+
+    parser.add_argument("--prefix", type=str, default="", help="Text added prior to input.")
+    parser.add_argument("--padding_text", type=str, default="", help="Deprecated, the use of `--prefix` is preferred.")
+
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+    parser.add_argument(
+        "--use_cpu",
+        action="store_true",
+        help="Whether or not to use cpu. If set to False, " "we will use gpu/npu or mps device if available",
+    )
+    parser.add_argument("--num_return_sequences", type=int, default=1, help="The number of samples to generate.")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--bf16",
+        action="store_true",
+        help="Whether to use bfloat 16-bit precision (through INTEL AMX or AVX_512) instead of 32-bit",
+    )
+    parser.add_argument("--jit", action="store_true", help="Whether or not to use jit trace to accelerate inference")
+    args = parser.parse_args()
+
+    if args.fp16 and args.bf16:
+        raise ValueError("You can only choose one of {fp16, bf16}")
+
+    torch_dtype = torch.float32
+    if args.fp16:
+        torch_dtype = torch.float16
+    if args.bf16:
+        torch_dtype = torch.bfloat16
+
+    # Initialize the distributed state.
+    distributed_state = PartialState(cpu=args.use_cpu)
+
+    logger.warning(f"device: {distributed_state.device}, 16-bits inference: {args.fp16 or args.bf16}")
+
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Initialize the model and tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    model = IPEXModelForCausalLM.from_pretrained(args.model_name_or_path, export=args.jit, torch_dtype=torch_dtype)
+
+    # Set the model to the right device
+    model.to(distributed_state.device)
+
+    max_seq_length = getattr(model.config, "max_position_embeddings", 0)
+    args.length = adjust_length_to_model(args.length, max_sequence_length=max_seq_length)
+    logger.info(args)
+
+    prompt_text = args.prompt if args.prompt else input("Model prompt >>> ")
+
+    prefix = args.prefix if args.prefix else args.padding_text
+    encoded_prompt = tokenizer.encode(prefix + prompt_text, add_special_tokens=False, return_tensors="pt")
+    encoded_prompt = encoded_prompt.to(distributed_state.device)
+
+    if encoded_prompt.size()[-1] == 0:
+        input_ids = None
+    else:
+        input_ids = encoded_prompt
+
+    output_sequences = model.generate(
+        input_ids=input_ids,
+        max_length=args.length + len(encoded_prompt[0]),
+        temperature=args.temperature,
+        top_k=args.k,
+        top_p=args.p,
+        repetition_penalty=args.repetition_penalty,
+        do_sample=True,
+        num_return_sequences=args.num_return_sequences,
+    )
+
+    # Remove the batch dimension when returning multiple sequences
+    if len(output_sequences.shape) > 2:
+        output_sequences.squeeze_()
+
+    generated_sequences = []
+
+    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
+        print(f"=== GENERATED SEQUENCE {generated_sequence_idx + 1} ===")
+        generated_sequence = generated_sequence.tolist()
+
+        # Decode text
+        text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
+
+        # Remove all text after the stop token
+        text = text[: text.find(args.stop_token) if args.stop_token else None]
+
+        # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing
+        total_sequence = (
+            prompt_text + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :]
+        )
+
+        generated_sequences.append(total_sequence)
+        print(total_sequence)
+
+    return generated_sequences
+
+
+if __name__ == "__main__":
+    main()

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+sentencepiece`
	`2`	`+protobuf`
	`3`	`+torch >= 2.1.0`