From 1b89624a38159620a2d24e8210e3459957e9200e Mon Sep 17 00:00:00 2001
From: "Feng, Jiqing" <jiqing.feng@intel.com>
Date: Sun, 7 Jan 2024 21:16:32 -0800
Subject: [PATCH 01/30] define optimum-intel pipeline

---
 optimum/intel/pipelines/__init__.py | 410 ++++++++++++++++++++++++++++
 1 file changed, 410 insertions(+)
 create mode 100644 optimum/intel/pipelines/__init__.py

diff --git a/optimum/intel/pipelines/__init__.py b/optimum/intel/pipelines/__init__.py
new file mode 100644
index 0000000000..4303ddfb59
--- /dev/null
+++ b/optimum/intel/pipelines/__init__.py
@@ -0,0 +1,410 @@
+#  Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import warnings
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+
+from huggingface_hub import model_info
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.auto.configuration_auto import AutoConfig
+from transformers.models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
+from transformers.pipelines.base import (
+    Pipeline,
+    PipelineRegistry,
+    get_default_model_and_revision,
+)
+from transformers.pipelines.text_generation import TextGenerationPipeline
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.utils import (
+    CONFIG_NAME,
+    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    cached_file,
+    extract_commit_hash,
+    is_offline_mode,
+    is_torch_available,
+    logging,
+)
+
+from ..generation.modeling import TSModelForCausalLM
+
+
+if is_torch_available():
+    import torch
+    from transformers.models.auto.modeling_auto import AutoModelForCausalLM
+
+
+if TYPE_CHECKING:
+    from transformers.modeling_utils import PreTrainedModel
+    from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+
+
+logger = logging.get_logger(__name__)
+
+
+# Register all the supported tasks here
+TASK_ALIASES = {
+    "sentiment-analysis": "text-classification",
+}
+SUPPORTED_TASKS = {
+    "text-generation": {
+        "impl": TextGenerationPipeline,
+        "pt": (AutoModelForCausalLM,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("gpt2", "6c0e608")}},
+        "type": "text",
+    },
+}
+
+
+PIPELINE_REGISTRY = PipelineRegistry(supported_tasks=SUPPORTED_TASKS, task_aliases=TASK_ALIASES)
+
+
+def get_supported_tasks() -> List[str]:
+    """
+    Returns a list of supported task strings.
+    """
+    return PIPELINE_REGISTRY.get_supported_tasks()
+
+
+def get_task(model: str, token: Optional[str] = None, **deprecated_kwargs) -> str:
+    use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+            FutureWarning,
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
+    if is_offline_mode():
+        raise RuntimeError("You cannot infer task automatically within `pipeline` when using offline mode")
+    try:
+        info = model_info(model, token=token)
+    except Exception as e:
+        raise RuntimeError(f"Instantiating a pipeline without a task set raised an error: {e}")
+    if not info.pipeline_tag:
+        raise RuntimeError(
+            f"The model {model} does not seem to have a correct `pipeline_tag` set to infer the task automatically"
+        )
+    if getattr(info, "library_name", "transformers") != "transformers":
+        raise RuntimeError(f"This model is meant to be used with {info.library_name} not with transformers")
+    task = info.pipeline_tag
+    return task
+
+
+def check_task(task: str) -> Tuple[str, Dict, Any]:
+    """
+    Checks an incoming task string, to validate it's correct and return the default Pipeline and Model classes, and
+    default models if they exist.
+
+    Args:
+        task (`str`):
+            The task defining which pipeline will be returned. Currently accepted tasks are:
+
+            - `"text-generation"`
+
+    Returns:
+        (normalized_task: `str`, task_defaults: `dict`, task_options: (`tuple`, None)) The normalized task name
+        (removed alias and options). The actual dictionary required to initialize the pipeline and some extra task
+        options for parametrized tasks like "translation_XX_to_YY"
+
+
+    """
+    return PIPELINE_REGISTRY.check_task(task)
+
+
+def clean_custom_task(task_info):
+    import transformers
+
+    if "impl" not in task_info:
+        raise RuntimeError("This model introduces a custom pipeline without specifying its implementation.")
+    pt_class_names = task_info.get("pt", ())
+    if isinstance(pt_class_names, str):
+        pt_class_names = [pt_class_names]
+    task_info["pt"] = tuple(getattr(transformers, c) for c in pt_class_names)
+    return task_info, None
+
+
+def pipeline(
+    task: str = None,
+    model: Optional[Union[str, "PreTrainedModel"]] = None,
+    config: Optional[Union[str, PretrainedConfig]] = None,
+    tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None,
+    revision: Optional[str] = None,
+    use_fast: bool = True,
+    token: Optional[Union[str, bool]] = None,
+    device: Optional[Union[int, str, "torch.device"]] = None,
+    device_map=None,
+    torch_dtype=None,
+    trust_remote_code: Optional[bool] = None,
+    model_kwargs: Dict[str, Any] = None,
+    pipeline_class: Optional[Any] = None,
+    **kwargs,
+) -> Pipeline:
+    """
+    Utility factory method to build a [`Pipeline`].
+
+    Pipelines are made of:
+
+        - A [tokenizer](tokenizer) in charge of mapping raw textual input to token.
+        - A [model](model) to make predictions from the inputs.
+        - Some (optional) post processing for enhancing model's output.
+
+    Args:
+        task (`str`):
+            The task defining which pipeline will be returned. Currently accepted tasks are:
+
+            - `"text-generation"`: will return a [`TextGenerationPipeline`]:.
+
+        model (`str` or [`PreTrainedModel`], *optional*):
+            The model that will be used by the pipeline to make predictions. This can be a model identifier or an
+            actual instance of a pretrained model inheriting from [`PreTrainedModel`] (for PyTorch).
+
+            If not provided, the default for the `task` will be loaded.
+        config (`str` or [`PretrainedConfig`], *optional*):
+            The configuration that will be used by the pipeline to instantiate the model. This can be a model
+            identifier or an actual pretrained model configuration inheriting from [`PretrainedConfig`].
+
+            If not provided, the default configuration file for the requested model will be used. That means that if
+            `model` is given, its default configuration will be used. However, if `model` is not supplied, this
+            `task`'s default model's config is used instead.
+        tokenizer (`str` or [`PreTrainedTokenizer`], *optional*):
+            The tokenizer that will be used by the pipeline to encode data for the model. This can be a model
+            identifier or an actual pretrained tokenizer inheriting from [`PreTrainedTokenizer`].
+
+            If not provided, the default tokenizer for the given `model` will be loaded (if it is a string). If `model`
+            is not specified or not a string, then the default tokenizer for `config` is loaded (if it is a string).
+            However, if `config` is also not given or not a string, then the default tokenizer for the given `task`
+            will be loaded.
+        revision (`str`, *optional*, defaults to `"main"`):
+            When passing a task name or a string model identifier: The specific model version to use. It can be a
+            branch name, a tag name, or a commit id, since we use a git-based system for storing models and other
+            artifacts on huggingface.co, so `revision` can be any identifier allowed by git.
+        use_fast (`bool`, *optional*, defaults to `True`):
+            Whether or not to use a Fast tokenizer if possible (a [`PreTrainedTokenizerFast`]).
+        use_auth_token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `huggingface-cli login` (stored in `~/.huggingface`).
+        device (`int` or `str` or `torch.device`):
+            Defines the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank like `1`) on which this
+            pipeline will be allocated.
+        device_map (`str` or `Dict[str, Union[int, str, torch.device]`, *optional*):
+            Sent directly as `model_kwargs` (just a simpler shortcut). When `accelerate` library is present, set
+            `device_map="auto"` to compute the most optimized `device_map` automatically (see
+            [here](https://huggingface.co/docs/accelerate/main/en/package_reference/big_modeling#accelerate.cpu_offload)
+            for more information).
+
+            <Tip warning={true}>
+
+            Do not use `device_map` AND `device` at the same time as they will conflict
+
+            </Tip>
+
+        torch_dtype (`str` or `torch.dtype`, *optional*):
+            Sent directly as `model_kwargs` (just a simpler shortcut) to use the available precision for this model
+            (`torch.float16`, `torch.bfloat16`, ... or `"auto"`).
+        trust_remote_code (`bool`, *optional*, defaults to `False`):
+            Whether or not to allow for custom code defined on the Hub in their own modeling, configuration,
+            tokenization or even pipeline files. This option should only be set to `True` for repositories you trust
+            and in which you have read the code, as it will execute code present on the Hub on your local machine.
+        model_kwargs (`Dict[str, Any]`, *optional*):
+            Additional dictionary of keyword arguments passed along to the model's `from_pretrained(...,
+            **model_kwargs)` function.
+        kwargs (`Dict[str, Any]`, *optional*):
+            Additional keyword arguments passed along to the specific pipeline init (see the documentation for the
+            corresponding pipeline class for possible values).
+
+    Returns:
+        [`Pipeline`]: A suitable pipeline for the task.
+
+    Examples:
+
+    ```python
+    >>> from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
+
+    >>> # Sentiment analysis pipeline
+    >>> analyzer = pipeline("sentiment-analysis")
+
+    >>> # Question answering pipeline, specifying the checkpoint identifier
+    >>> oracle = pipeline(
+    ...     "question-answering", model="distilbert-base-cased-distilled-squad", tokenizer="bert-base-cased"
+    ... )
+
+    >>> # Named entity recognition pipeline, passing in a specific model and tokenizer
+    >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
+    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+    >>> recognizer = pipeline("ner", model=model, tokenizer=tokenizer)
+    ```"""
+    if model_kwargs is None:
+        model_kwargs = {}
+
+    code_revision = kwargs.pop("code_revision", None)
+    commit_hash = kwargs.pop("_commit_hash", None)
+
+    hub_kwargs = {
+        "revision": revision,
+        "token": token,
+        "trust_remote_code": trust_remote_code,
+        "_commit_hash": commit_hash,
+    }
+
+    if task is None and model is None:
+        raise RuntimeError(
+            "Impossible to instantiate a pipeline without either a task or a model "
+            "being specified. "
+            "Please provide a task class or a model"
+        )
+
+    if task != "text-generation":
+        raise ValueError("Optimum-intel ipex optimization only supports text-generation task for now.")
+
+    if model is None and tokenizer is not None:
+        raise RuntimeError(
+            "Impossible to instantiate a pipeline with tokenizer specified but not the model as the provided tokenizer"
+            " may not be compatible with the default model. Please provide a PreTrainedModel class or a"
+            " path/identifier to a pretrained model when providing tokenizer."
+        )
+
+    if isinstance(model, Path):
+        model = str(model)
+
+    if commit_hash is None:
+        pretrained_model_name_or_path = None
+        if isinstance(config, str):
+            pretrained_model_name_or_path = config
+        elif config is None and isinstance(model, str):
+            pretrained_model_name_or_path = model
+
+        if not isinstance(config, PretrainedConfig) and pretrained_model_name_or_path is not None:
+            # We make a call to the config file first (which may be absent) to get the commit hash as soon as possible
+            resolved_config_file = cached_file(
+                pretrained_model_name_or_path,
+                CONFIG_NAME,
+                _raise_exceptions_for_missing_entries=False,
+                _raise_exceptions_for_connection_errors=False,
+                **hub_kwargs,
+            )
+            hub_kwargs["_commit_hash"] = extract_commit_hash(resolved_config_file, commit_hash)
+        else:
+            hub_kwargs["_commit_hash"] = getattr(config, "_commit_hash", None)
+
+    # Config is the primordial information item.
+    # Instantiate config if needed
+    if isinstance(config, str):
+        config = AutoConfig.from_pretrained(
+            config, _from_pipeline=task, code_revision=code_revision, **hub_kwargs, **model_kwargs
+        )
+        hub_kwargs["_commit_hash"] = config._commit_hash
+    elif config is None and isinstance(model, str):
+        config = AutoConfig.from_pretrained(
+            model, _from_pipeline=task, code_revision=code_revision, **hub_kwargs, **model_kwargs
+        )
+        hub_kwargs["_commit_hash"] = config._commit_hash
+
+    if task is None and model is not None:
+        if not isinstance(model, str):
+            raise RuntimeError(
+                "Inferring the task automatically requires to check the hub with a model_id defined as a `str`. "
+                f"{model} is not a valid model_id."
+            )
+        task = get_task(model, token)
+
+    normalized_task, targeted_task, task_options = check_task(task)
+    if pipeline_class is None:
+        pipeline_class = targeted_task["impl"]
+
+    # Use default model/config/tokenizer for the task if no model is provided
+    if model is None:
+        model, default_revision = get_default_model_and_revision(targeted_task, "pt", task_options)
+        revision = revision if revision is not None else default_revision
+        logger.warning(
+            f"No model was supplied, defaulted to {model} and revision"
+            f" {revision} ({HUGGINGFACE_CO_RESOLVE_ENDPOINT}/{model}).\n"
+            "Using a pipeline without specifying a model name and revision in production is not recommended."
+        )
+        if config is None and isinstance(model, str):
+            config = AutoConfig.from_pretrained(model, _from_pipeline=task, **hub_kwargs, **model_kwargs)
+            hub_kwargs["_commit_hash"] = config._commit_hash
+
+    if device_map is not None:
+        if "device_map" in model_kwargs:
+            raise ValueError(
+                'You cannot use both `pipeline(... device_map=..., model_kwargs={"device_map":...})` as those'
+                " arguments might conflict, use only one.)"
+            )
+        if device is not None:
+            logger.warning(
+                "Both `device` and `device_map` are specified. `device` will override `device_map`. You"
+                " will most likely encounter unexpected behavior. Please remove `device` and keep `device_map`."
+            )
+        model_kwargs["device_map"] = device_map
+    if torch_dtype is not None:
+        if "torch_dtype" in model_kwargs:
+            raise ValueError(
+                'You cannot use both `pipeline(... torch_dtype=..., model_kwargs={"torch_dtype":...})` as those'
+                " arguments might conflict, use only one.)"
+            )
+        model_kwargs["torch_dtype"] = torch_dtype
+
+    model_name = model if isinstance(model, str) else None
+
+    # Load the correct model if possible
+    # Infer the framework from the model if not already defined
+    if isinstance(model, str):
+        model = TSModelForCausalLM.from_pretrained(model, config=config, export=True, **model_kwargs)
+
+    model_config = model.config
+    hub_kwargs["_commit_hash"] = model.config._commit_hash
+    load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None
+
+    if load_tokenizer:
+        # Try to infer tokenizer from model or config name (if provided as str)
+        if tokenizer is None:
+            if isinstance(model_name, str):
+                tokenizer = model_name
+            elif isinstance(config, str):
+                tokenizer = config
+            else:
+                # Impossible to guess what is the right tokenizer here
+                raise Exception(
+                    "Impossible to guess which tokenizer to use. "
+                    "Please provide a PreTrainedTokenizer class or a path/identifier to a pretrained tokenizer."
+                )
+
+        # Instantiate tokenizer if needed
+        if isinstance(tokenizer, (str, tuple)):
+            if isinstance(tokenizer, tuple):
+                # For tuple we have (tokenizer name, {kwargs})
+                use_fast = tokenizer[1].pop("use_fast", use_fast)
+                tokenizer_identifier = tokenizer[0]
+                tokenizer_kwargs = tokenizer[1]
+            else:
+                tokenizer_identifier = tokenizer
+                tokenizer_kwargs = model_kwargs.copy()
+                tokenizer_kwargs.pop("torch_dtype", None)
+
+            tokenizer = AutoTokenizer.from_pretrained(
+                tokenizer_identifier, use_fast=use_fast, _from_pipeline=task, **hub_kwargs, **tokenizer_kwargs
+            )
+
+    if tokenizer is not None:
+        kwargs["tokenizer"] = tokenizer
+
+    if torch_dtype is not None:
+        kwargs["torch_dtype"] = torch_dtype
+
+    if device is not None:
+        kwargs["device"] = device
+
+    return pipeline_class(model=model, framework="pt", task=task, **kwargs)

From 2bf212208b98ca9c0eed509b556a3c74ceceb5f4 Mon Sep 17 00:00:00 2001
From: "Feng, Jiqing" <jiqing.feng@intel.com>
Date: Sun, 7 Jan 2024 22:30:58 -0800
Subject: [PATCH 02/30] add tests and readme

---
 README.md                            | 40 +++++++++++++++++++++
 optimum/intel/generation/modeling.py |  1 +
 optimum/intel/pipelines/__init__.py  |  5 +++
 tests/pipelines/test_pipelines.py    | 53 ++++++++++++++++++++++++++++
 4 files changed, 99 insertions(+)
 create mode 100644 tests/pipelines/test_pipelines.py

diff --git a/README.md b/README.md
index 54d8371b5b..fbdc06a3cc 100644
--- a/README.md
+++ b/README.md
@@ -41,6 +41,46 @@ where `extras` can be one or more of `neural-compressor`, `openvino`, `nncf`.
 
 # Quick tour
 
+## IPEX
+### pipeline
+Hugging Face pipelines provide a simple yet powerful abstraction to quickly set up inference. If you already have a pipeline from transformers, you can unlock the performance benefits of Optimum-Intel by just changing one line.
+```diff
+import torch
+- from transformers.pipelines import pipeline
++ from optimum.intel.pipelines import pipeline
+
+pipe = pipeline('text-generation', 'gpt2', torch_dtype=torch.bfloat16)
+pipe("Describe a real-world application of AI in sustainable energy.")
+```
+
+### generate
+If you want control over advanced features like quantization and token selection strategies, we recommend using the generate() API. Just like with pipelines, switching from existing transformers code is super simple.
+```diff
+import torch
+from transformers import AutoTokenizer, AutoConfig
+- from transformers import AutoModelForCausalLM
++ from optimum.intel.generation.modeling import TSModelForCausalLM
+
+name = 'gpt2'
+config = AutoConfig.from_pretrained(name, trust_remote_code=True)
+
+model = TSModelForCausalLM.from_pretrained(
+  name,
+  config=config,
+  torch_dtype=torch.bfloat16,
+  export=True,
+)
+
+tokenizer = AutoTokenizer.from_pretrained(name)
+input_sentence = ["Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?"]
+model_inputs = tokenizer(input_sentence, return_tensors="pt")
+generation_kwargs = dict(max_new_tokens=32, do_sample=False, num_beams=4, num_beam_groups=1, no_repeat_ngram_size=2, use_cache=True)
+
+generated_ids = model.generate(**model_inputs, **generation_kwargs)
+output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+print(output)
+```
+
 ## Neural Compressor
 
 Dynamic quantization can be used through the Optimum command-line interface:
diff --git a/optimum/intel/generation/modeling.py b/optimum/intel/generation/modeling.py
index fd946ea607..b15cfe8328 100644
--- a/optimum/intel/generation/modeling.py
+++ b/optimum/intel/generation/modeling.py
@@ -428,5 +428,6 @@ def _from_transformers(
             force_download=force_download,
             cache_dir=cache_dir,
             local_files_only=local_files_only,
+            model_dtype=torch_dtype,
             **kwargs,
         )
diff --git a/optimum/intel/pipelines/__init__.py b/optimum/intel/pipelines/__init__.py
index 4303ddfb59..61329264a1 100644
--- a/optimum/intel/pipelines/__init__.py
+++ b/optimum/intel/pipelines/__init__.py
@@ -31,6 +31,7 @@
     HUGGINGFACE_CO_RESOLVE_ENDPOINT,
     cached_file,
     extract_commit_hash,
+    is_ipex_available,
     is_offline_mode,
     is_torch_available,
     logging,
@@ -39,6 +40,10 @@
 from ..generation.modeling import TSModelForCausalLM
 
 
+if is_ipex_available():
+    import intel_extension_for_pytorch
+
+
 if is_torch_available():
     import torch
     from transformers.models.auto.modeling_auto import AutoModelForCausalLM
diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py
new file mode 100644
index 0000000000..0c6382d29c
--- /dev/null
+++ b/tests/pipelines/test_pipelines.py
@@ -0,0 +1,53 @@
+#  Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import unittest
+
+import torch
+from parameterized import parameterized
+from transformers.pipelines import pipeline as transformers_pipeline
+
+from optimum.intel.generation.modeling import TSModelForCausalLM
+from optimum.intel.pipelines import pipeline as ipex_pipeline
+
+
+MODEL_NAMES = {
+    "bert": "hf-internal-testing/tiny-random-bert",
+    "distilbert": "hf-internal-testing/tiny-random-distilbert",
+    "roberta": "hf-internal-testing/tiny-random-roberta",
+    "bloom": "hf-internal-testing/tiny-random-bloom",
+    "gptj": "hf-internal-testing/tiny-random-gptj",
+    "gpt2": "hf-internal-testing/tiny-random-gpt2",
+    "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel",
+    "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM",
+    "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel",
+}
+
+
+class PipelinesIntegrationTest(unittest.TestCase):
+    TEXT_GENERATION_SUPPORTED_ARCHITECTURES = ("bloom", "gptj", "gpt2", "gpt_neo")
+
+    @parameterized.expand(TEXT_GENERATION_SUPPORTED_ARCHITECTURES)
+    def test_text_generation_pipeline_inference(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        inputs = "DeepSpeed is a machine learning framework for deep neural networks and deep reinforcement learning. It is written in C++ and is available for Linux, Mac OS X,"
+        transformers_text_generator = transformers_pipeline("text-generation", model_id)
+        ipex_text_generator = ipex_pipeline("text-generation", model_id)
+        with torch.inference_mode():
+            transformers_output = transformers_text_generator(inputs)
+        with torch.inference_mode():
+            ipex_output = ipex_text_generator(inputs)
+        self.assertTrue(isinstance(ipex_text_generator.model, TSModelForCausalLM))
+        self.assertTrue(isinstance(ipex_text_generator.model.model, torch.jit.RecursiveScriptModule))
+        self.assertEqual(transformers_output[0]["generated_text"], ipex_output[0]["generated_text"])

From db10723a9ec7f185c7c3c342efbd2f1cc51de433 Mon Sep 17 00:00:00 2001
From: "Feng, Jiqing" <jiqing.feng@intel.com>
Date: Sun, 7 Jan 2024 22:33:25 -0800
Subject: [PATCH 03/30] fix pipelines example

---
 optimum/intel/pipelines/__init__.py | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/optimum/intel/pipelines/__init__.py b/optimum/intel/pipelines/__init__.py
index 61329264a1..3ae08416fa 100644
--- a/optimum/intel/pipelines/__init__.py
+++ b/optimum/intel/pipelines/__init__.py
@@ -236,20 +236,11 @@ def pipeline(
     Examples:
 
     ```python
-    >>> from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
+    >>> import torch
+    >>> from optimum.intel.pipelines import pipeline
 
-    >>> # Sentiment analysis pipeline
-    >>> analyzer = pipeline("sentiment-analysis")
-
-    >>> # Question answering pipeline, specifying the checkpoint identifier
-    >>> oracle = pipeline(
-    ...     "question-answering", model="distilbert-base-cased-distilled-squad", tokenizer="bert-base-cased"
-    ... )
-
-    >>> # Named entity recognition pipeline, passing in a specific model and tokenizer
-    >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
-    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-    >>> recognizer = pipeline("ner", model=model, tokenizer=tokenizer)
+    >>> pipe = pipeline('text-generation', 'gpt2', torch_dtype=torch.bfloat16)
+    >>> pipe("Describe a real-world application of AI in sustainable energy.")
     ```"""
     if model_kwargs is None:
         model_kwargs = {}

From 24f26db991bfc90baf581609550c7c5d263780ac Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 9 Jan 2024 03:37:14 -0500
Subject: [PATCH 04/30] fix readme codestyle

---
 README.md | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index fbdc06a3cc..0506ea944f 100644
--- a/README.md
+++ b/README.md
@@ -49,7 +49,7 @@ import torch
 - from transformers.pipelines import pipeline
 + from optimum.intel.pipelines import pipeline
 
-pipe = pipeline('text-generation', 'gpt2', torch_dtype=torch.bfloat16)
+pipe = pipeline("text-generation", "gpt2", torch_dtype=torch.bfloat16)
 pipe("Describe a real-world application of AI in sustainable energy.")
 ```
 
@@ -61,17 +61,14 @@ from transformers import AutoTokenizer, AutoConfig
 - from transformers import AutoModelForCausalLM
 + from optimum.intel.generation.modeling import TSModelForCausalLM
 
-name = 'gpt2'
-config = AutoConfig.from_pretrained(name, trust_remote_code=True)
-
+config = AutoConfig.from_pretrained("gpt2")
 model = TSModelForCausalLM.from_pretrained(
-  name,
+  "gpt2",
   config=config,
   torch_dtype=torch.bfloat16,
   export=True,
 )
-
-tokenizer = AutoTokenizer.from_pretrained(name)
+tokenizer = AutoTokenizer.from_pretrained("gpt2")
 input_sentence = ["Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?"]
 model_inputs = tokenizer(input_sentence, return_tensors="pt")
 generation_kwargs = dict(max_new_tokens=32, do_sample=False, num_beams=4, num_beam_groups=1, no_repeat_ngram_size=2, use_cache=True)

From 39b7804ef303b2e0dcc0f5003ed85f99bcd7af4f Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 9 Jan 2024 10:26:00 -0500
Subject: [PATCH 05/30] add _load_model in pipeline

---
 optimum/intel/pipelines/__init__.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/optimum/intel/pipelines/__init__.py b/optimum/intel/pipelines/__init__.py
index 3ae08416fa..98615cd7d1 100644
--- a/optimum/intel/pipelines/__init__.py
+++ b/optimum/intel/pipelines/__init__.py
@@ -141,6 +141,11 @@ def clean_custom_task(task_info):
     return task_info, None
 
 
+def _load_model(task, model, **kwargs):
+    if task == "text-generation":
+        return TSModelForCausalLM.from_pretrained(model, **kwargs)
+
+
 def pipeline(
     task: str = None,
     model: Optional[Union[str, "PreTrainedModel"]] = None,
@@ -262,8 +267,8 @@ def pipeline(
             "Please provide a task class or a model"
         )
 
-    if task != "text-generation":
-        raise ValueError("Optimum-intel ipex optimization only supports text-generation task for now.")
+    if task not in SUPPORTED_TASKS.keys():
+        raise ValueError(f"Optimum-intel ipex optimization only supports {SUPPORTED_TASKS.keys()} task for now.")
 
     if model is None and tokenizer is not None:
         raise RuntimeError(
@@ -358,7 +363,7 @@ def pipeline(
     # Load the correct model if possible
     # Infer the framework from the model if not already defined
     if isinstance(model, str):
-        model = TSModelForCausalLM.from_pretrained(model, config=config, export=True, **model_kwargs)
+        model = _load_model(task, model, config=config, export=True, **model_kwargs)
 
     model_config = model.config
     hub_kwargs["_commit_hash"] = model.config._commit_hash

From d37ff185d698b33df49a90002fbc45abba3dc9ef Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 2 Apr 2024 10:50:11 -0400
Subject: [PATCH 06/30] update pipeline for optimum intel

---
 optimum/intel/pipelines/__init__.py      | 412 +----------------------
 optimum/intel/pipelines/pipeline_base.py | 326 ++++++++++++++++++
 2 files changed, 327 insertions(+), 411 deletions(-)
 create mode 100644 optimum/intel/pipelines/pipeline_base.py

diff --git a/optimum/intel/pipelines/__init__.py b/optimum/intel/pipelines/__init__.py
index 98615cd7d1..02eb06cb39 100644
--- a/optimum/intel/pipelines/__init__.py
+++ b/optimum/intel/pipelines/__init__.py
@@ -1,411 +1 @@
-#  Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-import warnings
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
-
-from huggingface_hub import model_info
-from transformers.configuration_utils import PretrainedConfig
-from transformers.models.auto.configuration_auto import AutoConfig
-from transformers.models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
-from transformers.pipelines.base import (
-    Pipeline,
-    PipelineRegistry,
-    get_default_model_and_revision,
-)
-from transformers.pipelines.text_generation import TextGenerationPipeline
-from transformers.tokenization_utils import PreTrainedTokenizer
-from transformers.utils import (
-    CONFIG_NAME,
-    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
-    cached_file,
-    extract_commit_hash,
-    is_ipex_available,
-    is_offline_mode,
-    is_torch_available,
-    logging,
-)
-
-from ..generation.modeling import TSModelForCausalLM
-
-
-if is_ipex_available():
-    import intel_extension_for_pytorch
-
-
-if is_torch_available():
-    import torch
-    from transformers.models.auto.modeling_auto import AutoModelForCausalLM
-
-
-if TYPE_CHECKING:
-    from transformers.modeling_utils import PreTrainedModel
-    from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
-
-
-logger = logging.get_logger(__name__)
-
-
-# Register all the supported tasks here
-TASK_ALIASES = {
-    "sentiment-analysis": "text-classification",
-}
-SUPPORTED_TASKS = {
-    "text-generation": {
-        "impl": TextGenerationPipeline,
-        "pt": (AutoModelForCausalLM,) if is_torch_available() else (),
-        "default": {"model": {"pt": ("gpt2", "6c0e608")}},
-        "type": "text",
-    },
-}
-
-
-PIPELINE_REGISTRY = PipelineRegistry(supported_tasks=SUPPORTED_TASKS, task_aliases=TASK_ALIASES)
-
-
-def get_supported_tasks() -> List[str]:
-    """
-    Returns a list of supported task strings.
-    """
-    return PIPELINE_REGISTRY.get_supported_tasks()
-
-
-def get_task(model: str, token: Optional[str] = None, **deprecated_kwargs) -> str:
-    use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
-    if use_auth_token is not None:
-        warnings.warn(
-            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
-            FutureWarning,
-        )
-        if token is not None:
-            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
-        token = use_auth_token
-
-    if is_offline_mode():
-        raise RuntimeError("You cannot infer task automatically within `pipeline` when using offline mode")
-    try:
-        info = model_info(model, token=token)
-    except Exception as e:
-        raise RuntimeError(f"Instantiating a pipeline without a task set raised an error: {e}")
-    if not info.pipeline_tag:
-        raise RuntimeError(
-            f"The model {model} does not seem to have a correct `pipeline_tag` set to infer the task automatically"
-        )
-    if getattr(info, "library_name", "transformers") != "transformers":
-        raise RuntimeError(f"This model is meant to be used with {info.library_name} not with transformers")
-    task = info.pipeline_tag
-    return task
-
-
-def check_task(task: str) -> Tuple[str, Dict, Any]:
-    """
-    Checks an incoming task string, to validate it's correct and return the default Pipeline and Model classes, and
-    default models if they exist.
-
-    Args:
-        task (`str`):
-            The task defining which pipeline will be returned. Currently accepted tasks are:
-
-            - `"text-generation"`
-
-    Returns:
-        (normalized_task: `str`, task_defaults: `dict`, task_options: (`tuple`, None)) The normalized task name
-        (removed alias and options). The actual dictionary required to initialize the pipeline and some extra task
-        options for parametrized tasks like "translation_XX_to_YY"
-
-
-    """
-    return PIPELINE_REGISTRY.check_task(task)
-
-
-def clean_custom_task(task_info):
-    import transformers
-
-    if "impl" not in task_info:
-        raise RuntimeError("This model introduces a custom pipeline without specifying its implementation.")
-    pt_class_names = task_info.get("pt", ())
-    if isinstance(pt_class_names, str):
-        pt_class_names = [pt_class_names]
-    task_info["pt"] = tuple(getattr(transformers, c) for c in pt_class_names)
-    return task_info, None
-
-
-def _load_model(task, model, **kwargs):
-    if task == "text-generation":
-        return TSModelForCausalLM.from_pretrained(model, **kwargs)
-
-
-def pipeline(
-    task: str = None,
-    model: Optional[Union[str, "PreTrainedModel"]] = None,
-    config: Optional[Union[str, PretrainedConfig]] = None,
-    tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None,
-    revision: Optional[str] = None,
-    use_fast: bool = True,
-    token: Optional[Union[str, bool]] = None,
-    device: Optional[Union[int, str, "torch.device"]] = None,
-    device_map=None,
-    torch_dtype=None,
-    trust_remote_code: Optional[bool] = None,
-    model_kwargs: Dict[str, Any] = None,
-    pipeline_class: Optional[Any] = None,
-    **kwargs,
-) -> Pipeline:
-    """
-    Utility factory method to build a [`Pipeline`].
-
-    Pipelines are made of:
-
-        - A [tokenizer](tokenizer) in charge of mapping raw textual input to token.
-        - A [model](model) to make predictions from the inputs.
-        - Some (optional) post processing for enhancing model's output.
-
-    Args:
-        task (`str`):
-            The task defining which pipeline will be returned. Currently accepted tasks are:
-
-            - `"text-generation"`: will return a [`TextGenerationPipeline`]:.
-
-        model (`str` or [`PreTrainedModel`], *optional*):
-            The model that will be used by the pipeline to make predictions. This can be a model identifier or an
-            actual instance of a pretrained model inheriting from [`PreTrainedModel`] (for PyTorch).
-
-            If not provided, the default for the `task` will be loaded.
-        config (`str` or [`PretrainedConfig`], *optional*):
-            The configuration that will be used by the pipeline to instantiate the model. This can be a model
-            identifier or an actual pretrained model configuration inheriting from [`PretrainedConfig`].
-
-            If not provided, the default configuration file for the requested model will be used. That means that if
-            `model` is given, its default configuration will be used. However, if `model` is not supplied, this
-            `task`'s default model's config is used instead.
-        tokenizer (`str` or [`PreTrainedTokenizer`], *optional*):
-            The tokenizer that will be used by the pipeline to encode data for the model. This can be a model
-            identifier or an actual pretrained tokenizer inheriting from [`PreTrainedTokenizer`].
-
-            If not provided, the default tokenizer for the given `model` will be loaded (if it is a string). If `model`
-            is not specified or not a string, then the default tokenizer for `config` is loaded (if it is a string).
-            However, if `config` is also not given or not a string, then the default tokenizer for the given `task`
-            will be loaded.
-        revision (`str`, *optional*, defaults to `"main"`):
-            When passing a task name or a string model identifier: The specific model version to use. It can be a
-            branch name, a tag name, or a commit id, since we use a git-based system for storing models and other
-            artifacts on huggingface.co, so `revision` can be any identifier allowed by git.
-        use_fast (`bool`, *optional*, defaults to `True`):
-            Whether or not to use a Fast tokenizer if possible (a [`PreTrainedTokenizerFast`]).
-        use_auth_token (`str` or *bool*, *optional*):
-            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-            when running `huggingface-cli login` (stored in `~/.huggingface`).
-        device (`int` or `str` or `torch.device`):
-            Defines the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank like `1`) on which this
-            pipeline will be allocated.
-        device_map (`str` or `Dict[str, Union[int, str, torch.device]`, *optional*):
-            Sent directly as `model_kwargs` (just a simpler shortcut). When `accelerate` library is present, set
-            `device_map="auto"` to compute the most optimized `device_map` automatically (see
-            [here](https://huggingface.co/docs/accelerate/main/en/package_reference/big_modeling#accelerate.cpu_offload)
-            for more information).
-
-            <Tip warning={true}>
-
-            Do not use `device_map` AND `device` at the same time as they will conflict
-
-            </Tip>
-
-        torch_dtype (`str` or `torch.dtype`, *optional*):
-            Sent directly as `model_kwargs` (just a simpler shortcut) to use the available precision for this model
-            (`torch.float16`, `torch.bfloat16`, ... or `"auto"`).
-        trust_remote_code (`bool`, *optional*, defaults to `False`):
-            Whether or not to allow for custom code defined on the Hub in their own modeling, configuration,
-            tokenization or even pipeline files. This option should only be set to `True` for repositories you trust
-            and in which you have read the code, as it will execute code present on the Hub on your local machine.
-        model_kwargs (`Dict[str, Any]`, *optional*):
-            Additional dictionary of keyword arguments passed along to the model's `from_pretrained(...,
-            **model_kwargs)` function.
-        kwargs (`Dict[str, Any]`, *optional*):
-            Additional keyword arguments passed along to the specific pipeline init (see the documentation for the
-            corresponding pipeline class for possible values).
-
-    Returns:
-        [`Pipeline`]: A suitable pipeline for the task.
-
-    Examples:
-
-    ```python
-    >>> import torch
-    >>> from optimum.intel.pipelines import pipeline
-
-    >>> pipe = pipeline('text-generation', 'gpt2', torch_dtype=torch.bfloat16)
-    >>> pipe("Describe a real-world application of AI in sustainable energy.")
-    ```"""
-    if model_kwargs is None:
-        model_kwargs = {}
-
-    code_revision = kwargs.pop("code_revision", None)
-    commit_hash = kwargs.pop("_commit_hash", None)
-
-    hub_kwargs = {
-        "revision": revision,
-        "token": token,
-        "trust_remote_code": trust_remote_code,
-        "_commit_hash": commit_hash,
-    }
-
-    if task is None and model is None:
-        raise RuntimeError(
-            "Impossible to instantiate a pipeline without either a task or a model "
-            "being specified. "
-            "Please provide a task class or a model"
-        )
-
-    if task not in SUPPORTED_TASKS.keys():
-        raise ValueError(f"Optimum-intel ipex optimization only supports {SUPPORTED_TASKS.keys()} task for now.")
-
-    if model is None and tokenizer is not None:
-        raise RuntimeError(
-            "Impossible to instantiate a pipeline with tokenizer specified but not the model as the provided tokenizer"
-            " may not be compatible with the default model. Please provide a PreTrainedModel class or a"
-            " path/identifier to a pretrained model when providing tokenizer."
-        )
-
-    if isinstance(model, Path):
-        model = str(model)
-
-    if commit_hash is None:
-        pretrained_model_name_or_path = None
-        if isinstance(config, str):
-            pretrained_model_name_or_path = config
-        elif config is None and isinstance(model, str):
-            pretrained_model_name_or_path = model
-
-        if not isinstance(config, PretrainedConfig) and pretrained_model_name_or_path is not None:
-            # We make a call to the config file first (which may be absent) to get the commit hash as soon as possible
-            resolved_config_file = cached_file(
-                pretrained_model_name_or_path,
-                CONFIG_NAME,
-                _raise_exceptions_for_missing_entries=False,
-                _raise_exceptions_for_connection_errors=False,
-                **hub_kwargs,
-            )
-            hub_kwargs["_commit_hash"] = extract_commit_hash(resolved_config_file, commit_hash)
-        else:
-            hub_kwargs["_commit_hash"] = getattr(config, "_commit_hash", None)
-
-    # Config is the primordial information item.
-    # Instantiate config if needed
-    if isinstance(config, str):
-        config = AutoConfig.from_pretrained(
-            config, _from_pipeline=task, code_revision=code_revision, **hub_kwargs, **model_kwargs
-        )
-        hub_kwargs["_commit_hash"] = config._commit_hash
-    elif config is None and isinstance(model, str):
-        config = AutoConfig.from_pretrained(
-            model, _from_pipeline=task, code_revision=code_revision, **hub_kwargs, **model_kwargs
-        )
-        hub_kwargs["_commit_hash"] = config._commit_hash
-
-    if task is None and model is not None:
-        if not isinstance(model, str):
-            raise RuntimeError(
-                "Inferring the task automatically requires to check the hub with a model_id defined as a `str`. "
-                f"{model} is not a valid model_id."
-            )
-        task = get_task(model, token)
-
-    normalized_task, targeted_task, task_options = check_task(task)
-    if pipeline_class is None:
-        pipeline_class = targeted_task["impl"]
-
-    # Use default model/config/tokenizer for the task if no model is provided
-    if model is None:
-        model, default_revision = get_default_model_and_revision(targeted_task, "pt", task_options)
-        revision = revision if revision is not None else default_revision
-        logger.warning(
-            f"No model was supplied, defaulted to {model} and revision"
-            f" {revision} ({HUGGINGFACE_CO_RESOLVE_ENDPOINT}/{model}).\n"
-            "Using a pipeline without specifying a model name and revision in production is not recommended."
-        )
-        if config is None and isinstance(model, str):
-            config = AutoConfig.from_pretrained(model, _from_pipeline=task, **hub_kwargs, **model_kwargs)
-            hub_kwargs["_commit_hash"] = config._commit_hash
-
-    if device_map is not None:
-        if "device_map" in model_kwargs:
-            raise ValueError(
-                'You cannot use both `pipeline(... device_map=..., model_kwargs={"device_map":...})` as those'
-                " arguments might conflict, use only one.)"
-            )
-        if device is not None:
-            logger.warning(
-                "Both `device` and `device_map` are specified. `device` will override `device_map`. You"
-                " will most likely encounter unexpected behavior. Please remove `device` and keep `device_map`."
-            )
-        model_kwargs["device_map"] = device_map
-    if torch_dtype is not None:
-        if "torch_dtype" in model_kwargs:
-            raise ValueError(
-                'You cannot use both `pipeline(... torch_dtype=..., model_kwargs={"torch_dtype":...})` as those'
-                " arguments might conflict, use only one.)"
-            )
-        model_kwargs["torch_dtype"] = torch_dtype
-
-    model_name = model if isinstance(model, str) else None
-
-    # Load the correct model if possible
-    # Infer the framework from the model if not already defined
-    if isinstance(model, str):
-        model = _load_model(task, model, config=config, export=True, **model_kwargs)
-
-    model_config = model.config
-    hub_kwargs["_commit_hash"] = model.config._commit_hash
-    load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None
-
-    if load_tokenizer:
-        # Try to infer tokenizer from model or config name (if provided as str)
-        if tokenizer is None:
-            if isinstance(model_name, str):
-                tokenizer = model_name
-            elif isinstance(config, str):
-                tokenizer = config
-            else:
-                # Impossible to guess what is the right tokenizer here
-                raise Exception(
-                    "Impossible to guess which tokenizer to use. "
-                    "Please provide a PreTrainedTokenizer class or a path/identifier to a pretrained tokenizer."
-                )
-
-        # Instantiate tokenizer if needed
-        if isinstance(tokenizer, (str, tuple)):
-            if isinstance(tokenizer, tuple):
-                # For tuple we have (tokenizer name, {kwargs})
-                use_fast = tokenizer[1].pop("use_fast", use_fast)
-                tokenizer_identifier = tokenizer[0]
-                tokenizer_kwargs = tokenizer[1]
-            else:
-                tokenizer_identifier = tokenizer
-                tokenizer_kwargs = model_kwargs.copy()
-                tokenizer_kwargs.pop("torch_dtype", None)
-
-            tokenizer = AutoTokenizer.from_pretrained(
-                tokenizer_identifier, use_fast=use_fast, _from_pipeline=task, **hub_kwargs, **tokenizer_kwargs
-            )
-
-    if tokenizer is not None:
-        kwargs["tokenizer"] = tokenizer
-
-    if torch_dtype is not None:
-        kwargs["torch_dtype"] = torch_dtype
-
-    if device is not None:
-        kwargs["device"] = device
-
-    return pipeline_class(model=model, framework="pt", task=task, **kwargs)
+from .pipeline_base import pipeline
diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py
new file mode 100644
index 0000000000..c6bca6a4bf
--- /dev/null
+++ b/optimum/intel/pipelines/pipeline_base.py
@@ -0,0 +1,326 @@
+#  Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, Optional, Union
+
+from transformers import pipeline as transformers_pipeline
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
+from transformers.pipelines import (
+    AudioClassificationPipeline,
+    FillMaskPipeline,
+    ImageClassificationPipeline,
+    QuestionAnsweringPipeline,
+    TextClassificationPipeline,
+    TextGenerationPipeline,
+    TokenClassificationPipeline,
+)
+from transformers.pipelines.base import Pipeline
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.utils import (
+    is_ipex_available,
+    is_torch_available,
+    logging,
+)
+
+
+if is_ipex_available():
+    from ..ipex.modeling_base import (
+        IPEXModel,
+        IPEXModelForAudioClassification,
+        IPEXModelForCausalLM,
+        IPEXModelForImageClassification,
+        IPEXModelForMaskedLM,
+        IPEXModelForQuestionAnswering,
+        IPEXModelForSequenceClassification,
+        IPEXModelForTokenClassification,
+    )
+
+    IPEX_SUPPORTED_TASKS = {
+        "text-generation": {
+            "impl": TextGenerationPipeline,
+            "class": (IPEXModelForCausalLM,),
+            "default": "gpt2",
+            "type": "text",
+        },
+        "fill-mask": {
+            "impl": FillMaskPipeline,
+            "class": (IPEXModelForMaskedLM,),
+            "default": "bert-base-cased",
+            "type": "text",
+        },
+        "question-answering": {
+            "impl": QuestionAnsweringPipeline,
+            "class": (IPEXModelForQuestionAnswering,),
+            "default": "distilbert-base-cased-distilled-squad",
+            "type": "text",
+        },
+        "image-classification": {
+            "impl": ImageClassificationPipeline,
+            "class": (IPEXModelForImageClassification,),
+            "default": "google/vit-base-patch16-224",
+            "type": "image",
+        },
+        "text-classification": {
+            "impl": TextClassificationPipeline,
+            "class": (IPEXModelForSequenceClassification,),
+            "default": "distilbert-base-uncased-finetuned-sst-2-english",
+            "type": "text",
+        },
+        "token-classification": {
+            "impl": TokenClassificationPipeline,
+            "class": (IPEXModelForTokenClassification,),
+            "default": "dbmdz/bert-large-cased-finetuned-conll03-english",
+            "type": "text",
+        },
+        "audio-classification": {
+            "impl": AudioClassificationPipeline,
+            "class": (IPEXModelForAudioClassification,),
+            "default": "superb/hubert-base-superb-ks",
+            "type": "audio",
+        },
+    }
+
+
+def load_ipex_model(
+    model,
+    targeted_task,
+    SUPPORTED_TASKS,
+    model_kwargs: Optional[Dict[str, Any]] = None,
+):
+    if model_kwargs is None:
+        model_kwargs = {}
+
+    if model is None:
+        model_id = SUPPORTED_TASKS[targeted_task]["default"]
+        model = SUPPORTED_TASKS[targeted_task]["class"][0].from_pretrained(model_id, export=True)
+    elif isinstance(model, str):
+        ipex_model_class = SUPPORTED_TASKS[targeted_task]["class"][0]
+        model = ipex_model_class.from_pretrained(model, export=True, **model_kwargs)
+    elif isinstance(model, IPEXModel):
+        pass
+    else:
+        raise ValueError(
+            f"""Model {model} is not supported. Please provide a valid model either as string or IPEXModel.
+            You can also provide non model then a default one will be used"""
+        )
+
+    return model
+
+
+MAPPING_LOADING_FUNC = {
+    "ipex": load_ipex_model,
+}
+
+
+if is_torch_available():
+    import torch
+
+
+if TYPE_CHECKING:
+    from transformers.modeling_utils import PreTrainedModel
+    from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+
+
+logger = logging.get_logger(__name__)
+
+
+def pipeline(
+    task: str = None,
+    model: Optional[Union[str, "PreTrainedModel"]] = None,
+    config: Optional[Union[str, PretrainedConfig]] = None,
+    tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None,
+    accelerator: Optional[str] = "ipex",
+    use_fast: bool = True,
+    device: Optional[Union[int, str, "torch.device"]] = None,
+    torch_dtype=None,
+    model_kwargs: Dict[str, Any] = None,
+    **kwargs,
+) -> Pipeline:
+    """
+    Utility factory method to build a [`Pipeline`].
+
+    Pipelines are made of:
+
+        - A [tokenizer](tokenizer) in charge of mapping raw textual input to token.
+        - A [model](model) to make predictions from the inputs.
+        - Some (optional) post processing for enhancing model's output.
+
+    Args:
+        task (`str`):
+            The task defining which pipeline will be returned. Currently accepted tasks are:
+
+            - `"text-generation"`: will return a [`TextGenerationPipeline`]:.
+
+        model (`str` or [`PreTrainedModel`], *optional*):
+            The model that will be used by the pipeline to make predictions. This can be a model identifier or an
+            actual instance of a pretrained model inheriting from [`PreTrainedModel`] (for PyTorch).
+
+            If not provided, the default for the `task` will be loaded.
+        config (`str` or [`PretrainedConfig`], *optional*):
+            The configuration that will be used by the pipeline to instantiate the model. This can be a model
+            identifier or an actual pretrained model configuration inheriting from [`PretrainedConfig`].
+
+            If not provided, the default configuration file for the requested model will be used. That means that if
+            `model` is given, its default configuration will be used. However, if `model` is not supplied, this
+            `task`'s default model's config is used instead.
+        tokenizer (`str` or [`PreTrainedTokenizer`], *optional*):
+            The tokenizer that will be used by the pipeline to encode data for the model. This can be a model
+            identifier or an actual pretrained tokenizer inheriting from [`PreTrainedTokenizer`].
+
+            If not provided, the default tokenizer for the given `model` will be loaded (if it is a string). If `model`
+            is not specified or not a string, then the default tokenizer for `config` is loaded (if it is a string).
+            However, if `config` is also not given or not a string, then the default tokenizer for the given `task`
+            will be loaded.
+        accelerator (`str`, *optional*, defaults to `"ipex"`):
+            The optimization backends, choose from ["ipex", "inc", "openvino"].
+        revision (`str`, *optional*, defaults to `"main"`):
+            When passing a task name or a string model identifier: The specific model version to use. It can be a
+            branch name, a tag name, or a commit id, since we use a git-based system for storing models and other
+            artifacts on huggingface.co, so `revision` can be any identifier allowed by git.
+        use_fast (`bool`, *optional*, defaults to `True`):
+            Whether or not to use a Fast tokenizer if possible (a [`PreTrainedTokenizerFast`]).
+        use_auth_token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `huggingface-cli login` (stored in `~/.huggingface`).
+        device (`int` or `str` or `torch.device`):
+            Defines the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank like `1`) on which this
+            pipeline will be allocated.
+        device_map (`str` or `Dict[str, Union[int, str, torch.device]`, *optional*):
+            Sent directly as `model_kwargs` (just a simpler shortcut). When `accelerate` library is present, set
+            `device_map="auto"` to compute the most optimized `device_map` automatically (see
+            [here](https://huggingface.co/docs/accelerate/main/en/package_reference/big_modeling#accelerate.cpu_offload)
+            for more information).
+
+            <Tip warning={true}>
+
+            Do not use `device_map` AND `device` at the same time as they will conflict
+
+            </Tip>
+
+        torch_dtype (`str` or `torch.dtype`, *optional*):
+            Sent directly as `model_kwargs` (just a simpler shortcut) to use the available precision for this model
+            (`torch.float16`, `torch.bfloat16`, ... or `"auto"`).
+        trust_remote_code (`bool`, *optional*, defaults to `False`):
+            Whether or not to allow for custom code defined on the Hub in their own modeling, configuration,
+            tokenization or even pipeline files. This option should only be set to `True` for repositories you trust
+            and in which you have read the code, as it will execute code present on the Hub on your local machine.
+        model_kwargs (`Dict[str, Any]`, *optional*):
+            Additional dictionary of keyword arguments passed along to the model's `from_pretrained(...,
+            **model_kwargs)` function.
+        kwargs (`Dict[str, Any]`, *optional*):
+            Additional keyword arguments passed along to the specific pipeline init (see the documentation for the
+            corresponding pipeline class for possible values).
+
+    Returns:
+        [`Pipeline`]: A suitable pipeline for the task.
+
+    Examples:
+
+    ```python
+    >>> import torch
+    >>> from optimum.intel.pipelines import pipeline
+
+    >>> pipe = pipeline('text-generation', 'gpt2', torch_dtype=torch.bfloat16)
+    >>> pipe("Describe a real-world application of AI in sustainable energy.")
+    ```"""
+    if model_kwargs is None:
+        model_kwargs = {}
+
+    if task is None and model is None:
+        raise RuntimeError(
+            "Impossible to instantiate a pipeline without either a task or a model "
+            "being specified. "
+            "Please provide a task class or a model"
+        )
+
+    if model is None and tokenizer is not None:
+        raise RuntimeError(
+            "Impossible to instantiate a pipeline with tokenizer specified but not the model as the provided tokenizer"
+            " may not be compatible with the default model. Please provide a PreTrainedModel class or a"
+            " path/identifier to a pretrained model when providing tokenizer."
+        )
+
+    if accelerator not in MAPPING_LOADING_FUNC:
+        raise ValueError(f'Accelerator {accelerator} is not supported. Supported accelerator is "ipex".')
+
+    if accelerator == "ipex":
+        if task not in list(IPEX_SUPPORTED_TASKS.keys()):
+            raise ValueError(
+                f"Task {task} is not supported for the ONNX Runtime pipeline. Supported tasks are { list(IPEX_SUPPORTED_TASKS.keys())}"
+            )
+
+    if isinstance(model, Path):
+        model = str(model)
+
+    if torch_dtype is not None:
+        if "torch_dtype" in model_kwargs:
+            raise ValueError(
+                'You cannot use both `pipeline(... torch_dtype=..., model_kwargs={"torch_dtype":...})` as those'
+                " arguments might conflict, use only one.)"
+            )
+        model_kwargs["torch_dtype"] = torch_dtype
+
+    model_name = model if isinstance(model, str) else None
+
+    # Load the correct model if possible
+    # Infer the framework from the model if not already defined
+    model = MAPPING_LOADING_FUNC[accelerator](model, task, IPEX_SUPPORTED_TASKS, model_kwargs)
+
+    model_config = model.config
+    load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None
+
+    if load_tokenizer:
+        # Try to infer tokenizer from model or config name (if provided as str)
+        if tokenizer is None:
+            if isinstance(model_name, str):
+                tokenizer = model_name
+            elif isinstance(config, str):
+                tokenizer = config
+            else:
+                # Impossible to guess what is the right tokenizer here
+                raise Exception(
+                    "Impossible to guess which tokenizer to use. "
+                    "Please provide a PreTrainedTokenizer class or a path/identifier to a pretrained tokenizer."
+                )
+
+        # Instantiate tokenizer if needed
+        if isinstance(tokenizer, (str, tuple)):
+            if isinstance(tokenizer, tuple):
+                # For tuple we have (tokenizer name, {kwargs})
+                use_fast = tokenizer[1].pop("use_fast", use_fast)
+                tokenizer_identifier = tokenizer[0]
+                tokenizer_kwargs = tokenizer[1]
+            else:
+                tokenizer_identifier = tokenizer
+                tokenizer_kwargs = model_kwargs.copy()
+                tokenizer_kwargs.pop("torch_dtype", None)
+
+            tokenizer = AutoTokenizer.from_pretrained(
+                tokenizer_identifier, use_fast=use_fast, _from_pipeline=task, **tokenizer_kwargs
+            )
+
+    if torch_dtype is not None:
+        kwargs["torch_dtype"] = torch_dtype
+
+    if device is not None:
+        kwargs["device"] = device
+
+    return transformers_pipeline(
+        task,
+        model=model,
+        tokenizer=tokenizer,
+        use_fast=use_fast,
+        **kwargs,
+    )

From 688241726b6483f9ef45e934430a62571539dc26 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 2 Apr 2024 10:52:39 -0400
Subject: [PATCH 07/30] update tests

---
 tests/pipelines/test_pipelines.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py
index 0c6382d29c..3ca7da9406 100644
--- a/tests/pipelines/test_pipelines.py
+++ b/tests/pipelines/test_pipelines.py
@@ -18,7 +18,7 @@
 from parameterized import parameterized
 from transformers.pipelines import pipeline as transformers_pipeline
 
-from optimum.intel.generation.modeling import TSModelForCausalLM
+from optimum.intel.ipex.modeling_base import IPEXModelForCausalLM
 from optimum.intel.pipelines import pipeline as ipex_pipeline
 
 
@@ -43,11 +43,11 @@ def test_text_generation_pipeline_inference(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         inputs = "DeepSpeed is a machine learning framework for deep neural networks and deep reinforcement learning. It is written in C++ and is available for Linux, Mac OS X,"
         transformers_text_generator = transformers_pipeline("text-generation", model_id)
-        ipex_text_generator = ipex_pipeline("text-generation", model_id)
+        ipex_text_generator = ipex_pipeline("text-generation", model_id, accelerator="ipex")
         with torch.inference_mode():
             transformers_output = transformers_text_generator(inputs)
         with torch.inference_mode():
             ipex_output = ipex_text_generator(inputs)
-        self.assertTrue(isinstance(ipex_text_generator.model, TSModelForCausalLM))
+        self.assertTrue(isinstance(ipex_text_generator.model, IPEXModelForCausalLM))
         self.assertTrue(isinstance(ipex_text_generator.model.model, torch.jit.RecursiveScriptModule))
         self.assertEqual(transformers_output[0]["generated_text"], ipex_output[0]["generated_text"])

From 64c546c599086c507bbdda2f06c402ccf70e6664 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 2 Apr 2024 10:59:32 -0400
Subject: [PATCH 08/30] remove readme

---
 README.md | 37 -------------------------------------
 1 file changed, 37 deletions(-)

diff --git a/README.md b/README.md
index b7b6e82e86..c29a923745 100644
--- a/README.md
+++ b/README.md
@@ -44,43 +44,6 @@ where `extras` can be one or more of `ipex`, `neural-compressor`, `openvino`, `n
 
 # Quick tour
 
-## IPEX
-### pipeline
-Hugging Face pipelines provide a simple yet powerful abstraction to quickly set up inference. If you already have a pipeline from transformers, you can unlock the performance benefits of Optimum-Intel by just changing one line.
-```diff
-import torch
-- from transformers.pipelines import pipeline
-+ from optimum.intel.pipelines import pipeline
-
-pipe = pipeline("text-generation", "gpt2", torch_dtype=torch.bfloat16)
-pipe("Describe a real-world application of AI in sustainable energy.")
-```
-
-### generate
-If you want control over advanced features like quantization and token selection strategies, we recommend using the generate() API. Just like with pipelines, switching from existing transformers code is super simple.
-```diff
-import torch
-from transformers import AutoTokenizer, AutoConfig
-- from transformers import AutoModelForCausalLM
-+ from optimum.intel.generation.modeling import TSModelForCausalLM
-
-config = AutoConfig.from_pretrained("gpt2")
-model = TSModelForCausalLM.from_pretrained(
-  "gpt2",
-  config=config,
-  torch_dtype=torch.bfloat16,
-  export=True,
-)
-tokenizer = AutoTokenizer.from_pretrained("gpt2")
-input_sentence = ["Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?"]
-model_inputs = tokenizer(input_sentence, return_tensors="pt")
-generation_kwargs = dict(max_new_tokens=32, do_sample=False, num_beams=4, num_beam_groups=1, no_repeat_ngram_size=2, use_cache=True)
-
-generated_ids = model.generate(**model_inputs, **generation_kwargs)
-output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-print(output)
-```
-
 ## Neural Compressor
 
 Dynamic quantization can be used through the Optimum command-line interface:

From 29ad8b2b53434e85d3e13f6b1f5617bdbc86ceba Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Wed, 3 Apr 2024 09:16:52 +0800
Subject: [PATCH 09/30] Update optimum/intel/pipelines/__init__.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 optimum/intel/pipelines/__init__.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/optimum/intel/pipelines/__init__.py b/optimum/intel/pipelines/__init__.py
index 02eb06cb39..40a1e3ca56 100644
--- a/optimum/intel/pipelines/__init__.py
+++ b/optimum/intel/pipelines/__init__.py
@@ -1 +1,15 @@
+#  Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
 from .pipeline_base import pipeline

From b5392c1e8e4732c47eb25e73b79f559c1ce14040 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Sun, 7 Apr 2024 06:53:59 -0400
Subject: [PATCH 10/30] fix pipelines

---
 optimum/intel/ipex/inference.py             |   4 +
 optimum/intel/ipex/modeling_base.py         |   2 +
 optimum/intel/pipelines/pipeline_base.py    | 158 ++++++++++----------
 tests/{pipelines => ipex}/test_pipelines.py |   2 +-
 4 files changed, 84 insertions(+), 82 deletions(-)
 rename tests/{pipelines => ipex}/test_pipelines.py (93%)

diff --git a/optimum/intel/ipex/inference.py b/optimum/intel/ipex/inference.py
index ccf2da9d80..a628ebe12e 100644
--- a/optimum/intel/ipex/inference.py
+++ b/optimum/intel/ipex/inference.py
@@ -97,6 +97,10 @@ def __init__(
             jit (`boolean = False`, *optional*):
                 Enable jit to accelerate inference speed
         """
+        logger.warning(
+            "`inference_mode` is deprecated and will be removed in v1.18.0. Use `pipeline` to load and export your model to TorchScript instead."
+        )
+
         if not is_ipex_available():
             raise ImportError(IPEX_NOT_AVAILABLE_ERROR_MSG)
 
diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
index 0664a8e6ac..dfe48f5d4e 100644
--- a/optimum/intel/ipex/modeling_base.py
+++ b/optimum/intel/ipex/modeling_base.py
@@ -88,6 +88,8 @@ def ipex_jit_trace(model, task, use_cache):
         sample_inputs = prepare_jit_inputs(model, task, use_cache)
 
     model.config.return_dict = False
+    if "past_key_values" in sample_inputs.keys():
+        model.config.use_cache = use_cache
 
     model = ipex.optimize(model.eval(), dtype=model.dtype, inplace=True)
     # Disable repack while jit tracing to reduce the memory
diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py
index c6bca6a4bf..bdc3ea07d0 100644
--- a/optimum/intel/pipelines/pipeline_base.py
+++ b/optimum/intel/pipelines/pipeline_base.py
@@ -14,9 +14,10 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, Optional, Union
 
+from transformers import SequenceFeatureExtractor
 from transformers import pipeline as transformers_pipeline
-from transformers.configuration_utils import PretrainedConfig
-from transformers.models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
+from transformers.feature_extraction_utils import PreTrainedFeatureExtractor
+from transformers.onnx.utils import get_preprocessor
 from transformers.pipelines import (
     AudioClassificationPipeline,
     FillMaskPipeline,
@@ -34,6 +35,8 @@
     logging,
 )
 
+from optimum.utils.file_utils import find_files_matching_pattern
+
 
 if is_ipex_available():
     from ..ipex.modeling_base import (
@@ -96,27 +99,61 @@
 def load_ipex_model(
     model,
     targeted_task,
+    load_tokenizer,
+    tokenizer,
+    load_feature_extractor,
+    feature_extractor,
     SUPPORTED_TASKS,
     model_kwargs: Optional[Dict[str, Any]] = None,
 ):
     if model_kwargs is None:
         model_kwargs = {}
 
+    ipex_model_class = SUPPORTED_TASKS[targeted_task]["class"][0]
+
     if model is None:
         model_id = SUPPORTED_TASKS[targeted_task]["default"]
-        model = SUPPORTED_TASKS[targeted_task]["class"][0].from_pretrained(model_id, export=True)
+        model = ipex_model_class.from_pretrained(model_id, export=True)
     elif isinstance(model, str):
-        ipex_model_class = SUPPORTED_TASKS[targeted_task]["class"][0]
-        model = ipex_model_class.from_pretrained(model, export=True, **model_kwargs)
+        model_id = model
+        ipex_file = find_files_matching_pattern(
+            model,
+            ".+?.pt",
+            glob_pattern="**/*.pt",
+            subfolder=model_kwargs.pop("subfolder", None),
+            use_auth_token=model_kwargs.pop("token", None),
+            revision=model_kwargs.pop("revision", "main"),
+        )
+        export = len(ipex_file) == 0
+        model = ipex_model_class.from_pretrained(model, export=export, **model_kwargs)
     elif isinstance(model, IPEXModel):
-        pass
+        if tokenizer is None and load_tokenizer:
+            for preprocessor in model.preprocessors:
+                if isinstance(preprocessor, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
+                    tokenizer = preprocessor
+                    break
+            if tokenizer is None:
+                raise ValueError(
+                    "Could not automatically find a tokenizer for the IPEXModel, you must pass a tokenizer explictly"
+                )
+        if feature_extractor is None and load_feature_extractor:
+            for preprocessor in model.preprocessors:
+                if isinstance(preprocessor, SequenceFeatureExtractor):
+                    feature_extractor = preprocessor
+                    break
+            if feature_extractor is None:
+                raise ValueError(
+                    "Could not automatically find a feature extractor for the IPEXModel, you must pass a "
+                    "feature_extractor explictly"
+                )
+        model_id = None
     else:
         raise ValueError(
             f"""Model {model} is not supported. Please provide a valid model either as string or IPEXModel.
             You can also provide non model then a default one will be used"""
         )
 
-    return model
+    return model, model_id, tokenizer, feature_extractor
 
 
 MAPPING_LOADING_FUNC = {
@@ -125,7 +162,7 @@ def load_ipex_model(
 
 
 if is_torch_available():
-    import torch
+    pass
 
 
 if TYPE_CHECKING:
@@ -139,11 +176,10 @@ def load_ipex_model(
 def pipeline(
     task: str = None,
     model: Optional[Union[str, "PreTrainedModel"]] = None,
-    config: Optional[Union[str, PretrainedConfig]] = None,
     tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None,
+    feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None,
     accelerator: Optional[str] = "ipex",
     use_fast: bool = True,
-    device: Optional[Union[int, str, "torch.device"]] = None,
     torch_dtype=None,
     model_kwargs: Dict[str, Any] = None,
     **kwargs,
@@ -168,13 +204,6 @@ def pipeline(
             actual instance of a pretrained model inheriting from [`PreTrainedModel`] (for PyTorch).
 
             If not provided, the default for the `task` will be loaded.
-        config (`str` or [`PretrainedConfig`], *optional*):
-            The configuration that will be used by the pipeline to instantiate the model. This can be a model
-            identifier or an actual pretrained model configuration inheriting from [`PretrainedConfig`].
-
-            If not provided, the default configuration file for the requested model will be used. That means that if
-            `model` is given, its default configuration will be used. However, if `model` is not supplied, this
-            `task`'s default model's config is used instead.
         tokenizer (`str` or [`PreTrainedTokenizer`], *optional*):
             The tokenizer that will be used by the pipeline to encode data for the model. This can be a model
             identifier or an actual pretrained tokenizer inheriting from [`PreTrainedTokenizer`].
@@ -185,37 +214,11 @@ def pipeline(
             will be loaded.
         accelerator (`str`, *optional*, defaults to `"ipex"`):
             The optimization backends, choose from ["ipex", "inc", "openvino"].
-        revision (`str`, *optional*, defaults to `"main"`):
-            When passing a task name or a string model identifier: The specific model version to use. It can be a
-            branch name, a tag name, or a commit id, since we use a git-based system for storing models and other
-            artifacts on huggingface.co, so `revision` can be any identifier allowed by git.
         use_fast (`bool`, *optional*, defaults to `True`):
             Whether or not to use a Fast tokenizer if possible (a [`PreTrainedTokenizerFast`]).
-        use_auth_token (`str` or *bool*, *optional*):
-            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-            when running `huggingface-cli login` (stored in `~/.huggingface`).
-        device (`int` or `str` or `torch.device`):
-            Defines the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank like `1`) on which this
-            pipeline will be allocated.
-        device_map (`str` or `Dict[str, Union[int, str, torch.device]`, *optional*):
-            Sent directly as `model_kwargs` (just a simpler shortcut). When `accelerate` library is present, set
-            `device_map="auto"` to compute the most optimized `device_map` automatically (see
-            [here](https://huggingface.co/docs/accelerate/main/en/package_reference/big_modeling#accelerate.cpu_offload)
-            for more information).
-
-            <Tip warning={true}>
-
-            Do not use `device_map` AND `device` at the same time as they will conflict
-
-            </Tip>
-
         torch_dtype (`str` or `torch.dtype`, *optional*):
             Sent directly as `model_kwargs` (just a simpler shortcut) to use the available precision for this model
             (`torch.float16`, `torch.bfloat16`, ... or `"auto"`).
-        trust_remote_code (`bool`, *optional*, defaults to `False`):
-            Whether or not to allow for custom code defined on the Hub in their own modeling, configuration,
-            tokenization or even pipeline files. This option should only be set to `True` for repositories you trust
-            and in which you have read the code, as it will execute code present on the Hub on your local machine.
         model_kwargs (`Dict[str, Any]`, *optional*):
             Additional dictionary of keyword arguments passed along to the model's `from_pretrained(...,
             **model_kwargs)` function.
@@ -261,6 +264,23 @@ def pipeline(
                 f"Task {task} is not supported for the ONNX Runtime pipeline. Supported tasks are { list(IPEX_SUPPORTED_TASKS.keys())}"
             )
 
+    supported_tasks = IPEX_SUPPORTED_TASKS if accelerator == "ipex" else None
+
+    no_feature_extractor_tasks = set()
+    no_tokenizer_tasks = set()
+    for _task, values in supported_tasks.items():
+        if values["type"] == "text":
+            no_feature_extractor_tasks.add(_task)
+        elif values["type"] in {"image", "video"}:
+            no_tokenizer_tasks.add(_task)
+        elif values["type"] in {"audio"}:
+            no_tokenizer_tasks.add(_task)
+        elif values["type"] not in ["multimodal", "audio", "video"]:
+            raise ValueError(f"SUPPORTED_TASK {_task} contains invalid type {values['type']}")
+
+    load_tokenizer = False if task in no_tokenizer_tasks else True
+    load_feature_extractor = False if task in no_feature_extractor_tasks else True
+
     if isinstance(model, Path):
         model = str(model)
 
@@ -272,51 +292,27 @@ def pipeline(
             )
         model_kwargs["torch_dtype"] = torch_dtype
 
-    model_name = model if isinstance(model, str) else None
-
     # Load the correct model if possible
     # Infer the framework from the model if not already defined
-    model = MAPPING_LOADING_FUNC[accelerator](model, task, IPEX_SUPPORTED_TASKS, model_kwargs)
-
-    model_config = model.config
-    load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None
-
-    if load_tokenizer:
-        # Try to infer tokenizer from model or config name (if provided as str)
-        if tokenizer is None:
-            if isinstance(model_name, str):
-                tokenizer = model_name
-            elif isinstance(config, str):
-                tokenizer = config
-            else:
-                # Impossible to guess what is the right tokenizer here
-                raise Exception(
-                    "Impossible to guess which tokenizer to use. "
-                    "Please provide a PreTrainedTokenizer class or a path/identifier to a pretrained tokenizer."
-                )
+    model, model_id, tokenizer, feature_extractor = MAPPING_LOADING_FUNC[accelerator](
+        model,
+        task,
+        load_tokenizer,
+        tokenizer,
+        load_feature_extractor,
+        feature_extractor,
+        IPEX_SUPPORTED_TASKS,
+        model_kwargs,
+    )
 
-        # Instantiate tokenizer if needed
-        if isinstance(tokenizer, (str, tuple)):
-            if isinstance(tokenizer, tuple):
-                # For tuple we have (tokenizer name, {kwargs})
-                use_fast = tokenizer[1].pop("use_fast", use_fast)
-                tokenizer_identifier = tokenizer[0]
-                tokenizer_kwargs = tokenizer[1]
-            else:
-                tokenizer_identifier = tokenizer
-                tokenizer_kwargs = model_kwargs.copy()
-                tokenizer_kwargs.pop("torch_dtype", None)
-
-            tokenizer = AutoTokenizer.from_pretrained(
-                tokenizer_identifier, use_fast=use_fast, _from_pipeline=task, **tokenizer_kwargs
-            )
+    if tokenizer is None and load_tokenizer:
+        tokenizer = get_preprocessor(model_id)
+    if feature_extractor is None and load_feature_extractor:
+        feature_extractor = get_preprocessor(model_id)
 
     if torch_dtype is not None:
         kwargs["torch_dtype"] = torch_dtype
 
-    if device is not None:
-        kwargs["device"] = device
-
     return transformers_pipeline(
         task,
         model=model,
diff --git a/tests/pipelines/test_pipelines.py b/tests/ipex/test_pipelines.py
similarity index 93%
rename from tests/pipelines/test_pipelines.py
rename to tests/ipex/test_pipelines.py
index 3ca7da9406..04b24eca7f 100644
--- a/tests/pipelines/test_pipelines.py
+++ b/tests/ipex/test_pipelines.py
@@ -41,7 +41,7 @@ class PipelinesIntegrationTest(unittest.TestCase):
     @parameterized.expand(TEXT_GENERATION_SUPPORTED_ARCHITECTURES)
     def test_text_generation_pipeline_inference(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
-        inputs = "DeepSpeed is a machine learning framework for deep neural networks and deep reinforcement learning. It is written in C++ and is available for Linux, Mac OS X,"
+        inputs = "Describe a real-world application of AI."
         transformers_text_generator = transformers_pipeline("text-generation", model_id)
         ipex_text_generator = ipex_pipeline("text-generation", model_id, accelerator="ipex")
         with torch.inference_mode():

From f294f746ca7680b5464c904c56b325f068ae68d2 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Sun, 7 Apr 2024 11:16:46 -0400
Subject: [PATCH 11/30] add all supported tasks testing

---
 optimum/intel/pipelines/pipeline_base.py |  73 ++-----
 tests/ipex/test_pipelines.py             | 236 +++++++++++++++++++++--
 2 files changed, 245 insertions(+), 64 deletions(-)

diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py
index bdc3ea07d0..e9f38f58fe 100644
--- a/optimum/intel/pipelines/pipeline_base.py
+++ b/optimum/intel/pipelines/pipeline_base.py
@@ -11,13 +11,13 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, Optional, Union
 
-from transformers import SequenceFeatureExtractor
+from transformers import AutoConfig, AutoFeatureExtractor, AutoTokenizer
 from transformers import pipeline as transformers_pipeline
 from transformers.feature_extraction_utils import PreTrainedFeatureExtractor
-from transformers.onnx.utils import get_preprocessor
 from transformers.pipelines import (
     AudioClassificationPipeline,
     FillMaskPipeline,
@@ -35,8 +35,6 @@
     logging,
 )
 
-from optimum.utils.file_utils import find_files_matching_pattern
-
 
 if is_ipex_available():
     from ..ipex.modeling_base import (
@@ -99,13 +97,11 @@
 def load_ipex_model(
     model,
     targeted_task,
-    load_tokenizer,
-    tokenizer,
-    load_feature_extractor,
-    feature_extractor,
     SUPPORTED_TASKS,
     model_kwargs: Optional[Dict[str, Any]] = None,
+    **kwargs,
 ):
+    export = kwargs.pop("export", True)
     if model_kwargs is None:
         model_kwargs = {}
 
@@ -116,44 +112,25 @@ def load_ipex_model(
         model = ipex_model_class.from_pretrained(model_id, export=True)
     elif isinstance(model, str):
         model_id = model
-        ipex_file = find_files_matching_pattern(
-            model,
-            ".+?.pt",
-            glob_pattern="**/*.pt",
-            subfolder=model_kwargs.pop("subfolder", None),
-            use_auth_token=model_kwargs.pop("token", None),
-            revision=model_kwargs.pop("revision", "main"),
-        )
-        export = len(ipex_file) == 0
+        try:
+            config = AutoConfig.from_pretrained(model)
+            torchscript = getattr(config, "torchscript", None)
+            export = False if torchscript else export
+        except RuntimeError:
+            logger.warning(
+                "config file not found, please pass `export` to decide whether we should export this model. `export` defaullt to True"
+            )
+
         model = ipex_model_class.from_pretrained(model, export=export, **model_kwargs)
     elif isinstance(model, IPEXModel):
-        if tokenizer is None and load_tokenizer:
-            for preprocessor in model.preprocessors:
-                if isinstance(preprocessor, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
-                    tokenizer = preprocessor
-                    break
-            if tokenizer is None:
-                raise ValueError(
-                    "Could not automatically find a tokenizer for the IPEXModel, you must pass a tokenizer explictly"
-                )
-        if feature_extractor is None and load_feature_extractor:
-            for preprocessor in model.preprocessors:
-                if isinstance(preprocessor, SequenceFeatureExtractor):
-                    feature_extractor = preprocessor
-                    break
-            if feature_extractor is None:
-                raise ValueError(
-                    "Could not automatically find a feature extractor for the IPEXModel, you must pass a "
-                    "feature_extractor explictly"
-                )
         model_id = None
     else:
         raise ValueError(
-            f"""Model {model} is not supported. Please provide a valid model either as string or IPEXModel.
+            f"""Model {model} is not supported. Please provide a valid model name or path or a IPEXModel.
             You can also provide non model then a default one will be used"""
         )
 
-    return model, model_id, tokenizer, feature_extractor
+    return model, model_id
 
 
 MAPPING_LOADING_FUNC = {
@@ -294,21 +271,12 @@ def pipeline(
 
     # Load the correct model if possible
     # Infer the framework from the model if not already defined
-    model, model_id, tokenizer, feature_extractor = MAPPING_LOADING_FUNC[accelerator](
-        model,
-        task,
-        load_tokenizer,
-        tokenizer,
-        load_feature_extractor,
-        feature_extractor,
-        IPEX_SUPPORTED_TASKS,
-        model_kwargs,
-    )
+    model, model_id = MAPPING_LOADING_FUNC[accelerator](model, task, supported_tasks, model_kwargs, **kwargs)
 
-    if tokenizer is None and load_tokenizer:
-        tokenizer = get_preprocessor(model_id)
-    if feature_extractor is None and load_feature_extractor:
-        feature_extractor = get_preprocessor(model_id)
+    if load_tokenizer and model_id and tokenizer is None:
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+    if load_feature_extractor and model_id and feature_extractor is None:
+        feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
 
     if torch_dtype is not None:
         kwargs["torch_dtype"] = torch_dtype
@@ -317,6 +285,7 @@ def pipeline(
         task,
         model=model,
         tokenizer=tokenizer,
+        feature_extractor=feature_extractor,
         use_fast=use_fast,
         **kwargs,
     )
diff --git a/tests/ipex/test_pipelines.py b/tests/ipex/test_pipelines.py
index 04b24eca7f..89a27ab2c8 100644
--- a/tests/ipex/test_pipelines.py
+++ b/tests/ipex/test_pipelines.py
@@ -13,41 +13,253 @@
 #  limitations under the License.
 
 import unittest
+from tempfile import TemporaryDirectory
 
+import numpy as np
 import torch
 from parameterized import parameterized
+from transformers import AutoTokenizer
 from transformers.pipelines import pipeline as transformers_pipeline
 
-from optimum.intel.ipex.modeling_base import IPEXModelForCausalLM
+from optimum.intel.ipex.modeling_base import (
+    IPEXModelForAudioClassification,
+    IPEXModelForCausalLM,
+    IPEXModelForImageClassification,
+    IPEXModelForMaskedLM,
+    IPEXModelForQuestionAnswering,
+    IPEXModelForSequenceClassification,
+    IPEXModelForTokenClassification,
+)
 from optimum.intel.pipelines import pipeline as ipex_pipeline
 
 
 MODEL_NAMES = {
+    "albert": "hf-internal-testing/tiny-random-albert",
+    "beit": "hf-internal-testing/tiny-random-BeitForImageClassification",
     "bert": "hf-internal-testing/tiny-random-bert",
+    "bart": "hf-internal-testing/tiny-random-bart",
+    "blenderbot-small": "hf-internal-testing/tiny-random-BlenderbotModel",
+    "blenderbot": "hf-internal-testing/tiny-random-BlenderbotModel",
+    "bloom": "hf-internal-testing/tiny-random-BloomModel",
+    "convbert": "hf-internal-testing/tiny-random-ConvBertForSequenceClassification",
+    "codegen": "hf-internal-testing/tiny-random-CodeGenForCausalLM",
+    "convnext": "hf-internal-testing/tiny-random-convnext",
     "distilbert": "hf-internal-testing/tiny-random-distilbert",
-    "roberta": "hf-internal-testing/tiny-random-roberta",
-    "bloom": "hf-internal-testing/tiny-random-bloom",
-    "gptj": "hf-internal-testing/tiny-random-gptj",
+    "electra": "hf-internal-testing/tiny-random-electra",
+    "flaubert": "hf-internal-testing/tiny-random-flaubert",
+    "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel",
     "gpt2": "hf-internal-testing/tiny-random-gpt2",
     "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel",
     "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM",
-    "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel",
+    "gptj": "hf-internal-testing/tiny-random-GPTJModel",
+    "levit": "hf-internal-testing/tiny-random-LevitModel",
+    "llama": "fxmarty/tiny-llama-fast-tokenizer",
+    "llama2": "Jiqing/tiny_random_llama2",
+    "marian": "sshleifer/tiny-marian-en-de",
+    "mbart": "hf-internal-testing/tiny-random-mbart",
+    "mistral": "echarlaix/tiny-random-mistral",
+    "mobilenet_v1": "google/mobilenet_v1_0.75_192",
+    "mobilenet_v2": "hf-internal-testing/tiny-random-MobileNetV2Model",
+    "mobilevit": "hf-internal-testing/tiny-random-mobilevit",
+    "mpt": "hf-internal-testing/tiny-random-MptForCausalLM",
+    "mt5": "stas/mt5-tiny-random",
+    "opt": "hf-internal-testing/tiny-random-OPTModel",
+    "phi": "echarlaix/tiny-random-PhiForCausalLM",
+    "resnet": "hf-internal-testing/tiny-random-resnet",
+    "roberta": "hf-internal-testing/tiny-random-roberta",
+    "roformer": "hf-internal-testing/tiny-random-roformer",
+    "squeezebert": "hf-internal-testing/tiny-random-squeezebert",
+    "t5": "hf-internal-testing/tiny-random-t5",
+    "unispeech": "hf-internal-testing/tiny-random-unispeech",
+    "vit": "hf-internal-testing/tiny-random-vit",
+    "wav2vec2": "anton-l/wav2vec2-random-tiny-classifier",
+    "xlm": "hf-internal-testing/tiny-random-xlm",
 }
 
 
 class PipelinesIntegrationTest(unittest.TestCase):
-    TEXT_GENERATION_SUPPORTED_ARCHITECTURES = ("bloom", "gptj", "gpt2", "gpt_neo")
+    COMMON_SUPPORTED_ARCHITECTURES = (
+        "albert",
+        "bert",
+        "distilbert",
+        "electra",
+        "flaubert",
+        "roberta",
+        "roformer",
+        "squeezebert",
+        "xlm",
+    )
+    TEXT_GENERATION_SUPPORTED_ARCHITECTURES = (
+        "bart",
+        "gpt_bigcode",
+        "blenderbot",
+        "blenderbot-small",
+        "bloom",
+        "codegen",
+        "gpt2",
+        "gpt_neo",
+        "gpt_neox",
+        "llama",
+        "llama2",
+        "mistral",
+        "mpt",
+        "opt",
+    )
+    QUESTION_ANSWERING_SUPPORTED_ARCHITECTURES = (
+        "bert",
+        "distilbert",
+        "roberta",
+    )
+    AUDIO_CLASSIFICATION_SUPPORTED_ARCHITECTURES = (
+        "unispeech",
+        "wav2vec2",
+    )
+    IMAGE_CLASSIFICATION_SUPPORTED_ARCHITECTURES = (
+        "beit",
+        "mobilenet_v1",
+        "mobilenet_v2",
+        "mobilevit",
+        "resnet",
+        "vit",
+    )
+
+    @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES)
+    def test_token_classification_pipeline_inference(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        transformers_generator = transformers_pipeline("token-classification", model_id)
+        ipex_generator = ipex_pipeline("token-classification", model_id, accelerator="ipex")
+        inputs = "Hello I'm Omar and I live in Zürich."
+        with torch.inference_mode():
+            transformers_output = transformers_generator(inputs)
+        with torch.inference_mode():
+            ipex_output = ipex_generator(inputs)
+        self.assertEqual(len(transformers_output), len(ipex_output))
+        self.assertTrue(isinstance(ipex_generator.model, IPEXModelForTokenClassification))
+        self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule))
+        for i in range(len(transformers_output)):
+            self.assertAlmostEqual(transformers_output[i]["score"], ipex_output[i]["score"], delta=1e-4)
+
+    @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES)
+    def test_sequence_classification_pipeline_inference(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        transformers_generator = transformers_pipeline("text-classification", model_id)
+        ipex_generator = ipex_pipeline("text-classification", model_id, accelerator="ipex")
+        inputs = "This restaurant is awesome"
+        with torch.inference_mode():
+            transformers_output = transformers_generator(inputs)
+        with torch.inference_mode():
+            ipex_output = ipex_generator(inputs)
+        self.assertTrue(isinstance(ipex_generator.model, IPEXModelForSequenceClassification))
+        self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule))
+        self.assertEqual(transformers_output[0]["label"], ipex_output[0]["label"])
+        self.assertAlmostEqual(transformers_output[0]["score"], ipex_output[0]["score"], delta=1e-4)
+
+    @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES)
+    def test_fill_mask_pipeline_inference(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        inputs = "The Milky Way is a <mask> galaxy."
+        transformers_generator = transformers_pipeline("fill-mask", model_id)
+        ipex_generator = ipex_pipeline("fill-mask", model_id, accelerator="ipex")
+        mask_token = transformers_generator.tokenizer.mask_token
+        inputs = inputs.replace("<mask>", mask_token)
+        with torch.inference_mode():
+            transformers_output = transformers_generator(inputs)
+        with torch.inference_mode():
+            ipex_output = ipex_generator(inputs)
+        self.assertEqual(len(transformers_output), len(ipex_output))
+        self.assertTrue(isinstance(ipex_generator.model, IPEXModelForMaskedLM))
+        self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule))
+        for i in range(len(transformers_output)):
+            self.assertEqual(transformers_output[i]["token"], ipex_output[i]["token"])
+            self.assertAlmostEqual(transformers_output[i]["score"], ipex_output[i]["score"], delta=1e-4)
 
     @parameterized.expand(TEXT_GENERATION_SUPPORTED_ARCHITECTURES)
     def test_text_generation_pipeline_inference(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
+        transformers_generator = transformers_pipeline("text-generation", model_id)
+        ipex_generator = ipex_pipeline("text-generation", model_id, accelerator="ipex")
         inputs = "Describe a real-world application of AI."
-        transformers_text_generator = transformers_pipeline("text-generation", model_id)
-        ipex_text_generator = ipex_pipeline("text-generation", model_id, accelerator="ipex")
         with torch.inference_mode():
-            transformers_output = transformers_text_generator(inputs)
+            transformers_output = transformers_generator(inputs)
         with torch.inference_mode():
-            ipex_output = ipex_text_generator(inputs)
-        self.assertTrue(isinstance(ipex_text_generator.model, IPEXModelForCausalLM))
-        self.assertTrue(isinstance(ipex_text_generator.model.model, torch.jit.RecursiveScriptModule))
+            ipex_output = ipex_generator(inputs)
+        self.assertTrue(isinstance(ipex_generator.model, IPEXModelForCausalLM))
+        self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule))
         self.assertEqual(transformers_output[0]["generated_text"], ipex_output[0]["generated_text"])
+
+    @parameterized.expand(QUESTION_ANSWERING_SUPPORTED_ARCHITECTURES)
+    def test_question_answering_pipeline_inference(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        transformers_generator = transformers_pipeline("question-answering", model_id)
+        ipex_generator = ipex_pipeline("question-answering", model_id, accelerator="ipex")
+        question = "How many programming languages does BLOOM support?"
+        context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."
+        with torch.inference_mode():
+            transformers_output = transformers_generator(question=question, context=context)
+        with torch.inference_mode():
+            ipex_output = ipex_generator(question=question, context=context)
+        self.assertTrue(isinstance(ipex_generator.model, IPEXModelForQuestionAnswering))
+        self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule))
+        self.assertAlmostEqual(transformers_output["score"], ipex_output["score"], delta=1e-4)
+        self.assertEqual(transformers_output["start"], ipex_output["start"])
+        self.assertEqual(transformers_output["end"], ipex_output["end"])
+
+    @parameterized.expand(AUDIO_CLASSIFICATION_SUPPORTED_ARCHITECTURES)
+    def test_audio_classification_pipeline_inference(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        transformers_generator = transformers_pipeline("audio-classification", model_id)
+        ipex_generator = ipex_pipeline("audio-classification", model_id, accelerator="ipex")
+        inputs = [np.random.random(16000)]
+        with torch.inference_mode():
+            transformers_output = transformers_generator(inputs)
+        with torch.inference_mode():
+            ipex_output = ipex_generator(inputs)
+        self.assertTrue(isinstance(ipex_generator.model, IPEXModelForAudioClassification))
+        self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule))
+        self.assertAlmostEqual(transformers_output[0][0]["score"], ipex_output[0][0]["score"], delta=1e-2)
+        self.assertAlmostEqual(transformers_output[0][1]["score"], ipex_output[0][1]["score"], delta=1e-2)
+
+    @parameterized.expand(IMAGE_CLASSIFICATION_SUPPORTED_ARCHITECTURES)
+    def test_image_classification_pipeline_inference(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        transformers_generator = transformers_pipeline("image-classification", model_id)
+        ipex_generator = ipex_pipeline("image-classification", model_id, accelerator="ipex")
+        inputs = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        with torch.inference_mode():
+            transformers_output = transformers_generator(inputs)
+        with torch.inference_mode():
+            ipex_output = ipex_generator(inputs)
+        self.assertEqual(len(transformers_output), len(ipex_output))
+        self.assertTrue(isinstance(ipex_generator.model, IPEXModelForImageClassification))
+        self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule))
+        for i in range(len(transformers_output)):
+            self.assertEqual(transformers_output[i]["label"], ipex_output[i]["label"])
+            self.assertAlmostEqual(transformers_output[i]["score"], ipex_output[i]["score"], delta=1e-4)
+
+    @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES)
+    def test_pipeline_load_from_ipex_model(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        model = IPEXModelForSequenceClassification.from_pretrained(model_id, export=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        ipex_generator = ipex_pipeline("text-classification", model, tokenizer=tokenizer, accelerator="ipex")
+        inputs = "This restaurant is awesome"
+        with torch.inference_mode():
+            ipex_output = ipex_generator(inputs)
+        self.assertTrue(isinstance(ipex_generator.model, IPEXModelForSequenceClassification))
+        self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule))
+        self.assertGreaterEqual(ipex_output[0]["score"], 0.0)
+
+    @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES)
+    def test_pipeline_load_from_jit_model(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        model = IPEXModelForSequenceClassification.from_pretrained(model_id, export=True)
+        save_dir = TemporaryDirectory().name
+        model.save_pretrained(save_dir)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        ipex_generator = ipex_pipeline("text-classification", save_dir, tokenizer=tokenizer, accelerator="ipex")
+        inputs = "This restaurant is awesome"
+        with torch.inference_mode():
+            ipex_output = ipex_generator(inputs)
+        self.assertTrue(isinstance(ipex_generator.model, IPEXModelForSequenceClassification))
+        self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule))
+        self.assertGreaterEqual(ipex_output[0]["score"], 0.0)

From 7510036e30ded0f7b01701e2087ad24cf388beed Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 15 Apr 2024 09:44:47 -0400
Subject: [PATCH 12/30] add hub_kwargs and model_kwargs on tokenizer and
 feature_extractor

---
 optimum/intel/ipex/modeling_base.py      |  2 --
 optimum/intel/pipelines/pipeline_base.py | 15 ++++++++++++---
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
index dfe48f5d4e..0664a8e6ac 100644
--- a/optimum/intel/ipex/modeling_base.py
+++ b/optimum/intel/ipex/modeling_base.py
@@ -88,8 +88,6 @@ def ipex_jit_trace(model, task, use_cache):
         sample_inputs = prepare_jit_inputs(model, task, use_cache)
 
     model.config.return_dict = False
-    if "past_key_values" in sample_inputs.keys():
-        model.config.use_cache = use_cache
 
     model = ipex.optimize(model.eval(), dtype=model.dtype, inplace=True)
     # Disable repack while jit tracing to reduce the memory
diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py
index e9f38f58fe..d6dd9e2132 100644
--- a/optimum/intel/pipelines/pipeline_base.py
+++ b/optimum/intel/pipelines/pipeline_base.py
@@ -109,7 +109,7 @@ def load_ipex_model(
 
     if model is None:
         model_id = SUPPORTED_TASKS[targeted_task]["default"]
-        model = ipex_model_class.from_pretrained(model_id, export=True)
+        model = ipex_model_class.from_pretrained(model_id, export=True, **model_kwargs)
     elif isinstance(model, str):
         model_id = model
         try:
@@ -258,6 +258,15 @@ def pipeline(
     load_tokenizer = False if task in no_tokenizer_tasks else True
     load_feature_extractor = False if task in no_feature_extractor_tasks else True
 
+    commit_hash = kwargs.pop("_commit_hash", None)
+
+    hub_kwargs = {
+        "revision": kwargs.pop("revision", None),
+        "token": kwargs.pop("use_auth_token", None),
+        "trust_remote_code": kwargs.pop("trust_remote_code", None),
+        "_commit_hash": commit_hash,
+    }
+
     if isinstance(model, Path):
         model = str(model)
 
@@ -274,9 +283,9 @@ def pipeline(
     model, model_id = MAPPING_LOADING_FUNC[accelerator](model, task, supported_tasks, model_kwargs, **kwargs)
 
     if load_tokenizer and model_id and tokenizer is None:
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer = AutoTokenizer.from_pretrained(model_id, **hub_kwargs, **model_kwargs)
     if load_feature_extractor and model_id and feature_extractor is None:
-        feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
+        feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, **hub_kwargs, **model_kwargs)
 
     if torch_dtype is not None:
         kwargs["torch_dtype"] = torch_dtype

From 9e8ce0edbf36cc00c19f6b4cc5e51936d4853de5 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 25 Apr 2024 05:31:48 -0400
Subject: [PATCH 13/30] add hub_kwargs and default pipeline tests

---
 optimum/exporters/openvino/model_patcher.py |  6 +++---
 optimum/intel/pipelines/pipeline_base.py    |  9 ++++++---
 tests/ipex/test_pipelines.py                | 15 +++++++++++++++
 3 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 3649c163c6..96df156cc6 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -327,9 +327,9 @@ def _llama_gemma_update_causal_mask(self, attention_mask, input_tensor, cache_po
                 offset = 0
             mask_shape = attention_mask.shape
             mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype
-            causal_mask[
-                : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]
-            ] = mask_slice
+            causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = (
+                mask_slice
+            )
 
     if (
         self.config._attn_implementation == "sdpa"
diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py
index d6dd9e2132..5350e1e65d 100644
--- a/optimum/intel/pipelines/pipeline_base.py
+++ b/optimum/intel/pipelines/pipeline_base.py
@@ -99,6 +99,7 @@ def load_ipex_model(
     targeted_task,
     SUPPORTED_TASKS,
     model_kwargs: Optional[Dict[str, Any]] = None,
+    hub_kwargs: Optional[Dict[str, Any]] = None,
     **kwargs,
 ):
     export = kwargs.pop("export", True)
@@ -109,7 +110,7 @@ def load_ipex_model(
 
     if model is None:
         model_id = SUPPORTED_TASKS[targeted_task]["default"]
-        model = ipex_model_class.from_pretrained(model_id, export=True, **model_kwargs)
+        model = ipex_model_class.from_pretrained(model_id, export=True, **model_kwargs, **hub_kwargs)
     elif isinstance(model, str):
         model_id = model
         try:
@@ -121,7 +122,7 @@ def load_ipex_model(
                 "config file not found, please pass `export` to decide whether we should export this model. `export` defaullt to True"
             )
 
-        model = ipex_model_class.from_pretrained(model, export=export, **model_kwargs)
+        model = ipex_model_class.from_pretrained(model, export=export, **model_kwargs, **hub_kwargs)
     elif isinstance(model, IPEXModel):
         model_id = None
     else:
@@ -280,7 +281,9 @@ def pipeline(
 
     # Load the correct model if possible
     # Infer the framework from the model if not already defined
-    model, model_id = MAPPING_LOADING_FUNC[accelerator](model, task, supported_tasks, model_kwargs, **kwargs)
+    model, model_id = MAPPING_LOADING_FUNC[accelerator](
+        model, task, supported_tasks, model_kwargs, hub_kwargs, **kwargs
+    )
 
     if load_tokenizer and model_id and tokenizer is None:
         tokenizer = AutoTokenizer.from_pretrained(model_id, **hub_kwargs, **model_kwargs)
diff --git a/tests/ipex/test_pipelines.py b/tests/ipex/test_pipelines.py
index 89a27ab2c8..ac1c31ef6c 100644
--- a/tests/ipex/test_pipelines.py
+++ b/tests/ipex/test_pipelines.py
@@ -22,6 +22,7 @@
 from transformers.pipelines import pipeline as transformers_pipeline
 
 from optimum.intel.ipex.modeling_base import (
+    IPEXModel,
     IPEXModelForAudioClassification,
     IPEXModelForCausalLM,
     IPEXModelForImageClassification,
@@ -122,6 +123,15 @@ class PipelinesIntegrationTest(unittest.TestCase):
         "resnet",
         "vit",
     )
+    SUPPORT_TASKS = (
+        "text-generation",
+        "fill-mask",
+        "question-answering",
+        "image-classification",
+        "text-classification",
+        "token-classification",
+        "audio-classification",
+    )
 
     @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES)
     def test_token_classification_pipeline_inference(self, model_arch):
@@ -263,3 +273,8 @@ def test_pipeline_load_from_jit_model(self, model_arch):
         self.assertTrue(isinstance(ipex_generator.model, IPEXModelForSequenceClassification))
         self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule))
         self.assertGreaterEqual(ipex_output[0]["score"], 0.0)
+
+    @parameterized.expand(SUPPORT_TASKS)
+    def test_pipeline_with_default_model(self, task):
+        ipex_generator = ipex_pipeline(task, accelerator="ipex")
+        self.assertTrue(isinstance(ipex_generator.model, IPEXModel))

From 5013fe7df1dea102efbec140149e6dfdc355ff1d Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Sun, 28 Apr 2024 11:09:36 -0400
Subject: [PATCH 14/30] fix _from_transformers args

---
 optimum/intel/ipex/modeling_base.py | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
index 3961c1f3af..8fd5dacba3 100644
--- a/optimum/intel/ipex/modeling_base.py
+++ b/optimum/intel/ipex/modeling_base.py
@@ -151,35 +151,17 @@ def _from_transformers(
         model_id: str,
         config: PretrainedConfig,
         use_cache: bool = True,
-        use_auth_token: Optional[Union[bool, str]] = None,
-        revision: Optional[str] = None,
-        force_download: bool = False,
-        cache_dir: str = HUGGINGFACE_HUB_CACHE,
-        subfolder: str = "",
-        local_files_only: bool = False,
-        torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
-        trust_remote_code: bool = False,
+        **model_kwargs,
     ):
         if is_torch_version("<", "2.1.0"):
             raise ImportError("`torch>=2.0.0` is needed to trace your model")
 
         task = cls.export_feature
-        model_kwargs = {
-            "revision": revision,
-            "use_auth_token": use_auth_token,
-            "cache_dir": cache_dir,
-            "subfolder": subfolder,
-            "local_files_only": local_files_only,
-            "force_download": force_download,
-            "torch_dtype": torch_dtype,
-            "trust_remote_code": trust_remote_code,
-        }
-
         model = TasksManager.get_model_from_task(task, model_id, **model_kwargs)
         traced_model = ipex_jit_trace(model, task, use_cache)
 
         config.torchscript = True
-        config.torch_dtype = torch_dtype
+        config.torch_dtype = model_kwargs.get("torch_dtype", None)
 
         return cls(traced_model, config=config, model_save_dir=model_id, use_cache=use_cache, warmup=False)
 

From a39112fd49ef70e28b1ca42c952c3c89a6c5e1ef Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 29 Apr 2024 11:30:44 -0400
Subject: [PATCH 15/30] rm default pipeline test

---
 tests/ipex/test_pipelines.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/tests/ipex/test_pipelines.py b/tests/ipex/test_pipelines.py
index ac1c31ef6c..585219f00b 100644
--- a/tests/ipex/test_pipelines.py
+++ b/tests/ipex/test_pipelines.py
@@ -123,15 +123,6 @@ class PipelinesIntegrationTest(unittest.TestCase):
         "resnet",
         "vit",
     )
-    SUPPORT_TASKS = (
-        "text-generation",
-        "fill-mask",
-        "question-answering",
-        "image-classification",
-        "text-classification",
-        "token-classification",
-        "audio-classification",
-    )
 
     @parameterized.expand(COMMON_SUPPORTED_ARCHITECTURES)
     def test_token_classification_pipeline_inference(self, model_arch):
@@ -273,8 +264,3 @@ def test_pipeline_load_from_jit_model(self, model_arch):
         self.assertTrue(isinstance(ipex_generator.model, IPEXModelForSequenceClassification))
         self.assertTrue(isinstance(ipex_generator.model.model, torch.jit.RecursiveScriptModule))
         self.assertGreaterEqual(ipex_output[0]["score"], 0.0)
-
-    @parameterized.expand(SUPPORT_TASKS)
-    def test_pipeline_with_default_model(self, task):
-        ipex_generator = ipex_pipeline(task, accelerator="ipex")
-        self.assertTrue(isinstance(ipex_generator.model, IPEXModel))

From f401b55d323bf893281989b673262335bb6be0b4 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Mon, 6 May 2024 08:53:15 +0800
Subject: [PATCH 16/30] Update optimum/intel/pipelines/pipeline_base.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 optimum/intel/pipelines/pipeline_base.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py
index 5350e1e65d..5ebcfd72d6 100644
--- a/optimum/intel/pipelines/pipeline_base.py
+++ b/optimum/intel/pipelines/pipeline_base.py
@@ -139,9 +139,6 @@ def load_ipex_model(
 }
 
 
-if is_torch_available():
-    pass
-
 
 if TYPE_CHECKING:
     from transformers.modeling_utils import PreTrainedModel

From e784dd2ac65110318157630adb126f3082a3fb8f Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Mon, 6 May 2024 08:53:33 +0800
Subject: [PATCH 17/30] Update optimum/intel/pipelines/pipeline_base.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 optimum/intel/pipelines/pipeline_base.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py
index 5ebcfd72d6..4498a11b3d 100644
--- a/optimum/intel/pipelines/pipeline_base.py
+++ b/optimum/intel/pipelines/pipeline_base.py
@@ -98,8 +98,10 @@ def load_ipex_model(
     model,
     targeted_task,
     SUPPORTED_TASKS,
+    subfolder: str = "",
+    token: Optional[Union[bool, str]] = None,
+    revision: str = "main",
     model_kwargs: Optional[Dict[str, Any]] = None,
-    hub_kwargs: Optional[Dict[str, Any]] = None,
     **kwargs,
 ):
     export = kwargs.pop("export", True)

From 6fb886398d3ccfb994b4485f8e0a20c033361424 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Mon, 6 May 2024 08:53:43 +0800
Subject: [PATCH 18/30] Update optimum/intel/pipelines/pipeline_base.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 optimum/intel/pipelines/pipeline_base.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py
index 4498a11b3d..426137d9b5 100644
--- a/optimum/intel/pipelines/pipeline_base.py
+++ b/optimum/intel/pipelines/pipeline_base.py
@@ -155,10 +155,12 @@ def pipeline(
     model: Optional[Union[str, "PreTrainedModel"]] = None,
     tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None,
     feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None,
-    accelerator: Optional[str] = "ipex",
     use_fast: bool = True,
-    torch_dtype=None,
-    model_kwargs: Dict[str, Any] = None,
+    token: Optional[Union[str, bool]] = None,
+    accelerator: Optional[str] = "ort",
+    revision: Optional[str] = None,
+    trust_remote_code: Optional[bool] = None,
+    *model_kwargs,
     **kwargs,
 ) -> Pipeline:
     """

From 79ae3d95eea11094637b83852c4a7d61f398a1dc Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Mon, 6 May 2024 09:21:28 +0800
Subject: [PATCH 19/30] Update optimum/intel/pipelines/pipeline_base.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 optimum/intel/pipelines/pipeline_base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py
index 426137d9b5..6c78800a96 100644
--- a/optimum/intel/pipelines/pipeline_base.py
+++ b/optimum/intel/pipelines/pipeline_base.py
@@ -257,8 +257,8 @@ def pipeline(
         elif values["type"] not in ["multimodal", "audio", "video"]:
             raise ValueError(f"SUPPORTED_TASK {_task} contains invalid type {values['type']}")
 
-    load_tokenizer = False if task in no_tokenizer_tasks else True
-    load_feature_extractor = False if task in no_feature_extractor_tasks else True
+    load_tokenizer = task not in no_tokenizer_tasks
+    load_feature_extractor = task not in no_feature_extractor_tasks
 
     commit_hash = kwargs.pop("_commit_hash", None)
 

From cfbcf9f0b01d0b4d5357552e40568ff311c661a6 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Mon, 6 May 2024 09:21:49 +0800
Subject: [PATCH 20/30] Update optimum/intel/pipelines/pipeline_base.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 optimum/intel/pipelines/pipeline_base.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py
index 6c78800a96..e545149fd2 100644
--- a/optimum/intel/pipelines/pipeline_base.py
+++ b/optimum/intel/pipelines/pipeline_base.py
@@ -291,8 +291,6 @@ def pipeline(
     if load_feature_extractor and model_id and feature_extractor is None:
         feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, **hub_kwargs, **model_kwargs)
 
-    if torch_dtype is not None:
-        kwargs["torch_dtype"] = torch_dtype
 
     return transformers_pipeline(
         task,
@@ -300,5 +298,6 @@ def pipeline(
         tokenizer=tokenizer,
         feature_extractor=feature_extractor,
         use_fast=use_fast,
+        torch_dtype=torch_dtype,
         **kwargs,
     )

From 3760e1eade71172a8e3864ffed2e669e4c88284e Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Mon, 6 May 2024 09:23:05 +0800
Subject: [PATCH 21/30] Update optimum/intel/pipelines/pipeline_base.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 optimum/intel/pipelines/pipeline_base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py
index e545149fd2..f99af4e9bf 100644
--- a/optimum/intel/pipelines/pipeline_base.py
+++ b/optimum/intel/pipelines/pipeline_base.py
@@ -286,9 +286,9 @@ def pipeline(
         model, task, supported_tasks, model_kwargs, hub_kwargs, **kwargs
     )
 
-    if load_tokenizer and model_id and tokenizer is None:
+    if load_tokenizer and tokenizer is None:
         tokenizer = AutoTokenizer.from_pretrained(model_id, **hub_kwargs, **model_kwargs)
-    if load_feature_extractor and model_id and feature_extractor is None:
+    if load_feature_extractor and feature_extractor is None:
         feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, **hub_kwargs, **model_kwargs)
 
 

From 6d4726bfb61bc851fd21f53f22c66f73197cac64 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Mon, 6 May 2024 09:28:16 +0800
Subject: [PATCH 22/30] Update optimum/intel/pipelines/pipeline_base.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 optimum/intel/pipelines/pipeline_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py
index f99af4e9bf..f6b09216f6 100644
--- a/optimum/intel/pipelines/pipeline_base.py
+++ b/optimum/intel/pipelines/pipeline_base.py
@@ -240,7 +240,7 @@ def pipeline(
     if accelerator == "ipex":
         if task not in list(IPEX_SUPPORTED_TASKS.keys()):
             raise ValueError(
-                f"Task {task} is not supported for the ONNX Runtime pipeline. Supported tasks are { list(IPEX_SUPPORTED_TASKS.keys())}"
+                f"Task {task} is not supported for the IPEX pipeline. Supported tasks are { list(IPEX_SUPPORTED_TASKS.keys())}"
             )
 
     supported_tasks = IPEX_SUPPORTED_TASKS if accelerator == "ipex" else None

From 4effaa4d10ea346abca2ebabb1eac376cd1e10b0 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Mon, 6 May 2024 09:28:23 +0800
Subject: [PATCH 23/30] Update optimum/intel/pipelines/pipeline_base.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 optimum/intel/pipelines/pipeline_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py
index f6b09216f6..49545d8dd1 100644
--- a/optimum/intel/pipelines/pipeline_base.py
+++ b/optimum/intel/pipelines/pipeline_base.py
@@ -235,7 +235,7 @@ def pipeline(
         )
 
     if accelerator not in MAPPING_LOADING_FUNC:
-        raise ValueError(f'Accelerator {accelerator} is not supported. Supported accelerator is "ipex".')
+        raise ValueError(f'Accelerator {accelerator} is not supported. Supported accelerator is {", ".join(MAPPING_LOADING_FUNC)}.')
 
     if accelerator == "ipex":
         if task not in list(IPEX_SUPPORTED_TASKS.keys()):

From bf2ae084ddc5df123c91d4184e662aabcf76928e Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 6 May 2024 06:13:40 -0400
Subject: [PATCH 24/30] fix comments

---
 optimum/intel/ipex/modeling_base.py      |  2 ++
 optimum/intel/pipelines/pipeline_base.py | 45 +++++++++---------------
 tests/ipex/test_pipelines.py             |  1 -
 3 files changed, 18 insertions(+), 30 deletions(-)

diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
index 0b688ab4b1..73aa7e8881 100644
--- a/optimum/intel/ipex/modeling_base.py
+++ b/optimum/intel/ipex/modeling_base.py
@@ -161,6 +161,7 @@ def _from_transformers(
         local_files_only: bool = False,
         torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
         trust_remote_code: bool = False,
+        _commit_hash: str = None,
     ):
         if use_auth_token is not None:
             warnings.warn(
@@ -186,6 +187,7 @@ def _from_transformers(
             "force_download": force_download,
             "torch_dtype": torch_dtype,
             "trust_remote_code": trust_remote_code,
+            "_commit_hash": _commit_hash,
         }
 
         model = TasksManager.get_model_from_task(task, model_id, **model_kwargs)
diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py
index 49545d8dd1..d7c2d0d7a7 100644
--- a/optimum/intel/pipelines/pipeline_base.py
+++ b/optimum/intel/pipelines/pipeline_base.py
@@ -15,6 +15,7 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, Optional, Union
 
+import torch
 from transformers import AutoConfig, AutoFeatureExtractor, AutoTokenizer
 from transformers import pipeline as transformers_pipeline
 from transformers.feature_extraction_utils import PreTrainedFeatureExtractor
@@ -31,7 +32,6 @@
 from transformers.tokenization_utils import PreTrainedTokenizer
 from transformers.utils import (
     is_ipex_available,
-    is_torch_available,
     logging,
 )
 
@@ -98,13 +98,9 @@ def load_ipex_model(
     model,
     targeted_task,
     SUPPORTED_TASKS,
-    subfolder: str = "",
-    token: Optional[Union[bool, str]] = None,
-    revision: str = "main",
     model_kwargs: Optional[Dict[str, Any]] = None,
-    **kwargs,
+    hub_kwargs: Optional[Dict[str, Any]] = None,
 ):
-    export = kwargs.pop("export", True)
     if model_kwargs is None:
         model_kwargs = {}
 
@@ -118,15 +114,13 @@ def load_ipex_model(
         try:
             config = AutoConfig.from_pretrained(model)
             torchscript = getattr(config, "torchscript", None)
-            export = False if torchscript else export
+            export = False if torchscript else True
         except RuntimeError:
-            logger.warning(
-                "config file not found, please pass `export` to decide whether we should export this model. `export` defaullt to True"
-            )
-
+            logger.warning("We will use IPEXModel with export=True to export the model")
+            export = True
         model = ipex_model_class.from_pretrained(model, export=export, **model_kwargs, **hub_kwargs)
     elif isinstance(model, IPEXModel):
-        model_id = None
+        model_id = getattr(model.config, "name_or_path", None)
     else:
         raise ValueError(
             f"""Model {model} is not supported. Please provide a valid model name or path or a IPEXModel.
@@ -141,7 +135,6 @@ def load_ipex_model(
 }
 
 
-
 if TYPE_CHECKING:
     from transformers.modeling_utils import PreTrainedModel
     from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
@@ -160,8 +153,9 @@ def pipeline(
     accelerator: Optional[str] = "ort",
     revision: Optional[str] = None,
     trust_remote_code: Optional[bool] = None,
-    *model_kwargs,
-    **kwargs,
+    torch_dtype: Optional[Union[str, torch.dtype]] = None,
+    commit_hash: Optional[str] = None,
+    **model_kwargs,
 ) -> Pipeline:
     """
     Utility factory method to build a [`Pipeline`].
@@ -201,9 +195,6 @@ def pipeline(
         model_kwargs (`Dict[str, Any]`, *optional*):
             Additional dictionary of keyword arguments passed along to the model's `from_pretrained(...,
             **model_kwargs)` function.
-        kwargs (`Dict[str, Any]`, *optional*):
-            Additional keyword arguments passed along to the specific pipeline init (see the documentation for the
-            corresponding pipeline class for possible values).
 
     Returns:
         [`Pipeline`]: A suitable pipeline for the task.
@@ -235,7 +226,9 @@ def pipeline(
         )
 
     if accelerator not in MAPPING_LOADING_FUNC:
-        raise ValueError(f'Accelerator {accelerator} is not supported. Supported accelerator is {", ".join(MAPPING_LOADING_FUNC)}.')
+        raise ValueError(
+            f'Accelerator {accelerator} is not supported. Supported accelerator is {", ".join(MAPPING_LOADING_FUNC)}.'
+        )
 
     if accelerator == "ipex":
         if task not in list(IPEX_SUPPORTED_TASKS.keys()):
@@ -260,12 +253,10 @@ def pipeline(
     load_tokenizer = task not in no_tokenizer_tasks
     load_feature_extractor = task not in no_feature_extractor_tasks
 
-    commit_hash = kwargs.pop("_commit_hash", None)
-
     hub_kwargs = {
-        "revision": kwargs.pop("revision", None),
-        "token": kwargs.pop("use_auth_token", None),
-        "trust_remote_code": kwargs.pop("trust_remote_code", None),
+        "revision": revision,
+        "token": token,
+        "trust_remote_code": trust_remote_code,
         "_commit_hash": commit_hash,
     }
 
@@ -282,16 +273,13 @@ def pipeline(
 
     # Load the correct model if possible
     # Infer the framework from the model if not already defined
-    model, model_id = MAPPING_LOADING_FUNC[accelerator](
-        model, task, supported_tasks, model_kwargs, hub_kwargs, **kwargs
-    )
+    model, model_id = MAPPING_LOADING_FUNC[accelerator](model, task, supported_tasks, model_kwargs, hub_kwargs)
 
     if load_tokenizer and tokenizer is None:
         tokenizer = AutoTokenizer.from_pretrained(model_id, **hub_kwargs, **model_kwargs)
     if load_feature_extractor and feature_extractor is None:
         feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, **hub_kwargs, **model_kwargs)
 
-
     return transformers_pipeline(
         task,
         model=model,
@@ -299,5 +287,4 @@ def pipeline(
         feature_extractor=feature_extractor,
         use_fast=use_fast,
         torch_dtype=torch_dtype,
-        **kwargs,
     )
diff --git a/tests/ipex/test_pipelines.py b/tests/ipex/test_pipelines.py
index 585219f00b..89a27ab2c8 100644
--- a/tests/ipex/test_pipelines.py
+++ b/tests/ipex/test_pipelines.py
@@ -22,7 +22,6 @@
 from transformers.pipelines import pipeline as transformers_pipeline
 
 from optimum.intel.ipex.modeling_base import (
-    IPEXModel,
     IPEXModelForAudioClassification,
     IPEXModelForCausalLM,
     IPEXModelForImageClassification,

From 184a6106b81af2fc297750ccf7a0f2141ad81e3a Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Tue, 14 May 2024 16:18:43 +0200
Subject: [PATCH 25/30] Update optimum/exporters/openvino/model_patcher.py

---
 optimum/exporters/openvino/model_patcher.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 7498a28d2e..f68e873d40 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -341,9 +341,9 @@ def _llama_gemma_update_causal_mask(self, attention_mask, input_tensor, cache_po
                 offset = 0
             mask_shape = attention_mask.shape
             mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype
-            causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = (
-                mask_slice
-            )
+            causal_mask[
+                : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]
+            ] = mask_slice
 
     if (
         self.config._attn_implementation == "sdpa"

From abe8704a9a6c51a881ad8747914e863989169ae2 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Wed, 15 May 2024 14:04:28 +0800
Subject: [PATCH 26/30] Update optimum/intel/ipex/modeling_base.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 optimum/intel/ipex/modeling_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
index 73aa7e8881..d2963d55a1 100644
--- a/optimum/intel/ipex/modeling_base.py
+++ b/optimum/intel/ipex/modeling_base.py
@@ -194,7 +194,7 @@ def _from_transformers(
         traced_model = ipex_jit_trace(model, task, use_cache)
 
         config.torchscript = True
-        config.torch_dtype = model_kwargs.get("torch_dtype", None)
+        config.torch_dtype = torch_dtype
 
         return cls(traced_model, config=config, model_save_dir=model_id, use_cache=use_cache, warmup=False)
 

From aa4d4e6f8aee521aa78f9b7091bfb5bb6f66033f Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Wed, 15 May 2024 14:04:37 +0800
Subject: [PATCH 27/30] Update optimum/intel/pipelines/pipeline_base.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 optimum/intel/pipelines/pipeline_base.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py
index d7c2d0d7a7..10e8e50ab4 100644
--- a/optimum/intel/pipelines/pipeline_base.py
+++ b/optimum/intel/pipelines/pipeline_base.py
@@ -113,8 +113,7 @@ def load_ipex_model(
         model_id = model
         try:
             config = AutoConfig.from_pretrained(model)
-            torchscript = getattr(config, "torchscript", None)
-            export = False if torchscript else True
+            export = not getattr(config, "torchscript", False)
         except RuntimeError:
             logger.warning("We will use IPEXModel with export=True to export the model")
             export = True

From ea756b0b110e1c102ec1485964e740a337cd69f4 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Wed, 15 May 2024 14:04:46 +0800
Subject: [PATCH 28/30] Update optimum/intel/pipelines/pipeline_base.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 optimum/intel/pipelines/pipeline_base.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py
index 10e8e50ab4..9abe37b4bd 100644
--- a/optimum/intel/pipelines/pipeline_base.py
+++ b/optimum/intel/pipelines/pipeline_base.py
@@ -92,7 +92,8 @@
             "type": "audio",
         },
     }
-
+else:
+    IPEX_SUPPORTED_TASKS = {}
 
 def load_ipex_model(
     model,

From 7f92191f6e4c94dfaa3cc0e8c6cf62b2bcb642b3 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Wed, 15 May 2024 14:05:11 +0800
Subject: [PATCH 29/30] Update optimum/intel/pipelines/pipeline_base.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 optimum/intel/pipelines/pipeline_base.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py
index 9abe37b4bd..7000b7ed8e 100644
--- a/optimum/intel/pipelines/pipeline_base.py
+++ b/optimum/intel/pipelines/pipeline_base.py
@@ -30,10 +30,8 @@
 )
 from transformers.pipelines.base import Pipeline
 from transformers.tokenization_utils import PreTrainedTokenizer
-from transformers.utils import (
-    is_ipex_available,
-    logging,
-)
+from transformers.utils import logging
+from optimum.intel.utils import is_ipex_available
 
 
 if is_ipex_available():

From 30aec8a23b0ad66bfb3f49e2224df50bfff709ad Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 15 May 2024 10:08:32 -0400
Subject: [PATCH 30/30] fix style

---
 optimum/intel/pipelines/pipeline_base.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py
index 7000b7ed8e..65e6cfb782 100644
--- a/optimum/intel/pipelines/pipeline_base.py
+++ b/optimum/intel/pipelines/pipeline_base.py
@@ -31,6 +31,7 @@
 from transformers.pipelines.base import Pipeline
 from transformers.tokenization_utils import PreTrainedTokenizer
 from transformers.utils import logging
+
 from optimum.intel.utils import is_ipex_available
 
 
@@ -93,6 +94,7 @@
 else:
     IPEX_SUPPORTED_TASKS = {}
 
+
 def load_ipex_model(
     model,
     targeted_task,