faaany · kaixuanliu · Sep 10, 2024 · Sep 10, 2024
diff --git a/notebooks/ipex/text_generation.ipynb b/notebooks/ipex/text_generation.ipynb
@@ -22,6 +22,7 @@
    "source": [
     "import torch\n",
     "from transformers import AutoTokenizer\n",
+    "\n",
     "from optimum.intel.ipex import IPEXModelForCausalLM"
    ]
   },

diff --git a/notebooks/openvino/optimum_openvino_inference.ipynb b/notebooks/openvino/optimum_openvino_inference.ipynb
@@ -78,6 +78,7 @@
    "source": [
     "from optimum.intel import OVModelForQuestionAnswering\n",
     "\n",
+    "\n",
     "# Load PyTorch model from the Hub and export to OpenVINO in the background\n",
     "model = OVModelForQuestionAnswering.from_pretrained(\"distilbert-base-uncased-distilled-squad\", export=True)\n",
     "\n",
@@ -122,6 +123,7 @@
    "source": [
     "from transformers import AutoTokenizer\n",
     "\n",
+    "\n",
     "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased-distilled-squad\")\n",
     "tokenizer.save_pretrained(\"distilbert-base-uncased-distilled-squad-ov-fp32\")"
    ]
@@ -182,9 +184,11 @@
     }
    ],
    "source": [
-    "from optimum.intel import OVModelForQuestionAnswering\n",
     "from transformers import AutoTokenizer, pipeline\n",
     "\n",
+    "from optimum.intel import OVModelForQuestionAnswering\n",
+    "\n",
+    "\n",
     "model = OVModelForQuestionAnswering.from_pretrained(\"distilbert-base-uncased-distilled-squad-ov-fp32\")\n",
     "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased-distilled-squad\")\n",
     "ov_pipe = pipeline(\"question-answering\", model=model, tokenizer=tokenizer)\n",
@@ -240,9 +244,11 @@
    ],
    "source": [
     "import torch\n",
-    "from optimum.intel import OVModelForQuestionAnswering\n",
     "from transformers import AutoTokenizer, pipeline\n",
     "\n",
+    "from optimum.intel import OVModelForQuestionAnswering\n",
+    "\n",
+    "\n",
     "model = OVModelForQuestionAnswering.from_pretrained(\"distilbert-base-uncased-distilled-squad-ov-fp32\")\n",
     "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased-distilled-squad-ov-fp32\")\n",
     "\n",
@@ -324,9 +330,11 @@
     }
    ],
    "source": [
-    "from optimum.intel import OVModelForQuestionAnswering\n",
     "from transformers import AutoTokenizer, pipeline\n",
     "\n",
+    "from optimum.intel import OVModelForQuestionAnswering\n",
+    "\n",
+    "\n",
     "model = OVModelForQuestionAnswering.from_pretrained(\n",
     "    \"helenai/distilbert-base-uncased-distilled-squad-ov-fp32\", compile=False\n",
     ")\n",
@@ -411,6 +419,7 @@
    "source": [
     "from openvino.runtime import Core\n",
     "\n",
+    "\n",
     "for device in Core().available_devices:\n",
     "    print(device, Core().get_property(device, \"FULL_DEVICE_NAME\"))"
    ]
@@ -528,10 +537,12 @@
     }
    ],
    "source": [
+    "from datasets import load_dataset\n",
     "from IPython.display import Audio\n",
-    "from optimum.intel import OVModelForAudioClassification\n",
     "from transformers import AutoFeatureExtractor, pipeline\n",
-    "from datasets import load_dataset\n",
+    "\n",
+    "from optimum.intel import OVModelForAudioClassification\n",
+    "\n",
     "\n",
     "model_id = \"helenai/MIT-ast-finetuned-speech-commands-v2-ov\"\n",
     "model = OVModelForAudioClassification.from_pretrained(model_id)\n",
@@ -638,9 +649,11 @@
     }
    ],
    "source": [
-    "from optimum.intel import OVModelForCausalLM\n",
     "from transformers import AutoTokenizer, pipeline\n",
     "\n",
+    "from optimum.intel import OVModelForCausalLM\n",
+    "\n",
+    "\n",
     "model_id = \"helenai/gpt2-ov\"\n",
     "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
     "model = OVModelForCausalLM.from_pretrained(model_id)\n",
@@ -704,9 +717,11 @@
    ],
    "source": [
     "from IPython.display import Image\n",
-    "from optimum.intel import OVModelForImageClassification\n",
     "from transformers import AutoImageProcessor, pipeline\n",
     "\n",
+    "from optimum.intel import OVModelForImageClassification\n",
+    "\n",
+    "\n",
     "model_id = \"helenai/microsoft-swin-tiny-patch4-window7-224-ov\"\n",
     "model = OVModelForImageClassification.from_pretrained(model_id, compile=False)\n",
     "image_processor = AutoImageProcessor.from_pretrained(model_id)\n",
@@ -766,9 +781,11 @@
     }
    ],
    "source": [
-    "from optimum.intel import OVModelForMaskedLM\n",
     "from transformers import AutoTokenizer, pipeline\n",
     "\n",
+    "from optimum.intel import OVModelForMaskedLM\n",
+    "\n",
+    "\n",
     "model_id = \"helenai/bert-base-uncased-ov\"\n",
     "model = OVModelForMaskedLM.from_pretrained(model_id)\n",
     "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
@@ -835,9 +852,11 @@
     }
    ],
    "source": [
-    "from optimum.intel import OVModelForQuestionAnswering\n",
     "from transformers import AutoTokenizer, pipeline\n",
     "\n",
+    "from optimum.intel import OVModelForQuestionAnswering\n",
+    "\n",
+    "\n",
     "# Load the model and tokenizer saved in Part 1 of this notebook. Or use the line below to load them from the hub\n",
     "# model_id = \"helenai/distilbert-base-uncased-distilled-squad-ov-fp32\"\n",
     "model_id = \"distilbert-base-uncased-distilled-squad-ov-fp32\"\n",
@@ -890,9 +909,11 @@
     }
    ],
    "source": [
-    "from optimum.intel import OVModelForSeq2SeqLM\n",
     "from transformers import AutoTokenizer, pipeline\n",
     "\n",
+    "from optimum.intel import OVModelForSeq2SeqLM\n",
+    "\n",
+    "\n",
     "model_id = \"helenai/t5-small-ov\"\n",
     "model = OVModelForSeq2SeqLM.from_pretrained(model_id, compile=False, trust_remote_code=True)\n",
     "tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)\n",
@@ -998,9 +1019,11 @@
     }
    ],
    "source": [
-    "from optimum.intel import OVModelForSequenceClassification\n",
     "from transformers import AutoTokenizer, pipeline\n",
     "\n",
+    "from optimum.intel import OVModelForSequenceClassification\n",
+    "\n",
+    "\n",
     "model_id = \"helenai/papluca-xlm-roberta-base-language-detection-ov\"\n",
     "model = OVModelForSequenceClassification.from_pretrained(model_id)\n",
     "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
@@ -1047,9 +1070,11 @@
     }
    ],
    "source": [
-    "from optimum.intel import OVModelForTokenClassification\n",
     "from transformers import AutoTokenizer, pipeline\n",
     "\n",
+    "from optimum.intel import OVModelForTokenClassification\n",
+    "\n",
+    "\n",
     "model_id = \"helenai/dslim-bert-base-NER-ov-fp32\"\n",
     "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
     "model = OVModelForTokenClassification.from_pretrained(model_id)\n",

diff --git a/notebooks/openvino/quantized_generation_demo.ipynb b/notebooks/openvino/quantized_generation_demo.ipynb
@@ -45,6 +45,7 @@
     "import os\n",
     "\n",
     "from transformers import AutoTokenizer\n",
+    "\n",
     "from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig"
    ]
   },
@@ -211,6 +212,7 @@
    "source": [
     "from transformers import TextStreamer\n",
     "\n",
+    "\n",
     "# Tokenize the sample\n",
     "inputs = tokenizer([sample], return_tensors='pt')\n",
     "\n",
@@ -294,15 +296,15 @@
     "\n",
     "\n",
     "# Tokenize the sample\n",
-    "inputs = tokenizer([sample], return_tensors='pt')    \n",
+    "inputs = tokenizer([sample], return_tensors='pt')\n",
     "\n",
     "out = stateless_model.generate(\n",
     "    **inputs,\n",
     "    max_new_tokens=128,\n",
     "    streamer=TextStreamer(tokenizer=tokenizer, skip_special_tokens=True),\n",
     "    pad_token_id=tokenizer.eos_token_id,\n",
     "    prompt_lookup_num_tokens=3,\n",
-    ")    "
+    ")"
    ]
   },
   {
@@ -442,6 +444,7 @@
    "outputs": [],
    "source": [
     "from functools import wraps\n",
+    "\n",
     "import numpy as np\n",
     "\n",
     "\n",
@@ -458,15 +461,15 @@
     "        if len(self.seq_lens) > 0 or len(self.win_sizes) > 0:\n",
     "            raise RuntimeError(\"Always use a new instance, don't reuse!\")\n",
     "        self.model_forward = self.model.forward\n",
-    "        \n",
+    "\n",
     "        @wraps(self.model_forward)\n",
     "        def forward_wrapper(**kwargs):\n",
     "            self.seq_lens[-1].append(kwargs.get(\"attention_mask\").shape[-1])\n",
     "            self.win_sizes[-1].append(kwargs.get(\"input_ids\").shape[-1] - 1)\n",
     "            return self.model_forward(**kwargs)\n",
-    "        \n",
+    "\n",
     "        self.model.forward = forward_wrapper\n",
-    "        \n",
+    "\n",
     "        # wrap generate method\n",
     "        self.model_generate = self.model.generate\n",
     "\n",
@@ -494,7 +497,7 @@
     "        self.seq_lens = [sl[1:] for sl in self.seq_lens]\n",
     "        # Add window size for output to ease calculation later\n",
     "        for ws, sl in zip(self.win_sizes, self.seq_lens):\n",
-    "            ws.append(0)    \n",
+    "            ws.append(0)\n",
     "\n",
     "    def acceptance_rate(self, return_mean=True, normalize=False):\n",
     "        # ar_per_win = ((cur_seq_len - cur_win_size) - (prev_seq_len - prev_win_size) - 1) / prev_win_size\n",
@@ -533,8 +536,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from tqdm import tqdm\n",
     "from datasets import load_dataset\n",
+    "from tqdm import tqdm\n",
+    "\n",
     "\n",
     "dataset_name = \"openai_humaneval\"\n",
     "dataset_subset_name = None\n",
@@ -590,10 +594,10 @@
     "from threading import Thread\n",
     "\n",
     "from transformers import (\n",
-    "    TextIteratorStreamer,\n",
+    "    GenerationConfig,\n",
     "    StoppingCriteria,\n",
     "    StoppingCriteriaList,\n",
-    "    GenerationConfig,\n",
+    "    TextIteratorStreamer,\n",
     ")\n",
     "\n",
     "\n",
@@ -690,7 +694,7 @@
     "    prompt_char = \"▌\"\n",
     "    history[-1][1] = prompt_char\n",
     "    yield history, \"Status: Generating...\", *([gr.update(interactive=False)] * 4)\n",
-    "    \n",
+    "\n",
     "    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n",
     "\n",
     "    # Create a stopping criteria to prevent the model from playing the role of the user aswell.\n",
@@ -770,6 +774,7 @@
    "source": [
     "import gradio as gr\n",
     "\n",
+    "\n",
     "try:\n",
     "    demo.close()\n",
     "except:\n",
@@ -808,7 +813,7 @@
     "      history: conversation history\n",
     "    Returns:\n",
     "      updated history\n",
-    "    \"\"\" \n",
+    "    \"\"\"\n",
     "    history[-1][1] = None\n",
     "    return history\n",
     "\n",

diff --git a/notebooks/openvino/question_answering_quantization.ipynb b/notebooks/openvino/question_answering_quantization.ipynb
@@ -51,9 +51,11 @@
     "import transformers\n",
     "from evaluate import evaluator\n",
     "from openvino.runtime import Core\n",
-    "from optimum.intel import OVModelForQuestionAnswering, OVQuantizer, OVQuantizationConfig, OVConfig\n",
     "from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline\n",
     "\n",
+    "from optimum.intel import OVConfig, OVModelForQuestionAnswering, OVQuantizationConfig, OVQuantizer\n",
+    "\n",
+    "\n",
     "transformers.logging.set_verbosity_error()\n",
     "datasets.logging.set_verbosity_error()"
    ]

diff --git a/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb b/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb
@@ -46,15 +46,18 @@
    "outputs": [],
    "source": [
     "import time\n",
+    "from pathlib import Path\n",
+    "\n",
     "import datasets\n",
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
     "import transformers\n",
-    "from pathlib import Path\n",
     "from openvino.runtime import Core\n",
+    "\n",
     "from optimum.intel import OVConfig, OVQuantizer, OVStableDiffusionPipeline, OVWeightQuantizationConfig\n",
     "from optimum.intel.openvino.configuration import OVQuantizationMethod\n",
     "\n",
+    "\n",
     "transformers.logging.set_verbosity_error()\n",
     "datasets.logging.set_verbosity_error()"
    ]

diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py
@@ -29,11 +29,11 @@
 from .modeling_utils import (
     _IPEX_MINIMUM_VERSION_FOR_PATCHING,
     _gpt2_block_forward,
-    _ipex_rms_layer_norm_forward,
     _IPEXFalconDecoderLayer,
     _IPEXGPT2Attention,
     _IPEXIntermediate,
     _IPEXLlamaDecoderLayer,
+    _llama_layer_norm_forward,
     _llama_model_forward,
 )
 
@@ -79,7 +79,7 @@ def _patch_llama_model(model):
         2. Linear fusion with (2 Linears + Silu + Mul) and (Linear + Add)
     """
     convert_functions(model, LlamaModel, "forward", _llama_model_forward)
-    convert_functions(model, LlamaRMSNorm, "forward", _ipex_rms_layer_norm_forward)
+    convert_functions(model, LlamaRMSNorm, "forward", _llama_layer_norm_forward)
     convert_class(model, LlamaDecoderLayer, _IPEXLlamaDecoderLayer, model.config)
     return model
 

diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
@@ -404,9 +404,9 @@ def _llama_gemma_update_causal_mask_legacy(self, attention_mask, input_tensor, c
                 offset = 0
             mask_shape = attention_mask.shape
             mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype
-            causal_mask[
-                : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]
-            ] = mask_slice
+            causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = (
+                mask_slice
+            )
 
     if (
         self.config._attn_implementation == "sdpa"
@@ -1966,9 +1966,9 @@ def _dbrx_update_causal_mask_legacy(
                 offset = 0
             mask_shape = attention_mask.shape
             mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype
-            causal_mask[
-                : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]
-            ] = mask_slice
+            causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = (
+                mask_slice
+            )
 
     if (
         self.config._attn_implementation == "sdpa"

diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
@@ -154,7 +154,7 @@ def __init__(
             self._device = torch.device("cpu")
 
         # CPU only support jit model for now.
-        if export:
+        if export and self._device.type == "cpu":
             if isinstance(model, torch.jit.RecursiveScriptModule):
                 logger.warning("The model has been exported already.")
             else:
@@ -251,7 +251,6 @@ def _from_pretrained(
                 )
             token = use_auth_token
 
-        task = cls.export_feature
         commit_hash = kwargs.pop("_commit_hash", None)
 
         model_kwargs = {
@@ -263,49 +262,11 @@ def _from_pretrained(
             "force_download": force_download,
         }
 
-        model = TasksManager.get_model_from_task(task, model_id, **model_kwargs)
-
-        if is_torch_xpu_available(check_device=True):
-            model.to("xpu:0")
-            if _is_patched_with_ipex(model, task):
-                model = _patch_model(model)
-        else:
-            model = ipex_jit_trace(model, task, use_cache)
-            config.torchscript = True
-            config.torch_dtype = torch_dtype
-        return cls(model, config=config, model_save_dir=model_id, use_cache=use_cache, warmup=False)
-
-    @classmethod
-    def _from_pretrained(
-        cls,
-        model_id: Union[str, Path],
-        config: PretrainedConfig,
-        use_auth_token: Optional[Union[bool, str]] = None,
-        token: Optional[Union[bool, str]] = None,
-        revision: Optional[str] = None,
-        force_download: bool = False,
-        cache_dir: str = HUGGINGFACE_HUB_CACHE,
-        file_name: Optional[str] = WEIGHTS_NAME,
-        local_files_only: bool = False,
-        subfolder: str = "",
-        **kwargs,
-    ):
-        if use_auth_token is not None:
-            warnings.warn(
-                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
-                FutureWarning,
-            )
-            if token is not None:
-                raise ValueError(
-                    "Both the arguments `use_auth_token` and `token` were specified, which is not supported. Please specify only `token`."
-                )
-            token = use_auth_token
-
         if not getattr(config, "torchscript", False):
             logger.warning("Detect torchscript is false. Convert to torchscript model!")
 
             if is_torch_version("<", "2.1.0"):
-                raise ImportError("`torch>=2.0.0` is needed to trace your model")
+                raise ImportError("`torch>=2.1.0` is needed to trace your model")
 
             task = cls.export_feature
             config.torch_dtype = torch_dtype
@@ -318,6 +279,15 @@ def _from_pretrained(
                 _commit_hash=commit_hash,
                 **model_kwargs,
             )
+            if is_torch_xpu_available(check_device=True):
+                model.to("xpu:0")
+                if _is_patched_with_ipex(model, task):
+                    model = _patch_model(model)
+            else:
+                use_cache = kwargs.get("use_cache", True)
+                model = ipex_jit_trace(model, task, use_cache)
+                config.torchscript = True
+                config.torch_dtype = torch_dtype
 
             return cls(model, config=config, export=True, **kwargs)
 

diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
@@ -84,9 +84,9 @@ def __init__(
         for idx, key in enumerate(model.inputs):
             names = tuple(key.get_names())
             input_names[next((name for name in names if "/" not in name), names[0])] = idx
-            input_dtypes[
-                next((name for name in names if "/" not in name), names[0])
-            ] = key.get_element_type().get_type_name()
+            input_dtypes[next((name for name in names if "/" not in name), names[0])] = (
+                key.get_element_type().get_type_name()
+            )
         self.input_names = input_names
         self.input_dtypes = input_dtypes
 
@@ -95,9 +95,9 @@ def __init__(
         for idx, key in enumerate(model.outputs):
             names = tuple(key.get_names())
             output_names[next((name for name in names if "/" not in name), names[0])] = idx
-            output_dtypes[
-                next((name for name in names if "/" not in name), names[0])
-            ] = key.get_element_type().get_type_name()
+            output_dtypes[next((name for name in names if "/" not in name), names[0])] = (
+                key.get_element_type().get_type_name()
+            )
 
         self.output_names = output_names
         self.output_dtypes = output_dtypes