Merge branch 'huggingface:main' into jit_memory

jiqing-feng · web-flow · commit 6589fd2d10f4 · 2024-03-26T13:08:02.000+08:00
diff --git a/README.md b/README.md
@@ -202,6 +202,26 @@ Quantization aware training (QAT) is applied in order to simulate the effects of
 You can find more examples in the [documentation](https://huggingface.co/docs/optimum/intel/index).
 
 
+## IPEX
+To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. You can set `export=True` to load a PyTorch checkpoint, export your model via TorchScript and apply IPEX optimizations : both operators optimization (replaced with customized IPEX operators) and graph-level optimization (like operators fusion) will be applied on your model.
+```diff
+  from transformers import AutoTokenizer, pipeline
+- from transformers import AutoModelForCausalLM
++ from optimum.intel import IPEXModelForCausalLM
+
+
+  model_id = "gpt2"
+- model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
++ model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, export=True)
+  tokenizer = AutoTokenizer.from_pretrained(model_id)
+  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+  results = pipe("He's a dreadful magician and")
+
+```
+
+For more details, please refer to the [documentation](https://intel.github.io/intel-extension-for-pytorch/#introduction).
+
+
 ## Running the examples
 
 Check out the [`examples`](https://github.com/huggingface/optimum-intel/tree/main/examples) directory to see how 🤗 Optimum Intel can be used to optimize models and accelerate inference.
diff --git a/optimum/intel/neural_compressor/modeling_base.py b/optimum/intel/neural_compressor/modeling_base.py
@@ -32,9 +32,9 @@
     AutoModelForSequenceClassification,
     AutoModelForTokenClassification,
     AutoModelForVision2Seq,
+    GenerationConfig,
     GenerationMixin,
     PretrainedConfig,
-    XLNetLMHeadModel,
 )
 from transformers.modeling_utils import no_init_weights
 from transformers.models.auto.auto_factory import _get_model_class
@@ -84,6 +84,7 @@ def __init__(
         self._device = getattr(self.model, "device", None) or torch.device(
             "cuda:0" if torch.cuda.is_available() else "cpu"
         )
+        self.generation_config = GenerationConfig.from_model_config(config)
 
         # Registers the INCModelForXXX classes into the transformers AutoModel classes to avoid warnings when creating
         # a pipeline https://github.com/huggingface/transformers/blob/cad61b68396a1a387287a8e2e2fef78a25b79383/src/transformers/pipelines/base.py#L863
@@ -247,11 +248,6 @@ class INCModelForVision2Seq(INCModel):
     export_feature = "image-to-text"
 
 
-class INCModelForXLNetLM(INCModel):
-    auto_model_class = XLNetLMHeadModel
-    export_feature = "fill-mask"
-
-
 class INCModelForCausalLM(INCModel, BaseModelForCausalLM):
     auto_model_class = AutoModelForCausalLM
     export_feature = "text-generation"
diff --git a/optimum/intel/neural_compressor/quantization.py b/optimum/intel/neural_compressor/quantization.py
@@ -15,7 +15,6 @@
 import copy
 import inspect
 import logging
-import warnings
 from enum import Enum
 from itertools import chain
 from pathlib import Path
@@ -31,18 +30,9 @@
 from neural_compressor.quantization import fit
 from torch.utils.data import DataLoader, RandomSampler
 from transformers import (
-    AutoModelForCausalLM,
-    AutoModelForMaskedLM,
-    AutoModelForMultipleChoice,
-    AutoModelForQuestionAnswering,
-    AutoModelForSeq2SeqLM,
-    AutoModelForSequenceClassification,
-    AutoModelForTokenClassification,
-    AutoModelForVision2Seq,
     DataCollator,
     PretrainedConfig,
     PreTrainedModel,
-    XLNetLMHeadModel,
     default_data_collator,
 )
 
@@ -71,7 +61,6 @@
     INCModelForSequenceClassification,
     INCModelForTokenClassification,
     INCModelForVision2Seq,
-    INCModelForXLNetLM,
 )
 from .utils import INCDataLoader, _cfgs_to_fx_cfgs
 
@@ -538,49 +527,3 @@ def _apply_quantization_from_config(q_config: Dict, model: torch.nn.Module) -> t
         q_model = convert(q_model, mapping=q_mapping, inplace=True)
 
     return q_model
-
-
-class IncQuantizedModel(INCModel):
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        warnings.warn(
-            f"The class `{cls.__name__}` has been depreciated and will be removed in optimum-intel v1.12, please use "
-            f"`{cls.__name__.replace('IncQuantized', 'INC')}` instead."
-        )
-        return super().from_pretrained(*args, **kwargs)
-
-
-class IncQuantizedModelForQuestionAnswering(IncQuantizedModel):
-    auto_model_class = AutoModelForQuestionAnswering
-
-
-class IncQuantizedModelForSequenceClassification(IncQuantizedModel):
-    auto_model_class = AutoModelForSequenceClassification
-
-
-class IncQuantizedModelForTokenClassification(IncQuantizedModel):
-    auto_model_class = AutoModelForTokenClassification
-
-
-class IncQuantizedModelForMultipleChoice(IncQuantizedModel):
-    auto_model_class = AutoModelForMultipleChoice
-
-
-class IncQuantizedModelForSeq2SeqLM(IncQuantizedModel):
-    auto_model_class = AutoModelForSeq2SeqLM
-
-
-class IncQuantizedModelForCausalLM(IncQuantizedModel):
-    auto_model_class = AutoModelForCausalLM
-
-
-class IncQuantizedModelForMaskedLM(IncQuantizedModel):
-    auto_model_class = AutoModelForMaskedLM
-
-
-class IncQuantizedModelForXLNetLM(IncQuantizedModel):
-    auto_model_class = XLNetLMHeadModel
-
-
-class IncQuantizedModelForVision2Seq(IncQuantizedModel):
-    auto_model_class = AutoModelForVision2Seq
diff --git a/optimum/intel/openvino/modeling.py b/optimum/intel/openvino/modeling.py
@@ -137,7 +137,7 @@ def to(self, device: str):
             self._device = device.upper()
             self.request = None
         else:
-            logger.warning(f"device must be of type {str} but got {type(device)} instead")
+            logger.debug(f"device must be of type {str} but got {type(device)} instead")
 
         return self
 
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
@@ -64,7 +64,10 @@ def __init__(
         self.model_save_dir = model_save_dir
         self._device = device.upper()
         self.is_dynamic = dynamic_shapes
-        self.ov_config = ov_config if ov_config is not None else {"PERFORMANCE_HINT": "LATENCY"}
+        self.ov_config = ov_config if ov_config is not None else {}
+        if self.ov_config.get("PERFORMANCE_HINT") is None:
+            self.ov_config["PERFORMANCE_HINT"] = "LATENCY"
+
         self.preprocessors = kwargs.get("preprocessors", [])
         enable_compilation = kwargs.get("compile", True)
 
diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py
@@ -67,6 +67,10 @@ def __init__(
         self._device = device.upper()
         self.is_dynamic = dynamic_shapes
         self.ov_config = ov_config if ov_config is not None else {}
+
+        if self.ov_config.get("PERFORMANCE_HINT") is None:
+            self.ov_config["PERFORMANCE_HINT"] = "LATENCY"
+
         self.preprocessors = kwargs.get("preprocessors", [])
 
         if self.is_dynamic:
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
@@ -101,6 +101,8 @@ def __init__(
         self._device = device.upper()
         self.is_dynamic = dynamic_shapes
         self.ov_config = ov_config if ov_config is not None else {}
+        if self.ov_config.get("PERFORMANCE_HINT") is None:
+            self.ov_config["PERFORMANCE_HINT"] = "LATENCY"
 
         # This attribute is needed to keep one reference on the temporary directory, since garbage collecting
         # would end-up removing the directory containing the underlying OpenVINO model
@@ -456,7 +458,7 @@ def to(self, device: str):
             self._device = device.upper()
             self.clear_requests()
         else:
-            logger.warning(f"device must be of type {str} but got {type(device)} instead")
+            logger.debug(f"device must be of type {str} but got {type(device)} instead")
 
         return self
 
diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py
@@ -285,7 +285,7 @@ def to(self, device: str):
                 self.decoder_with_past._device = self._device
             self.clear_requests()
         else:
-            logger.warning(f"device must be of type {str} but got {type(device)} instead")
+            logger.debug(f"device must be of type {str} but got {type(device)} instead")
 
         return self
 
diff --git a/optimum/intel/version.py b/optimum/intel/version.py
@@ -12,4 +12,4 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-__version__ = "1.16.0.dev0"
+__version__ = "1.17.0.dev0"
diff --git a/setup.py b/setup.py
@@ -28,8 +28,8 @@
 
 INSTALL_REQUIRE = [
     "torch>=1.11",
-    "transformers>=4.36.0,<4.39.0",
-    "optimum @ git+https://github.com/huggingface/optimum.git#egg=optimum",
+    "transformers>=4.36.0,<4.40.0",
+    "optimum~=1.18",
     "datasets>=1.4.0",
     "sentencepiece",
     "scipy",
@@ -61,7 +61,7 @@
     "openvino": ["openvino>=2023.3", "nncf>=2.8.1"],
     "openvino-tokenizers": ["openvino-tokenizers[transformers]"],
     "nncf": ["nncf>=2.8.1"],
-    "ipex": ["intel-extension-for-pytorch"],
+    "ipex": ["intel-extension-for-pytorch", "transformers>=4.36.0,<4.39.0"],
     "diffusers": ["diffusers"],
     "quality": QUALITY_REQUIRE,
     "tests": TESTS_REQUIRE,
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
@@ -116,6 +116,9 @@ def test_load_from_hub_and_save_model(self):
         tokens = tokenizer("This is a sample input", return_tensors="pt")
         loaded_model = OVModelForSequenceClassification.from_pretrained(self.OV_MODEL_ID)
         self.assertIsInstance(loaded_model.config, PretrainedConfig)
+        # Test that PERFORMANCE_HINT is set to LATENCY by default
+        self.assertEqual(loaded_model.ov_config.get("PERFORMANCE_HINT"), "LATENCY")
+        self.assertEqual(loaded_model.request.get_property("PERFORMANCE_HINT"), "LATENCY")
         loaded_model_outputs = loaded_model(**tokens)
 
         # Test specifying ov_config with throughput hint and manual cache dir
@@ -134,7 +137,10 @@ def test_load_from_hub_and_save_model(self):
             folder_contents = os.listdir(tmpdirname)
             self.assertTrue(OV_XML_FILE_NAME in folder_contents)
             self.assertTrue(OV_XML_FILE_NAME.replace(".xml", ".bin") in folder_contents)
-            model = OVModelForSequenceClassification.from_pretrained(tmpdirname)
+            model = OVModelForSequenceClassification.from_pretrained(tmpdirname, ov_config={"NUM_STREAMS": 2})
+            # Test that PERFORMANCE_HINT is set to LATENCY by default even with ov_config provided
+            self.assertEqual(model.ov_config.get("PERFORMANCE_HINT"), "LATENCY")
+            self.assertEqual(model.request.get_property("PERFORMANCE_HINT"), "LATENCY")
 
         outputs = model(**tokens)
         self.assertTrue(torch.equal(loaded_model_outputs.logits, outputs.logits))
@@ -150,6 +156,9 @@ def test_load_from_hub_and_save_decoder_model(self, use_cache):
         tokens = tokenizer("This is a sample input", return_tensors="pt")
         loaded_model = OVModelForCausalLM.from_pretrained(model_id, use_cache=use_cache)
         self.assertIsInstance(loaded_model.config, PretrainedConfig)
+        # Test that PERFORMANCE_HINT is set to LATENCY by default
+        self.assertEqual(loaded_model.ov_config.get("PERFORMANCE_HINT"), "LATENCY")
+        self.assertEqual(loaded_model.request.get_compiled_model().get_property("PERFORMANCE_HINT"), "LATENCY")
         loaded_model_outputs = loaded_model(**tokens)
 
         with tempfile.TemporaryDirectory() as tmpdirname:
@@ -172,6 +181,11 @@ def test_load_from_hub_and_save_seq2seq_model(self):
         loaded_model = OVModelForSeq2SeqLM.from_pretrained(self.OV_SEQ2SEQ_MODEL_ID, compile=False)
         self.assertIsInstance(loaded_model.config, PretrainedConfig)
         loaded_model.to("cpu")
+        loaded_model.compile()
+        # Test that PERFORMANCE_HINT is set to LATENCY by default
+        self.assertEqual(loaded_model.ov_config.get("PERFORMANCE_HINT"), "LATENCY")
+        self.assertEqual(loaded_model.decoder.request.get_compiled_model().get_property("PERFORMANCE_HINT"), "LATENCY")
+
         loaded_model_outputs = loaded_model.generate(**tokens)
 
         with tempfile.TemporaryDirectory() as tmpdirname:
@@ -192,6 +206,10 @@ def test_load_from_hub_and_save_seq2seq_model(self):
     def test_load_from_hub_and_save_stable_diffusion_model(self):
         loaded_pipeline = OVStableDiffusionPipeline.from_pretrained(self.OV_DIFFUSION_MODEL_ID, compile=False)
         self.assertIsInstance(loaded_pipeline.config, Dict)
+        # Test that PERFORMANCE_HINT is set to LATENCY by default
+        self.assertEqual(loaded_pipeline.ov_config.get("PERFORMANCE_HINT"), "LATENCY")
+        loaded_pipeline.compile()
+        self.assertEqual(loaded_pipeline.unet.request.get_property("PERFORMANCE_HINT"), "LATENCY")
         batch_size, height, width = 2, 16, 16
         np.random.seed(0)
         inputs = {
@@ -501,7 +519,8 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "qwen",
         "qwen2",
         "stablelm",
-        # "starcoder2", # TODO: enable with next transformers release
+        "starcoder2",
+        "phi",
     )
     GENERATION_LENGTH = 100
     IS_SUPPORT_STATEFUL = is_openvino_version(">=", "2023.3")
@@ -524,16 +543,15 @@ def test_compare_to_transformers(self, model_arch):
 
         model_kwargs = {}
         if model_arch in self.REMOTE_CODE_MODELS:
-            model_kwargs = {
-                "config": AutoConfig.from_pretrained(model_id, trust_remote_code=True),
-                "trust_remote_code": True,
-            }
+            model_kwargs = {"trust_remote_code": True}
+
         ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG, **model_kwargs)
         self.assertIsInstance(ov_model.config, PretrainedConfig)
         self.assertTrue(ov_model.use_cache)
         self.assertEqual(
             ov_model.stateful, self.IS_SUPPORT_STATEFUL and ov_model.config.model_type not in not_stateful
         )
+        set_seed(SEED)
         transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
         if model_arch == "qwen":
@@ -570,6 +588,10 @@ def test_pipeline(self, model_arch):
                 "trust_remote_code": True,
             }
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
+
+        if model_arch == "qwen":
+            tokenizer._convert_tokens_to_ids = lambda x: 0
+
         model = OVModelForCausalLM.from_pretrained(
             model_id, export=True, use_cache=False, compile=False, **model_kwargs
         )
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
@@ -70,6 +70,7 @@
     "nystromformer": "hf-internal-testing/tiny-random-NystromformerModel",
     "pegasus": "hf-internal-testing/tiny-random-pegasus",
     "pix2struct": "fxmarty/pix2struct-tiny-random",
+    "phi": "hf-internal-testing/tiny-random-PhiForCausalLM",
     "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel",
     "qwen": "katuni4ka/tiny-random-qwen",
     "qwen2": "Qwen/Qwen1.5-0.5B",