Skip to content

Commit 6589fd2

Browse files
authoredMar 26, 2024··
Merge branch 'huggingface:main' into jit_memory
2 parents a3fb5b8 + c2d267a commit 6589fd2

12 files changed

+68
-77
lines changed
 

‎README.md

+20
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,26 @@ Quantization aware training (QAT) is applied in order to simulate the effects of
202202
You can find more examples in the [documentation](https://huggingface.co/docs/optimum/intel/index).
203203

204204

205+
## IPEX
206+
To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. You can set `export=True` to load a PyTorch checkpoint, export your model via TorchScript and apply IPEX optimizations : both operators optimization (replaced with customized IPEX operators) and graph-level optimization (like operators fusion) will be applied on your model.
207+
```diff
208+
from transformers import AutoTokenizer, pipeline
209+
- from transformers import AutoModelForCausalLM
210+
+ from optimum.intel import IPEXModelForCausalLM
211+
212+
213+
model_id = "gpt2"
214+
- model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
215+
+ model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, export=True)
216+
tokenizer = AutoTokenizer.from_pretrained(model_id)
217+
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
218+
results = pipe("He's a dreadful magician and")
219+
220+
```
221+
222+
For more details, please refer to the [documentation](https://intel.github.io/intel-extension-for-pytorch/#introduction).
223+
224+
205225
## Running the examples
206226

207227
Check out the [`examples`](https://github.com/huggingface/optimum-intel/tree/main/examples) directory to see how 🤗 Optimum Intel can be used to optimize models and accelerate inference.

‎optimum/intel/neural_compressor/modeling_base.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,9 @@
3232
AutoModelForSequenceClassification,
3333
AutoModelForTokenClassification,
3434
AutoModelForVision2Seq,
35+
GenerationConfig,
3536
GenerationMixin,
3637
PretrainedConfig,
37-
XLNetLMHeadModel,
3838
)
3939
from transformers.modeling_utils import no_init_weights
4040
from transformers.models.auto.auto_factory import _get_model_class
@@ -84,6 +84,7 @@ def __init__(
8484
self._device = getattr(self.model, "device", None) or torch.device(
8585
"cuda:0" if torch.cuda.is_available() else "cpu"
8686
)
87+
self.generation_config = GenerationConfig.from_model_config(config)
8788

8889
# Registers the INCModelForXXX classes into the transformers AutoModel classes to avoid warnings when creating
8990
# a pipeline https://github.com/huggingface/transformers/blob/cad61b68396a1a387287a8e2e2fef78a25b79383/src/transformers/pipelines/base.py#L863
@@ -247,11 +248,6 @@ class INCModelForVision2Seq(INCModel):
247248
export_feature = "image-to-text"
248249

249250

250-
class INCModelForXLNetLM(INCModel):
251-
auto_model_class = XLNetLMHeadModel
252-
export_feature = "fill-mask"
253-
254-
255251
class INCModelForCausalLM(INCModel, BaseModelForCausalLM):
256252
auto_model_class = AutoModelForCausalLM
257253
export_feature = "text-generation"

‎optimum/intel/neural_compressor/quantization.py

-57
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
import copy
1616
import inspect
1717
import logging
18-
import warnings
1918
from enum import Enum
2019
from itertools import chain
2120
from pathlib import Path
@@ -31,18 +30,9 @@
3130
from neural_compressor.quantization import fit
3231
from torch.utils.data import DataLoader, RandomSampler
3332
from transformers import (
34-
AutoModelForCausalLM,
35-
AutoModelForMaskedLM,
36-
AutoModelForMultipleChoice,
37-
AutoModelForQuestionAnswering,
38-
AutoModelForSeq2SeqLM,
39-
AutoModelForSequenceClassification,
40-
AutoModelForTokenClassification,
41-
AutoModelForVision2Seq,
4233
DataCollator,
4334
PretrainedConfig,
4435
PreTrainedModel,
45-
XLNetLMHeadModel,
4636
default_data_collator,
4737
)
4838

@@ -71,7 +61,6 @@
7161
INCModelForSequenceClassification,
7262
INCModelForTokenClassification,
7363
INCModelForVision2Seq,
74-
INCModelForXLNetLM,
7564
)
7665
from .utils import INCDataLoader, _cfgs_to_fx_cfgs
7766

@@ -538,49 +527,3 @@ def _apply_quantization_from_config(q_config: Dict, model: torch.nn.Module) -> t
538527
q_model = convert(q_model, mapping=q_mapping, inplace=True)
539528

540529
return q_model
541-
542-
543-
class IncQuantizedModel(INCModel):
544-
@classmethod
545-
def from_pretrained(cls, *args, **kwargs):
546-
warnings.warn(
547-
f"The class `{cls.__name__}` has been depreciated and will be removed in optimum-intel v1.12, please use "
548-
f"`{cls.__name__.replace('IncQuantized', 'INC')}` instead."
549-
)
550-
return super().from_pretrained(*args, **kwargs)
551-
552-
553-
class IncQuantizedModelForQuestionAnswering(IncQuantizedModel):
554-
auto_model_class = AutoModelForQuestionAnswering
555-
556-
557-
class IncQuantizedModelForSequenceClassification(IncQuantizedModel):
558-
auto_model_class = AutoModelForSequenceClassification
559-
560-
561-
class IncQuantizedModelForTokenClassification(IncQuantizedModel):
562-
auto_model_class = AutoModelForTokenClassification
563-
564-
565-
class IncQuantizedModelForMultipleChoice(IncQuantizedModel):
566-
auto_model_class = AutoModelForMultipleChoice
567-
568-
569-
class IncQuantizedModelForSeq2SeqLM(IncQuantizedModel):
570-
auto_model_class = AutoModelForSeq2SeqLM
571-
572-
573-
class IncQuantizedModelForCausalLM(IncQuantizedModel):
574-
auto_model_class = AutoModelForCausalLM
575-
576-
577-
class IncQuantizedModelForMaskedLM(IncQuantizedModel):
578-
auto_model_class = AutoModelForMaskedLM
579-
580-
581-
class IncQuantizedModelForXLNetLM(IncQuantizedModel):
582-
auto_model_class = XLNetLMHeadModel
583-
584-
585-
class IncQuantizedModelForVision2Seq(IncQuantizedModel):
586-
auto_model_class = AutoModelForVision2Seq

‎optimum/intel/openvino/modeling.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ def to(self, device: str):
137137
self._device = device.upper()
138138
self.request = None
139139
else:
140-
logger.warning(f"device must be of type {str} but got {type(device)} instead")
140+
logger.debug(f"device must be of type {str} but got {type(device)} instead")
141141

142142
return self
143143

‎optimum/intel/openvino/modeling_base.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,10 @@ def __init__(
6464
self.model_save_dir = model_save_dir
6565
self._device = device.upper()
6666
self.is_dynamic = dynamic_shapes
67-
self.ov_config = ov_config if ov_config is not None else {"PERFORMANCE_HINT": "LATENCY"}
67+
self.ov_config = ov_config if ov_config is not None else {}
68+
if self.ov_config.get("PERFORMANCE_HINT") is None:
69+
self.ov_config["PERFORMANCE_HINT"] = "LATENCY"
70+
6871
self.preprocessors = kwargs.get("preprocessors", [])
6972
enable_compilation = kwargs.get("compile", True)
7073

‎optimum/intel/openvino/modeling_base_seq2seq.py

+4
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@ def __init__(
6767
self._device = device.upper()
6868
self.is_dynamic = dynamic_shapes
6969
self.ov_config = ov_config if ov_config is not None else {}
70+
71+
if self.ov_config.get("PERFORMANCE_HINT") is None:
72+
self.ov_config["PERFORMANCE_HINT"] = "LATENCY"
73+
7074
self.preprocessors = kwargs.get("preprocessors", [])
7175

7276
if self.is_dynamic:

‎optimum/intel/openvino/modeling_diffusion.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,8 @@ def __init__(
101101
self._device = device.upper()
102102
self.is_dynamic = dynamic_shapes
103103
self.ov_config = ov_config if ov_config is not None else {}
104+
if self.ov_config.get("PERFORMANCE_HINT") is None:
105+
self.ov_config["PERFORMANCE_HINT"] = "LATENCY"
104106

105107
# This attribute is needed to keep one reference on the temporary directory, since garbage collecting
106108
# would end-up removing the directory containing the underlying OpenVINO model
@@ -456,7 +458,7 @@ def to(self, device: str):
456458
self._device = device.upper()
457459
self.clear_requests()
458460
else:
459-
logger.warning(f"device must be of type {str} but got {type(device)} instead")
461+
logger.debug(f"device must be of type {str} but got {type(device)} instead")
460462

461463
return self
462464

‎optimum/intel/openvino/modeling_seq2seq.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@ def to(self, device: str):
285285
self.decoder_with_past._device = self._device
286286
self.clear_requests()
287287
else:
288-
logger.warning(f"device must be of type {str} but got {type(device)} instead")
288+
logger.debug(f"device must be of type {str} but got {type(device)} instead")
289289

290290
return self
291291

‎optimum/intel/version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,4 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
__version__ = "1.16.0.dev0"
15+
__version__ = "1.17.0.dev0"

‎setup.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@
2828

2929
INSTALL_REQUIRE = [
3030
"torch>=1.11",
31-
"transformers>=4.36.0,<4.39.0",
32-
"optimum @ git+https://github.com/huggingface/optimum.git#egg=optimum",
31+
"transformers>=4.36.0,<4.40.0",
32+
"optimum~=1.18",
3333
"datasets>=1.4.0",
3434
"sentencepiece",
3535
"scipy",
@@ -61,7 +61,7 @@
6161
"openvino": ["openvino>=2023.3", "nncf>=2.8.1"],
6262
"openvino-tokenizers": ["openvino-tokenizers[transformers]"],
6363
"nncf": ["nncf>=2.8.1"],
64-
"ipex": ["intel-extension-for-pytorch"],
64+
"ipex": ["intel-extension-for-pytorch", "transformers>=4.36.0,<4.39.0"],
6565
"diffusers": ["diffusers"],
6666
"quality": QUALITY_REQUIRE,
6767
"tests": TESTS_REQUIRE,

‎tests/openvino/test_modeling.py

+28-6
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,9 @@ def test_load_from_hub_and_save_model(self):
116116
tokens = tokenizer("This is a sample input", return_tensors="pt")
117117
loaded_model = OVModelForSequenceClassification.from_pretrained(self.OV_MODEL_ID)
118118
self.assertIsInstance(loaded_model.config, PretrainedConfig)
119+
# Test that PERFORMANCE_HINT is set to LATENCY by default
120+
self.assertEqual(loaded_model.ov_config.get("PERFORMANCE_HINT"), "LATENCY")
121+
self.assertEqual(loaded_model.request.get_property("PERFORMANCE_HINT"), "LATENCY")
119122
loaded_model_outputs = loaded_model(**tokens)
120123

121124
# Test specifying ov_config with throughput hint and manual cache dir
@@ -134,7 +137,10 @@ def test_load_from_hub_and_save_model(self):
134137
folder_contents = os.listdir(tmpdirname)
135138
self.assertTrue(OV_XML_FILE_NAME in folder_contents)
136139
self.assertTrue(OV_XML_FILE_NAME.replace(".xml", ".bin") in folder_contents)
137-
model = OVModelForSequenceClassification.from_pretrained(tmpdirname)
140+
model = OVModelForSequenceClassification.from_pretrained(tmpdirname, ov_config={"NUM_STREAMS": 2})
141+
# Test that PERFORMANCE_HINT is set to LATENCY by default even with ov_config provided
142+
self.assertEqual(model.ov_config.get("PERFORMANCE_HINT"), "LATENCY")
143+
self.assertEqual(model.request.get_property("PERFORMANCE_HINT"), "LATENCY")
138144

139145
outputs = model(**tokens)
140146
self.assertTrue(torch.equal(loaded_model_outputs.logits, outputs.logits))
@@ -150,6 +156,9 @@ def test_load_from_hub_and_save_decoder_model(self, use_cache):
150156
tokens = tokenizer("This is a sample input", return_tensors="pt")
151157
loaded_model = OVModelForCausalLM.from_pretrained(model_id, use_cache=use_cache)
152158
self.assertIsInstance(loaded_model.config, PretrainedConfig)
159+
# Test that PERFORMANCE_HINT is set to LATENCY by default
160+
self.assertEqual(loaded_model.ov_config.get("PERFORMANCE_HINT"), "LATENCY")
161+
self.assertEqual(loaded_model.request.get_compiled_model().get_property("PERFORMANCE_HINT"), "LATENCY")
153162
loaded_model_outputs = loaded_model(**tokens)
154163

155164
with tempfile.TemporaryDirectory() as tmpdirname:
@@ -172,6 +181,11 @@ def test_load_from_hub_and_save_seq2seq_model(self):
172181
loaded_model = OVModelForSeq2SeqLM.from_pretrained(self.OV_SEQ2SEQ_MODEL_ID, compile=False)
173182
self.assertIsInstance(loaded_model.config, PretrainedConfig)
174183
loaded_model.to("cpu")
184+
loaded_model.compile()
185+
# Test that PERFORMANCE_HINT is set to LATENCY by default
186+
self.assertEqual(loaded_model.ov_config.get("PERFORMANCE_HINT"), "LATENCY")
187+
self.assertEqual(loaded_model.decoder.request.get_compiled_model().get_property("PERFORMANCE_HINT"), "LATENCY")
188+
175189
loaded_model_outputs = loaded_model.generate(**tokens)
176190

177191
with tempfile.TemporaryDirectory() as tmpdirname:
@@ -192,6 +206,10 @@ def test_load_from_hub_and_save_seq2seq_model(self):
192206
def test_load_from_hub_and_save_stable_diffusion_model(self):
193207
loaded_pipeline = OVStableDiffusionPipeline.from_pretrained(self.OV_DIFFUSION_MODEL_ID, compile=False)
194208
self.assertIsInstance(loaded_pipeline.config, Dict)
209+
# Test that PERFORMANCE_HINT is set to LATENCY by default
210+
self.assertEqual(loaded_pipeline.ov_config.get("PERFORMANCE_HINT"), "LATENCY")
211+
loaded_pipeline.compile()
212+
self.assertEqual(loaded_pipeline.unet.request.get_property("PERFORMANCE_HINT"), "LATENCY")
195213
batch_size, height, width = 2, 16, 16
196214
np.random.seed(0)
197215
inputs = {
@@ -501,7 +519,8 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
501519
"qwen",
502520
"qwen2",
503521
"stablelm",
504-
# "starcoder2", # TODO: enable with next transformers release
522+
"starcoder2",
523+
"phi",
505524
)
506525
GENERATION_LENGTH = 100
507526
IS_SUPPORT_STATEFUL = is_openvino_version(">=", "2023.3")
@@ -524,16 +543,15 @@ def test_compare_to_transformers(self, model_arch):
524543

525544
model_kwargs = {}
526545
if model_arch in self.REMOTE_CODE_MODELS:
527-
model_kwargs = {
528-
"config": AutoConfig.from_pretrained(model_id, trust_remote_code=True),
529-
"trust_remote_code": True,
530-
}
546+
model_kwargs = {"trust_remote_code": True}
547+
531548
ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG, **model_kwargs)
532549
self.assertIsInstance(ov_model.config, PretrainedConfig)
533550
self.assertTrue(ov_model.use_cache)
534551
self.assertEqual(
535552
ov_model.stateful, self.IS_SUPPORT_STATEFUL and ov_model.config.model_type not in not_stateful
536553
)
554+
set_seed(SEED)
537555
transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
538556
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
539557
if model_arch == "qwen":
@@ -570,6 +588,10 @@ def test_pipeline(self, model_arch):
570588
"trust_remote_code": True,
571589
}
572590
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
591+
592+
if model_arch == "qwen":
593+
tokenizer._convert_tokens_to_ids = lambda x: 0
594+
573595
model = OVModelForCausalLM.from_pretrained(
574596
model_id, export=True, use_cache=False, compile=False, **model_kwargs
575597
)

‎tests/openvino/utils_tests.py

+1
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
"nystromformer": "hf-internal-testing/tiny-random-NystromformerModel",
7171
"pegasus": "hf-internal-testing/tiny-random-pegasus",
7272
"pix2struct": "fxmarty/pix2struct-tiny-random",
73+
"phi": "hf-internal-testing/tiny-random-PhiForCausalLM",
7374
"poolformer": "hf-internal-testing/tiny-random-PoolFormerModel",
7475
"qwen": "katuni4ka/tiny-random-qwen",
7576
"qwen2": "Qwen/Qwen1.5-0.5B",

0 commit comments

Comments
 (0)
Please sign in to comment.