huggingface
diff --git a/‎.github/workflows/test_inc.yml
+11-4 b/‎.github/workflows/test_inc.yml
+11-4
diff --git a/‎.github/workflows/test_ipex.yml
+1 b/‎.github/workflows/test_ipex.yml
+1
diff --git a/‎.github/workflows/test_offline.yaml
+40 b/‎.github/workflows/test_offline.yaml
+40
diff --git a/‎.github/workflows/test_openvino.yml
+5-1 b/‎.github/workflows/test_openvino.yml
+5-1
diff --git a/‎.github/workflows/test_openvino_examples.yml
+10-10 b/‎.github/workflows/test_openvino_examples.yml
+10-10
diff --git a/‎.github/workflows/test_openvino_notebooks.yml
+2-2 b/‎.github/workflows/test_openvino_notebooks.yml
+2-2
diff --git a/‎README.md
+10-3 b/‎README.md
+10-3
diff --git a/‎docs/source/optimization_ov.mdx
+3-2 b/‎docs/source/optimization_ov.mdx
+3-2
diff --git a/‎examples/neural_compressor/language-modeling/run_clm.py
+25-27 b/‎examples/neural_compressor/language-modeling/run_clm.py
+25-27
diff --git a/‎examples/openvino/audio-classification/requirements.txt
+2-1 b/‎examples/openvino/audio-classification/requirements.txt
+2-1
diff --git a/‎examples/openvino/audio-classification/run_audio_classification.py
+1-1 b/‎examples/openvino/audio-classification/run_audio_classification.py
+1-1
diff --git a/‎examples/openvino/image-classification/requirements.txt
+1 b/‎examples/openvino/image-classification/requirements.txt
+1
diff --git a/‎examples/openvino/image-classification/run_image_classification.py
+1-1 b/‎examples/openvino/image-classification/run_image_classification.py
+1-1
diff --git a/‎examples/openvino/question-answering/requirements.txt
+1 b/‎examples/openvino/question-answering/requirements.txt
+1
diff --git a/‎examples/openvino/question-answering/run_qa.py
+1-1 b/‎examples/openvino/question-answering/run_qa.py
+1-1
diff --git a/‎examples/openvino/question-answering/trainer_qa.py
+1-1 b/‎examples/openvino/question-answering/trainer_qa.py
+1-1
diff --git a/‎examples/openvino/text-classification/requirements.txt
+2-1 b/‎examples/openvino/text-classification/requirements.txt
+2-1
diff --git a/‎examples/openvino/text-classification/run_glue.py
+1-1 b/‎examples/openvino/text-classification/run_glue.py
+1-1
@@ -32,11 +32,18 @@ jobs:
         python -m pip install --upgrade pip
         pip install cmake
         pip install py-cpuinfo
-        pip install torch==2.1.0 torchaudio==2.1.0 torchvision==0.16 --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install torch==2.2 torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
         pip install .[neural-compressor,diffusers,tests]
-        pip install intel-extension-for-pytorch==2.1.100
-        pip install intel-extension-for-transformers==1.3.2
+        pip install intel-extension-for-transformers
         pip install peft
+
     - name: Test with Pytest
       run: |
-        pytest tests/neural_compressor/
+        pytest tests/neural_compressor/ --ignore tests/neural_compressor/test_ipex.py --durations=0
+    - name: Test IPEX
+      run: |
+        pip uninstall -y intel-extension-for-transformers
+        pip install torch==2.1.0 torchaudio==2.1.0 torchvision==0.16 --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install intel-extension-for-pytorch==2.1.100
+        pytest tests/neural_compressor/test_ipex.py
+
@@ -30,6 +30,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
+        pip install torch==2.2 torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
         pip install .[ipex,tests]
     - name: Test with Pytest
       run: |
 
@@ -0,0 +1,40 @@
+name: Offline usage / Python - Test
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [3.9]
+        os: [ubuntu-latest]
+
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v3
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          pip install .[tests,openvino]
+      - name: Test
+        run: |
+          HF_HOME=/tmp/ huggingface-cli download hf-internal-testing/tiny-random-gpt2
+          HF_HOME=/tmp/ HF_HUB_OFFLINE=1 optimum-cli export openvino --model hf-internal-testing/tiny-random-gpt2 gpt2_openvino --task text-generation
+
+          huggingface-cli download hf-internal-testing/tiny-random-gpt2
+          HF_HUB_OFFLINE=1 optimum-cli export openvino --model hf-internal-testing/tiny-random-gpt2 gpt2_openvino --task text-generation
+
+          pytest tests/openvino/test_modeling.py -k "test_load_from_hub" -s -vvvvv
+          HF_HUB_OFFLINE=1 pytest tests/openvino/test_modeling.py -k "test_load_from_hub" -s -vvvvv
@@ -35,7 +35,11 @@ jobs:
         pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime
     - name: Test with Pytest
       run: |
-        pytest tests/openvino/ --ignore test_modeling_basic --durations=0
+        pytest tests/openvino/ --ignore tests/openvino/test_modeling_basic.py --durations=0
+    - name: Test basic
+      run: |
+        pip uninstall -y nncf
+        pytest tests/openvino/test_modeling_basic.py
     - name: Test openvino-nightly
       run: |
         pip uninstall -y openvino
 
@@ -7,11 +7,11 @@ on:
   push:
     paths:
     - '.github/workflows/test_openvino_examples.yml'
-    - 'examples/openvino/*'
+    - 'examples/openvino/**'
   pull_request:
     paths:
     - '.github/workflows/test_openvino_examples.yml'
-    - 'examples/openvino/*'
+    - 'examples/openvino/**'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -22,9 +22,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.8", "3.10"]
+        python-version: ["3.8", "3.11"]
 
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
 
     steps:
     - uses: actions/checkout@v2
@@ -35,12 +35,12 @@ jobs:
 
     - name: Install dependencies
       run: |
-        pip install optimum[openvino] jstyleson nncf pytest
-        pip install -r examples/openvino/audio-classification/requirements.txt
-        pip install -r examples/openvino/image-classification/requirements.txt
-        pip install -r examples/openvino/question-answering/requirements.txt
-        pip install -r examples/openvino/text-classification/requirements.txt
+        pip install .[openvino] jstyleson pytest
+        pip install -r examples/openvino/audio-classification/requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install -r examples/openvino/image-classification/requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install -r examples/openvino/question-answering/requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install -r examples/openvino/text-classification/requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
 
     - name: Test examples
       run: |
-        python -m pytest examples/openvino/test_examples.py
+        python -m pytest examples/openvino/test_examples.py
@@ -23,9 +23,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.8", "3.10"]
+        python-version: ["3.8", "3.11"]
 
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
 
     steps:
     - uses: actions/checkout@v2
 
@@ -78,12 +78,18 @@ It is possible to export your model to the [OpenVINO IR](https://docs.openvino.a
 optimum-cli export openvino --model gpt2 ov_model
 ```
 
-You can also apply 8-bit weight-only quantization when exporting your model : the model linear and embedding weights will be quantized to INT8, the activations will be kept in floating point precision.
+You can also apply 8-bit weight-only quantization when exporting your model : the model linear, embedding and convolution weights will be quantized to INT8, the activations will be kept in floating point precision.
 
 ```plain
 optimum-cli export openvino --model gpt2 --weight-format int8 ov_model
 ```
 
+Quantization in hybrid mode can be applied to Stable Diffusion pipeline during model export. This involves applying hybrid post-training quantization to the UNet model and weight-only quantization for the rest of the pipeline components. In the hybrid mode, weights in MatMul and Embedding layers are quantized, as well as activations of other layers.
+
+```plain
+optimum-cli export openvino --model stabilityai/stable-diffusion-2-1 --dataset conceptual_captions --weight-format int8 ov_model
+```
+
 To apply quantization on both weights and activations, you can find more information in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov).
 
 #### Inference:
@@ -122,7 +128,7 @@ Post-training static quantization introduces an additional calibration step wher
 
 ```python
 from functools import partial
-from optimum.intel import OVQuantizer, OVModelForSequenceClassification
+from optimum.intel import OVQuantizer, OVModelForSequenceClassification, OVConfig, OVQuantizationConfig
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 
 model_id = "distilbert-base-uncased-finetuned-sst-2-english"
@@ -145,7 +151,8 @@ calibration_dataset = quantizer.get_calibration_dataset(
 # The directory where the quantized model will be saved
 save_dir = "nncf_results"
 # Apply static quantization and save the resulting model in the OpenVINO IR format
-quantizer.quantize(calibration_dataset=calibration_dataset, save_directory=save_dir)
+ov_config = OVConfig(quantization_config=OVQuantizationConfig())
+quantizer.quantize(ov_config=ov_config, calibration_dataset=calibration_dataset, save_directory=save_dir)
 # Load the quantized model
 optimized_model = OVModelForSequenceClassification.from_pretrained(save_dir)
 ```
 
@@ -84,7 +84,7 @@ Here is how to apply static quantization on a fine-tuned DistilBERT given your o
 
 ```python
 from transformers import AutoTokenizer
-from optimum.intel import OVQuantizer, OVModelForSequenceClassification,
+from optimum.intel import OVQuantizer, OVModelForSequenceClassification, OVConfig, OVQuantizationConfig
 
 model_id = "distilbert-base-uncased-finetuned-sst-2-english"
 model = OVModelForSequenceClassification.from_pretrained(model_id, export=True)
@@ -95,7 +95,8 @@ save_dir = "ptq_model"
 quantizer = OVQuantizer.from_pretrained(model)
 
 # Apply static quantization and export the resulting quantized model to OpenVINO IR format
-quantizer.quantize(calibration_dataset=calibration_dataset, save_directory=save_dir)
+ov_config = OVConfig(quantization_config=OVQuantizationConfig())
+quantizer.quantize(ov_config=ov_config, calibration_dataset=calibration_dataset, save_directory=save_dir)
 # Save the tokenizer
 tokenizer.save_pretrained(save_dir)
 ```
 
@@ -57,15 +57,11 @@
 from transformers.utils.versions import require_version
 
 from optimum.intel.neural_compressor import INCModelForCausalLM, INCQuantizer, INCTrainer
-from optimum.intel.utils.import_utils import (
-    INTEL_EXTENSION_FOR_TRANSFORMERS_IMPORT_ERROR,
-    is_intel_extension_for_transformers_available,
-)
-
+from optimum.intel.utils.import_utils import ITREX_IMPORT_ERROR, is_itrex_available
 
-if is_intel_extension_for_transformers_available():
-    from intel_extension_for_transformers.transformers.utils.config import WeightOnlyQuantConfig
 
+if is_itrex_available():
+    from intel_extension_for_transformers.transformers.utils.config import GPTQConfig, RtnConfig
 
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
 
@@ -227,8 +223,9 @@ class OptimizationArguments:
         metadata={"help": "Scheme for weight only quantization. Choose from 'sym' and 'asym'."},
     )
     quantization_methodology: str = field(
-        default="RTN",
-        metadata={"help": "Quantization methodology for weight only quantization. Choose from 'RTN' and 'GPTQ'."},
+        choices=["rtn", "gptq"],
+        default="rtn",
+        metadata={"help": "Quantization methodology for weight only quantization. Choose from 'rtn' and 'gptq'."},
     )
     damp_percent: float = field(
         default=0.01,
@@ -658,26 +655,27 @@ def compute_metrics(eval_preds):
             else:
                 recipes = {}
             if optim_args.quantization_approach == "weight_only":
-                if not is_intel_extension_for_transformers_available():
-                    raise ImportError(INTEL_EXTENSION_FOR_TRANSFORMERS_IMPORT_ERROR.format("WeightOnly quantization"))
+                if not is_itrex_available():
+                    raise ImportError(ITREX_IMPORT_ERROR.format("WeightOnly quantization"))
                 if optim_args.apply_pruning or optim_args.apply_distillation:
                     raise ValueError("Weight only quantization and pruning or distillation cannot be combined.")
-                if optim_args.quantization_methodology == "GPTQ":
-                    algorithm_args = {
-                        "act_order": False,
-                        "percdamp": optim_args.damp_percent,
-                        "block_size": optim_args.gptq_block_size,
-                        "nsamples": optim_args.num_calibration_samples,
-                        "use_max_length": optim_args.use_max_length,
-                        "pad_max_length": optim_args.pad_max_length,
-                    }
-                quantization_config = WeightOnlyQuantConfig(
-                    weight_dtype=optim_args.weight_dtype,
-                    group_size=optim_args.group_size,
-                    scheme=optim_args.weight_only_scheme,
-                    algorithm=optim_args.quantization_methodology,
-                    algorithm_args=algorithm_args if optim_args.quantization_methodology == "GPTQ" else None,
-                )
+
+                algorithm_args = {
+                    "weight_dtype": optim_args.weight_dtype,
+                    "sym": optim_args.weight_only_scheme == "sym",
+                    "group_size": optim_args.group_size,
+                }
+
+                if optim_args.quantization_methodology == "gptq":
+                    quantization_config = GPTQConfig(
+                        damp_percent=optim_args.damp_percent,
+                        nsamples=optim_args.num_calibration_samples,
+                        blocksize=optim_args.gptq_block_size,
+                        **algorithm_args,
+                    )
+                else:
+                    quantization_config = RtnConfig(**algorithm_args)
+
             else:
                 quantization_config = PostTrainingQuantConfig(
                     approach=optim_args.quantization_approach, recipes=recipes
 
@@ -1,4 +1,5 @@
 datasets>=1.14.0
 evaluate
 librosa
-torchaudio
+torchaudio
+accelerate
@@ -35,7 +35,7 @@
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
-from optimum.intel.openvino import OVConfig, OVTrainer, OVTrainingArguments
+from optimum.intel import OVConfig, OVTrainer, OVTrainingArguments
 
 
 logger = logging.getLogger(__name__)
 
@@ -2,3 +2,4 @@ datasets >= 1.8.0
 torch >= 1.9.0
 torchvision>=0.6.0
 evaluate
+accelerate
@@ -52,7 +52,7 @@
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
-from optimum.intel.openvino import OVConfig, OVTrainer, OVTrainingArguments
+from optimum.intel import OVConfig, OVTrainer, OVTrainingArguments
 
 
 logger = logging.getLogger(__name__)
 
@@ -1,3 +1,4 @@
 datasets >= 1.8.0
 torch >= 1.9.0
 evaluate
+accelerate
@@ -49,7 +49,7 @@
 from transformers.utils.versions import require_version
 from utils_qa import postprocess_qa_predictions
 
-from optimum.intel.openvino import OVConfig, OVTrainingArguments
+from optimum.intel import OVConfig, OVTrainingArguments
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 
@@ -20,7 +20,7 @@
 import torch.nn.functional as F
 from transformers.trainer_utils import PredictionOutput
 
-from optimum.intel.openvino.trainer import OVTrainer
+from optimum.intel import OVTrainer
 
 
 class QuestionAnsweringOVTrainer(OVTrainer):
 
@@ -4,4 +4,5 @@ scipy
 scikit-learn
 protobuf
 torch >= 1.3
-evaluate
+evaluate
+accelerate
@@ -46,7 +46,7 @@
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
 
-from optimum.intel.openvino import OVConfig, OVTrainer, OVTrainingArguments
+from optimum.intel import OVConfig, OVTrainer, OVTrainingArguments
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.