nikita-savelyevv
diff --git a/‎.github/workflows/test_inc.yml
+1-2 b/‎.github/workflows/test_inc.yml
+1-2
diff --git a/‎.github/workflows/test_ipex.yml
+2 b/‎.github/workflows/test_ipex.yml
+2
diff --git a/‎.github/workflows/test_openvino.yml
+4-3 b/‎.github/workflows/test_openvino.yml
+4-3
diff --git a/‎.github/workflows/test_openvino_basic.yml
+13-8 b/‎.github/workflows/test_openvino_basic.yml
+13-8
diff --git a/‎.github/workflows/test_openvino_examples.yml
+1-1 b/‎.github/workflows/test_openvino_examples.yml
+1-1
diff --git a/‎.github/workflows/test_openvino_notebooks.yml
+1-1 b/‎.github/workflows/test_openvino_notebooks.yml
+1-1
diff --git a/‎README.md
+5 b/‎README.md
+5
diff --git a/‎docs/source/inference.mdx
+14-4 b/‎docs/source/inference.mdx
+14-4
diff --git a/‎docs/source/optimization_ov.mdx
+1-1 b/‎docs/source/optimization_ov.mdx
+1-1
diff --git a/‎docs/source/reference_ov.mdx
+74-17 b/‎docs/source/reference_ov.mdx
+74-17
diff --git a/‎examples/openvino/image-classification/configs/swin-base-jpqd.json
-2 b/‎examples/openvino/image-classification/configs/swin-base-jpqd.json
-2
diff --git a/‎examples/openvino/question-answering/configs/bert-base-jpqd.json
-2 b/‎examples/openvino/question-answering/configs/bert-base-jpqd.json
-2
diff --git a/‎examples/openvino/text-classification/configs/bert-base-jpqd.json
-2 b/‎examples/openvino/text-classification/configs/bert-base-jpqd.json
-2
diff --git a/‎notebooks/openvino/quantized_generation_demo.ipynb
+1-1 b/‎notebooks/openvino/quantized_generation_demo.ipynb
+1-1
diff --git a/‎optimum/exporters/ipex/model_patcher.py
+7-4 b/‎optimum/exporters/ipex/model_patcher.py
+7-4
@@ -32,7 +32,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install cmake
         pip install py-cpuinfo
-        pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cpu
+        pip install torch==2.3.0 torchaudio==2.3.0 torchvision==0.18 --index-url https://download.pytorch.org/whl/cpu
         pip install .[neural-compressor,diffusers,tests]
         pip install intel-extension-for-transformers
         pip install peft
@@ -43,7 +43,6 @@ jobs:
     - name: Test IPEX
       run: |
         pip uninstall -y intel-extension-for-transformers
-        pip install torch==2.3.0 torchaudio==2.3.0 torchvision==0.18 --extra-index-url https://download.pytorch.org/whl/cpu
         pip install intel-extension-for-pytorch==2.3.0
         pytest tests/neural_compressor/test_ipex.py
 
@@ -18,6 +18,7 @@ jobs:
       fail-fast: false
       matrix:
         python-version: [3.8, 3.9]
+        transformers-version: [4.39.0, 4.41.2]
         os: [ubuntu-latest]
 
     runs-on: ${{ matrix.os }}
@@ -32,6 +33,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install torch torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
         pip install .[ipex,tests]
+        pip install transformers==${{ matrix.transformers-version }}
     - name: Test with Pytest
       run: |
         pytest tests/ipex/
@@ -17,14 +17,14 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.11]
+        python-version: ["3.8", "3.12"]
         os: [ubuntu-latest]
 
     runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
     - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
@@ -46,3 +46,4 @@ jobs:
         pip install openvino-nightly
         python -c "from optimum.intel import OVModelForCausalLM; OVModelForCausalLM.from_pretrained('hf-internal-testing/tiny-random-gpt2', export=True, compile=False)"
         optimum-cli export openvino -m hf-internal-testing/tiny-random-gpt2 gpt2-ov
+
@@ -24,16 +24,16 @@ jobs:
       matrix:
         # Testing lower and upper bound of supported Python versions
         # This also ensures that the test fails if dependencies break for Python 3.7
-        python-version: ["3.8", "3.11"]
-        transformers: ['transformers']
+        python-version: ["3.8", "3.12"]
         optimum: ['optimum', 'git+https://github.com/huggingface/optimum.git']
+        os: ["ubuntu-22.04", "windows-latest"]
 
-    runs-on: ubuntu-20.04
+    runs-on: ${{ matrix.os }}
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
     - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
 
@@ -43,12 +43,17 @@ jobs:
         # optimum or transformers to a specific version
         # Install PyTorch CPU to prevent unnecessary downloading/installing of CUDA packages
         pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-        pip install .[tests] openvino onnx onnxruntime ${{ matrix.optimum}} ${{ matrix.transformers }}
+        pip install .[tests] openvino onnxruntime ${{ matrix.optimum}}
 
-    - name: Pip freeze        
+    - name: Pip freeze
       run: pip freeze
 
     - name: Test with Pytest
       run: |
         pytest tests/openvino/test_modeling_basic.py
-        RUN_SLOW=1 pytest tests/openvino/test_modeling.py -s -m "run_slow" --durations=0
+
+    - name: Slow tests
+      run: |
+        pytest tests/openvino/test_modeling.py -s -m "run_slow" --durations=0
+      env:
+        RUN_SLOW: 1
@@ -22,7 +22,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.8", "3.11"]
+        python-version: ["3.8", "3.12"]
 
     runs-on: ubuntu-22.04
 
 
@@ -23,7 +23,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.8", "3.11"]
+        python-version: ["3.8", "3.12"]
 
     runs-on: ubuntu-22.04
 
 
@@ -239,3 +239,8 @@ Do not forget to install requirements for every example:
 cd <example-folder>
 pip install -r requirements.txt
 ```
+
+
+## Gaudi
+
+To train your model on [Intel Gaudi AI Accelerators (HPU)](https://docs.habana.ai/en/latest/index.html), check out [Optimum Habana](https://github.com/huggingface/optimum-habana) which provides a set of tools enabling easy model loading, training and inference on single- and multi-HPU settings for different downstream tasks. After training your model, feel free to submit it to the Intel [leaderboard](https://huggingface.co/spaces/Intel/powered_by_intel_llm_leaderboard) which is designed to evaluate, score, and rank open-source LLMs that have been pre-trained or fine-tuned on Intel Hardwares. Models submitted to the leaderboard will be evaluated on the Intel Developer Cloud. The evaluation platform consists of Gaudi Accelerators and Xeon CPUs running benchmarks from the Eleuther AI Language Model Evaluation Harness.
@@ -28,8 +28,12 @@ As shown in the table below, each task is associated with a class enabling to au
 | `image-classification`               | `OVModelForImageClassification`      |
 | `feature-extraction`                 | `OVModelForFeatureExtraction`        |
 | `fill-mask`                          | `OVModelForMaskedLM`                 |
-| `text-generation`                    | `OVModelForCausalLM`                 |
-| `text2text-generation`               | `OVModelForSeq2SeqLM`                |
+| `image-classification`               | `OVModelForImageClassification`      |
+| `audio-classification`               | `OVModelForAudioClassification`      |
+| `text-generation-with-past`          | `OVModelForCausalLM`                 |
+| `text2text-generation-with-past`     | `OVModelForSeq2SeqLM`                |
+| `automatic-speech-recognition`       | `OVModelForSpeechSeq2Seq`            |
+| `image-to-text`                      | `OVModelForVision2Seq`               |
 
 
 ### Export
@@ -42,14 +46,20 @@ optimum-cli export openvino --model gpt2 ov_model
 
 The example above illustrates exporting a checkpoint from the 🤗 Hub. When exporting a local model, first make sure that you saved both the model’s weights and tokenizer files in the same directory (`local_path`).
 When using CLI, pass the `local_path` to the model argument instead of the checkpoint name of the model hosted on the Hub and provide the `--task` argument. You can review the list of supported tasks in the 🤗 [Optimum documentation](https://huggingface.co/docs/optimum/exporters/task_manager). If task argument is not provided, it will default to the model architecture without any task specific head.
-Here we set the `task` to `text-generation-with-past`, with the `-with-past` suffix enabling the re-use of the pre-computed key/values hidden-states `use_cache=True`.
+The `-with-past` suffix enable the re-use of the pre-computed key/values hidden-states and is the recommended option, to export the model without (equivalent to `use_cache=False`), you will need to remove this suffix.
 
 ```bash
 optimum-cli export openvino --model local_path --task text-generation-with-past ov_model
 ```
 
 To export your model in fp16, you can add `--weight-format fp16` when exporting your model.
 
+<Tip warning={true}>
+
+Models larger than 1 billion parameters are exported to the OpenVINO format with 8-bit weights by default. You can disable it with `--weight-format fp32`.
+
+</Tip>
+
 Once the model is exported, you can load the OpenVINO model using :
 
 ```python
@@ -126,7 +136,7 @@ model = OVModelForCausalLM.from_pretrained(model_id, load_in_8bit=True)
 
 <Tip warning={true}>
 
-`load_in_8bit` is enabled by default for the models larger than 1 billion parameters. You can disable it with `load_in_8bit=False`.
+If not specified, `load_in_8bit` will be set to `True` by default when models larger than 1 billion parameters are exported to the OpenVINO format (with `export=True`). You can disable it with `load_in_8bit=False`.
 
 </Tip>
 
 
@@ -44,7 +44,7 @@ model.save_pretrained(saving_directory)
 
 <Tip warning={true}>
 
-`load_in_8bit` is enabled by default for the models larger than 1 billion parameters. You can disable it with `load_in_8bit=False`.
+If not specified, `load_in_8bit` will be set to `True` by default when models larger than 1 billion parameters are exported to the OpenVINO format (with `export=True`). You can disable it with `load_in_8bit=False`.
 
 </Tip>
 
 
@@ -14,56 +14,113 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-# Reference
+# Models
 
-## OVModelForFeatureExtraction
+## Natural Language Processing
 
-[[autodoc]] openvino.modeling.OVModelForFeatureExtraction
+The following classes are available for the following natural language processing tasks.
+
+### OVModelForCausalLM
+
+[[autodoc]] openvino.modeling_decoder.OVModelForCausalLM
+    - forward
+    - generate
 
-## OVModelForMaskedLM
+### OVModelForMaskedLM
 
 [[autodoc]] openvino.modeling.OVModelForMaskedLM
+    - forward
+
+### OVModelForSeq2SeqLM
+
+[[autodoc]] openvino.modeling_seq2seq.OVModelForSeq2SeqLM
+    - forward
 
-## OVModelForQuestionAnswering
+### OVModelForQuestionAnswering
 
 [[autodoc]] openvino.modeling.OVModelForQuestionAnswering
+    - forward
 
-## OVModelForSequenceClassification
+### OVModelForSequenceClassification
 
 [[autodoc]] openvino.modeling.OVModelForSequenceClassification
+    - forward
 
-## OVModelForTokenClassification
+### OVModelForTokenClassification
 
 [[autodoc]] openvino.modeling.OVModelForTokenClassification
+    - forward
 
-## OVModelForAudioClassification
+
+## Audio
+
+The following classes are available for the following audio tasks.
+
+### OVModelForAudioClassification
 
 [[autodoc]] openvino.modeling.OVModelForAudioClassification
+    - forward
 
-## OVModelForAudioFrameClassification
+### OVModelForAudioFrameClassification
 
 [[autodoc]] openvino.modeling.OVModelForAudioFrameClassification
+    - forward
 
-## OVModelForCTC
+### OVModelForCTC
 
 [[autodoc]] openvino.modeling.OVModelForCTC
+    - forward
 
-## OVModelForAudioXVector
+### OVModelForAudioXVector
 
 [[autodoc]] openvino.modeling.OVModelForAudioXVector
+    - forward
+
+### OVModelForSpeechSeq2Seq
+
+[[autodoc]] openvino.modeling_seq2seq.OVModelForSpeechSeq2Seq
+    - forward
+
+
+## Computer Vision
 
-## OVModelForImageClassification
+The following classes are available for the following computer vision tasks.
+
+### OVModelForImageClassification
 
 [[autodoc]] openvino.modeling.OVModelForImageClassification
+    - forward
 
-## OVModelForCausalLM
 
-[[autodoc]] openvino.modeling_decoder.OVModelForCausalLM
+## Multimodal
 
-## OVModelForSeq2SeqLM
+The following classes are available for the following multimodal tasks.
 
-[[autodoc]] openvino.modeling_seq2seq.OVModelForSeq2SeqLM
+### OVModelForVision2Seq
+
+[[autodoc]] openvino.modeling_seq2seq.OVModelForVision2Seq
+    - forward
+
+### OVModelForPix2Struct
+
+[[autodoc]] openvino.modeling_seq2seq.OVModelForPix2Struct
+    - forward
+
+##  Custom Tasks
+
+### OVModelForCustomTasks
+
+[[autodoc]] openvino.modeling.OVModelForCustomTasks
+    - forward
+
+### OVModelForFeatureExtraction
+
+[[autodoc]] openvino.modeling.OVModelForFeatureExtraction
+    - forward
+
+
+# Quantization
 
-## OVQuantizer
+### OVQuantizer
 
 [[autodoc]] openvino.quantization.OVQuantizer
@@ -36,8 +36,6 @@
       "ignored_scopes": [
         "{re}.*__add___[0-1]",
         "{re}.*layer_norm_0",
-        "{re}.*matmul_1",
-        "{re}.*__truediv__*"
       ]
   }
 ]
@@ -36,8 +36,6 @@
         "ignored_scopes": [
             "{re}.*__add___[0-1]",
             "{re}.*layer_norm_0",
-            "{re}.*matmul_1",
-            "{re}.*__truediv__*"
         ]
     }
 ]
@@ -40,8 +40,6 @@
         "ignored_scopes": [
             "{re}.*__add___[0-1]",
             "{re}.*layer_norm_0",
-            "{re}.*matmul_1",
-            "{re}.*__truediv__*"
         ]
     }
 ]
@@ -32,7 +32,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# ! pip install optimum[openvino,nncf] torch"
+    "# ! pip install optimum[openvino,nncf] torch==2.2.2"
    ]
   },
   {
 
@@ -23,6 +23,7 @@
 from optimum.intel.utils.import_utils import is_ipex_version
 
 from .modeling_utils import (
+    _IPEX_MINIMUM_VERSION_FOR_PATCHING,
     _IPEXLlamaDecoderLayerRef,
     _llama_attn_forward,
     _llama_layer_norm_forward,
@@ -62,18 +63,20 @@ def patch_op(m, target_m, new_op_name, new_op):
 
 
 def _patch_llama_model(model):
-    if is_ipex_version("<", "2.5.0"):
-        raise ImportError("Only ipex version > 2.3.0 supports RotaryEmbedding and IndirectAccessKVCache")
+    if is_ipex_version("<", _IPEX_MINIMUM_VERSION_FOR_PATCHING):
+        raise ImportError(
+            f"Only ipex version >= {_IPEX_MINIMUM_VERSION_FOR_PATCHING} supports RotaryEmbedding and IndirectAccessKVCacheAttention"
+        )
 
-    from intel_extension_for_pytorch.llm.modules import IndirectAccessKVCache, RotaryEmbedding
+    from intel_extension_for_pytorch.llm.modules import IndirectAccessKVCacheAttention, RotaryEmbedding
 
     ipex_rope = RotaryEmbedding(
         model.config.max_position_embeddings,
         model.config.hidden_size // model.config.num_attention_heads,
         model.config.rope_theta,
         model.config.architectures[0],
     )
-    ipex_scale_dot_product = IndirectAccessKVCache(text_max_length=model.config.max_position_embeddings)
+    ipex_scale_dot_product = IndirectAccessKVCacheAttention(text_max_length=model.config.max_position_embeddings)
     patch_op(model, LlamaAttention, "ipex_rope", ipex_rope)
     patch_op(model, LlamaAttention, "ipex_scale_dot_product", ipex_scale_dot_product)
Original file line number	Diff line number	Diff line change
`@@ -36,8 +36,6 @@`
`36`	`36`	`"ignored_scopes": [`
`37`	`37`	`"{re}.*__add___[0-1]",`
`38`	`38`	`"{re}.*layer_norm_0",`
`39`		`- "{re}.*matmul_1",`
`40`		`- "{re}.__truediv__"`
`41`	`39`	`]`
`42`	`40`	`}`
`43`	`41`	`]`
Original file line number	Diff line number	Diff line change
`@@ -40,8 +40,6 @@`
`40`	`40`	`"ignored_scopes": [`
`41`	`41`	`"{re}.*__add___[0-1]",`
`42`	`42`	`"{re}.*layer_norm_0",`
`43`		`- "{re}.*matmul_1",`
`44`		`- "{re}.__truediv__"`
`45`	`43`	`]`
`46`	`44`	`}`
`47`	`45`	`]`
Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@`
`32`	`32`	`"metadata": {},`
`33`	`33`	`"outputs": [],`
`34`	`34`	`"source": [`
`35`		`- "# ! pip install optimum[openvino,nncf] torch"`
	`35`	`+ "# ! pip install optimum[openvino,nncf] torch==2.2.2"`
`36`	`36`	`]`
`37`	`37`	`},`
`38`	`38`	`{`