huggingface
diff --git a/‎.github/workflows/test_export_onnx_cli.yml
+14-8 b/‎.github/workflows/test_export_onnx_cli.yml
+14-8
diff --git a/‎.github/workflows/test_onnxruntime.yml
+6-6 b/‎.github/workflows/test_onnxruntime.yml
+6-6
diff --git a/‎.github/workflows/test_onnxruntime_gpu.yml
+41-17 b/‎.github/workflows/test_onnxruntime_gpu.yml
+41-17
diff --git a/‎.github/workflows/test_onnxruntime_slow.yml
+37-20 b/‎.github/workflows/test_onnxruntime_slow.yml
+37-20
diff --git a/‎.github/workflows/test_onnxruntime_train.yml
-26 b/‎.github/workflows/test_onnxruntime_train.yml
-26
diff --git a/‎.github/workflows/test_onnxruntime_training.yml
+66 b/‎.github/workflows/test_onnxruntime_training.yml
+66
diff --git a/‎README.md
+7 b/‎README.md
+7
diff --git a/‎docs/source/bettertransformer/overview.mdx
+3-3 b/‎docs/source/bettertransformer/overview.mdx
+3-3
diff --git a/‎docs/source/bettertransformer/tutorials/contribute.mdx
+2-2 b/‎docs/source/bettertransformer/tutorials/contribute.mdx
+2-2
diff --git a/‎docs/source/bettertransformer/tutorials/convert.mdx
+3-3 b/‎docs/source/bettertransformer/tutorials/convert.mdx
+3-3
diff --git a/‎docs/source/onnxruntime/usage_guides/models.mdx
+1-1 b/‎docs/source/onnxruntime/usage_guides/models.mdx
+1-1
@@ -2,9 +2,11 @@ name: Exporters ONNX CLI / Python - Test
 
 on:
   push:
-    branches: [main]
+    branches:
+      - main
   pull_request:
-    branches: [main]
+    branches:
+      - main
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -19,16 +21,20 @@ jobs:
         os: [ubuntu-20.04]
 
     runs-on: ${{ matrix.os }}
+
     steps:
-      - uses: actions/checkout@v2
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
       - name: Setup Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
-      - name: Install dependencies for pytorch export
+
+      - name: Install dependencies
         run: |
           pip install .[tests,exporters,diffusers]
-      - name: Test with unittest
-        working-directory: tests
+
+      - name: Test with pytest
         run: |
-          pytest exporters/onnx/test_exporters_onnx_cli.py -n auto -m "not tensorflow_test and not timm_test" -s --durations=0
+          pytest tests/exporters/onnx/test_exporters_onnx_cli.py -n auto -m "not tensorflow_test and not timm_test" -s --durations=0
@@ -1,12 +1,12 @@
-# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
-# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 name: ONNX Runtime / Python - Test
 
 on:
   push:
-    branches: [main]
+    branches:
+      - main
   pull_request:
-    branches: [main]
+    branches:
+      - main
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -58,10 +58,10 @@ jobs:
 
       - name: Test with pytest (in series)
         run: |
-          pytest tests/onnxruntime -m "run_in_series" --durations=0 -vvvv -s
+          pytest tests/onnxruntime -m "run_in_series" --durations=0 -vvvv
 
       - name: Test with pytest (in parallel)
         run: |
-          pytest tests/onnxruntime -m "not run_in_series" --durations=0 -vvvv -s -n auto
+          pytest tests/onnxruntime -m "not run_in_series" --durations=0 -vvvv -n auto
         env:
           HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
@@ -1,30 +1,54 @@
-name: ONNX Runtime / Test GPU
+name: ONNX Runtime GPU / Python - Test
 
 on:
   workflow_dispatch:
   schedule:
-    - cron: 0 1 */3 * * # at 1am every 3 days
+    - cron: 0 7 * * * # every day at 7am UTC
   pull_request:
-    types: [opened, synchronize, reopened, labeled]
-  # uncomment to enable on PR merge on main branch:
-  #push:
-  #  branches:
-  #    - main
+    branches:
+      - main
+    types:
+      - opened
+      - labeled
+      - reopened
+      - unlabeled
+      - synchronize
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
 
 jobs:
-  do-the-job:
-    if: ${{ (github.event_name == 'workflow_dispatch') || (github.event_name == 'schedule') || contains( github.event.pull_request.labels.*.name, 'gpu-test') }}
-    name: Start self-hosted EC2 runner
+  build:
+    if: ${{
+      (github.event_name == 'push') ||
+      (github.event_name == 'workflow_dispatch') ||
+      contains(github.event.pull_request.labels.*.name, 'gpu') ||
+      contains(github.event.pull_request.labels.*.name, 'onnxruntime-gpu')
+      }}
+
     runs-on:
       group: aws-g6-4xlarge-plus
-    env:
-      AWS_REGION: us-east-1
+
+    container:
+      image: nvcr.io/nvidia/tensorrt:24.12-py3
+      options: --gpus all
+
     steps:
       - name: Checkout
-        uses: actions/checkout@v2
-      - name: Build image
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.9"
+
+      - name: Install dependencies
         run: |
-          docker build -f tests/onnxruntime/docker/Dockerfile_onnxruntime_gpu -t onnxruntime-gpu .
-      - name: Test with unittest within docker container
+          pip install --upgrade pip
+          pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
+          pip install .[tests,onnxruntime-gpu,diffusers]
+
+      - name: Test with pytest
         run: |
-          docker run --rm --gpus all -v /mnt/cache/.cache/huggingface:/root/.cache/huggingface --workdir=/workspace/optimum/tests onnxruntime-gpu:latest
+          pytest tests/onnxruntime -m "cuda_ep_test or trt_ep_test" --durations=0 -vvvv -n auto
@@ -1,33 +1,50 @@
-name: ONNX Runtime slow / Python - Test
+name: ONNX Runtime Slow / Python - Test
 
 on:
   workflow_dispatch:
   schedule:
-    - cron: 0 7 * * * # every day at 7am
+    - cron: 0 7 * * * # every day at 7am UTC
+  pull_request:
+    branches:
+      - main
+    types:
+      - opened
+      - labeled
+      - reopened
+      - unlabeled
+      - synchronize
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 
 jobs:
   build:
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: ["3.9"]
-        os: [ubuntu-20.04]
+    if: ${{
+      (github.event_name == 'push') ||
+      (github.event_name == 'workflow_dispatch') ||
+      contains(github.event.pull_request.labels.*.name, 'slow') ||
+      contains(github.event.pull_request.labels.*.name, 'onnxruntime-slow')
+      }}
+
+    runs-on:
+      group: aws-general-8-plus
 
-    runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies for export
-      run: |
-        pip install .[tests,onnxruntime,diffusers]
-    - name: Test with unittest
-      working-directory: tests
-      run: |
-        RUN_SLOW=1 pytest onnxruntime -s -m "run_slow" --durations=0
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python 3.9
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.9"
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+          pip install .[tests,onnxruntime,diffusers]
+
+      - name: Test with pytest
+        run: |
+          RUN_SLOW=1 pytest tests/onnxruntime -m "run_slow" --durations=0 -vvvv
@@ -0,0 +1,66 @@
+name: ONNX Runtime Training / Python - Test
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: 0 7 * * * # every day at 7am UTC
+  pull_request:
+    branches:
+      - main
+    types:
+      - opened
+      - labeled
+      - reopened
+      - unlabeled
+      - synchronize
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    if: ${{
+      (github.event_name == 'push') ||
+      (github.event_name == 'workflow_dispatch') ||
+      contains( github.event.pull_request.labels.*.name, 'training') ||
+      contains( github.event.pull_request.labels.*.name, 'onnxruntime-training')
+      }}
+
+    runs-on:
+      group: aws-g6-4xlarge-plus
+
+    container:
+      image: nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
+      options: --gpus all
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.9"
+
+      - name: Install dependencies
+        env:
+          TORCH_CUDA_ARCH_LIST: "5.0 6.0 7.0 7.5 8.0 8.6 9.0+PTX"
+        run: |
+          pip install --upgrade pip
+          pip install --no-cache-dir "torch<2.6" torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+          pip install --no-cache-dir torch-ort onnxruntime-training && python -m torch_ort.configure
+          pip install --no-cache-dir evaluate absl-py rouge_score seqeval sacrebleu nltk scikit-learn
+          pip install .[tests,onnxruntime-training]
+
+      - name: Test with pytest (trainer)
+        run: |
+          RUN_SLOW=1 pytest tests/onnxruntime-training/test_trainer.py --durations=0 -vvvv
+        env:
+          HF_DATASETS_TRUST_REMOTE_CODE: 1
+
+      - name: Test with pytest (examples)
+        run: |
+          RUN_SLOW=1 pytest tests/onnxruntime-training/test_examples.py --durations=0 -vvvv
+        env:
+          HF_DATASETS_TRUST_REMOTE_CODE: 1
@@ -239,6 +239,13 @@ You can find more examples in the [documentation](https://huggingface.co/docs/op
 
 ### ONNX Runtime
 
+
+Before you begin, make sure you have all the necessary libraries installed :
+
+```bash
+pip install optimum[onnxruntime-training]
+```
+
 ```diff
 - from transformers import Trainer, TrainingArguments
 + from optimum.onnxruntime import ORTTrainer, ORTTrainingArguments
 
@@ -16,7 +16,7 @@ specific language governing permissions and limitations under the License.
 
 ## Quickstart
 
-Since its 1.13 version, [PyTorch released](https://pytorch.org/blog/PyTorch-1.13-release/) the stable version of a fast path for its standard Transformer APIs that provides out of the box performance improvements for transformer-based models. You can benefit from interesting speedup on most consumer-type devices, including CPUs, older and newer versions of NIVIDIA GPUs.
+Since its 1.13 version, [PyTorch released](https://pytorch.org/blog/PyTorch-1.13-release/) the stable version of a fast path for its standard Transformer APIs that provides out of the box performance improvements for transformer-based models. You can benefit from interesting speedup on most consumer-type devices, including CPUs, older and newer versions of NVIDIA GPUs.
 You can now use this feature in 🤗 Optimum together with Transformers and use it for major models in the Hugging Face ecosystem.
 
 In the 2.0 version, PyTorch includes a native scaled dot-product attention operator (SDPA) as part of `torch.nn.functional`. This function encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the [official documentation](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention) for more information, and [this blog post](https://pytorch.org/blog/out-of-the-box-acceleration/) for benchmarks.
@@ -54,13 +54,13 @@ The list of supported model below:
 - [DeiT](https://arxiv.org/abs/2012.12877)
 - [Electra](https://arxiv.org/abs/2003.10555)
 - [Ernie](https://arxiv.org/abs/1904.09223)
-- [Falcon](https://arxiv.org/abs/2306.01116) (No need to use BetterTransformer, it is [directy supported by Transformers](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-and-memory-efficient-attention-through-pytorchs-scaleddotproductattention))
+- [Falcon](https://arxiv.org/abs/2306.01116) (No need to use BetterTransformer, it is [directly supported by Transformers](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-and-memory-efficient-attention-through-pytorchs-scaleddotproductattention))
 - [FSMT](https://arxiv.org/abs/1907.06616)
 - [GPT2](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)
 - [GPT-j](https://huggingface.co/EleutherAI/gpt-j-6B)
 - [GPT-neo](https://github.com/EleutherAI/gpt-neo)
 - [GPT-neo-x](https://arxiv.org/abs/2204.06745)
-- [GPT BigCode](https://arxiv.org/abs/2301.03988) (SantaCoder, StarCoder - no need to use BetterTransformer, it is [directy supported by Transformers](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-and-memory-efficient-attention-through-pytorchs-scaleddotproductattention))
+- [GPT BigCode](https://arxiv.org/abs/2301.03988) (SantaCoder, StarCoder - no need to use BetterTransformer, it is [directly supported by Transformers](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-and-memory-efficient-attention-through-pytorchs-scaleddotproductattention))
 - [HuBERT](https://arxiv.org/pdf/2106.07447.pdf)
 - [LayoutLM](https://arxiv.org/abs/1912.13318)
 - [Llama & Llama2](https://arxiv.org/abs/2302.13971) (No need to use BetterTransformer, it is [directy supported by Transformers](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-and-memory-efficient-attention-through-pytorchs-scaleddotproductattention))
 
@@ -112,7 +112,7 @@ Now, make sure to fill all the necessary attributes, the list of attributes are:
 
 Note that these attributes correspond to all the components that are necessary to run a Transformer Encoder module, check the figure 1 on the ["Attention Is All You Need"](https://arxiv.org/pdf/1706.03762.pdf) paper.
 
-Once you filled all these attributes (sometimes the `query`, `key` and `value` layers needs to be "contigufied", check the [`modeling_encoder.py`](https://github.com/huggingface/optimum/blob/main/optimum/bettertransformer/models/encoder_models.py) file to understand more.)
+Once you filled all these attributes (sometimes the `query`, `key` and `value` layers needs to be "contiguified", check the [`modeling_encoder.py`](https://github.com/huggingface/optimum/blob/main/optimum/bettertransformer/models/encoder_models.py) file to understand more.)
 
 Make sure also to add the lines:
 ```python
@@ -125,7 +125,7 @@ self.validate_bettertransformer()
 
 First of all, start with the line `super().forward_checker()`, this is needed so that the parent class can run all the safety checkers before.
 
-After the first forward pass, the hidden states needs to be *nested* using the attention mask. Once they are nested, the attention mask is not needed anymore, therefore can be set to `None`. This is how the forward pass is built for `Bert`, these lines should remain pretty much similar accross models, but sometimes the shapes of the attention masks are different across models. 
+After the first forward pass, the hidden states needs to be *nested* using the attention mask. Once they are nested, the attention mask is not needed anymore, therefore can be set to `None`. This is how the forward pass is built for `Bert`, these lines should remain pretty much similar across models, but sometimes the shapes of the attention masks are different across models. 
 ```python
 super().forward_checker()
 
 
@@ -45,7 +45,7 @@ Sometimes you can directly load your model on your GPU devices using `accelerate
 
 ## Step 2: Set your model on your preferred device
 
-If you did not used `device_map="auto"` to load your model (or if your model does not support `device_map="auto"`), you can manually set your model to a GPU:
+If you did not use `device_map="auto"` to load your model (or if your model does not support `device_map="auto"`), you can manually set your model to a GPU:
 ```python
 >>> model = model.to(0) # or model.to("cuda:0")
 ```
@@ -92,7 +92,7 @@ You can also use `transformers.pipeline` as usual and pass the converted model d
 >>> ...
 ```
 
-Please refer to the [official documentation of `pipeline`](https://huggingface.co/docs/transformers/main_classes/pipelines) for further usage. If you face into any issue, do not hesitate to open an isse on GitHub!
+Please refer to the [official documentation of `pipeline`](https://huggingface.co/docs/transformers/main_classes/pipelines) for further usage. If you run into any issue, do not hesitate to open an issue on GitHub!
 
 ## Training compatibility
 
@@ -113,4 +113,4 @@ model = BetterTransformer.transform(model)
 model = BetterTransformer.reverse(model)
 model.save_pretrained("fine_tuned_model")
 model.push_to_hub("fine_tuned_model")
-```
+```
@@ -16,7 +16,7 @@ Once your model was [exported to the ONNX format](https://huggingface.co/docs/op
 - from transformers import AutoModelForCausalLM
 + from optimum.onnxruntime import ORTModelForCausalLM
 
-- model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B) # PyTorch checkpoint
+- model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B") # PyTorch checkpoint
 + model = ORTModelForCausalLM.from_pretrained("onnx-community/Llama-3.2-1B", subfolder="onnx") # ONNX checkpoint
   tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")