diff --git a/.github/workflows/test_export_onnx_cli.yml b/.github/workflows/test_export_onnx_cli.yml index 41394fef92..13c92c7561 100644 --- a/.github/workflows/test_export_onnx_cli.yml +++ b/.github/workflows/test_export_onnx_cli.yml @@ -2,9 +2,11 @@ name: Exporters ONNX CLI / Python - Test on: push: - branches: [main] + branches: + - main pull_request: - branches: [main] + branches: + - main concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -19,16 +21,20 @@ jobs: os: [ubuntu-20.04] runs-on: ${{ matrix.os }} + steps: - - uses: actions/checkout@v2 + - name: Checkout repository + uses: actions/checkout@v4 + - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - name: Install dependencies for pytorch export + + - name: Install dependencies run: | pip install .[tests,exporters,diffusers] - - name: Test with unittest - working-directory: tests + + - name: Test with pytest run: | - pytest exporters/onnx/test_exporters_onnx_cli.py -n auto -m "not tensorflow_test and not timm_test" -s --durations=0 + pytest tests/exporters/onnx/test_exporters_onnx_cli.py -n auto -m "not tensorflow_test and not timm_test" -s --durations=0 diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml index 55abfd5683..bf7f15e263 100644 --- a/.github/workflows/test_onnxruntime.yml +++ b/.github/workflows/test_onnxruntime.yml @@ -1,12 +1,12 @@ -# This workflow will install Python dependencies, run tests and lint with a variety of Python versions -# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions name: ONNX Runtime / Python - Test on: push: - branches: [main] + branches: + - main pull_request: - branches: [main] + branches: + - main concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -58,10 +58,10 @@ jobs: - name: Test with pytest (in series) run: | - pytest tests/onnxruntime -m "run_in_series" --durations=0 -vvvv -s + pytest tests/onnxruntime -m "run_in_series" --durations=0 -vvvv - name: Test with pytest (in parallel) run: | - pytest tests/onnxruntime -m "not run_in_series" --durations=0 -vvvv -s -n auto + pytest tests/onnxruntime -m "not run_in_series" --durations=0 -vvvv -n auto env: HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} diff --git a/.github/workflows/test_onnxruntime_gpu.yml b/.github/workflows/test_onnxruntime_gpu.yml index e2337de710..45c9bb89b7 100644 --- a/.github/workflows/test_onnxruntime_gpu.yml +++ b/.github/workflows/test_onnxruntime_gpu.yml @@ -1,30 +1,54 @@ -name: ONNX Runtime / Test GPU +name: ONNX Runtime GPU / Python - Test on: workflow_dispatch: schedule: - - cron: 0 1 */3 * * # at 1am every 3 days + - cron: 0 7 * * * # every day at 7am UTC pull_request: - types: [opened, synchronize, reopened, labeled] - # uncomment to enable on PR merge on main branch: - #push: - # branches: - # - main + branches: + - main + types: + - opened + - labeled + - reopened + - unlabeled + - synchronize + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true jobs: - do-the-job: - if: ${{ (github.event_name == 'workflow_dispatch') || (github.event_name == 'schedule') || contains( github.event.pull_request.labels.*.name, 'gpu-test') }} - name: Start self-hosted EC2 runner + build: + if: ${{ + (github.event_name == 'push') || + (github.event_name == 'workflow_dispatch') || + contains(github.event.pull_request.labels.*.name, 'gpu') || + contains(github.event.pull_request.labels.*.name, 'onnxruntime-gpu') + }} + runs-on: group: aws-g6-4xlarge-plus - env: - AWS_REGION: us-east-1 + + container: + image: nvcr.io/nvidia/tensorrt:24.12-py3 + options: --gpus all + steps: - name: Checkout - uses: actions/checkout@v2 - - name: Build image + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.9" + + - name: Install dependencies run: | - docker build -f tests/onnxruntime/docker/Dockerfile_onnxruntime_gpu -t onnxruntime-gpu . - - name: Test with unittest within docker container + pip install --upgrade pip + pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 + pip install .[tests,onnxruntime-gpu,diffusers] + + - name: Test with pytest run: | - docker run --rm --gpus all -v /mnt/cache/.cache/huggingface:/root/.cache/huggingface --workdir=/workspace/optimum/tests onnxruntime-gpu:latest + pytest tests/onnxruntime -m "cuda_ep_test or trt_ep_test" --durations=0 -vvvv -n auto diff --git a/.github/workflows/test_onnxruntime_slow.yml b/.github/workflows/test_onnxruntime_slow.yml index 031aeceea2..603b44c4fe 100644 --- a/.github/workflows/test_onnxruntime_slow.yml +++ b/.github/workflows/test_onnxruntime_slow.yml @@ -1,9 +1,18 @@ -name: ONNX Runtime slow / Python - Test +name: ONNX Runtime Slow / Python - Test on: workflow_dispatch: schedule: - - cron: 0 7 * * * # every day at 7am + - cron: 0 7 * * * # every day at 7am UTC + pull_request: + branches: + - main + types: + - opened + - labeled + - reopened + - unlabeled + - synchronize concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -11,23 +20,31 @@ concurrency: jobs: build: - strategy: - fail-fast: false - matrix: - python-version: ["3.9"] - os: [ubuntu-20.04] + if: ${{ + (github.event_name == 'push') || + (github.event_name == 'workflow_dispatch') || + contains(github.event.pull_request.labels.*.name, 'slow') || + contains(github.event.pull_request.labels.*.name, 'onnxruntime-slow') + }} + + runs-on: + group: aws-general-8-plus - runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies for export - run: | - pip install .[tests,onnxruntime,diffusers] - - name: Test with unittest - working-directory: tests - run: | - RUN_SLOW=1 pytest onnxruntime -s -m "run_slow" --durations=0 + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Python 3.9 + uses: actions/setup-python@v5 + with: + python-version: "3.9" + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + pip install .[tests,onnxruntime,diffusers] + + - name: Test with pytest + run: | + RUN_SLOW=1 pytest tests/onnxruntime -m "run_slow" --durations=0 -vvvv diff --git a/.github/workflows/test_onnxruntime_train.yml b/.github/workflows/test_onnxruntime_train.yml deleted file mode 100644 index 09a3a2090b..0000000000 --- a/.github/workflows/test_onnxruntime_train.yml +++ /dev/null @@ -1,26 +0,0 @@ -name: ONNX Runtime / Test ORTTrainer - -on: - workflow_dispatch: - schedule: - - cron: 0 1 */3 * * # at 1am every 3 days - pull_request: - types: [opened, synchronize, reopened, labeled] - -jobs: - do-the-job: - if: ${{ (github.event_name == 'workflow_dispatch') || (github.event_name == 'schedule') || contains( github.event.pull_request.labels.*.name, 'training')}} - name: Run ORTTrainer test - runs-on: - group: aws-g6-4xlarge-plus - env: - AWS_REGION: us-east-1 - steps: - - name: Checkout - uses: actions/checkout@v2 - - name: Build image - run: | - docker build -f tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer -t onnxruntime/train . - - name: Run test within docker container - run: | - docker run --rm --gpus all -v /mnt/cache/.cache/huggingface:/root/.cache/huggingface --workdir=/workspace/optimum/tests onnxruntime/train:latest diff --git a/.github/workflows/test_onnxruntime_training.yml b/.github/workflows/test_onnxruntime_training.yml new file mode 100644 index 0000000000..c4b4348bcd --- /dev/null +++ b/.github/workflows/test_onnxruntime_training.yml @@ -0,0 +1,66 @@ +name: ONNX Runtime Training / Python - Test + +on: + workflow_dispatch: + schedule: + - cron: 0 7 * * * # every day at 7am UTC + pull_request: + branches: + - main + types: + - opened + - labeled + - reopened + - unlabeled + - synchronize + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + build: + if: ${{ + (github.event_name == 'push') || + (github.event_name == 'workflow_dispatch') || + contains( github.event.pull_request.labels.*.name, 'training') || + contains( github.event.pull_request.labels.*.name, 'onnxruntime-training') + }} + + runs-on: + group: aws-g6-4xlarge-plus + + container: + image: nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 + options: --gpus all + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.9" + + - name: Install dependencies + env: + TORCH_CUDA_ARCH_LIST: "5.0 6.0 7.0 7.5 8.0 8.6 9.0+PTX" + run: | + pip install --upgrade pip + pip install --no-cache-dir "torch<2.6" torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 + pip install --no-cache-dir torch-ort onnxruntime-training && python -m torch_ort.configure + pip install --no-cache-dir evaluate absl-py rouge_score seqeval sacrebleu nltk scikit-learn + pip install .[tests,onnxruntime-training] + + - name: Test with pytest (trainer) + run: | + RUN_SLOW=1 pytest tests/onnxruntime-training/test_trainer.py --durations=0 -vvvv + env: + HF_DATASETS_TRUST_REMOTE_CODE: 1 + + - name: Test with pytest (examples) + run: | + RUN_SLOW=1 pytest tests/onnxruntime-training/test_examples.py --durations=0 -vvvv + env: + HF_DATASETS_TRUST_REMOTE_CODE: 1 diff --git a/examples/onnxruntime/training/image-classification/run_image_classification.py b/examples/onnxruntime/training/image-classification/run_image_classification.py index c5d5aabe27..c2bcb86aa0 100644 --- a/examples/onnxruntime/training/image-classification/run_image_classification.py +++ b/examples/onnxruntime/training/image-classification/run_image_classification.py @@ -333,6 +333,7 @@ def compute_metrics(p): token=model_args.token, trust_remote_code=model_args.trust_remote_code, ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, + attn_implementation="eager", ) image_processor = AutoImageProcessor.from_pretrained( model_args.image_processor_name or model_args.model_name_or_path, diff --git a/examples/onnxruntime/training/language-modeling/run_clm.py b/examples/onnxruntime/training/language-modeling/run_clm.py index 10c0622ec9..9481e182a1 100644 --- a/examples/onnxruntime/training/language-modeling/run_clm.py +++ b/examples/onnxruntime/training/language-modeling/run_clm.py @@ -442,9 +442,12 @@ def main(): trust_remote_code=model_args.trust_remote_code, torch_dtype=torch_dtype, low_cpu_mem_usage=model_args.low_cpu_mem_usage, + attn_implementation="eager", ) else: - model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code) + model = AutoModelForCausalLM.from_config( + config, trust_remote_code=model_args.trust_remote_code, attn_implementation="eager" + ) n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values()) logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") diff --git a/examples/onnxruntime/training/language-modeling/run_mlm.py b/examples/onnxruntime/training/language-modeling/run_mlm.py index d032210fa5..e25c70a297 100755 --- a/examples/onnxruntime/training/language-modeling/run_mlm.py +++ b/examples/onnxruntime/training/language-modeling/run_mlm.py @@ -430,10 +430,13 @@ def main(): token=model_args.token, trust_remote_code=model_args.trust_remote_code, low_cpu_mem_usage=model_args.low_cpu_mem_usage, + attn_implementation="eager", ) else: logger.info("Training new model from scratch") - model = AutoModelForMaskedLM.from_config(config, trust_remote_code=model_args.trust_remote_code) + model = AutoModelForMaskedLM.from_config( + config, trust_remote_code=model_args.trust_remote_code, attn_implementation="eager" + ) # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch # on a small vocab and want a smaller embedding size, remove this test. diff --git a/examples/onnxruntime/training/question-answering/run_qa.py b/examples/onnxruntime/training/question-answering/run_qa.py index 08b581a1a8..c63f7f6a19 100644 --- a/examples/onnxruntime/training/question-answering/run_qa.py +++ b/examples/onnxruntime/training/question-answering/run_qa.py @@ -364,6 +364,7 @@ def main(): revision=model_args.model_revision, token=model_args.token, trust_remote_code=model_args.trust_remote_code, + attn_implementation="eager", ) # Tokenizer check: this script requires a fast tokenizer. diff --git a/examples/onnxruntime/training/summarization/run_summarization.py b/examples/onnxruntime/training/summarization/run_summarization.py index 83ec61f225..c6a80e626d 100644 --- a/examples/onnxruntime/training/summarization/run_summarization.py +++ b/examples/onnxruntime/training/summarization/run_summarization.py @@ -458,6 +458,7 @@ def main(): revision=model_args.model_revision, token=model_args.token, trust_remote_code=model_args.trust_remote_code, + attn_implementation="eager", ) if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)): diff --git a/examples/onnxruntime/training/text-classification/run_classification.py b/examples/onnxruntime/training/text-classification/run_classification.py index 6600e26c36..1edcc3a999 100755 --- a/examples/onnxruntime/training/text-classification/run_classification.py +++ b/examples/onnxruntime/training/text-classification/run_classification.py @@ -527,6 +527,7 @@ def main(): token=model_args.token, trust_remote_code=model_args.trust_remote_code, ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, + attn_implementation="eager", ) model.config.pad_token_id = model.config.eos_token_id diff --git a/examples/onnxruntime/training/text-classification/run_glue.py b/examples/onnxruntime/training/text-classification/run_glue.py index f3f04657af..27e14199b4 100644 --- a/examples/onnxruntime/training/text-classification/run_glue.py +++ b/examples/onnxruntime/training/text-classification/run_glue.py @@ -404,6 +404,7 @@ def main(): token=model_args.token, trust_remote_code=model_args.trust_remote_code, ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, + attn_implementation="eager", ) # Preprocessing the raw_datasets diff --git a/examples/onnxruntime/training/token-classification/run_ner.py b/examples/onnxruntime/training/token-classification/run_ner.py index 55ddfa2cf0..102249fc51 100644 --- a/examples/onnxruntime/training/token-classification/run_ner.py +++ b/examples/onnxruntime/training/token-classification/run_ner.py @@ -405,6 +405,7 @@ def get_label_list(labels): token=model_args.token, trust_remote_code=model_args.trust_remote_code, ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, + attn_implementation="eager", ) if tokenizer.pad_token is None: diff --git a/examples/onnxruntime/training/translation/run_translation.py b/examples/onnxruntime/training/translation/run_translation.py index 0b6a36d12f..f54246be33 100644 --- a/examples/onnxruntime/training/translation/run_translation.py +++ b/examples/onnxruntime/training/translation/run_translation.py @@ -408,6 +408,7 @@ def main(): revision=model_args.model_revision, token=model_args.token, trust_remote_code=model_args.trust_remote_code, + attn_implementation="eager", ) # Set decoder_start_token_id diff --git a/optimum/exporters/onnx/base.py b/optimum/exporters/onnx/base.py index e6618568c0..43468a15c0 100644 --- a/optimum/exporters/onnx/base.py +++ b/optimum/exporters/onnx/base.py @@ -154,7 +154,7 @@ class OnnxConfig(ExportConfig, ABC): "feature-extraction": OrderedDict({"last_hidden_state": {0: "batch_size", 1: "sequence_length"}}), "fill-mask": OrderedDict({"logits": {0: "batch_size", 1: "sequence_length"}}), "image-classification": OrderedDict({"logits": {0: "batch_size"}}), - "image-segmentation": OrderedDict({"logits": {0: "batch_size", 1: "num_labels", 2: "height", 3: "width"}}), + "image-segmentation": OrderedDict({"logits": {0: "batch_size", 2: "height", 3: "width"}}), "image-to-text": OrderedDict({"logits": {0: "batch_size", 1: "sequence_length"}}), "image-to-image": OrderedDict( {"reconstruction": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"}} diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 78e1ca418d..f420ab39c6 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -52,6 +52,8 @@ FalconDummyPastKeyValuesGenerator, GemmaDummyPastKeyValuesGenerator, GPTBigCodeDummyPastKeyValuesGenerator, + LongformerDummyTextInputGenerator, + MCTCTDummyAudioInputGenerator, MistralDummyPastKeyValuesGenerator, NormalizedConfig, NormalizedEncoderDecoderConfig, @@ -171,6 +173,19 @@ class RemBertOnnxConfig(BertOnnxConfig): DEFAULT_ONNX_OPSET = 11 +class LongformerOnnxConfig(BertOnnxConfig): + DUMMY_INPUT_GENERATOR_CLASSES = (LongformerDummyTextInputGenerator,) + DEFAULT_ONNX_OPSET = 14 + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + inputs = super().inputs + + inputs["global_attention_mask"] = inputs["attention_mask"] + + return inputs + + class MegatronBertOnnxConfig(BertOnnxConfig): DEFAULT_ONNX_OPSET = 11 @@ -776,7 +791,6 @@ def flatten_past_key_values(self, flattened_output, name, idx, t): class BartOnnxConfig(M2M100OnnxConfig): DEFAULT_ONNX_OPSET = 14 # Bart now uses F.scaled_dot_product_attention by default for torch>=2.1.1. MIN_TORCH_VERSION = version.parse("2.1.2") - pass class MBartOnnxConfig(BartOnnxConfig): @@ -791,21 +805,19 @@ class BlenderbotSmallOnnxConfig(BartOnnxConfig): pass -# big_bird and bigbird_pegasus are unsupported for now as block sparse attention is written in pure python and numpy in transformers. -# Thus, the case attention_type == "block_sparse" is unusable. -# Even with rewritting this part in pure PyTorch, torch.onnx.export is then prohibitively slow. -# References: https://github.com/pytorch/pytorch/issues/63734 & https://github.com/pytorch/pytorch/issues/94821 -""" class BigBirdOnnxConfig(DistilBertOnnxConfig): pass + class BigBirdPegasusOnnxConfig(BartOnnxConfig): - def generate_dummy_inputs_for_validation(self, reference_model_inputs: Dict[str, Any]) -> Dict[str, Any]: - if self._behavior is ConfigBehavior.ENCODER: - # TODO: check why the attention mask is not present in the exported model - reference_model_inputs.pop("attention_mask") - return super().generate_dummy_inputs_for_validation(reference_model_inputs) -""" + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + inputs = super().inputs + if self._config.attention_type == "block_sparse": + # BigBirdPegasusEncoder creates its own attention_mask internally + # https://github.com/huggingface/transformers/blob/v4.48.0/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py#L1875 + inputs.pop("attention_mask", None) + return inputs class PegasusOnnxConfig(BartOnnxConfig): @@ -828,8 +840,10 @@ def inputs(self) -> Dict[str, Dict[int, str]]: @property def outputs(self) -> Dict[str, Dict[int, str]]: common_outputs = super().outputs + if self.task == "feature-extraction": common_outputs["last_hidden_state"] = {0: "batch_size"} + return common_outputs @@ -981,7 +995,14 @@ class PoolFormerOnnxConfig(ViTOnnxConfig): class SegformerOnnxConfig(YolosOnnxConfig): - pass + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + outputs = super().outputs + + if self.task == "image-segmentation": + outputs["logits"] = {0: "batch_size"} + + return outputs class MobileNetV1OnnxConfig(ViTOnnxConfig): @@ -1669,6 +1690,17 @@ def inputs(self) -> Dict[str, Dict[int, str]]: "pixel_values": dynamic_axis, } + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + outputs = super().outputs + + if "logits" in outputs: + # default is {0: "batch_size", 1: "sequence_length"} where sequence_length is dynamic axis + # but perceiver always return the same max sequence length in the second dimension + outputs["logits"] = {0: "batch_size"} + + return outputs + def generate_dummy_inputs(self, framework: str = "pt", **kwargs): self.is_generating_dummy_inputs = True dummy_inputs = super().generate_dummy_inputs(framework=framework, **kwargs) @@ -1738,23 +1770,16 @@ def inputs(self) -> Dict[str, Dict[int, str]]: return {"input_values": {0: "batch_size"}} -# TODO: currently disabled because an operator seems not supported by ONNX. -# class MCTCTDummyAudioInputGenerator(DummyAudioInputGenerator): -# def generate(self, input_name: str, framework: str = "pt"): -# shape = [self.batch_size, self.sequence_length, self.normalized_config.input_features_per_channel] -# if input_name == "input_features": -# return self.random_float_tensor(shape, min_value=-1, max_value=1, framework=framework) -# return super().generate(input_name, framework=framework) -# -# -# class MCTCTOnnxConfig(OnnxConfig): -# NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args(input_features_per_channel="input_feat_per_channel", allow_new=True) -# DUMMY_INPUT_GENERATOR_CLASSES = (MCTCTDummyAudioInputGenerator,) -# DEFAULT_ONNX_OPSET = 13 -# -# @property -# def inputs(self) -> Dict[str, Dict[int, str]]: -# return {"input_features": {0: "batch_size", 1: "sequence_classification"}} +class MCTCTOnnxConfig(OnnxConfig): + NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args( + input_features_per_channel="input_feat_per_channel", allow_new=True + ) + DUMMY_INPUT_GENERATOR_CLASSES = (MCTCTDummyAudioInputGenerator,) + DEFAULT_ONNX_OPSET = 13 + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + return {"input_features": {0: "batch_size", 1: "sequence_classification"}} class WhisperOnnxConfig(AudioToTextOnnxConfig): @@ -2349,6 +2374,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]: if self.use_past_in_inputs: self.add_past_key_values(common_inputs, direction="inputs") + if self._behavior is ConfigBehavior.DECODER: common_inputs["encoder_outputs"] = {0: "batch_size", 1: "encoder_sequence_length"} diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index 00349143c7..47a6ae08ca 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -449,31 +449,26 @@ class TasksManager: "question-answering", onnx="RemBertOnnxConfig", ), - # For big-bird and bigbird-pegasus being unsupported, refer to model_configs.py - # "big-bird": supported_tasks_mapping( - # "feature-extraction", - # "fill-mask", - # # the logic for text-generation is not supported for big-bird - # # "text-generation", - # "text-classification", - # "multiple-choice", - # "token-classification", - # "question-answering", - # onnx="BigBirdOnnxConfig", - # # TODO: check model_config.py to know why it cannot be enabled yet. - # # tflite="BigBirdTFLiteConfig", - # ), - # "bigbird-pegasus": supported_tasks_mapping( - # "feature-extraction", - # "feature-extraction-with-past", - # "text-generation", - # "text-generation-with-past", - # "text2text-generation", - # "text2text-generation-with-past", - # "text-classification", - # "question-answering", - # onnx="BigBirdPegasusOnnxConfig", - # ), + "big-bird": supported_tasks_mapping( + "feature-extraction", + "fill-mask", + "text-classification", + "multiple-choice", + "token-classification", + "question-answering", + onnx="BigBirdOnnxConfig", + ), + "bigbird-pegasus": supported_tasks_mapping( + "feature-extraction", + "feature-extraction-with-past", + "text-generation", + "text-generation-with-past", + "text2text-generation", + "text2text-generation-with-past", + "text-classification", + "question-answering", + onnx="BigBirdPegasusOnnxConfig", + ), "blenderbot": supported_tasks_mapping( "feature-extraction", "feature-extraction-with-past", @@ -807,15 +802,15 @@ class TasksManager: "text2text-generation-with-past", onnx="LongT5OnnxConfig", ), - # "longformer": supported_tasks_mapping( - # "feature-extraction", - # "fill-mask", - # "multiple-choice", - # "question-answering", - # "text-classification", - # "token-classification", - # onnx_config_cls="models.longformer.LongformerOnnxConfig", - # ), + "longformer": supported_tasks_mapping( + "feature-extraction", + "fill-mask", + "multiple-choice", + "question-answering", + "text-classification", + "token-classification", + onnx="LongformerOnnxConfig", + ), "marian": supported_tasks_mapping( "feature-extraction", "feature-extraction-with-past", @@ -861,12 +856,11 @@ class TasksManager: "text-classification", onnx="MistralOnnxConfig", ), - # TODO: enable once the missing operator is supported. - # "mctct": supported_tasks_mapping( - # "feature-extraction", - # "automatic-speech-recognition", - # onnx="MCTCTOnnxConfig", - # ), + "mctct": supported_tasks_mapping( + "feature-extraction", + "automatic-speech-recognition", + onnx="MCTCTOnnxConfig", + ), "mobilebert": supported_tasks_mapping( "feature-extraction", "fill-mask", @@ -1313,7 +1307,7 @@ class TasksManager: "transformers": _SUPPORTED_MODEL_TYPE, } _UNSUPPORTED_CLI_MODEL_TYPE = { - # diffusers model types + # diffusers model part "clip-text", "clip-text-with-projection", "flux-transformer-2d", @@ -1326,8 +1320,8 @@ class TasksManager: "clip-text-with-projection", "siglip-text-model", "siglip-text-with-projection", - # redundant model types - "trocr", # same as vision-encoder-decoder + # transformers model part + "trocr", # the decoder of a trocr vision-encoder-decoder } _SUPPORTED_CLI_MODEL_TYPE = ( set(_SUPPORTED_MODEL_TYPE.keys()) diff --git a/optimum/onnxruntime/base.py b/optimum/onnxruntime/base.py index 845780cafa..4e9f23b2d1 100644 --- a/optimum/onnxruntime/base.py +++ b/optimum/onnxruntime/base.py @@ -26,7 +26,7 @@ from ..utils.logging import warn_once from .io_binding import TypeHelper from .modeling_ort import ORTModel -from .utils import get_ordered_input_names, logging +from .utils import logging logger = logging.get_logger(__name__) @@ -38,6 +38,11 @@ class ORTModelPart: It has its own `onnxruntime.InferenceSession`, and can perform a forward pass. """ + # should be in an ORTMixin + _prepare_io_binding = ORTModel._prepare_io_binding + _prepare_output_buffer = ORTModel._prepare_output_buffer + _output_shape_inference = ORTModel._output_shape_inference + _prepare_onnx_inputs = ORTModel._prepare_onnx_inputs _prepare_onnx_outputs = ORTModel._prepare_onnx_outputs @@ -48,10 +53,12 @@ def __init__(self, session: InferenceSession, parent_model: "ORTModel"): self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())} self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())} + self.input_dtypes = {input_key.name: input_key.type for input_key in session.get_inputs()} self.output_dtypes = {output_key.name: output_key.type for output_key in session.get_outputs()} - self._ordered_input_names = get_ordered_input_names(self.input_names.keys(), func=self.forward) + self.input_shapes = {input_key.name: input_key.shape for input_key in session.get_inputs()} + self.output_shapes = {output_key.name: output_key.shape for output_key in session.get_outputs()} @property def device(self): @@ -118,27 +125,26 @@ def forward(self, input_ids: torch.LongTensor, attention_mask: torch.LongTensor, use_torch = isinstance(input_ids, torch.Tensor) self.parent_model.raise_on_numpy_input_io_binding(use_torch) - if self.device.type == "cuda" and self.parent_model.use_io_binding: - model_inputs = [input_ids] - if "attention_mask" in self.input_names: - model_inputs.append(attention_mask) - io_binding, output_shapes, output_buffers = self.parent_model._prepare_io_binding( - self.session, - *model_inputs, - ordered_input_names=self._ordered_input_names, - ) + model_inputs = { + "input_ids": input_ids, + "attention_mask": attention_mask, + } - io_binding.synchronize_inputs() - self.session.run_with_iobinding(io_binding) - io_binding.synchronize_outputs() + if self.parent_model.use_io_binding: + io_binding, output_shapes, output_buffers = self._prepare_io_binding(self.session, model_inputs) + + if self.device.type == "cpu": + self.session.run_with_iobinding(io_binding) + else: + io_binding.synchronize_inputs() + self.session.run_with_iobinding(io_binding) + io_binding.synchronize_outputs() last_hidden_state = output_buffers["last_hidden_state"].view(output_shapes["last_hidden_state"]) else: - model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask} - - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, model_inputs) onnx_outputs = self.session.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self._prepare_onnx_outputs(use_torch, onnx_outputs) last_hidden_state = model_outputs["last_hidden_state"] @@ -257,9 +263,7 @@ def forward( decoder_attention_mask: Optional[torch.LongTensor] = None, encoder_attention_mask: Optional[torch.LongTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, - labels: Optional[torch.LongTensor] = None, cache_position: Optional[torch.Tensor] = None, - use_cache_branch: None = None, ) -> Seq2SeqLMOutput: # Adding use_cache_branch in the signature here is just a hack for IO Binding @@ -279,6 +283,17 @@ def forward( input_ids, past_key_values, cache_position, use_torch=use_torch ) + model_inputs = { + "input_ids": input_ids, + "encoder_hidden_states": encoder_hidden_states, + "decoder_attention_mask": decoder_attention_mask, + "encoder_attention_mask": encoder_attention_mask, + "use_cache_branch": use_cache_branch_tensor, + "cache_position": cache_position, + } + if past_key_values is not None: + model_inputs.update(zip(self.key_value_input_names, past_key_values)) + if self.parent_model.use_io_binding: known_output_shapes = self.compute_past_key_values_output_shapes( input_ids, @@ -286,53 +301,27 @@ def forward( use_cache_branch=use_cache_branch_tensor.item() if use_cache_branch_tensor is not None else None, past_key_values=past_key_values, ) - outputs_to_not_bind = self.get_outputs_not_to_bind(use_merged_cache) - # TODO: fix transformers generate to have contiguous input_ids here already - # For an unknown reason, calling `contiguous()` here is necessary to not have errors - # on CPU EP with batch size > 1, despite it being also called in _prepare_io_binding.g - model_inputs = [input_ids.contiguous()] - - if "encoder_hidden_states" in self.input_names: - model_inputs.append(encoder_hidden_states) - - if "decoder_attention_mask" in self.input_names: - model_inputs.append(decoder_attention_mask) - - if "encoder_attention_mask" in self.input_names: - model_inputs.append(encoder_attention_mask) - - if past_key_values is not None: - model_inputs += past_key_values - - if "labels" in self.input_names: - model_inputs.append(labels) - known_output_shapes.update({"loss": []}) - - if use_cache_branch_tensor is not None: - model_inputs.append(use_cache_branch_tensor) - - if "cache_position" in self.input_names: - model_inputs.append(cache_position) - - io_binding, output_shapes, output_buffers = self.parent_model._prepare_io_binding( + io_binding, output_shapes, output_buffers = self._prepare_io_binding( self.session, - *model_inputs, + model_inputs, known_output_shapes=known_output_shapes, - ordered_input_names=self._ordered_input_names, outputs_to_not_bind=outputs_to_not_bind, ) + if self.device.type == "cpu": + self.session.run_with_iobinding(io_binding) + else: + io_binding.synchronize_inputs() + self.session.run_with_iobinding(io_binding) + io_binding.synchronize_outputs() + # Set -1 for sequence_length as it could be larger than the real sequence_length for name, shape in output_shapes.items(): if name in self.key_value_output_names: output_shapes[name] = shape[:2] + (-1,) + shape[3:] - io_binding.synchronize_inputs() - self.session.run_with_iobinding(io_binding) - io_binding.synchronize_outputs() - # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the # self-attention layer and 2 to the cross-attention layer) out_past_key_values = () @@ -350,7 +339,7 @@ def forward( if not self.use_past_in_outputs: out_past_key_values = None - elif not self.use_past_in_inputs or use_merged_no_cache: + elif not self.use_past_in_inputs or use_merged_no_cache or self.no_cross_attention_cache: out_past_key_values = tuple( out_past_key_values[i : i + self.num_pkv] for i in range(0, len(out_past_key_values), self.num_pkv) ) @@ -382,21 +371,9 @@ def forward( else: raise ValueError("Unsupported num_pkv") else: - model_inputs = { - "input_ids": input_ids, - "encoder_hidden_states": encoder_hidden_states, - "decoder_attention_mask": decoder_attention_mask, - "encoder_attention_mask": encoder_attention_mask, - "use_cache_branch": use_cache_branch_tensor, - "cache_position": cache_position, - "labels": labels, - } - if past_key_values is not None: - model_inputs.update(zip(self.key_value_input_names, past_key_values)) - - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, model_inputs) onnx_outputs = self.session.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self._prepare_onnx_outputs(use_torch, onnx_outputs) # TODO: using a new variable out_past_key_values is memory inefficient, # past_key_values is not used anymore at this point diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py index 7c7a8fb839..9afa1bf19a 100644 --- a/optimum/onnxruntime/modeling_decoder.py +++ b/optimum/onnxruntime/modeling_decoder.py @@ -209,7 +209,6 @@ def forward( attention_mask: Optional[torch.FloatTensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, - labels: Optional[torch.LongTensor] = None, use_cache_branch: bool = None, **kwargs, ) -> CausalLMOutputWithPast: @@ -218,8 +217,7 @@ def forward( self.raise_on_numpy_input_io_binding(use_torch) known_output_shapes = {} - use_cache_branch = None - loss = None + if self.use_cache: if past_key_values is not None: # Flatten the past_key_values (gpt_bigcode has fused key/value cache, so no need to flatten it) @@ -233,35 +231,28 @@ def forward( input_ids, past_key_values, use_torch ) - if self.use_io_binding: - # TODO: fix transformers generate to have contiguous input_ids here already - # For an unknown reason, calling `contiguous()` here is necessary to not have errors - # on CPU EP with batch size > 1, despite it being also called in _prepare_io_binding. - # I suspect the reason is the contiguous python list that messes something up? - model_inputs = [input_ids.contiguous()] - - if "attention_mask" in self.input_names: - model_inputs.append(attention_mask) - - if "position_ids" in self.input_names: - if position_ids is None: - raise ValueError("position_ids was not passed but is a required input for this ONNX model.") - model_inputs.append(position_ids.contiguous()) - - if past_key_values is not None: - model_inputs += past_key_values + # Create position_ids on the fly for batch generation + if "position_ids" in self.input_names and position_ids is None and attention_mask is not None: + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -1].unsqueeze(-1) - if use_cache_branch is not None: - model_inputs.append(use_cache_branch) + model_inputs = { + "input_ids": input_ids, + "position_ids": position_ids, + "attention_mask": attention_mask, + "use_cache_branch": use_cache_branch, + } - if "labels" in self.input_names: - model_inputs.append(labels) - known_output_shapes.update({"loss": []}) + if past_key_values is not None: + model_inputs.update( + zip(self.key_value_input_names, past_key_values), + ) - io_binding, output_shapes, output_buffers = self.prepare_io_binding( - *model_inputs, - known_output_shapes=known_output_shapes, - ordered_input_names=self._ordered_input_names, + if self.use_io_binding: + io_binding, output_shapes, output_buffers = self._prepare_io_binding( + self.model, model_inputs, known_output_shapes=known_output_shapes ) if self.device.type == "cpu": @@ -271,32 +262,19 @@ def forward( self.model.run_with_iobinding(io_binding) io_binding.synchronize_outputs() + loss = output_buffers.get("loss", None) + logits = output_buffers["logits"].view(output_shapes["logits"]) + if self.use_cache: # Tuple of length equal to : number of layer * number of past_key_value per decoder layer(2 for the self-attention) past_key_values = tuple( output_buffers[name].view(output_shapes[name]) for name in self.key_value_output_names ) - logits = output_buffers["logits"].view(output_shapes["logits"]) - - if "loss" in self.output_names: - loss = output_buffers["loss"].view(output_shapes["loss"]) else: - model_inputs = { - "input_ids": input_ids, - "position_ids": position_ids, - "attention_mask": attention_mask, - "use_cache_branch": use_cache_branch, - "labels": labels, - } - if past_key_values is not None: - model_inputs.update( - zip(self.key_value_input_names, past_key_values), - ) - - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, model_inputs) onnx_outputs = self.model.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self._prepare_onnx_outputs(use_torch, onnx_outputs) loss = model_outputs.get("loss", None) logits = model_outputs["logits"] diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py index a55eb064fa..e9633343c7 100644 --- a/optimum/onnxruntime/modeling_ort.py +++ b/optimum/onnxruntime/modeling_ort.py @@ -72,7 +72,6 @@ ONNX_WEIGHTS_NAME, check_io_binding, get_device_for_provider, - get_ordered_input_names, get_provider_for_device, parse_device, validate_provider_availability, @@ -276,8 +275,6 @@ def __init__( self.output_names = {output_key.name: idx for idx, output_key in enumerate(model.get_outputs())} self.output_dtypes = {output_key.name: output_key.type for output_key in model.get_outputs()} - self._ordered_input_names = get_ordered_input_names(self.input_names.keys(), func=self.forward) - @property def dtype(self) -> torch.dtype: """ @@ -773,43 +770,23 @@ def _output_shape_inference(self, axis_name: Union[str, int], dimensions: Dict[s """ if isinstance(axis_name, int): return axis_name - # It is actually covered below, but this is to make things faster. + elif axis_name in dimensions: return dimensions[axis_name] - # Tokens is going to be populated by iterating over every match for the self.output_shape_inference_pattern. - # This pattern matches 4 things: axis names, integer values, operators (+, -, *, /) and parenthesis. - tokens = [] - for idx, match_ in enumerate(re.finditer(self.output_shape_inference_pattern, axis_name)): - groups = match_.groups() - matched_group = None - for idx, group in enumerate(groups): - if group is not None: - matched_group = idx - break - - # For every match except an axis name, we simply append the content of the match to the tokens list. - # For an axis name, we check if it is specified in the `dimensions` dictionary. If for some reason it is - # not there, or its value not an integer, the shape inference process stops and we return the axis name as - # is. - if matched_group == 0: - dim = dimensions.get(groups[0], None) - if dim is None or not isinstance(dim, int): - return axis_name - tokens.append(str(dim)) - else: - tokens.append(groups[matched_group]) + # faster way to do the same thing, assuming the axis names are well defined (by us in the exporter config) + tokens = axis_name.split(" ") + for idx, token in enumerate(tokens): + if token in dimensions: + tokens[idx] = str(dimensions[token]) - # Here it should not be problematic to use eval since anything not matching the pattern would trigger an - # exception. return int(eval(" ".join(tokens))) # TODO: this method is bloated with state arguments (that are accesible using self) why ? def _prepare_io_binding( self, model: ort.InferenceSession, - *model_inputs: torch.Tensor, - ordered_input_names: List[str], + model_inputs: Dict[str, torch.Tensor], known_output_shapes: Optional[Dict[str, Tuple[int]]] = None, outputs_to_not_bind: Optional[Union[Set[str], str]] = None, ) -> Tuple[ort.IOBinding, Dict[str, Tuple[int]], Dict[str, torch.Tensor]]: @@ -819,10 +796,8 @@ def _prepare_io_binding( Args: model (`ort.InferenceSession`): The model for which we want to bind the inputs and outputs. - *model_inputs: - The inputs of the model. - ordered_input_names (`List[str]`): - Names of the inputs, that must match with the order of model_inputs. + model_inputs (`Dict[str, torch.Tensor]`): + The inputs to bind to the model. known_output_shapes (`Optional[Dict[str, Tuple[int]]]`, defaults to `None`): It can be hard to infer all the output shapes from the inputs only. For instance for the past key / values. It is possible to explicitely pass the shape via this argument. @@ -835,36 +810,39 @@ def _prepare_io_binding( """ io_binding = model.io_binding() - name_to_np_type = TypeHelper.get_io_numpy_type_map(model) + input_shapes = {} + for input_name in self.input_names.keys(): + input_shapes[input_name] = model_inputs[input_name].shape - input_name_to_shape = {} - for idx, tensor in enumerate(model_inputs): - if tensor is None: - continue - name = ordered_input_names[idx] - tensor = tensor.contiguous() - input_name_to_shape[name] = tensor.shape + if not model_inputs[input_name].is_contiguous(): + model_inputs[input_name] = model_inputs[input_name].contiguous() + + tensor_dtype = model_inputs[input_name].dtype + expected_dtype = TypeHelper.ort_type_to_torch_type(self.input_dtypes[input_name]) + if tensor_dtype != expected_dtype: + model_inputs[input_name] = model_inputs[input_name].to(expected_dtype) - data_ptr = tensor.data_ptr() - if "past" in name and data_ptr == 0: + data_ptr = model_inputs[input_name].data_ptr() + if data_ptr == 0: # During first generation, sequence_length can be 0 when use_cache=True, which results in data_ptr to also be 0. # To keep compatibility with IO binding, we pass the data pointer of input_ids instead. This will have no impact because past_key_values will not be used during the first generation. - data_ptr = model_inputs[0].data_ptr() + data_ptr = model_inputs["input_ids"].data_ptr() io_binding.bind_input( - name, - tensor.device.type, + input_name, + self.device.type, IOBindingHelper.get_device_index(self.device), - name_to_np_type[name], - tuple(tensor.shape), + TypeHelper.ort_type_to_numpy_type(self.input_dtypes[input_name]), + model_inputs[input_name].shape, data_ptr, ) + dimensions = {} for input_ in model.get_inputs(): shape = input_.shape for idx, axis in enumerate(shape): if isinstance(axis, str): - dimensions[axis] = input_name_to_shape[input_.name][idx] + dimensions[axis] = input_shapes[input_.name][idx] output_shapes = {} output_buffers = {} @@ -887,32 +865,25 @@ def _prepare_io_binding( output_shape = [] for axis_name in output_node.shape: output_shape.append(self._output_shape_inference(axis_name, dimensions)) + output_buffer = self._prepare_output_buffer(model, output_shape, output_name) + data_ptr = output_buffer.data_ptr() + io_binding.bind_output( output_name, - output_buffer.device.type, + self.device.type, IOBindingHelper.get_device_index(self.device), - name_to_np_type[output_name], + TypeHelper.ort_type_to_numpy_type(output_node.type), output_shape, - output_buffer.data_ptr(), + data_ptr, ) - output_shapes[output_name] = output_shape + output_buffers[output_name] = output_buffer + output_shapes[output_name] = output_shape return io_binding, output_shapes, output_buffers - def prepare_io_binding( - self, *model_inputs, ordered_input_names, outputs_to_not_bind=None, known_output_shapes=None - ): - return self._prepare_io_binding( - self.model, - *model_inputs, - ordered_input_names=ordered_input_names, - known_output_shapes=known_output_shapes, - outputs_to_not_bind=outputs_to_not_bind, - ) - def raise_on_numpy_input_io_binding(self, use_torch: bool): """ Raises an error if IO Binding is requested although the tensor used are numpy arrays. @@ -928,29 +899,57 @@ def raise_on_numpy_input_io_binding(self, use_torch: bool): ) def _prepare_onnx_inputs( - self, use_torch: bool, **inputs: Union[torch.Tensor, np.ndarray] + self, use_torch: bool, model_inputs: Dict[str, Union[torch.Tensor, np.ndarray]] ) -> Dict[str, np.ndarray]: + """ + Prepares the inputs for ONNX Runtime by converting them to numpy arrays with the expected dtype. + + Args: + use_torch (`bool`): + Whether the inputs are torch.Tensor or not. + inputs (`Dict[str, Union[torch.Tensor, np.ndarray]]`): + The inputs to prepare for ONNX Runtime. + + Returns: + `Dict[str, np.ndarray]`: The inputs prepared for ONNX Runtime. + """ + onnx_inputs = {} - # converts pytorch inputs into numpy inputs for onnx + for input_name in self.input_names.keys(): - onnx_inputs[input_name] = inputs.pop(input_name) + if model_inputs.get(input_name, None) is None: + raise ValueError(f"Input {input_name} is required by model but not provided.") if use_torch: - onnx_inputs[input_name] = onnx_inputs[input_name].numpy(force=True) + onnx_inputs[input_name] = model_inputs[input_name].numpy(force=True) + else: + onnx_inputs[input_name] = model_inputs[input_name] - if onnx_inputs[input_name].dtype != self.input_dtypes[input_name]: - onnx_inputs[input_name] = onnx_inputs[input_name].astype( - TypeHelper.ort_type_to_numpy_type(self.input_dtypes[input_name]) - ) + expected_dtype = TypeHelper.ort_type_to_numpy_type(self.input_dtypes[input_name]) + + if onnx_inputs[input_name].dtype != expected_dtype: + onnx_inputs[input_name] = onnx_inputs[input_name].astype(expected_dtype) return onnx_inputs def _prepare_onnx_outputs( - self, use_torch: bool, *onnx_outputs: np.ndarray + self, use_torch: bool, onnx_outputs: List[np.ndarray] ) -> Dict[str, Union[torch.Tensor, np.ndarray]]: + """ + Prepares the outputs from ONNX Runtime by converting them to torch.Tensor if requested. + + Args: + use_torch (`bool`): + Whether the outputs should be torch.Tensor or not. + onnx_outputs (`List[np.ndarray]`): + The outputs from ONNX Runtime. + + Returns: + `Dict[str, Union[torch.Tensor, np.ndarray]]`: The outputs prepared for the user. + """ + model_outputs = {} - # converts onnxruntime outputs into tensor for standard outputs for output_name, idx in self.output_names.items(): model_outputs[output_name] = onnx_outputs[idx] @@ -1088,26 +1087,28 @@ def forward( if token_type_ids is None and "token_type_ids" in self.input_names: token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids) - if self.device.type == "cuda" and self.use_io_binding: - io_binding, output_shapes, output_buffers = self.prepare_io_binding( - input_ids, - attention_mask, - token_type_ids, - ordered_input_names=self._ordered_input_names, - ) + model_inputs = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "token_type_ids": token_type_ids, + } + + if self.use_io_binding: + io_binding, output_shapes, output_buffers = self._prepare_io_binding(self.model, model_inputs) # run inference with binding & synchronize in case of multiple CUDA streams - io_binding.synchronize_inputs() - self.model.run_with_iobinding(io_binding) - io_binding.synchronize_outputs() + if self.device.type == "cpu": + self.model.run_with_iobinding(io_binding) + else: + io_binding.synchronize_inputs() + self.model.run_with_iobinding(io_binding) + io_binding.synchronize_outputs() last_hidden_state = output_buffers["last_hidden_state"].view(output_shapes["last_hidden_state"]) else: - model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids} - - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, model_inputs) onnx_outputs = self.model.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self._prepare_onnx_outputs(use_torch, onnx_outputs) if "last_hidden_state" in self.output_names: last_hidden_state = model_outputs["last_hidden_state"] @@ -1243,29 +1244,31 @@ def forward( use_torch = isinstance(input_ids, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) - if token_type_ids is None and "token_type_ids" in self.input_names: + if "token_type_ids" in self.input_names and token_type_ids is None: token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids) - if self.device.type == "cuda" and self.use_io_binding: - io_binding, output_shapes, output_buffers = self.prepare_io_binding( - input_ids, - attention_mask, - token_type_ids, - ordered_input_names=self._ordered_input_names, - ) + model_inputs = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "token_type_ids": token_type_ids, + } + + if self.use_io_binding: + io_binding, output_shapes, output_buffers = self._prepare_io_binding(self.model, model_inputs) # run inference with binding & synchronize in case of multiple CUDA streams - io_binding.synchronize_inputs() - self.model.run_with_iobinding(io_binding) - io_binding.synchronize_outputs() + if self.device.type == "cpu": + self.model.run_with_iobinding(io_binding) + else: + io_binding.synchronize_inputs() + self.model.run_with_iobinding(io_binding) + io_binding.synchronize_outputs() logits = output_buffers["logits"].view(output_shapes["logits"]) else: - model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids} - - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, model_inputs) onnx_outputs = self.model.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self._prepare_onnx_outputs(use_torch, onnx_outputs) logits = model_outputs["logits"] @@ -1338,28 +1341,25 @@ def forward( if token_type_ids is None and "token_type_ids" in self.input_names: token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids) - if self.device.type == "cuda" and self.use_io_binding: - io_binding, output_shapes, output_buffers = self.prepare_io_binding( - input_ids, - attention_mask, - token_type_ids, - ordered_input_names=self._ordered_input_names, - ) + model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids} + + if self.use_io_binding: + io_binding, output_shapes, output_buffers = self._prepare_io_binding(self.model, model_inputs) # run inference with binding & synchronize in case of multiple CUDA streams - io_binding.synchronize_inputs() - self.model.run_with_iobinding(io_binding) - io_binding.synchronize_outputs() + if self.device.type == "cpu": + self.model.run_with_iobinding(io_binding) + else: + io_binding.synchronize_inputs() + self.model.run_with_iobinding(io_binding) + io_binding.synchronize_outputs() - # TODO: this is the same routine in all io binding branches, should we refactor it into a prepare_io_binding_outputs method? start_logits = output_buffers["start_logits"].view(output_shapes["start_logits"]) end_logits = output_buffers["end_logits"].view(output_shapes["end_logits"]) else: - model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids} - - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, model_inputs) onnx_outputs = self.model.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self._prepare_onnx_outputs(use_torch, onnx_outputs) start_logits = model_outputs["start_logits"] end_logits = model_outputs["end_logits"] @@ -1448,26 +1448,28 @@ def forward( if token_type_ids is None and "token_type_ids" in self.input_names: token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids) - if self.device.type == "cuda" and self.use_io_binding: - io_binding, output_shapes, output_buffers = self.prepare_io_binding( - input_ids, - attention_mask, - token_type_ids, - ordered_input_names=self._ordered_input_names, - ) + model_inputs = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "token_type_ids": token_type_ids, + } + + if self.use_io_binding: + io_binding, output_shapes, output_buffers = self._prepare_io_binding(self.model, model_inputs) # run inference with binding & synchronize in case of multiple CUDA streams - io_binding.synchronize_inputs() - self.model.run_with_iobinding(io_binding) - io_binding.synchronize_outputs() + if self.device.type == "cpu": + self.model.run_with_iobinding(io_binding) + else: + io_binding.synchronize_inputs() + self.model.run_with_iobinding(io_binding) + io_binding.synchronize_outputs() logits = output_buffers["logits"].view(output_shapes["logits"]) else: - model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids} - - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, model_inputs) onnx_outputs = self.model.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self._prepare_onnx_outputs(use_torch, onnx_outputs) logits = model_outputs["logits"] @@ -1541,26 +1543,28 @@ def forward( if token_type_ids is None and "token_type_ids" in self.input_names: token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids) - if self.device.type == "cuda" and self.use_io_binding: - io_binding, output_shapes, output_buffers = self.prepare_io_binding( - input_ids, - attention_mask, - token_type_ids, - ordered_input_names=self._ordered_input_names, - ) + model_inputs = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "token_type_ids": token_type_ids, + } + + if self.use_io_binding: + io_binding, output_shapes, output_buffers = self._prepare_io_binding(self.model, model_inputs) # run inference with binding & synchronize in case of multiple CUDA streams - io_binding.synchronize_inputs() - self.model.run_with_iobinding(io_binding) - io_binding.synchronize_outputs() + if self.device.type == "cpu": + self.model.run_with_iobinding(io_binding) + else: + io_binding.synchronize_inputs() + self.model.run_with_iobinding(io_binding) + io_binding.synchronize_outputs() logits = output_buffers["logits"].view(output_shapes["logits"]) else: - model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids} - - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, model_inputs) onnx_outputs = self.model.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self._prepare_onnx_outputs(use_torch, onnx_outputs) logits = model_outputs["logits"] @@ -1627,26 +1631,28 @@ def forward( if token_type_ids is None and "token_type_ids" in self.input_names: token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids) - if self.device.type == "cuda" and self.use_io_binding: - io_binding, output_shapes, output_buffers = self.prepare_io_binding( - input_ids, - attention_mask, - token_type_ids, - ordered_input_names=self._ordered_input_names, - ) + model_inputs = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "token_type_ids": token_type_ids, + } + + if self.use_io_binding: + io_binding, output_shapes, output_buffers = self._prepare_io_binding(self.model, model_inputs) # run inference with binding & synchronize in case of multiple CUDA streams - io_binding.synchronize_inputs() - self.model.run_with_iobinding(io_binding) - io_binding.synchronize_outputs() + if self.device.type == "cpu": + self.model.run_with_iobinding(io_binding) + else: + io_binding.synchronize_inputs() + self.model.run_with_iobinding(io_binding) + io_binding.synchronize_outputs() logits = output_buffers["logits"].view(output_shapes["logits"]) else: - model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids} - - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, model_inputs) onnx_outputs = self.model.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self._prepare_onnx_outputs(use_torch, onnx_outputs) logits = model_outputs["logits"] @@ -1717,24 +1723,26 @@ def forward( use_torch = isinstance(pixel_values, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) - if self.device.type == "cuda" and self.use_io_binding: - io_binding, output_shapes, output_buffers = self.prepare_io_binding( - pixel_values, - ordered_input_names=self._ordered_input_names, - ) + model_inputs = { + "pixel_values": pixel_values, + } + + if self.use_io_binding: + io_binding, output_shapes, output_buffers = self._prepare_io_binding(self.model, model_inputs) # run inference with binding & synchronize in case of multiple CUDA streams - io_binding.synchronize_inputs() - self.model.run_with_iobinding(io_binding) - io_binding.synchronize_outputs() + if self.device.type == "cpu": + self.model.run_with_iobinding(io_binding) + else: + io_binding.synchronize_inputs() + self.model.run_with_iobinding(io_binding) + io_binding.synchronize_outputs() logits = output_buffers["logits"].view(output_shapes["logits"]) else: - model_inputs = {"pixel_values": pixel_values} - - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, model_inputs) onnx_outputs = self.model.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self._prepare_onnx_outputs(use_torch, onnx_outputs) logits = model_outputs["logits"] @@ -1805,24 +1813,26 @@ def forward( use_torch = isinstance(pixel_values, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) - if self.device.type == "cuda" and self.use_io_binding: - io_binding, output_shapes, output_buffers = self.prepare_io_binding( - pixel_values, - ordered_input_names=self._ordered_input_names, - ) + model_inputs = { + "pixel_values": pixel_values, + } + + if self.use_io_binding: + io_binding, output_shapes, output_buffers = self._prepare_io_binding(self.model, model_inputs) # run inference with binding & synchronize in case of multiple CUDA streams - io_binding.synchronize_inputs() - self.model.run_with_iobinding(io_binding) - io_binding.synchronize_outputs() + if self.device.type == "cpu": + self.model.run_with_iobinding(io_binding) + else: + io_binding.synchronize_inputs() + self.model.run_with_iobinding(io_binding) + io_binding.synchronize_outputs() logits = output_buffers["logits"].view(output_shapes["logits"]) else: - model_inputs = {"pixel_values": pixel_values} - - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, model_inputs) onnx_outputs = self.model.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self._prepare_onnx_outputs(use_torch, onnx_outputs) logits = model_outputs["logits"] @@ -1932,25 +1942,27 @@ def forward( use_torch = isinstance(model_input, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) - if self.device.type == "cuda" and self.use_io_binding: - io_binding, output_shapes, output_buffers = self.prepare_io_binding( - model_input, - attention_mask, - ordered_input_names=self._ordered_input_names, - ) + model_inputs = { + self.input_name: model_input, + "attention_mask": attention_mask, + } + + if self.use_io_binding: + io_binding, output_shapes, output_buffers = self._prepare_io_binding(self.model, model_inputs) # run inference with binding & synchronize in case of multiple CUDA streams - io_binding.synchronize_inputs() - self.model.run_with_iobinding(io_binding) - io_binding.synchronize_outputs() + if self.device.type == "cpu": + self.model.run_with_iobinding(io_binding) + else: + io_binding.synchronize_inputs() + self.model.run_with_iobinding(io_binding) + io_binding.synchronize_outputs() logits = output_buffers["logits"].view(output_shapes["logits"]) else: - model_inputs = {self.input_name: model_input, "attention_mask": attention_mask} - - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, model_inputs) onnx_outputs = self.model.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self._prepare_onnx_outputs(use_torch, onnx_outputs) logits = model_outputs["logits"] @@ -2009,35 +2021,36 @@ def forward( use_torch = isinstance(input_values, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) - if self.device.type == "cuda" and self.use_io_binding: - input_size = input_values.shape[1] - output_sizes = [] + model_inputs = { + "input_values": input_values, + } - def _conv_output_size(input_size, kernel_size, stride): - return (input_size - kernel_size) // stride + 1 + if self.use_io_binding: + batch_size = input_values.shape[0] + final_input_size = input_values.shape[-1] for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride): - input_size = _conv_output_size(input_size, kernel_size, stride) - output_sizes.append(input_size) + final_input_size = (final_input_size - kernel_size) // stride + 1 - known_output_shapes = {"logits": [input_values.shape[0], output_sizes[-1], self.config.vocab_size]} + known_output_shapes = {"logits": [batch_size, final_input_size, self.config.vocab_size]} - io_binding, output_shapes, output_buffers = self.prepare_io_binding( - input_values, ordered_input_names=self._ordered_input_names, known_output_shapes=known_output_shapes + io_binding, output_shapes, output_buffers = self._prepare_io_binding( + self.model, model_inputs, known_output_shapes=known_output_shapes ) # run inference with binding & synchronize in case of multiple CUDA streams - io_binding.synchronize_inputs() - self.model.run_with_iobinding(io_binding) - io_binding.synchronize_outputs() + if self.device.type == "cpu": + self.model.run_with_iobinding(io_binding) + else: + io_binding.synchronize_inputs() + self.model.run_with_iobinding(io_binding) + io_binding.synchronize_outputs() logits = output_buffers["logits"].view(output_shapes["logits"]) else: - model_inputs = {"input_values": input_values} - - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, model_inputs) onnx_outputs = self.model.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self._prepare_onnx_outputs(use_torch, onnx_outputs) logits = model_outputs["logits"] @@ -2104,25 +2117,28 @@ def forward( use_torch = isinstance(input_values, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) - if self.device.type == "cuda" and self.use_io_binding: - io_binding, output_shapes, output_buffers = self.prepare_io_binding( - input_values, ordered_input_names=self._ordered_input_names - ) + model_inputs = { + "input_values": input_values, + } + + if self.use_io_binding: + io_binding, output_shapes, output_buffers = self._prepare_io_binding(self.model, model_inputs) # run inference with binding & synchronize in case of multiple CUDA streams - io_binding.synchronize_inputs() - self.model.run_with_iobinding(io_binding) - io_binding.synchronize_outputs() + if self.device.type == "cpu": + self.model.run_with_iobinding(io_binding) + else: + io_binding.synchronize_inputs() + self.model.run_with_iobinding(io_binding) + io_binding.synchronize_outputs() logits = output_buffers["logits"].view(output_shapes["logits"]) embeddings = output_buffers["embeddings"].view(output_shapes["embeddings"]) else: - model_inputs = {"input_values": input_values} - - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, model_inputs) onnx_outputs = self.model.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self._prepare_onnx_outputs(use_torch, onnx_outputs) logits = model_outputs["logits"] embeddings = model_outputs["embeddings"] @@ -2182,14 +2198,14 @@ def forward( use_torch = isinstance(input_values, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) - if self.device.type == "cuda" and self.use_io_binding: + if self.use_io_binding: raise NotImplementedError() else: model_inputs = {"input_values": input_values} - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, model_inputs) onnx_outputs = self.model.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self._prepare_onnx_outputs(use_torch, onnx_outputs) logits = model_outputs["logits"] @@ -2241,29 +2257,33 @@ def forward( ): use_torch = isinstance(pixel_values, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) - if self.device.type == "cuda" and self.use_io_binding: - input_shapes = pixel_values.shape - io_binding, output_shapes, output_buffers = self.prepare_io_binding( - pixel_values, - ordered_input_names=self._ordered_input_names, - known_output_shapes={ - "reconstruction": [ - input_shapes[0], - input_shapes[1], - input_shapes[2] * self.config.upscale, - input_shapes[3] * self.config.upscale, - ] - }, + + model_inputs = { + "pixel_values": pixel_values, + } + + if self.use_io_binding: + batch_size, num_channels, height, width = pixel_values.shape + known_output_shapes = { + "reconstruction": [batch_size, num_channels, height * self.config.upscale, width * self.config.upscale] + } + + io_binding, output_shapes, output_buffers = self._prepare_io_binding( + self.model, model_inputs, known_output_shapes=known_output_shapes ) - io_binding.synchronize_inputs() - self.model.run_with_iobinding(io_binding) - io_binding.synchronize_outputs() + + if self.device.type == "cpu": + self.model.run_with_iobinding(io_binding) + else: + io_binding.synchronize_inputs() + self.model.run_with_iobinding(io_binding) + io_binding.synchronize_outputs() + reconstruction = output_buffers["reconstruction"].view(output_shapes["reconstruction"]) else: - model_inputs = {"pixel_values": pixel_values} - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, model_inputs) onnx_outputs = self.model.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self._prepare_onnx_outputs(use_torch, onnx_outputs) reconstruction = model_outputs["reconstruction"] return ImageSuperResolutionOutput(reconstruction=reconstruction) @@ -2318,23 +2338,26 @@ def forward(self, **model_inputs: Union[torch.Tensor, np.ndarray]): use_torch = isinstance(next(iter(model_inputs.values())), torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) - if self.device.type == "cuda" and self.use_io_binding: + if self.use_io_binding: # TODO: should this be used in favor of `model.prepare_io_binding`? io_binding = IOBindingHelper.prepare_io_binding(self, **model_inputs) # run inference with binding - io_binding.synchronize_inputs() - self.model.run_with_iobinding(io_binding) - io_binding.synchronize_outputs() + if self.device.type == "cpu": + self.model.run_with_iobinding(io_binding) + else: + io_binding.synchronize_inputs() + self.model.run_with_iobinding(io_binding) + io_binding.synchronize_outputs() model_outputs = {} for name, output in zip(self.output_names.keys(), io_binding._iobinding.get_outputs()): model_outputs[name] = IOBindingHelper.to_pytorch(output) else: - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, model_inputs) onnx_outputs = self.model.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self._prepare_onnx_outputs(use_torch, onnx_outputs) # converts output to namedtuple for pipelines post-processing return ModelOutput(**model_outputs) diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py index fba8152582..a3063826be 100644 --- a/optimum/onnxruntime/modeling_seq2seq.py +++ b/optimum/onnxruntime/modeling_seq2seq.py @@ -23,7 +23,6 @@ from tempfile import TemporaryDirectory from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union -import numpy as np import torch from huggingface_hub import hf_hub_download from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE @@ -363,43 +362,28 @@ def forward( use_torch = isinstance(input_features, torch.Tensor) self.parent_model.raise_on_numpy_input_io_binding(use_torch) - if self.parent_model.device.type == "cuda" and self.parent_model.use_io_binding: - model_inputs = ( - [input_features, attention_mask] if "attention_mask" in self.input_names else [input_features] - ) - io_binding, output_shapes, output_buffers = self.parent_model._prepare_io_binding( - self.session, - *model_inputs, - ordered_input_names=self._ordered_input_names, - ) + model_inputs = { + "input_features": input_features, + "attention_mask": attention_mask, + } - io_binding.synchronize_inputs() - self.session.run_with_iobinding(io_binding) - io_binding.synchronize_outputs() + if self.parent_model.use_io_binding: + io_binding, output_shapes, output_buffers = self._prepare_io_binding(self.session, model_inputs) - last_hidden_state = output_buffers["last_hidden_state"].view(output_shapes["last_hidden_state"]) - else: - if use_torch: - onnx_inputs = {"input_features": input_features.cpu().detach().numpy()} - if "attention_mask" in self.input_names: - onnx_inputs["attention_mask"] = attention_mask.cpu().detach().numpy() + if self.device.type == "cpu": + self.session.run_with_iobinding(io_binding) else: - onnx_inputs = {"input_features": input_features} - if "attention_mask" in self.input_names: - onnx_inputs["attention_mask"] = attention_mask + io_binding.synchronize_inputs() + self.session.run_with_iobinding(io_binding) + io_binding.synchronize_outputs() - # TODO: Replace with a better solution - # attention_mask is exported with int64 datatype and tokenizer produces int32 input - # for speech2text model. Hence, the input is type casted for inference. - if "attention_mask" in self.input_names: - if self.session.get_inputs()[1].type == "tensor(int64)": - onnx_inputs["attention_mask"] = onnx_inputs["attention_mask"].astype(np.int64) - - outputs = self.session.run(None, onnx_inputs) + last_hidden_state = output_buffers["last_hidden_state"].view(output_shapes["last_hidden_state"]) + else: + onnx_inputs = self._prepare_onnx_inputs(use_torch, model_inputs) + onnx_outputs = self.session.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, onnx_outputs) - last_hidden_state = outputs[self.output_names["last_hidden_state"]] - if use_torch: - last_hidden_state = torch.from_numpy(last_hidden_state).to(self.device) + last_hidden_state = model_outputs["last_hidden_state"] return BaseModelOutput(last_hidden_state=last_hidden_state) @@ -422,60 +406,30 @@ def forward( use_torch = isinstance(pixel_values, torch.Tensor) self.parent_model.raise_on_numpy_input_io_binding(use_torch) - if self.parent_model.device.type == "cuda" and self.parent_model.use_io_binding: - known_output_shapes = self.compute_encoder_known_output_shapes(pixel_values) + model_inputs = { + "pixel_values": pixel_values, + } - io_binding, output_shapes, output_buffers = self.parent_model._prepare_io_binding( - self.session, - pixel_values, - known_output_shapes=known_output_shapes, - ordered_input_names=self._ordered_input_names, - ) + if self.parent_model.use_io_binding: + io_binding, output_shapes, output_buffers = self._prepare_io_binding(self.session, model_inputs) - io_binding.synchronize_inputs() - self.session.run_with_iobinding(io_binding) - io_binding.synchronize_outputs() + if self.device.type == "cpu": + self.session.run_with_iobinding(io_binding) + else: + io_binding.synchronize_inputs() + self.session.run_with_iobinding(io_binding) + io_binding.synchronize_outputs() last_hidden_state = output_buffers["last_hidden_state"].view(output_shapes["last_hidden_state"]) else: - if use_torch: - onnx_inputs = {"pixel_values": pixel_values.cpu().detach().numpy()} - else: - onnx_inputs = {"pixel_values": pixel_values} - - outputs = self.session.run(None, onnx_inputs) + onnx_inputs = self._prepare_onnx_inputs(use_torch, model_inputs) + onnx_outputs = self.session.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, onnx_outputs) - last_hidden_state = outputs[self.output_names["last_hidden_state"]] - if use_torch: - last_hidden_state = torch.from_numpy(last_hidden_state).to(self.device) + last_hidden_state = model_outputs["last_hidden_state"] return BaseModelOutput(last_hidden_state=last_hidden_state) - def compute_encoder_known_output_shapes(self, pixel_values: torch.FloatTensor) -> Dict[str, List[int]]: - if self.normalized_config.config.model_type == "donut-swin": - # TODO: kind of weird to export to ONNX with dynamic output shape if it is in fact static... - encoder_sequence_length = ( - self.normalized_config.config.image_size[0] - * self.normalized_config.config.image_size[1] - // self.normalized_config.config.hidden_size - ) - elif self.normalized_config.config.model_type in ["vit", "deit"]: - return None - else: - raise ValueError( - f"Unsupported encoder model type {self.normalized_config.config.model_type} for ORTForVisionSeq2Seq with IOBinding." - "Currently supported models are vit, donut-swin and deit." - "Please submit a PR to add support for this model type." - ) - - return { - "last_hidden_state": [ - pixel_values.shape[0], # batch size - encoder_sequence_length, - self.normalized_config.config.hidden_size, - ] - } - class ORTEncoderForPix2Struct(ORTEncoder): """ @@ -496,41 +450,28 @@ def forward( use_torch = isinstance(flattened_patches, torch.Tensor) self.parent_model.raise_on_numpy_input_io_binding(use_torch) - if self.parent_model.device.type == "cuda" and self.parent_model.use_io_binding: - model_inputs = ( - [flattened_patches, attention_mask] if "attention_mask" in self.input_names else [flattened_patches] - ) - io_binding, output_shapes, output_buffers = self.parent_model._prepare_io_binding( - self.session, - *model_inputs, - ordered_input_names=self._ordered_input_names, - ) + model_inputs = { + "flattened_patches": flattened_patches, + "attention_mask": attention_mask, + } - io_binding.synchronize_inputs() - self.session.run_with_iobinding(io_binding) - io_binding.synchronize_outputs() + if self.parent_model.use_io_binding: + io_binding, output_shapes, output_buffers = self._prepare_io_binding(self.session, model_inputs) - last_hidden_state = output_buffers["last_hidden_state"].view(output_shapes["last_hidden_state"]) - else: - if use_torch: - onnx_inputs = {"flattened_patches": flattened_patches.cpu().detach().numpy()} - if "attention_mask" in self.input_names: - onnx_inputs["attention_mask"] = attention_mask.cpu().detach().numpy() + if self.device.type == "cpu": + self.session.run_with_iobinding(io_binding) else: - onnx_inputs = {"flattened_patches": flattened_patches} - if "attention_mask" in self.input_names: - onnx_inputs["attention_mask"] = attention_mask - - if "attention_mask" in self.input_names: - if self.session.get_inputs()[1].type == "tensor(int64)": - onnx_inputs["attention_mask"] = onnx_inputs["attention_mask"].astype(np.int64) + io_binding.synchronize_inputs() + self.session.run_with_iobinding(io_binding) + io_binding.synchronize_outputs() - outputs = self.session.run(None, onnx_inputs) - - last_hidden_state = outputs[self.output_names["last_hidden_state"]] + last_hidden_state = output_buffers["last_hidden_state"].view(output_shapes["last_hidden_state"]) + else: + onnx_inputs = self._prepare_onnx_inputs(use_torch, model_inputs) + onnx_outputs = self.session.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, onnx_outputs) - if use_torch: - last_hidden_state = torch.from_numpy(last_hidden_state).to(self.device) + last_hidden_state = model_outputs["last_hidden_state"] return BaseModelOutput(last_hidden_state=last_hidden_state) @@ -1164,7 +1105,6 @@ def forward( decoder_input_ids: Optional[torch.LongTensor] = None, encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, - labels: Optional[torch.LongTensor] = None, **kwargs, ) -> Seq2SeqLMOutput: # Encode if needed : first prediction pass @@ -1181,7 +1121,6 @@ def forward( past_key_values=past_key_values, encoder_hidden_states=encoder_outputs.last_hidden_state, encoder_attention_mask=attention_mask, - labels=labels, ) return Seq2SeqLMOutput( @@ -1297,7 +1236,6 @@ def forward( decoder_input_ids: Optional[torch.LongTensor] = None, encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, - labels: Optional[torch.LongTensor] = None, cache_position: Optional[torch.Tensor] = None, **kwargs, ) -> Seq2SeqLMOutput: @@ -1316,7 +1254,6 @@ def forward( encoder_hidden_states=encoder_outputs.last_hidden_state, encoder_attention_mask=attention_mask, cache_position=cache_position, - labels=labels, ) return Seq2SeqLMOutput( @@ -1477,10 +1414,8 @@ def forward( decoder_input_ids: Optional[torch.LongTensor] = None, encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, - labels: Optional[torch.LongTensor] = None, **kwargs, ) -> Seq2SeqLMOutput: - # Encode if needed : first prediction pass if encoder_outputs is None: encoder_outputs = self.encoder(pixel_values=pixel_values) @@ -1489,17 +1424,18 @@ def forward( if past_key_values is None or not self.use_cache or self.use_merged else self.decoder_with_past ) + decoder_outputs = model( input_ids=decoder_input_ids, past_key_values=past_key_values, encoder_hidden_states=encoder_outputs.last_hidden_state, - labels=labels, ) return Seq2SeqLMOutput( - loss=decoder_outputs.get("loss", None), + loss=decoder_outputs.loss, logits=decoder_outputs.logits, past_key_values=decoder_outputs.past_key_values, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, ) def prepare_inputs_for_generation( @@ -1577,42 +1513,33 @@ def forward( decoder_attention_mask: Optional[torch.BoolTensor] = None, encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, - labels: Optional[torch.LongTensor] = None, **kwargs, ) -> Seq2SeqLMOutput: - # Encode if needed : first prediction pass - # Encode if needed (training, first prediction pass) if encoder_outputs is None: encoder_outputs = self.encoder( flattened_patches=flattened_patches, attention_mask=attention_mask, ) - # TODO: for some reason the attention_mask for pix2struct is a float in transformers and not an int64. This messes up with the exporter - # hardcodes int64 input dtype for the attention mask. This workaround is quite ugly, it should be fixed rather in the ONNX exporter. - if isinstance(attention_mask, torch.Tensor): - attention_mask = attention_mask.to(torch.int64) - else: - attention_mask = attention_mask.astype(np.int64) - model = ( self.decoder - if past_key_values is None or not self.use_cache or self.use_merged + if self.use_merged or not self.use_cache or past_key_values is None else self.decoder_with_past ) + decoder_outputs = model( input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, past_key_values=past_key_values, encoder_hidden_states=encoder_outputs.last_hidden_state, encoder_attention_mask=attention_mask, - labels=labels, ) return Seq2SeqLMOutput( - loss=decoder_outputs.get("loss", None), + loss=decoder_outputs.loss, logits=decoder_outputs.logits, past_key_values=decoder_outputs.past_key_values, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, ) def prepare_inputs_for_generation( diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py index 4c6ad2553d..47a98e19c8 100644 --- a/optimum/onnxruntime/trainer.py +++ b/optimum/onnxruntime/trainer.py @@ -14,6 +14,7 @@ """ The ORTTrainer class, to easily train a 🤗 Transformers from scratch or finetune it on a new task with ONNX Runtime. """ + import functools import math import os @@ -27,8 +28,8 @@ # Integrations must be imported before ML frameworks: # isort: off +import safetensors from transformers.integrations import hp_params - from transformers.utils import is_accelerate_available from packaging import version @@ -58,7 +59,7 @@ from transformers.modeling_utils import PreTrainedModel, unwrap_model from transformers.tokenization_utils_base import PreTrainedTokenizerBase from transformers.trainer import Trainer -from transformers.trainer_callback import TrainerCallback, TrainerState +from transformers.trainer_callback import ExportableState, TrainerCallback, TrainerState from transformers.trainer_pt_utils import ( get_model_param_count, get_module_class_from_name, @@ -77,6 +78,8 @@ ) from transformers.training_args import ParallelMode from transformers.utils import ( + SAFE_WEIGHTS_NAME, + WEIGHTS_NAME, is_apex_available, is_sagemaker_dp_enabled, is_sagemaker_mp_enabled, @@ -119,11 +122,12 @@ # Name of the files used for checkpointing TRAINER_STATE_NAME = "trainer_state.json" +TRAINING_ARGS_NAME = "training_args.bin" logger = logging.get_logger(__name__) -class ModuleWithLoss(nn.Module): +class ModuleWithLoss(PreTrainedModel): def __init__(self, model, args, label_smoother): super().__init__() self._original_model = model @@ -131,11 +135,11 @@ def __init__(self, model, args, label_smoother): # Label smoothing self.label_smoother = label_smoother - def forward(self, inputs: Dict[str, Union[torch.Tensor, Any]], return_outputs): + def forward(self, inputs: Dict[str, Union[torch.Tensor, Any]], return_outputs, num_items_in_batch): # The compute_model_plus_loss_internal is assigned once the class is instantiated. # It should have same signature as Trainer.compute_loss(). # We do this to avoid potential un-synced states if we duplicated compute loss codes . - return self.compute_model_plus_loss_internal(self._original_model, inputs, return_outputs) + return self.compute_model_plus_loss_internal(self._original_model, inputs, return_outputs, num_items_in_batch) @property def module(self): @@ -291,14 +295,14 @@ def _set_signature_columns_if_needed(self): # Labels may be named label or label_ids, the default data collator handles that. self._signature_columns += list(set(["label", "label_ids"] + self.label_names)) - def compute_loss(self, model_with_loss, inputs, return_outputs=False): + def compute_loss(self, model_with_loss, inputs, return_outputs=False, num_items_in_batch=None): # Run model forward + loss compute. if isinstance(self.model, ModuleWithLoss): # ORTModule Does not support the BatchEncoding Type so we have to convert to a dict. dict_inputs = dict(inputs.items()) - return model_with_loss(dict_inputs, return_outputs) + return model_with_loss(dict_inputs, return_outputs, num_items_in_batch) else: - return super().compute_loss(model_with_loss, inputs, return_outputs) + return super().compute_loss(model_with_loss, inputs, return_outputs, num_items_in_batch) def train( self, @@ -508,8 +512,13 @@ def _inner_training_loop( if not delay_optimizer_creation: self.create_optimizer_and_scheduler(num_training_steps=max_steps) - self.state = TrainerState() + self.state = TrainerState( + stateful_callbacks=[ + cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState) + ] + ) self.state.is_hyper_param_search = trial is not None + self.state.train_batch_size = self._train_batch_size # Compute absolute values for logging, eval, and save if given as ratio if args.logging_steps is not None: @@ -798,12 +807,16 @@ def get_dataloader_sampler(dataloader): self.lr_scheduler.step() model.zero_grad() - grad_norm: Optional[float] = None self.state.global_step += 1 self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch self.control = self.callback_handler.on_step_end(args, self.state, self.control) - self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval) + if is_transformers_version(">=", "4.47.0"): + self._maybe_log_save_evaluate( + tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval, start_time + ) + else: + self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval) else: self.control = self.callback_handler.on_substep_end(args, self.state, self.control) @@ -818,8 +831,13 @@ def get_dataloader_sampler(dataloader): self.control.should_training_stop = True self.control = self.callback_handler.on_epoch_end(args, self.state, self.control) - self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval) + if is_transformers_version(">=", "4.47.0"): + self._maybe_log_save_evaluate( + tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval, start_time + ) + else: + self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval) if DebugOption.TPU_METRICS_DEBUG in self.args.debug: logger.warning( "You enabled PyTorch/XLA debug metrics which is not supported by ONNX " @@ -1072,3 +1090,39 @@ def get_ort_optimizer_cls_and_kwargs(args: ORTTrainingArguments) -> Tuple[Any, A else: raise ValueError(f"ORTTrainer cannot instantiate unsupported optimizer: {args.optim}") return optimizer_cls, optimizer_kwargs + + def _save(self, output_dir: Optional[str] = None, state_dict=None): + # If we are executing this function, we are the process zero, so we don't check for that. + output_dir = output_dir if output_dir is not None else self.args.output_dir + os.makedirs(output_dir, exist_ok=True) + logger.info(f"Saving model checkpoint to {output_dir}") + + supported_classes = (PreTrainedModel,) + # Save a trained model and configuration using `save_pretrained()`. + # They can then be reloaded using `from_pretrained()` + if not isinstance(self.model, supported_classes): + if state_dict is None: + state_dict = self.model.state_dict() + + if isinstance(self.accelerator.unwrap_model(self.model), supported_classes): + self.accelerator.unwrap_model(self.model).save_pretrained( + output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors + ) + else: + logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.") + if self.args.save_safetensors: + safetensors.torch.save_model( + self.model, os.path.join(output_dir, SAFE_WEIGHTS_NAME), metadata={"format": "pt"} + ) + else: + torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME)) + else: + self.model.save_pretrained( + output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors + ) + + if self.processing_class is not None: + self.processing_class.save_pretrained(output_dir) + + # Good practice: save your training arguments together with the trained model + torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME)) diff --git a/optimum/onnxruntime/utils.py b/optimum/onnxruntime/utils.py index 79375d958f..be395927cc 100644 --- a/optimum/onnxruntime/utils.py +++ b/optimum/onnxruntime/utils.py @@ -114,7 +114,7 @@ class ORTConfigManager: "bart": "bart", "bert": "bert", "big-bird": "bert", - # "bigbird-pegasus": None, # bug in `fusion_skiplayernorm.py` + "bigbird-pegasus": "bart", "blenderbot": "bert", "bloom": "gpt2", "camembert": "bert", @@ -129,7 +129,6 @@ class ORTConfigManager: "gpt-neox": "gpt2", "gptj": "gpt2", "granite": "gpt2", - # longt5 with O4 results in segmentation fault "longt5": "bert", "llama": "gpt2", "marian": "bart", diff --git a/optimum/utils/__init__.py b/optimum/utils/__init__.py index c870e49fad..b4097f0f80 100644 --- a/optimum/utils/__init__.py +++ b/optimum/utils/__init__.py @@ -51,7 +51,6 @@ is_transformers_available, is_transformers_version, require_numpy_strictly_lower, - torch_version, ) from .input_generators import ( DEFAULT_DUMMY_SHAPES, @@ -87,6 +86,8 @@ FalconDummyPastKeyValuesGenerator, GemmaDummyPastKeyValuesGenerator, GPTBigCodeDummyPastKeyValuesGenerator, + LongformerDummyTextInputGenerator, + MCTCTDummyAudioInputGenerator, MistralDummyPastKeyValuesGenerator, MultiQueryPastKeyValuesGenerator, ) diff --git a/optimum/utils/import_utils.py b/optimum/utils/import_utils.py index 6c25c72475..8da1df5fac 100644 --- a/optimum/utils/import_utils.py +++ b/optimum/utils/import_utils.py @@ -15,16 +15,18 @@ import importlib.metadata import importlib.util -import inspect import operator as op from collections import OrderedDict from contextlib import contextmanager -from typing import Tuple, Union +from logging import getLogger +from typing import List, Optional, Tuple, Union import numpy as np from packaging import version +logger = getLogger(__name__) + TORCH_MINIMUM_VERSION = version.parse("1.11.0") TRANSFORMERS_MINIMUM_VERSION = version.parse("4.25.0") DIFFUSERS_MINIMUM_VERSION = version.parse("0.22.0") @@ -37,16 +39,42 @@ STR_OPERATION_TO_FUNC = {">": op.gt, ">=": op.ge, "==": op.eq, "!=": op.ne, "<=": op.le, "<": op.lt} -def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[Tuple[bool, str], bool]: - # Check we're not importing a "pkg_name" directory somewhere but the actual library by trying to grab the version +def _is_package_available( + pkg_name: str, + return_version: bool = False, + pkg_distributions: Optional[List[str]] = None, +) -> Union[Tuple[bool, str], bool]: + """ + Check if a package is available in the current environment and not just an importable module by checking its version. + Optionally return the version of the package. + + Args: + pkg_name (str): The name of the package to check. + return_version (bool): Whether to return the version of the package. + pkg_distributions (Optional[List[str]]): A list of package distributions (e.g. "package-name", "package-name-gpu", etc.) to check for the package. + + Returns: + Union[Tuple[bool, str], bool]: A tuple of the package availability and the version of the package if `return_version` is `True`. + """ + package_exists = importlib.util.find_spec(pkg_name) is not None package_version = "N/A" + + if pkg_distributions is None: + pkg_distributions = [pkg_name] + else: + pkg_distributions.append(pkg_name) + if package_exists: - try: - package_version = importlib.metadata.version(pkg_name) - package_exists = True - except importlib.metadata.PackageNotFoundError: - package_exists = False + for pkg in pkg_distributions: + try: + package_version = importlib.metadata.version(pkg) + package_exists = True + break + except importlib.metadata.PackageNotFoundError: + package_exists = False + pass + if return_version: return package_exists, package_version else: @@ -64,45 +92,55 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[ _diffusers_available, _diffusers_version = _is_package_available("diffusers", return_version=True) _transformers_available, _transformers_version = _is_package_available("transformers", return_version=True) _torch_available, _torch_version = _is_package_available("torch", return_version=True) - -# importlib.metadata.version seem to not be robust with the ONNX Runtime extensions (`onnxruntime-gpu`, etc.) -_onnxruntime_available = _is_package_available("onnxruntime", return_version=False) - -# TODO : Remove -torch_version = version.parse(importlib.metadata.version("torch")) if _torch_available else None - - -# Note: _is_package_available("tensorflow") fails for tensorflow-cpu. Please test any changes to the line below -# with tensorflow-cpu to make sure it still works! -_tf_available = importlib.util.find_spec("tensorflow") is not None -_tf_version = None -if _tf_available: - candidates = ( +_onnxruntime_available, _onnxruntime_version = _is_package_available( + "onnxruntime", + return_version=True, + pkg_distributions=[ + "onnxruntime-gpu", + "onnxruntime-rocm", + "onnxruntime-training", + # list in https://github.com/microsoft/onnxruntime/blob/main/setup.py#L56C1-L98C91 + "onnxruntime-training-rocm", + "onnxruntime-training-cpu", + "onnxruntime-openvino", + "onnxruntime-vitisai", + "onnxruntime-armnn", + "onnxruntime-cann", + "onnxruntime-dnnl", + "onnxruntime-acl", + "onnxruntime-tvm", + "onnxruntime-qnn", + "onnxruntime-migraphx", + "ort-migraphx-nightly", + "ort-rocm-nightly", + ], +) +_tf_available, _tf_version = _is_package_available( + "tensorflow", + return_version=True, + pkg_distributions=[ "tensorflow", "tensorflow-cpu", "tensorflow-gpu", + "tensorflow-rocm", + "tensorflow-macos", + "tensorflow-aarch64", "tf-nightly", "tf-nightly-cpu", "tf-nightly-gpu", "tf-nightly-rocm", + "tf-nightly-macos", "intel-tensorflow", "intel-tensorflow-avx512", - "tensorflow-rocm", - "tensorflow-macos", - "tensorflow-aarch64", + ], +) + +if _tf_available and version.parse(_tf_version) < version.parse("2"): + logger.warning( + "TensorFlow 2.0 or higher is required to use the TensorFlow backend. " + "Please install the latest version of TensorFlow, or switch to another backend." ) - # For the metadata, we have to look for both tensorflow and tensorflow-cpu - for pkg in candidates: - try: - _tf_version = importlib.metadata.version(pkg) - break - except importlib.metadata.PackageNotFoundError: - pass - _tf_available = _tf_version is not None -if _tf_available: - if version.parse(_tf_version) < version.parse("2"): - _tf_available = False -_tf_version = _tf_version or "N/A" + _tf_available = False # This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L319 @@ -168,14 +206,6 @@ def is_onnx_available(): def is_onnxruntime_available(): - try: - # Try to import the source file of onnxruntime - if you run the tests from `tests` the function gets - # confused since there a folder named `onnxruntime` in `tests`. Therefore, `_onnxruntime_available` - # will be set to `True` even if not installed. - mod = importlib.import_module("onnxruntime") - inspect.getsourcefile(mod) - except Exception: - return False return _onnxruntime_available diff --git a/optimum/utils/input_generators.py b/optimum/utils/input_generators.py index e4545a8473..6a265061fd 100644 --- a/optimum/utils/input_generators.py +++ b/optimum/utils/input_generators.py @@ -383,6 +383,7 @@ class DummyTextInputGenerator(DummyInputGenerator): "input_ids", "attention_mask", "encoder_attention_mask", + "global_attention_mask", "token_type_ids", "position_ids", ) @@ -425,24 +426,47 @@ def __init__( self.padding_side = padding_side self.normalized_config = normalized_config - def generate( - self, - input_name: str, - framework: str = "pt", - int_dtype: str = "int64", - float_dtype: str = "fp32", - ): + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): min_value = 0 max_value = 2 if input_name != "input_ids" else self.vocab_size - shape = [self.batch_size, self.sequence_length] + if self.task == "multiple-choice": shape = [self.batch_size, self.num_choices, self.sequence_length] - if "mask" in input_name: + else: + shape = [self.batch_size, self.sequence_length] + + if input_name in ["attention_mask", "encoder_attention_mask"]: return self.random_mask_tensor(shape, padding_side=self.padding_side, framework=framework, dtype=int_dtype) else: return self.random_int_tensor(shape, max_value, min_value=min_value, framework=framework, dtype=int_dtype) +class LongformerDummyTextInputGenerator(DummyTextInputGenerator): + SUPPORTED_INPUT_NAMES = ( + "input_ids", + "attention_mask", + "token_type_ids", + "global_attention_mask", + ) + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name == "global_attention_mask": + attention_mask = super().generate( + "attention_mask", framework=framework, int_dtype=int_dtype, float_dtype=float_dtype + ) + + if framework == "pt": + global_attention_mask = torch.zeros_like(attention_mask) + elif framework == "tf": + global_attention_mask = tf.zeros_like(attention_mask) + else: + global_attention_mask = np.zeros_like(attention_mask) + + return global_attention_mask + + return super().generate(input_name, framework=framework, int_dtype=int_dtype, float_dtype=float_dtype) + + class DummyXPathSeqInputGenerator(DummyTextInputGenerator): """ Generates dummy xpath sequences. @@ -1559,3 +1583,12 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int framework=framework, dtype=float_dtype, ) + + +class MCTCTDummyAudioInputGenerator(DummyAudioInputGenerator): + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name == "input_features": + shape = [self.batch_size, self.sequence_length, self.normalized_config.input_features_per_channel] + return self.random_float_tensor(shape, min_value=-1, max_value=1, framework=framework, dtype=float_dtype) + + return super().generate(input_name, framework=framework, int_dtype=int_dtype, float_dtype=float_dtype) diff --git a/optimum/utils/normalized_config.py b/optimum/utils/normalized_config.py index 053417b20b..3f497b5920 100644 --- a/optimum/utils/normalized_config.py +++ b/optimum/utils/normalized_config.py @@ -234,8 +234,8 @@ class NormalizedConfigManager: "albert": NormalizedTextConfig, "bart": BartLikeNormalizedTextConfig, "bert": NormalizedTextConfig, - # "big_bird": NormalizedTextConfig, - # "bigbird_pegasus": BartLikeNormalizedTextConfig, + "big-bird": NormalizedTextConfig, + "bigbird-pegasus": BartLikeNormalizedTextConfig, "blenderbot": BartLikeNormalizedTextConfig, "blenderbot-small": BartLikeNormalizedTextConfig, "bloom": NormalizedTextConfig.with_args(num_layers="n_layer"), diff --git a/setup.py b/setup.py index 7e61560faf..d132975aa4 100644 --- a/setup.py +++ b/setup.py @@ -58,7 +58,15 @@ "datasets>=1.2.1", "evaluate", "protobuf>=3.20.1", - "accelerate", # ORTTrainer requires it. + "transformers>=4.36,<4.49.0", + ], + "onnxruntime-training": [ + "torch-ort", + "onnxruntime-training>=1.11.0", + "datasets>=1.2.1", + "accelerate", + "evaluate", + "protobuf>=3.20.1", "transformers>=4.36,<4.49.0", ], "exporters": [ @@ -84,7 +92,6 @@ "datasets<=2.16", "transformers>=4.36,<4.38", ], - # "executorch": "optimum-executorch", "diffusers": ["diffusers"], "intel": "optimum-intel>=1.18.0", "openvino": "optimum-intel[openvino]>=1.18.0", diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index bc6fafc53c..8705765bb2 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -37,6 +37,7 @@ PYTORCH_EXPORT_MODELS_TINY = { "albert": "hf-internal-testing/tiny-random-AlbertModel", + "audio-spectrogram-transformer": "hf-internal-testing/tiny-random-ASTModel", "beit": "hf-internal-testing/tiny-random-BeitForImageClassification", "bert": { "hf-internal-testing/tiny-random-BertModel": [ @@ -50,8 +51,8 @@ "nreimers/BERT-Tiny_L-2_H-128_A-2": ["feature-extraction"], }, "bart": "hf-internal-testing/tiny-random-bart", - # "big-bird": "hf-internal-testing/tiny-random-BigBirdModel", - # "bigbird-pegasus": "hf-internal-testing/tiny-random-bigbird_pegasus", + "big-bird": "hf-internal-testing/tiny-random-BigBirdModel", + "bigbird-pegasus": "hf-internal-testing/tiny-random-bigbird_pegasus", "blenderbot-small": "hf-internal-testing/tiny-random-BlenderbotModel", "blenderbot": "hf-internal-testing/tiny-random-BlenderbotModel", "bloom": "hf-internal-testing/tiny-random-BloomModel", @@ -73,7 +74,7 @@ "dinov2": "hf-internal-testing/tiny-random-Dinov2Model", "donut": "fxmarty/tiny-doc-qa-vision-encoder-decoder", "donut-swin": "hf-internal-testing/tiny-random-DonutSwinModel", - "detr": "hf-internal-testing/tiny-random-DetrModel", # hf-internal-testing/tiny-random-detr is larger + "detr": "hf-internal-testing/tiny-random-DetrModel", "distilbert": "hf-internal-testing/tiny-random-DistilBertModel", "dpt": "hf-internal-testing/tiny-random-DPTModel", "electra": "hf-internal-testing/tiny-random-ElectraModel", @@ -114,17 +115,19 @@ "lilt": "hf-internal-testing/tiny-random-LiltModel", "llama": "fxmarty/tiny-llama-fast-tokenizer", "longt5": "fxmarty/tiny-random-working-LongT5Model", - # "longformer": "allenai/longformer-base-4096", + "longformer": "hf-internal-testing/tiny-random-LongformerModel", "m2m-100": "hf-internal-testing/tiny-random-m2m_100", "marian": "sshleifer/tiny-marian-en-de", # hf-internal-testing ones are broken "markuplm": "hf-internal-testing/tiny-random-MarkupLMModel", "maskformer": "hf-internal-testing/tiny-random-MaskFormerForInstanceSegmentation", "mbart": "hf-internal-testing/tiny-random-mbart", + "mctct": "hf-internal-testing/tiny-random-MCTCTModel", + "megatron-bert": "hf-internal-testing/tiny-random-MegatronBertModel", "mgp-str": "hf-internal-testing/tiny-random-MgpstrForSceneTextRecognition", "mistral": "echarlaix/tiny-random-mistral", "mobilebert": "hf-internal-testing/tiny-random-MobileBertModel", "mobilenet-v2": "hf-internal-testing/tiny-random-MobileNetV2Model", - "mobilenet-v1": "google/mobilenet_v1_0.75_192", + "mobilenet-v1": "hf-internal-testing/tiny-random-MobileNetV1Model", "mobilevit": "hf-internal-testing/tiny-random-mobilevit", "modernbert": "hf-internal-testing/tiny-random-ModernBertForMaskedLM", "mpnet": "hf-internal-testing/tiny-random-MPNetModel", @@ -147,7 +150,6 @@ "phi": "echarlaix/tiny-random-PhiForCausalLM", "phi3": "Xenova/tiny-random-Phi3ForCausalLM", "pix2struct": "fxmarty/pix2struct-tiny-random", - # "rembert": "google/rembert", "rembert": "hf-internal-testing/tiny-random-RemBertModel", "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel", "pvt": "hf-internal-testing/tiny-random-PvtForImageClassification", @@ -172,7 +174,7 @@ "vit-msn": "hf-internal-testing/tiny-random-ViTMSNForImageClassification", "vits": "echarlaix/tiny-random-vits", "yolos": "hf-internal-testing/tiny-random-YolosModel", - "whisper": "openai/whisper-tiny.en", # hf-internal-testing ones are broken + "whisper": "optimum-internal-testing/tiny-random-whisper", "hubert": "hf-internal-testing/tiny-random-HubertModel", "wav2vec2": "hf-internal-testing/tiny-random-Wav2Vec2Model", "wav2vec2-conformer": "hf-internal-testing/tiny-random-wav2vec2-conformer", @@ -197,10 +199,6 @@ "hf-internal-testing/tiny-random-UniSpeechSatForPreTraining": ["audio-frame-classification"], "hf-internal-testing/tiny-random-UniSpeechSatForXVector": ["audio-xvector"], }, - "audio-spectrogram-transformer": "Ericwang/tiny-random-ast", - "megatron-bert": "hf-internal-testing/tiny-random-MegatronBertModel", - # Disabled for now because some operator seems to not be supported by ONNX. - # "mctct": "hf-internal-testing/tiny-random-MCTCTModel", "speech-to-text": "hf-internal-testing/tiny-random-Speech2TextModel", "speecht5": "hf-internal-testing/tiny-random-SpeechT5ForTextToSpeech", "xlm": "hf-internal-testing/tiny-random-XLMModel", @@ -218,38 +216,39 @@ }, } - +# TODO: enable export slow tests PYTORCH_EXPORT_MODELS_LARGE = { "albert": "albert-base-v2", + "audio-spectrogram-transformer": "nielsr/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593", "beit": "microsoft/beit-base-patch16-224", "bert": "bert-base-cased", "bart": "facebook/bart-base", - # "big-bird": "google/bigbird-roberta-base", - # "bigbird-pegasus": "hf-internal-testing/tiny-random-bigbird_pegasus", + "big-bird": "google/bigbird-roberta-base", + "bigbird-pegasus": "hf-internal-testing/tiny-random-bigbird_pegasus", "blenderbot-small": "facebook/blenderbot_small-90M", "blenderbot": "facebook/blenderbot-90M", - "bloom": "hf-internal-testing/tiny-random-BloomModel", # Not using bigscience/bloom-560m because it goes OOM. + "bloom": "bigscience/bloom-560m", "camembert": "camembert-base", "clip": "openai/clip-vit-base-patch32", "convbert": "YituTech/conv-bert-base", "convnext": "facebook/convnext-tiny-224", - "codegen": "hf-internal-testing/tiny-random-CodeGenModel", # Not using Salesforce/codegen-350M-multi because it takes too much time for testing. + "codegen": "Salesforce/codegen-350M-multi", "data2vec-text": "facebook/data2vec-text-base", "data2vec-vision": "facebook/data2vec-vision-base", "data2vec-audio": "facebook/data2vec-audio-base", - "deberta": "hf-internal-testing/tiny-random-DebertaModel", # Not using microsoft/deberta-base because it takes too much time for testing. - "deberta-v2": "hf-internal-testing/tiny-random-DebertaV2Model", # Not using microsoft/deberta-v2-xlarge because it takes too much time for testing. + "deberta": "microsoft/deberta-base", + "deberta-v2": "microsoft/deberta-v2-xlarge", "deit": "facebook/deit-small-patch16-224", - "detr": "hf-internal-testing/tiny-random-detr", # Not using facebook/detr-resnet-50 because it takes too much time for testing. + "detr": "facebook/detr-resnet-50", "distilbert": "distilbert-base-cased", "electra": "google/electra-base-generator", "encoder-decoder": "patrickvonplaten/bert2bert_cnn_daily_mail", - "flaubert": "hf-internal-testing/tiny-random-flaubert", # TODO + "flaubert": "flaubert/flaubert_small_cased", "gemma": "google/gemma-2b", "gpt2": "gpt2", "gpt-neo": "EleutherAI/gpt-neo-125M", "gpt-neox": "EleutherAI/gpt-neox-20b", - "gptj": "anton-l/gpt-j-tiny-random", # TODO + "gptj": "architext/gptj-162M", "groupvit": "nvidia/groupvit-gcc-yfcc", "hiera": "facebook/hiera-tiny-224-in1k-hf", "ibert": "kssteven/ibert-roberta-base", @@ -259,26 +258,26 @@ "layoutlmv3": "microsoft/layoutlmv3-base", "lilt": "SCUT-DLVCLab/lilt-roberta-en-base", "llama": "decapoda-research/llama-65b-hf", - "longt5": "fxmarty/tiny-random-working-LongT5Model", # Not using google/long-t5-local-base because it takes too much time for testing. - # "longformer": "allenai/longformer-base-4096", - "m2m-100": "hf-internal-testing/tiny-random-m2m_100", # Not using facebook/m2m100_418M because it takes too much time for testing. + "longt5": "google/long-t5-local-base", + "longformer": "allenai/longformer-base-4096", + "m2m-100": "facebook/m2m100_418M", "marian": "Helsinki-NLP/opus-mt-en-de", "markuplm": "hf-internal-testing/tiny-random-MarkupLMModel", "maskformer": "facebook/maskformer-swin-tiny-coco", "mbart": "sshleifer/tiny-mbart", "mgp-str": "alibaba-damo/mgp-str-base", "mobilebert": "google/mobilebert-uncased", - # "mobilenet_v1": "google/mobilenet_v1_0.75_192", - # "mobilenet_v2": "google/mobilenet_v2_0.35_96", + "mobilenet_v1": "google/mobilenet_v1_0.75_192", + "mobilenet_v2": "google/mobilenet_v2_0.35_96", "mobilevit": "apple/mobilevit-small", "modernbert": "answerdotai/ModernBERT-base", "mpt": "mosaicml/mpt-7b", - "mt5": "lewtun/tiny-random-mt5", # Not using google/mt5-small because it takes too much time for testing. + "mt5": "google/mt5-small", "musicgen": "facebook/musicgen-small", "nystromformer": "hf-internal-testing/tiny-random-NystromformerModel", "owlv2": "google/owlv2-base-patch16", "owlvit": "google/owlvit-base-patch32", - "perceiver": "hf-internal-testing/tiny-random-PerceiverModel", # Not using deepmind/language-perceiver because it takes too much time for testing. + "perceiver": "deepmind/language-perceiver", "rembert": "google/rembert", "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel", "pvt": "hf-internal-testing/tiny-random-PvtForImageClassification", @@ -308,9 +307,7 @@ "sew-d": "asapp/sew-d-tiny-100k-ft-ls100h", "unispeech": "microsoft/unispeech-1350-en-353-fr-ft-1h", "unispeech-sat": "microsoft/unispeech-sat-base", - "audio-spectrogram-transformer": "nielsr/audio-spectogram-transformer-finetuned-audioset-10-10-0.4593", - # Disabled for now because some operator seems to not be supported by ONNX. - # "mctct": "speechbrain/m-ctc-t-large", + "mctct": "speechbrain/m-ctc-t-large", "speech-to-text": "codenamewei/speech-to-text", "xlm": "xlm-clm-ende-1024", "xlm-roberta": "Unbabel/xlm-roberta-comet-small", diff --git a/tests/onnxruntime/ds_configs/ds_config_zero_stage_1.json b/tests/onnxruntime-training/ds_configs/ds_config_zero_stage_1.json similarity index 100% rename from tests/onnxruntime/ds_configs/ds_config_zero_stage_1.json rename to tests/onnxruntime-training/ds_configs/ds_config_zero_stage_1.json diff --git a/tests/onnxruntime/ds_configs/ds_config_zero_stage_2.json b/tests/onnxruntime-training/ds_configs/ds_config_zero_stage_2.json similarity index 100% rename from tests/onnxruntime/ds_configs/ds_config_zero_stage_2.json rename to tests/onnxruntime-training/ds_configs/ds_config_zero_stage_2.json diff --git a/tests/onnxruntime/ds_configs/ds_config_zero_stage_3.json b/tests/onnxruntime-training/ds_configs/ds_config_zero_stage_3.json similarity index 100% rename from tests/onnxruntime/ds_configs/ds_config_zero_stage_3.json rename to tests/onnxruntime-training/ds_configs/ds_config_zero_stage_3.json diff --git a/tests/onnxruntime/ds_configs/ds_config_zero_stage_inifinity.json b/tests/onnxruntime-training/ds_configs/ds_config_zero_stage_inifinity.json similarity index 100% rename from tests/onnxruntime/ds_configs/ds_config_zero_stage_inifinity.json rename to tests/onnxruntime-training/ds_configs/ds_config_zero_stage_inifinity.json diff --git a/tests/onnxruntime/training/nightly_test_examples.py b/tests/onnxruntime-training/test_examples.py similarity index 91% rename from tests/onnxruntime/training/nightly_test_examples.py rename to tests/onnxruntime-training/test_examples.py index a16913a097..5873f238af 100644 --- a/tests/onnxruntime/training/nightly_test_examples.py +++ b/tests/onnxruntime-training/test_examples.py @@ -25,7 +25,7 @@ class ORTTrainerExampleTest(unittest.TestCase): def test_text_classification(self): subprocess.run( - "cp ../examples/onnxruntime/training/text-classification/run_glue.py ./", + "cp examples/onnxruntime/training/text-classification/run_glue.py ./", shell=True, ) @@ -51,7 +51,7 @@ def test_text_classification(self): def test_token_classification(self): subprocess.run( - "cp ../examples/onnxruntime/training/token-classification/run_ner.py ./", + "cp examples/onnxruntime/training/token-classification/run_ner.py ./", shell=True, ) @@ -75,7 +75,7 @@ def test_token_classification(self): def test_translation(self): subprocess.run( - "cp ../examples/onnxruntime/training/translation/run_translation.py ./", + "cp examples/onnxruntime/training/translation/run_translation.py ./", shell=True, ) @@ -105,7 +105,7 @@ def test_translation(self): @pytest.mark.skip(reason="skip for now") def test_summarization(self): subprocess.run( - "cp ../examples/onnxruntime/training/summarization/run_summarization.py ./", + "cp examples/onnxruntime/training/summarization/run_summarization.py ./", shell=True, ) @@ -139,7 +139,7 @@ def test_stable_diffusion_txt2img(self): @pytest.mark.skip(reason="skip for now") def test_question_answering(self): subprocess.run( - "cp ../examples/onnxruntime/training/question-answering/run_qa.py ./", + "cp examples/onnxruntime/training/question-answering/run_qa.py ./", shell=True, ) @@ -166,7 +166,7 @@ def test_question_answering(self): @pytest.mark.skip(reason="skip for now") def test_language_modeling(self): subprocess.run( - "cp ../examples/onnxruntime/training/question-answering/run_qa.py ./", + "cp examples/onnxruntime/training/question-answering/run_qa.py ./", shell=True, ) @@ -194,7 +194,7 @@ def test_language_modeling(self): @pytest.mark.skip(reason="skip for now") def test_image_classification(self): subprocess.run( - "cp ../examples/onnxruntime/training/image-classification/run_image_classification.py ./", + "cp examples/onnxruntime/training/image-classification/run_image_classification.py ./", shell=True, ) diff --git a/tests/onnxruntime/training/nightly_test_trainer.py b/tests/onnxruntime-training/test_trainer.py similarity index 97% rename from tests/onnxruntime/training/nightly_test_trainer.py rename to tests/onnxruntime-training/test_trainer.py index e24ee30617..ac4413c639 100644 --- a/tests/onnxruntime/training/nightly_test_trainer.py +++ b/tests/onnxruntime-training/test_trainer.py @@ -60,11 +60,11 @@ nltk.download("punkt") _ENCODERS_TO_TEST = { - ("distilbert", "distilbert-base-cased"), + ("distilbert", "distilbert-base-uncased"), } _DECODERS_TO_TEST = { - ("gpt2", "gpt2"), + ("gpt2", "distilgpt2"), } _SEQ2SEQ_MODELS_TO_TEST = { @@ -78,11 +78,6 @@ "data_collator": default_data_collator, "data_collator_class": DataCollatorWithPadding, }, - # "token-classification": { - # "dataset": ["conll2003"], - # "metric": ["seqeval"], - # "data_collator_class": DataCollatorForTokenClassification, - # }, } _DECODER_TASKS_DATASETS_CONFIGS = { @@ -235,7 +230,7 @@ def load_and_prepare(task): def load_and_prepare_glue(model_name, data_metric_config, max_seq_length, padding="max_length", **kwargs): # Prepare model - model = AutoModelForSequenceClassification.from_pretrained(model_name) + model = AutoModelForSequenceClassification.from_pretrained(model_name, attn_implementation="eager") tokenizer = AutoTokenizer.from_pretrained(model_name) # Prepare dataset @@ -295,7 +290,9 @@ def load_and_prepare_ner(model_name, data_metric_config, max_seq_length, padding label_list = dataset["train"].features[f"{task}_tags"].feature.names # Prepare model - model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list)) + model = AutoModelForTokenClassification.from_pretrained( + model_name, num_labels=len(label_list), attn_implementation="eager" + ) if model_name.split("-")[0] in {"gpt2", "roberta"}: tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, add_prefix_space=True) else: @@ -387,7 +384,7 @@ def load_and_prepare_clm(model_name, data_metric_config, max_seq_length, padding metric = load(*data_metric_config["metric"]) # Prepare model - model = AutoModelForCausalLM.from_pretrained(model_name) + model = AutoModelForCausalLM.from_pretrained(model_name, attn_implementation="eager") tokenizer = AutoTokenizer.from_pretrained(model_name) # Prepare dataset @@ -462,7 +459,7 @@ def compute_metrics(eval_pred): def load_and_prepare_xsum(model_name, data_metric_config, _, **kwargs): # Prepare model - model = AutoModelForSeq2SeqLM.from_pretrained(model_name) + model = AutoModelForSeq2SeqLM.from_pretrained(model_name, attn_implementation="eager") tokenizer = AutoTokenizer.from_pretrained(model_name) # Load dataset and metric @@ -600,7 +597,7 @@ def test_trainer_fp32(self, test_name, model_name, task, data_metric_config): trainer.train() trainer.save_model() trainer.evaluate() - trainer.predict(test_dataset) + # trainer.predict(test_dataset) gc.collect() @slow @@ -639,7 +636,7 @@ def test_trainer_fp32_with_label_smoothing(self, test_name, model_name, task, da trainer.train() trainer.save_model() trainer.evaluate() - trainer.predict(test_dataset) + # trainer.predict(test_dataset) gc.collect() @slow @@ -678,7 +675,7 @@ def test_trainer_fp16(self, test_name, model_name, task, data_metric_config): trainer.train() trainer.save_model() trainer.evaluate() - trainer.predict(test_dataset) + # trainer.predict(test_dataset) gc.collect() @@ -730,7 +727,7 @@ def test_trainer_fp16_ds_stage1(self, test_name, model_name, task, data_metric_c weight_decay=self.weight_decay, logging_dir=tmp_dir, fp16=True, - deepspeed="onnxruntime/ds_configs/ds_config_zero_stage_1.json", + deepspeed="tests/onnxruntime-training/ds_configs/ds_config_zero_stage_1.json", ) trainer, _ = get_ort_trainer( @@ -769,7 +766,7 @@ def test_trainer_fp16_ds_stage2(self, test_name, model_name, task, data_metric_c weight_decay=self.weight_decay, logging_dir=tmp_dir, fp16=True, - deepspeed="onnxruntime/ds_configs/ds_config_zero_stage_2.json", + deepspeed="tests/onnxruntime-training/ds_configs/ds_config_zero_stage_2.json", ) trainer, _ = get_ort_trainer( diff --git a/tests/onnxruntime/docker/Dockerfile_onnxruntime_gpu b/tests/onnxruntime/docker/Dockerfile_onnxruntime_gpu deleted file mode 100644 index 696d863f34..0000000000 --- a/tests/onnxruntime/docker/Dockerfile_onnxruntime_gpu +++ /dev/null @@ -1,26 +0,0 @@ -# use version with CUDA 11.8 and TensorRT 8.5.1.7 to match ORT 1.14 requirements -FROM nvcr.io/nvidia/tensorrt:24.02-py3 -CMD nvidia-smi - -# Ignore interactive questions during `docker build` -ENV DEBIAN_FRONTEND noninteractive - -# Install and update tools to minimize security vulnerabilities -RUN apt-get update -RUN apt-get install -y software-properties-common wget apt-utils patchelf git libprotobuf-dev protobuf-compiler cmake \ - bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev ffmpeg && \ - apt-get clean -RUN unattended-upgrade -RUN apt-get autoremove -y - -RUN python -m pip install -U pip - -RUN pip install transformers torch onnxruntime-gpu -RUN pip install datasets evaluate diffusers scipy - -# Install Optimum -COPY . /workspace/optimum -RUN pip install /workspace/optimum[onnxruntime-gpu,tests] - -ENV TEST_LEVEL=1 -CMD pytest onnxruntime/test_*.py --durations=0 -s -vvvvv -m cuda_ep_test -m trt_ep_test diff --git a/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer b/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer deleted file mode 100644 index 82fece1cf5..0000000000 --- a/tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -# Copyright 2023 The HuggingFace Team All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Use nvidia/cuda image -FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 -CMD nvidia-smi - -# Ignore interactive questions during `docker build` -ENV DEBIAN_FRONTEND noninteractive - -# Bash shell -RUN chsh -s /bin/bash -SHELL ["/bin/bash", "-c"] - -# Versions -ARG PYTHON_VERSION=3.9 -ARG TORCH_CUDA_VERSION=cu118 -ARG TORCH_VERSION=2.0.0 -ARG TORCHVISION_VERSION=0.15.1 - -# Install and update tools to minimize security vulnerabilities -RUN apt-get update -RUN apt-get install -y software-properties-common wget apt-utils patchelf git libprotobuf-dev protobuf-compiler cmake \ - bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev ffmpeg && \ - apt-get clean -RUN unattended-upgrade -RUN apt-get autoremove -y - -# Install miniconda (comes with python 3.9 default) -ARG BUILD_USER=onnxruntimedev -ARG MINICONDA_PREFIX=/home/$BUILD_USER/miniconda3 -RUN apt-get install curl - -ARG CONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-py37_4.9.2-Linux-x86_64.sh -RUN curl -fSsL --insecure ${CONDA_URL} -o install-conda.sh && \ - /bin/bash ./install-conda.sh -b -p $MINICONDA_PREFIX && \ - $MINICONDA_PREFIX/bin/conda clean -ya && \ - $MINICONDA_PREFIX/bin/conda install -y python=${PYTHON_VERSION} - -ENV PATH=$MINICONDA_PREFIX/bin:${PATH} - -ARG PYTHON_EXE=$MINICONDA_PREFIX/bin/python - -# (Optional) Intall test dependencies -RUN $PYTHON_EXE -m pip install git+https://github.com/huggingface/transformers -RUN $PYTHON_EXE -m pip install datasets accelerate evaluate coloredlogs absl-py rouge_score seqeval scipy sacrebleu nltk scikit-learn parameterized sentencepiece -RUN $PYTHON_EXE -m pip install deepspeed mpi4py -# RUN $PYTHON_EXE -m pip install optuna ray sigopt wandb - -# PyTorch -RUN $PYTHON_EXE -m pip install onnx ninja -RUN $PYTHON_EXE -m pip install torch==${TORCH_VERSION} torchvision==${TORCHVISION_VERSION} -f https://download.pytorch.org/whl/${TORCH_CUDA_VERSION} - -# ORT Module -RUN $PYTHON_EXE -m pip install onnxruntime-training==1.16.3 -f https://download.onnxruntime.ai/onnxruntime_stable_cu118.html -RUN $PYTHON_EXE -m pip install torch-ort -ENV TORCH_CUDA_ARCH_LIST="5.2 6.0 6.1 7.0 7.5 8.0 8.6+PTX" -RUN $PYTHON_EXE -m pip install --upgrade protobuf==3.20.2 -RUN $PYTHON_EXE -m torch_ort.configure - -# https://github.com/vllm-project/vllm/issues/1726 -RUN pip uninstall nvidia-nccl-cu12 -y - -# Install Optimum -COPY . /workspace/optimum -RUN pip install /workspace/optimum[tests] - -ENV TEST_LEVEL=1 -CMD RUN_SLOW=1 pytest -v -rs onnxruntime/training/nightly_test_trainer.py --durations=0 -CMD RUN_SLOW=1 pytest -v -rs onnxruntime/training/nightly_test_examples.py --durations=0 diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py index 5ff2509264..749e078456 100644 --- a/tests/onnxruntime/test_diffusion.py +++ b/tests/onnxruntime/test_diffusion.py @@ -25,8 +25,8 @@ from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker from diffusers.utils import load_image from parameterized import parameterized +from testing_utils import MODEL_NAMES, SEED, ORTModelTestMixin from transformers.testing_utils import require_torch_gpu -from utils_onnxruntime_tests import MODEL_NAMES, SEED, ORTModelTestMixin from optimum.onnxruntime import ( ORTDiffusionPipeline, @@ -281,16 +281,18 @@ def test_negative_prompt(self, model_arch: str): grid_parameters( { "model_arch": SUPPORTED_ARCHITECTURES, - "provider": ["CUDAExecutionProvider", "ROCMExecutionProvider", "TensorrtExecutionProvider"], + "provider": ["CUDAExecutionProvider", "TensorrtExecutionProvider"], } ) ) - @pytest.mark.rocm_ep_test @pytest.mark.cuda_ep_test @pytest.mark.trt_ep_test @require_torch_gpu @require_diffusers def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): + if provider == "TensorrtExecutionProvider" and model_arch != self.__class__.SUPPORTED_ARCHITECTURES[0]: + self.skipTest("Testing a single arch for TensorrtExecutionProvider") + model_args = {"test_name": test_name, "model_arch": model_arch} self._setup(model_args) @@ -519,16 +521,18 @@ def test_image_reproducibility(self, model_arch: str): grid_parameters( { "model_arch": SUPPORTED_ARCHITECTURES, - "provider": ["CUDAExecutionProvider", "ROCMExecutionProvider", "TensorrtExecutionProvider"], + "provider": ["CUDAExecutionProvider", "TensorrtExecutionProvider"], } ) ) - @pytest.mark.rocm_ep_test @pytest.mark.cuda_ep_test @pytest.mark.trt_ep_test @require_torch_gpu @require_diffusers def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): + if provider == "TensorrtExecutionProvider" and model_arch != self.__class__.SUPPORTED_ARCHITECTURES[0]: + self.skipTest("Testing a single arch for TensorrtExecutionProvider") + model_args = {"test_name": test_name, "model_arch": model_arch} self._setup(model_args) @@ -759,16 +763,18 @@ def test_image_reproducibility(self, model_arch: str): grid_parameters( { "model_arch": SUPPORTED_ARCHITECTURES, - "provider": ["CUDAExecutionProvider", "ROCMExecutionProvider", "TensorrtExecutionProvider"], + "provider": ["CUDAExecutionProvider", "TensorrtExecutionProvider"], } ) ) - @pytest.mark.rocm_ep_test @pytest.mark.cuda_ep_test @pytest.mark.trt_ep_test @require_torch_gpu @require_diffusers def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): + if provider == "TensorrtExecutionProvider" and model_arch != self.__class__.SUPPORTED_ARCHITECTURES[0]: + self.skipTest("Testing a single arch for TensorrtExecutionProvider") + model_args = {"test_name": test_name, "model_arch": model_arch} self._setup(model_args) diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index c341bd88a9..9ea0483e35 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -16,7 +16,6 @@ import os import subprocess import tempfile -import time import unittest from pathlib import Path from typing import Dict @@ -26,12 +25,12 @@ import onnxruntime import pytest import requests -import timm import torch from huggingface_hub import HfApi from huggingface_hub.constants import default_cache_path from parameterized import parameterized from PIL import Image +from testing_utils import MODEL_NAMES, SEED, ORTModelTestMixin from transformers import ( AutoConfig, AutoFeatureExtractor, @@ -65,7 +64,6 @@ from transformers.models.swin2sr.configuration_swin2sr import Swin2SRConfig from transformers.onnx.utils import get_preprocessor from transformers.testing_utils import get_gpu_count, require_torch_gpu, slow -from utils_onnxruntime_tests import MODEL_NAMES, SEED, ORTModelTestMixin from optimum.exporters import TasksManager from optimum.exporters.onnx import MODEL_TYPES_REQUIRING_POSITION_IDS, main_export @@ -130,21 +128,12 @@ logger = logging.get_logger() -class Timer(object): - def __enter__(self): - self.elapsed = time.perf_counter() - return self - - def __exit__(self, type, value, traceback): - self.elapsed = (time.perf_counter() - self.elapsed) * 1e3 - - class ORTModelIntegrationTest(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.TEST_MODEL_ID = "sshleifer/tiny-distilbert-base-cased-distilled-squad" self.LOCAL_MODEL_PATH = "tests/assets/onnx" self.ONNX_MODEL_ID = "philschmid/distilbert-onnx" + self.TINY_ONNX_MODEL_ID = "fxmarty/resnet-tiny-beans" self.FAIL_ONNX_MODEL_ID = "sshleifer/tiny-distilbert-base-cased-distilled-squad" self.ONNX_SEQ2SEQ_MODEL_ID = "optimum/t5-small" @@ -255,6 +244,16 @@ def test_load_model_cuda_provider(self): self.assertListEqual(model.model.get_providers(), model.providers) self.assertEqual(model.device, torch.device("cuda:0")) + @require_torch_gpu + @pytest.mark.trt_ep_test + def test_load_model_tensorrt_provider(self): + model = ORTModel.from_pretrained(self.ONNX_MODEL_ID, provider="TensorrtExecutionProvider") + self.assertListEqual( + model.providers, ["TensorrtExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider"] + ) + self.assertListEqual(model.model.get_providers(), model.providers) + self.assertEqual(model.device, torch.device("cuda:0")) + @require_torch_gpu @require_ort_rocm @pytest.mark.rocm_ep_test @@ -774,7 +773,6 @@ def test_seq2seq_model_on_gpu_id(self): model.decoder_with_past.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1" ) - # test string device input for to() @require_torch_gpu @pytest.mark.cuda_ep_test def test_seq2seq_model_on_gpu_str(self): @@ -1054,7 +1052,7 @@ def test_save_load_ort_model_with_external_data(self): def test_save_load_decoder_model_with_external_data(self, use_cache: bool): with tempfile.TemporaryDirectory() as tmpdirname: model = ORTModelForCausalLM.from_pretrained( - "gpt2-large", use_cache=use_cache, export=True, use_merged=False, use_io_binding=False + "gpt2-large", export=True, use_cache=use_cache, use_merged=False, use_io_binding=False ) model.save_pretrained(tmpdirname) @@ -1265,9 +1263,7 @@ def test_trust_remote_code(self): ort_logits = ort_model(**inputs).logits - self.assertTrue( - torch.allclose(pt_logits, ort_logits, atol=1e-4), f" Maxdiff: {torch.abs(pt_logits - ort_logits).max()}" - ) + torch.testing.assert_close(pt_logits, ort_logits, atol=1e-4, rtol=1e-4) @parameterized.expand(("", "onnx")) def test_loading_with_config_not_from_subfolder(self, subfolder): @@ -1289,8 +1285,8 @@ class ORTModelForQuestionAnsweringIntegrationTest(ORTModelTestMixin): "albert", "bart", "bert", - # "big_bird", - # "bigbird_pegasus", + "big_bird", + "bigbird_pegasus", "camembert", "convbert", "data2vec_text", @@ -1354,11 +1350,14 @@ def test_compare_to_transformers(self, model_arch): self.assertIsInstance(onnx_outputs.end_logits, self.TENSOR_ALIAS_TO_TYPE[input_type]) # Compare tensor outputs - self.assertTrue( - torch.allclose(torch.Tensor(onnx_outputs.start_logits), transformers_outputs.start_logits, atol=1e-4) + torch.testing.assert_close( + torch.Tensor(onnx_outputs.start_logits), + transformers_outputs.start_logits, + atol=self.ATOL, + rtol=self.RTOL, ) - self.assertTrue( - torch.allclose(torch.Tensor(onnx_outputs.end_logits), transformers_outputs.end_logits, atol=1e-4) + torch.testing.assert_close( + torch.Tensor(onnx_outputs.end_logits), transformers_outputs.end_logits, atol=self.ATOL, rtol=self.RTOL ) gc.collect() @@ -1458,14 +1457,18 @@ def test_compare_to_io_binding(self, model_arch): model_id = MODEL_NAMES[model_arch] onnx_model = ORTModelForQuestionAnswering.from_pretrained( - self.onnx_model_dirs[model_arch], use_io_binding=False - ).to("cuda") + self.onnx_model_dirs[model_arch], use_io_binding=False, provider="CUDAExecutionProvider" + ) io_model = ORTModelForQuestionAnswering.from_pretrained( - self.onnx_model_dirs[model_arch], use_io_binding=True - ).to("cuda") + self.onnx_model_dirs[model_arch], use_io_binding=True, provider="CUDAExecutionProvider" + ) + + self.assertFalse(onnx_model.use_io_binding) + self.assertTrue(io_model.use_io_binding) tokenizer = get_preprocessor(model_id) - tokens = tokenizer(["This is a sample output"] * 2, return_tensors="pt") + tokens = tokenizer(["This is a sample output"] * 2, return_tensors="pt").to("cuda") + onnx_outputs = onnx_model(**tokens) io_outputs = io_model(**tokens) @@ -1475,8 +1478,12 @@ def test_compare_to_io_binding(self, model_arch): self.assertIsInstance(io_outputs.end_logits, torch.Tensor) # compare tensor outputs - self.assertTrue(torch.equal(onnx_outputs.start_logits, io_outputs.start_logits)) - self.assertTrue(torch.equal(onnx_outputs.end_logits, io_outputs.end_logits)) + torch.testing.assert_close( + torch.Tensor(io_outputs.start_logits), onnx_outputs.start_logits, atol=self.ATOL, rtol=self.RTOL + ) + torch.testing.assert_close( + torch.Tensor(io_outputs.end_logits), onnx_outputs.end_logits, atol=self.ATOL, rtol=self.RTOL + ) gc.collect() @@ -1485,7 +1492,7 @@ class ORTModelForMaskedLMIntegrationTest(ORTModelTestMixin): SUPPORTED_ARCHITECTURES = [ "albert", "bert", - # "big_bird", + "big_bird", "camembert", "convbert", "data2vec_text", @@ -1544,7 +1551,9 @@ def test_compare_to_transformers(self, model_arch): self.assertIsInstance(onnx_outputs.logits, self.TENSOR_ALIAS_TO_TYPE[input_type]) # compare tensor outputs - self.assertTrue(torch.allclose(torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=1e-4)) + torch.testing.assert_close( + torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=self.ATOL, rtol=self.RTOL + ) gc.collect() @@ -1630,16 +1639,19 @@ def test_compare_to_io_binding(self, model_arch): self._setup(model_args) model_id = MODEL_NAMES[model_arch] - onnx_model = ORTModelForMaskedLM.from_pretrained(self.onnx_model_dirs[model_arch], use_io_binding=False).to( - "cuda" + onnx_model = ORTModelForMaskedLM.from_pretrained( + self.onnx_model_dirs[model_arch], use_io_binding=False, provider="CUDAExecutionProvider" ) - io_model = ORTModelForMaskedLM.from_pretrained(self.onnx_model_dirs[model_arch], use_io_binding=True).to( - "cuda" + io_model = ORTModelForMaskedLM.from_pretrained( + self.onnx_model_dirs[model_arch], use_io_binding=True, provider="CUDAExecutionProvider" ) + self.assertFalse(onnx_model.use_io_binding) + self.assertTrue(io_model.use_io_binding) + tokenizer = get_preprocessor(model_id) - MASK_TOKEN = tokenizer.mask_token - tokens = tokenizer([f"The capital of France is {MASK_TOKEN}."] * 2, return_tensors="pt") + tokens = tokenizer([f"The capital of France is {tokenizer.mask_token}."] * 2, return_tensors="pt").to("cuda") + onnx_outputs = onnx_model(**tokens) io_outputs = io_model(**tokens) @@ -1647,7 +1659,7 @@ def test_compare_to_io_binding(self, model_arch): self.assertIsInstance(io_outputs.logits, torch.Tensor) # compare tensor outputs - self.assertTrue(torch.equal(onnx_outputs.logits, io_outputs.logits)) + torch.testing.assert_close(io_outputs.logits, onnx_outputs.logits, atol=self.ATOL, rtol=self.RTOL) gc.collect() @@ -1657,8 +1669,8 @@ class ORTModelForSequenceClassificationIntegrationTest(ORTModelTestMixin): "albert", "bart", "bert", - # "big_bird", - # "bigbird_pegasus", + "big_bird", + "bigbird_pegasus", "bloom", "camembert", "convbert", @@ -1725,7 +1737,9 @@ def test_compare_to_transformers(self, model_arch): self.assertIsInstance(onnx_outputs.logits, self.TENSOR_ALIAS_TO_TYPE[input_type]) # compare tensor outputs - self.assertTrue(torch.allclose(torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=1e-4)) + torch.testing.assert_close( + torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=self.ATOL, rtol=self.RTOL + ) gc.collect() @@ -1840,14 +1854,18 @@ def test_compare_to_io_binding(self, model_arch): model_id = MODEL_NAMES[model_arch] onnx_model = ORTModelForSequenceClassification.from_pretrained( - self.onnx_model_dirs[model_arch], use_io_binding=False - ).to("cuda") + self.onnx_model_dirs[model_arch], use_io_binding=False, provider="CUDAExecutionProvider" + ) io_model = ORTModelForSequenceClassification.from_pretrained( - self.onnx_model_dirs[model_arch], use_io_binding=True - ).to("cuda") + self.onnx_model_dirs[model_arch], use_io_binding=True, provider="CUDAExecutionProvider" + ) + + self.assertFalse(onnx_model.use_io_binding) + self.assertTrue(io_model.use_io_binding) tokenizer = get_preprocessor(model_id) - tokens = tokenizer(["This is a sample output"] * 2, return_tensors="pt") + tokens = tokenizer(["This is a sample output"] * 2, return_tensors="pt").to("cuda") + onnx_outputs = onnx_model(**tokens) io_outputs = io_model(**tokens) @@ -1855,7 +1873,7 @@ def test_compare_to_io_binding(self, model_arch): self.assertIsInstance(io_outputs.logits, torch.Tensor) # compare tensor outputs - self.assertTrue(torch.equal(onnx_outputs.logits, io_outputs.logits)) + torch.testing.assert_close(onnx_outputs.logits, io_outputs.logits, atol=self.ATOL, rtol=self.RTOL) gc.collect() @@ -1864,7 +1882,7 @@ class ORTModelForTokenClassificationIntegrationTest(ORTModelTestMixin): SUPPORTED_ARCHITECTURES = [ "albert", "bert", - # "big_bird", + "big_bird", "bloom", "camembert", "convbert", @@ -1926,7 +1944,9 @@ def test_compare_to_transformers(self, model_arch): self.assertIsInstance(onnx_outputs.logits, self.TENSOR_ALIAS_TO_TYPE[input_type]) # compare tensor outputs - self.assertTrue(torch.allclose(torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=1e-4)) + torch.testing.assert_close( + torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=self.ATOL, rtol=self.RTOL + ) gc.collect() @@ -2020,14 +2040,18 @@ def test_compare_to_io_binding(self, model_arch): model_id = MODEL_NAMES[model_arch] onnx_model = ORTModelForTokenClassification.from_pretrained( - self.onnx_model_dirs[model_arch], use_io_binding=False - ).to("cuda") + self.onnx_model_dirs[model_arch], use_io_binding=False, provider="CUDAExecutionProvider" + ) io_model = ORTModelForTokenClassification.from_pretrained( - self.onnx_model_dirs[model_arch], use_io_binding=True - ).to("cuda") + self.onnx_model_dirs[model_arch], use_io_binding=True, provider="CUDAExecutionProvider" + ) + + self.assertFalse(onnx_model.use_io_binding) + self.assertTrue(io_model.use_io_binding) tokenizer = get_preprocessor(model_id) - tokens = tokenizer(["This is a sample output"] * 2, return_tensors="pt") + tokens = tokenizer(["This is a sample output"] * 2, return_tensors="pt").to("cuda") + onnx_outputs = onnx_model(**tokens) io_outputs = io_model(**tokens) @@ -2035,7 +2059,7 @@ def test_compare_to_io_binding(self, model_arch): self.assertIsInstance(io_outputs.logits, torch.Tensor) # compare tensor outputs - self.assertTrue(torch.equal(onnx_outputs.logits, io_outputs.logits)) + torch.testing.assert_close(onnx_outputs.logits, io_outputs.logits, atol=self.ATOL, rtol=self.RTOL) gc.collect() @@ -2083,10 +2107,11 @@ def test_compare_to_transformers(self, model_arch): self.assertIsInstance(onnx_outputs.last_hidden_state, self.TENSOR_ALIAS_TO_TYPE[input_type]) # compare tensor outputs - self.assertTrue( - torch.allclose( - torch.Tensor(onnx_outputs.last_hidden_state), transformers_outputs.last_hidden_state, atol=1e-4 - ) + torch.testing.assert_close( + torch.Tensor(onnx_outputs.last_hidden_state), + transformers_outputs.last_hidden_state, + atol=self.ATOL, + rtol=self.RTOL, ) gc.collect() @@ -2178,14 +2203,18 @@ def test_compare_to_io_binding(self, model_arch): model_id = MODEL_NAMES[model_arch] onnx_model = ORTModelForFeatureExtraction.from_pretrained( - self.onnx_model_dirs[model_arch], use_io_binding=False - ).to("cuda") + self.onnx_model_dirs[model_arch], use_io_binding=False, provider="CUDAExecutionProvider" + ) io_model = ORTModelForFeatureExtraction.from_pretrained( - self.onnx_model_dirs[model_arch], use_io_binding=True - ).to("cuda") + self.onnx_model_dirs[model_arch], use_io_binding=True, provider="CUDAExecutionProvider" + ) + + self.assertFalse(onnx_model.use_io_binding) + self.assertTrue(io_model.use_io_binding) tokenizer = get_preprocessor(model_id) - tokens = tokenizer(["This is a sample output"] * 2, return_tensors="pt") + tokens = tokenizer(["This is a sample output"] * 2, return_tensors="pt").to("cuda") + onnx_outputs = onnx_model(**tokens) io_outputs = io_model(**tokens) @@ -2193,7 +2222,9 @@ def test_compare_to_io_binding(self, model_arch): self.assertIsInstance(io_outputs.last_hidden_state, torch.Tensor) # compare tensor outputs - self.assertTrue(torch.equal(onnx_outputs.last_hidden_state, io_outputs.last_hidden_state)) + torch.testing.assert_close( + onnx_outputs.last_hidden_state, io_outputs.last_hidden_state, atol=self.ATOL, rtol=self.RTOL + ) gc.collect() @@ -2206,7 +2237,9 @@ def test_default_token_type_ids(self): token_type_ids = tokens.pop("token_type_ids") outs = model(token_type_ids=token_type_ids, **tokens) outs_without_token_type_ids = model(**tokens) - self.assertTrue(np.allclose(outs.last_hidden_state, outs_without_token_type_ids.last_hidden_state)) + torch.testing.assert_close( + outs.last_hidden_state, outs_without_token_type_ids.last_hidden_state, atol=self.ATOL, rtol=self.RTOL + ) gc.collect() @@ -2215,7 +2248,7 @@ class ORTModelForMultipleChoiceIntegrationTest(ORTModelTestMixin): SUPPORTED_ARCHITECTURES = [ "albert", "bert", - # "big_bird", + "big_bird", "camembert", "convbert", "data2vec_text", @@ -2274,7 +2307,9 @@ def test_compare_to_transformers(self, model_arch): self.assertIsInstance(onnx_outputs.logits, self.TENSOR_ALIAS_TO_TYPE[input_type]) # Compare tensor outputs - self.assertTrue(torch.allclose(torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=1e-4)) + torch.testing.assert_close( + torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=self.ATOL, rtol=self.RTOL + ) gc.collect() @@ -2287,24 +2322,25 @@ def test_compare_to_io_binding(self, model_arch): model_id = MODEL_NAMES[model_arch] onnx_model = ORTModelForMultipleChoice.from_pretrained( - self.onnx_model_dirs[model_arch], use_io_binding=False - ).to("cuda") - io_model = ORTModelForMultipleChoice.from_pretrained(self.onnx_model_dirs[model_arch], use_io_binding=True).to( - "cuda" + self.onnx_model_dirs[model_arch], use_io_binding=False, provider="CUDAExecutionProvider" + ) + io_model = ORTModelForMultipleChoice.from_pretrained( + self.onnx_model_dirs[model_arch], use_io_binding=True, provider="CUDAExecutionProvider" ) - tokenizer = get_preprocessor(model_id) + self.assertFalse(onnx_model.use_io_binding) + self.assertTrue(io_model.use_io_binding) + num_choices = 4 - first_sentence = ["The sky is blue due to the shorter wavelength of blue light."] * num_choices start = "The color of the sky is" + tokenizer = get_preprocessor(model_id) + first_sentence = ["The sky is blue due to the shorter wavelength of blue light."] * num_choices second_sentence = [start + "blue", start + "green", start + "red", start + "yellow"] inputs = tokenizer(first_sentence, second_sentence, truncation=True, padding=True) - # Unflatten the tokenized inputs values expanding it to the shape [batch_size, num_choices, seq_length] for k, v in inputs.items(): inputs[k] = [v[i : i + num_choices] for i in range(0, len(v), num_choices)] - - inputs = dict(inputs.convert_to_tensors(tensor_type="pt")) + inputs = dict(inputs.convert_to_tensors(tensor_type="pt").to("cuda")) onnx_outputs = onnx_model(**inputs) io_outputs = io_model(**inputs) @@ -2313,7 +2349,7 @@ def test_compare_to_io_binding(self, model_arch): self.assertIsInstance(io_outputs.logits, torch.Tensor) # compare tensor outputs - self.assertTrue(torch.equal(onnx_outputs.logits, io_outputs.logits)) + torch.testing.assert_close(io_outputs.logits, onnx_outputs.logits, atol=self.ATOL, rtol=self.RTOL) gc.collect() @@ -2354,8 +2390,7 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin): ORTMODEL_CLASS = ORTModelForCausalLM TASK = "text-generation" - GENERATION_LENGTH = 90 - SPEEDUP_CACHE = 1.1 + GENERATION_LENGTH = 100 @parameterized.expand([(False,), (True,)]) @pytest.mark.run_in_series @@ -2471,10 +2506,7 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach self.assertIsInstance(onnx_outputs.logits, torch.Tensor) # compare tensor outputs - self.assertTrue( - torch.allclose(onnx_outputs.logits, transformers_outputs.logits, atol=1e-4), - f"Maxdiff: {(onnx_outputs.logits - transformers_outputs.logits).abs()}", - ) + torch.testing.assert_close(onnx_outputs.logits, transformers_outputs.logits, atol=self.ATOL, rtol=self.RTOL) # Compare batched generation. tokenizer.pad_token_id = tokenizer.eos_token_id @@ -2516,13 +2548,11 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach set_seed(SEED) with torch.no_grad(): transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config) + set_seed(SEED) onnx_outputs = onnx_model.generate(**tokens, generation_config=gen_config) - self.assertTrue( - torch.equal(onnx_outputs, transformers_outputs), - f"Failed with generation config : {gen_config}, transformers outputs {transformers_outputs}, ONNX model outputs {onnx_outputs}", - ) + torch.testing.assert_close(onnx_outputs, transformers_outputs, atol=self.ATOL, rtol=self.RTOL) gc.collect() @@ -2665,7 +2695,6 @@ def test_pipeline_on_trt_execution_provider(self, test_name: str, model_arch: st gc.collect() @parameterized.expand(SUPPORTED_ARCHITECTURES) - @pytest.mark.cuda_ep_test # mark as GPU test as well to run the without/with cache timing test on the slow tests def test_compare_with_and_without_past_key_values(self, model_arch): model_args = {"test_name": model_arch + "_False", "model_arch": model_arch, "use_cache": False} self._setup(model_args) @@ -2677,34 +2706,25 @@ def test_compare_with_and_without_past_key_values(self, model_arch): text = "My Name is Philipp and i live" tokens = tokenizer(text, return_tensors="pt", return_token_type_ids=False if model_arch == "llama" else None) + generation_length = 10 # model has a short max length + model_with_pkv = ORTModelForCausalLM.from_pretrained( self.onnx_model_dirs[model_arch + "_True"], use_cache=True, use_io_binding=False ) - _ = model_with_pkv.generate(**tokens) # warmup - with Timer() as with_pkv_timer: - outputs_model_with_pkv = model_with_pkv.generate( - **tokens, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1 - ) + outputs_model_with_pkv = model_with_pkv.generate( + **tokens, min_new_tokens=generation_length, max_new_tokens=generation_length, num_beams=1 + ) model_without_pkv = ORTModelForCausalLM.from_pretrained( self.onnx_model_dirs[model_arch + "_False"], use_cache=False, use_io_binding=False ) - _ = model_without_pkv.generate(**tokens) # warmup - with Timer() as without_pkv_timer: - outputs_model_without_pkv = model_without_pkv.generate( - **tokens, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1 - ) - - self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) - self.assertEqual(outputs_model_with_pkv.shape[1], tokens["input_ids"].shape[1] + self.GENERATION_LENGTH) - self.assertEqual(outputs_model_without_pkv.shape[1], tokens["input_ids"].shape[1] + self.GENERATION_LENGTH) + outputs_model_without_pkv = model_without_pkv.generate( + **tokens, min_new_tokens=generation_length, max_new_tokens=generation_length, num_beams=1 + ) - if os.environ.get("TEST_LEVEL", 0) == "1": - self.assertTrue( - without_pkv_timer.elapsed / with_pkv_timer.elapsed > self.SPEEDUP_CACHE, - f"With pkv latency: {with_pkv_timer.elapsed:.3f} ms, without pkv latency: {without_pkv_timer.elapsed:.3f} ms," - f" speedup: {without_pkv_timer.elapsed / with_pkv_timer.elapsed:.3f}", - ) + torch.testing.assert_close(outputs_model_with_pkv, outputs_model_without_pkv, atol=self.ATOL, rtol=self.RTOL) + self.assertEqual(outputs_model_with_pkv.shape[1], tokens["input_ids"].shape[1] + generation_length) + self.assertEqual(outputs_model_without_pkv.shape[1], tokens["input_ids"].shape[1] + generation_length) @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]})) def test_compare_merged_and_not_merged_models_outputs(self, test_name: str, model_arch: str, use_cache: bool): @@ -2747,7 +2767,7 @@ def test_compare_merged_and_not_merged_models_outputs(self, test_name: str, mode outputs_model_not_merged = model_not_merged.generate(**tokens) outputs_model_merged = model_merged.generate(**tokens) - self.assertTrue(torch.equal(outputs_model_merged, outputs_model_not_merged)) + torch.testing.assert_close(outputs_model_not_merged, outputs_model_merged, atol=self.ATOL, rtol=self.RTOL) @parameterized.expand( grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True], "use_merged": [False, True]}) @@ -2765,11 +2785,17 @@ def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: model_id = MODEL_NAMES[model_arch] onnx_model = ORTModelForCausalLM.from_pretrained( - self.onnx_model_dirs[test_name], use_cache=use_cache, use_io_binding=False - ).to("cuda") + self.onnx_model_dirs[test_name], + use_cache=use_cache, + use_io_binding=False, + provider="CUDAExecutionProvider", + ) io_model = ORTModelForCausalLM.from_pretrained( - self.onnx_model_dirs[test_name], use_cache=use_cache, use_io_binding=True - ).to("cuda") + self.onnx_model_dirs[test_name], + use_cache=use_cache, + use_io_binding=True, + provider="CUDAExecutionProvider", + ) tokenizer = get_preprocessor(model_id) tokens = tokenizer(["This is a sample output"] * 2, return_tensors="pt").to("cuda") @@ -2788,7 +2814,7 @@ def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: self.assertIsInstance(io_outputs.logits, torch.Tensor) # compare tensor outputs - self.assertTrue(torch.equal(onnx_outputs.logits, io_outputs.logits)) + torch.testing.assert_close(io_outputs.logits, onnx_outputs.logits, atol=self.ATOL, rtol=self.RTOL) gc.collect() @@ -2800,10 +2826,15 @@ def test_compare_generation_to_io_binding(self, test_name: str, model_arch: str, self._setup(model_args) model_id = MODEL_NAMES[model_arch] - onnx_model = ORTModelForCausalLM.from_pretrained(self.onnx_model_dirs[test_name], use_io_binding=False).to( - "cuda" + onnx_model = ORTModelForCausalLM.from_pretrained( + self.onnx_model_dirs[test_name], use_io_binding=False, provider="CUDAExecutionProvider" + ) + io_model = ORTModelForCausalLM.from_pretrained( + self.onnx_model_dirs[test_name], use_io_binding=True, provider="CUDAExecutionProvider" ) - io_model = ORTModelForCausalLM.from_pretrained(self.onnx_model_dirs[test_name], use_io_binding=True).to("cuda") + + self.assertFalse(onnx_model.use_io_binding) + self.assertTrue(io_model.use_io_binding) tokenizer = get_preprocessor(model_id) tokens = tokenizer( @@ -2811,11 +2842,12 @@ def test_compare_generation_to_io_binding(self, test_name: str, model_arch: str, return_tensors="pt", return_token_type_ids=False if model_arch == "llama" else None, ).to("cuda") + onnx_outputs = onnx_model.generate(**tokens) io_outputs = io_model.generate(**tokens) # compare tensor outputs - self.assertTrue(torch.equal(onnx_outputs, io_outputs)) + torch.testing.assert_close(io_outputs, onnx_outputs, atol=self.ATOL, rtol=self.RTOL) gc.collect() @@ -2841,8 +2873,6 @@ class ORTModelForImageClassificationIntegrationTest(ORTModelTestMixin): "vit", ] - TIMM_SUPPORTED_ARCHITECTURES = ["default-timm-config"] - FULL_GRID = {"model_arch": SUPPORTED_ARCHITECTURES} ORTMODEL_CLASS = ORTModelForImageClassification TASK = "image-classification" @@ -2868,54 +2898,6 @@ def test_load_vanilla_transformers_which_is_not_supported(self): self.assertIn("only supports the tasks", str(context.exception)) - @parameterized.expand(TIMM_SUPPORTED_ARCHITECTURES) - @pytest.mark.run_slow - @pytest.mark.timm_test - @slow - def test_compare_to_timm(self, model_arch): - model_args = {"test_name": model_arch, "model_arch": model_arch} - - self._setup(model_args) - - model_ids = self._get_model_ids(model_arch) - for model_id in model_ids: - onnx_model = ORTModelForImageClassification.from_pretrained( - self._get_onnx_model_dir(model_id, model_arch, model_arch) - ) - - self.assertIsInstance(onnx_model.model, onnxruntime.InferenceSession) - self.assertIsInstance(onnx_model.config, PretrainedConfig) - - set_seed(SEED) - timm_model = timm.create_model(model_id, pretrained=True) - timm_model = timm_model.eval() - - # get model specific transforms (normalization, resize) - data_config = timm.data.resolve_model_data_config(timm_model) - transforms = timm.data.create_transform(**data_config, is_training=False) - - url = ( - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png" - ) - image = Image.open(requests.get(url, stream=True).raw) - inputs = transforms(image).unsqueeze(0) - - with torch.no_grad(): - timm_outputs = timm_model(inputs) - - for input_type in ["pt", "np"]: - if input_type == "np": - inputs = inputs.cpu().detach().numpy() - onnx_outputs = onnx_model(inputs) - - self.assertIn("logits", onnx_outputs) - self.assertIsInstance(onnx_outputs.logits, self.TENSOR_ALIAS_TO_TYPE[input_type]) - - # compare tensor outputs - self.assertTrue(torch.allclose(torch.Tensor(onnx_outputs.logits), timm_outputs, atol=1e-4)) - - gc.collect() - @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): model_args = {"test_name": model_arch, "model_arch": model_arch} @@ -2946,7 +2928,9 @@ def test_compare_to_transformers(self, model_arch): self.assertIsInstance(onnx_outputs.logits, self.TENSOR_ALIAS_TO_TYPE[input_type]) # compare tensor outputs - self.assertTrue(torch.allclose(torch.Tensor(onnx_outputs.logits), trtfs_outputs.logits, atol=1e-4)) + torch.testing.assert_close( + torch.Tensor(onnx_outputs.logits), trtfs_outputs.logits, atol=self.ATOL, rtol=self.RTOL + ) gc.collect() @@ -3046,16 +3030,26 @@ def test_compare_to_io_binding(self, model_arch): model_id = MODEL_NAMES[model_arch] onnx_model = ORTModelForImageClassification.from_pretrained( - self.onnx_model_dirs[model_arch], use_io_binding=False - ).to("cuda") + self.onnx_model_dirs[model_arch], + use_io_binding=False, + provider="CUDAExecutionProvider", + provider_options={"cudnn_conv_algo_search": "DEFAULT"}, + ) io_model = ORTModelForImageClassification.from_pretrained( - self.onnx_model_dirs[model_arch], use_io_binding=True - ).to("cuda") + self.onnx_model_dirs[model_arch], + use_io_binding=True, + provider="CUDAExecutionProvider", + provider_options={"cudnn_conv_algo_search": "DEFAULT"}, + ) + + self.assertFalse(onnx_model.use_io_binding) + self.assertTrue(io_model.use_io_binding) preprocessor = get_preprocessor(model_id) url = "http://images.cocodataset.org/val2017/000000039769.jpg" image = Image.open(requests.get(url, stream=True).raw) - inputs = preprocessor(images=[image] * 2, return_tensors="pt") + inputs = preprocessor(images=[image] * 2, return_tensors="pt").to("cuda") + onnx_outputs = onnx_model(**inputs) io_outputs = io_model(**inputs) @@ -3063,10 +3057,7 @@ def test_compare_to_io_binding(self, model_arch): self.assertIsInstance(io_outputs.logits, torch.Tensor) # compare tensor outputs - self.assertTrue( - torch.allclose(onnx_outputs.logits, io_outputs.logits, atol=1e-4), - f" Maxdiff: {torch.abs(onnx_outputs.logits - io_outputs.logits).max()}", - ) + torch.testing.assert_close(onnx_outputs.logits, io_outputs.logits, atol=self.ATOL, rtol=self.RTOL) gc.collect() @@ -3113,7 +3104,9 @@ def test_compare_to_transformers(self, model_arch): self.assertIsInstance(onnx_outputs.logits, self.TENSOR_ALIAS_TO_TYPE[input_type]) # compare tensor outputs - self.assertTrue(torch.allclose(torch.Tensor(onnx_outputs.logits), trtfs_outputs.logits, atol=1e-4)) + torch.testing.assert_close( + torch.Tensor(onnx_outputs.logits), trtfs_outputs.logits, atol=self.ATOL, rtol=self.RTOL + ) gc.collect() @@ -3211,16 +3204,20 @@ def test_compare_to_io_binding(self, model_arch): model_id = MODEL_NAMES[model_arch] onnx_model = ORTModelForSemanticSegmentation.from_pretrained( - self.onnx_model_dirs[model_arch], use_io_binding=False - ).to("cuda") + self.onnx_model_dirs[model_arch], use_io_binding=False, provider="CUDAExecutionProvider" + ) io_model = ORTModelForSemanticSegmentation.from_pretrained( - self.onnx_model_dirs[model_arch], use_io_binding=True - ).to("cuda") + self.onnx_model_dirs[model_arch], use_io_binding=True, provider="CUDAExecutionProvider" + ) + + self.assertFalse(onnx_model.use_io_binding) + self.assertTrue(io_model.use_io_binding) preprocessor = get_preprocessor(model_id) url = "http://images.cocodataset.org/val2017/000000039769.jpg" image = Image.open(requests.get(url, stream=True).raw) - inputs = preprocessor(images=[image] * 2, return_tensors="pt") + inputs = preprocessor(images=[image] * 2, return_tensors="pt").to("cuda") + onnx_outputs = onnx_model(**inputs) io_outputs = io_model(**inputs) @@ -3228,10 +3225,7 @@ def test_compare_to_io_binding(self, model_arch): self.assertIsInstance(io_outputs.logits, torch.Tensor) # compare tensor outputs - self.assertTrue( - torch.allclose(onnx_outputs.logits, io_outputs.logits, atol=1e-4), - f" Maxdiff: {torch.abs(onnx_outputs.logits - io_outputs.logits).max()}", - ) + torch.testing.assert_close(onnx_outputs.logits, io_outputs.logits, atol=self.ATOL, rtol=self.RTOL) gc.collect() @@ -3296,7 +3290,9 @@ def test_compare_to_transformers(self, model_arch): self.assertIsInstance(onnx_outputs.logits, self.TENSOR_ALIAS_TO_TYPE[input_type]) # compare tensor outputs - self.assertTrue(torch.allclose(torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=1e-4)) + torch.testing.assert_close( + torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=self.ATOL, rtol=self.RTOL + ) gc.collect() @@ -3395,16 +3391,19 @@ def test_compare_to_io_binding(self, model_arch): model_id = MODEL_NAMES[model_arch] onnx_model = ORTModelForAudioClassification.from_pretrained( - self.onnx_model_dirs[model_arch], use_io_binding=False - ).to("cuda") + self.onnx_model_dirs[model_arch], use_io_binding=False, provider="CUDAExecutionProvider" + ) io_model = ORTModelForAudioClassification.from_pretrained( - self.onnx_model_dirs[model_arch], use_io_binding=True - ).to("cuda") + self.onnx_model_dirs[model_arch], use_io_binding=True, provider="CUDAExecutionProvider" + ) + + self.assertFalse(onnx_model.use_io_binding) + self.assertTrue(io_model.use_io_binding) - processor = AutoFeatureExtractor.from_pretrained(model_id) data = self._generate_random_audio_data() + processor = AutoFeatureExtractor.from_pretrained(model_id) + input_values = processor(data, return_tensors="pt").to("cuda") - input_values = processor(data, return_tensors="pt") onnx_outputs = onnx_model(**input_values) io_outputs = io_model(**input_values) @@ -3412,7 +3411,7 @@ def test_compare_to_io_binding(self, model_arch): self.assertIsInstance(io_outputs.logits, torch.Tensor) # compare tensor outputs - self.assertTrue(torch.allclose(onnx_outputs.logits, io_outputs.logits, atol=1e-4)) + torch.testing.assert_close(onnx_outputs.logits, io_outputs.logits, atol=self.ATOL, rtol=self.RTOL) gc.collect() @@ -3475,7 +3474,9 @@ def test_compare_to_transformers(self, model_arch): self.assertIsInstance(onnx_outputs.logits, self.TENSOR_ALIAS_TO_TYPE[input_type]) # compare tensor outputs - self.assertTrue(torch.allclose(torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=1e-4)) + torch.testing.assert_close( + torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=self.ATOL, rtol=self.RTOL + ) gc.collect() @@ -3487,17 +3488,26 @@ def test_compare_to_io_binding(self, model_arch): self._setup(model_args) model_id = MODEL_NAMES[model_arch] - onnx_model = ORTModelForCTC.from_pretrained( self.onnx_model_dirs[model_arch], use_io_binding=False, - ).to("cuda") - onnx_model.use_io_binding = False - io_model = ORTModelForCTC.from_pretrained(self.onnx_model_dirs[model_arch], use_io_binding=True).to("cuda") + provider="CUDAExecutionProvider", + provider_options={"cudnn_conv_algo_search": "DEFAULT"}, + ) + io_model = ORTModelForCTC.from_pretrained( + self.onnx_model_dirs[model_arch], + use_io_binding=True, + provider="CUDAExecutionProvider", + provider_options={"cudnn_conv_algo_search": "DEFAULT"}, + ) + + self.assertFalse(onnx_model.use_io_binding) + self.assertTrue(io_model.use_io_binding) - processor = AutoFeatureExtractor.from_pretrained(model_id) data = self._generate_random_audio_data() - input_values = processor(data, return_tensors="pt") + processor = AutoFeatureExtractor.from_pretrained(model_id) + input_values = processor(data, return_tensors="pt").to("cuda") + onnx_outputs = onnx_model(**input_values) io_outputs = io_model(**input_values) @@ -3505,7 +3515,9 @@ def test_compare_to_io_binding(self, model_arch): self.assertIsInstance(io_outputs.logits, torch.Tensor) # compare tensor outputs - self.assertTrue(torch.allclose(torch.Tensor(onnx_outputs.logits), io_outputs.logits, atol=1e-1)) + torch.testing.assert_close( + torch.Tensor(onnx_outputs.logits), io_outputs.logits, atol=self.ATOL, rtol=self.RTOL + ) gc.collect() @@ -3563,9 +3575,11 @@ def test_compare_to_transformers(self, model_arch): self.assertIsInstance(onnx_outputs.embeddings, self.TENSOR_ALIAS_TO_TYPE[input_type]) # compare tensor outputs - self.assertTrue(torch.allclose(torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=1e-4)) - self.assertTrue( - torch.allclose(torch.Tensor(onnx_outputs.embeddings), transformers_outputs.embeddings, atol=1e-4) + torch.testing.assert_close( + torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=self.ATOL, rtol=self.RTOL + ) + torch.testing.assert_close( + torch.Tensor(onnx_outputs.embeddings), transformers_outputs.embeddings, atol=self.ATOL, rtol=self.RTOL ) gc.collect() @@ -3579,16 +3593,19 @@ def test_compare_to_io_binding(self, model_arch): model_id = MODEL_NAMES[model_arch] onnx_model = ORTModelForAudioXVector.from_pretrained( - self.onnx_model_dirs[model_arch], use_io_binding=False - ).to("cuda") - io_model = ORTModelForAudioXVector.from_pretrained(self.onnx_model_dirs[model_arch], use_io_binding=True).to( - "cuda" + self.onnx_model_dirs[model_arch], use_io_binding=False, provider="CUDAExecutionProvider" + ) + io_model = ORTModelForAudioXVector.from_pretrained( + self.onnx_model_dirs[model_arch], use_io_binding=True, provider="CUDAExecutionProvider" ) - processor = AutoFeatureExtractor.from_pretrained(model_id) + self.assertFalse(onnx_model.use_io_binding) + self.assertTrue(io_model.use_io_binding) + data = self._generate_random_audio_data() + processor = AutoFeatureExtractor.from_pretrained(model_id) + input_values = processor(data, return_tensors="pt").to("cuda") - input_values = processor(data, return_tensors="pt") onnx_outputs = onnx_model(**input_values) io_outputs = io_model(**input_values) @@ -3597,8 +3614,8 @@ def test_compare_to_io_binding(self, model_arch): self.assertIsInstance(io_outputs.embeddings, torch.Tensor) # compare tensor outputs - self.assertTrue(torch.allclose(onnx_outputs.logits, io_outputs.logits, atol=1e-4)) - self.assertTrue(torch.allclose(onnx_outputs.embeddings, io_outputs.embeddings, atol=1e-4)) + torch.testing.assert_close(onnx_outputs.logits, io_outputs.logits, atol=self.ATOL, rtol=self.RTOL) + torch.testing.assert_close(onnx_outputs.embeddings, io_outputs.embeddings, atol=self.ATOL, rtol=self.RTOL) gc.collect() @@ -3646,6 +3663,7 @@ def test_compare_to_transformers(self, model_arch): with torch.no_grad(): transformers_outputs = transformers_model(**input_values) + for input_type in ["pt", "np"]: input_values = processor(self._generate_random_audio_data(), return_tensors=input_type) onnx_outputs = onnx_model(**input_values) @@ -3654,7 +3672,9 @@ def test_compare_to_transformers(self, model_arch): self.assertIsInstance(onnx_outputs.logits, self.TENSOR_ALIAS_TO_TYPE[input_type]) # compare tensor outputs - self.assertTrue(torch.allclose(torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=1e-4)) + torch.testing.assert_close( + torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=self.ATOL, rtol=self.RTOL + ) gc.collect() @@ -3662,7 +3682,7 @@ def test_compare_to_transformers(self, model_arch): class ORTModelForSeq2SeqLMIntegrationTest(ORTModelTestMixin): SUPPORTED_ARCHITECTURES = [ "bart", - # "bigbird_pegasus", + "bigbird_pegasus", "blenderbot", "blenderbot_small", "encoder-decoder", @@ -3685,7 +3705,6 @@ class ORTModelForSeq2SeqLMIntegrationTest(ORTModelTestMixin): TASK = "text2text-generation" GENERATION_LENGTH = 100 - SPEEDUP_CACHE = 1.1 def _get_model_ids(self, model_arch): model_ids = MODEL_NAMES[model_arch] @@ -3889,8 +3908,8 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach self.assertIsInstance(onnx_outputs.logits, self.TENSOR_ALIAS_TO_TYPE[input_type]) # Compare tensor outputs - self.assertTrue( - torch.allclose(torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=1e-4) + torch.testing.assert_close( + torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=self.ATOL, rtol=self.RTOL ) gc.collect() @@ -4098,15 +4117,17 @@ def test_pipeline_on_trt_execution_provider(self, test_name: str, model_arch: st gc.collect() @parameterized.expand(SUPPORTED_ARCHITECTURES) - @pytest.mark.cuda_ep_test # mark as GPU test as well to run the without/with cache timing test on the slow tests def test_compare_with_and_without_past_key_values(self, model_arch: str): - if model_arch == "m2m_100": - self.skipTest("m2m_100 comparison with/without pkv fail or is not supported") model_args = {"test_name": model_arch + "_False", "model_arch": model_arch, "use_cache": False} self._setup(model_args) model_args = {"test_name": model_arch + "_True", "model_arch": model_arch, "use_cache": True} self._setup(model_args) + if model_arch == "m2m_100": + generation_length = 20 # model's predefined maximum length + else: + generation_length = self.GENERATION_LENGTH + model_ids = self._get_model_ids(model_arch) for model_id in model_ids: if ( @@ -4123,31 +4144,23 @@ def test_compare_with_and_without_past_key_values(self, model_arch: str): self._get_onnx_model_dir(model_id, model_arch, model_arch + "_True"), use_cache=True ) - _ = model_with_pkv.generate(**tokens) # warmup - with Timer() as with_pkv_timer: - outputs_model_with_pkv = model_with_pkv.generate( - **tokens, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1 - ) + outputs_model_with_pkv = model_with_pkv.generate( + **tokens, min_new_tokens=generation_length, max_new_tokens=generation_length, num_beams=1 + ) model_without_pkv = ORTModelForSeq2SeqLM.from_pretrained( self._get_onnx_model_dir(model_id, model_arch, model_arch + "_False"), use_cache=False ) - _ = model_without_pkv.generate(**tokens) # warmup - with Timer() as without_pkv_timer: - outputs_model_without_pkv = model_without_pkv.generate( - **tokens, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1 - ) - self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) - self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH + 1) - self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH + 1) + outputs_model_without_pkv = model_without_pkv.generate( + **tokens, min_new_tokens=generation_length, max_new_tokens=generation_length, num_beams=1 + ) - if os.environ.get("TEST_LEVEL", 0) == "1": - self.assertTrue( - without_pkv_timer.elapsed / with_pkv_timer.elapsed > self.SPEEDUP_CACHE, - f"With pkv latency: {with_pkv_timer.elapsed:.3f} ms, without pkv latency: {without_pkv_timer.elapsed:.3f} ms," - f" speedup: {without_pkv_timer.elapsed / with_pkv_timer.elapsed:.3f}", - ) + torch.testing.assert_close( + outputs_model_with_pkv, outputs_model_without_pkv, rtol=self.RTOL, atol=self.ATOL + ) + self.assertEqual(outputs_model_with_pkv.shape[1], generation_length + 1) + self.assertEqual(outputs_model_without_pkv.shape[1], generation_length + 1) @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]})) def test_compare_merged_and_not_merged_models_outputs(self, test_name: str, model_arch: str, use_cache: bool): @@ -4196,7 +4209,7 @@ def test_compare_merged_and_not_merged_models_outputs(self, test_name: str, mode outputs_model_not_merged = model_not_merged.generate(**tokens) outputs_model_merged = model_merged.generate(**tokens) - self.assertTrue(torch.equal(outputs_model_merged, outputs_model_not_merged)) + torch.testing.assert_close(outputs_model_not_merged, outputs_model_merged, rtol=self.RTOL, atol=self.ATOL) @parameterized.expand( grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True], "use_merged": [False, True]}) @@ -4226,11 +4239,17 @@ def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: continue onnx_model = ORTModelForSeq2SeqLM.from_pretrained( - self._get_onnx_model_dir(model_id, model_arch, test_name), use_io_binding=False, use_cache=use_cache - ).to("cuda") + self._get_onnx_model_dir(model_id, model_arch, test_name), + use_io_binding=False, + use_cache=use_cache, + provider="CUDAExecutionProvider", + ) io_model = ORTModelForSeq2SeqLM.from_pretrained( - self._get_onnx_model_dir(model_id, model_arch, test_name), use_io_binding=True, use_cache=use_cache - ).to("cuda") + self._get_onnx_model_dir(model_id, model_arch, test_name), + use_io_binding=True, + use_cache=use_cache, + provider="CUDAExecutionProvider", + ) self.assertFalse(onnx_model.use_io_binding) self.assertTrue(io_model.use_io_binding) @@ -4240,8 +4259,9 @@ def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: decoder_start_token_id = onnx_model.config.decoder_start_token_id if model_arch != "mbart" else 2 if model_arch == "encoder-decoder": decoder_start_token_id = tokenizer.cls_token_id - - decoder_inputs = {"decoder_input_ids": torch.ones((2, 1), dtype=torch.long) * decoder_start_token_id} + decoder_inputs = { + "decoder_input_ids": torch.ones((2, 1), dtype=torch.long).to("cuda") * decoder_start_token_id + } onnx_outputs = onnx_model(**tokens, **decoder_inputs) io_outputs = io_model(**tokens, **decoder_inputs) @@ -4250,7 +4270,7 @@ def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: self.assertIsInstance(io_outputs.logits, torch.Tensor) # compare tensor outputs - self.assertTrue(torch.equal(onnx_outputs.logits, io_outputs.logits)) + torch.testing.assert_close(onnx_outputs.logits, io_outputs.logits, atol=self.ATOL, rtol=self.RTOL) gc.collect() @@ -4265,6 +4285,7 @@ def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: ) ) @require_torch_gpu + @pytest.mark.cuda_ep_test def test_compare_generation_to_io_binding( self, test_name: str, @@ -4295,25 +4316,34 @@ def test_compare_generation_to_io_binding( continue onnx_model = ORTModelForSeq2SeqLM.from_pretrained( - self._get_onnx_model_dir(model_id, model_arch, test_name), use_io_binding=False, use_cache=use_cache - ).to("cuda") + self._get_onnx_model_dir(model_id, model_arch, test_name), + use_io_binding=False, + use_cache=use_cache, + provider="CUDAExecutionProvider", + ) io_model = ORTModelForSeq2SeqLM.from_pretrained( - self._get_onnx_model_dir(model_id, model_arch, test_name), use_io_binding=True, use_cache=use_cache - ).to("cuda") + self._get_onnx_model_dir(model_id, model_arch, test_name), + use_io_binding=True, + use_cache=use_cache, + provider="CUDAExecutionProvider", + ) + + self.assertFalse(onnx_model.use_io_binding) + self.assertTrue(io_model.use_io_binding) tokenizer = get_preprocessor(model_id) tokens = tokenizer("This is a sample output", return_tensors="pt").to("cuda") + onnx_outputs = onnx_model.generate(**tokens, num_beams=num_beams) io_outputs = io_model.generate(**tokens, num_beams=num_beams) # compare tensor outputs - self.assertTrue(torch.equal(onnx_outputs, io_outputs)) + torch.testing.assert_close(onnx_outputs, io_outputs, atol=self.ATOL, rtol=self.RTOL) gc.collect() class ORTModelForSpeechSeq2SeqIntegrationTest(ORTModelTestMixin): - # TODO: speech_to_text should be tested SUPPORTED_ARCHITECTURES = ["whisper", "speech_to_text"] FULL_GRID = { @@ -4326,7 +4356,6 @@ class ORTModelForSpeechSeq2SeqIntegrationTest(ORTModelTestMixin): TASK = "automatic-speech-recognition" GENERATION_LENGTH = 100 - SPEEDUP_CACHE = 1.1 def _generate_random_audio_data(self): np.random.seed(10) @@ -4464,28 +4493,33 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach self.assertIsInstance(onnx_outputs.logits, self.TENSOR_ALIAS_TO_TYPE[input_type]) # Compare tensor outputs - self.assertTrue(torch.allclose(torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=1e-4)) + torch.testing.assert_close( + torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=self.ATOL, rtol=self.RTOL + ) - new_tokens = 20 # because tiny random speech to text model has a max_position_embeddings of 20 + if model_arch == "speech_to_text": + generation_length = 20 + else: + generation_length = self.GENERATION_LENGTH with torch.no_grad(): transformers_outputs = transformers_model.generate( **features["pt"], - max_new_tokens=new_tokens, - min_new_tokens=new_tokens, + max_new_tokens=generation_length, + min_new_tokens=generation_length, do_sample=False, num_beams=1, ) onnx_outputs = onnx_model.generate( **features["pt"], - max_new_tokens=new_tokens, - min_new_tokens=new_tokens, + max_new_tokens=generation_length, + min_new_tokens=generation_length, do_sample=False, num_beams=1, ) - self.assertTrue(torch.equal(onnx_outputs, transformers_outputs)) + torch.testing.assert_close(torch.Tensor(onnx_outputs), transformers_outputs, atol=self.ATOL, rtol=self.RTOL) gc.collect() @@ -4576,7 +4610,6 @@ def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, use_cache: b self.assertTrue(isinstance(outputs["text"], str)) @parameterized.expand(SUPPORTED_ARCHITECTURES) - @pytest.mark.cuda_ep_test # mark as GPU test as well to run the without/with cache timing test on the slow tests def test_compare_with_and_without_past_key_values(self, model_arch: str): model_args = {"test_name": model_arch + "_False", "model_arch": model_arch, "use_cache": False} self._setup(model_args) @@ -4593,42 +4626,34 @@ def test_compare_with_and_without_past_key_values(self, model_arch: str): self.onnx_model_dirs[model_arch + "_True"], use_cache=True ) - generation_length = self.GENERATION_LENGTH - self.GENERATION_LENGTH = 10 - _ = model_with_pkv.generate(**features) # warmup - with Timer() as with_pkv_timer: - outputs_model_with_pkv = model_with_pkv.generate( - **features, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1 - ) + if model_arch == "speech_to_text": + generation_length = 20 + else: + generation_length = self.GENERATION_LENGTH + + outputs_model_with_pkv = model_with_pkv.generate( + **features, min_new_tokens=generation_length, max_new_tokens=generation_length, num_beams=1 + ) model_without_pkv = ORTModelForSpeechSeq2Seq.from_pretrained( self.onnx_model_dirs[model_arch + "_False"], use_cache=False ) - _ = model_without_pkv.generate(**features) # warmup - with Timer() as without_pkv_timer: - outputs_model_without_pkv = model_without_pkv.generate( - **features, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1 - ) - self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) + outputs_model_without_pkv = model_without_pkv.generate( + **features, min_new_tokens=generation_length, max_new_tokens=generation_length, num_beams=1 + ) + + torch.testing.assert_close(outputs_model_with_pkv, outputs_model_without_pkv, rtol=self.RTOL, atol=self.ATOL) if model_arch == "whisper" and is_transformers_version(">=", "4.48"): - gen_length = self.GENERATION_LENGTH + out_length = generation_length elif model_arch == "whisper" and is_transformers_version(">=", "4.43"): - gen_length = self.GENERATION_LENGTH + 2 + out_length = generation_length + 2 else: - gen_length = self.GENERATION_LENGTH + 1 - - self.assertEqual(outputs_model_with_pkv.shape[1], gen_length) - self.assertEqual(outputs_model_without_pkv.shape[1], gen_length) + out_length = generation_length + 1 - self.GENERATION_LENGTH = generation_length - if os.environ.get("TEST_LEVEL", 0) == "1": - self.assertTrue( - without_pkv_timer.elapsed / with_pkv_timer.elapsed > self.SPEEDUP_CACHE, - f"With pkv latency: {with_pkv_timer.elapsed:.3f} ms, without pkv latency: {without_pkv_timer.elapsed:.3f} ms," - f" speedup: {without_pkv_timer.elapsed / with_pkv_timer.elapsed:.3f}", - ) + self.assertEqual(outputs_model_with_pkv.shape[1], out_length) + self.assertEqual(outputs_model_without_pkv.shape[1], out_length) @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]})) def test_compare_merged_and_not_merged_models_outputs(self, test_name: str, model_arch: str, use_cache: bool): @@ -4667,18 +4692,16 @@ def test_compare_merged_and_not_merged_models_outputs(self, test_name: str, mode self.assertEqual(model_merged.decoder_with_past, None) self.assertEqual(model_merged.use_merged, True) - generation_length = self.GENERATION_LENGTH - self.GENERATION_LENGTH = 10 + generation_length = 10 outputs_model_not_merged = model_not_merged.generate( - **features, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1 + **features, min_new_tokens=generation_length, max_new_tokens=generation_length, num_beams=1 ) outputs_model_merged = model_merged.generate( - **features, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1 + **features, min_new_tokens=generation_length, max_new_tokens=generation_length, num_beams=1 ) - self.GENERATION_LENGTH = generation_length - self.assertTrue(torch.equal(outputs_model_merged, outputs_model_not_merged)) + torch.testing.assert_close(outputs_model_not_merged, outputs_model_merged, rtol=self.RTOL, atol=self.ATOL) @parameterized.expand( grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True], "use_merged": [False, True]}) @@ -4686,9 +4709,6 @@ def test_compare_merged_and_not_merged_models_outputs(self, test_name: str, mode @require_torch_gpu @pytest.mark.cuda_ep_test def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: bool, use_merged: bool): - if use_cache is False and use_merged is True: - self.skipTest("use_cache=False, use_merged=True are uncompatible") - model_args = { "test_name": test_name, "model_arch": model_arch, @@ -4699,31 +4719,38 @@ def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: model_id = MODEL_NAMES[model_arch] onnx_model = ORTModelForSpeechSeq2Seq.from_pretrained( - self.onnx_model_dirs[test_name], use_io_binding=False - ).to("cuda") - io_model = ORTModelForSpeechSeq2Seq.from_pretrained(self.onnx_model_dirs[test_name], use_io_binding=True).to( - "cuda" + self.onnx_model_dirs[test_name], + use_io_binding=False, + provider="CUDAExecutionProvider", + provider_options={ + "cudnn_conv_algo_search": "DEFAULT", + }, + ) + io_model = ORTModelForSpeechSeq2Seq.from_pretrained( + self.onnx_model_dirs[test_name], + use_io_binding=True, + provider="CUDAExecutionProvider", + provider_options={ + "cudnn_conv_algo_search": "DEFAULT", + }, ) self.assertFalse(onnx_model.use_io_binding) self.assertTrue(io_model.use_io_binding) processor = get_preprocessor(model_id) - data = self._generate_random_audio_data() - features = processor.feature_extractor([data] * 2, return_tensors="pt").to("cuda") + inputs = processor([data] * 2, return_tensors="pt").to("cuda") + inputs["decoder_input_ids"] = torch.ones((2, 1), dtype=torch.long).to("cuda") - decoder_start_token_id = onnx_model.config.decoder_start_token_id - decoder_inputs = {"decoder_input_ids": torch.ones((2, 1), dtype=torch.long) * decoder_start_token_id} - - onnx_outputs = onnx_model(**features, **decoder_inputs) - io_outputs = io_model(**features, **decoder_inputs) + onnx_outputs = onnx_model(**inputs) + io_outputs = io_model(**inputs) self.assertTrue("logits" in io_outputs) self.assertIsInstance(io_outputs.logits, torch.Tensor) # compare tensor outputs - self.assertTrue(torch.equal(onnx_outputs.logits, io_outputs.logits)) + torch.testing.assert_close(onnx_outputs.logits, io_outputs.logits, atol=self.ATOL, rtol=self.RTOL) gc.collect() @@ -4733,7 +4760,7 @@ def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: "model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True], "use_merged": [False, True], - "num_beams": [1, 5], + "num_beams": [1, 3], } ) ) @@ -4760,22 +4787,24 @@ def test_compare_generation_to_io_binding( model_id = MODEL_NAMES[model_arch] onnx_model = ORTModelForSpeechSeq2Seq.from_pretrained( - self.onnx_model_dirs[test_name], use_io_binding=False - ).to("cuda") - io_model = ORTModelForSpeechSeq2Seq.from_pretrained(self.onnx_model_dirs[test_name], use_io_binding=True).to( - "cuda" + self.onnx_model_dirs[test_name], use_io_binding=False, provider="CUDAExecutionProvider" + ) + io_model = ORTModelForSpeechSeq2Seq.from_pretrained( + self.onnx_model_dirs[test_name], use_io_binding=True, provider="CUDAExecutionProvider" ) - processor = get_preprocessor(model_id) + self.assertFalse(onnx_model.use_io_binding) + self.assertTrue(io_model.use_io_binding) + processor = get_preprocessor(model_id) data = self._generate_random_audio_data() - features = processor.feature_extractor(data, return_tensors="pt").to("cuda") + features = processor(data, return_tensors="pt").to("cuda") onnx_outputs = onnx_model.generate(**features, num_beams=num_beams) io_outputs = io_model.generate(**features, num_beams=num_beams) # compare tensor outputs - self.assertTrue(torch.equal(onnx_outputs, io_outputs)) + torch.testing.assert_close(onnx_outputs, io_outputs, atol=self.ATOL, rtol=self.RTOL) gc.collect() @@ -4825,7 +4854,9 @@ def test_compare_to_transformers(self, model_arch: str): self.assertIsInstance(onnx_outputs, ImageSuperResolutionOutput) self.assertTrue("reconstruction" in onnx_outputs) self.assertIsInstance(onnx_outputs.reconstruction, torch.Tensor) - self.assertTrue(torch.allclose(onnx_outputs.reconstruction, transformers_outputs.reconstruction, atol=1e-4)) + torch.testing.assert_close( + onnx_outputs.reconstruction, transformers_outputs.reconstruction, atol=self.ATOL, rtol=self.RTOL + ) gc.collect() @@ -4924,7 +4955,9 @@ class ORTModelForVision2SeqIntegrationTest(ORTModelTestMixin): TASK = "image-to-text" GENERATION_LENGTH = 100 - SPEEDUP_CACHE = 1.1 + + ATOL = 1e-3 + RTOL = 1e-3 def _get_sample_image(self): url = "http://images.cocodataset.org/val2017/000000039769.jpg" @@ -4996,55 +5029,46 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach self.assertIsInstance(onnx_model.config, PretrainedConfig) set_seed(SEED) + image_processor, tokenizer = self._get_preprocessors(model_id) transformers_model = AutoModelForVision2Seq.from_pretrained(model_id) - feature_extractor, tokenizer = self._get_preprocessors(model_id) data = self._get_sample_image() + inputs = image_processor(data, return_tensors="pt") + inputs["decoder_input_ids"] = tokenizer("This is a sample output", return_tensors="pt").input_ids - start_token = "" - decoder_start_token_id = tokenizer.encode(start_token)[0] + with torch.no_grad(): + transformers_outputs = transformers_model(**inputs, use_cache=True) - extra_inputs = [{}, {}] + for input_type in ["pt", "np"]: + inputs = image_processor(data, return_tensors=input_type) + inputs["decoder_input_ids"] = tokenizer("This is a sample output", return_tensors=input_type).input_ids - for extra_inps in extra_inputs: - features = feature_extractor(data, return_tensors="pt") - decoder_inputs = {"decoder_input_ids": torch.ones((1, 1), dtype=torch.long) * decoder_start_token_id} + onnx_outputs = onnx_model(**inputs, use_cache=use_cache) - with torch.no_grad(): - transformers_outputs = transformers_model(**features, **decoder_inputs, **extra_inps, use_cache=True) - for input_type in ["pt", "np"]: - features = feature_extractor(data, return_tensors=input_type) - - if input_type == "np": - decoder_inputs = {"decoder_input_ids": np.ones((1, 1), dtype=np.int64) * decoder_start_token_id} - - if "past_key_values" in extra_inps: - del extra_inps["past_key_values"] # test only with pytorch + self.assertTrue("logits" in onnx_outputs) + self.assertIsInstance(onnx_outputs.logits, self.TENSOR_ALIAS_TO_TYPE[input_type]) - onnx_outputs = onnx_model(**features, **decoder_inputs, **extra_inps) + torch.testing.assert_close( + torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=self.ATOL, rtol=self.RTOL + ) - self.assertTrue("logits" in onnx_outputs) - self.assertIsInstance(onnx_outputs.logits, self.TENSOR_ALIAS_TO_TYPE[input_type]) - self.assertTrue( - torch.allclose(torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=1e-3) + if use_cache: + self.assertEqual( + len(onnx_outputs["past_key_values"]), + len(transformers_outputs["past_key_values"]), ) - - if use_cache: + for i in range(len(onnx_outputs["past_key_values"])): self.assertEqual( - len(onnx_outputs["past_key_values"]), len(transformers_outputs["past_key_values"]) + len(onnx_outputs["past_key_values"][i]), + len(transformers_outputs["past_key_values"][i]), ) - self.assertEqual( - len(onnx_outputs["past_key_values"][0]), len(transformers_outputs["past_key_values"][0]) - ) - for i in range(len(onnx_outputs["past_key_values"])): - for ort_pkv, trfs_pkv in zip( - onnx_outputs["past_key_values"][i], transformers_outputs["past_key_values"][i] - ): - ort_pkv = torch.Tensor(ort_pkv) - self.assertTrue( - torch.allclose(ort_pkv, trfs_pkv, atol=1e-3), - f" Maxdiff: {torch.abs(ort_pkv - trfs_pkv).max()}", - ) + for j in range(len(onnx_outputs["past_key_values"][i])): + torch.testing.assert_close( + torch.Tensor(onnx_outputs["past_key_values"][i][j]), + transformers_outputs["past_key_values"][i][j], + atol=self.ATOL, + rtol=self.RTOL, + ) gc.collect() @@ -5145,7 +5169,6 @@ def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, use_cache: b self.assertTrue(isinstance(outputs[0]["generated_text"], str)) @parameterized.expand(SUPPORTED_ARCHITECTURES[:1]) - @pytest.mark.cuda_ep_test # mark as GPU test as well to run the without/with cache timing test on the slow tests def test_compare_with_and_without_past_key_values(self, model_arch: str): model_args = {"test_name": model_arch + "_False", "model_arch": model_arch, "use_cache": False} self._setup(model_args) @@ -5161,41 +5184,29 @@ def test_compare_with_and_without_past_key_values(self, model_arch: str): model_with_pkv = ORTModelForVision2Seq.from_pretrained( self.onnx_model_dirs[model_arch + "_True"], use_cache=True ) - _ = model_with_pkv.generate(**features) # warmup - with Timer() as with_pkv_timer: - outputs_model_with_pkv = model_with_pkv.generate( - **features, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1 - ) + + outputs_model_with_pkv = model_with_pkv.generate( + **features, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1 + ) model_without_pkv = ORTModelForVision2Seq.from_pretrained( self.onnx_model_dirs[model_arch + "_False"], use_cache=False ) - _ = model_without_pkv.generate(**features) # warmup - with Timer() as without_pkv_timer: - outputs_model_without_pkv = model_without_pkv.generate( - **features, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1 - ) - self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) + outputs_model_without_pkv = model_without_pkv.generate( + **features, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1 + ) + + torch.testing.assert_close(outputs_model_with_pkv, outputs_model_without_pkv, rtol=self.RTOL, atol=self.ATOL) self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH + 1) self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH + 1) - if os.environ.get("TEST_LEVEL", 0) == "1": - self.assertTrue( - without_pkv_timer.elapsed / with_pkv_timer.elapsed > self.SPEEDUP_CACHE, - f"With pkv latency: {with_pkv_timer.elapsed:.3f} ms, without pkv latency: {without_pkv_timer.elapsed:.3f} ms," - f" speedup: {without_pkv_timer.elapsed / with_pkv_timer.elapsed:.3f}", - ) - @parameterized.expand( grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True], "use_merged": [False, True]}) ) @require_torch_gpu @pytest.mark.cuda_ep_test def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: bool, use_merged: bool): - if use_cache is False and use_merged is True: - self.skipTest("use_cache=False, use_merged=True are uncompatible") - model_args = { "test_name": test_name, "model_arch": model_arch, @@ -5205,21 +5216,25 @@ def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: self._setup(model_args) model_id = MODEL_NAMES[model_arch] - onnx_model = ORTModelForVision2Seq.from_pretrained(self.onnx_model_dirs[test_name], use_io_binding=False).to( - "cuda" + onnx_model = ORTModelForVision2Seq.from_pretrained( + self.onnx_model_dirs[test_name], + use_io_binding=False, + provider="CUDAExecutionProvider", + provider_options={"cudnn_conv_algo_search": "DEFAULT"}, ) - io_model = ORTModelForVision2Seq.from_pretrained(self.onnx_model_dirs[test_name], use_io_binding=True).to( - "cuda" + io_model = ORTModelForVision2Seq.from_pretrained( + self.onnx_model_dirs[test_name], + use_io_binding=True, + provider="CUDAExecutionProvider", + provider_options={"cudnn_conv_algo_search": "DEFAULT"}, ) self.assertFalse(onnx_model.use_io_binding) self.assertTrue(io_model.use_io_binding) - feature_extractor, tokenizer = self._get_preprocessors(model_id) - data = self._get_sample_image() + feature_extractor, tokenizer = self._get_preprocessors(model_id) pixel_values = feature_extractor([data] * 2, return_tensors="pt").pixel_values.to("cuda") - decoder_start_token_id = onnx_model.config.decoder.bos_token_id decoder_input_ids = torch.full((2, 1), decoder_start_token_id, dtype=torch.long).to("cuda") @@ -5230,7 +5245,7 @@ def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: self.assertIsInstance(io_outputs.logits, torch.Tensor) # compare tensor outputs - self.assertTrue(torch.equal(onnx_outputs.logits, io_outputs.logits)) + torch.testing.assert_close(onnx_outputs.logits, io_outputs.logits, atol=self.ATOL, rtol=self.RTOL) gc.collect() @@ -5249,9 +5264,6 @@ def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: def test_compare_generation_to_io_binding( self, test_name: str, model_arch: str, use_cache: bool, use_merged: bool, num_beams: int ): - if use_cache is False and use_merged is True: - self.skipTest("use_cache=False, use_merged=True are uncompatible") - model_args = { "test_name": test_name, "model_arch": model_arch, @@ -5261,23 +5273,25 @@ def test_compare_generation_to_io_binding( self._setup(model_args) model_id = MODEL_NAMES[model_arch] - onnx_model = ORTModelForVision2Seq.from_pretrained(self.onnx_model_dirs[test_name], use_io_binding=False).to( - "cuda" + onnx_model = ORTModelForVision2Seq.from_pretrained( + self.onnx_model_dirs[test_name], use_io_binding=False, provider="CUDAExecutionProvider" ) - io_model = ORTModelForVision2Seq.from_pretrained(self.onnx_model_dirs[test_name], use_io_binding=True).to( - "cuda" + io_model = ORTModelForVision2Seq.from_pretrained( + self.onnx_model_dirs[test_name], use_io_binding=True, provider="CUDAExecutionProvider" ) - feature_extractor, tokenizer = self._get_preprocessors(model_id) + self.assertFalse(onnx_model.use_io_binding) + self.assertTrue(io_model.use_io_binding) data = self._get_sample_image() + feature_extractor, _ = self._get_preprocessors(model_id) features = feature_extractor(data, return_tensors="pt").to("cuda") onnx_outputs = onnx_model.generate(**features, num_beams=num_beams) io_outputs = io_model.generate(**features, num_beams=num_beams) # compare tensor outputs - self.assertTrue(torch.equal(onnx_outputs, io_outputs)) + torch.testing.assert_close(onnx_outputs, io_outputs, atol=self.ATOL, rtol=self.RTOL) gc.collect() @@ -5353,13 +5367,23 @@ def test_default_pipeline_and_model_device(self, *args, **kwargs): @require_torch_gpu @pytest.mark.cuda_ep_test def test_compare_to_io_binding(self, *args, **kwargs): - model_arch, model_id = args + _, model_id = args + set_seed(SEED) - onnx_model = ORTModelForCustomTasks.from_pretrained(model_id, use_io_binding=False).to("cuda") + onnx_model = ORTModelForCustomTasks.from_pretrained( + model_id, use_io_binding=False, provider="CUDAExecutionProvider" + ) set_seed(SEED) - io_model = ORTModelForCustomTasks.from_pretrained(model_id, use_io_binding=True).to("cuda") + io_model = ORTModelForCustomTasks.from_pretrained( + model_id, use_io_binding=True, provider="CUDAExecutionProvider" + ) + + self.assertFalse(onnx_model.use_io_binding) + self.assertTrue(io_model.use_io_binding) + tokenizer = get_preprocessor(model_id) - tokens = tokenizer("This is a sample output", return_tensors="pt") + tokens = tokenizer("This is a sample output", return_tensors="pt").to("cuda") + onnx_outputs = onnx_model(**tokens) io_outputs = io_model(**tokens) @@ -5367,7 +5391,9 @@ def test_compare_to_io_binding(self, *args, **kwargs): self.assertIsInstance(io_outputs.pooler_output, torch.Tensor) # compare tensor outputs - self.assertTrue(torch.equal(onnx_outputs.pooler_output, io_outputs.pooler_output)) + torch.testing.assert_close( + onnx_outputs.pooler_output, io_outputs.pooler_output, atol=self.ATOL, rtol=self.RTOL + ) gc.collect() @@ -5385,7 +5411,6 @@ class ORTModelForPix2StructTest(ORTModelTestMixin): TASK = "image-to-text" # is it fine as well with visual-question-answering? GENERATION_LENGTH = 100 - SPEEDUP_CACHE = 1.1 IMAGE = Image.open( requests.get( @@ -5441,9 +5466,6 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach if use_cache is False and use_merged is True: self.skipTest("use_cache=False, use_merged=True are uncompatible") - if use_cache is False: - self.skipTest("skip") - model_args = { "test_name": test_name, "model_arch": model_arch, @@ -5459,111 +5481,82 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach if use_merged is False: model_path = Path(self.onnx_model_dirs[test_name], ONNX_DECODER_NAME) self.assertFalse(has_onnx_input(model_path, "use_cache_branch")) - self.assertEqual(onnx_model.use_merged, False) + self.assertFalse(onnx_model.use_merged) else: model_path = Path(self.onnx_model_dirs[test_name], ONNX_DECODER_MERGED_NAME) self.assertTrue(has_onnx_input(model_path, "use_cache_branch")) - self.assertEqual(onnx_model.use_merged, True) + self.assertTrue(onnx_model.use_merged) self.assertIsInstance(onnx_model.decoder, ORTDecoderForSeq2Seq) - if onnx_model.use_cache is True and onnx_model.use_merged is False: + if use_cache is True and use_merged is False: self.assertIsInstance(onnx_model.decoder_with_past, ORTDecoderForSeq2Seq) - if onnx_model.use_cache is True and onnx_model.use_merged is True: + if use_cache is True and use_merged is True: self.assertTrue(onnx_model.decoder_with_past is None) - self.assertIsInstance(onnx_model.config, PretrainedConfig) - set_seed(SEED) + transformers_model = Pix2StructForConditionalGeneration.from_pretrained(model_id) + + preprocessor = get_preprocessor(model_id) questions = [ "Who am I?", "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud and this is long long very long and super long my dear", ] - - transformers_model = Pix2StructForConditionalGeneration.from_pretrained(model_id) - preprocessor = get_preprocessor(model_id) - inputs = preprocessor(images=[self.IMAGE, self.IMAGE], text=questions, padding=True, return_tensors="pt") - del inputs["decoder_attention_mask"] - del inputs["decoder_input_ids"] - - decoder_start_token_id = transformers_model.config.decoder_start_token_id - decoder_inputs = { - "decoder_input_ids": torch.ones((2, 1), dtype=torch.long) * decoder_start_token_id, - "decoder_attention_mask": torch.ones((2, 1), dtype=torch.int64), - } with torch.no_grad(): - transformers_outputs = transformers_model(**inputs, **decoder_inputs) + transformers_outputs = transformers_model(**inputs) for input_type in ["pt", "np"]: inputs = preprocessor( images=[self.IMAGE, self.IMAGE], text=questions, padding=True, return_tensors=input_type ) - del inputs["decoder_attention_mask"] - del inputs["decoder_input_ids"] - if input_type == "np": - decoder_inputs = { - "decoder_input_ids": np.ones((2, 1), dtype=np.int64) * decoder_start_token_id, - "decoder_attention_mask": np.ones((2, 1), dtype=np.int64), - } - - onnx_outputs = onnx_model(**inputs, **decoder_inputs) + onnx_outputs = onnx_model(**inputs) self.assertTrue("logits" in onnx_outputs) self.assertIsInstance(onnx_outputs.logits, self.TENSOR_ALIAS_TO_TYPE[input_type]) - self.assertTrue(torch.allclose(torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=1e-4)) + torch.testing.assert_close( + torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=self.ATOL, rtol=self.RTOL + ) gc.collect() @parameterized.expand(SUPPORTED_ARCHITECTURES) - @pytest.mark.cuda_ep_test # mark as GPU test as well to run the without/with cache timing test on the slow tests def test_compare_with_and_without_past_key_values(self, model_arch: str): - if model_arch == "m2m_100": - return # TODO: this test is failing for m2m_100 model_args = {"test_name": model_arch + "_False", "model_arch": model_arch, "use_cache": False} self._setup(model_args) model_args = {"test_name": model_arch + "_True", "model_arch": model_arch, "use_cache": True} self._setup(model_args) + model_with_pkv = ORTModelForPix2Struct.from_pretrained( + self.onnx_model_dirs[model_arch + "_True"], use_cache=True + ) + model_without_pkv = ORTModelForPix2Struct.from_pretrained( + self.onnx_model_dirs[model_arch + "_False"], use_cache=False + ) + model_id = MODEL_NAMES[model_arch] preprocessor = get_preprocessor(model_id) - question = "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud" inputs = preprocessor(images=self.IMAGE, text=question, return_tensors="pt") - del inputs["decoder_attention_mask"] - del inputs["decoder_input_ids"] - model_with_pkv = ORTModelForPix2Struct.from_pretrained( - self.onnx_model_dirs[model_arch + "_True"], use_cache=True + outputs_model_with_pkv = model_with_pkv.generate( + **inputs, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1 ) - - _ = model_with_pkv.generate(**inputs) # warmup - with Timer() as with_pkv_timer: - outputs_model_with_pkv = model_with_pkv.generate( - **inputs, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1 - ) - - model_without_pkv = ORTModelForPix2Struct.from_pretrained( - self.onnx_model_dirs[model_arch + "_False"], use_cache=False + outputs_model_without_pkv = model_without_pkv.generate( + **inputs, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1 ) - _ = model_without_pkv.generate(**inputs) # warmup - with Timer() as without_pkv_timer: - outputs_model_without_pkv = model_without_pkv.generate( - **inputs, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1 - ) - self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) - self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH + 1) - self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH + 1) + self.assertEqual( + (outputs_model_with_pkv.shape[1], outputs_model_without_pkv.shape[1]), + ( + inputs["decoder_input_ids"].shape[1] + self.GENERATION_LENGTH + 1, + inputs["decoder_input_ids"].shape[1] + self.GENERATION_LENGTH + 1, + ), + ) - if os.environ.get("TEST_LEVEL", 0) == "1": - self.assertTrue( - without_pkv_timer.elapsed / with_pkv_timer.elapsed > self.SPEEDUP_CACHE, - f"With pkv latency: {with_pkv_timer.elapsed:.3f} ms, without pkv latency: {without_pkv_timer.elapsed:.3f} ms," - f" speedup: {without_pkv_timer.elapsed / with_pkv_timer.elapsed:.3f}", - ) + torch.testing.assert_close(outputs_model_with_pkv, outputs_model_without_pkv, rtol=self.RTOL, atol=self.ATOL) @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]})) def test_compare_merged_and_not_merged_models_outputs(self, test_name: str, model_arch: str, use_cache: bool): @@ -5582,41 +5575,37 @@ def test_compare_merged_and_not_merged_models_outputs(self, test_name: str, mode } self._setup(model_args) - model_id = MODEL_NAMES[model_arch] - preprocessor = get_preprocessor(model_id) - - question = "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud" - inputs = preprocessor(images=self.IMAGE, text=question, return_tensors="pt") - del inputs["decoder_attention_mask"] - del inputs["decoder_input_ids"] - - model_not_merged_dir = self.onnx_model_dirs[test_name + "_False"] - model_merged_dir = self.onnx_model_dirs[test_name + "_True"] - - model_not_merged = ORTModelForPix2Struct.from_pretrained(model_not_merged_dir) - not_merged_onnx_path = Path(model_not_merged_dir, ONNX_DECODER_NAME) + model_not_merged = ORTModelForPix2Struct.from_pretrained(self.onnx_model_dirs[test_name + "_False"]) + not_merged_onnx_path = Path(self.onnx_model_dirs[test_name + "_False"], ONNX_DECODER_NAME) self.assertFalse(has_onnx_input(not_merged_onnx_path, "use_cache_branch")) self.assertEqual(model_not_merged.use_merged, False) - model_merged = ORTModelForPix2Struct.from_pretrained(model_merged_dir) - merged_onnx_path = Path(model_merged_dir, ONNX_DECODER_MERGED_NAME) + model_merged = ORTModelForPix2Struct.from_pretrained(self.onnx_model_dirs[test_name + "_True"]) + merged_onnx_path = Path(self.onnx_model_dirs[test_name + "_True"], ONNX_DECODER_MERGED_NAME) self.assertTrue(has_onnx_input(merged_onnx_path, "use_cache_branch")) self.assertEqual(model_merged.decoder_with_past, None) self.assertEqual(model_merged.use_merged, True) - outputs_model_not_merged = model_not_merged.generate(**inputs) - outputs_model_merged = model_merged.generate(**inputs) + model_id = MODEL_NAMES[model_arch] + preprocessor = get_preprocessor(model_id) + question = "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud" + inputs = preprocessor(images=self.IMAGE, text=question, return_tensors="pt") + + outputs_model_not_merged = model_not_merged.generate( + **inputs, max_new_tokens=self.GENERATION_LENGTH, min_new_tokens=self.GENERATION_LENGTH + ) + outputs_model_merged = model_merged.generate( + **inputs, max_new_tokens=self.GENERATION_LENGTH, min_new_tokens=self.GENERATION_LENGTH + ) - self.assertTrue(torch.equal(outputs_model_merged, outputs_model_not_merged)) + torch.testing.assert_close(outputs_model_not_merged, outputs_model_merged, rtol=self.RTOL, atol=self.ATOL) @parameterized.expand( grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True], "use_merged": [False, True]}) ) + @require_torch_gpu @pytest.mark.cuda_ep_test def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: bool, use_merged: bool): - if use_cache is False and use_merged is True: - self.skipTest("use_cache=False, use_merged=True are uncompatible") - model_args = { "test_name": test_name, "model_arch": model_arch, @@ -5626,36 +5615,32 @@ def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: self._setup(model_args) model_id = MODEL_NAMES[model_arch] - onnx_model = ORTModelForPix2Struct.from_pretrained(self.onnx_model_dirs[test_name], use_io_binding=False) - io_model = ORTModelForPix2Struct.from_pretrained(self.onnx_model_dirs[test_name], use_io_binding=True) + onnx_model = ORTModelForPix2Struct.from_pretrained( + self.onnx_model_dirs[test_name], use_io_binding=False, provider="CUDAExecutionProvider" + ) + io_model = ORTModelForPix2Struct.from_pretrained( + self.onnx_model_dirs[test_name], use_io_binding=True, provider="CUDAExecutionProvider" + ) self.assertFalse(onnx_model.use_io_binding) self.assertTrue(io_model.use_io_binding) preprocessor = get_preprocessor(model_id) + question = ["What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", "Who are you?"] + inputs = preprocessor(images=[self.IMAGE, self.IMAGE], text=question, padding=True, return_tensors="pt").to( + "cuda" + ) - question = [ - "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud and this is even longer and longer and longer and longer and hey", - "Who are you?", - ] - inputs = preprocessor(images=[self.IMAGE, self.IMAGE], text=question, padding=True, return_tensors="pt") - del inputs["decoder_attention_mask"] - del inputs["decoder_input_ids"] - decoder_start_token_id = onnx_model.config.decoder_start_token_id - decoder_inputs = { - "decoder_input_ids": torch.ones((2, 1), dtype=torch.long) * decoder_start_token_id, - "decoder_attention_mask": torch.ones((2, 1), dtype=torch.int64), - } - - onnx_outputs = onnx_model(**inputs, **decoder_inputs) - io_outputs = io_model(**inputs, **decoder_inputs) + onnx_outputs = onnx_model(**inputs) + io_outputs = io_model(**inputs) self.assertTrue("logits" in io_outputs) - self.assertIsInstance(io_outputs.logits, torch.Tensor) + self.assertTrue("encoder_last_hidden_state" in io_outputs) - self.assertTrue(torch.allclose(onnx_outputs.logits, io_outputs.logits, atol=1e-4)) + self.assertIsInstance(io_outputs.logits, torch.Tensor) + self.assertIsInstance(io_outputs.encoder_last_hidden_state, torch.Tensor) - gc.collect() + torch.testing.assert_close(onnx_outputs.logits, io_outputs.logits, atol=self.ATOL, rtol=self.RTOL) @parameterized.expand( grid_parameters( @@ -5667,17 +5652,11 @@ def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: } ) ) + @require_torch_gpu + @pytest.mark.cuda_ep_test def test_compare_generation_to_io_binding( - self, - test_name: str, - model_arch: str, - use_cache: bool, - use_merged: bool, - num_beams: int, + self, test_name: str, model_arch: str, use_cache: bool, use_merged: bool, num_beams: int ): - if use_cache is False and use_merged is True: - self.skipTest("use_cache=False, use_merged=True are uncompatible") - model_args = { "test_name": test_name, "model_arch": model_arch, @@ -5687,22 +5666,27 @@ def test_compare_generation_to_io_binding( self._setup(model_args) model_id = MODEL_NAMES[model_arch] - onnx_model = ORTModelForPix2Struct.from_pretrained(self.onnx_model_dirs[test_name], use_io_binding=False) - io_model = ORTModelForPix2Struct.from_pretrained(self.onnx_model_dirs[test_name], use_io_binding=True) + onnx_model = ORTModelForPix2Struct.from_pretrained( + self.onnx_model_dirs[test_name], use_io_binding=False, provider="CUDAExecutionProvider" + ) + io_model = ORTModelForPix2Struct.from_pretrained( + self.onnx_model_dirs[test_name], use_io_binding=True, provider="CUDAExecutionProvider" + ) - preprocessor = get_preprocessor(model_id) + self.assertFalse(onnx_model.use_io_binding) + self.assertTrue(io_model.use_io_binding) + preprocessor = get_preprocessor(model_id) question = ["What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", "Who are you?"] - inputs = preprocessor(images=[self.IMAGE, self.IMAGE], text=question, padding=True, return_tensors="pt") - del inputs["decoder_attention_mask"] - del inputs["decoder_input_ids"] + inputs = preprocessor(images=[self.IMAGE, self.IMAGE], text=question, padding=True, return_tensors="pt").to( + "cuda" + ) + onnx_outputs = onnx_model.generate(**inputs, num_beams=num_beams) io_outputs = io_model.generate(**inputs, num_beams=num_beams) # compare tensor outputs - self.assertTrue(torch.equal(onnx_outputs, io_outputs)) - - gc.collect() + torch.testing.assert_close(onnx_outputs, io_outputs, atol=self.ATOL, rtol=self.RTOL) class TestBothExportersORTModel(unittest.TestCase): diff --git a/tests/onnxruntime/test_optimization.py b/tests/onnxruntime/test_optimization.py index 82109fcd11..e699eed9fa 100644 --- a/tests/onnxruntime/test_optimization.py +++ b/tests/onnxruntime/test_optimization.py @@ -25,10 +25,10 @@ import pytest import torch from parameterized import parameterized +from testing_utils import MODEL_NAMES from transformers import AutoTokenizer from transformers.onnx.utils import get_preprocessor from transformers.testing_utils import require_torch_gpu -from utils_onnxruntime_tests import MODEL_NAMES from optimum.exporters import TasksManager from optimum.exporters.onnx import MODEL_TYPES_REQUIRING_POSITION_IDS @@ -92,7 +92,7 @@ class ORTOptimizerTest(unittest.TestCase): SUPPORTED_ARCHITECTURES_WITH_MODEL_ID = ( (ORTModelForSequenceClassification, "hf-internal-testing/tiny-random-bart"), (ORTModelForSequenceClassification, "hf-internal-testing/tiny-random-bert"), - # (ORTModelForSequenceClassification, "hf-internal-testing/tiny-random-big_bird"), + (ORTModelForSequenceClassification, "hf-internal-testing/tiny-random-big_bird"), (ORTModelForSequenceClassification, "hf-internal-testing/tiny-random-distilbert"), (ORTModelForSequenceClassification, "hf-internal-testing/tiny-random-electra"), (ORTModelForCausalLM, "hf-internal-testing/tiny-random-gpt2"), @@ -251,7 +251,7 @@ class ORTOptimizerForSeq2SeqLMIntegrationTest(ORTOptimizerTestMixin): "bart", "blenderbot", "blenderbot_small", - # "longt5", + "longt5", "m2m_100", "marian", "mbart", @@ -346,10 +346,6 @@ def test_optimization_levels_cpu(self, test_name: str, model_arch: str, use_cach @pytest.mark.cuda_ep_test def test_optimization_levels_gpu(self, test_name: str, model_arch: str, use_cache: bool, optimization_level: str): for use_io_binding in [False, True]: - # TODO: investigate why marian with IO Binding fails - if model_arch == "marian" and use_io_binding is True: - continue - self._test_optimization_levels( test_name=test_name, model_arch=model_arch, diff --git a/tests/onnxruntime/test_timm.py b/tests/onnxruntime/test_timm.py new file mode 100644 index 0000000000..c51bcc01a0 --- /dev/null +++ b/tests/onnxruntime/test_timm.py @@ -0,0 +1,88 @@ +import gc + +import onnxruntime +import pytest +import requests +import timm +import torch +from parameterized import parameterized +from PIL import Image +from testing_utils import ORTModelTestMixin +from transformers import PretrainedConfig +from transformers.testing_utils import slow + +from optimum.onnxruntime import ORTModelForImageClassification + + +class ORTModelForImageClassificationIntegrationTest(ORTModelTestMixin): + TIMM_SUPPORTED_MODELS = [ + "timm/inception_v3.tf_adv_in1k", + "timm/tf_efficientnet_b0.in1k", + "timm/cspdarknet53.ra_in1k", + "timm/cspresnet50.ra_in1k", + "timm/cspresnext50.ra_in1k", + "timm/densenet121.ra_in1k", + "timm/dla102.in1k", + "timm/dpn107.mx_in1k", + "timm/ecaresnet101d.miil_in1k", + "timm/efficientnet_b1_pruned.in1k", + "timm/inception_resnet_v2.tf_ens_adv_in1k", + "timm/fbnetc_100.rmsp_in1k", + "timm/xception41.tf_in1k", + "timm/senet154.gluon_in1k", + "timm/seresnext26d_32x4d.bt_in1k", + "timm/hrnet_w18.ms_aug_in1k", + "timm/inception_v3.gluon_in1k", + "timm/inception_v4.tf_in1k", + "timm/mixnet_s.ft_in1k", + "timm/mnasnet_100.rmsp_in1k", + "timm/mobilenetv2_100.ra_in1k", + "timm/mobilenetv3_small_050.lamb_in1k", + "timm/nasnetalarge.tf_in1k", + "timm/tf_efficientnet_b0.ns_jft_in1k", + "timm/pnasnet5large.tf_in1k", + "timm/regnetx_002.pycls_in1k", + "timm/regnety_002.pycls_in1k", + "timm/res2net101_26w_4s.in1k", + "timm/res2next50.in1k", + "timm/resnest101e.in1k", + "timm/spnasnet_100.rmsp_in1k", + "timm/resnet18.fb_swsl_ig1b_ft_in1k", + "timm/tresnet_l.miil_in1k", + ] + + @parameterized.expand(TIMM_SUPPORTED_MODELS) + @pytest.mark.run_slow + @slow + def test_compare_to_timm(self, model_id): + onnx_model = ORTModelForImageClassification.from_pretrained(model_id, export=True) + self.assertIsInstance(onnx_model.model, onnxruntime.InferenceSession) + self.assertIsInstance(onnx_model.config, PretrainedConfig) + + timm_model = timm.create_model(model_id, pretrained=True) + timm_model = timm_model.eval() + + # get model specific transforms (normalization, resize) + data_config = timm.data.resolve_model_data_config(timm_model) + transforms = timm.data.create_transform(**data_config, is_training=False) + + url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png" + image = Image.open(requests.get(url, stream=True).raw).convert("RGB") + inputs = transforms(image).unsqueeze(0) + + with torch.no_grad(): + timm_outputs = timm_model(inputs) + + for input_type in ["pt", "np"]: + if input_type == "np": + inputs = inputs.cpu().detach().numpy() + + onnx_outputs = onnx_model(inputs) + + self.assertIn("logits", onnx_outputs) + self.assertIsInstance(onnx_outputs.logits, self.TENSOR_ALIAS_TO_TYPE[input_type]) + + # compare tensor outputs + torch.testing.assert_close(torch.Tensor(onnx_outputs.logits), timm_outputs, atol=self.ATOL, rtol=self.RTOL) + + gc.collect() diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/testing_utils.py similarity index 79% rename from tests/onnxruntime/utils_onnxruntime_tests.py rename to tests/onnxruntime/testing_utils.py index 02ced3be3a..5e12ef78d2 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/testing_utils.py @@ -25,14 +25,16 @@ from optimum.exporters import TasksManager +SEED = 42 + MODEL_NAMES = { "albert": "hf-internal-testing/tiny-random-AlbertModel", "audio_spectrogram_transformer": "Ericwang/tiny-random-ast", "beit": "hf-internal-testing/tiny-random-BeitForImageClassification", "bert": "hf-internal-testing/tiny-random-BertModel", "bart": "hf-internal-testing/tiny-random-bart", - # "big_bird": "hf-internal-testing/tiny-random-BigBirdModel", - # "bigbird_pegasus": "hf-internal-testing/tiny-random-bigbird_pegasus", + "big_bird": "hf-internal-testing/tiny-random-BigBirdModel", + "bigbird_pegasus": "hf-internal-testing/tiny-random-bigbird_pegasus", "blenderbot_small": "hf-internal-testing/tiny-random-BlenderbotModel", "blenderbot": "hf-internal-testing/tiny-random-BlenderbotModel", "bloom": "hf-internal-testing/tiny-random-BloomModel", @@ -47,43 +49,6 @@ "data2vec_audio": "hf-internal-testing/tiny-random-Data2VecAudioModel", "deberta": "hf-internal-testing/tiny-random-DebertaModel", "deberta_v2": "hf-internal-testing/tiny-random-DebertaV2Model", - "default-timm-config": { - "timm/inception_v3.tf_adv_in1k": ["image-classification"], - "timm/tf_efficientnet_b0.in1k": ["image-classification"], - "timm/resnetv2_50x1_bit.goog_distilled_in1k": ["image-classification"], - "timm/cspdarknet53.ra_in1k": ["image-classification"], - "timm/cspresnet50.ra_in1k": ["image-classification"], - "timm/cspresnext50.ra_in1k": ["image-classification"], - "timm/densenet121.ra_in1k": ["image-classification"], - "timm/dla102.in1k": ["image-classification"], - "timm/dpn107.mx_in1k": ["image-classification"], - "timm/ecaresnet101d.miil_in1k": ["image-classification"], - "timm/efficientnet_b1_pruned.in1k": ["image-classification"], - "timm/inception_resnet_v2.tf_ens_adv_in1k": ["image-classification"], - "timm/fbnetc_100.rmsp_in1k": ["image-classification"], - "timm/xception41.tf_in1k": ["image-classification"], - "timm/senet154.gluon_in1k": ["image-classification"], - "timm/seresnext26d_32x4d.bt_in1k": ["image-classification"], - "timm/hrnet_w18.ms_aug_in1k": ["image-classification"], - "timm/inception_v3.gluon_in1k": ["image-classification"], - "timm/inception_v4.tf_in1k": ["image-classification"], - "timm/mixnet_s.ft_in1k": ["image-classification"], - "timm/mnasnet_100.rmsp_in1k": ["image-classification"], - "timm/mobilenetv2_100.ra_in1k": ["image-classification"], - "timm/mobilenetv3_small_050.lamb_in1k": ["image-classification"], - "timm/nasnetalarge.tf_in1k": ["image-classification"], - "timm/tf_efficientnet_b0.ns_jft_in1k": ["image-classification"], - "timm/pnasnet5large.tf_in1k": ["image-classification"], - "timm/regnetx_002.pycls_in1k": ["image-classification"], - "timm/regnety_002.pycls_in1k": ["image-classification"], - "timm/res2net101_26w_4s.in1k": ["image-classification"], - "timm/res2next50.in1k": ["image-classification"], - "timm/resnest101e.in1k": ["image-classification"], - "timm/spnasnet_100.rmsp_in1k": ["image-classification"], - "timm/resnet18.fb_swsl_ig1b_ft_in1k": ["image-classification"], - "timm/wide_resnet101_2.tv_in1k": ["image-classification"], - "timm/tresnet_l.miil_in1k": ["image-classification"], - }, "deit": "hf-internal-testing/tiny-random-DeiTModel", "donut": "fxmarty/tiny-doc-qa-vision-encoder-decoder", "detr": "hf-internal-testing/tiny-random-detr", @@ -92,9 +57,7 @@ "dpt": "hf-internal-testing/tiny-random-DPTModel", "electra": "hf-internal-testing/tiny-random-ElectraModel", "encoder-decoder": { - "hf-internal-testing/tiny-random-EncoderDecoderModel-bert-bert": [ - "text2text-generation", - ], + "hf-internal-testing/tiny-random-EncoderDecoderModel-bert-bert": ["text2text-generation"], "mohitsha/tiny-random-testing-bert2gpt2": ["text2text-generation", "text2text-generation-with-past"], }, "falcon": "fxmarty/really-tiny-falcon-testing", @@ -168,13 +131,11 @@ "wav2vec2-conformer": "hf-internal-testing/tiny-random-wav2vec2-conformer", "wavlm": "hf-internal-testing/tiny-random-WavlmModel", "xlm": "hf-internal-testing/tiny-random-XLMModel", - "xlm_qa": "hf-internal-testing/tiny-random-XLMForQuestionAnsweringSimple", # issue with default hf-internal-testing in transformers QA pipeline post-processing + "xlm_qa": "hf-internal-testing/tiny-random-XLMForQuestionAnsweringSimple", "xlm_roberta": "hf-internal-testing/tiny-xlm-roberta", "yolos": "hf-internal-testing/tiny-random-YolosModel", } -SEED = 42 - class ORTModelTestMixin(unittest.TestCase): TENSOR_ALIAS_TO_TYPE = { @@ -182,6 +143,9 @@ class ORTModelTestMixin(unittest.TestCase): "np": np.ndarray, } + ATOL = 1e-4 + RTOL = 1e-4 + TASK = None ORTMODEL_CLASS = None