Fix GPU tests (#758)

fxmarty · web-flow · commit 8671303ee80d · 2023-02-09T08:17:32.000+01:00
fix tests
diff --git a/optimum/onnxruntime/io_binding/io_binding_helper.py b/optimum/onnxruntime/io_binding/io_binding_helper.py
@@ -158,9 +158,9 @@ def prepare_io_binding(ort_model: "ORTModel", **inputs) -> ort.IOBinding:
         Returns an IOBinding object for an inference session. This method is for general purpose, if the inputs and outputs
         are determined, you can prepare data buffers directly to avoid tensor transfers across frameworks.
         """
-        if not all(input_name in inputs.keys() for input_name in ort_model.input_names):
+        if not all(input_name in inputs.keys() for input_name in ort_model.inputs_names):
             raise ValueError(
-                f"The ONNX model takes {ort_model.input_names.keys()} as inputs, but only {inputs.keys()} are given."
+                f"The ONNX model takes {ort_model.inputs_names.keys()} as inputs, but only {inputs.keys()} are given."
             )
 
         name_to_np_type = TypeHelper.get_io_numpy_type_map(ort_model.model)
@@ -169,7 +169,7 @@ def prepare_io_binding(ort_model: "ORTModel", **inputs) -> ort.IOBinding:
         io_binding = ort_model.model.io_binding()
 
         # Bind inputs
-        for input_name in ort_model.input_names:
+        for input_name in ort_model.inputs_names:
             onnx_input = inputs.pop(input_name)
             onnx_input = onnx_input.contiguous()
 
diff --git a/tests/onnxruntime/Dockerfile_onnxruntime_gpu b/tests/onnxruntime/Dockerfile_onnxruntime_gpu
@@ -23,4 +23,4 @@ RUN pip install datasets evaluate scipy
 COPY . /workspace/optimum
 RUN pip install /workspace/optimum[onnxruntime-gpu,tests]
 
-CMD python3 -m unittest discover -s onnxruntime -p 'test_*.py'
+CMD pytest onnxruntime/test_*.py --durations=0 -s -m gpu_test
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
@@ -290,6 +290,7 @@ def test_load_seq2seq_model_from_empty_cache(self):
             _ = ORTModelForSeq2SeqLM.from_pretrained(self.TINY_ONNX_SEQ2SEQ_MODEL_ID, local_files_only=True)
 
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_load_model_cuda_provider(self):
         model = ORTModel.from_pretrained(self.ONNX_MODEL_ID, provider="CUDAExecutionProvider")
         self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"])
@@ -321,6 +322,7 @@ def test_load_seq2seq_model_without_past_from_hub(self):
         self.assertIsInstance(model.config, PretrainedConfig)
 
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_load_seq2seq_model_cuda_provider(self):
         model = ORTModelForSeq2SeqLM.from_pretrained(self.ONNX_SEQ2SEQ_MODEL_ID, provider="CUDAExecutionProvider")
         self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"])
@@ -419,6 +421,7 @@ def test_missing_execution_provider(self):
         """
 
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_model_on_gpu(self):
         model = ORTModel.from_pretrained(self.ONNX_MODEL_ID)
         gpu = torch.device("cuda")
@@ -428,6 +431,7 @@ def test_model_on_gpu(self):
 
     # test string device input for to()
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_model_on_gpu_str(self):
         model = ORTModel.from_pretrained(self.ONNX_MODEL_ID)
         gpu = torch.device("cuda")
@@ -449,6 +453,7 @@ def test_passing_session_options_seq2seq(self):
         self.assertEqual(model.decoder.session.get_session_options().intra_op_num_threads, 3)
 
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_passing_provider_options(self):
         model = ORTModel.from_pretrained(self.ONNX_MODEL_ID, provider="CUDAExecutionProvider")
         self.assertEqual(model.model.get_provider_options()["CUDAExecutionProvider"]["do_copy_in_default_stream"], "1")
@@ -490,6 +495,7 @@ def test_model_on_gpu_id(self):
         self.assertEqual(model.model.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
 
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_passing_provider_options_seq2seq(self):
         model = ORTModelForSeq2SeqLM.from_pretrained(self.ONNX_SEQ2SEQ_MODEL_ID, provider="CUDAExecutionProvider")
         self.assertEqual(
@@ -590,6 +596,7 @@ def test_seq2seq_model_on_cpu_str(self):
         self.assertListEqual(model.providers, ["CPUExecutionProvider"])
 
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_seq2seq_model_on_gpu(self):
         model = ORTModelForSeq2SeqLM.from_pretrained(self.ONNX_SEQ2SEQ_MODEL_ID, use_cache=True)
         gpu = torch.device("cuda")
@@ -631,6 +638,7 @@ def test_seq2seq_model_on_gpu_id(self):
 
     # test string device input for to()
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_seq2seq_model_on_gpu_str(self):
         model = ORTModelForSeq2SeqLM.from_pretrained(self.ONNX_SEQ2SEQ_MODEL_ID, use_cache=True)
         model.to("cuda")
@@ -988,6 +996,7 @@ def test_pipeline_model_is_none(self):
         )
     )
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
         if provider == "TensorrtExecutionProvider" and model_arch != self.__class__.SUPPORTED_ARCHITECTURES[0]:
             self.skipTest("testing a single arch for TensorrtExecutionProvider")
@@ -1012,6 +1021,7 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_compare_to_io_binding(self, model_arch):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
@@ -1139,6 +1149,7 @@ def test_pipeline_model_is_none(self):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_pipeline_on_gpu(self, model_arch):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
@@ -1160,6 +1171,7 @@ def test_pipeline_on_gpu(self, model_arch):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_compare_to_io_binding(self, model_arch):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
@@ -1298,6 +1310,7 @@ def test_pipeline_model_is_none(self):
         )
     )
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
         if provider == "TensorrtExecutionProvider" and model_arch != self.__class__.SUPPORTED_ARCHITECTURES[0]:
             self.skipTest("testing a single arch for TensorrtExecutionProvider")
@@ -1340,6 +1353,7 @@ def test_pipeline_zero_shot_classification(self):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_compare_to_io_binding(self, model_arch):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
@@ -1463,6 +1477,7 @@ def test_pipeline_model_is_none(self):
         )
     )
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
         if provider == "TensorrtExecutionProvider" and model_arch != self.__class__.SUPPORTED_ARCHITECTURES[0]:
             self.skipTest("testing a single arch for TensorrtExecutionProvider")
@@ -1487,6 +1502,7 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_compare_to_io_binding(self, model_arch):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
@@ -1583,6 +1599,7 @@ def test_pipeline_model_is_none(self):
         )
     )
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
         if provider == "TensorrtExecutionProvider" and model_arch != self.__class__.SUPPORTED_ARCHITECTURES[0]:
             self.skipTest("testing a single arch for TensorrtExecutionProvider")
@@ -1605,6 +1622,7 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_compare_to_io_binding(self, model_arch):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
@@ -1699,6 +1717,7 @@ def test_compare_to_transformers(self, model_arch):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_compare_to_io_binding(self, model_arch):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
@@ -1849,6 +1868,7 @@ def test_pipeline_model_is_none(self):
 
     @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]}))
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_pipeline_on_gpu(self, test_name: str, model_arch: str, use_cache: bool):
         model_args = {"test_name": test_name, "model_arch": model_arch, "use_cache": use_cache}
         self._setup(model_args)
@@ -1871,6 +1891,7 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, use_cache: bool)
     # TRT EP compile time can be long, so we don't test all archs
     @parameterized.expand(grid_parameters({"model_arch": ["gpt2"], "use_cache": [True, False]}))
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_pipeline_on_trt_execution_provider(self, test_name: str, model_arch: str, use_cache: bool):
         model_args = {"test_name": test_name, "model_arch": model_arch, "use_cache": use_cache}
         self._setup(model_args)
@@ -1953,6 +1974,7 @@ def test_compare_with_and_without_past_key_values(self, model_arch):
 
     @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]}))
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: bool):
         model_args = {"test_name": test_name, "model_arch": model_arch, "use_cache": use_cache}
         self._setup(model_args)
@@ -1976,6 +1998,7 @@ def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache:
 
     @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]}))
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_compare_generation_to_io_binding(self, test_name: str, model_arch: str, use_cache: bool):
         model_args = {"test_name": test_name, "model_arch": model_arch, "use_cache": use_cache}
         self._setup(model_args)
@@ -2094,6 +2117,7 @@ def test_pipeline_model_is_none(self):
         )
     )
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
         if provider == "TensorrtExecutionProvider" and model_arch != self.__class__.SUPPORTED_ARCHITECTURES[0]:
             self.skipTest("testing a single arch for TensorrtExecutionProvider")
@@ -2120,6 +2144,7 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_compare_to_io_binding(self, model_arch):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
@@ -2227,6 +2252,7 @@ def test_pipeline_model_is_none(self):
         )
     )
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
         if provider == "TensorrtExecutionProvider" and model_arch != self.__class__.SUPPORTED_ARCHITECTURES[0]:
             self.skipTest("testing a single arch for TensorrtExecutionProvider")
@@ -2253,6 +2279,7 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_compare_to_io_binding(self, model_arch):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
@@ -2277,7 +2304,7 @@ def test_compare_to_io_binding(self, model_arch):
 
         # compare tensor outputs
         self.assertTrue(
-            torch.allclose(onnx_outputs.logits, io_outputs.logits),
+            torch.allclose(onnx_outputs.logits, io_outputs.logits, atol=1e-5),
             f" Maxdiff: {torch.abs(onnx_outputs.logits - io_outputs.logits).max()}",
         )
 
@@ -2426,6 +2453,7 @@ def test_pipeline_model_is_none(self):
 
     @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]}))
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_pipeline_on_gpu(self, test_name: str, model_arch: str, use_cache: bool):
         model_args = {"test_name": test_name, "model_arch": model_arch, "use_cache": use_cache}
         self._setup(model_args)
@@ -2450,6 +2478,7 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, use_cache: bool)
     # TRT EP compile time can be long, so we don't test all archs
     @parameterized.expand(grid_parameters({"model_arch": ["t5"], "use_cache": [True, False]}))
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_pipeline_on_trt_execution_provider(self, test_name: str, model_arch: str, use_cache: bool):
         model_args = {"test_name": test_name, "model_arch": model_arch, "use_cache": use_cache}
         self._setup(model_args)
@@ -2537,6 +2566,7 @@ def test_compare_with_and_without_past_key_values(self, model_arch: str):
 
     @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]}))
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: bool):
         model_args = {"test_name": test_name, "model_arch": model_arch, "use_cache": use_cache}
         self._setup(model_args)
@@ -2567,6 +2597,7 @@ def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache:
 
     @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]}))
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_compare_generation_to_io_binding(self, test_name: str, model_arch: str, use_cache: bool):
         model_args = {"test_name": test_name, "model_arch": model_arch, "use_cache": use_cache}
         self._setup(model_args)
@@ -2696,6 +2727,7 @@ def test_pipeline_speech_recognition(self, test_name: str, model_arch: str, use_
 
     @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]}))
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_pipeline_on_gpu(self, test_name: str, model_arch: str, use_cache: bool):
         model_args = {"test_name": test_name, "model_arch": model_arch, "use_cache": use_cache}
         self._setup(model_args)
@@ -2761,6 +2793,7 @@ def test_compare_with_and_without_past_key_values(self, model_arch: str):
 
     @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]}))
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: bool):
         model_args = {"test_name": test_name, "model_arch": model_arch, "use_cache": use_cache}
         self._setup(model_args)
@@ -2794,6 +2827,7 @@ def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache:
 
     @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]}))
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_compare_generation_to_io_binding(self, test_name: str, model_arch: str, use_cache: bool):
         model_args = {"test_name": test_name, "model_arch": model_arch, "use_cache": use_cache}
         self._setup(model_args)
@@ -2950,6 +2984,7 @@ def test_pipeline_image_to_text(self, test_name: str, model_arch: str, use_cache
         )
     )
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_pipeline_on_gpu(self, test_name: str, model_arch: str, use_cache: bool):
         model_args = {
             "test_name": test_name,
@@ -3049,6 +3084,7 @@ def test_pipeline_ort_model(self, *args, **kwargs):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_MODEL_ID.items())
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_pipeline_on_gpu(self, *args, **kwargs):
         model_arch, model_id = args
         onnx_model = ORTModelForCustomTasks.from_pretrained(model_id)
@@ -3071,6 +3107,7 @@ def test_default_pipeline_and_model_device(self, *args, **kwargs):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_MODEL_ID.items())
     @require_torch_gpu
+    @pytest.mark.gpu_test
     def test_compare_to_io_binding(self, *args, **kwargs):
         model_arch, model_id = args
         set_seed(SEED)