huggingface · echarlaix · Apr 10, 2024 · Apr 8, 2024 · Apr 8, 2024 · Apr 8, 2024
diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
@@ -35,7 +35,7 @@ jobs:
         pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime
     - name: Test with Pytest
       run: |
-        pytest tests/openvino/ --ignore test_modeling_basic
+        pytest tests/openvino/ --ignore test_modeling_basic --durations=0
     - name: Test openvino-nightly
       run: |
         pip uninstall -y openvino

diff --git a/.github/workflows/test_openvino_basic.yml b/.github/workflows/test_openvino_basic.yml
@@ -25,7 +25,7 @@ jobs:
         # Testing lower and upper bound of supported Python versions
         # This also ensures that the test fails if dependencies break for Python 3.7
         python-version: ["3.8", "3.11"]
-        transformers: ['transformers', 'git+https://github.com/huggingface/transformers.git']
+        transformers: ['transformers']
         optimum: ['optimum', 'git+https://github.com/huggingface/optimum.git']
 
     runs-on: ubuntu-20.04
@@ -42,7 +42,7 @@ jobs:
         # Install openvino manually to prevent dependency conflicts when .[openvino] pins
         # optimum or transformers to a specific version
         # Install PyTorch CPU to prevent unnecessary downloading/installing of CUDA packages
-        pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
         pip install .[tests] openvino onnx onnxruntime ${{ matrix.optimum}} ${{ matrix.transformers }}
 
     - name: Pip freeze        
@@ -51,4 +51,4 @@ jobs:
     - name: Test with Pytest
       run: |
         pytest tests/openvino/test_modeling_basic.py
-
+        RUN_SLOW=1 pytest tests/openvino/test_modeling.py -s -m "run_slow" --durations=0
diff --git a/setup.py b/setup.py
@@ -52,6 +52,7 @@
     "auto-gptq",
     "transformers_stream_generator",
     "einops",
+    "tiktoken",
 ]
 
 QUALITY_REQUIRE = ["black~=23.1", "ruff>=0.0.241"]

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
@@ -20,6 +20,7 @@
 from typing import Dict
 
 import numpy as np
+import pytest
 import requests
 import timm
 import torch
@@ -53,6 +54,7 @@
     set_seed,
 )
 from transformers.onnx.utils import get_preprocessor
+from transformers.testing_utils import slow
 from utils_tests import MODEL_NAMES
 
 from optimum.intel import (
@@ -364,6 +366,8 @@ def test_compare_to_transformers(self, model_arch):
         gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @pytest.mark.run_slow
+    @slow
     def test_pipeline(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         model = OVModelForQuestionAnswering.from_pretrained(model_id, export=True)
@@ -379,6 +383,8 @@ def test_pipeline(self, model_arch):
         del model
         gc.collect()
 
+    @pytest.mark.run_slow
+    @slow
     def test_metric(self):
         model_id = "distilbert-base-cased-distilled-squad"
         set_seed(SEED)
@@ -431,6 +437,8 @@ def test_compare_to_transformers(self, model_arch):
         gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @pytest.mark.run_slow
+    @slow
     def test_pipeline(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         model = OVModelForTokenClassification.from_pretrained(model_id, export=True)
@@ -481,6 +489,8 @@ def test_compare_to_transformers(self, model_arch):
         gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @pytest.mark.run_slow
+    @slow
     def test_pipeline(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         model = OVModelForFeatureExtraction.from_pretrained(model_id, export=True)
@@ -526,9 +536,9 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "phi",
         "internlm2",
         "orion",
+        "falcon",
     )
     GENERATION_LENGTH = 100
-    IS_SUPPORT_STATEFUL = is_openvino_version(">=", "2023.3")
     REMOTE_CODE_MODELS = ("chatglm", "minicpm", "baichuan2", "jais", "qwen", "internlm2", "olmo", "orion")
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
@@ -553,37 +563,63 @@ def test_compare_to_transformers(self, model_arch):
         ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG, **model_kwargs)
         self.assertIsInstance(ov_model.config, PretrainedConfig)
         self.assertTrue(ov_model.use_cache)
-        self.assertEqual(
-            ov_model.stateful, self.IS_SUPPORT_STATEFUL and ov_model.config.model_type not in not_stateful
-        )
-        set_seed(SEED)
-        transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
+        self.assertEqual(ov_model.stateful, ov_model.config.model_type not in not_stateful)
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
-        if model_arch == "qwen":
-            transformers_model.to(torch.float32)
-        tokens = tokenizer(
-            "This is a sample", return_tensors="pt", return_token_type_ids=False if model_arch == "llama" else None
-        )
-        ov_outputs = ov_model(**tokens)
+        tokens = tokenizer("This is a sample output", return_tensors="pt")
 
+        ov_outputs = ov_model(**tokens)
         self.assertTrue("logits" in ov_outputs)
         self.assertIsInstance(ov_outputs.logits, torch.Tensor)
         self.assertTrue("past_key_values" in ov_outputs)
         self.assertIsInstance(ov_outputs.past_key_values, tuple)
-        is_stateful = ov_model.config.model_type not in not_stateful and self.IS_SUPPORT_STATEFUL
+        is_stateful = ov_model.config.model_type not in not_stateful
         self.assertEqual(ov_model.stateful, is_stateful)
         if is_stateful:
             self.assertTrue(len(ov_outputs.past_key_values) == 1 and len(ov_outputs.past_key_values[0]) == 0)
+
+        set_seed(SEED)
+        transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
+        if model_arch == "qwen":
+            transformers_model.to(torch.float32)
+
         with torch.no_grad():
             transformers_outputs = transformers_model(**tokens)
 
         # Compare tensor outputs
         self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, equal_nan=True, atol=1e-4))
+
+        # Qwen tokenizer does not support padding
+        if model_arch == "qwen":
+            return
+
+        if model_arch != "chatglm":
+            tokenizer.pad_token_id = tokenizer.eos_token_id
+        # Compare batched generation
+        tokenizer.padding_side = "left"
+        tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True)
+        ov_model.generation_config.eos_token_id = None
+        transformers_model.generation_config.eos_token_id = None
+        ov_model.config.eos_token_id = None
+        transformers_model.config.eos_token_id = None
+        gen_config = GenerationConfig(
+            max_new_tokens=30,
+            min_new_tokens=30,
+            num_beams=3,
+            do_sample=False,
+            eos_token_id=None,
+        )
+
+        ov_outputs = ov_model.generate(**tokens, generation_config=gen_config)
+        transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config)
+        self.assertTrue(torch.allclose(ov_outputs, transformers_outputs))
+
         del transformers_model
         del ov_model
         gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @pytest.mark.run_slow
+    @slow
     def test_pipeline(self, model_arch):
         model_kwargs = {}
         model_id = MODEL_NAMES[model_arch]
@@ -613,35 +649,6 @@ def test_pipeline(self, model_arch):
         del model
         gc.collect()
 
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    def test_multiple_inputs(self, model_arch):
-        model_id = MODEL_NAMES[model_arch]
-        set_seed(SEED)
-        if model_arch == "qwen":
-            self.skipTest("Qwen tokenizer does not support padding")
-        model_kwargs = {}
-        if model_arch in self.REMOTE_CODE_MODELS:
-            model_kwargs = {
-                "config": AutoConfig.from_pretrained(model_id, trust_remote_code=True),
-                "trust_remote_code": True,
-            }
-        model = OVModelForCausalLM.from_pretrained(model_id, export=True, compile=False, **model_kwargs)
-        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
-        tokenizer.pad_token = tokenizer.eos_token
-        texts = ["this is a simple input", "this is a second simple input", "this is a third simple input"]
-        tokens = tokenizer(texts, padding=True, return_tensors="pt")
-        generation_config = GenerationConfig(encoder_no_repeat_ngram_size=0, max_new_tokens=20, num_beams=2)
-        outputs = model.generate(**tokens, generation_config=generation_config)
-        self.assertIsInstance(outputs, torch.Tensor)
-        self.assertEqual(outputs.shape[0], 3)
-        # test that generation result is reproducible
-        outputs2 = model.generate(**tokens, generation_config=generation_config)
-        self.assertIsInstance(outputs2, torch.Tensor)
-        self.assertEqual(outputs2.shape[0], 3)
-        self.assertTrue(torch.allclose(outputs2, outputs))
-        del model
-        gc.collect()
-
     def test_model_and_decoder_same_device(self):
         model_id = MODEL_NAMES["gpt2"]
         model = OVModelForCausalLM.from_pretrained(model_id, export=True)
@@ -667,12 +674,11 @@ def test_compare_with_and_without_past_key_values(self):
         self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv))
         self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH)
         self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH)
-        if self.IS_SUPPORT_STATEFUL:
-            model_stateful = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=True, stateful=True)
-            outputs_model_stateful = model_stateful.generate(
-                **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
-            )
-            self.assertTrue(torch.equal(outputs_model_without_pkv, outputs_model_stateful))
+        model_stateful = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=True, stateful=True)
+        outputs_model_stateful = model_stateful.generate(
+            **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
+        )
+        self.assertTrue(torch.equal(outputs_model_without_pkv, outputs_model_stateful))
 
         del model_with_pkv
         del model_without_pkv
@@ -851,6 +857,8 @@ def test_compare_to_transformers(self, model_arch):
         gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @pytest.mark.run_slow
+    @slow
     def test_pipeline(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         model = OVModelForImageClassification.from_pretrained(model_id, export=True)
@@ -981,6 +989,8 @@ def test_pipeline(self, model_arch):
         gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @pytest.mark.run_slow
+    @slow
     def test_generate_utils(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         model = OVModelForSeq2SeqLM.from_pretrained(model_id, export=True)
@@ -1438,6 +1448,8 @@ def test_load_vanilla_transformers_which_is_not_supported(self):
         self.assertIn("only supports the tasks", str(context.exception))
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @pytest.mark.run_slow
+    @slow
     def test_generate_utils(self, model_arch: str):
         model_id = MODEL_NAMES[model_arch]
         model = OVModelForVision2Seq.from_pretrained(model_id, export=True)

diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
@@ -42,6 +42,7 @@
     "donut": "fxmarty/tiny-doc-qa-vision-encoder-decoder",
     "electra": "hf-internal-testing/tiny-random-electra",
     "gemma": "fxmarty/tiny-random-GemmaForCausalLM",
+    "falcon": "fxmarty/really-tiny-falcon-testing",
     "flaubert": "hf-internal-testing/tiny-random-flaubert",
     "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel",
     "gpt2": "hf-internal-testing/tiny-random-gpt2",