From e7610aaf32e90894f6099bf04f297f271387d06a Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Mon, 8 Apr 2024 15:20:59 +0200
Subject: [PATCH 1/9] Schedule nightly slow tests

---
 .github/workflows/test_openvino_basic.yml |  4 ++--
 tests/openvino/test_modeling.py           | 16 ++++++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test_openvino_basic.yml b/.github/workflows/test_openvino_basic.yml
index effb99a84d..c2626e91e5 100644
--- a/.github/workflows/test_openvino_basic.yml
+++ b/.github/workflows/test_openvino_basic.yml
@@ -25,7 +25,7 @@ jobs:
         # Testing lower and upper bound of supported Python versions
         # This also ensures that the test fails if dependencies break for Python 3.7
         python-version: ["3.8", "3.11"]
-        transformers: ['transformers', 'git+https://github.com/huggingface/transformers.git']
+        transformers: ['transformers']
         optimum: ['optimum', 'git+https://github.com/huggingface/optimum.git']
 
     runs-on: ubuntu-20.04
@@ -51,4 +51,4 @@ jobs:
     - name: Test with Pytest
       run: |
         pytest tests/openvino/test_modeling_basic.py
-
+        RUN_SLOW=1 pytest tests/openvino/test_modeling.py -s -m "run_slow" --durations=0
\ No newline at end of file
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 32fc255a1f..70d345495c 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -20,6 +20,7 @@
 from typing import Dict
 
 import numpy as np
+import pytest
 import requests
 import timm
 import torch
@@ -53,6 +54,7 @@
     set_seed,
 )
 from transformers.onnx.utils import get_preprocessor
+from transformers.testing_utils import slow
 from utils_tests import MODEL_NAMES
 
 from optimum.intel import (
@@ -364,6 +366,8 @@ def test_compare_to_transformers(self, model_arch):
         gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @pytest.mark.run_slow
+    @slow
     def test_pipeline(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         model = OVModelForQuestionAnswering.from_pretrained(model_id, export=True)
@@ -379,6 +383,8 @@ def test_pipeline(self, model_arch):
         del model
         gc.collect()
 
+    @pytest.mark.run_slow
+    @slow
     def test_metric(self):
         model_id = "distilbert-base-cased-distilled-squad"
         set_seed(SEED)
@@ -431,6 +437,8 @@ def test_compare_to_transformers(self, model_arch):
         gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @pytest.mark.run_slow
+    @slow
     def test_pipeline(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         model = OVModelForTokenClassification.from_pretrained(model_id, export=True)
@@ -481,6 +489,8 @@ def test_compare_to_transformers(self, model_arch):
         gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @pytest.mark.run_slow
+    @slow
     def test_pipeline(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         model = OVModelForFeatureExtraction.from_pretrained(model_id, export=True)
@@ -851,6 +861,8 @@ def test_compare_to_transformers(self, model_arch):
         gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @pytest.mark.run_slow
+    @slow
     def test_pipeline(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         model = OVModelForImageClassification.from_pretrained(model_id, export=True)
@@ -981,6 +993,8 @@ def test_pipeline(self, model_arch):
         gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @pytest.mark.run_slow
+    @slow
     def test_generate_utils(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         model = OVModelForSeq2SeqLM.from_pretrained(model_id, export=True)
@@ -1438,6 +1452,8 @@ def test_load_vanilla_transformers_which_is_not_supported(self):
         self.assertIn("only supports the tasks", str(context.exception))
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @pytest.mark.run_slow
+    @slow
     def test_generate_utils(self, model_arch: str):
         model_id = MODEL_NAMES[model_arch]
         model = OVModelForVision2Seq.from_pretrained(model_id, export=True)

From aed62ba017ecfbb5ecde1a0cd835a27140691472 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Mon, 8 Apr 2024 16:11:58 +0200
Subject: [PATCH 2/9] add needed dependency

---
 .github/workflows/test_openvino_basic.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_openvino_basic.yml b/.github/workflows/test_openvino_basic.yml
index c2626e91e5..3135e6c004 100644
--- a/.github/workflows/test_openvino_basic.yml
+++ b/.github/workflows/test_openvino_basic.yml
@@ -42,7 +42,7 @@ jobs:
         # Install openvino manually to prevent dependency conflicts when .[openvino] pins
         # optimum or transformers to a specific version
         # Install PyTorch CPU to prevent unnecessary downloading/installing of CUDA packages
-        pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
         pip install .[tests] openvino onnx onnxruntime ${{ matrix.optimum}} ${{ matrix.transformers }}
 
     - name: Pip freeze        

From d83cce32bfdc873b28add0ebce20acc991c42c1d Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Mon, 8 Apr 2024 16:25:06 +0200
Subject: [PATCH 3/9] set test to slow

---
 tests/openvino/test_modeling.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 70d345495c..28b6642f61 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -594,6 +594,8 @@ def test_compare_to_transformers(self, model_arch):
         gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @pytest.mark.run_slow
+    @slow
     def test_pipeline(self, model_arch):
         model_kwargs = {}
         model_id = MODEL_NAMES[model_arch]

From c95dee5d10424186283b4aaa944a6ccefe7c81f6 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Mon, 8 Apr 2024 17:35:53 +0200
Subject: [PATCH 4/9] add duration

---
 .github/workflows/test_openvino.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index ba5b09ff81..bff5cb525f 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -35,7 +35,7 @@ jobs:
         pip install .[openvino,openvino-tokenizers,tests,diffusers] onnxruntime
     - name: Test with Pytest
       run: |
-        pytest tests/openvino/ --ignore test_modeling_basic
+        pytest tests/openvino/ --ignore test_modeling_basic --durations=0
     - name: Test openvino-nightly
       run: |
         pip uninstall -y openvino

From af8b85b74ab569c3d782f26298a0f785f00cad1f Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Mon, 8 Apr 2024 17:48:29 +0200
Subject: [PATCH 5/9] convert to slow

---
 tests/openvino/test_modeling.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 28b6642f61..e715f4b6fb 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -626,6 +626,8 @@ def test_pipeline(self, model_arch):
         gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @pytest.mark.run_slow
+    @slow
     def test_multiple_inputs(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         set_seed(SEED)

From 8a4ff70354295c07ae608c59a4bd5511fea9e468 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Mon, 8 Apr 2024 17:48:39 +0200
Subject: [PATCH 6/9] add dependency

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index e80d0ea448..a8c43f51d4 100644
--- a/setup.py
+++ b/setup.py
@@ -52,6 +52,7 @@
     "auto-gptq",
     "transformers_stream_generator",
     "einops",
+    "tiktoken",
 ]
 
 QUALITY_REQUIRE = ["black~=23.1", "ruff>=0.0.241"]

From 6d091a00b255cbc5464edafa991507c5b173b328 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Mon, 8 Apr 2024 18:21:37 +0200
Subject: [PATCH 7/9] merge tests

---
 tests/openvino/test_modeling.py | 92 +++++++++++++++------------------
 tests/openvino/utils_tests.py   |  1 +
 2 files changed, 43 insertions(+), 50 deletions(-)

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index e715f4b6fb..2229bc0ecd 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -536,9 +536,9 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "phi",
         "internlm2",
         "orion",
+        "falcon",
     )
     GENERATION_LENGTH = 100
-    IS_SUPPORT_STATEFUL = is_openvino_version(">=", "2023.3")
     REMOTE_CODE_MODELS = ("chatglm", "minicpm", "baichuan2", "jais", "qwen", "internlm2", "olmo", "orion")
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
@@ -563,36 +563,60 @@ def test_compare_to_transformers(self, model_arch):
         ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG, **model_kwargs)
         self.assertIsInstance(ov_model.config, PretrainedConfig)
         self.assertTrue(ov_model.use_cache)
-        self.assertEqual(
-            ov_model.stateful, self.IS_SUPPORT_STATEFUL and ov_model.config.model_type not in not_stateful
-        )
-        set_seed(SEED)
-        transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
+        self.assertEqual(ov_model.stateful, ov_model.config.model_type not in not_stateful)
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
-        if model_arch == "qwen":
-            transformers_model.to(torch.float32)
-        tokens = tokenizer(
-            "This is a sample", return_tensors="pt", return_token_type_ids=False if model_arch == "llama" else None
-        )
-        ov_outputs = ov_model(**tokens)
+        tokens = tokenizer("This is a sample output", return_tensors="pt")
 
+        ov_outputs = ov_model(**tokens)
         self.assertTrue("logits" in ov_outputs)
         self.assertIsInstance(ov_outputs.logits, torch.Tensor)
         self.assertTrue("past_key_values" in ov_outputs)
         self.assertIsInstance(ov_outputs.past_key_values, tuple)
-        is_stateful = ov_model.config.model_type not in not_stateful and self.IS_SUPPORT_STATEFUL
+        is_stateful = ov_model.config.model_type not in not_stateful
         self.assertEqual(ov_model.stateful, is_stateful)
         if is_stateful:
             self.assertTrue(len(ov_outputs.past_key_values) == 1 and len(ov_outputs.past_key_values[0]) == 0)
+
+        set_seed(SEED)
+        transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
+        if model_arch == "qwen":
+            transformers_model.to(torch.float32)
+
         with torch.no_grad():
             transformers_outputs = transformers_model(**tokens)
 
         # Compare tensor outputs
         self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, equal_nan=True, atol=1e-4))
+
+        # Qwen tokenizer does not support padding
+        if model_arch == "qwen":
+            return
+
+        # Compare batched generation.
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+        tokenizer.padding_side = "left"
+        tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True)
+        ov_model.generation_config.eos_token_id = None
+        transformers_model.generation_config.eos_token_id = None
+        ov_model.config.eos_token_id = None
+        transformers_model.config.eos_token_id = None
+        gen_config = GenerationConfig(
+            max_new_tokens=30,
+            min_new_tokens=30,
+            num_beams=3,
+            do_sample=False,
+            eos_token_id=None,
+        )
+
+        ov_outputs = ov_model.generate(**tokens, generation_config=gen_config)
+        transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config)
+        self.assertTrue(torch.allclose(ov_outputs, transformers_outputs))
+
         del transformers_model
         del ov_model
         gc.collect()
 
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @pytest.mark.run_slow
     @slow
@@ -625,37 +649,6 @@ def test_pipeline(self, model_arch):
         del model
         gc.collect()
 
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @pytest.mark.run_slow
-    @slow
-    def test_multiple_inputs(self, model_arch):
-        model_id = MODEL_NAMES[model_arch]
-        set_seed(SEED)
-        if model_arch == "qwen":
-            self.skipTest("Qwen tokenizer does not support padding")
-        model_kwargs = {}
-        if model_arch in self.REMOTE_CODE_MODELS:
-            model_kwargs = {
-                "config": AutoConfig.from_pretrained(model_id, trust_remote_code=True),
-                "trust_remote_code": True,
-            }
-        model = OVModelForCausalLM.from_pretrained(model_id, export=True, compile=False, **model_kwargs)
-        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
-        tokenizer.pad_token = tokenizer.eos_token
-        texts = ["this is a simple input", "this is a second simple input", "this is a third simple input"]
-        tokens = tokenizer(texts, padding=True, return_tensors="pt")
-        generation_config = GenerationConfig(encoder_no_repeat_ngram_size=0, max_new_tokens=20, num_beams=2)
-        outputs = model.generate(**tokens, generation_config=generation_config)
-        self.assertIsInstance(outputs, torch.Tensor)
-        self.assertEqual(outputs.shape[0], 3)
-        # test that generation result is reproducible
-        outputs2 = model.generate(**tokens, generation_config=generation_config)
-        self.assertIsInstance(outputs2, torch.Tensor)
-        self.assertEqual(outputs2.shape[0], 3)
-        self.assertTrue(torch.allclose(outputs2, outputs))
-        del model
-        gc.collect()
-
     def test_model_and_decoder_same_device(self):
         model_id = MODEL_NAMES["gpt2"]
         model = OVModelForCausalLM.from_pretrained(model_id, export=True)
@@ -681,12 +674,11 @@ def test_compare_with_and_without_past_key_values(self):
         self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv))
         self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH)
         self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH)
-        if self.IS_SUPPORT_STATEFUL:
-            model_stateful = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=True, stateful=True)
-            outputs_model_stateful = model_stateful.generate(
-                **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
-            )
-            self.assertTrue(torch.equal(outputs_model_without_pkv, outputs_model_stateful))
+        model_stateful = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=True, stateful=True)
+        outputs_model_stateful = model_stateful.generate(
+            **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
+        )
+        self.assertTrue(torch.equal(outputs_model_without_pkv, outputs_model_stateful))
 
         del model_with_pkv
         del model_without_pkv
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index e7f62f1f61..73224c81b2 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -42,6 +42,7 @@
     "donut": "fxmarty/tiny-doc-qa-vision-encoder-decoder",
     "electra": "hf-internal-testing/tiny-random-electra",
     "gemma": "fxmarty/tiny-random-GemmaForCausalLM",
+    "falcon": "fxmarty/really-tiny-falcon-testing",
     "flaubert": "hf-internal-testing/tiny-random-flaubert",
     "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel",
     "gpt2": "hf-internal-testing/tiny-random-gpt2",

From 9fea0b34ffbc43e5b70f5cbeb48c4130a8750fcc Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Mon, 8 Apr 2024 18:42:56 +0200
Subject: [PATCH 8/9] fix format

---
 tests/openvino/test_modeling.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 2229bc0ecd..4b268b5b62 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -616,7 +616,6 @@ def test_compare_to_transformers(self, model_arch):
         del ov_model
         gc.collect()
 
-
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @pytest.mark.run_slow
     @slow

From 1eeba6b47a677a2141359408d6e53af24addce8b Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Tue, 9 Apr 2024 15:40:08 +0200
Subject: [PATCH 9/9] fix test for chatglm

---
 tests/openvino/test_modeling.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 4b268b5b62..907c767310 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -592,8 +592,9 @@ def test_compare_to_transformers(self, model_arch):
         if model_arch == "qwen":
             return
 
-        # Compare batched generation.
-        tokenizer.pad_token_id = tokenizer.eos_token_id
+        if model_arch != "chatglm":
+            tokenizer.pad_token_id = tokenizer.eos_token_id
+        # Compare batched generation
         tokenizer.padding_side = "left"
         tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True)
         ov_model.generation_config.eos_token_id = None