Added tests for load_in_4bit

AlexKoff88 · AlexKoff88 · commit efe85a2962c8 · 2024-01-31T16:44:16.000+04:00
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
@@ -155,7 +155,34 @@ class OVWeightCompressionTest(unittest.TestCase):
 
     SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 64, 365),)
     SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = (
-        (OVModelForCausalLM, "opt125m", 64, 477),
+        (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 44, 46),
+    )
+
+    LOAD_IN_4_BITS_SCOPE = (
+        (
+            OVModelForCausalLM,
+            "hf-internal-testing/tiny-random-gpt2",
+            dict(mode=nncf.CompressWeightsMode.INT4_ASYM, group_size=-1, ratio=0.8),
+            16,
+        ),
+        (
+            OVModelForCausalLM,
+            "hf-internal-testing/tiny-random-gpt2",
+            dict(
+                mode=nncf.CompressWeightsMode.INT4_ASYM,
+                group_size=-1,
+                ignored_scope=nncf.IgnoredScope(names=["__module.model.transformer.h.2.mlp.c_fc/aten::addmm/MatMul"]),
+            ),
+            6,
+        ),
+        (
+            OVModelForCausalLM,
+            "hf-internal-testing/tiny-random-gpt2",
+            dict(mode=nncf.CompressWeightsMode.INT4_ASYM, group_size=-1, ratio=0.8, all_layers=True),
+            22,
+        ),
+        # TODO: uncomment after fix
+        # (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", dict(mode=nncf.CompressWeightsMode.INT4_SYM, group_size=-1, ratio=0.8, sensitivity_metric=nncf.SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE, dataset="ptb"), 16),
     )
 
     SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION = (
@@ -249,37 +276,26 @@ def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_i
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS)
     @unittest.skipIf(not IS_SUPPORT_STATEFUL, "Stateful models supported only in 2023.3 and above")
-    def test_ovmodel_4bit_weight_compression_stateful(self, model_cls, model_name, expected_int8, expected_int4):
+    def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_name, expected_pt_int8, expected_ov_int8):
         task = model_cls.export_feature
 
         with tempfile.TemporaryDirectory() as tmp_dir:
             model_id = MODEL_NAMES[model_name]
             transformers_model = model_cls.from_pretrained(model_id, export=True, stateful=True)
-            tokenizer = AutoTokenizer.from_pretrained(model_id)
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
             if tokenizer.pad_token is None:
                 tokenizer.pad_token = tokenizer.eos_token
 
             quantizer = OVQuantizer.from_pretrained(transformers_model, task=task)
-            quantizer.quantize(
-                save_directory=tmp_dir,
-                weights_only=True,
-                quantization_config=OVWeightQuantizationConfig(mode=nncf.CompressWeightsMode.INT4_SYM, ratio=0.8),
-            )
+            quantizer.quantize(save_directory=tmp_dir, weights_only=True)
             model = model_cls.from_pretrained(tmp_dir)
-            self.assertTrue(model.stateful)
-            self.assertTrue(model.use_cache)
 
-            _, num_int8, num_int4 = get_num_quantized_nodes(model)
-            self.assertEqual(expected_int8, num_int8)
-            self.assertEqual(expected_int4, num_int4)
+            _, num_int8, _ = get_num_quantized_nodes(model)
+            self.assertEqual(expected_ov_int8, num_int8)
 
             tokens = tokenizer("This is a sample input", return_tensors="pt")
             outputs = model(**tokens)
-
             self.assertTrue("logits" in outputs)
-            self.assertTrue("past_key_values" in outputs)
-            self.assertIsInstance(outputs.past_key_values, tuple)
-            self.assertTrue(len(outputs.past_key_values) == 1 and len(outputs.past_key_values[0]) == 0)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION)
     def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type):
@@ -298,6 +314,21 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type):
             _, num_int8, _ = get_num_quantized_nodes(model)
             self.assertEqual(expected_ov_int8[i], num_int8)
 
+    @parameterized.expand(LOAD_IN_4_BITS_SCOPE)
+    def test_ovmodel_4bit_auto_compression(self, model_cls, model_id, quantization_config, expected_ov_int4):
+        task = model_cls.export_feature
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model = model_cls.from_pretrained(
+                model_id, export=True, load_in_4bit=True, quantization_config=quantization_config
+            )
+            tokenizer = AutoTokenizer.from_pretrained(model_id)
+            if tokenizer.pad_token is None:
+                tokenizer.pad_token = tokenizer.eos_token
+
+            _, num_int4, _ = get_num_quantized_nodes(model)
+            self.assertEqual(expected_ov_int4, num_int4)
+
     @parameterized.expand(((OVModelForCausalLM, "gpt2"),))
     @unittest.skipIf(not IS_SUPPORT_STATEFUL, "Stateful models supported only in 2023.3 and above")
     def test_ovmodel_stateful_load_with_compressed_weights(self, model_cls, model_type):