22
22
import numpy as np
23
23
from datasets import load_dataset
24
24
from parameterized import parameterized
25
+ import openvino .runtime as ov
25
26
import nncf
26
27
from transformers import (
27
28
AutoModelForQuestionAnswering ,
@@ -154,7 +155,8 @@ class OVWeightCompressionTest(unittest.TestCase):
154
155
)
155
156
156
157
SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM , "opt125m" , 64 , 365 ),)
157
- SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = (
158
+ SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTO_COMPRESSED_MATMULS = ((OVModelForCausalLM , "hf-internal-testing/tiny-random-OPTForCausalLM" , 16 , 136 ),)
159
+ SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS = (
158
160
(OVModelForCausalLM , "hf-internal-testing/tiny-random-gpt2" , 44 , 46 ),
159
161
)
160
162
@@ -170,7 +172,7 @@ class OVWeightCompressionTest(unittest.TestCase):
170
172
"hf-internal-testing/tiny-random-gpt2" ,
171
173
dict (
172
174
mode = nncf .CompressWeightsMode .INT4_ASYM ,
173
- group_size = - 1 ,
175
+ group_size = 32 ,
174
176
ignored_scope = nncf .IgnoredScope (names = ["__module.model.transformer.h.2.mlp.c_fc/aten::addmm/MatMul" ]),
175
177
),
176
178
6 ,
@@ -297,7 +299,7 @@ def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_i
297
299
outputs = model (** tokens )
298
300
self .assertTrue ("logits" in outputs )
299
301
300
- @parameterized .expand (SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS )
302
+ @parameterized .expand (SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS )
301
303
@unittest .skipIf (not IS_SUPPORT_STATEFUL , "Stateful models supported only in 2023.3 and above" )
302
304
def test_ovmodel_8bit_weight_compression_stateful (self , model_cls , model_name , expected_pt_int8 , expected_ov_int8 ):
303
305
task = model_cls .export_feature
@@ -351,6 +353,35 @@ def test_ovmodel_4bit_auto_compression(self, model_cls, model_id, quantization_c
351
353
352
354
_ , num_int4 , _ = get_num_quantized_nodes (model )
353
355
self .assertEqual (expected_ov_int4 , num_int4 )
356
+
357
+ @parameterized .expand (SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTO_COMPRESSED_MATMULS )
358
+ def test_ovmodel_4bit_auto_compression_with_custom_dataset (self , model_cls , model_id , expected_int8 , expected_int4 ):
359
+ task = model_cls .export_feature
360
+
361
+ tokenizer = AutoTokenizer .from_pretrained (model_id )
362
+ if tokenizer .pad_token is None :
363
+ tokenizer .pad_token = tokenizer .eos_token
364
+
365
+ dataset_name , dataset_config_name , column = _TASK_TO_DATASET [task ]
366
+ dataset = load_dataset (dataset_name , dataset_config_name , split = "test" )
367
+
368
+ def transform_fn (data , tokenizer ):
369
+ tokenized_text = tokenizer (data [column ], return_tensors = "np" )
370
+ input_ids = tokenized_text ["input_ids" ]
371
+ attention_mask = tokenized_text ["attention_mask" ]
372
+ inputs = {}
373
+ inputs ["input_ids" ] = input_ids
374
+ inputs ["attention_mask" ] = attention_mask
375
+ batch_size = input_ids .shape [0 ]
376
+ inputs ["beam_idx" ] = np .arange (batch_size , dtype = int )
377
+ return inputs
378
+
379
+ quantization_dataset = nncf .Dataset (dataset , partial (transform_fn , tokenizer = tokenizer ))
380
+ model = model_cls .from_pretrained (model_id , export = True , load_in_4bit = True , quantization_config = OVWeightQuantizationConfig (mode = nncf .CompressWeightsMode .INT4_SYM , group_size = - 1 , ratio = 0.8 , dataset = quantization_dataset ))
381
+
382
+ _ , num_int8 , num_int4 = get_num_quantized_nodes (model )
383
+ self .assertEqual (expected_int8 , num_int8 )
384
+ self .assertEqual (expected_int4 , num_int4 )
354
385
355
386
@parameterized .expand (((OVModelForCausalLM , "gpt2" ),))
356
387
@unittest .skipIf (not IS_SUPPORT_STATEFUL , "Stateful models supported only in 2023.3 and above" )
0 commit comments