72
72
class QuantizationTest (INCTestMixin ):
73
73
SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (
74
74
("text-classification" , "bert" , 21 ),
75
- # ("text-generation", "bloom", 21),
75
+ ("text-generation" , "bloom" , 21 ),
76
76
)
77
77
78
78
SUPPORTED_ARCHITECTURES_DYNAMIC = SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS + (
@@ -88,12 +88,14 @@ class QuantizationTest(INCTestMixin):
88
88
@parameterized .expand (SUPPORTED_ARCHITECTURES_DYNAMIC )
89
89
def test_dynamic_quantization (self , task , model_arch , expected_quantized_matmuls ):
90
90
model_name = MODEL_NAMES [model_arch ]
91
- quantization_config = PostTrainingQuantConfig (approach = "dynamic" )
92
91
model_class = ORT_SUPPORTED_TASKS [task ]["class" ][0 ]
93
92
tokenizer = AutoTokenizer .from_pretrained (model_name )
94
- save_onnx_model = False
93
+
95
94
quantized_model = None
95
+ save_onnx_model = False
96
96
model_kwargs = {"use_cache" : False , "use_io_binding" : False } if task == "text-generation" else {}
97
+ quantization_config = PostTrainingQuantConfig (approach = "dynamic" )
98
+
97
99
with tempfile .TemporaryDirectory () as tmp_dir :
98
100
for backend in ["torch" , "ort" ]:
99
101
if backend == "torch" :
@@ -104,8 +106,8 @@ def test_dynamic_quantization(self, task, model_arch, expected_quantized_matmuls
104
106
quantizer = INCQuantizer .from_pretrained (model , task = task )
105
107
quantizer .quantize (
106
108
quantization_config = quantization_config ,
107
- save_directory = tmp_dir ,
108
109
save_onnx_model = save_onnx_model ,
110
+ save_directory = tmp_dir ,
109
111
)
110
112
if backend == "torch" :
111
113
quantized_model = quantizer ._quantized_model
@@ -130,28 +132,29 @@ def test_static_quantization(self, task, model_arch, expected_quantized_matmuls)
130
132
if tokenizer .pad_token is None :
131
133
tokenizer .pad_token = tokenizer .eos_token
132
134
135
+ quantized_model = None
133
136
save_onnx_model = False
134
137
op_type_dict = (
135
138
{"Embedding" : {"weight" : {"dtype" : ["fp32" ]}, "activation" : {"dtype" : ["fp32" ]}}}
136
139
if save_onnx_model
137
140
else None
138
141
)
142
+ model_kwargs = {"use_cache" : False , "use_io_binding" : False } if task == "text-generation" else {}
139
143
quantization_config = PostTrainingQuantConfig (approach = "static" , op_type_dict = op_type_dict )
140
- quantized_model = None
141
144
142
145
with tempfile .TemporaryDirectory () as tmp_dir :
143
146
for backend in ["torch" , "ort" ]:
144
147
if backend == "torch" :
145
148
model = model_class .auto_model_class .from_pretrained (model_name )
146
149
else :
147
- model = model_class .from_pretrained (model_name , export = True )
150
+ model = model_class .from_pretrained (model_name , export = True , ** model_kwargs )
148
151
quantizer = INCQuantizer .from_pretrained (model , task = task )
149
152
calibration_dataset = _generate_dataset (quantizer , tokenizer , num_samples = num_samples )
150
153
quantizer .quantize (
151
154
quantization_config = quantization_config ,
152
155
calibration_dataset = calibration_dataset ,
153
- save_directory = tmp_dir ,
154
156
save_onnx_model = save_onnx_model ,
157
+ save_directory = tmp_dir ,
155
158
)
156
159
if backend == "torch" :
157
160
quantized_model = quantizer ._quantized_model
0 commit comments