From df72e9f2dbbe5a7a949c00f3efb73fc9cf60a65b Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 2 May 2024 09:55:48 +0200 Subject: [PATCH 1/4] added bert static test --- tests/neural_compressor/test_optimization.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/tests/neural_compressor/test_optimization.py b/tests/neural_compressor/test_optimization.py index 5d99306df1..68492b17be 100644 --- a/tests/neural_compressor/test_optimization.py +++ b/tests/neural_compressor/test_optimization.py @@ -72,7 +72,7 @@ class QuantizationTest(INCTestMixin): SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = ( ("text-classification", "bert", 21), - # ("text-generation", "bloom", 21), + ("text-generation", "bloom", 21), ) SUPPORTED_ARCHITECTURES_DYNAMIC = SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS + ( @@ -88,12 +88,14 @@ class QuantizationTest(INCTestMixin): @parameterized.expand(SUPPORTED_ARCHITECTURES_DYNAMIC) def test_dynamic_quantization(self, task, model_arch, expected_quantized_matmuls): model_name = MODEL_NAMES[model_arch] - quantization_config = PostTrainingQuantConfig(approach="dynamic") model_class = ORT_SUPPORTED_TASKS[task]["class"][0] tokenizer = AutoTokenizer.from_pretrained(model_name) - save_onnx_model = False + quantized_model = None + save_onnx_model = False model_kwargs = {"use_cache": False, "use_io_binding": False} if task == "text-generation" else {} + quantization_config = PostTrainingQuantConfig(approach="dynamic") + with tempfile.TemporaryDirectory() as tmp_dir: for backend in ["torch", "ort"]: if backend == "torch": @@ -104,8 +106,8 @@ def test_dynamic_quantization(self, task, model_arch, expected_quantized_matmuls quantizer = INCQuantizer.from_pretrained(model, task=task) quantizer.quantize( quantization_config=quantization_config, - save_directory=tmp_dir, save_onnx_model=save_onnx_model, + save_directory=tmp_dir, ) if backend == "torch": quantized_model = quantizer._quantized_model @@ -130,28 +132,29 @@ def test_static_quantization(self, task, model_arch, expected_quantized_matmuls) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token + quantized_model = None save_onnx_model = False op_type_dict = ( {"Embedding": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}}} if save_onnx_model else None ) + model_kwargs = {"use_cache": False, "use_io_binding": False} if task == "text-generation" else {} quantization_config = PostTrainingQuantConfig(approach="static", op_type_dict=op_type_dict) - quantized_model = None with tempfile.TemporaryDirectory() as tmp_dir: for backend in ["torch", "ort"]: if backend == "torch": model = model_class.auto_model_class.from_pretrained(model_name) else: - model = model_class.from_pretrained(model_name, export=True) + model = model_class.from_pretrained(model_name, export=True, **model_kwargs) quantizer = INCQuantizer.from_pretrained(model, task=task) calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=num_samples) quantizer.quantize( quantization_config=quantization_config, calibration_dataset=calibration_dataset, - save_directory=tmp_dir, save_onnx_model=save_onnx_model, + save_directory=tmp_dir, ) if backend == "torch": quantized_model = quantizer._quantized_model From e9f3aa369befe0f5ee97b0db1c73984261da8122 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 2 May 2024 14:25:07 +0200 Subject: [PATCH 2/4] fix test for models that require position ids --- tests/neural_compressor/test_optimization.py | 16 +++++++--------- tests/neural_compressor/utils_tests.py | 11 +++++++++++ 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/tests/neural_compressor/test_optimization.py b/tests/neural_compressor/test_optimization.py index 68492b17be..da42586139 100644 --- a/tests/neural_compressor/test_optimization.py +++ b/tests/neural_compressor/test_optimization.py @@ -70,12 +70,13 @@ class QuantizationTest(INCTestMixin): - SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = ( + SUPPORTED_ARCHITECTURES_STATIC = ( + ("text-generation", "gpt_neo", 17), ("text-classification", "bert", 21), ("text-generation", "bloom", 21), ) - SUPPORTED_ARCHITECTURES_DYNAMIC = SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS + ( + SUPPORTED_ARCHITECTURES_DYNAMIC = SUPPORTED_ARCHITECTURES_STATIC + ( ("fill-mask", "bert", 22), ("token-classification", "albert", 26), ) @@ -123,7 +124,7 @@ def test_dynamic_quantization(self, task, model_arch, expected_quantized_matmuls load_inc_model=True, ) - @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) + @parameterized.expand(SUPPORTED_ARCHITECTURES_STATIC) def test_static_quantization(self, task, model_arch, expected_quantized_matmuls): num_samples = 10 model_name = MODEL_NAMES[model_arch] @@ -134,13 +135,8 @@ def test_static_quantization(self, task, model_arch, expected_quantized_matmuls) quantized_model = None save_onnx_model = False - op_type_dict = ( - {"Embedding": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}}} - if save_onnx_model - else None - ) + quantization_config = PostTrainingQuantConfig(approach="static") model_kwargs = {"use_cache": False, "use_io_binding": False} if task == "text-generation" else {} - quantization_config = PostTrainingQuantConfig(approach="static", op_type_dict=op_type_dict) with tempfile.TemporaryDirectory() as tmp_dir: for backend in ["torch", "ort"]: @@ -148,8 +144,10 @@ def test_static_quantization(self, task, model_arch, expected_quantized_matmuls) model = model_class.auto_model_class.from_pretrained(model_name) else: model = model_class.from_pretrained(model_name, export=True, **model_kwargs) + quantizer = INCQuantizer.from_pretrained(model, task=task) calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=num_samples) + quantizer.quantize( quantization_config=quantization_config, calibration_dataset=calibration_dataset, diff --git a/tests/neural_compressor/utils_tests.py b/tests/neural_compressor/utils_tests.py index c91270355a..a6d09954f5 100644 --- a/tests/neural_compressor/utils_tests.py +++ b/tests/neural_compressor/utils_tests.py @@ -47,6 +47,7 @@ from optimum.intel.utils.constant import ONNX_WEIGHTS_NAME from optimum.onnxruntime import ORTModelForCausalLM, ORTModelForSequenceClassification from optimum.pipelines import ORT_SUPPORTED_TASKS +from optimum.exporters.onnx import MODEL_TYPES_REQUIRING_POSITION_IDS if is_ipex_available(): from optimum.intel import ( @@ -135,6 +136,13 @@ def _generate_dataset(quantizer, tokenizer, num_samples=10): num_samples=num_samples, dataset_split="train", ) + model_type = quantizer._original_model.config.model_type.replace("_", "-") + if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: + dataset = dataset.map( + lambda x: { + "position_ids": np.arange(len(x["input_ids"])), + } + ) return dataset @@ -187,6 +195,9 @@ def check_model_outputs( self.assertEqual(expected_quantized_matmuls, num_quantized_matmul) ort_model = ORT_SUPPORTED_TASKS[task]["class"][0].from_pretrained(save_directory, **model_kwargs) + model_type = ort_model.config.model_type.replace("_", "-") + if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: + tokens["position_ids"] = torch.arange(len(tokens["input_ids"])).unsqueeze(0) ort_outputs = ort_model(**tokens) self.assertTrue("logits" in ort_outputs) # self.assertTrue(torch.allclose(ort_outputs.logits, outputs, atol=1e-2)) From 3fe585332715717702b3c7916dff0fc1270354ca Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 3 May 2024 10:16:00 +0200 Subject: [PATCH 3/4] remove io_binding --- tests/neural_compressor/test_optimization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/neural_compressor/test_optimization.py b/tests/neural_compressor/test_optimization.py index da42586139..c5e5af8a23 100644 --- a/tests/neural_compressor/test_optimization.py +++ b/tests/neural_compressor/test_optimization.py @@ -94,7 +94,7 @@ def test_dynamic_quantization(self, task, model_arch, expected_quantized_matmuls quantized_model = None save_onnx_model = False - model_kwargs = {"use_cache": False, "use_io_binding": False} if task == "text-generation" else {} + model_kwargs = {"use_cache": False} if task == "text-generation" else {} quantization_config = PostTrainingQuantConfig(approach="dynamic") with tempfile.TemporaryDirectory() as tmp_dir: @@ -136,7 +136,7 @@ def test_static_quantization(self, task, model_arch, expected_quantized_matmuls) quantized_model = None save_onnx_model = False quantization_config = PostTrainingQuantConfig(approach="static") - model_kwargs = {"use_cache": False, "use_io_binding": False} if task == "text-generation" else {} + model_kwargs = {"use_cache": False} if task == "text-generation" else {} with tempfile.TemporaryDirectory() as tmp_dir: for backend in ["torch", "ort"]: From 5bd07f7f18595fd7db6ab321b2700409ed17cbe4 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 3 May 2024 10:28:00 +0200 Subject: [PATCH 4/4] optimum refuses io binding without kv cache --- tests/neural_compressor/test_optimization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/neural_compressor/test_optimization.py b/tests/neural_compressor/test_optimization.py index c5e5af8a23..da42586139 100644 --- a/tests/neural_compressor/test_optimization.py +++ b/tests/neural_compressor/test_optimization.py @@ -94,7 +94,7 @@ def test_dynamic_quantization(self, task, model_arch, expected_quantized_matmuls quantized_model = None save_onnx_model = False - model_kwargs = {"use_cache": False} if task == "text-generation" else {} + model_kwargs = {"use_cache": False, "use_io_binding": False} if task == "text-generation" else {} quantization_config = PostTrainingQuantConfig(approach="dynamic") with tempfile.TemporaryDirectory() as tmp_dir: @@ -136,7 +136,7 @@ def test_static_quantization(self, task, model_arch, expected_quantized_matmuls) quantized_model = None save_onnx_model = False quantization_config = PostTrainingQuantConfig(approach="static") - model_kwargs = {"use_cache": False} if task == "text-generation" else {} + model_kwargs = {"use_cache": False, "use_io_binding": False} if task == "text-generation" else {} with tempfile.TemporaryDirectory() as tmp_dir: for backend in ["torch", "ort"]: