Skip to content

Commit e9cb48c

Browse files
authored
improve UT coverage of PT Utils and Quantization (#1842)
* update UTs --------- Signed-off-by: xin3he <xin3.he@intel.com> Signed-off-by: xinhe3 <xinhe3@habana.ai>
1 parent 6b27383 commit e9cb48c

File tree

14 files changed

+346
-254
lines changed

14 files changed

+346
-254
lines changed

examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt

+2
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,5 @@ neural-compressor
1111
intel-extension-for-transformers
1212
lm_eval==0.4.2
1313
peft
14+
auto_round
15+
intel_extension_for_pytorch

examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -236,9 +236,10 @@ def get_user_model():
236236
# 3.x api
237237
if args.approach == 'weight_only':
238238
from neural_compressor.torch.quantization import RTNConfig, GPTQConfig, prepare, convert, quantize
239-
from neural_compressor.torch.utils import get_double_quant_config
239+
from neural_compressor.torch.utils import get_double_quant_config_dict
240240
weight_sym = True if args.woq_scheme == "sym" else False
241-
double_quant_config_dict = get_double_quant_config(args.double_quant_type)
241+
if args.double_quant_type is not None:
242+
double_quant_config_dict = get_double_quant_config_dict(args.double_quant_type)
242243

243244
if args.woq_algo == "RTN":
244245
if args.double_quant_type is not None:

neural_compressor/torch/algorithms/pt2e_quant/core.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from neural_compressor.common.utils import logger
2727
from neural_compressor.torch.algorithms.base_algorithm import Quantizer
2828
from neural_compressor.torch.algorithms.pt2e_quant import half_precision_rewriter as hp_rewriter
29-
from neural_compressor.torch.utils import create_xiq_quantizer_from_pt2e_config
29+
from neural_compressor.torch.algorithms.pt2e_quant.utility import create_xiq_quantizer_from_pt2e_config
3030

3131

3232
class W8A8PT2EQuantizer(Quantizer):
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# Copyright (c) 2024 Intel Corporation
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from typing import Dict
16+
17+
import torch
18+
import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq
19+
from torch.ao.quantization.observer import HistogramObserver, MinMaxObserver, PlaceholderObserver
20+
from torch.ao.quantization.quantizer import QuantizationSpec
21+
from torch.ao.quantization.quantizer.x86_inductor_quantizer import QuantizationConfig, X86InductorQuantizer
22+
23+
24+
def create_quant_spec_from_config(dtype, sym, granularity, algo, is_dynamic=False) -> QuantizationSpec:
25+
dtype_mapping: Dict[str, torch.dtype] = {"int8": torch.int8, "uint8": torch.uint8}
26+
select_dtype = dtype_mapping[dtype]
27+
min_max_mapping = {torch.int8: (-128, 127), torch.uint8: (0, 255)}
28+
qscheme_mapping = {
29+
"per_channel": {True: torch.per_channel_symmetric, False: torch.per_tensor_affine},
30+
"per_tensor": {True: torch.per_tensor_symmetric, False: torch.per_tensor_affine},
31+
}
32+
observer_mapping = {
33+
"placeholder": PlaceholderObserver,
34+
"minmax": MinMaxObserver,
35+
"kl": HistogramObserver,
36+
}
37+
# Force to use placeholder observer for dynamic quantization
38+
if is_dynamic:
39+
algo = "placeholder"
40+
# algo
41+
observer_or_fake_quant_ctr = observer_mapping[algo]
42+
# qscheme
43+
qscheme = qscheme_mapping[granularity][sym]
44+
quantization_spec = QuantizationSpec(
45+
dtype=select_dtype,
46+
quant_min=min_max_mapping[select_dtype][0],
47+
quant_max=min_max_mapping[select_dtype][1],
48+
observer_or_fake_quant_ctr=observer_or_fake_quant_ctr,
49+
qscheme=qscheme,
50+
is_dynamic=is_dynamic,
51+
)
52+
return quantization_spec
53+
54+
55+
def _map_inc_config_to_torch_quant_config(inc_config, is_dynamic=False) -> QuantizationConfig:
56+
default_quant_config = xiq.get_default_x86_inductor_quantization_config(is_dynamic=is_dynamic)
57+
input_act_quant_spec = create_quant_spec_from_config(
58+
inc_config.act_dtype, inc_config.act_sym, inc_config.act_granularity, inc_config.act_algo, is_dynamic=is_dynamic
59+
)
60+
weight_quant_spec = create_quant_spec_from_config(
61+
inc_config.w_dtype, inc_config.w_sym, inc_config.w_granularity, inc_config.w_algo
62+
)
63+
quant_config = QuantizationConfig(
64+
input_activation=input_act_quant_spec,
65+
output_activation=default_quant_config.output_activation,
66+
weight=weight_quant_spec,
67+
bias=default_quant_config.bias,
68+
is_qat=False,
69+
)
70+
return quant_config
71+
72+
73+
def create_xiq_quantizer_from_pt2e_config(config, is_dynamic=False) -> X86InductorQuantizer:
74+
quantizer = xiq.X86InductorQuantizer()
75+
# set global
76+
global_config = _map_inc_config_to_torch_quant_config(config, is_dynamic)
77+
quantizer.set_global(global_config)
78+
# Skip the local config for now (need torch 2.4)
79+
return quantizer

neural_compressor/torch/utils/auto_accelerator.py

+12-6
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ class CUDA_Accelerator:
9898
return accelerator_registry.register_accelerator_impl(name=name, priority=priority)
9999

100100

101-
class Auto_Accelerator(ABC):
101+
class Auto_Accelerator(ABC): # pragma: no cover
102102
@classmethod
103103
@abstractmethod
104104
def is_available(cls) -> bool:
@@ -175,7 +175,7 @@ def synchronize(self):
175175

176176

177177
@register_accelerator(name="cuda", priority=PRIORITY_CUDA)
178-
class CUDA_Accelerator(Auto_Accelerator):
178+
class CUDA_Accelerator(Auto_Accelerator): # pragma: no cover
179179
def __init__(self) -> None:
180180
self._name = "cuda"
181181

@@ -211,7 +211,7 @@ def empty_cache(self):
211211

212212

213213
@register_accelerator(name="xpu", priority=PRIORITY_XPU)
214-
class XPU_Accelerator(Auto_Accelerator):
214+
class XPU_Accelerator(Auto_Accelerator): # pragma: no cover
215215
def __init__(self) -> None:
216216
self._name = "xpu"
217217

@@ -250,7 +250,7 @@ def empty_cache(self):
250250

251251

252252
@register_accelerator(name="hpu", priority=PRIORITY_HPU)
253-
class HPU_Accelerator(Auto_Accelerator):
253+
class HPU_Accelerator(Auto_Accelerator): # pragma: no cover
254254
def __init__(self) -> None:
255255
self._name = "hpu"
256256

@@ -275,7 +275,10 @@ def synchronize(self):
275275
return torch.hpu.synchronize()
276276

277277
def set_device(self, device_index):
278-
return torch.hpu.set_device(device_index)
278+
try:
279+
torch.hpu.set_device(device_index)
280+
except Exception as e:
281+
logger.warning(e)
279282

280283
def current_device(self):
281284
return torch.hpu.current_device()
@@ -287,7 +290,10 @@ def device(self, device_index=None):
287290
return torch.hpu.device(device_index)
288291

289292
def empty_cache(self):
290-
return torch.hpu.empty_cache()
293+
try:
294+
torch.hpu.empty_cache()
295+
except Exception as e:
296+
logger.warning(e)
291297

292298
def mark_step(self):
293299
return htcore.mark_step()

neural_compressor/torch/utils/environ.py

+25-15
Original file line numberDiff line numberDiff line change
@@ -13,24 +13,14 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16+
import importlib
1617
import sys
1718

1819
import torch
1920
from packaging.version import Version
2021

21-
# pylint:disable=import-error
22-
try:
23-
import habana_frameworks.torch.hpex
24-
25-
_hpex_available = True
26-
except:
27-
_hpex_available = False
28-
29-
30-
def is_hpex_available():
31-
return _hpex_available
32-
3322

23+
################ Check imported sys.module first to decide behavior #################
3424
def is_ipex_imported() -> bool:
3525
for name, _ in sys.modules.items():
3626
if name == "intel_extension_for_pytorch":
@@ -45,11 +35,29 @@ def is_transformers_imported() -> bool:
4535
return False
4636

4737

48-
try:
49-
import intel_extension_for_pytorch as ipex
38+
################ Check available sys.module to decide behavior #################
39+
def is_package_available(package_name):
40+
from importlib.util import find_spec
41+
42+
package_spec = find_spec(package_name)
43+
return package_spec is not None
44+
5045

46+
## check hpex
47+
if is_package_available("habana_frameworks"):
48+
_hpex_available = True
49+
else:
50+
_hpex_available = False
51+
52+
53+
def is_hpex_available():
54+
return _hpex_available
55+
56+
57+
## check ipex
58+
if is_package_available("intel_extension_for_pytorch"):
5159
_ipex_available = True
52-
except:
60+
else:
5361
_ipex_available = False
5462

5563

@@ -60,6 +68,8 @@ def is_ipex_available():
6068
def get_ipex_version():
6169
if is_ipex_available():
6270
try:
71+
import intel_extension_for_pytorch as ipex
72+
6373
ipex_version = ipex.__version__.split("+")[0]
6474
except ValueError as e: # pragma: no cover
6575
assert False, "Got an unknown version of intel_extension_for_pytorch: {}".format(e)

neural_compressor/torch/utils/utility.py

+1-65
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,6 @@
1616
from typing import Callable, Dict, List, Tuple, Union
1717

1818
import torch
19-
import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq
20-
from torch.ao.quantization.observer import HistogramObserver, MinMaxObserver, PlaceholderObserver
21-
from torch.ao.quantization.quantizer import QuantizationSpec
22-
from torch.ao.quantization.quantizer.x86_inductor_quantizer import QuantizationConfig, X86InductorQuantizer
2319
from typing_extensions import TypeAlias
2420

2521
from neural_compressor.common import logger
@@ -120,11 +116,9 @@ def get_model_info(model: torch.nn.Module, white_module_list: List[Callable]) ->
120116
return filter_result
121117

122118

123-
def get_double_quant_config(double_quant_type):
119+
def get_double_quant_config_dict(double_quant_type="BNB_NF4"):
124120
from neural_compressor.torch.utils.constants import DOUBLE_QUANT_CONFIGS
125121

126-
if double_quant_type is None:
127-
return {}
128122
assert double_quant_type in DOUBLE_QUANT_CONFIGS, "Supported double quant configs: {}".format(
129123
list(DOUBLE_QUANT_CONFIGS.keys())
130124
)
@@ -170,61 +164,3 @@ def postprocess_model(model, mode, quantizer):
170164
elif mode == Mode.CONVERT or mode == Mode.QUANTIZE:
171165
if getattr(model, "quantizer", False):
172166
del model.quantizer
173-
174-
175-
def create_quant_spec_from_config(dtype, sym, granularity, algo, is_dynamic=False) -> QuantizationSpec:
176-
dtype_mapping: Dict[str, torch.dtype] = {"int8": torch.int8, "uint8": torch.uint8}
177-
select_dtype = dtype_mapping[dtype]
178-
min_max_mapping = {torch.int8: (-128, 127), torch.uint8: (0, 255)}
179-
qscheme_mapping = {
180-
"per_channel": {True: torch.per_channel_symmetric, False: torch.per_tensor_affine},
181-
"per_tensor": {True: torch.per_tensor_symmetric, False: torch.per_tensor_affine},
182-
}
183-
observer_mapping = {
184-
"placeholder": PlaceholderObserver,
185-
"minmax": MinMaxObserver,
186-
"kl": HistogramObserver,
187-
}
188-
# Force to use placeholder observer for dynamic quantization
189-
if is_dynamic:
190-
algo = "placeholder"
191-
# algo
192-
observer_or_fake_quant_ctr = observer_mapping[algo]
193-
# qscheme
194-
qscheme = qscheme_mapping[granularity][sym]
195-
quantization_spec = QuantizationSpec(
196-
dtype=select_dtype,
197-
quant_min=min_max_mapping[select_dtype][0],
198-
quant_max=min_max_mapping[select_dtype][1],
199-
observer_or_fake_quant_ctr=observer_or_fake_quant_ctr,
200-
qscheme=qscheme,
201-
is_dynamic=is_dynamic,
202-
)
203-
return quantization_spec
204-
205-
206-
def _map_inc_config_to_torch_quant_config(inc_config, is_dynamic=False) -> QuantizationConfig:
207-
default_quant_config = xiq.get_default_x86_inductor_quantization_config(is_dynamic=is_dynamic)
208-
input_act_quant_spec = create_quant_spec_from_config(
209-
inc_config.act_dtype, inc_config.act_sym, inc_config.act_granularity, inc_config.act_algo, is_dynamic=is_dynamic
210-
)
211-
weight_quant_spec = create_quant_spec_from_config(
212-
inc_config.w_dtype, inc_config.w_sym, inc_config.w_granularity, inc_config.w_algo
213-
)
214-
quant_config = QuantizationConfig(
215-
input_activation=input_act_quant_spec,
216-
output_activation=default_quant_config.output_activation,
217-
weight=weight_quant_spec,
218-
bias=default_quant_config.bias,
219-
is_qat=False,
220-
)
221-
return quant_config
222-
223-
224-
def create_xiq_quantizer_from_pt2e_config(config, is_dynamic=False) -> X86InductorQuantizer:
225-
quantizer = xiq.X86InductorQuantizer()
226-
# set global
227-
global_config = _map_inc_config_to_torch_quant_config(config, is_dynamic)
228-
quantizer.set_global(global_config)
229-
# Skip the local config for now (need torch 2.4)
230-
return quantizer

requirements_pt.txt

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
auto-round
2-
intel_extension_for_pytorch
1+
numpy
32
peft==0.10.0
43
psutil
54
py-cpuinfo

test/3x/torch/quantization/weight_only/test_rtn.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
prepare,
1515
quantize,
1616
)
17-
from neural_compressor.torch.utils import accelerator
17+
from neural_compressor.torch.utils import accelerator, is_hpex_available
1818

1919
device = accelerator.current_device_name()
2020

@@ -76,6 +76,8 @@ def test_int_params(self, bits, use_sym, group_size, group_dim):
7676
model = convert(model)
7777
out = model(self.example_inputs)[0]
7878
assert (out != self.label).any(), "WOQ output should be different with raw output"
79+
if is_hpex_available():
80+
assert "hpu" in out.device, "Neural Compressor should run on HPU when HPEX is available."
7981
if (bits, use_sym, group_size, group_dim) == (8, True, -1, 1):
8082
assert torch.allclose(out, self.label, atol=0.01), "Accuracy gap atol > 0.01 is unexpected."
8183
if (bits, use_sym, group_size, group_dim) == [(4, True, 128, 0), (4, True, 32, 1)]:

test/3x/torch/requirements.txt

+2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
auto_round
12
expecttest
3+
intel_extension_for_pytorch
24
numpy
35
peft==0.10.0
46
prettytable

0 commit comments

Comments
 (0)