Skip to content

Commit f025f05

Browse files
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
1 parent 7ed9c0f commit f025f05

File tree

15 files changed

+72
-38
lines changed

15 files changed

+72
-38
lines changed

neural_compressor/common/utils/__init__.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -28,22 +28,22 @@
2828
VLLM_EP_SIZE = int(os.getenv("VLLM_EP_SIZE", None))
2929
NUM_EXPERTS_PER_EP_RANK = DEEPSEEK_EXPERTS // VLLM_EP_SIZE # 32
3030
NUM_EXPERTS_GROUPS = 8
31-
NUM_EXPERTS_PER_GROUP_PER_RANK = NUM_EXPERTS_PER_EP_RANK // NUM_EXPERTS_GROUPS # 4
31+
NUM_EXPERTS_PER_GROUP_PER_RANK = NUM_EXPERTS_PER_EP_RANK // NUM_EXPERTS_GROUPS # 4
3232
FUSED_MOE_EXPERTS = NUM_EXPERTS_PER_GROUP_PER_RANK # 4
3333

3434

3535
import sys
3636
import pdb
3737

38+
3839
class ForkedPdb(pdb.Pdb):
3940
"""A Pdb subclass that may be used
40-
from a forked multiprocessing child
41+
from a forked multiprocessing child."""
4142

42-
"""
4343
def interaction(self, *args, **kwargs):
4444
_stdin = sys.stdin
4545
try:
46-
sys.stdin = open('/dev/stdin')
46+
sys.stdin = open("/dev/stdin")
4747
pdb.Pdb.interaction(self, *args, **kwargs)
4848
finally:
4949
sys.stdin = _stdin

neural_compressor/evaluation/hf_eval/hf_datasets/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,4 @@
1313
# distributed under the License is distributed on an "AS IS" BASIS,
1414
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1515
# See the License for the specific language governing permissions and
16-
# limitations under the License.
16+
# limitations under the License.

neural_compressor/evaluation/lm_eval/utils.py

+1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
from neural_compressor.common import logger
2424

25+
2526
class LMEvalParser:
2627
def __init__(
2728
self,

neural_compressor/torch/algorithms/weight_only/rtn.py

-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@
3737

3838
from .modules import INCWeightOnlyLinear
3939
from .utility import cast_fp8, quant_tensor, search_clip
40-
from .modules import INCWeightOnlyLinear
4140

4241
if is_transformers_imported():
4342
import transformers

neural_compressor/torch/algorithms/weight_only/save_load.py

+18-12
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,14 @@
2626
from neural_compressor.torch.utils import (
2727
HPU_SAFE_WEIGHTS_NAME,
2828
HPU_WEIGHT_NAME,
29+
LM_HEAD_NAMES,
2930
QCONFIG_NAME,
3031
WEIGHT_NAME,
3132
SaveLoadFormat,
33+
get_accelerator,
34+
get_enum_from_format,
3235
logger,
3336
set_module,
34-
get_enum_from_format,
35-
LM_HEAD_NAMES,
36-
get_accelerator,
3737
)
3838

3939
from .modules import HPUWeightOnlyLinear, INCWeightOnlyLinear, MulLinear
@@ -899,6 +899,7 @@ def _load_remaining_pretrained_weight(self, model):
899899
}
900900

901901
import transformers
902+
902903
if transformers.__version__ < "4.45.0":
903904
params_dict["loaded_state_dict_keys"] = self.loaded_state_dict_keys
904905

@@ -965,8 +966,9 @@ def change_config_to_hf_format(config_mappings):
965966
"true_sequential": True,
966967
"model_name_or_path": None,
967968
"model_file_base_name": "model",
968-
"quant_method": "gptq" # INC is using AutoGPTQ format for RTN, GPTQ, AWQ, and TEQ
969+
"quant_method": "gptq", # INC is using AutoGPTQ format for RTN, GPTQ, AWQ, and TEQ
969970
}
971+
970972
def _is_lm_head(name):
971973
for lm_head_name in LM_HEAD_NAMES:
972974
if re.match(lm_head_name, name):
@@ -993,17 +995,21 @@ def _is_lm_head(name):
993995
else:
994996
assert bits == config.bits, "bits should be the same for all modules, got {bits} and {config.bits}."
995997
assert sym == config.use_sym, "sym should be the same for all modules, got {sym} and {config.use_sym}."
996-
assert group_size == config.group_size, \
997-
"group_size should be the same for all modules, got {group_size} and {config.group_size}."
998+
assert (
999+
group_size == config.group_size
1000+
), "group_size should be the same for all modules, got {group_size} and {config.group_size}."
9981001
if hasattr(config, "percdamp"):
999-
assert damp_percent == config.percdamp, \
1000-
"percdamp should be the same for all modules, got {damp_percent} and {config.percdamp}."
1002+
assert (
1003+
damp_percent == config.percdamp
1004+
), "percdamp should be the same for all modules, got {damp_percent} and {config.percdamp}."
10011005
if hasattr(config, "act_order"):
1002-
assert desc_act == config.act_order, \
1003-
"act_order should be the same for all modules, got {desc_act} and {config.act_order}."
1006+
assert (
1007+
desc_act == config.act_order
1008+
), "act_order should be the same for all modules, got {desc_act} and {config.act_order}."
10041009
if hasattr(config, "true_sequential"):
1005-
assert true_sequential == config.true_sequential, \
1006-
"true_sequential should be the same for all modules, got {true_sequential} and {config.true_sequential}."
1010+
assert (
1011+
true_sequential == config.true_sequential
1012+
), "true_sequential should be the same for all modules, got {true_sequential} and {config.true_sequential}."
10071013
default_quantization_config["bits"] = bits
10081014
default_quantization_config["group_size"] = group_size
10091015
default_quantization_config["damp_percent"] = damp_percent

neural_compressor/torch/quantization/config.py

+2
Original file line numberDiff line numberDiff line change
@@ -1785,9 +1785,11 @@ def get_default_hqq_config() -> HQQConfig:
17851785

17861786
from ..algorithms.fp8_quant._core.common import get_white_list
17871787

1788+
17881789
@register_config(framework_name=FRAMEWORK_NAME, algo_name=FP8_QUANT)
17891790
class FP8Config(TorchBaseConfig):
17901791
"""Config class for FP8 quantization."""
1792+
17911793
name = FP8_QUANT
17921794

17931795
def __init__(

neural_compressor/torch/quantization/save_load_entry.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
FP8_QUANT: FP8Config,
3333
}
3434

35+
3536
def save(model, checkpoint_dir="saved_results", format="default"):
3637
"""Save quantized model.
3738
@@ -50,6 +51,7 @@ def save(model, checkpoint_dir="saved_results", format="default"):
5051
# fp8_quant
5152
if isinstance(config_object, FP8Config):
5253
from neural_compressor.torch.algorithms import fp8_quant
54+
5355
if format == SaveLoadFormat.DEFAULT:
5456
format = SaveLoadFormat.HUGGINGFACE
5557
fp8_quant.save(model, checkpoint_dir, format)
@@ -136,17 +138,20 @@ def load(model_name_or_path, original_model=None, format="default", device="cpu"
136138
return qmodel.to(device)
137139
elif format == SaveLoadFormat.HUGGINGFACE:
138140
import transformers
141+
139142
config = transformers.AutoConfig.from_pretrained(model_name_or_path, **kwargs)
140143
# use config to check which algorithm is used.
141144
if (
142-
"fp8_config" in config.quantization_config or
145+
"fp8_config" in config.quantization_config
146+
or
143147
# for FP8 LLMs for vLLM (https://huggingface.co/neuralmagic).
144148
(
145-
"quant_method" in config.quantization_config and
146-
config.quantization_config["quant_method"] in ["fp8", "compressed-tensors"]
149+
"quant_method" in config.quantization_config
150+
and config.quantization_config["quant_method"] in ["fp8", "compressed-tensors"]
147151
)
148152
):
149153
from neural_compressor.torch.algorithms import fp8_quant
154+
150155
return fp8_quant.load(model_name_or_path, format=format, device=device, **kwargs)
151156
else:
152157
from neural_compressor.torch.algorithms import weight_only

neural_compressor/torch/utils/auto_accelerator.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
import torch
3434

3535
from neural_compressor.common.utils import LazyImport, logger
36-
from functools import lru_cache
3736

3837
htcore = LazyImport("habana_frameworks.torch.core")
3938

@@ -42,15 +41,17 @@
4241
PRIORITY_CUDA = 90
4342
PRIORITY_CPU = 80
4443

44+
4545
class INCAcceleratorType(Enum):
4646
CUDA = auto()
4747
CPU = auto()
48-
EXTERNAL_ACCELERATOR_MAX = auto() # differentiate between external to intel accelerators
48+
EXTERNAL_ACCELERATOR_MAX = auto() # differentiate between external to intel accelerators
4949
XPU = auto()
50-
GAUDI_MIN = auto() # differentiate between any gaudi to xpu
50+
GAUDI_MIN = auto() # differentiate between any gaudi to xpu
5151
GAUDI2 = auto()
5252
GAUDI3 = auto()
5353

54+
5455
class AcceleratorRegistry:
5556
"""Accelerator Registry."""
5657

@@ -161,7 +162,7 @@ def synchronize(self):
161162
pass
162163

163164
@abstractmethod
164-
def get_inc_accelerator_type(self)->INCAcceleratorType:
165+
def get_inc_accelerator_type(self) -> INCAcceleratorType:
165166
"""Synchronize the accelerator."""
166167
pass
167168

neural_compressor/torch/utils/bit_packer.py

+11
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ def pack_array_with_numba_b4_c32(
5252
)
5353
return packed_array
5454

55+
5556
@register_pack_func(4, 16)
5657
@numba.jit(nopython=True, parallel=True)
5758
def pack_array_with_numba_b4_c16(
@@ -67,6 +68,7 @@ def pack_array_with_numba_b4_c16(
6768
)
6869
return packed_array
6970

71+
7072
@register_pack_func(4, 8)
7173
@numba.jit(nopython=True, parallel=True)
7274
def pack_array_with_numba_b4_c8(
@@ -77,6 +79,7 @@ def pack_array_with_numba_b4_c8(
7779
packed_array[:, i] = ((raw_array[:, i * n_pack + 1] & 0b1111) << 4) | (raw_array[:, i * n_pack] & 0b1111)
7880
return packed_array
7981

82+
8083
@register_pack_func(4, 64)
8184
@numba.jit(nopython=True, parallel=True)
8285
def pack_array_with_numba_b4_c64(
@@ -104,6 +107,7 @@ def pack_array_with_numba_b4_c64(
104107
)
105108
return packed_array
106109

110+
107111
@register_pack_func(8, 32)
108112
@numba.jit(nopython=True, parallel=True)
109113
def pack_array_with_numba_b8_c32(
@@ -119,6 +123,7 @@ def pack_array_with_numba_b8_c32(
119123
)
120124
return packed_array
121125

126+
122127
@register_pack_func(8, 16)
123128
@numba.jit(nopython=True, parallel=True)
124129
def pack_array_with_numba_b8_c16(
@@ -134,6 +139,7 @@ def pack_array_with_numba_b8_c16(
134139
)
135140
return packed_array
136141

142+
137143
@register_pack_func(8, 8)
138144
@numba.jit(nopython=True, parallel=True)
139145
def pack_array_with_numba_b8_c8(
@@ -144,6 +150,7 @@ def pack_array_with_numba_b8_c8(
144150
packed_array[:, i] = raw_array[:, i * n_pack] & 0b11111111
145151
return packed_array
146152

153+
147154
@register_pack_func(8, 64)
148155
@numba.jit(nopython=True, parallel=True)
149156
def pack_array_with_numba_b8_c64(
@@ -163,6 +170,7 @@ def pack_array_with_numba_b8_c64(
163170
)
164171
return packed_array
165172

173+
166174
@register_pack_func(2, 32)
167175
@numba.jit(nopython=True, parallel=True)
168176
def pack_array_with_numba_b2_c32(
@@ -190,6 +198,7 @@ def pack_array_with_numba_b2_c32(
190198
)
191199
return packed_array
192200

201+
193202
@register_pack_func(2, 16)
194203
@numba.jit(nopython=True, parallel=True)
195204
def pack_array_with_numba_b2_c16(
@@ -209,6 +218,7 @@ def pack_array_with_numba_b2_c16(
209218
)
210219
return packed_array
211220

221+
212222
@register_pack_func(2, 8)
213223
@numba.jit(nopython=True, parallel=True)
214224
def pack_array_with_numba_b2_c8(
@@ -224,6 +234,7 @@ def pack_array_with_numba_b2_c8(
224234
)
225235
return packed_array
226236

237+
227238
@register_pack_func(2, 64)
228239
@numba.jit(nopython=True, parallel=True)
229240
def pack_array_with_numba_b2_c64(

neural_compressor/torch/utils/environ.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,12 @@
1515
"""Intel Neural Compressor PyTorch environment check."""
1616

1717
import importlib
18-
import sys
1918
import os
19+
import sys
2020

2121
import torch
22-
from packaging.version import Version
2322
import torch.distributed
23+
from packaging.version import Version
2424

2525
from neural_compressor.common.utils import logger
2626

@@ -235,15 +235,17 @@ def is_tbb_available(): # pragma: no cover
235235
return False
236236
return True
237237

238+
238239
def show_mem_info(loglevel="info"):
239240
hpu_mem_mb = get_used_hpu_mem_MB()
240241
from neural_compressor.common.utils import logger
242+
241243
show_fn = getattr(logger, loglevel)
242244
rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else -1
243245
show_fn(f"[Rank {rank}] Used HPU memory: {hpu_mem_mb // 1000} GB {hpu_mem_mb % 1000} MB")
244246
cpu_mem_mb = get_used_cpu_mem_MB()
245247
show_fn(f"[Rank {rank}] Used CPU memory: {cpu_mem_mb // 1000} GB {cpu_mem_mb % 1000} MB")
246-
248+
247249

248250
def get_used_hpu_mem_MB():
249251
"""Get HPU used memory: MiB."""
@@ -259,6 +261,7 @@ def get_used_hpu_mem_MB():
259261
def get_used_cpu_mem_MB():
260262
"""Get the amount of CPU memory used by the current process in MiB (Mebibytes)."""
261263
import psutil
264+
262265
process = psutil.Process()
263266
mem_info = process.memory_info()
264267
used_cpu_mem = round(mem_info.rss / 1024**2, 3)

neural_compressor/torch/utils/llm_utility.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,13 @@
1616

1717
def initialize_model_and_tokenizer(model_name_or_path, use_load=False, device="cpu"):
1818
import transformers
19-
from neural_compressor.torch.utils import local_rank, world_size, logger
19+
20+
from neural_compressor.torch.utils import local_rank, logger, world_size
21+
2022
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path)
2123
if use_load:
2224
from neural_compressor.torch.quantization import load
25+
2326
model = load(model_name_or_path, format="huggingface", device=device)
2427
model, tokenizer = update_tokenizer(model, tokenizer)
2528
return model, tokenizer
@@ -37,6 +40,7 @@ def initialize_model_and_tokenizer(model_name_or_path, use_load=False, device="c
3740
"keep_module_on_host": True,
3841
}
3942
import deepspeed
43+
4044
ds_model = deepspeed.init_inference(model, **ds_inference_kwargs)
4145
model = ds_model.module
4246
model.eval()
@@ -95,10 +99,14 @@ def __getitem__(self, idx):
9599
dataloader = DataLoader(tokenized_dataset, batch_size=bs, shuffle=True)
96100
return dataloader
97101

102+
98103
def llm_benchmark(model, batch_size, input_length, warmup_iters=3, total_iters=20):
99104
import time
105+
100106
import torch
107+
101108
from neural_compressor.torch.utils import get_accelerator, logger
109+
102110
cur_accelerator = get_accelerator()
103111
# this is a simple example to show the performance benefit of quantization
104112
example_inputs = torch.ones((batch_size, input_length), dtype=torch.long)

neural_compressor/torch/utils/utility.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
detect_processor_type_based_on_hw,
3333
logger,
3434
)
35-
from neural_compressor.torch.utils import is_transformers_imported, SaveLoadFormat
35+
from neural_compressor.torch.utils import SaveLoadFormat, is_transformers_imported
3636

3737
if is_transformers_imported():
3838
import transformers
@@ -737,5 +737,4 @@ def get_enum_from_format(format):
737737
return obj
738738
elif format.upper() == obj.name:
739739
return obj
740-
raise ValueError(
741-
f"Invalid format value ('{format}'). Enter one of [{[m.name for m in SaveLoadFormat]}]")
740+
raise ValueError(f"Invalid format value ('{format}'). Enter one of [{[m.name for m in SaveLoadFormat]}]")

test/3x/torch/algorithms/fp8_quant/tester.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
QUANT_MODES_QUANT_ONLY = [QuantMode.QUANTIZE]
4141

4242
DTYPE_TO_HPDTYPE_STR = {
43-
torch.bfloat16: "BF16",
43+
torch.bfloat16: "BF16",
4444
torch.float16: "FP16",
4545
torch.float32: "FP32",
4646
}

0 commit comments

Comments
 (0)