[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit f025f051ae69 · 2025-03-03T08:20:55.000Z
for more information, see https://pre-commit.ci
diff --git a/neural_compressor/common/utils/__init__.py b/neural_compressor/common/utils/__init__.py
@@ -28,22 +28,22 @@
 VLLM_EP_SIZE = int(os.getenv("VLLM_EP_SIZE", None))
 NUM_EXPERTS_PER_EP_RANK = DEEPSEEK_EXPERTS // VLLM_EP_SIZE  # 32
 NUM_EXPERTS_GROUPS = 8
-NUM_EXPERTS_PER_GROUP_PER_RANK = NUM_EXPERTS_PER_EP_RANK // NUM_EXPERTS_GROUPS # 4
+NUM_EXPERTS_PER_GROUP_PER_RANK = NUM_EXPERTS_PER_EP_RANK // NUM_EXPERTS_GROUPS  # 4
 FUSED_MOE_EXPERTS = NUM_EXPERTS_PER_GROUP_PER_RANK  # 4
 
 
 import sys
 import pdb
 
+
 class ForkedPdb(pdb.Pdb):
     """A Pdb subclass that may be used
-    from a forked multiprocessing child
+    from a forked multiprocessing child."""
 
-    """
     def interaction(self, *args, **kwargs):
         _stdin = sys.stdin
         try:
-            sys.stdin = open('/dev/stdin')
+            sys.stdin = open("/dev/stdin")
             pdb.Pdb.interaction(self, *args, **kwargs)
         finally:
             sys.stdin = _stdin
diff --git a/neural_compressor/evaluation/hf_eval/hf_datasets/__init__.py b/neural_compressor/evaluation/hf_eval/hf_datasets/__init__.py
@@ -13,4 +13,4 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.
+# limitations under the License.
diff --git a/neural_compressor/evaluation/lm_eval/utils.py b/neural_compressor/evaluation/lm_eval/utils.py
@@ -22,6 +22,7 @@
 
 from neural_compressor.common import logger
 
+
 class LMEvalParser:
     def __init__(
         self,
diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py
@@ -37,7 +37,6 @@
 
 from .modules import INCWeightOnlyLinear
 from .utility import cast_fp8, quant_tensor, search_clip
-from .modules import INCWeightOnlyLinear
 
 if is_transformers_imported():
     import transformers
diff --git a/neural_compressor/torch/algorithms/weight_only/save_load.py b/neural_compressor/torch/algorithms/weight_only/save_load.py
@@ -26,14 +26,14 @@
 from neural_compressor.torch.utils import (
     HPU_SAFE_WEIGHTS_NAME,
     HPU_WEIGHT_NAME,
+    LM_HEAD_NAMES,
     QCONFIG_NAME,
     WEIGHT_NAME,
     SaveLoadFormat,
+    get_accelerator,
+    get_enum_from_format,
     logger,
     set_module,
-    get_enum_from_format,
-    LM_HEAD_NAMES,
-    get_accelerator,
 )
 
 from .modules import HPUWeightOnlyLinear, INCWeightOnlyLinear, MulLinear
@@ -899,6 +899,7 @@ def _load_remaining_pretrained_weight(self, model):
             }
 
             import transformers
+
             if transformers.__version__ < "4.45.0":
                 params_dict["loaded_state_dict_keys"] = self.loaded_state_dict_keys
 
@@ -965,8 +966,9 @@ def change_config_to_hf_format(config_mappings):
         "true_sequential": True,
         "model_name_or_path": None,
         "model_file_base_name": "model",
-        "quant_method": "gptq"  # INC is using AutoGPTQ format for RTN, GPTQ, AWQ, and TEQ
+        "quant_method": "gptq",  # INC is using AutoGPTQ format for RTN, GPTQ, AWQ, and TEQ
     }
+
     def _is_lm_head(name):
         for lm_head_name in LM_HEAD_NAMES:
             if re.match(lm_head_name, name):
@@ -993,17 +995,21 @@ def _is_lm_head(name):
         else:
             assert bits == config.bits, "bits should be the same for all modules, got {bits} and {config.bits}."
             assert sym == config.use_sym, "sym should be the same for all modules, got {sym} and {config.use_sym}."
-            assert group_size == config.group_size, \
-                    "group_size should be the same for all modules, got {group_size} and {config.group_size}."
+            assert (
+                group_size == config.group_size
+            ), "group_size should be the same for all modules, got {group_size} and {config.group_size}."
             if hasattr(config, "percdamp"):
-                assert damp_percent == config.percdamp, \
-                        "percdamp should be the same for all modules, got {damp_percent} and {config.percdamp}."
+                assert (
+                    damp_percent == config.percdamp
+                ), "percdamp should be the same for all modules, got {damp_percent} and {config.percdamp}."
             if hasattr(config, "act_order"):
-                assert desc_act == config.act_order, \
-                        "act_order should be the same for all modules, got {desc_act} and {config.act_order}."
+                assert (
+                    desc_act == config.act_order
+                ), "act_order should be the same for all modules, got {desc_act} and {config.act_order}."
             if hasattr(config, "true_sequential"):
-                assert true_sequential == config.true_sequential, \
-                        "true_sequential should be the same for all modules, got {true_sequential} and {config.true_sequential}."
+                assert (
+                    true_sequential == config.true_sequential
+                ), "true_sequential should be the same for all modules, got {true_sequential} and {config.true_sequential}."
     default_quantization_config["bits"] = bits
     default_quantization_config["group_size"] = group_size
     default_quantization_config["damp_percent"] = damp_percent
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
@@ -1785,9 +1785,11 @@ def get_default_hqq_config() -> HQQConfig:
 
 from ..algorithms.fp8_quant._core.common import get_white_list
 
+
 @register_config(framework_name=FRAMEWORK_NAME, algo_name=FP8_QUANT)
 class FP8Config(TorchBaseConfig):
     """Config class for FP8 quantization."""
+
     name = FP8_QUANT
 
     def __init__(
diff --git a/neural_compressor/torch/quantization/save_load_entry.py b/neural_compressor/torch/quantization/save_load_entry.py
@@ -32,6 +32,7 @@
     FP8_QUANT: FP8Config,
 }
 
+
 def save(model, checkpoint_dir="saved_results", format="default"):
     """Save quantized model.
 
@@ -50,6 +51,7 @@ def save(model, checkpoint_dir="saved_results", format="default"):
     # fp8_quant
     if isinstance(config_object, FP8Config):
         from neural_compressor.torch.algorithms import fp8_quant
+
         if format == SaveLoadFormat.DEFAULT:
             format = SaveLoadFormat.HUGGINGFACE
         fp8_quant.save(model, checkpoint_dir, format)
@@ -136,17 +138,20 @@ def load(model_name_or_path, original_model=None, format="default", device="cpu"
                 return qmodel.to(device)
     elif format == SaveLoadFormat.HUGGINGFACE:
         import transformers
+
         config = transformers.AutoConfig.from_pretrained(model_name_or_path, **kwargs)
         # use config to check which algorithm is used.
         if (
-            "fp8_config" in config.quantization_config or
+            "fp8_config" in config.quantization_config
+            or
             # for FP8 LLMs for vLLM (https://huggingface.co/neuralmagic).
             (
-                "quant_method" in config.quantization_config and
-                config.quantization_config["quant_method"] in ["fp8", "compressed-tensors"]
+                "quant_method" in config.quantization_config
+                and config.quantization_config["quant_method"] in ["fp8", "compressed-tensors"]
             )
         ):
             from neural_compressor.torch.algorithms import fp8_quant
+
             return fp8_quant.load(model_name_or_path, format=format, device=device, **kwargs)
         else:
             from neural_compressor.torch.algorithms import weight_only
diff --git a/neural_compressor/torch/utils/auto_accelerator.py b/neural_compressor/torch/utils/auto_accelerator.py
@@ -33,7 +33,6 @@
 import torch
 
 from neural_compressor.common.utils import LazyImport, logger
-from functools import lru_cache
 
 htcore = LazyImport("habana_frameworks.torch.core")
 
@@ -42,15 +41,17 @@
 PRIORITY_CUDA = 90
 PRIORITY_CPU = 80
 
+
 class INCAcceleratorType(Enum):
     CUDA = auto()
     CPU = auto()
-    EXTERNAL_ACCELERATOR_MAX = auto() # differentiate between external to intel accelerators
+    EXTERNAL_ACCELERATOR_MAX = auto()  # differentiate between external to intel accelerators
     XPU = auto()
-    GAUDI_MIN = auto() # differentiate between any gaudi to xpu
+    GAUDI_MIN = auto()  # differentiate between any gaudi to xpu
     GAUDI2 = auto()
     GAUDI3 = auto()
 
+
 class AcceleratorRegistry:
     """Accelerator Registry."""
 
@@ -161,7 +162,7 @@ def synchronize(self):
         pass
 
     @abstractmethod
-    def get_inc_accelerator_type(self)->INCAcceleratorType:
+    def get_inc_accelerator_type(self) -> INCAcceleratorType:
         """Synchronize the accelerator."""
         pass
 
diff --git a/neural_compressor/torch/utils/bit_packer.py b/neural_compressor/torch/utils/bit_packer.py
@@ -52,6 +52,7 @@ def pack_array_with_numba_b4_c32(
         )
     return packed_array
 
+
 @register_pack_func(4, 16)
 @numba.jit(nopython=True, parallel=True)
 def pack_array_with_numba_b4_c16(
@@ -67,6 +68,7 @@ def pack_array_with_numba_b4_c16(
         )
     return packed_array
 
+
 @register_pack_func(4, 8)
 @numba.jit(nopython=True, parallel=True)
 def pack_array_with_numba_b4_c8(
@@ -77,6 +79,7 @@ def pack_array_with_numba_b4_c8(
         packed_array[:, i] = ((raw_array[:, i * n_pack + 1] & 0b1111) << 4) | (raw_array[:, i * n_pack] & 0b1111)
     return packed_array
 
+
 @register_pack_func(4, 64)
 @numba.jit(nopython=True, parallel=True)
 def pack_array_with_numba_b4_c64(
@@ -104,6 +107,7 @@ def pack_array_with_numba_b4_c64(
         )
     return packed_array
 
+
 @register_pack_func(8, 32)
 @numba.jit(nopython=True, parallel=True)
 def pack_array_with_numba_b8_c32(
@@ -119,6 +123,7 @@ def pack_array_with_numba_b8_c32(
         )
     return packed_array
 
+
 @register_pack_func(8, 16)
 @numba.jit(nopython=True, parallel=True)
 def pack_array_with_numba_b8_c16(
@@ -134,6 +139,7 @@ def pack_array_with_numba_b8_c16(
         )
     return packed_array
 
+
 @register_pack_func(8, 8)
 @numba.jit(nopython=True, parallel=True)
 def pack_array_with_numba_b8_c8(
@@ -144,6 +150,7 @@ def pack_array_with_numba_b8_c8(
         packed_array[:, i] = raw_array[:, i * n_pack] & 0b11111111
     return packed_array
 
+
 @register_pack_func(8, 64)
 @numba.jit(nopython=True, parallel=True)
 def pack_array_with_numba_b8_c64(
@@ -163,6 +170,7 @@ def pack_array_with_numba_b8_c64(
         )
     return packed_array
 
+
 @register_pack_func(2, 32)
 @numba.jit(nopython=True, parallel=True)
 def pack_array_with_numba_b2_c32(
@@ -190,6 +198,7 @@ def pack_array_with_numba_b2_c32(
         )
     return packed_array
 
+
 @register_pack_func(2, 16)
 @numba.jit(nopython=True, parallel=True)
 def pack_array_with_numba_b2_c16(
@@ -209,6 +218,7 @@ def pack_array_with_numba_b2_c16(
         )
     return packed_array
 
+
 @register_pack_func(2, 8)
 @numba.jit(nopython=True, parallel=True)
 def pack_array_with_numba_b2_c8(
@@ -224,6 +234,7 @@ def pack_array_with_numba_b2_c8(
         )
     return packed_array
 
+
 @register_pack_func(2, 64)
 @numba.jit(nopython=True, parallel=True)
 def pack_array_with_numba_b2_c64(
diff --git a/neural_compressor/torch/utils/environ.py b/neural_compressor/torch/utils/environ.py
@@ -15,12 +15,12 @@
 """Intel Neural Compressor PyTorch environment check."""
 
 import importlib
-import sys
 import os
+import sys
 
 import torch
-from packaging.version import Version
 import torch.distributed
+from packaging.version import Version
 
 from neural_compressor.common.utils import logger
 
@@ -235,15 +235,17 @@ def is_tbb_available():  # pragma: no cover
         return False
     return True
 
+
 def show_mem_info(loglevel="info"):
     hpu_mem_mb = get_used_hpu_mem_MB()
     from neural_compressor.common.utils import logger
+
     show_fn = getattr(logger, loglevel)
     rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else -1
     show_fn(f"[Rank {rank}] Used HPU memory: {hpu_mem_mb // 1000} GB {hpu_mem_mb % 1000} MB")
     cpu_mem_mb = get_used_cpu_mem_MB()
     show_fn(f"[Rank {rank}] Used CPU memory: {cpu_mem_mb // 1000} GB {cpu_mem_mb % 1000} MB")
-    
+
 
 def get_used_hpu_mem_MB():
     """Get HPU used memory: MiB."""
@@ -259,6 +261,7 @@ def get_used_hpu_mem_MB():
 def get_used_cpu_mem_MB():
     """Get the amount of CPU memory used by the current process in MiB (Mebibytes)."""
     import psutil
+
     process = psutil.Process()
     mem_info = process.memory_info()
     used_cpu_mem = round(mem_info.rss / 1024**2, 3)
diff --git a/neural_compressor/torch/utils/llm_utility.py b/neural_compressor/torch/utils/llm_utility.py
@@ -16,10 +16,13 @@
 
 def initialize_model_and_tokenizer(model_name_or_path, use_load=False, device="cpu"):
     import transformers
-    from neural_compressor.torch.utils import local_rank, world_size, logger
+
+    from neural_compressor.torch.utils import local_rank, logger, world_size
+
     tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path)
     if use_load:
         from neural_compressor.torch.quantization import load
+
         model = load(model_name_or_path, format="huggingface", device=device)
         model, tokenizer = update_tokenizer(model, tokenizer)
         return model, tokenizer
@@ -37,6 +40,7 @@ def initialize_model_and_tokenizer(model_name_or_path, use_load=False, device="c
             "keep_module_on_host": True,
         }
         import deepspeed
+
         ds_model = deepspeed.init_inference(model, **ds_inference_kwargs)
         model = ds_model.module
     model.eval()
@@ -95,10 +99,14 @@ def __getitem__(self, idx):
     dataloader = DataLoader(tokenized_dataset, batch_size=bs, shuffle=True)
     return dataloader
 
+
 def llm_benchmark(model, batch_size, input_length, warmup_iters=3, total_iters=20):
     import time
+
     import torch
+
     from neural_compressor.torch.utils import get_accelerator, logger
+
     cur_accelerator = get_accelerator()
     # this is a simple example to show the performance benefit of quantization
     example_inputs = torch.ones((batch_size, input_length), dtype=torch.long)
diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py
@@ -32,7 +32,7 @@
     detect_processor_type_based_on_hw,
     logger,
 )
-from neural_compressor.torch.utils import is_transformers_imported, SaveLoadFormat
+from neural_compressor.torch.utils import SaveLoadFormat, is_transformers_imported
 
 if is_transformers_imported():
     import transformers
@@ -737,5 +737,4 @@ def get_enum_from_format(format):
             return obj
         elif format.upper() == obj.name:
             return obj
-    raise ValueError(
-        f"Invalid format value ('{format}'). Enter one of [{[m.name for m in SaveLoadFormat]}]")
+    raise ValueError(f"Invalid format value ('{format}'). Enter one of [{[m.name for m in SaveLoadFormat]}]")
diff --git a/test/3x/torch/algorithms/fp8_quant/tester.py b/test/3x/torch/algorithms/fp8_quant/tester.py
@@ -40,7 +40,7 @@
 QUANT_MODES_QUANT_ONLY = [QuantMode.QUANTIZE]
 
 DTYPE_TO_HPDTYPE_STR = {
-    torch.bfloat16: "BF16", 
+    torch.bfloat16: "BF16",
     torch.float16: "FP16",
     torch.float32: "FP32",
 }
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_save_load.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_save_load.py
diff --git a/test/3x/torch/algorithms/fp8_quant_xpu/unit_tests/test_xpu_basic.py b/test/3x/torch/algorithms/fp8_quant_xpu/unit_tests/test_xpu_basic.py

Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@`
`40`	`40`	`QUANT_MODES_QUANT_ONLY = [QuantMode.QUANTIZE]`
`41`	`41`
`42`	`42`	`DTYPE_TO_HPDTYPE_STR = {`
`43`		`- torch.bfloat16: "BF16",`
	`43`	`+ torch.bfloat16: "BF16",`
`44`	`44`	`torch.float16: "FP16",`
`45`	`45`	`torch.float32: "FP32",`
`46`	`46`	`}`