Skip to content

Commit 3603a0b

Browse files
committed
fix format and convert v2 to v1
Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
1 parent 5979473 commit 3603a0b

File tree

3 files changed

+17
-12
lines changed

3 files changed

+17
-12
lines changed

optimum/gptq/quantizer.py

+13-9
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,10 @@
2929

3030
from ..utils import is_accelerate_available, is_auto_gptq_available, is_gptqmodel_available
3131
from ..utils.modeling_utils import recurse_getattr
32+
from ..version import __version__ as optimum_version
3233
from .constants import GPTQ_CONFIG
3334
from .data import get_dataset, prepare_dataset
3435
from .utils import get_block_name_with_pattern, get_device, get_layers, get_preceding_modules, get_seqlen
35-
from ..version import __version__ as optimum_version
3636

3737

3838
if is_accelerate_available():
@@ -43,11 +43,11 @@
4343
from accelerate.hooks import remove_hook_from_module
4444

4545
if is_auto_gptq_available():
46+
from auto_gptq import __version__ as autogptq_version
4647
from auto_gptq import exllama_set_max_input_length
4748
from auto_gptq.modeling._utils import autogptq_post_init as gptq_post_init
4849
from auto_gptq.quantization import GPTQ
4950
from auto_gptq.utils.import_utils import dynamically_import_QuantLinear as hf_select_quant_linear
50-
from auto_gptq import __version__ as autogptq_version
5151

5252
if is_gptqmodel_available():
5353
from gptqmodel import exllama_set_max_input_length
@@ -128,8 +128,7 @@ def __init__(
128128
Properties, such as tooling:version, that do not directly contributes to quantization or quant inference are stored in meta.
129129
i.e. `meta.quantizer`: ["optimum:_version_", "gptqmodel:_version_"]
130130
backend (`str`, *optional*):
131-
Controls which gptq kernel to be used. Valid values for gptqmodel are `auto`, `auto_trainable` and more. For auto-gptq, only
132-
valid value is None and `auto_trainable`. Ref gptqmodel backends: https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py
131+
Controls which gptq kernel to be used. Valid values for gptqmodel are `auto`, `auto_trainable` and more. For auto-gptq, only valid value is None and `auto_trainable`. Ref gptqmodel backends: https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py
133132
use_cuda_fp16 (`bool`, defaults to `False`):
134133
Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16.
135134
model_seqlen (`Optional[int]`, defaults to `None`):
@@ -246,7 +245,7 @@ def to_dict(self):
246245

247246
if gptq_dict.get("meta") is None:
248247
gptq_dict["meta"] = {}
249-
248+
250249
meta = gptq_dict["meta"]
251250
# store both optimum:version and gptq_lib:version into quantize_config.meta.quantizer
252251
if meta.get("quantizer") is None:
@@ -719,7 +718,9 @@ class StoreAttr(object):
719718
pass
720719

721720
if is_gptqmodel_available():
722-
model, _ = hf_convert_gptq_v1_to_v2_format(model, self.bits, self.quant_linear, self.checkpoint_format, self.meta)
721+
model, _ = hf_convert_gptq_v1_to_v2_format(
722+
model, self.bits, self.quant_linear, self.checkpoint_format, self.meta
723+
)
723724

724725
model.quantize_config = StoreAttr()
725726
model.quantize_config.desc_act = self.desc_act
@@ -790,9 +791,12 @@ def save(self, model: nn.Module, save_dir: str, max_shard_size: str = "10GB", sa
790791
"""
791792

792793
# convert gptqmodel internal gptq_v2 format to v1 for max compatibility
793-
model, converted = hf_convert_gptq_v2_to_v1_format(model, self.sym, self.bits, self.quant_linear, self.checkpoint_format, self.meta)
794-
if converted:
795-
self.checkpoint_format = "gptq"
794+
if is_gptqmodel_available():
795+
model, converted = hf_convert_gptq_v2_to_v1_format(
796+
model, self.sym, self.bits, self.quant_linear, self.checkpoint_format, self.meta
797+
)
798+
if converted:
799+
self.checkpoint_format = "gptq"
796800

797801
os.makedirs(save_dir, exist_ok=True)
798802
model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)

optimum/utils/import_utils.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
5252
TRANSFORMERS_MINIMUM_VERSION = version.parse("4.25.0")
5353
DIFFUSERS_MINIMUM_VERSION = version.parse("0.22.0")
5454
AUTOGPTQ_MINIMUM_VERSION = version.parse("0.4.99") # Allows 0.5.0.dev0
55-
GPTQMODEL_MINIMUM_VERSION = version.parse("1.3.99") # Allows 1.4.0.dev0
55+
GPTQMODEL_MINIMUM_VERSION = version.parse("1.4.1") # Allows 1.4.0.dev0
5656

5757

5858
# This is the minimal required version to support some ONNX Runtime features

tests/gptq/test_quantization.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,6 @@ class GPTQTestCUDA(GPTQTest):
193193
expected_fp16_perplexity = 38
194194
expected_quantized_perplexity = 45
195195

196-
197196
def test_perplexity(self):
198197
"""
199198
A simple test to check if the model conversion has been done correctly by checking on the
@@ -309,7 +308,9 @@ def test_exllama_serialization(self):
309308
save_folder=tmpdirname,
310309
device_map={"": self.device_for_inference},
311310
)
312-
self.check_quantized_layers_type(quantized_model_from_saved, "exllama" if is_gptqmodel_available else "exllamav2")
311+
self.check_quantized_layers_type(
312+
quantized_model_from_saved, "exllama" if is_gptqmodel_available else "exllamav2"
313+
)
313314

314315
# transformers and auto-gptq compatibility
315316
# quantized models are more compatible with device map than

0 commit comments

Comments
 (0)