|
29 | 29 |
|
30 | 30 | from ..utils import is_accelerate_available, is_auto_gptq_available, is_gptqmodel_available
|
31 | 31 | from ..utils.modeling_utils import recurse_getattr
|
| 32 | +from ..version import __version__ as optimum_version |
32 | 33 | from .constants import GPTQ_CONFIG
|
33 | 34 | from .data import get_dataset, prepare_dataset
|
34 | 35 | from .utils import get_block_name_with_pattern, get_device, get_layers, get_preceding_modules, get_seqlen
|
35 |
| -from ..version import __version__ as optimum_version |
36 | 36 |
|
37 | 37 |
|
38 | 38 | if is_accelerate_available():
|
|
43 | 43 | from accelerate.hooks import remove_hook_from_module
|
44 | 44 |
|
45 | 45 | if is_auto_gptq_available():
|
| 46 | + from auto_gptq import __version__ as autogptq_version |
46 | 47 | from auto_gptq import exllama_set_max_input_length
|
47 | 48 | from auto_gptq.modeling._utils import autogptq_post_init as gptq_post_init
|
48 | 49 | from auto_gptq.quantization import GPTQ
|
49 | 50 | from auto_gptq.utils.import_utils import dynamically_import_QuantLinear as hf_select_quant_linear
|
50 |
| - from auto_gptq import __version__ as autogptq_version |
51 | 51 |
|
52 | 52 | if is_gptqmodel_available():
|
53 | 53 | from gptqmodel import exllama_set_max_input_length
|
@@ -128,8 +128,7 @@ def __init__(
|
128 | 128 | Properties, such as tooling:version, that do not directly contributes to quantization or quant inference are stored in meta.
|
129 | 129 | i.e. `meta.quantizer`: ["optimum:_version_", "gptqmodel:_version_"]
|
130 | 130 | backend (`str`, *optional*):
|
131 |
| - Controls which gptq kernel to be used. Valid values for gptqmodel are `auto`, `auto_trainable` and more. For auto-gptq, only |
132 |
| - valid value is None and `auto_trainable`. Ref gptqmodel backends: https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py |
| 131 | + Controls which gptq kernel to be used. Valid values for gptqmodel are `auto`, `auto_trainable` and more. For auto-gptq, only valid value is None and `auto_trainable`. Ref gptqmodel backends: https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py |
133 | 132 | use_cuda_fp16 (`bool`, defaults to `False`):
|
134 | 133 | Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16.
|
135 | 134 | model_seqlen (`Optional[int]`, defaults to `None`):
|
@@ -246,7 +245,7 @@ def to_dict(self):
|
246 | 245 |
|
247 | 246 | if gptq_dict.get("meta") is None:
|
248 | 247 | gptq_dict["meta"] = {}
|
249 |
| - |
| 248 | + |
250 | 249 | meta = gptq_dict["meta"]
|
251 | 250 | # store both optimum:version and gptq_lib:version into quantize_config.meta.quantizer
|
252 | 251 | if meta.get("quantizer") is None:
|
@@ -719,7 +718,9 @@ class StoreAttr(object):
|
719 | 718 | pass
|
720 | 719 |
|
721 | 720 | if is_gptqmodel_available():
|
722 |
| - model, _ = hf_convert_gptq_v1_to_v2_format(model, self.bits, self.quant_linear, self.checkpoint_format, self.meta) |
| 721 | + model, _ = hf_convert_gptq_v1_to_v2_format( |
| 722 | + model, self.bits, self.quant_linear, self.checkpoint_format, self.meta |
| 723 | + ) |
723 | 724 |
|
724 | 725 | model.quantize_config = StoreAttr()
|
725 | 726 | model.quantize_config.desc_act = self.desc_act
|
@@ -790,9 +791,12 @@ def save(self, model: nn.Module, save_dir: str, max_shard_size: str = "10GB", sa
|
790 | 791 | """
|
791 | 792 |
|
792 | 793 | # convert gptqmodel internal gptq_v2 format to v1 for max compatibility
|
793 |
| - model, converted = hf_convert_gptq_v2_to_v1_format(model, self.sym, self.bits, self.quant_linear, self.checkpoint_format, self.meta) |
794 |
| - if converted: |
795 |
| - self.checkpoint_format = "gptq" |
| 794 | + if is_gptqmodel_available(): |
| 795 | + model, converted = hf_convert_gptq_v2_to_v1_format( |
| 796 | + model, self.sym, self.bits, self.quant_linear, self.checkpoint_format, self.meta |
| 797 | + ) |
| 798 | + if converted: |
| 799 | + self.checkpoint_format = "gptq" |
796 | 800 |
|
797 | 801 | os.makedirs(save_dir, exist_ok=True)
|
798 | 802 | model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
|
|
0 commit comments