-
Notifications
You must be signed in to change notification settings - Fork 516
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Enable GPTQModel #2064
Merged
+227
−61
Merged
Enable GPTQModel #2064
Changes from 1 commit
Commits
Show all changes
28 commits
Select commit
Hold shift + click to select a range
3b6ddfc
align gptq check to transformers for supporting cpu
jiqing-feng 50a405a
fix comment
jiqing-feng 1cce05b
Merge branch 'huggingface:main' into gptq
jiqing-feng 9bb7694
gptqmodel
jiqing-feng 4709869
compatible with auto-gptq
jiqing-feng 32d434f
fix compatible with auto-gptq
jiqing-feng 0640350
fix compatible with auto-gptq linear
jiqing-feng 8c1c142
revert unrelated changes
jiqing-feng 27d2f2b
gptqmodel need use checkpoint_format (#1)
LRL-ModelCloud b65cd7f
Mod backend code (#2)
LRL-ModelCloud 20e81c4
fix format and log
jiqing-feng 1bad53e
fix version check
jiqing-feng ea29c3c
enable gptqmodel tests
jiqing-feng aa3d558
update check quant type
jiqing-feng 5979473
Fix optimum compat (#3)
ZX-ModelCloud 3603a0b
fix format and convert v2 to v1
jiqing-feng 32b0e7d
[Fix] all tensors not same device (#5)
ZX-ModelCloud dea8a47
fix format
jiqing-feng 69cf2e3
add gptqmodel tests which contains cpu
jiqing-feng 7312b7a
fix all auto-gptq tests
jiqing-feng f9b30c1
revert tests
jiqing-feng 19e7261
rm gptqmodel yaml
jiqing-feng 7125fe9
fix comment
jiqing-feng b61ef4a
enable real cpu tests by fp32
jiqing-feng d81ce2e
fix test model name
jiqing-feng f2b9688
keep the original device setting when using auto-gptq
jiqing-feng c446522
Update optimum/gptq/quantizer.py
jiqing-feng c8a6528
Update optimum/gptq/quantizer.py
jiqing-feng File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
enable gptqmodel tests
Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
commit ea29c3c9d25e6619d6746ec1596ad4b3095f15a3
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,28 +26,31 @@ | |
from optimum.gptq.eval import evaluate_perplexity | ||
from optimum.gptq.utils import get_block_name_with_pattern, get_preceding_modules, get_seqlen | ||
from optimum.utils import recurse_getattr | ||
from optimum.utils.import_utils import is_accelerate_available, is_auto_gptq_available | ||
from optimum.utils.testing_utils import require_auto_gptq, require_torch_gpu | ||
from optimum.utils.import_utils import is_accelerate_available, is_auto_gptq_available, is_gptqmodel_available | ||
from optimum.utils.testing_utils import require_gptq, require_torch_gpu | ||
|
||
|
||
if is_auto_gptq_available(): | ||
from auto_gptq import AutoGPTQForCausalLM | ||
from auto_gptq.utils.import_utils import dynamically_import_QuantLinear | ||
from auto_gptq.utils.import_utils import dynamically_import_QuantLinear as hf_select_quant_linear | ||
|
||
if is_gptqmodel_available(): | ||
from gptqmodel import GPTQModel | ||
from gptqmodel.utils.importer import hf_select_quant_linear | ||
|
||
if is_accelerate_available(): | ||
from accelerate import init_empty_weights | ||
|
||
|
||
@slow | ||
@require_auto_gptq | ||
@require_torch_gpu | ||
@require_gptq | ||
class GPTQTest(unittest.TestCase): | ||
model_name = "bigscience/bloom-560m" | ||
model_name = "Felladrin/Llama-68M-Chat-v1" | ||
|
||
expected_fp16_perplexity = 30 | ||
expected_quantized_perplexity = 34 | ||
|
||
expected_compression_ratio = 1.66 | ||
expected_compression_ratio = 1.2577 | ||
|
||
bits = 4 | ||
group_size = 128 | ||
|
@@ -56,8 +59,8 @@ class GPTQTest(unittest.TestCase): | |
exllama_config = None | ||
cache_block_outputs = True | ||
modules_in_block_to_quantize = None | ||
device_map_for_quantization = "cuda" | ||
device_for_inference = 0 | ||
device_map_for_quantization = "cpu" | ||
device_for_inference = "cpu" | ||
dataset = [ | ||
"auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm." | ||
] | ||
|
@@ -104,33 +107,36 @@ def test_memory_footprint(self): | |
self.assertAlmostEqual(self.fp16_mem / self.quantized_mem, self.expected_compression_ratio, places=2) | ||
|
||
def test_perplexity(self): | ||
""" | ||
A simple test to check if the model conversion has been done correctly by checking on the | ||
the perplexity of the converted models | ||
""" | ||
|
||
self.assertEqual(int(self.fp16_ppl), self.expected_fp16_perplexity) | ||
self.assertEqual(int(self.quantized_ppl), self.expected_quantized_perplexity) | ||
pass | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why remove the tests ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. CPU don't support this test yet, so we moved it to cuda test. |
||
|
||
def test_quantized_layers_class(self): | ||
""" | ||
A simple test to check if the model conversion has been done correctly by checking on the | ||
the class type of the linear layers of the converted models | ||
""" | ||
|
||
QuantLinear = dynamically_import_QuantLinear( | ||
use_triton=False, | ||
use_qigen=False, | ||
desc_act=self.desc_act, | ||
group_size=self.group_size, | ||
bits=self.bits, | ||
disable_exllama=self.disable_exllama or self.exllama_config["version"] != 1, | ||
disable_exllamav2=self.disable_exllama or self.exllama_config["version"] != 2, | ||
) | ||
self.assertTrue(self.quantized_model.transformer.h[0].mlp.dense_4h_to_h.__class__ == QuantLinear) | ||
if is_gptqmodel_available(): | ||
QuantLinear = hf_select_quant_linear( | ||
bits=self.bits, | ||
group_size=self.group_size, | ||
desc_act=self.desc_act, | ||
sym=True, | ||
device_map=self.device_map_for_quantization, | ||
pack=False, | ||
) | ||
else: | ||
QuantLinear = hf_select_quant_linear( | ||
use_triton=False, | ||
desc_act=self.desc_act, | ||
group_size=self.group_size, | ||
bits=self.bits, | ||
disable_exllama=self.disable_exllama or self.exllama_config["version"] != 1, | ||
disable_exllamav2=self.disable_exllama or self.exllama_config["version"] != 2, | ||
) | ||
self.assertTrue(self.quantized_model.model.layers[0].mlp.gate_proj.__class__ == QuantLinear) | ||
|
||
def check_quantized_layers_type(self, model, value): | ||
self.assertTrue(model.transformer.h[0].mlp.dense_4h_to_h.QUANT_TYPE == value) | ||
self.assertTrue(model.model.layers[0].mlp.gate_proj.QUANT_TYPE == value) | ||
|
||
def test_serialization(self): | ||
""" | ||
|
@@ -152,31 +158,45 @@ def test_serialization(self): | |
disable_exllama=self.disable_exllama, | ||
exllama_config=self.exllama_config, | ||
) | ||
if self.disable_exllama: | ||
self.check_quantized_layers_type(quantized_model_from_saved, "cuda-old") | ||
else: | ||
self.check_quantized_layers_type(quantized_model_from_saved, "exllama") | ||
# Only auto-gptq need to check the quant type | ||
if is_auto_gptq_available() and not is_gptqmodel_available(): | ||
if self.disable_exllama: | ||
self.check_quantized_layers_type(quantized_model_from_saved, "cuda-old") | ||
else: | ||
self.check_quantized_layers_type(quantized_model_from_saved, "exllama") | ||
|
||
# transformers and auto-gptq compatibility | ||
# quantized models are more compatible with device map than | ||
# device context managers (they're never used in transformers testing suite) | ||
_ = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map={"": self.device_for_inference}) | ||
_ = AutoGPTQForCausalLM.from_quantized(tmpdirname, device_map={"": self.device_for_inference}) | ||
if is_gptqmodel_available(): | ||
_ = GPTQModel.load(tmpdirname, device_map={"": self.device_for_inference}) | ||
else: | ||
_ = AutoGPTQForCausalLM.from_quantized(tmpdirname, device_map={"": self.device_for_inference}) | ||
|
||
|
||
class GPTQTestCPUInit(GPTQTest): | ||
device_map_for_quantization = "cpu" | ||
@require_torch_gpu | ||
class GPTQTestCUDA(GPTQTest): | ||
device_map_for_quantization = "cuda" | ||
device_for_inference = 0 | ||
expected_compression_ratio = 1.66 | ||
|
||
def test_perplexity(self): | ||
pass | ||
""" | ||
A simple test to check if the model conversion has been done correctly by checking on the | ||
the perplexity of the converted models | ||
""" | ||
|
||
self.assertEqual(int(self.fp16_ppl), self.expected_fp16_perplexity) | ||
self.assertEqual(int(self.quantized_ppl), self.expected_quantized_perplexity) | ||
|
||
|
||
class GPTQTestExllama(GPTQTest): | ||
class GPTQTestExllama(GPTQTestCUDA): | ||
disable_exllama = False | ||
exllama_config = {"version": 1} | ||
|
||
|
||
class GPTQTestActOrder(GPTQTest): | ||
class GPTQTestActOrder(GPTQTestCUDA): | ||
disable_exllama = True | ||
desc_act = True | ||
|
||
|
@@ -209,7 +229,10 @@ def test_exllama_serialization(self): | |
# quantized models are more compatible with device map than | ||
# device context managers (they're never used in transformers testing suite) | ||
_ = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map={"": self.device_for_inference}) | ||
_ = AutoGPTQForCausalLM.from_quantized(tmpdirname, device_map={"": self.device_for_inference}) | ||
if is_gptqmodel_available(): | ||
_ = GPTQModel.load(tmpdirname, device_map={"": self.device_for_inference}) | ||
else: | ||
_ = AutoGPTQForCausalLM.from_quantized(tmpdirname, device_map={"": self.device_for_inference}) | ||
|
||
def test_exllama_max_input_length(self): | ||
""" | ||
|
@@ -246,7 +269,7 @@ def test_exllama_max_input_length(self): | |
quantized_model_from_saved.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3) | ||
|
||
|
||
class GPTQTestExllamav2(GPTQTest): | ||
class GPTQTestExllamav2(GPTQTestCUDA): | ||
desc_act = False | ||
disable_exllama = True | ||
exllama_config = {"version": 2} | ||
|
@@ -279,25 +302,27 @@ def test_exllama_serialization(self): | |
# quantized models are more compatible with device map than | ||
# device context managers (they're never used in transformers testing suite) | ||
_ = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map={"": self.device_for_inference}) | ||
_ = AutoGPTQForCausalLM.from_quantized(tmpdirname, device_map={"": self.device_for_inference}) | ||
if is_gptqmodel_available(): | ||
_ = GPTQModel.load(tmpdirname, device_map={"": self.device_for_inference}) | ||
else: | ||
_ = AutoGPTQForCausalLM.from_quantized(tmpdirname, device_map={"": self.device_for_inference}) | ||
|
||
|
||
class GPTQTestNoBlockCaching(GPTQTest): | ||
class GPTQTestNoBlockCaching(GPTQTestCUDA): | ||
cache_block_outputs = False | ||
|
||
|
||
class GPTQTestModuleQuant(GPTQTest): | ||
class GPTQTestModuleQuant(GPTQTestCUDA): | ||
# all layers are quantized apart from self_attention.dense | ||
modules_in_block_to_quantize = [ | ||
["self_attention.query_key_value"], | ||
["mlp.dense_h_to_4h"], | ||
["mlp.dense_4h_to_h"], | ||
["self_attn.q_proj"], | ||
["mlp.gate_proj"], | ||
] | ||
expected_compression_ratio = 1.577 | ||
|
||
def test_not_converted_layers(self): | ||
# self_attention.dense should not be converted | ||
self.assertTrue(self.quantized_model.transformer.h[0].self_attention.dense.__class__.__name__ == "Linear") | ||
self.assertTrue(self.quantized_model.model.layers[0].self_attn.k_proj.__class__.__name__ == "Linear") | ||
|
||
|
||
class GPTQUtilsTest(unittest.TestCase): | ||
|
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please use a more standard model, we don't wanna rely on a model that might vanish