Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance 3.x torch WOQ load #1877

Closed
wants to merge 21 commits into from
Closed
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
317 changes: 187 additions & 130 deletions neural_compressor/torch/algorithms/weight_only/modules.py

Large diffs are not rendered by default.

544 changes: 347 additions & 197 deletions neural_compressor/torch/algorithms/weight_only/save_load.py

Large diffs are not rendered by default.

10 changes: 6 additions & 4 deletions neural_compressor/torch/quantization/load_entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,14 @@ def load(model_name_or_path, original_model=None, format="default", device="cpu"
Defaults to None.
format (str, optional): 'defult' for loading INC quantized model.
'huggingface' for loading huggingface WOQ causal language model. Defaults to "default".
device (str, optional): 'cpu', 'hpu' or 'cuda'. specify the device the model will be loaded to.
device (str, optional): 'cpu', 'hpu'. specify the device the model will be loaded to.
currently only used for weight-only quantization.
kwargs (remaining dictionary of keyword arguments, optional):
remaining dictionary of keyword arguments for loading huggingface models.
Will be passed to the huggingface model's `__init__` method, such as 'trust_remote_code', 'revision'.
Returns:
The quantized model
"""
# TODO: When loading WOQ model, use different WeightOnlyLinear module according to device.
if format == LoadFormat.DEFAULT.value:
from neural_compressor.common.base_config import ConfigRegistry

Expand All @@ -92,7 +92,8 @@ def load(model_name_or_path, original_model=None, format="default", device="cpu"
if isinstance(config_object, (RTNConfig, GPTQConfig, AWQConfig, TEQConfig, AutoRoundConfig)): # WOQ
from neural_compressor.torch.algorithms import weight_only

return weight_only.load(model_name_or_path, original_model, format=LoadFormat.DEFAULT)
qmodel = weight_only.load(model_name_or_path, original_model, format=LoadFormat.DEFAULT, device=device)
return qmodel.to(device)

original_model.qconfig = config_mapping
if isinstance(config_object, FP8Config): # FP8
Expand All @@ -103,6 +104,7 @@ def load(model_name_or_path, original_model=None, format="default", device="cpu"
# now only support load huggingface WOQ causal language model
from neural_compressor.torch.algorithms import weight_only

return weight_only.load(model_name_or_path, format=LoadFormat.HUGGINGFACE, **kwargs)
qmodel = weight_only.load(model_name_or_path, format=LoadFormat.HUGGINGFACE, device=device, **kwargs)
return qmodel.to(device)
else:
raise ValueError("`format` in load function can only be 'huggingface' or 'default', but get {}".format(format))
5 changes: 3 additions & 2 deletions neural_compressor/torch/utils/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@
# All constants for torch
WHITE_MODULE_LIST = [torch.nn.Linear, torch.nn.Conv1d, torch.nn.Conv2d, torch.nn.Conv3d]


WEIGHT_NAME = "quantized_model.pt"
HPU_SAFE_WEIGHTS_NAME = "hpu_model.safetensors"
WEIGHT_NAME = "quantized_weight.pt"
HPU_WEIGHT_NAME = "quantized_hpu_weight.pt"
QCONFIG_NAME = "qconfig.json"


Expand Down
56 changes: 50 additions & 6 deletions test/3x/torch/quantization/weight_only/test_awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,14 @@ def get_gpt_j():
return tiny_gptj


def get_woq_linear_num(model, woq_module_type_name):
woq_linear_num = 0
for _, module in model.named_modules():
if module.__class__.__name__ == woq_module_type_name:
woq_linear_num += 1
return woq_linear_num


class TestAWQQuant:
@classmethod
def setup_class(self):
Expand Down Expand Up @@ -110,6 +118,10 @@ def calib_func(model):
), "The results of calling `convert` + `prepare` and calling `quantize` should be equal."

def test_save_and_load(self):
from neural_compressor.torch.algorithms.weight_only.save_load import WOQModelLoader
from neural_compressor.torch.quantization import load
from neural_compressor.torch.utils import LoadFormat

@torch.no_grad()
def calib_func(model):
for i in range(2):
Expand All @@ -130,10 +142,42 @@ def calib_func(model):
q_model.save("saved_results")
inc_out = q_model(self.example_inputs)[0]

from neural_compressor.torch.quantization import load

# loading compressed model
# 1. loading compressed model (format=INC, device="cpu")
# linear -> INCWeightOnlyLinear
loaded_model = load("saved_results", copy.deepcopy(self.tiny_gptj))
loaded_out = loaded_model(self.example_inputs)[0]
assert torch.allclose(inc_out, loaded_out), "Unexpected result. Please double check."
assert isinstance(loaded_model.lm_head, WeightOnlyLinear), "loading compressed model failed."
output1 = loaded_model(self.example_inputs)[0]
assert torch.allclose(inc_out, output1), "Unexpected result. Please double check."
assert (
get_woq_linear_num(loaded_model, "INCWeightOnlyLinear") == 31
), "Incorrect number of INCWeightOnlyLinear modules"

# 2. loading compressed model (format=INC, device="hpu")
# first load: linear -> INCWeightOnlyLinear -> HPUWeightOnlyLinear, save quantized_hpu_weight.pt to local cache dir
model_loader = WOQModelLoader(
model_name_or_path="saved_results",
original_model=copy.deepcopy(self.tiny_gptj),
format=LoadFormat.DEFAULT,
device="hpu",
)
loaded_model = model_loader.load_woq_model()
assert (
get_woq_linear_num(loaded_model, "HPUWeightOnlyLinear") == 31
), "Incorrect number of HPUWeightOnlyLinear modules"
output2 = loaded_model(self.example_inputs)[0]

# second load: linear -> HPUWeightOnlyLinear using quantized_hpu_weight.pt saved in local cache dir
model_loader = WOQModelLoader(
model_name_or_path="saved_results",
original_model=copy.deepcopy(self.tiny_gptj),
format=LoadFormat.DEFAULT,
device="hpu",
)
loaded_model = model_loader.load_woq_model()
assert (
get_woq_linear_num(loaded_model, "HPUWeightOnlyLinear") == 31
), "Incorrect number of HPUWeightOnlyLinear modules"
output3 = loaded_model(self.example_inputs)[0]

assert torch.equal(
output2, output3
), "The model loaded the second time is different from the model loaded the first time"
58 changes: 50 additions & 8 deletions test/3x/torch/quantization/weight_only/test_gptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,14 @@ def run_fn(model):
model(torch.tensor([[40, 50, 60]], dtype=torch.long).to(device))


def get_woq_linear_num(model, woq_module_type_name):
woq_linear_num = 0
for _, module in model.named_modules():
if module.__class__.__name__ == woq_module_type_name:
woq_linear_num += 1
return woq_linear_num


class TestGPTQQuant:
def setup_class(self):
self.tiny_gptj = transformers.AutoModelForCausalLM.from_pretrained(
Expand Down Expand Up @@ -242,6 +250,10 @@ def run_fn_conv1d(model):
assert torch.allclose(out2, out1, atol=0.01), "Accuracy gap atol > 0.01 is unexpected."

def test_save_and_load(self):
from neural_compressor.torch.algorithms.weight_only.save_load import WOQModelLoader
from neural_compressor.torch.quantization import load
from neural_compressor.torch.utils import LoadFormat

fp32_model = copy.deepcopy(self.tiny_gptj)
quant_config = get_default_gptq_config()
prepared_model = prepare(fp32_model, quant_config)
Expand All @@ -251,12 +263,42 @@ def test_save_and_load(self):
q_model.save("saved_results")
inc_out = q_model(self.example_inputs)[0]

from neural_compressor.torch.quantization import load

# loading compressed model
# 1. loading compressed model (format=INC, device="cpu")
# linear -> INCWeightOnlyLinear
loaded_model = load("saved_results", copy.deepcopy(self.tiny_gptj))
loaded_out = loaded_model(self.example_inputs)[0]
assert torch.allclose(inc_out, loaded_out), "Unexpected result. Please double check."
assert isinstance(
loaded_model.transformer.h[0].attn.k_proj, WeightOnlyLinear
), "loading compressed model failed."
output1 = loaded_model(self.example_inputs)[0]
assert torch.allclose(inc_out, output1), "Unexpected result. Please double check."
assert (
get_woq_linear_num(loaded_model, "INCWeightOnlyLinear") == 30
), "Incorrect number of INCWeightOnlyLinear modules"

# 2. loading compressed model (format=INC, device="hpu")
# first load: linear -> INCWeightOnlyLinear -> HPUWeightOnlyLinear, save quantized_hpu_weight.pt to local cache dir
model_loader = WOQModelLoader(
model_name_or_path="saved_results",
original_model=copy.deepcopy(self.tiny_gptj),
format=LoadFormat.DEFAULT,
device="hpu",
)
loaded_model = model_loader.load_woq_model()
assert (
get_woq_linear_num(loaded_model, "HPUWeightOnlyLinear") == 30
), "Incorrect number of HPUWeightOnlyLinear modules"
output2 = loaded_model(self.example_inputs)[0]

# second load: linear -> HPUWeightOnlyLinear using quantized_hpu_weight.pt saved in local cache dir
model_loader = WOQModelLoader(
model_name_or_path="saved_results",
original_model=copy.deepcopy(self.tiny_gptj),
format=LoadFormat.DEFAULT,
device="hpu",
)
loaded_model = model_loader.load_woq_model()
assert (
get_woq_linear_num(loaded_model, "HPUWeightOnlyLinear") == 30
), "Incorrect number of HPUWeightOnlyLinear modules"
output3 = loaded_model(self.example_inputs)[0]

assert torch.equal(
output2, output3
), "The model loaded the second time is different from the model loaded the first time"
107 changes: 107 additions & 0 deletions test/3x/torch/quantization/weight_only/test_load.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import copy
import shutil

import huggingface_hub
import torch
import transformers

from neural_compressor.common import logger
from neural_compressor.torch.algorithms.weight_only.save_load import WOQModelLoader
from neural_compressor.torch.utils import LoadFormat, accelerator

device = accelerator.current_device_name()


class TestHFModelLoad:
def setup_class(self):
self.model_name = "TheBloke/TinyLlama-1.1B-python-v0.1-GPTQ"
self.example_inputs = torch.tensor([[10, 20, 30, 40, 50, 60]], dtype=torch.long).to(device)

self.local_hf_model = "./TinyLlama-1.1B-python-v0.1-GPTQ"
huggingface_hub.snapshot_download(self.model_name, local_dir=self.local_hf_model)

def teardown_class(self):
shutil.rmtree("TinyLlama-1.1B-python-v0.1-GPTQ", ignore_errors=True)
shutil.rmtree("saved_results", ignore_errors=True)

def get_woq_linear_num(self, model, woq_module_type_name):
woq_linear_num = 0
for _, module in model.named_modules():
if module.__class__.__name__ == woq_module_type_name:
woq_linear_num += 1
return woq_linear_num

def test_load_hf_woq_model(self):
from neural_compressor.torch.quantization import load

# 1. huggingface model_id (format=huggingface, device="cpu")
qmodel = load(
model_name_or_path=self.model_name, format="huggingface", torch_dtype=torch.float32
) # 'torch_dtype=torch.float32' for cpu test
assert (
self.get_woq_linear_num(qmodel, "INCWeightOnlyLinear") == 154
), "Incorrect number of INCWeightOnlyLinear modules"
output = qmodel(self.example_inputs)[0]
assert len(output) > 0, "Not loading the model correctly"

# 2. huggingface model_id (format=huggingface, device="hpu")
# first load: linear -> INCWeightOnlyLinear -> HPUWeightOnlyLinear, save hpu_model.safetensors to local cache dir
model_loader = WOQModelLoader(
model_name_or_path=self.model_name, format=LoadFormat.HUGGINGFACE, device="hpu", torch_dtype=torch.float32
)
model = model_loader.load_woq_model()
assert (
self.get_woq_linear_num(model, "HPUWeightOnlyLinear") == 154
), "Incorrect number of HPUWeightOnlyLinear modules"
output1 = model(self.example_inputs)[0]

# second load: linear -> HPUWeightOnlyLinear using hpu_model.safetensors saved in local cache dir
model_loader = WOQModelLoader(
model_name_or_path=self.model_name, format=LoadFormat.HUGGINGFACE, device="hpu", torch_dtype=torch.float32
)
model = model_loader.load_woq_model()
assert (
self.get_woq_linear_num(model, "HPUWeightOnlyLinear") == 154
), "Incorrect number of HPUWeightOnlyLinear modules"
output2 = model(self.example_inputs)[0]

assert torch.equal(
output1, output2
), "The model loaded the second time is different from the model loaded the first time"

# 3. huggingface local model_path (format=huggingface, device="hpu")
# first load: linear -> INCWeightOnlyLinear -> HPUWeightOnlyLinear, save hpu_model.safetensors to local cache dir
model_loader = WOQModelLoader(
model_name_or_path=self.local_hf_model,
format=LoadFormat.HUGGINGFACE,
device="hpu",
torch_dtype=torch.float32,
)
model = model_loader.load_woq_model()
assert (
self.get_woq_linear_num(model, "HPUWeightOnlyLinear") == 154
), "Incorrect number of HPUWeightOnlyLinear modules"
output1 = model(self.example_inputs)[0]

# second load: linear -> HPUWeightOnlyLinear using hpu_model.safetensors saved in local cache dir
model_loader = WOQModelLoader(
model_name_or_path=self.local_hf_model,
format=LoadFormat.HUGGINGFACE,
device="hpu",
torch_dtype=torch.float32,
)
model = model_loader.load_woq_model()
assert (
self.get_woq_linear_num(model, "HPUWeightOnlyLinear") == 154
), "Incorrect number of HPUWeightOnlyLinear modules"
output2 = model(self.example_inputs)[0]

assert torch.equal(
output1, output2
), "The model loaded the second time is different from the model loaded the first time"


test = TestHFModelLoad()
test.setup_class()
test.test_load_hf_woq_model()
test.teardown_class()
24 changes: 0 additions & 24 deletions test/3x/torch/quantization/weight_only/test_load_woq_hf_model.py

This file was deleted.

Loading
Loading