intel · yuwenzho · Jun 18, 2024 · Jun 18, 2024 · Jun 18, 2024 · Jun 18, 2024
diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py
diff --git a/neural_compressor/torch/algorithms/weight_only/save_load.py b/neural_compressor/torch/algorithms/weight_only/save_load.py
diff --git a/neural_compressor/torch/quantization/load_entry.py b/neural_compressor/torch/quantization/load_entry.py
@@ -65,14 +65,14 @@ def load(model_name_or_path, original_model=None, format="default", device="cpu"
             Defaults to None.
         format (str, optional): 'defult' for loading INC quantized model.
             'huggingface' for loading huggingface WOQ causal language model. Defaults to "default".
-        device (str, optional): 'cpu', 'hpu' or 'cuda'. specify the device the model will be loaded to.
+        device (str, optional): 'cpu', 'hpu'. specify the device the model will be loaded to.
+            currently only used for weight-only quantization.
         kwargs (remaining dictionary of keyword arguments, optional):
             remaining dictionary of keyword arguments for loading huggingface models.
             Will be passed to the huggingface model's `__init__` method, such as 'trust_remote_code', 'revision'.
     Returns:
         The quantized model
     """
-    # TODO: When loading WOQ model, use different WeightOnlyLinear module according to device.
     if format == LoadFormat.DEFAULT.value:
         from neural_compressor.common.base_config import ConfigRegistry
 
@@ -92,7 +92,8 @@ def load(model_name_or_path, original_model=None, format="default", device="cpu"
             if isinstance(config_object, (RTNConfig, GPTQConfig, AWQConfig, TEQConfig, AutoRoundConfig)):  # WOQ
                 from neural_compressor.torch.algorithms import weight_only
 
-                return weight_only.load(model_name_or_path, original_model, format=LoadFormat.DEFAULT)
+                qmodel = weight_only.load(model_name_or_path, original_model, format=LoadFormat.DEFAULT, device=device)
+                return qmodel.to(device)
 
             original_model.qconfig = config_mapping
             if isinstance(config_object, FP8Config):  # FP8
@@ -103,6 +104,7 @@ def load(model_name_or_path, original_model=None, format="default", device="cpu"
         # now only support load huggingface WOQ causal language model
         from neural_compressor.torch.algorithms import weight_only
 
-        return weight_only.load(model_name_or_path, format=LoadFormat.HUGGINGFACE, **kwargs)
+        qmodel = weight_only.load(model_name_or_path, format=LoadFormat.HUGGINGFACE, device=device, **kwargs)
+        return qmodel.to(device)
     else:
         raise ValueError("`format` in load function can only be 'huggingface' or 'default', but get {}".format(format))
diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py
@@ -29,8 +29,9 @@
 # All constants for torch
 WHITE_MODULE_LIST = [torch.nn.Linear, torch.nn.Conv1d, torch.nn.Conv2d, torch.nn.Conv3d]
 
-
-WEIGHT_NAME = "quantized_model.pt"
+HPU_SAFE_WEIGHTS_NAME = "hpu_model.safetensors"
+WEIGHT_NAME = "quantized_weight.pt"
+HPU_WEIGHT_NAME = "quantized_hpu_weight.pt"
 QCONFIG_NAME = "qconfig.json"
 
 

diff --git a/test/3x/torch/quantization/weight_only/test_awq.py b/test/3x/torch/quantization/weight_only/test_awq.py
@@ -24,6 +24,14 @@ def get_gpt_j():
     return tiny_gptj
 
 
+def get_woq_linear_num(model, woq_module_type_name):
+    woq_linear_num = 0
+    for _, module in model.named_modules():
+        if module.__class__.__name__ == woq_module_type_name:
+            woq_linear_num += 1
+    return woq_linear_num
+
+
 class TestAWQQuant:
     @classmethod
     def setup_class(self):
@@ -110,6 +118,10 @@ def calib_func(model):
         ), "The results of calling `convert` + `prepare` and calling `quantize` should be equal."
 
     def test_save_and_load(self):
+        from neural_compressor.torch.algorithms.weight_only.save_load import WOQModelLoader
+        from neural_compressor.torch.quantization import load
+        from neural_compressor.torch.utils import LoadFormat
+
         @torch.no_grad()
         def calib_func(model):
             for i in range(2):
@@ -130,10 +142,42 @@ def calib_func(model):
         q_model.save("saved_results")
         inc_out = q_model(self.example_inputs)[0]
 
-        from neural_compressor.torch.quantization import load
-
-        # loading compressed model
+        # 1. loading compressed model (format=INC, device="cpu")
+        # linear -> INCWeightOnlyLinear
         loaded_model = load("saved_results", copy.deepcopy(self.tiny_gptj))
-        loaded_out = loaded_model(self.example_inputs)[0]
-        assert torch.allclose(inc_out, loaded_out), "Unexpected result. Please double check."
-        assert isinstance(loaded_model.lm_head, WeightOnlyLinear), "loading compressed model failed."
+        output1 = loaded_model(self.example_inputs)[0]
+        assert torch.allclose(inc_out, output1), "Unexpected result. Please double check."
+        assert (
+            get_woq_linear_num(loaded_model, "INCWeightOnlyLinear") == 31
+        ), "Incorrect number of INCWeightOnlyLinear modules"
+
+        # 2. loading compressed model (format=INC, device="hpu")
+        # first load: linear -> INCWeightOnlyLinear -> HPUWeightOnlyLinear, save quantized_hpu_weight.pt to local cache dir
+        model_loader = WOQModelLoader(
+            model_name_or_path="saved_results",
+            original_model=copy.deepcopy(self.tiny_gptj),
+            format=LoadFormat.DEFAULT,
+            device="hpu",
+        )
+        loaded_model = model_loader.load_woq_model()
+        assert (
+            get_woq_linear_num(loaded_model, "HPUWeightOnlyLinear") == 31
+        ), "Incorrect number of HPUWeightOnlyLinear modules"
+        output2 = loaded_model(self.example_inputs)[0]
+
+        # second load: linear -> HPUWeightOnlyLinear using quantized_hpu_weight.pt saved in local cache dir
+        model_loader = WOQModelLoader(
+            model_name_or_path="saved_results",
+            original_model=copy.deepcopy(self.tiny_gptj),
+            format=LoadFormat.DEFAULT,
+            device="hpu",
+        )
+        loaded_model = model_loader.load_woq_model()
+        assert (
+            get_woq_linear_num(loaded_model, "HPUWeightOnlyLinear") == 31
+        ), "Incorrect number of HPUWeightOnlyLinear modules"
+        output3 = loaded_model(self.example_inputs)[0]
+
+        assert torch.equal(
+            output2, output3
+        ), "The model loaded the second time is different from the model loaded the first time"
diff --git a/test/3x/torch/quantization/weight_only/test_gptq.py b/test/3x/torch/quantization/weight_only/test_gptq.py
@@ -32,6 +32,14 @@ def run_fn(model):
         model(torch.tensor([[40, 50, 60]], dtype=torch.long).to(device))
 
 
+def get_woq_linear_num(model, woq_module_type_name):
+    woq_linear_num = 0
+    for _, module in model.named_modules():
+        if module.__class__.__name__ == woq_module_type_name:
+            woq_linear_num += 1
+    return woq_linear_num
+
+
 class TestGPTQQuant:
     def setup_class(self):
         self.tiny_gptj = transformers.AutoModelForCausalLM.from_pretrained(
@@ -242,6 +250,10 @@ def run_fn_conv1d(model):
         assert torch.allclose(out2, out1, atol=0.01), "Accuracy gap atol > 0.01 is unexpected."
 
     def test_save_and_load(self):
+        from neural_compressor.torch.algorithms.weight_only.save_load import WOQModelLoader
+        from neural_compressor.torch.quantization import load
+        from neural_compressor.torch.utils import LoadFormat
+
         fp32_model = copy.deepcopy(self.tiny_gptj)
         quant_config = get_default_gptq_config()
         prepared_model = prepare(fp32_model, quant_config)
@@ -251,12 +263,42 @@ def test_save_and_load(self):
         q_model.save("saved_results")
         inc_out = q_model(self.example_inputs)[0]
 
-        from neural_compressor.torch.quantization import load
-
-        # loading compressed model
+        # 1. loading compressed model (format=INC, device="cpu")
+        # linear -> INCWeightOnlyLinear
         loaded_model = load("saved_results", copy.deepcopy(self.tiny_gptj))
-        loaded_out = loaded_model(self.example_inputs)[0]
-        assert torch.allclose(inc_out, loaded_out), "Unexpected result. Please double check."
-        assert isinstance(
-            loaded_model.transformer.h[0].attn.k_proj, WeightOnlyLinear
-        ), "loading compressed model failed."
+        output1 = loaded_model(self.example_inputs)[0]
+        assert torch.allclose(inc_out, output1), "Unexpected result. Please double check."
+        assert (
+            get_woq_linear_num(loaded_model, "INCWeightOnlyLinear") == 30
+        ), "Incorrect number of INCWeightOnlyLinear modules"
+
+        # 2. loading compressed model (format=INC, device="hpu")
+        # first load: linear -> INCWeightOnlyLinear -> HPUWeightOnlyLinear, save quantized_hpu_weight.pt to local cache dir
+        model_loader = WOQModelLoader(
+            model_name_or_path="saved_results",
+            original_model=copy.deepcopy(self.tiny_gptj),
+            format=LoadFormat.DEFAULT,
+            device="hpu",
+        )
+        loaded_model = model_loader.load_woq_model()
+        assert (
+            get_woq_linear_num(loaded_model, "HPUWeightOnlyLinear") == 30
+        ), "Incorrect number of HPUWeightOnlyLinear modules"
+        output2 = loaded_model(self.example_inputs)[0]
+
+        # second load: linear -> HPUWeightOnlyLinear using quantized_hpu_weight.pt saved in local cache dir
+        model_loader = WOQModelLoader(
+            model_name_or_path="saved_results",
+            original_model=copy.deepcopy(self.tiny_gptj),
+            format=LoadFormat.DEFAULT,
+            device="hpu",
+        )
+        loaded_model = model_loader.load_woq_model()
+        assert (
+            get_woq_linear_num(loaded_model, "HPUWeightOnlyLinear") == 30
+        ), "Incorrect number of HPUWeightOnlyLinear modules"
+        output3 = loaded_model(self.example_inputs)[0]
+
+        assert torch.equal(
+            output2, output3
+        ), "The model loaded the second time is different from the model loaded the first time"
diff --git a/test/3x/torch/quantization/weight_only/test_load.py b/test/3x/torch/quantization/weight_only/test_load.py
@@ -0,0 +1,107 @@
+import copy
+import shutil
+
+import huggingface_hub
+import torch
+import transformers
+
+from neural_compressor.common import logger
+from neural_compressor.torch.algorithms.weight_only.save_load import WOQModelLoader
+from neural_compressor.torch.utils import LoadFormat, accelerator
+
+device = accelerator.current_device_name()
+
+
+class TestHFModelLoad:
+    def setup_class(self):
+        self.model_name = "TheBloke/TinyLlama-1.1B-python-v0.1-GPTQ"
+        self.example_inputs = torch.tensor([[10, 20, 30, 40, 50, 60]], dtype=torch.long).to(device)
+
+        self.local_hf_model = "./TinyLlama-1.1B-python-v0.1-GPTQ"
+        huggingface_hub.snapshot_download(self.model_name, local_dir=self.local_hf_model)
+
+    def teardown_class(self):
+        shutil.rmtree("TinyLlama-1.1B-python-v0.1-GPTQ", ignore_errors=True)
+        shutil.rmtree("saved_results", ignore_errors=True)
+
+    def get_woq_linear_num(self, model, woq_module_type_name):
+        woq_linear_num = 0
+        for _, module in model.named_modules():
+            if module.__class__.__name__ == woq_module_type_name:
+                woq_linear_num += 1
+        return woq_linear_num
+
+    def test_load_hf_woq_model(self):
+        from neural_compressor.torch.quantization import load
+
+        # 1. huggingface model_id (format=huggingface, device="cpu")
+        qmodel = load(
+            model_name_or_path=self.model_name, format="huggingface", torch_dtype=torch.float32
+        )  # 'torch_dtype=torch.float32' for cpu test
+        assert (
+            self.get_woq_linear_num(qmodel, "INCWeightOnlyLinear") == 154
+        ), "Incorrect number of INCWeightOnlyLinear modules"
+        output = qmodel(self.example_inputs)[0]
+        assert len(output) > 0, "Not loading the model correctly"
+
+        # 2. huggingface model_id (format=huggingface, device="hpu")
+        # first load: linear -> INCWeightOnlyLinear -> HPUWeightOnlyLinear, save hpu_model.safetensors to local cache dir
+        model_loader = WOQModelLoader(
+            model_name_or_path=self.model_name, format=LoadFormat.HUGGINGFACE, device="hpu", torch_dtype=torch.float32
+        )
+        model = model_loader.load_woq_model()
+        assert (
+            self.get_woq_linear_num(model, "HPUWeightOnlyLinear") == 154
+        ), "Incorrect number of HPUWeightOnlyLinear modules"
+        output1 = model(self.example_inputs)[0]
+
+        # second load: linear -> HPUWeightOnlyLinear using hpu_model.safetensors saved in local cache dir
+        model_loader = WOQModelLoader(
+            model_name_or_path=self.model_name, format=LoadFormat.HUGGINGFACE, device="hpu", torch_dtype=torch.float32
+        )
+        model = model_loader.load_woq_model()
+        assert (
+            self.get_woq_linear_num(model, "HPUWeightOnlyLinear") == 154
+        ), "Incorrect number of HPUWeightOnlyLinear modules"
+        output2 = model(self.example_inputs)[0]
+
+        assert torch.equal(
+            output1, output2
+        ), "The model loaded the second time is different from the model loaded the first time"
+
+        # 3. huggingface local model_path (format=huggingface, device="hpu")
+        # first load: linear -> INCWeightOnlyLinear -> HPUWeightOnlyLinear, save hpu_model.safetensors to local cache dir
+        model_loader = WOQModelLoader(
+            model_name_or_path=self.local_hf_model,
+            format=LoadFormat.HUGGINGFACE,
+            device="hpu",
+            torch_dtype=torch.float32,
+        )
+        model = model_loader.load_woq_model()
+        assert (
+            self.get_woq_linear_num(model, "HPUWeightOnlyLinear") == 154
+        ), "Incorrect number of HPUWeightOnlyLinear modules"
+        output1 = model(self.example_inputs)[0]
+
+        # second load: linear -> HPUWeightOnlyLinear using hpu_model.safetensors saved in local cache dir
+        model_loader = WOQModelLoader(
+            model_name_or_path=self.local_hf_model,
+            format=LoadFormat.HUGGINGFACE,
+            device="hpu",
+            torch_dtype=torch.float32,
+        )
+        model = model_loader.load_woq_model()
+        assert (
+            self.get_woq_linear_num(model, "HPUWeightOnlyLinear") == 154
+        ), "Incorrect number of HPUWeightOnlyLinear modules"
+        output2 = model(self.example_inputs)[0]
+
+        assert torch.equal(
+            output1, output2
+        ), "The model loaded the second time is different from the model loaded the first time"
+
+
+test = TestHFModelLoad()
+test.setup_class()
+test.test_load_hf_woq_model()
+test.teardown_class()
diff --git a/test/3x/torch/quantization/weight_only/test_load_woq_hf_model.py b/test/3x/torch/quantization/weight_only/test_load_woq_hf_model.py