Fix gptq device_map = "cpu" (#1662)

SunMarc · web-flow · commit 2c812193a06b · 2024-02-06T11:25:00.000+01:00
* fix gptq cpu device_map

* fix test

* remove default dict
diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py
@@ -332,20 +332,23 @@ def quantize_model(self, model: nn.Module, tokenizer: Optional[Any] = None):
             use_cache = model.config.use_cache
             model.config.use_cache = False
 
+        # If the model has a device_map, we don't move to model. We have already dispatched the hook that will do the work
         if hasattr(model, "hf_device_map"):
             devices = list(model.hf_device_map.values())
+            has_device_map = True
             if "disk" in devices:
                 raise ValueError("disk offload is not supported with GPTQ quantization")
-            if "cpu" in devices and len(model.hf_device_map) > 1:
-                logger.info("Cpu offload is not recommended. There might be some issues with the memory")
-                hook = None
-                for name, device in model.hf_device_map.items():
-                    if device == "cpu":
-                        module = recurse_getattr(model, name)
-                        remove_hook_from_module(module, recurse=True)
-                        module, hook = cpu_offload_with_hook(module, prev_module_hook=hook)
-            # If the model has a device_map, we don't move to model. We have already dispatched the hook that will do the work
-            has_device_map = True
+            if "cpu" in devices or torch.device("cpu") in devices:
+                if len(model.hf_device_map) > 1:
+                    logger.info("Cpu offload is not recommended. There might be some issues with the memory")
+                    hook = None
+                    for name, device in model.hf_device_map.items():
+                        if device == "cpu":
+                            module = recurse_getattr(model, name)
+                            remove_hook_from_module(module, recurse=True)
+                            module, hook = cpu_offload_with_hook(module, prev_module_hook=hook)
+                else:
+                    has_device_map = False
 
         if hasattr(model, "dtype"):
             self.use_cuda_fp16 = model.dtype == torch.float16
diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py
@@ -54,7 +54,7 @@ class GPTQTest(unittest.TestCase):
     exllama_config = None
     cache_block_outputs = True
     modules_to_quantize_inside_block = None
-
+    device_map_for_quantization = "cuda"
     dataset = [
         "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
     ]
@@ -66,7 +66,7 @@ def setUpClass(cls):
         Setup quantized model
         """
         cls.model_fp16 = AutoModelForCausalLM.from_pretrained(
-            cls.model_name, torch_dtype=torch.float16, device_map={"": 0}
+            cls.model_name, torch_dtype=torch.float16, device_map=cls.device_map_for_quantization
         )
         cls.mem_fp16 = cls.model_fp16.get_memory_footprint()
 
@@ -168,6 +168,13 @@ def test_serialization(self):
             self.check_inference_correctness(quantized_model_from_saved)
 
 
+class GPTQTestCPUInit(GPTQTest):
+    device_map_for_quantization = "cpu"
+
+    def test_generate_quality(self):
+        self.check_inference_correctness(self.quantized_model.to(0))
+
+
 class GPTQTestExllama(GPTQTest):
     disable_exllama = False
     exllama_config = {"version": 1}