Enable loading of torchscript model with INC and add warning (#540)

echarlaix · web-flow · commit 3b627f4252cc · 2024-01-30T16:57:34.000+01:00
diff --git a/optimum/intel/neural_compressor/modeling_base.py b/optimum/intel/neural_compressor/modeling_base.py
@@ -40,6 +40,8 @@
 from transformers.models.auto.auto_factory import _get_model_class
 from transformers.utils.generic import ContextManagers
 
+from optimum.intel.generation import BaseModelForCausalLM
+
 from ...modeling_base import OptimizedModel
 from ..utils.import_utils import _torch_version, is_torch_version
 from .configuration import INCConfig
@@ -83,11 +85,6 @@ def __init__(
             "cuda:0" if torch.cuda.is_available() else "cpu"
         )
 
-        if getattr(self.config, "backend", None) == "ipex":
-            raise NotImplementedError(
-                "`INCModel` does not supported the loading of model resulting from IPEX, please use `IPEXModel` to load your model instead instead"
-            )
-
         # Registers the INCModelForXXX classes into the transformers AutoModel classes to avoid warnings when creating
         # a pipeline https://github.com/huggingface/transformers/blob/cad61b68396a1a387287a8e2e2fef78a25b79383/src/transformers/pipelines/base.py#L863
         AutoConfig.register(self.base_model_prefix, AutoConfig)
@@ -143,11 +140,19 @@ def _from_pretrained(
                 f"Please check if torch quantization the model was obtained with is compatible with {_torch_version}."
             )
 
+        if getattr(config, "backend", None) == "ipex" or getattr(config, "torchscript", False):
+            logger.warning(
+                f"Using `{cls.__name__}` to load a TorchScript model will be deprecated in v1.15.0, to load your model please use `{cls.__name__.replace('INC', 'IPEX')}` instead."
+            )
+            model = torch.jit.load(model_cache_path)
+            model = torch.jit.freeze(model.eval())
+            return cls(model, config=config, model_save_dir=model_save_dir, inc_config=inc_config, **kwargs)
+
         model_class = _get_model_class(config, cls.auto_model_class._model_mapping)
         # Load the state dictionary of the model to verify whether the model to get the quantization config
         state_dict = torch.load(model_cache_path, map_location="cpu")
-        q_config = state_dict.get("best_configure", None)
 
+        q_config = state_dict.get("best_configure", None)
         if q_config is None:
             model = model_class.from_pretrained(model_save_dir)
         else:
@@ -169,10 +174,13 @@ def _from_pretrained(
     def _save_pretrained(self, save_directory: Union[str, Path]):
         output_path = os.path.join(save_directory, WEIGHTS_NAME)
 
-        state_dict = self.model.state_dict()
-        if self._q_config:
-            state_dict["best_configure"] = self._q_config
-        torch.save(state_dict, output_path)
+        if isinstance(self.model, torch.nn.Module):
+            state_dict = self.model.state_dict()
+            if self._q_config:
+                state_dict["best_configure"] = self._q_config
+            torch.save(state_dict, output_path)
+        else:
+            torch.jit.save(self.model, output_path)
 
         if self.inc_config:
             self.inc_config.save_pretrained(save_directory)
@@ -244,6 +252,29 @@ class INCModelForXLNetLM(INCModel):
     export_feature = "fill-mask"
 
 
-class INCModelForCausalLM(INCModel):
+class INCModelForCausalLM(INCModel, BaseModelForCausalLM):
     auto_model_class = AutoModelForCausalLM
     export_feature = "text-generation"
+    forward = BaseModelForCausalLM.forward
+    generate = BaseModelForCausalLM.generate
+    can_generate = BaseModelForCausalLM.can_generate
+
+    def __init__(
+        self,
+        model,
+        config: PretrainedConfig = None,
+        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        q_config: Dict = None,
+        inc_config: Dict = None,
+        use_cache: bool = True,
+        **kwargs,
+    ):
+        super(INCModelForCausalLM, self).__init__(
+            model=model,
+            config=config,
+            model_save_dir=model_save_dir,
+            q_config=q_config,
+            inc_config=inc_config,
+            use_cache=use_cache,
+            **kwargs,
+        )
diff --git a/tests/neural_compressor/test_modeling.py b/tests/neural_compressor/test_modeling.py
@@ -122,3 +122,24 @@ def test_pipeline(self, model_id, task):
             inputs *= 2
 
         pipe(*inputs)
+
+    def test_compare_with_and_without_past_key_values(self):
+        model_id = "echarlaix/tiny-random-gpt2-torchscript"
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokens = tokenizer("This is a sample input", return_tensors="pt")
+
+        model_with_pkv = INCModelForCausalLM.from_pretrained(model_id, use_cache=True, subfolder="model_with_pkv")
+
+        outputs_with_pkv = model_with_pkv.generate(
+            **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
+        )
+        model_without_pkv = INCModelForCausalLM.from_pretrained(
+            model_id, use_cache=False, subfolder="model_without_pkv"
+        )
+
+        outputs_without_pkv = model_without_pkv.generate(
+            **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
+        )
+        self.assertEqual(outputs_with_pkv.shape[1], self.GENERATION_LENGTH)
+        self.assertEqual(outputs_without_pkv.shape[1], self.GENERATION_LENGTH)
+        self.assertTrue(torch.equal(outputs_with_pkv, outputs_without_pkv))