huggingface
diff --git a/‎.github/workflows/test_inc.yml
+1-1 b/‎.github/workflows/test_inc.yml
+1-1
diff --git a/‎.github/workflows/test_offline.yaml
+40 b/‎.github/workflows/test_offline.yaml
+40
diff --git a/‎examples/neural_compressor/language-modeling/run_clm.py
+4-7 b/‎examples/neural_compressor/language-modeling/run_clm.py
+4-7
diff --git a/‎optimum/commands/export/openvino.py
+5-1 b/‎optimum/commands/export/openvino.py
+5-1
diff --git a/‎optimum/exporters/openvino/__main__.py
+2-1 b/‎optimum/exporters/openvino/__main__.py
+2-1
diff --git a/‎optimum/intel/generation/modeling.py
+3-2 b/‎optimum/intel/generation/modeling.py
+3-2
diff --git a/‎optimum/intel/ipex/modeling_base.py
+3-2 b/‎optimum/intel/ipex/modeling_base.py
+3-2
diff --git a/‎optimum/intel/neural_compressor/__init__.py
+1-1 b/‎optimum/intel/neural_compressor/__init__.py
+1-1
diff --git a/‎optimum/intel/neural_compressor/modeling_base.py
+4-7 b/‎optimum/intel/neural_compressor/modeling_base.py
+4-7
diff --git a/‎optimum/intel/neural_compressor/quantization.py
+19-86 b/‎optimum/intel/neural_compressor/quantization.py
+19-86
@@ -33,7 +33,7 @@ jobs:
         pip install cmake
         pip install py-cpuinfo
         pip install .[neural-compressor,diffusers,tests]
-        pip install intel-extension-for-transformers==1.4.0
+        pip install intel-extension-for-transformers
         pip install peft
 
     - name: Test with Pytest
 
@@ -0,0 +1,40 @@
+name: Offline usage / Python - Test
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [3.9]
+        os: [ubuntu-latest]
+
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v3
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          pip install .[tests,openvino]
+      - name: Test
+        run: |
+          HF_HOME=/tmp/ huggingface-cli download hf-internal-testing/tiny-random-gpt2
+          HF_HOME=/tmp/ HF_HUB_OFFLINE=1 optimum-cli export openvino --model hf-internal-testing/tiny-random-gpt2 gpt2_openvino --task text-generation
+
+          huggingface-cli download hf-internal-testing/tiny-random-gpt2
+          HF_HUB_OFFLINE=1 optimum-cli export openvino --model hf-internal-testing/tiny-random-gpt2 gpt2_openvino --task text-generation
+
+          pytest tests/openvino/test_modeling.py -k "test_load_from_hub" -s -vvvvv
+          HF_HUB_OFFLINE=1 pytest tests/openvino/test_modeling.py -k "test_load_from_hub" -s -vvvvv
@@ -57,13 +57,10 @@
 from transformers.utils.versions import require_version
 
 from optimum.intel.neural_compressor import INCModelForCausalLM, INCQuantizer, INCTrainer
-from optimum.intel.utils.import_utils import (
-    INTEL_EXTENSION_FOR_TRANSFORMERS_IMPORT_ERROR,
-    is_intel_extension_for_transformers_available,
-)
+from optimum.intel.utils.import_utils import ITREX_IMPORT_ERROR, is_itrex_available
 
 
-if is_intel_extension_for_transformers_available():
+if is_itrex_available():
     from intel_extension_for_transformers.transformers.utils.config import GPTQConfig, RtnConfig
 
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
@@ -658,8 +655,8 @@ def compute_metrics(eval_preds):
             else:
                 recipes = {}
             if optim_args.quantization_approach == "weight_only":
-                if not is_intel_extension_for_transformers_available():
-                    raise ImportError(INTEL_EXTENSION_FOR_TRANSFORMERS_IMPORT_ERROR.format("WeightOnly quantization"))
+                if not is_itrex_available():
+                    raise ImportError(ITREX_IMPORT_ERROR.format("WeightOnly quantization"))
                 if optim_args.apply_pruning or optim_args.apply_distillation:
                     raise ValueError("Weight only quantization and pruning or distillation cannot be combined.")
 
 
@@ -18,6 +18,8 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Optional
 
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+
 from ...exporters import TasksManager
 from ...intel.utils.import_utils import DIFFUSERS_IMPORT_ERROR, is_diffusers_available
 from ..base import BaseOptimumCLICommand, CommandInfo
@@ -47,7 +49,9 @@ def parse_args_openvino(parser: "ArgumentParser"):
             f" {str(TasksManager.get_all_tasks())}. For decoder models, use `xxx-with-past` to export the model using past key values in the decoder."
         ),
     )
-    optional_group.add_argument("--cache_dir", type=str, default=None, help="Path indicating where to store cache.")
+    optional_group.add_argument(
+        "--cache_dir", type=str, default=HUGGINGFACE_HUB_CACHE, help="Path indicating where to store cache."
+    )
     optional_group.add_argument(
         "--framework",
         type=str,
 
@@ -16,6 +16,7 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union
 
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from requests.exceptions import ConnectionError as RequestsConnectionError
 from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase
 
@@ -48,7 +49,7 @@ def main_export(
     task: str = "auto",
     device: str = "cpu",
     framework: Optional[str] = None,
-    cache_dir: Optional[str] = None,
+    cache_dir: str = HUGGINGFACE_HUB_CACHE,
     trust_remote_code: bool = False,
     pad_token_id: Optional[int] = None,
     subfolder: str = "",
 
@@ -21,6 +21,7 @@
 
 import torch
 from huggingface_hub import hf_hub_download
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from transformers import AutoConfig, AutoModelForCausalLM, GenerationConfig, PretrainedConfig, PreTrainedModel
 from transformers.generation import GenerationMixin
 from transformers.modeling_outputs import CausalLMOutputWithPast
@@ -357,7 +358,7 @@ def _from_pretrained(
         token: Optional[Union[bool, str]] = None,
         revision: Optional[Union[str, None]] = None,
         force_download: bool = False,
-        cache_dir: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
         file_name: Optional[str] = WEIGHTS_NAME,
         local_files_only: bool = False,
         use_cache: bool = True,
@@ -403,7 +404,7 @@ def _from_transformers(
         token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
-        cache_dir: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
         subfolder: str = "",
         local_files_only: bool = False,
         use_cache: bool = True,
 
@@ -22,6 +22,7 @@
 import intel_extension_for_pytorch as ipex
 import torch
 from huggingface_hub import hf_hub_download
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from intel_extension_for_pytorch.cpu._auto_kernel_selection import _enable_tpp
 from intel_extension_for_pytorch.transformers.optimize import get_dummy_input
 from transformers import (
@@ -154,7 +155,7 @@ def _from_transformers(
         token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
-        cache_dir: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
         subfolder: str = "",
         local_files_only: bool = False,
         torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
@@ -193,7 +194,7 @@ def _from_pretrained(
         token: Optional[Union[bool, str]] = None,
         revision: Optional[Union[str, None]] = None,
         force_download: bool = False,
-        cache_dir: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
         file_name: Optional[str] = WEIGHTS_NAME,
         local_files_only: bool = False,
         subfolder: str = "",
 
@@ -12,7 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-from ..utils.import_utils import is_diffusers_available, is_intel_extension_for_transformers_available
+from ..utils.import_utils import is_diffusers_available
 from .configuration import INCConfig
 from .modeling_base import (
     INCModel,
 
@@ -20,6 +20,7 @@
 
 import torch
 from huggingface_hub import hf_hub_download
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from neural_compressor.utils.pytorch import load
 from transformers import (
     AutoConfig,
@@ -43,11 +44,7 @@
 from optimum.intel.generation import BaseModelForCausalLM
 
 from ...modeling_base import OptimizedModel
-from ..utils.import_utils import (
-    _torch_version,
-    is_intel_extension_for_transformers_available,
-    is_torch_version,
-)
+from ..utils.import_utils import _torch_version, is_itrex_available, is_torch_version
 from .configuration import INCConfig
 from .utils import WEIGHTS_NAME
 
@@ -105,7 +102,7 @@ def _from_pretrained(
         token: Optional[Union[bool, str]] = None,
         revision: Optional[Union[str, None]] = None,
         force_download: bool = False,
-        cache_dir: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
         file_name: str = WEIGHTS_NAME,
         local_files_only: bool = False,
         subfolder: str = "",
@@ -137,7 +134,7 @@ def _from_pretrained(
         model_save_dir = Path(model_cache_path).parent
         inc_config = None
         msg = None
-        if is_intel_extension_for_transformers_available():
+        if is_itrex_available():
             try:
                 quantization_config = PretrainedConfig.from_pretrained(model_save_dir / "quantize_config.json")
                 algorithm = getattr(quantization_config, "quant_method", None)
 
@@ -19,11 +19,10 @@
 from enum import Enum
 from itertools import chain
 from pathlib import Path
-from typing import Callable, Dict, Optional, Union
+from typing import Callable, Optional, Union
 
 import torch
 from datasets import Dataset, load_dataset
-from neural_compressor.adaptor.pytorch import PyTorch_FXAdaptor, _cfg_to_qconfig, _propagate_qconfig
 from neural_compressor.config import PostTrainingQuantConfig
 from neural_compressor.experimental.export import torch_to_int8_onnx
 from neural_compressor.model.onnx_model import ONNXModel
@@ -47,14 +46,14 @@
 
 from ..utils.constant import _TASK_ALIASES, MIN_QDQ_ONNX_OPSET, ONNX_WEIGHTS_NAME, WEIGHTS_NAME
 from ..utils.import_utils import (
-    INTEL_EXTENSION_FOR_TRANSFORMERS_IMPORT_ERROR,
-    _intel_extension_for_transformers_version,
+    ITREX_IMPORT_ERROR,
     _ipex_version,
+    _itrex_version,
     _neural_compressor_version,
     _torch_version,
-    is_intel_extension_for_transformers_available,
-    is_intel_extension_for_transformers_version,
     is_ipex_version,
+    is_itrex_available,
+    is_itrex_version,
     is_neural_compressor_version,
     is_torch_version,
 )
@@ -69,16 +68,21 @@
     INCModelForTokenClassification,
     INCModelForVision2Seq,
 )
-from .utils import INCDataLoader, _cfgs_to_fx_cfgs
-
+from .utils import (
+    IPEX_MINIMUM_VERSION,
+    ITREX_MINIMUM_TORCH_VERSION,
+    ITREX_MINIMUM_VERSION,
+    NEURAL_COMPRESSOR_MINIMUM_VERSION,
+    NEURAL_COMPRESSOR_WEIGHT_ONLY_MINIMUM_VERSION,
+    INCDataLoader,
+)
 
-INTEL_EXTENSION_FOR_TRANSFORMERS_MINIMUM_VERSION = "1.4.0"
 
-if is_intel_extension_for_transformers_available():
-    if is_intel_extension_for_transformers_version("!=", INTEL_EXTENSION_FOR_TRANSFORMERS_MINIMUM_VERSION):
+if is_itrex_available():
+    if is_itrex_version("<", ITREX_MINIMUM_VERSION):
         raise ImportError(
-            f"Found an incompatible version of `intel-extension-for-transformers`. Found version {_intel_extension_for_transformers_version}, "
-            f"but only version {INTEL_EXTENSION_FOR_TRANSFORMERS_MINIMUM_VERSION} is supported."
+            f"Found an incompatible version of `intel-extension-for-transformers`. Found version {_itrex_version}, "
+            f"but only version {ITREX_MINIMUM_VERSION} or higher is supported."
         )
     from intel_extension_for_transformers.transformers.llm.quantization.utils import convert_to_quantized_model
     from intel_extension_for_transformers.transformers.modeling.modeling_auto import save_low_bit
@@ -92,10 +96,6 @@
 
 logger = logging.getLogger(__name__)
 
-NEURAL_COMPRESSOR_MINIMUM_VERSION = "2.1.0"
-NEURAL_COMPRESSOR_WEIGHT_ONLY_MINIMUM_VERSION = "2.3.0"
-IPEX_MINIMUM_VERSION = "2.1.0"
-ITREX_MINIMUM_TORCH_VERSION = "2.2.0"
 
 if is_neural_compressor_version("<", NEURAL_COMPRESSOR_MINIMUM_VERSION):
     raise ImportError(
@@ -231,8 +231,8 @@ def quantize(
                     f"Found an incompatible version of neural-compressor. Found version {_neural_compressor_version}, "
                     f"but only version {NEURAL_COMPRESSOR_WEIGHT_ONLY_MINIMUM_VERSION} or higher supports weight-only quantization."
                 )
-            if not is_intel_extension_for_transformers_available():
-                raise ImportError(INTEL_EXTENSION_FOR_TRANSFORMERS_IMPORT_ERROR.format("Weight only quantization"))
+            if not is_itrex_available():
+                raise ImportError(ITREX_IMPORT_ERROR.format("Weight only quantization"))
 
             if is_torch_version("<", ITREX_MINIMUM_TORCH_VERSION):
                 raise ImportError(
@@ -516,70 +516,3 @@ def _get_calibration_dataloader(
     def _remove_unused_columns(self, dataset: Dataset):
         ignored_columns = list(set(dataset.column_names) - set(self._signature_columns))
         return dataset.remove_columns(ignored_columns)
-
-
-# Adapted from https://github.com/intel/neural-compressor/blob/master/neural_compressor/utils/pytorch.py#L96
-def _apply_quantization_from_config(q_config: Dict, model: torch.nn.Module) -> torch.nn.Module:
-    """
-    Apply Intel Neural Compressor quantization steps on the given model.
-
-    Arguments:
-        q_config (`Dict`):
-            Dictionary containing all quantization information such as approach, dtype, scheme and granularity.
-        model (`torch.nn.Module`):
-            Model to quantize.
-    Returns:
-        q_model (`torch.nn.Module`):
-            Quantized model.
-    """
-    from torch.quantization import add_observer_, convert
-    from torch.quantization.quantize_fx import convert_fx, prepare_fx, prepare_qat_fx
-
-    approach = q_config.get("approach")
-    framework = q_config.get("framework")
-
-    if approach not in SUPPORTED_QUANT_MODE:
-        raise ValueError(
-            "Unknown quantization approach. Supported approach are " + ", ".join(SUPPORTED_QUANT_MODE.keys())
-        )
-
-    quant_mode = INCQuantizationMode(approach)
-    q_model = copy.deepcopy(model)
-    q_model.eval()
-
-    if framework == "pytorch_fx":
-        op_cfgs = _cfg_to_qconfig(q_config, approach)
-        fx_op_cfgs = _cfgs_to_fx_cfgs(op_cfgs, approach)
-
-        if not q_config["fx_sub_module_list"]:
-            if quant_mode == INCQuantizationMode.AWARE_TRAINING:
-                q_model.train()
-                q_model = prepare_qat_fx(q_model, fx_op_cfgs)
-            else:
-                q_model = prepare_fx(q_model, fx_op_cfgs)
-            q_model = convert_fx(q_model)
-
-        else:
-            sub_module_list = q_config["fx_sub_module_list"]
-            if q_config["approach"] == "quant_aware_training":
-                q_model.train()
-                PyTorch_FXAdaptor.prepare_sub_graph(sub_module_list, fx_op_cfgs, q_model, prefix="", is_qat=True)
-            else:
-                PyTorch_FXAdaptor.prepare_sub_graph(sub_module_list, fx_op_cfgs, q_model, prefix="")
-            PyTorch_FXAdaptor.convert_sub_graph(sub_module_list, q_model, prefix="")
-
-    else:
-        if quant_mode == INCQuantizationMode.DYNAMIC:
-            q_mapping = torch.quantization.quantization_mappings.get_default_dynamic_quant_module_mappings()
-            op_cfgs = _cfg_to_qconfig(q_config, approach)
-        else:
-            q_mapping = torch.quantization.quantization_mappings.get_default_static_quant_module_mappings()
-            op_cfgs = _cfg_to_qconfig(q_config)
-
-        _propagate_qconfig(q_model, op_cfgs, approach=approach)
-
-        if quant_mode != INCQuantizationMode.DYNAMIC:
-            add_observer_(q_model)
-        q_model = convert(q_model, mapping=q_mapping, inplace=True)
-
-    return q_model