openvinotoolkit
diff --git a/‎.ci/cspell_dict.txt
+1 b/‎.ci/cspell_dict.txt
+1
diff --git a/‎.github/workflows/examples.yml
+2-2 b/‎.github/workflows/examples.yml
+2-2
diff --git a/‎nncf/__init__.py
+1 b/‎nncf/__init__.py
+1
diff --git a/‎nncf/common/quantization/structs.py
+12-4 b/‎nncf/common/quantization/structs.py
+12-4
diff --git a/‎nncf/experimental/torch/fx/quantization/quantize_model.py
+3 b/‎nncf/experimental/torch/fx/quantization/quantize_model.py
+3
diff --git a/‎nncf/openvino/graph/nncf_graph_builder.py
+3-1 b/‎nncf/openvino/graph/nncf_graph_builder.py
+3-1
diff --git a/‎nncf/openvino/quantization/quantize_model.py
+3 b/‎nncf/openvino/quantization/quantize_model.py
+3
diff --git a/‎nncf/parameters.py
+23 b/‎nncf/parameters.py
+23
diff --git a/‎nncf/quantization/advanced_parameters.py
+3 b/‎nncf/quantization/advanced_parameters.py
+3
diff --git a/‎nncf/quantization/algorithms/weight_compression/algorithm.py
+20-2 b/‎nncf/quantization/algorithms/weight_compression/algorithm.py
+20-2
diff --git a/‎nncf/quantization/algorithms/weight_compression/backend.py
+14-3 b/‎nncf/quantization/algorithms/weight_compression/backend.py
+14-3
diff --git a/‎nncf/quantization/algorithms/weight_compression/openvino_backend.py
+4 b/‎nncf/quantization/algorithms/weight_compression/openvino_backend.py
+4
diff --git a/‎nncf/quantization/algorithms/weight_compression/scale_estimation.py
-1 b/‎nncf/quantization/algorithms/weight_compression/scale_estimation.py
-1
@@ -220,6 +220,7 @@ logit
 loglikelihoods
 lstmsequence
 lstsq
+lspec
 lyalyushkin
 mapillary
 maskrcnn
 
@@ -22,7 +22,7 @@ concurrency:
 
 jobs:
   examples-cpu:
-    name: Test exmaples CPU [${{ matrix.group }}/4]
+    name: Test examples CPU [${{ matrix.group }}/4]
     runs-on: ubuntu-latest-16-cores
     strategy:
       fail-fast: false
@@ -72,7 +72,7 @@ jobs:
           python .github/scripts/pytest_md_summary.py pytest-results.xml >> $GITHUB_STEP_SUMMARY
 
   examples-win-cpu:
-    name: Test exmaples CPU Windows [${{ matrix.group }}/4]
+    name: Test examples CPU Windows [${{ matrix.group }}/4]
     runs-on: windows-2019-16-core
     if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.skip_windows == 'false' }}
     strategy:
 
@@ -34,6 +34,7 @@
 from nncf.errors import UnsupportedVersionError as UnsupportedVersionError
 from nncf.errors import ValidationError as ValidationError
 from nncf.parameters import BackupMode as BackupMode
+from nncf.parameters import CompressionFormat as CompressionFormat
 from nncf.parameters import CompressWeightsMode as CompressWeightsMode
 from nncf.parameters import DropType as DropType
 from nncf.parameters import ModelType as ModelType
 
@@ -27,14 +27,22 @@
 @api()
 class QuantizationScheme(StrEnum):
     """
-    Basic enumeration for quantization scheme specification.
-
-    :param SYMMETRIC:
-    :param ASYMMETRIC:
+    Enumeration for specifying quantization schemes.
+
+    :param SYMMETRIC: Symmetric quantization where the range is defined by a single parameter - scale.
+        This range can include both negative and positive values if signed, or only positive values if unsigned.
+    :param ASYMMETRIC: Asymmetric quantization where the range is defined by two parameters - input_low and input_high,
+        representing the lower and upper boundaries of the range, respectively.
+    :param SYMMETRIC_LORA: Symmetric quantization with Low-Rank Adapters (LoRA), involving the sum of weights and
+        the multiplication of low-rank adapters.
+    :param ASYMMETRIC_LORA: Asymmetric quantization with Low-Rank Adapters (LoRA), involving the sum of weights and
+        the multiplication of low-rank adapters.
     """
 
     SYMMETRIC = "symmetric"
     ASYMMETRIC = "asymmetric"
+    SYMMETRIC_LORA = "symmetric_lora"
+    ASYMMETRIC_LORA = "asymmetric_lora"
 
 
 class QuantizerConfig:
 
@@ -31,6 +31,7 @@
 from nncf.experimental.torch.fx.transformations import compress_post_quantize_transformation
 from nncf.experimental.torch.fx.transformations import fq_weights_transformation
 from nncf.parameters import BackupMode
+from nncf.parameters import CompressionFormat
 from nncf.parameters import CompressWeightsMode
 from nncf.parameters import ModelType
 from nncf.parameters import QuantizationMode
@@ -131,6 +132,7 @@ def compress_weights_impl(
     gptq: bool,
     lora_correction: bool,
     backup_mode: BackupMode,
+    compression_format: CompressionFormat,
     advanced_parameters: Optional[AdvancedCompressionParameters] = None,
 ) -> torch.fx.GraphModule:
     """
@@ -149,6 +151,7 @@ def compress_weights_impl(
         gptq,
         lora_correction,
         backup_mode,
+        compression_format,
         advanced_parameters,
     )
     graph = NNCFGraphFactory.create(model)
 
@@ -44,11 +44,13 @@ def convert_to_nncf_dtype(ov_type: ov.Type) -> Dtype:
         """
         type_name = ov_type.get_type_name()
         conversion_map = {
+            "nf4": "float",
+            "f8e4m3": "float",
+            "f8e5m2": "float",
             "f16": "float",
             "bf16": "float",
             "f32": "float",
             "f64": "float",
-            "nf4": "float",
             "i4": "int",
             "i8": "int",
             "i16": "int",
 
@@ -32,6 +32,7 @@
 from nncf.openvino.quantization.quantize_ifmodel import apply_algorithm_if_bodies
 from nncf.openvino.rt_info import dump_parameters
 from nncf.parameters import BackupMode
+from nncf.parameters import CompressionFormat
 from nncf.parameters import CompressWeightsMode
 from nncf.parameters import DropType
 from nncf.parameters import ModelType
@@ -376,6 +377,7 @@ def compress_weights_impl(
     gptq: bool,
     lora_correction: bool,
     backup_mode: BackupMode,
+    compression_format: CompressionFormat,
     advanced_parameters: Optional[AdvancedCompressionParameters] = None,
 ) -> ov.Model:
     """
@@ -396,6 +398,7 @@ def compress_weights_impl(
         gptq,
         lora_correction,
         backup_mode,
+        compression_format,
         advanced_parameters,
     )
 
 
@@ -96,6 +96,29 @@ class CompressWeightsMode(StrEnum):
     E2M1 = "e2m1"
 
 
+@api(canonical_alias="nncf.CompressionFormat")
+class CompressionFormat(StrEnum):
+    """
+    Describes the format in which the model is saved after weight compression.
+
+    :param DQ: Represents the 'dequantize' format, where weights are stored in low-bit precision,
+        and a dequantization subgraph is added to the model. This is the default format for post-training weight
+        compression methods.
+    :param FQ: Represents the 'fake_quantize' format, where quantization is simulated by applying
+        quantization and dequantization operations. Weights remain in the same precision. This format is
+        suitable for quantization-aware training (QAT).
+    :param FQ_LORA: Represents the 'fake_quantize_with_lora' format, which combines fake quantization
+        with absorbable low-rank adapters (LoRA). Quantization is applied to the sum of weights and
+        the multiplication of adapters. This makes quantization-aware training (QAT) more efficient in terms of
+        accuracy, as adapters can also be tuned and remain computationally affordable during training due to their
+        small dimensions.
+    """
+
+    DQ = "dequantize"
+    FQ = "fake_quantize"
+    FQ_LORA = "fake_quantize_with_lora"
+
+
 @api(canonical_alias="nncf.BackupMode")
 class BackupMode(StrEnum):
     """
 
@@ -384,6 +384,9 @@ class AdvancedCompressionParameters:
     # Advanced Lora Correction algorithm parameters
     lora_correction_params: AdvancedLoraCorrectionParameters = field(default_factory=AdvancedLoraCorrectionParameters)
 
+    # rank of lora adapters for FQ_LORA format. Defaults to 256.
+    lora_adapter_rank: int = 256
+
 
 @api()
 @dataclass
 
@@ -31,6 +31,7 @@
 from nncf.common.utils.helpers import create_table
 from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic
 from nncf.parameters import BackupMode
+from nncf.parameters import CompressionFormat
 from nncf.parameters import CompressWeightsMode
 from nncf.parameters import SensitivityMetric
 from nncf.quantization.advanced_parameters import AdvancedCompressionParameters
@@ -45,6 +46,7 @@
 from nncf.quantization.algorithms.weight_compression.weight_lowering import WeightCompressionConfig
 from nncf.scopes import IgnoredScope
 from nncf.scopes import get_ignored_node_names_from_ignored_scope
+from nncf.tensor.definitions import TensorDataType
 
 TModel = TypeVar("TModel")
 TTensor = TypeVar("TTensor")
@@ -56,6 +58,12 @@
     CompressWeightsMode.NF4,
     CompressWeightsMode.E2M1,
 ]
+SUPPORTED_DATA_TYPES = [
+    TensorDataType.float16,
+    TensorDataType.bfloat16,
+    TensorDataType.float32,
+    TensorDataType.float64,
+]
 
 
 def get_weight_compression_configuration(
@@ -122,6 +130,7 @@ def check_user_compression_configuration(
     ignored_scope: Optional[IgnoredScope],
     sensitivity_metric: Optional[SensitivityMetric],
     backup_mode: Optional[BackupMode],
+    compression_format: Optional[CompressionFormat],
     advanced_parameters: Optional[AdvancedCompressionParameters],
 ) -> None:
     """
@@ -172,6 +181,10 @@ def check_user_compression_configuration(
             requires a dataset, but it's not provided."
         raise nncf.ValidationError(msg)
 
+    if lora_correction and compression_format in [CompressionFormat.FQ, CompressionFormat.FQ_LORA]:
+        msg = "LoRA Correction algorithm is not compatible with FQ and FQ_LORA compression formats."
+        raise nncf.ValidationError(msg)
+
 
 class WeightCompression(Algorithm):
     """
@@ -195,6 +208,7 @@ def __init__(
         gptq: bool,
         lora_correction: bool,
         backup_mode: BackupMode = BackupMode.INT8_ASYM,
+        compression_format: CompressionFormat = CompressionFormat.DQ,
         advanced_parameters: Optional[AdvancedCompressionParameters] = None,
     ):
         """
@@ -233,6 +247,7 @@ def __init__(
                 In this mode, weights are retained in their original precision without any quantization.
             INT8_SYM stands for 8-bit integer symmetric quantization without zero point.
             INT8_ASYM stands for 8-bit integer asymmetric quantization with a typical non-fixed zero point.
+        :param compression_format: Describes the format in which the model is saved after weight compression.
         :param advanced_parameters: advanced parameters for algorithms in compression pipeline.
         """
         super().__init__()
@@ -251,6 +266,7 @@ def __init__(
         self._gptq = gptq
         self._lora_correction = lora_correction
         self._backup_mode = backup_mode
+        self._compression_format = compression_format
         self._advanced_parameters = (
             advanced_parameters if advanced_parameters is not None else AdvancedCompressionParameters()
         )
@@ -489,7 +505,7 @@ def _get_ignored_scope_weight_statistics(self, model: TModel, graph: NNCFGraph)
                 continue
             for _, weight_port_id in self._backend_entity.get_weight_names_and_port_ids(node, graph):
                 weight_dtype = self._backend_entity.get_weight_dtype(node, weight_port_id, model, graph)
-                if weight_dtype.is_float():
+                if weight_dtype in SUPPORTED_DATA_TYPES:
                     continue
                 weight_shape = self._backend_entity.get_weight_shape(node, weight_port_id, graph)
                 weight_size = reduce(operator.mul, weight_shape, 1)
@@ -535,7 +551,7 @@ def apply(
                     continue
 
                 weight_dtype = self._backend_entity.get_weight_dtype(node, weight_port_id, model, graph)
-                if not weight_dtype.is_float():
+                if weight_dtype not in SUPPORTED_DATA_TYPES:
                     continue
                 weight_shape = self._backend_entity.get_weight_shape(node, weight_port_id, graph)
                 weight_size = reduce(operator.mul, weight_shape, 1)
@@ -646,6 +662,7 @@ def apply(
             scales,
             zero_points,
             lora_correction_algo,
+            self._compression_format,
         )
 
         self._backend_entity.dump_parameters(
@@ -662,6 +679,7 @@ def apply(
                 "gptq": self._gptq,
                 "lora_correction": self._lora_correction,
                 "backup_mode": self._backup_mode.value,
+                "compression_format": self._compression_format.value,
                 "advanced_parameters": convert_to_dict_recursively(self._advanced_parameters),
             },
             algo_name="weight_compression",
 
@@ -24,7 +24,10 @@
 from nncf.experimental.common.tensor_statistics.collectors import RawReducer
 from nncf.experimental.common.tensor_statistics.collectors import TensorCollector
 from nncf.experimental.common.tensor_statistics.statistics import HessianTensorStatistic
+from nncf.parameters import CompressionFormat
+from nncf.quantization.advanced_parameters import AdvancedCompressionParameters
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
+from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
 from nncf.tensor import Tensor
 from nncf.tensor import TensorDataType
 
@@ -147,15 +150,23 @@ def transform_model(
         weight_compression_parameters: Iterable[WeightCompressionParameters],
         precomputed_scales: Dict[str, Tensor] = None,
         precomputed_zero_points: Dict[str, Tensor] = None,
+        lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None,
+        compression_format: CompressionFormat = CompressionFormat.DQ,
+        advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(),
     ) -> TModel:
         """
         Applies weight compression transformations to the model.
 
         :param model: Model in which the weights will be compressed according to the weight compression description.
         :param graph: The graph associated with the model.
-        :param weight_compression_parameters: List of weight compression parameters.
-        :param precomputed_scales: Precomputed scales for weights compression.
-        :param precomputed_zero_points: Precomputed zero points for weights compression.
+        :param weight_compression_parameters: An iterable of weight compression parameters.
+        :param precomputed_scales: Precomputed scales for weight compression.
+        :param precomputed_zero_points: Precomputed zero points for weight compression.
+        :param lora_correction_algo: An optional algorithm to reduce quantization noise after weight compression by
+            using low-rank adapters. This algorithm not only overrides weights with their quantized counterparts but
+            also expands the model's execution graph following the Low-Rank Adaptation (LoRA) concept.
+        :param compression_format: The format in which the model is saved after weight compression.
+        :param compression_format_params: Describes advanced parameters of compression formats.
         :return: The transformed model.
         """
 
 
@@ -47,7 +47,9 @@
 from nncf.openvino.statistics.collectors import OVMeanReducer
 from nncf.openvino.statistics.collectors import OVMeanVarianceReducer
 from nncf.openvino.statistics.collectors import OVShapeReducer
+from nncf.parameters import CompressionFormat
 from nncf.parameters import CompressWeightsMode
+from nncf.quantization.advanced_parameters import AdvancedCompressionParameters
 from nncf.quantization.algorithms.weight_compression.awq_patterns import get_awq_patterns
 from nncf.quantization.algorithms.weight_compression.backend import AWQAlgoBackend
 from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend
@@ -286,6 +288,8 @@ def transform_model(
         precomputed_scales: Dict[str, Tensor] = None,
         precomputed_zero_points: Dict[str, Tensor] = None,
         lora_correction_algo: LoraCorrectionAlgorithm = None,
+        compression_format: CompressionFormat = CompressionFormat.DQ,
+        advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(),
     ) -> ov.Model:
         for wc_params in weight_compression_parameters:
             const_attributes = wc_params.node_with_weight.layer_attributes.constant_attributes[wc_params.weight_port_id]
 
@@ -232,7 +232,6 @@ def calculate_quantization_params(
         X, _ = reshape_weight_for_grouped_quantization(X, 0, group_size)
         best_diffs = None
         result_scale = None
-
         fp_outs = fns.matmul(fns.transpose(original_weight, (1, 0, 2)), X)
         q_outs = fns.matmul(fns.transpose(q_weights, (1, 0, 2)), X)