Skip to content

Commit 5489669

Browse files
committed
Merge branch 'develop' into support_transposed_input
2 parents 8016399 + e9860bb commit 5489669

37 files changed

+917
-171
lines changed

.ci/cspell_dict.txt

+1
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ logit
220220
loglikelihoods
221221
lstmsequence
222222
lstsq
223+
lspec
223224
lyalyushkin
224225
mapillary
225226
maskrcnn

.github/workflows/examples.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ concurrency:
2222

2323
jobs:
2424
examples-cpu:
25-
name: Test exmaples CPU [${{ matrix.group }}/4]
25+
name: Test examples CPU [${{ matrix.group }}/4]
2626
runs-on: ubuntu-latest-16-cores
2727
strategy:
2828
fail-fast: false
@@ -72,7 +72,7 @@ jobs:
7272
python .github/scripts/pytest_md_summary.py pytest-results.xml >> $GITHUB_STEP_SUMMARY
7373
7474
examples-win-cpu:
75-
name: Test exmaples CPU Windows [${{ matrix.group }}/4]
75+
name: Test examples CPU Windows [${{ matrix.group }}/4]
7676
runs-on: windows-2019-16-core
7777
if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.skip_windows == 'false' }}
7878
strategy:

nncf/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
from nncf.errors import UnsupportedVersionError as UnsupportedVersionError
3535
from nncf.errors import ValidationError as ValidationError
3636
from nncf.parameters import BackupMode as BackupMode
37+
from nncf.parameters import CompressionFormat as CompressionFormat
3738
from nncf.parameters import CompressWeightsMode as CompressWeightsMode
3839
from nncf.parameters import DropType as DropType
3940
from nncf.parameters import ModelType as ModelType

nncf/common/quantization/structs.py

+12-4
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,22 @@
2727
@api()
2828
class QuantizationScheme(StrEnum):
2929
"""
30-
Basic enumeration for quantization scheme specification.
31-
32-
:param SYMMETRIC:
33-
:param ASYMMETRIC:
30+
Enumeration for specifying quantization schemes.
31+
32+
:param SYMMETRIC: Symmetric quantization where the range is defined by a single parameter - scale.
33+
This range can include both negative and positive values if signed, or only positive values if unsigned.
34+
:param ASYMMETRIC: Asymmetric quantization where the range is defined by two parameters - input_low and input_high,
35+
representing the lower and upper boundaries of the range, respectively.
36+
:param SYMMETRIC_LORA: Symmetric quantization with Low-Rank Adapters (LoRA), involving the sum of weights and
37+
the multiplication of low-rank adapters.
38+
:param ASYMMETRIC_LORA: Asymmetric quantization with Low-Rank Adapters (LoRA), involving the sum of weights and
39+
the multiplication of low-rank adapters.
3440
"""
3541

3642
SYMMETRIC = "symmetric"
3743
ASYMMETRIC = "asymmetric"
44+
SYMMETRIC_LORA = "symmetric_lora"
45+
ASYMMETRIC_LORA = "asymmetric_lora"
3846

3947

4048
class QuantizerConfig:

nncf/experimental/torch/fx/quantization/quantize_model.py

+3
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from nncf.experimental.torch.fx.transformations import compress_post_quantize_transformation
3232
from nncf.experimental.torch.fx.transformations import fq_weights_transformation
3333
from nncf.parameters import BackupMode
34+
from nncf.parameters import CompressionFormat
3435
from nncf.parameters import CompressWeightsMode
3536
from nncf.parameters import ModelType
3637
from nncf.parameters import QuantizationMode
@@ -131,6 +132,7 @@ def compress_weights_impl(
131132
gptq: bool,
132133
lora_correction: bool,
133134
backup_mode: BackupMode,
135+
compression_format: CompressionFormat,
134136
advanced_parameters: Optional[AdvancedCompressionParameters] = None,
135137
) -> torch.fx.GraphModule:
136138
"""
@@ -149,6 +151,7 @@ def compress_weights_impl(
149151
gptq,
150152
lora_correction,
151153
backup_mode,
154+
compression_format,
152155
advanced_parameters,
153156
)
154157
graph = NNCFGraphFactory.create(model)

nncf/openvino/graph/nncf_graph_builder.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,13 @@ def convert_to_nncf_dtype(ov_type: ov.Type) -> Dtype:
4444
"""
4545
type_name = ov_type.get_type_name()
4646
conversion_map = {
47+
"nf4": "float",
48+
"f8e4m3": "float",
49+
"f8e5m2": "float",
4750
"f16": "float",
4851
"bf16": "float",
4952
"f32": "float",
5053
"f64": "float",
51-
"nf4": "float",
5254
"i4": "int",
5355
"i8": "int",
5456
"i16": "int",

nncf/openvino/quantization/quantize_model.py

+3
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
from nncf.openvino.quantization.quantize_ifmodel import apply_algorithm_if_bodies
3333
from nncf.openvino.rt_info import dump_parameters
3434
from nncf.parameters import BackupMode
35+
from nncf.parameters import CompressionFormat
3536
from nncf.parameters import CompressWeightsMode
3637
from nncf.parameters import DropType
3738
from nncf.parameters import ModelType
@@ -376,6 +377,7 @@ def compress_weights_impl(
376377
gptq: bool,
377378
lora_correction: bool,
378379
backup_mode: BackupMode,
380+
compression_format: CompressionFormat,
379381
advanced_parameters: Optional[AdvancedCompressionParameters] = None,
380382
) -> ov.Model:
381383
"""
@@ -396,6 +398,7 @@ def compress_weights_impl(
396398
gptq,
397399
lora_correction,
398400
backup_mode,
401+
compression_format,
399402
advanced_parameters,
400403
)
401404

nncf/parameters.py

+23
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,29 @@ class CompressWeightsMode(StrEnum):
9696
E2M1 = "e2m1"
9797

9898

99+
@api(canonical_alias="nncf.CompressionFormat")
100+
class CompressionFormat(StrEnum):
101+
"""
102+
Describes the format in which the model is saved after weight compression.
103+
104+
:param DQ: Represents the 'dequantize' format, where weights are stored in low-bit precision,
105+
and a dequantization subgraph is added to the model. This is the default format for post-training weight
106+
compression methods.
107+
:param FQ: Represents the 'fake_quantize' format, where quantization is simulated by applying
108+
quantization and dequantization operations. Weights remain in the same precision. This format is
109+
suitable for quantization-aware training (QAT).
110+
:param FQ_LORA: Represents the 'fake_quantize_with_lora' format, which combines fake quantization
111+
with absorbable low-rank adapters (LoRA). Quantization is applied to the sum of weights and
112+
the multiplication of adapters. This makes quantization-aware training (QAT) more efficient in terms of
113+
accuracy, as adapters can also be tuned and remain computationally affordable during training due to their
114+
small dimensions.
115+
"""
116+
117+
DQ = "dequantize"
118+
FQ = "fake_quantize"
119+
FQ_LORA = "fake_quantize_with_lora"
120+
121+
99122
@api(canonical_alias="nncf.BackupMode")
100123
class BackupMode(StrEnum):
101124
"""

nncf/quantization/advanced_parameters.py

+3
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,9 @@ class AdvancedCompressionParameters:
384384
# Advanced Lora Correction algorithm parameters
385385
lora_correction_params: AdvancedLoraCorrectionParameters = field(default_factory=AdvancedLoraCorrectionParameters)
386386

387+
# rank of lora adapters for FQ_LORA format. Defaults to 256.
388+
lora_adapter_rank: int = 256
389+
387390

388391
@api()
389392
@dataclass

nncf/quantization/algorithms/weight_compression/algorithm.py

+20-2
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from nncf.common.utils.helpers import create_table
3232
from nncf.experimental.common.tensor_statistics.statistics import WCTensorStatistic
3333
from nncf.parameters import BackupMode
34+
from nncf.parameters import CompressionFormat
3435
from nncf.parameters import CompressWeightsMode
3536
from nncf.parameters import SensitivityMetric
3637
from nncf.quantization.advanced_parameters import AdvancedCompressionParameters
@@ -45,6 +46,7 @@
4546
from nncf.quantization.algorithms.weight_compression.weight_lowering import WeightCompressionConfig
4647
from nncf.scopes import IgnoredScope
4748
from nncf.scopes import get_ignored_node_names_from_ignored_scope
49+
from nncf.tensor.definitions import TensorDataType
4850

4951
TModel = TypeVar("TModel")
5052
TTensor = TypeVar("TTensor")
@@ -56,6 +58,12 @@
5658
CompressWeightsMode.NF4,
5759
CompressWeightsMode.E2M1,
5860
]
61+
SUPPORTED_DATA_TYPES = [
62+
TensorDataType.float16,
63+
TensorDataType.bfloat16,
64+
TensorDataType.float32,
65+
TensorDataType.float64,
66+
]
5967

6068

6169
def get_weight_compression_configuration(
@@ -122,6 +130,7 @@ def check_user_compression_configuration(
122130
ignored_scope: Optional[IgnoredScope],
123131
sensitivity_metric: Optional[SensitivityMetric],
124132
backup_mode: Optional[BackupMode],
133+
compression_format: Optional[CompressionFormat],
125134
advanced_parameters: Optional[AdvancedCompressionParameters],
126135
) -> None:
127136
"""
@@ -172,6 +181,10 @@ def check_user_compression_configuration(
172181
requires a dataset, but it's not provided."
173182
raise nncf.ValidationError(msg)
174183

184+
if lora_correction and compression_format in [CompressionFormat.FQ, CompressionFormat.FQ_LORA]:
185+
msg = "LoRA Correction algorithm is not compatible with FQ and FQ_LORA compression formats."
186+
raise nncf.ValidationError(msg)
187+
175188

176189
class WeightCompression(Algorithm):
177190
"""
@@ -195,6 +208,7 @@ def __init__(
195208
gptq: bool,
196209
lora_correction: bool,
197210
backup_mode: BackupMode = BackupMode.INT8_ASYM,
211+
compression_format: CompressionFormat = CompressionFormat.DQ,
198212
advanced_parameters: Optional[AdvancedCompressionParameters] = None,
199213
):
200214
"""
@@ -233,6 +247,7 @@ def __init__(
233247
In this mode, weights are retained in their original precision without any quantization.
234248
INT8_SYM stands for 8-bit integer symmetric quantization without zero point.
235249
INT8_ASYM stands for 8-bit integer asymmetric quantization with a typical non-fixed zero point.
250+
:param compression_format: Describes the format in which the model is saved after weight compression.
236251
:param advanced_parameters: advanced parameters for algorithms in compression pipeline.
237252
"""
238253
super().__init__()
@@ -251,6 +266,7 @@ def __init__(
251266
self._gptq = gptq
252267
self._lora_correction = lora_correction
253268
self._backup_mode = backup_mode
269+
self._compression_format = compression_format
254270
self._advanced_parameters = (
255271
advanced_parameters if advanced_parameters is not None else AdvancedCompressionParameters()
256272
)
@@ -489,7 +505,7 @@ def _get_ignored_scope_weight_statistics(self, model: TModel, graph: NNCFGraph)
489505
continue
490506
for _, weight_port_id in self._backend_entity.get_weight_names_and_port_ids(node, graph):
491507
weight_dtype = self._backend_entity.get_weight_dtype(node, weight_port_id, model, graph)
492-
if weight_dtype.is_float():
508+
if weight_dtype in SUPPORTED_DATA_TYPES:
493509
continue
494510
weight_shape = self._backend_entity.get_weight_shape(node, weight_port_id, graph)
495511
weight_size = reduce(operator.mul, weight_shape, 1)
@@ -535,7 +551,7 @@ def apply(
535551
continue
536552

537553
weight_dtype = self._backend_entity.get_weight_dtype(node, weight_port_id, model, graph)
538-
if not weight_dtype.is_float():
554+
if weight_dtype not in SUPPORTED_DATA_TYPES:
539555
continue
540556
weight_shape = self._backend_entity.get_weight_shape(node, weight_port_id, graph)
541557
weight_size = reduce(operator.mul, weight_shape, 1)
@@ -646,6 +662,7 @@ def apply(
646662
scales,
647663
zero_points,
648664
lora_correction_algo,
665+
self._compression_format,
649666
)
650667

651668
self._backend_entity.dump_parameters(
@@ -662,6 +679,7 @@ def apply(
662679
"gptq": self._gptq,
663680
"lora_correction": self._lora_correction,
664681
"backup_mode": self._backup_mode.value,
682+
"compression_format": self._compression_format.value,
665683
"advanced_parameters": convert_to_dict_recursively(self._advanced_parameters),
666684
},
667685
algo_name="weight_compression",

nncf/quantization/algorithms/weight_compression/backend.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,10 @@
2424
from nncf.experimental.common.tensor_statistics.collectors import RawReducer
2525
from nncf.experimental.common.tensor_statistics.collectors import TensorCollector
2626
from nncf.experimental.common.tensor_statistics.statistics import HessianTensorStatistic
27+
from nncf.parameters import CompressionFormat
28+
from nncf.quantization.advanced_parameters import AdvancedCompressionParameters
2729
from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
30+
from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
2831
from nncf.tensor import Tensor
2932
from nncf.tensor import TensorDataType
3033

@@ -147,15 +150,23 @@ def transform_model(
147150
weight_compression_parameters: Iterable[WeightCompressionParameters],
148151
precomputed_scales: Dict[str, Tensor] = None,
149152
precomputed_zero_points: Dict[str, Tensor] = None,
153+
lora_correction_algo: Optional[LoraCorrectionAlgorithm] = None,
154+
compression_format: CompressionFormat = CompressionFormat.DQ,
155+
advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(),
150156
) -> TModel:
151157
"""
152158
Applies weight compression transformations to the model.
153159
154160
:param model: Model in which the weights will be compressed according to the weight compression description.
155161
:param graph: The graph associated with the model.
156-
:param weight_compression_parameters: List of weight compression parameters.
157-
:param precomputed_scales: Precomputed scales for weights compression.
158-
:param precomputed_zero_points: Precomputed zero points for weights compression.
162+
:param weight_compression_parameters: An iterable of weight compression parameters.
163+
:param precomputed_scales: Precomputed scales for weight compression.
164+
:param precomputed_zero_points: Precomputed zero points for weight compression.
165+
:param lora_correction_algo: An optional algorithm to reduce quantization noise after weight compression by
166+
using low-rank adapters. This algorithm not only overrides weights with their quantized counterparts but
167+
also expands the model's execution graph following the Low-Rank Adaptation (LoRA) concept.
168+
:param compression_format: The format in which the model is saved after weight compression.
169+
:param compression_format_params: Describes advanced parameters of compression formats.
159170
:return: The transformed model.
160171
"""
161172

nncf/quantization/algorithms/weight_compression/openvino_backend.py

+4
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,9 @@
4747
from nncf.openvino.statistics.collectors import OVMeanReducer
4848
from nncf.openvino.statistics.collectors import OVMeanVarianceReducer
4949
from nncf.openvino.statistics.collectors import OVShapeReducer
50+
from nncf.parameters import CompressionFormat
5051
from nncf.parameters import CompressWeightsMode
52+
from nncf.quantization.advanced_parameters import AdvancedCompressionParameters
5153
from nncf.quantization.algorithms.weight_compression.awq_patterns import get_awq_patterns
5254
from nncf.quantization.algorithms.weight_compression.backend import AWQAlgoBackend
5355
from nncf.quantization.algorithms.weight_compression.backend import MixedPrecisionAlgoBackend
@@ -286,6 +288,8 @@ def transform_model(
286288
precomputed_scales: Dict[str, Tensor] = None,
287289
precomputed_zero_points: Dict[str, Tensor] = None,
288290
lora_correction_algo: LoraCorrectionAlgorithm = None,
291+
compression_format: CompressionFormat = CompressionFormat.DQ,
292+
advanced_parameters: AdvancedCompressionParameters = AdvancedCompressionParameters(),
289293
) -> ov.Model:
290294
for wc_params in weight_compression_parameters:
291295
const_attributes = wc_params.node_with_weight.layer_attributes.constant_attributes[wc_params.weight_port_id]

nncf/quantization/algorithms/weight_compression/scale_estimation.py

-1
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,6 @@ def calculate_quantization_params(
232232
X, _ = reshape_weight_for_grouped_quantization(X, 0, group_size)
233233
best_diffs = None
234234
result_scale = None
235-
236235
fp_outs = fns.matmul(fns.transpose(original_weight, (1, 0, 2)), X)
237236
q_outs = fns.matmul(fns.transpose(q_weights, (1, 0, 2)), X)
238237

0 commit comments

Comments
 (0)