Skip to content

Commit 14158ce

Browse files
Data-aware compression fixes (#3019)
Fixes after #3003 . ### Changes 1. Convert raw activations to WC statistics for GPTQ + SE scenario. 2. Allow 2D tensor inputs for data-aware mixed precision. 2D activations arise in `opt`-like models, e.g. `opt-125m`. There, LayerNorm reshapes activations from [B, L, D] to [B*L, D]. ### Tests 1. Added a unit test for GPTQ + SE. 2. Modified a test for 2D activations and mixed precision. 3. Compressed tiny-llama to int4_asym with SQ + GPTQ before #3003 and for this PR. Got the same PPL value of 15.739704794594019 .
1 parent e6a4752 commit 14158ce

File tree

4 files changed

+54
-26
lines changed

4 files changed

+54
-26
lines changed

nncf/common/tensor_statistics/statistics.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ class WCTensorStatistic(TensorStatistic):
126126
MEAN_STAT = "mean_values"
127127
SHAPE_STAT = "shape_values"
128128

129-
def __init__(self, mean_values: List[Tensor], shapes: List[Tuple[int]]):
129+
def __init__(self, mean_values: List[Tensor], shapes: List[Tuple[int, ...]]):
130130
"""
131131
:param mean_values: List of N tensors of shape [HiddenDim] obtained by reducing activations along batch and
132132
sequence length dimensions.

nncf/quantization/algorithms/weight_compression/gptq.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from nncf.common.graph import NNCFNode
1919
from nncf.common.logging.track_progress import track
2020
from nncf.common.tensor_statistics.statistic_point import StatisticPointsContainer
21+
from nncf.common.tensor_statistics.statistics import WCTensorStatistic
2122
from nncf.common.utils.backend import BackendType
2223
from nncf.common.utils.backend import get_backend
2324
from nncf.parameters import CompressWeightsMode
@@ -265,9 +266,10 @@ def _quantize_weights(
265266
else:
266267
if self._scale_estimation and block_compression_config.num_bits == 4:
267268
activations = [inp.squeeze()[:, (i1 + i) : (i1 + i + group_size)] for inp in inputs]
269+
wc_statistics = self._activations_to_wc_statistics(activations)
268270
scale, zero_point = ScaleEstimation.calculate_quantization_params(
269271
self._backend_entity,
270-
activations,
272+
wc_statistics,
271273
weight_tensor[:, (i1 + i) : (i1 + i + group_size)],
272274
reduction_axes,
273275
wc_params.compression_config,
@@ -325,3 +327,15 @@ def _quantize_weights(
325327
else:
326328
zero_points = None
327329
return scales, zero_points
330+
331+
@staticmethod
332+
def _activations_to_wc_statistics(activations: List[Tensor]) -> WCTensorStatistic:
333+
# The code below mimics the logic from WeightCompression.get_statistic_points
334+
mean_values = []
335+
shapes = []
336+
for act in activations:
337+
shapes.append(act.shape)
338+
reduction_shape = tuple(range(act.ndim - 1))
339+
mean_values.append(fns.mean(act, axis=reduction_shape))
340+
wc_statistics = WCTensorStatistic(mean_values, shapes)
341+
return wc_statistics

nncf/quantization/algorithms/weight_compression/mixed_precision.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -237,9 +237,9 @@ def get_statistic_points(
237237
statistic_container = StatisticPointsContainer()
238238
for act_node, output_port_id in nodes_and_port_ids:
239239
n_dims = len(graph.get_output_edges_by_port_id(act_node, output_port_id)[0].tensor_shape)
240-
if n_dims < 3:
240+
if n_dims < 2:
241241
raise RuntimeError(
242-
f"Data-aware mixed precision criteria are not supported for MatMuls with 1D/2D activations. "
242+
f"Data-aware mixed precision criteria are not supported for MatMuls with 1D inputs. "
243243
f"Node: {act_node.node_name}, number of dimensions: {n_dims}."
244244
)
245245
statistic_point = self._backend_entity.target_point(

tests/openvino/native/quantization/test_weights_compression.py

+36-22
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from nncf.parameters import BackupMode
3131
from nncf.quantization import compress_weights
3232
from nncf.quantization.advanced_parameters import AdvancedCompressionParameters as CompressionParams
33+
from nncf.quantization.advanced_parameters import AdvancedGPTQParameters as GPTQParams
3334
from nncf.quantization.advanced_parameters import AdvancedLoraCorrectionParameters as LoraParams
3435
from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
3536
from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
@@ -1377,29 +1378,42 @@ def test_data_aware_algo_with_different_activation_dimensions(n_extra_dims):
13771378
group_size=-1,
13781379
dataset=dataset,
13791380
awq=True,
1381+
ratio=0.5,
1382+
sensitivity_metric=SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE,
13801383
)
13811384

13821385

1383-
@pytest.mark.parametrize("n_extra_dims,raises", ([0, True], (1, False), (2, False)))
1384-
def test_data_aware_mixed_precision_with_different_activation_dimensions(n_extra_dims, raises):
1385-
model = AWQMatmulModel(n_extra_dims=n_extra_dims).ov_model
1386-
dataset = Dataset([np.ones([1] * n_extra_dims + [8, 8])])
1387-
1388-
def call_compression():
1389-
compress_weights(
1390-
model,
1391-
mode=CompressWeightsMode.INT4_ASYM,
1392-
ratio=0.5,
1393-
sensitivity_metric=SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE,
1394-
group_size=-1,
1395-
dataset=dataset,
1396-
)
1386+
@pytest.mark.parametrize(
1387+
"kwargs",
1388+
[
1389+
dict(scale_estimation=True),
1390+
dict(lora_correction=True),
1391+
dict(
1392+
gptq=True,
1393+
scale_estimation=True,
1394+
advanced_parameters=CompressionParams(gptq_params=GPTQParams(subset_size=2)),
1395+
),
1396+
dict(
1397+
awq=True,
1398+
gptq=True,
1399+
scale_estimation=True,
1400+
advanced_parameters=CompressionParams(gptq_params=GPTQParams(subset_size=2)),
1401+
),
1402+
],
1403+
)
1404+
def test_compression_with_different_algo_combinations(kwargs):
1405+
dataset_size = 4
1406+
model = LMLinearModel().ov_model
1407+
input_data = [np.ones(inp.shape) for inp in model.inputs] * dataset_size
1408+
dataset = Dataset(input_data)
13971409

1398-
if raises:
1399-
with pytest.raises(RuntimeError) as exc_info:
1400-
call_compression()
1401-
assert "Data-aware mixed precision criteria are not supported for MatMuls with 1D/2D activations." in str(
1402-
exc_info.value
1403-
)
1404-
else:
1405-
call_compression()
1410+
compress_weights(
1411+
model,
1412+
mode=CompressWeightsMode.INT4_SYM,
1413+
ratio=1.0,
1414+
group_size=8,
1415+
subset_size=2,
1416+
dataset=dataset,
1417+
all_layers=True,
1418+
**kwargs,
1419+
)

0 commit comments

Comments
 (0)