Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimized weights compression for NF4 data type #3369

Draft
wants to merge 11 commits into
base: develop
Choose a base branch
from
2 changes: 2 additions & 0 deletions .github/workflows/call_precommit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ jobs:
shell: bash
- name: Install NNCF and test requirements
run: pip install . -r tests/openvino/requirements.txt
- name: Install OV RC
run: pip install -U --pre openvino --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
- name: Print installed modules
run: pip list
- name: Run OV precommit test scope
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/conformance_weight_compression.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ name: Weight compression
permissions: read-all

on:
pull_request:
workflow_call:
workflow_dispatch:
inputs:
Expand Down Expand Up @@ -41,6 +42,8 @@ jobs:
run: cat /proc/cpuinfo
- name: Install NNCF and test requirements
run: pip install -e . -r tests/post_training/requirements.txt
- name: Install OV RC
run: pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
- name: Print installed modules
run: pip list
- name: Run examples test scope
Expand Down
10 changes: 8 additions & 2 deletions nncf/openvino/optimized_functions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,14 @@
# limitations under the License.

from nncf.openvino.optimized_functions.functions import astype as astype
from nncf.openvino.optimized_functions.functions import do_int_quantization as do_int_quantization
from nncf.openvino.optimized_functions.functions import do_float_quantization as do_float_quantization
from nncf.openvino.optimized_functions.functions import do_integer_quantization as do_integer_quantization
from nncf.openvino.optimized_functions.functions import (
float_quantize_dequantize_weight as float_quantize_dequantize_weight,
)
from nncf.openvino.optimized_functions.functions import get_integer_quantization_error as get_integer_quantization_error
from nncf.openvino.optimized_functions.functions import quantize_dequantize_weight as quantize_dequantize_weight
from nncf.openvino.optimized_functions.functions import (
integer_quantize_dequantize_weight as integer_quantize_dequantize_weight,
)
from nncf.openvino.optimized_functions.models import OVModelParameters as OVModelParameters
from nncf.openvino.optimized_functions.models import clear_ov_model_cache as clear_ov_model_cache
103 changes: 95 additions & 8 deletions nncf/openvino/optimized_functions/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,16 @@

from typing import Optional, Tuple, Union

from nncf import CompressWeightsMode
from nncf.common.utils.caching import disable_results_caching
from nncf.openvino.optimized_functions.models import OV_MODEL_CACHE
from nncf.openvino.optimized_functions.models import OVModelParameters
from nncf.openvino.optimized_functions.models import get_astype_model
from nncf.openvino.optimized_functions.models import get_compress_decompress_weight_model
from nncf.openvino.optimized_functions.models import get_compress_weight_model
from nncf.openvino.optimized_functions.models import get_quantization_error_model
from nncf.openvino.optimized_functions.models import get_float_quantization_model
from nncf.openvino.optimized_functions.models import get_float_quantize_dequantize_weight_model
from nncf.openvino.optimized_functions.models import get_integer_quantization_error_model
from nncf.openvino.optimized_functions.models import get_integer_quantization_model
from nncf.openvino.optimized_functions.models import get_integer_quantize_dequantize_weight_model
from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig
from nncf.quantization.algorithms.weight_compression.weight_lowering import reshape_weight_for_grouped_quantization
from nncf.tensor import Tensor
Expand All @@ -27,7 +30,7 @@
ReductionAxes = Union[int, Tuple[int, ...]]


def do_int_quantization(
def do_integer_quantization(
weight: Tensor,
config: WeightCompressionConfig,
reduction_axes: Optional[ReductionAxes] = None,
Expand Down Expand Up @@ -63,7 +66,7 @@ def do_int_quantization(
{"compressed_weight": compressed_weight_dtype, "zero_point": compressed_weight_dtype}
)

model = get_compress_weight_model(
model = get_integer_quantization_model(
ov_model_params,
config,
weight_shape,
Expand Down Expand Up @@ -97,7 +100,49 @@ def do_int_quantization(
return compressed_weight, scale, zero_point


def quantize_dequantize_weight(
def do_float_quantization(
weight: Tensor,
config: WeightCompressionConfig,
reduction_axes: Optional[ReductionAxes] = None,
precomputed_scale: Tensor = None,
) -> Tuple[Tensor, Tensor]:
weight_shape = weight.shape
scale_shape = None if precomputed_scale is None else precomputed_scale.shape

ov_model_params = OVModelParameters()
ov_model_params.input_dtypes["weight"] = weight.dtype
if precomputed_scale is not None:
ov_model_params.input_dtypes["scale"] = precomputed_scale.dtype
if config.num_bits == 4 and weight.backend == TensorBackend.ov:
# Return ov tensors in target precision to seamlessly insert them into openvino model later
ov_model_params.return_ov_tensors = True
dtype = TensorDataType.nf4 if config.mode == CompressWeightsMode.NF4 else TensorDataType.f4e2m1
ov_model_params.output_dtypes.update({"compressed_weight": dtype})

model = get_float_quantization_model(
ov_model_params,
config,
weight_shape,
scale_shape,
reduction_axes,
)

if precomputed_scale is None:
# weight -> compressed_weight, scale
compressed_weight, scale = model([weight])

# Scale is always in fp32 so there is no need to store it in ov.Tensor
if scale.backend == TensorBackend.ov:
scale = scale.as_numpy_tensor()
else:
# weight, scale -> compressed_weight
compressed_weight = model([weight, precomputed_scale])[0]
scale = precomputed_scale

return compressed_weight, scale


def integer_quantize_dequantize_weight(
weight: Tensor,
config: WeightCompressionConfig,
reduction_axes: Optional[ReductionAxes] = None,
Expand Down Expand Up @@ -135,7 +180,7 @@ def quantize_dequantize_weight(
if precomputed_zero_point is not None:
ov_model_params.input_dtypes["zero_point"] = precomputed_zero_point.dtype

model = get_compress_decompress_weight_model(
model = get_integer_quantize_dequantize_weight_model(
ov_model_params, config, weight_shape, scale_shape, zero_point_shape, reduction_axes, return_compressed_weight
)

Expand All @@ -161,6 +206,48 @@ def quantize_dequantize_weight(
return decompressed_weight


def float_quantize_dequantize_weight(
weight: Tensor,
config: WeightCompressionConfig,
reduction_axes: Optional[ReductionAxes] = None,
precomputed_scale: Optional[Tensor] = None,
return_compressed_weight: Optional[bool] = False,
) -> Union[Tensor, Tuple[Tensor, Tensor, Tensor]]:
# When reduction axes are not provided, assuming that the weights are already reshaped
if config.group_size != -1 and reduction_axes is not None:
# weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2]
weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, config.group_size)

weight_shape = weight.shape
scale_shape = precomputed_scale.shape if precomputed_scale is not None else None

ov_model_params = OVModelParameters()
ov_model_params.input_dtypes["weight"] = weight.dtype
if precomputed_scale is not None:
ov_model_params.input_dtypes["scale"] = precomputed_scale.dtype

model = get_float_quantize_dequantize_weight_model(
ov_model_params, config, weight_shape, scale_shape, reduction_axes, return_compressed_weight
)

inputs = [weight]
if precomputed_scale is not None:
inputs.append(precomputed_scale)

compressed_weight, scale = None, precomputed_scale
results = model(inputs)
if len(results) == 1:
decompressed_weight = results[0]
elif len(results) == 2:
decompressed_weight, compressed_weight = results
else:
decompressed_weight, compressed_weight, scale = results
if return_compressed_weight:
return decompressed_weight, compressed_weight, scale
else:
return decompressed_weight


def get_integer_quantization_error(
weight: Tensor,
reduction_axes: ReductionAxes,
Expand Down Expand Up @@ -188,7 +275,7 @@ def get_integer_quantization_error(

ov_model_params = OVModelParameters()
ov_model_params.input_dtypes["weight"] = weight.dtype
model = get_quantization_error_model(
model = get_integer_quantization_error_model(
ov_model_params, config, original_weight_shape, weight.shape, original_reduction_axes, reduction_axes
)

Expand Down
Loading