|
22 | 22 | from openvino.runtime import Node
|
23 | 23 | from openvino.runtime import opset13 as opset
|
24 | 24 |
|
| 25 | +from nncf import CompressWeightsMode |
25 | 26 | from nncf.common.utils.backend import is_openvino_at_least
|
26 | 27 | from nncf.common.utils.caching import ResultsCache
|
27 | 28 | from nncf.common.utils.caching import cache_results
|
@@ -233,6 +234,26 @@ def get_integer_quantization_model(
|
233 | 234 | )
|
234 | 235 |
|
235 | 236 |
|
| 237 | +def get_float_quantization_model( |
| 238 | + ov_model_params: OVModelParameters, |
| 239 | + config: WeightCompressionConfig, |
| 240 | + weight_shape: Tuple, |
| 241 | + scale_shape: Optional[Tuple] = None, |
| 242 | + reduction_axes: Optional[ReductionAxes] = None, |
| 243 | +) -> Union[ModelCallable, ModelAsNodes]: |
| 244 | + weight_shape, scale_shape, _ = _prepare_quantization_model_inputs( |
| 245 | + ov_model_params, weight_shape, scale_shape, zero_point_shape=None, reduction_axes=reduction_axes |
| 246 | + ) |
| 247 | + |
| 248 | + return _build_float_quantization_model( |
| 249 | + config, |
| 250 | + ov_model_params, |
| 251 | + weight_shape, |
| 252 | + scale_shape, |
| 253 | + reduction_axes, |
| 254 | + ) |
| 255 | + |
| 256 | + |
236 | 257 | def get_integer_quantize_dequantize_weight_model(
|
237 | 258 | ov_model_params: OVModelParameters,
|
238 | 259 | config: WeightCompressionConfig,
|
@@ -453,6 +474,97 @@ def _build_integer_quantization_model(
|
453 | 474 | return partial(_infer_ov_model, ov_model_params, compiled_model)
|
454 | 475 |
|
455 | 476 |
|
| 477 | +@cache_results(OV_MODEL_CACHE) |
| 478 | +def _build_float_quantization_model( |
| 479 | + config: WeightCompressionConfig, |
| 480 | + ov_model_params: OVModelParameters, |
| 481 | + weight_shape: Tuple, |
| 482 | + scale_shape: Optional[Tuple] = None, |
| 483 | + reduction_axes: Optional[ReductionAxes] = None, |
| 484 | + return_nodes: bool = False, |
| 485 | +) -> Union[ModelCallable, ModelAsNodes]: |
| 486 | + default_input_dtypes = {"scale": TensorDataType.float32} |
| 487 | + default_output_dtypes = {"compressed_weight": TensorDataType.float32, "scale": TensorDataType.float32} |
| 488 | + |
| 489 | + # Update input and output dtypes with the default values |
| 490 | + ov_model_params = copy.deepcopy(ov_model_params) |
| 491 | + ov_model_params.input_dtypes = {**default_input_dtypes, **ov_model_params.input_dtypes} |
| 492 | + ov_model_params.output_dtypes = {**default_output_dtypes, **ov_model_params.output_dtypes} |
| 493 | + |
| 494 | + if "weight" not in ov_model_params.input_dtypes: |
| 495 | + msg = "Input weight dtype is required!" |
| 496 | + raise ValueError(msg) |
| 497 | + |
| 498 | + weight_dtype = ov_model_params.input_dtypes["weight"] |
| 499 | + input_scale_dtype = ov_model_params.input_dtypes["scale"] |
| 500 | + compressed_weight_dtype = ov_model_params.output_dtypes["compressed_weight"] |
| 501 | + output_scale_dtype = ov_model_params.output_dtypes["scale"] |
| 502 | + |
| 503 | + # Validate input dtypes |
| 504 | + valid_weight_dtypes = [TensorDataType.float32, TensorDataType.float16, TensorDataType.bfloat16] |
| 505 | + if weight_dtype not in valid_weight_dtypes: |
| 506 | + msg = f"Weight must be one of the following data types: {valid_weight_dtypes}. But found: {weight_dtype}." |
| 507 | + raise ValueError(msg) |
| 508 | + if scale_shape is not None and input_scale_dtype != TensorDataType.float32: |
| 509 | + msg = f"Input scale must be of float32 data type. But found: {input_scale_dtype}." |
| 510 | + raise ValueError(msg) |
| 511 | + |
| 512 | + # Validate output dtypes |
| 513 | + # TODO: Enable f4e2m1 |
| 514 | + valid_compressed_weight_dtypes = [TensorDataType.float32, TensorDataType.nf4] |
| 515 | + if compressed_weight_dtype not in valid_compressed_weight_dtypes: |
| 516 | + msg = ( |
| 517 | + f"Compressed weight must be one of the following data types: {valid_compressed_weight_dtypes}. " |
| 518 | + f"But found: {compressed_weight_dtype}." |
| 519 | + ) |
| 520 | + raise ValueError(msg) |
| 521 | + if scale_shape is None and output_scale_dtype != TensorDataType.float32: |
| 522 | + msg = f"Output scale must be of float32 data type. But found: {output_scale_dtype}." |
| 523 | + raise ValueError(msg) |
| 524 | + |
| 525 | + # Build OV model |
| 526 | + weight = opset.parameter(weight_shape, name="weight", dtype=DTYPE_MAP_OV[weight_dtype]) |
| 527 | + ov_parameters = [weight] |
| 528 | + weight = convert_op(weight, ov.Type.f32) |
| 529 | + |
| 530 | + divide_op = opset.divide if ov_model_params.convertable_division else non_convertable_divide_op |
| 531 | + if scale_shape is not None: |
| 532 | + # Scale is given as an input |
| 533 | + scale = opset.parameter(scale_shape, name="scale", dtype=DTYPE_MAP_OV[input_scale_dtype]) |
| 534 | + ov_parameters.append(scale) |
| 535 | + else: |
| 536 | + # Compute scale |
| 537 | + scale = opset.reduce_max(opset.abs(weight), reduction_axes=reduction_axes, keep_dims=True) |
| 538 | + # NOTE: adding machine epsilon to avoid division by zero |
| 539 | + eps = np.finfo(np.float32).eps |
| 540 | + scale = opset.select(opset.less(opset.abs(scale), eps), eps, scale) |
| 541 | + |
| 542 | + if config.mode == CompressWeightsMode.E2M1: |
| 543 | + max_val = opset.constant(6, ov.Type.f32) # Maximal value of e2m1 type. |
| 544 | + constant_2 = opset.constant(2, ov.Type.f32) |
| 545 | + scale = divide_op(scale, max_val) |
| 546 | + scale = opset.log(scale) / opset.log(constant_2) |
| 547 | + scale = opset.ceil(scale) |
| 548 | + scale = opset.clamp(scale, -127, 127) |
| 549 | + scale = opset.power(constant_2, scale) |
| 550 | + |
| 551 | + compressed_weight = divide_op(weight, scale) |
| 552 | + compressed_weight = convert_op(compressed_weight, ov.Type.nf4) |
| 553 | + compressed_weight = convert_op(compressed_weight, DTYPE_MAP_OV[compressed_weight_dtype]) |
| 554 | + |
| 555 | + ov_results = [compressed_weight] |
| 556 | + if len(ov_parameters) == 1: |
| 557 | + ov_results.append(scale) |
| 558 | + |
| 559 | + if return_nodes: |
| 560 | + return ov_parameters, ov_results, ov_model_params |
| 561 | + |
| 562 | + model = ov.Model(ov_results, ov_parameters) |
| 563 | + compiled_model = _compile_ov_model(model, device_name="CPU", config={inference_precision(): ov.Type.f32}) |
| 564 | + |
| 565 | + return partial(_infer_ov_model, ov_model_params, compiled_model) |
| 566 | + |
| 567 | + |
456 | 568 | @cache_results(OV_MODEL_CACHE)
|
457 | 569 | def _build_integer_quantize_dequantize_weight_model(
|
458 | 570 | config: WeightCompressionConfig,
|
|
0 commit comments