Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

load_in_4bit option for OVModelForCausalLM #538

Merged
merged 31 commits into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
7051462
Initial code for load_in_4_bit
AlexKoff88 Jan 29, 2024
491f25a
Dataset does not work
AlexKoff88 Jan 30, 2024
a08a16a
Intermediate changes
AlexKoff88 Jan 30, 2024
3ceea1d
Make it working with dataset
AlexKoff88 Jan 30, 2024
68d4f2d
Style
AlexKoff88 Jan 30, 2024
8b403da
Fixed small issue
AlexKoff88 Jan 30, 2024
0410b42
Fixed failed tests
AlexKoff88 Jan 30, 2024
7edffc8
Style
AlexKoff88 Jan 30, 2024
829cc6d
Comment failed tests due to NNCF 2.8
AlexKoff88 Jan 31, 2024
1e87775
Commented failed tests until new NNCF release
AlexKoff88 Jan 31, 2024
efe85a2
Added tests for load_in_4bit
AlexKoff88 Jan 31, 2024
6768527
Added awq option. Included NNCF package into openvino extra.
AlexKoff88 Feb 1, 2024
54f8fe0
Rolled back including nncf into openvino extra
AlexKoff88 Feb 1, 2024
2ec2a54
Style
AlexKoff88 Feb 1, 2024
c2f373f
Fixed tests
AlexKoff88 Feb 1, 2024
4c821ad
Fixed issues with models larger than 1B. Added tests.
AlexKoff88 Feb 2, 2024
9943624
Style
AlexKoff88 Feb 2, 2024
b555a67
Fixed issues. Applied comments.
AlexKoff88 Feb 5, 2024
9e108d7
Merge branch 'main' into ak/load_in_4bit_alt
AlexKoff88 Feb 5, 2024
55a673b
Removed unnecessary exception
AlexKoff88 Feb 5, 2024
374b1fc
Merged with main
AlexKoff88 Feb 5, 2024
f67e802
Applied more comments
AlexKoff88 Feb 5, 2024
de4d192
Fixed issue
AlexKoff88 Feb 5, 2024
277d39a
Make quantization_config a part of OVConfig in OVQuantizer
AlexKoff88 Feb 6, 2024
4707914
Fixed issue with Transformers
AlexKoff88 Feb 6, 2024
ae1da0f
Fixed test
AlexKoff88 Feb 7, 2024
1275d0a
Changed the naming. Added additional tests
AlexKoff88 Feb 8, 2024
ed69ff1
Fixed tests
AlexKoff88 Feb 8, 2024
c0e5a1a
Fixed tests
AlexKoff88 Feb 8, 2024
2922841
Applied more comments
AlexKoff88 Feb 8, 2024
a7eeeb2
Style
AlexKoff88 Feb 8, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions optimum/intel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,12 @@
"OVQuantizer",
"OVTrainer",
"OVTrainingArguments",
"OVWeightQuantizationConfig",
]
else:
_import_structure["openvino"].extend(["OVConfig", "OVQuantizer", "OVTrainer", "OVTrainingArguments"])
_import_structure["openvino"].extend(
["OVConfig", "OVQuantizer", "OVTrainer", "OVTrainingArguments", "OVWeightQuantizationConfig"]
)

try:
if not (is_openvino_available() and is_diffusers_available()):
Expand Down Expand Up @@ -171,9 +174,15 @@
if not (is_openvino_available() and is_nncf_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from .utils.dummy_openvino_and_nncf_objects import OVConfig, OVQuantizer, OVTrainer, OVTrainingArguments
from .utils.dummy_openvino_and_nncf_objects import (
OVConfig,
OVQuantizer,
OVTrainer,
OVTrainingArguments,
OVWeightQuantizationConfig,
)
else:
from .openvino import OVConfig, OVQuantizer, OVTrainer, OVTrainingArguments
from .openvino import OVConfig, OVQuantizer, OVTrainer, OVTrainingArguments, OVWeightQuantizationConfig

try:
if not (is_openvino_available() and is_diffusers_available()):
Expand Down
1 change: 1 addition & 0 deletions optimum/intel/openvino/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from .quantization import OVQuantizer
from .trainer import OVTrainer
from .training_args import OVTrainingArguments
from .weight_quantization import OVWeightQuantizationConfig

from .modeling import (
OVModelForAudioClassification,
Expand Down
6 changes: 4 additions & 2 deletions optimum/intel/openvino/modeling_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,11 +186,13 @@ def _from_pretrained(
force_download (`bool`, defaults to `False`):
Whether or not to force the (re-)download of the model weights and configuration files, overriding the
cached versions if they exist.
file_name(`str`, *optional*):
file_name (`str`, *optional*):
The file name of the model to load. Overwrites the default file name and allows one to load the model
with a different name.
local_files_only(`bool`, *optional*, defaults to `False`):
local_files_only (`bool`, *optional*, defaults to `False`):
Whether or not to only look at local files (i.e., do not try to download the model).
load_in_8bit (`bool`, *optional*, defaults to `False`):
Whether or not to apply 8-bit weight quantization.
"""

model_path = Path(model_id)
Expand Down
47 changes: 41 additions & 6 deletions optimum/intel/openvino/modeling_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS
from .modeling import _TOKENIZER_FOR_DOC, INPUTS_DOCSTRING, MODEL_START_DOCSTRING, OVModel
from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, STR_TO_OV_TYPE
from .weight_quantization import OVWeightQuantizationConfig, compress_decoder_weights


if is_transformers_version("<", "4.25.0"):
Expand Down Expand Up @@ -244,6 +245,8 @@ def _from_transformers(
use_cache: bool = True,
trust_remote_code: bool = False,
load_in_8bit: Optional[bool] = None,
load_in_4bit: Optional[bool] = None,
quantization_config: Optional[Union[OVWeightQuantizationConfig, Dict]] = None,
**kwargs,
):
if config.model_type.replace("_", "-") not in _SUPPORTED_ARCHITECTURES:
Expand All @@ -261,7 +264,7 @@ def _from_transformers(
task = task + "-with-past"

compression_option = None
if load_in_8bit is not None:
if load_in_8bit is not None and not load_in_4bit:
compression_option = "int8" if load_in_8bit else "fp32"
stateful = kwargs.pop("stateful", ensure_stateful_is_available(warn=False) and use_cache)
main_export(
Expand All @@ -283,7 +286,14 @@ def _from_transformers(
config.is_encoder_decoder = False
config.save_pretrained(save_dir_path)
return cls._from_pretrained(
model_id=save_dir_path, config=config, use_cache=use_cache, load_in_8bit=False, stateful=None, **kwargs
model_id=save_dir_path,
config=config,
use_cache=use_cache,
load_in_8bit=False,
stateful=None,
load_in_4bit=load_in_4bit,
quantization_config=quantization_config,
**kwargs,
)

def _reshape(
Expand Down Expand Up @@ -350,15 +360,14 @@ class OVModelForCausalLM(OVBaseDecoderModel, GenerationMixin):
checkpoint="gpt2",
)
)
def forward(
def prepare_forward_inputs(
self,
input_ids: torch.LongTensor,
attention_mask: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
position_ids: Optional[torch.LongTensor] = None,
**kwargs,
) -> CausalLMOutputWithPast:
self.compile()
) -> Dict:
if self.use_cache and past_key_values is not None:
input_ids = input_ids[:, -1:]

Expand Down Expand Up @@ -443,6 +452,26 @@ def forward(
self.next_beam_idx if self.next_beam_idx is not None else np.arange(batch_size, dtype=int)
)

return inputs

def forward(
self,
input_ids: torch.LongTensor,
attention_mask: Optional[torch.LongTensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
position_ids: Optional[torch.LongTensor] = None,
**kwargs,
) -> CausalLMOutputWithPast:
self.compile()

inputs = self.prepare_forward_inputs(
input_ids=input_ids,
attention_mask=attention_mask,
past_key_values=past_key_values,
position_ids=position_ids,
**kwargs,
)

# Run inference
self.request.start_async(inputs, share_inputs=True)
self.request.wait()
Expand Down Expand Up @@ -526,6 +555,8 @@ def _from_pretrained(
from_onnx: bool = False,
local_files_only: bool = False,
load_in_8bit: bool = False,
load_in_4bit: bool = False,
quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
**kwargs,
):
model_path = Path(model_id)
Expand Down Expand Up @@ -557,7 +588,11 @@ def _from_pretrained(
else:
init_cls = cls

return init_cls(model=model, config=config, model_save_dir=model_cache_path.parent, **kwargs)
causal_model = init_cls(model=model, config=config, model_save_dir=model_cache_path.parent, **kwargs)

if load_in_4bit:
compress_decoder_weights(causal_model, quantization_config)
return causal_model


class OVBloomForCausalLM(OVModelForCausalLM):
Expand Down
46 changes: 30 additions & 16 deletions optimum/intel/openvino/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from torch.utils.data import DataLoader, RandomSampler
from transformers import DataCollator, PreTrainedModel, default_data_collator
from transformers.pytorch_utils import Conv1D
from transformers.utils.quantization_config import QuantizationConfigMixin

from optimum.exporters.tasks import TasksManager
from optimum.quantization_base import OptimumQuantizer
Expand All @@ -49,6 +50,7 @@
ONNX_WEIGHTS_NAME,
OV_XML_FILE_NAME,
)
from .weight_quantization import compress_decoder_weights


COMPRESSION_OPTIONS = {
Expand Down Expand Up @@ -119,7 +121,8 @@ def quantize(
self,
calibration_dataset: Dataset = None,
save_directory: Union[str, Path] = None,
quantization_config: OVConfig = None,
quantization_config: QuantizationConfigMixin = None,
ov_config: OVConfig = None,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure we need both an ov_config and quantization_config, would prefer to keep only one if that's feasible

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@echarlaix, how about deprecating ov_config here and migrating to different types of QuantizationConfigMixin so that there will be only quantization_config parameter?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

was actually thinking about the opposite as ov_config have a quantization section, need to think a bit about this

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@echarlaix, I've changed the code according to your suggestion.

file_name: Optional[str] = None,
batch_size: int = 1,
data_collator: Optional[DataCollator] = None,
Expand Down Expand Up @@ -210,7 +213,7 @@ def quantize(
self._quantize_torchmodel(
calibration_dataset,
save_directory,
quantization_config,
ov_config,
file_name,
batch_size,
data_collator,
Expand Down Expand Up @@ -272,15 +275,26 @@ def _quantize_ovcausallm(
data_collator: Optional[DataCollator] = None,
remove_unused_columns: bool = True,
weights_only: bool = False,
quantization_config: OVConfig = None,
quantization_config: QuantizationConfigMixin = None,
**kwargs,
):
if self.model.stateful and not weights_only:
raise Exception(
"Full quantizaiton for stateful OVModelForCausalLM is currently broken. Possbile options:\n"
"1. Quantize AutoModelForCausalLM\n"
"2. Use weight only quantization\n"
"3. Use stateful=False to export stateless model"
)

save_directory = Path(save_directory)
save_directory.mkdir(parents=True, exist_ok=True)

if weights_only:
options = self._get_compression_options(quantization_config)
self.model.model = nncf.compress_weights(self.model.model, **options)
if quantization_config is None:
# Use default 8-bit compression
self.model.model = nncf.compress_weights(self.model.model)
else:
compress_decoder_weights(self.model, quantization_config)
self.model.save_pretrained(save_directory)
return

Expand Down Expand Up @@ -356,7 +370,7 @@ def _quantize_torchmodel(
self,
calibration_dataset: Dataset,
save_directory: Union[str, Path],
quantization_config: OVConfig = None,
ov_config: OVConfig = None,
file_name: Optional[str] = None,
batch_size: int = 1,
data_collator: Optional[DataCollator] = None,
Expand All @@ -378,14 +392,14 @@ def _quantize_torchmodel(
model_type=model_type,
)

if quantization_config is None:
if ov_config is None:
logger.info(
"No configuration describing the quantization process was provided, a default OVConfig will be generated."
)
quantization_config = OVConfig()
ov_config = OVConfig()
onnx_file_name = (
ONNX_WEIGHTS_NAME
if file_name is None and quantization_config.save_onnx_model
if file_name is None and ov_config.save_onnx_model
else Path(ov_file_name).with_suffix(".onnx")
)
if weights_only:
Expand All @@ -403,8 +417,8 @@ def _quantize_torchmodel(
)

model_inputs = next(iter(calibration_dataloader))
quantization_config.add_input_info(model_inputs)
nncf_config = NNCFConfig.from_dict(quantization_config.__dict__)
ov_config.add_input_info(model_inputs)
nncf_config = NNCFConfig.from_dict(ov_config.__dict__)
nncf_config = register_default_init_args(nncf_config, calibration_dataloader)
controller, compressed_model = create_compressed_model(
self.model, nncf_config, wrap_inputs_fn=wrap_nncf_model_inputs_with_objwalk
Expand All @@ -423,13 +437,13 @@ def _quantize_torchmodel(
else:
onnx_config = onnx_config_class(model.config)

model_path = save_directory / (onnx_file_name if quantization_config.save_onnx_model else ov_file_name)
model_path = save_directory / (onnx_file_name if ov_config.save_onnx_model else ov_file_name)
onnx_path = save_directory / onnx_file_name
export_fn = export if not quantization_config.save_onnx_model else export_pytorch_via_onnx
export_fn = export if not ov_config.save_onnx_model else export_pytorch_via_onnx
opset = min(onnx_config.DEFAULT_ONNX_OPSET, MAX_ONNX_OPSET)
opset = max(opset, MIN_ONNX_QDQ_OPSET)
kwargs = {}
if not quantization_config.save_onnx_model:
if not ov_config.save_onnx_model:
kwargs = {"stateful": ensure_export_task_support_stateful(task)}
_, _, is_onnx = export_fn(model=model, config=onnx_config, output=model_path, opset=opset, **kwargs)
if is_onnx:
Expand All @@ -438,14 +452,14 @@ def _quantize_torchmodel(
# Model required second saving for appling weights compression transformations
self._save_pretrained(model, output_path)
# if onnx conversion happens as fallback for pytorch conversion, remove onnx model
if not quantization_config.save_onnx_model:
if not ov_config.save_onnx_model:
os.remove(onnx_path)
try:
os.remove(f"{onnx_path}_data")
except FileNotFoundError:
pass

quantization_config.save_pretrained(save_directory)
ov_config.save_pretrained(save_directory)

@staticmethod
def _save_pretrained(model: openvino.runtime.Model, output_path: str):
Expand Down
Loading
Loading