|
12 | 12 | # See the License for the specific language governing permissions and
|
13 | 13 | # limitations under the License.
|
14 | 14 |
|
15 |
| -from typing import Dict, List, Optional, Union |
| 15 | +from dataclasses import dataclass |
| 16 | +from typing import Any, Dict, List, Optional, Union |
16 | 17 |
|
17 | 18 | import torch
|
| 19 | +from transformers import PretrainedConfig |
18 | 20 | from transformers.utils.quantization_config import QuantizationConfigMixin
|
19 | 21 |
|
20 | 22 | from optimum.configuration_utils import BaseConfig
|
21 | 23 |
|
22 |
| -from .weight_quantization import OVWeightQuantizationConfig |
23 |
| - |
24 | 24 |
|
25 | 25 | DEFAULT_QUANTIZATION_CONFIG = {
|
26 | 26 | "algorithm": "quantization",
|
|
77 | 77 | }
|
78 | 78 |
|
79 | 79 |
|
| 80 | +DEFAULT_4BIT_CONFIGS = { |
| 81 | + "databricks/dolly-v2-3b": {"bits": 4, "sym": False, "group_size": 32, "ratio": 0.5}, |
| 82 | + "EleutherAI/gpt-j-6b": {"bits": 4, "sym": False, "group_size": 64}, |
| 83 | + "facebook/opt-6.7b": {"bits": 4, "sym": False, "group_size": 64, "ratio": 0.8}, |
| 84 | + "bigscience/bloomz-7b1": {"bits": 4, "sym": False, "group_size": 32, "ratio": 0.6}, |
| 85 | + "togethercomputer/RedPajama-INCITE-7B-Instruct": {"bits": 4, "sym": False, "group_size": 128}, |
| 86 | + "HuggingFaceH4/zephyr-7b-beta": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.6}, |
| 87 | + "meta-llama/Llama-2-7b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.6}, |
| 88 | + "meta-llama/Llama-2-7b-chat": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.8}, |
| 89 | + "meta-llama/Llama-2-13b-chat": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.8}, |
| 90 | + "stabilityai/stablelm-3b-4e1t": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.8}, |
| 91 | + "stablelm-epoch-3b-preview": {"bits": 4, "sym": True, "group_size": 64, "ratio": 0.8}, |
| 92 | + "stable-zephyr-3b-dpo": {"bits": 4, "sym": False, "group_size": 64, "ratio": 0.8}, |
| 93 | + "pansophic/rocket-3B": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.8}, |
| 94 | + "THUDM/chatglm2-6b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.72}, |
| 95 | + "Qwen/Qwen-7B-Chat": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.6}, |
| 96 | + "openlm-research/open_llama_3b": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True}, |
| 97 | + "tiiuae/falcon-7b": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True}, |
| 98 | + "psmathur/orca_mini_3b": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True}, |
| 99 | +} |
| 100 | + |
| 101 | + |
80 | 102 | class OVConfig(BaseConfig):
|
81 | 103 | CONFIG_NAME = "openvino_config.json"
|
82 | 104 | FULL_CONFIGURATION_FILE = "openvino_config.json"
|
@@ -127,3 +149,91 @@ def _enable_standard_onnx_export_option(self):
|
127 | 149 | for i, algo_config in enumerate(self.compression):
|
128 | 150 | if algo_config["algorithm"] == "quantization":
|
129 | 151 | self.compression[i]["export_to_onnx_standard_ops"] = self.save_onnx_model
|
| 152 | + |
| 153 | + |
| 154 | +@dataclass |
| 155 | +class OVWeightQuantizationConfig(QuantizationConfigMixin): |
| 156 | + """ |
| 157 | + This is a wrapper class about all possible attributes and features that you can play with a model that has been |
| 158 | + loaded using `optimum-intel` api for quantization with NNCF. |
| 159 | +
|
| 160 | + Args: |
| 161 | +
|
| 162 | + bits (`int`, defaults to 8): |
| 163 | + The number of bits to quantize to. |
| 164 | + sym (`bool`, *optional*, defaults to `False`): |
| 165 | + Whether to use symetric quantization. |
| 166 | + tokenizer (`str` or `PreTrainedTokenizerBase`, *optional*): |
| 167 | + The tokenizer used to process the dataset. You can pass either: |
| 168 | + - A custom tokenizer object. |
| 169 | + - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co. |
| 170 | + Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a |
| 171 | + user or organization name, like `dbmdz/bert-base-german-cased`. |
| 172 | + - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved |
| 173 | + using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`. |
| 174 | + dataset (`Union[List[str]]`, *optional*): |
| 175 | + The dataset used for data-aware compression. You can provide your own dataset in a list of string or just use the |
| 176 | + the one from the list ['wikitext2','c4','c4-new','ptb','ptb-new'] |
| 177 | + group_size (`int`, *optional*, defaults to 128): |
| 178 | + The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization. |
| 179 | + ratio (`float`, *optional*, defaults to 1.0): |
| 180 | + The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM |
| 181 | + and the rest to INT8_ASYM). |
| 182 | + all_layers (`bool`, *optional*): |
| 183 | + Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit presicion. |
| 184 | + sensitivity_metric (`nncf.SensitivityMetric`, *optional*): |
| 185 | + The sensitivity metric for assigning quantization precision to layers. In order to |
| 186 | + preserve the accuracy of the model, the more sensitive layers receives a higher precision. |
| 187 | + awq (`bool`, *optional*): |
| 188 | + Enables AWQ method to unify weight ranges and improve overall model accuracy. |
| 189 | + ignored_scope (`nncf.IgnoredScope`, *optional*): |
| 190 | + An ignored scope that defined the list of model control flow graph nodes to be ignored during quantization. |
| 191 | +
|
| 192 | + """ |
| 193 | + |
| 194 | + def __init__( |
| 195 | + self, |
| 196 | + bits: int = 8, |
| 197 | + sym: bool = False, |
| 198 | + tokenizer: Any = None, |
| 199 | + dataset: Optional[str] = None, |
| 200 | + ratio: Optional[float] = None, |
| 201 | + group_size: Optional[int] = None, |
| 202 | + all_layers: Optional[bool] = None, |
| 203 | + sensitivity_metric: Optional[str] = None, |
| 204 | + ignored_scope: Optional[dict] = None, |
| 205 | + **kwargs, |
| 206 | + ): |
| 207 | + self.bits = bits |
| 208 | + self.sym = sym |
| 209 | + self.tokenizer = tokenizer |
| 210 | + self.dataset = dataset |
| 211 | + self.group_size = group_size |
| 212 | + self.ratio = ratio |
| 213 | + self.all_layers = all_layers |
| 214 | + self.sensitivity_metric = sensitivity_metric |
| 215 | + self.ignored_scope = ignored_scope |
| 216 | + self.quant_method = "default" # TODO : enable AWQ after nncf v2.9.0 release |
| 217 | + self.post_init() |
| 218 | + |
| 219 | + def post_init(self): |
| 220 | + r""" |
| 221 | + Safety checker that arguments are correct |
| 222 | + """ |
| 223 | + if self.ratio is not None and not (0 <= self.ratio <= 1): |
| 224 | + raise ValueError("damp_percent must between 0 and 1.") |
| 225 | + if self.group_size is not None and self.group_size != -1 and self.group_size <= 0: |
| 226 | + raise ValueError("group_size must be greater than 0 or equal to -1") |
| 227 | + if self.dataset is not None and isinstance(self.dataset, str): |
| 228 | + if self.dataset not in ["wikitext2", "c4", "c4-new", "ptb", "ptb-new"]: |
| 229 | + raise ValueError( |
| 230 | + f"""You have entered a string value for dataset. You can only choose between |
| 231 | + ['wikitext2','c4','c4-new','ptb','ptb-new'], but we found {self.dataset}""" |
| 232 | + ) |
| 233 | + |
| 234 | + if self.bits not in [4, 8]: |
| 235 | + raise ValueError(f"Only support quantization to [4,8] bits but found {self.bits}") |
| 236 | + |
| 237 | + |
| 238 | +def _check_default_4bit_configs(config: PretrainedConfig): |
| 239 | + return DEFAULT_4BIT_CONFIGS.get(config.name_or_path, None) |
0 commit comments