Skip to content

Commit 0eced14

Browse files
authored
Enhance INC WOQ model loading & support Huggingface WOQ model loading (#1826)
Signed-off-by: yuwenzho <yuwen.zhou@intel.com>
1 parent 6733dab commit 0eced14

File tree

15 files changed

+762
-52
lines changed

15 files changed

+762
-52
lines changed

docs/3x/PT_WeightOnlyQuant.md

+8-8
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,13 @@ Theoretically, round-to-nearest (RTN) is the most straightforward way to quantiz
3131

3232
## Supported Matrix
3333

34-
| Algorithms/Backend | PyTorch eager mode |
34+
| Algorithms/Backend | PyTorch eager mode |
3535
|--------------|----------|
3636
| RTN | &#10004; |
3737
| GPTQ | &#10004; |
3838
| AutoRound| &#10004; |
3939
| AWQ | &#10004; |
40-
| TEQ | &#10004; |
40+
| TEQ | &#10004; |
4141
| HQQ | &#10004; |
4242
> **RTN:** A quantification method that we can think of very intuitively. It does not require additional datasets and is a very fast quantization method. Generally speaking, RTN will convert the weight into a uniformly distributed integer data type, but some algorithms, such as Qlora, propose a non-uniform NF4 data type and prove its theoretical optimality.
4343
@@ -64,8 +64,8 @@ WeightOnlyQuant quantization for PyTorch is using prepare and convert [APIs](./P
6464
| bits (int)| [1, ..., 8] |
6565
| group_size (int)| [-1, 1, ..., $C_{in}$] |
6666
| use_sym (bool)| [True, False] |
67-
| use_double_quant (bool) | [True, False] |
68-
| double_quant_dtype (str) | ['int'] |
67+
| use_double_quant (bool) | [True, False] |
68+
| double_quant_dtype (str) | ['int'] |
6969
| double_quant_bits (int) | [1, ..., bits] |
7070
| double_quant_use_sym (bool) | [True, False] |
7171
| double_quant_group_size (int) | [-1, 1, ..., $C_{in}$] |
@@ -98,7 +98,7 @@ model = convert(model)
9898
#### GPTQ
9999
| gptq_args | comments | default value |
100100
|----------|-------------|-------------------------------------------------------------------|
101-
| use_mse_search (bool) | Enables mean squared error (MSE) search | False
101+
| use_mse_search (bool) | Enables mean squared error (MSE) search | False
102102
| use_layer_wise (bool) | Enables quantize model per layer | False |
103103
| model_path (str) | Model path that is used to load state_dict per layer | |
104104
| use_double_quant (bool) | Enables double quantization | False |
@@ -120,7 +120,7 @@ model = convert(model)
120120
#### AutoRound
121121
| autoround_args | comments | default value |
122122
|----------|-------------|-------------------------------------------------------------------|
123-
| enable_full_range (bool) | Whether to enable full range quantization | False
123+
| enable_full_range (bool) | Whether to enable full range quantization | False
124124
| batch_size (int) | Batch size for training | 8 |
125125
| lr_scheduler | The learning rate scheduler to be used | None |
126126
| enable_quanted_input (bool) | Whether to use quantized input data | True |
@@ -251,8 +251,8 @@ from neural_compressor.torch.quantization import load
251251

252252
orig_model = YOURMODEL()
253253
loaded_model = load(
254-
"saved_results", model=orig_model
255-
) # Please note that the model parameter passes the original model.
254+
"saved_results", original_model=orig_model
255+
) # Please note that the original_model parameter passes the original model.
256256
```
257257

258258

examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/run_llm.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@
6363
parser.add_argument("--calib_iters", default=100, type=int,
6464
help="calibration iters.")
6565
parser.add_argument("--tasks", nargs='+', default=["lambada_openai"], \
66-
type=str, choices=["hellaswag", "lambada_openai", "piqa", "winogrande", "copa",
66+
type=str, choices=["hellaswag", "lambada_openai", "piqa", "winogrande", "copa",
6767
"rte", "openbookqa", "lambada_standard", "wikitext"],
6868
help="tasks list for accuracy validation")
6969
parser.add_argument("--limit", default=None, type=int,
@@ -117,10 +117,10 @@
117117
for examples in calib_dataset:
118118
calib_data.append(
119119
tokenizer(
120-
examples["text"],
121-
return_tensors="pt",
122-
max_length=64,
123-
padding="max_length",
120+
examples["text"],
121+
return_tensors="pt",
122+
max_length=64,
123+
padding="max_length",
124124
truncation=True
125125
)
126126
)
@@ -154,7 +154,7 @@ def calib_func(model):
154154

155155

156156

157-
# If torch.matmul and torch.bmm are not replaced by INC module,
157+
# If torch.matmul and torch.bmm are not replaced by INC module,
158158
# Below codes can make torch.matmul and torch.bmm run on fp8 by injection.
159159
if not args.skip_fp8_mm and args.precision in ['fp8_e4m3', 'fp8_e5m2']:
160160
def replace_torch_mm_bmm():

examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,7 @@ def run_fn(model):
367367
user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs)
368368
run_fn(user_model)
369369
user_model = convert(user_model)
370-
370+
371371
user_model.save(args.output_dir)
372372

373373

@@ -377,9 +377,10 @@ def run_fn(model):
377377
print("load int8 model")
378378

379379
from neural_compressor.torch.quantization import load
380+
user_model, _ = get_user_model()
380381
tokenizer = AutoTokenizer.from_pretrained(args.model)
381382
config = AutoConfig.from_pretrained(args.model)
382-
user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)))
383+
user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)), user_model)
383384
setattr(user_model, "config", config)
384385
else:
385386
user_model, tokenizer = get_user_model()

neural_compressor/torch/algorithms/weight_only/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,6 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14+
15+
16+
from .save_load import save, load

0 commit comments

Comments
 (0)