Skip to content

Commit 48c5e3a

Browse files
authored
Modify WOQ examples structure (#1866)
Signed-off-by: Kaihui-intel <kaihui.tang@intel.com> Signed-off-by: chensuyue <suyue.chen@intel.com>
1 parent 498af74 commit 48c5e3a

File tree

11 files changed

+233
-305
lines changed

11 files changed

+233
-305
lines changed

.azure-pipelines/scripts/models/run_pytorch_models_trigger.sh

+3-3
Original file line numberDiff line numberDiff line change
@@ -53,15 +53,15 @@ elif [ "${model}" == "resnet18_fx" ]; then
5353
tuning_cmd="bash run_quant.sh --topology=resnet18 --dataset_location=${dataset_location} --input_model=${input_model}"
5454
benchmark_cmd="bash run_benchmark.sh --topology=resnet18 --dataset_location=${dataset_location} --mode=performance --batch_size=${batch_size} --iters=500"
5555
elif [ "${model}" == "opt_125m_woq_gptq_int4" ]; then
56-
model_src_dir="nlp/huggingface_models/language-modeling/quantization/llm"
56+
model_src_dir="nlp/huggingface_models/language-modeling/quantization/weight_only"
5757
inc_new_api=3x_pt
5858
tuning_cmd="bash run_quant.sh --topology=opt_125m_woq_gptq_int4"
5959
elif [ "${model}" == "opt_125m_woq_gptq_int4_dq_bnb" ]; then
60-
model_src_dir="nlp/huggingface_models/language-modeling/quantization/llm"
60+
model_src_dir="nlp/huggingface_models/language-modeling/quantization/weight_only"
6161
inc_new_api=3x_pt
6262
tuning_cmd="bash run_quant.sh --topology=opt_125m_woq_gptq_int4_dq_bnb"
6363
elif [ "${model}" == "opt_125m_woq_gptq_int4_dq_ggml" ]; then
64-
model_src_dir="nlp/huggingface_models/language-modeling/quantization/llm"
64+
model_src_dir="nlp/huggingface_models/language-modeling/quantization/weight_only"
6565
inc_new_api=3x_pt
6666
tuning_cmd="bash run_quant.sh --topology=opt_125m_woq_gptq_int4_dq_ggml"
6767
fi

docs/3x/PT_WeightOnlyQuant.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,7 @@ loaded_model = load(
258258

259259
## Examples
260260

261-
Users can also refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm) on how to quantize a model with WeightOnlyQuant.
261+
Users can also refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only) on how to quantize a model with WeightOnlyQuant.
262262

263263
## Reference
264264

+151-67
Original file line numberDiff line numberDiff line change
@@ -1,67 +1,151 @@
1-
{
2-
"pytorch": {
3-
"gpt_j_ipex":{
4-
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
5-
"dataset_location": "",
6-
"input_model": "",
7-
"main_script": "run_clm_no_trainer.py",
8-
"batch_size": 1
9-
},
10-
"gpt_j_ipex_sq":{
11-
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
12-
"dataset_location": "",
13-
"input_model": "",
14-
"main_script": "run_clm_no_trainer.py",
15-
"batch_size": 1
16-
},
17-
"llama2_7b_ipex":{
18-
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
19-
"dataset_location": "",
20-
"input_model": "",
21-
"main_script": "run_clm_no_trainer.py",
22-
"batch_size": 1
23-
},
24-
"llama2_7b_ipex_sq":{
25-
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
26-
"dataset_location": "",
27-
"input_model": "",
28-
"main_script": "run_clm_no_trainer.py",
29-
"batch_size": 1
30-
},
31-
"opt_125m_ipex":{
32-
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
33-
"dataset_location": "",
34-
"input_model": "",
35-
"main_script": "run_clm_no_trainer.py",
36-
"batch_size": 8
37-
},
38-
"opt_125m_ipex_sq":{
39-
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
40-
"dataset_location": "",
41-
"input_model": "",
42-
"main_script": "run_clm_no_trainer.py",
43-
"batch_size": 8
44-
},
45-
"dlrm_ipex": {
46-
"model_src_dir": "recommendation/dlrm/static_quant/ipex",
47-
"dataset_location": "/mnt/local_disk3/dataset/dlrm/dlrm/input",
48-
"input_model": "/mnt/local_disk3/dataset/dlrm/dlrm/dlrm_weight/tb00_40M.pt",
49-
"main_script": "dlrm_s_pytorch.py",
50-
"batch_size": 16384
51-
},
52-
"resnet18_pt2e_static":{
53-
"model_src_dir": "cv/static_quant",
54-
"dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
55-
"input_model": "",
56-
"main_script": "main.py",
57-
"batch_size": 1
58-
},
59-
"opt_125m_pt2e_static":{
60-
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e",
61-
"dataset_location": "",
62-
"input_model": "",
63-
"main_script": "run_clm_no_trainer.py",
64-
"batch_size": 1
65-
}
66-
}
67-
}
1+
{
2+
"pytorch": {
3+
"opt_125m_woq_gptq_int4":{
4+
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
5+
"dataset_location": "",
6+
"input_model": "",
7+
"main_script": "run_clm_no_trainer.py",
8+
"batch_size": 1
9+
},
10+
"opt_125m_woq_gptq_int4_dq_bnb":{
11+
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
12+
"dataset_location": "",
13+
"input_model": "",
14+
"main_script": "run_clm_no_trainer.py",
15+
"batch_size": 1
16+
},
17+
"opt_125m_woq_gptq_int4_dq_ggml":{
18+
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
19+
"dataset_location": "",
20+
"input_model": "",
21+
"main_script": "run_clm_no_trainer.py",
22+
"batch_size": 8
23+
},
24+
"llama2_7b_gptq_int4":{
25+
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
26+
"dataset_location": "",
27+
"input_model": "",
28+
"main_script": "run_clm_no_trainer.py",
29+
"batch_size": 8
30+
},
31+
"llama2_7b_gptq_int4_dq_bnb":{
32+
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
33+
"dataset_location": "",
34+
"input_model": "",
35+
"main_script": "run_clm_no_trainer.py",
36+
"batch_size": 8
37+
},
38+
"llama2_7b_gptq_int4_dq_ggml":{
39+
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
40+
"dataset_location": "",
41+
"input_model": "",
42+
"main_script": "run_clm_no_trainer.py",
43+
"batch_size": 8
44+
},
45+
"gpt_j_woq_rtn_int4":{
46+
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
47+
"dataset_location": "",
48+
"input_model": "",
49+
"main_script": "run_clm_no_trainer.py",
50+
"batch_size": 8
51+
},
52+
"gpt_j_woq_rtn_int4_dq_bnb":{
53+
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
54+
"dataset_location": "",
55+
"input_model": "",
56+
"main_script": "run_clm_no_trainer.py",
57+
"batch_size": 8
58+
},
59+
"gpt_j_woq_rtn_int4_dq_ggml":{
60+
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
61+
"dataset_location": "",
62+
"input_model": "",
63+
"main_script": "run_clm_no_trainer.py",
64+
"batch_size": 8
65+
},
66+
"gpt_j_woq_gptq_int4":{
67+
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
68+
"dataset_location": "",
69+
"input_model": "",
70+
"main_script": "run_clm_no_trainer.py",
71+
"batch_size": 8
72+
},
73+
"gpt_j_woq_gptq_int4_dq_bnb":{
74+
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
75+
"dataset_location": "",
76+
"input_model": "",
77+
"main_script": "run_clm_no_trainer.py",
78+
"batch_size": 8
79+
},
80+
"gpt_j_woq_gptq_int4_dq_ggml":{
81+
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/weight_only",
82+
"dataset_location": "",
83+
"input_model": "",
84+
"main_script": "run_clm_no_trainer.py",
85+
"batch_size": 8
86+
},
87+
"gpt_j_ipex":{
88+
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
89+
"dataset_location": "",
90+
"input_model": "",
91+
"main_script": "run_clm_no_trainer.py",
92+
"batch_size": 1
93+
},
94+
"gpt_j_ipex_sq":{
95+
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
96+
"dataset_location": "",
97+
"input_model": "",
98+
"main_script": "run_clm_no_trainer.py",
99+
"batch_size": 1
100+
},
101+
"llama2_7b_ipex":{
102+
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
103+
"dataset_location": "",
104+
"input_model": "",
105+
"main_script": "run_clm_no_trainer.py",
106+
"batch_size": 1
107+
},
108+
"llama2_7b_ipex_sq":{
109+
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
110+
"dataset_location": "",
111+
"input_model": "",
112+
"main_script": "run_clm_no_trainer.py",
113+
"batch_size": 1
114+
},
115+
"opt_125m_ipex":{
116+
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant",
117+
"dataset_location": "",
118+
"input_model": "",
119+
"main_script": "run_clm_no_trainer.py",
120+
"batch_size": 8
121+
},
122+
"opt_125m_ipex_sq":{
123+
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/smooth_quant",
124+
"dataset_location": "",
125+
"input_model": "",
126+
"main_script": "run_clm_no_trainer.py",
127+
"batch_size": 8
128+
},
129+
"dlrm_ipex": {
130+
"model_src_dir": "recommendation/dlrm/static_quant/ipex",
131+
"dataset_location": "/mnt/local_disk3/dataset/dlrm/dlrm/input",
132+
"input_model": "/mnt/local_disk3/dataset/dlrm/dlrm/dlrm_weight/tb00_40M.pt",
133+
"main_script": "dlrm_s_pytorch.py",
134+
"batch_size": 16384
135+
},
136+
"resnet18_pt2e_static":{
137+
"model_src_dir": "cv/static_quant",
138+
"dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
139+
"input_model": "",
140+
"main_script": "main.py",
141+
"batch_size": 1
142+
},
143+
"opt_125m_pt2e_static":{
144+
"model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e",
145+
"dataset_location": "",
146+
"input_model": "",
147+
"main_script": "run_clm_no_trainer.py",
148+
"batch_size": 1
149+
}
150+
}
151+
}

examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/README.md examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/README.md

-38
Original file line numberDiff line numberDiff line change
@@ -21,27 +21,14 @@ Here is how to run the scripts:
2121
### GPT-J-6b
2222

2323
#### Quantization
24-
```bash
25-
# "--sq" is used to enable smooth quant
26-
python run_clm_no_trainer.py \
27-
--model EleutherAI/gpt-j-6B \
28-
--quantize \
29-
--sq \
30-
--alpha 1.0 \
31-
--ipex \
32-
--output_dir "saved_results"
33-
```
34-
**Notes**: Smooth quantization here is based on torch.jit. Without past key value in example_inputs, the quantized model cannot be used for text-generation.
3524

3625
```bash
37-
# "--approach weight_only" is used to enable weight only quantization.
3826
# "--woq_algo GPTQ" is used to enable GPTQ algorithms
3927
# "--double_quant_type BNB_NF4" is used to enable double quant algorithms
4028
python run_clm_no_trainer.py \
4129
--model EleutherAI/gpt-j-6B \
4230
--dataset NeelNanda/pile-10k \
4331
--quantize \
44-
--approach weight_only \
4532
--woq_algo GPTQ \
4633
--woq_bits 4 \
4734
--woq_scheme asym \
@@ -57,7 +44,6 @@ python run_clm_no_trainer.py \
5744
--model EleutherAI/gpt-j-6B \
5845
--dataset NeelNanda/pile-10k \
5946
--quantize \
60-
--approach weight_only \
6147
--woq_algo RTN \
6248
--woq_bits 4 \
6349
--woq_scheme asym \
@@ -74,23 +60,12 @@ python run_clm_no_trainer.py \
7460
#### Quantization
7561

7662
```bash
77-
# "--sq" is used to enable smooth quant
78-
python run_clm_no_trainer.py \
79-
--model facebook/opt-125m \
80-
--quantize \
81-
--sq \
82-
--alpha 0.5 \
83-
--ipex \
84-
--output_dir "saved_results"
85-
86-
# "--approach weight_only" is used to enable weight only quantization.
8763
# "--woq_algo GPTQ" is used to enable GPTQ algorithms
8864
# "--double_quant_type BNB_NF4" is used to enable double quant algorithms
8965
python run_clm_no_trainer.py \
9066
--model facebook/opt-125m \
9167
--dataset NeelNanda/pile-10k \
9268
--quantize \
93-
--approach weight_only \
9469
--woq_algo GPTQ \
9570
--woq_bits 4 \
9671
--woq_scheme asym \
@@ -106,7 +81,6 @@ python run_clm_no_trainer.py \
10681
--model facebook/opt-125m \
10782
--dataset NeelNanda/pile-10k \
10883
--quantize \
109-
--approach weight_only \
11084
--woq_algo RTN \
11185
--woq_bits 4 \
11286
--woq_scheme asym \
@@ -121,23 +95,12 @@ python run_clm_no_trainer.py \
12195
#### Quantization
12296

12397
```bash
124-
# "--sq" is used to enable smooth quant
125-
python run_clm_no_trainer.py \
126-
--model meta-llama/Llama-2-7b-hf \
127-
--quantize \
128-
--sq \
129-
--alpha 0.8 \
130-
--ipex \
131-
--output_dir "saved_results"
132-
133-
# "--approach weight_only" is used to enable weight only quantization.
13498
# "--double_quant_type BNB_NF4" is used to enable double quant algorithms
13599
# "--woq_algo GPTQ" is used to enable GPTQ algorithms
136100
python run_clm_no_trainer.py \
137101
--model meta-llama/Llama-2-7b-hf \
138102
--dataset NeelNanda/pile-10k \
139103
--quantize \
140-
--approach weight_only \
141104
--woq_algo GPTQ \
142105
--woq_bits 4 \
143106
--woq_scheme asym \
@@ -153,7 +116,6 @@ python run_clm_no_trainer.py \
153116
--model meta-llama/Llama-2-7b-hf \
154117
--dataset NeelNanda/pile-10k \
155118
--quantize \
156-
--approach weight_only \
157119
--woq_algo RTN \
158120
--woq_bits 4 \
159121
--woq_scheme asym \

0 commit comments

Comments
 (0)