1
1
import os
2
2
os .environ ["EXPERIMENTAL_WEIGHT_SHARING" ] = "False"
3
3
os .environ ["USE_GAUDI2_SCALE" ] = "True"
4
- os .environ .pop ("USE_GAUDI2_SCALE" ) # gaudi2 scale does not work
4
+ os .environ .pop ("USE_GAUDI2_SCALE" ) # gaudi scale work
5
5
# os.environ["GRAPH_VISUALIZATION"] = "True"
6
6
import shutil
7
7
shutil .rmtree (".graph_dumps" , ignore_errors = True )
14
14
import torch .nn .functional as F
15
15
import deepspeed
16
16
import transformers
17
- from transformers import AutoModelForCausalLM , AutoTokenizer
17
+ from transformers import AutoModelForCausalLM , AutoTokenizer , AutoConfig
18
18
import habana_frameworks .torch .core as htcore
19
19
import numpy as np
20
20
import lm_eval
21
21
import lm_eval .tasks
22
22
import lm_eval .evaluator
23
+ from accelerate import init_empty_weights , load_checkpoint_and_dispatch
23
24
24
25
25
26
torch .set_grad_enabled (False )
@@ -110,11 +111,16 @@ def itrex_bootstrap_stderr(f, xs, iters):
110
111
token = None ,
111
112
)
112
113
else :
113
- user_model = AutoModelForCausalLM .from_pretrained (
114
- args .model ,
115
- device_map = 'hpu' ,
116
- torch_dtype = model_dtype ,
117
- )
114
+ if args .load :
115
+ config = AutoConfig .from_pretrained (args .model , torch_dtype = model_dtype )
116
+ with init_empty_weights ():
117
+ user_model = AutoModelForCausalLM .from_config (config )
118
+ else :
119
+ user_model = AutoModelForCausalLM .from_pretrained (
120
+ args .model ,
121
+ device_map = 'hpu' ,
122
+ torch_dtype = model_dtype ,
123
+ )
118
124
elif re .search ("chatglm" , args .model .lower ()):
119
125
from models .modeling_chatglm import ChatGLMForConditionalGeneration
120
126
user_model = ChatGLMForConditionalGeneration .from_pretrained (
@@ -126,13 +132,18 @@ def itrex_bootstrap_stderr(f, xs, iters):
126
132
# print(user_model.transformer.output_layer.weight.dtype) # always fp16
127
133
user_model .float () # static fp8 need float32 for graph compiler
128
134
else :
129
- user_model = AutoModelForCausalLM .from_pretrained (
130
- args .model ,
131
- trust_remote_code = args .trust_remote_code ,
132
- revision = args .revision ,
133
- device_map = 'hpu' ,
134
- torch_dtype = model_dtype ,
135
- )
135
+ if args .load :
136
+ config = AutoConfig .from_pretrained (args .model , torch_dtype = model_dtype )
137
+ with init_empty_weights ():
138
+ user_model = AutoModelForCausalLM .from_config (config )
139
+ else :
140
+ user_model = AutoModelForCausalLM .from_pretrained (
141
+ args .model ,
142
+ trust_remote_code = args .trust_remote_code ,
143
+ revision = args .revision ,
144
+ device_map = 'hpu' ,
145
+ torch_dtype = model_dtype ,
146
+ )
136
147
137
148
# tokenizer
138
149
if re .search ("baichuan" , args .model .lower ()):
@@ -219,11 +230,40 @@ def replace_torch_mm_bmm():
219
230
_check_params_as_const (user_model )
220
231
# saving
221
232
user_model .save ("saved_results" )
222
- print (user_model , flush = True )
233
+ #print(user_model, flush=True)
234
+ def show_msg ():
235
+ import numpy as np
236
+ import glob
237
+ from habana_frameworks .torch .hpu import memory_stats
238
+ print ("Number of HPU graphs:" , len (glob .glob (".graph_dumps/*PreGraph*" )))
239
+ mem_stats = memory_stats ()
240
+ mem_dict = {
241
+ "memory_allocated (GB)" : np .round (mem_stats ["InUse" ] / 1024 ** 3 , 2 ),
242
+ "max_memory_allocated (GB)" : np .round (mem_stats ["MaxInUse" ] / 1024 ** 3 , 2 ),
243
+ "total_memory_available (GB)" : np .round (mem_stats ["Limit" ] / 1024 ** 3 , 2 ),
244
+ }
245
+ for k , v in mem_dict .items ():
246
+ print ("{:35} = {} GB" .format (k [:- 5 ].replace ("_" , " " ).capitalize (), v ))
247
+ show_msg ()
223
248
224
249
if args .load :
250
+ def show_msg ():
251
+ import numpy as np
252
+ import glob
253
+ from habana_frameworks .torch .hpu import memory_stats
254
+ print ("Number of HPU graphs:" , len (glob .glob (".graph_dumps/*PreGraph*" )))
255
+ mem_stats = memory_stats ()
256
+ mem_dict = {
257
+ "memory_allocated (GB)" : np .round (mem_stats ["InUse" ] / 1024 ** 3 , 2 ),
258
+ "max_memory_allocated (GB)" : np .round (mem_stats ["MaxInUse" ] / 1024 ** 3 , 2 ),
259
+ "total_memory_available (GB)" : np .round (mem_stats ["Limit" ] / 1024 ** 3 , 2 ),
260
+ }
261
+ for k , v in mem_dict .items ():
262
+ print ("{:35} = {} GB" .format (k [:- 5 ].replace ("_" , " " ).capitalize (), v ))
263
+ show_msg ()
225
264
from neural_compressor .torch .quantization import load
226
265
user_model = load (user_model , "saved_results" )
266
+ show_msg ()
227
267
# replace torch.matmul and toch.bmm by injection
228
268
def replace_torch_mm_bmm ():
229
269
from neural_compressor .torch .amp .fp8 .functions import fp8_matmul
@@ -235,7 +275,8 @@ def replace_torch_mm_bmm():
235
275
from habana_frameworks .torch .core .quantization import _check_params_as_const , _mark_params_as_const
236
276
_mark_params_as_const (user_model ) # can reduce memory allocated and speed up
237
277
_check_params_as_const (user_model )
238
- print (user_model , flush = True )
278
+ #print(user_model, flush=True)
279
+ show_msg ()
239
280
240
281
if args .to_graph :
241
282
import habana_frameworks .torch .hpu .graphs as htgraphs
0 commit comments