Skip to content

Commit cfc90e6

Browse files
committed
add greedy_pipeline
1 parent 93c6a88 commit cfc90e6

File tree

3 files changed

+168
-2
lines changed

3 files changed

+168
-2
lines changed

.gitignore

+4-2
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
*.exe
3131
*.out
3232
*.app
33-
3433
build
35-
.cache
34+
.cache
35+
# Vscode
36+
.env
37+
.vscode

greedy_pipeline.py

+153
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
import argparse
2+
import json
3+
import time
4+
import numpy as np
5+
from openvino.runtime import Core
6+
from transformers import AutoTokenizer, AutoModelForCausalLM
7+
from utils.greedy_search import process_logits
8+
9+
checkpoint = "llama-7b"
10+
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
11+
eos_token_id = tokenizer.eos_token_id
12+
13+
14+
# this function converts text to tokens
15+
def tokenize(text):
16+
"""
17+
tokenize input text using GPT2 tokenizer
18+
19+
Parameters:
20+
text, str - input text
21+
Returns:
22+
input_ids - np.array with input token ids
23+
attention_mask - np.array with 0 in place, where should be padding and 1 for places where original tokens are located, represents attention mask for model
24+
"""
25+
26+
inputs = tokenizer(text, return_tensors="np")
27+
return inputs["input_ids"], inputs["attention_mask"]
28+
29+
30+
def softmax(x):
31+
e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
32+
summation = e_x.sum(axis=-1, keepdims=True)
33+
return e_x / summation
34+
35+
36+
def process_logits(cur_length, scores, eos_token_id, min_length=0):
37+
"""
38+
reduce probability for padded indicies
39+
40+
Parameters:
41+
cur_length - current length of input sequence
42+
scores - model output logits
43+
eos_token_id - index of end of string token in model vocab
44+
min_length - minimum length for appling postprocessing
45+
"""
46+
if cur_length < min_length:
47+
scores[:, eos_token_id] = -float("inf")
48+
return scores
49+
50+
51+
def get_top_k_logits(scores, top_k):
52+
"""
53+
perform top-k sampling
54+
55+
Parameters:
56+
scores - model output logits
57+
top_k - number of elements with highest probability to select
58+
"""
59+
filter_value = -float("inf")
60+
top_k = min(max(top_k, 1), scores.shape[-1])
61+
top_k_scores = -np.sort(-scores)[:, :top_k]
62+
indices_to_remove = scores < np.min(top_k_scores)
63+
filtred_scores = np.ma.array(scores, mask=indices_to_remove,
64+
fill_value=filter_value).filled()
65+
return filtred_scores
66+
67+
68+
def prepare_next_input(model_inputs, next_tokens):
69+
model_inputs['input_ids'] = np.array([[next_tokens]])
70+
71+
if 'attention_mask' in model_inputs:
72+
attention_mask = model_inputs['attention_mask']
73+
model_inputs['attention_mask'] = np.concatenate([attention_mask,
74+
np.ones(attention_mask.shape[0], 1)], dim=-1)
75+
return model_inputs
76+
77+
78+
def generate_greedy(input_ids, attention_mask, max_sequence_length=128,
79+
eos_token_id=eos_token_id, dynamic_shapes=True, engine="OV"):
80+
first_iteration = True
81+
model_inputs = {}
82+
output_names = []
83+
while True:
84+
cur_input_len = len(input_ids[0])
85+
model_input_ids = input_ids
86+
model_input_attention_mask = attention_mask
87+
88+
if first_iteration:
89+
first_input = {"input_ids": model_input_ids,
90+
"attention_mask": model_input_attention_mask
91+
}
92+
outputs = compiled_model(first_input)
93+
logits = outputs['logits']
94+
next_token_logits = logits[:, -1, :]
95+
first_iteration = False
96+
else:
97+
outputs = compiled_model(model_inputs)
98+
# pre-process distribution
99+
next_tokens_scores = next_token_logits
100+
next_tokens = np.argmax(next_tokens_scores, axis=-1)
101+
# get next token id
102+
# break the loop if max length or end of text token is reached
103+
if cur_input_len == max_sequence_length or next_tokens == eos_token_id:
104+
break
105+
else:
106+
input_ids = np.concatenate((input_ids, next_tokens), axis=-1)
107+
model_inputs = prepare_next_input(model_inputs, next_tokens)
108+
109+
return input_ids
110+
111+
112+
if __name__ == "__main__":
113+
parser = argparse.ArgumentParser()
114+
# Add an argument
115+
parser.add_argument('-m', '--model', type=str, required=True,
116+
help="path to model file")
117+
parser.add_argument('-pl', '--prompt-length', type=int, default=32, required=False,
118+
help="prompt length")
119+
parser.add_argument('-al', '--answer-length', type=int,
120+
default=32, help="generated token length")
121+
# Parse the argument
122+
args = parser.parse_args()
123+
124+
# initialize openvino core
125+
read_model_start = time.time()
126+
core = Core()
127+
print("Init OpenVINO model ...")
128+
# read the model and corresponding weights from file
129+
ov_model = core.read_model(args.model)
130+
131+
compiled_model = core.compile_model(ov_model, "CPU")
132+
prompts = {}
133+
with open("promtps.json") as f:
134+
prompts = json.load(f)
135+
if str(args.prompt_length) not in prompts:
136+
print("Prompt with length {0} is not provided in prompt.json".format(
137+
args.prompt_length))
138+
exit(-1)
139+
140+
text = prompts[str(args.prompt_length)]
141+
print("Input text: ", text)
142+
input_ids, attention_mask = tokenize(text)
143+
144+
gen_sequence_start = time.time()
145+
print("Start generate sequence ...")
146+
147+
output_ids = generate_greedy(input_ids, attention_mask, max_sequence_length=args.prompt_length + args.answer_length,
148+
eos_token_id=eos_token_id, dynamic_shapes=True, engine=args.engine)
149+
gen_sequence_end = time.time()
150+
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
151+
152+
gen_sequence_length = len(output_ids[0]) - len(input_ids[0])
153+
gen_latency = gen_sequence_end - gen_sequence_start

0 commit comments

Comments
 (0)