|
| 1 | +import argparse |
| 2 | +import json |
| 3 | +import time |
| 4 | +import numpy as np |
| 5 | +from openvino.runtime import Core |
| 6 | +from transformers import AutoTokenizer, AutoModelForCausalLM |
| 7 | +from utils.greedy_search import process_logits |
| 8 | + |
| 9 | +checkpoint = "llama-7b" |
| 10 | +tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True) |
| 11 | +eos_token_id = tokenizer.eos_token_id |
| 12 | + |
| 13 | + |
| 14 | +# this function converts text to tokens |
| 15 | +def tokenize(text): |
| 16 | + """ |
| 17 | + tokenize input text using GPT2 tokenizer |
| 18 | +
|
| 19 | + Parameters: |
| 20 | + text, str - input text |
| 21 | + Returns: |
| 22 | + input_ids - np.array with input token ids |
| 23 | + attention_mask - np.array with 0 in place, where should be padding and 1 for places where original tokens are located, represents attention mask for model |
| 24 | + """ |
| 25 | + |
| 26 | + inputs = tokenizer(text, return_tensors="np") |
| 27 | + return inputs["input_ids"], inputs["attention_mask"] |
| 28 | + |
| 29 | + |
| 30 | +def softmax(x): |
| 31 | + e_x = np.exp(x - np.max(x, axis=-1, keepdims=True)) |
| 32 | + summation = e_x.sum(axis=-1, keepdims=True) |
| 33 | + return e_x / summation |
| 34 | + |
| 35 | + |
| 36 | +def process_logits(cur_length, scores, eos_token_id, min_length=0): |
| 37 | + """ |
| 38 | + reduce probability for padded indicies |
| 39 | +
|
| 40 | + Parameters: |
| 41 | + cur_length - current length of input sequence |
| 42 | + scores - model output logits |
| 43 | + eos_token_id - index of end of string token in model vocab |
| 44 | + min_length - minimum length for appling postprocessing |
| 45 | + """ |
| 46 | + if cur_length < min_length: |
| 47 | + scores[:, eos_token_id] = -float("inf") |
| 48 | + return scores |
| 49 | + |
| 50 | + |
| 51 | +def get_top_k_logits(scores, top_k): |
| 52 | + """ |
| 53 | + perform top-k sampling |
| 54 | +
|
| 55 | + Parameters: |
| 56 | + scores - model output logits |
| 57 | + top_k - number of elements with highest probability to select |
| 58 | + """ |
| 59 | + filter_value = -float("inf") |
| 60 | + top_k = min(max(top_k, 1), scores.shape[-1]) |
| 61 | + top_k_scores = -np.sort(-scores)[:, :top_k] |
| 62 | + indices_to_remove = scores < np.min(top_k_scores) |
| 63 | + filtred_scores = np.ma.array(scores, mask=indices_to_remove, |
| 64 | + fill_value=filter_value).filled() |
| 65 | + return filtred_scores |
| 66 | + |
| 67 | + |
| 68 | +def prepare_next_input(model_inputs, next_tokens): |
| 69 | + model_inputs['input_ids'] = np.array([[next_tokens]]) |
| 70 | + |
| 71 | + if 'attention_mask' in model_inputs: |
| 72 | + attention_mask = model_inputs['attention_mask'] |
| 73 | + model_inputs['attention_mask'] = np.concatenate([attention_mask, |
| 74 | + np.ones(attention_mask.shape[0], 1)], dim=-1) |
| 75 | + return model_inputs |
| 76 | + |
| 77 | + |
| 78 | +def generate_greedy(input_ids, attention_mask, max_sequence_length=128, |
| 79 | + eos_token_id=eos_token_id, dynamic_shapes=True, engine="OV"): |
| 80 | + first_iteration = True |
| 81 | + model_inputs = {} |
| 82 | + output_names = [] |
| 83 | + while True: |
| 84 | + cur_input_len = len(input_ids[0]) |
| 85 | + model_input_ids = input_ids |
| 86 | + model_input_attention_mask = attention_mask |
| 87 | + |
| 88 | + if first_iteration: |
| 89 | + first_input = {"input_ids": model_input_ids, |
| 90 | + "attention_mask": model_input_attention_mask |
| 91 | + } |
| 92 | + outputs = compiled_model(first_input) |
| 93 | + logits = outputs['logits'] |
| 94 | + next_token_logits = logits[:, -1, :] |
| 95 | + first_iteration = False |
| 96 | + else: |
| 97 | + outputs = compiled_model(model_inputs) |
| 98 | + # pre-process distribution |
| 99 | + next_tokens_scores = next_token_logits |
| 100 | + next_tokens = np.argmax(next_tokens_scores, axis=-1) |
| 101 | + # get next token id |
| 102 | + # break the loop if max length or end of text token is reached |
| 103 | + if cur_input_len == max_sequence_length or next_tokens == eos_token_id: |
| 104 | + break |
| 105 | + else: |
| 106 | + input_ids = np.concatenate((input_ids, next_tokens), axis=-1) |
| 107 | + model_inputs = prepare_next_input(model_inputs, next_tokens) |
| 108 | + |
| 109 | + return input_ids |
| 110 | + |
| 111 | + |
| 112 | +if __name__ == "__main__": |
| 113 | + parser = argparse.ArgumentParser() |
| 114 | + # Add an argument |
| 115 | + parser.add_argument('-m', '--model', type=str, required=True, |
| 116 | + help="path to model file") |
| 117 | + parser.add_argument('-pl', '--prompt-length', type=int, default=32, required=False, |
| 118 | + help="prompt length") |
| 119 | + parser.add_argument('-al', '--answer-length', type=int, |
| 120 | + default=32, help="generated token length") |
| 121 | + # Parse the argument |
| 122 | + args = parser.parse_args() |
| 123 | + |
| 124 | + # initialize openvino core |
| 125 | + read_model_start = time.time() |
| 126 | + core = Core() |
| 127 | + print("Init OpenVINO model ...") |
| 128 | + # read the model and corresponding weights from file |
| 129 | + ov_model = core.read_model(args.model) |
| 130 | + |
| 131 | + compiled_model = core.compile_model(ov_model, "CPU") |
| 132 | + prompts = {} |
| 133 | + with open("promtps.json") as f: |
| 134 | + prompts = json.load(f) |
| 135 | + if str(args.prompt_length) not in prompts: |
| 136 | + print("Prompt with length {0} is not provided in prompt.json".format( |
| 137 | + args.prompt_length)) |
| 138 | + exit(-1) |
| 139 | + |
| 140 | + text = prompts[str(args.prompt_length)] |
| 141 | + print("Input text: ", text) |
| 142 | + input_ids, attention_mask = tokenize(text) |
| 143 | + |
| 144 | + gen_sequence_start = time.time() |
| 145 | + print("Start generate sequence ...") |
| 146 | + |
| 147 | + output_ids = generate_greedy(input_ids, attention_mask, max_sequence_length=args.prompt_length + args.answer_length, |
| 148 | + eos_token_id=eos_token_id, dynamic_shapes=True, engine=args.engine) |
| 149 | + gen_sequence_end = time.time() |
| 150 | + output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) |
| 151 | + |
| 152 | + gen_sequence_length = len(output_ids[0]) - len(input_ids[0]) |
| 153 | + gen_latency = gen_sequence_end - gen_sequence_start |
0 commit comments