@@ -17,6 +17,7 @@ def get_greedy() -> GenerationConfig:
17
17
generation_config .num_return_sequences = 1
18
18
return generation_config
19
19
20
+
20
21
def get_beam_search () -> GenerationConfig :
21
22
generation_config = GenerationConfig ()
22
23
generation_config .num_groups = 3
@@ -25,6 +26,7 @@ def get_beam_search() -> GenerationConfig:
25
26
generation_config .num_return_sequences = generation_config .num_groups * generation_config .group_size
26
27
return generation_config
27
28
29
+
28
30
def get_test_dataset () -> Tuple [List [str ], List [GenerationConfig ]]:
29
31
prompts = [
30
32
"What is OpenVINO?" ,
@@ -40,6 +42,7 @@ def get_test_dataset() -> Tuple[List[str], List[GenerationConfig]]:
40
42
]
41
43
return (prompts , generation_configs )
42
44
45
+
43
46
def get_scheduler_config (scheduler_params : dict = None ) -> SchedulerConfig :
44
47
scheduler_config = SchedulerConfig ()
45
48
if scheduler_params is None :
@@ -54,6 +57,7 @@ def get_scheduler_config(scheduler_params: dict = None) -> SchedulerConfig:
54
57
55
58
return scheduler_config
56
59
60
+
57
61
def convert_to_hf (
58
62
default_generation_config : HFGenerationConfig ,
59
63
generation_config : GenerationConfig
@@ -91,6 +95,7 @@ def convert_to_hf(
91
95
hf_generation_config = HFGenerationConfig (** kwargs )
92
96
return hf_generation_config
93
97
98
+
94
99
def run_hugging_face (
95
100
model_id : str ,
96
101
prompts : List [str ],
@@ -117,7 +122,7 @@ def run_hugging_face(
117
122
inputs = hf_tokenizer (prompt , return_tensors = "pt" )
118
123
prompt_len = len (inputs ['input_ids' ][0 ])
119
124
generate_outputs = model .generate (** inputs , generation_config = convert_to_hf (model .generation_config , generation_config ), return_dict_in_generate = True )
120
- all_text_batch = hf_tokenizer .batch_decode ([generated_ids [prompt_len :] for generated_ids in generate_outputs .sequences ])
125
+ all_text_batch = hf_tokenizer .batch_decode ([generated_ids [prompt_len :] for generated_ids in generate_outputs .sequences ], skip_special_tokens = True )
121
126
122
127
generation_result = GenerationResult ()
123
128
generation_result .m_generation_ids = all_text_batch
@@ -126,16 +131,23 @@ def run_hugging_face(
126
131
generation_result .m_scores = [score for score in generate_outputs .sequences_scores ]
127
132
generation_results .append (generation_result )
128
133
134
+ del hf_tokenizer
135
+ del model
136
+
129
137
return (generation_results , model_path )
130
138
139
+
131
140
def run_continuous_batching (
132
141
model_path : Path ,
133
142
scheduler_config : SchedulerConfig ,
134
143
prompts : List [str ],
135
144
generation_configs : List [GenerationConfig ]
136
145
) -> List [GenerationResult ]:
137
146
pipe = ContinuousBatchingPipeline (model_path .absolute ().as_posix (), scheduler_config )
138
- return pipe .generate (prompts , generation_configs )
147
+ output = pipe .generate (prompts , generation_configs )
148
+ del pipe
149
+ return output
150
+
139
151
140
152
def get_models_list (file_name : str ):
141
153
models = []
@@ -148,6 +160,7 @@ def get_models_list(file_name: str):
148
160
models .append (model_name )
149
161
return models
150
162
163
+
151
164
def compare_results (hf_result , ov_result , generation_config ):
152
165
if generation_config .is_beam_search :
153
166
assert len (hf_result .m_scores ) == len (ov_result .m_scores )
0 commit comments