Skip to content

Commit b5b3b54

Browse files
committed
Set Left Padding For Text Gen Task
1 parent 819c513 commit b5b3b54

File tree

4 files changed

+18
-6
lines changed

4 files changed

+18
-6
lines changed

optimum/commands/export/openvino.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,7 @@ def _get_default_int4_config(model_id_or_path, library_name):
315315
model = model_cls.from_pretrained(self.args.model, export=True, quantization_config=quantization_config)
316316
model.save_pretrained(self.args.output)
317317
if not self.args.disable_convert_tokenizer:
318-
maybe_convert_tokenizers(library_name, self.args.output, model)
318+
maybe_convert_tokenizers(library_name, self.args.output, model, task=task)
319319
elif task.startswith("text-generation") and quantize_with_dataset:
320320
from optimum.intel import OVModelForCausalLM
321321

@@ -334,7 +334,7 @@ def _get_default_int4_config(model_id_or_path, library_name):
334334
preprocessors = maybe_load_preprocessors(
335335
self.args.model, trust_remote_code=self.args.trust_remote_code
336336
)
337-
maybe_convert_tokenizers(library_name, self.args.output, preprocessors=preprocessors)
337+
maybe_convert_tokenizers(library_name, self.args.output, preprocessors=preprocessors, task=task)
338338
else:
339339
# TODO : add input shapes
340340
main_export(

optimum/exporters/openvino/__main__.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -387,7 +387,7 @@ class StoreAttr(object):
387387
)
388388

389389
if convert_tokenizer:
390-
maybe_convert_tokenizers(library_name, output, model, preprocessors)
390+
maybe_convert_tokenizers(library_name, output, model, preprocessors, task=task)
391391

392392
clear_class_registry()
393393
del model
@@ -399,7 +399,7 @@ class StoreAttr(object):
399399
GPTQQuantizer.post_init_model = orig_post_init_model
400400

401401

402-
def maybe_convert_tokenizers(library_name: str, output: Path, model=None, preprocessors=None):
402+
def maybe_convert_tokenizers(library_name: str, output: Path, model=None, preprocessors=None, task=None):
403403
"""
404404
Tries to convert tokenizers to OV format and export them to disk.
405405
@@ -412,6 +412,8 @@ def maybe_convert_tokenizers(library_name: str, output: Path, model=None, prepro
412412
Model instance.
413413
preprocessors (`Iterable`, *optional*, defaults to None):
414414
Iterable possibly containing tokenizers to be converted.
415+
task (`str`, *optional*, defaults to None):
416+
The task to export the model for. Affects tokenizer conversion parameters.
415417
"""
416418
from optimum.exporters.openvino.convert import export_tokenizer
417419

@@ -420,7 +422,7 @@ def maybe_convert_tokenizers(library_name: str, output: Path, model=None, prepro
420422
tokenizer = next(filter(lambda it: isinstance(it, PreTrainedTokenizerBase), preprocessors), None)
421423
if tokenizer:
422424
try:
423-
export_tokenizer(tokenizer, output)
425+
export_tokenizer(tokenizer, output, task=task)
424426
except Exception as exception:
425427
logger.warning(
426428
"Could not load tokenizer using specified model ID or path. OpenVINO tokenizer/detokenizer "
@@ -430,6 +432,6 @@ def maybe_convert_tokenizers(library_name: str, output: Path, model=None, prepro
430432
for tokenizer_name in ("tokenizer", "tokenizer_2"):
431433
tokenizer = getattr(model, tokenizer_name, None)
432434
if tokenizer:
433-
export_tokenizer(tokenizer, output / tokenizer_name)
435+
export_tokenizer(tokenizer, output / tokenizer_name, task=task)
434436
else:
435437
logger.warning("Tokenizer won't be converted.")

optimum/exporters/openvino/convert.py

+6
Original file line numberDiff line numberDiff line change
@@ -706,6 +706,7 @@ def export_tokenizer(
706706
tokenizer,
707707
output: Union[str, Path],
708708
suffix: Optional[str] = "",
709+
task: Optional[str] = None,
709710
):
710711
# avoid circular imports
711712
from optimum.intel.openvino import OV_DETOKENIZER_NAME, OV_TOKENIZER_NAME
@@ -722,6 +723,11 @@ def export_tokenizer(
722723
if output.exists():
723724
tokenizer = maybe_convert_tokenizer_to_fast(tokenizer, output)
724725

726+
if task is not None and task.startswith("text-generation"):
727+
logger.info(f"Set padding side to left for `{task}` task.")
728+
tokenizer.padding_side = "left"
729+
tokenizer.truncation_side = "left"
730+
725731
try:
726732
converted = convert_tokenizer(tokenizer, with_detokenizer=True)
727733
except NotImplementedError:

tests/openvino/test_exporters_cli.py

+4
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,10 @@ def test_exporters_cli_tokenizers(self, task: str, model_type: str):
171171
if number_of_tokenizers == 1:
172172
self.assertTrue("Detokenizer is not supported, convert tokenizer only." in output, output)
173173

174+
if task.startswith("text-generation"):
175+
self.assertTrue("Set padding side to left" in output, output)
176+
177+
174178
@parameterized.expand(SUPPORTED_ARCHITECTURES)
175179
def test_exporters_cli_fp16(self, task: str, model_type: str):
176180
with TemporaryDirectory() as tmpdir:

0 commit comments

Comments
 (0)