Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convert Tokenizers By Default #580

Merged
merged 26 commits into from
Apr 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
6d72fe0
Convert Tokenizers By Default
apaniukov Feb 29, 2024
ffcae81
Add Warning to Deprecated Option
apaniukov Feb 29, 2024
eadb210
Update OV Tokenizers Availability Check
apaniukov Mar 8, 2024
8a22002
Move openvino-tokenizers to openvino dependencies
apaniukov Mar 13, 2024
40f227d
Make Style
apaniukov Mar 13, 2024
d9e3a0f
Merge branch 'main' into del-convert-tokenizer-flag
apaniukov Mar 13, 2024
52e24b5
Change Imports to Absolute
apaniukov Mar 13, 2024
fd1887f
Check openvino-nightly compatibility
apaniukov Mar 14, 2024
92d42f4
Change model skip explanation
apaniukov Mar 14, 2024
feb12dd
Merge branch 'main' into del-convert-tokenizer-flag
apaniukov Mar 14, 2024
86a1eca
Update OV Tokenizers Availability Check
apaniukov Mar 22, 2024
9041fe5
Add Check for OpenVINO Nightly and Archive
apaniukov Mar 25, 2024
277225c
Merge branch 'main' into del-convert-tokenizer-flag
apaniukov Mar 28, 2024
3dcaf9e
Add linux distros compatibility message
apaniukov Mar 28, 2024
241d265
Address Review Comments
apaniukov Mar 28, 2024
4d3df41
Address Review Comments
apaniukov Mar 28, 2024
efd6638
Merge branch 'main' into del-convert-tokenizer-flag
apaniukov Apr 3, 2024
eb05594
Address Review Comments
apaniukov Apr 4, 2024
4ed432d
Fix Style
apaniukov Apr 4, 2024
3710884
Change Warnings to Debug Level
apaniukov Apr 17, 2024
a5ef7b1
Merge branch 'main' into del-convert-tokenizer-flag
apaniukov Apr 17, 2024
cb2b26f
Merge branch 'main' into del-convert-tokenizer-flag
apaniukov Apr 17, 2024
80d4c1d
Fix Tests Debug Message
apaniukov Apr 18, 2024
85c925a
Merge branch 'main' into del-convert-tokenizer-flag
echarlaix Apr 18, 2024
f4b3301
Fix Style
apaniukov Apr 18, 2024
934ea22
Fix Style
apaniukov Apr 18, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions optimum/commands/export/openvino.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,15 @@ def parse_args_openvino(parser: "ArgumentParser"):
"OpenVINO native inference code that expects kv-cache inputs and outputs in the model."
),
)
optional_group.add_argument(
"--disable-convert-tokenizer",
action="store_true",
help="Do not add converted tokenizer and detokenizer OpenVINO models.",
)
optional_group.add_argument(
"--convert-tokenizer",
action="store_true",
help="Add converted tokenizer and detokenizer with OpenVINO Tokenizers",
help="[Deprecated] Add converted tokenizer and detokenizer with OpenVINO Tokenizers.",
)

optional_group.add_argument(
Expand Down Expand Up @@ -247,6 +252,9 @@ def run(self):
model.save_pretrained(self.args.output)

else:
if self.args.convert_tokenizer:
logger.warning("`--convert-tokenizer` option is deprecated. Tokenizer will be converted by default.")

# TODO : add input shapes
main_export(
model_name_or_path=self.args.model,
Expand All @@ -258,7 +266,7 @@ def run(self):
pad_token_id=self.args.pad_token_id,
ov_config=ov_config,
stateful=not self.args.disable_stateful,
convert_tokenizer=self.args.convert_tokenizer,
convert_tokenizer=not self.args.disable_convert_tokenizer,
library_name=library_name,
# **input_shapes,
)
15 changes: 5 additions & 10 deletions optimum/exporters/openvino/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,10 @@
from optimum.exporters import TasksManager
from optimum.exporters.onnx.base import OnnxConfig
from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED
from optimum.exporters.openvino.convert import export_from_model, export_tokenizer
from optimum.intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version
from optimum.utils.save_utils import maybe_load_preprocessors

from ...intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version
from .convert import export_from_model, export_tokenizer


if TYPE_CHECKING:
from optimum.intel.openvino.configuration import OVConfig
Expand Down Expand Up @@ -187,12 +186,6 @@ def main_export(
f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
)

if convert_tokenizer and not is_openvino_tokenizers_available():
logger.warning(
"`convert_tokenizer` requires openvino-tokenizers, please install it with `pip install optimum-intel[openvino-tokenizers]`"
)
convert_tokenizer = False

do_gptq_patching = False
custom_architecture = False
loading_kwargs = {}
Expand Down Expand Up @@ -348,7 +341,7 @@ class StoreAttr(object):
**kwargs_shapes,
)

if convert_tokenizer:
if convert_tokenizer and is_openvino_tokenizers_available():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The tokenizer export results in additional files being included which could be confusing for users, can this be moved to to an openvino_tokenizers directory ?

Suggested change
if convert_tokenizer and is_openvino_tokenizers_available():
if convert_tokenizer and is_openvino_tokenizers_available():
output = Path(output) / "openvino_tokenizers"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is similar to the original tokenizer, so we replicate an existing pattern here.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm thinking about something similar than what's done for SD models is this something that sounds reasonable to you? I'm fine with adding it in a following PR if you prefer not including it in this one

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I can do that in a separate PR.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks a lot @apaniukov !

if library_name != "diffusers":
tokenizer = next(
(preprocessor for preprocessor in preprocessors if isinstance(preprocessor, PreTrainedTokenizerBase)),
Expand All @@ -371,6 +364,8 @@ class StoreAttr(object):
tokenizer_2 = getattr(model, "tokenizer_2", None)
if tokenizer_2 is not None:
export_tokenizer(tokenizer_2, output, suffix="_2")
elif convert_tokenizer and not is_openvino_tokenizers_available():
logger.warning("Tokenizer won't be converted.")

# Unpatch modules after GPTQ export
if do_gptq_patching:
Expand Down
14 changes: 3 additions & 11 deletions optimum/exporters/openvino/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union

from transformers import T5Tokenizer, T5TokenizerFast
from transformers.utils import is_tf_available, is_torch_available

from openvino.runtime import PartialShape, save_model
Expand Down Expand Up @@ -49,9 +48,6 @@
)


UNSUPPORTED_TOKENIZER_CLASSES = (T5Tokenizer, T5TokenizerFast)


logger = logging.getLogger(__name__)

if is_torch_available():
Expand Down Expand Up @@ -662,10 +658,6 @@ def export_tokenizer(
):
from optimum.intel.openvino import OV_DETOKENIZER_NAME, OV_TOKENIZER_NAME # avoid circular imports

if isinstance(tokenizer, UNSUPPORTED_TOKENIZER_CLASSES):
logger.info(f"OpenVINO Tokenizer export for {type(tokenizer).__name__} is not supported.")
return

try:
from openvino_tokenizers import convert_tokenizer
except ModuleNotFoundError:
Expand All @@ -681,13 +673,13 @@ def export_tokenizer(
try:
converted = convert_tokenizer(tokenizer, with_detokenizer=True)
except NotImplementedError:
logger.warning("Detokenizer is not supported, convert tokenizer only.")
logger.info("Detokenizer is not supported, convert tokenizer only.")
converted = convert_tokenizer(tokenizer, with_detokenizer=False)
except OVTypeError:
logger.warning(f"OpenVINO Tokenizer export for {type(tokenizer).__name__} is not supported.")
logger.debug(f"OpenVINO Tokenizer export for {type(tokenizer).__name__} is not supported.")
return
except Exception as exception:
logger.warning(
logger.debug(
f"OpenVINO Tokenizer export for {type(tokenizer).__name__} is not supported. Exception: {exception}"
)
return
Expand Down
103 changes: 75 additions & 28 deletions optimum/intel/utils/import_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import functools
import importlib.util
import logging
import operator as op
Expand Down Expand Up @@ -95,32 +95,6 @@
except ImportError:
_openvino_available = False

_openvino_tokenizers_available = importlib.util.find_spec("openvino_tokenizers") is not None and _openvino_available
_openvino_tokenizers_version = "N/A"
if _openvino_tokenizers_available:
try:
_openvino_tokenizers_version = importlib_metadata.version("openvino_tokenizers")
except importlib_metadata.PackageNotFoundError:
_openvino_tokenizers_available = False

if _openvino_tokenizers_available and _openvino_tokenizers_version != "N/A":
_compatible_openvino_version = next(
(
requirement.split("==")[-1]
for requirement in importlib_metadata.requires("openvino-tokenizers")
if requirement.startswith("openvino==")
),
"",
)
_openvino_tokenizers_available = _compatible_openvino_version == ov_major_version
if not _openvino_tokenizers_available:
logger.warning(
"OpenVINO Tokenizer version is not compatible with OpenVINO version. "
f"Installed OpenVINO version: {ov_major_version},"
f"OpenVINO Tokenizers requires {_compatible_openvino_version}. "
f"OpenVINO Tokenizers models will not be added during export."
)

_nncf_available = importlib.util.find_spec("nncf") is not None
_nncf_version = "N/A"
if _nncf_available:
Expand Down Expand Up @@ -196,8 +170,81 @@ def is_openvino_available():
return _openvino_available


@functools.lru_cache(1)
def is_openvino_tokenizers_available():
return _openvino_tokenizers_available
if not is_openvino_available():
return False

if importlib.util.find_spec("openvino_tokenizers") is None:
logger.info(
"OpenVINO Tokenizers is not available. To deploy models in production "
"with C++ code, please follow installation instructions: "
"https://github.com/openvinotoolkit/openvino_tokenizers?tab=readme-ov-file#installation\n"
)
return False

try:
pip_metadata_version = importlib_metadata.version("openvino")
except importlib_metadata.PackageNotFoundError:
pip_metadata_version = False
try:
pip_metadata_version = importlib_metadata.version("openvino-nightly")
is_nightly = True
except importlib_metadata.PackageNotFoundError:
is_nightly = False

try:
import openvino_tokenizers

openvino_tokenizers._get_factory()
except RuntimeError:
tokenizers_version = openvino_tokenizers.__version__

if tokenizers_version == "0.0.0.0":
try:
tokenizers_version = importlib_metadata.version("openvino_tokenizers") or tokenizers_version
except importlib_metadata.PackageNotFoundError:
pass
message = (
"OpenVINO and OpenVINO Tokenizers versions are not binary compatible.\n"
f"OpenVINO version: {_openvino_version}\n"
f"OpenVINO Tokenizers version: {tokenizers_version}\n"
"First 3 numbers should be the same. Update OpenVINO Tokenizers to compatible version. "
)
if not pip_metadata_version:
message += (
"For archive installation of OpenVINO try to build OpenVINO Tokenizers from source: "
"https://github.com/openvinotoolkit/openvino_tokenizers/tree/master?tab=readme-ov-file"
"#build-and-install-from-source"
)
if sys.platform == "linux":
message += (
"\nThe PyPI version of OpenVINO Tokenizers is built on CentOS and may not be compatible with other "
"Linux distributions; rebuild OpenVINO Tokenizers from source."
)
else:
message += (
"It is recommended to use the same day builds for pre-release version. "
"To install both OpenVINO and OpenVINO Tokenizers release version perform:\n"
)
if is_nightly:
message += "pip uninstall -y openvino-nightly && "
message += "pip install --force-reinstall openvino openvino-tokenizers\n"
if is_nightly:
message += (
"openvino-nightly package will be deprecated in the future - use pre-release drops instead. "
)
message += "To update both OpenVINO and OpenVINO Tokenizers to the latest pre-release version perform:\n"
if is_nightly:
message += "pip uninstall -y openvino-nightly && "
message += (
"pip install --pre -U openvino openvino-tokenizers "
"--extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly"
)
logger.warning(message)
return False

return True


def is_nncf_available():
Expand Down
9 changes: 2 additions & 7 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,8 @@
QUALITY_REQUIRE = ["black~=23.1", "ruff>=0.0.241"]

EXTRAS_REQUIRE = {
"neural-compressor": [
"neural-compressor>=2.2.0",
"onnxruntime<1.15.0",
"accelerate",
],
"openvino": ["openvino>=2023.3", "nncf>=2.8.1"],
"openvino-tokenizers": ["openvino-tokenizers[transformers]"],
"neural-compressor": ["neural-compressor>=2.2.0", "onnxruntime<1.15.0", "accelerate"],
"openvino": ["openvino>=2023.3", "nncf>=2.8.1", "openvino-tokenizers[transformers]"],
"nncf": ["nncf>=2.8.1"],
"ipex": ["intel-extension-for-pytorch", "transformers>=4.36.0,<4.39.0"],
"diffusers": ["diffusers"],
Expand Down
24 changes: 12 additions & 12 deletions tests/openvino/test_exporters_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ class OVCLIExportTestCase(unittest.TestCase):
)
EXPECTED_NUMBER_OF_TOKENIZER_MODELS = {
"gpt2": 2,
"t5": 0, # failed internal sentencepiece check - no <s> token in the vocab
"t5": 0, # no .model file in the repository
"albert": 0, # not supported yet
"distilbert": 1, # no detokenizer
"roberta": 2,
Expand Down Expand Up @@ -125,26 +125,26 @@ def test_exporters_cli(self, task: str, model_type: str):
for arch in SUPPORTED_ARCHITECTURES
if not arch[0].endswith("-with-past") and not arch[1].endswith("-refiner")
)
@unittest.skipIf(not is_openvino_tokenizers_available(), reason="OpenVINO Tokenizers not available")
def test_exporters_cli_tokenizers(self, task: str, model_type: str):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we shoudl remove :

- @unittest.skipIf(not is_openvino_tokenizers_available(), reason="OpenVINO Tokenizers not available")

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed and add logic checks for "Test openvino-nightly" stage, where the libraries are not compatible.

with TemporaryDirectory() as tmpdir:
output = subprocess.check_output(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should also check whether no error was raised here

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With

optimum-cli export openvino --model hf-internal-testing/tiny-random-t5 --task text2text-generation ov_model

I'm getting the following error :

OpenVINO Tokenizer export for T5TokenizerFast is not supported. Exception: [Errno 2] No such file or directory: '/tmp/tmprj8zsg44/spiece.model'

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would like to have this fixed before making the tokenizer export default / merging this PR

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is not a bug, we don't support the Unigram model from tokenizer.json yet, only from sentencepiece model file. If you use a repository that has such a file, the tokenizer will be converted.

I added one more check for the vocab file here: openvinotoolkit/openvino_tokenizers#116
This will transform

OpenVINO Tokenizer export for T5TokenizerFast is not supported. Exception: [Errno 2] No such file or directory: '/tmp/tmprj8zsg44/spiece.model'

into:

OpenVINO Tokenizer export for T5TokenizerFast is not supported. Exception: Cannot convert tokenizer of this type without `.model` file.

The issue is that the original tokenizer has info about .model file in the tokenizer object, but it does not have it in practice:
image

I would argue that this is an edge case of a test repo and most tokenizers with info about the .model file have it, so it should not block the merge.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's reasonable for not all cases to be supported, but for these cases we should disable the tokenizer export and not throw an error as it won't be expected by users and could be a bit confusing in my opinion

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we either disable this warning or disable export for unsupported cases ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This warning is for any exception that appeared during conversion. After seeing this message, the user can create an issue for a particular model/tokenizer support.
I also prefer to tell the user that the tokenizer is not converted, rather than silently not including it without warning, so the lack of a (de)tokenizer model won't be a surprise.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd agree if the export of the tokenizer was an explicit choice of the user (current --convert-tokenizer) but as this PR makes it default I think having such warning can be an issue as it makes it look like the export failed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed the log level to debug, the message won't be visible by default.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks @apaniukov

f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --convert-tokenizer --task {task} {tmpdir}",
f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} {tmpdir}",
shell=True,
stderr=subprocess.STDOUT,
).decode()
save_dir = Path(tmpdir)
number_of_tokenizers = sum("tokenizer" in file for file in map(str, save_dir.rglob("*.xml")))
self.assertEqual(
self.EXPECTED_NUMBER_OF_TOKENIZER_MODELS[model_type],
number_of_tokenizers,
f"OVT: {is_openvino_tokenizers_available() }",
)
if not is_openvino_tokenizers_available():
self.assertTrue(
"OpenVINO Tokenizers is not available." in output
or "OpenVINO and OpenVINO Tokenizers versions are not binary compatible." in output,
msg=output,
)
return
Comment on lines +135 to +141
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would prefer to have it removed to make sure this is always tested (we should always have a compatible version of openvino / openvino-tokenizer when testing)

Suggested change
if not is_openvino_tokenizers_available():
self.assertTrue(
"OpenVINO Tokenizers is not available." in output
or "OpenVINO and OpenVINO Tokenizers versions are not binary compatible." in output,
msg=output,
)
return

Copy link
Contributor Author

@apaniukov apaniukov Apr 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is added because of this test:

- name: Test openvino-nightly

It deletes openvino and installs openvino-nightly, which should be incompatible with the installed tokenizer version, replicating the incompatibility scenario.


number_of_tokenizers = sum("tokenizer" in file for file in map(str, Path(tmpdir).rglob("*.xml")))
self.assertEqual(self.EXPECTED_NUMBER_OF_TOKENIZER_MODELS[model_type], number_of_tokenizers, output)

if number_of_tokenizers == 1:
self.assertTrue("Detokenizer is not supported, convert tokenizer only." in output, output)
elif number_of_tokenizers == 0 and task not in ("image-classification", "audio-classification"):
self.assertTrue(("OpenVINO Tokenizer export for" in output and "is not supported." in output), output)

@parameterized.expand(SUPPORTED_ARCHITECTURES)
def test_exporters_cli_fp16(self, task: str, model_type: str):
Expand Down
Loading