feat: Async tokenizer (#86)

* feat: support async, wip * feat: fix and add tests, examples, update readme * fix: poetry lock * fix: anyio -> aiofiles * fix: try 3.8 * fix: remove 3.7 from tests * fix: poetry lock * fix: add 3.7 back * fix: poetry lock * fix: poetry.lock * ci: pipenv * fix: pipenv * fix: pipenv * fix: pyproject * fix: lock * fix: version * fix: Removed aiofiles * ci: update python version, * fix: switch from aiofiles to anyio, remove redundant comments * chore: poetry lock * fix: disable initializing async classes directly, cr comments * test: fix import * ci: add asyncio-mode to test workflow * fix: to_thread -> run_in_executor * ci: add asyncio * fix: cr comments * fix: cr comments --------- Co-authored-by: asafg <asafg@ai21.com>
AI21Labs · Jun 18, 2024 · 3006cda · 3006cda
1 parent 1178ba7
commit 3006cda
Show file tree

Hide file tree

Showing 20 changed files with 1,209 additions and 289 deletions.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -65,7 +65,7 @@ jobs:
           poetry install --no-root --without dev
       - name: Run Tests
         run: |
-          poetry run pytest
+          poetry run pytest --asyncio-mode=auto
       - name: Upload pytest test results
         uses: actions/upload-artifact@v3
         with:

diff --git a/README.md b/README.md
@@ -35,6 +35,46 @@ poetry add ai21-tokenizer
 
 ### Tokenizer Creation
 
+### Jamba Tokenizer
+
+```python
+from ai21_tokenizer import Tokenizer, PreTrainedTokenizers
+
+tokenizer = Tokenizer.get_tokenizer(PreTrainedTokenizers.JAMBA_INSTRUCT_TOKENIZER)
+# Your code here
+```
+
+Another way would be to use our Jamba tokenizer directly:
+
+```python
+from ai21_tokenizer import JambaInstructTokenizer
+
+model_path = "<Path to your vocabs file>"
+tokenizer = JambaInstructTokenizer(model_path=model_path)
+# Your code here
+```
+
+#### Async usage
+
+```python
+from ai21_tokenizer import Tokenizer, PreTrainedTokenizers
+
+tokenizer = Tokenizer.get_async_tokenizer(PreTrainedTokenizers.JAMBA_INSTRUCT_TOKENIZER)
+# Your code here
+```
+
+Another way would be to use our async Jamba tokenizer class method create:
+
+```python
+from ai21_tokenizer import AsyncJambaInstructTokenizer
+
+model_path = "<Path to your vocabs file>"
+tokenizer = AsyncJambaInstructTokenizer.create(model_path=model_path)
+# Your code here
+```
+
+### J2 Tokenizer
+
 ```python
 from ai21_tokenizer import Tokenizer
 
@@ -52,6 +92,26 @@ config = {} # "dictionary object of your config.json file"
 tokenizer = JurassicTokenizer(model_path=model_path, config=config)
 ```
 
+#### Async usage
+
+```python
+from ai21_tokenizer import Tokenizer
+
+tokenizer = Tokenizer.get_async_tokenizer()
+# Your code here
+```
+
+Another way would be to use our async Jamba tokenizer class method create:
+
+```python
+from ai21_tokenizer import AsyncJurassicTokenizer
+
+model_path = "<Path to your vocabs file. This is usually a binary file that end with .model>"
+config = {} # "dictionary object of your config.json file"
+tokenizer = AsyncJurassicTokenizer.create(model_path=model_path, config=config)
+# Your code here
+```
+
 ### Functions
 
 #### Encode and Decode
@@ -67,6 +127,18 @@ decoded_text = tokenizer.decode(encoded_text)
 print(f"Decoded text: {decoded_text}")
 ```
 
+#### Async
+
+```python
+# Assuming you have created an async tokenizer
+text_to_encode = "apple orange banana"
+encoded_text = await tokenizer.encode(text_to_encode)
+print(f"Encoded text: {encoded_text}")
+
+decoded_text = await tokenizer.decode(encoded_text)
+print(f"Decoded text: {decoded_text}")
+```
+
 #### What if you had wanted to convert your tokens to ids or vice versa?
 
 ```python
@@ -76,4 +148,14 @@ print(f"IDs corresponds to Tokens: {tokens}")
 ids = tokenizer.convert_tokens_to_ids(tokens)
 ```
 
+#### Async
+
+```python
+# Assuming you have created an async tokenizer
+tokens = await tokenizer.convert_ids_to_tokens(encoded_text)
+print(f"IDs corresponds to Tokens: {tokens}")
+
+ids = tokenizer.convert_tokens_to_ids(tokens)
+```
+
 **For more examples, please see our [examples](examples) folder.**
diff --git a/ai21_tokenizer/__init__.py b/ai21_tokenizer/__init__.py
@@ -1,6 +1,6 @@
-from ai21_tokenizer.base_tokenizer import BaseTokenizer
-from ai21_tokenizer.jamba_instruct_tokenizer import JambaInstructTokenizer
-from ai21_tokenizer.jurassic_tokenizer import JurassicTokenizer
+from ai21_tokenizer.base_tokenizer import BaseTokenizer, AsyncBaseTokenizer
+from ai21_tokenizer.jamba_instruct_tokenizer import JambaInstructTokenizer, AsyncJambaInstructTokenizer
+from ai21_tokenizer.jurassic_tokenizer import JurassicTokenizer, AsyncJurassicTokenizer
 from ai21_tokenizer.tokenizer_factory import TokenizerFactory as Tokenizer, PreTrainedTokenizers
 from .version import VERSION
 
@@ -9,8 +9,11 @@
 __all__ = [
     "Tokenizer",
     "JurassicTokenizer",
+    "AsyncJurassicTokenizer",
     "BaseTokenizer",
+    "AsyncBaseTokenizer",
     "__version__",
     "PreTrainedTokenizers",
     "JambaInstructTokenizer",
+    "AsyncJambaInstructTokenizer",
 ]
diff --git a/ai21_tokenizer/base_jamba_instruct_tokenizer.py b/ai21_tokenizer/base_jamba_instruct_tokenizer.py
@@ -0,0 +1,48 @@
+from __future__ import annotations
+
+import os
+import tempfile
+from pathlib import Path
+from typing import List, Union, Optional
+from abc import ABC, abstractmethod
+
+from tokenizers import Tokenizer
+
+from ai21_tokenizer.file_utils import PathLike
+
+_TOKENIZER_FILE = "tokenizer.json"
+_DEFAULT_MODEL_CACHE_DIR = Path(tempfile.gettempdir()) / "jamba_instruct"
+
+
+class BaseJambaInstructTokenizer(ABC):
+    _tokenizer: Optional[Tokenizer] = None
+
+    @abstractmethod
+    def _load_from_cache(self, cache_file: Path) -> Tokenizer:
+        pass
+
+    def _is_cached(self, cache_dir: PathLike) -> bool:
+        return Path(cache_dir).exists() and _TOKENIZER_FILE in os.listdir(cache_dir)
+
+    def _cache_tokenizer(self, tokenizer: Tokenizer, cache_dir: PathLike) -> None:
+        # create cache directory for caching the tokenizer and save it
+        Path(cache_dir).mkdir(parents=True, exist_ok=True)
+        tokenizer.save(str(cache_dir / _TOKENIZER_FILE))
+
+    def _encode(self, text: str, **kwargs) -> List[int]:
+        return self._tokenizer.encode(text, **kwargs).ids
+
+    def _decode(self, token_ids: List[int], **kwargs) -> str:
+        return self._tokenizer.decode(token_ids, **kwargs)
+
+    def _convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
+        if isinstance(tokens, str):
+            return self._tokenizer.token_to_id(tokens)
+
+        return [self._tokenizer.token_to_id(token) for token in tokens]
+
+    def _convert_ids_to_tokens(self, token_ids: Union[int, List[int]]) -> Union[str, List[str]]:
+        if isinstance(token_ids, int):
+            return self._tokenizer.id_to_token(token_ids)
+
+        return [self._tokenizer.id_to_token(token_id) for token_id in token_ids]