diff --git a/README.md b/README.md index 34e180a..dec6cad 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,7 @@ # AI21 Labs Tokenizer ---- - ## Installation ---- - ### pip ```bash @@ -20,8 +16,6 @@ poetry add ai21-tokenizer ## Usage ---- - ### Tokenizer Creation ```python @@ -41,4 +35,28 @@ config = {} # "dictionary object of your config.json file" tokenizer = JurassicTokenizer(model_path=model_path, config=config) ``` +### Functions + +#### Encode and Decode + +These functions allow you to encode your text to a list of token ids and back to plaintext + +```python +text_to_encode = "apple orange banana" +encoded_text = tokenizer.encode(text_to_encode) +print(f"Encoded text: {encoded_text}") + +decoded_text = tokenizer.decode(encoded_text) +print(f"Decoded text: {encoded_text}") +``` + +#### What if you had wanted to convert your tokens to ids or vice versa? + +```python +tokens = tokenizer.convert_ids_to_tokens(encoded_text) +print(f"IDs corresponds to Tokens: {tokens}") + +ids = tokenizer.convert_tokens_to_ids(tokens) +``` + **For more examples, please see our [examples](examples) folder.** diff --git a/ai21_tokenizer/base_tokenizer.py b/ai21_tokenizer/base_tokenizer.py index 78c4514..6cb57fd 100644 --- a/ai21_tokenizer/base_tokenizer.py +++ b/ai21_tokenizer/base_tokenizer.py @@ -4,23 +4,75 @@ class BaseTokenizer(ABC): + """ + Base class for tokenizers. + + This class defines the interface for tokenization operations such as encoding, decoding, + converting tokens to IDs, and converting IDs to tokens. + """ + @abstractmethod def encode(self, text: str, **kwargs) -> List[int]: + """ + Encodes the given text into a list of token IDs. + + Args: + text (str): The input text to be encoded. + **kwargs: Additional keyword arguments for encoding. + + Returns: + List[int]: The list of token IDs representing the encoded text. + """ pass @abstractmethod def decode(self, token_ids: List[int], **kwargs) -> str: + """ + Decodes the given list of token IDs into a string. + + Args: + token_ids (List[int]): The list of token IDs to be decoded. + **kwargs: Additional keyword arguments for decoding. + + Returns: + str: The decoded string. + """ pass @abstractmethod def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]: + """ + Converts the given tokens into token IDs. + + Args: + tokens (Union[str, List[str]]): The input tokens to be converted. + + Returns: + Union[int, List[int]]: The token IDs representing the input tokens. + """ pass @abstractmethod def convert_ids_to_tokens(self, token_ids: Union[int, List[int]], **kwargs) -> Union[str, List[str]]: + """ + Converts the given token IDs into tokens. + + Args: + token_ids (Union[int, List[int]]): The input token IDs to be converted. + **kwargs: Additional keyword arguments for conversion. + + Returns: + Union[str, List[str]]: The tokens representing the input token IDs. + """ pass @property @abstractmethod def vocab_size(self) -> int: + """ + Returns the size of the vocabs. + + Returns: + int: The size of the vocabs. + """ pass