fix: Examples in readme (#55)

* ci: workflow dispatch for release * docs: Updated readme with more examples * docs: Added docs to base class
AI21Labs · Nov 23, 2023 · 94f3a3c · 94f3a3c
1 parent dbf5609
commit 94f3a3c
Show file tree

Hide file tree

Showing 2 changed files with 76 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -1,11 +1,7 @@
 # AI21 Labs Tokenizer
 
----
-
 ## Installation
 
----
-
 ### pip
 
 ```bash
@@ -20,8 +16,6 @@ poetry add ai21-tokenizer
 
 ## Usage
 
----
-
 ### Tokenizer Creation
 
 ```python
@@ -41,4 +35,28 @@ config = {} # "dictionary object of your config.json file"
 tokenizer = JurassicTokenizer(model_path=model_path, config=config)
 ```
 
+### Functions
+
+#### Encode and Decode
+
+These functions allow you to encode your text to a list of token ids and back to plaintext
+
+```python
+text_to_encode = "apple orange banana"
+encoded_text = tokenizer.encode(text_to_encode)
+print(f"Encoded text: {encoded_text}")
+
+decoded_text = tokenizer.decode(encoded_text)
+print(f"Decoded text: {encoded_text}")
+```
+
+#### What if you had wanted to convert your tokens to ids or vice versa?
+
+```python
+tokens = tokenizer.convert_ids_to_tokens(encoded_text)
+print(f"IDs corresponds to Tokens: {tokens}")
+
+ids = tokenizer.convert_tokens_to_ids(tokens)
+```
+
 **For more examples, please see our [examples](examples) folder.**
diff --git a/ai21_tokenizer/base_tokenizer.py b/ai21_tokenizer/base_tokenizer.py
@@ -4,23 +4,75 @@
 
 
 class BaseTokenizer(ABC):
+    """
+    Base class for tokenizers.
+
+    This class defines the interface for tokenization operations such as encoding, decoding,
+    converting tokens to IDs, and converting IDs to tokens.
+    """
+
     @abstractmethod
     def encode(self, text: str, **kwargs) -> List[int]:
+        """
+        Encodes the given text into a list of token IDs.
+
+        Args:
+            text (str): The input text to be encoded.
+            **kwargs: Additional keyword arguments for encoding.
+
+        Returns:
+            List[int]: The list of token IDs representing the encoded text.
+        """
         pass
 
     @abstractmethod
     def decode(self, token_ids: List[int], **kwargs) -> str:
+        """
+        Decodes the given list of token IDs into a string.
+
+        Args:
+            token_ids (List[int]): The list of token IDs to be decoded.
+            **kwargs: Additional keyword arguments for decoding.
+
+        Returns:
+            str: The decoded string.
+        """
         pass
 
     @abstractmethod
     def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
+        """
+        Converts the given tokens into token IDs.
+
+        Args:
+            tokens (Union[str, List[str]]): The input tokens to be converted.
+
+        Returns:
+            Union[int, List[int]]: The token IDs representing the input tokens.
+        """
         pass
 
     @abstractmethod
     def convert_ids_to_tokens(self, token_ids: Union[int, List[int]], **kwargs) -> Union[str, List[str]]:
+        """
+        Converts the given token IDs into tokens.
+
+        Args:
+            token_ids (Union[int, List[int]]): The input token IDs to be converted.
+            **kwargs: Additional keyword arguments for conversion.
+
+        Returns:
+            Union[str, List[str]]: The tokens representing the input token IDs.
+        """
         pass
 
     @property
     @abstractmethod
     def vocab_size(self) -> int:
+        """
+        Returns the size of the vocabs.
+
+        Returns:
+            int: The size of the vocabs.
+        """
         pass