huggingface · baptistecolle · Jan 21, 2025 · Jan 15, 2025 · Jan 15, 2025 · Jan 16, 2025
diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml
@@ -53,7 +53,9 @@ jobs:
       - name: Make documentation
         shell: bash
         run: |
+          doc-builder notebook-to-mdx examples/ --output_dir docs/source/howto/ --open_notebook_prefix https://colab.research.google.com/github/huggingface/optimum-tpu/blob/main
+          python docs/scripts/auto-generate-examples.py
           doc-builder build optimum.tpu docs/source/ --repo_name optimum-tpu --build_dir tpu-doc-build/ --version ${{ env.VERSION }} --version_tag_suffix "" --html --clean
           cd tpu-doc-build/
           mv optimum.tpu optimum-tpu
-          doc-builder push optimum-tpu --doc_build_repo_id "hf-doc-build/doc-build" --token "${{ secrets.HF_DOC_BUILD_PUSH }}" --commit_msg "Updated with commit $COMMIT_SHA See: https://github.com/huggingface/optimum-tpu/commit/$COMMIT_SHA" --n_retries 5
+          doc-builder push optimum-tpu --doc_build_repo_id "hf-doc-build/doc-build" --token "${{ secrets.HF_DOC_BUILD_PUSH }}" --commit_msg "Updated with commit $COMMIT_SHA See: https://github.com/huggingface/optimum-tpu/commit/$COMMIT_SHA" --n_retries 5
diff --git a/.github/workflows/doc-pr-build.yml b/.github/workflows/doc-pr-build.yml
@@ -38,6 +38,8 @@ jobs:
       - name: Make documentation
         shell: bash
         run: |
+          doc-builder notebook-to-mdx examples/ --output_dir docs/source/howto/ --open_notebook_prefix https://colab.research.google.com/github/huggingface/optimum-tpu/blob/main
+          python docs/scripts/auto-generate-examples.py
           doc-builder build optimum.tpu docs/source/ --repo_name optimum-tpu --build_dir tpu-doc-build/ --version pr_${{ env.PR_NUMBER }} --version_tag_suffix "" --html --clean
 
       - name: Save commit_sha & pr_number

diff --git a/.gitignore b/.gitignore
@@ -135,4 +135,7 @@ dmypy.json
 .vscode
 .idea/
 
-jetstream-pt-deps
+jetstream-pt-deps
+
+# Optimum TPU artifacts
+tpu-doc-build/
diff --git a/Makefile b/Makefile
@@ -117,3 +117,6 @@ tgi_test: test_installs tgi_server
 tgi_docker_test:
 	python -m pip install -r text-generation-inference/integration-tests/requirements.txt
 	python -m pytest -sv text-generation-inference/integration-tests
+
+preview_doc:
+	doc-builder preview optimum-tpu docs/source --not_python_module
diff --git a/docs/scripts/auto-generate-examples.py b/docs/scripts/auto-generate-examples.py
@@ -0,0 +1,48 @@
+import os
+import sys
+import yaml
+
+# Check that both files exist
+examples_file = 'docs/scripts/examples_list.yml'
+toctree_file = 'docs/source/_toctree.yml'
+
+if not os.path.exists(examples_file):
+    print(f"Error: {examples_file} does not exist")
+    sys.exit(1)
+
+if not os.path.exists(toctree_file):
+    print(f"Error: {toctree_file} does not exist") 
+    sys.exit(1)
+
+# Read the examples list
+with open(examples_file, 'r') as f:
+    examples = yaml.safe_load(f)
+
+# Read the main toctree
+with open(toctree_file, 'r') as f:
+    toc = yaml.safe_load(f)
+
+# Find the howto section and insert before more_examples
+# Iterate through the list to find the sections with howto
+for item in toc:
+    if isinstance(item, dict) and 'sections' in item:
+        for section in item['sections']:
+            if isinstance(section, dict) and 'sections' in section:
+                howto_items = section['sections']
+                for i, subitem in enumerate(howto_items):
+                    if subitem.get('local') == 'howto/more_examples':
+                        # Insert the new examples before this position
+                        for example in reversed(examples):
+                            howto_items.insert(i, example)
+                        break
+
+# Write back the modified toctree
+with open(toctree_file, 'w') as f:
+    yaml.dump(toc, f, sort_keys=False, allow_unicode=True, default_flow_style=False)
+
+print("Added examples to the howto section of the toctree")
+
+# Print the updated toctree contents
+with open(toctree_file, 'r') as f:
+    print("\nUpdated _toctree.yml contents:")
+    print(f.read())
diff --git a/docs/scripts/examples_list.yml b/docs/scripts/examples_list.yml
@@ -0,0 +1,4 @@
+- local: howto/gemma_tuning
+  title: Gemma Fine-Tuning Example
+- local: howto/llama_tuning
+  title: Llama Fine-Tuning Example
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -3,19 +3,49 @@
     title: 🤗 Optimum-TPU
   - local: supported-architectures
     title: Supported Models
+  - local: installation
+    title: Installation
+  - local: optimum_container
+    title: Optimum TPU Containers
   - sections:
-    - local: tutorials/overview
-      title: Overview
+    - local: tutorials/tpu_setup
+      title: First TPU Setup on Google Cloud
+    - local: tutorials/inference_on_tpu
+      title: First TPU Inference on Google Cloud
+    - local: tutorials/training_on_tpu
+      title: First TPU Training on Google Cloud
     title: Tutorials
   - sections:
-    - local: howto/overview
-      title: Overview
-    - local: howto/deploy
-      title: Deploying a Google Cloud TPU instance
+    - local: howto/gcloud_cli
+      title: Deploying and Connecting to Google TPU Instances via GCloud CLI
     - local: howto/serving
       title: Deploying a TGI server on a Google Cloud TPU instance
     - local: howto/training
       title: Training on a Google Cloud TPU instance
+    - local: howto/deploy_instance_on_ie
+      title: How to Deploy a Model on Inference Endpoint for Serving using TPUs
+    - local: howto/advanced-tgi-serving
+      title: Advanced TGI Server Configuration
+    - local: howto/installation_inside_a_container
+      title: Installing Optimum-TPU inside a Docker Container
+    - local: howto/more_examples
+      title: Find More Examples on the Optimum-TPU GitHub Repository
     title: How-To Guides
+  - sections:
+    - local: conceptual_guides/tpu_hardware_support
+      title: TPU Hardware Support
+    - local: conceptual_guides/difference_between_jetstream_and_xla
+      title: Difference between Jetstream Pytorch and Pytorch XLA
+    title: Conceptual Guides
+  - sections:
+    - local: reference/fsdp_v2
+      title: FSDPv2
+    - local: reference/tgi_advanced_options
+      title: TGI Configuration Reference Guide
+    title: Reference
+  - sections:
+    - local: contributing
+      title: Contributing to Optimum TPU
+    title: Contributing
   title: Optimum-TPU
   isExpanded: true
diff --git a/docs/source/conceptual_guides/difference_between_jetstream_and_xla.mdx b/docs/source/conceptual_guides/difference_between_jetstream_and_xla.mdx
@@ -0,0 +1,23 @@
+# Differences between Jetstream Pytorch and PyTorch XLA
+
+This guide explains to optimum-tpu users the difference between Jetstream Pytorch and PyTorch XLA as those are two available backends in TGI.
+
+JetStream PyTorch is a high-performance inference engine built on top of PyTorch XLA. It is optimized for throughput and memory efficiency when running Large Language Models (LLMs) on TPUs.
+
+| Feature | Jetstream Pytorch | PyTorch XLA |
+|---------|-----------|-------------|
+| Training | ❌ | ✅ |
+| Serving | ✅ | ✅ |
+| Performance | Higher serving performance | Standard performance |
+| Flexibility | Limited to serving | Full PyTorch ecosystem |
+| Use Case | Production inference | Development and training |
+| Integration | Optimized for deployment | Standard PyTorch workflow |
+
+**Notes:**
+By default, optimum-tpu is using PyTorch XLA for training and Jetstream Pytorch for serving. 
+
+You can configure optimum-tpu to use either version for serving with TGI. You can use the Pytorch XLA backend in TGI by setting up `-e JETSTREAM_PT_DISABLE=1` in your docker run arguments.
+
+You can find more information about:
+- PyTorch XLA: https://pytorch.org/xla/ and https://github.com/pytorch/xla
+- Jetstream Pytorch: https://github.com/AI-Hypercomputer/jetstream-pytorch
diff --git a/docs/source/conceptual_guides/tpu_hardware_support.mdx b/docs/source/conceptual_guides/tpu_hardware_support.mdx
@@ -0,0 +1,36 @@
+# TPU hardware support
+Optimum-TPU support and is optimized for v5e and v6e TPUs.
+
+## TPU naming convention
+The TPU naming follows this format: `<tpu_version>-<number_of_tpus>`
+
+TPU version: 
+- v5litepod (v5e)
+- v6e
+
+For example, a v5litepod-8 is a v5e TPU with 8 tpus.
+
+## Memory on TPU
+The HBM (High Bandwidth Memory) capacity per chip is 16GB for v5e, v5p and 32GB for v6e. So a v5e-8 (v5litepod-8), has 16GB*8=128GB of HBM memory
+
+## Recommended Runtime for TPU
+
+During the TPU VM creation use the following TPU VM base images for optimum-tpu:
+- v2-alpha-tpuv6e (TPU v6e) (recommended)
+- v2-alpha-tpuv5 (TPU v5p) (recommended)
+- v2-alpha-tpuv5-lite (TPU v5e) (recommended)
+- tpu-ubuntu2204-base (default)
+
+For installation instructions, refer to our [TPU setup tutorial](../tutorials/tpu_setup). We recommend you use the *alpha* version with optimum-tpu, as optimum-tpu is tested and optimized for those.
+
+More information at https://cloud.google.com/tpu/docs/runtimes#pytorch_and_jax
+
+# Next steps
+For more information on the different TPU hardware, you can look at:
+https://cloud.google.com/tpu/docs/v6e
+https://cloud.google.com/tpu/docs/v5p
+https://cloud.google.com/tpu/docs/v5e
+
+Pricing informatin can be found here https://cloud.google.com/tpu/pricing
+
+TPU availability can be found https://cloud.google.com/tpu/docs/regions-zones
diff --git a/docs/source/contributing.mdx b/docs/source/contributing.mdx
@@ -0,0 +1,86 @@
+# Contributing to Optimum TPU
+
+We're excited that you're interested in contributing to Optimum TPU! Whether you're fixing bugs, adding new features, improving documentation, or sharing your experiences, your contributions are highly valued 😄
+
+## Getting Started
+
+1. [Fork](https://github.com/huggingface/optimum-tpu/fork) and clone the repository:
+```bash
+git clone https://github.com/YOUR_USERNAME/optimum-tpu.git
+cd optimum-tpu
+```
+
+2. Install the package locally:
+```bash
+python -m venv .venv
+source .venv/bin/activate
+python -m pip install . -f https://storage.googleapis.com/libtpu-releases/index.html
+```
+
+## Development Tools
+
+The project includes a comprehensive Makefile with commands for various development tasks:
+
+### Testing
+```bash
+make tests              # Run all the non-TGI-related tests
+make tgi_test           # Run TGI tests with PyTorch/XLA
+make tgi_test_jetstream # Run TGI tests with Jetstream backend
+make tgi_docker_test    # Run TGI integration tests in Docker
+```
+
+### Code Quality
+```bash
+make style         # Auto-fix code style issues
+make style_check   # Check code style without fixing
+```
+
+### Documentation
+```bash
+make preview_doc    # Preview documentation locally
+```
+
+### Docker Images
+```bash
+make tpu-tgi        # Build TGI Docker image
+make tpu-tgi-ie     # Build TGI inference endpoint image
+make tpu-tgi-gcp    # Build TGI Google Cloud image
+```
+
+### TGI Development
+When working on Text Generation Inference (`/text-generation-inference` folder), you might also want to build a TGI image from scratch. To do this, refer to the manual image building section of the [serving how to guide](./howto/serving)
+
+1. Build the standalone server:
+```bash
+make tgi_server
+```
+
+## Pull Request Process
+
+1. Create a new branch:
+```bash
+git checkout -b your-feature-name
+```
+
+2. Make your changes
+
+3. Run tests:
+```bash
+make tests 
+# Run more specialized test if needed such as make tgi_test, make tgi_test_jetstream, make tgi_docker_test
+make style_check
+```
+
+4. Submit your PR with:
+   - Clear description of changes
+   - Test results
+   - Documentation updates if needed
+
+## Need Help?
+
+- Check the [documentation](https://huggingface.co/docs/optimum/tpu/overview)
+- Open an issue for bugs or feature requests
+
+## License
+
+By contributing to Optimum TPU, you agree that your contributions will be licensed under the Apache License, Version 2.0.
diff --git a/docs/source/howto/advanced-tgi-serving.mdx b/docs/source/howto/advanced-tgi-serving.mdx
@@ -0,0 +1,60 @@
+# Advanced TGI Server Configuration
+
+## Jetstream Pytorch and Pytorch XLA backends
+
+[Jetstream Pytorch](https://github.com/AI-Hypercomputer/jetstream-pytorch) is a highly optimized Pytorch engine for serving LLMs on Cloud TPU. This engine is selected by default if the dependency is available.
+
+We recommend using Jetstream with TGI for the best performance. If for some reason you want to use the Pytorch/XLA backend instead, you can set the `JETSTREAM_PT_DISABLE=1` environment variable. 
+
+For more information, see our discussion on the [difference between jetstream and pytorch XLA](../conceptual_guides/difference_between_jetstream_and_xla)
+
+## Quantization
+When using Jetstream Pytorch engine, it is possible to enable quantization to reduce the memory footprint and increase the throughput. To enable quantization, set the `QUANTIZATION=1` environment variable. For instance, on a 2x4 TPU v5e (16GB per chip * 8 = 128 GB per pod), you can serve models up to 70B parameters, such as Llama 3.3-70B. The quantization is done in `int8` on the fly as the weight loads. As with any quantization option, you can expect a small drop in the model accuracy. Without the quantization option enabled, the model is served in bf16.
+
+## How to solve memory requirements
+
+If you encounter `Backend(NotEnoughMemory(2048))`, here are some solutions that could help with reducing memory usage in TGI:
+
+**Optimum-TPU specific arguments:**
+- `-e QUANTIZATION=1`: To enable quantization. This should reduce memory requirements by almost half
+- `-e MAX_BATCH_SIZE=n`: You can manually reduce the size of the batch size
+
+**TGI specific arguments:**
+- `--max-input-length`: Maximum input sequence length
+- `--max-total-tokens`: Maximum combined input and output tokens
+- `--max-batch-prefill-tokens`: Maximum tokens for batch processing
+- `--max-batch-total-tokens`: Maximum total tokens in a batch
+
+To reduce memory usage, you can try smaller values for  `--max-input-length`, `--max-total-tokens`, `--max-batch-prefill-tokens`, and `--max-batch-total-tokens`. 
+
+<Tip warning={true}>
+`max-batch-prefill-tokens ≤ max-input-length * max_batch_size`. Otherwise, you will have an error as the configuration does not make sense. If the max-batch-prefill-tokens were bigger, then you would not be able to process any request
+</Tip>
+
+## Sharding
+Sharding is done automatically by the TGI server, so your model uses all the TPUs that are available. We do tensor parallelism, so the layers are automatically split in all available TPUs. However, the TGI router will only see one shard. 
+
+More information on tensor parralelsim can be found here https://huggingface.co/docs/text-generation-inference/conceptual/tensor_parallelism.
+
+## Understanding the configuration
+
+Key parameters explained:
+
+**Required parameters**
+- `--shm-size 16GB`: Increase default shared memory allocation.
+- `--privileged`: Required for TPU access.
+- `--net host`: Uses host network mode.
+Those are needed to run a TPU container so that the container can properly access the TPU hardware.
+
+**Optional parameters**
+- `-v ~/hf_data:/data`: Volume mount for model storage, this allows you to not have to re-download the models weights on each startup. You can use any folder you would like as long as it maps back to /data.
+- `-e SKIP_WARMUP=1`: Disables warmup for quick testing (not recommended for production).
+Those are parameters used by TGI and optimum-TPU to configure the server behavior.
+
+
+<Tip warning={true}>
+`--privileged --shm-size 16GB --net host` is required as specify in https://github.com/pytorch/xla
+</Tip>
+
+## Next steps
+Please check the [TGI docs](https://huggingface.co/docs/text-generation-inference) for more TGI server configuration options.