basetenlabs
diff --git a/‎.github/workflows/commit_new_release_to_main.yml
+2-2 b/‎.github/workflows/commit_new_release_to_main.yml
+2-2
diff --git a/‎.github/workflows/integration-tests.yml
+71 b/‎.github/workflows/integration-tests.yml
+71
diff --git a/‎.github/workflows/release-truss-utils.yml
+52 b/‎.github/workflows/release-truss-utils.yml
+52
diff --git a/‎docs/examples/performance/tgi-server.mdx
+1-1 b/‎docs/examples/performance/tgi-server.mdx
+1-1
diff --git a/‎docs/examples/pre-process.mdx
+123 b/‎docs/examples/pre-process.mdx
+123
diff --git a/‎docs/images/notebook-to-model.png
12.5 KB b/‎docs/images/notebook-to-model.png
12.5 KB
diff --git a/‎docs/learn/model-serving/model-load.mdx
+7-1 b/‎docs/learn/model-serving/model-load.mdx
+7-1
diff --git a/‎docs/mint.json
+1 b/‎docs/mint.json
+1
diff --git a/‎docs/quickstart.mdx
+6 b/‎docs/quickstart.mdx
+6
diff --git a/‎examples/vllm-gcs/config.yaml
+19 b/‎examples/vllm-gcs/config.yaml
+19
diff --git a/‎examples/vllm-gcs/data/service_account.json
+1 b/‎examples/vllm-gcs/data/service_account.json
+1
@@ -11,7 +11,7 @@ jobs:
 
     steps:
     - name: Check out code
-      uses: actions/checkout@v2
+      uses: actions/checkout@v3
 
     - name: Configure Git user as basetenbot
       run: |
@@ -25,7 +25,7 @@ jobs:
     - name: Merge release into main with priority on main changes
       run: |
         git checkout main
-        git merge --strategy-option=ours release -m "Merge release into main prioritizing main changes"
+        git merge --strategy-option=ours origin/release -m "Merge release into main prioritizing main changes"
         git push origin main
       env:
         GH_TOKEN: ${{ secrets.BASETENBOT_GITHUB_TOKEN }}
@@ -0,0 +1,71 @@
+name: integration-tests
+
+on:
+  workflow_dispatch: # Allows running from actions tab
+
+concurrency:
+  group: main-${{ github.ref_name }}
+  cancel-in-progress: false
+
+jobs:
+  detect-version-changed:
+    runs-on: ubuntu-20.04
+    outputs:
+      version_changed: ${{ steps.versions.outputs.version_changed }}
+      new_version: ${{ steps.versions.outputs.new_version }}
+      new_base_image_version: ${{ steps.versions.outputs.new_base_image_version }}
+      build_base_images: ${{ steps.versions.outputs.build_base_images }}
+      release_version: ${{ steps.versions.outputs.release_version }}
+      is_prerelease_version: ${{ steps.versions.outputs.is_prerelease_version }}
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          # We need to use a different github token because GITHUB_TOKEN cannot trigger a workflow from another
+          token: ${{secrets.BASETENBOT_GITHUB_TOKEN}}
+          fetch-depth: 2
+      - uses: ./.github/actions/detect-versions/
+        id: versions
+  build-and-push-truss-base-images-if-needed:
+    needs: [detect-version-changed]
+    if: needs.detect-version-changed.outputs.build_base_images == 'true'
+    runs-on: ubuntu-20.04
+    strategy:
+      matrix:
+        python_version: ["3.8", "3.9", "3.10", "3.11"]
+        use_gpu: ["y", "n"]
+        job_type: ["server", "training"]
+    steps:
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v1
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - uses: actions/checkout@v3
+      - uses: ./.github/actions/setup-python/
+      - run: poetry install
+      - shell: bash
+        run: |
+          poetry run bin/generate_base_images.py \
+            --use-gpu ${{ matrix.use_gpu }}  \
+            --python-version ${{ matrix.python_version }} \
+            --job-type ${{ matrix.job_type }} \
+            --version-tag ${{ needs.detect-version-changed.outputs.new_base_image_version }} \
+            --skip-login --push
+
+  integration-tests:
+    needs: [detect-version-changed, build-and-push-truss-base-images-if-needed]
+    if: ${{ !failure() && !cancelled() && (needs.build-and-push-truss-base-images-if-needed.result == 'success' || needs.build-and-push-truss-base-images-if-needed.result == 'skipped') }}
+    runs-on: ubuntu-20.04
+    strategy:
+      fail-fast: false
+      matrix:
+        split_group: ["1", "2", "3", "4", "5"]
+    steps:
+      - uses: actions/checkout@v3
+      - uses: ./.github/actions/setup-python/
+      - run: poetry install
+      - run: poetry run pytest truss/tests  -m 'integration' --splits 5 --group ${{ matrix.split_group }}
@@ -0,0 +1,52 @@
+name: Release CI
+
+on:
+  workflow_dispatch:
+    inputs:
+      version:
+        description: 'Version to bump to'
+        required: true
+
+
+
+concurrency:
+  group: release-utils-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: false
+
+jobs:
+  publish-to-pypi:
+    runs-on: ubuntu-20.04
+    steps:
+      - name: "Git tag release"
+        uses: actions/checkout@v3
+        with:
+          token: ${{secrets.BASETENBOT_GITHUB_TOKEN}}
+
+      - uses: ./.github/actions/setup-python/
+
+      - name: Tag release
+        env:
+          INPUT_VERSION: ${{ github.event.inputs.version }}
+        run: |
+          cd truss-utils
+          poetry version $INPUT_VERSION
+          NEW_VERSION=v$INPUT_VERSION
+          TAG=truss-utils-$NEW_VERSION
+          git config --global user.name "Github action"
+          git config --global user.email "github.action@baseten.co"
+
+          git tag -a $TAG -m "Release truss-utils $NEW_VERSION"
+          git push origin $TAG
+
+      - name: Install poetry packages
+        working-directory: truss-utils
+        run: poetry install --no-dev
+
+      - name: Build
+        working-directory: truss-utils
+        run: poetry build
+
+      - name: Publish to PyPI
+        if: ${{ github.event_name != 'pull_request' }}
+        working-directory: truss-utils
+        run: poetry publish -u "${{ secrets.PYPI_USERNAME }}" -p "${{ secrets.PYPI_PASSWORD }}"
@@ -65,7 +65,7 @@ truss push
 You can invoke the model with:
 
 ```sh
-truss predict -d '{"inputs": "What is a large language model?", "parameters": {"max_new_tokens": 128}}'
+truss predict -d '{"inputs": "What is a large language model?", "parameters": {"max_new_tokens": 128, "sample": true}} --published'
 ```
 
 <RequestExample>
 
@@ -0,0 +1,123 @@
+---
+title: Pre/post-process methods
+description: "Deploy a model that makes use of pre-process"
+---
+
+Out of the box, Truss limits the amount of concurrent predicts that happen on
+single container. This ensures that the CPU, and for many models the GPU, do not get
+overloaded, and that the model can continue respond to requests in periods of high load
+
+However, many models, in addition to having compute components, also have
+IO requirements. For example, a model that classifies images may need to download
+the image from a URL before it can classify it.
+
+Truss provides a way to separate the IO component from the compute component, to
+ensure that any IO does not prevent utilization of the compute on your pod.
+
+To do this, you can use the pre/post process methods on a Truss. These methods
+can be defined like this:
+
+
+```python
+class Model:
+    def __init__: ...
+    def load(self, **kwargs) -> None: ...
+    def preprocess(self, request):
+        # Include any IO logic that happens _before_ predict here
+        ...
+
+    def predict(self, request):
+        # Include the actual predict here
+        ...
+
+    def postprocess(self, response):
+        # Include any IO logic that happens _after_ predict here
+        ...
+```
+
+What happens when the model is invoked is that any logic defined in the pre or post-process
+methods happen on a separate thread, and are not subject to the same concurrency limits as
+predict. So -- let's say you have a model that can handle 5 concurrent requests:
+
+```config.yaml
+...
+runtime:
+    predict_concurrency: 10
+...
+```
+
+If you hit it with 10 requests, they will _all_ begin pre-processing, but then when the
+the 6th request is ready to begin the predict method, it will have to wait for one of the
+first 5 requests to finish. This ensures that the GPU is not overloaded, while also ensuring
+that the compute logic does not get blocked by IO, thereby ensuring that you can achieve
+maximum throughput.
+
+<RequestExample>
+
+```python model/model.py
+import requests
+from typing import Dict
+from PIL import Image
+from transformers import CLIPProcessor, CLIPModel
+
+CHECKPOINT = "openai/clip-vit-base-patch32"
+
+
+class Model:
+    """
+    This is simple example of using CLIP to classify images.
+    It outputs the probability of the image being a cat or a dog.
+    """
+    def __init__(self, **kwargs) -> None:
+        self._processor = None
+        self._model = None
+
+    def load(self):
+        """
+        Loads the CLIP model and processor checkpoints.
+        """
+        self._model = CLIPModel.from_pretrained(CHECKPOINT)
+        self._processor = CLIPProcessor.from_pretrained(CHECKPOINT)
+
+    def preprocess(self, request: Dict) -> Dict:
+        """"
+        This method downloads the image from the url and preprocesses it.
+        The preprocess method is used for any logic that involves IO, in this
+        case downloading the image. It is called before the predict method
+        in a separate thread and is not subject to the same concurrency
+        limits as the predict method, so can be called many times in parallel.
+        """
+        image = Image.open(requests.get(request.pop("url"), stream=True).raw)
+        request["inputs"] = self._processor(
+            text=["a photo of a cat", "a photo of a dog"],
+            images=image,
+            return_tensors="pt",
+            padding=True
+        )
+        return request
+
+    def predict(self, request: Dict) -> Dict:
+        """
+        This performs the actual classification. The predict method is subject to
+        the predict concurrency constraints.
+        """
+        outputs = self._model(**request["inputs"])
+        logits_per_image = outputs.logits_per_image
+        return logits_per_image.softmax(dim=1).tolist()
+
+```
+
+```yaml config.yaml
+model_name: clip-example
+requirements:
+- transformers==4.32.0
+- pillow==10.0.0
+- torch==2.0.1
+resources:
+  cpu: "3"
+  memory: 14Gi
+  use_gpu: true
+  accelerator: A10G
+```
+
+</RequestExample>
@@ -5,7 +5,13 @@ description: "Load an ML model into your Truss"
 
 The other essential file in a Truss is `model/model.py`. In this file, you write a `Model` class: an interface between the ML model that you're packaging and the model server that you're running it on.
 
-Open `model/model.py` in your text editor.
+The code to load and invoke a model in a Jupyter notebook or Python script maps directly to the code used in `model/model.py`.
+
+<Frame>
+  <img src="/images/notebook-to-model.png" />
+</Frame>
+
+We'll go line-by-line through the code. Open `model/model.py` in your text editor.
 
 ### Import transformers
 
 
@@ -58,6 +58,7 @@
         "examples/private-model",
         "examples/system-packages",
         "examples/streaming",
+        "examples/pre-process",
         "examples/performance/cached-weights",
         "examples/performance/tgi-server",
         "examples/performance/vllm-server"
 
@@ -37,6 +37,12 @@ cd text-classification
 
 One of the two essential files in a Truss is `model/model.py`. In this file, you write a `Model` class: an interface between the ML model that you're packaging and the model server that you're running it on.
 
+The code to load and invoke a model in a Jupyter notebook or Python script maps directly to the code used in `model/model.py`.
+
+<Frame>
+  <img src="/images/notebook-to-model.png" />
+</Frame>
+
 There are two member functions that you must implement in the `Model` class:
 
 * `load()` loads the model onto the model server. It runs exactly once when the model server is spun up or patched.
 
@@ -0,0 +1,19 @@
+build:
+  arguments:
+    endpoint: Completions
+    model: gs://llama-2-7b
+    tokenizer: hf-internal-testing/llama-tokenizer
+  model_server: VLLM
+environment_variables: {}
+external_package_dirs: []
+model_metadata: {}
+model_name: vllm llama gcs
+python_version: py39
+requirements: []
+resources:
+  accelerator: A10G
+  cpu: 500m
+  memory: 30Gi
+  use_gpu: true
+secrets: {}
+system_packages: []
@@ -0,0 +1 @@
+YOUR SERVICE ACCOUNT KEY