basetenlabs
diff --git a/‎docs/examples/performance/tgi-server.mdx
+12-12 b/‎docs/examples/performance/tgi-server.mdx
+12-12
diff --git a/‎docs/guides/concurrency.mdx
+117 b/‎docs/guides/concurrency.mdx
+117
diff --git a/‎docs/images/concurrency-flow-chart-high-level.png
111 KB b/‎docs/images/concurrency-flow-chart-high-level.png
111 KB
diff --git a/‎docs/images/concurrency-flow-model-pod.png
127 KB b/‎docs/images/concurrency-flow-model-pod.png
127 KB
diff --git a/‎docs/images/concurrency-target-picture.png
180 KB b/‎docs/images/concurrency-target-picture.png
180 KB
diff --git a/‎docs/mint.json
+2-1 b/‎docs/mint.json
+2-1
diff --git a/‎docs/reference/cli/init.mdx
+4-9 b/‎docs/reference/cli/init.mdx
+4-9
diff --git a/‎docs/reference/cli/predict.mdx
+12-7 b/‎docs/reference/cli/predict.mdx
+12-7
diff --git a/‎docs/reference/cli/push.mdx
+6-6 b/‎docs/reference/cli/push.mdx
+6-6
diff --git a/‎docs/reference/cli/watch.mdx
+4-2 b/‎docs/reference/cli/watch.mdx
+4-2
diff --git a/‎poetry.lock
+26-1 b/‎poetry.lock
+26-1
diff --git a/‎pyproject.toml
+2-1 b/‎pyproject.toml
+2-1
@@ -20,43 +20,43 @@ This example will cover:
 Get started by creating a new Truss:
 
 ```sh
-truss init --backend TGI opt125
+truss init --backend TGI falcon-7b
 ```
 
 You're going to see a couple of prompts. Follow along with the instructions below:
-1. Type `facebook/opt-125M` when prompted for `model`.
+1. Type `tiiuae/falcon-7b` when prompted for `model`.
 2. Press the `tab` key when prompted for `endpoint`. Select the `generate_stream` endpoint.
-3. Give your model a name like `OPT-125M`.
+3. Give your model a name like `Falcon 7B`.
 
 Finally, navigate to the directory:
 
 ```sh
-cd opt125
+cd falcon-7b
 ```
 
 ### Step 2: Setting resources and other arguments
 
 You'll notice that there's a `config.yaml` in the new directory. This is where we'll set the resources and other arguments for the model. Open the file in your favorite editor.
 
-OPT-125M will need a GPU so let's set the correct resources. Update the `resources` key with the following:
+Falcon 7B will need a GPU so let's set the correct resources. Update the `resources` key with the following:
 
 ```yaml config.yaml
 resources:
-  accelerator: T4
+  accelerator: A10G
   cpu: "4"
   memory: 16Gi
   use_gpu: true
 ```
 
-Also notice the `build` key which contains the `model_server` we're using as well as other arguments. These arguments are passed to the underlying vLLM server which you can find [here](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/api_server.py).
+Also notice the `build` key which contains the `model_server` we're using as well as other arguments. These arguments are passed to the underlying TGI server.
 
 ### Step 3: Deploy the model
 
 <Note>
 You'll need a [Baseten API key](https://app.baseten.co/settings/account/api_keys) for this step.
 </Note>
 
-Let's deploy our OPT-125M vLLM model.
+Let's deploy our Falcon 7B TGI model.
 
 ```sh
 truss push
@@ -65,7 +65,7 @@ truss push
 You can invoke the model with:
 
 ```sh
-truss predict -d '{"inputs": "What is a large language model?", "parameters": {"max_new_tokens": 128, "sample": true}} --published'
+truss predict -d '{"inputs": "What is a large language model?", "parameters": {"max_new_tokens": 128, "sample": true}}' --published
 ```
 
 <RequestExample>
@@ -74,16 +74,16 @@ truss predict -d '{"inputs": "What is a large language model?", "parameters": {"
 build:
   arguments:
     endpoint: generate_stream
-    model: facebook/opt-125M
+    model: tiiuae/falcon-7b
   model_server: TGI
 environment_variables: {}
 external_package_dirs: []
 model_metadata: {}
-model_name: OPT-125M
+model_name: Falcon 7B
 python_version: py39
 requirements: []
 resources:
-  accelerator: T4
+  accelerator: A10G
   cpu: "4"
   memory: 16Gi
   use_gpu: true
 
@@ -0,0 +1,117 @@
+---
+title: "How to configure concurrency"
+description: "A guide to setting concurrency for your model"
+---
+
+Configuring concurrency is one of the major knobs available for getting the most performance
+out of your model. In this doc, we'll cover the options that are available to you.
+
+# What is concurrency, and why configure it?
+
+At a very high level, "concurrency" in this context refers to how many requests a single replica can
+process at the same time. There are no right answers to what this number ought to be -- the specifics
+of your model and the metrics you are optimizing for (throughput? latency?) matter a lot for determining this.
+
+In Baseten & Truss, there are two notions of concurrency:
+* **Concurrency Target** -- the number of requests that will be sent to a model at the same time
+* **Predict Concurrency** -- once requests have made it onto the model container, the "predict concurrency" governs how many
+requests can go through the `predict` function on your Truss at once.
+
+# Concurrency Target
+
+The concurrency target is set in the Baseten UI, and to re-iterate, governs the maximum number of requests that will be sent
+to a single model replica.
+
+<Frame>
+  <img src="/images/concurrency-target-picture.png" />
+</Frame>
+
+An important note about this setting is that it is also used as a part of the auto-scaling parameters. If all replicas have
+hit their Concurrency Target, this triggers Baseten's autoscaling.
+
+Let's dive into a concrete example:
+
+<Frame>
+  <img src="/images/concurrency-flow-chart-high-level.png" />
+</Frame>
+
+Let's say that there is a single replica of a model, and the concurrency target is 2. If 5 requests come in, the first 2 will
+be sent to the replica, and the other 3 get queued up. Once the requests on the container complete the queued up
+requests will make it to the model container.
+
+<Note>
+Remember that if all replicas have hit their concurrency target, this will trigger autoscaling. So in this specific example,
+the queuing of requests 3-5 will trigger another replica to come up, if the model has not hit its max replicas yet.
+</Note>
+
+
+# Predict Concurrency
+
+Alright, so we've talked about the **Concurreny Target** feature that governs how many requests will be sent to a model at once.
+predict concurrency is a bit different -- it operates on the level of the model container and governs how many requests will go
+through the `predict` function concurrently.
+
+To get a sense for why this matters, let's recap the structure of a Truss:
+
+```python model.py
+class Model:
+
+    def __init__(self):
+        ...
+
+    def preprocess(self, request):
+        ...
+
+    def predict(self, request):
+        ...
+
+    def postprocess(self, response):
+        ...
+```
+
+In this Truss model, there are three functions that are called in order to serve a request:
+* **preprocess** -- this function is used to perform any prework / modifications on the request before the `predict` function
+runs. For instance, if you are running an image classification model, and need to download images from S3, this is a good placeholder
+to do it.
+* **predict** -- this function is where the actual inference happens. It is likely where the logic that runs GPU code lives
+* **postprocess** -- this function is used to perform any postwork / modifications on the response before it is returned to the
+user. For instance, if you are running a text-to-image model, this is a good place to implement the logic for uploading an image
+to S3.
+
+You can see with these three functions and the behaviors that they are used for that you might want to have different
+levels of concurrency for the `predict` function. The most common need here is to limit access to the GPU, since multiple
+requests running on the GPU at the same time could cause serious degradation in performance.
+
+Unlike **Concurrency Target**, which is configured in the Baseten UI, the **Predict Concurrency** is configured as a part
+of the Truss Config (in the `config.yaml` file).
+
+```yaml config.yaml
+model_name: "My model with concurrency limits"
+...
+runtime:
+    predict_concurrency: 2 # the default is 1
+...
+```
+
+To better understand this, let's use a specific example:
+
+<Frame>
+  <img src="/images/concurrency-flow-model-pod.png" />
+</Frame>
+
+Let's say predict concurrency is 1.
+1. Two requests come in to the pod.
+2. Both requests will begin preprocessing immediately (let's say,
+downloading images from S3).
+3. Once the first request finishes preprocessing, it will begin running on the GPU. The second request
+will then remain queued until the first request finishes running on the GPU in predict.
+4. After the first request finishes, the second request will begin being processed on the GPU
+5. Once the second request finishes, it will begin postprocessing, even if the first request is not done postprocessing
+
+To reiterate, predict concurrency is really great to use if you want to protect your GPU resource on your model pod,
+while still allowing for high concurrency for the pre and post-process steps.
+
+<Note>
+Remember that to actually achieve the predict concurrency you desire, the Concurrency Target must be at least that amount,
+so that the requests make it to the model container.
+</Note>
@@ -68,7 +68,8 @@
       "group": "Guides",
       "pages": [
         "guides/secrets",
-        "guides/base-images"
+        "guides/base-images",
+        "guides/concurrency"
       ]
     },
     {
 
@@ -7,28 +7,23 @@ description: "Create a new Truss."
 truss init [OPTIONS] TARGET_DIRECTORY
 ```
 
-### Options
-
-<ParamField body="-t, --trainable">
-Create a trainable truss. Deprecated.
-</ParamField>
+## Options
 
 <ParamField body="-b, --backend" type="TrussServer|TGI|VLLM">
 What type of server to create. Default: `TrussServer`.
 </ParamField>
-
 <ParamField body="--help">
 Show help message and exit.
 </ParamField>
 
-### Arguments
+## Arguments
 
 <ParamField body="TARGET_DIRECTORY" type="str">
-A Truss is created in this directory
+A Truss is created in this directory.
 </ParamField>
 
 
-### Example
+## Example
 
 ```
 truss init whisper-truss
 
@@ -7,11 +7,8 @@ description: "Invokes the packaged model."
 truss predict [OPTIONS]
 ```
 
-### Options
+## Options
 
-<ParamField body="--target_directory" type="TEXT">
-A Truss directory. If none, use current directory.
-</ParamField>
 <ParamField body="--remote" type="TEXT">
 Name of the remote in .trussrc to patch changes to.
 </ParamField>
@@ -21,15 +18,23 @@ String formatted as json that represents request.
 <ParamField body="-f, --file" type="PATH">
 Path to json file containing the request.
 </ParamField>
-<ParamField body="--published">
-Invoked the published model version.
+<ParamField body="--model_version" type="TEXT">
+ID of model version to invoke.
+</ParamField>
+<ParamField body="--model" type="TEXT">
+ID of model to invoke.
 </ParamField>
 <ParamField body="--help">
 Show help message and exit.
 </ParamField>
 
+## Arguments
+
+<ParamField body="TARGET_DIRECTORY" type="Optional">
+A Truss directory. If none, use current directory.
+</ParamField>
 
-### Examples
+## Examples
 
 ```
 truss predict -d '{"prompt": "What is the meaning of life?"}'
 
@@ -7,29 +7,29 @@ description: "Pushes a truss to a TrussRemote."
 truss push [OPTIONS] [TARGET_DIRECTORY]
 ```
 
-### Options
+## Options
 
 <ParamField body="--remote" type="TEXT">
-Name of the remote in .trussrc to patch changes to
+Name of the remote in .trussrc to patch changes to.
 </ParamField>
-<ParamField body="--publish">
+<ParamField body="--publish" type="BOOL">
 Publish truss as production deployment.
 </ParamField>
-<ParamField body="--trusted">
+<ParamField body="--trusted" type="BOOL">
 Give Truss access to secrets on remote host.
 </ParamField>
 <ParamField body="--help">
 Show help message and exit.
 </ParamField>
 
-### Arguments
+## Arguments
 
 <ParamField body="TARGET_DIRECTORY" type="Optional">
 A Truss directory. If none, use current directory.
 </ParamField>
 
 
-### Examples
+## Examples
 
 ```
 truss push
 
@@ -10,9 +10,11 @@ truss watch [OPTIONS] [TARGET_DIRECTORY]
 ### Options
 
 <ParamField body="--remote" type="TEXT">
-Name of the remote in .trussrc to patch changes to
+Name of the remote in .trussrc to patch changes to.
+</ParamField>
+<ParamField body="--logs" type="BOOL">
+Automatically open remote logs tab.
 </ParamField>
-
 <ParamField body="--help">
 Show help message and exit.
 </ParamField>
 
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "truss"
-version = "0.7.8"
+version = "0.7.9"
 description = "A seamless bridge from model development to model delivery"
 license = "MIT"
 readme = "README.md"
@@ -82,6 +82,7 @@ pytest-split = "^0.8.1"
 httpx = {extras = ["cli"], version = "^0.24.1"}
 requests-mock = "^1.11.0"
 flask = "^2.3.3"
+types-requests = "2.31.0.2"
 
 [build-system]
 requires = ["poetry-core>=1.2.1"]
Original file line number	Diff line number	Diff line change
`@@ -68,7 +68,8 @@`
`68`	`68`	`"group": "Guides",`
`69`	`69`	`"pages": [`
`70`	`70`	`"guides/secrets",`
`71`		`- "guides/base-images"`
	`71`	`+ "guides/base-images",`
	`72`	`+ "guides/concurrency"`
`72`	`73`	`]`
`73`	`74`	`},`
`74`	`75`	`{`