Skip to content

Commit 53db85f

Browse files
authored
update: max num tokens for bei (#1357)
* update: n-gram deployment * add BEI_MINIUMUM_MAX_NUM_TOKENS * updated rc release version * fmt serving image builder * update sha of image * fix: imports * rename env variable
1 parent 2b96fcb commit 53db85f

File tree

3 files changed

+11
-3
lines changed

3 files changed

+11
-3
lines changed

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "truss"
3-
version = "0.9.60rc003"
3+
version = "0.9.60rc004"
44
description = "A seamless bridge from model development to model delivery"
55
license = "MIT"
66
readme = "README.md"

truss/base/constants.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
TRTLLM_PREDICT_CONCURRENCY = 512
3232
BEI_TRTLLM_CLIENT_BATCH_SIZE = 128
3333
BEI_MAX_CONCURRENCY_TARGET_REQUESTS = 2048
34+
BEI_REQUIRED_MAX_NUM_TOKENS = 16384
3435

3536
TRTLLM_MIN_MEMORY_REQUEST_GI = 24
3637
HF_MODELS_API_URL = "https://huggingface.co/api/models"
@@ -104,7 +105,7 @@
104105
TRTLLM_BASE_IMAGE = "baseten/briton-server:v0.16.0-5be7b58"
105106
TRTLLM_PYTHON_EXECUTABLE = "/usr/local/briton/venv/bin/python"
106107
BASE_TRTLLM_REQUIREMENTS = ["briton==0.4.2"]
107-
BEI_TRTLLM_BASE_IMAGE = "baseten/bei:0.0.16@sha256:51e7ab169ffc2fa9e809a2e34d2f767277ba0c67e01c63fbca842992bb6402fc"
108+
BEI_TRTLLM_BASE_IMAGE = "baseten/bei:0.0.17@sha256:9c3577f6ec672d6da5aca18e9c0ebdddd65ed80c8858e757fbde7e9cf48de01d"
108109

109110
BEI_TRTLLM_PYTHON_EXECUTABLE = "/usr/bin/python3"
110111

truss/contexts/image_builder/serving_image_builder.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
BASE_SERVER_REQUIREMENTS_TXT_FILENAME,
1818
BASE_TRTLLM_REQUIREMENTS,
1919
BEI_MAX_CONCURRENCY_TARGET_REQUESTS,
20+
BEI_REQUIRED_MAX_NUM_TOKENS,
2021
BEI_TRTLLM_BASE_IMAGE,
2122
BEI_TRTLLM_CLIENT_BATCH_SIZE,
2223
BEI_TRTLLM_PYTHON_EXECUTABLE,
@@ -384,13 +385,19 @@ def prepare_trtllm_bei_encoder_build_dir(self, build_dir: Path):
384385
# runtime batch size may not be higher than what the build settings of the model allow
385386
# to 32 even if the engine.rank0 allows for higher batch_size
386387
runtime_max_batch_size = min(config.trt_llm.build.max_batch_size, 32)
387-
388+
# make sure the user gets good performance, enforcing max_num_tokens here and in engine-builder
389+
runtime_max_batch_tokens = max(
390+
config.trt_llm.build.max_num_tokens, BEI_REQUIRED_MAX_NUM_TOKENS
391+
)
388392
port = 7997
389393
start_command = " ".join(
390394
[
391395
"truss-transfer-cli && text-embeddings-router",
392396
f"--port {port}",
397+
# assert the max_batch_size is within trt-engine limits
393398
f"--max-batch-requests {runtime_max_batch_size}",
399+
# assert the max_num_tokens is within trt-engine limits
400+
f"--max-batch-tokens {runtime_max_batch_tokens}",
394401
# how many sentences can be in a single json payload.
395402
# limited default to improve request based autoscaling.
396403
f"--max-client-batch-size {BEI_TRTLLM_CLIENT_BATCH_SIZE}",

0 commit comments

Comments
 (0)