|
17 | 17 | BASE_SERVER_REQUIREMENTS_TXT_FILENAME,
|
18 | 18 | BASE_TRTLLM_REQUIREMENTS,
|
19 | 19 | BEI_MAX_CONCURRENCY_TARGET_REQUESTS,
|
| 20 | + BEI_REQUIRED_MAX_NUM_TOKENS, |
20 | 21 | BEI_TRTLLM_BASE_IMAGE,
|
21 | 22 | BEI_TRTLLM_CLIENT_BATCH_SIZE,
|
22 | 23 | BEI_TRTLLM_PYTHON_EXECUTABLE,
|
@@ -384,13 +385,19 @@ def prepare_trtllm_bei_encoder_build_dir(self, build_dir: Path):
|
384 | 385 | # runtime batch size may not be higher than what the build settings of the model allow
|
385 | 386 | # to 32 even if the engine.rank0 allows for higher batch_size
|
386 | 387 | runtime_max_batch_size = min(config.trt_llm.build.max_batch_size, 32)
|
387 |
| - |
| 388 | + # make sure the user gets good performance, enforcing max_num_tokens here and in engine-builder |
| 389 | + runtime_max_batch_tokens = max( |
| 390 | + config.trt_llm.build.max_num_tokens, BEI_REQUIRED_MAX_NUM_TOKENS |
| 391 | + ) |
388 | 392 | port = 7997
|
389 | 393 | start_command = " ".join(
|
390 | 394 | [
|
391 | 395 | "truss-transfer-cli && text-embeddings-router",
|
392 | 396 | f"--port {port}",
|
| 397 | + # assert the max_batch_size is within trt-engine limits |
393 | 398 | f"--max-batch-requests {runtime_max_batch_size}",
|
| 399 | + # assert the max_num_tokens is within trt-engine limits |
| 400 | + f"--max-batch-tokens {runtime_max_batch_tokens}", |
394 | 401 | # how many sentences can be in a single json payload.
|
395 | 402 | # limited default to improve request based autoscaling.
|
396 | 403 | f"--max-client-batch-size {BEI_TRTLLM_CLIENT_BATCH_SIZE}",
|
|
0 commit comments