Adds control server passthrough for websockets (#1441)

nnarayen · web-flow · commit 1cd1d0a3f0c0 · 2025-03-10T18:25:05.000-04:00
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "truss"
-version = "0.9.67rc001"
+version = "0.9.67rc004"
 description = "A seamless bridge from model development to model delivery"
 license = "MIT"
 readme = "README.md"
@@ -91,6 +91,7 @@ click = { version = "^8.0.3", optional = false }
 fastapi = { version =">=0.109.1", optional = false }
 google-cloud-storage = { version = "2.10.0", optional = false }
 httpx = { version = ">=0.24.1", optional = false }
+httpx-ws = { version = "^0.7.1", optional = false }
 inquirerpy = { version = "^0.3.4", optional = false }
 libcst = { version = "<1.2.0", optional = false }
 loguru = { version = ">=0.7.2", optional = false }
diff --git a/truss/templates/control/control/application.py b/truss/templates/control/control/application.py
@@ -94,7 +94,7 @@ async def start_background_inference_startup():
     @app.on_event("shutdown")
     def on_shutdown():
         # FastApi handles the term signal to start the shutdown flow. Here we
-        # make sure that the inference server is stopeed when control server
+        # make sure that the inference server is stopped when control server
         # shuts down. Inference server has logic to wait until all requests are
         # finished before exiting. By waiting on that, we inherit the same
         # behavior for control server.
diff --git a/truss/templates/control/control/endpoints.py b/truss/templates/control/control/endpoints.py
@@ -1,16 +1,26 @@
 import asyncio
-from typing import Any, Dict
+import logging
+from typing import Any, Callable, Dict
 
 import httpx
-from fastapi import APIRouter
+from fastapi import APIRouter, WebSocket
 from fastapi.responses import JSONResponse, StreamingResponse
 from helpers.errors import ModelLoadFailed, ModelNotReady
-from httpx import URL, ConnectError, RemoteProtocolError
+from httpx_ws import aconnect_ws
 from starlette.requests import ClientDisconnect, Request
 from starlette.responses import Response
 from tenacity import RetryCallState, Retrying, retry_if_exception_type, wait_fixed
+from wsproto.events import BytesMessage, TextMessage
 
 INFERENCE_SERVER_START_WAIT_SECS = 60
+BASE_RETRY_EXCEPTIONS = (
+    retry_if_exception_type(httpx.ConnectError)
+    | retry_if_exception_type(httpx.RemoteProtocolError)
+    | retry_if_exception_type(httpx.ReadError)
+    | retry_if_exception_type(httpx.ReadTimeout)
+    | retry_if_exception_type(httpx.ConnectTimeout)
+    | retry_if_exception_type(ModelNotReady)
+)
 
 control_app = APIRouter()
 
@@ -20,14 +30,14 @@ def index():
     return {}
 
 
-async def proxy(request: Request):
+async def proxy_http(request: Request):
     inference_server_process_controller = (
         request.app.state.inference_server_process_controller
     )
     client: httpx.AsyncClient = request.app.state.proxy_client
 
     path = _reroute_if_health_check(request.url.path)
-    url = URL(path=path, query=request.url.query.encode("utf-8"))
+    url = httpx.URL(path=path, query=request.url.query.encode("utf-8"))
 
     # 2 min connect timeouts, no timeout for requests.
     # We don't want requests to fail due to timeout on the proxy
@@ -47,19 +57,7 @@ async def proxy(request: Request):
     )
 
     # Wait a bit for inference server to start
-    for attempt in Retrying(
-        retry=(
-            retry_if_exception_type(ConnectError)
-            | retry_if_exception_type(ModelNotReady)
-            | retry_if_exception_type(RemoteProtocolError)
-            | retry_if_exception_type(httpx.ReadError)
-            | retry_if_exception_type(httpx.ReadTimeout)
-            | retry_if_exception_type(httpx.ConnectTimeout)
-        ),
-        stop=_custom_stop_strategy,
-        wait=wait_fixed(1),
-        reraise=False,
-    ):
+    for attempt in inference_retries():
         with attempt:
             try:
                 if inference_server_process_controller.is_inference_server_intentionally_stopped():
@@ -68,7 +66,7 @@ async def proxy(request: Request):
 
                 if await _is_model_not_ready(resp):
                     raise ModelNotReady("Model has started running, but not ready yet.")
-            except (RemoteProtocolError, ConnectError) as exp:
+            except (httpx.RemoteProtocolError, httpx.ConnectError) as exp:
                 # This check is a bit expensive so we don't do it before every request, we
                 # do it only if request fails with connection error. If the inference server
                 # process is running then we continue waiting for it to start (by retrying),
@@ -94,7 +92,59 @@ async def proxy(request: Request):
     return response
 
 
-control_app.add_route("/v1/{path:path}", proxy, ["GET", "POST"])
+def inference_retries(
+    retry_condition: Callable[[RetryCallState], bool] = BASE_RETRY_EXCEPTIONS,
+):
+    for attempt in Retrying(
+        retry=retry_condition,
+        stop=_custom_stop_strategy,
+        wait=wait_fixed(1),
+        reraise=False,
+    ):
+        yield attempt
+
+
+async def _safe_close_ws(ws: WebSocket, logger: logging.Logger):
+    try:
+        await ws.close()
+    except RuntimeError as close_error:
+        logger.debug(f"Duplicate close of websocket: `{close_error}`.")
+
+
+async def proxy_ws(client_ws: WebSocket):
+    await client_ws.accept()
+    proxy_client: httpx.AsyncClient = client_ws.app.state.proxy_client
+    logger = client_ws.app.state.logger
+
+    for attempt in inference_retries():
+        with attempt:
+            async with aconnect_ws("/v1/websocket", proxy_client) as server_ws:  # type: ignore
+                # Unfortunate, but FastAPI and httpx-ws have slightly different abstractions
+                # for sending data, so it's not easy to create a unified wrapper.
+                async def forward_to_server():
+                    while True:
+                        message = await client_ws.receive()
+                        if "text" in message:
+                            await server_ws.send_text(message["text"])
+                        elif "bytes" in message:
+                            await server_ws.send_bytes(message["bytes"])
+
+                async def forward_to_client():
+                    while True:
+                        message = await server_ws.receive()
+                        if isinstance(message, TextMessage):
+                            await client_ws.send_text(message.data)
+                        elif isinstance(message, BytesMessage):
+                            await client_ws.send_bytes(message.data)
+
+                try:
+                    await asyncio.gather(forward_to_client(), forward_to_server())
+                finally:
+                    await _safe_close_ws(client_ws, logger)
+
+
+control_app.add_websocket_route("/v1/websocket", proxy_ws)
+control_app.add_route("/v1/{path:path}", proxy_http, ["GET", "POST"])
 
 
 @control_app.post("/control/patch")
diff --git a/truss/templates/control/requirements.txt b/truss/templates/control/requirements.txt
@@ -5,6 +5,7 @@ uvicorn==0.24.0
 uvloop==0.19.0
 tenacity==8.1.0
 httpx==0.27.0
+httpx-ws>=0.7.0
 python-json-logger==2.0.2
 loguru==0.7.2
 websockets<=14.0
diff --git a/truss/tests/templates/control/control/test_server_integration.py b/truss/tests/templates/control/control/test_server_integration.py
@@ -15,6 +15,7 @@
 import psutil
 import pytest
 import requests
+import websockets
 
 PATCH_PING_MAX_DELAY_SECS = 3
 
@@ -91,6 +92,68 @@ def inner():
     assert resp.content == "01234".encode("utf-8")
 
 
+@pytest.mark.asyncio
+@pytest.mark.integration
+async def test_truss_control_server_text_websocket(
+    control_server: ControlServerDetails,
+):
+    ws_model_code = """
+import fastapi
+
+class Model:
+    async def websocket(self, websocket: fastapi.WebSocket):
+        try:
+            while True:
+                text = await websocket.receive_text()
+                await websocket.send_text(text + " pong")
+        except fastapi.WebSocketDisconnect:
+            pass
+"""
+
+    ctrl_url = f"ws://localhost:{control_server.control_server_port}"
+    _patch(ws_model_code, control_server)
+
+    async with websockets.connect(f"{ctrl_url}/v1/websocket") as websocket:
+        await websocket.send("hello")
+        response = await websocket.recv()
+        assert response == "hello pong"
+
+        await websocket.send("world")
+        response = await websocket.recv()
+        assert response == "world pong"
+
+
+@pytest.mark.asyncio
+@pytest.mark.integration
+async def test_truss_control_server_binary_websocket(
+    control_server: ControlServerDetails,
+):
+    ws_model_code = """
+import fastapi
+
+class Model:
+    async def websocket(self, websocket: fastapi.WebSocket):
+        try:
+            while True:
+                text = await websocket.receive_bytes()
+                await websocket.send_bytes(text + b" pong")
+        except fastapi.WebSocketDisconnect:
+            pass
+"""
+
+    ctrl_url = f"ws://localhost:{control_server.control_server_port}"
+    _patch(ws_model_code, control_server)
+
+    async with websockets.connect(f"{ctrl_url}/v1/websocket") as websocket:
+        await websocket.send(b"hello")
+        response = await websocket.recv()
+        assert response == b"hello pong"
+
+        await websocket.send(b"world")
+        response = await websocket.recv()
+        assert response == b"world pong"
+
+
 @pytest.mark.integration
 def test_truss_control_server_health_check(control_server: ControlServerDetails):
     ctrl_url = f"http://localhost:{control_server.control_server_port}"