Caching Weights from S3 (#709)

Varun Shenoy · web-flow · commit 233d0dad676a · 2023-10-25T17:06:24.000-07:00
* init s3 caching

* update toml to test on dev

* fix gcs tests + add s3 tests

* cleanup

* add boto to deps

* update pyproject to include boto

* bump dev

* update poetry lock

* public s3 buckets are working

* update dev

* bump rc
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "truss"
-version = "0.7.14rc2"
+version = "0.7.14rc3"
 description = "A seamless bridge from model development to model delivery"
 license = "MIT"
 readme = "README.md"
@@ -40,6 +40,8 @@ huggingface_hub = "^0.16.4"
 rich-click = "^1.6.1"
 inquirerpy = "^0.3.4"
 google-cloud-storage = "2.10.0"
+botocore =">=1.31.7"
+
 
 [tool.poetry.group.builder.dependencies]
 python = ">=3.8,<3.12"
@@ -58,6 +60,7 @@ httpx = "^0.24.1"
 psutil = "^5.9.4"
 huggingface_hub = "^0.16.4"
 google-cloud-storage = "2.10.0"
+boto3 = "^1.26.157"
 
 [tool.poetry.dev-dependencies]
 ipython = "^7.16"
diff --git a/truss/contexts/image_builder/cache_warmer.py b/truss/contexts/image_builder/cache_warmer.py
@@ -1,10 +1,13 @@
 import datetime
+import json
 import os
 import subprocess
 import sys
 from pathlib import Path
 from typing import Optional
 
+import boto3
+from botocore.client import Config
 from google.cloud import storage
 from huggingface_hub import hf_hub_download
 
@@ -32,52 +35,105 @@ def _download_from_url_using_b10cp(
     )
 
 
-def split_gs_path(gs_path):
+def split_path(path, prefix="gs://"):
     # Remove the 'gs://' prefix
-    path = gs_path.replace("gs://", "")
+    path = path.replace(prefix, "")
 
     # Split on the first slash
     parts = path.split("/", 1)
 
     bucket_name = parts[0]
-    prefix = parts[1] if len(parts) > 1 else ""
+    path = parts[1] if len(parts) > 1 else ""
 
-    return bucket_name, prefix
+    return bucket_name, path
+
+
+def parse_s3_service_account_file(file_path):
+    # open the json file
+    with open(file_path, "r") as f:
+        data = json.load(f)
+
+    # validate the data
+    if "aws_access_key_id" not in data or "aws_secret_access_key" not in data:
+        raise ValueError("Invalid AWS credentials file")
+
+    # parse the data
+    aws_access_key_id = data["aws_access_key_id"]
+    aws_secret_access_key = data["aws_secret_access_key"]
+    aws_region = data["aws_region"]
+
+    return aws_access_key_id, aws_secret_access_key, aws_region
 
 
 def download_file(
     repo_name, file_name, revision_name=None, key_file="/app/data/service_account.json"
 ):
     # Check if repo_name starts with "gs://"
-    if "gs://" in repo_name:
+    if repo_name.startswith(("gs://", "s3://")):
+        prefix = repo_name[:5]
+
         # Create directory if not exist
-        bucket_name, _ = split_gs_path(repo_name)
-        repo_name = repo_name.replace("gs://", "")
+        bucket_name, _ = split_path(repo_name, prefix=prefix)
+        repo_name = repo_name.replace(prefix, "")
         cache_dir = Path(f"/app/hf_cache/{bucket_name}")
         cache_dir.mkdir(parents=True, exist_ok=True)
 
-        # Connect to GCS storage
-        storage_client = storage.Client.from_service_account_json(key_file)
-        bucket = storage_client.bucket(bucket_name)
-        blob = bucket.blob(file_name)
+        if prefix == "gs://":
+            # Connect to GCS storage
+            storage_client = storage.Client.from_service_account_json(key_file)
+            bucket = storage_client.bucket(bucket_name)
+            blob = bucket.blob(file_name)
 
-        dst_file = Path(f"{cache_dir}/{file_name}")
-        if not dst_file.parent.exists():
-            dst_file.parent.mkdir(parents=True)
+            dst_file = Path(f"{cache_dir}/{file_name}")
+            if not dst_file.parent.exists():
+                dst_file.parent.mkdir(parents=True)
 
-        if not blob.exists(storage_client):
-            raise RuntimeError(f"File not found on GCS bucket: {blob.name}")
+            if not blob.exists(storage_client):
+                raise RuntimeError(f"File not found on GCS bucket: {blob.name}")
 
-        url = blob.generate_signed_url(
-            version="v4",
-            expiration=datetime.timedelta(minutes=15),
-            method="GET",
-        )
-        try:
-            proc = _download_from_url_using_b10cp(_b10cp_path(), url, dst_file)
-            proc.wait()
-        except Exception as e:
-            raise RuntimeError(f"Failure downloading file from GCS: {e}")
+            url = blob.generate_signed_url(
+                version="v4",
+                expiration=datetime.timedelta(minutes=15),
+                method="GET",
+            )
+            try:
+                proc = _download_from_url_using_b10cp(_b10cp_path(), url, dst_file)
+                proc.wait()
+            except Exception as e:
+                raise RuntimeError(f"Failure downloading file from GCS: {e}")
+        elif prefix == "s3://":
+            (
+                AWS_ACCESS_KEY_ID,
+                AWS_SECRET_ACCESS_KEY,
+                AWS_REGION,
+            ) = parse_s3_service_account_file(key_file)
+            client = boto3.client(
+                "s3",
+                aws_access_key_id=AWS_ACCESS_KEY_ID,
+                aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
+                region_name=AWS_REGION,
+                config=Config(signature_version="s3v4"),
+            )
+            bucket_name, _ = split_path(bucket_name, prefix="s3://")
+
+            dst_file = Path(f"{cache_dir}/{file_name}")
+            if not dst_file.parent.exists():
+                dst_file.parent.mkdir(parents=True)
+
+            try:
+                url = client.generate_presigned_url(
+                    "get_object",
+                    Params={"Bucket": bucket_name, "Key": file_name},
+                    ExpiresIn=3600,
+                )
+            except Exception:
+                raise RuntimeError(f"File not found on S3 bucket: {file_name}")
+
+            try:
+                proc = _download_from_url_using_b10cp(_b10cp_path(), url, dst_file)
+                proc.wait()
+            except Exception as e:
+                raise RuntimeError(f"Failure downloading file from S3: {e}")
     else:
         secret_path = Path("/etc/secrets/hf-access-token")
         secret = secret_path.read_text().strip() if secret_path.exists() else None
diff --git a/truss/contexts/image_builder/serving_image_builder.py b/truss/contexts/image_builder/serving_image_builder.py
@@ -1,6 +1,8 @@
+import json
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 
+import boto3
 import yaml
 from google.cloud import storage
 from huggingface_hub import get_hf_file_metadata, hf_hub_url, list_repo_files
@@ -88,29 +90,31 @@ def create_triton_build_dir(config: TrussConfig, build_dir: Path, truss_dir: Pat
     (build_dir / SYSTEM_PACKAGES_TXT_FILENAME).write_text(_spec.system_packages_txt)
 
 
-def split_gs_path(gs_path):
+def split_path(path, prefix="gs://"):
     # Remove the 'gs://' prefix
-    path = gs_path.replace("gs://", "")
+    path = path.replace(prefix, "")
 
     # Split on the first slash
     parts = path.split("/", 1)
 
     bucket_name = parts[0]
-    prefix = parts[1] if len(parts) > 1 else ""
+    path = parts[1] if len(parts) > 1 else ""
 
-    return bucket_name, prefix
+    return bucket_name, path
 
 
-def list_bucket_files(bucket_name, data_dir, is_trusted=False):
-    # TODO(varun): provide support for aws s3
-
+def list_gcs_bucket_files(
+    bucket_name,
+    data_dir,
+    is_trusted=False,
+):
     if is_trusted:
         storage_client = storage.Client.from_service_account_json(
             data_dir / "service_account.json"
         )
     else:
         storage_client = storage.Client()
-    bucket_name, prefix = split_gs_path(bucket_name)
+    bucket_name, prefix = split_path(bucket_name)
     blobs = storage_client.list_blobs(bucket_name, prefix=prefix)
 
     all_objects = []
@@ -123,9 +127,52 @@ def list_bucket_files(bucket_name, data_dir, is_trusted=False):
     return all_objects
 
 
+def parse_s3_service_account_file(file_path):
+    # open the json file
+    with open(file_path, "r") as f:
+        data = json.load(f)
+
+    # validate the data
+    if "aws_access_key_id" not in data or "aws_secret_access_key" not in data:
+        raise ValueError("Invalid AWS credentials file")
+
+    # parse the data
+    aws_access_key_id = data["aws_access_key_id"]
+    aws_secret_access_key = data["aws_secret_access_key"]
+
+    return aws_access_key_id, aws_secret_access_key
+
+
+def list_s3_bucket_files(bucket_name, data_dir, is_trusted=False):
+    if is_trusted:
+        AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY = parse_s3_service_account_file(
+            data_dir / "service_account.json"
+        )
+        session = boto3.Session(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
+        s3 = session.resource("s3")
+    else:
+        s3 = boto3.client("s3")
+
+    bucket_name, _ = split_path(bucket_name, prefix="s3://")
+    bucket = s3.Bucket(bucket_name)
+
+    all_objects = []
+    for blob in bucket.objects.all():
+        all_objects.append(blob.key)
+
+    return all_objects
+
+
 def list_files(repo_id, data_dir, revision=None):
-    if repo_id.startswith(("s3://", "gs://")):
-        return list_bucket_files(repo_id, data_dir, is_trusted=True)
+    credentials_file = data_dir / "service_account.json"
+    if repo_id.startswith("gs://"):
+        return list_gcs_bucket_files(
+            repo_id, data_dir, is_trusted=credentials_file.exists()
+        )
+    elif repo_id.startswith("s3://"):
+        return list_s3_bucket_files(
+            repo_id, data_dir, is_trusted=credentials_file.exists()
+        )
     else:
         # we assume it's a HF bucket
         return list_repo_files(repo_id, revision=revision)
@@ -201,10 +248,16 @@ def copy_into_build_dir(from_path: Path, path_in_build_dir: str):
 
 
 def fetch_files_to_cache(cached_files: list, repo_id: str, filtered_repo_files: list):
-    if "gs://" in repo_id:
-        bucket_name, _ = split_gs_path(repo_id)
+    if repo_id.startswith("gs://"):
+        bucket_name, _ = split_path(repo_id)
         repo_id = f"gs://{bucket_name}"
 
+        for filename in filtered_repo_files:
+            cached_files.append(f"/app/hf_cache/{bucket_name}/{filename}")
+    elif repo_id.startswith("s3://"):
+        bucket_name, _ = split_path(repo_id, prefix="s3://")
+        repo_id = f"s3://{bucket_name}"
+
         for filename in filtered_repo_files:
             cached_files.append(f"/app/hf_cache/{bucket_name}/{filename}")
     else:
diff --git a/truss/templates/cache_requirements.txt b/truss/templates/cache_requirements.txt
@@ -1,3 +1,4 @@
 huggingface-hub==0.16.4
 google-cloud-storage==2.10.0
+boto3==1.28.70
 hf-transfer==0.1.3
diff --git a/truss/tests/contexts/image_builder/test_serving_image_builder.py b/truss/tests/contexts/image_builder/test_serving_image_builder.py