Skip to content

Commit 242f5f0

Browse files
bobbywlindseybobbywlindsey
and
bobbywlindsey
authored
Amazon SageMaker Script Mode Examples (aws#2217)
* Add SageMaker Script Mode examples * No need for README * Add docstrings * Add CloudFormation script * Add index into README and links for getting the notebook to appear on the website * Replace Boston Housing with synthetic housing data; add intro and prereq sections; format code * Add more explanations of code cells * Add check for existing endpoints * Clean up some code * Automate deleting of resources * Fix stuff Co-authored-by: bobbywlindsey <contact@bobbywlindsey.com>
1 parent 3889f90 commit 242f5f0

14 files changed

+1267
-28
lines changed

README.md

+30-28
Large diffs are not rendered by default.

sagemaker-script-mode/.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
.DS_Store
2+
.ipynb_checkpoints/
3+
data/
4+
.vscode/
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
from random import choice
2+
import numpy as np
3+
import pandas as pd
4+
5+
6+
NUM_HOUSES_PER_LOCATION = 1000
7+
LOCATIONS = [
8+
"NewYork_NY",
9+
"LosAngeles_CA",
10+
"Chicago_IL",
11+
"Houston_TX",
12+
"Dallas_TX",
13+
"Phoenix_AZ",
14+
"Philadelphia_PA",
15+
"SanAntonio_TX",
16+
"SanDiego_CA",
17+
"SanFrancisco_CA",
18+
]
19+
MAX_YEAR = 2019
20+
21+
22+
def generate_price(house):
23+
"""Generate price based on features of the house"""
24+
25+
if house["FRONT_PORCH"] == "y":
26+
garage = 1
27+
else:
28+
garage = 0
29+
30+
if house["FRONT_PORCH"] == "y":
31+
front_porch = 1
32+
else:
33+
front_porch = 0
34+
35+
price = int(
36+
150 * house["SQUARE_FEET"]
37+
+ 10000 * house["NUM_BEDROOMS"]
38+
+ 15000 * house["NUM_BATHROOMS"]
39+
+ 15000 * house["LOT_ACRES"]
40+
+ 10000 * garage
41+
+ 10000 * front_porch
42+
+ 15000 * house["GARAGE_SPACES"]
43+
- 5000 * (MAX_YEAR - house["YEAR_BUILT"])
44+
)
45+
return price
46+
47+
48+
def generate_yes_no():
49+
"""Generate values (y/n) for categorical features"""
50+
answer = choice([1, 0])
51+
return answer
52+
53+
54+
def generate_random_house():
55+
"""Generate a row of data (single house information)"""
56+
house = {
57+
"SQUARE_FEET": np.random.normal(3000, 750),
58+
"NUM_BEDROOMS": np.random.randint(2, 7),
59+
"NUM_BATHROOMS": np.random.randint(2, 7) / 2,
60+
"LOT_ACRES": round(np.random.normal(1.0, 0.25), 2),
61+
"GARAGE_SPACES": np.random.randint(0, 4),
62+
"YEAR_BUILT": min(MAX_YEAR, int(np.random.normal(1995, 10))),
63+
"FRONT_PORCH": generate_yes_no(),
64+
"DECK": generate_yes_no(),
65+
}
66+
67+
price = generate_price(house)
68+
69+
return [
70+
house["YEAR_BUILT"],
71+
house["SQUARE_FEET"],
72+
house["NUM_BEDROOMS"],
73+
house["NUM_BATHROOMS"],
74+
house["LOT_ACRES"],
75+
house["GARAGE_SPACES"],
76+
house["FRONT_PORCH"],
77+
house["DECK"],
78+
price,
79+
]
80+
81+
82+
def generate_houses(num_houses):
83+
"""Generate housing dataset"""
84+
house_list = []
85+
86+
for _ in range(num_houses):
87+
house_list.append(generate_random_house())
88+
89+
df = pd.DataFrame(
90+
house_list,
91+
columns=[
92+
"YEAR_BUILT",
93+
"SQUARE_FEET",
94+
"NUM_BEDROOMS",
95+
"NUM_BATHROOMS",
96+
"LOT_ACRES",
97+
"GARAGE_SPACES",
98+
"FRONT_PORCH",
99+
"DECK",
100+
"PRICE",
101+
],
102+
)
103+
return df

sagemaker-script-mode/index.rst

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
SageMaker Script Mode Examples
2+
==============================
3+
4+
Use your own custom training and inference scripts, similar to those you would use outside of SageMaker, to bring your own model leveraging SageMaker's prebuilt containers for various frameworks like Scikit-learn, PyTorch, and XGBoost.
5+
6+
Understand how you can bring your own dependencies and custom libraries to both training and inference.
7+
8+
SageMaker Script Mode at Increasing Levels of Customization
9+
-----------------------------------------------------------
10+
11+
.. toctree::
12+
:maxdepth: 1
13+
14+
sagemaker-script-mode
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .cross_validation_xgboost import *
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import xgboost as xgb
2+
import numpy as np
3+
4+
5+
def cross_validation(df, K, hyperparameters):
6+
"""
7+
Perform cross validation on a dataset.
8+
9+
:param df: pandas.DataFrame
10+
:param K: int
11+
:param hyperparameters: dict
12+
"""
13+
train_indices = list(df.sample(frac=1).index)
14+
k_folds = np.array_split(train_indices, K)
15+
if K == 1:
16+
K = 2
17+
18+
rmse_list = []
19+
for i in range(len(k_folds)):
20+
training_folds = [fold for j, fold in enumerate(k_folds) if j != i]
21+
training_indices = np.concatenate(training_folds)
22+
x_train, y_train = df.iloc[training_indices, 1:], df.iloc[training_indices, :1]
23+
x_validation, y_validation = df.iloc[k_folds[i], 1:], df.iloc[k_folds[i], :1]
24+
dtrain = xgb.DMatrix(data=x_train, label=y_train)
25+
dvalidation = xgb.DMatrix(data=x_validation, label=y_validation)
26+
27+
model = xgb.train(
28+
params=hyperparameters,
29+
dtrain=dtrain,
30+
evals=[(dtrain, "train"), (dvalidation, "validation")],
31+
)
32+
eval_results = model.eval(dvalidation)
33+
rmse_list.append(float(eval_results.split("eval-rmse:")[1]))
34+
return rmse_list, model
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import torch
2+
import torch.nn as nn
3+
4+
5+
class NeuralNet(nn.Module):
6+
def __init__(self):
7+
super().__init__()
8+
self.fc1 = nn.Linear(8, 8)
9+
self.fc2 = nn.Linear(8, 6)
10+
self.fc3 = nn.Linear(6, 1)
11+
12+
def forward(self, x):
13+
x = torch.tanh(self.fc1(x))
14+
x = torch.sigmoid(self.fc2(x))
15+
x = self.fc3(x)
16+
return x
17+
18+
19+
def get_model():
20+
21+
model = NeuralNet()
22+
return model
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
import argparse
2+
import numpy as np
3+
import os
4+
import sys
5+
import logging
6+
import json
7+
import shutil
8+
import torch
9+
import torch.nn as nn
10+
from torch.utils.data import DataLoader, TensorDataset
11+
from pytorch_model_def import get_model
12+
13+
14+
logger = logging.getLogger(__name__)
15+
logger.setLevel(logging.DEBUG)
16+
logger.addHandler(logging.StreamHandler(sys.stdout))
17+
18+
19+
def parse_args():
20+
"""
21+
Parse arguments passed from the SageMaker API
22+
to the container
23+
"""
24+
25+
parser = argparse.ArgumentParser()
26+
27+
# Hyperparameters sent by the client are passed as command-line arguments to the script
28+
parser.add_argument("--epochs", type=int, default=1)
29+
parser.add_argument("--batch_size", type=int, default=64)
30+
parser.add_argument("--learning_rate", type=float, default=0.1)
31+
32+
# Data directories
33+
parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
34+
parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
35+
36+
# Model directory: we will use the default set by SageMaker, /opt/ml/model
37+
parser.add_argument("--model_dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
38+
39+
return parser.parse_known_args()
40+
41+
42+
def get_train_data(train_dir):
43+
"""
44+
Get the training data and convert to tensors
45+
"""
46+
47+
x_train = np.load(os.path.join(train_dir, "x_train.npy"))
48+
y_train = np.load(os.path.join(train_dir, "y_train.npy"))
49+
logger.info(f"x train: {x_train.shape}, y train: {y_train.shape}")
50+
51+
return torch.from_numpy(x_train), torch.from_numpy(y_train)
52+
53+
54+
def get_test_data(test_dir):
55+
"""
56+
Get the testing data and convert to tensors
57+
"""
58+
59+
x_test = np.load(os.path.join(test_dir, "x_test.npy"))
60+
y_test = np.load(os.path.join(test_dir, "y_test.npy"))
61+
logger.info(f"x test: {x_test.shape}, y test: {y_test.shape}")
62+
63+
return torch.from_numpy(x_test), torch.from_numpy(y_test)
64+
65+
66+
def model_fn(model_dir):
67+
"""
68+
Load the model for inference
69+
"""
70+
71+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
72+
model = get_model()
73+
model.load_state_dict(torch.load(model_dir + "/model.pth"))
74+
model.eval()
75+
return model.to(device)
76+
77+
78+
def input_fn(request_body, request_content_type):
79+
"""
80+
Deserialize and prepare the prediction input
81+
"""
82+
83+
if request_content_type == "application/json":
84+
request = json.loads(request_body)
85+
train_inputs = torch.tensor(request)
86+
return train_inputs
87+
88+
89+
def predict_fn(input_data, model):
90+
"""
91+
Apply model to the incoming request
92+
"""
93+
94+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
95+
model.to(device)
96+
model.eval()
97+
with torch.no_grad():
98+
return model(input_data.float()).numpy()[0]
99+
100+
101+
def train():
102+
"""
103+
Train the PyTorch model
104+
"""
105+
106+
x_train, y_train = get_train_data(args.train)
107+
x_test, y_test = get_test_data(args.test)
108+
train_ds = TensorDataset(x_train, y_train)
109+
110+
batch_size = args.batch_size
111+
epochs = args.epochs
112+
learning_rate = args.learning_rate
113+
logger.info(
114+
"batch_size = {}, epochs = {}, learning rate = {}".format(
115+
batch_size, epochs, learning_rate
116+
)
117+
)
118+
119+
train_dl = DataLoader(train_ds, batch_size, shuffle=True)
120+
121+
model = get_model()
122+
model = model.to(device)
123+
criterion = nn.MSELoss()
124+
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
125+
126+
for epoch in range(epochs):
127+
for x_train_batch, y_train_batch in train_dl:
128+
y = model(x_train_batch.float())
129+
loss = criterion(y.flatten(), y_train_batch.float())
130+
optimizer.zero_grad()
131+
loss.backward()
132+
optimizer.step()
133+
epoch += 1
134+
logger.info(f"epoch: {epoch} -> loss: {loss}")
135+
136+
# evalutate on test set
137+
with torch.no_grad():
138+
y = model(x_test.float()).flatten()
139+
mse = ((y - y_test) ** 2).sum() / y_test.shape[0]
140+
print("\nTest MSE:", mse.numpy())
141+
142+
torch.save(model.state_dict(), args.model_dir + "/model.pth")
143+
# PyTorch requires that the inference script must
144+
# be in the .tar.gz model file and Step Functions SDK doesn't do this.
145+
inference_code_path = args.model_dir + "/code/"
146+
147+
if not os.path.exists(inference_code_path):
148+
os.mkdir(inference_code_path)
149+
logger.info("Created a folder at {}!".format(inference_code_path))
150+
151+
shutil.copy("train_deploy_pytorch_without_dependencies.py", inference_code_path)
152+
shutil.copy("pytorch_model_def.py", inference_code_path)
153+
logger.info("Saving models files to {}".format(inference_code_path))
154+
155+
156+
if __name__ == "__main__":
157+
158+
args, _ = parse_args()
159+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
160+
161+
train()

0 commit comments

Comments
 (0)