xin3he
diff --git a/‎README.md
+30-28 b/‎README.md
+30-28
diff --git a/‎sagemaker-script-mode/.gitignore
+4 b/‎sagemaker-script-mode/.gitignore
+4
diff --git a/‎sagemaker-script-mode/generate_synthetic_housing_data.py
+103 b/‎sagemaker-script-mode/generate_synthetic_housing_data.py
+103
diff --git a/‎sagemaker-script-mode/index.rst
+14 b/‎sagemaker-script-mode/index.rst
+14
diff --git a/‎sagemaker-script-mode/my_custom_library/__init__.py
+1 b/‎sagemaker-script-mode/my_custom_library/__init__.py
+1
diff --git a/‎sagemaker-script-mode/my_custom_library/cross_validation_xgboost.py
+34 b/‎sagemaker-script-mode/my_custom_library/cross_validation_xgboost.py
+34
diff --git a/‎sagemaker-script-mode/pytorch_script/pytorch_model_def.py
+22 b/‎sagemaker-script-mode/pytorch_script/pytorch_model_def.py
+22
diff --git a/‎sagemaker-script-mode/pytorch_script/train_deploy_pytorch_without_dependencies.py
+161 b/‎sagemaker-script-mode/pytorch_script/train_deploy_pytorch_without_dependencies.py
+161
@@ -0,0 +1,4 @@
+.DS_Store
+.ipynb_checkpoints/
+data/
+.vscode/
@@ -0,0 +1,103 @@
+from random import choice
+import numpy as np
+import pandas as pd
+
+
+NUM_HOUSES_PER_LOCATION = 1000
+LOCATIONS = [
+    "NewYork_NY",
+    "LosAngeles_CA",
+    "Chicago_IL",
+    "Houston_TX",
+    "Dallas_TX",
+    "Phoenix_AZ",
+    "Philadelphia_PA",
+    "SanAntonio_TX",
+    "SanDiego_CA",
+    "SanFrancisco_CA",
+]
+MAX_YEAR = 2019
+
+
+def generate_price(house):
+    """Generate price based on features of the house"""
+
+    if house["FRONT_PORCH"] == "y":
+        garage = 1
+    else:
+        garage = 0
+
+    if house["FRONT_PORCH"] == "y":
+        front_porch = 1
+    else:
+        front_porch = 0
+
+    price = int(
+        150 * house["SQUARE_FEET"]
+        + 10000 * house["NUM_BEDROOMS"]
+        + 15000 * house["NUM_BATHROOMS"]
+        + 15000 * house["LOT_ACRES"]
+        + 10000 * garage
+        + 10000 * front_porch
+        + 15000 * house["GARAGE_SPACES"]
+        - 5000 * (MAX_YEAR - house["YEAR_BUILT"])
+    )
+    return price
+
+
+def generate_yes_no():
+    """Generate values (y/n) for categorical features"""
+    answer = choice([1, 0])
+    return answer
+
+
+def generate_random_house():
+    """Generate a row of data (single house information)"""
+    house = {
+        "SQUARE_FEET": np.random.normal(3000, 750),
+        "NUM_BEDROOMS": np.random.randint(2, 7),
+        "NUM_BATHROOMS": np.random.randint(2, 7) / 2,
+        "LOT_ACRES": round(np.random.normal(1.0, 0.25), 2),
+        "GARAGE_SPACES": np.random.randint(0, 4),
+        "YEAR_BUILT": min(MAX_YEAR, int(np.random.normal(1995, 10))),
+        "FRONT_PORCH": generate_yes_no(),
+        "DECK": generate_yes_no(),
+    }
+
+    price = generate_price(house)
+
+    return [
+        house["YEAR_BUILT"],
+        house["SQUARE_FEET"],
+        house["NUM_BEDROOMS"],
+        house["NUM_BATHROOMS"],
+        house["LOT_ACRES"],
+        house["GARAGE_SPACES"],
+        house["FRONT_PORCH"],
+        house["DECK"],
+        price,
+    ]
+
+
+def generate_houses(num_houses):
+    """Generate housing dataset"""
+    house_list = []
+
+    for _ in range(num_houses):
+        house_list.append(generate_random_house())
+
+    df = pd.DataFrame(
+        house_list,
+        columns=[
+            "YEAR_BUILT",
+            "SQUARE_FEET",
+            "NUM_BEDROOMS",
+            "NUM_BATHROOMS",
+            "LOT_ACRES",
+            "GARAGE_SPACES",
+            "FRONT_PORCH",
+            "DECK",
+            "PRICE",
+        ],
+    )
+    return df
@@ -0,0 +1,14 @@
+SageMaker Script Mode Examples
+==============================
+
+Use your own custom training and inference scripts, similar to those you would use outside of SageMaker, to bring your own model leveraging SageMaker's prebuilt containers for various frameworks like Scikit-learn, PyTorch, and XGBoost.
+
+Understand how you can bring your own dependencies and custom libraries to both training and inference.
+
+SageMaker Script Mode at Increasing Levels of Customization
+-----------------------------------------------------------
+
+.. toctree::
+   :maxdepth: 1
+
+   sagemaker-script-mode
@@ -0,0 +1 @@
+from .cross_validation_xgboost import *
@@ -0,0 +1,34 @@
+import xgboost as xgb
+import numpy as np
+
+
+def cross_validation(df, K, hyperparameters):
+    """
+    Perform cross validation on a dataset.
+
+    :param df: pandas.DataFrame
+    :param K: int
+    :param hyperparameters: dict
+    """
+    train_indices = list(df.sample(frac=1).index)
+    k_folds = np.array_split(train_indices, K)
+    if K == 1:
+        K = 2
+
+    rmse_list = []
+    for i in range(len(k_folds)):
+        training_folds = [fold for j, fold in enumerate(k_folds) if j != i]
+        training_indices = np.concatenate(training_folds)
+        x_train, y_train = df.iloc[training_indices, 1:], df.iloc[training_indices, :1]
+        x_validation, y_validation = df.iloc[k_folds[i], 1:], df.iloc[k_folds[i], :1]
+        dtrain = xgb.DMatrix(data=x_train, label=y_train)
+        dvalidation = xgb.DMatrix(data=x_validation, label=y_validation)
+
+        model = xgb.train(
+            params=hyperparameters,
+            dtrain=dtrain,
+            evals=[(dtrain, "train"), (dvalidation, "validation")],
+        )
+        eval_results = model.eval(dvalidation)
+        rmse_list.append(float(eval_results.split("eval-rmse:")[1]))
+    return rmse_list, model
@@ -0,0 +1,22 @@
+import torch
+import torch.nn as nn
+
+
+class NeuralNet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = nn.Linear(8, 8)
+        self.fc2 = nn.Linear(8, 6)
+        self.fc3 = nn.Linear(6, 1)
+
+    def forward(self, x):
+        x = torch.tanh(self.fc1(x))
+        x = torch.sigmoid(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+def get_model():
+
+    model = NeuralNet()
+    return model
@@ -0,0 +1,161 @@
+import argparse
+import numpy as np
+import os
+import sys
+import logging
+import json
+import shutil
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, TensorDataset
+from pytorch_model_def import get_model
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+logger.addHandler(logging.StreamHandler(sys.stdout))
+
+
+def parse_args():
+    """
+    Parse arguments passed from the SageMaker API
+    to the container
+    """
+
+    parser = argparse.ArgumentParser()
+
+    # Hyperparameters sent by the client are passed as command-line arguments to the script
+    parser.add_argument("--epochs", type=int, default=1)
+    parser.add_argument("--batch_size", type=int, default=64)
+    parser.add_argument("--learning_rate", type=float, default=0.1)
+
+    # Data directories
+    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
+    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
+
+    # Model directory: we will use the default set by SageMaker, /opt/ml/model
+    parser.add_argument("--model_dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
+
+    return parser.parse_known_args()
+
+
+def get_train_data(train_dir):
+    """
+    Get the training data and convert to tensors
+    """
+
+    x_train = np.load(os.path.join(train_dir, "x_train.npy"))
+    y_train = np.load(os.path.join(train_dir, "y_train.npy"))
+    logger.info(f"x train: {x_train.shape}, y train: {y_train.shape}")
+
+    return torch.from_numpy(x_train), torch.from_numpy(y_train)
+
+
+def get_test_data(test_dir):
+    """
+    Get the testing data and convert to tensors
+    """
+
+    x_test = np.load(os.path.join(test_dir, "x_test.npy"))
+    y_test = np.load(os.path.join(test_dir, "y_test.npy"))
+    logger.info(f"x test: {x_test.shape}, y test: {y_test.shape}")
+
+    return torch.from_numpy(x_test), torch.from_numpy(y_test)
+
+
+def model_fn(model_dir):
+    """
+    Load the model for inference
+    """
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = get_model()
+    model.load_state_dict(torch.load(model_dir + "/model.pth"))
+    model.eval()
+    return model.to(device)
+
+
+def input_fn(request_body, request_content_type):
+    """
+    Deserialize and prepare the prediction input
+    """
+
+    if request_content_type == "application/json":
+        request = json.loads(request_body)
+        train_inputs = torch.tensor(request)
+        return train_inputs
+
+
+def predict_fn(input_data, model):
+    """
+    Apply model to the incoming request
+    """
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    model.eval()
+    with torch.no_grad():
+        return model(input_data.float()).numpy()[0]
+
+
+def train():
+    """
+    Train the PyTorch model
+    """
+
+    x_train, y_train = get_train_data(args.train)
+    x_test, y_test = get_test_data(args.test)
+    train_ds = TensorDataset(x_train, y_train)
+
+    batch_size = args.batch_size
+    epochs = args.epochs
+    learning_rate = args.learning_rate
+    logger.info(
+        "batch_size = {}, epochs = {}, learning rate = {}".format(
+            batch_size, epochs, learning_rate
+        )
+    )
+
+    train_dl = DataLoader(train_ds, batch_size, shuffle=True)
+
+    model = get_model()
+    model = model.to(device)
+    criterion = nn.MSELoss()
+    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+    for epoch in range(epochs):
+        for x_train_batch, y_train_batch in train_dl:
+            y = model(x_train_batch.float())
+            loss = criterion(y.flatten(), y_train_batch.float())
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+        epoch += 1
+        logger.info(f"epoch: {epoch} -> loss: {loss}")
+
+    # evalutate on test set
+    with torch.no_grad():
+        y = model(x_test.float()).flatten()
+        mse = ((y - y_test) ** 2).sum() / y_test.shape[0]
+    print("\nTest MSE:", mse.numpy())
+
+    torch.save(model.state_dict(), args.model_dir + "/model.pth")
+    # PyTorch requires that the inference script must
+    # be in the .tar.gz model file and Step Functions SDK doesn't do this.
+    inference_code_path = args.model_dir + "/code/"
+
+    if not os.path.exists(inference_code_path):
+        os.mkdir(inference_code_path)
+        logger.info("Created a folder at {}!".format(inference_code_path))
+
+    shutil.copy("train_deploy_pytorch_without_dependencies.py", inference_code_path)
+    shutil.copy("pytorch_model_def.py", inference_code_path)
+    logger.info("Saving models files to {}".format(inference_code_path))
+
+
+if __name__ == "__main__":
+
+    args, _ = parse_args()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    train()
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from .cross_validation_xgboost import *`