|
67 | 67 | "import numpy as np\n",
|
68 | 68 | "import re\n",
|
69 | 69 | "from sagemaker import get_execution_role\n",
|
| 70 | + "import sagemaker\n", |
70 | 71 | "\n",
|
71 | 72 | "region = boto3.Session().region_name\n",
|
72 | 73 | "\n",
|
73 | 74 | "role = get_execution_role()\n",
|
74 | 75 | "\n",
|
75 |
| - "kms_key_arn = \"<your-kms-key-arn>\"\n", |
| 76 | + "kms_key = \"<your-kms-key-arn>\"\n", |
76 | 77 | "\n",
|
77 |
| - "bucket = \"<s3-bucket>\" # put your s3 bucket name here, and create s3 bucket\n", |
| 78 | + "bucket = sagemaker.Session().default_bucket()\n", |
78 | 79 | "prefix = \"sagemaker/DEMO-kms\"\n",
|
79 | 80 | "# customize to your bucket where you have stored the data\n",
|
80 | 81 | "bucket_path = \"s3://{}\".format(bucket)"
|
|
90 | 91 | "\n",
|
91 | 92 | "### Data ingestion\n",
|
92 | 93 | "\n",
|
93 |
| - "We, first, read the dataset from an existing repository into memory. This processing could be done *in situ* by Amazon Athena, Apache Spark in Amazon EMR, Amazon Redshift, etc., assuming the dataset is present in the appropriate location. Then, the next step would be to transfer the data to S3 for use in training. For small datasets, such as the one used below, reading into memory isn't onerous, though it would be for larger datasets." |
| 94 | + "We, first, read the dataset from an existing repository into memory. This processing could be done *in situ* by Amazon Athena, Apache Spark in Amazon EMR, Amazon Redshift, etc., assuming the dataset is present in the appropriate location. Then, the next step would be to transfer the data to S3 for use in training. For small datasets, such as the one used below, reading into memory isn't onerous, though it would be for larger datasets.\n", |
| 95 | + "\n", |
| 96 | + "This example uses the California Housing dataset, initially published in:\n", |
| 97 | + "\n", |
| 98 | + "> Pace, R. Kelley, and Ronald Barry. \"Sparse spatial autoregressions.\" Statistics & Probability Letters 33.3 (1997): 291-297." |
94 | 99 | ]
|
95 | 100 | },
|
96 | 101 | {
|
|
99 | 104 | "metadata": {},
|
100 | 105 | "outputs": [],
|
101 | 106 | "source": [
|
102 |
| - "from sklearn.datasets import load_boston\n", |
| 107 | + "from sklearn.datasets import fetch_california_housing\n", |
103 | 108 | "\n",
|
104 |
| - "boston = load_boston()\n", |
105 |
| - "X = boston[\"data\"]\n", |
106 |
| - "y = boston[\"target\"]\n", |
107 |
| - "feature_names = boston[\"feature_names\"]\n", |
| 109 | + "california = fetch_california_housing()\n", |
| 110 | + "X = california[\"data\"]\n", |
| 111 | + "y = california[\"target\"]\n", |
| 112 | + "feature_names = california[\"feature_names\"]\n", |
108 | 113 | "data = pd.DataFrame(X, columns=feature_names)\n",
|
109 | 114 | "target = pd.DataFrame(y, columns={\"MEDV\"})\n",
|
110 | 115 | "data[\"MEDV\"] = y\n",
|
111 |
| - "local_file_name = \"boston.csv\"\n", |
| 116 | + "local_file_name = \"california_housing.csv\"\n", |
112 | 117 | "data.to_csv(local_file_name, header=False, index=False)"
|
113 | 118 | ]
|
114 | 119 | },
|
|
140 | 145 | "outputs": [],
|
141 | 146 | "source": [
|
142 | 147 | "def write_file(X, y, fname, include_labels=True):\n",
|
143 |
| - " feature_names = boston[\"feature_names\"]\n", |
| 148 | + " feature_names = california[\"feature_names\"]\n", |
144 | 149 | " data = pd.DataFrame(X, columns=feature_names)\n",
|
145 | 150 | " if include_labels:\n",
|
146 | 151 | " data.insert(0, \"MEDV\", y)\n",
|
|
180 | 185 | "\n",
|
181 | 186 | "data_train = open(train_file, \"rb\")\n",
|
182 | 187 | "key_train = \"{}/train/{}\".format(prefix, train_file)\n",
|
183 |
| - "kms_key_id = kms_key_arn.split(\":key/\")[1]\n", |
| 188 | + "kms_key_id = kms_key.split(\":key/\")[1]\n", |
184 | 189 | "\n",
|
185 | 190 | "print(\"Put object...\")\n",
|
186 | 191 | "s3.put_object(\n",
|
|
227 | 232 | "source": [
|
228 | 233 | "## Training the SageMaker XGBoost model\n",
|
229 | 234 | "\n",
|
230 |
| - "Now that we have our data in S3, we can begin training. We'll use Amazon SageMaker XGboost algorithm as an example to demonstrate model training. Note that nothing needs to be changed in the way you'd call the training algorithm. The only requirement for training to succeed is that the IAM role (`role`) used for S3 access has permissions to encrypt and decrypt data with the KMS key (`kms_key_arn`). You can set these permissions using the instructions [here](http://docs.aws.amazon.com/kms/latest/developerguide/key-policies.html#key-policy-default-allow-users). If the permissions aren't set, you'll get the `Data download failed` error. Specify a `VolumeKmsKeyId` in the training job parameters to have the volume attached to the ML compute instance encrypted using key provided." |
| 235 | + "Now that we have our data in S3, we can begin training. We'll use Amazon SageMaker XGboost algorithm as an example to demonstrate model training. Note that nothing needs to be changed in the way you'd call the training algorithm. The only requirement for training to succeed is that the IAM role (`role`) used for S3 access has permissions to encrypt and decrypt data with the KMS key (`kms_key`). You can set these permissions using the instructions [here](http://docs.aws.amazon.com/kms/latest/developerguide/key-policies.html#key-policy-default-allow-users). If the permissions aren't set, you'll get the `Data download failed` error. Specify a `VolumeKmsKeyId` in the training job parameters to have the volume attached to the ML compute instance encrypted using key provided." |
231 | 236 | ]
|
232 | 237 | },
|
233 | 238 | {
|
|
236 | 241 | "metadata": {},
|
237 | 242 | "outputs": [],
|
238 | 243 | "source": [
|
239 |
| - "from sagemaker.amazon.amazon_estimator import get_image_uri\n", |
| 244 | + "from sagemaker import image_uris\n", |
240 | 245 | "\n",
|
241 |
| - "container = get_image_uri(boto3.Session().region_name, \"xgboost\")" |
| 246 | + "container = image_uris.retrieve(\n", |
| 247 | + " region=boto3.Session().region_name, framework=\"xgboost\", version=\"latest\"\n", |
| 248 | + ")" |
242 | 249 | ]
|
243 | 250 | },
|
244 | 251 | {
|
|
262 | 269 | " \"InstanceCount\": 1,\n",
|
263 | 270 | " \"InstanceType\": \"ml.m4.4xlarge\",\n",
|
264 | 271 | " \"VolumeSizeInGB\": 5,\n",
|
265 |
| - " \"VolumeKmsKeyId\": kms_key_arn,\n", |
| 272 | + " \"VolumeKmsKeyId\": kms_key,\n", |
266 | 273 | " },\n",
|
267 | 274 | " \"TrainingJobName\": job_name,\n",
|
268 | 275 | " \"HyperParameters\": {\n",
|
|
379 | 386 | "print(endpoint_config_name)\n",
|
380 | 387 | "create_endpoint_config_response = client.create_endpoint_config(\n",
|
381 | 388 | " EndpointConfigName=endpoint_config_name,\n",
|
382 |
| - " KmsKeyId=kms_key_arn,\n", |
| 389 | + " KmsKeyId=kms_key,\n", |
383 | 390 | " ProductionVariants=[\n",
|
384 | 391 | " {\n",
|
385 | 392 | " \"InstanceType\": \"ml.m4.xlarge\",\n",
|
|
509 | 516 | "metadata": {},
|
510 | 517 | "source": [
|
511 | 518 | "## Run batch prediction using batch transform\n",
|
512 |
| - "Create a transform job to do batch prediction using the trained model. Similar to the training section above, the execution role assumed by this notebook must have permissions to encrypt and decrypt data with the KMS key (`kms_key_arn`) used for S3 server-side encryption. Similar to training, specify a `VolumeKmsKeyId` so that the volume attached to the transform instance is encrypted using the key provided." |
| 519 | + "Create a transform job to do batch prediction using the trained model. Similar to the training section above, the execution role assumed by this notebook must have permissions to encrypt and decrypt data with the KMS key (`kms_key`) used for S3 server-side encryption. Similar to training, specify a `VolumeKmsKeyId` so that the volume attached to the transform instance is encrypted using the key provided." |
513 | 520 | ]
|
514 | 521 | },
|
515 | 522 | {
|
|
542 | 549 | " \"TransformResources\": {\n",
|
543 | 550 | " \"InstanceCount\": 1,\n",
|
544 | 551 | " \"InstanceType\": \"ml.c4.xlarge\",\n",
|
545 |
| - " \"VolumeKmsKeyId\": kms_key_arn,\n", |
| 552 | + " \"VolumeKmsKeyId\": kms_key,\n", |
546 | 553 | " },\n",
|
547 | 554 | "}\n",
|
548 | 555 | "\n",
|
|
605 | 612 | "name": "python",
|
606 | 613 | "nbconvert_exporter": "python",
|
607 | 614 | "pygments_lexer": "ipython3",
|
608 |
| - "version": "3.6.2" |
| 615 | + "version": "3.6.13" |
609 | 616 | },
|
610 | 617 | "notice": "Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
|
611 | 618 | },
|
|
0 commit comments