|
105 | 105 | "algorithm_arn = AlgorithmArnProvider.get_algorithm_arn(region)"
|
106 | 106 | ]
|
107 | 107 | },
|
108 |
| - { |
109 |
| - "cell_type": "code", |
110 |
| - "execution_count": null, |
111 |
| - "metadata": {}, |
112 |
| - "outputs": [], |
113 |
| - "source": [ |
114 |
| - "import subprocess\n", |
115 |
| - "\n", |
116 |
| - "subprocess.run(\"apt-get update -y\", shell=True)\n", |
117 |
| - "subprocess.run(\"apt install unzip\", shell=True)" |
118 |
| - ] |
119 |
| - }, |
120 | 108 | {
|
121 | 109 | "cell_type": "markdown",
|
122 | 110 | "metadata": {},
|
123 | 111 | "source": [
|
124 | 112 | "### Step 3: Get the data\n",
|
125 | 113 | "\n",
|
126 |
| - "In this example we'll use the direct-marketing dataset to build a binary classification model that predicts whether customers will accept or decline a marketing offer. \n", |
127 |
| - "First we'll download the data and split it into train and test sets. AutoGluon does not require a separate validation set (it uses bagged k-fold cross-validation)." |
| 114 | + "In this example we'll use the [1] [UCI Machine Learning Repository: Adult Data Set](https://archive.ics.uci.edu/ml/datasets/adult) to build a binary classification model that predicts whether customers will accept or decline a marketing offer. \n", |
| 115 | + "First we'll download the data and split it into train and test sets. AutoGluon does not require a separate validation set (it uses bagged k-fold cross-validation).\n", |
| 116 | + "\n", |
| 117 | + "[1] Dua, D. and Graff, C. (2019). [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml). Irvine, CA: University of California, School of Information and Computer Science." |
128 | 118 | ]
|
129 | 119 | },
|
130 | 120 | {
|
|
133 | 123 | "metadata": {},
|
134 | 124 | "outputs": [],
|
135 | 125 | "source": [
|
136 |
| - "# Download and unzip the data\n", |
137 |
| - "subprocess.run(\n", |
138 |
| - " f\"aws s3 cp --region {region} s3://sagemaker-sample-data-{region}/autopilot/direct_marketing/bank-additional.zip .\",\n", |
139 |
| - " shell=True,\n", |
140 |
| - ")\n", |
141 |
| - "subprocess.run(\"unzip -qq -o bank-additional.zip\", shell=True)\n", |
142 |
| - "subprocess.run(\"rm bank-additional.zip\", shell=True)\n", |
143 |
| - "\n", |
144 |
| - "local_data_path = \"./bank-additional/bank-additional-full.csv\"\n", |
145 |
| - "data = pd.read_csv(local_data_path)\n", |
| 126 | + "# Download the data\n", |
| 127 | + "s3 = boto3.client(\"s3\")\n", |
| 128 | + "s3.download_file(\"autogluon\", \"datasets/Inc/train.csv\", \"train.csv\")\n", |
| 129 | + "s3.download_file(\"autogluon\", \"datasets/Inc/test.csv\", \"test.csv\")\n", |
146 | 130 | "\n",
|
147 | 131 | "# Split train/test data\n",
|
148 |
| - "train = data.sample(frac=0.7, random_state=42)\n", |
149 |
| - "test = data.drop(train.index)\n", |
| 132 | + "train = pd.read_csv('train.csv')\n", |
| 133 | + "test = pd.read_csv('test.csv')\n", |
150 | 134 | "\n",
|
151 | 135 | "# Split test X/y\n",
|
152 |
| - "label = \"y\"\n", |
| 136 | + "label = \"class\"\n", |
153 | 137 | "y_test = test[label]\n",
|
154 | 138 | "X_test = test.drop(columns=[label])"
|
155 | 139 | ]
|
|
220 | 204 | "outputs": [],
|
221 | 205 | "source": [
|
222 | 206 | "# Define required label and optional additional parameters\n",
|
223 |
| - "init_args = {\"label\": \"y\"}\n", |
| 207 | + "init_args = {\"label\": \"class\"}\n", |
224 | 208 | "\n",
|
225 | 209 | "# Define additional parameters\n",
|
226 | 210 | "fit_args = {\n",
|
|
434 | 418 | "name": "python",
|
435 | 419 | "nbconvert_exporter": "python",
|
436 | 420 | "pygments_lexer": "ipython3",
|
437 |
| - "version": "3.6.10" |
| 421 | + "version": "3.6.13" |
438 | 422 | }
|
439 | 423 | },
|
440 | 424 | "nbformat": 4,
|
|
0 commit comments