diff --git a/Technocollab_Mini_Project.ipynb b/Technocollab_Mini_Project.ipynb new file mode 100644 index 0000000..12f6fa6 --- /dev/null +++ b/Technocollab_Mini_Project.ipynb @@ -0,0 +1,1438 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Technocollab Mini Project.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 474 + }, + "id": "7DvkOHqVAwuT", + "outputId": "cf04fefd-41a9-48b9-ac21-59222705ba81" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "There are 51 rows and 5 columns.\n", + "\n", + "\n", + "RangeIndex: 51 entries, 0 to 50\n", + "Data columns (total 5 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 state 51 non-null object \n", + " 1 drvr_fatl_col_bmiles 51 non-null float64\n", + " 2 perc_fatl_speed 51 non-null int64 \n", + " 3 perc_fatl_alcohol 51 non-null int64 \n", + " 4 perc_fatl_1st_time 51 non-null int64 \n", + "dtypes: float64(1), int64(3), object(1)\n", + "memory usage: 2.1+ KB\n", + "None\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " state drvr_fatl_col_bmiles perc_fatl_speed perc_fatl_alcohol \\\n", + "46 Virginia 12.7 19 27 \n", + "47 Washington 10.6 42 33 \n", + "48 West Virginia 23.8 34 28 \n", + "49 Wisconsin 13.8 36 33 \n", + "50 Wyoming 17.4 42 32 \n", + "\n", + " perc_fatl_1st_time \n", + "46 88 \n", + "47 86 \n", + "48 87 \n", + "49 84 \n", + "50 90 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
statedrvr_fatl_col_bmilesperc_fatl_speedperc_fatl_alcoholperc_fatl_1st_time
46Virginia12.7192788
47Washington10.6423386
48West Virginia23.8342887
49Wisconsin13.8363384
50Wyoming17.4423290
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 1 + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "\n", + "# Read in `road-accidents.csv`\n", + "car_acc = pd.read_csv('road-accidents.csv', comment = '#', sep = '|')\n", + "\n", + "# Save the number of rows columns as a tuple\n", + "rows_and_cols = car_acc.shape\n", + "print('There are {} rows and {} columns.\\n'.format(\n", + " rows_and_cols[0], rows_and_cols[1]))\n", + "\n", + "# Generate an overview of the DataFrame\n", + "car_acc_information = car_acc.info()\n", + "print(car_acc_information)\n", + "\n", + "# Display the last five rows of the DataFrame\n", + "car_acc.tail()" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Grapical Summary" + ], + "metadata": { + "id": "ZYUXbJ3pBXv2" + } + }, + { + "cell_type": "code", + "source": [ + "import seaborn as sns\n", + "%matplotlib inline\n", + "\n", + "# Compute the summary statistics of all columns in the `car_acc` DataFrame\n", + "sum_stat_car = car_acc.describe()\n", + "print(\"Summary: \",sum_stat_car)\n", + "\n", + "# Create a pairwise scatter plot to explore the data\n", + "sns.pairplot(sum_stat_car)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "TFRWu0tKBHrD", + "outputId": "f7a8eaf6-7386-4e03-dfa2-b32056c0db4d" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Summary: drvr_fatl_col_bmiles perc_fatl_speed perc_fatl_alcohol \\\n", + "count 51.000000 51.000000 51.000000 \n", + "mean 15.790196 31.725490 30.686275 \n", + "std 4.122002 9.633438 5.132213 \n", + "min 5.900000 13.000000 16.000000 \n", + "25% 12.750000 23.000000 28.000000 \n", + "50% 15.600000 34.000000 30.000000 \n", + "75% 18.500000 38.000000 33.000000 \n", + "max 23.900000 54.000000 44.000000 \n", + "\n", + " perc_fatl_1st_time \n", + "count 51.00000 \n", + "mean 88.72549 \n", + "std 6.96011 \n", + "min 76.00000 \n", + "25% 83.50000 \n", + "50% 88.00000 \n", + "75% 95.00000 \n", + "max 100.00000 \n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 4 + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "***Finding Correlation***" + ], + "metadata": { + "id": "LypR_rGKDGeJ" + } + }, + { + "cell_type": "code", + "source": [ + "corr_columns = car_acc.corr()\n", + "corr_columns" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 175 + }, + "id": "br-qkkcVB6tj", + "outputId": "aa1d5023-cf59-4563-afee-c9e9248c5d7b" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " drvr_fatl_col_bmiles perc_fatl_speed \\\n", + "drvr_fatl_col_bmiles 1.000000 -0.029080 \n", + "perc_fatl_speed -0.029080 1.000000 \n", + "perc_fatl_alcohol 0.199426 0.286244 \n", + "perc_fatl_1st_time -0.017942 0.014066 \n", + "\n", + " perc_fatl_alcohol perc_fatl_1st_time \n", + "drvr_fatl_col_bmiles 0.199426 -0.017942 \n", + "perc_fatl_speed 0.286244 0.014066 \n", + "perc_fatl_alcohol 1.000000 -0.245455 \n", + "perc_fatl_1st_time -0.245455 1.000000 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
drvr_fatl_col_bmilesperc_fatl_speedperc_fatl_alcoholperc_fatl_1st_time
drvr_fatl_col_bmiles1.000000-0.0290800.199426-0.017942
perc_fatl_speed-0.0290801.0000000.2862440.014066
perc_fatl_alcohol0.1994260.2862441.000000-0.245455
perc_fatl_1st_time-0.0179420.014066-0.2454551.000000
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 5 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "***Applying Linear Regression Model***" + ], + "metadata": { + "id": "YM0fqRHnC-uW" + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn import linear_model\n", + "\n", + "# Create the features and target DataFrames\n", + "features = car_acc[['perc_fatl_speed', 'perc_fatl_alcohol', 'perc_fatl_1st_time']]\n", + "target = car_acc['drvr_fatl_col_bmiles']\n", + "\n", + "# Create a linear regression object\n", + "reg = linear_model.LinearRegression()\n", + "\n", + "# Fit a multivariate linear regression model\n", + "reg.fit(features, target)\n", + "\n", + "# Retrieve the regression coefficients\n", + "fit_coef = reg.coef_\n", + "fit_coef" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_w0XqMgiCvhI", + "outputId": "3b3ea174-2b23-4701-f139-1e62272aba5c" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([-0.04180041, 0.19086404, 0.02473301])" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "***Perform PCA on standardized data***" + ], + "metadata": { + "id": "z3Z24yNsDOFB" + } + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "\n", + "# Standardize and center the feature columns\n", + "from sklearn.preprocessing import StandardScaler\n", + "scaler = StandardScaler()\n", + "features_scaled = scaler.fit_transform(features)\n", + "\n", + "# Import the PCA class function from sklearn\n", + "from sklearn.decomposition import PCA\n", + "pca = PCA()\n", + "\n", + "# Fit the standardized data to the pca\n", + "pca.fit(features_scaled)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ew8dM3XDC7Ps", + "outputId": "62c5af1e-3dca-4510-faab-13781a22f858" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "PCA()" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ] + }, + { + "cell_type": "code", + "source": [ + "import matplotlib.pyplot as plt\n", + "plt.bar(range(1, pca.n_components_ + 1), pca.explained_variance_ratio_)\n", + "plt.xlabel('Principal component #')\n", + "plt.ylabel('Proportion of variance explained')\n", + "plt.xticks([1, 2, 3])\n", + "\n", + "# Compute the cumulative proportion of variance explained by the first two principal components\n", + "two_first_comp_var_exp = pca.explained_variance_ratio_[0].cumsum()[0] + pca.explained_variance_ratio_[1].cumsum()[0]\n", + "print(\"The cumulative variance of the first two principal components is {}\".format(\n", + " round(two_first_comp_var_exp, 5)))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 297 + }, + "id": "6MpklQTIDclL", + "outputId": "d2823b6e-1c28-449c-d4a5-38195fef1603" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "The cumulative variance of the first two principal components is 0.7947\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "***Visualize first two principal components***" + ], + "metadata": { + "id": "aTSsZShDDqHL" + } + }, + { + "cell_type": "code", + "source": [ + "# Transform the scaled features using two principal components\n", + "pca = PCA(n_components = 2)\n", + "p_comps = pca.fit_transform(features_scaled)\n", + "\n", + "# Extract the first and second component to use for the scatter plot\n", + "p_comp1 = p_comps[:, 0]\n", + "p_comp2 = p_comps[:, 1]\n", + "\n", + "# Plot the first two principal components in a scatter plot\n", + "plt.scatter(p_comp1, p_comp2)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 283 + }, + "id": "ygaFfiodDgdC", + "outputId": "a7498101-3434-4420-c910-1a9bfc206f40" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 9 + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAD4CAYAAADvsV2wAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAWpUlEQVR4nO3df6zddX3H8edrterNdLvbeif0tl3Z1tQxqnSeIKbLAgq2EAO1yoQlKk7TZJGoi+tCR4I//mk3Ejc3idgAURcnmtGWKt0qWAxbJo5bWiyl1HVERw9sXMGihLtJ4b0/zim9vZxz7z3nfM/31+f1SG56vt/z7fl+zv3x/n7O+/P+fL6KCMzMrP5+oegGmJlZPhzwzcwS4YBvZpYIB3wzs0Q44JuZJeIVRTdgNosWLYrly5cX3Qwzs8rYt2/fjyNirNNzpQ74y5cvZ2JiouhmmJlVhqQfdXvOKR0zs0Q44JuZJcIB38wsEQ74ZmaJcMA3M0tEqat0bH527m9yw54jPH58isWjI2xau5L1q8eLbpaZlYwDfsXt3N9k8/aDTD3/AgDN41Ns3n4QIPOg7wuLWbU5pVNxN+w58lKwP2nq+Re4Yc+RTM9z8sLSPD5FcOrCsnN/M9PzmNnwOOBX3OPHp3ra36+8LixmNjwO+BW3eHSkp/39yuvCYmbD44BfcZvWrmRk4YLT9o0sXMCmtSszPU9eFxYzGx4H/Ipbv3qcLRtWMT46goDx0RG2bFiV+WBqXhcWMxseV+nUwPrV40Ovljn5+q7SMasuB3ybtzwuLGY2PE7pmJklwgHfzCwRDvhmZolwwDczS4QDvplZIgYO+JKWSrpH0sOSDkn6aIdjJOlvJR2V9H1Jvzfoec3MrDdZlGWeAD4eEQ9Iei2wT9JdEfHwtGMuAVa0v94MfL79r5mZ5WTgHn5EPBERD7Qf/ww4DMws1r4c+HK03AeMSjpz0HObmdn8ZZrDl7QcWA18b8ZT48Bj07aP8fKLwsnX2ChpQtLE5ORkls0zM0taZgFf0muA24GPRcRP+32diNgWEY2IaIyNjWXVPDOz5GUS8CUtpBXsvxIR2zsc0gSWTtte0t5nZmY5yaJKR8AtwOGI+EyXw3YB72tX65wPPBMRTwx6bjMzm78sqnTWAO8FDko60N73F8AygIi4CdgNXAocBZ4DPpDBec3MrAcDB/yI+FdAcxwTwIcHPZeZmfXPyyNbpezc3/Sa/GZ9csC3yti5v8nm7Qdfupl68/gUm7cfBHDQN5sHr6VjlXHDniMvBfuTpp5/gRv2HCmoRWbV4oBvlfH48ame9pvZ6RzwrTIWj470tN/MTueAb5Wxae1KRhYuOG3fyMIFbFq7sqAWmVWLB22tMk4OzLpKx6w/DvhWKetXjzvAm/XJKR0zs0Q44JuZJcIB38wsEQ74ZmaJ8KCt5cJr4JgVzwHfhs5r4LT4omdFc0rHhs5r4Jy66DWPTxGcuujt3O8bv1l+HPBt6LwGji96Vg5Z3dP2VklPSnqoy/MXSHpG0oH21/VZnNeqwWvg+KJn5ZBVD/+LwLo5jvmXiDi3/fXpjM5rFeA1cHzRs3LIJOBHxL3A01m8ltXP+tXjbNmwivHREQSMj46wZcOqpAYsfdGzMsizSuctkh4EHgf+LCIOdTpI0kZgI8CyZctybJ4NU+pr4HjhNysDte4vnsELScuBb0bEOR2e+yXgxYh4VtKlwGcjYsVcr9loNGJiYiKT9pmZpUDSvohodHoulyqdiPhpRDzbfrwbWChpUR7nNjOzllwCvqQzJKn9+Lz2eZ/K49xmZtaSSQ5f0leBC4BFko4BnwAWAkTETcC7gT+RdAKYAq6MrHJJZmY2L5kE/Ii4ao7nPwd8LotzmZlZfzzT1swsEQ74ZmaJcMA3M0uEA76ZWSIc8M3MEuGAb2aWCAd8M7NE+BaHffCt6sysihzwe+T7s5pZVTml0yPfqs7Mqso9/B75VnVOaZlVlXv4PUr9VnUnU1rN41MEp1JaO/c3i26amc3BAb9Hqd+qzikts+pySqdHqd+qzikts+pywO9DyvdnXTw6QrNDcE8lpWVWZU7pWE9ST2mZVZl7+NaT1FNaZlWW1S0ObwXeATwZEed0eF7AZ4FLgeeAqyPigSzObflLOaU1KJe0WpGySul8EVg3y/OXACvaXxuBz2d0XrPKcEmrFS2TgB8R9wJPz3LI5cCXo+U+YFTSmVmc26wqXNJqRcsrhz8OPDZt+1h73xMzD5S0kdanAJYtW5ZL48rCH/fLYxg/C5e0WtFKN2gbEduAbQCNRiMKbk5uvChbeQzrZ+GS1tm5wzN8eZVlNoGl07aXtPdZmz/ul8ewfhZlLWndub/Jmq17OevaO1mzdW8hYwoe38hHXgF/F/A+tZwPPBMRL0vnpMwf98tjWD+L9avH2bJhFeOjIwgYHx1hy4ZVhfZiyxJo3eHJR1ZlmV8FLgAWSToGfAJYCBARNwG7aZVkHqVVlvmBLM5bJ/64Xx7D/FmUraR1tkCbZzvd4clHVlU6V0XEmRGxMCKWRMQtEXFTO9jTrs75cET8VkSsioiJLM5bJ2X9uJ+ilH4WZQm0qa9CmxcvrVASZfy4n6qUfhZlCbQpXWSLpIjyFsI0Go2YmPCHAbNhmVmRBK1AW8QFzlU62ZC0LyIanZ4rXVmmmeWnTGsjlW18o44c8M0S50CbDufwzcwS4R5+DTj3aWbz4YBfcV6SwczmyymdivMMRTObL/fwKy7LiTNODZnVm3v4FZfVxJmyrKliZsPjgF9xWc1QdGrIrP6c0qm4rCbOlGVNFTMbHgf8Gshi4oxX6zSrP6d0DPDiVWYpcA/fgHKtqWJmw+GAby/xmipm9eaUjplZIjIJ+JLWSToi6aikazs8f7WkSUkH2l8fyuK8ZmY2fwOndCQtAG4ELgaOAfdL2hURD8849GsRcc2g5zMzs/5kkcM/DzgaEY8CSLoNuByYGfDNrEde7sKylEXAHwcem7Z9DHhzh+PeJekPgB8AfxoRj3U4BkkbgY0Ay5Yty6B5ZtXklVDzV/cLbF6Dtt8AlkfEG4C7gC91OzAitkVEIyIaY2NjOTXPrHy83EW+UlhPKosefhNYOm17SXvfSyLiqWmbNwN/lcF5e1b3q7flI6/fIy93ka/ZLrB1iRNZ9PDvB1ZIOkvSK4ErgV3TD5B05rTNy4DDGZy3JylcvW348vw9ymolVJufFC6wAwf8iDgBXAPsoRXIvx4RhyR9WtJl7cM+IumQpAeBjwBXD3reXvnjsWUhz9+jbstdXPj6MdZs3ctZ197Jmq173WnJSAoX2Exm2kbEbmD3jH3XT3u8Gdicxbn6lcLV24av0wJzs+0fRKflLi58/Ri372t6IHcINq1dedogOdRvPalkllbwapCWhQUSL0R03D8MM5e7WLN1b+3zzEVJYT2pZAJ+CldvG75OwX62/VnzJ9Xhqvt6UsmspbN+9ThbNqxifHQEAeOjI2zZsKrWP1zL3niXT4Td9mcthTyzDU8yPXyo/9Xbhq/oT4pFn9+qrfYB37X3lqWi87xFn9+qTZFT7rEfjUYjJiYm+v7/M6emQ6s35FSOVYE7K9YPSfsiotHpuVrn8F17b1XliYI2DLUO+K5osKpyZ8WGodYB3xUNVlXurNgw1Drgd5uaXsaKhp37m54uby9xZ8WGodYBvyq1987X2kxV6qxYddS+LLMKtfcpLMtqvXH5pQ1D7QN+FThfWw9Zl1FWobNi1VLrlE5VOF9bfU7LWRU44JeA87XV5zJKqwKndErA+dqXq9osU6flrAoyCfiS1gGfBRYAN0fE1hnPvwr4MvAm4CngPRHxwyzOXRfO154yc0mMKtzkw/dbsCoYOKUjaQFwI3AJcDZwlaSzZxz2QeAnEfHbwF8Dfznoea2+qpgecVrOqiCLHP55wNGIeDQifg7cBlw+45jLgS+1H/8j8DZpSLcIssqrYnqkKnM+LG1ZpHTGgcembR8D3tztmIg4IekZ4NeAH898MUkbgY0Ay5Yty6B5VjVVTY84LWdlV7oqnYjYFhGNiGiMjY0V3RwrgNMjVjVVWRolix5+E1g6bXtJe1+nY45JegXwy7QGb81exlVLViVVKjLIIuDfD6yQdBatwH4l8EczjtkFvB/4LvBuYG+U+M4rVSsJrCOnR6wqqrQ0ysABv52TvwbYQ6ss89aIOCTp08BEROwCbgH+XtJR4GlaF4VSqtLV2syKV6Uig0zq8CNiN7B7xr7rpz3+X+CKLM7Vq15761W6WptZ8apUZFC6Qdss9bO+SZWu1mZWvCoVGdQ64PczgccLmZlZL6o0B6PWa+n001vftHblaTl8KO/V2szKoSpFBrUO+P3k1qpQEjjMKiJXKFlV+He1d7UO+P321st8tR5mFZErlKybsgVX/672p9Y5/Crl1uZrmAuLFbVoWVVmKaaqjDd3qeICe2VQ6x4+lLu33o9hVhEVUaHknlr5lbFU2dV0/al1D7+OhllFVESFkntq5VfG4Opquv444FfMMGt+i6gnLmMwsdOVMbhWqfa9TBzwK2aY4xJFjHmUMZjY6coYXOs4PpcHlXgNMxqNRkxMTBTdDBuimTl8aAUT//GWS69VOmWr6kmJpH0R0ej0XO0Hba3cqjDvwXorfvBAfHk54Fvh6lZJVbSie9dlrOqxFgd8sxopQ+/aA/Hl5UFbsxopQ5mrB+LLywHfrEbK0LsuY1WPtTjgm9VIGXrXLpksr4Fy+JJ+FfgasBz4IfCHEfGTDse9ABxsb/5XRFw2yHnNrLOyLO/tgfhyGrSHfy3w7YhYAXy7vd3JVESc2/5ysDcbEveubTYDTbySdAS4ICKekHQm8J2IeFlXQtKzEfGaXl/fE6/MzHoz28SrQXv4r4uIJ9qP/xt4XZfjXi1pQtJ9ktbP9oKSNraPnZicnByweWZmdtKcOXxJdwNndHjquukbERGSun1c+I2IaEr6TWCvpIMR8Z+dDoyIbcA2aPXw52qfDVfRk3jMLDtzBvyIuKjbc5L+R9KZ01I6T3Z5jWb730clfQdYDXQM+FYenSbxfOxrB/jkrkN88rLfdeA3q5hBUzq7gPe3H78fuGPmAZJ+RdKr2o8XAWuAhwc8r+Wg0yQegONTzxd+xyMz692gAX8rcLGk/wAuam8jqSHp5vYxvwNMSHoQuAfYGhEO+BUw22Qd36TErHoGqsOPiKeAt3XYPwF8qP3434BVg5zHirF4dITmLEHfa6OYVYtn2lpXnabIT+e1UcyqxatlWlcnB2U/9Y1D/OS55097zmujmFWPe/g2q/Wrx9l//dv5m/ec69mbZhXnHr7Ni9dGMas+9/DNzBLhgG9mlggHfDOzRDjgm5klwgHfzCwRrtIxy4lXHrWiOeCb5aDTyqObt7fu+umgb3lxSscsB51WHvUCdJY3B3yzHHRbaM4L0FmeHPDNctBtoTkvQGd5csA3y0GnlUe9AJ3lzYO2Zjk4OTDrKh0rkgO+WU68AJ0VbaCUjqQrJB2S9KKkxizHrZN0RNJRSdcOck4zM+vPoD38h4ANwBe6HSBpAXAjcDFwDLhf0i7f17acPDnIrL4GvaftYQBJsx12HnA0Ih5tH3sbcDnggF8ynhxkVm95VOmMA49N2z7W3teRpI2SJiRNTE5ODr1xdkqvk4N27m+yZutezrr2TtZs3cvO/c08mmlmfZqzhy/pbuCMDk9dFxF3ZN2giNgGbANoNBqR9etbd71MDqr6pwGnrixFcwb8iLhowHM0gaXTtpe091nJLB4dodkhuHeaHDTbp4GyB86qX6zM+pVHSud+YIWksyS9ErgS2JXDea1HvUwOqvJSAV7XxlI1aFnmOyUdA94C3ClpT3v/Ykm7ASLiBHANsAc4DHw9Ig4N1mwbhvWrx9myYRXjoyMIGB8dYcuGVR17vVVeKqDKFyuzQQxapbMD2NFh/+PApdO2dwO7BzmX5WO+k4M2rV15WloEqrNUQC+pqyrzOIXN5LV0rC+9fBoomxTWtTk5TtE8PkVwapzClVRp89IK1reqLhWQwro2VR5Ut+FxwLeu6pwSqOrFar48TmGdOOBbR2UpXazzRWeYUhmnsN44h28dlaF00Xno/qUwTmG9cw/fOsoiJTBo79x56P6lME5hvXPAt44GTQlkkRJyHnowdR+nsN45pWMddUoJCLjw9WPz+v9ZpISqPLnLrB/DXpDQAd86Wr96nHe9aZzpC18HcPu+5rx+CbPonTsPbSnJY8zKAd+6uueRSWYuVzrfXnoWvfMqT+4y61UehRLO4VtXg/TSs1p6wXloS0UeY1YO+CVVhvrzQQZuXSVi1ps85k444JdQWSY9DdpLd+/cbP7yWJDQOfwSKsOkJ3AO3SxPefy9uYdfQmWqP3cv3Sw/w/57cw+/hFx/bmbDMOgdr66QdEjSi5Iasxz3Q0kHJR2QNDHIOVOQYv35sCecmNngKZ2HgA3AF+Zx7IUR8eMBz5eE1CpcyjJIbVZ3g97i8DCApLkOtR6llDv3Imlm+cgrhx/AtyTtk7RxtgMlbZQ0IWlicnIyp+ZZkco0SG1WZ3P28CXdDZzR4anrIuKOeZ7n9yOiKenXgbskPRIR93Y6MCK2AdsAGo3GzJn9VnL9TBjzzTrM8jFnwI+IiwY9SUQ02/8+KWkHcB7QMeAPqgwzVFPVby4+jwknZpZDSkfSL0p67cnHwNtpDfZmzndIKla/E8Y8wcssHwMN2kp6J/B3wBhwp6QDEbFW0mLg5oi4FHgdsKM9sPsK4B8i4p8HbHdHHvwr1iC5+JQGqc2KMmiVzg5gR4f9jwOXth8/CrxxkPPMlwf/iuVcvFm51WqmrWeoFivFCWNmVVKrgO+AUyzn4s3KrVaLp6U2Q7WMnIs3K69aBXxwwDEz66ZWKR0zM+vOAd/MLBEO+GZmiXDANzNLhAO+mVkiFFHeBSklTQI/muOwRUDKN1ZJ/f2Dvwepv3/w92D6+/+NiBjrdFCpA/58SJqIiK63V6y71N8/+HuQ+vsHfw/m+/6d0jEzS4QDvplZIuoQ8LcV3YCCpf7+wd+D1N8/+Hswr/df+Ry+mZnNTx16+GZmNg8O+GZmiahFwJd0g6RHJH1f0g5Jo0W3KU+SrpB0SNKLkpIpTZO0TtIRSUclXVt0e/Im6VZJT0oayj2iy07SUkn3SHq4/fv/0aLblDdJr5b075IebH8PPjXb8bUI+MBdwDkR8QbgB8DmgtuTt4eADcC9RTckL5IWADcClwBnA1dJOrvYVuXui8C6ohtRoBPAxyPibOB84MMJ/g78H/DWiHgjcC6wTtL53Q6uRcCPiG9FxIn25n3AkiLbk7eIOBwRR4puR87OA45GxKMR8XPgNuDygtuUq4i4F3i66HYUJSKeiIgH2o9/BhwGkroZRrQ8295c2P7qWolTi4A/wx8D/1R0I2zoxoHHpm0fI7E/djtF0nJgNfC9YluSP0kLJB0AngTuioiu34PK3PFK0t3AGR2eui4i7mgfcx2tj3lfybNteZjP+zdLkaTXALcDH4uInxbdnrxFxAvAue2xyx2SzomIjuM6lQn4EXHRbM9Luhp4B/C2qOHkgrnef4KawNJp20va+ywhkhbSCvZfiYjtRbenSBFxXNI9tMZ1Ogb8WqR0JK0D/hy4LCKeK7o9lov7gRWSzpL0SuBKYFfBbbIcSRJwC3A4Ij5TdHuKIGnsZFWipBHgYuCRbsfXIuADnwNeC9wl6YCkm4puUJ4kvVPSMeAtwJ2S9hTdpmFrD9JfA+yhNVj39Yg4VGyr8iXpq8B3gZWSjkn6YNFtytka4L3AW9t/9wckXVp0o3J2JnCPpO/T6gTdFRHf7Hawl1YwM0tEXXr4ZmY2Bwd8M7NEOOCbmSXCAd/MLBEO+GZmiXDANzNLhAO+mVki/h98BKGbse6vYQAAAABJRU5ErkJggg==\n" + }, + "metadata": { + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "***Finding Clusters***" + ], + "metadata": { + "id": "H-bJAJB0D1fi" + } + }, + { + "cell_type": "code", + "source": [ + "# Import KMeans from sklearn\n", + "from sklearn.cluster import KMeans\n", + "\n", + "# A loop will be used to plot the explanatory power for up to 10 KMeans clusters\n", + "ks = range(1, 10)\n", + "inertias = []\n", + "for k in ks:\n", + " # Initialize the KMeans object using the current number of clusters (k)\n", + " km = KMeans(n_clusters=k, random_state=8)\n", + " # Fit the scaled features to the KMeans object\n", + " km.fit(features_scaled)\n", + " # Append the inertia for `km` to the list of inertias\n", + " inertias.append(km.inertia_)\n", + " \n", + "# Plot the results in a line plot\n", + "plt.plot(ks, inertias, marker='o')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 283 + }, + "id": "6sYe3bjEDxzT", + "outputId": "d772bd39-34de-4374-9b74-5c527e569699" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[]" + ] + }, + "metadata": {}, + "execution_count": 10 + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "***KMeans to visualize clusters in the PCA scatter plot***" + ], + "metadata": { + "id": "0bBVtbTgEA_6" + } + }, + { + "cell_type": "code", + "source": [ + "# Create a KMeans object with 3 clusters, use random_state=8 \n", + "km = KMeans(n_clusters = 3, random_state = 8)\n", + "\n", + "# Fit the data to the `km` object\n", + "km.fit(features_scaled)\n", + "\n", + "# Create a scatter plot of the first two principal components\n", + "# and color it according to the KMeans cluster assignment \n", + "plt.scatter(p_comps[:, 0], p_comps[:, 1], c = km.labels_)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 283 + }, + "id": "USmAfs1ED2ZP", + "outputId": "72c32c34-342c-4848-88ff-76aa1f06b60b" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 11 + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "***Visualize the feature differences between the clusters***" + ], + "metadata": { + "id": "z1IGFmGIEK9F" + } + }, + { + "cell_type": "code", + "source": [ + "# Create a new column with the labels from the KMeans clustering\n", + "car_acc['cluster'] = km.labels_\n", + "\n", + "# Reshape the DataFrame to the long format\n", + "melt_car = pd.melt(car_acc, id_vars = ['cluster'], var_name ='measurement', value_name = 'percent', \n", + " value_vars =['perc_fatl_speed', 'perc_fatl_alcohol', 'perc_fatl_1st_time'])\n", + "\n", + "# Create a violin plot splitting and coloring the results according to the km-clusters\n", + "sns.violinplot(melt_car['percent'], melt_car['measurement'], hue = melt_car['cluster'])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 353 + }, + "id": "l6MMNIaXEFOQ", + "outputId": "68981a2b-0ee9-4036-d821-0bfb35f9ec7a" + }, + "execution_count": 12, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.\n", + " FutureWarning\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 12 + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "***Compute the number of accidents within each cluster***" + ], + "metadata": { + "id": "RRunv6r1EWBq" + } + }, + { + "cell_type": "code", + "source": [ + "# Read in the new dataset\n", + "miles_driven = pd.read_csv('miles-driven.csv', sep='|')\n", + "\n", + "display(miles_driven.head())\n", + "\n", + "# Merge the `car_acc` DataFrame with the `miles_driven` DataFrame\n", + "car_acc_miles = car_acc.merge(miles_driven, on='state')\n", + "\n", + "# Create a new column for the number of drivers involved in fatal accidents\n", + "car_acc_miles['num_drvr_fatl_col'] = (car_acc_miles['drvr_fatl_col_bmiles'] * car_acc_miles['million_miles_annually']) / 1000\n", + "\n", + "display(car_acc_miles.head())\n", + "\n", + "# Create a barplot of the total number of accidents per cluster\n", + "sns.barplot(x='cluster', y='num_drvr_fatl_col', data=car_acc_miles, estimator=sum, ci=None)\n", + "\n", + "# Calculate the number of states in each cluster and their 'num_drvr_fatl_col' mean and sum.\n", + "count_mean_sum = car_acc_miles.groupby('cluster')['num_drvr_fatl_col'].agg(['count', 'mean', 'sum'])\n", + "count_mean_sum" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 835 + }, + "id": "rYEcmGeJESJj", + "outputId": "8c91f2cf-3b87-443e-97e4-25073a794b0c" + }, + "execution_count": 13, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + " state million_miles_annually\n", + "0 Alabama 64914\n", + "1 Alaska 4593\n", + "2 Arizona 59575\n", + "3 Arkansas 32953\n", + "4 California 320784" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
statemillion_miles_annually
0Alabama64914
1Alaska4593
2Arizona59575
3Arkansas32953
4California320784
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + " state drvr_fatl_col_bmiles perc_fatl_speed perc_fatl_alcohol \\\n", + "0 Alabama 18.8 39 30 \n", + "1 Alaska 18.1 41 25 \n", + "2 Arizona 18.6 35 28 \n", + "3 Arkansas 22.4 18 26 \n", + "4 California 12.0 35 28 \n", + "\n", + " perc_fatl_1st_time cluster million_miles_annually num_drvr_fatl_col \n", + "0 80 0 64914 1220.3832 \n", + "1 94 2 4593 83.1333 \n", + "2 96 2 59575 1108.0950 \n", + "3 95 2 32953 738.1472 \n", + "4 89 2 320784 3849.4080 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
statedrvr_fatl_col_bmilesperc_fatl_speedperc_fatl_alcoholperc_fatl_1st_timeclustermillion_miles_annuallynum_drvr_fatl_col
0Alabama18.83930800649141220.3832
1Alaska18.14125942459383.1333
2Arizona18.63528962595751108.0950
3Arkansas22.4182695232953738.1472
4California12.035288923207843849.4080
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " count mean sum\n", + "cluster \n", + "0 18 911.406439 16405.3159\n", + "1 11 860.505945 9465.5654\n", + "2 22 898.378595 19764.3291" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countmeansum
cluster
018911.40643916405.3159
111860.5059459465.5654
222898.37859519764.3291
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 13 + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## *I would choose cluster 2...*" + ], + "metadata": { + "id": "a90qB4aVEqqa" + } + } + ] +} \ No newline at end of file