diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 0000000..89e4415 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,3 @@ +# These are supported funding model platforms + +github: chris1610 diff --git a/Monte_Carlo_Simulationv2.ipynb b/Monte_Carlo_Simulationv2.ipynb new file mode 100644 index 0000000..6499910 --- /dev/null +++ b/Monte_Carlo_Simulationv2.ipynb @@ -0,0 +1,468 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Monte Carlo Simulation with Python\n", + "\n", + "Notebook to accompany article on [Practical Business Python](https://pbpython.com/monte-carlo.html)\n", + "\n", + "Update to use numpy for faster loops based on comments [here](https://www.reddit.com/r/Python/comments/arxwkm/monte_carlo_simulation_with_python/)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "sns.set_style('whitegrid')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Define the variables for the Percent to target based on historical results\n", + "avg = 1\n", + "std_dev = .1\n", + "num_reps = 500\n", + "num_simulations = 100000" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Show an example of calculating the percent to target\n", + "pct_to_target = np.random.normal(\n", + " avg,\n", + " std_dev,\n", + " size=(num_reps, num_simulations)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0.79328531, 0.99211018, 1.14343423, ..., 0.83737887, 0.93507967,\n", + " 0.86079851],\n", + " [1.03126742, 1.04414961, 1.08119495, ..., 0.98607625, 1.01161899,\n", + " 0.96872644],\n", + " [1.08616345, 0.93970666, 1.07594111, ..., 0.94057821, 1.00399945,\n", + " 1.05325946],\n", + " ...,\n", + " [1.10388204, 0.90397305, 0.96005999, ..., 0.88810244, 1.18064642,\n", + " 0.94066897],\n", + " [1.07581302, 0.92552317, 1.08256074, ..., 0.91934988, 1.06668758,\n", + " 1.05969099],\n", + " [1.12755095, 0.95080038, 0.978849 , ..., 1.0094155 , 0.94359533,\n", + " 1.06332923]])" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pct_to_target[0:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "# Another example for the sales target distribution\n", + "sales_target_values = [75_000, 100_000, 200_000, 300_000, 400_000, 500_000]\n", + "sales_target_prob = [.3, .3, .2, .1, .05, .05]\n", + "sales_target = np.random.choice(sales_target_values, p=sales_target_prob, \n", + " size=(num_reps, num_simulations))" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 75000, 200000, 75000, ..., 75000, 100000, 200000],\n", + " [200000, 75000, 100000, ..., 200000, 100000, 100000],\n", + " [400000, 75000, 100000, ..., 500000, 200000, 75000],\n", + " ...,\n", + " [500000, 75000, 500000, ..., 75000, 75000, 75000],\n", + " [ 75000, 100000, 75000, ..., 75000, 500000, 100000],\n", + " [100000, 75000, 75000, ..., 100000, 100000, 75000]])" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sales_target[0:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "commission_percentages = np.take(\n", + " np.array([0.04, 0.03, 0.02]),\n", + " np.digitize(pct_to_target, bins=[.9, .99, 10])\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2 26992625\n", + "1 15075317\n", + "0 7932058\n", + "dtype: int64" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(np.digitize(pct_to_target, bins=[.9, .99, 10]).flatten()).value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.02 26992625\n", + "0.03 15075317\n", + "0.04 7932058\n", + "dtype: int64" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# frequencies\n", + "pd.DataFrame(commission_percentages.flatten()).value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0.04, 0.02, 0.02, ..., 0.04, 0.03, 0.04],\n", + " [0.02, 0.02, 0.02, ..., 0.03, 0.02, 0.03],\n", + " [0.02, 0.03, 0.02, ..., 0.03, 0.02, 0.02],\n", + " ...,\n", + " [0.02, 0.03, 0.03, ..., 0.04, 0.02, 0.03],\n", + " [0.02, 0.03, 0.02, ..., 0.03, 0.02, 0.02],\n", + " [0.02, 0.03, 0.03, ..., 0.02, 0.03, 0.02]])" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "commission_percentages[0:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "total_commissions = (commission_percentages * sales_target).sum(axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "96546.42131435724" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "total_commissions.std()" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Total_Commissions
02838250.0
12786750.0
22795500.0
33054750.0
42831750.0
\n", + "
" + ], + "text/plain": [ + " Total_Commissions\n", + "0 2838250.0\n", + "1 2786750.0\n", + "2 2795500.0\n", + "3 3054750.0\n", + "4 2831750.0" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Show how to create the dataframe\n", + "df = pd.DataFrame(data={'Total_Commissions': total_commissions})\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY0AAAERCAYAAACHA/vpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAueElEQVR4nO3de1hU9aLG8e8wDEjMkJFlTyneMdONN7adEq9pWGreUUhKLdvaUbemblDxlve87LO1sDTr9GiCpGbu7OZtaxZ5klKLUIssRU3xspNBmUFY548OcyQVlikw6vt5np6HWfObtd65NK/rNstiGIaBiIiICT4VHUBERG4cKg0RETFNpSEiIqapNERExDSVhoiImKbSEBER01QaUq4KCgp488036dmzJ926dePxxx9n7ty5uN3ucs/SrVs3zp49e9WP27x5M9OnTy+DRMXVr1+frl270q1bN5544gl69uzJ22+/7bk/KSmJJUuWlDiPd955p9hjLnbx49u3b88333xzVfkOHz7M8OHDATh+/Dj9+vW7qsfLjcm3ogPIrWXKlCn8+uuvvPXWWzgcDs6dO8eYMWOYMGECc+fOLdcs77333h963COPPMIjjzxyndNc3ltvvUVwcDAAp0+fZsiQIbhcLgYNGkR0dHSpj09LS6NevXqXvc/M40ty9OhRDh48CEDVqlVJTk6+pvnJjcGik/ukvGRlZdGlSxd27NiB3W73TM/Ozuarr74iMjKSnJwcpk6dyr59+7BYLLRq1YoXXngBX19f/vSnPzFw4EA+//xzzp07x7Bhw/joo484cOAAd999N6+++iq33Xab6XH169cnNTWVgoIC4uLiOHPmDABt2rRh5MiRZGdnX3b62rVr+fjjj3nttdf45ZdfmDJlCkeOHMEwDLp3786zzz5LVlYWAwYMoE2bNuzZs4ezZ88yduxYOnbsSGZmJhMmTMDtdmMYBr179+bJJ5+85PUqyldUGgBfffUVI0aM4NNPP+Xll1/mzJkzTJo0iZUrV5KcnIzNZsPf358XX3yRgwcPMmHCBPz9/RkyZAinT59m9+7dnDhxgvr161OjRg3P49u3b8+DDz7Ivn37cLvdDBw4kN69e7Nz506mTZvG+++/D+C5/d5779GpUyeOHz/On//8Z6ZOnUrXrl35+uuvyc/PZ/bs2aSmpmK1WgkLC2PcuHHY7Xbat29Pjx49SE1N5dixY3Tr1o2RI0eW4adOrjdtnpJyk56eTt26dYsVBsBdd91FZGQkANOnT6dy5cr885//ZM2aNezfv5833ngDALfbTZUqVVi9ejXdu3cnISGBCRMm8MEHH+B0Otm8efNVjSuSkpJCtWrVePfdd3n77bf5+eefycnJueL0i40ZM4YHH3yQf/7znyQlJbF+/Xo2bNgA/Lb5JiIigtWrVzN69GhmzpwJwLJly2jfvj1r165lyZIl7Nq1i8LCQlOv4f333092dranyOC3TX4zZ87k9ddfZ82aNURFRZGWlkbHjh1p3749AwYM8JTSkSNHePfdd5k3b94l8/b39+fdd9/ljTfeYMGCBXz//fdXzGG1Wpk+fTohISEsW7as2H2LFy/mxIkTvPfee7z33nsUFhby0ksvee4/d+6cp+TeeOMNDh8+bOq5i3dQaUi58fHxKfXLcfv27fTv3x+LxYKfnx/9+vVj+/btnvuLyiUkJITQ0FCqVq2Kj48P1apV49dff73qcQCtWrXik08+YfDgwaxatYrRo0fjcDiuOL3IuXPn+OqrrzxfyA6Hg549e3ry2mw22rRpA8ADDzzAv//9bwA6duzI66+/zrBhw/jkk09ISEjAx8fc/4oWiwX47Qu+iNVqpVOnTvTr148XX3yRoKAgevfufdnHN2nSBF/fy2+VLtonUbVqVVq2bElqaqqpTL+3fft2+vXrh81mw8fHh9jYWD799FPP/UWb9qpWrcqdd955yfsh3k2lIeUmLCyMH3/8EafTWWz68ePHee6558jLy6OwsNDzxQhQWFjIhQsXPLdtNttl//49s+OKcm3evJm+ffty5MgR+vTpw7fffnvF6Rdn+/3W3YvzFn1pAsWeU7t27fj444957LHHyMjIoGvXrvzyyy8lZizyzTffUK1aNQIDA4tNnzdvHq+++iohISEsWbKEF1544bKPv+22264474uLq7CwEF9fXywWS7HnmJ+fX2rGy72HFz/u4sL7/fzF+6k0pNxUrVqVrl27Mn78eE9xOJ1OpkyZQuXKlalUqRIRERGsWLECwzBwu92kpKTw8MMPl2muefPmkZiYSIcOHZgwYQJ169bl+++/v+L0Ina7ncaNG3uOTsrJyWHdunWl5h09ejQffPABnTt3ZvLkydjtdg4dOlRqzuPHjzNv3jwGDRpUbPrp06dp06YNlStXZsCAAYwcOdJzJJTVai1WuiV59913gd92cKempvLQQw8RHBzM0aNHOXXqFIZheDa9Fc37ciXSqlUrkpKSyM/Pp7CwkLfffpuWLVuayiDeT0dPSbmaPHkyiYmJ9OvXD6vVitvtpkOHDp5DNxMSEpg+fTpdu3YlPz+fVq1aMWTIkDLN9PTTTxMfH0+XLl3w8/Ojfv36dO7cmV9//fWy04t2CsNvhfPiiy+ydu1a3G43Xbt2pWfPnhw5cuSKy3v++eeZMGECq1atwmq10qFDB/785z9fMZuPjw9WqxWAXr16XbLTPDg4mKFDhzJgwAAqVark2d8A0Lp1a2bPnm3qdXC5XPTo0YP8/HwSEhKoVasW8Ntmq169enHXXXfRtm1bTyHVrVsXf39/evfuzd///nfPfIYOHcqcOXPo3r07Fy5cICwsjIkTJ5rKIN5PR0+JiIhp2jwlIiKmqTRERMQ0lYaIiJim0hAREdNu6qOndu/eXeyY8LLicrnKZTlXyxtzeWMmUK6r5Y25vDET3Ji5XC4XTZo0uex9N3Vp+Pv706BBgzJfTkZGRrks52p5Yy5vzATKdbW8MZc3ZoIbM1dGRsYVH6fNUyIiYppKQ0RETFNpiIiIaTf1Pg0R+WPy8/PJysoiLy/viveXtN27InhjJvDuXAcPHqRatWql/qjnxVQaInKJrKwsHA4HNWvWLPaLtUXOnz9PQEBABSS7Mm/MBN6b69y5c5w7d46srCzP74yZoc1TInKJvLw87rzzzssWhtwcLBYLd9555xXXJq9EpSEil6XCuPn9kfdYpSEipcrLLyh2+1o3t/x+fnLj0D4NESlVJZuVmvEbSh9o0k+zO1+3eUn5UmnILS0vv4BKNmuFnLFbtGy51OzZs0lPTyc7O5u8vDyqV6/OHXfcwcKFC4uN279/P2fPnr3iRax27txJcnJysYtE/d7333/P3LlzOX/+POfOnaNNmzYMHz68TDfPjRo1ijlz5uDn52dq/Pbt2zl27Bh9+/Yts0xmqTTklna9/wV9NfSv7SuLj48HYO3atfz444+MGTPmsuM++eQTqlSpcsXSKM3Zs2d54YUXWLRoETVr1qSgoIC//vWvJCcnEx0d/Yfzl6akEruc1q1bl1GSq6fSEBGvl5+fz/jx4zl8+DAFBQUMHDiQ5s2b8+6772Kz2WjYsCE//fQT77zzjucx//jHP0qd7+bNm3nwwQepWbMm8Nt1z+fMmeM5b2H27NmkpaUB0KVLF8+lgX19fTl69Chut5vHH3+crVu3cuzYMRITEzl27BhLlizBZrPxyy+/0KtXL9LS0ti3bx9PPfUUMTExtG/fng8//JBt27axdOlSfH19ue+++3jppZf4+uuvmTNnDr6+vgQFBTFv3jw++eQTT3m+8cYbbNiwAV9fX8LDwxk7diyLFi0iKyuLU6dOcfToUcaNG0erVq34+9//zhdffEFhYSGdO3dmwIAB1/xeqDRExOutWrWKO+64g7lz5+J0OunZsyfJycn06NGDKlWqEBYWxrZt21iyZAkBAQFMmjSJHTt2ULVq1RLne+LECapXr15sWmBgIABbt24lKyuLlJQULly4QExMDP/xH/8BwH333cf06dOZNGkSWVlZLF26lIULF7JlyxYaNGjAL7/8wrp160hPT2fEiBFs2rSJ48ePM2zYMGJiYjzLev/99xkwYACdO3dm3bp1OJ1ONm3aRMeOHXnmmWfYsmULZ8+e9Yzfv38/H374IcnJyfj6+jJ8+HC2bt0KgJ+fH6+//jqfffYZb7zxBq1atWLdunWsWLGCqlWrsnbt2uvyXujoKRHxepmZmZ5NUHa7nTp16nD48OFiY4KDg4mLi2PcuHHs37+fCxculDrfe++9l19++aXYtMOHD/Pll1+SmZlJeHg4FosFm81G48aNyczMBOCBBx4AICgoiLp163r+drvdANSrVw+bzYbD4aBatWr4+flx++2343K5ii1r3LhxfPnll/Tv35+vvvoKHx8fhgwZwunTp3n66af56KOP8PX9/3/b//jjjzRu3BibzYbFYiE8PJzvv/8ewLNf7p577vHkWLBgAQsWLOCZZ54pVj7XQmsaIlKqvPyC67oP5moPAqhTpw67du2iY8eOOJ1ODhw4QLVq1bBYLBQWFpKTk8PixYvZtm0bAAMHDsQwjFLn265dO1577TWio6MJCQkhPz+f2bNn8/DDD1OnTh3Wrl3LgAEDyM/P5+uvv6ZHjx5A6ec3mN2JvmrVKoYPH86dd97JpEmT2LhxI7m5ufTo0YO4uDhee+01UlJSuPfeewGoXbs2b775JhcuXMBqtfLll1/SvXt39u3bd8ky3W43H330EQsWLMAwDDp37kznzp257777TGW7EpWGiJTq91/w1/rTGFd71FhUVBQTJ04kOjoal8vFsGHDuPPOO2nUqBEvvfQSderUoUmTJvTo0YPbbruNoKAgTpw4QbVq1Uqcr91uZ/bs2SQkJGAYBrm5ubRr146YmBgsFgv/8z//Q9++fcnPz6dTp040bNjwDz/nywkLC2PgwIFUrlyZwMBA2rZty6FDh4iPj+e2227DZrPx4osv8uWXXwJQv359HnvsMaKjoyksLKR58+Z06NCBffv2XTLvorWbbt26cfvtt9OyZUtP+VwLi2Gmjm9Q5XXxkxvxIisVxRszefPRUxX1epW2XG/8PSVvzATen+ty73VJ77/WNETkpjdlyhTP/oiLLV26lEqVKlVAohuXSkNELsswjJvm96emTJlS0RG80h/Z0KSjp0TkEpUqVeLUqVN/6EtFbgyGYXDq1KmrXtPSmoaIXKJatWpkZWWRnZ192fvz8/Ov6sI95cEbM4F35yo6JPhqlElpFBQUkJCQwMGDB7FarcyaNQvDMIiPj8disVCvXj0mT56Mj48PKSkpnhNVhg4dSrt27cjLy2Ps2LGcOnWKwMBA5syZQ3BwMLt372bGjBlYrVYiIiIYNmxYWcQXueXZbLYSL8zjjQc0eGMm8O5cV3PxpSJlsnmq6AzF5ORkRowYwaxZs5g1axYjR45k5cqVGIbB5s2byc7OZvny5SQnJ7Ns2TIWLFiA2+0mKSmJ0NBQVq5cSffu3UlMTARg8uTJzJ8/n6SkJPbs2UN6enpZxBcRkSsokzWNDh060LZtWwCOHj1KlSpV+Ne//kWLFi2A335867PPPsPHx4emTZvi5+eHn58fISEh7Nu3j7S0NJ599lnP2MTERJxOJ263m5CQEAAiIiJITU0t8bhpl8tVLtfmzcvL88prAHtjLm/LVNH/AizttfC216uIN+byxkxw8+Uqs30avr6+xMXFsXHjRhYuXMjWrVs9R2IEBgaSk5OD0+nE4XB4HhMYGIjT6Sw2/eKxdru92Njf/4zA7/n7++s8DS/L5Y2ZKlJpr4W3vl7emMsbM8GNmaukMinTo6fmzJnDxx9/zMSJE4v95kpubi5BQUHY7XZyc3OLTXc4HMWmlzQ2KCioLOOLiMjvlElprFu3jtdeew347bKQFouFRo0asXPnTuC3C4qEh4cTFhZGWloaLpeLnJwcMjMzCQ0NpVmzZp7fkNm+fTvNmzfHbrdjs9k4dOgQhmGwY8cOwsPDyyK+iIhcQZlsnnr00UcZN24cTz75JBcuXGD8+PHUqVOHiRMnsmDBAmrXrk1kZCRWq5XY2FhiYmIwDINRo0bh7+9PdHQ0cXFxREdHY7PZmD9/PgBTp05lzJgxFBQUEBERQePGjcsivoiIXEGZlMZtt9122QugrFix4pJpUVFRREVFFZsWEBBwyWUdAZo0aUJKSsr1CyoiIldFZ4SLiIhpKg0RETFNpSEiIqapNERExDSVhoiImKbSEBER01QaIiJimkpDRERMU2mIiIhpKg0RETFNpSEiIqapNERExDSVhoiImKbSEBER01QaIiJimkpDRERMU2mIiIhpKg0RETFNpSEiIqapNERExDSVhoiImKbSEBER01QaIiJimkpDRERM873eM8zPz2f8+PEcOXIEt9vN0KFDueeeexgyZAg1a9YEIDo6mscff5yUlBSSk5Px9fVl6NChtGvXjry8PMaOHcupU6cIDAxkzpw5BAcHs3v3bmbMmIHVaiUiIoJhw4Zd7+giIlKK614a69evp3LlysydO5czZ87Qo0cP/vM//5OBAwcyaNAgz7js7GyWL1/OmjVrcLlcxMTE0LJlS5KSkggNDWX48OFs2LCBxMREEhISmDx5MosWLaJ69eo899xzpKen07Bhw+sdX0RESnDdS6NTp05ERkZ6blutVr799lsOHjzI5s2bqVGjBuPHj2fv3r00bdoUPz8//Pz8CAkJYd++faSlpfHss88C0Lp1axITE3E6nbjdbkJCQgCIiIggNTW11NJwuVxkZGRc76d4iby8vHJZztXyxlzelqlBgwYVuvzSXgtve72KeGMub8wEN1+u614agYGBADidTkaMGMHIkSNxu9306dOHRo0asXjxYl555RXuv/9+HA5Hscc5nU6cTqdnemBgIDk5OTidTux2e7Gxhw8fLjWLv79/uXwpZGRkVPiXz+V4Yy5vzFSRSnstvPX18sZc3pgJbsxcJZVJmewIP3bsGE899RTdunWja9eudOzYkUaNGgHQsWNHvvvuO+x2O7m5uZ7H5Obm4nA4ik3Pzc0lKCjosmODgoLKIrqIiJTgupfGyZMnGTRoEGPHjqV3794APPPMM+zduxfAs1kpLCyMtLQ0XC4XOTk5ZGZmEhoaSrNmzdi2bRsA27dvp3nz5tjtdmw2G4cOHcIwDHbs2EF4ePj1ji4iIqW47punXn31Vc6ePUtiYiKJiYkAxMfHM3PmTGw2G1WqVGHatGnY7XZiY2OJiYnBMAxGjRqFv78/0dHRxMXFER0djc1mY/78+QBMnTqVMWPGUFBQQEREBI0bN77e0aUC5eUXUMlmregYIlKK614aCQkJJCQkXDI9OTn5kmlRUVFERUUVmxYQEMDChQsvGdukSRNSUlKuX1DxKpVsVmrGbyj35f40u3O5L1PkRqaT+0RExDSVhoiImKbSEBER01QaIiJimkpDRERMU2mIiIhpKg0RETFNpSEiIqapNERExDSVhoiImKbSEBER01QaIiJimkpDRERMU2mIiIhpKg2RCpKXX1DqmLK4TKiZ5YpcyXW/noaImKNriMiNSGsaIiJimkpDRERMU2mIiIhpKg0RETFNpSEiIqapNERExDSVhoiImKbSEBER0677yX35+fmMHz+eI0eO4Ha7GTp0KHXr1iU+Ph6LxUK9evWYPHkyPj4+pKSkkJycjK+vL0OHDqVdu3bk5eUxduxYTp06RWBgIHPmzCE4OJjdu3czY8YMrFYrERERDBs27HpHFxGRUpha0zh58qTpGa5fv57KlSuzcuVKli5dyrRp05g1axYjR45k5cqVGIbB5s2byc7OZvny5SQnJ7Ns2TIWLFiA2+0mKSmJ0NBQVq5cSffu3UlMTARg8uTJzJ8/n6SkJPbs2UN6evofe8YiIvKHmVrTGD58OMHBwfTu3Zs2bdrg43PlrunUqRORkZGe21arlfT0dFq0aAFA69at+eyzz/Dx8aFp06b4+fnh5+dHSEgI+/btIy0tjWeffdYzNjExEafTidvtJiQkBICIiAhSU1Np2LBhibldLhcZGRlmnuI1ycvLK5flXC1vzHWlTGXxG0tyZdf6ubiRPlsV7WbLZao0kpKSyMzMZPXq1SxevJiHHnqI3r17U7169UvGBgYGAuB0OhkxYgQjR45kzpw5WCwWz/05OTk4nU4cDkexxzmdzmLTLx5rt9uLjT18+HCpuf39/cvlyygjI8Mrv/S8MZc3ZroVXet74I3vozdmghszV0llYnpH+N1330316tWpVKkSBw4cYMaMGfzjH/+47Nhjx47x1FNP0a1bN7p27VpszSQ3N5egoCDsdju5ubnFpjscjmLTSxobFBRkNrqIiFwnpkrjr3/9K3379uXs2bPMnTuXxYsX8+qrr7Jt27ZLxp48eZJBgwYxduxYevfuDcADDzzAzp07Adi+fTvh4eGEhYWRlpaGy+UiJyeHzMxMQkNDadasmWe+27dvp3nz5tjtdmw2G4cOHcIwDHbs2EF4ePj1eg1ERMQkU5unoqKiaNKkCYGBgZw4ccIzPSkp6ZKxr776KmfPniUxMdGzE3vChAlMnz6dBQsWULt2bSIjI7FarcTGxhITE4NhGIwaNQp/f3+io6OJi4sjOjoam83G/PnzAZg6dSpjxoyhoKCAiIgIGjdufD2ev4iIXAVTpfH111/z6aefEh8fz/Tp02nUqBHPPfcc/v7+l4xNSEggISHhkukrVqy4ZFpUVBRRUVHFpgUEBLBw4cJLxjZp0oSUlBQzcUVEpIyY2jy1ZcsW4uPjAVi4cCFbtmwp01AiIuKdTJWGxWLB7XYDv528ZxhGmYYSERHvZGrzVL9+/ejatSuhoaH8+OOPnvMoRETk1mKqNPr06cMjjzzC4cOHqV69OsHBwWWdS0REvJCp0sjIyGDVqlW4XC7PtFmzZpVZKBER8U6mSiM+Pp7+/ftzzz33lHUeERHxYqZKo0qVKvTp06ess4iIiJczVRr33XcfS5YsoUGDBp7fkIqIiCjTYCIi4n1MlUZ+fj4HDx7k4MGDnmkqDRGRW4+p0pg1axYHDx7k0KFD1K9fn7vvvrusc4mIiBcyVRorVqxg48aN/Prrr/To0YOff/6ZSZMmlXU2ERHxMqbOCN+wYQP//d//jcPh4Omnn2bPnj1lnUtERLyQqdIo+tmQop3gfn5+ZZdIRES8lqnNU126dOHJJ5/k6NGjDB48mA4dOpR1LhER8UKmSqN///489NBDHDhwgFq1anH//feXdS4REfFCpkrj5Zdf9vydmZnJpk2bGDZsWJmFEhER72T6jHD4bd/Gd999R2FhYZmGEhER72T6p9Evpp9GFxG5NZkqjYvPBM/OzubYsWNlFkhERLyXqdK4+EQ+f39//va3v5VZIBER8V6mSmP58uVlnUNERG4ApkrjiSeeIDc3F39/f8+FmAzDwGKxsHnz5jINKCIi3sNUaTRt2pTu3bvTtGlT9u/fz7Jly5g+fXpZZxMRES9jqjQyMzNp2rQpAPXr1+fYsWP6KRERkVuQqd+ecjgc/Nd//Rdbtmxh7ty53HvvvaU+Zs+ePcTGxgKQnp5Oq1atiI2NJTY2lg8++ACAlJQUevbsSVRUFFu3bgUgLy+P4cOHExMTw+DBgzl9+jQAu3fvpk+fPvTr16/YyYYiIlJ+TK1pzJ8/n5UrV/Lpp59Sv359Ro0aVeL4pUuXsn79egICAgD47rvvGDhwIIMGDfKMyc7OZvny5axZswaXy0VMTAwtW7YkKSmJ0NBQhg8fzoYNG0hMTCQhIYHJkyezaNEiqlevznPPPUd6ejoNGza8hqcuIiJXy9Sahr+/P7fffjt33HEHtWrV4uzZsyWODwkJYdGiRZ7b3377Lf/617948sknGT9+PE6nk71799K0aVP8/PxwOByEhISwb98+0tLSaNWqFQCtW7cmNTUVp9OJ2+0mJCQEi8VCREQEqamp1/C0RUTkjzB9nsbdd9/N559/TqNGjYiLi2Pp0qVXHB8ZGUlWVpbndlhYGH369KFRo0YsXryYV155hfvvvx+Hw+EZExgYiNPpxOl0eqYHBgaSk5OD0+nEbrcXG3v48OFSc7tcLjIyMsw8xWuSl5dXLsu5Wt6Y60qZGjRoUAFpbl3X+rm4kT5bFe1my2WqNA4dOsSMGTPYtWsX7du3Z8mSJVe1kI4dOxIUFOT5e9q0aYSHh5Obm+sZk5ubi8PhwG63e6bn5uYSFBRUbNrF00vj7+9fLl9GGRkZXvml5425vDHTreha3wNvfB+9MRPcmLlKKhNTm6cKCgo4ffo0FosFp9OJj4+ph3k888wz7N27F4DU1FQaNmxIWFgYaWlpuFwucnJyyMzMJDQ0lGbNmrFt2zYAtm/fTvPmzbHb7dhsNg4dOoRhGOzYsYPw8PCryiAiItfO1JrGqFGjiI6OJjs7m759+zJhwoSrWsiUKVOYNm0aNpuNKlWqMG3aNOx2O7GxscTExGAYBqNGjcLf35/o6Gji4uKIjo7GZrMxf/58AKZOncqYMWMoKCggIiKCxo0bX/2zFRGRa2KqNI4dO8bHH3/M6dOnueOOOzyXfS1JtWrVSElJAaBhw4YkJydfMiYqKoqoqKhi0wICAli4cOElY5s0aeKZn4iIVAxT25mKvqyDg4NNFYaIiNycTK1puN1uunfvTq1atTz7M4o2G4mIyK2jxNJITEzk+eefZ8yYMRw/fpyqVauWVy4REfFCJW6e+uKLLwBo0aIF77zzDi1atPD8JyIit54SS8MwjMv+LSIit6YSS+Pind7aAS4iIiXu00hPT6dfv34YhsEPP/zg+dtisVz2EFoREbm5lVga69evL68cIiJyAyixNO67777yyiEiIjeAq/sRKRERuaWpNERExDSVhoiImKbSEBER01QaIiJimkpDRERMU2mIiIhpKg0RETFNpSEiIqapNERExDSVhoiImKbSEBER01QaIiJimkpDRERMU2mIiIhpZVYae/bsITY2FoCff/6Z6OhoYmJimDx5MoWFhQCkpKTQs2dPoqKi2Lp1KwB5eXkMHz6cmJgYBg8ezOnTpwHYvXs3ffr0oV+/frz88stlFVtEREpQJqWxdOlSEhIScLlcAMyaNYuRI0eycuVKDMNg8+bNZGdns3z5cpKTk1m2bBkLFizA7XaTlJREaGgoK1eupHv37iQmJgIwefJk5s+fT1JSEnv27CE9Pb0soouISAnKpDRCQkJYtGiR53Z6ejotWrQAoHXr1nz++efs3buXpk2b4ufnh8PhICQkhH379pGWlkarVq08Y1NTU3E6nbjdbkJCQrBYLERERJCamloW0UVEpAQlXu71j4qMjCQrK8tz2zAMLBYLAIGBgeTk5OB0OnE4HJ4xgYGBOJ3OYtMvHmu324uNPXz4cKk5XC4XGRkZ1+tpXVFeXl65LOdqeWOuK2Vq0KBBBaS5dV3r5+JG+mxVtJstV5mUxu/5+Pz/Ck1ubi5BQUHY7XZyc3OLTXc4HMWmlzQ2KCio1OX6+/uXy5dRRkaGV37peWMub8x0K7rW98Ab30dvzAQ3Zq6SyqRcjp564IEH2LlzJwDbt28nPDycsLAw0tLScLlc5OTkkJmZSWhoKM2aNWPbtm2esc2bN8dut2Oz2Th06BCGYbBjxw7Cw8PLI7qIiFykXNY04uLimDhxIgsWLKB27dpERkZitVqJjY0lJiYGwzAYNWoU/v7+REdHExcXR3R0NDabjfnz5wMwdepUxowZQ0FBARERETRu3Lg8oouIyEXKrDSqVatGSkoKALVq1WLFihWXjImKiiIqKqrYtICAABYuXHjJ2CZNmnjmJ2UjL7+ASjZrmS7DG1fTRcS8clnTkBtDJZuVmvEbKmTZP83uXCHLFZGrozPCRUTENJWGiIiYptIQERHTVBoiImKaSkNERExTaYjcYvLyC655Hn/00OnrsWypWDrkVuQWo0Or5VpoTUNERExTaYiIiGkqDRERMU2lISIipqk0RETENJWGiIiYptIQERHTVBoiImKaSkNERExTaYiIiGkqDRERMU2lISIipqk0RETENJWGiIiYptIQERHTVBoiImJauV6EqXv37jgcDgCqVavGkCFDiI+Px2KxUK9ePSZPnoyPjw8pKSkkJyfj6+vL0KFDadeuHXl5eYwdO5ZTp04RGBjInDlzCA4OLs/4IiK3vHIrDZfLBcDy5cs904YMGcLIkSN58MEHmTRpEps3b6ZJkyYsX76cNWvW4HK5iImJoWXLliQlJREaGsrw4cPZsGEDiYmJJCQklFd8ERGhHDdP7du3j/PnzzNo0CCeeuopdu/eTXp6Oi1atACgdevWfP755+zdu5emTZvi5+eHw+EgJCSEffv2kZaWRqtWrTxjU1NTyyu6iIj8n3Jb06hUqRLPPPMMffr04aeffmLw4MEYhoHFYgEgMDCQnJwcnE6nZxNW0XSn01lsetHY0rhcLjIyMsrmCV0kLy+vXJZzta42V4MGDcowjchvyur/lZvl/8Py8kdzlVtp1KpVixo1amCxWKhVqxaVK1cmPT3dc39ubi5BQUHY7XZyc3OLTXc4HMWmF40tjb+/f7l8EWZkZHjlF6635pJbW1l9Jr31834j5iqpTMpt89Tq1auZPXs2AMePH8fpdNKyZUt27twJwPbt2wkPDycsLIy0tDRcLhc5OTlkZmYSGhpKs2bN2LZtm2ds8+bNyyu6iIj8n3Jb0+jduzfjxo0jOjoai8XCzJkzueOOO5g4cSILFiygdu3aREZGYrVaiY2NJSYmBsMwGDVqFP7+/kRHRxMXF0d0dDQ2m4358+eXV3QREfk/5VYafn5+l/2iX7FixSXToqKiiIqKKjYtICCAhQsXllk+EREpnU7uExER01QaIiJimkpDRERMU2mIiIhpKg0RETFNpSEiIqapNERExDSVhoiImKbSEBER01QaIiJimkpDRERMU2mIiIhpKg0RETFNpSEiIqapNERExDSVhoiImKbS8EJ5+QXXZT7eeF1iubVdr8/25ZT0eS/L5d5qyu3KfWJeJZuVmvEbyn25P83uXO7LlFuLPts3Pq1piIiIaSoNERExTaUhIiKmqTRERMQ0lYaIiJim0hAREdNuqENuCwsLmTJlCvv378fPz4/p06dTo0aNio4lInLLuKHWNDZt2oTb7WbVqlWMHj2a2bNnV3QkEZFbyg1VGmlpabRq1QqAJk2a8O2335bp8syeRaozr0W8W0WdEX4znoluMQzDqOgQZk2YMIFHH32UNm3aANC2bVs2bdqEr+/lt7Lt3r0bf3//8owoInLDc7lcNGnS5LL33VD7NOx2O7m5uZ7bhYWFVywM4IpPWkRE/pgbavNUs2bN2L59O/DbWkRoaGgFJxIRubXcUJunio6eOnDgAIZhMHPmTOrUqVPRsUREbhk3VGmIiEjFuqE2T4mISMVSaYiIiGkqDRERMe2GOuS2IuXn5zN+/HiOHDmC2+1m6NChPPLII57733zzTVavXk1wcDAAU6dOpXbt2hWea+/evcyePRvDMLjrrruYO3duuZy7UlKu7OxsXnjhBc/YjIwMRo8eTXR0dIVlAli/fj1vvvkmPj4+9OrVi5iYmDLNYzbXunXrWLZsGQ6Hgx49etCnT59yyVVQUEBCQgIHDx7EarUya9YsQkJCPPdv2bKFV155BV9fX3r16kVUVJRX5AI4f/48AwcOZMaMGeV2sExpud5//33eeustrFYroaGhTJkyBR+fsv13e2mZPv74Y5YsWYLFYqFv377mPluGmLJ69Wpj+vTphmEYxunTp402bdoUu3/06NHGN99841W5CgsLjSeeeML46aefDMMwjJSUFCMzM7PCc13sq6++MmJjY40LFy5UeKaWLVsaZ86cMVwul9GhQwfj3//+d5lnKi3XqVOnjLZt2xpnzpwxCgoKjNjYWOPw4cPlkmvjxo1GfHy8YRiG8cUXXxhDhgzx3Od2uz2vkcvlMnr27GmcOHGiwnMZhmHs3bvX6NGjh/Hwww8bP/zwQ7lkKi3X+fPnjUceecQ4d+6cYRiGMWrUKGPTpk0VmunChQtGx44djbNnzxoXLlwwHn30UePUqVOlzlNrGiZ16tSJyMhIz22r1Vrs/vT0dJYsWUJ2djZt27blL3/5S4XnOnjwIJUrV+att97iwIEDtGnTplzWfkrLVcQwDKZNm8a8efMue395Z6pfvz45OTn4+vpiGAYWi6XMM5WWKysri/vvv5/KlSsD8Kc//Yk9e/ZQrVq1Ms/VoUMH2rZtC8DRo0epUqWK577MzExCQkK4/fbbAWjevDm7du3iscceq9BcAG63m1deeYW//e1vZZ7FbC4/Pz+Sk5MJCAgA4MKFC+Wyxl9SJqvVygcffICvry+nTp0CIDAwsNR5qjRMKnoxnU4nI0aMYOTIkcXu79y5MzExMdjtdoYNG8bWrVtp165dheY6c+YMX3/9NRMnTqRGjRoMGTKERo0a8dBDD1VoriJbtmyhXr165VZkpWWqV68evXr1IiAggI4dOxIUFFThuWrUqMEPP/zAyZMnCQwMJDU1lZo1a5ZLLgBfX1/i4uLYuHEjCxcu9Ex3Op04HA7P7cDAQJxOZ4Xngt8KrKJcKZePj4/nC3v58uWcO3eOli1bVmimovs++eQTXnzxRdq0aVPiL2x4lNFa0U3p6NGjRo8ePYx33nmn2PTCwkLj7NmzntsrVqwwXn755QrP9cMPPxhdunTx3H7zzTeNJUuWVHiuIiNGjDB27dpVbnlKypSRkWFERkZ6VtVHjRplfPDBBxWeyzAMY/PmzUa/fv2MkSNHGgkJCcbGjRvLLVeREydOGG3btjVyc3MNw/jt9Xr22Wc998+YMcP48MMPKzzXxfr371+um6cudrlcBQUFxuzZs42//OUvns1UFZ3p4mxjx441Vq9eXep8dPSUSSdPnmTQoEGMHTuW3r17F7vP6XTSpUsXcnNzMQyDnTt30qhRowrPVb16dXJzc/n5558B2LVrF/Xq1avwXEXS09Np1qxZueQpLZPD4aBSpUr4+/tjtVoJDg7m7NmzFZ7rwoUL7Nmzh7fffps5c+bw448/lttrtm7dOl577TUAAgICsFgsnk1nderU4eeff+bf//43brebXbt20bRp0wrPVZFKyzVp0iRcLheJiYmezVQVmcnpdNK/f3/cbjc+Pj4EBASY2jGvM8JNmj59Oh9++GGxTSl9+vTh/Pnz9O3bl3Xr1rF8+XL8/Px46KGHGDFihFfkSk1NZf78+RiGQdOmTUlISPCKXKdPn2bgwIG899575ZLHTKakpCTWrFmDzWYjJCSEadOm4efnV+G5Xn75ZTZt2oS/vz8DBw6kU6dOZZ4J4Ny5c4wbN46TJ09y4cIFBg8ezPnz5zl37hx9+/b1HD1lGAa9evXiySef9IpcRWJjY5kyZUq5HT1VUq5GjRrRq1cvwsPDPfvKnnrqKTp27Fhhmfr27cuqVatYvXo1vr6+1K9fn4kTJ5ZawCoNERExTZunRETENJWGiIiYptIQERHTVBoiImKaSkNE5Ca0Z88eYmNjSxyzdu1a+vTpQ8+ePXnllVdMzVdnhIuI3GSWLl3K+vXrSzwf5NChQyQlJXlOFVi4cCH5+fnYbLYS5601DRGRm0xISAiLFi3y3N6/fz+xsbHExsYyfPhwcnJy+Pzzz2nUqBFxcXH079+fZs2alVoYoDUNEZGbTmRkJFlZWZ7bEydOZObMmdStW5d33nmH119/nUqVKrFr1y6SkpJwuVxER0ezevXqUn9zTaUhInKTy8zMZOrUqcBv12+pVasWYWFhtGjRArvdjt1up06dOvz000+EhYWVOC+VhojITa5WrVrMmTOHe++9l7S0NLKzs6lVqxYrV67E5XJRUFDg+bn70qg0RERuclOmTCEuLo6CggIAZsyYQa1atejVqxfR0dEYhsHzzz/vuW5LSfTbUyIiYpqOnhIREdNUGiIiYppKQ0RETFNpiIiIaSoNERExTaUhIiKmqTRERMS0/wVf3ZYTm53/1wAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.plot(kind='hist', title='Commissions Distribution')" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Total_Commissions
count1.000000e+05
mean2.831683e+06
std9.654690e+04
min2.472750e+06
25%2.765750e+06
50%2.830250e+06
75%2.896500e+06
max3.278500e+06
\n", + "
" + ], + "text/plain": [ + " Total_Commissions\n", + "count 1.000000e+05\n", + "mean 2.831683e+06\n", + "std 9.654690e+04\n", + "min 2.472750e+06\n", + "25% 2.765750e+06\n", + "50% 2.830250e+06\n", + "75% 2.896500e+06\n", + "max 3.278500e+06" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/data/Example4.xlsx b/data/Example4.xlsx new file mode 100644 index 0000000..307ffc3 Binary files /dev/null and b/data/Example4.xlsx differ diff --git a/data/cereal_data.csv b/data/cereal_data.csv new file mode 100644 index 0000000..6a907de --- /dev/null +++ b/data/cereal_data.csv @@ -0,0 +1,78 @@ +name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating,cereal +100% Bran,Nabisco,Cold,70,4,1,130,10.0,5.0,6,280,25,Top,1.0,0.33,68.4,1 +100% Natural Bran,Quaker Oats,Cold,120,3,5,15,2.0,8.0,8,135,0,Top,1.0,1.0,33.98,1 +All-Bran,Kellogs,Cold,70,4,1,260,9.0,7.0,5,320,25,Top,1.0,0.33,59.43,1 +All-Bran with Extra Fiber,Kellogs,Cold,50,4,0,140,14.0,8.0,0,330,25,Top,1.0,0.5,93.7,1 +Almond Delight,Ralston Purina,Cold,110,2,2,200,1.0,14.0,8,-1,25,Top,1.0,0.75,34.38,1 +Apple Cinnamon Cheerios,General Mills,Cold,110,2,2,180,1.5,10.5,10,70,25,Bottom,1.0,0.75,29.51,1 +Apple Jacks,Kellogs,Cold,110,2,0,125,1.0,11.0,14,30,25,Middle,1.0,1.0,33.17,1 +Basic 4,General Mills,Cold,130,3,2,210,2.0,18.0,8,100,25,Top,1.33,0.75,37.04,1 +Bran Chex,Ralston Purina,Cold,90,2,1,200,4.0,15.0,6,125,25,Bottom,1.0,0.67,49.12,1 +Bran Flakes,Post,Cold,90,3,0,210,5.0,13.0,5,190,25,Top,1.0,0.67,53.31,1 +Cap'n'Crunch,Quaker Oats,Cold,120,1,2,220,0.0,12.0,12,35,25,Middle,1.0,0.75,18.04,1 +Cheerios,General Mills,Cold,110,6,2,290,2.0,17.0,1,105,25,Bottom,1.0,1.25,50.76,1 +Cinnamon Toast Crunch,General Mills,Cold,120,1,3,210,0.0,13.0,9,45,25,Middle,1.0,0.75,19.82,1 +Clusters,General Mills,Cold,110,3,2,140,2.0,13.0,7,105,25,Top,1.0,0.5,40.4,1 +Cocoa Puffs,General Mills,Cold,110,1,1,180,0.0,12.0,13,55,25,Middle,1.0,1.0,22.74,1 +Corn Chex,Ralston Purina,Cold,110,2,0,280,0.0,22.0,3,25,25,Bottom,1.0,1.0,41.45,1 +Corn Flakes,Kellogs,Cold,100,2,0,290,1.0,21.0,2,35,25,Bottom,1.0,1.0,45.86,1 +Corn Pops,Kellogs,Cold,110,1,0,90,1.0,13.0,12,20,25,Middle,1.0,1.0,35.78,1 +Count Chocula,General Mills,Cold,110,1,1,180,0.0,12.0,13,65,25,Middle,1.0,1.0,22.4,1 +Cracklin' Oat Bran,Kellogs,Cold,110,3,3,140,4.0,10.0,7,160,25,Top,1.0,0.5,40.45,1 +Cream of Wheat (Quick),Nabisco,Hot,100,3,0,80,1.0,21.0,0,-1,0,Middle,1.0,1.0,64.53,1 +Crispix,Kellogs,Cold,110,2,0,220,1.0,21.0,3,30,25,Top,1.0,1.0,46.9,1 +Crispy Wheat & Raisins,General Mills,Cold,100,2,1,140,2.0,11.0,10,120,25,Top,1.0,0.75,36.18,1 +Double Chex,Ralston Purina,Cold,100,2,0,190,1.0,18.0,5,80,25,Top,1.0,0.75,44.33,1 +Froot Loops,Kellogs,Cold,110,2,1,125,1.0,11.0,13,30,25,Middle,1.0,1.0,32.21,1 +Frosted Flakes,Kellogs,Cold,110,1,0,200,1.0,14.0,11,25,25,Bottom,1.0,0.75,31.44,1 +Frosted Mini-Wheats,Kellogs,Cold,100,3,0,0,3.0,14.0,7,100,25,Middle,1.0,0.8,58.35,1 +"Fruit & Fibre Dates, Walnuts, and Oats",Post,Cold,120,3,2,160,5.0,12.0,10,200,25,Top,1.25,0.67,40.92,1 +Fruitful Bran,Kellogs,Cold,120,3,0,240,5.0,14.0,12,190,25,Top,1.33,0.67,41.02,1 +Fruity Pebbles,Post,Cold,110,1,1,135,0.0,13.0,12,25,25,Middle,1.0,0.75,28.03,1 +Golden Crisp,Post,Cold,100,2,0,45,0.0,11.0,15,40,25,Bottom,1.0,0.88,35.25,1 +Golden Grahams,General Mills,Cold,110,1,1,280,0.0,15.0,9,45,25,Middle,1.0,0.75,23.8,1 +Grape Nuts Flakes,Post,Cold,100,3,1,140,3.0,15.0,5,85,25,Top,1.0,0.88,52.08,1 +Grape-Nuts,Post,Cold,110,3,0,170,3.0,17.0,3,90,25,Top,1.0,0.25,53.37,1 +Great Grains Pecan,Post,Cold,120,3,3,75,3.0,13.0,4,100,25,Top,1.0,0.33,45.81,1 +Honey Graham Ohs,Quaker Oats,Cold,120,1,2,220,1.0,12.0,11,45,25,Middle,1.0,1.0,21.87,1 +Honey Nut Cheerios,General Mills,Cold,110,3,1,250,1.5,11.5,10,90,25,Bottom,1.0,0.75,31.07,1 +Honey-comb,Post,Cold,110,1,0,180,0.0,14.0,11,35,25,Bottom,1.0,1.33,28.74,1 +Just Right Crunchy Nuggets,Kellogs,Cold,110,2,1,170,1.0,17.0,6,60,100,Top,1.0,1.0,36.52,1 +Just Right Fruit & Nut,Kellogs,Cold,140,3,1,170,2.0,20.0,9,95,100,Top,1.3,0.75,36.47,1 +Kix,General Mills,Cold,110,2,1,260,0.0,21.0,3,40,25,Middle,1.0,1.5,39.24,1 +Life,Quaker Oats,Cold,100,4,2,150,2.0,12.0,6,95,25,Middle,1.0,0.67,45.33,1 +Lucky Charms,General Mills,Cold,110,2,1,180,0.0,12.0,12,55,25,Middle,1.0,1.0,26.73,1 +Maypo,AM Home Food,Hot,100,4,1,0,0.0,16.0,3,95,25,Middle,1.0,1.0,54.85,1 +"Muesli Raisins, Dates, & Almonds",Ralston Purina,Cold,150,4,3,95,3.0,16.0,11,170,25,Top,1.0,1.0,37.14,1 +"Muesli Raisins, Peaches, & Pecans",Ralston Purina,Cold,150,4,3,150,3.0,16.0,11,170,25,Top,1.0,1.0,34.14,1 +Mueslix Crispy Blend,Kellogs,Cold,160,3,2,150,3.0,17.0,13,160,25,Top,1.5,0.67,30.31,1 +Multi-Grain Cheerios,General Mills,Cold,100,2,1,220,2.0,15.0,6,90,25,Bottom,1.0,1.0,40.11,1 +Nut&Honey Crunch,Kellogs,Cold,120,2,1,190,0.0,15.0,9,40,25,Middle,1.0,0.67,29.92,1 +Nutri-Grain Almond-Raisin,Kellogs,Cold,140,3,2,220,3.0,21.0,7,130,25,Top,1.33,0.67,40.69,1 +Nutri-grain Wheat,Kellogs,Cold,90,3,0,170,3.0,18.0,2,90,25,Top,1.0,1.0,59.64,1 +Oatmeal Raisin Crisp,General Mills,Cold,130,3,2,170,1.5,13.5,10,120,25,Top,1.25,0.5,30.45,1 +Post Nat. Raisin Bran,Post,Cold,120,3,1,200,6.0,11.0,14,260,25,Top,1.33,0.67,37.84,1 +Product 19,Kellogs,Cold,100,3,0,320,1.0,20.0,3,45,100,Top,1.0,1.0,41.5,1 +Puffed Rice,Quaker Oats,Cold,50,1,0,0,0.0,13.0,0,15,0,Top,0.5,1.0,60.76,1 +Puffed Wheat,Quaker Oats,Cold,50,2,0,0,1.0,10.0,0,50,0,Top,0.5,1.0,63.01,1 +Quaker Oat Squares,Quaker Oats,Cold,100,4,1,135,2.0,14.0,6,110,25,Top,1.0,0.5,49.51,1 +Quaker Oatmeal,Quaker Oats,Hot,100,5,2,0,2.7,-1.0,-1,110,0,Bottom,1.0,0.67,50.83,1 +Raisin Bran,Kellogs,Cold,120,3,1,210,5.0,14.0,12,240,25,Middle,1.33,0.75,39.26,1 +Raisin Nut Bran,General Mills,Cold,100,3,2,140,2.5,10.5,8,140,25,Top,1.0,0.5,39.7,1 +Raisin Squares,Kellogs,Cold,90,2,0,0,2.0,15.0,6,110,25,Top,1.0,0.5,55.33,1 +Rice Chex,Ralston Purina,Cold,110,1,0,240,0.0,23.0,2,30,25,Bottom,1.0,1.13,42.0,1 +Rice Krispies,Kellogs,Cold,110,2,0,290,0.0,22.0,3,35,25,Bottom,1.0,1.0,40.56,1 +Shredded Wheat,Nabisco,Cold,80,2,0,0,3.0,16.0,0,95,0,Bottom,0.83,1.0,68.24,1 +Shredded Wheat 'n'Bran,Nabisco,Cold,90,3,0,0,4.0,19.0,0,140,0,Bottom,1.0,0.67,74.47,1 +Shredded Wheat spoon size,Nabisco,Cold,90,3,0,0,3.0,20.0,0,120,0,Bottom,1.0,0.67,72.8,1 +Smacks,Kellogs,Cold,110,2,1,70,1.0,9.0,15,40,25,Middle,1.0,0.75,31.23,1 +Special K,Kellogs,Cold,110,6,0,230,1.0,16.0,3,55,25,Bottom,1.0,1.0,53.13,1 +Strawberry Fruit Wheats,Nabisco,Cold,90,2,0,15,3.0,15.0,5,90,25,Middle,1.0,1.0,59.36,1 +Total Corn Flakes,General Mills,Cold,110,2,1,200,0.0,21.0,3,35,100,Top,1.0,1.0,38.84,1 +Total Raisin Bran,General Mills,Cold,140,3,1,190,4.0,15.0,14,230,100,Top,1.5,1.0,28.59,1 +Total Whole Grain,General Mills,Cold,100,3,1,200,3.0,16.0,3,110,100,Top,1.0,1.0,46.66,1 +Triples,General Mills,Cold,110,2,1,250,0.0,21.0,3,60,25,Top,1.0,0.75,39.11,1 +Trix,General Mills,Cold,110,1,1,140,0.0,13.0,12,25,25,Middle,1.0,1.0,27.75,1 +Wheat Chex,Ralston Purina,Cold,100,3,1,230,3.0,17.0,3,115,25,Bottom,1.0,0.67,49.79,1 +Wheaties,General Mills,Cold,100,3,1,200,3.0,17.0,3,110,25,Bottom,1.0,1.0,51.59,1 +Wheaties Honey Gold,General Mills,Cold,110,2,1,200,1.0,16.0,8,60,25,Bottom,1.0,0.75,36.19,1 diff --git a/data/sales_9_2022.xlsx b/data/sales_9_2022.xlsx new file mode 100644 index 0000000..9b6845c Binary files /dev/null and b/data/sales_9_2022.xlsx differ diff --git a/data/shipping_tables.xlsx b/data/shipping_tables.xlsx new file mode 100644 index 0000000..aad8f1b Binary files /dev/null and b/data/shipping_tables.xlsx differ diff --git a/notebooks/Category-Encoding-Article.ipynb b/notebooks/Category-Encoding-Article.ipynb index e54c5c8..25a601c 100644 --- a/notebooks/Category-Encoding-Article.ipynb +++ b/notebooks/Category-Encoding-Article.ipynb @@ -12,21 +12,23 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Import the pandas, scikit-learn, numpy and [category_encoder](http://contrib.scikit-learn.org/categorical-encoding/) libraries." + "Import the pandas, scikit-learn, numpy and [category_encoder](https://github.com/scikit-learn-contrib/category_encoders) libraries." ] }, { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", - "from sklearn.preprocessing import LabelBinarizer, LabelEncoder\n", + "from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder\n", + "from sklearn.compose import make_column_transformer\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.model_selection import cross_val_score\n", "\n", "import category_encoders as ce" ] @@ -41,9 +43,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "headers = [\"symboling\", \"normalized_losses\", \"make\", \"fuel_type\", \"aspiration\", \"num_doors\", \"body_style\",\n", @@ -62,26 +62,35 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "df = pd.read_csv(\"http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data\",\n", + "df = pd.read_csv(\"https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data\",\n", " header=None, names=headers, na_values=\"?\" )" ] }, { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", " \n", @@ -243,12 +252,12 @@ "3 2 164.0 audi gas std four \n", "4 2 164.0 audi gas std four \n", "\n", - " body_style drive_wheels engine_location wheel_base ... engine_size \\\n", - "0 convertible rwd front 88.6 ... 130 \n", - "1 convertible rwd front 88.6 ... 130 \n", - "2 hatchback rwd front 94.5 ... 152 \n", - "3 sedan fwd front 99.8 ... 109 \n", - "4 sedan 4wd front 99.4 ... 136 \n", + " body_style drive_wheels engine_location wheel_base ... engine_size \\\n", + "0 convertible rwd front 88.6 ... 130 \n", + "1 convertible rwd front 88.6 ... 130 \n", + "2 hatchback rwd front 94.5 ... 152 \n", + "3 sedan fwd front 99.8 ... 109 \n", + "4 sedan 4wd front 99.4 ... 136 \n", "\n", " fuel_system bore stroke compression_ratio horsepower peak_rpm city_mpg \\\n", "0 mpfi 3.47 2.68 9.0 111.0 5000.0 21 \n", @@ -286,9 +295,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -341,9 +348,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "obj_df = df.select_dtypes(include=['object']).copy()" @@ -352,14 +357,25 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -481,14 +497,25 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -565,9 +592,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -596,9 +621,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "obj_df = obj_df.fillna({\"num_doors\": \"four\"})" @@ -607,14 +630,25 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -668,9 +702,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -680,8 +712,8 @@ "five 11\n", "eight 5\n", "two 4\n", - "twelve 1\n", "three 1\n", + "twelve 1\n", "Name: num_cylinders, dtype: int64" ] }, @@ -697,9 +729,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "cleanup_nums = {\"num_doors\": {\"four\": 4, \"two\": 2},\n", @@ -710,25 +740,34 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "obj_df.replace(cleanup_nums, inplace=True)" + "obj_df = obj_df.replace(cleanup_nums)" ] }, { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -850,9 +889,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -889,9 +926,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -916,9 +951,7 @@ { "cell_type": "code", "execution_count": 18, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "obj_df[\"body_style\"] = obj_df[\"body_style\"].astype('category')" @@ -927,9 +960,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -966,9 +997,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "obj_df[\"body_style_cat\"] = obj_df[\"body_style\"].cat.codes" @@ -977,14 +1006,25 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -1105,9 +1145,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1145,14 +1183,25 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -1185,9 +1234,9 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1201,9 +1250,9 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1217,9 +1266,9 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1233,9 +1282,9 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1249,9 +1298,9 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", "
4mpfi00.00.01.0001
14mpfi00.00.01.0001
26mpfi20.00.01.0001
34mpfi30.01.00.0010
45mpfi31.00.00.0100
\n", @@ -1266,18 +1315,18 @@ "4 audi gas std 4 sedan front \n", "\n", " engine_type num_cylinders fuel_system body_style_cat drive_wheels_4wd \\\n", - "0 dohc 4 mpfi 0 0.0 \n", - "1 dohc 4 mpfi 0 0.0 \n", - "2 ohcv 6 mpfi 2 0.0 \n", - "3 ohc 4 mpfi 3 0.0 \n", - "4 ohc 5 mpfi 3 1.0 \n", + "0 dohc 4 mpfi 0 0 \n", + "1 dohc 4 mpfi 0 0 \n", + "2 ohcv 6 mpfi 2 0 \n", + "3 ohc 4 mpfi 3 0 \n", + "4 ohc 5 mpfi 3 1 \n", "\n", " drive_wheels_fwd drive_wheels_rwd \n", - "0 0.0 1.0 \n", - "1 0.0 1.0 \n", - "2 0.0 1.0 \n", - "3 1.0 0.0 \n", - "4 0.0 0.0 " + "0 0 1 \n", + "1 0 1 \n", + "2 0 1 \n", + "3 1 0 \n", + "4 0 0 " ] }, "execution_count": 23, @@ -1299,14 +1348,25 @@ { "cell_type": "code", "execution_count": 24, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", " \n", @@ -1342,14 +1402,14 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1362,14 +1422,14 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1382,14 +1442,14 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1402,14 +1462,14 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1422,14 +1482,14 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
4mpfi01.00.00.00.00.00.00.01.010000001
14mpfi01.00.00.00.00.00.00.01.010000001
26mpfi20.00.01.00.00.00.00.01.000100001
34mpfi30.00.00.01.00.00.01.00.000010010
45mpfi30.00.00.01.00.01.00.00.000010100
\n", @@ -1444,18 +1504,18 @@ "4 audi gas std 4 front ohc \n", "\n", " num_cylinders fuel_system body_style_cat body_convertible body_hardtop \\\n", - "0 4 mpfi 0 1.0 0.0 \n", - "1 4 mpfi 0 1.0 0.0 \n", - "2 6 mpfi 2 0.0 0.0 \n", - "3 4 mpfi 3 0.0 0.0 \n", - "4 5 mpfi 3 0.0 0.0 \n", + "0 4 mpfi 0 1 0 \n", + "1 4 mpfi 0 1 0 \n", + "2 6 mpfi 2 0 0 \n", + "3 4 mpfi 3 0 0 \n", + "4 5 mpfi 3 0 0 \n", "\n", " body_hatchback body_sedan body_wagon drive_4wd drive_fwd drive_rwd \n", - "0 0.0 0.0 0.0 0.0 0.0 1.0 \n", - "1 0.0 0.0 0.0 0.0 0.0 1.0 \n", - "2 1.0 0.0 0.0 0.0 0.0 1.0 \n", - "3 0.0 1.0 0.0 0.0 1.0 0.0 \n", - "4 0.0 1.0 0.0 1.0 0.0 0.0 " + "0 0 0 0 0 0 1 \n", + "1 0 0 0 0 0 1 \n", + "2 1 0 0 0 0 1 \n", + "3 0 1 0 0 1 0 \n", + "4 0 1 0 1 0 0 " ] }, "execution_count": 24, @@ -1478,9 +1538,7 @@ { "cell_type": "code", "execution_count": 25, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1488,8 +1546,8 @@ "ohc 148\n", "ohcf 15\n", "ohcv 13\n", - "dohc 12\n", "l 12\n", + "dohc 12\n", "rotor 4\n", "dohcv 1\n", "Name: engine_type, dtype: int64" @@ -1514,9 +1572,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "obj_df[\"OHC_Code\"] = np.where(obj_df[\"engine_type\"].str.contains(\"ohc\"), 1, 0)" @@ -1525,14 +1581,25 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", " \n", @@ -1717,36 +1784,43 @@ { "cell_type": "code", "execution_count": 28, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "lb_make = LabelEncoder()" + "ord_enc = OrdinalEncoder()" ] }, { "cell_type": "code", "execution_count": 29, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "obj_df[\"make_code\"] = lb_make.fit_transform(obj_df[\"make\"])" + "obj_df[\"make_code\"] = ord_enc.fit_transform(obj_df[[\"make\"]])" ] }, { "cell_type": "code", "execution_count": 30, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -1759,57 +1833,57 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
0alfa-romero00.0
1alfa-romero00.0
2alfa-romero00.0
3audi11.0
4audi11.0
5audi11.0
6audi11.0
7audi11.0
8audi11.0
9audi11.0
10bmw22.0
\n", @@ -1817,17 +1891,17 @@ ], "text/plain": [ " make make_code\n", - "0 alfa-romero 0\n", - "1 alfa-romero 0\n", - "2 alfa-romero 0\n", - "3 audi 1\n", - "4 audi 1\n", - "5 audi 1\n", - "6 audi 1\n", - "7 audi 1\n", - "8 audi 1\n", - "9 audi 1\n", - "10 bmw 2" + "0 alfa-romero 0.0\n", + "1 alfa-romero 0.0\n", + "2 alfa-romero 0.0\n", + "3 audi 1.0\n", + "4 audi 1.0\n", + "5 audi 1.0\n", + "6 audi 1.0\n", + "7 audi 1.0\n", + "8 audi 1.0\n", + "9 audi 1.0\n", + "10 bmw 2.0" ] }, "execution_count": 30, @@ -1849,13 +1923,11 @@ { "cell_type": "code", "execution_count": 31, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ - "lb_style = LabelBinarizer()\n", - "lb_results = lb_style.fit_transform(obj_df[\"body_style\"])" + "oe_style = OneHotEncoder()\n", + "oe_results = oe_style.fit_transform(obj_df[[\"body_style\"]])" ] }, { @@ -1868,20 +1940,18 @@ { "cell_type": "code", "execution_count": 32, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([[1, 0, 0, 0, 0],\n", - " [1, 0, 0, 0, 0],\n", - " [0, 0, 1, 0, 0],\n", - " ..., \n", - " [0, 0, 0, 1, 0],\n", - " [0, 0, 0, 1, 0],\n", - " [0, 0, 0, 1, 0]])" + "array([[1., 0., 0., 0., 0.],\n", + " [1., 0., 0., 0., 0.],\n", + " [0., 0., 1., 0., 0.],\n", + " ...,\n", + " [0., 0., 0., 1., 0.],\n", + " [0., 0., 0., 1., 0.],\n", + " [0., 0., 0., 1., 0.]])" ] }, "execution_count": 32, @@ -1890,23 +1960,34 @@ } ], "source": [ - "lb_results" + "oe_results.toarray()" ] }, { "cell_type": "code", "execution_count": 33, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1918,55 +1999,55 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
convertiblehardtop
0100001.00.00.00.00.0
1100001.00.00.00.00.0
2001000.00.01.00.00.0
3000100.00.00.01.00.0
4000100.00.00.01.00.0
\n", "
" ], "text/plain": [ - " convertible hardtop hatchback sedan wagon\n", - "0 1 0 0 0 0\n", - "1 1 0 0 0 0\n", - "2 0 0 1 0 0\n", - "3 0 0 0 1 0\n", - "4 0 0 0 1 0" + " convertible hardtop hatchback sedan wagon\n", + "0 1.0 0.0 0.0 0.0 0.0\n", + "1 1.0 0.0 0.0 0.0 0.0\n", + "2 0.0 0.0 1.0 0.0 0.0\n", + "3 0.0 0.0 0.0 1.0 0.0\n", + "4 0.0 0.0 0.0 1.0 0.0" ] }, "execution_count": 33, @@ -1975,7 +2056,7 @@ } ], "source": [ - "pd.DataFrame(lb_results, columns=lb_style.classes_).head()" + "pd.DataFrame(oe_results.toarray(), columns=oe_style.categories_).head()" ] }, { @@ -1983,15 +2064,13 @@ "metadata": {}, "source": [ "### Advanced Encoding\n", - "[category_encoder](http://contrib.scikit-learn.org/categorical-encoding/) library" + "[category_encoder](https://github.com/scikit-learn-contrib/category_encoders) library" ] }, { "cell_type": "code", "execution_count": 34, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# Get a new clean dataframe\n", @@ -2001,14 +2080,25 @@ { "cell_type": "code", "execution_count": 35, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", " \n", @@ -2130,15 +2220,42 @@ { "cell_type": "code", "execution_count": 36, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/chris/miniconda3/envs/pbpcode/lib/python3.8/site-packages/category_encoders/utils.py:21: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead\n", + " elif pd.api.types.is_categorical(cols):\n" + ] + }, { "data": { "text/plain": [ - "BackwardDifferenceEncoder(cols=['engine_type'], drop_invariant=False,\n", - " return_df=True, verbose=0)" + "BackwardDifferenceEncoder(cols=['engine_type'],\n", + " mapping=[{'col': 'engine_type',\n", + " 'mapping': engine_type_0 engine_type_1 engine_type_2 engine_type_3 engine_type_4 \\\n", + " 1 -0.857143 -0.714286 -0.571429 -0.428571 -0.285714 \n", + " 2 0.142857 -0.714286 -0.571429 -0.428571 -0.285714 \n", + " 3 0.142857 0.285714 -0.571429 -0.428571 -0.285714 \n", + " 4 0.142857 0.285714 0.428571 -0.428571 -0.285714 \n", + " 5 0.142857 0.285714 0.428571 0.571429 -0.285714 \n", + " 6 0.142857 0.285714 0.428571 0.571429 0.714286 \n", + " 7 0.142857 0.285714 0.428571 0.571429 0.714286 \n", + "-1 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "-2 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "\n", + " engine_type_5 \n", + " 1 -0.142857 \n", + " 2 -0.142857 \n", + " 3 -0.142857 \n", + " 4 -0.142857 \n", + " 5 -0.142857 \n", + " 6 -0.142857 \n", + " 7 0.857143 \n", + "-1 0.000000 \n", + "-2 0.000000 }])" ] }, "execution_count": 36, @@ -2147,70 +2264,85 @@ } ], "source": [ - "encoder = ce.backward_difference.BackwardDifferenceEncoder(cols=[\"engine_type\"])\n", + "# Specify the columns to encode then fit and transform\n", + "encoder = ce.BackwardDifferenceEncoder(cols=[\"engine_type\"])\n", "encoder.fit(obj_df, verbose=1)" ] }, { "cell_type": "code", "execution_count": 37, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/chris/miniconda3/envs/pbpcode/lib/python3.8/site-packages/category_encoders/utils.py:21: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead\n", + " elif pd.api.types.is_categorical(cols):\n" + ] + }, { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2218,9 +2350,8 @@ " \n", " \n", " \n", - " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2231,19 +2362,19 @@ "" ], "text/plain": [ - " col_engine_type_0 col_engine_type_1 col_engine_type_2 col_engine_type_3 \\\n", - "0 1.0 0.142857 0.285714 0.428571 \n", - "1 1.0 0.142857 0.285714 0.428571 \n", - "2 1.0 0.142857 0.285714 0.428571 \n", - "3 1.0 0.142857 -0.714286 -0.571429 \n", - "4 1.0 0.142857 -0.714286 -0.571429 \n", + " engine_type_0 engine_type_1 engine_type_2 engine_type_3 engine_type_4 \\\n", + "0 -0.857143 -0.714286 -0.571429 -0.428571 -0.285714 \n", + "1 -0.857143 -0.714286 -0.571429 -0.428571 -0.285714 \n", + "2 0.142857 -0.714286 -0.571429 -0.428571 -0.285714 \n", + "3 0.142857 0.285714 -0.571429 -0.428571 -0.285714 \n", + "4 0.142857 0.285714 -0.571429 -0.428571 -0.285714 \n", "\n", - " col_engine_type_4 col_engine_type_5 col_engine_type_6 \n", - "0 0.571429 0.714286 -0.142857 \n", - "1 0.571429 0.714286 -0.142857 \n", - "2 0.571429 0.714286 0.857143 \n", - "3 -0.428571 -0.285714 -0.142857 \n", - "4 -0.428571 -0.285714 -0.142857 " + " engine_type_5 \n", + "0 -0.142857 \n", + "1 -0.142857 \n", + "2 -0.142857 \n", + "3 -0.142857 \n", + "4 -0.142857 " ] }, "execution_count": 37, @@ -2252,7 +2383,7 @@ } ], "source": [ - "encoder.transform(obj_df).iloc[:,0:7].head()" + "encoder.fit_transform(obj_df).iloc[:,8:14].head()" ] }, { @@ -2265,137 +2396,217 @@ { "cell_type": "code", "execution_count": 38, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "PolynomialEncoder(cols=['engine_type'], drop_invariant=False, return_df=True,\n", - " verbose=0)" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "encoder = ce.polynomial.PolynomialEncoder(cols=[\"engine_type\"])\n", - "encoder.fit(obj_df, verbose=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": { - "collapsed": false - }, - "outputs": [ + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/chris/miniconda3/envs/pbpcode/lib/python3.8/site-packages/category_encoders/utils.py:21: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead\n", + " elif pd.api.types.is_categorical(cols):\n" + ] + }, { "data": { "text/html": [ "
\n", + "\n", "
col_engine_type_0col_engine_type_1col_engine_type_2col_engine_type_3col_engine_type_4col_engine_type_5col_engine_type_6engine_type_0engine_type_1engine_type_2engine_type_3engine_type_4engine_type_5
01.00.1428570.2857140.4285710.5714290.714286-0.857143-0.714286-0.571429-0.428571-0.285714-0.142857
11.00.1428570.2857140.4285710.5714290.714286-0.857143-0.714286-0.571429-0.428571-0.285714-0.142857
21.00.1428570.2857140.4285710.5714290.7142860.857143-0.714286-0.571429-0.428571-0.285714-0.142857
31.00.142857-0.7142860.285714-0.571429-0.428571-0.285714
41.00.142857-0.7142860.285714-0.571429-0.428571-0.285714
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
col_engine_type_0col_engine_type_1col_engine_type_2col_engine_type_3col_engine_type_4col_engine_type_5col_engine_type_6engine_type_0engine_type_1engine_type_2engine_type_3engine_type_4engine_type_5
01.0-5.669467e-015.455447e-01-4.082483e-01-0.5669470.545545-0.4082480.241747-1.091089e-01-0.1091090.032898
11.0-5.669467e-015.455447e-01-4.082483e-01-0.5669470.545545-0.4082480.241747-1.091089e-01-0.1091090.032898
21.03.779645e-013.970680e-17-4.082483e-01-0.3779640.0000000.408248-0.564076-4.364358e-010.436436-0.197386
31.01.347755e-17-4.364358e-011.528598e-170.4834948.990141e-18-0.657952-0.188982-0.3273270.4082480.080582-0.5455450.493464
41.01.347755e-17-4.364358e-011.528598e-170.4834948.990141e-18-0.657952-0.188982-0.3273270.4082480.080582-0.5455450.493464
\n", "
" ], "text/plain": [ - " col_engine_type_0 col_engine_type_1 col_engine_type_2 col_engine_type_3 \\\n", - "0 1.0 -5.669467e-01 5.455447e-01 -4.082483e-01 \n", - "1 1.0 -5.669467e-01 5.455447e-01 -4.082483e-01 \n", - "2 1.0 3.779645e-01 3.970680e-17 -4.082483e-01 \n", - "3 1.0 1.347755e-17 -4.364358e-01 1.528598e-17 \n", - "4 1.0 1.347755e-17 -4.364358e-01 1.528598e-17 \n", + " engine_type_0 engine_type_1 engine_type_2 engine_type_3 engine_type_4 \\\n", + "0 -0.566947 0.545545 -0.408248 0.241747 -0.109109 \n", + "1 -0.566947 0.545545 -0.408248 0.241747 -0.109109 \n", + "2 -0.377964 0.000000 0.408248 -0.564076 0.436436 \n", + "3 -0.188982 -0.327327 0.408248 0.080582 -0.545545 \n", + "4 -0.188982 -0.327327 0.408248 0.080582 -0.545545 \n", "\n", - " col_engine_type_4 col_engine_type_5 col_engine_type_6 \n", - "0 0.241747 -1.091089e-01 0.032898 \n", - "1 0.241747 -1.091089e-01 0.032898 \n", - "2 -0.564076 -4.364358e-01 -0.197386 \n", - "3 0.483494 8.990141e-18 -0.657952 \n", - "4 0.483494 8.990141e-18 -0.657952 " + " engine_type_5 \n", + "0 0.032898 \n", + "1 0.032898 \n", + "2 -0.197386 \n", + "3 0.493464 \n", + "4 0.493464 " + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "encoder = ce.polynomial.PolynomialEncoder(cols=[\"engine_type\"])\n", + "encoder.fit_transform(obj_df, verbose=1).iloc[:,8:14].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Scikit-learn pipeline\n", + "Show an example of how to incorporate the encoding strategies into a scikit-learn pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "# for the purposes of this analysis, only use a small subset of features\n", + "feature_cols = [\n", + " 'fuel_type', 'make', 'aspiration', 'highway_mpg', 'city_mpg',\n", + " 'curb_weight', 'drive_wheels'\n", + "]\n", + "\n", + "# Remove the empty price rows\n", + "df_ml = df.dropna(subset=['price'])\n", + "\n", + "X = df_ml[feature_cols]\n", + "y = df_ml['price']" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "column_trans = make_column_transformer((OneHotEncoder(handle_unknown='ignore'),\n", + " ['fuel_type', 'make', 'drive_wheels']),\n", + " (OrdinalEncoder(), ['aspiration']),\n", + " remainder='passthrough')" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "linreg = LinearRegression()\n", + "pipe = make_pipeline(column_trans, linreg)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([-4476.0937653 , -1014.54842052, -4227.68553953, -4936.79899194,\n", + " -1591.8291911 , -3716.06617255, -4293.79197464, -1390.00486495,\n", + " -1600.57946369, -2124.30041954])" ] }, - "execution_count": 39, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "encoder.transform(obj_df).iloc[:,0:7].head()" + "cross_val_score(pipe, X, y, cv=10, scoring='neg_mean_absolute_error')" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "-2937.17" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get the average of the errors after 10 iterations\n", + "cross_val_score(pipe, X, y, cv=10, scoring='neg_mean_absolute_error').mean().round(2)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [] } @@ -2416,9 +2627,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.8.5" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/notebooks/Common-Excel-Part-2.ipynb b/notebooks/Common-Excel-Part-2.ipynb index 8d3c24d..e6a5c34 100644 --- a/notebooks/Common-Excel-Part-2.ipynb +++ b/notebooks/Common-Excel-Part-2.ipynb @@ -55,7 +55,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.read_excel(\"../data/sample-salesv3.xlsx\")" + "df = pd.read_excel('https://github.com/chris1610/pbpython/blob/master/data/sample-salesv3.xlsx?raw=true')" ] }, { @@ -117,18 +117,18 @@ "data": { "text/html": [ "
\n", - "\n", "\n", " \n", @@ -284,18 +284,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -406,18 +406,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -528,18 +528,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -650,18 +650,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -774,18 +774,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -898,18 +898,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -1034,18 +1034,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -1157,18 +1157,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -1279,18 +1279,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -1401,18 +1401,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -1523,18 +1523,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -1638,18 +1638,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -1762,18 +1762,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -1890,18 +1890,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -2017,18 +2017,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -2137,18 +2137,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -2257,18 +2257,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -2377,18 +2377,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -2511,18 +2511,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -2640,18 +2640,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -2826,18 +2826,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -2948,18 +2948,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -3124,7 +3124,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python [default]", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -3138,7 +3138,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.8.5" } }, "nbformat": 4, diff --git a/notebooks/case_study_weather/1-dwd_konverter_download.ipynb b/notebooks/case_study_weather/1-dwd_konverter_download.ipynb new file mode 100644 index 0000000..e3659cf --- /dev/null +++ b/notebooks/case_study_weather/1-dwd_konverter_download.ipynb @@ -0,0 +1,109 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Import temperature data from the DWD and process it\n", + "\n", + "This notebook pulls historical temperature data from the DWD server and formats it for future use in other projects. The data is delivered in a hourly frequencs in a .zip file for each of the available weather stations. To use the data, we need everythin in a single .csv-file, all stations side-by-side. Also, we need the daily average.\n", + "\n", + "To reduce computing time, we also crop all data earlier than 2007. \n", + "\n", + "Files should be executed in the following pipeline:\n", + "* 1-dwd_konverter_download\n", + "* 2-dwd_konverter_extract\n", + "* 3-dwd_konverter_build_df\n", + "* 4-dwd_konverter_final_processing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.) Download files from the DWD-API\n", + "Here we download all relevant files from the DWS Server. The DWD Server is http-based, so we scrape the download page for all links that match 'stundenwerte_TU_.\\*_hist.zip' and download them to the folder 'download'. \n", + "\n", + "Link to the relevant DWD-page: https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/air_temperature/historical/" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done\n" + ] + } + ], + "source": [ + "import requests\n", + "import re\n", + "from bs4 import BeautifulSoup\n", + "from pathlib import Path\n", + "\n", + "# Set base values\n", + "download_folder = Path.cwd() / 'download'\n", + "base_url = 'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/air_temperature/historical/'\n", + "\n", + "\n", + "# Initiate Session and get the Index-Page\n", + "with requests.Session() as s:\n", + " resp = s.get(base_url)\n", + "\n", + "# Parse the Index-Page for all relevant \n", + "soup = BeautifulSoup(resp.content)\n", + "links = soup.findAll(\"a\", href=re.compile(\"stundenwerte_TU_.*_hist.zip\"))\n", + "\n", + "# For testing, only download 10 files\n", + "file_max = 10\n", + "dl_count = 0\n", + "\n", + "#Download the .zip files to the download_folder\n", + "for link in links:\n", + " zip_response = requests.get(base_url + link['href'], stream=True)\n", + " # Limit the downloads while testing\n", + " dl_count += 1\n", + " if dl_count > file_max:\n", + " break\n", + " with open(Path(download_folder) / link['href'], 'wb') as file:\n", + " for chunk in zip_response.iter_content(chunk_size=128):\n", + " file.write(chunk) \n", + " \n", + "print('Done')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/case_study_weather/2-dwd_konverter_extract.ipynb b/notebooks/case_study_weather/2-dwd_konverter_extract.ipynb new file mode 100644 index 0000000..ac8d1c4 --- /dev/null +++ b/notebooks/case_study_weather/2-dwd_konverter_extract.ipynb @@ -0,0 +1,98 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Import temperature data from the DWD and process it\n", + "\n", + "This notebook pulls historical temperature data from the DWD server and formats it for future use in other projects. The data is delivered in a hourly frequencs in a .zip file for each of the available weather stations. To use the data, we need everythin in a single .csv-file, all stations side-by-side. Also, we need the daily average.\n", + "\n", + "To reduce computing time, we also crop all data earlier than 2007. \n", + "\n", + "Files should be executed in the following pipeline:\n", + "* 1-dwd_konverter_download\n", + "* 2-dwd_konverter_extract\n", + "* 3-dwd_konverter_build_df\n", + "* 4-dwd_konverter_final_processing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.) Extract all .zip-archives\n", + "In this next step, we extract a single file from all the downloaded .zip files and save them to the 'import' folder. Beware, there is going to be a lot of data (~6 GB of .csv files)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Done'" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from pathlib import Path\n", + "import glob\n", + "import re\n", + "from zipfile import ZipFile\n", + "\n", + "# Folder definitions\n", + "download_folder = Path.cwd() / 'download'\n", + "import_folder = Path.cwd() / 'import'\n", + "\n", + "# Find all .zip files and generate a list\n", + "unzip_files = glob.glob('download/stundenwerte_TU_*_hist.zip')\n", + "\n", + "# Set the name pattern of the file we need\n", + "regex_name = re.compile('produkt.*')\n", + "\n", + "# Open all files, look for files that match ne regex pattern, extract to 'import'\n", + "for file in unzip_files:\n", + " with ZipFile(file, 'r') as zipObj:\n", + " list_of_filenames = zipObj.namelist()\n", + " extract_filename = list(filter(regex_name.match, list_of_filenames))[0]\n", + " zipObj.extract(extract_filename, import_folder)\n", + "\n", + "display('Done')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/case_study_weather/3-dwd_konverter_build_df.ipynb b/notebooks/case_study_weather/3-dwd_konverter_build_df.ipynb new file mode 100644 index 0000000..accf54e --- /dev/null +++ b/notebooks/case_study_weather/3-dwd_konverter_build_df.ipynb @@ -0,0 +1,488 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Import temperature data from the DWD and process it\n", + "\n", + "This notebook pulls historical temperature data from the DWD server and formats it for future use in other projects. The data is delivered in a hourly frequencs in a .zip file for each of the available weather stations. To use the data, we need everythin in a single .csv-file, all stations side-by-side. Also, we need the daily average.\n", + "\n", + "To reduce computing time, we also crop all data earlier than 2007. \n", + "\n", + "Files should be executed in the following pipeline:\n", + "* 1-dwd_konverter_download\n", + "* 2-dwd_konverter_extract\n", + "* 3-dwd_konverter_build_df\n", + "* 4-dwd_konverter_final_processing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.) Import the .csv files into pandas and concat into a single df\n", + "Now we need to import everything that we have extracted. This operation is going to take some time (aprox 20 mins). If you want to save time, you can just delete a few of the .csv-files in the 'import' folder. The script works as well with only a few files. \n", + "\n", + "### Process individual files\n", + "The files are imported into a single df, stripped of unnecessary columns and filtered by date. Then we set a DateTimeIndex and concatenate them into the main_df. Because the loop takes a long time, we output some status messages, to ensure the process is still running. \n", + "### Process the concatenated main_df\n", + "Then we display some infos of the main_df so we can ensure that there are no errors, mainly to ensure all data-types are recognized correctly. Also, we drop duplicate entries, in case some of the .csv files were copied.\n", + "### Unstack and export\n", + "For the final step, we unstack the main_df and save it to a .csv and a .pkl file for the next step. Also, we display some output to get a grasp of what is going on. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Finished file: import/produkt_tu_stunde_20041101_20191231_00078.txt'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'This is file 10'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'Shape of the main_df is: (771356, 1)'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "float 771356\n", + "Name: TT_TU, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'Shape of the main_df is: (113952, 9)'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TT_TU
STATIONS_ID3447173789196102125
MESS_DATUM
2007-01-01 00:00:0011.4NaNNaNNaN11.09.4NaN9.7NaN
2007-01-01 01:00:0012.0NaNNaNNaN11.49.6NaN10.4NaN
2007-01-01 02:00:0012.3NaNNaNNaN9.410.0NaN9.9NaN
2007-01-01 03:00:0011.5NaNNaNNaN9.39.7NaN9.5NaN
2007-01-01 04:00:009.6NaNNaNNaN8.610.2NaN8.9NaN
\n", + "
" + ], + "text/plain": [ + " TT_TU \n", + "STATIONS_ID 3 44 71 73 78 91 96 102 125\n", + "MESS_DATUM \n", + "2007-01-01 00:00:00 11.4 NaN NaN NaN 11.0 9.4 NaN 9.7 NaN\n", + "2007-01-01 01:00:00 12.0 NaN NaN NaN 11.4 9.6 NaN 10.4 NaN\n", + "2007-01-01 02:00:00 12.3 NaN NaN NaN 9.4 10.0 NaN 9.9 NaN\n", + "2007-01-01 03:00:00 11.5 NaN NaN NaN 9.3 9.7 NaN 9.5 NaN\n", + "2007-01-01 04:00:00 9.6 NaN NaN NaN 8.6 10.2 NaN 8.9 NaN" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TT_TU
STATIONS_ID3447173789196102125
count37224.000000111003.00000088391.000000111471.000000113950.000000113950.0000006399.000000106379.00000082589.000000
mean10.1039229.9332138.3997647.5014869.8722689.19986912.73025510.1499911.045942
std7.20000114.4459738.77976647.5371127.2812158.40071323.18955510.72803086.520406
min-13.600000-999.000000-999.000000-999.000000-16.200000-999.000000-999.000000-999.000000-999.000000
25%5.0000004.9000002.2000002.8000004.7000003.4000007.2500005.7000001.800000
50%9.90000010.0000008.3000009.3000009.7000008.90000013.20000010.2000008.200000
75%15.30000015.20000014.20000015.80000015.00000014.70000018.50000015.20000014.500000
max36.20000037.00000033.70000036.70000039.00000036.90000037.90000033.40000033.700000
\n", + "
" + ], + "text/plain": [ + " TT_TU \\\n", + "STATIONS_ID 3 44 71 73 \n", + "count 37224.000000 111003.000000 88391.000000 111471.000000 \n", + "mean 10.103922 9.933213 8.399764 7.501486 \n", + "std 7.200001 14.445973 8.779766 47.537112 \n", + "min -13.600000 -999.000000 -999.000000 -999.000000 \n", + "25% 5.000000 4.900000 2.200000 2.800000 \n", + "50% 9.900000 10.000000 8.300000 9.300000 \n", + "75% 15.300000 15.200000 14.200000 15.800000 \n", + "max 36.200000 37.000000 33.700000 36.700000 \n", + "\n", + " \\\n", + "STATIONS_ID 78 91 96 102 \n", + "count 113950.000000 113950.000000 6399.000000 106379.000000 \n", + "mean 9.872268 9.199869 12.730255 10.149991 \n", + "std 7.281215 8.400713 23.189555 10.728030 \n", + "min -16.200000 -999.000000 -999.000000 -999.000000 \n", + "25% 4.700000 3.400000 7.250000 5.700000 \n", + "50% 9.700000 8.900000 13.200000 10.200000 \n", + "75% 15.000000 14.700000 18.500000 15.200000 \n", + "max 39.000000 36.900000 37.900000 33.400000 \n", + "\n", + " \n", + "STATIONS_ID 125 \n", + "count 82589.000000 \n", + "mean 1.045942 \n", + "std 86.520406 \n", + "min -999.000000 \n", + "25% 1.800000 \n", + "50% 8.200000 \n", + "75% 14.500000 \n", + "max 33.700000 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from IPython.display import clear_output\n", + "\n", + "from pathlib import Path\n", + "import glob\n", + "\n", + "\n", + "import_files = glob.glob('import/*')\n", + "out_file = Path.cwd() / \"export_uncleaned\" / \"to_clean\"\n", + "#msum_file= Path.cwd() / \"export\" / \"monatssumme.csv\"\n", + "\n", + "obsolete_columns = [\n", + " 'QN_9',\n", + " 'RF_TU',\n", + " 'eor'\n", + "]\n", + "\n", + "main_df = pd.DataFrame()\n", + "i = 1\n", + "\n", + "for file in import_files:\n", + "\n", + " # Read in the next file\n", + " df = pd.read_csv(file, delimiter=\";\")\n", + " # Prepare the df befor merging (Drop obsolete, convert to datetime, filter to date, set index)\n", + " df.drop(columns=obsolete_columns, inplace=True)\n", + " df[\"MESS_DATUM\"] = pd.to_datetime(df[\"MESS_DATUM\"], format=\"%Y%m%d%H\")\n", + " df = df[df['MESS_DATUM']>= \"2007-01-01\"]\n", + " df.set_index(['MESS_DATUM', 'STATIONS_ID'], inplace=True)\n", + " \n", + " # Merge to the main_df\n", + " main_df = pd.concat([main_df, df])\n", + " \n", + " # Display some status messages\n", + " clear_output(wait=True)\n", + " display('Finished file: {}'.format(file), 'This is file {}'.format(i))\n", + " display('Shape of the main_df is: {}'.format(main_df.shape))\n", + " i+=1\n", + "\n", + "# Check if all types are correct\n", + "display(main_df['TT_TU'].apply(lambda x: type(x).__name__).value_counts())\n", + " \n", + "# Make sure that to files or observations a duplicates, eg. scan the index for duplicate entries.\n", + "# The ~ is a bitwise operation, meaning it flips all bits. \n", + "main_df = main_df[~main_df.index.duplicated(keep='last')]\n", + "\n", + "\n", + "# Unstack the main_df\n", + "main_df = main_df.unstack('STATIONS_ID')\n", + "display('Shape of the main_df is: {}'.format(main_df.shape))\n", + "\n", + "# Save main_df to a .csv file and a pickle to continue working in the next cell. \n", + "main_df.to_pickle(Path(out_file).with_suffix('.pkl'))\n", + "main_df.to_csv(Path(out_file).with_suffix('.csv'), sep=\";\")\n", + "\n", + "display(main_df.head())\n", + "display(main_df.describe())\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/case_study_weather/4-dwd_konverter_final_processing.ipynb b/notebooks/case_study_weather/4-dwd_konverter_final_processing.ipynb new file mode 100644 index 0000000..13bb693 --- /dev/null +++ b/notebooks/case_study_weather/4-dwd_konverter_final_processing.ipynb @@ -0,0 +1,601 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Import temperature data from the DWD and process it\n", + "\n", + "This notebook pulls historical temperature data from the DWD server and formats it for future use in other projects. The data is delivered in a hourly frequencs in a .zip file for each of the available weather stations. To use the data, we need everythin in a single .csv-file, all stations side-by-side. Also, we need the daily average.\n", + "\n", + "To reduce computing time, we also crop all data earlier than 2007. \n", + "\n", + "Files should be executed in the following pipeline:\n", + "* 1-dwd_konverter_download\n", + "* 2-dwd_konverter_extract\n", + "* 3-dwd_konverter_build_df\n", + "* 4-dwd_konverter_final_processing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4.) Final data processing\n", + "We load in the data that has been saved in the last step, so we don't need to calculate everything again it we pause the project and come back later. \n", + "### Data Cleaning\n", + "The data contains some errors, which need to be cleaned. You can see, by looking at the output of main_df.describe() in the last cell, that the minimum teperature on some stations is -999. That means that there is no plausible measurement for this particular hour. We change this to np.nan, so that we can safely calculate the avarage values. \n", + "### Change the frequency\n", + "Finally we resample the data to daily means." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TT_TU
STATIONS_ID3447173789196102125
MESS_DATUM
2011-12-31NaN3.882.761.194.302.43NaN3.80NaN
2012-01-01NaN10.908.144.0310.9610.27NaN9.01NaN
2012-01-02NaN7.416.184.777.577.77NaN6.484.66
2012-01-03NaN6.143.614.466.385.28NaN5.633.51
2012-01-04NaN5.802.484.455.464.57NaN5.851.94
\n", + "
" + ], + "text/plain": [ + " TT_TU \n", + "STATIONS_ID 3 44 71 73 78 91 96 102 125\n", + "MESS_DATUM \n", + "2011-12-31 NaN 3.88 2.76 1.19 4.30 2.43 NaN 3.80 NaN\n", + "2012-01-01 NaN 10.90 8.14 4.03 10.96 10.27 NaN 9.01 NaN\n", + "2012-01-02 NaN 7.41 6.18 4.77 7.57 7.77 NaN 6.48 4.66\n", + "2012-01-03 NaN 6.14 3.61 4.46 6.38 5.28 NaN 5.63 3.51\n", + "2012-01-04 NaN 5.80 2.48 4.45 5.46 4.57 NaN 5.85 1.94" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TT_TU
STATIONS_ID3447173789196102125
count1551.0000004629.0000003683.0000004652.0000004748.0000004748.000000267.0000004490.0000003935.000000
mean10.10393910.0881538.4112449.6868559.8723429.20883713.19363310.2203458.466612
std6.7424606.6539837.5117087.8497766.6583997.1243246.7623276.0766497.711229
min-10.870000-10.710000-14.940000-14.320000-12.390000-15.710000-0.970000-8.170000-16.420000
25%5.4100005.2500002.6200003.3975005.0900003.8700007.5750005.7900002.365000
50%10.14000010.3200008.5700009.9000009.9000009.23000013.77000010.2000008.540000
75%15.35000015.38000014.07000016.08000015.12250014.82000018.19500015.26000014.545000
max28.41000028.45000027.19000026.94000029.89000027.55000026.98000027.33000028.030000
\n", + "
" + ], + "text/plain": [ + " TT_TU \\\n", + "STATIONS_ID 3 44 71 73 78 \n", + "count 1551.000000 4629.000000 3683.000000 4652.000000 4748.000000 \n", + "mean 10.103939 10.088153 8.411244 9.686855 9.872342 \n", + "std 6.742460 6.653983 7.511708 7.849776 6.658399 \n", + "min -10.870000 -10.710000 -14.940000 -14.320000 -12.390000 \n", + "25% 5.410000 5.250000 2.620000 3.397500 5.090000 \n", + "50% 10.140000 10.320000 8.570000 9.900000 9.900000 \n", + "75% 15.350000 15.380000 14.070000 16.080000 15.122500 \n", + "max 28.410000 28.450000 27.190000 26.940000 29.890000 \n", + "\n", + " \n", + "STATIONS_ID 91 96 102 125 \n", + "count 4748.000000 267.000000 4490.000000 3935.000000 \n", + "mean 9.208837 13.193633 10.220345 8.466612 \n", + "std 7.124324 6.762327 6.076649 7.711229 \n", + "min -15.710000 -0.970000 -8.170000 -16.420000 \n", + "25% 3.870000 7.575000 5.790000 2.365000 \n", + "50% 9.230000 13.770000 10.200000 8.540000 \n", + "75% 14.820000 18.195000 15.260000 14.545000 \n", + "max 27.550000 26.980000 27.330000 28.030000 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TT_TU
STATIONS_ID3447173789196102125
MESS_DATUM
2007-01-017.38NaNNaNNaN7.426.55NaN8.32NaN
2007-01-024.67NaNNaNNaN4.492.88NaN6.730.51
2007-01-036.19NaNNaNNaN4.874.25NaN7.120.91
2007-01-047.69NaNNaNNaN7.825.85NaN8.344.43
2007-01-057.78NaNNaNNaN7.476.03NaN8.203.92
..............................
2019-12-27NaN2.033.952.272.361.412.213.792.78
2019-12-28NaN0.38-0.59-0.27-0.07-2.10-0.052.32-1.29
2019-12-29NaN0.68-2.04-3.630.07-2.41-0.972.81-4.40
2019-12-30NaN5.921.88-2.465.57-1.263.785.97-1.32
2019-12-31NaN5.541.92-0.414.05-0.465.567.661.91
\n", + "

4748 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " TT_TU \n", + "STATIONS_ID 3 44 71 73 78 91 96 102 125\n", + "MESS_DATUM \n", + "2007-01-01 7.38 NaN NaN NaN 7.42 6.55 NaN 8.32 NaN\n", + "2007-01-02 4.67 NaN NaN NaN 4.49 2.88 NaN 6.73 0.51\n", + "2007-01-03 6.19 NaN NaN NaN 4.87 4.25 NaN 7.12 0.91\n", + "2007-01-04 7.69 NaN NaN NaN 7.82 5.85 NaN 8.34 4.43\n", + "2007-01-05 7.78 NaN NaN NaN 7.47 6.03 NaN 8.20 3.92\n", + "... ... ... ... ... ... ... ... ... ...\n", + "2019-12-27 NaN 2.03 3.95 2.27 2.36 1.41 2.21 3.79 2.78\n", + "2019-12-28 NaN 0.38 -0.59 -0.27 -0.07 -2.10 -0.05 2.32 -1.29\n", + "2019-12-29 NaN 0.68 -2.04 -3.63 0.07 -2.41 -0.97 2.81 -4.40\n", + "2019-12-30 NaN 5.92 1.88 -2.46 5.57 -1.26 3.78 5.97 -1.32\n", + "2019-12-31 NaN 5.54 1.92 -0.41 4.05 -0.46 5.56 7.66 1.91\n", + "\n", + "[4748 rows x 9 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from pathlib import Path\n", + "\n", + "# Import and export paths\n", + "pkl_file = Path.cwd() / \"export_uncleaned\" / \"to_clean.pkl\"\n", + "cleaned_file = Path.cwd() / \"export_cleaned\" / \"cleaned.csv\"\n", + "\n", + "# Read in the pickle file from the last cell\n", + "cleaning_df = pd.read_pickle(pkl_file)\n", + "\n", + "\n", + "# Replace all values with \"-999\", which indicate missing data\n", + "cleaning_df.replace(to_replace=-999, value=np.nan, inplace=True)\n", + "\n", + "# Resample to daily frequency\n", + "cleaning_df = cleaning_df.resample('D').mean().round(decimals=2)\n", + "\n", + "# Save as .csv\n", + "cleaning_df.to_csv(cleaned_file, sep=\";\", decimal=\",\")\n", + "\n", + "display(cleaning_df.loc['2011-12-31':'2012-01-04'])\n", + "display(cleaning_df.describe())\n", + "display(cleaning_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/case_study_weather/download/zipfile.txt b/notebooks/case_study_weather/download/zipfile.txt new file mode 100644 index 0000000..767ede5 --- /dev/null +++ b/notebooks/case_study_weather/download/zipfile.txt @@ -0,0 +1 @@ +Zip files will be stored here. diff --git a/notebooks/case_study_weather/export_cleaned/clean_data.txt b/notebooks/case_study_weather/export_cleaned/clean_data.txt new file mode 100644 index 0000000..2326a9c --- /dev/null +++ b/notebooks/case_study_weather/export_cleaned/clean_data.txt @@ -0,0 +1 @@ +File csv file for analysis diff --git a/notebooks/case_study_weather/export_uncleaned/csv_pickle_file.txt b/notebooks/case_study_weather/export_uncleaned/csv_pickle_file.txt new file mode 100644 index 0000000..168c7fa --- /dev/null +++ b/notebooks/case_study_weather/export_uncleaned/csv_pickle_file.txt @@ -0,0 +1 @@ +csv and pickle files stored here diff --git a/notebooks/case_study_weather/import/text_files.txt b/notebooks/case_study_weather/import/text_files.txt new file mode 100644 index 0000000..1db36b9 --- /dev/null +++ b/notebooks/case_study_weather/import/text_files.txt @@ -0,0 +1 @@ +Raw text files with temp measurements.