From 817c12c8a15f4b90e552fcf98d75df414d70c7df Mon Sep 17 00:00:00 2001 From: Chris Moffitt Date: Sat, 15 Aug 2020 13:59:07 -0500 Subject: [PATCH 01/10] Adding cereal data --- data/cereal_data.csv | 78 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 data/cereal_data.csv diff --git a/data/cereal_data.csv b/data/cereal_data.csv new file mode 100644 index 0000000..271adf5 --- /dev/null +++ b/data/cereal_data.csv @@ -0,0 +1,78 @@ +name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating,cereal +100% Bran,Nabisco,Cold,70,4,1,130,10.0,5.0,6,280,25,Top,1.0,0.33,68.4,1 +100% Natural Bran,Quaker Oats,Cold,120,3,5,15,2.0,8.0,8,135,0,Top,1.0,1.0,33.98,1 +All-Bran,Kellogs,Cold,70,4,1,260,9.0,7.0,5,320,25,Top,1.0,0.33,59.43,1 +All-Bran with Extra Fiber,Kellogs,Cold,50,4,0,140,14.0,8.0,0,330,25,Top,1.0,0.5,93.7,1 +Almond Delight,Ralston Puring,Cold,110,2,2,200,1.0,14.0,8,-1,25,Top,1.0,0.75,34.38,1 +Apple Cinnamon Cheerios,General Mills,Cold,110,2,2,180,1.5,10.5,10,70,25,Bottom,1.0,0.75,29.51,1 +Apple Jacks,Kellogs,Cold,110,2,0,125,1.0,11.0,14,30,25,Middle,1.0,1.0,33.17,1 +Basic 4,General Mills,Cold,130,3,2,210,2.0,18.0,8,100,25,Top,1.33,0.75,37.04,1 +Bran Chex,Ralston Puring,Cold,90,2,1,200,4.0,15.0,6,125,25,Bottom,1.0,0.67,49.12,1 +Bran Flakes,Post,Cold,90,3,0,210,5.0,13.0,5,190,25,Top,1.0,0.67,53.31,1 +Cap'n'Crunch,Quaker Oats,Cold,120,1,2,220,0.0,12.0,12,35,25,Middle,1.0,0.75,18.04,1 +Cheerios,General Mills,Cold,110,6,2,290,2.0,17.0,1,105,25,Bottom,1.0,1.25,50.76,1 +Cinnamon Toast Crunch,General Mills,Cold,120,1,3,210,0.0,13.0,9,45,25,Middle,1.0,0.75,19.82,1 +Clusters,General Mills,Cold,110,3,2,140,2.0,13.0,7,105,25,Top,1.0,0.5,40.4,1 +Cocoa Puffs,General Mills,Cold,110,1,1,180,0.0,12.0,13,55,25,Middle,1.0,1.0,22.74,1 +Corn Chex,Ralston Puring,Cold,110,2,0,280,0.0,22.0,3,25,25,Bottom,1.0,1.0,41.45,1 +Corn Flakes,Kellogs,Cold,100,2,0,290,1.0,21.0,2,35,25,Bottom,1.0,1.0,45.86,1 +Corn Pops,Kellogs,Cold,110,1,0,90,1.0,13.0,12,20,25,Middle,1.0,1.0,35.78,1 +Count Chocula,General Mills,Cold,110,1,1,180,0.0,12.0,13,65,25,Middle,1.0,1.0,22.4,1 +Cracklin' Oat Bran,Kellogs,Cold,110,3,3,140,4.0,10.0,7,160,25,Top,1.0,0.5,40.45,1 +Cream of Wheat (Quick),Nabisco,Hot,100,3,0,80,1.0,21.0,0,-1,0,Middle,1.0,1.0,64.53,1 +Crispix,Kellogs,Cold,110,2,0,220,1.0,21.0,3,30,25,Top,1.0,1.0,46.9,1 +Crispy Wheat & Raisins,General Mills,Cold,100,2,1,140,2.0,11.0,10,120,25,Top,1.0,0.75,36.18,1 +Double Chex,Ralston Puring,Cold,100,2,0,190,1.0,18.0,5,80,25,Top,1.0,0.75,44.33,1 +Froot Loops,Kellogs,Cold,110,2,1,125,1.0,11.0,13,30,25,Middle,1.0,1.0,32.21,1 +Frosted Flakes,Kellogs,Cold,110,1,0,200,1.0,14.0,11,25,25,Bottom,1.0,0.75,31.44,1 +Frosted Mini-Wheats,Kellogs,Cold,100,3,0,0,3.0,14.0,7,100,25,Middle,1.0,0.8,58.35,1 +"Fruit & Fibre Dates, Walnuts, and Oats",Post,Cold,120,3,2,160,5.0,12.0,10,200,25,Top,1.25,0.67,40.92,1 +Fruitful Bran,Kellogs,Cold,120,3,0,240,5.0,14.0,12,190,25,Top,1.33,0.67,41.02,1 +Fruity Pebbles,Post,Cold,110,1,1,135,0.0,13.0,12,25,25,Middle,1.0,0.75,28.03,1 +Golden Crisp,Post,Cold,100,2,0,45,0.0,11.0,15,40,25,Bottom,1.0,0.88,35.25,1 +Golden Grahams,General Mills,Cold,110,1,1,280,0.0,15.0,9,45,25,Middle,1.0,0.75,23.8,1 +Grape Nuts Flakes,Post,Cold,100,3,1,140,3.0,15.0,5,85,25,Top,1.0,0.88,52.08,1 +Grape-Nuts,Post,Cold,110,3,0,170,3.0,17.0,3,90,25,Top,1.0,0.25,53.37,1 +Great Grains Pecan,Post,Cold,120,3,3,75,3.0,13.0,4,100,25,Top,1.0,0.33,45.81,1 +Honey Graham Ohs,Quaker Oats,Cold,120,1,2,220,1.0,12.0,11,45,25,Middle,1.0,1.0,21.87,1 +Honey Nut Cheerios,General Mills,Cold,110,3,1,250,1.5,11.5,10,90,25,Bottom,1.0,0.75,31.07,1 +Honey-comb,Post,Cold,110,1,0,180,0.0,14.0,11,35,25,Bottom,1.0,1.33,28.74,1 +Just Right Crunchy Nuggets,Kellogs,Cold,110,2,1,170,1.0,17.0,6,60,100,Top,1.0,1.0,36.52,1 +Just Right Fruit & Nut,Kellogs,Cold,140,3,1,170,2.0,20.0,9,95,100,Top,1.3,0.75,36.47,1 +Kix,General Mills,Cold,110,2,1,260,0.0,21.0,3,40,25,Middle,1.0,1.5,39.24,1 +Life,Quaker Oats,Cold,100,4,2,150,2.0,12.0,6,95,25,Middle,1.0,0.67,45.33,1 +Lucky Charms,General Mills,Cold,110,2,1,180,0.0,12.0,12,55,25,Middle,1.0,1.0,26.73,1 +Maypo,AM Home Food,Hot,100,4,1,0,0.0,16.0,3,95,25,Middle,1.0,1.0,54.85,1 +"Muesli Raisins, Dates, & Almonds",Ralston Puring,Cold,150,4,3,95,3.0,16.0,11,170,25,Top,1.0,1.0,37.14,1 +"Muesli Raisins, Peaches, & Pecans",Ralston Puring,Cold,150,4,3,150,3.0,16.0,11,170,25,Top,1.0,1.0,34.14,1 +Mueslix Crispy Blend,Kellogs,Cold,160,3,2,150,3.0,17.0,13,160,25,Top,1.5,0.67,30.31,1 +Multi-Grain Cheerios,General Mills,Cold,100,2,1,220,2.0,15.0,6,90,25,Bottom,1.0,1.0,40.11,1 +Nut&Honey Crunch,Kellogs,Cold,120,2,1,190,0.0,15.0,9,40,25,Middle,1.0,0.67,29.92,1 +Nutri-Grain Almond-Raisin,Kellogs,Cold,140,3,2,220,3.0,21.0,7,130,25,Top,1.33,0.67,40.69,1 +Nutri-grain Wheat,Kellogs,Cold,90,3,0,170,3.0,18.0,2,90,25,Top,1.0,1.0,59.64,1 +Oatmeal Raisin Crisp,General Mills,Cold,130,3,2,170,1.5,13.5,10,120,25,Top,1.25,0.5,30.45,1 +Post Nat. Raisin Bran,Post,Cold,120,3,1,200,6.0,11.0,14,260,25,Top,1.33,0.67,37.84,1 +Product 19,Kellogs,Cold,100,3,0,320,1.0,20.0,3,45,100,Top,1.0,1.0,41.5,1 +Puffed Rice,Quaker Oats,Cold,50,1,0,0,0.0,13.0,0,15,0,Top,0.5,1.0,60.76,1 +Puffed Wheat,Quaker Oats,Cold,50,2,0,0,1.0,10.0,0,50,0,Top,0.5,1.0,63.01,1 +Quaker Oat Squares,Quaker Oats,Cold,100,4,1,135,2.0,14.0,6,110,25,Top,1.0,0.5,49.51,1 +Quaker Oatmeal,Quaker Oats,Hot,100,5,2,0,2.7,-1.0,-1,110,0,Bottom,1.0,0.67,50.83,1 +Raisin Bran,Kellogs,Cold,120,3,1,210,5.0,14.0,12,240,25,Middle,1.33,0.75,39.26,1 +Raisin Nut Bran,General Mills,Cold,100,3,2,140,2.5,10.5,8,140,25,Top,1.0,0.5,39.7,1 +Raisin Squares,Kellogs,Cold,90,2,0,0,2.0,15.0,6,110,25,Top,1.0,0.5,55.33,1 +Rice Chex,Ralston Puring,Cold,110,1,0,240,0.0,23.0,2,30,25,Bottom,1.0,1.13,42.0,1 +Rice Krispies,Kellogs,Cold,110,2,0,290,0.0,22.0,3,35,25,Bottom,1.0,1.0,40.56,1 +Shredded Wheat,Nabisco,Cold,80,2,0,0,3.0,16.0,0,95,0,Bottom,0.83,1.0,68.24,1 +Shredded Wheat 'n'Bran,Nabisco,Cold,90,3,0,0,4.0,19.0,0,140,0,Bottom,1.0,0.67,74.47,1 +Shredded Wheat spoon size,Nabisco,Cold,90,3,0,0,3.0,20.0,0,120,0,Bottom,1.0,0.67,72.8,1 +Smacks,Kellogs,Cold,110,2,1,70,1.0,9.0,15,40,25,Middle,1.0,0.75,31.23,1 +Special K,Kellogs,Cold,110,6,0,230,1.0,16.0,3,55,25,Bottom,1.0,1.0,53.13,1 +Strawberry Fruit Wheats,Nabisco,Cold,90,2,0,15,3.0,15.0,5,90,25,Middle,1.0,1.0,59.36,1 +Total Corn Flakes,General Mills,Cold,110,2,1,200,0.0,21.0,3,35,100,Top,1.0,1.0,38.84,1 +Total Raisin Bran,General Mills,Cold,140,3,1,190,4.0,15.0,14,230,100,Top,1.5,1.0,28.59,1 +Total Whole Grain,General Mills,Cold,100,3,1,200,3.0,16.0,3,110,100,Top,1.0,1.0,46.66,1 +Triples,General Mills,Cold,110,2,1,250,0.0,21.0,3,60,25,Top,1.0,0.75,39.11,1 +Trix,General Mills,Cold,110,1,1,140,0.0,13.0,12,25,25,Middle,1.0,1.0,27.75,1 +Wheat Chex,Ralston Puring,Cold,100,3,1,230,3.0,17.0,3,115,25,Bottom,1.0,0.67,49.79,1 +Wheaties,General Mills,Cold,100,3,1,200,3.0,17.0,3,110,25,Bottom,1.0,1.0,51.59,1 +Wheaties Honey Gold,General Mills,Cold,110,2,1,200,1.0,16.0,8,60,25,Bottom,1.0,0.75,36.19,1 From 8f0e8225e66eca163a043b7dba31bbd1e4abe4da Mon Sep 17 00:00:00 2001 From: Chris Moffitt Date: Sat, 15 Aug 2020 14:58:40 -0500 Subject: [PATCH 02/10] Fixing typo in cereal data --- data/cereal_data.csv | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/data/cereal_data.csv b/data/cereal_data.csv index 271adf5..6a907de 100644 --- a/data/cereal_data.csv +++ b/data/cereal_data.csv @@ -3,18 +3,18 @@ name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,she 100% Natural Bran,Quaker Oats,Cold,120,3,5,15,2.0,8.0,8,135,0,Top,1.0,1.0,33.98,1 All-Bran,Kellogs,Cold,70,4,1,260,9.0,7.0,5,320,25,Top,1.0,0.33,59.43,1 All-Bran with Extra Fiber,Kellogs,Cold,50,4,0,140,14.0,8.0,0,330,25,Top,1.0,0.5,93.7,1 -Almond Delight,Ralston Puring,Cold,110,2,2,200,1.0,14.0,8,-1,25,Top,1.0,0.75,34.38,1 +Almond Delight,Ralston Purina,Cold,110,2,2,200,1.0,14.0,8,-1,25,Top,1.0,0.75,34.38,1 Apple Cinnamon Cheerios,General Mills,Cold,110,2,2,180,1.5,10.5,10,70,25,Bottom,1.0,0.75,29.51,1 Apple Jacks,Kellogs,Cold,110,2,0,125,1.0,11.0,14,30,25,Middle,1.0,1.0,33.17,1 Basic 4,General Mills,Cold,130,3,2,210,2.0,18.0,8,100,25,Top,1.33,0.75,37.04,1 -Bran Chex,Ralston Puring,Cold,90,2,1,200,4.0,15.0,6,125,25,Bottom,1.0,0.67,49.12,1 +Bran Chex,Ralston Purina,Cold,90,2,1,200,4.0,15.0,6,125,25,Bottom,1.0,0.67,49.12,1 Bran Flakes,Post,Cold,90,3,0,210,5.0,13.0,5,190,25,Top,1.0,0.67,53.31,1 Cap'n'Crunch,Quaker Oats,Cold,120,1,2,220,0.0,12.0,12,35,25,Middle,1.0,0.75,18.04,1 Cheerios,General Mills,Cold,110,6,2,290,2.0,17.0,1,105,25,Bottom,1.0,1.25,50.76,1 Cinnamon Toast Crunch,General Mills,Cold,120,1,3,210,0.0,13.0,9,45,25,Middle,1.0,0.75,19.82,1 Clusters,General Mills,Cold,110,3,2,140,2.0,13.0,7,105,25,Top,1.0,0.5,40.4,1 Cocoa Puffs,General Mills,Cold,110,1,1,180,0.0,12.0,13,55,25,Middle,1.0,1.0,22.74,1 -Corn Chex,Ralston Puring,Cold,110,2,0,280,0.0,22.0,3,25,25,Bottom,1.0,1.0,41.45,1 +Corn Chex,Ralston Purina,Cold,110,2,0,280,0.0,22.0,3,25,25,Bottom,1.0,1.0,41.45,1 Corn Flakes,Kellogs,Cold,100,2,0,290,1.0,21.0,2,35,25,Bottom,1.0,1.0,45.86,1 Corn Pops,Kellogs,Cold,110,1,0,90,1.0,13.0,12,20,25,Middle,1.0,1.0,35.78,1 Count Chocula,General Mills,Cold,110,1,1,180,0.0,12.0,13,65,25,Middle,1.0,1.0,22.4,1 @@ -22,7 +22,7 @@ Cracklin' Oat Bran,Kellogs,Cold,110,3,3,140,4.0,10.0,7,160,25,Top,1.0,0.5,40.45, Cream of Wheat (Quick),Nabisco,Hot,100,3,0,80,1.0,21.0,0,-1,0,Middle,1.0,1.0,64.53,1 Crispix,Kellogs,Cold,110,2,0,220,1.0,21.0,3,30,25,Top,1.0,1.0,46.9,1 Crispy Wheat & Raisins,General Mills,Cold,100,2,1,140,2.0,11.0,10,120,25,Top,1.0,0.75,36.18,1 -Double Chex,Ralston Puring,Cold,100,2,0,190,1.0,18.0,5,80,25,Top,1.0,0.75,44.33,1 +Double Chex,Ralston Purina,Cold,100,2,0,190,1.0,18.0,5,80,25,Top,1.0,0.75,44.33,1 Froot Loops,Kellogs,Cold,110,2,1,125,1.0,11.0,13,30,25,Middle,1.0,1.0,32.21,1 Frosted Flakes,Kellogs,Cold,110,1,0,200,1.0,14.0,11,25,25,Bottom,1.0,0.75,31.44,1 Frosted Mini-Wheats,Kellogs,Cold,100,3,0,0,3.0,14.0,7,100,25,Middle,1.0,0.8,58.35,1 @@ -43,8 +43,8 @@ Kix,General Mills,Cold,110,2,1,260,0.0,21.0,3,40,25,Middle,1.0,1.5,39.24,1 Life,Quaker Oats,Cold,100,4,2,150,2.0,12.0,6,95,25,Middle,1.0,0.67,45.33,1 Lucky Charms,General Mills,Cold,110,2,1,180,0.0,12.0,12,55,25,Middle,1.0,1.0,26.73,1 Maypo,AM Home Food,Hot,100,4,1,0,0.0,16.0,3,95,25,Middle,1.0,1.0,54.85,1 -"Muesli Raisins, Dates, & Almonds",Ralston Puring,Cold,150,4,3,95,3.0,16.0,11,170,25,Top,1.0,1.0,37.14,1 -"Muesli Raisins, Peaches, & Pecans",Ralston Puring,Cold,150,4,3,150,3.0,16.0,11,170,25,Top,1.0,1.0,34.14,1 +"Muesli Raisins, Dates, & Almonds",Ralston Purina,Cold,150,4,3,95,3.0,16.0,11,170,25,Top,1.0,1.0,37.14,1 +"Muesli Raisins, Peaches, & Pecans",Ralston Purina,Cold,150,4,3,150,3.0,16.0,11,170,25,Top,1.0,1.0,34.14,1 Mueslix Crispy Blend,Kellogs,Cold,160,3,2,150,3.0,17.0,13,160,25,Top,1.5,0.67,30.31,1 Multi-Grain Cheerios,General Mills,Cold,100,2,1,220,2.0,15.0,6,90,25,Bottom,1.0,1.0,40.11,1 Nut&Honey Crunch,Kellogs,Cold,120,2,1,190,0.0,15.0,9,40,25,Middle,1.0,0.67,29.92,1 @@ -60,7 +60,7 @@ Quaker Oatmeal,Quaker Oats,Hot,100,5,2,0,2.7,-1.0,-1,110,0,Bottom,1.0,0.67,50.83 Raisin Bran,Kellogs,Cold,120,3,1,210,5.0,14.0,12,240,25,Middle,1.33,0.75,39.26,1 Raisin Nut Bran,General Mills,Cold,100,3,2,140,2.5,10.5,8,140,25,Top,1.0,0.5,39.7,1 Raisin Squares,Kellogs,Cold,90,2,0,0,2.0,15.0,6,110,25,Top,1.0,0.5,55.33,1 -Rice Chex,Ralston Puring,Cold,110,1,0,240,0.0,23.0,2,30,25,Bottom,1.0,1.13,42.0,1 +Rice Chex,Ralston Purina,Cold,110,1,0,240,0.0,23.0,2,30,25,Bottom,1.0,1.13,42.0,1 Rice Krispies,Kellogs,Cold,110,2,0,290,0.0,22.0,3,35,25,Bottom,1.0,1.0,40.56,1 Shredded Wheat,Nabisco,Cold,80,2,0,0,3.0,16.0,0,95,0,Bottom,0.83,1.0,68.24,1 Shredded Wheat 'n'Bran,Nabisco,Cold,90,3,0,0,4.0,19.0,0,140,0,Bottom,1.0,0.67,74.47,1 @@ -73,6 +73,6 @@ Total Raisin Bran,General Mills,Cold,140,3,1,190,4.0,15.0,14,230,100,Top,1.5,1.0 Total Whole Grain,General Mills,Cold,100,3,1,200,3.0,16.0,3,110,100,Top,1.0,1.0,46.66,1 Triples,General Mills,Cold,110,2,1,250,0.0,21.0,3,60,25,Top,1.0,0.75,39.11,1 Trix,General Mills,Cold,110,1,1,140,0.0,13.0,12,25,25,Middle,1.0,1.0,27.75,1 -Wheat Chex,Ralston Puring,Cold,100,3,1,230,3.0,17.0,3,115,25,Bottom,1.0,0.67,49.79,1 +Wheat Chex,Ralston Purina,Cold,100,3,1,230,3.0,17.0,3,115,25,Bottom,1.0,0.67,49.79,1 Wheaties,General Mills,Cold,100,3,1,200,3.0,17.0,3,110,25,Bottom,1.0,1.0,51.59,1 Wheaties Honey Gold,General Mills,Cold,110,2,1,200,1.0,16.0,8,60,25,Bottom,1.0,0.75,36.19,1 From f90969dc64bb0edf48f6a52c8063573205109f81 Mon Sep 17 00:00:00 2001 From: Chris Moffitt Date: Sun, 11 Oct 2020 11:01:48 -0500 Subject: [PATCH 03/10] Adding files for new Case Study post --- .../1-dwd_konverter_download.ipynb | 109 ++++ .../2-dwd_konverter_extract.ipynb | 98 +++ .../3-dwd_konverter_build_df.ipynb | 488 ++++++++++++++ .../4-dwd_konverter_final_processing.ipynb | 601 ++++++++++++++++++ .../case_study_weather/download/zipfile.txt | 1 + .../export_cleaned/clean_data.txt | 1 + .../export_uncleaned/csv_pickle_file.txt | 1 + .../case_study_weather/import/text_files.txt | 1 + 8 files changed, 1300 insertions(+) create mode 100644 notebooks/case_study_weather/1-dwd_konverter_download.ipynb create mode 100644 notebooks/case_study_weather/2-dwd_konverter_extract.ipynb create mode 100644 notebooks/case_study_weather/3-dwd_konverter_build_df.ipynb create mode 100644 notebooks/case_study_weather/4-dwd_konverter_final_processing.ipynb create mode 100644 notebooks/case_study_weather/download/zipfile.txt create mode 100644 notebooks/case_study_weather/export_cleaned/clean_data.txt create mode 100644 notebooks/case_study_weather/export_uncleaned/csv_pickle_file.txt create mode 100644 notebooks/case_study_weather/import/text_files.txt diff --git a/notebooks/case_study_weather/1-dwd_konverter_download.ipynb b/notebooks/case_study_weather/1-dwd_konverter_download.ipynb new file mode 100644 index 0000000..e3659cf --- /dev/null +++ b/notebooks/case_study_weather/1-dwd_konverter_download.ipynb @@ -0,0 +1,109 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Import temperature data from the DWD and process it\n", + "\n", + "This notebook pulls historical temperature data from the DWD server and formats it for future use in other projects. The data is delivered in a hourly frequencs in a .zip file for each of the available weather stations. To use the data, we need everythin in a single .csv-file, all stations side-by-side. Also, we need the daily average.\n", + "\n", + "To reduce computing time, we also crop all data earlier than 2007. \n", + "\n", + "Files should be executed in the following pipeline:\n", + "* 1-dwd_konverter_download\n", + "* 2-dwd_konverter_extract\n", + "* 3-dwd_konverter_build_df\n", + "* 4-dwd_konverter_final_processing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.) Download files from the DWD-API\n", + "Here we download all relevant files from the DWS Server. The DWD Server is http-based, so we scrape the download page for all links that match 'stundenwerte_TU_.\\*_hist.zip' and download them to the folder 'download'. \n", + "\n", + "Link to the relevant DWD-page: https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/air_temperature/historical/" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done\n" + ] + } + ], + "source": [ + "import requests\n", + "import re\n", + "from bs4 import BeautifulSoup\n", + "from pathlib import Path\n", + "\n", + "# Set base values\n", + "download_folder = Path.cwd() / 'download'\n", + "base_url = 'https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/hourly/air_temperature/historical/'\n", + "\n", + "\n", + "# Initiate Session and get the Index-Page\n", + "with requests.Session() as s:\n", + " resp = s.get(base_url)\n", + "\n", + "# Parse the Index-Page for all relevant \n", + "soup = BeautifulSoup(resp.content)\n", + "links = soup.findAll(\"a\", href=re.compile(\"stundenwerte_TU_.*_hist.zip\"))\n", + "\n", + "# For testing, only download 10 files\n", + "file_max = 10\n", + "dl_count = 0\n", + "\n", + "#Download the .zip files to the download_folder\n", + "for link in links:\n", + " zip_response = requests.get(base_url + link['href'], stream=True)\n", + " # Limit the downloads while testing\n", + " dl_count += 1\n", + " if dl_count > file_max:\n", + " break\n", + " with open(Path(download_folder) / link['href'], 'wb') as file:\n", + " for chunk in zip_response.iter_content(chunk_size=128):\n", + " file.write(chunk) \n", + " \n", + "print('Done')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/case_study_weather/2-dwd_konverter_extract.ipynb b/notebooks/case_study_weather/2-dwd_konverter_extract.ipynb new file mode 100644 index 0000000..ac8d1c4 --- /dev/null +++ b/notebooks/case_study_weather/2-dwd_konverter_extract.ipynb @@ -0,0 +1,98 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Import temperature data from the DWD and process it\n", + "\n", + "This notebook pulls historical temperature data from the DWD server and formats it for future use in other projects. The data is delivered in a hourly frequencs in a .zip file for each of the available weather stations. To use the data, we need everythin in a single .csv-file, all stations side-by-side. Also, we need the daily average.\n", + "\n", + "To reduce computing time, we also crop all data earlier than 2007. \n", + "\n", + "Files should be executed in the following pipeline:\n", + "* 1-dwd_konverter_download\n", + "* 2-dwd_konverter_extract\n", + "* 3-dwd_konverter_build_df\n", + "* 4-dwd_konverter_final_processing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.) Extract all .zip-archives\n", + "In this next step, we extract a single file from all the downloaded .zip files and save them to the 'import' folder. Beware, there is going to be a lot of data (~6 GB of .csv files)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Done'" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from pathlib import Path\n", + "import glob\n", + "import re\n", + "from zipfile import ZipFile\n", + "\n", + "# Folder definitions\n", + "download_folder = Path.cwd() / 'download'\n", + "import_folder = Path.cwd() / 'import'\n", + "\n", + "# Find all .zip files and generate a list\n", + "unzip_files = glob.glob('download/stundenwerte_TU_*_hist.zip')\n", + "\n", + "# Set the name pattern of the file we need\n", + "regex_name = re.compile('produkt.*')\n", + "\n", + "# Open all files, look for files that match ne regex pattern, extract to 'import'\n", + "for file in unzip_files:\n", + " with ZipFile(file, 'r') as zipObj:\n", + " list_of_filenames = zipObj.namelist()\n", + " extract_filename = list(filter(regex_name.match, list_of_filenames))[0]\n", + " zipObj.extract(extract_filename, import_folder)\n", + "\n", + "display('Done')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/case_study_weather/3-dwd_konverter_build_df.ipynb b/notebooks/case_study_weather/3-dwd_konverter_build_df.ipynb new file mode 100644 index 0000000..accf54e --- /dev/null +++ b/notebooks/case_study_weather/3-dwd_konverter_build_df.ipynb @@ -0,0 +1,488 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Import temperature data from the DWD and process it\n", + "\n", + "This notebook pulls historical temperature data from the DWD server and formats it for future use in other projects. The data is delivered in a hourly frequencs in a .zip file for each of the available weather stations. To use the data, we need everythin in a single .csv-file, all stations side-by-side. Also, we need the daily average.\n", + "\n", + "To reduce computing time, we also crop all data earlier than 2007. \n", + "\n", + "Files should be executed in the following pipeline:\n", + "* 1-dwd_konverter_download\n", + "* 2-dwd_konverter_extract\n", + "* 3-dwd_konverter_build_df\n", + "* 4-dwd_konverter_final_processing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3.) Import the .csv files into pandas and concat into a single df\n", + "Now we need to import everything that we have extracted. This operation is going to take some time (aprox 20 mins). If you want to save time, you can just delete a few of the .csv-files in the 'import' folder. The script works as well with only a few files. \n", + "\n", + "### Process individual files\n", + "The files are imported into a single df, stripped of unnecessary columns and filtered by date. Then we set a DateTimeIndex and concatenate them into the main_df. Because the loop takes a long time, we output some status messages, to ensure the process is still running. \n", + "### Process the concatenated main_df\n", + "Then we display some infos of the main_df so we can ensure that there are no errors, mainly to ensure all data-types are recognized correctly. Also, we drop duplicate entries, in case some of the .csv files were copied.\n", + "### Unstack and export\n", + "For the final step, we unstack the main_df and save it to a .csv and a .pkl file for the next step. Also, we display some output to get a grasp of what is going on. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Finished file: import/produkt_tu_stunde_20041101_20191231_00078.txt'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'This is file 10'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'Shape of the main_df is: (771356, 1)'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "float 771356\n", + "Name: TT_TU, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'Shape of the main_df is: (113952, 9)'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TT_TU
STATIONS_ID3447173789196102125
MESS_DATUM
2007-01-01 00:00:0011.4NaNNaNNaN11.09.4NaN9.7NaN
2007-01-01 01:00:0012.0NaNNaNNaN11.49.6NaN10.4NaN
2007-01-01 02:00:0012.3NaNNaNNaN9.410.0NaN9.9NaN
2007-01-01 03:00:0011.5NaNNaNNaN9.39.7NaN9.5NaN
2007-01-01 04:00:009.6NaNNaNNaN8.610.2NaN8.9NaN
\n", + "
" + ], + "text/plain": [ + " TT_TU \n", + "STATIONS_ID 3 44 71 73 78 91 96 102 125\n", + "MESS_DATUM \n", + "2007-01-01 00:00:00 11.4 NaN NaN NaN 11.0 9.4 NaN 9.7 NaN\n", + "2007-01-01 01:00:00 12.0 NaN NaN NaN 11.4 9.6 NaN 10.4 NaN\n", + "2007-01-01 02:00:00 12.3 NaN NaN NaN 9.4 10.0 NaN 9.9 NaN\n", + "2007-01-01 03:00:00 11.5 NaN NaN NaN 9.3 9.7 NaN 9.5 NaN\n", + "2007-01-01 04:00:00 9.6 NaN NaN NaN 8.6 10.2 NaN 8.9 NaN" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TT_TU
STATIONS_ID3447173789196102125
count37224.000000111003.00000088391.000000111471.000000113950.000000113950.0000006399.000000106379.00000082589.000000
mean10.1039229.9332138.3997647.5014869.8722689.19986912.73025510.1499911.045942
std7.20000114.4459738.77976647.5371127.2812158.40071323.18955510.72803086.520406
min-13.600000-999.000000-999.000000-999.000000-16.200000-999.000000-999.000000-999.000000-999.000000
25%5.0000004.9000002.2000002.8000004.7000003.4000007.2500005.7000001.800000
50%9.90000010.0000008.3000009.3000009.7000008.90000013.20000010.2000008.200000
75%15.30000015.20000014.20000015.80000015.00000014.70000018.50000015.20000014.500000
max36.20000037.00000033.70000036.70000039.00000036.90000037.90000033.40000033.700000
\n", + "
" + ], + "text/plain": [ + " TT_TU \\\n", + "STATIONS_ID 3 44 71 73 \n", + "count 37224.000000 111003.000000 88391.000000 111471.000000 \n", + "mean 10.103922 9.933213 8.399764 7.501486 \n", + "std 7.200001 14.445973 8.779766 47.537112 \n", + "min -13.600000 -999.000000 -999.000000 -999.000000 \n", + "25% 5.000000 4.900000 2.200000 2.800000 \n", + "50% 9.900000 10.000000 8.300000 9.300000 \n", + "75% 15.300000 15.200000 14.200000 15.800000 \n", + "max 36.200000 37.000000 33.700000 36.700000 \n", + "\n", + " \\\n", + "STATIONS_ID 78 91 96 102 \n", + "count 113950.000000 113950.000000 6399.000000 106379.000000 \n", + "mean 9.872268 9.199869 12.730255 10.149991 \n", + "std 7.281215 8.400713 23.189555 10.728030 \n", + "min -16.200000 -999.000000 -999.000000 -999.000000 \n", + "25% 4.700000 3.400000 7.250000 5.700000 \n", + "50% 9.700000 8.900000 13.200000 10.200000 \n", + "75% 15.000000 14.700000 18.500000 15.200000 \n", + "max 39.000000 36.900000 37.900000 33.400000 \n", + "\n", + " \n", + "STATIONS_ID 125 \n", + "count 82589.000000 \n", + "mean 1.045942 \n", + "std 86.520406 \n", + "min -999.000000 \n", + "25% 1.800000 \n", + "50% 8.200000 \n", + "75% 14.500000 \n", + "max 33.700000 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from IPython.display import clear_output\n", + "\n", + "from pathlib import Path\n", + "import glob\n", + "\n", + "\n", + "import_files = glob.glob('import/*')\n", + "out_file = Path.cwd() / \"export_uncleaned\" / \"to_clean\"\n", + "#msum_file= Path.cwd() / \"export\" / \"monatssumme.csv\"\n", + "\n", + "obsolete_columns = [\n", + " 'QN_9',\n", + " 'RF_TU',\n", + " 'eor'\n", + "]\n", + "\n", + "main_df = pd.DataFrame()\n", + "i = 1\n", + "\n", + "for file in import_files:\n", + "\n", + " # Read in the next file\n", + " df = pd.read_csv(file, delimiter=\";\")\n", + " # Prepare the df befor merging (Drop obsolete, convert to datetime, filter to date, set index)\n", + " df.drop(columns=obsolete_columns, inplace=True)\n", + " df[\"MESS_DATUM\"] = pd.to_datetime(df[\"MESS_DATUM\"], format=\"%Y%m%d%H\")\n", + " df = df[df['MESS_DATUM']>= \"2007-01-01\"]\n", + " df.set_index(['MESS_DATUM', 'STATIONS_ID'], inplace=True)\n", + " \n", + " # Merge to the main_df\n", + " main_df = pd.concat([main_df, df])\n", + " \n", + " # Display some status messages\n", + " clear_output(wait=True)\n", + " display('Finished file: {}'.format(file), 'This is file {}'.format(i))\n", + " display('Shape of the main_df is: {}'.format(main_df.shape))\n", + " i+=1\n", + "\n", + "# Check if all types are correct\n", + "display(main_df['TT_TU'].apply(lambda x: type(x).__name__).value_counts())\n", + " \n", + "# Make sure that to files or observations a duplicates, eg. scan the index for duplicate entries.\n", + "# The ~ is a bitwise operation, meaning it flips all bits. \n", + "main_df = main_df[~main_df.index.duplicated(keep='last')]\n", + "\n", + "\n", + "# Unstack the main_df\n", + "main_df = main_df.unstack('STATIONS_ID')\n", + "display('Shape of the main_df is: {}'.format(main_df.shape))\n", + "\n", + "# Save main_df to a .csv file and a pickle to continue working in the next cell. \n", + "main_df.to_pickle(Path(out_file).with_suffix('.pkl'))\n", + "main_df.to_csv(Path(out_file).with_suffix('.csv'), sep=\";\")\n", + "\n", + "display(main_df.head())\n", + "display(main_df.describe())\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/case_study_weather/4-dwd_konverter_final_processing.ipynb b/notebooks/case_study_weather/4-dwd_konverter_final_processing.ipynb new file mode 100644 index 0000000..13bb693 --- /dev/null +++ b/notebooks/case_study_weather/4-dwd_konverter_final_processing.ipynb @@ -0,0 +1,601 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Import temperature data from the DWD and process it\n", + "\n", + "This notebook pulls historical temperature data from the DWD server and formats it for future use in other projects. The data is delivered in a hourly frequencs in a .zip file for each of the available weather stations. To use the data, we need everythin in a single .csv-file, all stations side-by-side. Also, we need the daily average.\n", + "\n", + "To reduce computing time, we also crop all data earlier than 2007. \n", + "\n", + "Files should be executed in the following pipeline:\n", + "* 1-dwd_konverter_download\n", + "* 2-dwd_konverter_extract\n", + "* 3-dwd_konverter_build_df\n", + "* 4-dwd_konverter_final_processing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4.) Final data processing\n", + "We load in the data that has been saved in the last step, so we don't need to calculate everything again it we pause the project and come back later. \n", + "### Data Cleaning\n", + "The data contains some errors, which need to be cleaned. You can see, by looking at the output of main_df.describe() in the last cell, that the minimum teperature on some stations is -999. That means that there is no plausible measurement for this particular hour. We change this to np.nan, so that we can safely calculate the avarage values. \n", + "### Change the frequency\n", + "Finally we resample the data to daily means." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TT_TU
STATIONS_ID3447173789196102125
MESS_DATUM
2011-12-31NaN3.882.761.194.302.43NaN3.80NaN
2012-01-01NaN10.908.144.0310.9610.27NaN9.01NaN
2012-01-02NaN7.416.184.777.577.77NaN6.484.66
2012-01-03NaN6.143.614.466.385.28NaN5.633.51
2012-01-04NaN5.802.484.455.464.57NaN5.851.94
\n", + "
" + ], + "text/plain": [ + " TT_TU \n", + "STATIONS_ID 3 44 71 73 78 91 96 102 125\n", + "MESS_DATUM \n", + "2011-12-31 NaN 3.88 2.76 1.19 4.30 2.43 NaN 3.80 NaN\n", + "2012-01-01 NaN 10.90 8.14 4.03 10.96 10.27 NaN 9.01 NaN\n", + "2012-01-02 NaN 7.41 6.18 4.77 7.57 7.77 NaN 6.48 4.66\n", + "2012-01-03 NaN 6.14 3.61 4.46 6.38 5.28 NaN 5.63 3.51\n", + "2012-01-04 NaN 5.80 2.48 4.45 5.46 4.57 NaN 5.85 1.94" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TT_TU
STATIONS_ID3447173789196102125
count1551.0000004629.0000003683.0000004652.0000004748.0000004748.000000267.0000004490.0000003935.000000
mean10.10393910.0881538.4112449.6868559.8723429.20883713.19363310.2203458.466612
std6.7424606.6539837.5117087.8497766.6583997.1243246.7623276.0766497.711229
min-10.870000-10.710000-14.940000-14.320000-12.390000-15.710000-0.970000-8.170000-16.420000
25%5.4100005.2500002.6200003.3975005.0900003.8700007.5750005.7900002.365000
50%10.14000010.3200008.5700009.9000009.9000009.23000013.77000010.2000008.540000
75%15.35000015.38000014.07000016.08000015.12250014.82000018.19500015.26000014.545000
max28.41000028.45000027.19000026.94000029.89000027.55000026.98000027.33000028.030000
\n", + "
" + ], + "text/plain": [ + " TT_TU \\\n", + "STATIONS_ID 3 44 71 73 78 \n", + "count 1551.000000 4629.000000 3683.000000 4652.000000 4748.000000 \n", + "mean 10.103939 10.088153 8.411244 9.686855 9.872342 \n", + "std 6.742460 6.653983 7.511708 7.849776 6.658399 \n", + "min -10.870000 -10.710000 -14.940000 -14.320000 -12.390000 \n", + "25% 5.410000 5.250000 2.620000 3.397500 5.090000 \n", + "50% 10.140000 10.320000 8.570000 9.900000 9.900000 \n", + "75% 15.350000 15.380000 14.070000 16.080000 15.122500 \n", + "max 28.410000 28.450000 27.190000 26.940000 29.890000 \n", + "\n", + " \n", + "STATIONS_ID 91 96 102 125 \n", + "count 4748.000000 267.000000 4490.000000 3935.000000 \n", + "mean 9.208837 13.193633 10.220345 8.466612 \n", + "std 7.124324 6.762327 6.076649 7.711229 \n", + "min -15.710000 -0.970000 -8.170000 -16.420000 \n", + "25% 3.870000 7.575000 5.790000 2.365000 \n", + "50% 9.230000 13.770000 10.200000 8.540000 \n", + "75% 14.820000 18.195000 15.260000 14.545000 \n", + "max 27.550000 26.980000 27.330000 28.030000 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TT_TU
STATIONS_ID3447173789196102125
MESS_DATUM
2007-01-017.38NaNNaNNaN7.426.55NaN8.32NaN
2007-01-024.67NaNNaNNaN4.492.88NaN6.730.51
2007-01-036.19NaNNaNNaN4.874.25NaN7.120.91
2007-01-047.69NaNNaNNaN7.825.85NaN8.344.43
2007-01-057.78NaNNaNNaN7.476.03NaN8.203.92
..............................
2019-12-27NaN2.033.952.272.361.412.213.792.78
2019-12-28NaN0.38-0.59-0.27-0.07-2.10-0.052.32-1.29
2019-12-29NaN0.68-2.04-3.630.07-2.41-0.972.81-4.40
2019-12-30NaN5.921.88-2.465.57-1.263.785.97-1.32
2019-12-31NaN5.541.92-0.414.05-0.465.567.661.91
\n", + "

4748 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " TT_TU \n", + "STATIONS_ID 3 44 71 73 78 91 96 102 125\n", + "MESS_DATUM \n", + "2007-01-01 7.38 NaN NaN NaN 7.42 6.55 NaN 8.32 NaN\n", + "2007-01-02 4.67 NaN NaN NaN 4.49 2.88 NaN 6.73 0.51\n", + "2007-01-03 6.19 NaN NaN NaN 4.87 4.25 NaN 7.12 0.91\n", + "2007-01-04 7.69 NaN NaN NaN 7.82 5.85 NaN 8.34 4.43\n", + "2007-01-05 7.78 NaN NaN NaN 7.47 6.03 NaN 8.20 3.92\n", + "... ... ... ... ... ... ... ... ... ...\n", + "2019-12-27 NaN 2.03 3.95 2.27 2.36 1.41 2.21 3.79 2.78\n", + "2019-12-28 NaN 0.38 -0.59 -0.27 -0.07 -2.10 -0.05 2.32 -1.29\n", + "2019-12-29 NaN 0.68 -2.04 -3.63 0.07 -2.41 -0.97 2.81 -4.40\n", + "2019-12-30 NaN 5.92 1.88 -2.46 5.57 -1.26 3.78 5.97 -1.32\n", + "2019-12-31 NaN 5.54 1.92 -0.41 4.05 -0.46 5.56 7.66 1.91\n", + "\n", + "[4748 rows x 9 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from pathlib import Path\n", + "\n", + "# Import and export paths\n", + "pkl_file = Path.cwd() / \"export_uncleaned\" / \"to_clean.pkl\"\n", + "cleaned_file = Path.cwd() / \"export_cleaned\" / \"cleaned.csv\"\n", + "\n", + "# Read in the pickle file from the last cell\n", + "cleaning_df = pd.read_pickle(pkl_file)\n", + "\n", + "\n", + "# Replace all values with \"-999\", which indicate missing data\n", + "cleaning_df.replace(to_replace=-999, value=np.nan, inplace=True)\n", + "\n", + "# Resample to daily frequency\n", + "cleaning_df = cleaning_df.resample('D').mean().round(decimals=2)\n", + "\n", + "# Save as .csv\n", + "cleaning_df.to_csv(cleaned_file, sep=\";\", decimal=\",\")\n", + "\n", + "display(cleaning_df.loc['2011-12-31':'2012-01-04'])\n", + "display(cleaning_df.describe())\n", + "display(cleaning_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/case_study_weather/download/zipfile.txt b/notebooks/case_study_weather/download/zipfile.txt new file mode 100644 index 0000000..767ede5 --- /dev/null +++ b/notebooks/case_study_weather/download/zipfile.txt @@ -0,0 +1 @@ +Zip files will be stored here. diff --git a/notebooks/case_study_weather/export_cleaned/clean_data.txt b/notebooks/case_study_weather/export_cleaned/clean_data.txt new file mode 100644 index 0000000..2326a9c --- /dev/null +++ b/notebooks/case_study_weather/export_cleaned/clean_data.txt @@ -0,0 +1 @@ +File csv file for analysis diff --git a/notebooks/case_study_weather/export_uncleaned/csv_pickle_file.txt b/notebooks/case_study_weather/export_uncleaned/csv_pickle_file.txt new file mode 100644 index 0000000..168c7fa --- /dev/null +++ b/notebooks/case_study_weather/export_uncleaned/csv_pickle_file.txt @@ -0,0 +1 @@ +csv and pickle files stored here diff --git a/notebooks/case_study_weather/import/text_files.txt b/notebooks/case_study_weather/import/text_files.txt new file mode 100644 index 0000000..1db36b9 --- /dev/null +++ b/notebooks/case_study_weather/import/text_files.txt @@ -0,0 +1 @@ +Raw text files with temp measurements. From 3e5f71f9299350159ef5e8fd3da26bae0d6fbfe3 Mon Sep 17 00:00:00 2001 From: Chris Moffitt Date: Sun, 18 Oct 2020 22:11:16 -0500 Subject: [PATCH 04/10] Adding Excel file for next article --- data/shipping_tables.xlsx | Bin 0 -> 16307 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 data/shipping_tables.xlsx diff --git a/data/shipping_tables.xlsx b/data/shipping_tables.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..aad8f1be1aaff811557fd2939520ca62dc6b8fc9 GIT binary patch literal 16307 zcmeHuWmH|swl3}-+=IIZ*8qXw?(R--cXxMpcXxMp*8suY_0c)!-gKw?zBk_Qdlrnn z*<-Ewv8!rY&2N^JC@=^z02lxS006*80K2dT+9*H(fJiU^fDZr=K&t$f7Pk5pwwiKI z-}G(NX&lYX@UuaH$T9(d-k<-^^}l!qdgVn$f$0(2pA5uz-(?pceqKr0Rm zmSM+2M&(vK&dipBh4HAU9ls8eTyc@aEOo7QFKvH7O5UWjQoa=gY=>LEHzr;el5E40 z#wV|cgT=YQIMu~uaZIxkQ)47<(1N?Hk zNGDEi6pTGjtK2+L&YK&W zypP{1$OSkxdf<&hz=L3(Ya`4jB3D#N_TZ8;U5L$=<+0X6*3g??aMhhBZyjKrp&Lfu zVzyY`mGJEi7(nVTmRu)KM|k~yN9^5Dq2Dc8(^}umhL+}!*Z)}X|6;TK)6h$!#U*>_ zVEoU7p92Q(=GGz+K8rf=3AcQdbNBqZ3SSqNLxi=~L5htahv@?>>+-;JOh!dH8TLeW(V_}x)&<5d` znG1m`gb5l&Ovn#;*#4+oNnR@b;>znf_vL_dpJh+V{3{z+G7n=WQal#o^ABKo0@=il zr;|_yZFCHm%3TM|KHfZF%F7zD8CUA0TeA~5Y3iAEUJ0ahe0XuCmPj3xAz(tfXP6Kg zAkMt;P^o3Q7)f*LWr6G|89X2I4#6v2e-CPZHxkplHs5$)0D!jl=m-BkGmd7o4wlxY zI+m8Ee}uYHd3DPiR%AC#<2T^#G@XA9g=+9(3>R2>vLQ?cmc5m4HJb;0RmJ|~q3{qA<_k~;*!i98%`sUs_Y8dnEzGoRaX!ka z%0*Z0w1?fp2_71j&)l?rj?4g=2Rb!!#bIVGBccj3r=M#}z2f2`KgrX@O4=`odAi|nn!SEI$wwrsD~oY+d=8Hf-`yejNO<#{G+|jTDk$C zY6nG;7P!!O*2(r|7UF9HN@H7JdR5%{{rLa{rZp>MJU6!dm0GYw=`1Mk3dc0w^T1&L zJrT%C1U)PKP#v+VO-eSZDmY3 z-^wjznO#1`OrcRhKB!`8;20uIpMRRzV0+qitGE0%0(i>pobcG) zMmrvoq^dVvf1FNrRxok4bY@SRW=BaAAO|@p*=nPq$DJAj8Pz1(JrwmAVe|26Aw`&m zLRFH$)frAB&xGhu%Y#2lhNt(pZak*Suputh+8$WJ2e{%Tr=Q#%aZA&M+FLaf+fWiWZN zBg7c>-g-zX8p#F{yw6jt7%u>Zxp=$Yx5cX^y!-?CM=v@D@fPru=z?6d>qf2FFB8)* zl4IH@0rpA(H(oX3cll^sO!X11-!)TQ#T*6hB$o!f-8au_KRn;+uY?%Z3#ihV3_2$^v;f z(A*Y4?%qJ(WpDF2@dAI`+L12O>8BeW4!NNRlZ;yN~vyODZsJ`J$ST`}WvNmV-`xgFvn-`MA#v)VX&W$Cfp zbn6>u8pj=H5^G-{x0tAHOBzRwSi;ZMbZ!~(w*ZG-3c%KQc{kL*n{b5tybHp+anHTq z#RPx=d^h2Lg}Of*@_&Ur!1u)U{n`KBS8I%?S&}Ln%T^)?qwGQ3A5T zE0~99m<;aYBEFpu14dDt3YTn&$K7-_DazE4x^TVg?E@U4Xd*~#&DHv~HB5J+(^5IW z@fFzHl#@Y5IumOL7Siar&G$~P3@n*wgaV}^&Bp;!5w zxXfuxYJyo&M~%u`_2WfAA7w@Q(FffyFc|%|SxIr5^b~`%bfu_SFrFHuNZ^X#T(63` zsnQ7)(c8-`0v7X;%Tawj>DIwP6@fq3b|JSxc_&LB zn|8)5pKmBf!VADK{V{2bKJ?LVbPSYAWKncAZZ&$d?6O66dXxf4~kkAs8%%&_BwFmT!><#Zl2H^%&7-8I42o`}w#sGRS4655rH zI||r9L`dtH709O<4)Iko_naLHodaqcT^q|g2sx7H05p^zo5TMM1W|$E0Xeiyd*a-D zB5o?fi2Ybt>sWX{ZTy3h&I*NR$OKr_HOfaC=TDt1(x6_SP23t4kgH%;pwC8#4@^bW zU0T*t5FO z6n3l~PwwhmDvY8|1n!}+;67quUK_%iPLB-RqwYbMG8(h&7T5<<^Bsb~VSMM&9mi?E zfNsB_Y`-X6rQ=)7gc~+&ZmvtDCS(Y#>Tmzjz!;f)Z~ujEZ*g_Ja)7bEfl*4V!`%nC zsoAt((fUP^ABfsYNEkbbc>+41KyJS$d=?Rz3>FHEG>BYicvS_3 z0pYazS=8c<+}vcRLt(y*^gbIOuaRd6@3*4js4MzS#W}Ku6vV*928#V~{WIqhC5%!E z2Qr&Hh5#fH*X_r`R?vj(QK8p9PtCY2oy9`@@q%C{VI@msXrI1)IPG-gJ*30M%A=e@ zA#YI>{}$Bq5<0nW?|8EL-b%`rD3l*)jbav7>uBQ;w`CTtAQea&<@6yU0N<37ptfU+ z&{5~H`nr3KO`0n+#W8MrCU3~A5g}>h(J30x-;rji{)N;y^2hgTMOe$CGLzlwJUw1^ z3=0NHD2AC+g}EOX{(`UY-~Y|bLD=U&CjnmTCR1X63q_#1XgdgyRBWMCP_ptgmJpbqmQ9m@XZoJ3U z9N7QS7c%_xg^DZTIjjh7c*bvl4_qI?tUp3w*zv2(z!QeqI~Zi(Y!fK}8<>+G%p1MB zjj-~1<|l;4!X;2-u$NP0l$UaM%vK(CpyB}8f7+1D0jTY6YWz&Pdj@KxsB3itBt>3}U0ugXrv=~z=A!=w=g!KPC8Tdqbu(Z} z=aLD}$;M%?>6KH(>{-*OZJEBXay^cCpPLunv8jtuuR>7}_(kiQ!19z@u-;?Jw?FtZ zLMYW*hU&Ox?RUo3WPn^MY$l>06pool5=i`s%)Ylze8B+rDB5ksXCp?Ef78@ zc-7kj=#Lk<>186S-j0#ueED%`4a$HZl?wz^o>1b(?MRQDrv$TLebIK&MhT);f$-{h z$o6VRkJk=)PUYBHw{y`EVjeS+Erlqp*M{`0(*^16I+SX})jhJV!i!jY=3`G2&zntr zJ>V3EM)?FIA5lRQp^j!aZ40@@2#0i)wXHP-Z12Y~@ zUR*{qEan5?c+bPCMJ38*Rn>!in{BLu`=P(d`PA^I+7uv)~Cg>d5`f?*g`9F9!U683}*I(2u!q$4x5gGO9+%;*=xgJqW&Vt27 z>9YN+VlZ;F@ES(BmQl9hdnrxOTaxH6^#^C!XIQuIakrrA9O4q=tp#CqN$I~d!z8rd zwu#ni?L2volvSl9%A8+!SwI&i9gXfs#sI&F^m^$}i|n^2E^37$85E|3ZfRVoO<48G z^`M06?#>{(Pe~;bYxVQbr`yPLdk%LCCr?@mkQw(q=aTZl5XiRG^~%8aZaW1)ijMxm+l>raZeGG?ORm^yUSQa^>j+4q+5@~P z(y>LtvR$GSGZ7Vg*bt%Dwt|AYba&?~+l%S8L}#Hy=-=p##F* zgc+y<*f>7xiKvu$GG*w5(f1nOI#Mof3?B)PVdF$RT#8yHx9Foj!-kS=C>B={XH|Xg zss0@3w=Z*w^Kto7JDr{bciUz9jtmLO-ROb%6Vj2eCC4(dVoC%oyDFil4BP;oKr%#Mw}<**Bqt%N z8$-z0HhWiyXCRO){gFtt8s*4gMda@*xV>KU34`nD7;@v(wStteWNsaGdWXd7pD8ov zSnz4nxm;e)53T|_xIC`+A9^&ao9ZY;Ex?IoWl!fKG~Zrc(AG6OUe6CNSPo`MFS>g> zo*zaMI^HhWZz{J}QE1!UE-w#a&^Fv|en>{(EU#eWB%Xf^syZ*<(mf*y(e{FSx&Y)! z>!thL8_kQjZ(n#iG6bX7Oh9^HX` zvp9L={QO(G*jmh0Q&3CLWR&SCjYuiL-4Di0#ci3TuuArD+Jw$JZ!P=ush@_mBNWeLm{OVDc%2#xcJ>UkDuLBC^JKPhl zdM|UdsP+DIQwTa^6qo&Cm=veYvRL$BuR96cN5LoupRZ`H5lb8bT>RKl)%2?kkoDa! z1_X_uJ0{7oq=Aq6<2Y5Vp-F9cXtv&!A^OXl^^0|5L5v*iS4DBlE$Fhgq5bGC4csvAQOSZ3H*y~^ z1=d8=S$9+AoYTVL#5X?y?c(i(%#bgD?$brLbbLJugHSe1a7I0KRsTlmZdo&+rMb>^ z(hh>*YJ!1(+i*LEAZ_Z+sqGH|1%tNc9f{tp4K8pu05DpO~p^wrL9(WN%?PVzPybN6pg#RfAP z+!gxcrb$;S{WrHad?rb)u3p0=XP5iwlC4#{j2~l9M?m{pK0CO9v`&SB(-p2rHufNur8Z34r=g3Hsshb;MM~RX#S@tAxv+in2gGM~QzcI*#bl!Dr%*3fdO@uV-# zF^ok?fKocNjQA0@vP;cYP9}E7#chJQ)P$k7HDPqqblzxNR2*aRPs;txNddDAl1(~I zSk{b(Vep8if&`?EUL+*4lf4`k4|OIG4HSe-rVk(X(R&sg!I}tXRnf9#8|^W^rd!-Q zXvbrqRR;*fva7{N41>`}CA!4+QfpFs#KAI6VP~6DDtaPtt9(!9rNm3ILa^l+bY$ov zH1=OZmrj?!^V*Uv|Mptkw}Deiw1nN=h8Gsm2F(701u<7iiIx&PJjgS|$NF%L-YcaN zC2IG>(;DW2yho2b;c-z zxN@sRPjF3*^hzxw6$FgrEc3m&-4|A_i*iz)!L(5oJ6@_Ik+l2A;nmhT18@f~PUmkP ztOYXrQlBjo2+1C&u?AzAwL(^7l*1r4TG>dc-`vT~z}G|;sbev(JOZ@E7 z6-Hhv)eh|_;5W6Rq}AYNjvfKkv?m4ajVlHgXimTHQjhr|hJ=V|j-u6VJ#M$FExB5q zA>yY!)eUDVo7T2zhyXmz9+?ll0zx`v$I^(Na_~Uy{4*#B$zyJPG zpRloYGSj#DBN5+G+^}3>MsUZgeB+by&!W!X51^i~C)TgS0R(L(;APN)2L)2eHX}ej zkd~vcTzqRMBww+eVM+?HfOn}HBs{xcTXPU%Z`wCGT-BXI7?)w;gjWNv@lNn9letSs zS;cCRAVj1@+=VGPMP)e^yggz0O4xv`SwSBUm#8IvSe~~k=GH+z-}q&WZ9@x;5(Vv3 zG#v_M#}Y*7@E7vA*$8i~orOTU1*OFKKAX^4;ng~~{2SEjntM z#Gr-htO9;BeuRi@X^S}7dGO;!Uq3qbP&&ccyJ^VhcEWE-JO@Pt`w&;UCg;)J`s@$h zNu(MX??byv$~~@9X`81lR9iuo=c=VDV&%Zc=B1NQ&UXm) z&}&j*sGKXOGIn=3MFjm?Xoj3>y(OH9);W*lZ(?RYibyQVH(L)i=W7whDg!h&=&mX5 ziH8}m&ol@494s?E^L) zLEfgI{$t^Au%x_t5%xvYsUoo4e^Z!;>2?(47>_(DnoeaaM+I)#Y+u1jZiShVb(1c| zcmV#2CAL&4%-oN%i8!o-5+nP_xr<9*ymzfB<@d-Ag5C{WY(;oKMA=Ke-mW)yN@L_ zcjp=?nF+r}iSgu6$ito+N;e8ONh#v6^z&?Sho)AV0d-Za_Z8{B{*=XWTE62PcSm7m zTL4h51eH;Vp%uGkmDRmk>p|u;lU@f`)gOkuQ*JruT(w!+TUsU97?T(mk7P+qJi0zz zDm}!bqN>8UZpZ3)F07vVrBwh{0rgt{Vs53T39>Bx)4ipIUK$(+=_Kl#Wz%`x+acOL z#axT@2HjILU~od?XX-eXr@mRPP>+6MW7GY-mvTz$i9F`0=70+hyEQ*TL0bH?7BCIw zb2}A2nY)OQrHGh5eO#ySTFkfpz}PM^j`io;Tdws=J2CiBdmbtk3m)h8a}nl^Qr%m% zS?Ap-Z8co?M?O?`WkFsQw3MFax)}mFo8s7D8r%?{Em=S7T5@9cYT&+a5Sdcd`<=|R zBP3nFgSn$8FH{5JNfH;?wc+-3ROLkyCAxP(dL%%;olBBDLH|<`6MGgNLGImsD8D-j zR!O=LJEI2Kn2mtD>nW#fowztxi@!~aO`11)#5BbC*g)a`^DY}m(fM=L# zxJ7R~>fNA%Xn+}0X4R>BnP-_fqLAK(Po?{E4~S^9=C^up^Hw6oH;xe$!(vj$v9n%m z7PRoNh-}}+rlxGLCO`VQ^Ss>NyoM?>VLz5zF~;TzEv+)alpcnAE-@8}XU?+3##%R85UQAo=E_BUiVO z|D=b7sW$+v?;|Jvf6c(pGSYv|08}>}g20*g8^W!{^m|)Lep}#V$v$}LK|O5!*TFAS z9W0xIeCJHZa8CvWCb(Bbr^rjr1zEA65Z3(;^RSrX1wAIv-XEIbNY8|OV$hSFn+n63 zw1X5*FDXe@kochxr z_OJ%348K(xeS_;;VqVAZjnoBiaSTHQS!YzaQx;dXi9;&Vo(#Gh^mxYMaeIRThR1d1 ztdkUZ{K0y#Knk2u5*Y_ucXV;Ly(VqzK6&Ex-QOB{Cq{Eg5P?=x=S2NU_5RxY=_rGp zr$yA>^=)^9)9cCQCbB~FL#j}pRe+n9%^9@8;sCnKEIl0k7G?4@)F@lZ`6tLP0Rm$# zvyoB2{q{vq*Sf6)ub#Kjp$V`5m?B+9X*BWCTq^@D=wI>xoKb}XX1B28)@3J_?5 zRf7mcG9x-QGJ_$;RQXA~)692bmvWFU+TYB0axcY*%jMvl2*t_OgL;kiF_(-?BT1B_ z#C^PR0*^4(>#L{7z^d%<^!=L3D3-{GtRjXalRg=aVuDVUQ`CJ)?Om-YHlMrAC4jyC zlJiMO(Og|m)Q^yq%3=U zv_Iee#T3PgTBgz=``yJo;zi!U!2<*5&>_-_MlI7M+v2ckF(?2Mt`IR6Uv9XJaZYoh zN;}*yzq*!B&?hLiWI5nAO@9u@xTNLWIj@Q;L&sydK8oK4VOy@O(r?3d^1oHfDH&U! zP4Hj4TlHGY8K0a_G(Kr=Ub^%;fM&ZuG+-|RGter;RpTNbS83Z_g=xcb)HsF{3*F~c zf*&))2bS&l0m)*Wn230njb_kM8Og|WbQB;>Q9{Abv3rcn*5z=B* zK6UPZ`7$a0EWKFb#259Z*Jp^jk7Vu-q*Z)l#`CtbMR94ItiwY}4;RwfZ!Cw*nk$$d z<0&VN$U|tT2h3E^pwn!c%RFf8m+oBuc2QsMn2^Pwyx<;+O$d8*Bl%!k6DO&~pwtRV z+vqJ*cT~>j8Nr-j#vC2I{%yB)*&S86?=`p=-#pG25>7Yzdo~rnT0i2Ah>BuKGAD4M zDt_Ma4{ck_r4``gq-^sCrO|$>ul%P2lrNGMzR=H}2kqBYY`)i-0zR=$Oc(_CR*+}X zS)Tyl{93WcGOfg8;)JZ^rU0l4ky3NrL)%!Pa;EQkax)DXeYbkjmLykiuTWc*T)P$Q zkdWPhZXbDJt^zkX15HwQe5Y;YJIx&XB}9Zl4_*UJY}`;&5(HI*q&?dz30TF9!ei)2 z6G8a49%6Tb652(^L1>?1aSayf0X8L}bE5BF!j@J@<;F$2GPI;OUuP1z>zk-ra5eFfYIOP49i15rr~4!E#ZoK420N!*(ap@>jQ8v@LhOrGWd)?D6ht z{_cY+{EZ^v-kV#j-z#+=0LTDz?QCo<&6Ui}XpL?4&3~#Vel`&RStkCwgZjT;JBdry z$wCNzKcv;#?fO1bq2{(Cw~P|p3a&y%B0wMj`FaLOUZ&xq_7N-Uk+zFSpTE`EWt?)) z%=Bbt`IB=+JCB4CzjU;G;D+8=C-BgrT))DeW)xcR zK7(c4NZEt5(TsjYZM*0p_1cTALn_}kISsO3)-NSB?f`bT5iVy~{-(Z;DcsI~w2x2R zuR_bX)qZI|&f>LN4&5C1(cH_pZK1$`ny4yV#|xkRb2Q`cd; zsqPZM%8yd9$&Ji{foHg70;MWDw9wLfSo{<0B+5w6Y?~bsVU?L;)zXP(!~qOdPaSW|1^3V?Gm8QC6!tuf?Y`OegbtZpY-M%D_V&b zq~-f+V+Q##4=nssjY?&e-~Q)?!dg+d;;;k3rbSgq_TYY^db>xJ)h2frgRqD(FaW^7l3?cs5 znq>HbtS%daooCs9kC`raZWEh^T7c4nrTxXbv?7ysNC^<*Be%`okFXc*?CCF}=u4tk z@~CgZ18f;2=_Q-0ZZh~Y5(k7i!^c41mpgPLefP1Z7P>n0M--^P)Ys;fLA?r@!6waq zVnkbPVzvxwE5J8nWPvCL;3W5yAB?H5h%fEXjYsF?H;AxZcz4HZ=)#*NTE4ciS|*x@;gpcutL2qQcTf2a_ynD9`xcNnx+%eXHS z-vCpy`<`Rni}FJMEX{;V6J(lxmPfB02$m7xK>|%uRqdOPDQx3dY~&mxe6|grMm9a| znEFq{%rq>=c9+>E(`_BWL5J6Z`D zLjEJ)Q?z{c#4j2y8}2dc?WCFF!r(7k&LVx&rSk6e|FHOQ%>Qd znWJczJKj{by9@J*QV#Ywys4pEbE^oladVMkg1MU)`q1DdDPc@GVd=QwR7ycHc-m6u zT6$G&_9QPKGJZ~fLgao=y91?(`fOf3hmKtgkGB7H#r?x6wt3ojlx^BnHV)lG1wOel z0<=?R-R=ZIjs8nfu%|hTXuutCdF<`xW4Sf9dK;oa*@yjbyL8EEMuYkdMWJ!hNvsY1 z3`W*Bh>ekZ{M*v-I1l2E033~Sisq~f5;GTJ`S^Q=D$Yf~iN(XgvN72_@)W3ZeXWZX z0xl8Cko2fNtC!F1a@2({1RnAiK9}9CEza3^VoTvWa6r~soaXo3C;x_pmrOW+S)DT-P>!O znn#GVz546U!kr^pBU|0lZ=7W8`mNrW?0+QOMvSj9r(QokH|4)VI5SL~lfg%rYd&7> z7Lz-qQJSef_0eX);=|M57f?mA&0xMosqP*so?ld@4{j_FxI9M%BEdEua2v{G&GFz zMA)zk=xEzrg0hYWU2>E}*|bvL9TK?$cvZ)|Iyo}LS9E5p#_MD)?|Ul@vARSs&yhVv zoWc*pSFn7yzuF)?$Wd(A17xt1-SFmZD8OXpR-%6Llvw*nI~3DQJQAVY(&d_`Za4-x z+T_nK+V5LtJz4a08jveg&Q+HZ7dz4R?&4{h zd>e!7(rkJ8xN0l`crLV3WrloJTdX7h(EfBMxn}5d!hnIpP}E9#Gsp4C0@X;1jQ6hh zk|)%T+M&o(#vO;CO&so!Z25we*P^C9YT#*x!J9cq`A<~Ftx^K206d^&m(%WO4S1~7 zaW&nJc@vI-H&{$X^Ja`HTX^M?&+xaG9`YG$n(s-%KcpF5dBl6~ci7*P5cI#L8B*4k z-)#QBat*x*anW8n1poI<;KxxeaoPH%v1xnhk#5}9K<|af{1x69@~;%$_Z#9Dwa%F7 z7<(^95*ZdGcq9TL?VuX5yrB!MN*?B0CpTj7ygD-OQjE?s-a@Q)cMf9+5k4XmBrto? zp@dG_SFP97u3wd}gr*d0xrO;0wp$CLLr7K5no8~EA11FJv4rbyl}YO9@@la$LYx^S z`hd{BUs?5T$_=BJ!wgKB%;adKT?EPNlP7A_w7EDi+N!2+oS`xHtFC63b-mhHDS(Hv z=6}x7b+foC-L9qSZ1#VdTDi}+ynC<8{Cy4$+h>Nv-{;TteLI2p-pHtDsrz%{Xmu^E z_5UdDzPIiFr+@KX$$X3Squ5I9-|Y z?rw1mf=W4C(kxK5C|aCC(f)LCS#5Q}T4sy5CMG~XA|)MbytodZzx&uqb2;d!9|xZ~sxj zU;Crf7)jGry7v}ZKX>pP7lrY9-|9iA_a;kzQ8fD)+z^V01%hfKrVlkMk{cJvtoANs z*UTQc6Fgi)hqtp2*#V(6{vv7>BGSB-RrXs-K>d&&l_>lp&k&*NjIh@O0q zncvWL??&~M`idIbBg(cgZhEItK?_6^-`%AJvek!lu{j9rZfZ|5(-BAuLwFYGgWU^c z3Dh^A*UR@j=pP>qI)bOU(NB}~&0w(Tal88icP@XcZcajH0)L&QxPS7QQFEkm<|*6J zF?fu*X*c2_+So?OmJ?_An`e8wa@?6n`@@ph(85Bw$3J>fTOKy&?)Swad)H*#cP;+4 z>HSyD^}n~i|7i>VrP2Vt6J5bT^e|wXEt?#%t7+qo2vG-)%BZbDjhl7042*rovdLtR zx2TeoRLAs?53jjvJl&N_A1 z^?MoapAWPCX{bN{{D-5hQlfta`0E+EKLZTEE8d?@)cr2_`?;cDqG#_1Ykxas^ts*Zw=o?>(TuP~edM5#?to|96z%o5p^j z5TgD>`MrhgcYxn%?_U7fI6nb?k>J0J{!WJd5*_&XFVWwrvELDXCvtuvh!g%k2meUx z{0{g#FY*gelIZ^e_=_d^9q@Oq;TND7@lU|tn1+939)1V?y}11gl=Iqd!acvHp1o{}h{~L_y#E1polj`_J?{#W2GD$EW`TD2!7G literal 0 HcmV?d00001 From f4b5b81617fe61f5e323d91c6a0f14d8c91539e8 Mon Sep 17 00:00:00 2001 From: Chris Moffitt Date: Sat, 28 Nov 2020 10:38:51 -0600 Subject: [PATCH 05/10] Updating article to fix code errors and add a pipeline example --- notebooks/Category-Encoding-Article.ipynb | 995 +++++++++++++--------- 1 file changed, 603 insertions(+), 392 deletions(-) diff --git a/notebooks/Category-Encoding-Article.ipynb b/notebooks/Category-Encoding-Article.ipynb index e54c5c8..25a601c 100644 --- a/notebooks/Category-Encoding-Article.ipynb +++ b/notebooks/Category-Encoding-Article.ipynb @@ -12,21 +12,23 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Import the pandas, scikit-learn, numpy and [category_encoder](http://contrib.scikit-learn.org/categorical-encoding/) libraries." + "Import the pandas, scikit-learn, numpy and [category_encoder](https://github.com/scikit-learn-contrib/category_encoders) libraries." ] }, { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", - "from sklearn.preprocessing import LabelBinarizer, LabelEncoder\n", + "from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder\n", + "from sklearn.compose import make_column_transformer\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.model_selection import cross_val_score\n", "\n", "import category_encoders as ce" ] @@ -41,9 +43,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "headers = [\"symboling\", \"normalized_losses\", \"make\", \"fuel_type\", \"aspiration\", \"num_doors\", \"body_style\",\n", @@ -62,26 +62,35 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "df = pd.read_csv(\"http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data\",\n", + "df = pd.read_csv(\"https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data\",\n", " header=None, names=headers, na_values=\"?\" )" ] }, { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", " \n", @@ -243,12 +252,12 @@ "3 2 164.0 audi gas std four \n", "4 2 164.0 audi gas std four \n", "\n", - " body_style drive_wheels engine_location wheel_base ... engine_size \\\n", - "0 convertible rwd front 88.6 ... 130 \n", - "1 convertible rwd front 88.6 ... 130 \n", - "2 hatchback rwd front 94.5 ... 152 \n", - "3 sedan fwd front 99.8 ... 109 \n", - "4 sedan 4wd front 99.4 ... 136 \n", + " body_style drive_wheels engine_location wheel_base ... engine_size \\\n", + "0 convertible rwd front 88.6 ... 130 \n", + "1 convertible rwd front 88.6 ... 130 \n", + "2 hatchback rwd front 94.5 ... 152 \n", + "3 sedan fwd front 99.8 ... 109 \n", + "4 sedan 4wd front 99.4 ... 136 \n", "\n", " fuel_system bore stroke compression_ratio horsepower peak_rpm city_mpg \\\n", "0 mpfi 3.47 2.68 9.0 111.0 5000.0 21 \n", @@ -286,9 +295,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -341,9 +348,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "obj_df = df.select_dtypes(include=['object']).copy()" @@ -352,14 +357,25 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -481,14 +497,25 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -565,9 +592,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -596,9 +621,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "obj_df = obj_df.fillna({\"num_doors\": \"four\"})" @@ -607,14 +630,25 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -668,9 +702,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -680,8 +712,8 @@ "five 11\n", "eight 5\n", "two 4\n", - "twelve 1\n", "three 1\n", + "twelve 1\n", "Name: num_cylinders, dtype: int64" ] }, @@ -697,9 +729,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "cleanup_nums = {\"num_doors\": {\"four\": 4, \"two\": 2},\n", @@ -710,25 +740,34 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "obj_df.replace(cleanup_nums, inplace=True)" + "obj_df = obj_df.replace(cleanup_nums)" ] }, { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -850,9 +889,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -889,9 +926,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -916,9 +951,7 @@ { "cell_type": "code", "execution_count": 18, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "obj_df[\"body_style\"] = obj_df[\"body_style\"].astype('category')" @@ -927,9 +960,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -966,9 +997,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "obj_df[\"body_style_cat\"] = obj_df[\"body_style\"].cat.codes" @@ -977,14 +1006,25 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -1105,9 +1145,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1145,14 +1183,25 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -1185,9 +1234,9 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1201,9 +1250,9 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1217,9 +1266,9 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1233,9 +1282,9 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1249,9 +1298,9 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", "
4mpfi00.00.01.0001
14mpfi00.00.01.0001
26mpfi20.00.01.0001
34mpfi30.01.00.0010
45mpfi31.00.00.0100
\n", @@ -1266,18 +1315,18 @@ "4 audi gas std 4 sedan front \n", "\n", " engine_type num_cylinders fuel_system body_style_cat drive_wheels_4wd \\\n", - "0 dohc 4 mpfi 0 0.0 \n", - "1 dohc 4 mpfi 0 0.0 \n", - "2 ohcv 6 mpfi 2 0.0 \n", - "3 ohc 4 mpfi 3 0.0 \n", - "4 ohc 5 mpfi 3 1.0 \n", + "0 dohc 4 mpfi 0 0 \n", + "1 dohc 4 mpfi 0 0 \n", + "2 ohcv 6 mpfi 2 0 \n", + "3 ohc 4 mpfi 3 0 \n", + "4 ohc 5 mpfi 3 1 \n", "\n", " drive_wheels_fwd drive_wheels_rwd \n", - "0 0.0 1.0 \n", - "1 0.0 1.0 \n", - "2 0.0 1.0 \n", - "3 1.0 0.0 \n", - "4 0.0 0.0 " + "0 0 1 \n", + "1 0 1 \n", + "2 0 1 \n", + "3 1 0 \n", + "4 0 0 " ] }, "execution_count": 23, @@ -1299,14 +1348,25 @@ { "cell_type": "code", "execution_count": 24, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", " \n", @@ -1342,14 +1402,14 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1362,14 +1422,14 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1382,14 +1442,14 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1402,14 +1462,14 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1422,14 +1482,14 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
4mpfi01.00.00.00.00.00.00.01.010000001
14mpfi01.00.00.00.00.00.00.01.010000001
26mpfi20.00.01.00.00.00.00.01.000100001
34mpfi30.00.00.01.00.00.01.00.000010010
45mpfi30.00.00.01.00.01.00.00.000010100
\n", @@ -1444,18 +1504,18 @@ "4 audi gas std 4 front ohc \n", "\n", " num_cylinders fuel_system body_style_cat body_convertible body_hardtop \\\n", - "0 4 mpfi 0 1.0 0.0 \n", - "1 4 mpfi 0 1.0 0.0 \n", - "2 6 mpfi 2 0.0 0.0 \n", - "3 4 mpfi 3 0.0 0.0 \n", - "4 5 mpfi 3 0.0 0.0 \n", + "0 4 mpfi 0 1 0 \n", + "1 4 mpfi 0 1 0 \n", + "2 6 mpfi 2 0 0 \n", + "3 4 mpfi 3 0 0 \n", + "4 5 mpfi 3 0 0 \n", "\n", " body_hatchback body_sedan body_wagon drive_4wd drive_fwd drive_rwd \n", - "0 0.0 0.0 0.0 0.0 0.0 1.0 \n", - "1 0.0 0.0 0.0 0.0 0.0 1.0 \n", - "2 1.0 0.0 0.0 0.0 0.0 1.0 \n", - "3 0.0 1.0 0.0 0.0 1.0 0.0 \n", - "4 0.0 1.0 0.0 1.0 0.0 0.0 " + "0 0 0 0 0 0 1 \n", + "1 0 0 0 0 0 1 \n", + "2 1 0 0 0 0 1 \n", + "3 0 1 0 0 1 0 \n", + "4 0 1 0 1 0 0 " ] }, "execution_count": 24, @@ -1478,9 +1538,7 @@ { "cell_type": "code", "execution_count": 25, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -1488,8 +1546,8 @@ "ohc 148\n", "ohcf 15\n", "ohcv 13\n", - "dohc 12\n", "l 12\n", + "dohc 12\n", "rotor 4\n", "dohcv 1\n", "Name: engine_type, dtype: int64" @@ -1514,9 +1572,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "obj_df[\"OHC_Code\"] = np.where(obj_df[\"engine_type\"].str.contains(\"ohc\"), 1, 0)" @@ -1525,14 +1581,25 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", " \n", @@ -1717,36 +1784,43 @@ { "cell_type": "code", "execution_count": 28, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "lb_make = LabelEncoder()" + "ord_enc = OrdinalEncoder()" ] }, { "cell_type": "code", "execution_count": 29, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "obj_df[\"make_code\"] = lb_make.fit_transform(obj_df[\"make\"])" + "obj_df[\"make_code\"] = ord_enc.fit_transform(obj_df[[\"make\"]])" ] }, { "cell_type": "code", "execution_count": 30, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", @@ -1759,57 +1833,57 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
0alfa-romero00.0
1alfa-romero00.0
2alfa-romero00.0
3audi11.0
4audi11.0
5audi11.0
6audi11.0
7audi11.0
8audi11.0
9audi11.0
10bmw22.0
\n", @@ -1817,17 +1891,17 @@ ], "text/plain": [ " make make_code\n", - "0 alfa-romero 0\n", - "1 alfa-romero 0\n", - "2 alfa-romero 0\n", - "3 audi 1\n", - "4 audi 1\n", - "5 audi 1\n", - "6 audi 1\n", - "7 audi 1\n", - "8 audi 1\n", - "9 audi 1\n", - "10 bmw 2" + "0 alfa-romero 0.0\n", + "1 alfa-romero 0.0\n", + "2 alfa-romero 0.0\n", + "3 audi 1.0\n", + "4 audi 1.0\n", + "5 audi 1.0\n", + "6 audi 1.0\n", + "7 audi 1.0\n", + "8 audi 1.0\n", + "9 audi 1.0\n", + "10 bmw 2.0" ] }, "execution_count": 30, @@ -1849,13 +1923,11 @@ { "cell_type": "code", "execution_count": 31, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ - "lb_style = LabelBinarizer()\n", - "lb_results = lb_style.fit_transform(obj_df[\"body_style\"])" + "oe_style = OneHotEncoder()\n", + "oe_results = oe_style.fit_transform(obj_df[[\"body_style\"]])" ] }, { @@ -1868,20 +1940,18 @@ { "cell_type": "code", "execution_count": 32, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([[1, 0, 0, 0, 0],\n", - " [1, 0, 0, 0, 0],\n", - " [0, 0, 1, 0, 0],\n", - " ..., \n", - " [0, 0, 0, 1, 0],\n", - " [0, 0, 0, 1, 0],\n", - " [0, 0, 0, 1, 0]])" + "array([[1., 0., 0., 0., 0.],\n", + " [1., 0., 0., 0., 0.],\n", + " [0., 0., 1., 0., 0.],\n", + " ...,\n", + " [0., 0., 0., 1., 0.],\n", + " [0., 0., 0., 1., 0.],\n", + " [0., 0., 0., 1., 0.]])" ] }, "execution_count": 32, @@ -1890,23 +1960,34 @@ } ], "source": [ - "lb_results" + "oe_results.toarray()" ] }, { "cell_type": "code", "execution_count": 33, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1918,55 +1999,55 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
convertiblehardtop
0100001.00.00.00.00.0
1100001.00.00.00.00.0
2001000.00.01.00.00.0
3000100.00.00.01.00.0
4000100.00.00.01.00.0
\n", "
" ], "text/plain": [ - " convertible hardtop hatchback sedan wagon\n", - "0 1 0 0 0 0\n", - "1 1 0 0 0 0\n", - "2 0 0 1 0 0\n", - "3 0 0 0 1 0\n", - "4 0 0 0 1 0" + " convertible hardtop hatchback sedan wagon\n", + "0 1.0 0.0 0.0 0.0 0.0\n", + "1 1.0 0.0 0.0 0.0 0.0\n", + "2 0.0 0.0 1.0 0.0 0.0\n", + "3 0.0 0.0 0.0 1.0 0.0\n", + "4 0.0 0.0 0.0 1.0 0.0" ] }, "execution_count": 33, @@ -1975,7 +2056,7 @@ } ], "source": [ - "pd.DataFrame(lb_results, columns=lb_style.classes_).head()" + "pd.DataFrame(oe_results.toarray(), columns=oe_style.categories_).head()" ] }, { @@ -1983,15 +2064,13 @@ "metadata": {}, "source": [ "### Advanced Encoding\n", - "[category_encoder](http://contrib.scikit-learn.org/categorical-encoding/) library" + "[category_encoder](https://github.com/scikit-learn-contrib/category_encoders) library" ] }, { "cell_type": "code", "execution_count": 34, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# Get a new clean dataframe\n", @@ -2001,14 +2080,25 @@ { "cell_type": "code", "execution_count": 35, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", + "\n", "\n", " \n", " \n", @@ -2130,15 +2220,42 @@ { "cell_type": "code", "execution_count": 36, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/chris/miniconda3/envs/pbpcode/lib/python3.8/site-packages/category_encoders/utils.py:21: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead\n", + " elif pd.api.types.is_categorical(cols):\n" + ] + }, { "data": { "text/plain": [ - "BackwardDifferenceEncoder(cols=['engine_type'], drop_invariant=False,\n", - " return_df=True, verbose=0)" + "BackwardDifferenceEncoder(cols=['engine_type'],\n", + " mapping=[{'col': 'engine_type',\n", + " 'mapping': engine_type_0 engine_type_1 engine_type_2 engine_type_3 engine_type_4 \\\n", + " 1 -0.857143 -0.714286 -0.571429 -0.428571 -0.285714 \n", + " 2 0.142857 -0.714286 -0.571429 -0.428571 -0.285714 \n", + " 3 0.142857 0.285714 -0.571429 -0.428571 -0.285714 \n", + " 4 0.142857 0.285714 0.428571 -0.428571 -0.285714 \n", + " 5 0.142857 0.285714 0.428571 0.571429 -0.285714 \n", + " 6 0.142857 0.285714 0.428571 0.571429 0.714286 \n", + " 7 0.142857 0.285714 0.428571 0.571429 0.714286 \n", + "-1 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "-2 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "\n", + " engine_type_5 \n", + " 1 -0.142857 \n", + " 2 -0.142857 \n", + " 3 -0.142857 \n", + " 4 -0.142857 \n", + " 5 -0.142857 \n", + " 6 -0.142857 \n", + " 7 0.857143 \n", + "-1 0.000000 \n", + "-2 0.000000 }])" ] }, "execution_count": 36, @@ -2147,70 +2264,85 @@ } ], "source": [ - "encoder = ce.backward_difference.BackwardDifferenceEncoder(cols=[\"engine_type\"])\n", + "# Specify the columns to encode then fit and transform\n", + "encoder = ce.BackwardDifferenceEncoder(cols=[\"engine_type\"])\n", "encoder.fit(obj_df, verbose=1)" ] }, { "cell_type": "code", "execution_count": 37, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/chris/miniconda3/envs/pbpcode/lib/python3.8/site-packages/category_encoders/utils.py:21: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead\n", + " elif pd.api.types.is_categorical(cols):\n" + ] + }, { "data": { "text/html": [ "
\n", + "\n", "
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2218,9 +2350,8 @@ " \n", " \n", " \n", - " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2231,19 +2362,19 @@ "" ], "text/plain": [ - " col_engine_type_0 col_engine_type_1 col_engine_type_2 col_engine_type_3 \\\n", - "0 1.0 0.142857 0.285714 0.428571 \n", - "1 1.0 0.142857 0.285714 0.428571 \n", - "2 1.0 0.142857 0.285714 0.428571 \n", - "3 1.0 0.142857 -0.714286 -0.571429 \n", - "4 1.0 0.142857 -0.714286 -0.571429 \n", + " engine_type_0 engine_type_1 engine_type_2 engine_type_3 engine_type_4 \\\n", + "0 -0.857143 -0.714286 -0.571429 -0.428571 -0.285714 \n", + "1 -0.857143 -0.714286 -0.571429 -0.428571 -0.285714 \n", + "2 0.142857 -0.714286 -0.571429 -0.428571 -0.285714 \n", + "3 0.142857 0.285714 -0.571429 -0.428571 -0.285714 \n", + "4 0.142857 0.285714 -0.571429 -0.428571 -0.285714 \n", "\n", - " col_engine_type_4 col_engine_type_5 col_engine_type_6 \n", - "0 0.571429 0.714286 -0.142857 \n", - "1 0.571429 0.714286 -0.142857 \n", - "2 0.571429 0.714286 0.857143 \n", - "3 -0.428571 -0.285714 -0.142857 \n", - "4 -0.428571 -0.285714 -0.142857 " + " engine_type_5 \n", + "0 -0.142857 \n", + "1 -0.142857 \n", + "2 -0.142857 \n", + "3 -0.142857 \n", + "4 -0.142857 " ] }, "execution_count": 37, @@ -2252,7 +2383,7 @@ } ], "source": [ - "encoder.transform(obj_df).iloc[:,0:7].head()" + "encoder.fit_transform(obj_df).iloc[:,8:14].head()" ] }, { @@ -2265,137 +2396,217 @@ { "cell_type": "code", "execution_count": 38, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "PolynomialEncoder(cols=['engine_type'], drop_invariant=False, return_df=True,\n", - " verbose=0)" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "encoder = ce.polynomial.PolynomialEncoder(cols=[\"engine_type\"])\n", - "encoder.fit(obj_df, verbose=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": { - "collapsed": false - }, - "outputs": [ + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/chris/miniconda3/envs/pbpcode/lib/python3.8/site-packages/category_encoders/utils.py:21: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead\n", + " elif pd.api.types.is_categorical(cols):\n" + ] + }, { "data": { "text/html": [ "
\n", + "\n", "
col_engine_type_0col_engine_type_1col_engine_type_2col_engine_type_3col_engine_type_4col_engine_type_5col_engine_type_6engine_type_0engine_type_1engine_type_2engine_type_3engine_type_4engine_type_5
01.00.1428570.2857140.4285710.5714290.714286-0.857143-0.714286-0.571429-0.428571-0.285714-0.142857
11.00.1428570.2857140.4285710.5714290.714286-0.857143-0.714286-0.571429-0.428571-0.285714-0.142857
21.00.1428570.2857140.4285710.5714290.7142860.857143-0.714286-0.571429-0.428571-0.285714-0.142857
31.00.142857-0.7142860.285714-0.571429-0.428571-0.285714
41.00.142857-0.7142860.285714-0.571429-0.428571-0.285714
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
col_engine_type_0col_engine_type_1col_engine_type_2col_engine_type_3col_engine_type_4col_engine_type_5col_engine_type_6engine_type_0engine_type_1engine_type_2engine_type_3engine_type_4engine_type_5
01.0-5.669467e-015.455447e-01-4.082483e-01-0.5669470.545545-0.4082480.241747-1.091089e-01-0.1091090.032898
11.0-5.669467e-015.455447e-01-4.082483e-01-0.5669470.545545-0.4082480.241747-1.091089e-01-0.1091090.032898
21.03.779645e-013.970680e-17-4.082483e-01-0.3779640.0000000.408248-0.564076-4.364358e-010.436436-0.197386
31.01.347755e-17-4.364358e-011.528598e-170.4834948.990141e-18-0.657952-0.188982-0.3273270.4082480.080582-0.5455450.493464
41.01.347755e-17-4.364358e-011.528598e-170.4834948.990141e-18-0.657952-0.188982-0.3273270.4082480.080582-0.5455450.493464
\n", "
" ], "text/plain": [ - " col_engine_type_0 col_engine_type_1 col_engine_type_2 col_engine_type_3 \\\n", - "0 1.0 -5.669467e-01 5.455447e-01 -4.082483e-01 \n", - "1 1.0 -5.669467e-01 5.455447e-01 -4.082483e-01 \n", - "2 1.0 3.779645e-01 3.970680e-17 -4.082483e-01 \n", - "3 1.0 1.347755e-17 -4.364358e-01 1.528598e-17 \n", - "4 1.0 1.347755e-17 -4.364358e-01 1.528598e-17 \n", + " engine_type_0 engine_type_1 engine_type_2 engine_type_3 engine_type_4 \\\n", + "0 -0.566947 0.545545 -0.408248 0.241747 -0.109109 \n", + "1 -0.566947 0.545545 -0.408248 0.241747 -0.109109 \n", + "2 -0.377964 0.000000 0.408248 -0.564076 0.436436 \n", + "3 -0.188982 -0.327327 0.408248 0.080582 -0.545545 \n", + "4 -0.188982 -0.327327 0.408248 0.080582 -0.545545 \n", "\n", - " col_engine_type_4 col_engine_type_5 col_engine_type_6 \n", - "0 0.241747 -1.091089e-01 0.032898 \n", - "1 0.241747 -1.091089e-01 0.032898 \n", - "2 -0.564076 -4.364358e-01 -0.197386 \n", - "3 0.483494 8.990141e-18 -0.657952 \n", - "4 0.483494 8.990141e-18 -0.657952 " + " engine_type_5 \n", + "0 0.032898 \n", + "1 0.032898 \n", + "2 -0.197386 \n", + "3 0.493464 \n", + "4 0.493464 " + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "encoder = ce.polynomial.PolynomialEncoder(cols=[\"engine_type\"])\n", + "encoder.fit_transform(obj_df, verbose=1).iloc[:,8:14].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Scikit-learn pipeline\n", + "Show an example of how to incorporate the encoding strategies into a scikit-learn pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "# for the purposes of this analysis, only use a small subset of features\n", + "feature_cols = [\n", + " 'fuel_type', 'make', 'aspiration', 'highway_mpg', 'city_mpg',\n", + " 'curb_weight', 'drive_wheels'\n", + "]\n", + "\n", + "# Remove the empty price rows\n", + "df_ml = df.dropna(subset=['price'])\n", + "\n", + "X = df_ml[feature_cols]\n", + "y = df_ml['price']" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "column_trans = make_column_transformer((OneHotEncoder(handle_unknown='ignore'),\n", + " ['fuel_type', 'make', 'drive_wheels']),\n", + " (OrdinalEncoder(), ['aspiration']),\n", + " remainder='passthrough')" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "linreg = LinearRegression()\n", + "pipe = make_pipeline(column_trans, linreg)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([-4476.0937653 , -1014.54842052, -4227.68553953, -4936.79899194,\n", + " -1591.8291911 , -3716.06617255, -4293.79197464, -1390.00486495,\n", + " -1600.57946369, -2124.30041954])" ] }, - "execution_count": 39, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "encoder.transform(obj_df).iloc[:,0:7].head()" + "cross_val_score(pipe, X, y, cv=10, scoring='neg_mean_absolute_error')" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "-2937.17" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get the average of the errors after 10 iterations\n", + "cross_val_score(pipe, X, y, cv=10, scoring='neg_mean_absolute_error').mean().round(2)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [] } @@ -2416,9 +2627,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.8.5" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } From 4eb1fc51a84ee070fb89c6f006a9833baabb4224 Mon Sep 17 00:00:00 2001 From: Chris Moffitt Date: Sun, 29 Nov 2020 13:49:45 -0600 Subject: [PATCH 06/10] Cleaning up notebook --- notebooks/Common-Excel-Part-2.ipynb | 328 ++++++++++++++-------------- 1 file changed, 164 insertions(+), 164 deletions(-) diff --git a/notebooks/Common-Excel-Part-2.ipynb b/notebooks/Common-Excel-Part-2.ipynb index 8d3c24d..e6a5c34 100644 --- a/notebooks/Common-Excel-Part-2.ipynb +++ b/notebooks/Common-Excel-Part-2.ipynb @@ -55,7 +55,7 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.read_excel(\"../data/sample-salesv3.xlsx\")" + "df = pd.read_excel('https://github.com/chris1610/pbpython/blob/master/data/sample-salesv3.xlsx?raw=true')" ] }, { @@ -117,18 +117,18 @@ "data": { "text/html": [ "
\n", - "\n", "\n", " \n", @@ -284,18 +284,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -406,18 +406,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -528,18 +528,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -650,18 +650,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -774,18 +774,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -898,18 +898,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -1034,18 +1034,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -1157,18 +1157,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -1279,18 +1279,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -1401,18 +1401,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -1523,18 +1523,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -1638,18 +1638,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -1762,18 +1762,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -1890,18 +1890,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -2017,18 +2017,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -2137,18 +2137,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -2257,18 +2257,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -2377,18 +2377,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -2511,18 +2511,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -2640,18 +2640,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -2826,18 +2826,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -2948,18 +2948,18 @@ "data": { "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -3124,7 +3124,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python [default]", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -3138,7 +3138,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.8.5" } }, "nbformat": 4, From 3ec12bbfd45287310ba8bc8f2f72d2a52891eb06 Mon Sep 17 00:00:00 2001 From: marcopolo88 <14142694+marcopolo88@users.noreply.github.com> Date: Fri, 26 Feb 2021 14:48:07 +0100 Subject: [PATCH 07/10] corrected commission distribution was supposed to be like this: """ Return the commission rate based on the table: 0-90% = 2% 91-99% = 3% >= 100 = 4% """ --- Monte_Carlo_Simulationv2.ipynb | 468 +++++++++++++++++++++++++++++++++ 1 file changed, 468 insertions(+) create mode 100644 Monte_Carlo_Simulationv2.ipynb diff --git a/Monte_Carlo_Simulationv2.ipynb b/Monte_Carlo_Simulationv2.ipynb new file mode 100644 index 0000000..6499910 --- /dev/null +++ b/Monte_Carlo_Simulationv2.ipynb @@ -0,0 +1,468 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Monte Carlo Simulation with Python\n", + "\n", + "Notebook to accompany article on [Practical Business Python](https://pbpython.com/monte-carlo.html)\n", + "\n", + "Update to use numpy for faster loops based on comments [here](https://www.reddit.com/r/Python/comments/arxwkm/monte_carlo_simulation_with_python/)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "sns.set_style('whitegrid')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Define the variables for the Percent to target based on historical results\n", + "avg = 1\n", + "std_dev = .1\n", + "num_reps = 500\n", + "num_simulations = 100000" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Show an example of calculating the percent to target\n", + "pct_to_target = np.random.normal(\n", + " avg,\n", + " std_dev,\n", + " size=(num_reps, num_simulations)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0.79328531, 0.99211018, 1.14343423, ..., 0.83737887, 0.93507967,\n", + " 0.86079851],\n", + " [1.03126742, 1.04414961, 1.08119495, ..., 0.98607625, 1.01161899,\n", + " 0.96872644],\n", + " [1.08616345, 0.93970666, 1.07594111, ..., 0.94057821, 1.00399945,\n", + " 1.05325946],\n", + " ...,\n", + " [1.10388204, 0.90397305, 0.96005999, ..., 0.88810244, 1.18064642,\n", + " 0.94066897],\n", + " [1.07581302, 0.92552317, 1.08256074, ..., 0.91934988, 1.06668758,\n", + " 1.05969099],\n", + " [1.12755095, 0.95080038, 0.978849 , ..., 1.0094155 , 0.94359533,\n", + " 1.06332923]])" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pct_to_target[0:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "# Another example for the sales target distribution\n", + "sales_target_values = [75_000, 100_000, 200_000, 300_000, 400_000, 500_000]\n", + "sales_target_prob = [.3, .3, .2, .1, .05, .05]\n", + "sales_target = np.random.choice(sales_target_values, p=sales_target_prob, \n", + " size=(num_reps, num_simulations))" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 75000, 200000, 75000, ..., 75000, 100000, 200000],\n", + " [200000, 75000, 100000, ..., 200000, 100000, 100000],\n", + " [400000, 75000, 100000, ..., 500000, 200000, 75000],\n", + " ...,\n", + " [500000, 75000, 500000, ..., 75000, 75000, 75000],\n", + " [ 75000, 100000, 75000, ..., 75000, 500000, 100000],\n", + " [100000, 75000, 75000, ..., 100000, 100000, 75000]])" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sales_target[0:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "commission_percentages = np.take(\n", + " np.array([0.04, 0.03, 0.02]),\n", + " np.digitize(pct_to_target, bins=[.9, .99, 10])\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2 26992625\n", + "1 15075317\n", + "0 7932058\n", + "dtype: int64" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(np.digitize(pct_to_target, bins=[.9, .99, 10]).flatten()).value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.02 26992625\n", + "0.03 15075317\n", + "0.04 7932058\n", + "dtype: int64" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# frequencies\n", + "pd.DataFrame(commission_percentages.flatten()).value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0.04, 0.02, 0.02, ..., 0.04, 0.03, 0.04],\n", + " [0.02, 0.02, 0.02, ..., 0.03, 0.02, 0.03],\n", + " [0.02, 0.03, 0.02, ..., 0.03, 0.02, 0.02],\n", + " ...,\n", + " [0.02, 0.03, 0.03, ..., 0.04, 0.02, 0.03],\n", + " [0.02, 0.03, 0.02, ..., 0.03, 0.02, 0.02],\n", + " [0.02, 0.03, 0.03, ..., 0.02, 0.03, 0.02]])" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "commission_percentages[0:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "total_commissions = (commission_percentages * sales_target).sum(axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "96546.42131435724" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "total_commissions.std()" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Total_Commissions
02838250.0
12786750.0
22795500.0
33054750.0
42831750.0
\n", + "
" + ], + "text/plain": [ + " Total_Commissions\n", + "0 2838250.0\n", + "1 2786750.0\n", + "2 2795500.0\n", + "3 3054750.0\n", + "4 2831750.0" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Show how to create the dataframe\n", + "df = pd.DataFrame(data={'Total_Commissions': total_commissions})\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY0AAAERCAYAAACHA/vpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAueElEQVR4nO3de1hU9aLG8e8wDEjMkJFlTyneMdONN7adEq9pWGreUUhKLdvaUbemblDxlve87LO1sDTr9GiCpGbu7OZtaxZ5klKLUIssRU3xspNBmUFY548OcyQVlikw6vt5np6HWfObtd65NK/rNstiGIaBiIiICT4VHUBERG4cKg0RETFNpSEiIqapNERExDSVhoiImKbSEBER01QaUq4KCgp488036dmzJ926dePxxx9n7ty5uN3ucs/SrVs3zp49e9WP27x5M9OnTy+DRMXVr1+frl270q1bN5544gl69uzJ22+/7bk/KSmJJUuWlDiPd955p9hjLnbx49u3b88333xzVfkOHz7M8OHDATh+/Dj9+vW7qsfLjcm3ogPIrWXKlCn8+uuvvPXWWzgcDs6dO8eYMWOYMGECc+fOLdcs77333h963COPPMIjjzxyndNc3ltvvUVwcDAAp0+fZsiQIbhcLgYNGkR0dHSpj09LS6NevXqXvc/M40ty9OhRDh48CEDVqlVJTk6+pvnJjcGik/ukvGRlZdGlSxd27NiB3W73TM/Ozuarr74iMjKSnJwcpk6dyr59+7BYLLRq1YoXXngBX19f/vSnPzFw4EA+//xzzp07x7Bhw/joo484cOAAd999N6+++iq33Xab6XH169cnNTWVgoIC4uLiOHPmDABt2rRh5MiRZGdnX3b62rVr+fjjj3nttdf45ZdfmDJlCkeOHMEwDLp3786zzz5LVlYWAwYMoE2bNuzZs4ezZ88yduxYOnbsSGZmJhMmTMDtdmMYBr179+bJJ5+85PUqyldUGgBfffUVI0aM4NNPP+Xll1/mzJkzTJo0iZUrV5KcnIzNZsPf358XX3yRgwcPMmHCBPz9/RkyZAinT59m9+7dnDhxgvr161OjRg3P49u3b8+DDz7Ivn37cLvdDBw4kN69e7Nz506mTZvG+++/D+C5/d5779GpUyeOHz/On//8Z6ZOnUrXrl35+uuvyc/PZ/bs2aSmpmK1WgkLC2PcuHHY7Xbat29Pjx49SE1N5dixY3Tr1o2RI0eW4adOrjdtnpJyk56eTt26dYsVBsBdd91FZGQkANOnT6dy5cr885//ZM2aNezfv5833ngDALfbTZUqVVi9ejXdu3cnISGBCRMm8MEHH+B0Otm8efNVjSuSkpJCtWrVePfdd3n77bf5+eefycnJueL0i40ZM4YHH3yQf/7znyQlJbF+/Xo2bNgA/Lb5JiIigtWrVzN69GhmzpwJwLJly2jfvj1r165lyZIl7Nq1i8LCQlOv4f333092dranyOC3TX4zZ87k9ddfZ82aNURFRZGWlkbHjh1p3749AwYM8JTSkSNHePfdd5k3b94l8/b39+fdd9/ljTfeYMGCBXz//fdXzGG1Wpk+fTohISEsW7as2H2LFy/mxIkTvPfee7z33nsUFhby0ksvee4/d+6cp+TeeOMNDh8+bOq5i3dQaUi58fHxKfXLcfv27fTv3x+LxYKfnx/9+vVj+/btnvuLyiUkJITQ0FCqVq2Kj48P1apV49dff73qcQCtWrXik08+YfDgwaxatYrRo0fjcDiuOL3IuXPn+OqrrzxfyA6Hg549e3ry2mw22rRpA8ADDzzAv//9bwA6duzI66+/zrBhw/jkk09ISEjAx8fc/4oWiwX47Qu+iNVqpVOnTvTr148XX3yRoKAgevfufdnHN2nSBF/fy2+VLtonUbVqVVq2bElqaqqpTL+3fft2+vXrh81mw8fHh9jYWD799FPP/UWb9qpWrcqdd955yfsh3k2lIeUmLCyMH3/8EafTWWz68ePHee6558jLy6OwsNDzxQhQWFjIhQsXPLdtNttl//49s+OKcm3evJm+ffty5MgR+vTpw7fffnvF6Rdn+/3W3YvzFn1pAsWeU7t27fj444957LHHyMjIoGvXrvzyyy8lZizyzTffUK1aNQIDA4tNnzdvHq+++iohISEsWbKEF1544bKPv+22264474uLq7CwEF9fXywWS7HnmJ+fX2rGy72HFz/u4sL7/fzF+6k0pNxUrVqVrl27Mn78eE9xOJ1OpkyZQuXKlalUqRIRERGsWLECwzBwu92kpKTw8MMPl2muefPmkZiYSIcOHZgwYQJ169bl+++/v+L0Ina7ncaNG3uOTsrJyWHdunWl5h09ejQffPABnTt3ZvLkydjtdg4dOlRqzuPHjzNv3jwGDRpUbPrp06dp06YNlStXZsCAAYwcOdJzJJTVai1WuiV59913gd92cKempvLQQw8RHBzM0aNHOXXqFIZheDa9Fc37ciXSqlUrkpKSyM/Pp7CwkLfffpuWLVuayiDeT0dPSbmaPHkyiYmJ9OvXD6vVitvtpkOHDp5DNxMSEpg+fTpdu3YlPz+fVq1aMWTIkDLN9PTTTxMfH0+XLl3w8/Ojfv36dO7cmV9//fWy04t2CsNvhfPiiy+ydu1a3G43Xbt2pWfPnhw5cuSKy3v++eeZMGECq1atwmq10qFDB/785z9fMZuPjw9WqxWAXr16XbLTPDg4mKFDhzJgwAAqVark2d8A0Lp1a2bPnm3qdXC5XPTo0YP8/HwSEhKoVasW8Ntmq169enHXXXfRtm1bTyHVrVsXf39/evfuzd///nfPfIYOHcqcOXPo3r07Fy5cICwsjIkTJ5rKIN5PR0+JiIhp2jwlIiKmqTRERMQ0lYaIiJim0hAREdNu6qOndu/eXeyY8LLicrnKZTlXyxtzeWMmUK6r5Y25vDET3Ji5XC4XTZo0uex9N3Vp+Pv706BBgzJfTkZGRrks52p5Yy5vzATKdbW8MZc3ZoIbM1dGRsYVH6fNUyIiYppKQ0RETFNpiIiIaTf1Pg0R+WPy8/PJysoiLy/viveXtN27InhjJvDuXAcPHqRatWql/qjnxVQaInKJrKwsHA4HNWvWLPaLtUXOnz9PQEBABSS7Mm/MBN6b69y5c5w7d46srCzP74yZoc1TInKJvLw87rzzzssWhtwcLBYLd9555xXXJq9EpSEil6XCuPn9kfdYpSEipcrLLyh2+1o3t/x+fnLj0D4NESlVJZuVmvEbSh9o0k+zO1+3eUn5UmnILS0vv4BKNmuFnLFbtGy51OzZs0lPTyc7O5u8vDyqV6/OHXfcwcKFC4uN279/P2fPnr3iRax27txJcnJysYtE/d7333/P3LlzOX/+POfOnaNNmzYMHz68TDfPjRo1ijlz5uDn52dq/Pbt2zl27Bh9+/Yts0xmqTTklna9/wV9NfSv7SuLj48HYO3atfz444+MGTPmsuM++eQTqlSpcsXSKM3Zs2d54YUXWLRoETVr1qSgoIC//vWvJCcnEx0d/Yfzl6akEruc1q1bl1GSq6fSEBGvl5+fz/jx4zl8+DAFBQUMHDiQ5s2b8+6772Kz2WjYsCE//fQT77zzjucx//jHP0qd7+bNm3nwwQepWbMm8Nt1z+fMmeM5b2H27NmkpaUB0KVLF8+lgX19fTl69Chut5vHH3+crVu3cuzYMRITEzl27BhLlizBZrPxyy+/0KtXL9LS0ti3bx9PPfUUMTExtG/fng8//JBt27axdOlSfH19ue+++3jppZf4+uuvmTNnDr6+vgQFBTFv3jw++eQTT3m+8cYbbNiwAV9fX8LDwxk7diyLFi0iKyuLU6dOcfToUcaNG0erVq34+9//zhdffEFhYSGdO3dmwIAB1/xeqDRExOutWrWKO+64g7lz5+J0OunZsyfJycn06NGDKlWqEBYWxrZt21iyZAkBAQFMmjSJHTt2ULVq1RLne+LECapXr15sWmBgIABbt24lKyuLlJQULly4QExMDP/xH/8BwH333cf06dOZNGkSWVlZLF26lIULF7JlyxYaNGjAL7/8wrp160hPT2fEiBFs2rSJ48ePM2zYMGJiYjzLev/99xkwYACdO3dm3bp1OJ1ONm3aRMeOHXnmmWfYsmULZ8+e9Yzfv38/H374IcnJyfj6+jJ8+HC2bt0KgJ+fH6+//jqfffYZb7zxBq1atWLdunWsWLGCqlWrsnbt2uvyXujoKRHxepmZmZ5NUHa7nTp16nD48OFiY4KDg4mLi2PcuHHs37+fCxculDrfe++9l19++aXYtMOHD/Pll1+SmZlJeHg4FosFm81G48aNyczMBOCBBx4AICgoiLp163r+drvdANSrVw+bzYbD4aBatWr4+flx++2343K5ii1r3LhxfPnll/Tv35+vvvoKHx8fhgwZwunTp3n66af56KOP8PX9/3/b//jjjzRu3BibzYbFYiE8PJzvv/8ewLNf7p577vHkWLBgAQsWLOCZZ54pVj7XQmsaIlKqvPyC67oP5moPAqhTpw67du2iY8eOOJ1ODhw4QLVq1bBYLBQWFpKTk8PixYvZtm0bAAMHDsQwjFLn265dO1577TWio6MJCQkhPz+f2bNn8/DDD1OnTh3Wrl3LgAEDyM/P5+uvv6ZHjx5A6ec3mN2JvmrVKoYPH86dd97JpEmT2LhxI7m5ufTo0YO4uDhee+01UlJSuPfeewGoXbs2b775JhcuXMBqtfLll1/SvXt39u3bd8ky3W43H330EQsWLMAwDDp37kznzp257777TGW7EpWGiJTq91/w1/rTGFd71FhUVBQTJ04kOjoal8vFsGHDuPPOO2nUqBEvvfQSderUoUmTJvTo0YPbbruNoKAgTpw4QbVq1Uqcr91uZ/bs2SQkJGAYBrm5ubRr146YmBgsFgv/8z//Q9++fcnPz6dTp040bNjwDz/nywkLC2PgwIFUrlyZwMBA2rZty6FDh4iPj+e2227DZrPx4osv8uWXXwJQv359HnvsMaKjoyksLKR58+Z06NCBffv2XTLvorWbbt26cfvtt9OyZUtP+VwLi2Gmjm9Q5XXxkxvxIisVxRszefPRUxX1epW2XG/8PSVvzATen+ty73VJ77/WNETkpjdlyhTP/oiLLV26lEqVKlVAohuXSkNELsswjJvm96emTJlS0RG80h/Z0KSjp0TkEpUqVeLUqVN/6EtFbgyGYXDq1KmrXtPSmoaIXKJatWpkZWWRnZ192fvz8/Ov6sI95cEbM4F35yo6JPhqlElpFBQUkJCQwMGDB7FarcyaNQvDMIiPj8disVCvXj0mT56Mj48PKSkpnhNVhg4dSrt27cjLy2Ps2LGcOnWKwMBA5syZQ3BwMLt372bGjBlYrVYiIiIYNmxYWcQXueXZbLYSL8zjjQc0eGMm8O5cV3PxpSJlsnmq6AzF5ORkRowYwaxZs5g1axYjR45k5cqVGIbB5s2byc7OZvny5SQnJ7Ns2TIWLFiA2+0mKSmJ0NBQVq5cSffu3UlMTARg8uTJzJ8/n6SkJPbs2UN6enpZxBcRkSsokzWNDh060LZtWwCOHj1KlSpV+Ne//kWLFi2A335867PPPsPHx4emTZvi5+eHn58fISEh7Nu3j7S0NJ599lnP2MTERJxOJ263m5CQEAAiIiJITU0t8bhpl8tVLtfmzcvL88prAHtjLm/LVNH/AizttfC216uIN+byxkxw8+Uqs30avr6+xMXFsXHjRhYuXMjWrVs9R2IEBgaSk5OD0+nE4XB4HhMYGIjT6Sw2/eKxdru92Njf/4zA7/n7++s8DS/L5Y2ZKlJpr4W3vl7emMsbM8GNmaukMinTo6fmzJnDxx9/zMSJE4v95kpubi5BQUHY7XZyc3OLTXc4HMWmlzQ2KCioLOOLiMjvlElprFu3jtdeew347bKQFouFRo0asXPnTuC3C4qEh4cTFhZGWloaLpeLnJwcMjMzCQ0NpVmzZp7fkNm+fTvNmzfHbrdjs9k4dOgQhmGwY8cOwsPDyyK+iIhcQZlsnnr00UcZN24cTz75JBcuXGD8+PHUqVOHiRMnsmDBAmrXrk1kZCRWq5XY2FhiYmIwDINRo0bh7+9PdHQ0cXFxREdHY7PZmD9/PgBTp05lzJgxFBQUEBERQePGjcsivoiIXEGZlMZtt9122QugrFix4pJpUVFRREVFFZsWEBBwyWUdAZo0aUJKSsr1CyoiIldFZ4SLiIhpKg0RETFNpSEiIqapNERExDSVhoiImKbSEBER01QaIiJimkpDRERMU2mIiIhpKg0RETFNpSEiIqapNERExDSVhoiImKbSEBER01QaIiJimkpDRERMU2mIiIhpKg0RETFNpSEiIqapNERExDSVhoiImKbSEBER01QaIiJimkpDRERM873eM8zPz2f8+PEcOXIEt9vN0KFDueeeexgyZAg1a9YEIDo6mscff5yUlBSSk5Px9fVl6NChtGvXjry8PMaOHcupU6cIDAxkzpw5BAcHs3v3bmbMmIHVaiUiIoJhw4Zd7+giIlKK614a69evp3LlysydO5czZ87Qo0cP/vM//5OBAwcyaNAgz7js7GyWL1/OmjVrcLlcxMTE0LJlS5KSkggNDWX48OFs2LCBxMREEhISmDx5MosWLaJ69eo899xzpKen07Bhw+sdX0RESnDdS6NTp05ERkZ6blutVr799lsOHjzI5s2bqVGjBuPHj2fv3r00bdoUPz8//Pz8CAkJYd++faSlpfHss88C0Lp1axITE3E6nbjdbkJCQgCIiIggNTW11NJwuVxkZGRc76d4iby8vHJZztXyxlzelqlBgwYVuvzSXgtve72KeGMub8wEN1+u614agYGBADidTkaMGMHIkSNxu9306dOHRo0asXjxYl555RXuv/9+HA5Hscc5nU6cTqdnemBgIDk5OTidTux2e7Gxhw8fLjWLv79/uXwpZGRkVPiXz+V4Yy5vzFSRSnstvPX18sZc3pgJbsxcJZVJmewIP3bsGE899RTdunWja9eudOzYkUaNGgHQsWNHvvvuO+x2O7m5uZ7H5Obm4nA4ik3Pzc0lKCjosmODgoLKIrqIiJTgupfGyZMnGTRoEGPHjqV3794APPPMM+zduxfAs1kpLCyMtLQ0XC4XOTk5ZGZmEhoaSrNmzdi2bRsA27dvp3nz5tjtdmw2G4cOHcIwDHbs2EF4ePj1ji4iIqW47punXn31Vc6ePUtiYiKJiYkAxMfHM3PmTGw2G1WqVGHatGnY7XZiY2OJiYnBMAxGjRqFv78/0dHRxMXFER0djc1mY/78+QBMnTqVMWPGUFBQQEREBI0bN77e0aUC5eUXUMlmregYIlKK614aCQkJJCQkXDI9OTn5kmlRUVFERUUVmxYQEMDChQsvGdukSRNSUlKuX1DxKpVsVmrGbyj35f40u3O5L1PkRqaT+0RExDSVhoiImKbSEBER01QaIiJimkpDRERMU2mIiIhpKg0RETFNpSEiIqapNERExDSVhoiImKbSEBER01QaIiJimkpDRERMU2mIiIhpKg2RCpKXX1DqmLK4TKiZ5YpcyXW/noaImKNriMiNSGsaIiJimkpDRERMU2mIiIhpKg0RETFNpSEiIqapNERExDSVhoiImKbSEBER0677yX35+fmMHz+eI0eO4Ha7GTp0KHXr1iU+Ph6LxUK9evWYPHkyPj4+pKSkkJycjK+vL0OHDqVdu3bk5eUxduxYTp06RWBgIHPmzCE4OJjdu3czY8YMrFYrERERDBs27HpHFxGRUpha0zh58qTpGa5fv57KlSuzcuVKli5dyrRp05g1axYjR45k5cqVGIbB5s2byc7OZvny5SQnJ7Ns2TIWLFiA2+0mKSmJ0NBQVq5cSffu3UlMTARg8uTJzJ8/n6SkJPbs2UN6evofe8YiIvKHmVrTGD58OMHBwfTu3Zs2bdrg43PlrunUqRORkZGe21arlfT0dFq0aAFA69at+eyzz/Dx8aFp06b4+fnh5+dHSEgI+/btIy0tjWeffdYzNjExEafTidvtJiQkBICIiAhSU1Np2LBhibldLhcZGRlmnuI1ycvLK5flXC1vzHWlTGXxG0tyZdf6ubiRPlsV7WbLZao0kpKSyMzMZPXq1SxevJiHHnqI3r17U7169UvGBgYGAuB0OhkxYgQjR45kzpw5WCwWz/05OTk4nU4cDkexxzmdzmLTLx5rt9uLjT18+HCpuf39/cvlyygjI8Mrv/S8MZc3ZroVXet74I3vozdmghszV0llYnpH+N1330316tWpVKkSBw4cYMaMGfzjH/+47Nhjx47x1FNP0a1bN7p27VpszSQ3N5egoCDsdju5ubnFpjscjmLTSxobFBRkNrqIiFwnpkrjr3/9K3379uXs2bPMnTuXxYsX8+qrr7Jt27ZLxp48eZJBgwYxduxYevfuDcADDzzAzp07Adi+fTvh4eGEhYWRlpaGy+UiJyeHzMxMQkNDadasmWe+27dvp3nz5tjtdmw2G4cOHcIwDHbs2EF4ePj1eg1ERMQkU5unoqKiaNKkCYGBgZw4ccIzPSkp6ZKxr776KmfPniUxMdGzE3vChAlMnz6dBQsWULt2bSIjI7FarcTGxhITE4NhGIwaNQp/f3+io6OJi4sjOjoam83G/PnzAZg6dSpjxoyhoKCAiIgIGjdufD2ev4iIXAVTpfH111/z6aefEh8fz/Tp02nUqBHPPfcc/v7+l4xNSEggISHhkukrVqy4ZFpUVBRRUVHFpgUEBLBw4cJLxjZp0oSUlBQzcUVEpIyY2jy1ZcsW4uPjAVi4cCFbtmwp01AiIuKdTJWGxWLB7XYDv528ZxhGmYYSERHvZGrzVL9+/ejatSuhoaH8+OOPnvMoRETk1mKqNPr06cMjjzzC4cOHqV69OsHBwWWdS0REvJCp0sjIyGDVqlW4XC7PtFmzZpVZKBER8U6mSiM+Pp7+/ftzzz33lHUeERHxYqZKo0qVKvTp06ess4iIiJczVRr33XcfS5YsoUGDBp7fkIqIiCjTYCIi4n1MlUZ+fj4HDx7k4MGDnmkqDRGRW4+p0pg1axYHDx7k0KFD1K9fn7vvvrusc4mIiBcyVRorVqxg48aN/Prrr/To0YOff/6ZSZMmlXU2ERHxMqbOCN+wYQP//d//jcPh4Omnn2bPnj1lnUtERLyQqdIo+tmQop3gfn5+ZZdIRES8lqnNU126dOHJJ5/k6NGjDB48mA4dOpR1LhER8UKmSqN///489NBDHDhwgFq1anH//feXdS4REfFCpkrj5Zdf9vydmZnJpk2bGDZsWJmFEhER72T6jHD4bd/Gd999R2FhYZmGEhER72T6p9Evpp9GFxG5NZkqjYvPBM/OzubYsWNlFkhERLyXqdK4+EQ+f39//va3v5VZIBER8V6mSmP58uVlnUNERG4ApkrjiSeeIDc3F39/f8+FmAzDwGKxsHnz5jINKCIi3sNUaTRt2pTu3bvTtGlT9u/fz7Jly5g+fXpZZxMRES9jqjQyMzNp2rQpAPXr1+fYsWP6KRERkVuQqd+ecjgc/Nd//Rdbtmxh7ty53HvvvaU+Zs+ePcTGxgKQnp5Oq1atiI2NJTY2lg8++ACAlJQUevbsSVRUFFu3bgUgLy+P4cOHExMTw+DBgzl9+jQAu3fvpk+fPvTr16/YyYYiIlJ+TK1pzJ8/n5UrV/Lpp59Sv359Ro0aVeL4pUuXsn79egICAgD47rvvGDhwIIMGDfKMyc7OZvny5axZswaXy0VMTAwtW7YkKSmJ0NBQhg8fzoYNG0hMTCQhIYHJkyezaNEiqlevznPPPUd6ejoNGza8hqcuIiJXy9Sahr+/P7fffjt33HEHtWrV4uzZsyWODwkJYdGiRZ7b3377Lf/617948sknGT9+PE6nk71799K0aVP8/PxwOByEhISwb98+0tLSaNWqFQCtW7cmNTUVp9OJ2+0mJCQEi8VCREQEqamp1/C0RUTkjzB9nsbdd9/N559/TqNGjYiLi2Pp0qVXHB8ZGUlWVpbndlhYGH369KFRo0YsXryYV155hfvvvx+Hw+EZExgYiNPpxOl0eqYHBgaSk5OD0+nEbrcXG3v48OFSc7tcLjIyMsw8xWuSl5dXLsu5Wt6Y60qZGjRoUAFpbl3X+rm4kT5bFe1my2WqNA4dOsSMGTPYtWsX7du3Z8mSJVe1kI4dOxIUFOT5e9q0aYSHh5Obm+sZk5ubi8PhwG63e6bn5uYSFBRUbNrF00vj7+9fLl9GGRkZXvml5425vDHTreha3wNvfB+9MRPcmLlKKhNTm6cKCgo4ffo0FosFp9OJj4+ph3k888wz7N27F4DU1FQaNmxIWFgYaWlpuFwucnJyyMzMJDQ0lGbNmrFt2zYAtm/fTvPmzbHb7dhsNg4dOoRhGOzYsYPw8PCryiAiItfO1JrGqFGjiI6OJjs7m759+zJhwoSrWsiUKVOYNm0aNpuNKlWqMG3aNOx2O7GxscTExGAYBqNGjcLf35/o6Gji4uKIjo7GZrMxf/58AKZOncqYMWMoKCggIiKCxo0bX/2zFRGRa2KqNI4dO8bHH3/M6dOnueOOOzyXfS1JtWrVSElJAaBhw4YkJydfMiYqKoqoqKhi0wICAli4cOElY5s0aeKZn4iIVAxT25mKvqyDg4NNFYaIiNycTK1puN1uunfvTq1atTz7M4o2G4mIyK2jxNJITEzk+eefZ8yYMRw/fpyqVauWVy4REfFCJW6e+uKLLwBo0aIF77zzDi1atPD8JyIit54SS8MwjMv+LSIit6YSS+Pind7aAS4iIiXu00hPT6dfv34YhsEPP/zg+dtisVz2EFoREbm5lVga69evL68cIiJyAyixNO67777yyiEiIjeAq/sRKRERuaWpNERExDSVhoiImKbSEBER01QaIiJimkpDRERMU2mIiIhpKg0RETFNpSEiIqapNERExDSVhoiImKbSEBER01QaIiJimkpDRERMU2mIiIhpZVYae/bsITY2FoCff/6Z6OhoYmJimDx5MoWFhQCkpKTQs2dPoqKi2Lp1KwB5eXkMHz6cmJgYBg8ezOnTpwHYvXs3ffr0oV+/frz88stlFVtEREpQJqWxdOlSEhIScLlcAMyaNYuRI0eycuVKDMNg8+bNZGdns3z5cpKTk1m2bBkLFizA7XaTlJREaGgoK1eupHv37iQmJgIwefJk5s+fT1JSEnv27CE9Pb0soouISAnKpDRCQkJYtGiR53Z6ejotWrQAoHXr1nz++efs3buXpk2b4ufnh8PhICQkhH379pGWlkarVq08Y1NTU3E6nbjdbkJCQrBYLERERJCamloW0UVEpAQlXu71j4qMjCQrK8tz2zAMLBYLAIGBgeTk5OB0OnE4HJ4xgYGBOJ3OYtMvHmu324uNPXz4cKk5XC4XGRkZ1+tpXVFeXl65LOdqeWOuK2Vq0KBBBaS5dV3r5+JG+mxVtJstV5mUxu/5+Pz/Ck1ubi5BQUHY7XZyc3OLTXc4HMWmlzQ2KCio1OX6+/uXy5dRRkaGV37peWMub8x0K7rW98Ab30dvzAQ3Zq6SyqRcjp564IEH2LlzJwDbt28nPDycsLAw0tLScLlc5OTkkJmZSWhoKM2aNWPbtm2esc2bN8dut2Oz2Th06BCGYbBjxw7Cw8PLI7qIiFykXNY04uLimDhxIgsWLKB27dpERkZitVqJjY0lJiYGwzAYNWoU/v7+REdHExcXR3R0NDabjfnz5wMwdepUxowZQ0FBARERETRu3Lg8oouIyEXKrDSqVatGSkoKALVq1WLFihWXjImKiiIqKqrYtICAABYuXHjJ2CZNmnjmJ2UjL7+ASjZrmS7DG1fTRcS8clnTkBtDJZuVmvEbKmTZP83uXCHLFZGrozPCRUTENJWGiIiYptIQERHTVBoiImKaSkNERExTaYjcYvLyC655Hn/00OnrsWypWDrkVuQWo0Or5VpoTUNERExTaYiIiGkqDRERMU2lISIipqk0RETENJWGiIiYptIQERHTVBoiImKaSkNERExTaYiIiGkqDRERMU2lISIipqk0RETENJWGiIiYptIQERHTVBoiImJauV6EqXv37jgcDgCqVavGkCFDiI+Px2KxUK9ePSZPnoyPjw8pKSkkJyfj6+vL0KFDadeuHXl5eYwdO5ZTp04RGBjInDlzCA4OLs/4IiK3vHIrDZfLBcDy5cs904YMGcLIkSN58MEHmTRpEps3b6ZJkyYsX76cNWvW4HK5iImJoWXLliQlJREaGsrw4cPZsGEDiYmJJCQklFd8ERGhHDdP7du3j/PnzzNo0CCeeuopdu/eTXp6Oi1atACgdevWfP755+zdu5emTZvi5+eHw+EgJCSEffv2kZaWRqtWrTxjU1NTyyu6iIj8n3Jb06hUqRLPPPMMffr04aeffmLw4MEYhoHFYgEgMDCQnJwcnE6nZxNW0XSn01lsetHY0rhcLjIyMsrmCV0kLy+vXJZzta42V4MGDcowjchvyur/lZvl/8Py8kdzlVtp1KpVixo1amCxWKhVqxaVK1cmPT3dc39ubi5BQUHY7XZyc3OLTXc4HMWmF40tjb+/f7l8EWZkZHjlF6635pJbW1l9Jr31834j5iqpTMpt89Tq1auZPXs2AMePH8fpdNKyZUt27twJwPbt2wkPDycsLIy0tDRcLhc5OTlkZmYSGhpKs2bN2LZtm2ds8+bNyyu6iIj8n3Jb0+jduzfjxo0jOjoai8XCzJkzueOOO5g4cSILFiygdu3aREZGYrVaiY2NJSYmBsMwGDVqFP7+/kRHRxMXF0d0dDQ2m4358+eXV3QREfk/5VYafn5+l/2iX7FixSXToqKiiIqKKjYtICCAhQsXllk+EREpnU7uExER01QaIiJimkpDRERMU2mIiIhpKg0RETFNpSEiIqapNERExDSVhoiImKbSEBER01QaIiJimkpDRERMU2mIiIhpKg0RETFNpSEiIqapNERExDSVhoiImKbS8EJ5+QXXZT7eeF1iubVdr8/25ZT0eS/L5d5qyu3KfWJeJZuVmvEbyn25P83uXO7LlFuLPts3Pq1piIiIaSoNERExTaUhIiKmqTRERMQ0lYaIiJim0hAREdNuqENuCwsLmTJlCvv378fPz4/p06dTo0aNio4lInLLuKHWNDZt2oTb7WbVqlWMHj2a2bNnV3QkEZFbyg1VGmlpabRq1QqAJk2a8O2335bp8syeRaozr0W8W0WdEX4znoluMQzDqOgQZk2YMIFHH32UNm3aANC2bVs2bdqEr+/lt7Lt3r0bf3//8owoInLDc7lcNGnS5LL33VD7NOx2O7m5uZ7bhYWFVywM4IpPWkRE/pgbavNUs2bN2L59O/DbWkRoaGgFJxIRubXcUJunio6eOnDgAIZhMHPmTOrUqVPRsUREbhk3VGmIiEjFuqE2T4mISMVSaYiIiGkqDRERMe2GOuS2IuXn5zN+/HiOHDmC2+1m6NChPPLII57733zzTVavXk1wcDAAU6dOpXbt2hWea+/evcyePRvDMLjrrruYO3duuZy7UlKu7OxsXnjhBc/YjIwMRo8eTXR0dIVlAli/fj1vvvkmPj4+9OrVi5iYmDLNYzbXunXrWLZsGQ6Hgx49etCnT59yyVVQUEBCQgIHDx7EarUya9YsQkJCPPdv2bKFV155BV9fX3r16kVUVJRX5AI4f/48AwcOZMaMGeV2sExpud5//33eeustrFYroaGhTJkyBR+fsv13e2mZPv74Y5YsWYLFYqFv377mPluGmLJ69Wpj+vTphmEYxunTp402bdoUu3/06NHGN99841W5CgsLjSeeeML46aefDMMwjJSUFCMzM7PCc13sq6++MmJjY40LFy5UeKaWLVsaZ86cMVwul9GhQwfj3//+d5lnKi3XqVOnjLZt2xpnzpwxCgoKjNjYWOPw4cPlkmvjxo1GfHy8YRiG8cUXXxhDhgzx3Od2uz2vkcvlMnr27GmcOHGiwnMZhmHs3bvX6NGjh/Hwww8bP/zwQ7lkKi3X+fPnjUceecQ4d+6cYRiGMWrUKGPTpk0VmunChQtGx44djbNnzxoXLlwwHn30UePUqVOlzlNrGiZ16tSJyMhIz22r1Vrs/vT0dJYsWUJ2djZt27blL3/5S4XnOnjwIJUrV+att97iwIEDtGnTplzWfkrLVcQwDKZNm8a8efMue395Z6pfvz45OTn4+vpiGAYWi6XMM5WWKysri/vvv5/KlSsD8Kc//Yk9e/ZQrVq1Ms/VoUMH2rZtC8DRo0epUqWK577MzExCQkK4/fbbAWjevDm7du3iscceq9BcAG63m1deeYW//e1vZZ7FbC4/Pz+Sk5MJCAgA4MKFC+Wyxl9SJqvVygcffICvry+nTp0CIDAwsNR5qjRMKnoxnU4nI0aMYOTIkcXu79y5MzExMdjtdoYNG8bWrVtp165dheY6c+YMX3/9NRMnTqRGjRoMGTKERo0a8dBDD1VoriJbtmyhXr165VZkpWWqV68evXr1IiAggI4dOxIUFFThuWrUqMEPP/zAyZMnCQwMJDU1lZo1a5ZLLgBfX1/i4uLYuHEjCxcu9Ex3Op04HA7P7cDAQJxOZ4Xngt8KrKJcKZePj4/nC3v58uWcO3eOli1bVmimovs++eQTXnzxRdq0aVPiL2x4lNFa0U3p6NGjRo8ePYx33nmn2PTCwkLj7NmzntsrVqwwXn755QrP9cMPPxhdunTx3H7zzTeNJUuWVHiuIiNGjDB27dpVbnlKypSRkWFERkZ6VtVHjRplfPDBBxWeyzAMY/PmzUa/fv2MkSNHGgkJCcbGjRvLLVeREydOGG3btjVyc3MNw/jt9Xr22Wc998+YMcP48MMPKzzXxfr371+um6cudrlcBQUFxuzZs42//OUvns1UFZ3p4mxjx441Vq9eXep8dPSUSSdPnmTQoEGMHTuW3r17F7vP6XTSpUsXcnNzMQyDnTt30qhRowrPVb16dXJzc/n5558B2LVrF/Xq1avwXEXS09Np1qxZueQpLZPD4aBSpUr4+/tjtVoJDg7m7NmzFZ7rwoUL7Nmzh7fffps5c+bw448/lttrtm7dOl577TUAAgICsFgsnk1nderU4eeff+bf//43brebXbt20bRp0wrPVZFKyzVp0iRcLheJiYmezVQVmcnpdNK/f3/cbjc+Pj4EBASY2jGvM8JNmj59Oh9++GGxTSl9+vTh/Pnz9O3bl3Xr1rF8+XL8/Px46KGHGDFihFfkSk1NZf78+RiGQdOmTUlISPCKXKdPn2bgwIG899575ZLHTKakpCTWrFmDzWYjJCSEadOm4efnV+G5Xn75ZTZt2oS/vz8DBw6kU6dOZZ4J4Ny5c4wbN46TJ09y4cIFBg8ezPnz5zl37hx9+/b1HD1lGAa9evXiySef9IpcRWJjY5kyZUq5HT1VUq5GjRrRq1cvwsPDPfvKnnrqKTp27Fhhmfr27cuqVatYvXo1vr6+1K9fn4kTJ5ZawCoNERExTZunRETENJWGiIiYptIQERHTVBoiImKaSkNE5Ca0Z88eYmNjSxyzdu1a+vTpQ8+ePXnllVdMzVdnhIuI3GSWLl3K+vXrSzwf5NChQyQlJXlOFVi4cCH5+fnYbLYS5601DRGRm0xISAiLFi3y3N6/fz+xsbHExsYyfPhwcnJy+Pzzz2nUqBFxcXH079+fZs2alVoYoDUNEZGbTmRkJFlZWZ7bEydOZObMmdStW5d33nmH119/nUqVKrFr1y6SkpJwuVxER0ezevXqUn9zTaUhInKTy8zMZOrUqcBv12+pVasWYWFhtGjRArvdjt1up06dOvz000+EhYWVOC+VhojITa5WrVrMmTOHe++9l7S0NLKzs6lVqxYrV67E5XJRUFDg+bn70qg0RERuclOmTCEuLo6CggIAZsyYQa1atejVqxfR0dEYhsHzzz/vuW5LSfTbUyIiYpqOnhIREdNUGiIiYppKQ0RETFNpiIiIaSoNERExTaUhIiKmqTRERMS0/wVf3ZYTm53/1wAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.plot(kind='hist', title='Commissions Distribution')" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Total_Commissions
count1.000000e+05
mean2.831683e+06
std9.654690e+04
min2.472750e+06
25%2.765750e+06
50%2.830250e+06
75%2.896500e+06
max3.278500e+06
\n", + "
" + ], + "text/plain": [ + " Total_Commissions\n", + "count 1.000000e+05\n", + "mean 2.831683e+06\n", + "std 9.654690e+04\n", + "min 2.472750e+06\n", + "25% 2.765750e+06\n", + "50% 2.830250e+06\n", + "75% 2.896500e+06\n", + "max 3.278500e+06" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 3bce001c92bd184636f45f4a00538cb063831586 Mon Sep 17 00:00:00 2001 From: Chris Moffitt Date: Sun, 27 Jun 2021 11:40:34 -0500 Subject: [PATCH 08/10] Create FUNDING.yml Add a sponsorship link --- .github/FUNDING.yml | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .github/FUNDING.yml diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 0000000..89e4415 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,3 @@ +# These are supported funding model platforms + +github: chris1610 From 0e7af2c36da8d5d518b8c1f0ce84458e38c5d345 Mon Sep 17 00:00:00 2001 From: Chris Moffitt Date: Sat, 14 Aug 2021 14:35:58 -0500 Subject: [PATCH 09/10] Adding missing file --- data/Example4.xlsx | Bin 0 -> 13509 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 data/Example4.xlsx diff --git a/data/Example4.xlsx b/data/Example4.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..307ffc300095a447ea6343084885347c18014fa4 GIT binary patch literal 13509 zcmeIZg;yM3@;*GcI|Kp*f)5ZRNN_@s;0#W1cXxM!J40}SySqz(;O_1o-2FFMc|W_^ zegA^*_RQ%%bLLj{?W%gJs&79nD+vRO3xEe80ssIqfZlPMi53(95Dg0e-~bSz)rG9B z91N`-v=v=#4DB@;T`Vo$X2C)OGXT(#@&DWY4_lxFJSfw}j45&>_9(Q)AhA>>fXII2 z*NaUj$KTN%_p?Y}J>AswB|YQ;Q!o?HjH42Jbjh9NY}lmA%CgQc_-C^`YM5_dhmsZ% z2UAb)FRFI@*RghLswWw^L;~+$Yimasr2rCb>$~LHgx5bwln|QWz2f)TOk3zx##;T^ zh+3ppz}2_dxhA8$g2g#dIElGAjx%LuO6#t5vB@9~8%FNF^vX#qSC;IH0kYP&Fl<~NiaO`gz=V5SM+7=*yKkRuOV+!woTjg)uaiV2T!BulQFO%( z1(}lls}!Fn&x9*6M;TQJk#KT}17Gna`KtYtR$bA%ErXgCR5&UPtf=S6IEWif@t*yb zw~zKQm`m!{L=s+~y`J%WnMa=m+0`APl7b1BX@y?89XGkFwt+>*g>Xtc&XWg&OlqGz zIUDvZ%cy8CRmPRKS`FLTV47<;2U1sY-)X;J7-_)@q(c2CNH*_-4#6-0fDWXTVL&Y7 zV!`BWZD+1$ZEgO$Zj~r$T4!_OdTN`#z?fgf`ys8Gv20M7&<&Rr$>d+i7ut2uDg&y2 zvK1EHow%csN#|wDd1Tf}cuP8Ae7&yL7RVwa0qQ1~EkQ|mQMi7dS zZ4O5zq9RXsI8Y%q$!_+8kc~WGgMq-K)rd>lb!E0Dh9WovFGir9UUgC0#PRu6Lvpb6 z>H$}|y3d!&(EfZVTvJC9Eq*qQXeGER=ecnw=r+d33Zw}3BUs7ux*}CZm)_ZP;*5xx zOZuldHfDaL&SOW^l|Ta`KYo_@E*Sdtwzzor4m<6Rin~Pah*I zVLCS+H5CfW2aH0cJh~jEoVYyADgUd6ye_;1LBIay?})GaY$@A!v zCVFzoQv7LRs`cdr^sUw5jBg&3sHd6|%@JEi%P}1Nhl00o&aoQe9|SJ?%pYG`y)tNi z^!uvFo1DI|Ji)5oN|BJ!MBnLYS0!LN@HSi9VQoC@&N=(PeXsACloMj*J}4YDHCA8s z?)qCKDP=zgCdCr-Zcf5B|1z>$q{S|Rq<7qMjo)yEaSuv_vHi=q(u*c*si%V9bf?qc zUY+~z7`w(HkSHoJFU#!%CcaH(1-VEMP6gC@H~fQE(syZUI{B3~Jp=Nu(00aqU6STMFp_ zGYnhRHL{RN^EK<4F~N8gAL58>*_-B~!qF4fJdJd0apj!sqEI_|v_+tctXx%A#=(R@ znC(sbNERz`x+iZJbsKY}PbYhR3bZ_Wd|s1pag;Q@Eakq8*TL49fB$P?#6_d6B;V4M z!KCa(FP;51zUs?vnbO=%hDpO;z_0>!;N!=Ujm()J`)&&FB+0iT-rm78_Lw(Ck`YVP z(xI-oOMP&9oJ)lg;ps}U+U%IxoU3)D88(Uxnl#uqU^0k*yy5`~t`e-|318bt6u{^a z1QiWmR|oZ9aaN9=h~EXx7GhUe)%$Od9<%=ZR4a5HPU{x-0)gj$0@#-yoNxj8 zrwLN!2my#t5CH$Z(ETfn|5@~)AU!A~4gBBUO5~*_yO}XtQJ#aCol~6fu;v|@$bTsv zVj%X{&@NJwv3gyu645njewmSEg0T*CJsIwGx?xA(fWf)yq%R1?hIPZSIN^b_9=;fc zLDV}cCGiu5!@>S_a9DhVf}U)T)5ss&_bwfmbLSRF^*Sg!k6ie&aUzh-Y=qg7#ms)} z%>X3n57un?+y^N9D)p_|5QzL>%{@AC#{T^ z&Qs?_a@qqd%EpFJie^$|Tu(1$VlJ#Q%h$2z4Z!===BStkpNt;PkE@zUG_^{!N$S3HFln!7J|BJ`ES}%)d9HA~YjIm>D_PZi zZd=iQK7Stdc6-ooez1ITY~|PB#9(@GPSC~(n&sf&cnB&#+?{@QduZ~q?b~#^Y2j{Y zKWOlA5@j%04f!7kf9U(sfe-y*|`Fz{3S>4mk?&hhQu$+X^ z>v+lA{HA?pPWsmQVBhtkV)3{>D#(2Ij#`U#_4(1rtJ;mzb>o3_E01vX*UT!%GyYs- z(aRi_(ow(4GMD*tMsq#57q7FS!=wU<6dcuW$RHKRh^*Dv?%UTq#MPOhIUlHME+CzK=X<~SxC zd7oJGzj!sTd~EQsw<LJ)>=qu?R6GrQT8 zHd5rbXv0BV*8PObbmYiqUR)Ot7Q$)!F-oIhiY>aYkj?31R7Bc$-w4egC(PI zosFwI#J;tiH{<*`a*GQ?H-b<5`SDiE?N)qol+>UvW0YkrZ# z1-)Lw_4Qwzn^pmZ-7txSe54*seItqy0qQ2@UX!GYoHc60GM6c&YFIqz5?M4~Qylg_ zoi@1Ya`KJLB&nbT9w4$7}K(6Wf!Em9}be~9vTzZ<#Bi%FgXPuhl4H)rkcgx5Ad`lq_@P?<+reg zP0HtrDqqr9X2(uz!j^Vt%Th4#>~p7*7<**pPsGx|(Rz4r@U6ofIo=LYc_=oieQv+2 z#Cy-`Aw}?6W&6^?m(pS8gV2gfLeeE73g4AY012Bo`juKulBwa|NoSaP)B!5xvKnn) zKuj*nIx2Kx4y%|BgAd)R!l>BA2hDGj3N1o>7$v^lj$X0&W6Ga&bOtku#3bDh8PjVL z;lY6#dxcEt0woH({d1<%OuDpX5k3;zcJT+Dz~CHPBMm(~XXXrDA@T8oH0&lKFO7KC-^47iM;NdOi)iP%@3sc85@^x3Nzo1k9Xc=~9c z&ex{0IX(24n@z#6K;r@^J7T`EjxfJe_!Ta`W4@h>C`5nV&=yAdt(7Jepr#}=*aVsH z3JAQ+FtsxoTPh9R%54~-2SN>_8g726Wx(EM4?Zs}LIX%pjAnbjM&=#svp4bD3Ng>N zAbNGjzuZvt<><~pjDqD4wtocPhH8=mU>!^@VFB*mh&pie@nKMRHxj$hUSO zkmuc_1=Nyj@SR3^a8GT_sCz&o+cD0LV+`}nOuns(xh^^o0T;!-mBJ2%gfk;t&7}dM z7?&zTX^c%b^(xfCHSJ(W?a)pjUxL@;v_Pw^;BNO7UxDjJwvayOzGHgBH*q=39c1A8 zj~~P~32h~hij+f#iixapvcxo5UlKtfz*7NX5#6%wymlZDPcGakosdFk{mzJnX3jvX zBgetzJ)CU%5qyE&r9ky^rId`_MJgkPb7Coqu%^{!1C&NajW*^_ zJ&<|F%UE{dsv)3|u*iw6lru=gIh#4_>LVX`ZlWD1h~c)z#fB|{0H1Z}u`O`V&WgKs zJ!T$C39)iuy4F6_KrbDEW(X_r&Cr53l$b(WDF5D%79> z^ZdPcXw95f98QO&rWrG8;bG@i985xmo9p?v?>^6J0k&QA#z*$)uSR)A##agy+fZ+G z4gzbG(~)RUeo`cfgnFjZIq4zE@_*|OP5T0k;lZphQdeMGds#a4#tEuaheg+*P^>I! zy`VOdN+2m0%FyAo<6Q`eN|DvR`uw|3cxf0FYRpxNIn%v`%DZ2<)%kBM9|m^UzN>~p zz0dq2QM~I$0=63dA^~bmymQycnQC=RNN^~{k&VoSl6b?*EGf@TIvz^B^ff*Swnmwa zI(%F)XR^0Y#@e9}M|Q{5M+RaCg{ggy^dSz`T`s7jVFG4O`;s=Xf+z*DQsY#(3`NH? z6YXg@zP+L_p(jFBm9uW!wjJ^eRr^X?)B^G(ewnYFlMc#x4|T2kv{xNac!u{H(j<~+ z7N_8W%SNxQJ*@?a1}ew}P_v=y_sN9=SI9yc$_Z~oVZFMF7*by}MkE-C3(dX4 z-A6W;jLWSt0h~kVvow$_v|3X~0NJNn5j9ALmJhv)aVkZ&17C5f5uv8xwxp%!>>GCg9ir`J& z<-15}&9@oSifSA|z9GVRg=ep(8p=kY23$P^=81W2NTSSABR}6Me*l@PrXI>ne8cbW zlaC>=R;c8i!NdCO+0^U2%%8G zl`A9}MQy?5$B~1~UmEV_3uz=Y80QYa6-gpSuoac~@Y%^t;&6rQ$zUTogIdH6V$t;+ z6$YOPq=>P&*rabAHsa(-&<3ZspHv6uuO*^%Mfp|)52N(ADY%7@DQ1b!YT^4# zqBpbi!X;fBNr!Kj;g_oqaZekZ_Y`2LJ4hZrt z8QnKvc6*zZ#lO3qZtX-KDpQL{CpDf<)1k66za}0da5J+!Mv)~C4jGslq|NGp!7^<1 zrCRFV7f52PMKr^XpP4LIv){wpzYs4PrD^g306|`_J!Rswe+E*w>rZPqwqGtgH@gYr z@7c6EIxs~T>b;SUEyNSmiIMr9!|j=Mi0H$jT{5|!YrY~>{)Ll-Zx-}ZY?BiXTeFbl zrgz{}F38>VoyWX{gBs;rYb_rpRIV??PFN^=pD=`jc1Z6rXJcr&D}V*;m{4sj7;mM* zC?QEu2p8ObMGP57fy|33cL&|X`6_v8t_^1@hnAs2TPaUD@l~Ot_3~NHG9gvfam~R*D^;w$sG$H}f#kf6ApfEdw{Qk;4Dk!7!%C~z=ffQ7TutQsP>87|p z{r&cXi-e&|APvgr^ibh@Tw(4W95IWH5E1S-wJ?Z3OahWT&0w3GC zZITEmnC1X}hS)tw+S+m(C6G-&g9nd6fMeLo_>{C?lCK&=c{B3{HYBqJOYkv+df^aFrO(p7U%;}tN|^8F1t#N}+F5%B=2s{4 zUQYE+s!AX826P!5H^WE}1zY>)UnFZRutX4mM3Oze6j7Q_6qZjd`KLIJ2D%WWo=Ve(ePwLrFhq&tlB~P z%9Ot+mMjo^H~|?KMSUv3_NI8}vTeV{tgP5^qFh>$uAiuyfR}y((sEKrkP80pmIbDX z`7q^Xh}3>a6X)}224Ig-)0HNF4=ErQ=l3syGzwXYaMzYs=(MdC;=6tGL8wlBXuytpM)N?8^KT}|_ z-HHvT-Ar&!%u7-K4BTrG%YrzMv2_auya#D~Q&;=uO`1}r!7dFda~(i<;=A@iYD@RAOdkdtPM#vB=i5EG@hLGn>Eb2rAZbmp&0?LM5 zZMAY1Asr#p!7n)A{`iRNrV|o zzr&L5#s+@|GD5O3g_*j9e48I(yR7)T4+{vUXn)Pum77F#Hg<}Ghr#Bu@(UD&^t)xK zI&-B9^8Mq|W|ypxQww&!%PEd?`QHWP*kephgNTADey5SHvgRAu+N;N0D`)xl8Oyw& z2e8;P(jlE5?P2LIB!N=wLnmX!OWZV>?#%Wd6QZ|~U-5&k8lTsGLd)iv34DTdEK28P zuVR(#f-nK5^Qcca4AEQhJ4f{XL z9B*E4ueJ9dT0h>X+_-Kh90eVPc?2$ar289vQU9}n&rdY{#JIY6;DIo;Xfe;guu z_Lyr6^mdH9%|Gh&RJggVFF&T3G{1R0Jzk4b6}5P@evAL|v^r^Ay!t}=-`3nZ@iE(E z5CDLmZ+~+H{}U-Iu*s5zm9 zi9WaH=ij7zxm`G%O|&Mui6)}_i2bJBCVS488a|guIr+`ooD<0A0TKg;V;Le@-J!gD zlwADoWrt&(HVc@K-&rc0o~Z78Z%$B%YFmRGyZt>z`1E>H4BSoI!m|>t+`0jupV#b1FC< z?1R=)e|zx5vEt~*BJr1>a4o#;=JQRYXN{DRN#&m_;gpH@mDEDjV|i5fz3zXrb64d7PaS3hk9b;ctqYg@U(d`B*ljDnc%g zG0XjfC#i(&bg1MRYlQTI2Y(p84OloIUa>6iDlnhs^Dv;$h0KH5E4Xm`=b#Cr+%I`9 z;rHxs+QJOBSnRwzMXAe}D23fZX)o^e-vvnSa0guxeriWX-9&#N(d_;ZD`~ek(HKU+ z8pG!_7a_}QzaSOc*X>1(P9_rL>@SY*5jD>v%qK)FTlIda9;xoflM#6X+?H7~m?rq% zaO6#8OL$T%DZYbWMVR3NZ{1w&a0n|8_eEj+LNkGaL-AkrJ&oPjMXjIC|`TwVb~uvT;K2!)p=GZ72VPz z(WTeep|?r5!X{~F;r5tgo7=??BM?=M6Wm@MduZCwds$ca>S(X<9ks!}_Aq<>_PYLh z7*o#Nk5@Mk5g8SK*)JIHSwlFjSk2-XoHCd40Rsm2vvMm=+*U--Xh;>!qw@fXGn73% zT$K9R1DHDnXt*ovTa9!>`p(?VBz>vFCbqXY-mbWls>HY*wUco43qSb3zw*fNg3nDlBVQB6u8(s z7g~)7p#+J%JvF|QmC~t0uRTR))dYCp{3L;{lo64fBY~VYx*)EWae&r*(wA|EFXOTY z37Kl2ZMTt`E%ESNwy)PjTDOBfEqn?zA_nj3RwS<=;k0gC|i(@TJf|Au^VtpIs zpfKNL>uTm`TGT3%!$1-4P#wWM!R&{>{*Gt(+k@)Qrlg=L7Eq&JBat2JK?DYtxd=H; zgD*9;!dN$t)m^O_Vm%!NoB18i9zoZv3w$HRlsbNvLW9$5@pP+OXWehF@vDM_!Q2{g zG6V4MV-nrL-3;0c-tlN`{A!!Y0`#OQwwMk)eJ(7W6sCd81aj#zq`vD4 zWj4=6J*#g@DCdcPw30?dwZd@ka$x0r`NBkx5E1 zbb<>UNx>Z^>y(vfkp6=`q-G+wG@2rgGV_({knxnwGLy5H8`+=pFX(I$@UtTvGZ_a%AJy}8P1Pb8& z1W_OOgHG5|oN5H(Y6}+)!;2TK1;Vn#90Qn8w;fZ_Up^D7uK24Hf%4!}g~ov+9mbkY zjGPAg5-6Rfvs548gss+LY> zUyW+{fUvDH*<-(ExW8(Zs-K{_Emw>a^PaX+prJ2JTge`BAlKT}t0S6G8G2=_79hk+ z@D$V&J!L$-rPtTlru8&v1KgN3*S-8#nuDOVi^nqL326$X1f%`E{MtLXS{T~@e$G{+ zq;8$UjO+Ei=0)c&jS$g599B_MAxA=;y)dmt1wl%08s8u_(qh4T^?h_e$>dD!f@H-g z*Yk*TnK+|5?HC0s+ALj9=hrAKg|Cc_3puwJp2x$?X77}!@YE`CPgMPxs_U&|e;&g% zFR~b62^3KGsGA?ckx5q?#(uzWGD2pH{}8Lzkc?(()@3r^ASxd{c9SnGUuF|k5#3`i zK*^5Lyg*-5Qn+o-*s$#Ug`i*0wpwmstQJsYi8&5k+c_0bz>YwZb{Rt2kCIKKIgipo z)l}rtnLd#g$CcTGgcPkEh}+$bF{)=hj1ohxH%H;T-K8pTYDlEGmsUlK?pq+)x$$Yg zoX(l9;8Z7`YMA3v3{DJMsw!tpkjNm@A-Yi}tH7YO2M^D~vVpa@U`uZ`Oo(Q$h;L3D zz0p~4!{yRK#>+_N!fgz_8OnyS8#tvEO*Mw|?RC8WE41AQPqg>6CadX01T12D@?qaL}-sqse-RWbGhlK-f#C^xrAD4+CYD{ z5hGGNX{gY6$`kWZt;?@^C_5}!)YTLsZWRDf74ON+J!G@=x{cR?ezsWe_T%8f3(_3%n*jyKf{e5S9q?+EP-SsJNQiQey&|^nfp2N;? zUfkT^=35Qv>}|)?UW(&0l$%nJU&iWEq(@7I221C3rg9K+M6N7l!!sG8T>&iDQ?b#< z|B3^)aN#C=2o8*p007*7;J`%J&d@-?!Oqmm_-}9s_%T>@>E@RW1dvJe} zPWr)3SLTtng0hPC$DX0dynT#e(>NN10A6n?Jt*QbRZ+YlvT<@6Cy~;17m6Vw_U0P|2m+M8e{iFUdN&-VdB5SZ&zH+Bpfp?#b$Ba3GH^PeT_7Q3`VP`2(CV?V#K!pdnxG zx6s5?(8Oqs0o-=${Isvpny){N5OYhwI`Hjc4}lR+eb8a-URA)5vysng;dD-Sh~sM@ zE25$)E`}C{2*2}=w(`{h$ex$^3ASV$>11N@N&I z%n%YxSPbsw%Rrvq_>DIv42^7O+et*9p!| zn`x6aG}Tz5LX93chD4m4aO2Fy0X9b|S0<`xgT{x8wF9R>AGLDe>i6EfO_KBzg0n3y zHQ*;cLMQr(VS&I&v4tghD#bi%gAzV0AZ(no1~cI?-vomgnF)_Hz>`W%`+VhAG4bS@ zsqk4w^)3!6ZN#?MqSM{hhE$WCASF*cSsb5Y3v_05bv~mMK|g&Q3PWe#Nn791QnxKQ zdnHt(cOAVs-k_;FoT~lR(wZInml?0U>InzXYJ4gUui&T5|UVtjn zXXCTBhkOb#j)9#IoG5IGuq0SnKFczbwLyogVxoKL^~Q%2LkmLV^$!aP=+OPeRW09a z-XgM3CG6t@^SYHlq-dqtZ*EU*ouyDqrt85i)q%I)?(?g;^ehT)y(-q?Q^P{{>vXiM zva=cEQ|rylBZAlw4u879Qmw)tkPKDSMA*KrJusd2&rL*e@OH6O$-1?~gin>Yq~^g1 zu7qEl3a>T}(UnBRyniNfIT{v^|Z)?FvI-R*;UHv2$5$c?v1S)@M6Fp@e#ONZgMvFanL~cG zDL!kK2Qq`@5Fz|-sjEBuB&r5IWNXLO=DK7)#s{blz*m9C};*qz5VYmjQrDG z|2+Pe*GFU}|6ReqQ``S3`1_au!Nk81-TxH)lkWSMXbXfx{|EW^PvL*38vZ2;0C*z* zP5A$Zc=)HDKlx{WX{tc||0eMtY_vbM{JEm`mln#m|8;%sPX&M86a1who91r{{<=f> zQ}oX^>Mzk^y1$G5*;M_h;os}$U-AIJ0iYEkq~7i;UL}x R0FWUcA4tJwV)^~+{{h-3vWfrz literal 0 HcmV?d00001 From 8a3206ccb06a058deda1fc27b207937e16e443de Mon Sep 17 00:00:00 2001 From: Chris Moffitt Date: Sun, 25 Sep 2022 12:29:16 -0500 Subject: [PATCH 10/10] Adding new data file for groupby warning article --- data/sales_9_2022.xlsx | Bin 0 -> 10186 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 data/sales_9_2022.xlsx diff --git a/data/sales_9_2022.xlsx b/data/sales_9_2022.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..9b6845c47baa4dc2bfeeae85ff26451a144d1deb GIT binary patch literal 10186 zcmeHt1y`KO)^+0^+}#`3pn(Kv+}#}lH0~PQ-5r7?gkZtlJ-7yUcXt9WGxyFnGr9Bq zf_tlbRjpp#`>dy{&fcf$oKld6g2n>C0^k7v06D*ZzxVpd@inu8S2-^hV-QWRpp1vGN`K>m&a@ z40=Vu_U?q9R1;*&2C;Qp3d8LhYt$%25lM-@-Uw^x*9w9eN zPv0SR8!mplgO=t=4i@1%Hhg{ksP`FwRQtLvB~H+qkW>kQIW~@qpBnl~Zv%3X zRv~ZyLdU9{+A=!Nz}HE%jd9E=umzo`-o*x!ByY;%1k z?d*4cHWRG^nb7CX_r4TCi1c=7>C=07QKDOx<=n?yl59j``%7Y^&#YFwSSM_^KfiS zJK*>%dC!Oa2^Ii&eue@l{Ee11s;rckFTN)8!aAfETIxHPS~;>X|JMJHj{n6R{L5c2 ziI zdK0zZPkz15SrUPP#ZOu5S{j^kZ|@98Pvej*W?!<_jqWmiHGQ2XF69n%X^EjPt}o1$ z{=PyfHhBiBKpSONCqTgbOcabGkmje=E33I|bXx{7Ev$T08eCq-owJ`Xn&C5(T(F1o zHk4Q9a3T%6-_giyuFR|7iu~%1KvmhC*P`4g+kub5UEjp2{Q{KHhWXQrNiMTriGma3 zmVH#bk2>edN2{9iY%t5cn;W64xc~IKe*|ga@=JvJPm-J}lXMG9zzXBh)7!K(v!S~$vdV-kDiXa56=9>X7Gt%m)RrH6w^Q70f(pp7 zEu@t$lB0*!>4)WUeV*)Fx7Y9DF3g|+TD)8gpVca?2lMh!;>^56waXbB%tQS@yH##U zDJTtuD@{+;2E^KO2i9!kkmGzYINgfJ=G+pU8B?qX=zZ@?Z zPD)9*aiUbtJJMkxQO2~3H2He53gZb*^Ujm;84x_ql5ZHR_ALo7{ziz?c66AHWhAAf zz*T#Xz@+A0hCui0JSUh!RgGPAumZn5{@b2m*K3sV()j3U1LrrShLf$cgLh(h)VlFy zYK~~<0Mx9thqw(3d9>b7YAV&BaXMS5_=*dgRTMu=pUj zObiAc9-jNG0GjBi9?goKzRxLp*i)Oy&F_NV!EKfKvE)L#U5?>U^rG5YgmPq>93SXi4G(C1;3Y(q`s#_w6!3Q$E(Oeo~$;yCv|jYE;;xoP0fpj1##Yodic|=y6>#ffL6X-rKCz$%?f&GX#;u| z`LbV&`5hy!nS+flm=Fpy11IBx0+2-({VZi4f#sDzqzSSsVcJj+scU|@pHYa5_jBeC z%=OcCjKRg)qaTumt||mYbdp8ND=CDBVd%2gU|#1B21+PQ zagnV)WLvEVArL@_a`!cCt;1_dt~Vv8Y*Vtd?{Qw7LunE}od?t@ncv7O=ke`oES3D^ z`(fO=uw=_zN7%&oVaDr>aoT-F;HOV?9oh$3d8d&RIj2nT)@=QcyPiqIq{kP^S+p~f ze7s9O>>098X=XmC;*!^U<&*kRzt92;Ms}6?3a13YO&XrxRsaIqXTy*S2uZICy#Te)D_Z^vcUh|EJO$luRdf z!TuaCDmyD4~jo(?XpdvwC&g2=o!9HFq zZjO;9#5HcNeV;vjo2tdiSxki)tSRk z>nH=!Hpyu!WdBqk4dEL!B}Y_|l+T++rFTMMy(nmu;B7K*?e9PI%)?g#l!p#cOZ#fT z5EV%^PCF#waqD}NW#hrtmi-SP%HHws?U)UK!rP7@fCC zP8rX-cm(K_*{164xGJ};vAdT8i24QzuAS=)CH%|@BqnidZ@)7laDQh!H76-+0S-8_ zXbwA`u<40*gwsg!ggvcq<;5zjF*BHe($>V&x5;}<8g>bHzhH?!42VZtw}olh(Y^~b zn!4IiJNQs{iVzI>Zr)oc#ANZi*_q^&bMOV8)}t#6Br|+$ai`^TUhN+LeS}aDa4y$H z|LUwNwrRl6YEzzkvP^Yiy%{H2B*wq`ZJLWtnWb+#F5R3i*Djk2fqzm^aeKaC7WJ7I zY+`A-4JAKN?9m`gt*uv-672gf%>#aRgO!xVn=@(=&m{9G1ByKFqqKp~(z ziA#w>@%7L^Zz?$nJet}7yag0(UE3W4vz=xFWhLs`Wf+4W-vWs+im+WmW8cYP=JiDt z8WL_b$HDPu8Sfa>QX21TH8%o=T>N+A6x3$Pg&}!m`%TO)$!@wwHfMQxX6eA%49haO zzH?>hC$A97rOEQdf@Pz#0`6kz7Qo{TjQ|j_3YP>Z zCj>fhW$LosH4-L`OT4#(F+T(G-Vi{6&xYy&dPh?cZWW4oey=-6Bmf5|!0ue!L$nWx zwukUZ9QqQYxIdbQT+cRlMLBJQdqSw3M9T=NYrd3ry8$?xt)A8fqaedp$>Q-oz9gCI z0z)zsa%0TKQafUP4~VX{UCp|kF4&P{VnnR@#XD;zA*0g&yTpalq#7al7#sR@(GU#+ zEH&mHNUd0Z$GIxE#EX4Grz~oM6+bnU$`mBYs+G9T)&T_0@O);Bf*S~@qZHsKuB@@* zlL5uEf2G1p>@I<%9Q#(4r&Be7*kj)VG)5SB9VfQAxvH8>fC|ES{)*j`mZSxYW=aA} zNf^k07Lq0@dioE9!u{60`kAF$I2oE1GWEdu=~4x_G!&9}cns&#M#PLGJd=2p@*A|K zLFsYZj7$5KNf3!)5*V<(;W`N#aL2;g2s1%D>H-7dT$IC5`QZLe6&Whr7w`ni^XJ_` ze}|}xD8*4~Rkc=p3lhAo^77^jHgs#bSCLpy>esOeJhl(=Ev?-}C7c`+Z%Dw_5_%J9 zp^2Mr60EEw7|2*(%y5n?7w(=V|D~zF)4?jL1F@484dg?82&bWqE)*}!;t$6{(%knx z0d~?#rq=vfkKuCLdbvF#PC#^HPI+n#mrL%dU)iF2y~MxY?q@R^NOs}wJSYs;@eU{ zJtjvK14<#vc907hBl@@U#}?q*0pI>3xPQ|94qUWWxtDKx$$raD{jU2?=BC!BEWh`^ zOa7tea0DSYRvYg9TVzMq$JZM%)XS@*RtZZ~CRvF@HMNI|Y8>qG&4iec)Vx9C*DD!v|Pp@P*v}dj?)g$QkHH}WmrjRxu8UdKec@Ma+B@tap80}(UR&Ro{H2N zpRC`ia4wJ;Ih#r~ne1c91LX9Qmq?66H$||yLt=cCUij$kfNuRAzHb)l4rG;^Oe)E{ zK#$V{9~@WT{+`Egg(k93qLcC?P9G6*z;Y@M!)K*ZJj(c6J%1ww&kyY6l!quuQa+wv zo8{No;)GEmfh+1o71X&E8(o#cp+UP!$He6G=Z4vA{3Pq16E{E%3~%#0>YqEfbu2jJ zYBZoNSWOX|oM-k183QkkamVw$4)6pSe1MkcXMLzboxZDspIU9T6JH?AIsKCgYG?Tl zl5AoPQ55c{bDPvpR0naE$Mkip8ir(WWE1lb}*BJ zrLWyJBYmW&PQ~St}GtEuLk}cr*bh>vD(k9?@xqH{8U)fMYFKq)yt*m@J9i#vJ z^C#}Ie%sUO{u%e)6zy4OciZFLU`pHb8Q)d;`XV+<>xc96y#(Bq4_7<#F~su=M8v75 zRcew=!I#1(`Gr> zNP5;l6`l4?tO91yIMM~rm2b5ueem?uE?Z_P;i4g|DQqmx@|anw1aPy%k)yt@G#6EF zx{VT!5mvy{F`?X6AreVk+$0jZH6y^jE>j+Md5rcgAas&i#KC}CiMd+DD0HyXlq_)m ze0bUVTUVjww1AfhrQyqSuzH7n-Rd1QXI4B^;svq2CT@)|*<^R{=@6$WW1#|hgwtKz z8#4wU1Wv9qqJkL5oM~yKm;6JqpL9T3#adR! zi5asQT&Pf3Vrngz37b(pqhXb_kFEnGt`Lr}FfrO^4~cvkK+_$NUlqa)DWj!_dG=zv zU3`h-4|i2Zz3JQMhobkBdV=3fM=aAWw0f_uuY|y9&0c;3G$-e~+49X5TO8zx$Ad6E zO~S4ppqs}d;aCe76dP$9zz`!8x9P7_85a{NnKtZc-qNzq$BZFsiI^_6XNJpMskm{_ z`#YT$;wXlg7BheoC&EnEe(at2H*Z+t&4}Xv0Y0Z&kh$CrY9tj*@(;yU#SUAXHoRsGp?cKr77DX+hpO`2kovNZ(Cs_S) z*BJSSlOHsD8`DCj*yS6H8VDUY_M=eIEk!A4>-}hGl*hXHZSHEo@OAW*oR)W(yLeqQ zZmK~El_+r+~}V) zG*}qmqQiV60vz^-+59rfvE#PxT-OZ-%x9XzCwLR1DfuE4T=G&)vO8agl?{!{GmzrR z&!u;on3lJ{Os)r2lO5Q;x}@%#yj` z6~?r?RP)-@hX|++R;G_b?6R88a$Vt7Rf-GM9E|YD@>5*5*3M!)0%v8kZ~L=`w4C2D z9!O=~-VZD`PrrwA^?T!C=fm?wX;(qmHiZ&+KS9`^$Yl_*n4lR2ztYT0%k=C`X9c$; zHOrJpVAzJH5}=feURxaGf~Pk4vs`E2nI3i3AWl)|)#SlFgpT2usEft7z8U7@jV-3( zAoPd`8T}#Lnzj4&R-HL7`xA7stcRL`98GH%aM1f20U&gXT>jYM556G1QtcBIkEQZ) zqBo~4lqj#}r)?AtxDjg{>a^j()TZ9~nuTylQv8K=gg002<_6@DF^+^tL-b6t0~$paG`mVmOsA(c8$GK2GYe_HpN%Y zNBcr9DUjiphd@FwDZWw+ccSA<;6H3Tk&vogtkzD4^sI2BIyrv0C619}X$sEwEYhM8 z-45FE3n>Je_H@%+c(>6#ehtx*1&P*Uab8?kUol^tv`K5X>|EYQD@Bn7nsc1U9UKvD z)Hdg%Sd4BZBWis83=V}=wzg|hO;jnh2lGVx8~77;2JTP{Wt_7K5#%kO)OyopFpXOc z^GtdY4vNU~`m~@c#l40r)Wu7{a9#t67AlKi@IR2rcS%ueYaTd;MwseW#R~@VcX7xJ z7*mBlR!bQA2kZuVe#yq8>kVJ#p!}Hf#b^dA%0`jSRX89sglEW;smH(^$)2iUx+^UbGBmNg;^&g(jJLa3ry7rq!OB&%@A9VEQ0d0$dH+I?UBwE=& zl!(y9>{7)F?mWq#v4S)m%7@F|SijBl>@=j|WRv>-S(sn|$zTCrh+yA^5y8x|XQ(!v zGxB7~$%MvA^!kAS_R0{V0r^>-#L5DSr}s|iBm5;BraR*2lCKyrc!i`Zhjs6i+e;no zj#{5*TNF)5#lH7LECjkYCKvf@3zi{sCwf}-6zsUw-Mi4TDdZrEcIJa7ErBrE4Q<; zF+UOpQE3phceTEKEjkJ@f3vF6q|(AGyxA(kNe3$NMpeSAQ;uXjeK2ZC^#(4+mOj)gqx#uQWZrC*cv~0aV3x(N06N z79G1OMmiZZCG%8s*JzsQDGnt=youQmyR!J`%ipLuA3fWI)pOo+KF}CBkD#MH!&xqC z>J1UhCI6|VK4b&1iCy|V`XHmrXq&jNZ6f@Lz2%?n2_QkdV{+lA`pk?=rRK* zoyTiju43rn0DNXt+elC0L%>+z?Incilgy~*jlqIjtjdOzXK;73oBI@OHvja^HxG5g zeQ=FUU0{yn@yee0LM%AIb`;GZreZWt@{}F{s;wSg_-%IV*SA?ZRNirc<6sPyvUm!7 zL;396OgJ9oB~mCT!8FSs3M1V)(>Y^qI3UOieTh{e1h2=*tK5|8HBYkJrMkgpKBxLw4 zQbc1woy=Xa%|{&_rlUndnm3U31GDr#Yz)gbkYLDE$JbU;6KAkm?Dy53Z&kd#|CAVN;3=OZiaT5cvs zH{ZDo8H}t33xsI{k`jeyWH@Uz+#m*c(%lBTvS=D(Mp4*D>CVpp^UTFk!$@a6m!hP~ ztJiTlKVI&0p9)}UThpA*HAV$kU~uQkH*x8~T^rIIQ=q5T?J*qAt>}b{b@__&TM((% z0)5xuGuKqxe2}A`xXKRO<64u-VkC z-4<}Th`~gS7u;^-Kw&)}4vP2^%|A&D7`{Ik!Ow46!}=_)wt>}p{pxk)lKkpNFVH3t zDUt16jMLq`wrTcKM{ydD6Uk%+Nn+M)RHX6cPs?#oEjN=uHJyXIXM}BOGUy`usFG*MvTVn+WTRTSqgmwSDgy2$w7s@_cpq)~U7>OQ@27`{7a&Sbnpb*Rq!AZ%qh=pGjyfL~J|clDZf~^r1Di@%MJs+Ezf%eC>AkbUj0aY=?ciW8Vt=uwCCxD^R-Y!X;pYm6J@! zJ*0}NYef%Lt?@0Khv|A^=ZWmDQz*ZwBUfnt!(`a_Y_0Xg$UPppPq6-vo0t#N8ih9M z$O@`AJ+`s^Ht?`!7|zMplKl z-sP)FnGxZ8wFdf?`T5LAnRQ^xsF2ri2fc34)GM(2BtzxZQTF&X`xesypHq>XeB7+n z^KPxtU^AsIY4|ZiD_|FJIjd^cIEN?_v#G!C^ddUqS4cCb5 zclx28z+5zw{N01RXZ@^aay$!uwtS8w+UO(dNb~V zDhoeM3h!X#4g^uA1Z>$K$W4~g%R}y`FWOk&JZ)TdU7dRbbuuU39d?)_;c8{;2SeZLy`^?359 z0b+vR4g7gn`4#%>#^+Dy6Y;;Gzixwmwea@<`X?R$xFrPu{v(|J3jezV{uS;_@fY~N Z1yMm7=4EvO0K}J