diff --git a/Chapter3-6_Solution.ipynb b/Chapter3-6_Solution.ipynb new file mode 100644 index 0000000..c2a800c --- /dev/null +++ b/Chapter3-6_Solution.ipynb @@ -0,0 +1,830 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "2d617e33", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd \n", + "import numpy as np \n", + "import seaborn as sns \n", + "import matplotlib.pyplot as plt \n", + "import matplotlib as mpl\n", + "import scipy.stats as stats \n", + "\n", + "# 시각화 옵션 \n", + "from IPython.display import set_matplotlib_formats\n", + "\n", + "set_matplotlib_formats('retina')\n", + "mpl.rc('font',family='Malgun Gothic')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a4fcc1b3", + "metadata": {}, + "outputs": [], + "source": [ + "df1 = pd.read_csv('01_Contract_Data.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b742d87d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 51301 entries, 0 to 51300\n", + "Data columns (total 20 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Index 51301 non-null int64 \n", + " 1 Member_ID 51301 non-null int64 \n", + " 2 Sales_Type 51301 non-null object \n", + " 3 Contract_Type 51301 non-null object \n", + " 4 Channel 51301 non-null object \n", + " 5 Datetime 51301 non-null object \n", + " 6 Term 51301 non-null int64 \n", + " 7 Payment_Type 51301 non-null object \n", + " 8 Product_Type 51301 non-null object \n", + " 9 Amount_Month 51301 non-null int64 \n", + " 10 Customer_Type 51299 non-null object \n", + " 11 Age 44329 non-null float64\n", + " 12 Address1 51299 non-null object \n", + " 13 Address2 51299 non-null object \n", + " 14 State 51301 non-null object \n", + " 15 Overdue_count 51301 non-null int64 \n", + " 16 Overdue_Type 51301 non-null object \n", + " 17 Gender 51301 non-null object \n", + " 18 Credit_Rank 42520 non-null float64\n", + " 19 Bank 48542 non-null object \n", + "dtypes: float64(2), int64(5), object(13)\n", + "memory usage: 7.8+ MB\n" + ] + } + ], + "source": [ + "df1.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "87bb330e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IndexMember_IDSales_TypeContract_TypeChannelDatetimeTermPayment_TypeProduct_TypeAmount_MonthCustomer_TypeAgeAddress1Address2StateOverdue_countOverdue_TypeGenderCredit_RankBank
0166758234렌탈일반계약영업방판2019-05-0660CMSDES-196900개인42.0경기도경기도계약확정0없음여자9.0새마을금고
1266755948렌탈교체계약영업방판2020-02-2060카드이체DES-1102900개인39.0경기도경기도계약확정0없음남자2.0현대카드
2366756657렌탈일반계약홈쇼핑/방송2019-02-2860CMSDES-196900개인48.0경기도경기도계약확정0없음여자8.0우리은행
3466423450멤버십멤버십3유형재계약2019-05-1312CMSDES-166900개인39.0경기도경기도계약확정0없음남자5.0농협회원조합
4566423204멤버십멤버십3유형재계약2019-05-1012CMSDES-166900개인60.0경기도경기도기간만료12있음남자8.0농협회원조합
\n", + "
" + ], + "text/plain": [ + " Index Member_ID Sales_Type Contract_Type Channel Datetime Term \\\n", + "0 1 66758234 렌탈 일반계약 영업방판 2019-05-06 60 \n", + "1 2 66755948 렌탈 교체계약 영업방판 2020-02-20 60 \n", + "2 3 66756657 렌탈 일반계약 홈쇼핑/방송 2019-02-28 60 \n", + "3 4 66423450 멤버십 멤버십3유형 재계약 2019-05-13 12 \n", + "4 5 66423204 멤버십 멤버십3유형 재계약 2019-05-10 12 \n", + "\n", + " Payment_Type Product_Type Amount_Month Customer_Type Age Address1 \\\n", + "0 CMS DES-1 96900 개인 42.0 경기도 \n", + "1 카드이체 DES-1 102900 개인 39.0 경기도 \n", + "2 CMS DES-1 96900 개인 48.0 경기도 \n", + "3 CMS DES-1 66900 개인 39.0 경기도 \n", + "4 CMS DES-1 66900 개인 60.0 경기도 \n", + "\n", + " Address2 State Overdue_count Overdue_Type Gender Credit_Rank Bank \n", + "0 경기도 계약확정 0 없음 여자 9.0 새마을금고 \n", + "1 경기도 계약확정 0 없음 남자 2.0 현대카드 \n", + "2 경기도 계약확정 0 없음 여자 8.0 우리은행 \n", + "3 경기도 계약확정 0 없음 남자 5.0 농협회원조합 \n", + "4 경기도 기간만료 12 있음 남자 8.0 농협회원조합 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d90a0e54", + "metadata": {}, + "outputs": [], + "source": [ + "df1['Datetime(dt)'] = pd.to_datetime(df1['Datetime'])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "66595a56", + "metadata": {}, + "outputs": [], + "source": [ + "df1['Year'] = df1['Datetime(dt)'].dt.year \n", + "df1['Month'] = df1['Datetime(dt)'].dt.month \n", + "df1['day_of_week'] = df1['Datetime(dt)'].dt.day_name() \n", + "df1['Day'] = df1['Datetime(dt)'].dt.day" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "5faf1549", + "metadata": {}, + "outputs": [], + "source": [ + "df1['Bank(clean)'] = df1['Bank'].replace(np.nan, '미확인')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "1f9c56e0", + "metadata": {}, + "outputs": [], + "source": [ + "df1['Address1(clean)'] = df1['Address1'].replace(np.nan, '미확인')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "79eed8c5", + "metadata": {}, + "outputs": [], + "source": [ + "df1['Address2(clean)']= df1['Address2'].replace(np.nan, '미확인')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "767cbacb", + "metadata": {}, + "outputs": [], + "source": [ + "df2 = df1.drop(columns=['Index','Member_ID','Datetime','Datetime(dt)',\n", + " 'Bank','Address1','Address2'])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "c2f1e80d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['계약확정', '기간만료', '해약확정', '해약진행중'], dtype=object)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2['State'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "ebf799c5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 50665\n", + "1 636\n", + "Name: State, dtype: int64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Y = df2['State'].replace({'계약확정':0,'기간만료':0,'해약확정':1,'해약진행중':1})\n", + "Y.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ea75d1eb", + "metadata": {}, + "outputs": [], + "source": [ + "X = pd.get_dummies(df2.drop(columns=['State']))" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "a717fa65", + "metadata": {}, + "outputs": [], + "source": [ + "from imblearn.under_sampling import RandomUnderSampler\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.impute import KNNImputer\n", + "\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.svm import SVC\n", + "\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "87ba88b8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RandomUnderSampler()" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sampler = RandomUnderSampler()\n", + "sampler.fit(X,Y)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "591873f9", + "metadata": {}, + "outputs": [], + "source": [ + "X_under, Y_under = sampler.fit_resample(X,Y)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "5e14681e", + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, Y_train, Y_test = train_test_split(X_under, Y_under,test_size=0.3,\n", + " random_state=1234)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "eaf95c1b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "GridSearchCV(cv=3,\n", + " estimator=Pipeline(steps=[('impute', KNNImputer()),\n", + " ('model', DecisionTreeClassifier())]),\n", + " n_jobs=-1,\n", + " param_grid={'model__class_weight': ['balanced', None],\n", + " 'model__criterion': ['gini', 'entropy'],\n", + " 'model__max_depth': [9],\n", + " 'model__min_samples_leaf': [8],\n", + " 'model__min_samples_split': [5]},\n", + " scoring='f1')" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipe_list = [('impute',KNNImputer()),\n", + " ('model',DecisionTreeClassifier())]\n", + "pipe_model = Pipeline(pipe_list)\n", + "\n", + "hyper_parameter = {'model__max_depth':[9],\n", + " 'model__criterion':['gini','entropy'],\n", + " 'model__min_samples_split':[5],\n", + " 'model__min_samples_leaf':[8],\n", + " 'model__class_weight':['balanced',None]}\n", + "\n", + "grid_model = GridSearchCV(pipe_model, param_grid=hyper_parameter, \n", + " cv=3, n_jobs=-1, scoring='f1')\n", + "grid_model.fit(X_train,Y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "22046b8e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(steps=[('impute', KNNImputer()),\n", + " ('model',\n", + " DecisionTreeClassifier(class_weight='balanced', max_depth=9,\n", + " min_samples_leaf=8,\n", + " min_samples_split=5))])" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "best_model= grid_model.best_estimator_\n", + "best_model" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "5838beb4", + "metadata": {}, + "outputs": [], + "source": [ + "Y_train_pred = best_model.predict(X_train)\n", + "Y_test_pred = best_model.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "be517b00", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.76 0.73 0.75 448\n", + " 1 0.74 0.76 0.75 442\n", + "\n", + " accuracy 0.75 890\n", + " macro avg 0.75 0.75 0.75 890\n", + "weighted avg 0.75 0.75 0.75 890\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(Y_train, Y_train_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "16974038", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.63 0.65 0.64 188\n", + " 1 0.65 0.63 0.64 194\n", + "\n", + " accuracy 0.64 382\n", + " macro avg 0.64 0.64 0.64 382\n", + "weighted avg 0.64 0.64 0.64 382\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(Y_test, Y_test_pred))" + ] + }, + { + "cell_type": "markdown", + "id": "b5151ed1", + "metadata": {}, + "source": [ + "# Random Forest Model " + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "44613231", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "GridSearchCV(cv=3,\n", + " estimator=Pipeline(steps=[('impute', KNNImputer()),\n", + " ('model', RandomForestClassifier())]),\n", + " n_jobs=-1,\n", + " param_grid={'model__class_weight': ['balanced', None],\n", + " 'model__criterion': ['gini', 'entropy'],\n", + " 'model__max_depth': range(5, 10),\n", + " 'model__min_samples_leaf': range(5, 10),\n", + " 'model__min_samples_split': range(5, 10),\n", + " 'model__n_estimators': [50, 100, 150, 250]},\n", + " scoring='f1')" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipe_list = [('impute',KNNImputer()),\n", + " ('model',RandomForestClassifier())]\n", + "pipe_model = Pipeline(pipe_list)\n", + "\n", + "hyper_parameter = {'model__max_depth':[9],\n", + " 'model__criterion':['gini','entropy'],\n", + " 'model__min_samples_split':[6],\n", + " 'model__min_samples_leaf':[5],\n", + " 'model__n_estimators':[50,100,150,250],\n", + " 'model__class_weight':['balanced',None]}\n", + "\n", + "grid_model = GridSearchCV(pipe_model, param_grid=hyper_parameter, \n", + " cv=3, n_jobs=-1, scoring='f1')\n", + "grid_model.fit(X_train,Y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "id": "f788b2dc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(steps=[('impute', KNNImputer()),\n", + " ('model',\n", + " RandomForestClassifier(max_depth=9, min_samples_leaf=5,\n", + " min_samples_split=6))])" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "best_model= grid_model.best_estimator_\n", + "best_model" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "3e5b3d92", + "metadata": {}, + "outputs": [], + "source": [ + "Y_train_pred = best_model.predict(X_train)\n", + "Y_test_pred = best_model.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "id": "9f1e0274", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.67 0.95 0.79 448\n", + " 1 0.91 0.54 0.68 442\n", + "\n", + " accuracy 0.74 890\n", + " macro avg 0.79 0.74 0.73 890\n", + "weighted avg 0.79 0.74 0.73 890\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(Y_train, Y_train_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "id": "a0533368", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.61 0.94 0.74 188\n", + " 1 0.88 0.41 0.56 194\n", + "\n", + " accuracy 0.67 382\n", + " macro avg 0.74 0.68 0.65 382\n", + "weighted avg 0.75 0.67 0.65 382\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(Y_test, Y_test_pred))" + ] + }, + { + "cell_type": "markdown", + "id": "569576ea", + "metadata": {}, + "source": [ + "# Support Vector Machine Model " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b198a3e5", + "metadata": {}, + "outputs": [], + "source": [ + "pipe_list = [('impute',KNNImputer()),\n", + " ('model',SVC())]\n", + "pipe_model = Pipeline(pipe_list)\n", + "\n", + "hyper_parameter = {'model__C':[1,10,50,100],\n", + " 'model__kernel':['linear'],\n", + " 'model__class_weight':[None, 'balanced']}\n", + "\n", + "grid_model = GridSearchCV(pipe_model, param_grid=hyper_parameter, \n", + " cv=3, n_jobs=-1, scoring='f1')\n", + "grid_model.fit(X_train,Y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "305c699a", + "metadata": {}, + "outputs": [], + "source": [ + "best_model= grid_model.best_estimator_\n", + "best_model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8bd5962", + "metadata": {}, + "outputs": [], + "source": [ + "Y_train_pred = best_model.predict(X_train)\n", + "Y_test_pred = best_model.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c252bfe", + "metadata": {}, + "outputs": [], + "source": [ + "print(classification_report(Y_train, Y_train_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68405319", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "print(classification_report(Y_test, Y_test_pred))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}