#import python packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
pd.set_option('display.max_columns', None)
print(sns.__version__) #check seaborn version
0.11.0
df=pd.read_csv("data/bank.csv")
df.head()
CHK_ACCT | DURATION | HISTORY | NEW_CAR | USED_CAR | FURNITURE | RADIO/TV | EDUCATION | RETRAINING | AMOUNT | SAV_ACCT | EMPLOYMENT | INSTALL_RATE | MALE_DIV | MALE_SINGLE | MALE_MAR_or_WID | CO-APPLICANT | GUARANTOR | PRESENT_RESIDENT | REAL_ESTATE | PROP_UNKN_NONE | AGE | OTHER_INSTALL | RENT | OWN_RES | NUM_CREDITS | JOB | NUM_DEPENDENTS | TELEPHONE | FOREIGN | RESPONSE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 6 | 4 | 0 | 0 | 0 | 1 | 0 | 0 | 1169 | 4 | 4 | 4 | 0 | 1 | 0 | 0 | 0 | 4 | 1 | 0 | 67 | 0 | 0 | 1 | 2 | 2 | 1 | 1 | 0 | 1 |
1 | 1 | 48 | 2 | 0 | 0 | 0 | 1 | 0 | 0 | 5951 | 0 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 22 | 0 | 0 | 1 | 1 | 2 | 1 | 0 | 0 | 0 |
2 | 3 | 12 | 4 | 0 | 0 | 0 | 0 | 1 | 0 | 2096 | 0 | 3 | 2 | 0 | 1 | 0 | 0 | 0 | 3 | 1 | 0 | 49 | 0 | 0 | 1 | 1 | 1 | 2 | 0 | 0 | 1 |
3 | 0 | 42 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 7882 | 0 | 3 | 2 | 0 | 1 | 0 | 0 | 1 | 4 | 0 | 0 | 45 | 0 | 0 | 0 | 1 | 2 | 2 | 0 | 0 | 1 |
4 | 0 | 24 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 4870 | 0 | 2 | 3 | 0 | 1 | 0 | 0 | 0 | 4 | 0 | 1 | 53 | 0 | 0 | 0 | 2 | 2 | 2 | 0 | 0 | 0 |
df.describe()
CHK_ACCT | DURATION | HISTORY | NEW_CAR | USED_CAR | FURNITURE | RADIO/TV | EDUCATION | RETRAINING | AMOUNT | SAV_ACCT | EMPLOYMENT | INSTALL_RATE | MALE_DIV | MALE_SINGLE | MALE_MAR_or_WID | CO-APPLICANT | GUARANTOR | PRESENT_RESIDENT | REAL_ESTATE | PROP_UNKN_NONE | AGE | OTHER_INSTALL | RENT | OWN_RES | NUM_CREDITS | JOB | NUM_DEPENDENTS | TELEPHONE | FOREIGN | RESPONSE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1000.000000 | 1000.000000 | 1000.00000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.00000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 |
mean | 1.577000 | 20.903000 | 2.54500 | 0.234000 | 0.103000 | 0.181000 | 0.280000 | 0.050000 | 0.097000 | 3271.258000 | 1.105000 | 2.384000 | 2.973000 | 0.050000 | 0.54800 | 0.092000 | 0.041000 | 0.052000 | 2.845000 | 0.282000 | 0.154000 | 35.546000 | 0.186000 | 0.179000 | 0.713000 | 1.407000 | 1.904000 | 1.155000 | 0.404000 | 0.037000 | 0.700000 |
std | 1.257638 | 12.058814 | 1.08312 | 0.423584 | 0.304111 | 0.385211 | 0.449224 | 0.218054 | 0.296106 | 2822.736876 | 1.580023 | 1.208306 | 1.118715 | 0.218054 | 0.49794 | 0.289171 | 0.198389 | 0.222138 | 1.103718 | 0.450198 | 0.361129 | 11.375469 | 0.389301 | 0.383544 | 0.452588 | 0.577654 | 0.653614 | 0.362086 | 0.490943 | 0.188856 | 0.458487 |
min | 0.000000 | 4.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 250.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 19.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.000000 | 12.000000 | 2.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1365.500000 | 0.000000 | 2.000000 | 2.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 0.000000 | 27.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 2.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 1.000000 | 18.000000 | 2.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2319.500000 | 0.000000 | 2.000000 | 3.000000 | 0.000000 | 1.00000 | 0.000000 | 0.000000 | 0.000000 | 3.000000 | 0.000000 | 0.000000 | 33.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 2.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 |
75% | 3.000000 | 24.000000 | 4.00000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 3972.250000 | 2.000000 | 4.000000 | 4.000000 | 0.000000 | 1.00000 | 0.000000 | 0.000000 | 0.000000 | 4.000000 | 1.000000 | 0.000000 | 42.000000 | 0.000000 | 0.000000 | 1.000000 | 2.000000 | 2.000000 | 1.000000 | 1.000000 | 0.000000 | 1.000000 |
max | 3.000000 | 72.000000 | 4.00000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 18424.000000 | 4.000000 | 4.000000 | 4.000000 | 1.000000 | 1.00000 | 1.000000 | 1.000000 | 1.000000 | 4.000000 | 1.000000 | 1.000000 | 75.000000 | 1.000000 | 1.000000 | 1.000000 | 4.000000 | 3.000000 | 2.000000 | 1.000000 | 1.000000 | 1.000000 |
df.sort_values('CHK_ACCT', ascending=False).head(1)
CHK_ACCT | DURATION | HISTORY | NEW_CAR | USED_CAR | FURNITURE | RADIO/TV | EDUCATION | RETRAINING | AMOUNT | SAV_ACCT | EMPLOYMENT | INSTALL_RATE | MALE_DIV | MALE_SINGLE | MALE_MAR_or_WID | CO-APPLICANT | GUARANTOR | PRESENT_RESIDENT | REAL_ESTATE | PROP_UNKN_NONE | AGE | OTHER_INSTALL | RENT | OWN_RES | NUM_CREDITS | JOB | NUM_DEPENDENTS | TELEPHONE | FOREIGN | RESPONSE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
898 | 3 | 15 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 2186 | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 4 | 1 | 0 | 33 | 1 | 1 | 0 | 1 | 1 | 1 | 0 | 0 | 1 |
df['DURATION'].max()
72
df['AMOUNT'].min()
250
# pivot table using response
# 700 borrowers have paid back their loans and 300 borrowers have defaulted on their loans
df.groupby('RESPONSE').size()
RESPONSE 0 300 1 700 dtype: int64
#created a dataframe for RESPONSE
payback=df.groupby('RESPONSE').size().reset_index()
payback
RESPONSE | 0 | |
---|---|---|
0 | 0 | 300 |
1 | 1 | 700 |
# 70 percent of people who took out a loan have paid it back
# 30 percent of people who took out a loan defaulted on their loan
colors = ['red', 'green']
plt.pie(payback[0], labels=payback['RESPONSE'], colors=colors, autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal');
payback.groupby('RESPONSE').sum().plot(kind='bar');
df.groupby("RESPONSE").mean()
CHK_ACCT | DURATION | HISTORY | NEW_CAR | USED_CAR | FURNITURE | RADIO/TV | EDUCATION | RETRAINING | AMOUNT | SAV_ACCT | EMPLOYMENT | INSTALL_RATE | MALE_DIV | MALE_SINGLE | MALE_MAR_or_WID | CO-APPLICANT | GUARANTOR | PRESENT_RESIDENT | REAL_ESTATE | PROP_UNKN_NONE | AGE | OTHER_INSTALL | RENT | OWN_RES | NUM_CREDITS | JOB | NUM_DEPENDENTS | TELEPHONE | FOREIGN | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
RESPONSE | ||||||||||||||||||||||||||||||
0 | 0.903333 | 24.860000 | 2.166667 | 0.296667 | 0.056667 | 0.193333 | 0.206667 | 0.073333 | 0.113333 | 3938.126667 | 0.673333 | 2.170000 | 3.096667 | 0.066667 | 0.486667 | 0.083333 | 0.060000 | 0.033333 | 2.850000 | 0.200000 | 0.223333 | 33.963333 | 0.253333 | 0.233333 | 0.620000 | 1.366667 | 1.936667 | 1.153333 | 0.376667 | 0.013333 |
1 | 1.865714 | 19.207143 | 2.707143 | 0.207143 | 0.122857 | 0.175714 | 0.311429 | 0.040000 | 0.090000 | 2985.457143 | 1.290000 | 2.475714 | 2.920000 | 0.042857 | 0.574286 | 0.095714 | 0.032857 | 0.060000 | 2.842857 | 0.317143 | 0.124286 | 36.224286 | 0.157143 | 0.155714 | 0.752857 | 1.424286 | 1.890000 | 1.155714 | 0.415714 | 0.047143 |
# total number of people that asked for a loan in each category of EMPLOYMENT
client_count=df.groupby('EMPLOYMENT').size()
# total number of people that paid back their loan in each category of EMPLOYMENT according to RESPONSE
payback_sum=df.groupby('EMPLOYMENT').RESPONSE.sum()
print(client_count)
print("-----------------------------------------------------------")
print(payback_sum)
EMPLOYMENT 0 62 1 172 2 339 3 174 4 253 dtype: int64 ----------------------------------------------------------- EMPLOYMENT 0 39 1 102 2 235 3 135 4 189 Name: RESPONSE, dtype: int64
client_count.plot(kind='bar')
plt.xlabel('EMPLOYMENT')
plt.ylabel('RESPONSE')
plt.title("RESPONSE by EMPLOYMENT")
Text(0.5, 1.0, 'RESPONSE by EMPLOYMENT')
pd.pivot_table(df, index='RESPONSE', columns='HISTORY', values='AMOUNT', aggfunc='count')
HISTORY | 0 | 1 | 2 | 3 | 4 |
---|---|---|---|---|---|
RESPONSE | |||||
0 | 25 | 28 | 169 | 28 | 50 |
1 | 15 | 21 | 361 | 60 | 243 |
pd.pivot_table(df, index='RESPONSE', columns='HISTORY', values='AMOUNT', aggfunc='count').plot(kind='bar')
<AxesSubplot:xlabel='RESPONSE'>
sns.boxplot(df.RESPONSE, df.REAL_ESTATE)
<AxesSubplot:xlabel='RESPONSE', ylabel='REAL_ESTATE'>
sns.factorplot("RESPONSE", "EMPLOYMENT", data=df)
C:\Users\patri\anaconda3\lib\site-packages\seaborn\categorical.py:3704: UserWarning: The `factorplot` function has been renamed to `catplot`. The original name will be removed in a future release. Please update your code. Note that the default `kind` in `factorplot` (`'point'`) has changed `'strip'` in `catplot`. warnings.warn(msg)
<seaborn.axisgrid.FacetGrid at 0x1a1cda27e80>
sns.lmplot("AGE", "RESPONSE", df, y_jitter=0.40, x_jitter=.40)
<seaborn.axisgrid.FacetGrid at 0x1a1cd8c97f0>
sns.catplot("RESPONSE", "DURATION", data=df, kind='point')
<seaborn.axisgrid.FacetGrid at 0x11c975af8e0>
sns.histplot(data=df, x="AMOUNT", hue="RESPONSE")
<AxesSubplot:xlabel='AMOUNT', ylabel='Count'>
sns.kdeplot(df.OWN_RES, df.RESPONSE)
<AxesSubplot:xlabel='RESPONSE', ylabel='OWN_RES'>
df.groupby('RESPONSE')['INSTALL_RATE'].sum().plot(kind='bar');
sns.catplot("RESPONSE", "JOB", "EDUCATION", data=df, kind='point')
<seaborn.axisgrid.FacetGrid at 0x11c94b58490>
sns.boxplot(df.RESPONSE, df.SAV_ACCT)
<AxesSubplot:xlabel='RESPONSE', ylabel='SAV_ACCT'>
sns.scatterplot(data=df, x="RESPONSE", y="AGE")
<AxesSubplot:xlabel='RESPONSE', ylabel='AGE'>
sns.scatterplot(data=df, x="RESPONSE", y="AMOUNT")
<AxesSubplot:xlabel='RESPONSE', ylabel='AMOUNT'>
sns.lineplot(data=df, x="RESPONSE", y="CHK_ACCT")
<AxesSubplot:xlabel='RESPONSE', ylabel='CHK_ACCT'>
sns.lmplot("RESPONSE", "HISTORY", df)
<seaborn.axisgrid.FacetGrid at 0x11c970e2610>
sns.lmplot("RESPONSE", "FURNITURE", df)
<seaborn.axisgrid.FacetGrid at 0x1a1ceee7160>
# shows the number of people from each HISTORY category that took out a loan
response_count = df.groupby('HISTORY').size()
# shows the total number of people that paid back their loan from each HISTORY category according to response
response_sum = df.groupby('HISTORY').RESPONSE.sum()
print(response_count)
print("-----------------------------------------------------------")
print(response_sum)
HISTORY 0 40 1 49 2 530 3 88 4 293 dtype: int64 ----------------------------------------------------------- HISTORY 0 15 1 21 2 361 3 60 4 243 Name: RESPONSE, dtype: int64
response_count.plot(kind='bar')
plt.xlabel('History')
plt.ylabel('Count of people')
plt.title("People by History")
Text(0.5, 1.0, 'People by History')
response_prob=response_sum/response_count
print(response_prob)
HISTORY 0 0.375000 1 0.428571 2 0.681132 3 0.681818 4 0.829352 dtype: float64
response_prob.plot(kind='bar')
<AxesSubplot:xlabel='HISTORY'>
# total number of people that asked for a loan in each category of EMPLOYMENT
client_count=df.groupby('EMPLOYMENT').size()
# total number of people that paid back their loan in each category of EMPLOYMENT according to RESPONSE
payback_sum=df.groupby('EMPLOYMENT').RESPONSE.sum()
print(client_count)
print("-----------------------------------------------------------")
print(payback_sum)
EMPLOYMENT 0 62 1 172 2 339 3 174 4 253 dtype: int64 ----------------------------------------------------------- EMPLOYMENT 0 39 1 102 2 235 3 135 4 189 Name: RESPONSE, dtype: int64
payback_prob=payback_sum/client_count
print(payback_prob)
EMPLOYMENT 0 0.629032 1 0.593023 2 0.693215 3 0.775862 4 0.747036 dtype: float64
payback_prob.plot(kind='bar');
sns.jointplot(data=df, x="DURATION", y="AGE", hue="JOB")
<seaborn.axisgrid.JointGrid at 0x1a1cef10b80>