#import python packages 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
pd.set_option('display.max_columns', None)

print(sns.__version__)    #check seaborn version

0.11.0


df=pd.read_csv("data/bank.csv")
df.head()


df.describe()


df.sort_values('CHK_ACCT', ascending=False).head(1)


df['DURATION'].max()

72


df['AMOUNT'].min()

250


# pivot table using response
# 700 borrowers have paid back their loans and 300 borrowers have defaulted on their loans 

df.groupby('RESPONSE').size()

RESPONSE
0    300
1    700
dtype: int64


#created a dataframe for RESPONSE

payback=df.groupby('RESPONSE').size().reset_index()
payback


# 70 percent of people who took out a loan have paid it back 
# 30 percent of people who took out a loan defaulted on their loan 

colors = ['red', 'green']

plt.pie(payback[0], labels=payback['RESPONSE'], colors=colors, autopct='%1.1f%%', shadow=True, startangle=140)

plt.axis('equal');


payback.groupby('RESPONSE').sum().plot(kind='bar');


df.groupby("RESPONSE").mean()


# total number of people that asked for a loan in each category of EMPLOYMENT 
client_count=df.groupby('EMPLOYMENT').size()

# total number of people that paid back their loan in each category of EMPLOYMENT according to RESPONSE
payback_sum=df.groupby('EMPLOYMENT').RESPONSE.sum()

print(client_count)
print("-----------------------------------------------------------")
print(payback_sum)

EMPLOYMENT
0     62
1    172
2    339
3    174
4    253
dtype: int64
-----------------------------------------------------------
EMPLOYMENT
0     39
1    102
2    235
3    135
4    189
Name: RESPONSE, dtype: int64


client_count.plot(kind='bar')
plt.xlabel('EMPLOYMENT')
plt.ylabel('RESPONSE')
plt.title("RESPONSE by EMPLOYMENT")

Text(0.5, 1.0, 'RESPONSE by EMPLOYMENT')


pd.pivot_table(df, index='RESPONSE', columns='HISTORY', values='AMOUNT', aggfunc='count')


pd.pivot_table(df, index='RESPONSE', columns='HISTORY', values='AMOUNT', aggfunc='count').plot(kind='bar')

<AxesSubplot:xlabel='RESPONSE'>


 sns.boxplot(df.RESPONSE, df.REAL_ESTATE)

<AxesSubplot:xlabel='RESPONSE', ylabel='REAL_ESTATE'>


sns.factorplot("RESPONSE", "EMPLOYMENT", data=df)

C:\Users\patri\anaconda3\lib\site-packages\seaborn\categorical.py:3704: UserWarning: The `factorplot` function has been renamed to `catplot`. The original name will be removed in a future release. Please update your code. Note that the default `kind` in `factorplot` (`'point'`) has changed `'strip'` in `catplot`.
  warnings.warn(msg)

<seaborn.axisgrid.FacetGrid at 0x1a1cda27e80>


sns.lmplot("AGE", "RESPONSE", df, y_jitter=0.40, x_jitter=.40)

<seaborn.axisgrid.FacetGrid at 0x1a1cd8c97f0>


sns.catplot("RESPONSE", "DURATION", data=df, kind='point')

<seaborn.axisgrid.FacetGrid at 0x11c975af8e0>


sns.histplot(data=df, x="AMOUNT", hue="RESPONSE")

<AxesSubplot:xlabel='AMOUNT', ylabel='Count'>


sns.kdeplot(df.OWN_RES, df.RESPONSE)

<AxesSubplot:xlabel='RESPONSE', ylabel='OWN_RES'>


df.groupby('RESPONSE')['INSTALL_RATE'].sum().plot(kind='bar');


sns.catplot("RESPONSE", "JOB", "EDUCATION", data=df, kind='point')

<seaborn.axisgrid.FacetGrid at 0x11c94b58490>


sns.boxplot(df.RESPONSE, df.SAV_ACCT)

<AxesSubplot:xlabel='RESPONSE', ylabel='SAV_ACCT'>


sns.scatterplot(data=df, x="RESPONSE", y="AGE")

<AxesSubplot:xlabel='RESPONSE', ylabel='AGE'>


sns.scatterplot(data=df, x="RESPONSE", y="AMOUNT")

<AxesSubplot:xlabel='RESPONSE', ylabel='AMOUNT'>


sns.lineplot(data=df, x="RESPONSE", y="CHK_ACCT")

<AxesSubplot:xlabel='RESPONSE', ylabel='CHK_ACCT'>


sns.lmplot("RESPONSE", "HISTORY", df)

<seaborn.axisgrid.FacetGrid at 0x11c970e2610>


sns.lmplot("RESPONSE", "FURNITURE", df)

<seaborn.axisgrid.FacetGrid at 0x1a1ceee7160>


# shows the number of people from each HISTORY category that took out a loan  
response_count = df.groupby('HISTORY').size()

# shows the total number of people that paid back their loan from each HISTORY category according to response
response_sum = df.groupby('HISTORY').RESPONSE.sum()

print(response_count)
print("-----------------------------------------------------------")
print(response_sum)

HISTORY
0     40
1     49
2    530
3     88
4    293
dtype: int64
-----------------------------------------------------------
HISTORY
0     15
1     21
2    361
3     60
4    243
Name: RESPONSE, dtype: int64


response_count.plot(kind='bar')
plt.xlabel('History')
plt.ylabel('Count of people')
plt.title("People by History")

Text(0.5, 1.0, 'People by History')


response_prob=response_sum/response_count
print(response_prob)

HISTORY
0    0.375000
1    0.428571
2    0.681132
3    0.681818
4    0.829352
dtype: float64


response_prob.plot(kind='bar')

<AxesSubplot:xlabel='HISTORY'>


# total number of people that asked for a loan in each category of EMPLOYMENT 
client_count=df.groupby('EMPLOYMENT').size()

# total number of people that paid back their loan in each category of EMPLOYMENT according to RESPONSE
payback_sum=df.groupby('EMPLOYMENT').RESPONSE.sum()

print(client_count)
print("-----------------------------------------------------------")
print(payback_sum)

EMPLOYMENT
0     62
1    172
2    339
3    174
4    253
dtype: int64
-----------------------------------------------------------
EMPLOYMENT
0     39
1    102
2    235
3    135
4    189
Name: RESPONSE, dtype: int64


payback_prob=payback_sum/client_count
print(payback_prob)

EMPLOYMENT
0    0.629032
1    0.593023
2    0.693215
3    0.775862
4    0.747036
dtype: float64


payback_prob.plot(kind='bar');


sns.jointplot(data=df, x="DURATION", y="AGE", hue="JOB")

<seaborn.axisgrid.JointGrid at 0x1a1cef10b80>

	CHK_ACCT	DURATION	HISTORY	NEW_CAR	FURNITURE	RADIO/TV	EDUCATION	AMOUNT	SAV_ACCT	EMPLOYMENT	INSTALL_RATE	MALE_SINGLE	GUARANTOR	PRESENT_RESIDENT	REAL_ESTATE	PROP_UNKN_NONE	AGE	OWN_RES	NUM_CREDITS	JOB	NUM_DEPENDENTS	TELEPHONE	RESPONSE
0	0	6	4	0	0	1	0	1169	4	4	4	1	0	4	1	0	67	1	2	2	1	1	1
1	1	48	2	0	0	1	0	5951	0	2	2	0	0	2	1	0	22	1	1	2	1	0	0
2	3	12	4	0	0	0	1	2096	0	3	2	1	0	3	1	0	49	1	1	1	2	0	1
3	0	42	2	0	1	0	0	7882	0	3	2	1	1	4	0	0	45	0	1	2	2	0	1
4	0	24	3	1	0	0	0	4870	0	2	3	1	0	4	0	1	53	0	2	2	2	0	0

	CHK_ACCT	DURATION	HISTORY	NEW_CAR	USED_CAR	FURNITURE	RADIO/TV	EDUCATION	RETRAINING	AMOUNT	SAV_ACCT	EMPLOYMENT	INSTALL_RATE	MALE_DIV	MALE_SINGLE	MALE_MAR_or_WID	CO-APPLICANT	GUARANTOR	PRESENT_RESIDENT	REAL_ESTATE	PROP_UNKN_NONE	AGE	OTHER_INSTALL	RENT	OWN_RES	NUM_CREDITS	JOB	NUM_DEPENDENTS	TELEPHONE	FOREIGN	RESPONSE
count	1000.000000	1000.000000	1000.00000	1000.000000	1000.000000	1000.000000	1000.000000	1000.000000	1000.000000	1000.000000	1000.000000	1000.000000	1000.000000	1000.000000	1000.00000	1000.000000	1000.000000	1000.000000	1000.000000	1000.000000	1000.000000	1000.000000	1000.000000	1000.000000	1000.000000	1000.000000	1000.000000	1000.000000	1000.000000	1000.000000	1000.000000
mean	1.577000	20.903000	2.54500	0.234000	0.103000	0.181000	0.280000	0.050000	0.097000	3271.258000	1.105000	2.384000	2.973000	0.050000	0.54800	0.092000	0.041000	0.052000	2.845000	0.282000	0.154000	35.546000	0.186000	0.179000	0.713000	1.407000	1.904000	1.155000	0.404000	0.037000	0.700000
std	1.257638	12.058814	1.08312	0.423584	0.304111	0.385211	0.449224	0.218054	0.296106	2822.736876	1.580023	1.208306	1.118715	0.218054	0.49794	0.289171	0.198389	0.222138	1.103718	0.450198	0.361129	11.375469	0.389301	0.383544	0.452588	0.577654	0.653614	0.362086	0.490943	0.188856	0.458487
min	0.000000	4.000000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	250.000000	0.000000	0.000000	1.000000	0.000000	0.00000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	19.000000	0.000000	0.000000	0.000000	1.000000	0.000000	1.000000	0.000000	0.000000	0.000000
25%	0.000000	12.000000	2.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1365.500000	0.000000	2.000000	2.000000	0.000000	0.00000	0.000000	0.000000	0.000000	2.000000	0.000000	0.000000	27.000000	0.000000	0.000000	0.000000	1.000000	2.000000	1.000000	0.000000	0.000000	0.000000
50%	1.000000	18.000000	2.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	2319.500000	0.000000	2.000000	3.000000	0.000000	1.00000	0.000000	0.000000	0.000000	3.000000	0.000000	0.000000	33.000000	0.000000	0.000000	1.000000	1.000000	2.000000	1.000000	0.000000	0.000000	1.000000
75%	3.000000	24.000000	4.00000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	3972.250000	2.000000	4.000000	4.000000	0.000000	1.00000	0.000000	0.000000	0.000000	4.000000	1.000000	0.000000	42.000000	0.000000	0.000000	1.000000	2.000000	2.000000	1.000000	1.000000	0.000000	1.000000
max	3.000000	72.000000	4.00000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	18424.000000	4.000000	4.000000	4.000000	1.000000	1.00000	1.000000	1.000000	1.000000	4.000000	1.000000	1.000000	75.000000	1.000000	1.000000	1.000000	4.000000	3.000000	2.000000	1.000000	1.000000	1.000000

	CHK_ACCT	DURATION	HISTORY	NEW_CAR	USED_CAR	FURNITURE	RADIO/TV	EDUCATION	RETRAINING	AMOUNT	SAV_ACCT	EMPLOYMENT	INSTALL_RATE	MALE_DIV	MALE_SINGLE	MALE_MAR_or_WID	CO-APPLICANT	GUARANTOR	PRESENT_RESIDENT	REAL_ESTATE	PROP_UNKN_NONE	AGE	OTHER_INSTALL	RENT	OWN_RES	NUM_CREDITS	JOB	NUM_DEPENDENTS	TELEPHONE	FOREIGN
RESPONSE
0	0.903333	24.860000	2.166667	0.296667	0.056667	0.193333	0.206667	0.073333	0.113333	3938.126667	0.673333	2.170000	3.096667	0.066667	0.486667	0.083333	0.060000	0.033333	2.850000	0.200000	0.223333	33.963333	0.253333	0.233333	0.620000	1.366667	1.936667	1.153333	0.376667	0.013333
1	1.865714	19.207143	2.707143	0.207143	0.122857	0.175714	0.311429	0.040000	0.090000	2985.457143	1.290000	2.475714	2.920000	0.042857	0.574286	0.095714	0.032857	0.060000	2.842857	0.317143	0.124286	36.224286	0.157143	0.155714	0.752857	1.424286	1.890000	1.155714	0.415714	0.047143

Patrick Mugisha, "On my honor, as a student, I have neither given nor received unauthorized aid on this academic work."¶

Q1. Use some common sense (and/or your business knowledge) to answer the first question. Which columns or (independent) variables would significantly influence the Y value (whether someone fulfills the terms of credit agreement or not)? List at least three and explain why?¶

Name the column (variable) and explain why?¶

Name the column (variable) and explain why?¶

Name the column (variable) and explain why?¶

Q2. What are some general findings from basic statistics (describe)?¶

What is the most popular checking account status in the data?¶

What is the maximum loan duration?¶

What was the minimum loan amount?¶

Q3. What portion of borrowers have paid back? What portion have defaulted on their loans? You need to use a pivot table or charts. Provide a discussion of the findings in a markdown¶

Show atleast one pie plot¶

Show atleast one bar plot¶

Q4. Demonstrate your skills in groupby to extract business intelligence. The focus of your analysis should be who is likely to pay back/default on the loan. Provide¶

Use groupby for analysis¶

Visualize the outcomes of groupby¶

Provide a short discussion of the findings in a markdown.¶

Q5. Demonstrate your skills in pivot_table to extract business intelligence. The focus of your analysis should be who is likely to pay back/default on the loan. Provide¶

Use pivot_table for analysis¶

Visualize the outcomes of pivot_table¶

Provide a short discussion of the findings in a markdown.¶

Q6. What is the relationship between RESPONSE and the three variables you chosen in Question #1? For each variable, you need to show a chart or charts (e.g., matplot).¶

The relationship between RESPONSE and variable #1¶

The relationship between RESPONSE and variable #2¶

The relationship between RESPONSE and variable #3¶

Provide a discussion of the findings in a markdown¶

Q7. Visualize the relationship between DURATION and RESPONSE and provide the insights from the chart(s)?¶

Provide a discussion of the findings in a markdown¶

Q8. What variables appear to be highly influential in determining Y value (RESPONSE)? Use seaborn plots to display the interaction of two, three or more variables and how these variables are related to Y value (RESPONSE)¶

Use distribution plots (e.g., histplot): At least two plots¶

Use categorical plots (e.g., catplot, barplot): At least three plots¶

Use relational plots (e.g., scatterplot): At least two plots¶

Use relational/statistical plots (e.g., lmplot): At least two plots¶

Provide a discussion of the findings in a markdown¶

Q9. Credit history could be an important variable predicting whether people will fulfill the credit agreement, so find out any relationship between history and response. You need to develop two plots here.¶

1st plot simply shows how many people per each category;¶

2nd plot shows the "probability" of loan payback in terms of "history".¶

Provide a discussion of the findings in a markdown¶

Q10. Formulate your own question relevant to this dataset and business problem and answer using data visualization.¶

What is the probability of people who paid back their loans in terms of every employment category¶

Make a that plot shows the "probability" of loan payback in terms of "EMPLOYMENT"¶

Q11. By Using seaborn, make a joint plot to show relationship between DURATION, AGE, and the type of JOB loan seekers have..¶

Q12. What are the characteristics of the people who have paid back?¶

What are the characteristics of the people who have defaulted on loans?¶