from IPython.display import YouTubeVideo
YouTubeVideo('iANv_0ZQKDY', width=800, height=500)


# import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


# read data
df=pd.read_csv("data/titanic_train.csv")
print(df.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S


#remove unimportant columns ... Ticket #s mean nothing ... too many missing values in Cabin ... 
df = df.drop(['Name', 'PassengerId', 'Ticket','Cabin'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


#many missing values in Age and Cabin
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64


#remove the rows with missing age ... this removes 177 rows with missing values
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  712 non-null    int64  
 1   Pclass    712 non-null    int64  
 2   Sex       712 non-null    object 
 3   Age       712 non-null    float64
 4   SibSp     712 non-null    int64  
 5   Parch     712 non-null    int64  
 6   Fare      712 non-null    float64
 7   Embarked  712 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 50.1+ KB


# find out missing values again

df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


# info
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  712 non-null    int64  
 1   Pclass    712 non-null    int64  
 2   Sex       712 non-null    object 
 3   Age       712 non-null    float64
 4   SibSp     712 non-null    int64  
 5   Parch     712 non-null    int64  
 6   Fare      712 non-null    float64
 7   Embarked  712 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 50.1+ KB


# correlation analysis
# the output is partial correlation because the sex and embarked are categorical and are ignored  
df.corr()


# unique values in Sex column ... groupby, all the following codes do the same thing
#df['Sex'].value_counts()
df['Sex'].unique()
#df.groupby('Sex').size()

array(['male', 'female'], dtype=object)


# find out unique values in Embarked ... groupby
df['Embarked'].value_counts()

# (C = Cherbourg; Q = Queenstown; S = Southampton)

S    554
C    130
Q     28
Name: Embarked, dtype: int64


#http://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html

# create dummy variables or colummn for Sex
# create dummy variables or colummn for Embarked

df = pd.get_dummies(df, columns=['Sex', 'Embarked'])
df.head()


# correlation analysis again
df.corr()


# correlation plot
plt.figure(figsize=(8, 8))
sns.heatmap(df.corr())

<AxesSubplot:>

	Survived	Pclass	Age	SibSp	Parch	Fare
Survived	1.000000	-0.356462	-0.082446	-0.015523	0.095265	0.266100
Pclass	-0.356462	1.000000	-0.365902	0.065187	0.023666	-0.552893
Age	-0.082446	-0.365902	1.000000	-0.307351	-0.187896	0.093143
SibSp	-0.015523	0.065187	-0.307351	1.000000	0.383338	0.139860
Parch	0.095265	0.023666	-0.187896	0.383338	1.000000	0.206624
Fare	0.266100	-0.552893	0.093143	0.139860	0.206624	1.000000

	Survived	Pclass	Age	SibSp	Parch	Fare	Sex_female	Sex_male	Embarked_C	Embarked_Q	Embarked_S
Survived	1.000000	-0.356462	-0.082446	-0.015523	0.095265	0.266100	0.536762	-0.536762	0.195673	-0.048966	-0.159015
Pclass	-0.356462	1.000000	-0.365902	0.065187	0.023666	-0.552893	-0.150826	0.150826	-0.279194	0.131989	0.197831
Age	-0.082446	-0.365902	1.000000	-0.307351	-0.187896	0.093143	-0.099037	0.099037	0.038268	-0.021693	-0.025431
SibSp	-0.015523	0.065187	-0.307351	1.000000	0.383338	0.139860	0.106296	-0.106296	-0.046227	0.051331	0.018968
Parch	0.095265	0.023666	-0.187896	0.383338	1.000000	0.206624	0.249543	-0.249543	-0.009523	-0.009417	0.013259
Fare	0.266100	-0.552893	0.093143	0.139860	0.206624	1.000000	0.182457	-0.182457	0.301337	-0.062346	-0.250994
Sex_female	0.536762	-0.150826	-0.099037	0.106296	0.249543	0.182457	1.000000	-1.000000	0.103611	0.027256	-0.109078
Sex_male	-0.536762	0.150826	0.099037	-0.106296	-0.249543	-0.182457	-1.000000	1.000000	-0.103611	-0.027256	0.109078
Embarked_C	0.195673	-0.279194	0.038268	-0.046227	-0.009523	0.301337	0.103611	-0.103611	1.000000	-0.095623	-0.884986
Embarked_Q	-0.048966	0.131989	-0.021693	0.051331	-0.009417	-0.062346	0.027256	-0.027256	-0.095623	1.000000	-0.378859
Embarked_S	-0.159015	0.197831	-0.025431	0.018968	0.013259	-0.250994	-0.109078	0.109078	-0.884986	-0.378859	1.000000

Patrick Mugisha, "On my honor, as a student, I have neither given nor received unauthorized aid on this academic work."¶

Correlation Analysis for the Data with Categorial Columns or Variables¶

Data description (Titanic)¶

Two variables (Embarked, Sex) are missing in this correlation analysis. This is a problem to us. We need to take care of this using Dummy Variables¶

Handling Categorical Columns for Correlation Analysis (and Machine learning in the future)¶

now, everything is clear¶

	Survived	Pclass	Age	SibSp	Fare	Sex_female	Sex_male	Embarked_C	Embarked_S
0	0	3	22.0	1	7.2500	0	1	0	1
1	1	1	38.0	1	71.2833	1	0	1	0
2	1	3	26.0	0	7.9250	1	0	0	1
3	1	1	35.0	1	53.1000	1	0	0	1
4	0	3	35.0	0	8.0500	0	1	0	1