from IPython.display import YouTubeVideo
YouTubeVideo('iANv_0ZQKDY', width=800, height=500)
# import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# read data
df=pd.read_csv("data/titanic_train.csv")
print(df.head())
PassengerId Survived Pclass \ 0 1 0 3 1 2 1 1 2 3 1 3 3 4 1 1 4 5 0 3 Name Sex Age SibSp \ 0 Braund, Mr. Owen Harris male 22.0 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 2 Heikkinen, Miss. Laina female 26.0 0 3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 4 Allen, Mr. William Henry male 35.0 0 Parch Ticket Fare Cabin Embarked 0 0 A/5 21171 7.2500 NaN S 1 0 PC 17599 71.2833 C85 C 2 0 STON/O2. 3101282 7.9250 NaN S 3 0 113803 53.1000 C123 S 4 0 373450 8.0500 NaN S
#remove unimportant columns ... Ticket #s mean nothing ... too many missing values in Cabin ...
df = df.drop(['Name', 'PassengerId', 'Ticket','Cabin'], axis=1)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Survived 891 non-null int64 1 Pclass 891 non-null int64 2 Sex 891 non-null object 3 Age 714 non-null float64 4 SibSp 891 non-null int64 5 Parch 891 non-null int64 6 Fare 891 non-null float64 7 Embarked 889 non-null object dtypes: float64(2), int64(4), object(2) memory usage: 55.8+ KB
#many missing values in Age and Cabin
df.isnull().sum()
Survived 0 Pclass 0 Sex 0 Age 177 SibSp 0 Parch 0 Fare 0 Embarked 2 dtype: int64
#remove the rows with missing age ... this removes 177 rows with missing values
df = df.dropna()
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 712 entries, 0 to 890 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Survived 712 non-null int64 1 Pclass 712 non-null int64 2 Sex 712 non-null object 3 Age 712 non-null float64 4 SibSp 712 non-null int64 5 Parch 712 non-null int64 6 Fare 712 non-null float64 7 Embarked 712 non-null object dtypes: float64(2), int64(4), object(2) memory usage: 50.1+ KB
# find out missing values again
df.isnull().sum()
Survived 0 Pclass 0 Sex 0 Age 0 SibSp 0 Parch 0 Fare 0 Embarked 0 dtype: int64
# info
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 712 entries, 0 to 890 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Survived 712 non-null int64 1 Pclass 712 non-null int64 2 Sex 712 non-null object 3 Age 712 non-null float64 4 SibSp 712 non-null int64 5 Parch 712 non-null int64 6 Fare 712 non-null float64 7 Embarked 712 non-null object dtypes: float64(2), int64(4), object(2) memory usage: 50.1+ KB
# correlation analysis
# the output is partial correlation because the sex and embarked are categorical and are ignored
df.corr()
Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|
Survived | 1.000000 | -0.356462 | -0.082446 | -0.015523 | 0.095265 | 0.266100 |
Pclass | -0.356462 | 1.000000 | -0.365902 | 0.065187 | 0.023666 | -0.552893 |
Age | -0.082446 | -0.365902 | 1.000000 | -0.307351 | -0.187896 | 0.093143 |
SibSp | -0.015523 | 0.065187 | -0.307351 | 1.000000 | 0.383338 | 0.139860 |
Parch | 0.095265 | 0.023666 | -0.187896 | 0.383338 | 1.000000 | 0.206624 |
Fare | 0.266100 | -0.552893 | 0.093143 | 0.139860 | 0.206624 | 1.000000 |
# unique values in Sex column ... groupby, all the following codes do the same thing
#df['Sex'].value_counts()
df['Sex'].unique()
#df.groupby('Sex').size()
array(['male', 'female'], dtype=object)
# find out unique values in Embarked ... groupby
df['Embarked'].value_counts()
# (C = Cherbourg; Q = Queenstown; S = Southampton)
S 554 C 130 Q 28 Name: Embarked, dtype: int64
#http://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html
# create dummy variables or colummn for Sex
# create dummy variables or colummn for Embarked
df = pd.get_dummies(df, columns=['Sex', 'Embarked'])
df.head()
Survived | Pclass | Age | SibSp | Parch | Fare | Sex_female | Sex_male | Embarked_C | Embarked_Q | Embarked_S | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | 22.0 | 1 | 0 | 7.2500 | 0 | 1 | 0 | 0 | 1 |
1 | 1 | 1 | 38.0 | 1 | 0 | 71.2833 | 1 | 0 | 1 | 0 | 0 |
2 | 1 | 3 | 26.0 | 0 | 0 | 7.9250 | 1 | 0 | 0 | 0 | 1 |
3 | 1 | 1 | 35.0 | 1 | 0 | 53.1000 | 1 | 0 | 0 | 0 | 1 |
4 | 0 | 3 | 35.0 | 0 | 0 | 8.0500 | 0 | 1 | 0 | 0 | 1 |
# correlation analysis again
df.corr()
Survived | Pclass | Age | SibSp | Parch | Fare | Sex_female | Sex_male | Embarked_C | Embarked_Q | Embarked_S | |
---|---|---|---|---|---|---|---|---|---|---|---|
Survived | 1.000000 | -0.356462 | -0.082446 | -0.015523 | 0.095265 | 0.266100 | 0.536762 | -0.536762 | 0.195673 | -0.048966 | -0.159015 |
Pclass | -0.356462 | 1.000000 | -0.365902 | 0.065187 | 0.023666 | -0.552893 | -0.150826 | 0.150826 | -0.279194 | 0.131989 | 0.197831 |
Age | -0.082446 | -0.365902 | 1.000000 | -0.307351 | -0.187896 | 0.093143 | -0.099037 | 0.099037 | 0.038268 | -0.021693 | -0.025431 |
SibSp | -0.015523 | 0.065187 | -0.307351 | 1.000000 | 0.383338 | 0.139860 | 0.106296 | -0.106296 | -0.046227 | 0.051331 | 0.018968 |
Parch | 0.095265 | 0.023666 | -0.187896 | 0.383338 | 1.000000 | 0.206624 | 0.249543 | -0.249543 | -0.009523 | -0.009417 | 0.013259 |
Fare | 0.266100 | -0.552893 | 0.093143 | 0.139860 | 0.206624 | 1.000000 | 0.182457 | -0.182457 | 0.301337 | -0.062346 | -0.250994 |
Sex_female | 0.536762 | -0.150826 | -0.099037 | 0.106296 | 0.249543 | 0.182457 | 1.000000 | -1.000000 | 0.103611 | 0.027256 | -0.109078 |
Sex_male | -0.536762 | 0.150826 | 0.099037 | -0.106296 | -0.249543 | -0.182457 | -1.000000 | 1.000000 | -0.103611 | -0.027256 | 0.109078 |
Embarked_C | 0.195673 | -0.279194 | 0.038268 | -0.046227 | -0.009523 | 0.301337 | 0.103611 | -0.103611 | 1.000000 | -0.095623 | -0.884986 |
Embarked_Q | -0.048966 | 0.131989 | -0.021693 | 0.051331 | -0.009417 | -0.062346 | 0.027256 | -0.027256 | -0.095623 | 1.000000 | -0.378859 |
Embarked_S | -0.159015 | 0.197831 | -0.025431 | 0.018968 | 0.013259 | -0.250994 | -0.109078 | 0.109078 | -0.884986 | -0.378859 | 1.000000 |
# correlation plot
plt.figure(figsize=(8, 8))
sns.heatmap(df.corr())
<AxesSubplot:>