ETL represents a broad range of data transformation and processing techniques. ETL involves cleaning and transforming the data for data analysis.

# import python packages
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline
# import titanic_train.csv
ti=pd.read_csv("data/titanic_train.csv")
ti.head()
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.00 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 350.00 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | hi | 26.00 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | hello | 35.00 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 0.35 | 0 | 0 | 373450 | 8.0500 | NaN | S |
ti.tail()
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.00 | NaN | S |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.00 | B42 | S |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.45 | NaN | S |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.00 | C148 | C |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.75 | NaN | Q |
#data types & missing values #info is for the description of your data
ti.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 891 non-null int64 1 Survived 891 non-null int64 2 Pclass 891 non-null int64 3 Name 891 non-null object 4 Sex 891 non-null object 5 Age 714 non-null float64 6 SibSp 891 non-null int64 7 Parch 891 non-null int64 8 Ticket 891 non-null object 9 Fare 891 non-null float64 10 Cabin 204 non-null object 11 Embarked 889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.7+ KB
#dtypes is for the type of data yuo have
ti.dtypes
PassengerId int64 Survived int64 Pclass int64 Name object Sex object Age float64 SibSp int64 Parch int64 Ticket object Fare float64 Cabin object Embarked object dtype: object
Numerical columns: PassengerId, Survived, Pclass, Age, SibSp, Parch, Fare
Categorial columns: Name, Sex, Ticket, Cabin, Embarked
There are some missing values (Age, Cabin, Embarked)
This dataset needs data cleaning & transformation
#Describe shows a quick statistic summary of your data
#this is a good way to examine the numerical columns
ti.describe()
| PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
|---|---|---|---|---|---|---|---|
| count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
| mean | 446.000000 | 0.383838 | 2.308642 | 30.087563 | 0.523008 | 0.381594 | 32.204208 |
| std | 257.353842 | 0.486592 | 0.836071 | 18.863424 | 1.102743 | 0.806057 | 49.693429 |
| min | 1.000000 | 0.000000 | 1.000000 | 0.350000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 223.500000 | 0.000000 | 2.000000 | 20.000000 | 0.000000 | 0.000000 | 7.910400 |
| 50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
| 75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
| max | 891.000000 | 1.000000 | 3.000000 | 350.000000 | 8.000000 | 6.000000 | 512.329200 |
# OR use this method if you want to find individual results
print(ti['Age'].mean())
print(ti['Age'].std())
print(ti['Age'].max())
print(ti['Age'].min())
30.08756302521008 18.86342448744758 350.0 0.35
ti.hist(figsize=(8,8));
There are a number of data quality issues
# describe for a single column
ti['Fare'].describe()
count 891.000000 mean 32.204208 std 49.693429 min 0.000000 25% 7.910400 50% 14.454200 75% 31.000000 max 512.329200 Name: Fare, dtype: float64
# do this for categorical columns
ti['Embarked'].describe()
count 889 unique 3 top S freq 644 Name: Embarked, dtype: object
#pivot in excel
ti.groupby('Embarked').size()
Embarked C 168 Q 77 S 644 dtype: int64
# how many unique values in the Sex column
ti.groupby('Sex').size()
Sex female 312 hello 1 hi 1 male 577 dtype: int64
ti.head(2)
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 350.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
# remove an unnecessary column (PassengerId)
ti = ti.drop('PassengerId', axis=1)
ti.head()
| Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.00 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 350.00 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 1 | 3 | Heikkinen, Miss. Laina | hi | 26.00 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | hello | 35.00 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 0 | 3 | Allen, Mr. William Henry | male | 0.35 | 0 | 0 | 373450 | 8.0500 | NaN | S |
# remove multiple columns (Name, Ticket, Cabin)
ti = ti.drop(['Name','Ticket'], axis=1)
ti.head()
| Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.00 | 1 | 0 | 7.2500 | NaN | S |
| 1 | 1 | 1 | female | 350.00 | 1 | 0 | 71.2833 | C85 | C |
| 2 | 1 | 3 | hi | 26.00 | 0 | 0 | 7.9250 | NaN | S |
| 3 | 1 | 1 | hello | 35.00 | 1 | 0 | 53.1000 | C123 | S |
| 4 | 0 | 3 | male | 0.35 | 0 | 0 | 8.0500 | NaN | S |
# can you remove Cabin column? (this column is not much useful for further analysis - predictive analytics)
ti=ti.drop(['Cabin'], axis=1)
ti.head()
| Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.00 | 1 | 0 | 7.2500 | S |
| 1 | 1 | 1 | female | 350.00 | 1 | 0 | 71.2833 | C |
| 2 | 1 | 3 | hi | 26.00 | 0 | 0 | 7.9250 | S |
| 3 | 1 | 1 | hello | 35.00 | 1 | 0 | 53.1000 | S |
| 4 | 0 | 3 | male | 0.35 | 0 | 0 | 8.0500 | S |
ti.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Survived 891 non-null int64 1 Pclass 891 non-null int64 2 Sex 891 non-null object 3 Age 714 non-null float64 4 SibSp 891 non-null int64 5 Parch 891 non-null int64 6 Fare 891 non-null float64 7 Embarked 889 non-null object dtypes: float64(2), int64(4), object(2) memory usage: 55.8+ KB
# how many missing values in each column or variable
ti.isnull().sum()
Survived 0 Pclass 0 Sex 0 Age 177 SibSp 0 Parch 0 Fare 0 Embarked 2 dtype: int64
There are different ways to handle missing values
Recommended heuristics for handling missing values in real-world datasets
ti.head()
| Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.00 | 1 | 0 | 7.2500 | S |
| 1 | 1 | 1 | female | 350.00 | 1 | 0 | 71.2833 | C |
| 2 | 1 | 3 | hi | 26.00 | 0 | 0 | 7.9250 | S |
| 3 | 1 | 1 | hello | 35.00 | 1 | 0 | 53.1000 | S |
| 4 | 0 | 3 | male | 0.35 | 0 | 0 | 8.0500 | S |
len(ti)
891
#handling missing value: remove the rows with any missing value
ti_rev = ti.dropna()
ti_rev.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 712 entries, 0 to 890 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Survived 712 non-null int64 1 Pclass 712 non-null int64 2 Sex 712 non-null object 3 Age 712 non-null float64 4 SibSp 712 non-null int64 5 Parch 712 non-null int64 6 Fare 712 non-null float64 7 Embarked 712 non-null object dtypes: float64(2), int64(4), object(2) memory usage: 50.1+ KB
ti_rev.head(2)
| Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S |
| 1 | 1 | 1 | female | 350.0 | 1 | 0 | 71.2833 | C |
ti_rev.isnull().sum()
Survived 0 Pclass 0 Sex 0 Age 0 SibSp 0 Parch 0 Fare 0 Embarked 0 dtype: int64
Now, the dataset has 712 rows. No rows with missing values. This is an aggressive approach.
ti.isnull().sum()
Survived 0 Pclass 0 Sex 0 Age 177 SibSp 0 Parch 0 Fare 0 Embarked 2 dtype: int64
ti['Embarked'].describe()
count 889 unique 3 top S freq 644 Name: Embarked, dtype: object
# replace null value with the most popular value in a categorial columns
ti = ti.fillna({'Embarked': 'S'})
ti.isnull().sum()
Survived 0 Pclass 0 Sex 0 Age 177 SibSp 0 Parch 0 Fare 0 Embarked 0 dtype: int64
ti['Age'].median()
28.0
# replace null value with the median value
ti = ti.fillna({'Age': 28})
ti.isnull().sum()
Survived 0 Pclass 0 Sex 0 Age 0 SibSp 0 Parch 0 Fare 0 Embarked 0 dtype: int64
#OR MUCH BETTER APROACH (USE THIS)
# replace null value with the median value
ti = ti.fillna({'Age': ti['Age'].mean()})
ti.isnull().sum()
Survived 0 Pclass 0 Sex 0 Age 0 SibSp 0 Parch 0 Fare 0 Embarked 0 dtype: int64
len(ti)
891
# what would you do to replace null values with the mean value
ti=ti.fillna({'Age': ti['Age'].mean()})
ti.isnull().sum()
Survived 0 Pclass 0 Sex 0 Age 0 SibSp 0 Parch 0 Fare 0 Embarked 0 dtype: int64
#we have 4 different types of gender info
ti['Sex'].describe()
count 891 unique 4 top male freq 577 Name: Sex, dtype: object
#hello and hi should not be considered a type of gender
ti['Sex'].value_counts()
male 577 female 312 hi 1 hello 1 Name: Sex, dtype: int64
# hello in Sex is inaccurate ...replacing inaccurate value with null value
ti = ti.replace('hello', np.nan)
ti.head()
| Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.00 | 1 | 0 | 7.2500 | S |
| 1 | 1 | 1 | female | 350.00 | 1 | 0 | 71.2833 | C |
| 2 | 1 | 3 | hi | 26.00 | 0 | 0 | 7.9250 | S |
| 3 | 1 | 1 | NaN | 35.00 | 1 | 0 | 53.1000 | S |
| 4 | 0 | 3 | male | 0.35 | 0 | 0 | 8.0500 | S |
ti = ti.dropna()
ti.head()
| Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.00 | 1 | 0 | 7.2500 | S |
| 1 | 1 | 1 | female | 350.00 | 1 | 0 | 71.2833 | C |
| 2 | 1 | 3 | hi | 26.00 | 0 | 0 | 7.9250 | S |
| 4 | 0 | 3 | male | 0.35 | 0 | 0 | 8.0500 | S |
| 5 | 0 | 3 | male | 28.00 | 0 | 0 | 8.4583 | Q |
# 350 is an inaccurate value for age and should be replaced by 35
# changinge single value with replace
ti = ti.replace({'Age': 350}, {'Age': 35})
ti.head()
| Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.00 | 1 | 0 | 7.2500 | S |
| 1 | 1 | 1 | female | 35.00 | 1 | 0 | 71.2833 | C |
| 2 | 1 | 3 | hi | 26.00 | 0 | 0 | 7.9250 | S |
| 4 | 0 | 3 | male | 0.35 | 0 | 0 | 8.0500 | S |
| 5 | 0 | 3 | male | 28.00 | 0 | 0 | 8.4583 | Q |
# instead of replacing hello with null value and removing the row,
# can you replace hi with the most popular value (male) in the categorical column?
ti=ti.replace('hi', 'male')
ti.head()
| Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 22.00 | 1 | 0 | 7.2500 | S |
| 1 | 1 | 1 | female | 35.00 | 1 | 0 | 71.2833 | C |
| 2 | 1 | 3 | male | 26.00 | 0 | 0 | 7.9250 | S |
| 4 | 0 | 3 | male | 0.35 | 0 | 0 | 8.0500 | S |
| 5 | 0 | 3 | male | 28.00 | 0 | 0 | 8.4583 | Q |
ti['Sex'].value_counts()
male 578 female 312 Name: Sex, dtype: int64
#above cell is the same as pivot
ti.groupby('Sex').size()
Sex female 312 male 578 dtype: int64
# Now, there should be male and female in the Sex column
# chaing categorical values with map
ti['Sex'] = ti['Sex'].map({'female': 0, 'male': 1})
ti.head()
| Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | 1 | 22.00 | 1 | 0 | 7.2500 | S |
| 1 | 1 | 1 | 0 | 35.00 | 1 | 0 | 71.2833 | C |
| 2 | 1 | 3 | 1 | 26.00 | 0 | 0 | 7.9250 | S |
| 4 | 0 | 3 | 1 | 0.35 | 0 | 0 | 8.0500 | S |
| 5 | 0 | 3 | 1 | 28.00 | 0 | 0 | 8.4583 | Q |
# when you analyze data, it is better to transform categorical values to numerical values
# can you convert S, C, Q (Embarked) to 1, 2, 3
ti['Embarked']=ti['Embarked'].map({'S': 1, 'C': 2, 'Q':3})
ti.head()
| Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | 1 | 22.00 | 1 | 0 | 7.2500 | 1 |
| 1 | 1 | 1 | 0 | 35.00 | 1 | 0 | 71.2833 | 2 |
| 2 | 1 | 3 | 1 | 26.00 | 0 | 0 | 7.9250 | 1 |
| 4 | 0 | 3 | 1 | 0.35 | 0 | 0 | 8.0500 | 1 |
| 5 | 0 | 3 | 1 | 28.00 | 0 | 0 | 8.4583 | 3 |
ti.isnull().sum()
Survived 0 Pclass 0 Sex 0 Age 0 SibSp 0 Parch 0 Fare 0 Embarked 0 dtype: int64
After applying multiple techniques of data cleaning, the data looks good now. Perhaps, this cleaned data can be used for analysis & builing predictive model. The predictive model would predict who is likely to survive, while using Survived as y value
# if the data cleaning is done, you may want to export it as csv or excel
ti.to_csv("data/ti_cleaned.csv")
ti.to_excel("data/ti_cleaned.xlsx")
# create new columns
ti['CurrentAge'] = ti['Age'] + 90
ti.head()
| Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | CurrentAge | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | 1 | 22.00 | 1 | 0 | 7.2500 | 1 | 112.00 |
| 1 | 1 | 1 | 0 | 35.00 | 1 | 0 | 71.2833 | 2 | 125.00 |
| 2 | 1 | 3 | 1 | 26.00 | 0 | 0 | 7.9250 | 1 | 116.00 |
| 4 | 0 | 3 | 1 | 0.35 | 0 | 0 | 8.0500 | 1 | 90.35 |
| 5 | 0 | 3 | 1 | 28.00 | 0 | 0 | 8.4583 | 3 | 118.00 |
# can you create a new column ("discount") based on Fare?
# discount = Fare * 0.25
ti['discount']=ti['Fare'] * 0.25
ti.head()
| Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | CurrentAge | discount | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | 1 | 22.00 | 1 | 0 | 7.2500 | 1 | 112.00 | 1.812500 |
| 1 | 1 | 1 | 0 | 35.00 | 1 | 0 | 71.2833 | 2 | 125.00 | 17.820825 |
| 2 | 1 | 3 | 1 | 26.00 | 0 | 0 | 7.9250 | 1 | 116.00 | 1.981250 |
| 4 | 0 | 3 | 1 | 0.35 | 0 | 0 | 8.0500 | 1 | 90.35 | 2.012500 |
| 5 | 0 | 3 | 1 | 28.00 | 0 | 0 | 8.4583 | 3 | 118.00 | 2.114575 |
# create a copy of existing column
ti['DeparturePort'] = ti['Embarked']
ti.head()
| Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | CurrentAge | discount | DeparturePort | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | 1 | 22.00 | 1 | 0 | 7.2500 | 1 | 112.00 | 1.812500 | 1 |
| 1 | 1 | 1 | 0 | 35.00 | 1 | 0 | 71.2833 | 2 | 125.00 | 17.820825 | 2 |
| 2 | 1 | 3 | 1 | 26.00 | 0 | 0 | 7.9250 | 1 | 116.00 | 1.981250 | 1 |
| 4 | 0 | 3 | 1 | 0.35 | 0 | 0 | 8.0500 | 1 | 90.35 | 2.012500 | 1 |
| 5 | 0 | 3 | 1 | 28.00 | 0 | 0 | 8.4583 | 3 | 118.00 | 2.114575 | 3 |
# rename columns
ti = ti.rename(columns={'Pclass': 'PassengerClass'})
ti.head()
| Survived | PassengerClass | Sex | Age | SibSp | Parch | Fare | Embarked | CurrentAge | discount | DeparturePort | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | 1 | 22.00 | 1 | 0 | 7.2500 | 1 | 112.00 | 1.812500 | 1 |
| 1 | 1 | 1 | 0 | 35.00 | 1 | 0 | 71.2833 | 2 | 125.00 | 17.820825 | 2 |
| 2 | 1 | 3 | 1 | 26.00 | 0 | 0 | 7.9250 | 1 | 116.00 | 1.981250 | 1 |
| 4 | 0 | 3 | 1 | 0.35 | 0 | 0 | 8.0500 | 1 | 90.35 | 2.012500 | 1 |
| 5 | 0 | 3 | 1 | 28.00 | 0 | 0 | 8.4583 | 3 | 118.00 | 2.114575 | 3 |
# Can you rename discount as Disc
ti=ti.rename(columns={'discount':'Disc'})
ti.head()
| Survived | PassengerClass | Sex | Age | SibSp | Parch | Fare | Embarked | CurrentAge | Disc | DeparturePort | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | 1 | 22.00 | 1 | 0 | 7.2500 | 1 | 112.00 | 1.812500 | 1 |
| 1 | 1 | 1 | 0 | 35.00 | 1 | 0 | 71.2833 | 2 | 125.00 | 17.820825 | 2 |
| 2 | 1 | 3 | 1 | 26.00 | 0 | 0 | 7.9250 | 1 | 116.00 | 1.981250 | 1 |
| 4 | 0 | 3 | 1 | 0.35 | 0 | 0 | 8.0500 | 1 | 90.35 | 2.012500 | 1 |
| 5 | 0 | 3 | 1 | 28.00 | 0 | 0 | 8.4583 | 3 | 118.00 | 2.114575 | 3 |
# create bins (discretize)
ti['Fare_Category'] = pd.qcut(ti['Fare'], 5)
ti.head()
| Survived | PassengerClass | Sex | Age | SibSp | Parch | Fare | Embarked | CurrentAge | Disc | DeparturePort | Fare_Category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | 1 | 22.00 | 1 | 0 | 7.2500 | 1 | 112.00 | 1.812500 | 1 | (-0.001, 7.854] |
| 1 | 1 | 1 | 0 | 35.00 | 1 | 0 | 71.2833 | 2 | 125.00 | 17.820825 | 2 | (39.688, 512.329] |
| 2 | 1 | 3 | 1 | 26.00 | 0 | 0 | 7.9250 | 1 | 116.00 | 1.981250 | 1 | (7.854, 10.5] |
| 4 | 0 | 3 | 1 | 0.35 | 0 | 0 | 8.0500 | 1 | 90.35 | 2.012500 | 1 | (7.854, 10.5] |
| 5 | 0 | 3 | 1 | 28.00 | 0 | 0 | 8.4583 | 3 | 118.00 | 2.114575 | 3 | (7.854, 10.5] |
ti['Fare'].describe()
count 890.000000 mean 32.180730 std 49.716425 min 0.000000 25% 7.903100 50% 14.454200 75% 31.000000 max 512.329200 Name: Fare, dtype: float64
ti.groupby('Fare_Category').size()
Fare_Category (-0.001, 7.854] 179 (7.854, 10.5] 184 (10.5, 21.317] 171 (21.317, 39.688] 181 (39.688, 512.329] 175 dtype: int64
#rename the name of those bins to 1, 2, 3, 4, 5
ti['Fare_Category_Labels'] = pd.qcut(ti['Fare'], 5, labels=['1', '2', '3', '4', '5'])
ti.head()
| Survived | PassengerClass | Sex | Age | SibSp | Parch | Fare | Embarked | CurrentAge | Disc | DeparturePort | Fare_Category | Fare_Category_Labels | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | 1 | 22.00 | 1 | 0 | 7.2500 | 1 | 112.00 | 1.812500 | 1 | (-0.001, 7.854] | 1 |
| 1 | 1 | 1 | 0 | 35.00 | 1 | 0 | 71.2833 | 2 | 125.00 | 17.820825 | 2 | (39.688, 512.329] | 5 |
| 2 | 1 | 3 | 1 | 26.00 | 0 | 0 | 7.9250 | 1 | 116.00 | 1.981250 | 1 | (7.854, 10.5] | 2 |
| 4 | 0 | 3 | 1 | 0.35 | 0 | 0 | 8.0500 | 1 | 90.35 | 2.012500 | 1 | (7.854, 10.5] | 2 |
| 5 | 0 | 3 | 1 | 28.00 | 0 | 0 | 8.4583 | 3 | 118.00 | 2.114575 | 3 | (7.854, 10.5] | 2 |
ti.dtypes
Survived int64 PassengerClass int64 Sex int64 Age float64 SibSp int64 Parch int64 Fare float64 Embarked int64 CurrentAge float64 Disc float64 DeparturePort int64 Fare_Category category Fare_Category_Labels category dtype: object
#Survived is categorical data type. What if I want to change it to integer
ti['Survived'] = ti['Survived'].astype(int)
ti.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 890 entries, 0 to 890 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Survived 890 non-null int32 1 PassengerClass 890 non-null int64 2 Sex 890 non-null int64 3 Age 890 non-null float64 4 SibSp 890 non-null int64 5 Parch 890 non-null int64 6 Fare 890 non-null float64 7 Embarked 890 non-null int64 8 CurrentAge 890 non-null float64 9 Disc 890 non-null float64 10 DeparturePort 890 non-null int64 11 Fare_Category 890 non-null category 12 Fare_Category_Labels 890 non-null category dtypes: category(2), float64(4), int32(1), int64(6) memory usage: 82.1 KB
# filtering or selecting columns
#selecting two columns and assigning to a new variable or dataset called "ti_2"
ti_2 = ti[['Survived','Age']]
ti_2.head()
| Survived | Age | |
|---|---|---|
| 0 | 0 | 22.00 |
| 1 | 1 | 35.00 |
| 2 | 1 | 26.00 |
| 4 | 0 | 0.35 |
| 5 | 0 | 28.00 |
ti_2.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 890 entries, 0 to 890 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Survived 890 non-null int32 1 Age 890 non-null float64 dtypes: float64(1), int32(1) memory usage: 17.4 KB
# can you select two other columns. use a different variable name (ti_3)
ti_3=ti[['CurrentAge', 'Disc']]
ti_3.head()
| CurrentAge | Disc | |
|---|---|---|
| 0 | 112.00 | 1.812500 |
| 1 | 125.00 | 17.820825 |
| 2 | 116.00 | 1.981250 |
| 4 | 90.35 | 2.012500 |
| 5 | 118.00 | 2.114575 |
# Filtering or Selecting rows
#single criterion
print(len(ti[(ti['Fare'] > 200)]))
20
#if you want to see a list of people
ti[(ti['Fare'] > 200)].head(2)
| Survived | PassengerClass | Sex | Age | SibSp | Parch | Fare | Embarked | CurrentAge | Disc | DeparturePort | Fare_Category | Fare_Category_Labels | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 27 | 0 | 1 | 1 | 19.0 | 3 | 2 | 263.0 | 1 | 109.0 | 65.75 | 1 | (39.688, 512.329] | 5 |
| 88 | 1 | 1 | 0 | 23.0 | 3 | 2 | 263.0 | 1 | 113.0 | 65.75 | 1 | (39.688, 512.329] | 5 |
#Multiple conditions have to be combined using &
len(ti[(ti['Fare'] > 200) & (ti['Survived'] == 1)])
14
#Multiple conditions have to be combined using &
print(len(ti[(ti['Fare'] < 20)]))
print(len(ti[(ti['Fare'] < 20) & (ti['Survived'] == 1)]))
515 143
(len(ti[(ti['Fare'] < 20) & (ti['Survived'] == 1)])) / (len(ti[(ti['Fare'] < 20)]))
0.27766990291262134
Some passengers in upper class did not survived.
# can you create a filter using a criterion?
# a list of females that survived
ti[(ti['Sex'] == 0) & (ti['Survived'] == 1)].head()
| Survived | PassengerClass | Sex | Age | SibSp | Parch | Fare | Embarked | CurrentAge | Disc | DeparturePort | Fare_Category | Fare_Category_Labels | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 | 1 | 0 | 35.0 | 1 | 0 | 71.2833 | 2 | 125.0 | 17.820825 | 2 | (39.688, 512.329] | 5 |
| 8 | 1 | 3 | 0 | 27.0 | 0 | 2 | 11.1333 | 1 | 117.0 | 2.783325 | 1 | (10.5, 21.317] | 3 |
| 9 | 1 | 2 | 0 | 14.0 | 1 | 0 | 30.0708 | 2 | 104.0 | 7.517700 | 2 | (21.317, 39.688] | 4 |
| 10 | 1 | 3 | 0 | 4.0 | 1 | 1 | 16.7000 | 1 | 94.0 | 4.175000 | 1 | (10.5, 21.317] | 3 |
| 11 | 1 | 1 | 0 | 58.0 | 0 | 0 | 26.5500 | 1 | 148.0 | 6.637500 | 1 | (21.317, 39.688] | 4 |
len(ti)
890
# sampling data randomly
# sampling 10 rows only (different samples for every time you run this cell)
ti_10 = ti.sample(10)
ti_10
# to have the same samples for every time
# ti.sample(10, random_state = 1)
| Survived | PassengerClass | Sex | Age | SibSp | Parch | Fare | Embarked | CurrentAge | Disc | DeparturePort | Fare_Category | Fare_Category_Labels | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 259 | 1 | 2 | 0 | 50.0 | 0 | 1 | 26.0000 | 1 | 140.0 | 6.500000 | 1 | (21.317, 39.688] | 4 |
| 578 | 0 | 3 | 0 | 28.0 | 1 | 0 | 14.4583 | 2 | 118.0 | 3.614575 | 2 | (10.5, 21.317] | 3 |
| 223 | 0 | 3 | 1 | 28.0 | 0 | 0 | 7.8958 | 1 | 118.0 | 1.973950 | 1 | (7.854, 10.5] | 2 |
| 598 | 0 | 3 | 1 | 28.0 | 0 | 0 | 7.2250 | 2 | 118.0 | 1.806250 | 2 | (-0.001, 7.854] | 1 |
| 356 | 1 | 1 | 0 | 22.0 | 0 | 1 | 55.0000 | 1 | 112.0 | 13.750000 | 1 | (39.688, 512.329] | 5 |
| 770 | 0 | 3 | 1 | 24.0 | 0 | 0 | 9.5000 | 1 | 114.0 | 2.375000 | 1 | (7.854, 10.5] | 2 |
| 270 | 0 | 1 | 1 | 28.0 | 0 | 0 | 31.0000 | 1 | 118.0 | 7.750000 | 1 | (21.317, 39.688] | 4 |
| 112 | 0 | 3 | 1 | 22.0 | 0 | 0 | 8.0500 | 1 | 112.0 | 2.012500 | 1 | (7.854, 10.5] | 2 |
| 807 | 0 | 3 | 0 | 18.0 | 0 | 0 | 7.7750 | 1 | 108.0 | 1.943750 | 1 | (-0.001, 7.854] | 1 |
| 682 | 0 | 3 | 1 | 20.0 | 0 | 0 | 9.2250 | 1 | 110.0 | 2.306250 | 1 | (7.854, 10.5] | 2 |
# sort data by column
#sorting: ascending=True is the default
ti_10.sort_values('Fare')
| Survived | PassengerClass | Sex | Age | SibSp | Parch | Fare | Embarked | CurrentAge | Disc | DeparturePort | Fare_Category | Fare_Category_Labels | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 598 | 0 | 3 | 1 | 28.0 | 0 | 0 | 7.2250 | 2 | 118.0 | 1.806250 | 2 | (-0.001, 7.854] | 1 |
| 807 | 0 | 3 | 0 | 18.0 | 0 | 0 | 7.7750 | 1 | 108.0 | 1.943750 | 1 | (-0.001, 7.854] | 1 |
| 223 | 0 | 3 | 1 | 28.0 | 0 | 0 | 7.8958 | 1 | 118.0 | 1.973950 | 1 | (7.854, 10.5] | 2 |
| 112 | 0 | 3 | 1 | 22.0 | 0 | 0 | 8.0500 | 1 | 112.0 | 2.012500 | 1 | (7.854, 10.5] | 2 |
| 682 | 0 | 3 | 1 | 20.0 | 0 | 0 | 9.2250 | 1 | 110.0 | 2.306250 | 1 | (7.854, 10.5] | 2 |
| 770 | 0 | 3 | 1 | 24.0 | 0 | 0 | 9.5000 | 1 | 114.0 | 2.375000 | 1 | (7.854, 10.5] | 2 |
| 578 | 0 | 3 | 0 | 28.0 | 1 | 0 | 14.4583 | 2 | 118.0 | 3.614575 | 2 | (10.5, 21.317] | 3 |
| 259 | 1 | 2 | 0 | 50.0 | 0 | 1 | 26.0000 | 1 | 140.0 | 6.500000 | 1 | (21.317, 39.688] | 4 |
| 270 | 0 | 1 | 1 | 28.0 | 0 | 0 | 31.0000 | 1 | 118.0 | 7.750000 | 1 | (21.317, 39.688] | 4 |
| 356 | 1 | 1 | 0 | 22.0 | 0 | 1 | 55.0000 | 1 | 112.0 | 13.750000 | 1 | (39.688, 512.329] | 5 |
#by= is optional
ti_10.sort_values(by=['Fare'], ascending=False).head()
| Survived | PassengerClass | Sex | Age | SibSp | Parch | Fare | Embarked | CurrentAge | Disc | DeparturePort | Fare_Category | Fare_Category_Labels | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 356 | 1 | 1 | 0 | 22.0 | 0 | 1 | 55.0000 | 1 | 112.0 | 13.750000 | 1 | (39.688, 512.329] | 5 |
| 270 | 0 | 1 | 1 | 28.0 | 0 | 0 | 31.0000 | 1 | 118.0 | 7.750000 | 1 | (21.317, 39.688] | 4 |
| 259 | 1 | 2 | 0 | 50.0 | 0 | 1 | 26.0000 | 1 | 140.0 | 6.500000 | 1 | (21.317, 39.688] | 4 |
| 578 | 0 | 3 | 0 | 28.0 | 1 | 0 | 14.4583 | 2 | 118.0 | 3.614575 | 2 | (10.5, 21.317] | 3 |
| 770 | 0 | 3 | 1 | 24.0 | 0 | 0 | 9.5000 | 1 | 114.0 | 2.375000 | 1 | (7.854, 10.5] | 2 |
# Ascending by first column, then descending by second column
ti_10.sort_values(by=['Survived','Age'], ascending=[1,0]).head()
| Survived | PassengerClass | Sex | Age | SibSp | Parch | Fare | Embarked | CurrentAge | Disc | DeparturePort | Fare_Category | Fare_Category_Labels | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 578 | 0 | 3 | 0 | 28.0 | 1 | 0 | 14.4583 | 2 | 118.0 | 3.614575 | 2 | (10.5, 21.317] | 3 |
| 223 | 0 | 3 | 1 | 28.0 | 0 | 0 | 7.8958 | 1 | 118.0 | 1.973950 | 1 | (7.854, 10.5] | 2 |
| 598 | 0 | 3 | 1 | 28.0 | 0 | 0 | 7.2250 | 2 | 118.0 | 1.806250 | 2 | (-0.001, 7.854] | 1 |
| 270 | 0 | 1 | 1 | 28.0 | 0 | 0 | 31.0000 | 1 | 118.0 | 7.750000 | 1 | (21.317, 39.688] | 4 |
| 770 | 0 | 3 | 1 | 24.0 | 0 | 0 | 9.5000 | 1 | 114.0 | 2.375000 | 1 | (7.854, 10.5] | 2 |
# you can expport this sample data
ti_10.to_csv("data/ti_10.csv")
ti_10.to_excel("data/ti_10.xlsx")
# can you sample 50 rows?
ti_50 = ti.sample(50)
ti_50
| Survived | PassengerClass | Sex | Age | SibSp | Parch | Fare | Embarked | CurrentAge | Disc | DeparturePort | Fare_Category | Fare_Category_Labels | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 510 | 1 | 3 | 1 | 29.0 | 0 | 0 | 7.7500 | 3 | 119.0 | 1.937500 | 3 | (-0.001, 7.854] | 1 |
| 869 | 1 | 3 | 1 | 4.0 | 1 | 1 | 11.1333 | 1 | 94.0 | 2.783325 | 1 | (10.5, 21.317] | 3 |
| 150 | 0 | 2 | 1 | 51.0 | 0 | 0 | 12.5250 | 1 | 141.0 | 3.131250 | 1 | (10.5, 21.317] | 3 |
| 136 | 1 | 1 | 0 | 19.0 | 0 | 2 | 26.2833 | 1 | 109.0 | 6.570825 | 1 | (21.317, 39.688] | 4 |
| 836 | 0 | 3 | 1 | 21.0 | 0 | 0 | 8.6625 | 1 | 111.0 | 2.165625 | 1 | (7.854, 10.5] | 2 |
| 16 | 0 | 3 | 1 | 2.0 | 4 | 1 | 29.1250 | 3 | 92.0 | 7.281250 | 3 | (21.317, 39.688] | 4 |
| 33 | 0 | 2 | 1 | 66.0 | 0 | 0 | 10.5000 | 1 | 156.0 | 2.625000 | 1 | (7.854, 10.5] | 2 |
| 22 | 1 | 3 | 0 | 15.0 | 0 | 0 | 8.0292 | 3 | 105.0 | 2.007300 | 3 | (7.854, 10.5] | 2 |
| 576 | 1 | 2 | 0 | 34.0 | 0 | 0 | 13.0000 | 1 | 124.0 | 3.250000 | 1 | (10.5, 21.317] | 3 |
| 212 | 0 | 3 | 1 | 22.0 | 0 | 0 | 7.2500 | 1 | 112.0 | 1.812500 | 1 | (-0.001, 7.854] | 1 |
| 28 | 1 | 3 | 0 | 28.0 | 0 | 0 | 7.8792 | 3 | 118.0 | 1.969800 | 3 | (7.854, 10.5] | 2 |
| 399 | 1 | 2 | 0 | 28.0 | 0 | 0 | 12.6500 | 1 | 118.0 | 3.162500 | 1 | (10.5, 21.317] | 3 |
| 574 | 0 | 3 | 1 | 16.0 | 0 | 0 | 8.0500 | 1 | 106.0 | 2.012500 | 1 | (7.854, 10.5] | 2 |
| 453 | 1 | 1 | 1 | 49.0 | 1 | 0 | 89.1042 | 2 | 139.0 | 22.276050 | 2 | (39.688, 512.329] | 5 |
| 889 | 1 | 1 | 1 | 26.0 | 0 | 0 | 30.0000 | 2 | 116.0 | 7.500000 | 2 | (21.317, 39.688] | 4 |
| 274 | 1 | 3 | 0 | 28.0 | 0 | 0 | 7.7500 | 3 | 118.0 | 1.937500 | 3 | (-0.001, 7.854] | 1 |
| 199 | 0 | 2 | 0 | 24.0 | 0 | 0 | 13.0000 | 1 | 114.0 | 3.250000 | 1 | (10.5, 21.317] | 3 |
| 206 | 0 | 3 | 1 | 32.0 | 1 | 0 | 15.8500 | 1 | 122.0 | 3.962500 | 1 | (10.5, 21.317] | 3 |
| 533 | 1 | 3 | 0 | 28.0 | 0 | 2 | 22.3583 | 2 | 118.0 | 5.589575 | 2 | (21.317, 39.688] | 4 |
| 722 | 0 | 2 | 1 | 34.0 | 0 | 0 | 13.0000 | 1 | 124.0 | 3.250000 | 1 | (10.5, 21.317] | 3 |
| 324 | 0 | 3 | 1 | 28.0 | 8 | 2 | 69.5500 | 1 | 118.0 | 17.387500 | 1 | (39.688, 512.329] | 5 |
| 109 | 1 | 3 | 0 | 28.0 | 1 | 0 | 24.1500 | 3 | 118.0 | 6.037500 | 3 | (21.317, 39.688] | 4 |
| 90 | 0 | 3 | 1 | 29.0 | 0 | 0 | 8.0500 | 1 | 119.0 | 2.012500 | 1 | (7.854, 10.5] | 2 |
| 445 | 1 | 1 | 1 | 4.0 | 0 | 2 | 81.8583 | 1 | 94.0 | 20.464575 | 1 | (39.688, 512.329] | 5 |
| 708 | 1 | 1 | 0 | 22.0 | 0 | 0 | 151.5500 | 1 | 112.0 | 37.887500 | 1 | (39.688, 512.329] | 5 |
| 393 | 1 | 1 | 0 | 23.0 | 1 | 0 | 113.2750 | 2 | 113.0 | 28.318750 | 2 | (39.688, 512.329] | 5 |
| 741 | 0 | 1 | 1 | 36.0 | 1 | 0 | 78.8500 | 1 | 126.0 | 19.712500 | 1 | (39.688, 512.329] | 5 |
| 712 | 1 | 1 | 1 | 48.0 | 1 | 0 | 52.0000 | 1 | 138.0 | 13.000000 | 1 | (39.688, 512.329] | 5 |
| 736 | 0 | 3 | 0 | 48.0 | 1 | 3 | 34.3750 | 1 | 138.0 | 8.593750 | 1 | (21.317, 39.688] | 4 |
| 668 | 0 | 3 | 1 | 43.0 | 0 | 0 | 8.0500 | 1 | 133.0 | 2.012500 | 1 | (7.854, 10.5] | 2 |
| 634 | 0 | 3 | 0 | 9.0 | 3 | 2 | 27.9000 | 1 | 99.0 | 6.975000 | 1 | (21.317, 39.688] | 4 |
| 17 | 1 | 2 | 1 | 28.0 | 0 | 0 | 13.0000 | 1 | 118.0 | 3.250000 | 1 | (10.5, 21.317] | 3 |
| 41 | 0 | 2 | 0 | 27.0 | 1 | 0 | 21.0000 | 1 | 117.0 | 5.250000 | 1 | (10.5, 21.317] | 3 |
| 356 | 1 | 1 | 0 | 22.0 | 0 | 1 | 55.0000 | 1 | 112.0 | 13.750000 | 1 | (39.688, 512.329] | 5 |
| 104 | 0 | 3 | 1 | 37.0 | 2 | 0 | 7.9250 | 1 | 127.0 | 1.981250 | 1 | (7.854, 10.5] | 2 |
| 87 | 0 | 3 | 1 | 28.0 | 0 | 0 | 8.0500 | 1 | 118.0 | 2.012500 | 1 | (7.854, 10.5] | 2 |
| 13 | 0 | 3 | 1 | 39.0 | 1 | 5 | 31.2750 | 1 | 129.0 | 7.818750 | 1 | (21.317, 39.688] | 4 |
| 621 | 1 | 1 | 1 | 42.0 | 1 | 0 | 52.5542 | 1 | 132.0 | 13.138550 | 1 | (39.688, 512.329] | 5 |
| 481 | 0 | 2 | 1 | 28.0 | 0 | 0 | 0.0000 | 1 | 118.0 | 0.000000 | 1 | (-0.001, 7.854] | 1 |
| 695 | 0 | 2 | 1 | 52.0 | 0 | 0 | 13.5000 | 1 | 142.0 | 3.375000 | 1 | (10.5, 21.317] | 3 |
| 619 | 0 | 2 | 1 | 26.0 | 0 | 0 | 10.5000 | 1 | 116.0 | 2.625000 | 1 | (7.854, 10.5] | 2 |
| 213 | 0 | 2 | 1 | 30.0 | 0 | 0 | 13.0000 | 1 | 120.0 | 3.250000 | 1 | (10.5, 21.317] | 3 |
| 26 | 0 | 3 | 1 | 28.0 | 0 | 0 | 7.2250 | 2 | 118.0 | 1.806250 | 2 | (-0.001, 7.854] | 1 |
| 441 | 0 | 3 | 1 | 20.0 | 0 | 0 | 9.5000 | 1 | 110.0 | 2.375000 | 1 | (7.854, 10.5] | 2 |
| 550 | 1 | 1 | 1 | 17.0 | 0 | 2 | 110.8833 | 2 | 107.0 | 27.720825 | 2 | (39.688, 512.329] | 5 |
| 624 | 0 | 3 | 1 | 21.0 | 0 | 0 | 16.1000 | 1 | 111.0 | 4.025000 | 1 | (10.5, 21.317] | 3 |
| 349 | 0 | 3 | 1 | 42.0 | 0 | 0 | 8.6625 | 1 | 132.0 | 2.165625 | 1 | (7.854, 10.5] | 2 |
| 617 | 0 | 3 | 0 | 26.0 | 1 | 0 | 16.1000 | 1 | 116.0 | 4.025000 | 1 | (10.5, 21.317] | 3 |
| 482 | 0 | 3 | 1 | 50.0 | 0 | 0 | 8.0500 | 1 | 140.0 | 2.012500 | 1 | (7.854, 10.5] | 2 |
| 795 | 0 | 2 | 1 | 39.0 | 0 | 0 | 13.0000 | 1 | 129.0 | 3.250000 | 1 | (10.5, 21.317] | 3 |
# apply sorting (descending, ascending)
ti_50.sort_values(by=['Survived','CurrentAge'], ascending=[0,1]).head()
| Survived | PassengerClass | Sex | Age | SibSp | Parch | Fare | Embarked | CurrentAge | Disc | DeparturePort | Fare_Category | Fare_Category_Labels | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 869 | 1 | 3 | 1 | 4.0 | 1 | 1 | 11.1333 | 1 | 94.0 | 2.783325 | 1 | (10.5, 21.317] | 3 |
| 445 | 1 | 1 | 1 | 4.0 | 0 | 2 | 81.8583 | 1 | 94.0 | 20.464575 | 1 | (39.688, 512.329] | 5 |
| 22 | 1 | 3 | 0 | 15.0 | 0 | 0 | 8.0292 | 3 | 105.0 | 2.007300 | 3 | (7.854, 10.5] | 2 |
| 550 | 1 | 1 | 1 | 17.0 | 0 | 2 | 110.8833 | 2 | 107.0 | 27.720825 | 2 | (39.688, 512.329] | 5 |
| 136 | 1 | 1 | 0 | 19.0 | 0 | 2 | 26.2833 | 1 | 109.0 | 6.570825 | 1 | (21.317, 39.688] | 4 |