Name: Patrick Mugisha
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px #if you don't have this, install "pip install plotly"
# regression
import sklearn.linear_model as lm
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
#regression analysis (statistics)
import statsmodels.api as sm
from statsmodels.formula.api import ols
# validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
# regression feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import RFE
# pca
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import plotly
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import scikitplot as skplt
import sklearn
import statsmodels.api as sm
%matplotlib inline
# Classifiers
#import decisiontreeclassifier
from sklearn import tree
from sklearn.tree import export_text
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from IPython.display import SVG
#from graphviz import Source
from IPython.display import display
#import logisticregression classifier
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
#import knn classifier
from sklearn.neighbors import KNeighborsClassifier
#for validating your classification model
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
# feature selection
from sklearn.feature_selection import RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import RandomForestClassifier
# grid search
from sklearn.model_selection import GridSearchCV
#clustering
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import ward_tree
from scipy.cluster.hierarchy import dendrogram, linkage, ward
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cdist
from scipy import stats
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn import metrics
#clustering plot
from pandas.plotting import parallel_coordinates
#3d chart clustering
from mpl_toolkits.mplot3d import Axes3D
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings("ignore")
The goal of this project is to predict movie success (the column named “imdb_score”).
Data understanding & transformation: data quality issues and your solutions
#opening the csv file
df = pd.read_csv("data/movie_metadata.csv")
df.head()
color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | genres | actor_1_name | movie_title | num_voted_users | cast_total_facebook_likes | actor_3_name | facenumber_in_poster | plot_keywords | movie_imdb_link | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | aspect_ratio | movie_facebook_likes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Color | James Cameron | 723.0 | 178.0 | 0.0 | 855.0 | Joel David Moore | 1000.0 | 760505847.0 | Action|Adventure|Fantasy|Sci-Fi | CCH Pounder | Avatar | 886204 | 4834 | Wes Studi | 0.0 | avatar|future|marine|native|paraplegic | http://www.imdb.com/title/tt0499549/?ref_=fn_t... | 3054.0 | English | USA | PG-13 | 237000000.0 | 2009.0 | 936.0 | 7.9 | 1.78 | 33000 |
1 | Color | Gore Verbinski | 302.0 | 169.0 | 563.0 | 1000.0 | Orlando Bloom | 40000.0 | 309404152.0 | Action|Adventure|Fantasy | Johnny Depp | Pirates of the Caribbean: At World's End | 471220 | 48350 | Jack Davenport | 0.0 | goddess|marriage ceremony|marriage proposal|pi... | http://www.imdb.com/title/tt0449088/?ref_=fn_t... | 1238.0 | English | USA | PG-13 | 300000000.0 | 2007.0 | 5000.0 | 7.1 | 2.35 | 0 |
2 | Color | Sam Mendes | 602.0 | 148.0 | 0.0 | 161.0 | Rory Kinnear | 11000.0 | 200074175.0 | Action|Adventure|Thriller | Christoph Waltz | Spectre | 275868 | 11700 | Stephanie Sigman | 1.0 | bomb|espionage|sequel|spy|terrorist | http://www.imdb.com/title/tt2379713/?ref_=fn_t... | 994.0 | English | UK | PG-13 | 245000000.0 | 2015.0 | 393.0 | 6.8 | 2.35 | 85000 |
3 | Color | Christopher Nolan | 813.0 | 164.0 | 22000.0 | 23000.0 | Christian Bale | 27000.0 | 448130642.0 | Action|Thriller | Tom Hardy | The Dark Knight Rises | 1144337 | 106759 | Joseph Gordon-Levitt | 0.0 | deception|imprisonment|lawlessness|police offi... | http://www.imdb.com/title/tt1345836/?ref_=fn_t... | 2701.0 | English | USA | PG-13 | 250000000.0 | 2012.0 | 23000.0 | 8.5 | 2.35 | 164000 |
4 | NaN | Doug Walker | NaN | NaN | 131.0 | NaN | Rob Walker | 131.0 | NaN | Documentary | Doug Walker | Star Wars: Episode VII - The Force Awakens ... | 8 | 143 | NaN | 0.0 | NaN | http://www.imdb.com/title/tt5289954/?ref_=fn_t... | NaN | NaN | NaN | NaN | NaN | NaN | 12.0 | 7.1 | NaN | 0 |
#number of rows and columns
df.shape
(5043, 28)
Describe data
The goal of this project is to predict movie success while using "imdb_score" as the y variable and potentially 27 other variables acting as predictors from a total of 5043 movies ranging from the year 1916 to 2016.
#basic statistics
df.describe()
num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_1_facebook_likes | gross | num_voted_users | cast_total_facebook_likes | facenumber_in_poster | num_user_for_reviews | budget | title_year | actor_2_facebook_likes | imdb_score | aspect_ratio | movie_facebook_likes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 4993.000000 | 5028.000000 | 4939.000000 | 5020.000000 | 5036.000000 | 4.159000e+03 | 5.043000e+03 | 5043.000000 | 5030.000000 | 5022.000000 | 4.551000e+03 | 4935.000000 | 5030.000000 | 5043.000000 | 4714.000000 | 5043.000000 |
mean | 140.194272 | 107.201074 | 686.509212 | 645.009761 | 6560.047061 | 4.846841e+07 | 8.366816e+04 | 9699.063851 | 1.371173 | 272.770808 | 3.975262e+07 | 2002.470517 | 1651.754473 | 6.442138 | 2.220403 | 7525.964505 |
std | 121.601675 | 25.197441 | 2813.328607 | 1665.041728 | 15020.759120 | 6.845299e+07 | 1.384853e+05 | 18163.799124 | 2.013576 | 377.982886 | 2.061149e+08 | 12.474599 | 4042.438863 | 1.125116 | 1.385113 | 19320.445110 |
min | 1.000000 | 7.000000 | 0.000000 | 0.000000 | 0.000000 | 1.620000e+02 | 5.000000e+00 | 0.000000 | 0.000000 | 1.000000 | 2.180000e+02 | 1916.000000 | 0.000000 | 1.600000 | 1.180000 | 0.000000 |
25% | 50.000000 | 93.000000 | 7.000000 | 133.000000 | 614.000000 | 5.340988e+06 | 8.593500e+03 | 1411.000000 | 0.000000 | 65.000000 | 6.000000e+06 | 1999.000000 | 281.000000 | 5.800000 | 1.850000 | 0.000000 |
50% | 110.000000 | 103.000000 | 49.000000 | 371.500000 | 988.000000 | 2.551750e+07 | 3.435900e+04 | 3090.000000 | 1.000000 | 156.000000 | 2.000000e+07 | 2005.000000 | 595.000000 | 6.600000 | 2.350000 | 166.000000 |
75% | 195.000000 | 118.000000 | 194.500000 | 636.000000 | 11000.000000 | 6.230944e+07 | 9.630900e+04 | 13756.500000 | 2.000000 | 326.000000 | 4.500000e+07 | 2011.000000 | 918.000000 | 7.200000 | 2.350000 | 3000.000000 |
max | 813.000000 | 511.000000 | 23000.000000 | 23000.000000 | 640000.000000 | 7.605058e+08 | 1.689764e+06 | 656730.000000 | 43.000000 | 5060.000000 | 1.221550e+10 | 2016.000000 | 137000.000000 | 9.500000 | 16.000000 | 349000.000000 |
Identifying data quality issues
#number of missing values in each column
df.isnull().sum()
color 19 director_name 104 num_critic_for_reviews 50 duration 15 director_facebook_likes 104 actor_3_facebook_likes 23 actor_2_name 13 actor_1_facebook_likes 7 gross 884 genres 0 actor_1_name 7 movie_title 0 num_voted_users 0 cast_total_facebook_likes 0 actor_3_name 23 facenumber_in_poster 13 plot_keywords 153 movie_imdb_link 0 num_user_for_reviews 21 language 12 country 5 content_rating 303 budget 492 title_year 108 actor_2_facebook_likes 13 imdb_score 0 aspect_ratio 329 movie_facebook_likes 0 dtype: int64
#finding all rows with null/missing values
df[df.isnull().any(axis=1)]
color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | genres | actor_1_name | movie_title | num_voted_users | cast_total_facebook_likes | actor_3_name | facenumber_in_poster | plot_keywords | movie_imdb_link | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | aspect_ratio | movie_facebook_likes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4 | NaN | Doug Walker | NaN | NaN | 131.0 | NaN | Rob Walker | 131.0 | NaN | Documentary | Doug Walker | Star Wars: Episode VII - The Force Awakens ... | 8 | 143 | NaN | 0.0 | NaN | http://www.imdb.com/title/tt5289954/?ref_=fn_t... | NaN | NaN | NaN | NaN | NaN | NaN | 12.0 | 7.1 | NaN | 0 |
55 | Color | Peter Sohn | 298.0 | 93.0 | 113.0 | 113.0 | Jack McGraw | 275.0 | 123070338.0 | Adventure|Animation|Comedy|Family|Fantasy | A.J. Buckley | The Good Dinosaur | 62836 | 696 | Peter Sohn | 0.0 | apatosaurus|asteroid|dinosaur|fear|river | http://www.imdb.com/title/tt1979388/?ref_=fn_t... | 345.0 | English | USA | PG | NaN | 2015.0 | 150.0 | 6.8 | 2.35 | 20000 |
84 | Color | Roland Joffé | 10.0 | 109.0 | 596.0 | 283.0 | Alice Englert | 622.0 | NaN | Action|Adventure|Romance|Sci-Fi | Tamsin Egerton | The Lovers | 2138 | 1982 | Bipasha Basu | 3.0 | 1770s|british india|great barrier reef|india|ring | http://www.imdb.com/title/tt1321869/?ref_=fn_t... | 15.0 | English | Belgium | R | NaN | 2015.0 | 525.0 | 4.5 | NaN | 677 |
98 | Color | Hideaki Anno | 1.0 | 120.0 | 28.0 | 12.0 | Shin'ya Tsukamoto | 544.0 | NaN | Action|Adventure|Drama|Horror|Sci-Fi | Mark Chinnery | Godzilla Resurgence | 374 | 699 | Atsuko Maeda | 0.0 | blood|godzilla|monster|sequel | http://www.imdb.com/title/tt4262980/?ref_=fn_t... | 13.0 | Japanese | Japan | NaN | NaN | 2016.0 | 106.0 | 8.2 | 2.35 | 0 |
99 | Color | Peter Jackson | 645.0 | 182.0 | 0.0 | 773.0 | Adam Brown | 5000.0 | 303001229.0 | Adventure|Fantasy | Aidan Turner | The Hobbit: An Unexpected Journey | 637246 | 9152 | James Nesbitt | NaN | dragon|dwarf|hobbit|orc|wizard | http://www.imdb.com/title/tt0903624/?ref_=fn_t... | 1367.0 | English | USA | PG-13 | 180000000.0 | 2012.0 | 972.0 | 7.9 | 2.35 | 166000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
5037 | Color | Edward Burns | 14.0 | 95.0 | 0.0 | 133.0 | Caitlin FitzGerald | 296.0 | 4584.0 | Comedy|Drama | Kerry Bishé | Newlyweds | 1338 | 690 | Daniella Pineda | 1.0 | written and directed by cast member | http://www.imdb.com/title/tt1880418/?ref_=fn_t... | 14.0 | English | USA | Not Rated | 9000.0 | 2011.0 | 205.0 | 6.4 | NaN | 413 |
5038 | Color | Scott Smith | 1.0 | 87.0 | 2.0 | 318.0 | Daphne Zuniga | 637.0 | NaN | Comedy|Drama | Eric Mabius | Signed Sealed Delivered | 629 | 2283 | Crystal Lowe | 2.0 | fraud|postal worker|prison|theft|trial | http://www.imdb.com/title/tt3000844/?ref_=fn_t... | 6.0 | English | Canada | NaN | NaN | 2013.0 | 470.0 | 7.7 | NaN | 84 |
5039 | Color | NaN | 43.0 | 43.0 | NaN | 319.0 | Valorie Curry | 841.0 | NaN | Crime|Drama|Mystery|Thriller | Natalie Zea | The Following | 73839 | 1753 | Sam Underwood | 1.0 | cult|fbi|hideout|prison escape|serial killer | http://www.imdb.com/title/tt2071645/?ref_=fn_t... | 359.0 | English | USA | TV-14 | NaN | NaN | 593.0 | 7.5 | 16.00 | 32000 |
5040 | Color | Benjamin Roberds | 13.0 | 76.0 | 0.0 | 0.0 | Maxwell Moody | 0.0 | NaN | Drama|Horror|Thriller | Eva Boehnke | A Plague So Pleasant | 38 | 0 | David Chandler | 0.0 | NaN | http://www.imdb.com/title/tt2107644/?ref_=fn_t... | 3.0 | English | USA | NaN | 1400.0 | 2013.0 | 0.0 | 6.3 | NaN | 16 |
5041 | Color | Daniel Hsia | 14.0 | 100.0 | 0.0 | 489.0 | Daniel Henney | 946.0 | 10443.0 | Comedy|Drama|Romance | Alan Ruck | Shanghai Calling | 1255 | 2386 | Eliza Coupe | 5.0 | NaN | http://www.imdb.com/title/tt2070597/?ref_=fn_t... | 9.0 | English | USA | PG-13 | NaN | 2012.0 | 719.0 | 6.3 | 2.35 | 660 |
1287 rows × 28 columns
We have 1287 rows that have missing values in them, dropping those rows seems like an aggressive approach instead I will fill some of them, then drop the rest.
Identify data types
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5043 entries, 0 to 5042 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 color 5024 non-null object 1 director_name 4939 non-null object 2 num_critic_for_reviews 4993 non-null float64 3 duration 5028 non-null float64 4 director_facebook_likes 4939 non-null float64 5 actor_3_facebook_likes 5020 non-null float64 6 actor_2_name 5030 non-null object 7 actor_1_facebook_likes 5036 non-null float64 8 gross 4159 non-null float64 9 genres 5043 non-null object 10 actor_1_name 5036 non-null object 11 movie_title 5043 non-null object 12 num_voted_users 5043 non-null int64 13 cast_total_facebook_likes 5043 non-null int64 14 actor_3_name 5020 non-null object 15 facenumber_in_poster 5030 non-null float64 16 plot_keywords 4890 non-null object 17 movie_imdb_link 5043 non-null object 18 num_user_for_reviews 5022 non-null float64 19 language 5031 non-null object 20 country 5038 non-null object 21 content_rating 4740 non-null object 22 budget 4551 non-null float64 23 title_year 4935 non-null float64 24 actor_2_facebook_likes 5030 non-null float64 25 imdb_score 5043 non-null float64 26 aspect_ratio 4714 non-null float64 27 movie_facebook_likes 5043 non-null int64 dtypes: float64(13), int64(3), object(12) memory usage: 1.1+ MB
df.dtypes
color object director_name object num_critic_for_reviews float64 duration float64 director_facebook_likes float64 actor_3_facebook_likes float64 actor_2_name object actor_1_facebook_likes float64 gross float64 genres object actor_1_name object movie_title object num_voted_users int64 cast_total_facebook_likes int64 actor_3_name object facenumber_in_poster float64 plot_keywords object movie_imdb_link object num_user_for_reviews float64 language object country object content_rating object budget float64 title_year float64 actor_2_facebook_likes float64 imdb_score float64 aspect_ratio float64 movie_facebook_likes int64 dtype: object
There is a combination of Numerical and categorical columns in this dataset. some columns have numbers in them and others have strings.
Numerical columns: num_critic_for_reviews, duration, director_facebook_likes, actor_3_facebook_likes, actor_1_facebook_likes, gross, num_voted_users, cast_total_facebook_likes, facenumber_in_poster, num_user_for_reviews, budget, title_year, actor_2_facebook_likes, imdb_score, aspect_ratio, movie_facebook_likes
Categorial columns: color, director_name, actor_2_name, genres, actor_1_name, movie_title, actor_3_name, plot_keywords, movie_imdb_link, language, country, content_rating
Identify value counts of a selective list of columns considered to be important to predict a movie’s success (imdb_score)
# some columns considered to be important to predict imdb_score
df.corr()['imdb_score'].sort_values(ascending=False)
imdb_score 1.000000 num_voted_users 0.410965 num_critic_for_reviews 0.305303 num_user_for_reviews 0.292475 duration 0.261662 movie_facebook_likes 0.247049 gross 0.198021 director_facebook_likes 0.170802 cast_total_facebook_likes 0.085787 actor_2_facebook_likes 0.083808 actor_1_facebook_likes 0.076099 aspect_ratio 0.059445 actor_3_facebook_likes 0.052633 budget 0.030688 facenumber_in_poster -0.062958 title_year -0.209167 Name: imdb_score, dtype: float64
#top 5 common movie length in minutes
df['duration'].value_counts().head()
90.0 161 100.0 141 101.0 139 98.0 135 97.0 131 Name: duration, dtype: int64
#top 5 directors with the most movies
df['director_name'].value_counts().head()
Steven Spielberg 26 Woody Allen 22 Martin Scorsese 20 Clint Eastwood 20 Ridley Scott 17 Name: director_name, dtype: int64
df[['director_facebook_likes', 'director_name']].value_counts().head()
director_facebook_likes director_name 14000.0 Steven Spielberg 26 11000.0 Woody Allen 22 16000.0 Clint Eastwood 20 17000.0 Martin Scorsese 20 0.0 Ridley Scott 17 dtype: int64
#top 5 primary actors based on the total number of movies they starred in
df['actor_1_name'].value_counts().head()
Robert De Niro 49 Johnny Depp 41 Nicolas Cage 33 J.K. Simmons 31 Bruce Willis 30 Name: actor_1_name, dtype: int64
df['actor_1_facebook_likes'].value_counts().head()
1000.0 449 11000.0 211 2000.0 197 3000.0 155 12000.0 135 Name: actor_1_facebook_likes, dtype: int64
#number of movies other actors starred in
df['actor_2_name'].value_counts().head()
Morgan Freeman 20 Charlize Theron 15 Brad Pitt 14 Meryl Streep 11 James Franco 11 Name: actor_2_name, dtype: int64
df[['actor_2_name', 'actor_2_facebook_likes']].value_counts().head()
actor_2_name actor_2_facebook_likes Morgan Freeman 11000.0 20 Charlize Theron 9000.0 15 Brad Pitt 11000.0 14 Meryl Streep 11000.0 11 James Franco 11000.0 11 dtype: int64
#number of movies other actors starred in
df['actor_3_name'].value_counts().head()
Steve Coogan 8 John Heard 8 Ben Mendelsohn 8 Lois Maxwell 7 Kirsten Dunst 7 Name: actor_3_name, dtype: int64
df[['actor_3_name', 'actor_3_facebook_likes']].value_counts().head()
actor_3_name actor_3_facebook_likes Ben Mendelsohn 748.0 8 Steve Coogan 1000.0 8 John Heard 697.0 8 Robert Duvall 3000.0 7 Stephen Root 939.0 7 dtype: int64
#number of users who gave a review
df[['movie_title', 'num_user_for_reviews']].sort_values(by=['num_user_for_reviews'], ascending=False).head()
movie_title | num_user_for_reviews | |
---|---|---|
270 | The Lord of the Rings: The Fellowship of the R... | 5060.0 |
66 | The Dark Knight | 4667.0 |
1937 | The Shawshank Redemption | 4144.0 |
654 | The Matrix | 3646.0 |
240 | Star Wars: Episode I - The Phantom Menace | 3597.0 |
#number of critical reviews on imdb
df[['movie_title', 'num_critic_for_reviews']].sort_values(by=['num_critic_for_reviews'], ascending=False).head()
movie_title | num_critic_for_reviews | |
---|---|---|
3 | The Dark Knight Rises | 813.0 |
227 | Prometheus | 775.0 |
296 | Django Unchained | 765.0 |
3493 | Skyfall | 750.0 |
30 | Skyfall | 750.0 |
#number of people who voted for the movie
df[['movie_title', 'num_voted_users']].sort_values(by=['num_voted_users'], ascending=False).head()
movie_title | num_voted_users | |
---|---|---|
1937 | The Shawshank Redemption | 1689764 |
66 | The Dark Knight | 1676169 |
97 | Inception | 1468200 |
683 | Fight Club | 1347461 |
3355 | Pulp Fiction | 1324680 |
#Total number of facebook likes of the entire cast of the movie
df[['movie_title', 'cast_total_facebook_likes']].sort_values(by=['cast_total_facebook_likes'], ascending=False).head()
movie_title | cast_total_facebook_likes | |
---|---|---|
1902 | Anchorman: The Legend of Ron Burgundy | 656730 |
1223 | The Final Destination | 303717 |
4704 | Treachery | 283939 |
4592 | Hardflip | 263584 |
4409 | Kickboxer: Vengeance | 261818 |
#number of Facebook likes in the movie page
df[['movie_title', 'movie_facebook_likes']].sort_values(by=['movie_facebook_likes'], ascending=False).head()
movie_title | movie_facebook_likes | |
---|---|---|
96 | Interstellar | 349000 |
296 | Django Unchained | 199000 |
10 | Batman v Superman: Dawn of Justice | 197000 |
128 | Mad Max: Fury Road | 191000 |
179 | The Revenant | 190000 |
#top 5 popular genres
df['genres'].value_counts().head()
Drama 236 Comedy 209 Comedy|Drama 191 Comedy|Drama|Romance 187 Comedy|Romance 158 Name: genres, dtype: int64
df['content_rating'].value_counts().head()
R 2118 PG-13 1461 PG 701 Not Rated 116 G 112 Name: content_rating, dtype: int64
#top 5 movie gross earnings in Dollars
df[['movie_title', 'gross']].sort_values(by=['gross'], ascending=False).head()
movie_title | gross | |
---|---|---|
0 | Avatar | 760505847.0 |
26 | Titanic | 658672302.0 |
29 | Jurassic World | 652177271.0 |
794 | The Avengers | 623279547.0 |
17 | The Avengers | 623279547.0 |
Data preparation
Handling duplicate rows
# check duplicate rows
# display those rows for review
dup_rows = df.drop_duplicates()
dup_rows.head()
color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | genres | actor_1_name | movie_title | num_voted_users | cast_total_facebook_likes | actor_3_name | facenumber_in_poster | plot_keywords | movie_imdb_link | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | aspect_ratio | movie_facebook_likes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Color | James Cameron | 723.0 | 178.0 | 0.0 | 855.0 | Joel David Moore | 1000.0 | 760505847.0 | Action|Adventure|Fantasy|Sci-Fi | CCH Pounder | Avatar | 886204 | 4834 | Wes Studi | 0.0 | avatar|future|marine|native|paraplegic | http://www.imdb.com/title/tt0499549/?ref_=fn_t... | 3054.0 | English | USA | PG-13 | 237000000.0 | 2009.0 | 936.0 | 7.9 | 1.78 | 33000 |
1 | Color | Gore Verbinski | 302.0 | 169.0 | 563.0 | 1000.0 | Orlando Bloom | 40000.0 | 309404152.0 | Action|Adventure|Fantasy | Johnny Depp | Pirates of the Caribbean: At World's End | 471220 | 48350 | Jack Davenport | 0.0 | goddess|marriage ceremony|marriage proposal|pi... | http://www.imdb.com/title/tt0449088/?ref_=fn_t... | 1238.0 | English | USA | PG-13 | 300000000.0 | 2007.0 | 5000.0 | 7.1 | 2.35 | 0 |
2 | Color | Sam Mendes | 602.0 | 148.0 | 0.0 | 161.0 | Rory Kinnear | 11000.0 | 200074175.0 | Action|Adventure|Thriller | Christoph Waltz | Spectre | 275868 | 11700 | Stephanie Sigman | 1.0 | bomb|espionage|sequel|spy|terrorist | http://www.imdb.com/title/tt2379713/?ref_=fn_t... | 994.0 | English | UK | PG-13 | 245000000.0 | 2015.0 | 393.0 | 6.8 | 2.35 | 85000 |
3 | Color | Christopher Nolan | 813.0 | 164.0 | 22000.0 | 23000.0 | Christian Bale | 27000.0 | 448130642.0 | Action|Thriller | Tom Hardy | The Dark Knight Rises | 1144337 | 106759 | Joseph Gordon-Levitt | 0.0 | deception|imprisonment|lawlessness|police offi... | http://www.imdb.com/title/tt1345836/?ref_=fn_t... | 2701.0 | English | USA | PG-13 | 250000000.0 | 2012.0 | 23000.0 | 8.5 | 2.35 | 164000 |
4 | NaN | Doug Walker | NaN | NaN | 131.0 | NaN | Rob Walker | 131.0 | NaN | Documentary | Doug Walker | Star Wars: Episode VII - The Force Awakens ... | 8 | 143 | NaN | 0.0 | NaN | http://www.imdb.com/title/tt5289954/?ref_=fn_t... | NaN | NaN | NaN | NaN | NaN | NaN | 12.0 | 7.1 | NaN | 0 |
# dropping duplicated rows from dataframe
df = df.drop_duplicates()
df.shape
(4998, 28)
After dropping duplicate rows, now we have 4998 rows left from 5043 rows, meaning there were 45 duplicate rows.
Dropping unnecessary columns
df=df.drop(['plot_keywords', 'facenumber_in_poster', 'aspect_ratio', 'movie_imdb_link'], axis=1)
df.head()
color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | genres | actor_1_name | movie_title | num_voted_users | cast_total_facebook_likes | actor_3_name | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | movie_facebook_likes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Color | James Cameron | 723.0 | 178.0 | 0.0 | 855.0 | Joel David Moore | 1000.0 | 760505847.0 | Action|Adventure|Fantasy|Sci-Fi | CCH Pounder | Avatar | 886204 | 4834 | Wes Studi | 3054.0 | English | USA | PG-13 | 237000000.0 | 2009.0 | 936.0 | 7.9 | 33000 |
1 | Color | Gore Verbinski | 302.0 | 169.0 | 563.0 | 1000.0 | Orlando Bloom | 40000.0 | 309404152.0 | Action|Adventure|Fantasy | Johnny Depp | Pirates of the Caribbean: At World's End | 471220 | 48350 | Jack Davenport | 1238.0 | English | USA | PG-13 | 300000000.0 | 2007.0 | 5000.0 | 7.1 | 0 |
2 | Color | Sam Mendes | 602.0 | 148.0 | 0.0 | 161.0 | Rory Kinnear | 11000.0 | 200074175.0 | Action|Adventure|Thriller | Christoph Waltz | Spectre | 275868 | 11700 | Stephanie Sigman | 994.0 | English | UK | PG-13 | 245000000.0 | 2015.0 | 393.0 | 6.8 | 85000 |
3 | Color | Christopher Nolan | 813.0 | 164.0 | 22000.0 | 23000.0 | Christian Bale | 27000.0 | 448130642.0 | Action|Thriller | Tom Hardy | The Dark Knight Rises | 1144337 | 106759 | Joseph Gordon-Levitt | 2701.0 | English | USA | PG-13 | 250000000.0 | 2012.0 | 23000.0 | 8.5 | 164000 |
4 | NaN | Doug Walker | NaN | NaN | 131.0 | NaN | Rob Walker | 131.0 | NaN | Documentary | Doug Walker | Star Wars: Episode VII - The Force Awakens ... | 8 | 143 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 12.0 | 7.1 | 0 |
Handling null values
# this dataframe will be used for Regression
dfr = df.copy()
dfr.head()
color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | genres | actor_1_name | movie_title | num_voted_users | cast_total_facebook_likes | actor_3_name | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | movie_facebook_likes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Color | James Cameron | 723.0 | 178.0 | 0.0 | 855.0 | Joel David Moore | 1000.0 | 760505847.0 | Action|Adventure|Fantasy|Sci-Fi | CCH Pounder | Avatar | 886204 | 4834 | Wes Studi | 3054.0 | English | USA | PG-13 | 237000000.0 | 2009.0 | 936.0 | 7.9 | 33000 |
1 | Color | Gore Verbinski | 302.0 | 169.0 | 563.0 | 1000.0 | Orlando Bloom | 40000.0 | 309404152.0 | Action|Adventure|Fantasy | Johnny Depp | Pirates of the Caribbean: At World's End | 471220 | 48350 | Jack Davenport | 1238.0 | English | USA | PG-13 | 300000000.0 | 2007.0 | 5000.0 | 7.1 | 0 |
2 | Color | Sam Mendes | 602.0 | 148.0 | 0.0 | 161.0 | Rory Kinnear | 11000.0 | 200074175.0 | Action|Adventure|Thriller | Christoph Waltz | Spectre | 275868 | 11700 | Stephanie Sigman | 994.0 | English | UK | PG-13 | 245000000.0 | 2015.0 | 393.0 | 6.8 | 85000 |
3 | Color | Christopher Nolan | 813.0 | 164.0 | 22000.0 | 23000.0 | Christian Bale | 27000.0 | 448130642.0 | Action|Thriller | Tom Hardy | The Dark Knight Rises | 1144337 | 106759 | Joseph Gordon-Levitt | 2701.0 | English | USA | PG-13 | 250000000.0 | 2012.0 | 23000.0 | 8.5 | 164000 |
4 | NaN | Doug Walker | NaN | NaN | 131.0 | NaN | Rob Walker | 131.0 | NaN | Documentary | Doug Walker | Star Wars: Episode VII - The Force Awakens ... | 8 | 143 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 12.0 | 7.1 | 0 |
# finding null values
dfr.isnull().sum()
color 19 director_name 103 num_critic_for_reviews 49 duration 15 director_facebook_likes 103 actor_3_facebook_likes 23 actor_2_name 13 actor_1_facebook_likes 7 gross 874 genres 0 actor_1_name 7 movie_title 0 num_voted_users 0 cast_total_facebook_likes 0 actor_3_name 23 num_user_for_reviews 21 language 12 country 5 content_rating 301 budget 487 title_year 107 actor_2_facebook_likes 13 imdb_score 0 movie_facebook_likes 0 dtype: int64
Filling some rows with null values
dfr['color'].describe()
count 4979 unique 2 top Color freq 4772 Name: color, dtype: object
# replace null value with the most popular value in color column
dfr = dfr.fillna({'color': 'Color'})
dfr.isnull().sum()
color 0 director_name 103 num_critic_for_reviews 49 duration 15 director_facebook_likes 103 actor_3_facebook_likes 23 actor_2_name 13 actor_1_facebook_likes 7 gross 874 genres 0 actor_1_name 7 movie_title 0 num_voted_users 0 cast_total_facebook_likes 0 actor_3_name 23 num_user_for_reviews 21 language 12 country 5 content_rating 301 budget 487 title_year 107 actor_2_facebook_likes 13 imdb_score 0 movie_facebook_likes 0 dtype: int64
dfr['language'].describe()
count 4986 unique 47 top English freq 4662 Name: language, dtype: object
# replace null value with the most popular value in language column
dfr = dfr.fillna({'language': 'English'})
dfr.isnull().sum()
color 0 director_name 103 num_critic_for_reviews 49 duration 15 director_facebook_likes 103 actor_3_facebook_likes 23 actor_2_name 13 actor_1_facebook_likes 7 gross 874 genres 0 actor_1_name 7 movie_title 0 num_voted_users 0 cast_total_facebook_likes 0 actor_3_name 23 num_user_for_reviews 21 language 0 country 5 content_rating 301 budget 487 title_year 107 actor_2_facebook_likes 13 imdb_score 0 movie_facebook_likes 0 dtype: int64
dfr['country'].describe()
count 4993 unique 65 top USA freq 3773 Name: country, dtype: object
# replace null value with the most popular value in country column
dfr = dfr.fillna({'country': 'USA'})
dfr.isnull().sum()
color 0 director_name 103 num_critic_for_reviews 49 duration 15 director_facebook_likes 103 actor_3_facebook_likes 23 actor_2_name 13 actor_1_facebook_likes 7 gross 874 genres 0 actor_1_name 7 movie_title 0 num_voted_users 0 cast_total_facebook_likes 0 actor_3_name 23 num_user_for_reviews 21 language 0 country 0 content_rating 301 budget 487 title_year 107 actor_2_facebook_likes 13 imdb_score 0 movie_facebook_likes 0 dtype: int64
#replace null value with the median value
dfr = dfr.fillna({'num_critic_for_reviews': dfr['num_critic_for_reviews'].mean()})
dfr = dfr.fillna({'duration': dfr['duration'].mean()})
dfr = dfr.fillna({'director_facebook_likes': dfr['director_facebook_likes'].mean()})
dfr = dfr.fillna({'actor_3_facebook_likes': dfr['actor_3_facebook_likes'].mean()})
dfr = dfr.fillna({'actor_1_facebook_likes': dfr['actor_1_facebook_likes'].mean()})
dfr = dfr.fillna({'num_user_for_reviews': dfr['num_user_for_reviews'].mean()})
dfr = dfr.fillna({'title_year': dfr['title_year'].mean()})
dfr = dfr.fillna({'actor_2_facebook_likes': dfr['actor_2_facebook_likes'].mean()})
dfr.isnull().sum()
color 0 director_name 103 num_critic_for_reviews 0 duration 0 director_facebook_likes 0 actor_3_facebook_likes 0 actor_2_name 13 actor_1_facebook_likes 0 gross 874 genres 0 actor_1_name 7 movie_title 0 num_voted_users 0 cast_total_facebook_likes 0 actor_3_name 23 num_user_for_reviews 0 language 0 country 0 content_rating 301 budget 487 title_year 0 actor_2_facebook_likes 0 imdb_score 0 movie_facebook_likes 0 dtype: int64
#dropping all remaining null values and reset the index
dfr = dfr.dropna()
dfr = dfr.reset_index(drop=True)
dfr.head()
color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | genres | actor_1_name | movie_title | num_voted_users | cast_total_facebook_likes | actor_3_name | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | movie_facebook_likes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Color | James Cameron | 723.0 | 178.0 | 0.0 | 855.0 | Joel David Moore | 1000.0 | 760505847.0 | Action|Adventure|Fantasy|Sci-Fi | CCH Pounder | Avatar | 886204 | 4834 | Wes Studi | 3054.0 | English | USA | PG-13 | 237000000.0 | 2009.0 | 936.0 | 7.9 | 33000 |
1 | Color | Gore Verbinski | 302.0 | 169.0 | 563.0 | 1000.0 | Orlando Bloom | 40000.0 | 309404152.0 | Action|Adventure|Fantasy | Johnny Depp | Pirates of the Caribbean: At World's End | 471220 | 48350 | Jack Davenport | 1238.0 | English | USA | PG-13 | 300000000.0 | 2007.0 | 5000.0 | 7.1 | 0 |
2 | Color | Sam Mendes | 602.0 | 148.0 | 0.0 | 161.0 | Rory Kinnear | 11000.0 | 200074175.0 | Action|Adventure|Thriller | Christoph Waltz | Spectre | 275868 | 11700 | Stephanie Sigman | 994.0 | English | UK | PG-13 | 245000000.0 | 2015.0 | 393.0 | 6.8 | 85000 |
3 | Color | Christopher Nolan | 813.0 | 164.0 | 22000.0 | 23000.0 | Christian Bale | 27000.0 | 448130642.0 | Action|Thriller | Tom Hardy | The Dark Knight Rises | 1144337 | 106759 | Joseph Gordon-Levitt | 2701.0 | English | USA | PG-13 | 250000000.0 | 2012.0 | 23000.0 | 8.5 | 164000 |
4 | Color | Andrew Stanton | 462.0 | 132.0 | 475.0 | 530.0 | Samantha Morton | 640.0 | 73058679.0 | Action|Adventure|Sci-Fi | Daryl Sabara | John Carter | 212204 | 1873 | Polly Walker | 738.0 | English | USA | PG-13 | 263700000.0 | 2012.0 | 632.0 | 6.6 | 24000 |
#checking to see if there is any null values left
dfr.isnull().sum()
color 0 director_name 0 num_critic_for_reviews 0 duration 0 director_facebook_likes 0 actor_3_facebook_likes 0 actor_2_name 0 actor_1_facebook_likes 0 gross 0 genres 0 actor_1_name 0 movie_title 0 num_voted_users 0 cast_total_facebook_likes 0 actor_3_name 0 num_user_for_reviews 0 language 0 country 0 content_rating 0 budget 0 title_year 0 actor_2_facebook_likes 0 imdb_score 0 movie_facebook_likes 0 dtype: int64
Variables that i think are good predictors for a movie's success are: duration, director_name, director_facebook_likes, actor_1_name, actor_1_facebook_likes, actor_2_name, actor_2_facebook_likes, actor_3_name, actor_3_facebook_likes, num_user_for_reviews, num_critic_for_reviews, num_voted_users, cast_total_facebook_likes, movie_facebook_likes, genres, content_rating, gross
# Who are the top 5 directors in terms of how many movies they made?
df.groupby(['director_name']).size().sort_values(ascending=False).head()
director_name Steven Spielberg 26 Woody Allen 22 Clint Eastwood 20 Martin Scorsese 20 Ridley Scott 17 dtype: int64
#bar graph visualization
df.groupby(['director_name']).size().sort_values(ascending=False).head().plot(kind='bar')
<AxesSubplot:xlabel='director_name'>
#what are the top 5 directors in terms of gross earnings?
df.groupby('director_name')['gross'].sum().sort_values(ascending=False).head()
director_name Steven Spielberg 4.114233e+09 Peter Jackson 2.592969e+09 Michael Bay 2.231243e+09 Tim Burton 2.071275e+09 Sam Raimi 2.049549e+09 Name: gross, dtype: float64
#bar graph
df.groupby('director_name')['gross'].sum().sort_values(ascending=False).head().plot(kind='bar')
<AxesSubplot:xlabel='director_name'>
#Which director has the most facebook likes?
df.groupby('director_name')['director_facebook_likes'].sum().sort_values(ascending=False).head()
director_name Steven Spielberg 364000.0 Martin Scorsese 340000.0 Clint Eastwood 320000.0 Woody Allen 242000.0 David Fincher 210000.0 Name: director_facebook_likes, dtype: float64
# How many movies did the primary actor star in?
df.groupby(['actor_1_name']).size().sort_values(ascending=False).head()
actor_1_name Robert De Niro 49 Johnny Depp 40 Nicolas Cage 32 J.K. Simmons 31 Matt Damon 30 dtype: int64
df.groupby(['actor_1_name']).size().sort_values(ascending=False).head().plot(kind='barh')
<AxesSubplot:ylabel='actor_1_name'>
#Who are the top 5 actors with the highest grossing movies?
df.groupby('actor_1_name')['gross'].sum().sort_values(ascending=False).head(5)
actor_1_name Johnny Depp 3.688020e+09 Harrison Ford 3.391556e+09 Tom Hanks 3.264559e+09 Tom Cruise 2.987622e+09 J.K. Simmons 2.856407e+09 Name: gross, dtype: float64
df.groupby('actor_1_name')['gross'].sum().sort_values(ascending=False).head().plot(kind='bar')
<AxesSubplot:xlabel='actor_1_name'>
#What are the top 5 movies with the most critical reviews?
df.groupby('movie_title')['num_critic_for_reviews'].sum().sort_values(ascending=False).head()
movie_title Skyfall 1500.0 King Kong 1338.0 Oz the Great and Powerful 1050.0 RoboCop 984.0 The Great Gatsby 980.0 Name: num_critic_for_reviews, dtype: float64
df.groupby('movie_title')['num_critic_for_reviews'].sum().sort_values(ascending=False).head().plot(kind='bar')
<AxesSubplot:xlabel='movie_title'>
#Which top 5 movies were voted for most?
df.groupby('movie_title')['num_voted_users'].sum().sort_values(ascending=False).head()
movie_title The Shawshank Redemption 1689764 The Dark Knight 1676169 Inception 1468200 Fight Club 1347461 Pulp Fiction 1324680 Name: num_voted_users, dtype: int64
df.groupby('movie_title')['num_voted_users'].sum().sort_values(ascending=False).head().plot(kind='bar')
<AxesSubplot:xlabel='movie_title'>
gen_gros = df[['genres','gross']]
gen_gros = df.dropna(subset=['genres'])
gen_gros = df.dropna(subset=['gross'])
#now, separate a string of genres into multiple rows
# borrowed the code from https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows
gegr = pd.DataFrame(gen_gros.genres.str.split('|').tolist(), index=gen_gros.gross).stack()
gegr = gegr.reset_index()[[0, 'gross']]
gegr.columns = ['genres', 'gross']
gegr.head()
genres | gross | |
---|---|---|
0 | Action | 760505847.0 |
1 | Adventure | 760505847.0 |
2 | Fantasy | 760505847.0 |
3 | Sci-Fi | 760505847.0 |
4 | Action | 309404152.0 |
gegr.groupby('genres')['gross'].sum().sort_values(ascending=False).head()
genres Adventure 7.855534e+10 Comedy 7.719882e+10 Action 7.399143e+10 Drama 7.301086e+10 Thriller 5.447667e+10 Name: gross, dtype: float64
Adventure is the highest grossing genre type followed by comedy.
gegr.groupby('genres')['gross'].sum().sort_values(ascending=False).plot(kind='line')
<AxesSubplot:xlabel='genres'>
gegr.groupby('genres')['gross'].sum().sort_values(ascending=False).plot(kind='bar')
<AxesSubplot:xlabel='genres'>
# Which top 5 films with the highest imdb_score?
mov_cat_score2 = df.groupby('movie_title').agg({'imdb_score': ['max']})
mov_cat_score2.columns = ['imdb_score_max']
mov_cat_score2 = mov_cat_score2.reset_index()
mov_cat_score2.sort_values(by='imdb_score_max', ascending=False).head()
movie_title | imdb_score_max | |
---|---|---|
4550 | Towering Inferno | 9.5 |
4299 | The Shawshank Redemption | 9.3 |
3822 | The Godfather | 9.2 |
997 | Dekalog | 9.1 |
2012 | Kickboxer: Vengeance | 9.1 |
#creating a separate dataframe
dfbag1 = df.copy()
dfbag2 = dfbag1[['imdb_score','movie_title']]
dfbag2.head()
imdb_score | movie_title | |
---|---|---|
0 | 7.9 | Avatar |
1 | 7.1 | Pirates of the Caribbean: At World's End |
2 | 6.8 | Spectre |
3 | 8.5 | The Dark Knight Rises |
4 | 7.1 | Star Wars: Episode VII - The Force Awakens ... |
dfbag3 = dfbag2.dropna(subset=['imdb_score'])
dfbag3 = dfbag2.dropna(subset=['movie_title'])
# creating 3 bins from imdb_score column
dfbag3['imdb_category'] = pd.qcut(dfbag3['imdb_score'], 3)
dfbag3.head()
imdb_score | movie_title | imdb_category | |
---|---|---|---|
0 | 7.9 | Avatar | (7.0, 9.5] |
1 | 7.1 | Pirates of the Caribbean: At World's End | (7.0, 9.5] |
2 | 6.8 | Spectre | (6.1, 7.0] |
3 | 8.5 | The Dark Knight Rises | (7.0, 9.5] |
4 | 7.1 | Star Wars: Episode VII - The Force Awakens ... | (7.0, 9.5] |
#rename the name of the bins to bad, average, and good.
dfbag3['imdb_category_labels']=pd.qcut(dfbag3['imdb_score'], 3, labels=['bad', 'average', 'good'])
dfbag3.sort_values(by= 'imdb_score', ascending=False).head()
imdb_score | movie_title | imdb_category | imdb_category_labels | |
---|---|---|---|---|
2765 | 9.5 | Towering Inferno | (7.0, 9.5] | good |
1937 | 9.3 | The Shawshank Redemption | (7.0, 9.5] | good |
3466 | 9.2 | The Godfather | (7.0, 9.5] | good |
3207 | 9.1 | Dekalog | (7.0, 9.5] | good |
2824 | 9.1 | Dekalog | (7.0, 9.5] | good |
dfbag3.groupby('imdb_category_labels').size()
imdb_category_labels bad 1748 average 1673 good 1577 dtype: int64
The movies seem to be evenly distributed between bad, average, and good.
icl = dfbag3.groupby('imdb_category_labels').size().reset_index()
icl
imdb_category_labels | 0 | |
---|---|---|
0 | bad | 1748 |
1 | average | 1673 |
2 | good | 1577 |
colors = ['red', 'yellow', 'lightgreen']
explode = (0.1, 0, 0)
plt.pie(icl[0], labels=icl['imdb_category_labels'], explode=explode, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal');
# Which primary actors starred in the biggest budgeted movie?
df.groupby(['actor_1_name','movie_title'])['budget'].sum().sort_values(ascending=False).head()
actor_1_name movie_title Doona Bae The Host 1.221550e+10 Min-sik Choi Lady Vengeance 4.200000e+09 Marcell Nagy Fateless 2.500000e+09 Minnie Driver Princess Mononoke 2.400000e+09 William Hootkins Steamboy 2.127520e+09 Name: budget, dtype: float64
#What is the most successfull movie both a director and primary actor made together in terms of gross?
df.groupby(['director_name','actor_1_name','movie_title'])['gross'].sum().sort_values(ascending=False).head()
director_name actor_1_name movie_title James Cameron CCH Pounder Avatar 760505847.0 Jon Favreau Scarlett Johansson The Jungle Book 725290282.0 Sam Raimi J.K. Simmons Spider-Man 3 673060606.0 Tim Burton Johnny Depp Alice in Wonderland 668370412.0 James Cameron Leonardo DiCaprio Titanic 658672302.0 Name: gross, dtype: float64
#What the most popular content rating?
imp = df['content_rating'].value_counts()
imp.head()
R 2098 PG-13 1444 PG 698 Not Rated 116 G 112 Name: content_rating, dtype: int64
imp.sort_values(ascending=False).head().plot(kind='barh')
<AxesSubplot:>
Data visualization
dfact1 = df.loc[df['color'] == 'Color']
dfact2 = df.loc[df['color'] == ' Black and White']
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.suptitle('Horizontally stacked subplots')
ax1.plot(dfact1['gross'], 'ro')
ax2.plot(dfact2['gross'], 'ro');
sns.kdeplot(df.title_year)
<AxesSubplot:xlabel='title_year', ylabel='Density'>
sns.catplot("content_rating", "imdb_score", data=df, kind='point')
<seaborn.axisgrid.FacetGrid at 0x198f2ba6610>
sns.jointplot("imdb_score", "gross", df, kind="hex", color="#8855AA")
<seaborn.axisgrid.JointGrid at 0x198f2b29130>
sns.jointplot("gross", "num_critic_for_reviews", df)
<seaborn.axisgrid.JointGrid at 0x198f288ec70>
df[['actor_2_facebook_likes', 'actor_1_facebook_likes']].plot(kind='kde', title='KDE plot')
<AxesSubplot:title={'center':'KDE plot'}, ylabel='Density'>
df[['director_facebook_likes', 'movie_facebook_likes']].plot(kind='box', title='Box plot')
<AxesSubplot:title={'center':'Box plot'}>
sns.lmplot('title_year', 'gross', data=df, fit_reg=True)
<seaborn.axisgrid.FacetGrid at 0x198f42068b0>
sns.lmplot("num_voted_users", "imdb_score", df, hue="color")
<seaborn.axisgrid.FacetGrid at 0x198f41e30a0>
sns.kdeplot(df.num_user_for_reviews, df.num_voted_users)
<AxesSubplot:xlabel='num_user_for_reviews', ylabel='num_voted_users'>
sns.histplot(data=df, x="title_year", hue="color")
<AxesSubplot:xlabel='title_year', ylabel='Count'>
df.corr()
num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_1_facebook_likes | gross | num_voted_users | cast_total_facebook_likes | num_user_for_reviews | budget | title_year | actor_2_facebook_likes | imdb_score | movie_facebook_likes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
num_critic_for_reviews | 1.000000 | 0.257529 | 0.181246 | 0.265591 | 0.189533 | 0.477603 | 0.624255 | 0.260753 | 0.608175 | 0.119237 | 0.276845 | 0.280421 | 0.307226 | 0.681723 |
duration | 0.257529 | 1.000000 | 0.173027 | 0.119806 | 0.087567 | 0.247443 | 0.313935 | 0.120975 | 0.328413 | 0.073586 | -0.137076 | 0.129660 | 0.262618 | 0.194401 |
director_facebook_likes | 0.181246 | 0.173027 | 1.000000 | 0.123204 | 0.091374 | 0.146368 | 0.299873 | 0.120781 | 0.223519 | 0.021015 | -0.064856 | 0.121191 | 0.171564 | 0.162588 |
actor_3_facebook_likes | 0.265591 | 0.119806 | 0.123204 | 1.000000 | 0.250000 | 0.289965 | 0.277918 | 0.470036 | 0.225871 | 0.045718 | 0.095383 | 0.556366 | 0.051683 | 0.267775 |
actor_1_facebook_likes | 0.189533 | 0.087567 | 0.091374 | 0.250000 | 1.000000 | 0.153083 | 0.191239 | 0.952760 | 0.144768 | 0.022202 | 0.086286 | 0.389749 | 0.075952 | 0.133604 |
gross | 0.477603 | 0.247443 | 0.146368 | 0.289965 | 0.153083 | 1.000000 | 0.635271 | 0.240049 | 0.561006 | 0.101033 | 0.029110 | 0.254172 | 0.198417 | 0.370140 |
num_voted_users | 0.624255 | 0.313935 | 0.299873 | 0.277918 | 0.191239 | 0.635271 | 1.000000 | 0.261063 | 0.798691 | 0.079069 | 0.007232 | 0.265317 | 0.411299 | 0.535218 |
cast_total_facebook_likes | 0.260753 | 0.120975 | 0.120781 | 0.470036 | 0.952760 | 0.240049 | 0.261063 | 1.000000 | 0.204547 | 0.035525 | 0.109263 | 0.625837 | 0.085091 | 0.204378 |
num_user_for_reviews | 0.608175 | 0.328413 | 0.223519 | 0.225871 | 0.144768 | 0.561006 | 0.798691 | 0.204547 | 1.000000 | 0.084149 | -0.002355 | 0.217153 | 0.292745 | 0.398795 |
budget | 0.119237 | 0.073586 | 0.021015 | 0.045718 | 0.022202 | 0.101033 | 0.079069 | 0.035525 | 0.084149 | 1.000000 | 0.044953 | 0.043152 | 0.031061 | 0.060564 |
title_year | 0.276845 | -0.137076 | -0.064856 | 0.095383 | 0.086286 | 0.029110 | 0.007232 | 0.109263 | -0.002355 | 0.044953 | 1.000000 | 0.102083 | -0.207591 | 0.217227 |
actor_2_facebook_likes | 0.280421 | 0.129660 | 0.121191 | 0.556366 | 0.389749 | 0.254172 | 0.265317 | 0.625837 | 0.217153 | 0.043152 | 0.102083 | 1.000000 | 0.082400 | 0.237699 |
imdb_score | 0.307226 | 0.262618 | 0.171564 | 0.051683 | 0.075952 | 0.198417 | 0.411299 | 0.085091 | 0.292745 | 0.031061 | -0.207591 | 0.082400 | 1.000000 | 0.249604 |
movie_facebook_likes | 0.681723 | 0.194401 | 0.162588 | 0.267775 | 0.133604 | 0.370140 | 0.535218 | 0.204378 | 0.398795 | 0.060564 | 0.217227 | 0.237699 | 0.249604 | 1.000000 |
plt.figure(figsize=(14,10))
sns.heatmap(df.corr(), vmax=.8, square=True, annot=True, fmt=".1f")
<AxesSubplot:>
Variables highly correlated to imdb_score are; movie_facebook_likes, num_user_for_reviews, gross, num_voted_users, director_facebook_likes, duration, num_critic_for_reviews
Key variables that are correlated to each other
# correlation of two columns: num_critic_for_reviews, movie_facebook_likes
df[['num_critic_for_reviews','movie_facebook_likes']].corr()
num_critic_for_reviews | movie_facebook_likes | |
---|---|---|
num_critic_for_reviews | 1.000000 | 0.681723 |
movie_facebook_likes | 0.681723 | 1.000000 |
# correlation of two columns: actor_1_facebook_likes, cast_total_facebook_likes
df[['actor_1_facebook_likes','cast_total_facebook_likes']].corr()
actor_1_facebook_likes | cast_total_facebook_likes | |
---|---|---|
actor_1_facebook_likes | 1.00000 | 0.95276 |
cast_total_facebook_likes | 0.95276 | 1.00000 |
# correlation of two columns: num_voted_users, num_user_for_reviews
df[['num_voted_users','num_user_for_reviews']].corr()
num_voted_users | num_user_for_reviews | |
---|---|---|
num_voted_users | 1.000000 | 0.798691 |
num_user_for_reviews | 0.798691 | 1.000000 |
g = sns.PairGrid(df)
g.map(plt.scatter)
<seaborn.axisgrid.PairGrid at 0x198f4a7f610>
import pingouin as pg
below are the pearson correlation of the highly correlated variables. a high r2 indicates a strong correlation and low r2 indicates a low correlation among then.
pg.corr(x=df['num_voted_users'], y=df['num_user_for_reviews'])
n | r | CI95% | r2 | adj_r2 | p-val | BF10 | power | |
---|---|---|---|---|---|---|---|---|
pearson | 4977 | 0.798691 | [0.79, 0.81] | 0.637907 | 0.637762 | 0.0 | inf | 1.0 |
pg.corr(x=df['actor_1_facebook_likes'], y=df['cast_total_facebook_likes'])
n | r | CI95% | r2 | adj_r2 | p-val | BF10 | power | |
---|---|---|---|---|---|---|---|---|
pearson | 4991 | 0.95276 | [0.95, 0.96] | 0.907752 | 0.907715 | 0.0 | inf | 1.0 |
pg.corr(x=df['num_critic_for_reviews'], y=df['movie_facebook_likes'])
n | r | CI95% | r2 | adj_r2 | p-val | BF10 | power | |
---|---|---|---|---|---|---|---|---|
pearson | 4949 | 0.681723 | [0.67, 0.7] | 0.464746 | 0.464529 | 0.0 | inf | 1.0 |
dfR = dfr.copy()
dfR.head()
color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | genres | actor_1_name | movie_title | num_voted_users | cast_total_facebook_likes | actor_3_name | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | movie_facebook_likes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Color | James Cameron | 723.0 | 178.0 | 0.0 | 855.0 | Joel David Moore | 1000.0 | 760505847.0 | Action|Adventure|Fantasy|Sci-Fi | CCH Pounder | Avatar | 886204 | 4834 | Wes Studi | 3054.0 | English | USA | PG-13 | 237000000.0 | 2009.0 | 936.0 | 7.9 | 33000 |
1 | Color | Gore Verbinski | 302.0 | 169.0 | 563.0 | 1000.0 | Orlando Bloom | 40000.0 | 309404152.0 | Action|Adventure|Fantasy | Johnny Depp | Pirates of the Caribbean: At World's End | 471220 | 48350 | Jack Davenport | 1238.0 | English | USA | PG-13 | 300000000.0 | 2007.0 | 5000.0 | 7.1 | 0 |
2 | Color | Sam Mendes | 602.0 | 148.0 | 0.0 | 161.0 | Rory Kinnear | 11000.0 | 200074175.0 | Action|Adventure|Thriller | Christoph Waltz | Spectre | 275868 | 11700 | Stephanie Sigman | 994.0 | English | UK | PG-13 | 245000000.0 | 2015.0 | 393.0 | 6.8 | 85000 |
3 | Color | Christopher Nolan | 813.0 | 164.0 | 22000.0 | 23000.0 | Christian Bale | 27000.0 | 448130642.0 | Action|Thriller | Tom Hardy | The Dark Knight Rises | 1144337 | 106759 | Joseph Gordon-Levitt | 2701.0 | English | USA | PG-13 | 250000000.0 | 2012.0 | 23000.0 | 8.5 | 164000 |
4 | Color | Andrew Stanton | 462.0 | 132.0 | 475.0 | 530.0 | Samantha Morton | 640.0 | 73058679.0 | Action|Adventure|Sci-Fi | Daryl Sabara | John Carter | 212204 | 1873 | Polly Walker | 738.0 | English | USA | PG-13 | 263700000.0 | 2012.0 | 632.0 | 6.6 | 24000 |
#drop unnecessary coloumn
dfR=dfr.drop(['movie_title'], axis=1)
dfR.head(1)
color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | genres | actor_1_name | num_voted_users | cast_total_facebook_likes | actor_3_name | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | movie_facebook_likes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Color | James Cameron | 723.0 | 178.0 | 0.0 | 855.0 | Joel David Moore | 1000.0 | 760505847.0 | Action|Adventure|Fantasy|Sci-Fi | CCH Pounder | 886204 | 4834 | Wes Studi | 3054.0 | English | USA | PG-13 | 237000000.0 | 2009.0 | 936.0 | 7.9 | 33000 |
#splitting genres
dfR['main_genre'] = dfR.genres.str.split('|').str[0]
dfR.head()
color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | genres | actor_1_name | num_voted_users | cast_total_facebook_likes | actor_3_name | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | movie_facebook_likes | main_genre | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Color | James Cameron | 723.0 | 178.0 | 0.0 | 855.0 | Joel David Moore | 1000.0 | 760505847.0 | Action|Adventure|Fantasy|Sci-Fi | CCH Pounder | 886204 | 4834 | Wes Studi | 3054.0 | English | USA | PG-13 | 237000000.0 | 2009.0 | 936.0 | 7.9 | 33000 | Action |
1 | Color | Gore Verbinski | 302.0 | 169.0 | 563.0 | 1000.0 | Orlando Bloom | 40000.0 | 309404152.0 | Action|Adventure|Fantasy | Johnny Depp | 471220 | 48350 | Jack Davenport | 1238.0 | English | USA | PG-13 | 300000000.0 | 2007.0 | 5000.0 | 7.1 | 0 | Action |
2 | Color | Sam Mendes | 602.0 | 148.0 | 0.0 | 161.0 | Rory Kinnear | 11000.0 | 200074175.0 | Action|Adventure|Thriller | Christoph Waltz | 275868 | 11700 | Stephanie Sigman | 994.0 | English | UK | PG-13 | 245000000.0 | 2015.0 | 393.0 | 6.8 | 85000 | Action |
3 | Color | Christopher Nolan | 813.0 | 164.0 | 22000.0 | 23000.0 | Christian Bale | 27000.0 | 448130642.0 | Action|Thriller | Tom Hardy | 1144337 | 106759 | Joseph Gordon-Levitt | 2701.0 | English | USA | PG-13 | 250000000.0 | 2012.0 | 23000.0 | 8.5 | 164000 | Action |
4 | Color | Andrew Stanton | 462.0 | 132.0 | 475.0 | 530.0 | Samantha Morton | 640.0 | 73058679.0 | Action|Adventure|Sci-Fi | Daryl Sabara | 212204 | 1873 | Polly Walker | 738.0 | English | USA | PG-13 | 263700000.0 | 2012.0 | 632.0 | 6.6 | 24000 | Action |
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dfR['main_genre'] = le.fit_transform(dfR.main_genre)
dfR.head()
color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | genres | actor_1_name | num_voted_users | cast_total_facebook_likes | actor_3_name | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | movie_facebook_likes | main_genre | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Color | James Cameron | 723.0 | 178.0 | 0.0 | 855.0 | Joel David Moore | 1000.0 | 760505847.0 | Action|Adventure|Fantasy|Sci-Fi | CCH Pounder | 886204 | 4834 | Wes Studi | 3054.0 | English | USA | PG-13 | 237000000.0 | 2009.0 | 936.0 | 7.9 | 33000 | 0 |
1 | Color | Gore Verbinski | 302.0 | 169.0 | 563.0 | 1000.0 | Orlando Bloom | 40000.0 | 309404152.0 | Action|Adventure|Fantasy | Johnny Depp | 471220 | 48350 | Jack Davenport | 1238.0 | English | USA | PG-13 | 300000000.0 | 2007.0 | 5000.0 | 7.1 | 0 | 0 |
2 | Color | Sam Mendes | 602.0 | 148.0 | 0.0 | 161.0 | Rory Kinnear | 11000.0 | 200074175.0 | Action|Adventure|Thriller | Christoph Waltz | 275868 | 11700 | Stephanie Sigman | 994.0 | English | UK | PG-13 | 245000000.0 | 2015.0 | 393.0 | 6.8 | 85000 | 0 |
3 | Color | Christopher Nolan | 813.0 | 164.0 | 22000.0 | 23000.0 | Christian Bale | 27000.0 | 448130642.0 | Action|Thriller | Tom Hardy | 1144337 | 106759 | Joseph Gordon-Levitt | 2701.0 | English | USA | PG-13 | 250000000.0 | 2012.0 | 23000.0 | 8.5 | 164000 | 0 |
4 | Color | Andrew Stanton | 462.0 | 132.0 | 475.0 | 530.0 | Samantha Morton | 640.0 | 73058679.0 | Action|Adventure|Sci-Fi | Daryl Sabara | 212204 | 1873 | Polly Walker | 738.0 | English | USA | PG-13 | 263700000.0 | 2012.0 | 632.0 | 6.6 | 24000 | 0 |
dfR.color = le.fit_transform(dfR.color)
dfR.director_name = le.fit_transform(dfR.director_name)
dfR.actor_2_name = le.fit_transform(dfR.actor_2_name)
dfR.actor_1_name = le.fit_transform(dfR.actor_1_name)
dfR.actor_3_name = le.fit_transform(dfR.actor_3_name)
dfR.language = le.fit_transform(dfR.language)
dfR.country = le.fit_transform(dfR.country)
dfR.content_rating = le.fit_transform(dfR.content_rating)
dfR.head()
color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | genres | actor_1_name | num_voted_users | cast_total_facebook_likes | actor_3_name | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | movie_facebook_likes | main_genre | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 638 | 723.0 | 178.0 | 0.0 | 855.0 | 1030 | 1000.0 | 760505847.0 | Action|Adventure|Fantasy|Sci-Fi | 192 | 886204 | 4834 | 2585 | 3054.0 | 9 | 44 | 7 | 237000000.0 | 2009.0 | 936.0 | 7.9 | 33000 | 0 |
1 | 1 | 555 | 302.0 | 169.0 | 563.0 | 1000.0 | 1638 | 40000.0 | 309404152.0 | Action|Adventure|Fantasy | 703 | 471220 | 48350 | 1024 | 1238.0 | 9 | 44 | 7 | 300000000.0 | 2007.0 | 5000.0 | 7.1 | 0 | 0 |
2 | 1 | 1436 | 602.0 | 148.0 | 0.0 | 161.0 | 1844 | 11000.0 | 200074175.0 | Action|Adventure|Thriller | 261 | 275868 | 11700 | 2347 | 994.0 | 9 | 43 | 7 | 245000000.0 | 2015.0 | 393.0 | 6.8 | 85000 | 0 |
3 | 1 | 254 | 813.0 | 164.0 | 22000.0 | 23000.0 | 396 | 27000.0 | 448130642.0 | Action|Thriller | 1378 | 1144337 | 106759 | 1295 | 2701.0 | 9 | 44 | 7 | 250000000.0 | 2012.0 | 23000.0 | 8.5 | 164000 | 0 |
4 | 1 | 64 | 462.0 | 132.0 | 475.0 | 530.0 | 1887 | 640.0 | 73058679.0 | Action|Adventure|Sci-Fi | 327 | 212204 | 1873 | 2015 | 738.0 | 9 | 44 | 7 | 263700000.0 | 2012.0 | 632.0 | 6.6 | 24000 | 0 |
#make new dataframe for Regression Modeling
dfRM = dfR.copy()
dfRM.head()
color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | genres | actor_1_name | num_voted_users | cast_total_facebook_likes | actor_3_name | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | movie_facebook_likes | main_genre | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 638 | 723.0 | 178.0 | 0.0 | 855.0 | 1030 | 1000.0 | 760505847.0 | Action|Adventure|Fantasy|Sci-Fi | 192 | 886204 | 4834 | 2585 | 3054.0 | 9 | 44 | 7 | 237000000.0 | 2009.0 | 936.0 | 7.9 | 33000 | 0 |
1 | 1 | 555 | 302.0 | 169.0 | 563.0 | 1000.0 | 1638 | 40000.0 | 309404152.0 | Action|Adventure|Fantasy | 703 | 471220 | 48350 | 1024 | 1238.0 | 9 | 44 | 7 | 300000000.0 | 2007.0 | 5000.0 | 7.1 | 0 | 0 |
2 | 1 | 1436 | 602.0 | 148.0 | 0.0 | 161.0 | 1844 | 11000.0 | 200074175.0 | Action|Adventure|Thriller | 261 | 275868 | 11700 | 2347 | 994.0 | 9 | 43 | 7 | 245000000.0 | 2015.0 | 393.0 | 6.8 | 85000 | 0 |
3 | 1 | 254 | 813.0 | 164.0 | 22000.0 | 23000.0 | 396 | 27000.0 | 448130642.0 | Action|Thriller | 1378 | 1144337 | 106759 | 1295 | 2701.0 | 9 | 44 | 7 | 250000000.0 | 2012.0 | 23000.0 | 8.5 | 164000 | 0 |
4 | 1 | 64 | 462.0 | 132.0 | 475.0 | 530.0 | 1887 | 640.0 | 73058679.0 | Action|Adventure|Sci-Fi | 327 | 212204 | 1873 | 2015 | 738.0 | 9 | 44 | 7 | 263700000.0 | 2012.0 | 632.0 | 6.6 | 24000 | 0 |
# remove unnecessary column ('genres')
dfRM = dfRM.drop(['genres'], axis=1)
dfRM.head(1)
color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | actor_1_name | num_voted_users | cast_total_facebook_likes | actor_3_name | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | movie_facebook_likes | main_genre | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 638 | 723.0 | 178.0 | 0.0 | 855.0 | 1030 | 1000.0 | 760505847.0 | 192 | 886204 | 4834 | 2585 | 3054.0 | 9 | 44 | 7 | 237000000.0 | 2009.0 | 936.0 | 7.9 | 33000 | 0 |
dfRM.corr()['imdb_score'].sort_values()
title_year -0.133390 color -0.118480 country -0.054637 actor_3_name -0.004077 actor_2_name -0.003265 director_name 0.016045 actor_1_name 0.025374 budget 0.030437 main_genre 0.054811 actor_3_facebook_likes 0.064969 content_rating 0.074990 actor_1_facebook_likes 0.093368 actor_2_facebook_likes 0.101208 cast_total_facebook_likes 0.106062 language 0.111172 director_facebook_likes 0.192332 gross 0.217158 movie_facebook_likes 0.283379 num_user_for_reviews 0.325020 num_critic_for_reviews 0.350114 duration 0.367463 num_voted_users 0.479947 imdb_score 1.000000 Name: imdb_score, dtype: float64
plt.figure(figsize=(12, 12))
sns.heatmap(dfRM.corr(), annot=True)
<AxesSubplot:>
Full model
#assigning columns to X and Y variables
X = dfRM.drop(['imdb_score'], axis =1)
y = dfRM['imdb_score']
# Full Model 1 development
# split validation (70% training & 30% testing data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
(2660, 22) (2660,) (1140, 22) (1140,)
model1 = lm.LinearRegression()
model1.fit(X_train, y_train)
model1_y = model1.predict(X_test)
print('Coefficients: ', model1.coef_)
print("y-intercept ", model1.intercept_)
Coefficients: [-3.60473287e-01 7.08596471e-07 2.66441067e-03 9.25211727e-03 1.14551964e-06 2.71098754e-05 -1.94933021e-06 4.60424228e-05 -9.24772051e-10 2.47707249e-05 3.51449550e-06 -4.35642069e-05 -1.42679048e-05 -6.23624458e-04 4.13847368e-02 -7.29686643e-03 1.61841498e-02 -2.09896310e-10 -2.20522188e-02 5.20883132e-05 -2.76290152e-06 3.13106567e-02] y-intercept 49.13015384492339
pd.DataFrame(list(zip(X.columns, np.transpose(model1.coef_)))).round(2)
0 | 1 | |
---|---|---|
0 | color | -0.36 |
1 | director_name | 0.00 |
2 | num_critic_for_reviews | 0.00 |
3 | duration | 0.01 |
4 | director_facebook_likes | 0.00 |
5 | actor_3_facebook_likes | 0.00 |
6 | actor_2_name | -0.00 |
7 | actor_1_facebook_likes | 0.00 |
8 | gross | -0.00 |
9 | actor_1_name | 0.00 |
10 | num_voted_users | 0.00 |
11 | cast_total_facebook_likes | -0.00 |
12 | actor_3_name | -0.00 |
13 | num_user_for_reviews | -0.00 |
14 | language | 0.04 |
15 | country | -0.01 |
16 | content_rating | 0.02 |
17 | budget | -0.00 |
18 | title_year | -0.02 |
19 | actor_2_facebook_likes | 0.00 |
20 | movie_facebook_likes | -0.00 |
21 | main_genre | 0.03 |
# full model 1 evaluation
print("mean square error: ", mean_squared_error(y_test, model1_y))
print("variance or r-squared: ", explained_variance_score(y_test, model1_y))
mean square error: 0.6975054276930848 variance or r-squared: 0.36763555494497147
plt.subplots()
plt.scatter(y_test, model1_y)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()
Comments about Full Model#1: This full model has MSE=0.698 and r-squared=0.368
Statmodel
Building the full model (with all X variables) using statsmodel. Interpret p value.
X_train.columns
Index(['color', 'director_name', 'num_critic_for_reviews', 'duration', 'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name', 'actor_1_facebook_likes', 'gross', 'actor_1_name', 'num_voted_users', 'cast_total_facebook_likes', 'actor_3_name', 'num_user_for_reviews', 'language', 'country', 'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes', 'movie_facebook_likes', 'main_genre'], dtype='object')
runs_reg_model = ols("imdb_score~color+director_name+num_critic_for_reviews+duration+director_facebook_likes+actor_3_facebook_likes+actor_2_name+actor_1_facebook_likes+gross+actor_1_name+num_voted_users+cast_total_facebook_likes+actor_3_name+num_user_for_reviews+language+country+content_rating+budget+title_year+actor_2_facebook_likes+movie_facebook_likes+main_genre",dfRM)
runs_reg = runs_reg_model.fit()
print(runs_reg.summary())
OLS Regression Results ============================================================================== Dep. Variable: imdb_score R-squared: 0.383 Model: OLS Adj. R-squared: 0.379 Method: Least Squares F-statistic: 106.5 Date: Mon, 10 May 2021 Prob (F-statistic): 0.00 Time: 20:12:08 Log-Likelihood: -4677.2 No. Observations: 3800 AIC: 9400. Df Residuals: 3777 BIC: 9544. Df Model: 22 Covariance Type: nonrobust ============================================================================================= coef std err t P>|t| [0.025 0.975] --------------------------------------------------------------------------------------------- Intercept 48.4620 3.298 14.695 0.000 41.996 54.928 color -0.3642 0.078 -4.687 0.000 -0.517 -0.212 director_name 1.691e-05 2.72e-05 0.621 0.535 -3.65e-05 7.03e-05 num_critic_for_reviews 0.0025 0.000 12.863 0.000 0.002 0.003 duration 0.0096 0.001 14.451 0.000 0.008 0.011 director_facebook_likes 3.721e-06 4.71e-06 0.790 0.430 -5.51e-06 1.3e-05 actor_3_facebook_likes 5.73e-05 2.11e-05 2.719 0.007 1.6e-05 9.86e-05 actor_2_name -9.829e-06 2.07e-05 -0.475 0.635 -5.04e-05 3.08e-05 actor_1_facebook_likes 6.236e-05 1.29e-05 4.823 0.000 3.7e-05 8.77e-05 gross -9.463e-10 2.85e-10 -3.325 0.001 -1.5e-09 -3.88e-10 actor_1_name 4.927e-05 3.22e-05 1.530 0.126 -1.39e-05 0.000 num_voted_users 3.657e-06 1.72e-07 21.280 0.000 3.32e-06 3.99e-06 cast_total_facebook_likes -6.086e-05 1.29e-05 -4.720 0.000 -8.61e-05 -3.56e-05 actor_3_name -1.526e-05 1.76e-05 -0.866 0.387 -4.98e-05 1.93e-05 num_user_for_reviews -0.0007 5.69e-05 -11.511 0.000 -0.001 -0.001 language 0.0407 0.006 7.087 0.000 0.029 0.052 country -0.0061 0.001 -4.318 0.000 -0.009 -0.003 content_rating 0.0175 0.009 2.025 0.043 0.001 0.034 budget -7.436e-11 6.12e-11 -1.215 0.225 -1.94e-10 4.57e-11 title_year -0.0218 0.002 -13.180 0.000 -0.025 -0.019 actor_2_facebook_likes 6.453e-05 1.36e-05 4.746 0.000 3.79e-05 9.12e-05 movie_facebook_likes -1.738e-06 9.34e-07 -1.861 0.063 -3.57e-06 9.27e-08 main_genre 0.0314 0.005 6.692 0.000 0.022 0.041 ============================================================================== Omnibus: 537.600 Durbin-Watson: 1.940 Prob(Omnibus): 0.000 Jarque-Bera (JB): 1060.944 Skew: -0.875 Prob(JB): 4.16e-231 Kurtosis: 4.908 Cond. No. 5.63e+10 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 5.63e+10. This might indicate that there are strong multicollinearity or other numerical problems.
Interpretation of coefficients: some coeficients have a negative value even though they are highly correlated with the Y, but since this is a stats model we are going to focus on what the p-value says.
Interpretation of p value: for every X variable with p-value less than 0.05 we consider it significant. for those X variables with p-value more than 0.05 we considered them not influential at all (e.g; director_name, director_facebook_likes, actor_2_name, actor_1_name, actor_3_name, budget, and movie_facebook_likes).
Interpretation of R-sqaured: measure of how well the linear model fit a set of observation. for this model r-squared is 0.383
Full Model#2 (fewer X variables)
#assigning columns to X and Y variables
X = dfRM.drop(['imdb_score', 'color', 'actor_3_facebook_likes', 'cast_total_facebook_likes', 'actor_3_name', 'language', 'country', 'budget', 'title_year'], axis =1)
y = dfRM['imdb_score']
# Model 2 validation
# split validation (70% training & 30% testing data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
(2660, 14) (2660,) (1140, 14) (1140,)
# Model 2 building
# build a multiple regression model below
model2 = lm.LinearRegression()
model2.fit(X_train, y_train)
model2_y = model2.predict(X_test)
pd.DataFrame(list(zip(X.columns, np.transpose(model2.coef_)))).round(2)
0 | 1 | |
---|---|---|
0 | director_name | 0.00 |
1 | num_critic_for_reviews | 0.00 |
2 | duration | 0.01 |
3 | director_facebook_likes | 0.00 |
4 | actor_2_name | -0.00 |
5 | actor_1_facebook_likes | 0.00 |
6 | gross | -0.00 |
7 | actor_1_name | 0.00 |
8 | num_voted_users | 0.00 |
9 | num_user_for_reviews | -0.00 |
10 | content_rating | 0.00 |
11 | actor_2_facebook_likes | -0.00 |
12 | movie_facebook_likes | -0.00 |
13 | main_genre | 0.03 |
# Model 2 evaluation
print("mean square error: ", mean_squared_error(y_test, model2_y))
print("variance or r-squared: ", explained_variance_score(y_test, model2_y))
mean square error: 0.7583978138026021 variance or r-squared: 0.3133516604609867
plt.subplots()
plt.scatter(y_test, model2_y) # showing actual y as X-axis and predicted y as Y-axis
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4) #dotted line represents perfect prediction (actual = predicted)
plt.xlabel('Actual')
plt.ylabel('Predicted') #dashed black line in graph below
plt.show()
Comment on model#2: this model has MSE=0.758 and r-squared=0.313 it is no better than the first model.
Lasso Regression (Regularization)
#assigning columns to X and Y variables
X = dfRM.drop(['imdb_score'], axis =1)
y = dfRM['imdb_score']
# split validation (70% training & 30% testing data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
(2660, 22) (2660,) (1140, 22) (1140,)
#Fitting the model
model4 = lm.Lasso(alpha=1)
model4.fit(X_train, y_train)
model4_y = model4.predict(X_test)
print('Coefficients: ', model4.coef_)
print("y-intercept ", model4.intercept_)
Coefficients: [-0.00000000e+00 5.51502947e-06 1.95570275e-03 8.91445510e-03 4.10957034e-06 3.68993754e-05 -1.94287132e-06 5.27624872e-05 -1.67028463e-09 2.08743384e-05 3.75225164e-06 -5.19009239e-05 -6.41584264e-06 -4.92817926e-04 0.00000000e+00 -0.00000000e+00 0.00000000e+00 -2.12880572e-10 -7.46309141e-03 5.88536919e-05 -3.00565679e-06 0.00000000e+00] y-intercept 20.020823435201983
pd.DataFrame(list(zip(X.columns, np.transpose(model4.coef_)))).round(2)
0 | 1 | |
---|---|---|
0 | color | -0.00 |
1 | director_name | 0.00 |
2 | num_critic_for_reviews | 0.00 |
3 | duration | 0.01 |
4 | director_facebook_likes | 0.00 |
5 | actor_3_facebook_likes | 0.00 |
6 | actor_2_name | -0.00 |
7 | actor_1_facebook_likes | 0.00 |
8 | gross | -0.00 |
9 | actor_1_name | 0.00 |
10 | num_voted_users | 0.00 |
11 | cast_total_facebook_likes | -0.00 |
12 | actor_3_name | -0.00 |
13 | num_user_for_reviews | -0.00 |
14 | language | 0.00 |
15 | country | -0.00 |
16 | content_rating | 0.00 |
17 | budget | -0.00 |
18 | title_year | -0.01 |
19 | actor_2_facebook_likes | 0.00 |
20 | movie_facebook_likes | -0.00 |
21 | main_genre | 0.00 |
print("mean square error: ", mean_squared_error(y_test, model4_y))
print("variance or r-squared: ", explained_variance_score(y_test, model4_y))
mean square error: 0.7451581727766546 variance or r-squared: 0.3245674947261781
Comment on Lasso model: This regularized model using Lasso has MSE=0.745 and r-squared=0.3245 still not better than the full model.
Feature Selection model
X_new = SelectKBest(f_regression, k=2).fit_transform(X, y)
X_new
array([[1.78000e+02, 8.86204e+05], [1.69000e+02, 4.71220e+05], [1.48000e+02, 2.75868e+05], ..., [8.10000e+01, 5.20550e+04], [9.50000e+01, 1.33800e+03], [9.00000e+01, 4.28500e+03]])
# what are those two columns?
selector = SelectKBest(f_regression, k=2).fit(X, y)
idxs_selected = selector.get_support(indices=True)
print(idxs_selected)
[ 3 10]
X_train.head(2)
color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | actor_1_name | num_voted_users | cast_total_facebook_likes | actor_3_name | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | movie_facebook_likes | main_genre | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
529 | 1 | 1239 | 67.0 | 99.0 | 545.0 | 533.0 | 1921 | 933.0 | 14567883.0 | 285 | 44248 | 2542 | 1121 | 373.0 | 9 | 43 | 9 | 75000000.0 | 1998.0 | 722.0 | 0 | 0 |
3709 | 1 | 666 | 220.0 | 92.0 | 100.0 | 482.0 | 578 | 970.0 | 44540956.0 | 710 | 161448 | 3950 | 1267 | 1473.0 | 9 | 44 | 6 | 400000.0 | 2004.0 | 759.0 | 0 | 4 |
The 2 variables f_regression determined are the most important are duration and num_voted_users.
# I Choose 2 variables (duration, num_voted_users) and develop a multiple linear regression model.
y_new = dfRM['imdb_score']
X_new = dfRM[['duration', 'num_voted_users']]
# split validation (using X_new)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=0)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
(2660, 2) (2660,) (1140, 2) (1140,)
# Model Building
model5 = lm.LinearRegression()
model5.fit(X_train, y_train)
model5_y = model5.predict(X_test)
coef = ["%.3f" % i for i in model5.coef_]
x2columns = [i for i in X.columns]
sorted(zip(coef, x2columns), reverse=True)
[('0.010', 'color'), ('0.000', 'director_name')]
pd.DataFrame(list(zip(X.columns, np.transpose(model5.coef_)))).round(2)
0 | 1 | |
---|---|---|
0 | color | 0.01 |
1 | director_name | 0.00 |
# Model Evaluation
print("mean square error: ", mean_squared_error(y_test, model5_y))
print("variance or r-squared: ", explained_variance_score(y_test, model5_y))
mean square error: 0.8147118108347682 variance or r-squared: 0.26196018554763556
plt.subplots()
plt.scatter(y_test, model5_y)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()
Comments on Feature Selection model#1: this model has MSE=0.815 anf r-squared=0.262 still not better than the full model.
Feature Selection model#2
X_new = SelectKBest(f_regression, k=3).fit_transform(X, y)
X_new
array([[7.23000e+02, 1.78000e+02, 8.86204e+05], [3.02000e+02, 1.69000e+02, 4.71220e+05], [6.02000e+02, 1.48000e+02, 2.75868e+05], ..., [5.60000e+01, 8.10000e+01, 5.20550e+04], [1.40000e+01, 9.50000e+01, 1.33800e+03], [4.30000e+01, 9.00000e+01, 4.28500e+03]])
# what are those three columns?
selector = SelectKBest(f_regression, k=3).fit(X, y)
idxs_selected = selector.get_support(indices=True)
print(idxs_selected)
[ 2 3 10]
# split validation (using X_new)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=0)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
(2660, 3) (2660,) (1140, 3) (1140,)
# Model Building
model6 = lm.LinearRegression()
model6.fit(X_train, y_train)
model6_y = model6.predict(X_test)
# Model Evaluation
print("mean square error: ", mean_squared_error(y_test, model6_y))
print("variance or r-squared: ", explained_variance_score(y_test, model6_y))
mean square error: 0.8149166305784206 variance or r-squared: 0.26154558745430856
Comment on Feature Selection model#2: Feature Selection model#1 is better than Feature Selection model#2. Feature Selection model#2 has MSE=0.815 and r-squared=0.2615
Recursive Feature Selection (RFE): Another Feature Selection Method
lr = lm.LinearRegression()
rfe = RFE(lr, n_features_to_select=2)
rfe_y = rfe.fit(X,y)
print("Features sorted by their rank:")
print(sorted(zip([x for x in rfe.ranking_], X.columns)))
Features sorted by their rank: [(1, 'color'), (1, 'language'), (2, 'content_rating'), (3, 'main_genre'), (4, 'duration'), (5, 'title_year'), (6, 'country'), (7, 'num_critic_for_reviews'), (8, 'num_user_for_reviews'), (9, 'actor_3_facebook_likes'), (10, 'actor_2_facebook_likes'), (11, 'actor_1_facebook_likes'), (12, 'cast_total_facebook_likes'), (13, 'actor_1_name'), (14, 'director_facebook_likes'), (15, 'actor_3_name'), (16, 'actor_2_name'), (17, 'director_name'), (18, 'num_voted_users'), (19, 'movie_facebook_likes'), (20, 'gross'), (21, 'budget')]
RFE model
# I Choose 2 variables (color, language) and develop a multiple linear regression model (model7).
y_new = dfRM['imdb_score']
X_new = dfRM[['color', 'language']]
# Model validation
# split validation (70% training & 30% testing data)
X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size=0.3, random_state=0)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
(2660, 2) (2660,) (1140, 2) (1140,)
#Model building
model7 = lm.LinearRegression()
model7.fit(X_train, y_train)
model7_y = model7.predict(X_test)
pd.DataFrame(list(zip(X_new.columns, np.transpose(model7.coef_)))).round(2)
0 | 1 | |
---|---|---|
0 | color | -0.70 |
1 | language | 0.05 |
print("mean square error: ", mean_squared_error(y_test, model7_y))
print("variance or r-squared: ", explained_variance_score(y_test, model7_y))
mean square error: 1.0755484398602855 variance or r-squared: 0.025182699822159593
Comments on RFE model: this model has MSE=1.076 and r-squared=0.025 which is VERY bad.
RandormForestRegressor for Feature Selection
from sklearn.ensemble import RandomForestRegressor
#assigning columns to X and Y variables
X = dfRM.drop(['imdb_score'], axis =1)
y = dfRM['imdb_score']
# split validation (70% training & 30% testing data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# model building
regr = RandomForestRegressor(random_state=0)
regr.fit(X_train, y_train)
regr_predicted = regr.predict(X_test)
print("mean square error: ", mean_squared_error(y_test, regr_predicted))
print("variance or r-squared: ", explained_variance_score(y_test, regr_predicted))
mean square error: 0.475041150877193 variance or r-squared: 0.5691611729372041
# predictors in order of importance
feature_importances = pd.DataFrame(regr.feature_importances_, index = X_train.columns,
columns=['importance']).sort_values('importance', ascending=False)
feature_importances
importance | |
---|---|
num_voted_users | 0.316322 |
duration | 0.108144 |
budget | 0.092743 |
num_user_for_reviews | 0.060537 |
gross | 0.051325 |
num_critic_for_reviews | 0.043779 |
title_year | 0.034242 |
main_genre | 0.030561 |
actor_3_facebook_likes | 0.029434 |
director_name | 0.027687 |
actor_3_name | 0.025523 |
cast_total_facebook_likes | 0.024581 |
actor_2_name | 0.024325 |
director_facebook_likes | 0.023138 |
actor_1_name | 0.022259 |
movie_facebook_likes | 0.021120 |
actor_2_facebook_likes | 0.020579 |
actor_1_facebook_likes | 0.018579 |
content_rating | 0.011938 |
country | 0.009580 |
language | 0.003261 |
color | 0.000344 |
feature_importances.plot(kind='barh');
Comments about Regression models: of all the regression models I made, the RandomForestRegression was the best. it has MSE=0.475 and r-squared=0.569
The goal is to build a classification model to predict if a movie is good or bad.
#creating new datarfame for Classification model
dfCM = dfRM.copy()
dfCM.head()
color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | actor_1_name | num_voted_users | cast_total_facebook_likes | actor_3_name | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | movie_facebook_likes | main_genre | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 638 | 723.0 | 178.0 | 0.0 | 855.0 | 1030 | 1000.0 | 760505847.0 | 192 | 886204 | 4834 | 2585 | 3054.0 | 9 | 44 | 7 | 237000000.0 | 2009.0 | 936.0 | 7.9 | 33000 | 0 |
1 | 1 | 555 | 302.0 | 169.0 | 563.0 | 1000.0 | 1638 | 40000.0 | 309404152.0 | 703 | 471220 | 48350 | 1024 | 1238.0 | 9 | 44 | 7 | 300000000.0 | 2007.0 | 5000.0 | 7.1 | 0 | 0 |
2 | 1 | 1436 | 602.0 | 148.0 | 0.0 | 161.0 | 1844 | 11000.0 | 200074175.0 | 261 | 275868 | 11700 | 2347 | 994.0 | 9 | 43 | 7 | 245000000.0 | 2015.0 | 393.0 | 6.8 | 85000 | 0 |
3 | 1 | 254 | 813.0 | 164.0 | 22000.0 | 23000.0 | 396 | 27000.0 | 448130642.0 | 1378 | 1144337 | 106759 | 1295 | 2701.0 | 9 | 44 | 7 | 250000000.0 | 2012.0 | 23000.0 | 8.5 | 164000 | 0 |
4 | 1 | 64 | 462.0 | 132.0 | 475.0 | 530.0 | 1887 | 640.0 | 73058679.0 | 327 | 212204 | 1873 | 2015 | 738.0 | 9 | 44 | 7 | 263700000.0 | 2012.0 | 632.0 | 6.6 | 24000 | 0 |
dfCM['imdb_quality'] = 'bad'
dfCM.loc[dfCM['imdb_score'] > 6,'imdb_quality'] = 'good'
dfCM.head()
color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | actor_1_name | num_voted_users | cast_total_facebook_likes | actor_3_name | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | movie_facebook_likes | main_genre | imdb_quality | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 638 | 723.0 | 178.0 | 0.0 | 855.0 | 1030 | 1000.0 | 760505847.0 | 192 | 886204 | 4834 | 2585 | 3054.0 | 9 | 44 | 7 | 237000000.0 | 2009.0 | 936.0 | 7.9 | 33000 | 0 | good |
1 | 1 | 555 | 302.0 | 169.0 | 563.0 | 1000.0 | 1638 | 40000.0 | 309404152.0 | 703 | 471220 | 48350 | 1024 | 1238.0 | 9 | 44 | 7 | 300000000.0 | 2007.0 | 5000.0 | 7.1 | 0 | 0 | good |
2 | 1 | 1436 | 602.0 | 148.0 | 0.0 | 161.0 | 1844 | 11000.0 | 200074175.0 | 261 | 275868 | 11700 | 2347 | 994.0 | 9 | 43 | 7 | 245000000.0 | 2015.0 | 393.0 | 6.8 | 85000 | 0 | good |
3 | 1 | 254 | 813.0 | 164.0 | 22000.0 | 23000.0 | 396 | 27000.0 | 448130642.0 | 1378 | 1144337 | 106759 | 1295 | 2701.0 | 9 | 44 | 7 | 250000000.0 | 2012.0 | 23000.0 | 8.5 | 164000 | 0 | good |
4 | 1 | 64 | 462.0 | 132.0 | 475.0 | 530.0 | 1887 | 640.0 | 73058679.0 | 327 | 212204 | 1873 | 2015 | 738.0 | 9 | 44 | 7 | 263700000.0 | 2012.0 | 632.0 | 6.6 | 24000 | 0 | good |
# How likely a movie is bad or good?
dfCM['imdb_quality'].value_counts() / len(dfCM)
good 0.693684 bad 0.306316 Name: imdb_quality, dtype: float64
s = (dfCM['imdb_quality'].value_counts() / len(dfCM))
ax = s.plot(kind='barh')
[ax.text(v, i, '{:.2f}%'.format(100*v)) for i, v in enumerate(s)];
#mappling or replacing
dfCM = dfCM.replace({'imdb_quality': 'bad'}, {'imdb_quality': '0'})
dfCM = dfCM.replace({'imdb_quality': 'good'}, {'imdb_quality': '1'})
dfCM.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3800 entries, 0 to 3799 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 color 3800 non-null int32 1 director_name 3800 non-null int32 2 num_critic_for_reviews 3800 non-null float64 3 duration 3800 non-null float64 4 director_facebook_likes 3800 non-null float64 5 actor_3_facebook_likes 3800 non-null float64 6 actor_2_name 3800 non-null int32 7 actor_1_facebook_likes 3800 non-null float64 8 gross 3800 non-null float64 9 actor_1_name 3800 non-null int32 10 num_voted_users 3800 non-null int64 11 cast_total_facebook_likes 3800 non-null int64 12 actor_3_name 3800 non-null int32 13 num_user_for_reviews 3800 non-null float64 14 language 3800 non-null int32 15 country 3800 non-null int32 16 content_rating 3800 non-null int32 17 budget 3800 non-null float64 18 title_year 3800 non-null float64 19 actor_2_facebook_likes 3800 non-null float64 20 imdb_score 3800 non-null float64 21 movie_facebook_likes 3800 non-null int64 22 main_genre 3800 non-null int32 23 imdb_quality 3800 non-null object dtypes: float64(11), int32(9), int64(3), object(1) memory usage: 579.0+ KB
dfCM['imdb_quality'] = dfCM['imdb_quality'].astype(int)
print(dfCM.dtypes)
color int32 director_name int32 num_critic_for_reviews float64 duration float64 director_facebook_likes float64 actor_3_facebook_likes float64 actor_2_name int32 actor_1_facebook_likes float64 gross float64 actor_1_name int32 num_voted_users int64 cast_total_facebook_likes int64 actor_3_name int32 num_user_for_reviews float64 language int32 country int32 content_rating int32 budget float64 title_year float64 actor_2_facebook_likes float64 imdb_score float64 movie_facebook_likes int64 main_genre int32 imdb_quality int32 dtype: object
#Exploratory data analysis
# basic statistics
dfCM.describe()
color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | actor_1_name | num_voted_users | cast_total_facebook_likes | actor_3_name | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | movie_facebook_likes | main_genre | imdb_quality | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 3800.000000 | 3800.000000 | 3800.000000 | 3800.000000 | 3800.000000 | 3800.000000 | 3800.000000 | 3800.000000 | 3.800000e+03 | 3800.000000 | 3.800000e+03 | 3800.000000 | 3800.000000 | 3800.000000 | 3800.000000 | 3800.000000 | 3800.000000 | 3.800000e+03 | 3800.000000 | 3800.000000 | 3800.000000 | 3800.000000 | 3800.000000 | 3800.000000 |
mean | 0.967368 | 880.653421 | 164.724971 | 110.026053 | 794.400526 | 755.126842 | 1117.593947 | 7648.966842 | 5.164025e+07 | 733.248421 | 1.037115e+05 | 11354.007105 | 1331.273158 | 330.334474 | 9.353421 | 40.514211 | 7.584737 | 4.549443e+07 | 2003.072368 | 1980.765526 | 6.459184 | 9202.434474 | 3.642895 | 0.693684 |
std | 0.177694 | 498.570965 | 123.774159 | 22.611570 | 3047.211325 | 1851.494817 | 653.567933 | 15473.939652 | 6.955357e+07 | 419.280739 | 1.509243e+05 | 18999.250279 | 767.424657 | 409.373833 | 2.437659 | 9.827296 | 1.689875 | 2.247288e+08 | 9.885966 | 4495.328345 | 1.054768 | 21399.154854 | 3.030615 | 0.461023 |
min | 0.000000 | 0.000000 | 1.000000 | 37.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.620000e+02 | 0.000000 | 2.200000e+01 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 2.180000e+02 | 1927.000000 | 0.000000 | 1.600000 | 0.000000 | 0.000000 | 0.000000 |
25% | 1.000000 | 450.750000 | 74.000000 | 95.000000 | 10.000000 | 188.000000 | 552.750000 | 730.750000 | 7.421793e+06 | 367.000000 | 1.813225e+04 | 1871.000000 | 657.750000 | 105.000000 | 9.000000 | 44.000000 | 7.000000 | 1.000000e+07 | 1999.000000 | 371.750000 | 5.900000 | 0.000000 | 0.000000 | 0.000000 |
50% | 1.000000 | 893.500000 | 136.000000 | 106.000000 | 59.000000 | 432.000000 | 1103.500000 | 1000.000000 | 2.877222e+07 | 732.000000 | 5.238000e+04 | 3955.000000 | 1325.500000 | 205.000000 | 9.000000 | 44.000000 | 7.000000 | 2.500000e+07 | 2005.000000 | 670.000000 | 6.600000 | 217.000000 | 4.000000 | 1.000000 |
75% | 1.000000 | 1332.250000 | 222.000000 | 120.000000 | 230.000000 | 689.250000 | 1685.250000 | 12000.000000 | 6.628274e+07 | 1086.250000 | 1.253340e+05 | 16095.000000 | 2006.000000 | 393.250000 | 9.000000 | 44.000000 | 9.000000 | 5.000000e+07 | 2010.000000 | 973.000000 | 7.200000 | 11000.000000 | 6.000000 | 1.000000 |
max | 1.000000 | 1706.000000 | 813.000000 | 330.000000 | 23000.000000 | 23000.000000 | 2241.000000 | 640000.000000 | 7.605058e+08 | 1468.000000 | 1.689764e+06 | 656730.000000 | 2645.000000 | 5060.000000 | 33.000000 | 45.000000 | 11.000000 | 1.221550e+10 | 2016.000000 | 137000.000000 | 9.300000 | 349000.000000 | 16.000000 | 1.000000 |
# correlation analysis
dfCM.corr()
color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | actor_1_name | num_voted_users | cast_total_facebook_likes | actor_3_name | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | movie_facebook_likes | main_genre | imdb_quality | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
color | 1.000000 | -0.019378 | -0.000504 | -0.050758 | -0.059483 | 0.020773 | 0.004718 | 0.024401 | 0.040809 | -0.014031 | -0.036312 | 0.028273 | -0.003840 | -0.069815 | -0.002538 | 0.048200 | -0.003938 | 0.014707 | 0.159880 | 0.019141 | -0.118480 | 0.023283 | -0.039730 | -0.086702 |
director_name | -0.019378 | 1.000000 | -0.001645 | 0.033452 | 0.050708 | -0.042588 | 0.019431 | -0.028597 | -0.001541 | 0.022334 | -0.009080 | -0.038909 | 0.000361 | 0.015093 | -0.029706 | -0.010025 | -0.010658 | -0.007753 | -0.051667 | -0.042471 | 0.016045 | -0.008192 | -0.009628 | 0.015587 |
num_critic_for_reviews | -0.000504 | -0.001645 | 1.000000 | 0.237456 | 0.177850 | 0.247289 | -0.014900 | 0.168864 | 0.469265 | 0.001681 | 0.596951 | 0.237330 | -0.008414 | 0.568929 | -0.022866 | 0.029091 | 0.066474 | 0.106622 | 0.403101 | 0.253847 | 0.350114 | 0.702770 | -0.077460 | 0.262172 |
duration | -0.050758 | 0.033452 | 0.237456 | 1.000000 | 0.181601 | 0.123003 | 0.023126 | 0.085598 | 0.249981 | 0.010341 | 0.344836 | 0.120807 | 0.033452 | 0.357494 | 0.041872 | 0.000228 | 0.066912 | 0.069450 | -0.131890 | 0.128665 | 0.367463 | 0.219787 | -0.040454 | 0.269486 |
director_facebook_likes | -0.059483 | 0.050708 | 0.177850 | 0.181601 | 1.000000 | 0.121721 | -0.012826 | 0.091510 | 0.141708 | 0.015536 | 0.303774 | 0.121362 | 0.002219 | 0.220750 | -0.021517 | 0.055794 | 0.028269 | 0.018888 | -0.047833 | 0.119176 | 0.192332 | 0.162192 | -0.007688 | 0.120775 |
actor_3_facebook_likes | 0.020773 | -0.042588 | 0.247289 | 0.123003 | 0.121721 | 1.000000 | 0.019008 | 0.252765 | 0.282772 | -0.004719 | 0.259370 | 0.485410 | -0.031515 | 0.202895 | -0.050903 | 0.064960 | -0.022043 | 0.039158 | 0.113030 | 0.550690 | 0.064969 | 0.259299 | -0.082684 | 0.036692 |
actor_2_name | 0.004718 | 0.019431 | -0.014900 | 0.023126 | -0.012826 | 0.019008 | 1.000000 | 0.000731 | 0.002834 | 0.006126 | -0.001605 | -0.003397 | -0.021155 | 0.005996 | 0.027842 | -0.009126 | 0.008143 | 0.016084 | -0.018135 | -0.023477 | -0.003265 | -0.016878 | -0.026834 | -0.000792 |
actor_1_facebook_likes | 0.024401 | -0.028597 | 0.168864 | 0.085598 | 0.091510 | 0.252765 | 0.000731 | 1.000000 | 0.145021 | 0.017845 | 0.180013 | 0.946300 | 0.011333 | 0.124256 | -0.059636 | 0.059861 | 0.022929 | 0.017009 | 0.092108 | 0.390333 | 0.093368 | 0.128746 | -0.049751 | 0.066052 |
gross | 0.040809 | -0.001541 | 0.469265 | 0.249981 | 0.141708 | 0.282772 | 0.002834 | 0.145021 | 1.000000 | 0.004855 | 0.626979 | 0.230466 | 0.006172 | 0.550763 | -0.093999 | 0.138867 | -0.224620 | 0.100619 | 0.047498 | 0.245716 | 0.217158 | 0.365256 | -0.261192 | 0.147358 |
actor_1_name | -0.014031 | 0.022334 | 0.001681 | 0.010341 | 0.015536 | -0.004719 | 0.006126 | 0.017845 | 0.004855 | 1.000000 | 0.001935 | 0.014748 | 0.019359 | 0.006295 | 0.001443 | -0.006240 | 0.007233 | 0.001824 | -0.016186 | 0.003300 | 0.025374 | -0.009251 | -0.019649 | 0.010283 |
num_voted_users | -0.036312 | -0.009080 | 0.596951 | 0.344836 | 0.303774 | 0.259370 | -0.001605 | 0.180013 | 0.626979 | 0.001935 | 1.000000 | 0.246180 | -0.006988 | 0.781581 | -0.033118 | 0.073423 | 0.008597 | 0.067780 | 0.017439 | 0.241157 | 0.479947 | 0.518471 | -0.133259 | 0.279504 |
cast_total_facebook_likes | 0.028273 | -0.038909 | 0.237330 | 0.120807 | 0.121362 | 0.485410 | -0.003397 | 0.946300 | 0.230466 | 0.014748 | 0.246180 | 1.000000 | 0.001557 | 0.179592 | -0.073790 | 0.079102 | 0.019012 | 0.028866 | 0.121891 | 0.639878 | 0.106062 | 0.199670 | -0.076193 | 0.075535 |
actor_3_name | -0.003840 | 0.000361 | -0.008414 | 0.033452 | 0.002219 | -0.031515 | -0.021155 | 0.011333 | 0.006172 | 0.019359 | -0.006988 | 0.001557 | 1.000000 | -0.013753 | 0.002429 | 0.018827 | 0.001797 | -0.021540 | -0.009794 | -0.013690 | -0.004077 | -0.013302 | -0.015384 | -0.000189 |
num_user_for_reviews | -0.069815 | 0.015093 | 0.568929 | 0.357494 | 0.220750 | 0.202895 | 0.005996 | 0.124256 | 0.550763 | 0.006295 | 0.781581 | 0.179592 | -0.013753 | 1.000000 | -0.044113 | 0.044190 | 0.045637 | 0.072663 | 0.013325 | 0.187555 | 0.325020 | 0.372988 | -0.077588 | 0.192536 |
language | -0.002538 | -0.029706 | -0.022866 | 0.041872 | -0.021517 | -0.050903 | 0.027842 | -0.059636 | -0.093999 | 0.001443 | -0.033118 | -0.073790 | 0.002429 | -0.044113 | 1.000000 | -0.195474 | 0.048225 | 0.121470 | 0.021866 | -0.057602 | 0.111172 | -0.016942 | 0.041959 | 0.064502 |
country | 0.048200 | -0.010025 | 0.029091 | 0.000228 | 0.055794 | 0.064960 | -0.009126 | 0.059861 | 0.138867 | -0.006240 | 0.073423 | 0.079102 | 0.018827 | 0.044190 | -0.195474 | 1.000000 | -0.046911 | -0.010849 | -0.039854 | 0.064148 | -0.054637 | 0.030963 | -0.016521 | -0.035642 |
content_rating | -0.003938 | -0.010658 | 0.066474 | 0.066912 | 0.028269 | -0.022043 | 0.008143 | 0.022929 | -0.224620 | 0.007233 | 0.008597 | 0.019012 | 0.001797 | 0.045637 | 0.048225 | -0.046911 | 1.000000 | -0.021612 | 0.116049 | 0.026250 | 0.074990 | 0.020613 | 0.201710 | 0.063734 |
budget | 0.014707 | -0.007753 | 0.106622 | 0.069450 | 0.018888 | 0.039158 | 0.016084 | 0.017009 | 0.100619 | 0.001824 | 0.067780 | 0.028866 | -0.021540 | 0.072663 | 0.121470 | -0.010849 | -0.021612 | 1.000000 | 0.044517 | 0.035664 | 0.030437 | 0.052961 | -0.079110 | 0.023186 |
title_year | 0.159880 | -0.051667 | 0.403101 | -0.131890 | -0.047833 | 0.113030 | -0.018135 | 0.092108 | 0.047498 | -0.016186 | 0.017439 | 0.121891 | -0.009794 | 0.013325 | 0.021866 | -0.039854 | 0.116049 | 0.044517 | 1.000000 | 0.118707 | -0.133390 | 0.299590 | -0.020285 | -0.070852 |
actor_2_facebook_likes | 0.019141 | -0.042471 | 0.253847 | 0.128665 | 0.119176 | 0.550690 | -0.023477 | 0.390333 | 0.245716 | 0.003300 | 0.241157 | 0.639878 | -0.013690 | 0.187555 | -0.057602 | 0.064148 | 0.026250 | 0.035664 | 0.118707 | 1.000000 | 0.101208 | 0.225995 | -0.077700 | 0.079633 |
imdb_score | -0.118480 | 0.016045 | 0.350114 | 0.367463 | 0.192332 | 0.064969 | -0.003265 | 0.093368 | 0.217158 | 0.025374 | 0.479947 | 0.106062 | -0.004077 | 0.325020 | 0.111172 | -0.054637 | 0.074990 | 0.030437 | -0.133390 | 0.101208 | 1.000000 | 0.283379 | 0.054811 | 0.776134 |
movie_facebook_likes | 0.023283 | -0.008192 | 0.702770 | 0.219787 | 0.162192 | 0.259299 | -0.016878 | 0.128746 | 0.365256 | -0.009251 | 0.518471 | 0.199670 | -0.013302 | 0.372988 | -0.016942 | 0.030963 | 0.020613 | 0.052961 | 0.299590 | 0.225995 | 0.283379 | 1.000000 | -0.069939 | 0.172567 |
main_genre | -0.039730 | -0.009628 | -0.077460 | -0.040454 | -0.007688 | -0.082684 | -0.026834 | -0.049751 | -0.261192 | -0.019649 | -0.133259 | -0.076193 | -0.015384 | -0.077588 | 0.041959 | -0.016521 | 0.201710 | -0.079110 | -0.020285 | -0.077700 | 0.054811 | -0.069939 | 1.000000 | 0.040945 |
imdb_quality | -0.086702 | 0.015587 | 0.262172 | 0.269486 | 0.120775 | 0.036692 | -0.000792 | 0.066052 | 0.147358 | 0.010283 | 0.279504 | 0.075535 | -0.000189 | 0.192536 | 0.064502 | -0.035642 | 0.063734 | 0.023186 | -0.070852 | 0.079633 | 0.776134 | 0.172567 | 0.040945 | 1.000000 |
corr = pd.DataFrame(dfCM.corr()['imdb_quality'].drop('imdb_quality'))
corr.sort_values(['imdb_quality'], ascending = False)
imdb_quality | |
---|---|
imdb_score | 0.776134 |
num_voted_users | 0.279504 |
duration | 0.269486 |
num_critic_for_reviews | 0.262172 |
num_user_for_reviews | 0.192536 |
movie_facebook_likes | 0.172567 |
gross | 0.147358 |
director_facebook_likes | 0.120775 |
actor_2_facebook_likes | 0.079633 |
cast_total_facebook_likes | 0.075535 |
actor_1_facebook_likes | 0.066052 |
language | 0.064502 |
content_rating | 0.063734 |
main_genre | 0.040945 |
actor_3_facebook_likes | 0.036692 |
budget | 0.023186 |
director_name | 0.015587 |
actor_1_name | 0.010283 |
actor_3_name | -0.000189 |
actor_2_name | -0.000792 |
country | -0.035642 |
title_year | -0.070852 |
color | -0.086702 |
# heatmap for correlation
plt.figure(figsize=(8,8))
sns.heatmap(dfCM.corr(), annot=True)
<AxesSubplot:>
# cluster map
sns.clustermap(data=dfCM.corr(), annot=True, cmap='Greens')
<seaborn.matrix.ClusterGrid at 0x19884ddd8e0>
dfCM.head(2)
color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | actor_1_name | num_voted_users | cast_total_facebook_likes | actor_3_name | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | movie_facebook_likes | main_genre | imdb_quality | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 638 | 723.0 | 178.0 | 0.0 | 855.0 | 1030 | 1000.0 | 760505847.0 | 192 | 886204 | 4834 | 2585 | 3054.0 | 9 | 44 | 7 | 237000000.0 | 2009.0 | 936.0 | 7.9 | 33000 | 0 | 1 |
1 | 1 | 555 | 302.0 | 169.0 | 563.0 | 1000.0 | 1638 | 40000.0 | 309404152.0 | 703 | 471220 | 48350 | 1024 | 1238.0 | 9 | 44 | 7 | 300000000.0 | 2007.0 | 5000.0 | 7.1 | 0 | 0 | 1 |
dfCM.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3800 entries, 0 to 3799 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 color 3800 non-null int32 1 director_name 3800 non-null int32 2 num_critic_for_reviews 3800 non-null float64 3 duration 3800 non-null float64 4 director_facebook_likes 3800 non-null float64 5 actor_3_facebook_likes 3800 non-null float64 6 actor_2_name 3800 non-null int32 7 actor_1_facebook_likes 3800 non-null float64 8 gross 3800 non-null float64 9 actor_1_name 3800 non-null int32 10 num_voted_users 3800 non-null int64 11 cast_total_facebook_likes 3800 non-null int64 12 actor_3_name 3800 non-null int32 13 num_user_for_reviews 3800 non-null float64 14 language 3800 non-null int32 15 country 3800 non-null int32 16 content_rating 3800 non-null int32 17 budget 3800 non-null float64 18 title_year 3800 non-null float64 19 actor_2_facebook_likes 3800 non-null float64 20 imdb_score 3800 non-null float64 21 movie_facebook_likes 3800 non-null int64 22 main_genre 3800 non-null int32 23 imdb_quality 3800 non-null int32 dtypes: float64(11), int32(10), int64(3) memory usage: 564.2 KB
# convert categorical variables to dummy variables
dfCM = pd.get_dummies(dfCM, columns=["color", "language", "country", "content_rating", "main_genre"],
prefix=["color", "language", "country", "content_rating", "main_genre"],
drop_first=True)
dfCM.head(2)
director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | actor_1_name | num_voted_users | cast_total_facebook_likes | actor_3_name | num_user_for_reviews | budget | title_year | actor_2_facebook_likes | imdb_score | movie_facebook_likes | imdb_quality | color_1 | language_1 | language_2 | language_3 | language_4 | language_5 | language_6 | language_7 | language_8 | language_9 | language_10 | language_11 | language_12 | language_13 | language_14 | language_15 | language_16 | language_17 | language_18 | language_19 | language_20 | language_21 | language_22 | language_23 | language_24 | language_25 | language_26 | language_27 | language_28 | language_29 | language_30 | language_31 | language_32 | language_33 | country_1 | country_2 | country_3 | country_4 | country_5 | country_6 | country_7 | country_8 | country_9 | country_10 | country_11 | country_12 | country_13 | country_14 | country_15 | country_16 | country_17 | country_18 | country_19 | country_20 | country_21 | country_22 | country_23 | country_24 | country_25 | country_26 | country_27 | country_28 | country_29 | country_30 | country_31 | country_32 | country_33 | country_34 | country_35 | country_36 | country_37 | country_38 | country_39 | country_40 | country_41 | country_42 | country_43 | country_44 | country_45 | content_rating_1 | content_rating_2 | content_rating_3 | content_rating_4 | content_rating_5 | content_rating_6 | content_rating_7 | content_rating_8 | content_rating_9 | content_rating_10 | content_rating_11 | main_genre_1 | main_genre_2 | main_genre_3 | main_genre_4 | main_genre_5 | main_genre_6 | main_genre_7 | main_genre_8 | main_genre_9 | main_genre_10 | main_genre_11 | main_genre_12 | main_genre_13 | main_genre_14 | main_genre_15 | main_genre_16 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 638 | 723.0 | 178.0 | 0.0 | 855.0 | 1030 | 1000.0 | 760505847.0 | 192 | 886204 | 4834 | 2585 | 3054.0 | 237000000.0 | 2009.0 | 936.0 | 7.9 | 33000 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 555 | 302.0 | 169.0 | 563.0 | 1000.0 | 1638 | 40000.0 | 309404152.0 | 703 | 471220 | 48350 | 1024 | 1238.0 | 300000000.0 | 2007.0 | 5000.0 | 7.1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
dfCM.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3800 entries, 0 to 3799 Columns: 125 entries, director_name to main_genre_16 dtypes: float64(11), int32(5), int64(3), uint8(106) memory usage: 883.3 KB
Decision tree
# declare X variables and y variable
y = dfCM['imdb_quality']
X = dfCM.drop(['imdb_quality', 'imdb_score'], axis=1)
# split validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# Initialize DecisionTreeClassifier() ... name your decision model "dt"
dt = DecisionTreeClassifier()
# Train a decision tree model
dt.fit(X_train, y_train)
DecisionTreeClassifier()
print(len(X_train))
print(len(X_test))
2660 1140
#Model evaluation
# http://scikit-learn.org/stable/modules/model_evaluation.html
print(metrics.accuracy_score(y_test, dt.predict(X_test)))
print("--------------------------------------------------------")
print(metrics.confusion_matrix(y_test, dt.predict(X_test)))
print("--------------------------------------------------------")
print(metrics.classification_report(y_test, dt.predict(X_test)))
print("--------------------------------------------------------")
print(metrics.roc_auc_score(y_test, dt.predict(X_test)))
# y-test is the actual y value in the testing dataset
# dt.predict(X_test) is the predicted y value generated by your model
# If they are same, we can say your model is accurate.
0.7403508771929824 -------------------------------------------------------- [[189 141] [155 655]] -------------------------------------------------------- precision recall f1-score support 0 0.55 0.57 0.56 330 1 0.82 0.81 0.82 810 accuracy 0.74 1140 macro avg 0.69 0.69 0.69 1140 weighted avg 0.74 0.74 0.74 1140 -------------------------------------------------------- 0.6906846240179574
!pip install scikit-plot
Requirement already satisfied: scikit-plot in c:\users\patri\anaconda3\lib\site-packages (0.3.7) Requirement already satisfied: matplotlib>=1.4.0 in c:\users\patri\anaconda3\lib\site-packages (from scikit-plot) (3.3.2) Requirement already satisfied: scikit-learn>=0.18 in c:\users\patri\anaconda3\lib\site-packages (from scikit-plot) (0.23.2) Requirement already satisfied: scipy>=0.9 in c:\users\patri\anaconda3\lib\site-packages (from scikit-plot) (1.5.2) Requirement already satisfied: joblib>=0.10 in c:\users\patri\anaconda3\lib\site-packages (from scikit-plot) (0.17.0) Requirement already satisfied: pillow>=6.2.0 in c:\users\patri\anaconda3\lib\site-packages (from matplotlib>=1.4.0->scikit-plot) (8.0.1) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\patri\anaconda3\lib\site-packages (from matplotlib>=1.4.0->scikit-plot) (1.3.0) Requirement already satisfied: certifi>=2020.06.20 in c:\users\patri\anaconda3\lib\site-packages (from matplotlib>=1.4.0->scikit-plot) (2020.6.20) Requirement already satisfied: cycler>=0.10 in c:\users\patri\anaconda3\lib\site-packages (from matplotlib>=1.4.0->scikit-plot) (0.10.0) Requirement already satisfied: python-dateutil>=2.1 in c:\users\patri\anaconda3\lib\site-packages (from matplotlib>=1.4.0->scikit-plot) (2.8.1) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\users\patri\anaconda3\lib\site-packages (from matplotlib>=1.4.0->scikit-plot) (2.4.7) Requirement already satisfied: numpy>=1.15 in c:\users\patri\anaconda3\lib\site-packages (from matplotlib>=1.4.0->scikit-plot) (1.19.2) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\patri\anaconda3\lib\site-packages (from scikit-learn>=0.18->scikit-plot) (2.1.0) Requirement already satisfied: six in c:\users\patri\anaconda3\lib\site-packages (from cycler>=0.10->matplotlib>=1.4.0->scikit-plot) (1.15.0)
skplt.metrics.plot_confusion_matrix(y_true=np.array(y_test), y_pred=dt.predict(X_test))
plt.show()
Confusion Matrix explanation
141 movies were misclassified as bad movies
155 movies were misclassified as good movies
The decision tree model is 74% accurate
Therefore, we expect that the model will be about 74% accurate when the model is applied into a real-world situation
True Positive Rate (Sensitivity) = 655/810 = 0.80
False Positive Rate = 141/330 = 0.427
True Negative Rate (Specificity) = 189/330 = 0.572
False Negatve Rate = 155/810 = 0.191
Visualizing decision tree
text_representation = tree.export_text(dt, feature_names=(list(X.columns)))
print(text_representation)
|--- num_voted_users <= 86645.50 | |--- duration <= 108.50 | | |--- budget <= 15550000.00 | | | |--- main_genre_10 <= 0.50 | | | | |--- num_critic_for_reviews <= 70.50 | | | | | |--- budget <= 2150000.00 | | | | | | |--- num_critic_for_reviews <= 17.00 | | | | | | | |--- actor_3_facebook_likes <= 16.00 | | | | | | | | |--- cast_total_facebook_likes <= 349.00 | | | | | | | | | |--- class: 1 | | | | | | | | |--- cast_total_facebook_likes > 349.00 | | | | | | | | | |--- class: 0 | | | | | | | |--- actor_3_facebook_likes > 16.00 | | | | | | | | |--- movie_facebook_likes <= 84.00 | | | | | | | | | |--- class: 0 | | | | | | | | |--- movie_facebook_likes > 84.00 | | | | | | | | | |--- title_year <= 2000.50 | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- title_year > 2000.50 | | | | | | | | | | |--- cast_total_facebook_likes <= 886.00 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- cast_total_facebook_likes > 886.00 | | | | | | | | | | | |--- class: 0 | | | | | | |--- num_critic_for_reviews > 17.00 | | | | | | | |--- gross <= 4082.00 | | | | | | | | |--- class: 0 | | | | | | | |--- gross > 4082.00 | | | | | | | | |--- title_year <= 1987.50 | | | | | | | | | |--- director_facebook_likes <= 27.00 | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- director_facebook_likes > 27.00 | | | | | | | | | | |--- class: 0 | | | | | | | | |--- title_year > 1987.50 | | | | | | | | | |--- duration <= 84.50 | | | | | | | | | | |--- cast_total_facebook_likes <= 3197.00 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | | |--- cast_total_facebook_likes > 3197.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- duration > 84.50 | | | | | | | | | | |--- title_year <= 2014.00 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | | |--- title_year > 2014.00 | | | | | | | | | | | |--- class: 0 | | | | | |--- budget > 2150000.00 | | | | | | |--- num_voted_users <= 29235.50 | | | | | | | |--- duration <= 96.50 | | | | | | | | |--- main_genre_7 <= 0.50 | | | | | | | | | |--- director_facebook_likes <= 221.00 | | | | | | | | | | |--- actor_3_facebook_likes <= 41.00 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | | |--- actor_3_facebook_likes > 41.00 | | | | | | | | | | | |--- truncated branch of depth 6 | | | | | | | | | |--- director_facebook_likes > 221.00 | | | | | | | | | | |--- class: 1 | | | | | | | | |--- main_genre_7 > 0.50 | | | | | | | | | |--- actor_2_name <= 634.50 | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- actor_2_name > 634.50 | | | | | | | | | | |--- title_year <= 2011.50 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- title_year > 2011.50 | | | | | | | | | | | |--- class: 0 | | | | | | | |--- duration > 96.50 | | | | | | | | |--- actor_2_facebook_likes <= 287.00 | | | | | | | | | |--- actor_2_name <= 2176.50 | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- actor_2_name > 2176.50 | | | | | | | | | | |--- class: 0 | | | | | | | | |--- actor_2_facebook_likes > 287.00 | | | | | | | | | |--- director_name <= 134.00 | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- director_name > 134.00 | | | | | | | | | | |--- director_name <= 905.50 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | | |--- director_name > 905.50 | | | | | | | | | | | |--- truncated branch of depth 4 | | | | | | |--- num_voted_users > 29235.50 | | | | | | | |--- actor_1_name <= 1269.00 | | | | | | | | |--- class: 1 | | | | | | | |--- actor_1_name > 1269.00 | | | | | | | | |--- class: 0 | | | | |--- num_critic_for_reviews > 70.50 | | | | | |--- gross <= 12247547.00 | | | | | | |--- title_year <= 2011.50 | | | | | | | |--- content_rating_11 <= 0.50 | | | | | | | | |--- main_genre_12 <= 0.50 | | | | | | | | | |--- movie_facebook_likes <= 41500.00 | | | | | | | | | | |--- director_name <= 29.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | | |--- director_name > 29.50 | | | | | | | | | | | |--- truncated branch of depth 8 | | | | | | | | | |--- movie_facebook_likes > 41500.00 | | | | | | | | | | |--- class: 0 | | | | | | | | |--- main_genre_12 > 0.50 | | | | | | | | | |--- class: 0 | | | | | | | |--- content_rating_11 > 0.50 | | | | | | | | |--- class: 0 | | | | | | |--- title_year > 2011.50 | | | | | | | |--- actor_3_facebook_likes <= 342.00 | | | | | | | | |--- num_critic_for_reviews <= 90.50 | | | | | | | | | |--- class: 0 | | | | | | | | |--- num_critic_for_reviews > 90.50 | | | | | | | | | |--- class: 1 | | | | | | | |--- actor_3_facebook_likes > 342.00 | | | | | | | | |--- budget <= 6500000.00 | | | | | | | | | |--- cast_total_facebook_likes <= 47582.50 | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- cast_total_facebook_likes > 47582.50 | | | | | | | | | | |--- class: 1 | | | | | | | | |--- budget > 6500000.00 | | | | | | | | | |--- gross <= 8660346.50 | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- gross > 8660346.50 | | | | | | | | | | |--- class: 0 | | | | | |--- gross > 12247547.00 | | | | | | |--- num_voted_users <= 37697.50 | | | | | | | |--- duration <= 93.50 | | | | | | | | |--- actor_3_name <= 2022.00 | | | | | | | | | |--- class: 0 | | | | | | | | |--- actor_3_name > 2022.00 | | | | | | | | | |--- class: 1 | | | | | | | |--- duration > 93.50 | | | | | | | | |--- actor_1_facebook_likes <= 924.50 | | | | | | | | | |--- movie_facebook_likes <= 5425.50 | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- movie_facebook_likes > 5425.50 | | | | | | | | | | |--- content_rating_7 <= 0.50 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- content_rating_7 > 0.50 | | | | | | | | | | | |--- class: 1 | | | | | | | | |--- actor_1_facebook_likes > 924.50 | | | | | | | | | |--- num_user_for_reviews <= 148.00 | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- num_user_for_reviews > 148.00 | | | | | | | | | | |--- director_facebook_likes <= 47.00 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | | |--- director_facebook_likes > 47.00 | | | | | | | | | | | |--- class: 1 | | | | | | |--- num_voted_users > 37697.50 | | | | | | | |--- num_user_for_reviews <= 299.00 | | | | | | | | |--- director_facebook_likes <= 521.50 | | | | | | | | | |--- class: 1 | | | | | | | | |--- director_facebook_likes > 521.50 | | | | | | | | | |--- director_facebook_likes <= 608.00 | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- director_facebook_likes > 608.00 | | | | | | | | | | |--- class: 1 | | | | | | | |--- num_user_for_reviews > 299.00 | | | | | | | | |--- director_facebook_likes <= 3.50 | | | | | | | | | |--- class: 1 | | | | | | | | |--- director_facebook_likes > 3.50 | | | | | | | | | |--- duration <= 100.50 | | | | | | | | | | |--- cast_total_facebook_likes <= 1864.00 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- cast_total_facebook_likes > 1864.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- duration > 100.50 | | | | | | | | | | |--- class: 1 | | | |--- main_genre_10 > 0.50 | | | | |--- director_facebook_likes <= 120.50 | | | | | |--- movie_facebook_likes <= 25500.00 | | | | | | |--- num_voted_users <= 81258.00 | | | | | | | |--- language_30 <= 0.50 | | | | | | | | |--- actor_1_name <= 155.00 | | | | | | | | | |--- budget <= 7000000.00 | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- budget > 7000000.00 | | | | | | | | | | |--- class: 0 | | | | | | | | |--- actor_1_name > 155.00 | | | | | | | | | |--- class: 0 | | | | | | | |--- language_30 > 0.50 | | | | | | | | |--- class: 1 | | | | | | |--- num_voted_users > 81258.00 | | | | | | | |--- class: 1 | | | | | |--- movie_facebook_likes > 25500.00 | | | | | | |--- class: 1 | | | | |--- director_facebook_likes > 120.50 | | | | | |--- director_name <= 1586.00 | | | | | | |--- class: 1 | | | | | |--- director_name > 1586.00 | | | | | | |--- class: 0 | | |--- budget > 15550000.00 | | | |--- num_voted_users <= 30075.00 | | | | |--- actor_3_facebook_likes <= 105.50 | | | | | |--- actor_2_name <= 306.50 | | | | | | |--- class: 0 | | | | | |--- actor_2_name > 306.50 | | | | | | |--- main_genre_10 <= 0.50 | | | | | | | |--- director_name <= 608.50 | | | | | | | | |--- duration <= 99.00 | | | | | | | | | |--- class: 0 | | | | | | | | |--- duration > 99.00 | | | | | | | | | |--- class: 1 | | | | | | | |--- director_name > 608.50 | | | | | | | | |--- main_genre_1 <= 0.50 | | | | | | | | | |--- class: 1 | | | | | | | | |--- main_genre_1 > 0.50 | | | | | | | | | |--- actor_3_name <= 1724.50 | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- actor_3_name > 1724.50 | | | | | | | | | | |--- actor_3_name <= 2588.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- actor_3_name > 2588.00 | | | | | | | | | | | |--- class: 1 | | | | | | |--- main_genre_10 > 0.50 | | | | | | | |--- class: 0 | | | | |--- actor_3_facebook_likes > 105.50 | | | | | |--- movie_facebook_likes <= 12500.00 | | | | | | |--- actor_1_name <= 1459.00 | | | | | | | |--- actor_2_facebook_likes <= 6500.00 | | | | | | | | |--- actor_2_name <= 940.50 | | | | | | | | | |--- actor_2_name <= 773.50 | | | | | | | | | | |--- director_name <= 1441.50 | | | | | | | | | | | |--- truncated branch of depth 6 | | | | | | | | | | |--- director_name > 1441.50 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | |--- actor_2_name > 773.50 | | | | | | | | | | |--- duration <= 97.00 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | | |--- duration > 97.00 | | | | | | | | | | | |--- class: 1 | | | | | | | | |--- actor_2_name > 940.50 | | | | | | | | | |--- duration <= 107.50 | | | | | | | | | | |--- director_facebook_likes <= 386.50 | | | | | | | | | | | |--- truncated branch of depth 9 | | | | | | | | | | |--- director_facebook_likes > 386.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | |--- duration > 107.50 | | | | | | | | | | |--- class: 1 | | | | | | | |--- actor_2_facebook_likes > 6500.00 | | | | | | | | |--- actor_3_name <= 2184.00 | | | | | | | | | |--- class: 1 | | | | | | | | |--- actor_3_name > 2184.00 | | | | | | | | | |--- class: 0 | | | | | | |--- actor_1_name > 1459.00 | | | | | | | |--- class: 1 | | | | | |--- movie_facebook_likes > 12500.00 | | | | | | |--- movie_facebook_likes <= 54500.00 | | | | | | | |--- class: 1 | | | | | | |--- movie_facebook_likes > 54500.00 | | | | | | | |--- class: 0 | | | |--- num_voted_users > 30075.00 | | | | |--- num_user_for_reviews <= 178.50 | | | | | |--- budget <= 34500000.00 | | | | | | |--- duration <= 89.50 | | | | | | | |--- director_facebook_likes <= 105.50 | | | | | | | | |--- class: 0 | | | | | | | |--- director_facebook_likes > 105.50 | | | | | | | | |--- class: 1 | | | | | | |--- duration > 89.50 | | | | | | | |--- title_year <= 1987.00 | | | | | | | | |--- class: 0 | | | | | | | |--- title_year > 1987.00 | | | | | | | | |--- num_user_for_reviews <= 133.50 | | | | | | | | | |--- country_6 <= 0.50 | | | | | | | | | | |--- actor_2_name <= 133.50 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- actor_2_name > 133.50 | | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- country_6 > 0.50 | | | | | | | | | | |--- class: 0 | | | | | | | | |--- num_user_for_reviews > 133.50 | | | | | | | | | |--- actor_3_name <= 823.00 | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- actor_3_name > 823.00 | | | | | | | | | | |--- director_facebook_likes <= 37.00 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | | |--- director_facebook_likes > 37.00 | | | | | | | | | | | |--- class: 1 | | | | | |--- budget > 34500000.00 | | | | | | |--- main_genre_1 <= 0.50 | | | | | | | |--- main_genre_2 <= 0.50 | | | | | | | | |--- duration <= 102.50 | | | | | | | | | |--- director_name <= 68.50 | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- director_name > 68.50 | | | | | | | | | | |--- country_43 <= 0.50 | | | | | | | | | | | |--- truncated branch of depth 5 | | | | | | | | | | |--- country_43 > 0.50 | | | | | | | | | | | |--- class: 1 | | | | | | | | |--- duration > 102.50 | | | | | | | | | |--- actor_1_name <= 1296.50 | | | | | | | | | | |--- director_name <= 205.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- director_name > 205.00 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | |--- actor_1_name > 1296.50 | | | | | | | | | | |--- class: 0 | | | | | | | |--- main_genre_2 > 0.50 | | | | | | | | |--- class: 1 | | | | | | |--- main_genre_1 > 0.50 | | | | | | | |--- actor_1_facebook_likes <= 935.00 | | | | | | | | |--- actor_3_facebook_likes <= 83.50 | | | | | | | | | |--- class: 1 | | | | | | | | |--- actor_3_facebook_likes > 83.50 | | | | | | | | | |--- class: 0 | | | | | | | |--- actor_1_facebook_likes > 935.00 | | | | | | | | |--- class: 1 | | | | |--- num_user_for_reviews > 178.50 | | | | | |--- budget <= 125500000.00 | | | | | | |--- num_voted_users <= 82309.50 | | | | | | | |--- actor_1_name <= 1418.50 | | | | | | | | |--- director_facebook_likes <= 834.50 | | | | | | | | | |--- director_facebook_likes <= 15.50 | | | | | | | | | | |--- actor_2_name <= 1603.50 | | | | | | | | | | | |--- truncated branch of depth 4 | | | | | | | | | | |--- actor_2_name > 1603.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | |--- director_facebook_likes > 15.50 | | | | | | | | | | |--- title_year <= 2005.50 | | | | | | | | | | | |--- truncated branch of depth 7 | | | | | | | | | | |--- title_year > 2005.50 | | | | | | | | | | | |--- truncated branch of depth 4 | | | | | | | | |--- director_facebook_likes > 834.50 | | | | | | | | | |--- duration <= 100.50 | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- duration > 100.50 | | | | | | | | | | |--- class: 1 | | | | | | | |--- actor_1_name > 1418.50 | | | | | | | | |--- class: 1 | | | | | | |--- num_voted_users > 82309.50 | | | | | | | |--- title_year <= 2006.50 | | | | | | | | |--- class: 1 | | | | | | | |--- title_year > 2006.50 | | | | | | | | |--- actor_2_name <= 1381.50 | | | | | | | | | |--- class: 0 | | | | | | | | |--- actor_2_name > 1381.50 | | | | | | | | | |--- budget <= 39950000.00 | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- budget > 39950000.00 | | | | | | | | | | |--- class: 0 | | | | | |--- budget > 125500000.00 | | | | | | |--- actor_1_facebook_likes <= 55000.00 | | | | | | | |--- class: 1 | | | | | | |--- actor_1_facebook_likes > 55000.00 | | | | | | | |--- class: 0 | |--- duration > 108.50 | | |--- budget <= 36500000.00 | | | |--- num_voted_users <= 10991.50 | | | | |--- duration <= 124.50 | | | | | |--- budget <= 16000000.00 | | | | | | |--- actor_2_name <= 190.00 | | | | | | | |--- class: 0 | | | | | | |--- actor_2_name > 190.00 | | | | | | | |--- content_rating_7 <= 0.50 | | | | | | | | |--- num_voted_users <= 3161.00 | | | | | | | | | |--- actor_1_name <= 452.50 | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- actor_1_name > 452.50 | | | | | | | | | | |--- director_name <= 674.50 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- director_name > 674.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | |--- num_voted_users > 3161.00 | | | | | | | | | |--- num_user_for_reviews <= 91.50 | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- num_user_for_reviews > 91.50 | | | | | | | | | | |--- budget <= 5500000.00 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | | |--- budget > 5500000.00 | | | | | | | | | | | |--- class: 1 | | | | | | | |--- content_rating_7 > 0.50 | | | | | | | | |--- title_year <= 2003.50 | | | | | | | | | |--- class: 1 | | | | | | | | |--- title_year > 2003.50 | | | | | | | | | |--- director_name <= 244.00 | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- director_name > 244.00 | | | | | | | | | | |--- num_critic_for_reviews <= 76.50 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- num_critic_for_reviews > 76.50 | | | | | | | | | | | |--- class: 1 | | | | | |--- budget > 16000000.00 | | | | | | |--- main_genre_3 <= 0.50 | | | | | | | |--- actor_1_name <= 1269.00 | | | | | | | | |--- actor_1_name <= 74.50 | | | | | | | | | |--- class: 1 | | | | | | | | |--- actor_1_name > 74.50 | | | | | | | | | |--- language_11 <= 0.50 | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- language_11 > 0.50 | | | | | | | | | | |--- class: 1 | | | | | | | |--- actor_1_name > 1269.00 | | | | | | | | |--- movie_facebook_likes <= 181.00 | | | | | | | | | |--- class: 0 | | | | | | | | |--- movie_facebook_likes > 181.00 | | | | | | | | | |--- class: 1 | | | | | | |--- main_genre_3 > 0.50 | | | | | | | |--- class: 1 | | | | |--- duration > 124.50 | | | | | |--- country_13 <= 0.50 | | | | | | |--- cast_total_facebook_likes <= 22515.50 | | | | | | | |--- class: 1 | | | | | | |--- cast_total_facebook_likes > 22515.50 | | | | | | | |--- class: 0 | | | | | |--- country_13 > 0.50 | | | | | | |--- class: 0 | | | |--- num_voted_users > 10991.50 | | | | |--- duration <= 119.50 | | | | | |--- budget <= 14400000.00 | | | | | | |--- content_rating_1 <= 0.50 | | | | | | | |--- country_3 <= 0.50 | | | | | | | | |--- duration <= 118.50 | | | | | | | | | |--- class: 1 | | | | | | | | |--- duration > 118.50 | | | | | | | | | |--- actor_1_name <= 317.00 | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- actor_1_name > 317.00 | | | | | | | | | | |--- class: 1 | | | | | | | |--- country_3 > 0.50 | | | | | | | | |--- class: 0 | | | | | | |--- content_rating_1 > 0.50 | | | | | | | |--- class: 0 | | | | | |--- budget > 14400000.00 | | | | | | |--- num_voted_users <= 48941.00 | | | | | | | |--- num_voted_users <= 45238.00 | | | | | | | | |--- num_critic_for_reviews <= 95.50 | | | | | | | | | |--- num_user_for_reviews <= 200.00 | | | | | | | | | | |--- actor_2_facebook_likes <= 615.00 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | | |--- actor_2_facebook_likes > 615.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- num_user_for_reviews > 200.00 | | | | | | | | | | |--- class: 1 | | | | | | | | |--- num_critic_for_reviews > 95.50 | | | | | | | | | |--- num_user_for_reviews <= 345.50 | | | | | | | | | | |--- budget <= 14900000.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- budget > 14900000.00 | | | | | | | | | | | |--- truncated branch of depth 4 | | | | | | | | | |--- num_user_for_reviews > 345.50 | | | | | | | | | | |--- class: 0 | | | | | | | |--- num_voted_users > 45238.00 | | | | | | | | |--- class: 0 | | | | | | |--- num_voted_users > 48941.00 | | | | | | | |--- num_user_for_reviews <= 94.50 | | | | | | | | |--- class: 0 | | | | | | | |--- num_user_for_reviews > 94.50 | | | | | | | | |--- actor_3_facebook_likes <= 53.50 | | | | | | | | | |--- class: 0 | | | | | | | | |--- actor_3_facebook_likes > 53.50 | | | | | | | | | |--- num_critic_for_reviews <= 283.00 | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- num_critic_for_reviews > 283.00 | | | | | | | | | | |--- budget <= 19000000.00 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- budget > 19000000.00 | | | | | | | | | | | |--- class: 1 | | | | |--- duration > 119.50 | | | | | |--- actor_2_name <= 256.00 | | | | | | |--- movie_facebook_likes <= 13000.00 | | | | | | | |--- actor_3_name <= 2499.00 | | | | | | | | |--- class: 1 | | | | | | | |--- actor_3_name > 2499.00 | | | | | | | | |--- class: 0 | | | | | | |--- movie_facebook_likes > 13000.00 | | | | | | | |--- class: 0 | | | | | |--- actor_2_name > 256.00 | | | | | | |--- gross <= 97234792.00 | | | | | | | |--- actor_3_name <= 2591.50 | | | | | | | | |--- country_13 <= 0.50 | | | | | | | | | |--- budget <= 33500000.00 | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- budget > 33500000.00 | | | | | | | | | | |--- actor_3_name <= 513.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | | |--- actor_3_name > 513.50 | | | | | | | | | | | |--- class: 1 | | | | | | | | |--- country_13 > 0.50 | | | | | | | | | |--- actor_2_facebook_likes <= 703.00 | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- actor_2_facebook_likes > 703.00 | | | | | | | | | | |--- class: 0 | | | | | | | |--- actor_3_name > 2591.50 | | | | | | | | |--- budget <= 18500000.00 | | | | | | | | | |--- class: 0 | | | | | | | | |--- budget > 18500000.00 | | | | | | | | | |--- class: 1 | | | | | | |--- gross > 97234792.00 | | | | | | | |--- title_year <= 1987.00 | | | | | | | | |--- class: 0 | | | | | | | |--- title_year > 1987.00 | | | | | | | | |--- class: 1 | | |--- budget > 36500000.00 | | | |--- actor_2_facebook_likes <= 4500.00 | | | | |--- num_voted_users <= 78910.50 | | | | | |--- gross <= 3166911.50 | | | | | | |--- budget <= 39000000.00 | | | | | | | |--- class: 0 | | | | | | |--- budget > 39000000.00 | | | | | | | |--- class: 1 | | | | | |--- gross > 3166911.50 | | | | | | |--- actor_2_name <= 142.50 | | | | | | | |--- num_user_for_reviews <= 286.50 | | | | | | | | |--- class: 1 | | | | | | | |--- num_user_for_reviews > 286.50 | | | | | | | | |--- class: 0 | | | | | | |--- actor_2_name > 142.50 | | | | | | | |--- gross <= 31435074.00 | | | | | | | | |--- actor_2_facebook_likes <= 853.50 | | | | | | | | | |--- actor_2_facebook_likes <= 741.50 | | | | | | | | | | |--- language_9 <= 0.50 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- language_9 > 0.50 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | |--- actor_2_facebook_likes > 741.50 | | | | | | | | | | |--- class: 1 | | | | | | | | |--- actor_2_facebook_likes > 853.50 | | | | | | | | | |--- actor_3_facebook_likes <= 883.00 | | | | | | | | | | |--- director_facebook_likes <= 739.00 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | | |--- director_facebook_likes > 739.00 | | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- actor_3_facebook_likes > 883.00 | | | | | | | | | | |--- actor_2_name <= 1610.00 | | | | | | | | | | | |--- class: 1 | | | | | | | | | | |--- actor_2_name > 1610.00 | | | | | | | | | | | |--- class: 0 | | | | | | | |--- gross > 31435074.00 | | | | | | | | |--- num_critic_for_reviews <= 39.50 | | | | | | | | | |--- class: 0 | | | | | | | | |--- num_critic_for_reviews > 39.50 | | | | | | | | | |--- budget <= 62500000.00 | | | | | | | | | | |--- actor_2_facebook_likes <= 401.50 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- actor_2_facebook_likes > 401.50 | | | | | | | | | | | |--- truncated branch of depth 5 | | | | | | | | | |--- budget > 62500000.00 | | | | | | | | | | |--- num_voted_users <= 58005.00 | | | | | | | | | | | |--- truncated branch of depth 4 | | | | | | | | | | |--- num_voted_users > 58005.00 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | |--- num_voted_users > 78910.50 | | | | | |--- class: 1 | | | |--- actor_2_facebook_likes > 4500.00 | | | | |--- class: 1 |--- num_voted_users > 86645.50 | |--- num_voted_users <= 143731.50 | | |--- budget <= 29500000.00 | | | |--- main_genre_10 <= 0.50 | | | | |--- num_user_for_reviews <= 105.00 | | | | | |--- class: 0 | | | | |--- num_user_for_reviews > 105.00 | | | | | |--- actor_2_name <= 135.50 | | | | | | |--- content_rating_7 <= 0.50 | | | | | | | |--- class: 0 | | | | | | |--- content_rating_7 > 0.50 | | | | | | | |--- class: 1 | | | | | |--- actor_2_name > 135.50 | | | | | | |--- duration <= 83.50 | | | | | | | |--- content_rating_7 <= 0.50 | | | | | | | | |--- class: 1 | | | | | | | |--- content_rating_7 > 0.50 | | | | | | | | |--- class: 0 | | | | | | |--- duration > 83.50 | | | | | | | |--- num_user_for_reviews <= 663.00 | | | | | | | | |--- class: 1 | | | | | | | |--- num_user_for_reviews > 663.00 | | | | | | | | |--- num_voted_users <= 91897.50 | | | | | | | | | |--- class: 0 | | | | | | | | |--- num_voted_users > 91897.50 | | | | | | | | | |--- class: 1 | | | |--- main_genre_10 > 0.50 | | | | |--- gross <= 46866564.00 | | | | | |--- class: 1 | | | | |--- gross > 46866564.00 | | | | | |--- actor_3_facebook_likes <= 269.50 | | | | | | |--- class: 1 | | | | | |--- actor_3_facebook_likes > 269.50 | | | | | | |--- class: 0 | | |--- budget > 29500000.00 | | | |--- duration <= 127.50 | | | | |--- num_user_for_reviews <= 677.50 | | | | | |--- actor_3_facebook_likes <= 832.50 | | | | | | |--- cast_total_facebook_likes <= 23119.00 | | | | | | | |--- actor_2_facebook_likes <= 877.50 | | | | | | | | |--- country_3 <= 0.50 | | | | | | | | | |--- main_genre_4 <= 0.50 | | | | | | | | | | |--- director_name <= 84.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | | |--- director_name > 84.50 | | | | | | | | | | | |--- truncated branch of depth 4 | | | | | | | | | |--- main_genre_4 > 0.50 | | | | | | | | | | |--- title_year <= 2003.50 | | | | | | | | | | | |--- truncated branch of depth 2 | | | | | | | | | | |--- title_year > 2003.50 | | | | | | | | | | | |--- class: 1 | | | | | | | | |--- country_3 > 0.50 | | | | | | | | | |--- class: 0 | | | | | | | |--- actor_2_facebook_likes > 877.50 | | | | | | | | |--- gross <= 75334592.00 | | | | | | | | | |--- class: 0 | | | | | | | | |--- gross > 75334592.00 | | | | | | | | | |--- num_voted_users <= 107277.00 | | | | | | | | | | |--- content_rating_1 <= 0.50 | | | | | | | | | | | |--- class: 0 | | | | | | | | | | |--- content_rating_1 > 0.50 | | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- num_voted_users > 107277.00 | | | | | | | | | | |--- class: 1 | | | | | | |--- cast_total_facebook_likes > 23119.00 | | | | | | | |--- class: 1 | | | | | |--- actor_3_facebook_likes > 832.50 | | | | | | |--- budget <= 77500000.00 | | | | | | | |--- actor_3_name <= 628.50 | | | | | | | | |--- actor_2_name <= 392.50 | | | | | | | | | |--- class: 1 | | | | | | | | |--- actor_2_name > 392.50 | | | | | | | | | |--- class: 0 | | | | | | | |--- actor_3_name > 628.50 | | | | | | | | |--- director_name <= 396.00 | | | | | | | | | |--- class: 0 | | | | | | | | |--- director_name > 396.00 | | | | | | | | | |--- country_44 <= 0.50 | | | | | | | | | | |--- class: 0 | | | | | | | | | |--- country_44 > 0.50 | | | | | | | | | | |--- class: 1 | | | | | | |--- budget > 77500000.00 | | | | | | | |--- class: 0 | | | | |--- num_user_for_reviews > 677.50 | | | | | |--- gross <= 33394617.00 | | | | | | |--- class: 1 | | | | | |--- gross > 33394617.00 | | | | | | |--- class: 0 | | | |--- duration > 127.50 | | | | |--- main_genre_9 <= 0.50 | | | | | |--- actor_1_name <= 1397.00 | | | | | | |--- class: 1 | | | | | |--- actor_1_name > 1397.00 | | | | | | |--- class: 0 | | | | |--- main_genre_9 > 0.50 | | | | | |--- class: 0 | |--- num_voted_users > 143731.50 | | |--- content_rating_7 <= 0.50 | | | |--- duration <= 97.50 | | | | |--- actor_3_name <= 122.50 | | | | | |--- budget <= 115000000.00 | | | | | | |--- class: 1 | | | | | |--- budget > 115000000.00 | | | | | | |--- class: 0 | | | | |--- actor_3_name > 122.50 | | | | | |--- duration <= 96.50 | | | | | | |--- class: 1 | | | | | |--- duration > 96.50 | | | | | | |--- num_user_for_reviews <= 346.50 | | | | | | | |--- class: 0 | | | | | | |--- num_user_for_reviews > 346.50 | | | | | | | |--- class: 1 | | | |--- duration > 97.50 | | | | |--- class: 1 | | |--- content_rating_7 > 0.50 | | | |--- num_voted_users <= 229755.00 | | | | |--- budget <= 112500000.00 | | | | | |--- num_user_for_reviews <= 917.00 | | | | | | |--- actor_2_name <= 2221.00 | | | | | | | |--- actor_3_facebook_likes <= 2000.00 | | | | | | | | |--- actor_1_name <= 15.00 | | | | | | | | | |--- actor_2_name <= 1548.50 | | | | | | | | | | |--- class: 1 | | | | | | | | | |--- actor_2_name > 1548.50 | | | | | | | | | | |--- class: 0 | | | | | | | | |--- actor_1_name > 15.00 | | | | | | | | | |--- class: 1 | | | | | | | |--- actor_3_facebook_likes > 2000.00 | | | | | | | | |--- num_user_for_reviews <= 377.00 | | | | | | | | | |--- class: 0 | | | | | | | | |--- num_user_for_reviews > 377.00 | | | | | | | | | |--- class: 1 | | | | | | |--- actor_2_name > 2221.00 | | | | | | | |--- class: 0 | | | | | |--- num_user_for_reviews > 917.00 | | | | | | |--- gross <= 175348128.00 | | | | | | | |--- class: 1 | | | | | | |--- gross > 175348128.00 | | | | | | | |--- class: 0 | | | | |--- budget > 112500000.00 | | | | | |--- duration <= 125.50 | | | | | | |--- class: 0 | | | | | |--- duration > 125.50 | | | | | | |--- num_user_for_reviews <= 1212.50 | | | | | | | |--- gross <= 76709692.00 | | | | | | | | |--- class: 0 | | | | | | | |--- gross > 76709692.00 | | | | | | | | |--- class: 1 | | | | | | |--- num_user_for_reviews > 1212.50 | | | | | | | |--- class: 0 | | | |--- num_voted_users > 229755.00 | | | | |--- actor_3_name <= 156.00 | | | | | |--- title_year <= 2005.50 | | | | | | |--- class: 1 | | | | | |--- title_year > 2005.50 | | | | | | |--- class: 0 | | | | |--- actor_3_name > 156.00 | | | | | |--- cast_total_facebook_likes <= 3588.00 | | | | | | |--- budget <= 197500000.00 | | | | | | | |--- class: 1 | | | | | | |--- budget > 197500000.00 | | | | | | | |--- country_43 <= 0.50 | | | | | | | | |--- class: 0 | | | | | | | |--- country_43 > 0.50 | | | | | | | | |--- class: 1 | | | | | |--- cast_total_facebook_likes > 3588.00 | | | | | | |--- class: 1
fig = plt.figure(figsize=(25,20))
tree.plot_tree(dt,
feature_names = X.columns,
class_names=['bad','good'],
filled = True);
# left arrow means "True"
# right arrow means "False"
Simpler decision tree with less variables
# developing a very complex ("full-grown") tree
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
dt_simple = DecisionTreeClassifier(max_depth=3, min_samples_leaf=5)
dt_simple.fit(X_train, y_train)
# max_depth : The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
# min_samples_leaf : The minimum number of samples required to be at a leaf node
# http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier
DecisionTreeClassifier(max_depth=3, min_samples_leaf=5)
# Find out the performance of this model & interpret the results
print(metrics.accuracy_score(y_test, dt_simple.predict(X_test)))
print("--------------------------------------------------------")
print(metrics.confusion_matrix(y_test, dt_simple.predict(X_test)))
print("--------------------------------------------------------")
print(metrics.classification_report(y_test, dt_simple.predict(X_test)))
print("--------------------------------------------------------")
print(metrics.roc_auc_score(y_test, dt_simple.predict(X_test)))
0.7587719298245614 -------------------------------------------------------- [[132 198] [ 77 733]] -------------------------------------------------------- precision recall f1-score support 0 0.63 0.40 0.49 330 1 0.79 0.90 0.84 810 accuracy 0.76 1140 macro avg 0.71 0.65 0.67 1140 weighted avg 0.74 0.76 0.74 1140 -------------------------------------------------------- 0.652469135802469
fig = plt.figure(figsize=(25,20))
tree.plot_tree(dt_simple,
feature_names = X.columns,
class_names=['bad','good'],
filled = True);
# left arrow means "True"
# right arrow means "False"
Decision Tree interpretation
All those 202 movies that have num_voted_users = 86645.5, duration <= 108.5, and budget is not <= 15550000 (meaning greater than 15550000) are predicted as bad.
10 fold cross validation: The basic idea is that, rather than testing the model quality only once, cross validation (or 10-fold CV) tests the model 10 times with 10 different testing datasets.
# evaluate the decision tree model using 10-fold cross-validation
# initialize decision tree algorithm (without fitting)
scores = cross_val_score(DecisionTreeClassifier(), X, y, scoring='accuracy', cv=10)
print(scores)
print(scores.mean())
[0.73684211 0.69473684 0.7 0.71842105 0.75526316 0.75789474 0.73947368 0.71578947 0.72368421 0.71052632] 0.7252631578947368
# https://scikit-learn.org/stable/modules/cross_validation.html
#The mean score and the 95% confidence interval of the score estimate are hence given by:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
Accuracy: 0.73 (+/- 0.04)
Random Forest (Ensemble model)
Building multiple decision trees (ensembled decision trees) with the purpose to improve the model accuracy RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=1, random_state=0) n_estimators = the number of decision trees in the forest Combining different opionions is likely to lead to high accuracy.
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=20) #building 20 decision trees
clf=clf.fit(X_train, y_train)
clf.score(X_test, y_test)
0.8070175438596491
# generate evaluation metrics
print(metrics.accuracy_score(y_test, clf.predict(X_test))) #overall accuracy
print(metrics.confusion_matrix(y_test, clf.predict(X_test)))
print(metrics.classification_report(y_test, clf.predict(X_test)))
0.8070175438596491 [[187 143] [ 77 733]] precision recall f1-score support 0 0.71 0.57 0.63 330 1 0.84 0.90 0.87 810 accuracy 0.81 1140 macro avg 0.77 0.74 0.75 1140 weighted avg 0.80 0.81 0.80 1140
The RandomForestClassifier has an accuracy of 80.7%
# important variables in descending order
pd.DataFrame(clf.feature_importances_, index = X.columns,
columns=['importance']).sort_values('importance', ascending=False)
importance | |
---|---|
num_voted_users | 0.090071 |
num_critic_for_reviews | 0.083409 |
duration | 0.075814 |
num_user_for_reviews | 0.061850 |
gross | 0.061517 |
... | ... |
language_29 | 0.000000 |
country_10 | 0.000000 |
language_28 | 0.000000 |
country_45 | 0.000000 |
country_42 | 0.000000 |
123 rows × 1 columns
num_voted_users and num_critic_for_reviews appear to be two most important predictors
#Predict class probabilities for X
clf.predict_proba(X_test)
# 1st column: the probability of bad movie
# 2nd column: the probability of good movie
array([[0.1 , 0.9 ], [0.05, 0.95], [0.45, 0.55], ..., [0.75, 0.25], [0.55, 0.45], [0.15, 0.85]])
# calculate the fpr and tpr for all thresholds of the classification
preds = dt.predict_proba(X_test)[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
K-Nearest Neighbor (KNN)
# evaluate the model by splitting into train and test sets & develop knn model (name it as knn)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
KNeighborsClassifier()
#Model evaluation
# http://scikit-learn.org/stable/modules/model_evaluation.html
print(metrics.accuracy_score(y_test, knn.predict(X_test)))
print("--------------------------------------------------------")
print(metrics.confusion_matrix(y_test, knn.predict(X_test)))
print("--------------------------------------------------------")
print(metrics.classification_report(y_test, knn.predict(X_test)))
print("--------------------------------------------------------")
print(metrics.roc_auc_score(y_test, knn.predict(X_test)))
0.6526315789473685 -------------------------------------------------------- [[ 90 240] [156 654]] -------------------------------------------------------- precision recall f1-score support 0 0.37 0.27 0.31 330 1 0.73 0.81 0.77 810 accuracy 0.65 1140 macro avg 0.55 0.54 0.54 1140 weighted avg 0.63 0.65 0.64 1140 -------------------------------------------------------- 0.5400673400673401
When using KNN, I only have 65.26% accuracy
Appendix 1: 10 fold cross validation
# evaluate the knn model using 10-fold cross-validation
scores = cross_val_score(KNeighborsClassifier(), X, y, scoring='accuracy', cv=10)
print(scores)
print(scores.mean())
[0.66578947 0.52105263 0.57368421 0.58157895 0.59210526 0.49736842 0.47368421 0.44210526 0.53421053 0.70263158] 0.5584210526315789
Appendix 2: Search for the optimal k value (GridSearch)
#create a dictionary of all values we want to test for n_neighbors
params_knn = {'n_neighbors': np.arange(1, 25)}
#use gridsearch to test all values for n_neighbors
knn_gs = GridSearchCV(knn, params_knn, cv=5)
#fit model to training data
knn_gs.fit(X_train, y_train)
GridSearchCV(cv=5, estimator=KNeighborsClassifier(), param_grid={'n_neighbors': array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24])})
#save best model
knn_best = knn_gs.best_estimator_
#check best n_neigbors value
print(knn_gs.best_score_)
print(knn_gs.best_params_)
print(knn_gs.best_estimator_)
0.6860902255639098 {'n_neighbors': 22} KNeighborsClassifier(n_neighbors=22)
Appendix 3: Model Evaluation with ROC
# https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8
# https://stackoverflow.com/questions/25009284/how-to-plot-roc-curve-in-python
preds = knn.predict_proba(X_test)[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1.05])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
Logistic Regression
# correlation analysis
dfCM.corr()
director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | actor_1_name | num_voted_users | cast_total_facebook_likes | actor_3_name | num_user_for_reviews | budget | title_year | actor_2_facebook_likes | imdb_score | movie_facebook_likes | imdb_quality | color_1 | language_1 | language_2 | language_3 | language_4 | language_5 | language_6 | language_7 | language_8 | language_9 | language_10 | language_11 | language_12 | language_13 | language_14 | language_15 | language_16 | language_17 | language_18 | language_19 | language_20 | language_21 | language_22 | language_23 | language_24 | language_25 | language_26 | language_27 | language_28 | language_29 | language_30 | language_31 | language_32 | language_33 | country_1 | country_2 | country_3 | country_4 | country_5 | country_6 | country_7 | country_8 | country_9 | country_10 | country_11 | country_12 | country_13 | country_14 | country_15 | country_16 | country_17 | country_18 | country_19 | country_20 | country_21 | country_22 | country_23 | country_24 | country_25 | country_26 | country_27 | country_28 | country_29 | country_30 | country_31 | country_32 | country_33 | country_34 | country_35 | country_36 | country_37 | country_38 | country_39 | country_40 | country_41 | country_42 | country_43 | country_44 | country_45 | content_rating_1 | content_rating_2 | content_rating_3 | content_rating_4 | content_rating_5 | content_rating_6 | content_rating_7 | content_rating_8 | content_rating_9 | content_rating_10 | content_rating_11 | main_genre_1 | main_genre_2 | main_genre_3 | main_genre_4 | main_genre_5 | main_genre_6 | main_genre_7 | main_genre_8 | main_genre_9 | main_genre_10 | main_genre_11 | main_genre_12 | main_genre_13 | main_genre_14 | main_genre_15 | main_genre_16 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
director_name | 1.000000 | -0.001645 | 0.033452 | 0.050708 | -0.042588 | 0.019431 | -0.028597 | -0.001541 | 0.022334 | -0.009080 | -0.038909 | 0.000361 | 0.015093 | -0.007753 | -0.051667 | -0.042471 | 0.016045 | -0.008192 | 0.015587 | -0.019378 | 0.008864 | 0.005870 | -0.026188 | 0.023519 | -0.005001 | 0.032066 | 0.016312 | 0.014454 | 0.000593 | -0.003829 | -0.022857 | 0.034792 | -0.027742 | 0.022510 | 0.001801 | -0.009191 | 0.023753 | -0.022984 | 0.018823 | -0.025853 | 0.010925 | 0.005870 | 0.018823 | 0.017065 | 0.009741 | -0.017329 | -0.006528 | -0.019419 | 0.023151 | -0.036348 | 0.009436 | -0.009850 | -0.011705 | -0.006296 | -0.009752 | -0.004242 | 0.002224 | -0.006528 | -0.035021 | 0.010816 | 0.012372 | -0.000835 | -0.028493 | 0.016118 | -0.028239 | 0.010463 | 0.014201 | 0.040556 | -0.006010 | 0.013051 | 0.017232 | -0.019223 | 0.022510 | -0.012226 | -0.018894 | 0.012809 | -0.027742 | 0.012585 | 0.003462 | -0.037636 | 0.014454 | 0.012151 | 0.026548 | 0.009741 | 0.018953 | 0.003233 | 0.009254 | 0.015731 | -0.024704 | 0.033626 | 0.020939 | -0.027160 | -0.008004 | -0.024934 | 0.000869 | 0.032030 | -0.033709 | 0.026373 | -0.001404 | -0.010013 | 0.001167 | 0.019373 | -0.014230 | 0.004474 | -0.008231 | 0.002576 | -0.000590 | -0.008654 | 0.023684 | 0.004067 | 0.023856 | -0.009812 | -0.026830 | -0.034223 | 0.016883 | 0.009906 | 0.014849 | -0.009780 | 0.014456 | 0.003054 | 0.003807 | -0.011184 | 0.001827 | -0.009536 | 0.003929 |
num_critic_for_reviews | -0.001645 | 1.000000 | 0.237456 | 0.177850 | 0.247289 | -0.014900 | 0.168864 | 0.469265 | 0.001681 | 0.596951 | 0.237330 | -0.008414 | 0.568929 | 0.106622 | 0.403101 | 0.253847 | 0.350114 | 0.702770 | 0.262172 | -0.000504 | -0.007305 | 0.031630 | -0.007174 | -0.001045 | -0.008616 | 0.002485 | -0.002174 | -0.007433 | 0.023230 | -0.010976 | -0.021196 | -0.005342 | -0.004399 | -0.012414 | -0.012025 | 0.037043 | -0.006500 | 0.003269 | -0.017662 | 0.016681 | 0.003294 | 0.015505 | -0.000619 | -0.006519 | 0.016597 | -0.002285 | -0.012648 | 0.008951 | 0.008820 | -0.027013 | -0.018259 | -0.019628 | -0.001668 | -0.004556 | -0.012811 | -0.005938 | 0.000298 | -0.012648 | -0.029976 | -0.005863 | -0.000598 | -0.001275 | -0.006978 | 0.014588 | 0.005280 | -0.002999 | -0.011894 | -0.015429 | 0.012884 | -0.000416 | -0.015061 | -0.013598 | -0.012414 | 0.041463 | -0.012649 | 0.023502 | -0.004399 | -0.013298 | -0.005184 | -0.042388 | -0.007433 | -0.013467 | 0.032535 | 0.016597 | -0.003897 | -0.016613 | -0.017007 | -0.018318 | 0.004779 | 0.011949 | 0.015583 | 0.017406 | 0.013959 | 0.015070 | -0.022289 | 0.046482 | -0.013327 | -0.009010 | -0.059284 | -0.005863 | -0.005234 | -0.000126 | -0.031399 | -0.107759 | 0.108216 | -0.016972 | 0.012399 | -0.055599 | 0.002355 | 0.048493 | 0.002523 | 0.032020 | -0.219093 | 0.010276 | -0.067674 | -0.020865 | -0.016896 | 0.027029 | 0.071957 | -0.015710 | 0.042503 | 0.032548 | 0.012245 | -0.026372 | -0.020606 |
duration | 0.033452 | 0.237456 | 1.000000 | 0.181601 | 0.123003 | 0.023126 | 0.085598 | 0.249981 | 0.010341 | 0.344836 | 0.120807 | 0.033452 | 0.357494 | 0.069450 | -0.131890 | 0.128665 | 0.367463 | 0.219787 | 0.269486 | -0.050758 | -0.010783 | 0.007157 | 0.012181 | -0.013893 | 0.002134 | -0.026970 | -0.004594 | 0.016545 | -0.053022 | 0.015769 | -0.009256 | 0.085300 | -0.020326 | 0.052939 | 0.017204 | -0.011191 | 0.016237 | 0.033339 | 0.001417 | 0.024040 | 0.021639 | 0.020792 | 0.011463 | -0.005760 | -0.020498 | -0.011636 | 0.009912 | 0.002134 | -0.004324 | -0.008761 | 0.079123 | 0.017922 | -0.011501 | 0.007842 | -0.013653 | -0.022017 | -0.000736 | 0.009912 | -0.044066 | 0.012181 | 0.043191 | -0.006477 | 0.008671 | -0.002930 | -0.012218 | 0.006290 | 0.002134 | 0.007256 | 0.001417 | -0.018407 | 0.011646 | -0.007912 | 0.052939 | -0.005760 | -0.003986 | -0.021494 | -0.020326 | 0.054318 | 0.023688 | 0.012891 | 0.016545 | -0.004324 | 0.081400 | -0.020498 | 0.005005 | -0.000019 | -0.021547 | -0.006477 | -0.009669 | 0.002869 | -0.000861 | 0.019755 | -0.015890 | 0.002511 | 0.063497 | 0.038983 | -0.048401 | 0.131305 | -0.080990 | 0.007157 | 0.016213 | -0.012358 | -0.022359 | -0.097385 | 0.066474 | -0.008735 | 0.035778 | -0.010818 | -0.043681 | -0.029551 | -0.107189 | 0.157948 | -0.227021 | 0.085801 | -0.035861 | 0.139705 | -0.006249 | -0.032001 | -0.109319 | -0.005101 | 0.000643 | -0.002889 | -0.003578 | -0.022864 | 0.002040 |
director_facebook_likes | 0.050708 | 0.177850 | 0.181601 | 1.000000 | 0.121721 | -0.012826 | 0.091510 | 0.141708 | 0.015536 | 0.303774 | 0.121362 | 0.002219 | 0.220750 | 0.018888 | -0.047833 | 0.119176 | 0.192332 | 0.162192 | 0.120775 | -0.059483 | -0.004230 | -0.004230 | 0.054345 | -0.010784 | -0.003996 | -0.005115 | -0.004473 | -0.005069 | 0.024028 | -0.003533 | -0.021893 | -0.010634 | -0.005693 | -0.007577 | -0.003991 | -0.004522 | -0.008604 | 0.054339 | -0.003996 | -0.007036 | -0.011096 | -0.004230 | -0.003996 | -0.002473 | -0.007984 | -0.004961 | -0.007429 | -0.003362 | -0.002446 | -0.018644 | -0.007114 | -0.004204 | -0.003426 | -0.006492 | -0.003778 | -0.019115 | -0.003735 | -0.007429 | -0.027121 | -0.004038 | -0.008785 | -0.004161 | -0.006763 | 0.004665 | -0.001078 | -0.021426 | -0.003101 | -0.022429 | -0.002436 | -0.014835 | -0.005757 | -0.004076 | -0.007577 | -0.002430 | -0.006122 | -0.008343 | -0.005693 | -0.008280 | 0.017266 | -0.012488 | -0.005069 | -0.004108 | -0.013282 | -0.007984 | -0.004129 | -0.003969 | -0.004230 | -0.004182 | -0.005128 | -0.006163 | -0.004736 | -0.008857 | 0.010760 | -0.005452 | -0.008269 | -0.048768 | 0.076403 | -0.002904 | -0.031602 | -0.003794 | -0.005441 | -0.007780 | -0.017150 | 0.000997 | 0.006050 | -0.006785 | 0.014096 | -0.018133 | -0.011560 | 0.001275 | -0.003687 | 0.067200 | -0.057268 | 0.049582 | -0.020493 | 0.004912 | 0.036038 | -0.009983 | -0.042278 | -0.005531 | 0.048765 | -0.003948 | -0.010603 | -0.005983 | 0.041893 |
actor_3_facebook_likes | -0.042588 | 0.247289 | 0.123003 | 0.121721 | 1.000000 | 0.019008 | 0.252765 | 0.282772 | -0.004719 | 0.259370 | 0.485410 | -0.031515 | 0.202895 | 0.039158 | 0.113030 | 0.550690 | 0.064969 | 0.259299 | 0.036692 | 0.020773 | -0.006618 | -0.005628 | -0.005601 | -0.014460 | -0.006583 | -0.010094 | -0.008362 | -0.010828 | 0.072820 | -0.004497 | -0.033400 | -0.019316 | -0.009193 | -0.006622 | -0.006618 | -0.008988 | -0.015309 | -0.017118 | -0.001947 | -0.012273 | -0.020210 | -0.006451 | -0.006539 | -0.006618 | -0.012987 | -0.008191 | -0.014496 | -0.006381 | -0.006478 | -0.028048 | -0.009476 | -0.006574 | -0.006364 | -0.011162 | -0.005698 | -0.022407 | -0.002052 | -0.014496 | -0.032884 | -0.003121 | -0.016841 | -0.006565 | -0.007371 | -0.011046 | -0.006302 | -0.011499 | -0.006460 | -0.013775 | -0.004856 | -0.016866 | -0.007191 | -0.006057 | -0.006622 | -0.006092 | -0.008984 | -0.010939 | -0.009193 | -0.016310 | -0.017361 | -0.016127 | -0.010828 | -0.001438 | -0.015414 | -0.012987 | -0.002578 | -0.006443 | -0.006618 | -0.004856 | -0.006943 | -0.010838 | 0.039749 | -0.012644 | -0.020638 | -0.008982 | -0.010181 | -0.030564 | 0.077656 | -0.006460 | -0.029368 | -0.005084 | -0.005375 | -0.009197 | -0.033703 | 0.007164 | 0.094606 | -0.009218 | -0.069808 | -0.026365 | -0.014554 | 0.067475 | 0.008893 | 0.003129 | -0.024819 | -0.028171 | -0.033059 | -0.020658 | -0.004734 | -0.003708 | -0.044258 | -0.003770 | -0.017337 | -0.001421 | -0.005808 | -0.007445 | -0.010043 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
main_genre_12 | 0.003807 | 0.042503 | 0.000643 | 0.048765 | -0.017337 | -0.018693 | 0.008631 | -0.003230 | -0.008621 | 0.040739 | -0.002461 | 0.016764 | 0.046729 | -0.004544 | 0.015211 | -0.025849 | 0.011275 | 0.010223 | 0.016946 | -0.004054 | -0.001293 | -0.001293 | -0.001293 | -0.003425 | -0.001293 | -0.002241 | -0.001829 | -0.002241 | 0.016715 | -0.001293 | -0.007575 | -0.004296 | -0.001829 | -0.002894 | -0.001293 | -0.001829 | -0.003425 | -0.004095 | -0.001293 | -0.002894 | -0.004848 | -0.001293 | -0.001293 | -0.001293 | -0.002588 | -0.002241 | -0.002894 | -0.001293 | -0.001293 | -0.006356 | -0.002241 | -0.001293 | -0.001293 | -0.002241 | -0.001293 | 0.024326 | -0.001293 | -0.002894 | 0.015953 | -0.001293 | -0.004671 | -0.001293 | -0.002241 | -0.003884 | -0.001293 | 0.007313 | -0.001293 | 0.058229 | -0.001293 | -0.004671 | -0.001829 | -0.001293 | -0.002894 | -0.001293 | -0.002588 | -0.003425 | -0.001829 | -0.004296 | -0.005019 | -0.004095 | -0.002241 | -0.001293 | -0.004296 | -0.002588 | -0.001293 | -0.001293 | -0.001293 | -0.001293 | -0.001829 | -0.002241 | -0.002241 | -0.003662 | 0.037701 | -0.001829 | -0.002588 | -0.024010 | -0.017115 | -0.001293 | -0.012346 | -0.001293 | -0.001829 | -0.003170 | -0.008223 | -0.033594 | 0.004941 | -0.002241 | 0.020848 | -0.006356 | 0.060745 | -0.026184 | -0.008728 | -0.019087 | -0.048032 | -0.021382 | -0.007462 | -0.037352 | -0.002241 | -0.007905 | -0.016715 | -0.001829 | 1.000000 | -0.001293 | -0.003425 | -0.001829 | -0.002241 |
main_genre_13 | -0.011184 | 0.032548 | -0.002889 | -0.003948 | -0.001421 | 0.009345 | 0.005611 | 0.002523 | 0.007808 | 0.010356 | 0.002778 | -0.001232 | 0.003039 | 0.000340 | 0.013012 | -0.003674 | 0.009858 | 0.020320 | 0.010781 | 0.002980 | -0.000263 | -0.000263 | -0.000263 | -0.000697 | -0.000263 | -0.000456 | -0.000372 | -0.000456 | 0.003402 | -0.000263 | -0.001542 | -0.000874 | -0.000372 | -0.000589 | -0.000263 | -0.000372 | -0.000697 | -0.000833 | -0.000263 | -0.000589 | -0.000987 | -0.000263 | -0.000263 | -0.000263 | -0.000527 | -0.000456 | -0.000589 | -0.000263 | -0.000263 | -0.001293 | -0.000456 | -0.000263 | -0.000263 | -0.000456 | -0.000263 | -0.001673 | -0.000263 | -0.000589 | -0.002089 | -0.000263 | -0.000951 | -0.000263 | -0.000456 | -0.000791 | -0.000263 | -0.002695 | -0.000263 | -0.002364 | -0.000263 | -0.000951 | -0.000372 | -0.000263 | -0.000589 | -0.000263 | -0.000527 | -0.000697 | -0.000372 | -0.000874 | -0.001021 | -0.000833 | -0.000456 | -0.000263 | -0.000874 | -0.000527 | -0.000263 | -0.000263 | -0.000263 | -0.000263 | -0.000372 | -0.000456 | -0.000456 | -0.000745 | -0.001238 | -0.000372 | -0.000527 | -0.004886 | 0.008239 | -0.000263 | -0.002513 | -0.000263 | -0.000372 | -0.000645 | -0.001673 | -0.006837 | 0.022329 | -0.000456 | -0.014769 | -0.001293 | -0.000833 | -0.005329 | -0.001776 | -0.003884 | -0.009775 | -0.004351 | -0.001519 | -0.007601 | -0.000456 | -0.001609 | -0.003402 | -0.000372 | -0.001293 | 1.000000 | -0.000697 | -0.000372 | -0.000456 |
main_genre_14 | 0.001827 | 0.012245 | -0.003578 | -0.010603 | -0.005808 | -0.003467 | -0.007797 | -0.013495 | 0.008509 | 0.018590 | -0.009649 | -0.012436 | 0.028542 | -0.003397 | -0.007765 | -0.013013 | 0.006900 | -0.002701 | 0.001920 | 0.007890 | -0.000697 | -0.000697 | -0.000697 | -0.001846 | -0.000697 | -0.001208 | -0.000986 | -0.001208 | 0.009007 | -0.000697 | -0.004082 | -0.002315 | -0.000986 | -0.001559 | -0.000697 | -0.000986 | -0.001846 | -0.002207 | -0.000697 | -0.001559 | -0.002612 | -0.000697 | -0.000697 | -0.000697 | -0.001395 | -0.001208 | -0.001559 | -0.000697 | -0.000697 | -0.003425 | -0.001208 | -0.000697 | -0.000697 | -0.001208 | -0.000697 | -0.004431 | -0.000697 | -0.001559 | 0.042910 | -0.000697 | -0.002517 | -0.000697 | -0.001208 | -0.002093 | -0.000697 | -0.007135 | -0.000697 | -0.006260 | -0.000697 | -0.002517 | -0.000986 | -0.000697 | -0.001559 | -0.000697 | -0.001395 | -0.001846 | -0.000986 | -0.002315 | -0.002704 | -0.002207 | -0.001208 | -0.000697 | -0.002315 | -0.001395 | -0.000697 | -0.000697 | -0.000697 | -0.000697 | -0.000986 | -0.001208 | -0.001208 | -0.001973 | -0.003278 | -0.000986 | -0.001395 | -0.012938 | 0.006613 | -0.000697 | -0.006653 | -0.000697 | -0.000986 | -0.001708 | -0.004431 | -0.000952 | -0.005403 | -0.001208 | 0.010207 | -0.003425 | -0.002207 | -0.014109 | -0.004703 | -0.010285 | -0.025882 | -0.011522 | -0.004021 | -0.020127 | -0.001208 | -0.004260 | -0.009007 | -0.000986 | -0.003425 | -0.000697 | 1.000000 | -0.000986 | -0.001208 |
main_genre_15 | -0.009536 | -0.026372 | -0.022864 | -0.005983 | -0.007445 | 0.003280 | -0.010837 | -0.017028 | 0.011290 | -0.015708 | -0.012478 | 0.009487 | -0.017398 | -0.004630 | 0.014922 | -0.008415 | -0.019783 | -0.009690 | -0.009642 | 0.004215 | -0.000372 | -0.000372 | -0.000372 | -0.000986 | -0.000372 | -0.000645 | -0.000527 | -0.000645 | 0.004811 | -0.000372 | -0.002180 | -0.001236 | -0.000527 | -0.000833 | -0.000372 | -0.000527 | -0.000986 | -0.001179 | -0.000372 | -0.000833 | -0.001395 | -0.000372 | -0.000372 | -0.000372 | -0.000745 | -0.000645 | -0.000833 | -0.000372 | -0.000372 | -0.001829 | -0.000645 | -0.000372 | -0.000372 | -0.000645 | -0.000372 | -0.002367 | -0.000372 | -0.000833 | -0.002955 | -0.000372 | -0.001345 | -0.000372 | -0.000645 | -0.001118 | -0.000372 | -0.003811 | -0.000372 | -0.003344 | -0.000372 | -0.001345 | -0.000527 | -0.000372 | -0.000833 | -0.000372 | -0.000745 | -0.000986 | -0.000527 | -0.001236 | -0.001445 | -0.001179 | -0.000645 | -0.000372 | -0.001236 | -0.000745 | -0.000372 | -0.000372 | 0.707014 | -0.000372 | -0.000527 | -0.000645 | -0.000645 | -0.001054 | -0.001751 | -0.000527 | -0.000745 | -0.006911 | -0.016769 | -0.000372 | -0.003554 | -0.000372 | -0.000527 | -0.000913 | 0.110059 | -0.009670 | -0.016674 | -0.000645 | 0.002159 | -0.001829 | -0.001179 | -0.007537 | -0.002512 | -0.005494 | -0.013826 | -0.006155 | -0.002148 | -0.010751 | -0.000645 | -0.002275 | -0.004811 | -0.000527 | -0.001829 | -0.000372 | -0.000986 | 1.000000 | -0.000645 |
main_genre_16 | 0.003929 | -0.020606 | 0.002040 | 0.041893 | -0.010043 | -0.005746 | 0.005502 | -0.014440 | -0.020824 | 0.014403 | -0.000341 | -0.004736 | -0.001465 | -0.005352 | -0.048548 | -0.011332 | 0.008195 | -0.003326 | -0.001647 | 0.005163 | -0.000456 | -0.000456 | -0.000456 | -0.001208 | -0.000456 | -0.000790 | -0.000645 | -0.000790 | -0.040761 | -0.000456 | -0.002671 | -0.001515 | -0.000645 | -0.001020 | -0.000456 | -0.000645 | 0.217298 | -0.001444 | -0.000456 | -0.001020 | -0.001709 | -0.000456 | -0.000456 | -0.000456 | -0.000912 | -0.000790 | -0.001020 | -0.000456 | -0.000456 | -0.002241 | -0.000790 | -0.000456 | -0.000456 | -0.000790 | -0.000456 | -0.002899 | -0.000456 | -0.001020 | -0.003620 | -0.000456 | -0.001647 | -0.000456 | -0.000790 | -0.001370 | -0.000456 | -0.004668 | -0.000456 | -0.004096 | -0.000456 | -0.001647 | -0.000645 | -0.000456 | -0.001020 | -0.000456 | -0.000912 | -0.001208 | -0.000645 | 0.172884 | -0.001770 | -0.001444 | -0.000790 | -0.000456 | -0.001515 | -0.000912 | -0.000456 | -0.000456 | -0.000456 | -0.000456 | -0.000645 | -0.000790 | -0.000790 | -0.001291 | -0.002145 | -0.000645 | -0.000912 | -0.008465 | -0.008936 | -0.000456 | -0.004353 | -0.000456 | -0.000645 | -0.001118 | -0.002899 | -0.011845 | -0.020424 | -0.000790 | 0.012056 | -0.002241 | -0.001444 | -0.009232 | -0.003077 | -0.006730 | -0.016935 | -0.007539 | -0.002631 | -0.013170 | -0.000790 | -0.002787 | -0.005893 | -0.000645 | -0.002241 | -0.000456 | -0.001208 | -0.000645 | 1.000000 |
125 rows × 125 columns
corr = pd.DataFrame(dfCM.corr()['imdb_quality'].drop("imdb_quality"))
corr.sort_values(['imdb_quality'], ascending = False).head()
imdb_quality | |
---|---|
imdb_score | 0.776134 |
num_voted_users | 0.279504 |
duration | 0.269486 |
num_critic_for_reviews | 0.262172 |
num_user_for_reviews | 0.192536 |
# evaluate the model by splitting into train and test sets and build a logistic regression model
# name it as "lr"
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
lr = LogisticRegression(solver='lbfgs', max_iter=500)
lr.fit(X_train, y_train)
LogisticRegression(max_iter=500)
# print coefficients
coef = pd.DataFrame(list(zip(X.columns, np.transpose(lr.coef_))),
columns=['X variables', 'coef']).sort_values('coef', ascending=False)
coef['coef'] = coef['coef'].str[0]
coef.head()
X variables | coef | |
---|---|---|
9 | num_voted_users | 1.795222e-05 |
16 | movie_facebook_likes | 1.598530e-06 |
10 | cast_total_facebook_likes | 9.627292e-07 |
6 | actor_1_facebook_likes | 6.798224e-07 |
15 | actor_2_facebook_likes | 2.428489e-07 |
#Model evaluation
print(metrics.accuracy_score(y_test, lr.predict(X_test)))
print(metrics.confusion_matrix(y_test, lr.predict(X_test)))
print(metrics.classification_report(y_test, lr.predict(X_test)))
print(metrics.roc_auc_score(y_test, lr.predict(X_test)))
0.7210526315789474 [[ 57 273] [ 45 765]] precision recall f1-score support 0 0.56 0.17 0.26 330 1 0.74 0.94 0.83 810 accuracy 0.72 1140 macro avg 0.65 0.56 0.55 1140 weighted avg 0.69 0.72 0.66 1140 0.5585858585858585
This Logistic Regression model has 72.1% accuracy
# generate class probabilities
probs = lr.predict_proba(X_test)
print(probs)
#1st column: probability of being a bad movie
#2nd column: probabiility of being a good movie
[[0.08744289 0.91255711] [0.17190518 0.82809482] [0.4086413 0.5913587 ] ... [0.45069471 0.54930529] [0.23998017 0.76001983] [0.4913827 0.5086173 ]]
Interpretation:
Comparing Algorithms
#https://stackoverflow.com/questions/42894871/how-to-plot-multiple-roc-curves-in-one-plot-with-legend-and-auc-scores-in-python
plt.figure()
# Add the models to the list that you want to view on the ROC plot
models = [
{
'label': 'Decision Tree',
'model': DecisionTreeClassifier(),
},
{
'label': 'K-nearest neighbors',
'model': KNeighborsClassifier(),
},
{
'label': 'Logistic Regression',
'model': LogisticRegression(solver='lbfgs', max_iter=500),
},
{
'label': 'Random Forest',
'model': RandomForestClassifier(n_estimators=100),
}
]
# Below for loop iterates through your models list
for m in models:
model = m['model'] # select the model
model.fit(X_train, y_train) # train the model
#y_pred=model.predict(X_test) # predict the test data
# Compute False postive rate, and True positive rate
fpr, tpr, thresholds = metrics.roc_curve(y_test, model.predict_proba(X_test)[:,1])
# Calculate Area under the curve to display on the plot
auc = metrics.roc_auc_score(y_test,model.predict(X_test))
# Now, plot the computed values
plt.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % (m['label'], auc))
# Custom settings for the plot
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('1-Specificity(False Positive Rate)')
plt.ylabel('Sensitivity(True Positive Rate)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show() # Display
Interpretation and conclusion on Classification
Objective: Analyze the data using K-means algorithm. I determine the optimal K value for Kmeans. report the movie “profiles” based on clustering analysis.
#create new dataframe for clustering
dff = dfRM.copy()
dff.head()
color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | actor_1_name | num_voted_users | cast_total_facebook_likes | actor_3_name | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | movie_facebook_likes | main_genre | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 638 | 723.0 | 178.0 | 0.0 | 855.0 | 1030 | 1000.0 | 760505847.0 | 192 | 886204 | 4834 | 2585 | 3054.0 | 9 | 44 | 7 | 237000000.0 | 2009.0 | 936.0 | 7.9 | 33000 | 0 |
1 | 1 | 555 | 302.0 | 169.0 | 563.0 | 1000.0 | 1638 | 40000.0 | 309404152.0 | 703 | 471220 | 48350 | 1024 | 1238.0 | 9 | 44 | 7 | 300000000.0 | 2007.0 | 5000.0 | 7.1 | 0 | 0 |
2 | 1 | 1436 | 602.0 | 148.0 | 0.0 | 161.0 | 1844 | 11000.0 | 200074175.0 | 261 | 275868 | 11700 | 2347 | 994.0 | 9 | 43 | 7 | 245000000.0 | 2015.0 | 393.0 | 6.8 | 85000 | 0 |
3 | 1 | 254 | 813.0 | 164.0 | 22000.0 | 23000.0 | 396 | 27000.0 | 448130642.0 | 1378 | 1144337 | 106759 | 1295 | 2701.0 | 9 | 44 | 7 | 250000000.0 | 2012.0 | 23000.0 | 8.5 | 164000 | 0 |
4 | 1 | 64 | 462.0 | 132.0 | 475.0 | 530.0 | 1887 | 640.0 | 73058679.0 | 327 | 212204 | 1873 | 2015 | 738.0 | 9 | 44 | 7 | 263700000.0 | 2012.0 | 632.0 | 6.6 | 24000 | 0 |
dff.describe()
color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | actor_1_name | num_voted_users | cast_total_facebook_likes | actor_3_name | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | movie_facebook_likes | main_genre | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 3800.000000 | 3800.000000 | 3800.000000 | 3800.000000 | 3800.000000 | 3800.000000 | 3800.000000 | 3800.000000 | 3.800000e+03 | 3800.000000 | 3.800000e+03 | 3800.000000 | 3800.000000 | 3800.000000 | 3800.000000 | 3800.000000 | 3800.000000 | 3.800000e+03 | 3800.000000 | 3800.000000 | 3800.000000 | 3800.000000 | 3800.000000 |
mean | 0.967368 | 880.653421 | 164.724971 | 110.026053 | 794.400526 | 755.126842 | 1117.593947 | 7648.966842 | 5.164025e+07 | 733.248421 | 1.037115e+05 | 11354.007105 | 1331.273158 | 330.334474 | 9.353421 | 40.514211 | 7.584737 | 4.549443e+07 | 2003.072368 | 1980.765526 | 6.459184 | 9202.434474 | 3.642895 |
std | 0.177694 | 498.570965 | 123.774159 | 22.611570 | 3047.211325 | 1851.494817 | 653.567933 | 15473.939652 | 6.955357e+07 | 419.280739 | 1.509243e+05 | 18999.250279 | 767.424657 | 409.373833 | 2.437659 | 9.827296 | 1.689875 | 2.247288e+08 | 9.885966 | 4495.328345 | 1.054768 | 21399.154854 | 3.030615 |
min | 0.000000 | 0.000000 | 1.000000 | 37.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.620000e+02 | 0.000000 | 2.200000e+01 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 2.180000e+02 | 1927.000000 | 0.000000 | 1.600000 | 0.000000 | 0.000000 |
25% | 1.000000 | 450.750000 | 74.000000 | 95.000000 | 10.000000 | 188.000000 | 552.750000 | 730.750000 | 7.421793e+06 | 367.000000 | 1.813225e+04 | 1871.000000 | 657.750000 | 105.000000 | 9.000000 | 44.000000 | 7.000000 | 1.000000e+07 | 1999.000000 | 371.750000 | 5.900000 | 0.000000 | 0.000000 |
50% | 1.000000 | 893.500000 | 136.000000 | 106.000000 | 59.000000 | 432.000000 | 1103.500000 | 1000.000000 | 2.877222e+07 | 732.000000 | 5.238000e+04 | 3955.000000 | 1325.500000 | 205.000000 | 9.000000 | 44.000000 | 7.000000 | 2.500000e+07 | 2005.000000 | 670.000000 | 6.600000 | 217.000000 | 4.000000 |
75% | 1.000000 | 1332.250000 | 222.000000 | 120.000000 | 230.000000 | 689.250000 | 1685.250000 | 12000.000000 | 6.628274e+07 | 1086.250000 | 1.253340e+05 | 16095.000000 | 2006.000000 | 393.250000 | 9.000000 | 44.000000 | 9.000000 | 5.000000e+07 | 2010.000000 | 973.000000 | 7.200000 | 11000.000000 | 6.000000 |
max | 1.000000 | 1706.000000 | 813.000000 | 330.000000 | 23000.000000 | 23000.000000 | 2241.000000 | 640000.000000 | 7.605058e+08 | 1468.000000 | 1.689764e+06 | 656730.000000 | 2645.000000 | 5060.000000 | 33.000000 | 45.000000 | 11.000000 | 1.221550e+10 | 2016.000000 | 137000.000000 | 9.300000 | 349000.000000 | 16.000000 |
# variance test
dff.var()
color 3.157507e-02 director_name 2.485730e+05 num_critic_for_reviews 1.532004e+04 duration 5.112831e+02 director_facebook_likes 9.285497e+06 actor_3_facebook_likes 3.428033e+06 actor_2_name 4.271510e+05 actor_1_facebook_likes 2.394428e+08 gross 4.837700e+15 actor_1_name 1.757963e+05 num_voted_users 2.277814e+10 cast_total_facebook_likes 3.609715e+08 actor_3_name 5.889406e+05 num_user_for_reviews 1.675869e+05 language 5.942184e+00 country 9.657574e+01 content_rating 2.855676e+00 budget 5.050302e+16 title_year 9.773232e+01 actor_2_facebook_likes 2.020798e+07 imdb_score 1.112535e+00 movie_facebook_likes 4.579238e+08 main_genre 9.184630e+00 dtype: float64
dff_norm = (dff - dff.mean()) / (dff.max() - dff.min())
dff_norm.head()
color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | actor_1_name | num_voted_users | cast_total_facebook_likes | actor_3_name | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | movie_facebook_likes | main_genre | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.032632 | -0.142235 | 0.687531 | 0.231993 | -0.034539 | 0.004342 | -0.039087 | -0.010389 | 0.932098 | -0.368698 | 0.463084 | -0.009928 | 0.473999 | 0.538380 | -0.01071 | 0.077462 | -0.053158 | 0.015677 | 0.066603 | -0.007626 | 0.187119 | 0.068188 | -0.227681 |
1 | 0.032632 | -0.190887 | 0.169058 | 0.201276 | -0.010061 | 0.010647 | 0.232220 | 0.050548 | 0.338938 | -0.020605 | 0.217494 | 0.056334 | -0.116171 | 0.179416 | -0.01071 | 0.077462 | -0.053158 | 0.020835 | 0.044131 | 0.022038 | 0.083223 | -0.026368 | -0.227681 |
2 | 0.032632 | 0.325526 | 0.538516 | 0.129604 | -0.034539 | -0.025832 | 0.324144 | 0.005236 | 0.195178 | -0.321695 | 0.101883 | 0.000527 | 0.384018 | 0.131185 | -0.01071 | 0.055240 | -0.053158 | 0.016332 | 0.134018 | -0.011590 | 0.044262 | 0.217185 | -0.227681 |
3 | 0.032632 | -0.367323 | 0.798368 | 0.184211 | 0.921983 | 0.967168 | -0.321996 | 0.030236 | 0.521351 | 0.439204 | 0.615849 | 0.145273 | -0.013714 | 0.468604 | -0.01071 | 0.077462 | -0.053158 | 0.016741 | 0.100310 | 0.153425 | 0.265041 | 0.443546 | -0.227681 |
4 | 0.032632 | -0.478695 | 0.366102 | 0.074996 | -0.013887 | -0.009788 | 0.343332 | -0.010952 | 0.028163 | -0.276736 | 0.064207 | -0.014437 | 0.258498 | 0.080582 | -0.01071 | 0.077462 | -0.053158 | 0.017863 | 0.100310 | -0.009845 | 0.018288 | 0.042400 | -0.227681 |
dff_norm.var()
color 0.031575 director_name 0.085407 num_critic_for_reviews 0.023235 duration 0.005956 director_facebook_likes 0.017553 actor_3_facebook_likes 0.006480 actor_2_name 0.085055 actor_1_facebook_likes 0.000585 gross 0.008364 actor_1_name 0.081575 num_voted_users 0.007978 cast_total_facebook_likes 0.000837 actor_3_name 0.084182 num_user_for_reviews 0.006548 language 0.005457 country 0.047692 content_rating 0.023601 budget 0.000338 title_year 0.012338 actor_2_facebook_likes 0.001077 imdb_score 0.018764 movie_facebook_likes 0.003760 main_genre 0.035877 dtype: float64
Clustering analysis (k = 2)
k_means = KMeans(init='k-means++', n_clusters=2, random_state=0)
k_means.fit(dff_norm)
KMeans(n_clusters=2, random_state=0)
# cluster labels
k_means.labels_
array([0, 0, 1, ..., 1, 0, 0])
# find out cluster centers
k_means.cluster_centers_
array([[ 6.94939596e-03, -2.58304102e-01, 1.12999967e-03, -4.30171519e-03, -5.84492018e-03, 2.95938149e-03, -1.61165649e-02, 7.36663132e-04, 5.37842564e-04, -1.59137028e-02, 4.79279330e-04, 1.22021354e-03, -1.93395821e-03, -1.97013193e-03, 7.69428679e-04, 5.78967394e-03, 2.38955407e-03, 1.72928973e-04, 3.12456593e-03, 1.56298982e-03, -3.20438664e-03, 5.54081705e-04, 4.42849575e-03], [-6.72626673e-03, 2.50010547e-01, -1.09371796e-03, 4.16359694e-03, 5.65725314e-03, -2.86436251e-03, 1.55990988e-02, -7.13010561e-04, -5.20573668e-04, 1.54027501e-02, -4.63890765e-04, -1.18103527e-03, 1.87186323e-03, 1.90687549e-03, -7.44724081e-04, -5.60378073e-03, -2.31283095e-03, -1.67376619e-04, -3.02424325e-03, -1.51280578e-03, 3.10150110e-03, -5.36291407e-04, -4.28630686e-03]])
# convert cluster lables to dataframe
dff1 = pd.DataFrame(k_means.labels_, columns = ['cluster'])
dff1.head()
cluster | |
---|---|
0 | 0 |
1 | 0 |
2 | 1 |
3 | 0 |
4 | 0 |
# joining clusters in new dataset
dff2 = dff.join(dff1)
dff2.head()
color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | actor_1_name | num_voted_users | cast_total_facebook_likes | actor_3_name | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | movie_facebook_likes | main_genre | cluster | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 638 | 723.0 | 178.0 | 0.0 | 855.0 | 1030 | 1000.0 | 760505847.0 | 192 | 886204 | 4834 | 2585 | 3054.0 | 9 | 44 | 7 | 237000000.0 | 2009.0 | 936.0 | 7.9 | 33000 | 0 | 0 |
1 | 1 | 555 | 302.0 | 169.0 | 563.0 | 1000.0 | 1638 | 40000.0 | 309404152.0 | 703 | 471220 | 48350 | 1024 | 1238.0 | 9 | 44 | 7 | 300000000.0 | 2007.0 | 5000.0 | 7.1 | 0 | 0 | 0 |
2 | 1 | 1436 | 602.0 | 148.0 | 0.0 | 161.0 | 1844 | 11000.0 | 200074175.0 | 261 | 275868 | 11700 | 2347 | 994.0 | 9 | 43 | 7 | 245000000.0 | 2015.0 | 393.0 | 6.8 | 85000 | 0 | 1 |
3 | 1 | 254 | 813.0 | 164.0 | 22000.0 | 23000.0 | 396 | 27000.0 | 448130642.0 | 1378 | 1144337 | 106759 | 1295 | 2701.0 | 9 | 44 | 7 | 250000000.0 | 2012.0 | 23000.0 | 8.5 | 164000 | 0 | 0 |
4 | 1 | 64 | 462.0 | 132.0 | 475.0 | 530.0 | 1887 | 640.0 | 73058679.0 | 327 | 212204 | 1873 | 2015 | 738.0 | 9 | 44 | 7 | 263700000.0 | 2012.0 | 632.0 | 6.6 | 24000 | 0 | 0 |
# How many observations are there in cluster 1 and cluster 2?
dff2.groupby(['cluster']).count()
color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | actor_1_name | num_voted_users | cast_total_facebook_likes | actor_3_name | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | movie_facebook_likes | main_genre | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
cluster | |||||||||||||||||||||||
0 | 1871 | 1871 | 1871 | 1871 | 1871 | 1871 | 1871 | 1871 | 1871 | 1871 | 1871 | 1871 | 1871 | 1871 | 1871 | 1871 | 1871 | 1871 | 1871 | 1871 | 1871 | 1871 | 1871 |
1 | 1929 | 1929 | 1929 | 1929 | 1929 | 1929 | 1929 | 1929 | 1929 | 1929 | 1929 | 1929 | 1929 | 1929 | 1929 | 1929 | 1929 | 1929 | 1929 | 1929 | 1929 | 1929 | 1929 |
Cluster 0 has 1871 observations.
Cluster 1 has 1929 observations.
#The mean values of each cluster in terms of different variables
dff2.groupby(['cluster']).mean()
color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | actor_1_name | num_voted_users | cast_total_facebook_likes | actor_3_name | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | movie_facebook_likes | main_genre | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
cluster | |||||||||||||||||||||||
0 | 0.974345 | 440.384821 | 165.672309 | 108.789417 | 659.708712 | 822.840192 | 1082.172100 | 8124.57830 | 5.204926e+07 | 710.248530 | 104505.895243 | 12157.561732 | 1326.888830 | 320.394976 | 9.378407 | 40.778193 | 7.610903 | 4.769759e+07 | 2003.353287 | 2193.219134 | 6.434099 | 9391.644575 | 3.709781 |
1 | 0.960601 | 1307.684292 | 163.806117 | 111.225505 | 925.042509 | 689.449456 | 1151.950752 | 7187.65578 | 5.124354e+07 | 755.556765 | 102940.918611 | 10574.613271 | 1335.525661 | 339.975117 | 9.329186 | 40.258165 | 7.559357 | 4.335751e+07 | 2002.799896 | 1774.699844 | 6.483515 | 9018.913427 | 3.578020 |
Profile of each cluster
movies in cluster 0, on average, is characterized by a lower director_name, director_facebook_likes, actor_2_name, actor_1_name, and high actor_1_facebook_likes, cast_total_facebook_likes, and budget.
movies in cluster 1, on average, are characterized by a high director_name, director_facebook_likes, actor_2_name, actor_1_name, and low cast_total_facebook_likes.
Objective is to develop useful insights from your business intelligence (data visualization, correlation, pivot tables) and models (regression, classification, and clustering).