import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px  #if you don't have this, install "pip install plotly"

# regression
import sklearn.linear_model as lm
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score

#regression analysis (statistics)
import statsmodels.api as sm
from statsmodels.formula.api import ols

# validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# regression feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import RFE

# pca
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings("ignore")


import numpy as np
import pandas as pd
import plotly
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import scikitplot as skplt
import sklearn
import statsmodels.api as sm
%matplotlib inline

# Classifiers
#import decisiontreeclassifier
from sklearn import tree
from sklearn.tree import export_text
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from IPython.display import SVG
#from graphviz import Source
from IPython.display import display
#import logisticregression classifier
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
#import knn classifier
from sklearn.neighbors import KNeighborsClassifier

#for validating your classification model
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

# feature selection
from sklearn.feature_selection import RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import RandomForestClassifier

# grid search
from sklearn.model_selection import GridSearchCV

#clustering
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import ward_tree
from scipy.cluster.hierarchy import dendrogram, linkage, ward
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cdist
from scipy import stats
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn import metrics

#clustering plot
from pandas.plotting import parallel_coordinates

#3d chart clustering
from mpl_toolkits.mplot3d import Axes3D

pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings("ignore")


#opening the csv file
df = pd.read_csv("data/movie_metadata.csv")
df.head()


#number of rows and columns
df.shape

(5043, 28)


#basic statistics
df.describe()


#number of missing values in each column
df.isnull().sum()

color                         19
director_name                104
num_critic_for_reviews        50
duration                      15
director_facebook_likes      104
actor_3_facebook_likes        23
actor_2_name                  13
actor_1_facebook_likes         7
gross                        884
genres                         0
actor_1_name                   7
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                  23
facenumber_in_poster          13
plot_keywords                153
movie_imdb_link                0
num_user_for_reviews          21
language                      12
country                        5
content_rating               303
budget                       492
title_year                   108
actor_2_facebook_likes        13
imdb_score                     0
aspect_ratio                 329
movie_facebook_likes           0
dtype: int64


#finding all rows with null/missing values
df[df.isnull().any(axis=1)]


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   color                      5024 non-null   object 
 1   director_name              4939 non-null   object 
 2   num_critic_for_reviews     4993 non-null   float64
 3   duration                   5028 non-null   float64
 4   director_facebook_likes    4939 non-null   float64
 5   actor_3_facebook_likes     5020 non-null   float64
 6   actor_2_name               5030 non-null   object 
 7   actor_1_facebook_likes     5036 non-null   float64
 8   gross                      4159 non-null   float64
 9   genres                     5043 non-null   object 
 10  actor_1_name               5036 non-null   object 
 11  movie_title                5043 non-null   object 
 12  num_voted_users            5043 non-null   int64  
 13  cast_total_facebook_likes  5043 non-null   int64  
 14  actor_3_name               5020 non-null   object 
 15  facenumber_in_poster       5030 non-null   float64
 16  plot_keywords              4890 non-null   object 
 17  movie_imdb_link            5043 non-null   object 
 18  num_user_for_reviews       5022 non-null   float64
 19  language                   5031 non-null   object 
 20  country                    5038 non-null   object 
 21  content_rating             4740 non-null   object 
 22  budget                     4551 non-null   float64
 23  title_year                 4935 non-null   float64
 24  actor_2_facebook_likes     5030 non-null   float64
 25  imdb_score                 5043 non-null   float64
 26  aspect_ratio               4714 non-null   float64
 27  movie_facebook_likes       5043 non-null   int64  
dtypes: float64(13), int64(3), object(12)
memory usage: 1.1+ MB


df.dtypes

color                         object
director_name                 object
num_critic_for_reviews       float64
duration                     float64
director_facebook_likes      float64
actor_3_facebook_likes       float64
actor_2_name                  object
actor_1_facebook_likes       float64
gross                        float64
genres                        object
actor_1_name                  object
movie_title                   object
num_voted_users                int64
cast_total_facebook_likes      int64
actor_3_name                  object
facenumber_in_poster         float64
plot_keywords                 object
movie_imdb_link               object
num_user_for_reviews         float64
language                      object
country                       object
content_rating                object
budget                       float64
title_year                   float64
actor_2_facebook_likes       float64
imdb_score                   float64
aspect_ratio                 float64
movie_facebook_likes           int64
dtype: object


# some columns considered to be important to predict imdb_score
df.corr()['imdb_score'].sort_values(ascending=False)

imdb_score                   1.000000
num_voted_users              0.410965
num_critic_for_reviews       0.305303
num_user_for_reviews         0.292475
duration                     0.261662
movie_facebook_likes         0.247049
gross                        0.198021
director_facebook_likes      0.170802
cast_total_facebook_likes    0.085787
actor_2_facebook_likes       0.083808
actor_1_facebook_likes       0.076099
aspect_ratio                 0.059445
actor_3_facebook_likes       0.052633
budget                       0.030688
facenumber_in_poster        -0.062958
title_year                  -0.209167
Name: imdb_score, dtype: float64


#top 5 common movie length in minutes
df['duration'].value_counts().head()

90.0     161
100.0    141
101.0    139
98.0     135
97.0     131
Name: duration, dtype: int64


#top 5 directors with the most movies
df['director_name'].value_counts().head()

Steven Spielberg    26
Woody Allen         22
Martin Scorsese     20
Clint Eastwood      20
Ridley Scott        17
Name: director_name, dtype: int64


df[['director_facebook_likes', 'director_name']].value_counts().head()

director_facebook_likes  director_name   
14000.0                  Steven Spielberg    26
11000.0                  Woody Allen         22
16000.0                  Clint Eastwood      20
17000.0                  Martin Scorsese     20
0.0                      Ridley Scott        17
dtype: int64


#top 5 primary actors based on the total number of movies they starred in
df['actor_1_name'].value_counts().head()

Robert De Niro    49
Johnny Depp       41
Nicolas Cage      33
J.K. Simmons      31
Bruce Willis      30
Name: actor_1_name, dtype: int64


df['actor_1_facebook_likes'].value_counts().head()

1000.0     449
11000.0    211
2000.0     197
3000.0     155
12000.0    135
Name: actor_1_facebook_likes, dtype: int64


#number of movies other actors starred in
df['actor_2_name'].value_counts().head()

Morgan Freeman     20
Charlize Theron    15
Brad Pitt          14
Meryl Streep       11
James Franco       11
Name: actor_2_name, dtype: int64


df[['actor_2_name', 'actor_2_facebook_likes']].value_counts().head()

actor_2_name     actor_2_facebook_likes
Morgan Freeman   11000.0                   20
Charlize Theron  9000.0                    15
Brad Pitt        11000.0                   14
Meryl Streep     11000.0                   11
James Franco     11000.0                   11
dtype: int64


#number of movies other actors starred in
df['actor_3_name'].value_counts().head()

Steve Coogan      8
John Heard        8
Ben Mendelsohn    8
Lois Maxwell      7
Kirsten Dunst     7
Name: actor_3_name, dtype: int64


df[['actor_3_name', 'actor_3_facebook_likes']].value_counts().head()

actor_3_name    actor_3_facebook_likes
Ben Mendelsohn  748.0                     8
Steve Coogan    1000.0                    8
John Heard      697.0                     8
Robert Duvall   3000.0                    7
Stephen Root    939.0                     7
dtype: int64


#number of users who gave a review
df[['movie_title', 'num_user_for_reviews']].sort_values(by=['num_user_for_reviews'], ascending=False).head()


#number of critical reviews on imdb
df[['movie_title', 'num_critic_for_reviews']].sort_values(by=['num_critic_for_reviews'], ascending=False).head()


#number of people who voted for the movie
df[['movie_title', 'num_voted_users']].sort_values(by=['num_voted_users'], ascending=False).head()


#Total number of facebook likes of the entire cast of the movie
df[['movie_title', 'cast_total_facebook_likes']].sort_values(by=['cast_total_facebook_likes'], ascending=False).head()


#number of Facebook likes in the movie page
df[['movie_title', 'movie_facebook_likes']].sort_values(by=['movie_facebook_likes'], ascending=False).head()


#top 5 popular genres
df['genres'].value_counts().head()

Drama                   236
Comedy                  209
Comedy|Drama            191
Comedy|Drama|Romance    187
Comedy|Romance          158
Name: genres, dtype: int64


df['content_rating'].value_counts().head()

R            2118
PG-13        1461
PG            701
Not Rated     116
G             112
Name: content_rating, dtype: int64


#top 5 movie gross earnings in Dollars
df[['movie_title', 'gross']].sort_values(by=['gross'], ascending=False).head()


# check duplicate rows 
# display those rows for review
dup_rows = df.drop_duplicates()
dup_rows.head()


# dropping duplicated rows from dataframe
df = df.drop_duplicates()
df.shape

(4998, 28)


df=df.drop(['plot_keywords', 'facenumber_in_poster', 'aspect_ratio', 'movie_imdb_link'], axis=1)
df.head()


# this dataframe will be used for Regression
dfr = df.copy()
dfr.head()


# finding null values
dfr.isnull().sum()

color                         19
director_name                103
num_critic_for_reviews        49
duration                      15
director_facebook_likes      103
actor_3_facebook_likes        23
actor_2_name                  13
actor_1_facebook_likes         7
gross                        874
genres                         0
actor_1_name                   7
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                  23
num_user_for_reviews          21
language                      12
country                        5
content_rating               301
budget                       487
title_year                   107
actor_2_facebook_likes        13
imdb_score                     0
movie_facebook_likes           0
dtype: int64


dfr['color'].describe()

count      4979
unique        2
top       Color
freq       4772
Name: color, dtype: object


# replace null value with the most popular value in color column
dfr = dfr.fillna({'color': 'Color'})
dfr.isnull().sum()

color                          0
director_name                103
num_critic_for_reviews        49
duration                      15
director_facebook_likes      103
actor_3_facebook_likes        23
actor_2_name                  13
actor_1_facebook_likes         7
gross                        874
genres                         0
actor_1_name                   7
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                  23
num_user_for_reviews          21
language                      12
country                        5
content_rating               301
budget                       487
title_year                   107
actor_2_facebook_likes        13
imdb_score                     0
movie_facebook_likes           0
dtype: int64


dfr['language'].describe()

count        4986
unique         47
top       English
freq         4662
Name: language, dtype: object


# replace null value with the most popular value in language column
dfr = dfr.fillna({'language': 'English'})
dfr.isnull().sum()

color                          0
director_name                103
num_critic_for_reviews        49
duration                      15
director_facebook_likes      103
actor_3_facebook_likes        23
actor_2_name                  13
actor_1_facebook_likes         7
gross                        874
genres                         0
actor_1_name                   7
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                  23
num_user_for_reviews          21
language                       0
country                        5
content_rating               301
budget                       487
title_year                   107
actor_2_facebook_likes        13
imdb_score                     0
movie_facebook_likes           0
dtype: int64


dfr['country'].describe()

count     4993
unique      65
top        USA
freq      3773
Name: country, dtype: object


# replace null value with the most popular value in country column
dfr = dfr.fillna({'country': 'USA'})
dfr.isnull().sum()

color                          0
director_name                103
num_critic_for_reviews        49
duration                      15
director_facebook_likes      103
actor_3_facebook_likes        23
actor_2_name                  13
actor_1_facebook_likes         7
gross                        874
genres                         0
actor_1_name                   7
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                  23
num_user_for_reviews          21
language                       0
country                        0
content_rating               301
budget                       487
title_year                   107
actor_2_facebook_likes        13
imdb_score                     0
movie_facebook_likes           0
dtype: int64


#replace null value with the median value 
dfr = dfr.fillna({'num_critic_for_reviews': dfr['num_critic_for_reviews'].mean()})
dfr = dfr.fillna({'duration': dfr['duration'].mean()})
dfr = dfr.fillna({'director_facebook_likes': dfr['director_facebook_likes'].mean()})
dfr = dfr.fillna({'actor_3_facebook_likes': dfr['actor_3_facebook_likes'].mean()})
dfr = dfr.fillna({'actor_1_facebook_likes': dfr['actor_1_facebook_likes'].mean()})
dfr = dfr.fillna({'num_user_for_reviews': dfr['num_user_for_reviews'].mean()})
dfr = dfr.fillna({'title_year': dfr['title_year'].mean()})
dfr = dfr.fillna({'actor_2_facebook_likes': dfr['actor_2_facebook_likes'].mean()})
dfr.isnull().sum()

color                          0
director_name                103
num_critic_for_reviews         0
duration                       0
director_facebook_likes        0
actor_3_facebook_likes         0
actor_2_name                  13
actor_1_facebook_likes         0
gross                        874
genres                         0
actor_1_name                   7
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                  23
num_user_for_reviews           0
language                       0
country                        0
content_rating               301
budget                       487
title_year                     0
actor_2_facebook_likes         0
imdb_score                     0
movie_facebook_likes           0
dtype: int64


#dropping all remaining null values and reset the index
dfr = dfr.dropna()
dfr = dfr.reset_index(drop=True)
dfr.head()


#checking to see if there is any null values left
dfr.isnull().sum()

color                        0
director_name                0
num_critic_for_reviews       0
duration                     0
director_facebook_likes      0
actor_3_facebook_likes       0
actor_2_name                 0
actor_1_facebook_likes       0
gross                        0
genres                       0
actor_1_name                 0
movie_title                  0
num_voted_users              0
cast_total_facebook_likes    0
actor_3_name                 0
num_user_for_reviews         0
language                     0
country                      0
content_rating               0
budget                       0
title_year                   0
actor_2_facebook_likes       0
imdb_score                   0
movie_facebook_likes         0
dtype: int64


# Who are the top 5 directors in terms of how many movies they made?
df.groupby(['director_name']).size().sort_values(ascending=False).head()

director_name
Steven Spielberg    26
Woody Allen         22
Clint Eastwood      20
Martin Scorsese     20
Ridley Scott        17
dtype: int64


#bar graph visualization
df.groupby(['director_name']).size().sort_values(ascending=False).head().plot(kind='bar')

<AxesSubplot:xlabel='director_name'>


#what are the top 5 directors in terms of gross earnings?
df.groupby('director_name')['gross'].sum().sort_values(ascending=False).head()

director_name
Steven Spielberg    4.114233e+09
Peter Jackson       2.592969e+09
Michael Bay         2.231243e+09
Tim Burton          2.071275e+09
Sam Raimi           2.049549e+09
Name: gross, dtype: float64


#bar graph
df.groupby('director_name')['gross'].sum().sort_values(ascending=False).head().plot(kind='bar')

<AxesSubplot:xlabel='director_name'>


#Which director has the most facebook likes?
df.groupby('director_name')['director_facebook_likes'].sum().sort_values(ascending=False).head()

director_name
Steven Spielberg    364000.0
Martin Scorsese     340000.0
Clint Eastwood      320000.0
Woody Allen         242000.0
David Fincher       210000.0
Name: director_facebook_likes, dtype: float64


# How many movies did the primary actor star in?
df.groupby(['actor_1_name']).size().sort_values(ascending=False).head()

actor_1_name
Robert De Niro    49
Johnny Depp       40
Nicolas Cage      32
J.K. Simmons      31
Matt Damon        30
dtype: int64


df.groupby(['actor_1_name']).size().sort_values(ascending=False).head().plot(kind='barh')

<AxesSubplot:ylabel='actor_1_name'>


#Who are the top 5 actors with the highest grossing movies?
df.groupby('actor_1_name')['gross'].sum().sort_values(ascending=False).head(5)

actor_1_name
Johnny Depp      3.688020e+09
Harrison Ford    3.391556e+09
Tom Hanks        3.264559e+09
Tom Cruise       2.987622e+09
J.K. Simmons     2.856407e+09
Name: gross, dtype: float64


df.groupby('actor_1_name')['gross'].sum().sort_values(ascending=False).head().plot(kind='bar')

<AxesSubplot:xlabel='actor_1_name'>


#What are the top 5 movies with the most critical reviews?
df.groupby('movie_title')['num_critic_for_reviews'].sum().sort_values(ascending=False).head()

movie_title
Skyfall                       1500.0
King Kong                     1338.0
Oz the Great and Powerful     1050.0
RoboCop                        984.0
The Great Gatsby               980.0
Name: num_critic_for_reviews, dtype: float64


df.groupby('movie_title')['num_critic_for_reviews'].sum().sort_values(ascending=False).head().plot(kind='bar')

<AxesSubplot:xlabel='movie_title'>


#Which top 5 movies were voted for most?
df.groupby('movie_title')['num_voted_users'].sum().sort_values(ascending=False).head()

movie_title
The Shawshank Redemption     1689764
The Dark Knight              1676169
Inception                    1468200
Fight Club                   1347461
Pulp Fiction                 1324680
Name: num_voted_users, dtype: int64


df.groupby('movie_title')['num_voted_users'].sum().sort_values(ascending=False).head().plot(kind='bar')

<AxesSubplot:xlabel='movie_title'>


gen_gros = df[['genres','gross']]


gen_gros = df.dropna(subset=['genres'])


gen_gros = df.dropna(subset=['gross'])


#now, separate a string of genres into multiple rows
# borrowed the code from https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows

gegr = pd.DataFrame(gen_gros.genres.str.split('|').tolist(), index=gen_gros.gross).stack()
gegr = gegr.reset_index()[[0, 'gross']] 
gegr.columns = ['genres', 'gross'] 
gegr.head()


gegr.groupby('genres')['gross'].sum().sort_values(ascending=False).head()

genres
Adventure    7.855534e+10
Comedy       7.719882e+10
Action       7.399143e+10
Drama        7.301086e+10
Thriller     5.447667e+10
Name: gross, dtype: float64


gegr.groupby('genres')['gross'].sum().sort_values(ascending=False).plot(kind='line')

<AxesSubplot:xlabel='genres'>


gegr.groupby('genres')['gross'].sum().sort_values(ascending=False).plot(kind='bar')

<AxesSubplot:xlabel='genres'>


# Which top 5 films with the highest imdb_score?
mov_cat_score2 = df.groupby('movie_title').agg({'imdb_score': ['max']})
mov_cat_score2.columns = ['imdb_score_max']
mov_cat_score2 = mov_cat_score2.reset_index()
mov_cat_score2.sort_values(by='imdb_score_max', ascending=False).head()


#creating a separate dataframe
dfbag1 = df.copy()


dfbag2 = dfbag1[['imdb_score','movie_title']]
dfbag2.head()


dfbag3 = dfbag2.dropna(subset=['imdb_score'])
dfbag3 = dfbag2.dropna(subset=['movie_title'])


# creating 3 bins from imdb_score column
dfbag3['imdb_category'] = pd.qcut(dfbag3['imdb_score'], 3)
dfbag3.head()


#rename the name of the bins to bad, average, and good.

dfbag3['imdb_category_labels']=pd.qcut(dfbag3['imdb_score'], 3, labels=['bad', 'average', 'good'])
dfbag3.sort_values(by= 'imdb_score', ascending=False).head()


dfbag3.groupby('imdb_category_labels').size()

imdb_category_labels
bad        1748
average    1673
good       1577
dtype: int64


icl = dfbag3.groupby('imdb_category_labels').size().reset_index()  
icl


colors = ['red', 'yellow', 'lightgreen']
explode = (0.1, 0, 0)

plt.pie(icl[0], labels=icl['imdb_category_labels'], explode=explode, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140)

plt.axis('equal');


# Which primary actors starred in the biggest budgeted movie?
df.groupby(['actor_1_name','movie_title'])['budget'].sum().sort_values(ascending=False).head()

actor_1_name      movie_title       
Doona Bae         The Host              1.221550e+10
Min-sik Choi      Lady Vengeance        4.200000e+09
Marcell Nagy      Fateless              2.500000e+09
Minnie Driver     Princess Mononoke     2.400000e+09
William Hootkins  Steamboy              2.127520e+09
Name: budget, dtype: float64


#What is the most successfull movie both a director and primary actor made together in terms of gross?
df.groupby(['director_name','actor_1_name','movie_title'])['gross'].sum().sort_values(ascending=False).head()

director_name  actor_1_name        movie_title         
James Cameron  CCH Pounder         Avatar                  760505847.0
Jon Favreau    Scarlett Johansson  The Jungle Book         725290282.0
Sam Raimi      J.K. Simmons        Spider-Man 3            673060606.0
Tim Burton     Johnny Depp         Alice in Wonderland     668370412.0
James Cameron  Leonardo DiCaprio   Titanic                 658672302.0
Name: gross, dtype: float64


#What the most popular content rating?
imp = df['content_rating'].value_counts()
imp.head()

R            2098
PG-13        1444
PG            698
Not Rated     116
G             112
Name: content_rating, dtype: int64


imp.sort_values(ascending=False).head().plot(kind='barh')

<AxesSubplot:>


dfact1 = df.loc[df['color'] == 'Color']        
dfact2 = df.loc[df['color'] == ' Black and White']  

fig, (ax1, ax2) = plt.subplots(1, 2) 

fig.suptitle('Horizontally stacked subplots')
ax1.plot(dfact1['gross'], 'ro')
ax2.plot(dfact2['gross'], 'ro');


sns.kdeplot(df.title_year)

<AxesSubplot:xlabel='title_year', ylabel='Density'>


sns.catplot("content_rating", "imdb_score", data=df, kind='point')

<seaborn.axisgrid.FacetGrid at 0x198f2ba6610>


sns.jointplot("imdb_score", "gross", df, kind="hex", color="#8855AA")

<seaborn.axisgrid.JointGrid at 0x198f2b29130>


sns.jointplot("gross", "num_critic_for_reviews", df)

<seaborn.axisgrid.JointGrid at 0x198f288ec70>


df[['actor_2_facebook_likes', 'actor_1_facebook_likes']].plot(kind='kde', title='KDE plot')

<AxesSubplot:title={'center':'KDE plot'}, ylabel='Density'>


df[['director_facebook_likes', 'movie_facebook_likes']].plot(kind='box', title='Box plot')

<AxesSubplot:title={'center':'Box plot'}>


sns.lmplot('title_year', 'gross', data=df, fit_reg=True)

<seaborn.axisgrid.FacetGrid at 0x198f42068b0>


sns.lmplot("num_voted_users", "imdb_score", df, hue="color")

<seaborn.axisgrid.FacetGrid at 0x198f41e30a0>


sns.kdeplot(df.num_user_for_reviews, df.num_voted_users)

<AxesSubplot:xlabel='num_user_for_reviews', ylabel='num_voted_users'>


sns.histplot(data=df, x="title_year", hue="color")

<AxesSubplot:xlabel='title_year', ylabel='Count'>


df.corr()


plt.figure(figsize=(14,10))
sns.heatmap(df.corr(), vmax=.8, square=True, annot=True, fmt=".1f")

<AxesSubplot:>


# correlation of two columns: num_critic_for_reviews, movie_facebook_likes
df[['num_critic_for_reviews','movie_facebook_likes']].corr()


# correlation of two columns: actor_1_facebook_likes, cast_total_facebook_likes
df[['actor_1_facebook_likes','cast_total_facebook_likes']].corr()


# correlation of two columns: num_voted_users, num_user_for_reviews
df[['num_voted_users','num_user_for_reviews']].corr()


g = sns.PairGrid(df)
g.map(plt.scatter)

<seaborn.axisgrid.PairGrid at 0x198f4a7f610>


import pingouin as pg


pg.corr(x=df['num_voted_users'], y=df['num_user_for_reviews'])


pg.corr(x=df['actor_1_facebook_likes'], y=df['cast_total_facebook_likes'])


pg.corr(x=df['num_critic_for_reviews'], y=df['movie_facebook_likes'])


dfR = dfr.copy()
dfR.head()


#drop unnecessary coloumn
dfR=dfr.drop(['movie_title'], axis=1)
dfR.head(1)


#splitting genres
dfR['main_genre'] = dfR.genres.str.split('|').str[0]
dfR.head()


from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dfR['main_genre'] = le.fit_transform(dfR.main_genre)
dfR.head()


dfR.color = le.fit_transform(dfR.color)
dfR.director_name = le.fit_transform(dfR.director_name)
dfR.actor_2_name = le.fit_transform(dfR.actor_2_name)
dfR.actor_1_name = le.fit_transform(dfR.actor_1_name)
dfR.actor_3_name = le.fit_transform(dfR.actor_3_name)
dfR.language = le.fit_transform(dfR.language)
dfR.country = le.fit_transform(dfR.country)
dfR.content_rating = le.fit_transform(dfR.content_rating)
dfR.head()


#make new dataframe for Regression Modeling
dfRM = dfR.copy()
dfRM.head()


# remove unnecessary column ('genres')

dfRM = dfRM.drop(['genres'], axis=1)
dfRM.head(1)


dfRM.corr()['imdb_score'].sort_values()

title_year                  -0.133390
color                       -0.118480
country                     -0.054637
actor_3_name                -0.004077
actor_2_name                -0.003265
director_name                0.016045
actor_1_name                 0.025374
budget                       0.030437
main_genre                   0.054811
actor_3_facebook_likes       0.064969
content_rating               0.074990
actor_1_facebook_likes       0.093368
actor_2_facebook_likes       0.101208
cast_total_facebook_likes    0.106062
language                     0.111172
director_facebook_likes      0.192332
gross                        0.217158
movie_facebook_likes         0.283379
num_user_for_reviews         0.325020
num_critic_for_reviews       0.350114
duration                     0.367463
num_voted_users              0.479947
imdb_score                   1.000000
Name: imdb_score, dtype: float64


plt.figure(figsize=(12, 12))
sns.heatmap(dfRM.corr(), annot=True)

<AxesSubplot:>


#assigning columns to X and Y variables
X = dfRM.drop(['imdb_score'], axis =1)
y = dfRM['imdb_score']


# Full Model 1 development
# split validation (70% training & 30% testing data)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(2660, 22) (2660,) (1140, 22) (1140,)


model1 = lm.LinearRegression()   
model1.fit(X_train, y_train)        
model1_y = model1.predict(X_test)


print('Coefficients: ', model1.coef_)
print("y-intercept ", model1.intercept_)

Coefficients:  [-3.60473287e-01  7.08596471e-07  2.66441067e-03  9.25211727e-03
  1.14551964e-06  2.71098754e-05 -1.94933021e-06  4.60424228e-05
 -9.24772051e-10  2.47707249e-05  3.51449550e-06 -4.35642069e-05
 -1.42679048e-05 -6.23624458e-04  4.13847368e-02 -7.29686643e-03
  1.61841498e-02 -2.09896310e-10 -2.20522188e-02  5.20883132e-05
 -2.76290152e-06  3.13106567e-02]
y-intercept  49.13015384492339


pd.DataFrame(list(zip(X.columns, np.transpose(model1.coef_)))).round(2)


# full model 1 evaluation

print("mean square error: ", mean_squared_error(y_test, model1_y))
print("variance or r-squared: ", explained_variance_score(y_test, model1_y))

mean square error:  0.6975054276930848
variance or r-squared:  0.36763555494497147


plt.subplots()
plt.scatter(y_test, model1_y)   
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)   
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()


X_train.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'actor_1_name', 'num_voted_users',
       'cast_total_facebook_likes', 'actor_3_name', 'num_user_for_reviews',
       'language', 'country', 'content_rating', 'budget', 'title_year',
       'actor_2_facebook_likes', 'movie_facebook_likes', 'main_genre'],
      dtype='object')


runs_reg_model = ols("imdb_score~color+director_name+num_critic_for_reviews+duration+director_facebook_likes+actor_3_facebook_likes+actor_2_name+actor_1_facebook_likes+gross+actor_1_name+num_voted_users+cast_total_facebook_likes+actor_3_name+num_user_for_reviews+language+country+content_rating+budget+title_year+actor_2_facebook_likes+movie_facebook_likes+main_genre",dfRM)
runs_reg = runs_reg_model.fit()
print(runs_reg.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:             imdb_score   R-squared:                       0.383
Model:                            OLS   Adj. R-squared:                  0.379
Method:                 Least Squares   F-statistic:                     106.5
Date:                Mon, 10 May 2021   Prob (F-statistic):               0.00
Time:                        20:12:08   Log-Likelihood:                -4677.2
No. Observations:                3800   AIC:                             9400.
Df Residuals:                    3777   BIC:                             9544.
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
=============================================================================================
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept                    48.4620      3.298     14.695      0.000      41.996      54.928
color                        -0.3642      0.078     -4.687      0.000      -0.517      -0.212
director_name              1.691e-05   2.72e-05      0.621      0.535   -3.65e-05    7.03e-05
num_critic_for_reviews        0.0025      0.000     12.863      0.000       0.002       0.003
duration                      0.0096      0.001     14.451      0.000       0.008       0.011
director_facebook_likes    3.721e-06   4.71e-06      0.790      0.430   -5.51e-06     1.3e-05
actor_3_facebook_likes      5.73e-05   2.11e-05      2.719      0.007     1.6e-05    9.86e-05
actor_2_name              -9.829e-06   2.07e-05     -0.475      0.635   -5.04e-05    3.08e-05
actor_1_facebook_likes     6.236e-05   1.29e-05      4.823      0.000     3.7e-05    8.77e-05
gross                     -9.463e-10   2.85e-10     -3.325      0.001    -1.5e-09   -3.88e-10
actor_1_name               4.927e-05   3.22e-05      1.530      0.126   -1.39e-05       0.000
num_voted_users            3.657e-06   1.72e-07     21.280      0.000    3.32e-06    3.99e-06
cast_total_facebook_likes -6.086e-05   1.29e-05     -4.720      0.000   -8.61e-05   -3.56e-05
actor_3_name              -1.526e-05   1.76e-05     -0.866      0.387   -4.98e-05    1.93e-05
num_user_for_reviews         -0.0007   5.69e-05    -11.511      0.000      -0.001      -0.001
language                      0.0407      0.006      7.087      0.000       0.029       0.052
country                      -0.0061      0.001     -4.318      0.000      -0.009      -0.003
content_rating                0.0175      0.009      2.025      0.043       0.001       0.034
budget                    -7.436e-11   6.12e-11     -1.215      0.225   -1.94e-10    4.57e-11
title_year                   -0.0218      0.002    -13.180      0.000      -0.025      -0.019
actor_2_facebook_likes     6.453e-05   1.36e-05      4.746      0.000    3.79e-05    9.12e-05
movie_facebook_likes      -1.738e-06   9.34e-07     -1.861      0.063   -3.57e-06    9.27e-08
main_genre                    0.0314      0.005      6.692      0.000       0.022       0.041
==============================================================================
Omnibus:                      537.600   Durbin-Watson:                   1.940
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1060.944
Skew:                          -0.875   Prob(JB):                    4.16e-231
Kurtosis:                       4.908   Cond. No.                     5.63e+10
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.63e+10. This might indicate that there are
strong multicollinearity or other numerical problems.


#assigning columns to X and Y variables
X = dfRM.drop(['imdb_score', 'color', 'actor_3_facebook_likes', 'cast_total_facebook_likes', 'actor_3_name', 'language', 'country', 'budget', 'title_year'], axis =1)
y = dfRM['imdb_score']


# Model 2 validation
# split validation (70% training & 30% testing data)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(2660, 14) (2660,) (1140, 14) (1140,)


# Model 2 building
# build a multiple regression model below

model2 = lm.LinearRegression()      
model2.fit(X_train, y_train)        
model2_y = model2.predict(X_test)


pd.DataFrame(list(zip(X.columns, np.transpose(model2.coef_)))).round(2)


# Model 2 evaluation

print("mean square error: ", mean_squared_error(y_test, model2_y))
print("variance or r-squared: ", explained_variance_score(y_test, model2_y))

mean square error:  0.7583978138026021
variance or r-squared:  0.3133516604609867


plt.subplots()
plt.scatter(y_test, model2_y)       # showing actual y as X-axis and predicted y as Y-axis
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)   #dotted line represents perfect prediction (actual = predicted)
plt.xlabel('Actual')
plt.ylabel('Predicted')   #dashed black line in graph below
plt.show()


#assigning columns to X and Y variables
X = dfRM.drop(['imdb_score'], axis =1)
y = dfRM['imdb_score']


# split validation (70% training & 30% testing data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(2660, 22) (2660,) (1140, 22) (1140,)


#Fitting the model

model4 = lm.Lasso(alpha=1)         
model4.fit(X_train, y_train)        
model4_y = model4.predict(X_test)


print('Coefficients: ', model4.coef_)
print("y-intercept ", model4.intercept_)

Coefficients:  [-0.00000000e+00  5.51502947e-06  1.95570275e-03  8.91445510e-03
  4.10957034e-06  3.68993754e-05 -1.94287132e-06  5.27624872e-05
 -1.67028463e-09  2.08743384e-05  3.75225164e-06 -5.19009239e-05
 -6.41584264e-06 -4.92817926e-04  0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -2.12880572e-10 -7.46309141e-03  5.88536919e-05
 -3.00565679e-06  0.00000000e+00]
y-intercept  20.020823435201983


pd.DataFrame(list(zip(X.columns, np.transpose(model4.coef_)))).round(2)


print("mean square error: ", mean_squared_error(y_test, model4_y))
print("variance or r-squared: ", explained_variance_score(y_test, model4_y))

mean square error:  0.7451581727766546
variance or r-squared:  0.3245674947261781


X_new = SelectKBest(f_regression, k=2).fit_transform(X, y)
X_new

array([[1.78000e+02, 8.86204e+05],
       [1.69000e+02, 4.71220e+05],
       [1.48000e+02, 2.75868e+05],
       ...,
       [8.10000e+01, 5.20550e+04],
       [9.50000e+01, 1.33800e+03],
       [9.00000e+01, 4.28500e+03]])


# what are those two columns?
selector = SelectKBest(f_regression, k=2).fit(X, y)
idxs_selected = selector.get_support(indices=True)
print(idxs_selected)

[ 3 10]


X_train.head(2)


# I Choose 2 variables (duration, num_voted_users) and develop a multiple linear regression model.

y_new = dfRM['imdb_score'] 
X_new = dfRM[['duration', 'num_voted_users']]


# split validation (using X_new)

X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=0)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(2660, 2) (2660,) (1140, 2) (1140,)


# Model Building

model5 = lm.LinearRegression()
model5.fit(X_train, y_train)
model5_y = model5.predict(X_test)


coef = ["%.3f" % i for i in model5.coef_]
x2columns = [i for i in X.columns]
sorted(zip(coef, x2columns), reverse=True)

[('0.010', 'color'), ('0.000', 'director_name')]


pd.DataFrame(list(zip(X.columns, np.transpose(model5.coef_)))).round(2)


# Model Evaluation
print("mean square error: ", mean_squared_error(y_test, model5_y))
print("variance or r-squared: ", explained_variance_score(y_test, model5_y))

mean square error:  0.8147118108347682
variance or r-squared:  0.26196018554763556


plt.subplots()
plt.scatter(y_test, model5_y)       
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()


X_new = SelectKBest(f_regression, k=3).fit_transform(X, y)
X_new

array([[7.23000e+02, 1.78000e+02, 8.86204e+05],
       [3.02000e+02, 1.69000e+02, 4.71220e+05],
       [6.02000e+02, 1.48000e+02, 2.75868e+05],
       ...,
       [5.60000e+01, 8.10000e+01, 5.20550e+04],
       [1.40000e+01, 9.50000e+01, 1.33800e+03],
       [4.30000e+01, 9.00000e+01, 4.28500e+03]])


# what are those three columns?

selector = SelectKBest(f_regression, k=3).fit(X, y)
idxs_selected = selector.get_support(indices=True)
print(idxs_selected)

[ 2  3 10]


# split validation (using X_new)

X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=0)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(2660, 3) (2660,) (1140, 3) (1140,)


# Model Building
model6 = lm.LinearRegression()
model6.fit(X_train, y_train)
model6_y = model6.predict(X_test)

# Model Evaluation
print("mean square error: ", mean_squared_error(y_test, model6_y))
print("variance or r-squared: ", explained_variance_score(y_test, model6_y))

mean square error:  0.8149166305784206
variance or r-squared:  0.26154558745430856


lr = lm.LinearRegression()
rfe = RFE(lr, n_features_to_select=2)
rfe_y = rfe.fit(X,y)

print("Features sorted by their rank:")
print(sorted(zip([x for x in rfe.ranking_], X.columns)))

Features sorted by their rank:
[(1, 'color'), (1, 'language'), (2, 'content_rating'), (3, 'main_genre'), (4, 'duration'), (5, 'title_year'), (6, 'country'), (7, 'num_critic_for_reviews'), (8, 'num_user_for_reviews'), (9, 'actor_3_facebook_likes'), (10, 'actor_2_facebook_likes'), (11, 'actor_1_facebook_likes'), (12, 'cast_total_facebook_likes'), (13, 'actor_1_name'), (14, 'director_facebook_likes'), (15, 'actor_3_name'), (16, 'actor_2_name'), (17, 'director_name'), (18, 'num_voted_users'), (19, 'movie_facebook_likes'), (20, 'gross'), (21, 'budget')]


# I Choose 2 variables (color, language) and develop a multiple linear regression model (model7).

y_new = dfRM['imdb_score'] 
X_new = dfRM[['color', 'language']]


# Model validation
# split validation (70% training & 30% testing data)

X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size=0.3, random_state=0)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(2660, 2) (2660,) (1140, 2) (1140,)


#Model building
model7 = lm.LinearRegression()
model7.fit(X_train, y_train)   
model7_y = model7.predict(X_test)


pd.DataFrame(list(zip(X_new.columns, np.transpose(model7.coef_)))).round(2)


print("mean square error: ", mean_squared_error(y_test, model7_y))
print("variance or r-squared: ", explained_variance_score(y_test, model7_y))

mean square error:  1.0755484398602855
variance or r-squared:  0.025182699822159593


from sklearn.ensemble import RandomForestRegressor

#assigning columns to X and Y variables
X = dfRM.drop(['imdb_score'], axis =1)
y = dfRM['imdb_score']

# split validation (70% training & 30% testing data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# model building
regr = RandomForestRegressor(random_state=0)
regr.fit(X_train, y_train)
regr_predicted = regr.predict(X_test)

print("mean square error: ", mean_squared_error(y_test, regr_predicted))
print("variance or r-squared: ", explained_variance_score(y_test, regr_predicted))

mean square error:  0.475041150877193
variance or r-squared:  0.5691611729372041


# predictors in order of importance
feature_importances = pd.DataFrame(regr.feature_importances_, index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)

feature_importances


feature_importances.plot(kind='barh');


#creating new datarfame for Classification model

dfCM = dfRM.copy()
dfCM.head()


dfCM['imdb_quality'] = 'bad'
dfCM.loc[dfCM['imdb_score'] > 6,'imdb_quality'] = 'good'
dfCM.head()


# How likely a movie is bad or good?

dfCM['imdb_quality'].value_counts() / len(dfCM)

good    0.693684
bad     0.306316
Name: imdb_quality, dtype: float64


s = (dfCM['imdb_quality'].value_counts() / len(dfCM))
ax = s.plot(kind='barh') 
[ax.text(v, i, '{:.2f}%'.format(100*v)) for i, v in enumerate(s)];


#mappling or replacing
dfCM = dfCM.replace({'imdb_quality': 'bad'}, {'imdb_quality': '0'})
dfCM = dfCM.replace({'imdb_quality': 'good'}, {'imdb_quality': '1'})


dfCM.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3800 entries, 0 to 3799
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   color                      3800 non-null   int32  
 1   director_name              3800 non-null   int32  
 2   num_critic_for_reviews     3800 non-null   float64
 3   duration                   3800 non-null   float64
 4   director_facebook_likes    3800 non-null   float64
 5   actor_3_facebook_likes     3800 non-null   float64
 6   actor_2_name               3800 non-null   int32  
 7   actor_1_facebook_likes     3800 non-null   float64
 8   gross                      3800 non-null   float64
 9   actor_1_name               3800 non-null   int32  
 10  num_voted_users            3800 non-null   int64  
 11  cast_total_facebook_likes  3800 non-null   int64  
 12  actor_3_name               3800 non-null   int32  
 13  num_user_for_reviews       3800 non-null   float64
 14  language                   3800 non-null   int32  
 15  country                    3800 non-null   int32  
 16  content_rating             3800 non-null   int32  
 17  budget                     3800 non-null   float64
 18  title_year                 3800 non-null   float64
 19  actor_2_facebook_likes     3800 non-null   float64
 20  imdb_score                 3800 non-null   float64
 21  movie_facebook_likes       3800 non-null   int64  
 22  main_genre                 3800 non-null   int32  
 23  imdb_quality               3800 non-null   object 
dtypes: float64(11), int32(9), int64(3), object(1)
memory usage: 579.0+ KB


dfCM['imdb_quality'] = dfCM['imdb_quality'].astype(int)
print(dfCM.dtypes)

color                          int32
director_name                  int32
num_critic_for_reviews       float64
duration                     float64
director_facebook_likes      float64
actor_3_facebook_likes       float64
actor_2_name                   int32
actor_1_facebook_likes       float64
gross                        float64
actor_1_name                   int32
num_voted_users                int64
cast_total_facebook_likes      int64
actor_3_name                   int32
num_user_for_reviews         float64
language                       int32
country                        int32
content_rating                 int32
budget                       float64
title_year                   float64
actor_2_facebook_likes       float64
imdb_score                   float64
movie_facebook_likes           int64
main_genre                     int32
imdb_quality                   int32
dtype: object


#Exploratory data analysis
# basic statistics
dfCM.describe()


# correlation analysis
dfCM.corr()


corr = pd.DataFrame(dfCM.corr()['imdb_quality'].drop('imdb_quality'))
corr.sort_values(['imdb_quality'], ascending = False)


# heatmap for correlation
plt.figure(figsize=(8,8))
sns.heatmap(dfCM.corr(), annot=True)

<AxesSubplot:>


# cluster map 
sns.clustermap(data=dfCM.corr(), annot=True, cmap='Greens')

<seaborn.matrix.ClusterGrid at 0x19884ddd8e0>


dfCM.head(2)


dfCM.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3800 entries, 0 to 3799
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   color                      3800 non-null   int32  
 1   director_name              3800 non-null   int32  
 2   num_critic_for_reviews     3800 non-null   float64
 3   duration                   3800 non-null   float64
 4   director_facebook_likes    3800 non-null   float64
 5   actor_3_facebook_likes     3800 non-null   float64
 6   actor_2_name               3800 non-null   int32  
 7   actor_1_facebook_likes     3800 non-null   float64
 8   gross                      3800 non-null   float64
 9   actor_1_name               3800 non-null   int32  
 10  num_voted_users            3800 non-null   int64  
 11  cast_total_facebook_likes  3800 non-null   int64  
 12  actor_3_name               3800 non-null   int32  
 13  num_user_for_reviews       3800 non-null   float64
 14  language                   3800 non-null   int32  
 15  country                    3800 non-null   int32  
 16  content_rating             3800 non-null   int32  
 17  budget                     3800 non-null   float64
 18  title_year                 3800 non-null   float64
 19  actor_2_facebook_likes     3800 non-null   float64
 20  imdb_score                 3800 non-null   float64
 21  movie_facebook_likes       3800 non-null   int64  
 22  main_genre                 3800 non-null   int32  
 23  imdb_quality               3800 non-null   int32  
dtypes: float64(11), int32(10), int64(3)
memory usage: 564.2 KB


# convert categorical variables to dummy variables

dfCM =  pd.get_dummies(dfCM, columns=["color", "language", "country", "content_rating", "main_genre"],
                         prefix=["color", "language", "country", "content_rating", "main_genre"],
                         drop_first=True)
dfCM.head(2)


dfCM.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3800 entries, 0 to 3799
Columns: 125 entries, director_name to main_genre_16
dtypes: float64(11), int32(5), int64(3), uint8(106)
memory usage: 883.3 KB


# declare X variables and y variable

y = dfCM['imdb_quality']
X = dfCM.drop(['imdb_quality', 'imdb_score'], axis=1)


# split validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Initialize DecisionTreeClassifier() ... name your decision model "dt"
dt = DecisionTreeClassifier()

# Train a decision tree model
dt.fit(X_train, y_train)

DecisionTreeClassifier()


print(len(X_train))
print(len(X_test))

2660
1140


#Model evaluation
# http://scikit-learn.org/stable/modules/model_evaluation.html
print(metrics.accuracy_score(y_test, dt.predict(X_test)))
print("--------------------------------------------------------")
print(metrics.confusion_matrix(y_test, dt.predict(X_test))) 
print("--------------------------------------------------------")
print(metrics.classification_report(y_test, dt.predict(X_test)))
print("--------------------------------------------------------")
print(metrics.roc_auc_score(y_test, dt.predict(X_test)))

# y-test is the actual y value in the testing dataset
# dt.predict(X_test) is the predicted y value generated by your model
# If they are same, we can say your model is accurate.

0.7403508771929824
--------------------------------------------------------
[[189 141]
 [155 655]]
--------------------------------------------------------
              precision    recall  f1-score   support

           0       0.55      0.57      0.56       330
           1       0.82      0.81      0.82       810

    accuracy                           0.74      1140
   macro avg       0.69      0.69      0.69      1140
weighted avg       0.74      0.74      0.74      1140

--------------------------------------------------------
0.6906846240179574


!pip install scikit-plot

Requirement already satisfied: scikit-plot in c:\users\patri\anaconda3\lib\site-packages (0.3.7)
Requirement already satisfied: matplotlib>=1.4.0 in c:\users\patri\anaconda3\lib\site-packages (from scikit-plot) (3.3.2)
Requirement already satisfied: scikit-learn>=0.18 in c:\users\patri\anaconda3\lib\site-packages (from scikit-plot) (0.23.2)
Requirement already satisfied: scipy>=0.9 in c:\users\patri\anaconda3\lib\site-packages (from scikit-plot) (1.5.2)
Requirement already satisfied: joblib>=0.10 in c:\users\patri\anaconda3\lib\site-packages (from scikit-plot) (0.17.0)
Requirement already satisfied: pillow>=6.2.0 in c:\users\patri\anaconda3\lib\site-packages (from matplotlib>=1.4.0->scikit-plot) (8.0.1)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\patri\anaconda3\lib\site-packages (from matplotlib>=1.4.0->scikit-plot) (1.3.0)
Requirement already satisfied: certifi>=2020.06.20 in c:\users\patri\anaconda3\lib\site-packages (from matplotlib>=1.4.0->scikit-plot) (2020.6.20)
Requirement already satisfied: cycler>=0.10 in c:\users\patri\anaconda3\lib\site-packages (from matplotlib>=1.4.0->scikit-plot) (0.10.0)
Requirement already satisfied: python-dateutil>=2.1 in c:\users\patri\anaconda3\lib\site-packages (from matplotlib>=1.4.0->scikit-plot) (2.8.1)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\users\patri\anaconda3\lib\site-packages (from matplotlib>=1.4.0->scikit-plot) (2.4.7)
Requirement already satisfied: numpy>=1.15 in c:\users\patri\anaconda3\lib\site-packages (from matplotlib>=1.4.0->scikit-plot) (1.19.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\patri\anaconda3\lib\site-packages (from scikit-learn>=0.18->scikit-plot) (2.1.0)
Requirement already satisfied: six in c:\users\patri\anaconda3\lib\site-packages (from cycler>=0.10->matplotlib>=1.4.0->scikit-plot) (1.15.0)


skplt.metrics.plot_confusion_matrix(y_true=np.array(y_test), y_pred=dt.predict(X_test))
plt.show()


text_representation = tree.export_text(dt, feature_names=(list(X.columns)))
print(text_representation)

|--- num_voted_users <= 86645.50
|   |--- duration <= 108.50
|   |   |--- budget <= 15550000.00
|   |   |   |--- main_genre_10 <= 0.50
|   |   |   |   |--- num_critic_for_reviews <= 70.50
|   |   |   |   |   |--- budget <= 2150000.00
|   |   |   |   |   |   |--- num_critic_for_reviews <= 17.00
|   |   |   |   |   |   |   |--- actor_3_facebook_likes <= 16.00
|   |   |   |   |   |   |   |   |--- cast_total_facebook_likes <= 349.00
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- cast_total_facebook_likes >  349.00
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- actor_3_facebook_likes >  16.00
|   |   |   |   |   |   |   |   |--- movie_facebook_likes <= 84.00
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |--- movie_facebook_likes >  84.00
|   |   |   |   |   |   |   |   |   |--- title_year <= 2000.50
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |--- title_year >  2000.50
|   |   |   |   |   |   |   |   |   |   |--- cast_total_facebook_likes <= 886.00
|   |   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |   |--- cast_total_facebook_likes >  886.00
|   |   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- num_critic_for_reviews >  17.00
|   |   |   |   |   |   |   |--- gross <= 4082.00
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- gross >  4082.00
|   |   |   |   |   |   |   |   |--- title_year <= 1987.50
|   |   |   |   |   |   |   |   |   |--- director_facebook_likes <= 27.00
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |--- director_facebook_likes >  27.00
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |--- title_year >  1987.50
|   |   |   |   |   |   |   |   |   |--- duration <= 84.50
|   |   |   |   |   |   |   |   |   |   |--- cast_total_facebook_likes <= 3197.00
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 3
|   |   |   |   |   |   |   |   |   |   |--- cast_total_facebook_likes >  3197.00
|   |   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |--- duration >  84.50
|   |   |   |   |   |   |   |   |   |   |--- title_year <= 2014.00
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 3
|   |   |   |   |   |   |   |   |   |   |--- title_year >  2014.00
|   |   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- budget >  2150000.00
|   |   |   |   |   |   |--- num_voted_users <= 29235.50
|   |   |   |   |   |   |   |--- duration <= 96.50
|   |   |   |   |   |   |   |   |--- main_genre_7 <= 0.50
|   |   |   |   |   |   |   |   |   |--- director_facebook_likes <= 221.00
|   |   |   |   |   |   |   |   |   |   |--- actor_3_facebook_likes <= 41.00
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   |   |   |   |   |   |   |--- actor_3_facebook_likes >  41.00
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 6
|   |   |   |   |   |   |   |   |   |--- director_facebook_likes >  221.00
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- main_genre_7 >  0.50
|   |   |   |   |   |   |   |   |   |--- actor_2_name <= 634.50
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |--- actor_2_name >  634.50
|   |   |   |   |   |   |   |   |   |   |--- title_year <= 2011.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |   |--- title_year >  2011.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- duration >  96.50
|   |   |   |   |   |   |   |   |--- actor_2_facebook_likes <= 287.00
|   |   |   |   |   |   |   |   |   |--- actor_2_name <= 2176.50
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |--- actor_2_name >  2176.50
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |--- actor_2_facebook_likes >  287.00
|   |   |   |   |   |   |   |   |   |--- director_name <= 134.00
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |--- director_name >  134.00
|   |   |   |   |   |   |   |   |   |   |--- director_name <= 905.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 3
|   |   |   |   |   |   |   |   |   |   |--- director_name >  905.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 4
|   |   |   |   |   |   |--- num_voted_users >  29235.50
|   |   |   |   |   |   |   |--- actor_1_name <= 1269.00
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- actor_1_name >  1269.00
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |--- num_critic_for_reviews >  70.50
|   |   |   |   |   |--- gross <= 12247547.00
|   |   |   |   |   |   |--- title_year <= 2011.50
|   |   |   |   |   |   |   |--- content_rating_11 <= 0.50
|   |   |   |   |   |   |   |   |--- main_genre_12 <= 0.50
|   |   |   |   |   |   |   |   |   |--- movie_facebook_likes <= 41500.00
|   |   |   |   |   |   |   |   |   |   |--- director_name <= 29.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   |   |   |   |   |   |   |--- director_name >  29.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 8
|   |   |   |   |   |   |   |   |   |--- movie_facebook_likes >  41500.00
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |--- main_genre_12 >  0.50
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- content_rating_11 >  0.50
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- title_year >  2011.50
|   |   |   |   |   |   |   |--- actor_3_facebook_likes <= 342.00
|   |   |   |   |   |   |   |   |--- num_critic_for_reviews <= 90.50
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |--- num_critic_for_reviews >  90.50
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- actor_3_facebook_likes >  342.00
|   |   |   |   |   |   |   |   |--- budget <= 6500000.00
|   |   |   |   |   |   |   |   |   |--- cast_total_facebook_likes <= 47582.50
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |--- cast_total_facebook_likes >  47582.50
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- budget >  6500000.00
|   |   |   |   |   |   |   |   |   |--- gross <= 8660346.50
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |--- gross >  8660346.50
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- gross >  12247547.00
|   |   |   |   |   |   |--- num_voted_users <= 37697.50
|   |   |   |   |   |   |   |--- duration <= 93.50
|   |   |   |   |   |   |   |   |--- actor_3_name <= 2022.00
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |--- actor_3_name >  2022.00
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- duration >  93.50
|   |   |   |   |   |   |   |   |--- actor_1_facebook_likes <= 924.50
|   |   |   |   |   |   |   |   |   |--- movie_facebook_likes <= 5425.50
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |--- movie_facebook_likes >  5425.50
|   |   |   |   |   |   |   |   |   |   |--- content_rating_7 <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |   |--- content_rating_7 >  0.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- actor_1_facebook_likes >  924.50
|   |   |   |   |   |   |   |   |   |--- num_user_for_reviews <= 148.00
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |--- num_user_for_reviews >  148.00
|   |   |   |   |   |   |   |   |   |   |--- director_facebook_likes <= 47.00
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   |   |   |   |   |   |   |--- director_facebook_likes >  47.00
|   |   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- num_voted_users >  37697.50
|   |   |   |   |   |   |   |--- num_user_for_reviews <= 299.00
|   |   |   |   |   |   |   |   |--- director_facebook_likes <= 521.50
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- director_facebook_likes >  521.50
|   |   |   |   |   |   |   |   |   |--- director_facebook_likes <= 608.00
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |--- director_facebook_likes >  608.00
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- num_user_for_reviews >  299.00
|   |   |   |   |   |   |   |   |--- director_facebook_likes <= 3.50
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- director_facebook_likes >  3.50
|   |   |   |   |   |   |   |   |   |--- duration <= 100.50
|   |   |   |   |   |   |   |   |   |   |--- cast_total_facebook_likes <= 1864.00
|   |   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |   |--- cast_total_facebook_likes >  1864.00
|   |   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |--- duration >  100.50
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |--- main_genre_10 >  0.50
|   |   |   |   |--- director_facebook_likes <= 120.50
|   |   |   |   |   |--- movie_facebook_likes <= 25500.00
|   |   |   |   |   |   |--- num_voted_users <= 81258.00
|   |   |   |   |   |   |   |--- language_30 <= 0.50
|   |   |   |   |   |   |   |   |--- actor_1_name <= 155.00
|   |   |   |   |   |   |   |   |   |--- budget <= 7000000.00
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |--- budget >  7000000.00
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |--- actor_1_name >  155.00
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- language_30 >  0.50
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- num_voted_users >  81258.00
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- movie_facebook_likes >  25500.00
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- director_facebook_likes >  120.50
|   |   |   |   |   |--- director_name <= 1586.00
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- director_name >  1586.00
|   |   |   |   |   |   |--- class: 0
|   |   |--- budget >  15550000.00
|   |   |   |--- num_voted_users <= 30075.00
|   |   |   |   |--- actor_3_facebook_likes <= 105.50
|   |   |   |   |   |--- actor_2_name <= 306.50
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- actor_2_name >  306.50
|   |   |   |   |   |   |--- main_genre_10 <= 0.50
|   |   |   |   |   |   |   |--- director_name <= 608.50
|   |   |   |   |   |   |   |   |--- duration <= 99.00
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |--- duration >  99.00
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- director_name >  608.50
|   |   |   |   |   |   |   |   |--- main_genre_1 <= 0.50
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- main_genre_1 >  0.50
|   |   |   |   |   |   |   |   |   |--- actor_3_name <= 1724.50
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |--- actor_3_name >  1724.50
|   |   |   |   |   |   |   |   |   |   |--- actor_3_name <= 2588.00
|   |   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |   |--- actor_3_name >  2588.00
|   |   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- main_genre_10 >  0.50
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |--- actor_3_facebook_likes >  105.50
|   |   |   |   |   |--- movie_facebook_likes <= 12500.00
|   |   |   |   |   |   |--- actor_1_name <= 1459.00
|   |   |   |   |   |   |   |--- actor_2_facebook_likes <= 6500.00
|   |   |   |   |   |   |   |   |--- actor_2_name <= 940.50
|   |   |   |   |   |   |   |   |   |--- actor_2_name <= 773.50
|   |   |   |   |   |   |   |   |   |   |--- director_name <= 1441.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 6
|   |   |   |   |   |   |   |   |   |   |--- director_name >  1441.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 3
|   |   |   |   |   |   |   |   |   |--- actor_2_name >  773.50
|   |   |   |   |   |   |   |   |   |   |--- duration <= 97.00
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   |   |   |   |   |   |   |--- duration >  97.00
|   |   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- actor_2_name >  940.50
|   |   |   |   |   |   |   |   |   |--- duration <= 107.50
|   |   |   |   |   |   |   |   |   |   |--- director_facebook_likes <= 386.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 9
|   |   |   |   |   |   |   |   |   |   |--- director_facebook_likes >  386.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   |   |   |   |   |   |--- duration >  107.50
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- actor_2_facebook_likes >  6500.00
|   |   |   |   |   |   |   |   |--- actor_3_name <= 2184.00
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- actor_3_name >  2184.00
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- actor_1_name >  1459.00
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- movie_facebook_likes >  12500.00
|   |   |   |   |   |   |--- movie_facebook_likes <= 54500.00
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- movie_facebook_likes >  54500.00
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |--- num_voted_users >  30075.00
|   |   |   |   |--- num_user_for_reviews <= 178.50
|   |   |   |   |   |--- budget <= 34500000.00
|   |   |   |   |   |   |--- duration <= 89.50
|   |   |   |   |   |   |   |--- director_facebook_likes <= 105.50
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- director_facebook_likes >  105.50
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- duration >  89.50
|   |   |   |   |   |   |   |--- title_year <= 1987.00
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- title_year >  1987.00
|   |   |   |   |   |   |   |   |--- num_user_for_reviews <= 133.50
|   |   |   |   |   |   |   |   |   |--- country_6 <= 0.50
|   |   |   |   |   |   |   |   |   |   |--- actor_2_name <= 133.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |   |--- actor_2_name >  133.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |--- country_6 >  0.50
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |--- num_user_for_reviews >  133.50
|   |   |   |   |   |   |   |   |   |--- actor_3_name <= 823.00
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |--- actor_3_name >  823.00
|   |   |   |   |   |   |   |   |   |   |--- director_facebook_likes <= 37.00
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   |   |   |   |   |   |   |--- director_facebook_likes >  37.00
|   |   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- budget >  34500000.00
|   |   |   |   |   |   |--- main_genre_1 <= 0.50
|   |   |   |   |   |   |   |--- main_genre_2 <= 0.50
|   |   |   |   |   |   |   |   |--- duration <= 102.50
|   |   |   |   |   |   |   |   |   |--- director_name <= 68.50
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |--- director_name >  68.50
|   |   |   |   |   |   |   |   |   |   |--- country_43 <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 5
|   |   |   |   |   |   |   |   |   |   |--- country_43 >  0.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- duration >  102.50
|   |   |   |   |   |   |   |   |   |--- actor_1_name <= 1296.50
|   |   |   |   |   |   |   |   |   |   |--- director_name <= 205.00
|   |   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |   |--- director_name >  205.00
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   |   |   |   |   |   |--- actor_1_name >  1296.50
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- main_genre_2 >  0.50
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- main_genre_1 >  0.50
|   |   |   |   |   |   |   |--- actor_1_facebook_likes <= 935.00
|   |   |   |   |   |   |   |   |--- actor_3_facebook_likes <= 83.50
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- actor_3_facebook_likes >  83.50
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- actor_1_facebook_likes >  935.00
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- num_user_for_reviews >  178.50
|   |   |   |   |   |--- budget <= 125500000.00
|   |   |   |   |   |   |--- num_voted_users <= 82309.50
|   |   |   |   |   |   |   |--- actor_1_name <= 1418.50
|   |   |   |   |   |   |   |   |--- director_facebook_likes <= 834.50
|   |   |   |   |   |   |   |   |   |--- director_facebook_likes <= 15.50
|   |   |   |   |   |   |   |   |   |   |--- actor_2_name <= 1603.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 4
|   |   |   |   |   |   |   |   |   |   |--- actor_2_name >  1603.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   |   |   |   |   |   |--- director_facebook_likes >  15.50
|   |   |   |   |   |   |   |   |   |   |--- title_year <= 2005.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 7
|   |   |   |   |   |   |   |   |   |   |--- title_year >  2005.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 4
|   |   |   |   |   |   |   |   |--- director_facebook_likes >  834.50
|   |   |   |   |   |   |   |   |   |--- duration <= 100.50
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |--- duration >  100.50
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- actor_1_name >  1418.50
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- num_voted_users >  82309.50
|   |   |   |   |   |   |   |--- title_year <= 2006.50
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- title_year >  2006.50
|   |   |   |   |   |   |   |   |--- actor_2_name <= 1381.50
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |--- actor_2_name >  1381.50
|   |   |   |   |   |   |   |   |   |--- budget <= 39950000.00
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |--- budget >  39950000.00
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- budget >  125500000.00
|   |   |   |   |   |   |--- actor_1_facebook_likes <= 55000.00
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- actor_1_facebook_likes >  55000.00
|   |   |   |   |   |   |   |--- class: 0
|   |--- duration >  108.50
|   |   |--- budget <= 36500000.00
|   |   |   |--- num_voted_users <= 10991.50
|   |   |   |   |--- duration <= 124.50
|   |   |   |   |   |--- budget <= 16000000.00
|   |   |   |   |   |   |--- actor_2_name <= 190.00
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- actor_2_name >  190.00
|   |   |   |   |   |   |   |--- content_rating_7 <= 0.50
|   |   |   |   |   |   |   |   |--- num_voted_users <= 3161.00
|   |   |   |   |   |   |   |   |   |--- actor_1_name <= 452.50
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |--- actor_1_name >  452.50
|   |   |   |   |   |   |   |   |   |   |--- director_name <= 674.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |   |--- director_name >  674.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   |   |   |   |   |--- num_voted_users >  3161.00
|   |   |   |   |   |   |   |   |   |--- num_user_for_reviews <= 91.50
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |--- num_user_for_reviews >  91.50
|   |   |   |   |   |   |   |   |   |   |--- budget <= 5500000.00
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   |   |   |   |   |   |   |--- budget >  5500000.00
|   |   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- content_rating_7 >  0.50
|   |   |   |   |   |   |   |   |--- title_year <= 2003.50
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- title_year >  2003.50
|   |   |   |   |   |   |   |   |   |--- director_name <= 244.00
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |--- director_name >  244.00
|   |   |   |   |   |   |   |   |   |   |--- num_critic_for_reviews <= 76.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |   |--- num_critic_for_reviews >  76.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- budget >  16000000.00
|   |   |   |   |   |   |--- main_genre_3 <= 0.50
|   |   |   |   |   |   |   |--- actor_1_name <= 1269.00
|   |   |   |   |   |   |   |   |--- actor_1_name <= 74.50
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- actor_1_name >  74.50
|   |   |   |   |   |   |   |   |   |--- language_11 <= 0.50
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |--- language_11 >  0.50
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- actor_1_name >  1269.00
|   |   |   |   |   |   |   |   |--- movie_facebook_likes <= 181.00
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |--- movie_facebook_likes >  181.00
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- main_genre_3 >  0.50
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- duration >  124.50
|   |   |   |   |   |--- country_13 <= 0.50
|   |   |   |   |   |   |--- cast_total_facebook_likes <= 22515.50
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- cast_total_facebook_likes >  22515.50
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- country_13 >  0.50
|   |   |   |   |   |   |--- class: 0
|   |   |   |--- num_voted_users >  10991.50
|   |   |   |   |--- duration <= 119.50
|   |   |   |   |   |--- budget <= 14400000.00
|   |   |   |   |   |   |--- content_rating_1 <= 0.50
|   |   |   |   |   |   |   |--- country_3 <= 0.50
|   |   |   |   |   |   |   |   |--- duration <= 118.50
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- duration >  118.50
|   |   |   |   |   |   |   |   |   |--- actor_1_name <= 317.00
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |--- actor_1_name >  317.00
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- country_3 >  0.50
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- content_rating_1 >  0.50
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- budget >  14400000.00
|   |   |   |   |   |   |--- num_voted_users <= 48941.00
|   |   |   |   |   |   |   |--- num_voted_users <= 45238.00
|   |   |   |   |   |   |   |   |--- num_critic_for_reviews <= 95.50
|   |   |   |   |   |   |   |   |   |--- num_user_for_reviews <= 200.00
|   |   |   |   |   |   |   |   |   |   |--- actor_2_facebook_likes <= 615.00
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 3
|   |   |   |   |   |   |   |   |   |   |--- actor_2_facebook_likes >  615.00
|   |   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |--- num_user_for_reviews >  200.00
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- num_critic_for_reviews >  95.50
|   |   |   |   |   |   |   |   |   |--- num_user_for_reviews <= 345.50
|   |   |   |   |   |   |   |   |   |   |--- budget <= 14900000.00
|   |   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |   |--- budget >  14900000.00
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 4
|   |   |   |   |   |   |   |   |   |--- num_user_for_reviews >  345.50
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- num_voted_users >  45238.00
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- num_voted_users >  48941.00
|   |   |   |   |   |   |   |--- num_user_for_reviews <= 94.50
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- num_user_for_reviews >  94.50
|   |   |   |   |   |   |   |   |--- actor_3_facebook_likes <= 53.50
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |--- actor_3_facebook_likes >  53.50
|   |   |   |   |   |   |   |   |   |--- num_critic_for_reviews <= 283.00
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |--- num_critic_for_reviews >  283.00
|   |   |   |   |   |   |   |   |   |   |--- budget <= 19000000.00
|   |   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |   |--- budget >  19000000.00
|   |   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- duration >  119.50
|   |   |   |   |   |--- actor_2_name <= 256.00
|   |   |   |   |   |   |--- movie_facebook_likes <= 13000.00
|   |   |   |   |   |   |   |--- actor_3_name <= 2499.00
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- actor_3_name >  2499.00
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- movie_facebook_likes >  13000.00
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- actor_2_name >  256.00
|   |   |   |   |   |   |--- gross <= 97234792.00
|   |   |   |   |   |   |   |--- actor_3_name <= 2591.50
|   |   |   |   |   |   |   |   |--- country_13 <= 0.50
|   |   |   |   |   |   |   |   |   |--- budget <= 33500000.00
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |--- budget >  33500000.00
|   |   |   |   |   |   |   |   |   |   |--- actor_3_name <= 513.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   |   |   |   |   |   |   |--- actor_3_name >  513.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- country_13 >  0.50
|   |   |   |   |   |   |   |   |   |--- actor_2_facebook_likes <= 703.00
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |--- actor_2_facebook_likes >  703.00
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- actor_3_name >  2591.50
|   |   |   |   |   |   |   |   |--- budget <= 18500000.00
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |--- budget >  18500000.00
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- gross >  97234792.00
|   |   |   |   |   |   |   |--- title_year <= 1987.00
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- title_year >  1987.00
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |--- budget >  36500000.00
|   |   |   |--- actor_2_facebook_likes <= 4500.00
|   |   |   |   |--- num_voted_users <= 78910.50
|   |   |   |   |   |--- gross <= 3166911.50
|   |   |   |   |   |   |--- budget <= 39000000.00
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- budget >  39000000.00
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- gross >  3166911.50
|   |   |   |   |   |   |--- actor_2_name <= 142.50
|   |   |   |   |   |   |   |--- num_user_for_reviews <= 286.50
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- num_user_for_reviews >  286.50
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- actor_2_name >  142.50
|   |   |   |   |   |   |   |--- gross <= 31435074.00
|   |   |   |   |   |   |   |   |--- actor_2_facebook_likes <= 853.50
|   |   |   |   |   |   |   |   |   |--- actor_2_facebook_likes <= 741.50
|   |   |   |   |   |   |   |   |   |   |--- language_9 <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |   |--- language_9 >  0.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 3
|   |   |   |   |   |   |   |   |   |--- actor_2_facebook_likes >  741.50
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- actor_2_facebook_likes >  853.50
|   |   |   |   |   |   |   |   |   |--- actor_3_facebook_likes <= 883.00
|   |   |   |   |   |   |   |   |   |   |--- director_facebook_likes <= 739.00
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   |   |   |   |   |   |   |--- director_facebook_likes >  739.00
|   |   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |--- actor_3_facebook_likes >  883.00
|   |   |   |   |   |   |   |   |   |   |--- actor_2_name <= 1610.00
|   |   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |   |--- actor_2_name >  1610.00
|   |   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- gross >  31435074.00
|   |   |   |   |   |   |   |   |--- num_critic_for_reviews <= 39.50
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |--- num_critic_for_reviews >  39.50
|   |   |   |   |   |   |   |   |   |--- budget <= 62500000.00
|   |   |   |   |   |   |   |   |   |   |--- actor_2_facebook_likes <= 401.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |   |--- actor_2_facebook_likes >  401.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 5
|   |   |   |   |   |   |   |   |   |--- budget >  62500000.00
|   |   |   |   |   |   |   |   |   |   |--- num_voted_users <= 58005.00
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 4
|   |   |   |   |   |   |   |   |   |   |--- num_voted_users >  58005.00
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 3
|   |   |   |   |--- num_voted_users >  78910.50
|   |   |   |   |   |--- class: 1
|   |   |   |--- actor_2_facebook_likes >  4500.00
|   |   |   |   |--- class: 1
|--- num_voted_users >  86645.50
|   |--- num_voted_users <= 143731.50
|   |   |--- budget <= 29500000.00
|   |   |   |--- main_genre_10 <= 0.50
|   |   |   |   |--- num_user_for_reviews <= 105.00
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- num_user_for_reviews >  105.00
|   |   |   |   |   |--- actor_2_name <= 135.50
|   |   |   |   |   |   |--- content_rating_7 <= 0.50
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- content_rating_7 >  0.50
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- actor_2_name >  135.50
|   |   |   |   |   |   |--- duration <= 83.50
|   |   |   |   |   |   |   |--- content_rating_7 <= 0.50
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- content_rating_7 >  0.50
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- duration >  83.50
|   |   |   |   |   |   |   |--- num_user_for_reviews <= 663.00
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- num_user_for_reviews >  663.00
|   |   |   |   |   |   |   |   |--- num_voted_users <= 91897.50
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |--- num_voted_users >  91897.50
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |--- main_genre_10 >  0.50
|   |   |   |   |--- gross <= 46866564.00
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- gross >  46866564.00
|   |   |   |   |   |--- actor_3_facebook_likes <= 269.50
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- actor_3_facebook_likes >  269.50
|   |   |   |   |   |   |--- class: 0
|   |   |--- budget >  29500000.00
|   |   |   |--- duration <= 127.50
|   |   |   |   |--- num_user_for_reviews <= 677.50
|   |   |   |   |   |--- actor_3_facebook_likes <= 832.50
|   |   |   |   |   |   |--- cast_total_facebook_likes <= 23119.00
|   |   |   |   |   |   |   |--- actor_2_facebook_likes <= 877.50
|   |   |   |   |   |   |   |   |--- country_3 <= 0.50
|   |   |   |   |   |   |   |   |   |--- main_genre_4 <= 0.50
|   |   |   |   |   |   |   |   |   |   |--- director_name <= 84.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   |   |   |   |   |   |   |--- director_name >  84.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 4
|   |   |   |   |   |   |   |   |   |--- main_genre_4 >  0.50
|   |   |   |   |   |   |   |   |   |   |--- title_year <= 2003.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   |   |   |   |   |   |   |--- title_year >  2003.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- country_3 >  0.50
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- actor_2_facebook_likes >  877.50
|   |   |   |   |   |   |   |   |--- gross <= 75334592.00
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |--- gross >  75334592.00
|   |   |   |   |   |   |   |   |   |--- num_voted_users <= 107277.00
|   |   |   |   |   |   |   |   |   |   |--- content_rating_1 <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |   |--- content_rating_1 >  0.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |--- num_voted_users >  107277.00
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- cast_total_facebook_likes >  23119.00
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- actor_3_facebook_likes >  832.50
|   |   |   |   |   |   |--- budget <= 77500000.00
|   |   |   |   |   |   |   |--- actor_3_name <= 628.50
|   |   |   |   |   |   |   |   |--- actor_2_name <= 392.50
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- actor_2_name >  392.50
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- actor_3_name >  628.50
|   |   |   |   |   |   |   |   |--- director_name <= 396.00
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |--- director_name >  396.00
|   |   |   |   |   |   |   |   |   |--- country_44 <= 0.50
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |   |--- country_44 >  0.50
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- budget >  77500000.00
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |--- num_user_for_reviews >  677.50
|   |   |   |   |   |--- gross <= 33394617.00
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- gross >  33394617.00
|   |   |   |   |   |   |--- class: 0
|   |   |   |--- duration >  127.50
|   |   |   |   |--- main_genre_9 <= 0.50
|   |   |   |   |   |--- actor_1_name <= 1397.00
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- actor_1_name >  1397.00
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |--- main_genre_9 >  0.50
|   |   |   |   |   |--- class: 0
|   |--- num_voted_users >  143731.50
|   |   |--- content_rating_7 <= 0.50
|   |   |   |--- duration <= 97.50
|   |   |   |   |--- actor_3_name <= 122.50
|   |   |   |   |   |--- budget <= 115000000.00
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- budget >  115000000.00
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |--- actor_3_name >  122.50
|   |   |   |   |   |--- duration <= 96.50
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- duration >  96.50
|   |   |   |   |   |   |--- num_user_for_reviews <= 346.50
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- num_user_for_reviews >  346.50
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |--- duration >  97.50
|   |   |   |   |--- class: 1
|   |   |--- content_rating_7 >  0.50
|   |   |   |--- num_voted_users <= 229755.00
|   |   |   |   |--- budget <= 112500000.00
|   |   |   |   |   |--- num_user_for_reviews <= 917.00
|   |   |   |   |   |   |--- actor_2_name <= 2221.00
|   |   |   |   |   |   |   |--- actor_3_facebook_likes <= 2000.00
|   |   |   |   |   |   |   |   |--- actor_1_name <= 15.00
|   |   |   |   |   |   |   |   |   |--- actor_2_name <= 1548.50
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |   |--- actor_2_name >  1548.50
|   |   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |--- actor_1_name >  15.00
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |--- actor_3_facebook_likes >  2000.00
|   |   |   |   |   |   |   |   |--- num_user_for_reviews <= 377.00
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |--- num_user_for_reviews >  377.00
|   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- actor_2_name >  2221.00
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- num_user_for_reviews >  917.00
|   |   |   |   |   |   |--- gross <= 175348128.00
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- gross >  175348128.00
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |--- budget >  112500000.00
|   |   |   |   |   |--- duration <= 125.50
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- duration >  125.50
|   |   |   |   |   |   |--- num_user_for_reviews <= 1212.50
|   |   |   |   |   |   |   |--- gross <= 76709692.00
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- gross >  76709692.00
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- num_user_for_reviews >  1212.50
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |--- num_voted_users >  229755.00
|   |   |   |   |--- actor_3_name <= 156.00
|   |   |   |   |   |--- title_year <= 2005.50
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- title_year >  2005.50
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |--- actor_3_name >  156.00
|   |   |   |   |   |--- cast_total_facebook_likes <= 3588.00
|   |   |   |   |   |   |--- budget <= 197500000.00
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- budget >  197500000.00
|   |   |   |   |   |   |   |--- country_43 <= 0.50
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- country_43 >  0.50
|   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |--- cast_total_facebook_likes >  3588.00
|   |   |   |   |   |   |--- class: 1


fig = plt.figure(figsize=(25,20))
tree.plot_tree(dt, 
                feature_names = X.columns, 
                class_names=['bad','good'], 
                filled = True);

# left arrow means "True"
# right arrow means "False"


# developing a very complex ("full-grown") tree

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
dt_simple = DecisionTreeClassifier(max_depth=3, min_samples_leaf=5)
dt_simple.fit(X_train, y_train)

# max_depth : The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.

# min_samples_leaf : The minimum number of samples required to be at a leaf node

# http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier

DecisionTreeClassifier(max_depth=3, min_samples_leaf=5)


# Find out the performance of this model & interpret the results

print(metrics.accuracy_score(y_test, dt_simple.predict(X_test)))
print("--------------------------------------------------------")
print(metrics.confusion_matrix(y_test, dt_simple.predict(X_test))) 
print("--------------------------------------------------------")
print(metrics.classification_report(y_test, dt_simple.predict(X_test)))
print("--------------------------------------------------------")
print(metrics.roc_auc_score(y_test, dt_simple.predict(X_test)))

0.7587719298245614
--------------------------------------------------------
[[132 198]
 [ 77 733]]
--------------------------------------------------------
              precision    recall  f1-score   support

           0       0.63      0.40      0.49       330
           1       0.79      0.90      0.84       810

    accuracy                           0.76      1140
   macro avg       0.71      0.65      0.67      1140
weighted avg       0.74      0.76      0.74      1140

--------------------------------------------------------
0.652469135802469


fig = plt.figure(figsize=(25,20))
tree.plot_tree(dt_simple, 
                feature_names = X.columns, 
                class_names=['bad','good'], 
                filled = True);

# left arrow means "True"
# right arrow means "False"


# evaluate the decision tree model using 10-fold cross-validation

# initialize decision tree algorithm (without fitting)

scores = cross_val_score(DecisionTreeClassifier(), X, y, scoring='accuracy', cv=10)
print(scores)
print(scores.mean())

[0.73684211 0.69473684 0.7        0.71842105 0.75526316 0.75789474
 0.73947368 0.71578947 0.72368421 0.71052632]
0.7252631578947368


# https://scikit-learn.org/stable/modules/cross_validation.html
#The mean score and the 95% confidence interval of the score estimate are hence given by:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.73 (+/- 0.04)


from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=20)    #building 20 decision trees
clf=clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8070175438596491


# generate evaluation metrics
print(metrics.accuracy_score(y_test, clf.predict(X_test))) #overall accuracy
print(metrics.confusion_matrix(y_test, clf.predict(X_test)))
print(metrics.classification_report(y_test, clf.predict(X_test)))

0.8070175438596491
[[187 143]
 [ 77 733]]
              precision    recall  f1-score   support

           0       0.71      0.57      0.63       330
           1       0.84      0.90      0.87       810

    accuracy                           0.81      1140
   macro avg       0.77      0.74      0.75      1140
weighted avg       0.80      0.81      0.80      1140


# important variables in descending order
pd.DataFrame(clf.feature_importances_, index = X.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)


#Predict class probabilities for X
clf.predict_proba(X_test)
# 1st column: the probability of bad movie
# 2nd column: the probability of good movie

array([[0.1 , 0.9 ],
       [0.05, 0.95],
       [0.45, 0.55],
       ...,
       [0.75, 0.25],
       [0.55, 0.45],
       [0.15, 0.85]])


# calculate the fpr and tpr for all thresholds of the classification

preds = dt.predict_proba(X_test)[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


# evaluate the model by splitting into train and test sets & develop knn model (name it as knn)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

KNeighborsClassifier()


#Model evaluation
# http://scikit-learn.org/stable/modules/model_evaluation.html

print(metrics.accuracy_score(y_test, knn.predict(X_test)))
print("--------------------------------------------------------")
print(metrics.confusion_matrix(y_test, knn.predict(X_test))) 
print("--------------------------------------------------------")
print(metrics.classification_report(y_test, knn.predict(X_test)))
print("--------------------------------------------------------")
print(metrics.roc_auc_score(y_test, knn.predict(X_test)))

0.6526315789473685
--------------------------------------------------------
[[ 90 240]
 [156 654]]
--------------------------------------------------------
              precision    recall  f1-score   support

           0       0.37      0.27      0.31       330
           1       0.73      0.81      0.77       810

    accuracy                           0.65      1140
   macro avg       0.55      0.54      0.54      1140
weighted avg       0.63      0.65      0.64      1140

--------------------------------------------------------
0.5400673400673401


# evaluate the knn model using 10-fold cross-validation

scores = cross_val_score(KNeighborsClassifier(), X, y, scoring='accuracy', cv=10)
print(scores)
print(scores.mean())

[0.66578947 0.52105263 0.57368421 0.58157895 0.59210526 0.49736842
 0.47368421 0.44210526 0.53421053 0.70263158]
0.5584210526315789


#create a dictionary of all values we want to test for n_neighbors
params_knn = {'n_neighbors': np.arange(1, 25)}

#use gridsearch to test all values for n_neighbors
knn_gs = GridSearchCV(knn, params_knn, cv=5)

#fit model to training data
knn_gs.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24])})


#save best model
knn_best = knn_gs.best_estimator_

#check best n_neigbors value
print(knn_gs.best_score_)
print(knn_gs.best_params_)
print(knn_gs.best_estimator_)

0.6860902255639098
{'n_neighbors': 22}
KNeighborsClassifier(n_neighbors=22)


# https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8
# https://stackoverflow.com/questions/25009284/how-to-plot-roc-curve-in-python

preds = knn.predict_proba(X_test)[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1.05])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


# correlation analysis
dfCM.corr()


corr = pd.DataFrame(dfCM.corr()['imdb_quality'].drop("imdb_quality"))
corr.sort_values(['imdb_quality'], ascending = False).head()


# evaluate the model by splitting into train and test sets and build a logistic regression model
# name it as "lr"

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
lr = LogisticRegression(solver='lbfgs', max_iter=500)
lr.fit(X_train, y_train)

LogisticRegression(max_iter=500)


# print coefficients
coef = pd.DataFrame(list(zip(X.columns, np.transpose(lr.coef_))), 
             columns=['X variables', 'coef']).sort_values('coef', ascending=False)

coef['coef'] = coef['coef'].str[0]
coef.head()


#Model evaluation

print(metrics.accuracy_score(y_test, lr.predict(X_test)))
print(metrics.confusion_matrix(y_test, lr.predict(X_test)))
print(metrics.classification_report(y_test, lr.predict(X_test)))
print(metrics.roc_auc_score(y_test, lr.predict(X_test)))

0.7210526315789474
[[ 57 273]
 [ 45 765]]
              precision    recall  f1-score   support

           0       0.56      0.17      0.26       330
           1       0.74      0.94      0.83       810

    accuracy                           0.72      1140
   macro avg       0.65      0.56      0.55      1140
weighted avg       0.69      0.72      0.66      1140

0.5585858585858585


# generate class probabilities
probs = lr.predict_proba(X_test)
print(probs)
#1st column: probability of being a bad movie
#2nd column: probabiility of being a good movie

[[0.08744289 0.91255711]
 [0.17190518 0.82809482]
 [0.4086413  0.5913587 ]
 ...
 [0.45069471 0.54930529]
 [0.23998017 0.76001983]
 [0.4913827  0.5086173 ]]


#https://stackoverflow.com/questions/42894871/how-to-plot-multiple-roc-curves-in-one-plot-with-legend-and-auc-scores-in-python

plt.figure()

# Add the models to the list that you want to view on the ROC plot
models = [
{
    'label': 'Decision Tree',
    'model': DecisionTreeClassifier(),
},
{
    'label': 'K-nearest neighbors',
    'model': KNeighborsClassifier(),
},
{
    'label': 'Logistic Regression',
    'model': LogisticRegression(solver='lbfgs', max_iter=500),
},
{
    'label': 'Random Forest',
    'model': RandomForestClassifier(n_estimators=100),
}
]

# Below for loop iterates through your models list
for m in models:
    model = m['model'] # select the model
    model.fit(X_train, y_train) # train the model
    #y_pred=model.predict(X_test) # predict the test data
    
    # Compute False postive rate, and True positive rate
    fpr, tpr, thresholds = metrics.roc_curve(y_test, model.predict_proba(X_test)[:,1])

    # Calculate Area under the curve to display on the plot
    auc = metrics.roc_auc_score(y_test,model.predict(X_test))

    # Now, plot the computed values
    plt.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % (m['label'], auc))

# Custom settings for the plot 
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('1-Specificity(False Positive Rate)')
plt.ylabel('Sensitivity(True Positive Rate)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()   # Display


#create new dataframe for clustering
dff = dfRM.copy()
dff.head()


dff.describe()


# variance test
dff.var()

color                        3.157507e-02
director_name                2.485730e+05
num_critic_for_reviews       1.532004e+04
duration                     5.112831e+02
director_facebook_likes      9.285497e+06
actor_3_facebook_likes       3.428033e+06
actor_2_name                 4.271510e+05
actor_1_facebook_likes       2.394428e+08
gross                        4.837700e+15
actor_1_name                 1.757963e+05
num_voted_users              2.277814e+10
cast_total_facebook_likes    3.609715e+08
actor_3_name                 5.889406e+05
num_user_for_reviews         1.675869e+05
language                     5.942184e+00
country                      9.657574e+01
content_rating               2.855676e+00
budget                       5.050302e+16
title_year                   9.773232e+01
actor_2_facebook_likes       2.020798e+07
imdb_score                   1.112535e+00
movie_facebook_likes         4.579238e+08
main_genre                   9.184630e+00
dtype: float64


dff_norm = (dff - dff.mean()) / (dff.max() - dff.min())
dff_norm.head()


dff_norm.var()

color                        0.031575
director_name                0.085407
num_critic_for_reviews       0.023235
duration                     0.005956
director_facebook_likes      0.017553
actor_3_facebook_likes       0.006480
actor_2_name                 0.085055
actor_1_facebook_likes       0.000585
gross                        0.008364
actor_1_name                 0.081575
num_voted_users              0.007978
cast_total_facebook_likes    0.000837
actor_3_name                 0.084182
num_user_for_reviews         0.006548
language                     0.005457
country                      0.047692
content_rating               0.023601
budget                       0.000338
title_year                   0.012338
actor_2_facebook_likes       0.001077
imdb_score                   0.018764
movie_facebook_likes         0.003760
main_genre                   0.035877
dtype: float64


k_means = KMeans(init='k-means++', n_clusters=2, random_state=0)


k_means.fit(dff_norm)

KMeans(n_clusters=2, random_state=0)


# cluster labels
k_means.labels_

array([0, 0, 1, ..., 1, 0, 0])


# find out cluster centers
k_means.cluster_centers_

array([[ 6.94939596e-03, -2.58304102e-01,  1.12999967e-03,
        -4.30171519e-03, -5.84492018e-03,  2.95938149e-03,
        -1.61165649e-02,  7.36663132e-04,  5.37842564e-04,
        -1.59137028e-02,  4.79279330e-04,  1.22021354e-03,
        -1.93395821e-03, -1.97013193e-03,  7.69428679e-04,
         5.78967394e-03,  2.38955407e-03,  1.72928973e-04,
         3.12456593e-03,  1.56298982e-03, -3.20438664e-03,
         5.54081705e-04,  4.42849575e-03],
       [-6.72626673e-03,  2.50010547e-01, -1.09371796e-03,
         4.16359694e-03,  5.65725314e-03, -2.86436251e-03,
         1.55990988e-02, -7.13010561e-04, -5.20573668e-04,
         1.54027501e-02, -4.63890765e-04, -1.18103527e-03,
         1.87186323e-03,  1.90687549e-03, -7.44724081e-04,
        -5.60378073e-03, -2.31283095e-03, -1.67376619e-04,
        -3.02424325e-03, -1.51280578e-03,  3.10150110e-03,
        -5.36291407e-04, -4.28630686e-03]])


# convert cluster lables to dataframe
dff1 = pd.DataFrame(k_means.labels_, columns = ['cluster'])
dff1.head()


# joining clusters in new dataset
dff2 = dff.join(dff1)
dff2.head()


# How many observations are there in cluster 1 and cluster 2?
dff2.groupby(['cluster']).count()


#The mean values of each cluster in terms of different variables
dff2.groupby(['cluster']).mean()

	color	director_name	num_critic_for_reviews	duration	director_facebook_likes	actor_3_facebook_likes	actor_2_name	actor_1_facebook_likes	gross	genres	actor_1_name	movie_title	num_voted_users	cast_total_facebook_likes	actor_3_name	facenumber_in_poster	plot_keywords	movie_imdb_link	num_user_for_reviews	language	country	content_rating	budget	title_year	actor_2_facebook_likes	imdb_score	aspect_ratio	movie_facebook_likes
0	Color	James Cameron	723.0	178.0	0.0	855.0	Joel David Moore	1000.0	760505847.0	Action\|Adventure\|Fantasy\|Sci-Fi	CCH Pounder	Avatar	886204	4834	Wes Studi	0.0	avatar\|future\|marine\|native\|paraplegic	http://www.imdb.com/title/tt0499549/?ref_=fn_t...	3054.0	English	USA	PG-13	237000000.0	2009.0	936.0	7.9	1.78	33000
1	Color	Gore Verbinski	302.0	169.0	563.0	1000.0	Orlando Bloom	40000.0	309404152.0	Action\|Adventure\|Fantasy	Johnny Depp	Pirates of the Caribbean: At World's End	471220	48350	Jack Davenport	0.0	goddess\|marriage ceremony\|marriage proposal\|pi...	http://www.imdb.com/title/tt0449088/?ref_=fn_t...	1238.0	English	USA	PG-13	300000000.0	2007.0	5000.0	7.1	2.35	0
2	Color	Sam Mendes	602.0	148.0	0.0	161.0	Rory Kinnear	11000.0	200074175.0	Action\|Adventure\|Thriller	Christoph Waltz	Spectre	275868	11700	Stephanie Sigman	1.0	bomb\|espionage\|sequel\|spy\|terrorist	http://www.imdb.com/title/tt2379713/?ref_=fn_t...	994.0	English	UK	PG-13	245000000.0	2015.0	393.0	6.8	2.35	85000
3	Color	Christopher Nolan	813.0	164.0	22000.0	23000.0	Christian Bale	27000.0	448130642.0	Action\|Thriller	Tom Hardy	The Dark Knight Rises	1144337	106759	Joseph Gordon-Levitt	0.0	deception\|imprisonment\|lawlessness\|police offi...	http://www.imdb.com/title/tt1345836/?ref_=fn_t...	2701.0	English	USA	PG-13	250000000.0	2012.0	23000.0	8.5	2.35	164000
4	NaN	Doug Walker	NaN	NaN	131.0	NaN	Rob Walker	131.0	NaN	Documentary	Doug Walker	Star Wars: Episode VII - The Force Awakens ...	8	143	NaN	0.0	NaN	http://www.imdb.com/title/tt5289954/?ref_=fn_t...	NaN	NaN	NaN	NaN	NaN	NaN	12.0	7.1	NaN	0

	num_critic_for_reviews	duration	director_facebook_likes	actor_3_facebook_likes	actor_1_facebook_likes	gross	num_voted_users	cast_total_facebook_likes	facenumber_in_poster	num_user_for_reviews	budget	title_year	actor_2_facebook_likes	imdb_score	aspect_ratio	movie_facebook_likes
count	4993.000000	5028.000000	4939.000000	5020.000000	5036.000000	4.159000e+03	5.043000e+03	5043.000000	5030.000000	5022.000000	4.551000e+03	4935.000000	5030.000000	5043.000000	4714.000000	5043.000000
mean	140.194272	107.201074	686.509212	645.009761	6560.047061	4.846841e+07	8.366816e+04	9699.063851	1.371173	272.770808	3.975262e+07	2002.470517	1651.754473	6.442138	2.220403	7525.964505
std	121.601675	25.197441	2813.328607	1665.041728	15020.759120	6.845299e+07	1.384853e+05	18163.799124	2.013576	377.982886	2.061149e+08	12.474599	4042.438863	1.125116	1.385113	19320.445110
min	1.000000	7.000000	0.000000	0.000000	0.000000	1.620000e+02	5.000000e+00	0.000000	0.000000	1.000000	2.180000e+02	1916.000000	0.000000	1.600000	1.180000	0.000000
25%	50.000000	93.000000	7.000000	133.000000	614.000000	5.340988e+06	8.593500e+03	1411.000000	0.000000	65.000000	6.000000e+06	1999.000000	281.000000	5.800000	1.850000	0.000000
50%	110.000000	103.000000	49.000000	371.500000	988.000000	2.551750e+07	3.435900e+04	3090.000000	1.000000	156.000000	2.000000e+07	2005.000000	595.000000	6.600000	2.350000	166.000000
75%	195.000000	118.000000	194.500000	636.000000	11000.000000	6.230944e+07	9.630900e+04	13756.500000	2.000000	326.000000	4.500000e+07	2011.000000	918.000000	7.200000	2.350000	3000.000000
max	813.000000	511.000000	23000.000000	23000.000000	640000.000000	7.605058e+08	1.689764e+06	656730.000000	43.000000	5060.000000	1.221550e+10	2016.000000	137000.000000	9.500000	16.000000	349000.000000

	color	director_name	num_critic_for_reviews	duration	director_facebook_likes	actor_3_facebook_likes	actor_2_name	actor_1_facebook_likes	gross	genres	actor_1_name	movie_title	num_voted_users	cast_total_facebook_likes	actor_3_name	facenumber_in_poster	plot_keywords	movie_imdb_link	num_user_for_reviews	language	country	content_rating	budget	title_year	actor_2_facebook_likes	imdb_score	aspect_ratio	movie_facebook_likes
4	NaN	Doug Walker	NaN	NaN	131.0	NaN	Rob Walker	131.0	NaN	Documentary	Doug Walker	Star Wars: Episode VII - The Force Awakens ...	8	143	NaN	0.0	NaN	http://www.imdb.com/title/tt5289954/?ref_=fn_t...	NaN	NaN	NaN	NaN	NaN	NaN	12.0	7.1	NaN	0
55	Color	Peter Sohn	298.0	93.0	113.0	113.0	Jack McGraw	275.0	123070338.0	Adventure\|Animation\|Comedy\|Family\|Fantasy	A.J. Buckley	The Good Dinosaur	62836	696	Peter Sohn	0.0	apatosaurus\|asteroid\|dinosaur\|fear\|river	http://www.imdb.com/title/tt1979388/?ref_=fn_t...	345.0	English	USA	PG	NaN	2015.0	150.0	6.8	2.35	20000
84	Color	Roland Joffé	10.0	109.0	596.0	283.0	Alice Englert	622.0	NaN	Action\|Adventure\|Romance\|Sci-Fi	Tamsin Egerton	The Lovers	2138	1982	Bipasha Basu	3.0	1770s\|british india\|great barrier reef\|india\|ring	http://www.imdb.com/title/tt1321869/?ref_=fn_t...	15.0	English	Belgium	R	NaN	2015.0	525.0	4.5	NaN	677
98	Color	Hideaki Anno	1.0	120.0	28.0	12.0	Shin'ya Tsukamoto	544.0	NaN	Action\|Adventure\|Drama\|Horror\|Sci-Fi	Mark Chinnery	Godzilla Resurgence	374	699	Atsuko Maeda	0.0	blood\|godzilla\|monster\|sequel	http://www.imdb.com/title/tt4262980/?ref_=fn_t...	13.0	Japanese	Japan	NaN	NaN	2016.0	106.0	8.2	2.35	0
99	Color	Peter Jackson	645.0	182.0	0.0	773.0	Adam Brown	5000.0	303001229.0	Adventure\|Fantasy	Aidan Turner	The Hobbit: An Unexpected Journey	637246	9152	James Nesbitt	NaN	dragon\|dwarf\|hobbit\|orc\|wizard	http://www.imdb.com/title/tt0903624/?ref_=fn_t...	1367.0	English	USA	PG-13	180000000.0	2012.0	972.0	7.9	2.35	166000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
5037	Color	Edward Burns	14.0	95.0	0.0	133.0	Caitlin FitzGerald	296.0	4584.0	Comedy\|Drama	Kerry Bishé	Newlyweds	1338	690	Daniella Pineda	1.0	written and directed by cast member	http://www.imdb.com/title/tt1880418/?ref_=fn_t...	14.0	English	USA	Not Rated	9000.0	2011.0	205.0	6.4	NaN	413
5038	Color	Scott Smith	1.0	87.0	2.0	318.0	Daphne Zuniga	637.0	NaN	Comedy\|Drama	Eric Mabius	Signed Sealed Delivered	629	2283	Crystal Lowe	2.0	fraud\|postal worker\|prison\|theft\|trial	http://www.imdb.com/title/tt3000844/?ref_=fn_t...	6.0	English	Canada	NaN	NaN	2013.0	470.0	7.7	NaN	84
5039	Color	NaN	43.0	43.0	NaN	319.0	Valorie Curry	841.0	NaN	Crime\|Drama\|Mystery\|Thriller	Natalie Zea	The Following	73839	1753	Sam Underwood	1.0	cult\|fbi\|hideout\|prison escape\|serial killer	http://www.imdb.com/title/tt2071645/?ref_=fn_t...	359.0	English	USA	TV-14	NaN	NaN	593.0	7.5	16.00	32000
5040	Color	Benjamin Roberds	13.0	76.0	0.0	0.0	Maxwell Moody	0.0	NaN	Drama\|Horror\|Thriller	Eva Boehnke	A Plague So Pleasant	38	0	David Chandler	0.0	NaN	http://www.imdb.com/title/tt2107644/?ref_=fn_t...	3.0	English	USA	NaN	1400.0	2013.0	0.0	6.3	NaN	16
5041	Color	Daniel Hsia	14.0	100.0	0.0	489.0	Daniel Henney	946.0	10443.0	Comedy\|Drama\|Romance	Alan Ruck	Shanghai Calling	1255	2386	Eliza Coupe	5.0	NaN	http://www.imdb.com/title/tt2070597/?ref_=fn_t...	9.0	English	USA	PG-13	NaN	2012.0	719.0	6.3	2.35	660

	movie_title	num_user_for_reviews
270	The Lord of the Rings: The Fellowship of the R...	5060.0
66	The Dark Knight	4667.0
1937	The Shawshank Redemption	4144.0
654	The Matrix	3646.0
240	Star Wars: Episode I - The Phantom Menace	3597.0

	movie_title	num_critic_for_reviews
3	The Dark Knight Rises	813.0
227	Prometheus	775.0
296	Django Unchained	765.0
3493	Skyfall	750.0
30	Skyfall	750.0

FINAL PROJECT¶

Business understanding¶

Data understanding¶

Business intelligence¶

What are the top 5 genres in terms of gross?¶

Classify movies from good, average to bad according to their imdb_score¶

Correlation analysis¶

Regression¶

Classification¶

Clustering¶

Storytelling¶

	movie_title	num_voted_users
1937	The Shawshank Redemption	1689764
66	The Dark Knight	1676169
97	Inception	1468200
683	Fight Club	1347461
3355	Pulp Fiction	1324680

	movie_title	cast_total_facebook_likes
1902	Anchorman: The Legend of Ron Burgundy	656730
1223	The Final Destination	303717
4704	Treachery	283939
4592	Hardflip	263584
4409	Kickboxer: Vengeance	261818

	movie_title	movie_facebook_likes
96	Interstellar	349000
296	Django Unchained	199000
10	Batman v Superman: Dawn of Justice	197000
128	Mad Max: Fury Road	191000
179	The Revenant	190000

	movie_title	gross
0	Avatar	760505847.0
26	Titanic	658672302.0
29	Jurassic World	652177271.0
794	The Avengers	623279547.0
17	The Avengers	623279547.0

	genres	gross
0	Action	760505847.0
1	Adventure	760505847.0
2	Fantasy	760505847.0
3	Sci-Fi	760505847.0
4	Action	309404152.0

	movie_title	imdb_score_max
4550	Towering Inferno	9.5
4299	The Shawshank Redemption	9.3
3822	The Godfather	9.2
997	Dekalog	9.1
2012	Kickboxer: Vengeance	9.1

	imdb_score	movie_title	imdb_category
0	7.9	Avatar	(7.0, 9.5]
1	7.1	Pirates of the Caribbean: At World's End	(7.0, 9.5]
2	6.8	Spectre	(6.1, 7.0]
3	8.5	The Dark Knight Rises	(7.0, 9.5]
4	7.1	Star Wars: Episode VII - The Force Awakens ...	(7.0, 9.5]

	imdb_score	movie_title	imdb_category	imdb_category_labels
2765	9.5	Towering Inferno	(7.0, 9.5]	good
1937	9.3	The Shawshank Redemption	(7.0, 9.5]	good
3466	9.2	The Godfather	(7.0, 9.5]	good
3207	9.1	Dekalog	(7.0, 9.5]	good
2824	9.1	Dekalog	(7.0, 9.5]	good

	num_critic_for_reviews	duration	director_facebook_likes	actor_3_facebook_likes	actor_1_facebook_likes	gross	num_voted_users	cast_total_facebook_likes	num_user_for_reviews	budget	title_year	actor_2_facebook_likes	imdb_score	movie_facebook_likes
num_critic_for_reviews	1.000000	0.257529	0.181246	0.265591	0.189533	0.477603	0.624255	0.260753	0.608175	0.119237	0.276845	0.280421	0.307226	0.681723
duration	0.257529	1.000000	0.173027	0.119806	0.087567	0.247443	0.313935	0.120975	0.328413	0.073586	-0.137076	0.129660	0.262618	0.194401
director_facebook_likes	0.181246	0.173027	1.000000	0.123204	0.091374	0.146368	0.299873	0.120781	0.223519	0.021015	-0.064856	0.121191	0.171564	0.162588
actor_3_facebook_likes	0.265591	0.119806	0.123204	1.000000	0.250000	0.289965	0.277918	0.470036	0.225871	0.045718	0.095383	0.556366	0.051683	0.267775
actor_1_facebook_likes	0.189533	0.087567	0.091374	0.250000	1.000000	0.153083	0.191239	0.952760	0.144768	0.022202	0.086286	0.389749	0.075952	0.133604
gross	0.477603	0.247443	0.146368	0.289965	0.153083	1.000000	0.635271	0.240049	0.561006	0.101033	0.029110	0.254172	0.198417	0.370140
num_voted_users	0.624255	0.313935	0.299873	0.277918	0.191239	0.635271	1.000000	0.261063	0.798691	0.079069	0.007232	0.265317	0.411299	0.535218
cast_total_facebook_likes	0.260753	0.120975	0.120781	0.470036	0.952760	0.240049	0.261063	1.000000	0.204547	0.035525	0.109263	0.625837	0.085091	0.204378
num_user_for_reviews	0.608175	0.328413	0.223519	0.225871	0.144768	0.561006	0.798691	0.204547	1.000000	0.084149	-0.002355	0.217153	0.292745	0.398795
budget	0.119237	0.073586	0.021015	0.045718	0.022202	0.101033	0.079069	0.035525	0.084149	1.000000	0.044953	0.043152	0.031061	0.060564
title_year	0.276845	-0.137076	-0.064856	0.095383	0.086286	0.029110	0.007232	0.109263	-0.002355	0.044953	1.000000	0.102083	-0.207591	0.217227
actor_2_facebook_likes	0.280421	0.129660	0.121191	0.556366	0.389749	0.254172	0.265317	0.625837	0.217153	0.043152	0.102083	1.000000	0.082400	0.237699
imdb_score	0.307226	0.262618	0.171564	0.051683	0.075952	0.198417	0.411299	0.085091	0.292745	0.031061	-0.207591	0.082400	1.000000	0.249604
movie_facebook_likes	0.681723	0.194401	0.162588	0.267775	0.133604	0.370140	0.535218	0.204378	0.398795	0.060564	0.217227	0.237699	0.249604	1.000000

	actor_1_facebook_likes	cast_total_facebook_likes
actor_1_facebook_likes	1.00000	0.95276
cast_total_facebook_likes	0.95276	1.00000

	color	director_name	num_critic_for_reviews	duration	director_facebook_likes	actor_3_facebook_likes	actor_2_name	actor_1_facebook_likes	gross	actor_1_name	num_voted_users	cast_total_facebook_likes	actor_3_name	num_user_for_reviews	language	country	content_rating	budget	title_year	actor_2_facebook_likes	movie_facebook_likes	main_genre
529	1	1239	67.0	99.0	545.0	533.0	1921	933.0	14567883.0	285	44248	2542	1121	373.0	9	43	9	75000000.0	1998.0	722.0	0	0
3709	1	666	220.0	92.0	100.0	482.0	578	970.0	44540956.0	710	161448	3950	1267	1473.0	9	44	6	400000.0	2004.0	759.0	0	4

	0	1
0	color	-0.70
1	language	0.05

	0	1
0	color	-0.36
1	director_name	0.00
2	num_critic_for_reviews	0.00
3	duration	0.01
4	director_facebook_likes	0.00
5	actor_3_facebook_likes	0.00
6	actor_2_name	-0.00
7	actor_1_facebook_likes	0.00
8	gross	-0.00
9	actor_1_name	0.00
10	num_voted_users	0.00
11	cast_total_facebook_likes	-0.00
12	actor_3_name	-0.00
13	num_user_for_reviews	-0.00
14	language	0.04
15	country	-0.01
16	content_rating	0.02
17	budget	-0.00
18	title_year	-0.02
19	actor_2_facebook_likes	0.00
20	movie_facebook_likes	-0.00
21	main_genre	0.03

	importance
num_voted_users	0.316322
duration	0.108144
budget	0.092743
num_user_for_reviews	0.060537
gross	0.051325
num_critic_for_reviews	0.043779
title_year	0.034242
main_genre	0.030561
actor_3_facebook_likes	0.029434
director_name	0.027687
actor_3_name	0.025523
cast_total_facebook_likes	0.024581
actor_2_name	0.024325
director_facebook_likes	0.023138
actor_1_name	0.022259
movie_facebook_likes	0.021120
actor_2_facebook_likes	0.020579
actor_1_facebook_likes	0.018579
content_rating	0.011938
country	0.009580
language	0.003261
color	0.000344

	color	director_name	num_critic_for_reviews	duration	director_facebook_likes	actor_3_facebook_likes	actor_2_name	actor_1_facebook_likes	gross	actor_1_name	num_voted_users	cast_total_facebook_likes	actor_3_name	num_user_for_reviews	language	country	content_rating	budget	title_year	actor_2_facebook_likes	imdb_score	movie_facebook_likes	main_genre	imdb_quality
count	3800.000000	3800.000000	3800.000000	3800.000000	3800.000000	3800.000000	3800.000000	3800.000000	3.800000e+03	3800.000000	3.800000e+03	3800.000000	3800.000000	3800.000000	3800.000000	3800.000000	3800.000000	3.800000e+03	3800.000000	3800.000000	3800.000000	3800.000000	3800.000000	3800.000000
mean	0.967368	880.653421	164.724971	110.026053	794.400526	755.126842	1117.593947	7648.966842	5.164025e+07	733.248421	1.037115e+05	11354.007105	1331.273158	330.334474	9.353421	40.514211	7.584737	4.549443e+07	2003.072368	1980.765526	6.459184	9202.434474	3.642895	0.693684
std	0.177694	498.570965	123.774159	22.611570	3047.211325	1851.494817	653.567933	15473.939652	6.955357e+07	419.280739	1.509243e+05	18999.250279	767.424657	409.373833	2.437659	9.827296	1.689875	2.247288e+08	9.885966	4495.328345	1.054768	21399.154854	3.030615	0.461023
min	0.000000	0.000000	1.000000	37.000000	0.000000	0.000000	0.000000	0.000000	1.620000e+02	0.000000	2.200000e+01	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	2.180000e+02	1927.000000	0.000000	1.600000	0.000000	0.000000	0.000000
25%	1.000000	450.750000	74.000000	95.000000	10.000000	188.000000	552.750000	730.750000	7.421793e+06	367.000000	1.813225e+04	1871.000000	657.750000	105.000000	9.000000	44.000000	7.000000	1.000000e+07	1999.000000	371.750000	5.900000	0.000000	0.000000	0.000000
50%	1.000000	893.500000	136.000000	106.000000	59.000000	432.000000	1103.500000	1000.000000	2.877222e+07	732.000000	5.238000e+04	3955.000000	1325.500000	205.000000	9.000000	44.000000	7.000000	2.500000e+07	2005.000000	670.000000	6.600000	217.000000	4.000000	1.000000
75%	1.000000	1332.250000	222.000000	120.000000	230.000000	689.250000	1685.250000	12000.000000	6.628274e+07	1086.250000	1.253340e+05	16095.000000	2006.000000	393.250000	9.000000	44.000000	9.000000	5.000000e+07	2010.000000	973.000000	7.200000	11000.000000	6.000000	1.000000
max	1.000000	1706.000000	813.000000	330.000000	23000.000000	23000.000000	2241.000000	640000.000000	7.605058e+08	1468.000000	1.689764e+06	656730.000000	2645.000000	5060.000000	33.000000	45.000000	11.000000	1.221550e+10	2016.000000	137000.000000	9.300000	349000.000000	16.000000	1.000000

	color	director_name	num_critic_for_reviews	duration	director_facebook_likes	actor_3_facebook_likes	actor_2_name	actor_1_facebook_likes	gross	actor_1_name	num_voted_users	cast_total_facebook_likes	actor_3_name	num_user_for_reviews	language	country	content_rating	budget	title_year	actor_2_facebook_likes	imdb_score	movie_facebook_likes	main_genre	imdb_quality
color	1.000000	-0.019378	-0.000504	-0.050758	-0.059483	0.020773	0.004718	0.024401	0.040809	-0.014031	-0.036312	0.028273	-0.003840	-0.069815	-0.002538	0.048200	-0.003938	0.014707	0.159880	0.019141	-0.118480	0.023283	-0.039730	-0.086702
director_name	-0.019378	1.000000	-0.001645	0.033452	0.050708	-0.042588	0.019431	-0.028597	-0.001541	0.022334	-0.009080	-0.038909	0.000361	0.015093	-0.029706	-0.010025	-0.010658	-0.007753	-0.051667	-0.042471	0.016045	-0.008192	-0.009628	0.015587
num_critic_for_reviews	-0.000504	-0.001645	1.000000	0.237456	0.177850	0.247289	-0.014900	0.168864	0.469265	0.001681	0.596951	0.237330	-0.008414	0.568929	-0.022866	0.029091	0.066474	0.106622	0.403101	0.253847	0.350114	0.702770	-0.077460	0.262172
duration	-0.050758	0.033452	0.237456	1.000000	0.181601	0.123003	0.023126	0.085598	0.249981	0.010341	0.344836	0.120807	0.033452	0.357494	0.041872	0.000228	0.066912	0.069450	-0.131890	0.128665	0.367463	0.219787	-0.040454	0.269486
director_facebook_likes	-0.059483	0.050708	0.177850	0.181601	1.000000	0.121721	-0.012826	0.091510	0.141708	0.015536	0.303774	0.121362	0.002219	0.220750	-0.021517	0.055794	0.028269	0.018888	-0.047833	0.119176	0.192332	0.162192	-0.007688	0.120775
actor_3_facebook_likes	0.020773	-0.042588	0.247289	0.123003	0.121721	1.000000	0.019008	0.252765	0.282772	-0.004719	0.259370	0.485410	-0.031515	0.202895	-0.050903	0.064960	-0.022043	0.039158	0.113030	0.550690	0.064969	0.259299	-0.082684	0.036692
actor_2_name	0.004718	0.019431	-0.014900	0.023126	-0.012826	0.019008	1.000000	0.000731	0.002834	0.006126	-0.001605	-0.003397	-0.021155	0.005996	0.027842	-0.009126	0.008143	0.016084	-0.018135	-0.023477	-0.003265	-0.016878	-0.026834	-0.000792
actor_1_facebook_likes	0.024401	-0.028597	0.168864	0.085598	0.091510	0.252765	0.000731	1.000000	0.145021	0.017845	0.180013	0.946300	0.011333	0.124256	-0.059636	0.059861	0.022929	0.017009	0.092108	0.390333	0.093368	0.128746	-0.049751	0.066052
gross	0.040809	-0.001541	0.469265	0.249981	0.141708	0.282772	0.002834	0.145021	1.000000	0.004855	0.626979	0.230466	0.006172	0.550763	-0.093999	0.138867	-0.224620	0.100619	0.047498	0.245716	0.217158	0.365256	-0.261192	0.147358
actor_1_name	-0.014031	0.022334	0.001681	0.010341	0.015536	-0.004719	0.006126	0.017845	0.004855	1.000000	0.001935	0.014748	0.019359	0.006295	0.001443	-0.006240	0.007233	0.001824	-0.016186	0.003300	0.025374	-0.009251	-0.019649	0.010283
num_voted_users	-0.036312	-0.009080	0.596951	0.344836	0.303774	0.259370	-0.001605	0.180013	0.626979	0.001935	1.000000	0.246180	-0.006988	0.781581	-0.033118	0.073423	0.008597	0.067780	0.017439	0.241157	0.479947	0.518471	-0.133259	0.279504
cast_total_facebook_likes	0.028273	-0.038909	0.237330	0.120807	0.121362	0.485410	-0.003397	0.946300	0.230466	0.014748	0.246180	1.000000	0.001557	0.179592	-0.073790	0.079102	0.019012	0.028866	0.121891	0.639878	0.106062	0.199670	-0.076193	0.075535
actor_3_name	-0.003840	0.000361	-0.008414	0.033452	0.002219	-0.031515	-0.021155	0.011333	0.006172	0.019359	-0.006988	0.001557	1.000000	-0.013753	0.002429	0.018827	0.001797	-0.021540	-0.009794	-0.013690	-0.004077	-0.013302	-0.015384	-0.000189
num_user_for_reviews	-0.069815	0.015093	0.568929	0.357494	0.220750	0.202895	0.005996	0.124256	0.550763	0.006295	0.781581	0.179592	-0.013753	1.000000	-0.044113	0.044190	0.045637	0.072663	0.013325	0.187555	0.325020	0.372988	-0.077588	0.192536
language	-0.002538	-0.029706	-0.022866	0.041872	-0.021517	-0.050903	0.027842	-0.059636	-0.093999	0.001443	-0.033118	-0.073790	0.002429	-0.044113	1.000000	-0.195474	0.048225	0.121470	0.021866	-0.057602	0.111172	-0.016942	0.041959	0.064502
country	0.048200	-0.010025	0.029091	0.000228	0.055794	0.064960	-0.009126	0.059861	0.138867	-0.006240	0.073423	0.079102	0.018827	0.044190	-0.195474	1.000000	-0.046911	-0.010849	-0.039854	0.064148	-0.054637	0.030963	-0.016521	-0.035642
content_rating	-0.003938	-0.010658	0.066474	0.066912	0.028269	-0.022043	0.008143	0.022929	-0.224620	0.007233	0.008597	0.019012	0.001797	0.045637	0.048225	-0.046911	1.000000	-0.021612	0.116049	0.026250	0.074990	0.020613	0.201710	0.063734
budget	0.014707	-0.007753	0.106622	0.069450	0.018888	0.039158	0.016084	0.017009	0.100619	0.001824	0.067780	0.028866	-0.021540	0.072663	0.121470	-0.010849	-0.021612	1.000000	0.044517	0.035664	0.030437	0.052961	-0.079110	0.023186
title_year	0.159880	-0.051667	0.403101	-0.131890	-0.047833	0.113030	-0.018135	0.092108	0.047498	-0.016186	0.017439	0.121891	-0.009794	0.013325	0.021866	-0.039854	0.116049	0.044517	1.000000	0.118707	-0.133390	0.299590	-0.020285	-0.070852
actor_2_facebook_likes	0.019141	-0.042471	0.253847	0.128665	0.119176	0.550690	-0.023477	0.390333	0.245716	0.003300	0.241157	0.639878	-0.013690	0.187555	-0.057602	0.064148	0.026250	0.035664	0.118707	1.000000	0.101208	0.225995	-0.077700	0.079633
imdb_score	-0.118480	0.016045	0.350114	0.367463	0.192332	0.064969	-0.003265	0.093368	0.217158	0.025374	0.479947	0.106062	-0.004077	0.325020	0.111172	-0.054637	0.074990	0.030437	-0.133390	0.101208	1.000000	0.283379	0.054811	0.776134
movie_facebook_likes	0.023283	-0.008192	0.702770	0.219787	0.162192	0.259299	-0.016878	0.128746	0.365256	-0.009251	0.518471	0.199670	-0.013302	0.372988	-0.016942	0.030963	0.020613	0.052961	0.299590	0.225995	0.283379	1.000000	-0.069939	0.172567
main_genre	-0.039730	-0.009628	-0.077460	-0.040454	-0.007688	-0.082684	-0.026834	-0.049751	-0.261192	-0.019649	-0.133259	-0.076193	-0.015384	-0.077588	0.041959	-0.016521	0.201710	-0.079110	-0.020285	-0.077700	0.054811	-0.069939	1.000000	0.040945
imdb_quality	-0.086702	0.015587	0.262172	0.269486	0.120775	0.036692	-0.000792	0.066052	0.147358	0.010283	0.279504	0.075535	-0.000189	0.192536	0.064502	-0.035642	0.063734	0.023186	-0.070852	0.079633	0.776134	0.172567	0.040945	1.000000

	importance
num_voted_users	0.090071
num_critic_for_reviews	0.083409
duration	0.075814
num_user_for_reviews	0.061850
gross	0.061517
...	...
language_29	0.000000
country_10	0.000000
language_28	0.000000
country_45	0.000000
country_42	0.000000

	X variables	coef
9	num_voted_users	1.795222e-05
16	movie_facebook_likes	1.598530e-06
10	cast_total_facebook_likes	9.627292e-07
6	actor_1_facebook_likes	6.798224e-07
15	actor_2_facebook_likes	2.428489e-07

	color	director_name	num_critic_for_reviews	duration	director_facebook_likes	actor_3_facebook_likes	actor_2_name	actor_1_facebook_likes	gross	actor_1_name	num_voted_users	cast_total_facebook_likes	actor_3_name	num_user_for_reviews	language	country	content_rating	budget	title_year	actor_2_facebook_likes	imdb_score	movie_facebook_likes	main_genre
0	0.032632	-0.142235	0.687531	0.231993	-0.034539	0.004342	-0.039087	-0.010389	0.932098	-0.368698	0.463084	-0.009928	0.473999	0.538380	-0.01071	0.077462	-0.053158	0.015677	0.066603	-0.007626	0.187119	0.068188	-0.227681
1	0.032632	-0.190887	0.169058	0.201276	-0.010061	0.010647	0.232220	0.050548	0.338938	-0.020605	0.217494	0.056334	-0.116171	0.179416	-0.01071	0.077462	-0.053158	0.020835	0.044131	0.022038	0.083223	-0.026368	-0.227681
2	0.032632	0.325526	0.538516	0.129604	-0.034539	-0.025832	0.324144	0.005236	0.195178	-0.321695	0.101883	0.000527	0.384018	0.131185	-0.01071	0.055240	-0.053158	0.016332	0.134018	-0.011590	0.044262	0.217185	-0.227681
3	0.032632	-0.367323	0.798368	0.184211	0.921983	0.967168	-0.321996	0.030236	0.521351	0.439204	0.615849	0.145273	-0.013714	0.468604	-0.01071	0.077462	-0.053158	0.016741	0.100310	0.153425	0.265041	0.443546	-0.227681
4	0.032632	-0.478695	0.366102	0.074996	-0.013887	-0.009788	0.343332	-0.010952	0.028163	-0.276736	0.064207	-0.014437	0.258498	0.080582	-0.01071	0.077462	-0.053158	0.017863	0.100310	-0.009845	0.018288	0.042400	-0.227681

	color	director_name	num_critic_for_reviews	duration	director_facebook_likes	actor_3_facebook_likes	actor_2_name	actor_1_facebook_likes	gross	actor_1_name	num_voted_users	cast_total_facebook_likes	actor_3_name	num_user_for_reviews	language	country	content_rating	budget	title_year	actor_2_facebook_likes	imdb_score	movie_facebook_likes	main_genre
cluster
0	1871	1871	1871	1871	1871	1871	1871	1871	1871	1871	1871	1871	1871	1871	1871	1871	1871	1871	1871	1871	1871	1871	1871
1	1929	1929	1929	1929	1929	1929	1929	1929	1929	1929	1929	1929	1929	1929	1929	1929	1929	1929	1929	1929	1929	1929	1929

	color	director_name	num_critic_for_reviews	duration	director_facebook_likes	actor_3_facebook_likes	actor_2_name	actor_1_facebook_likes	gross	actor_1_name	num_voted_users	cast_total_facebook_likes	actor_3_name	num_user_for_reviews	language	country	content_rating	budget	title_year	actor_2_facebook_likes	imdb_score	movie_facebook_likes	main_genre
cluster
0	0.974345	440.384821	165.672309	108.789417	659.708712	822.840192	1082.172100	8124.57830	5.204926e+07	710.248530	104505.895243	12157.561732	1326.888830	320.394976	9.378407	40.778193	7.610903	4.769759e+07	2003.353287	2193.219134	6.434099	9391.644575	3.709781
1	0.960601	1307.684292	163.806117	111.225505	925.042509	689.449456	1151.950752	7187.65578	5.124354e+07	755.556765	102940.918611	10574.613271	1335.525661	339.975117	9.329186	40.258165	7.559357	4.335751e+07	2002.799896	1774.699844	6.483515	9018.913427	3.578020