#draft 1
import re
import json
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.colors
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from sklearn.neighbors import LocalOutlierFactor
sb.set()
movies = pd.read_csv("tmdb_5000_movies.csv")
movies.columns
credits = pd.read_csv("tmdb_5000_credits.csv")
credits.columns
genre = moviedata[['genres']]
title = moviedata[['original_title']]
revenue = moviedata[['revenue']]
release = moviedata[['release_date']]
**#add a new column for all the actors name USE THIS CODE **
def extract_actors(row):
cast_names = []
cast_list = json.loads(row['cast'])
for i in range(len(cast_list)):
cast_names.append(cast_list[i].get('name'))
return cast_names
credits["actor_names"] = credits.apply(extract_actors, axis = 1)
credits
#or
finalNameList = []
for index,row in credits.iterrows():
castRow = row['cast']
listOfNames = re.findall(r'(?<="name": ")(.*?)(?=",)',castRow)
finalNameList.extend(listOfNames)
print(finalNameList)
#Cleaning data
fullmov = pd.concat([movies, credits], axis=1)
fullmov.shape
#dropping useless columns, movie_id and id can be omitted, homepage also need to be drop since it has too much missing value. title is repeating
finalmov =fulllmov.drop(['id','movie_id','overview','homepage', 'title','status','tagline'],axis=1)
finalmov
#removing data w 0 revenue
finalmov = fullmov.loc[(revenue!=0).any(1)]
finalmov
#second way ofinding all the actors name
j = pd.DataFrame([])
for index,row in df.iterrows():
if 'Action'in row['genres']:
this = pd.DataFrame(row).T
j = j.append(this)
newdf = pd.DataFrame(j)
newdf
#OR
j = pd.DataFrame([])
for index,row in combi.iterrows():
if 'Action' in row['genres']:
#print(row)
#print(index)
tem=combi.iloc[index:index+1]
#print(tem)
j=pd.concat([j,tem],axis=0)
##new code for dates
newdf['month'] = pd.DatetimeIndex(newdf['release_date']).month
print(newdf['month'])
months = newdf['month']
#full data in action
f = plt.figure(figsize=(24, 18))
f= sb.boxplot(x = 'revenue', y = 'month', data = action, orient = "h")
f = sb.swarmplot(x="revenue", y="month", data = action, orient = "h", color = "black")
#set the limit for the plot so that the value look be less skew
f = plt.figure(figsize=(24, 18))
f = sb.boxplot(x="revenue", y="month", data=action, orient = "h")
f = sb.swarmplot(x="revenue", y="month", data=action, orient = "h", color = "black")
f.set(xlim=(10000, 500000000))
**#What areas have the most influence on revenue?**
data1= finalmov[["revenue","budget","popularity","runtime","vote_average","vote_count", "month"]]
data1.corr()
plt.figure(figsize=(12, 8))
sb.heatmap(data1.corr(), annot=True, cmap="YlGnBu")
**#What influence does release date have on revenue?**
import datetime as dt
data2=finalmov[['release_date','revenue']]
data2['release_date']= pd.to_datetime(data2['release_date'])
data2['quarter'] = data2['release_date'].dt.quarter
#no of movies by quater
plt.hist(data2['quarter'])
def check_weekday(date):
res=len(pd.bdate_range(date,date))
if res == 0 :
return 0
else:
return 1
data2['weekday']=data2['release_date'].dt.dayofweek
data2['isWeekend']=(data2['weekday']>=5).astype(int) #0 for weekday and 1 for weekend
data2
#no of movies released on weekend vs weekday
plt.hist(data2['isWeekend'])
plt.hist(data2['weekday'])
corr = data2.corr()
plt.figure(figsize=(12, 8))
sb.heatmap(data2.corr(), annot=True, cmap="Blues")
#visualising data for action
j = pd.DataFrame([])
for index,row in combi.iterrows():
if 'Action' in row['genres']:
#print(row)
#print(index)
tem=combi.iloc[index:index+1]
#print(tem)
j=pd.concat([j,tem],axis=0)
data3 = action[["revenue","budget","popularity","runtime","vote_average","vote_count", "month"]]
data3.corr()
plt.figure(figsize=(12, 8))
sb.heatmap(data3.corr(), annot=True, cmap="YlGnBu")
import datetime as dt
action1 =action[['release_date','revenue']]
action1['release_date']= pd.to_datetime(action1['release_date'])
action1['quarter'] = action1['release_date'].dt.quarter
action1
def check_weekday(date):
res=len(pd.bdate_range(date,date))
if res == 0 :
return 0
else:
return 1
action1['weekday']=action1['release_date'].dt.dayofweek
action1['isWeekend']=(action1['weekday']>=5).astype(int) #0 for weekday and 1 for weekend
action1
corr = action1.corr()
plt.figure(figsize=(12, 8))
sb.heatmap(action1.corr(), annot=True, cmap="Blues")
#find the top 10 actors that appear the most often
actor = finalmov["actor_names"]
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(actor)
label_classes = mlb.classes_
label_classe
label_data = pd.DataFrame(labels, columns=label_classes)
val = {}
for x in label_classes :
val.update({x:label_data[x].value_counts()[1]})
sorted_val = sorted(val.items(), key=lambda kv: kv[1], reverse=True)
sorted_val
#sam l jackson is the most appeared actor
sam = pd.DataFrame([])
for index,row in finalmov.iterrows():
if 'Samuel L. Jackson' in row['cast']:
#print(row)
#print(index)w
tem=finalmov.iloc[index:index+1]
#print(tem)
sam = pd.concat([sam,tem],axis=0)
sam
sam.corr()
plt.figure(figsize=(12, 8))
sb.heatmap(sam.corr(), annot=True, cmap="YlGnBu")
#prolly have to do the same for the other 9 xx
# Extract Response and Predictors
y = pd.DataFrame(finalmov['revenue'])
X = pd.DataFrame(finalmov[["budget", "popularity", "vote_average", "vote_count"]])
# Linear Regression using Train Data
linreg = LinearRegression() # create the linear regression object
linreg.fit(X_train, y_train) # train the linear regression model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
# Linear Regression using Train Data
linreg = LinearRegression() # create the linear regression object
linreg.fit(X_train, y_train) # train the linear regression model
# Coefficients of the Linear Regression line
print('Intercept of Regression \t: b = ', linreg.intercept_)
print('Coefficients of Regression \t: a = ', linreg.coef_)
print()
# Print the Coefficients against Predictors
print(pd.DataFrame(list(zip(X_train.columns, linreg.coef_[0])), columns = ["Predictors", "Coefficients"]))
print()
# Predict Response corresponding to Predictors
y_train_pred = linreg.predict(X_train)
y_test_pred = linreg.predict(X_test)
# Plot the Predictions vs the True values
f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_train_pred, color = "blue")
axes[0].plot(y_train, y_train, 'w-', linewidth = 1)
axes[0].set_xlabel("True values of the Response Variable (Train)")
axes[0].set_ylabel("Predicted values of the Response Variable (Train)")
axes[1].scatter(y_test, y_test_pred, color = "green")
axes[1].plot(y_test, y_test, 'w-', linewidth = 1)
axes[1].set_xlabel("True values of the Response Variable (Test)")
axes[1].set_ylabel("Predicted values of the Response Variable (Test)")
plt.show()
# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_train, y_train))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_train, y_train_pred))
print()
# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_test, y_test))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, y_test_pred))
print()
#Prediction of revenue
revenue_pred = finalmov[finalmov["original_title"].isin(["Men in Black 3", "The Avengers", "Avatar"])]
revenue_pred
X_pred = pd.DataFrame(revenue_pred[["budget", "popularity", "vote_average", "vote_count"]])
y_pred = linreg.predict(X_pred)
y_pred
y_pred = pd.DataFrame(y_pred, columns = ["PredTotal"], index = revenue_pred.index)
finalmov_acc = pd.concat([revenue_pred[["original_title", "revenue"]], y_pred], axis = 1)
y_errs = 100 * abs(finalmov_acc["revenue"] - finalmov_acc["PredTotal"]) / finalmov_acc["revenue"]
y_errs = pd.DataFrame(y_errs, columns = ["Error"], index = revenue_pred.index)
finalmov_acc = pd.concat([finalmov_acc, y_errs], axis = 1)
finalmov_acc