Extract Response and Predictors

#draft 1 import re import json import numpy as np import pandas as pd import seaborn as sb import matplotlib.colors import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error from sklearn.tree import DecisionTreeClassifier from sklearn.tree import plot_tree from sklearn.metrics import confusion_matrix from sklearn.cluster import KMeans from sklearn.neighbors import LocalOutlierFactor sb.set() movies = pd.read_csv("tmdb_5000_movies.csv") movies.columns credits = pd.read_csv("tmdb_5000_credits.csv") credits.columns genre = moviedata[['genres']] title = moviedata[['original_title']] revenue = moviedata[['revenue']] release = moviedata[['release_date']] **#add a new column for all the actors name USE THIS CODE ** def extract_actors(row): cast_names = [] cast_list = json.loads(row['cast']) for i in range(len(cast_list)): cast_names.append(cast_list[i].get('name')) return cast_names credits["actor_names"] = credits.apply(extract_actors, axis = 1) credits #or finalNameList = [] for index,row in credits.iterrows(): castRow = row['cast'] listOfNames = re.findall(r'(?<="name": ")(.*?)(?=",)',castRow) finalNameList.extend(listOfNames) print(finalNameList) #Cleaning data fullmov = pd.concat([movies, credits], axis=1) fullmov.shape #dropping useless columns, movie_id and id can be omitted, homepage also need to be drop since it has too much missing value. title is repeating finalmov =fulllmov.drop(['id','movie_id','overview','homepage', 'title','status','tagline'],axis=1) finalmov #removing data w 0 revenue finalmov = fullmov.loc[(revenue!=0).any(1)] finalmov #second way ofinding all the actors name j = pd.DataFrame([]) for index,row in df.iterrows(): if 'Action'in row['genres']: this = pd.DataFrame(row).T j = j.append(this) newdf = pd.DataFrame(j) newdf #OR j = pd.DataFrame([]) for index,row in combi.iterrows(): if 'Action' in row['genres']: #print(row) #print(index) tem=combi.iloc[index:index+1] #print(tem) j=pd.concat([j,tem],axis=0) ##new code for dates newdf['month'] = pd.DatetimeIndex(newdf['release_date']).month print(newdf['month']) months = newdf['month'] #full data in action f = plt.figure(figsize=(24, 18)) f= sb.boxplot(x = 'revenue', y = 'month', data = action, orient = "h") f = sb.swarmplot(x="revenue", y="month", data = action, orient = "h", color = "black") #set the limit for the plot so that the value look be less skew f = plt.figure(figsize=(24, 18)) f = sb.boxplot(x="revenue", y="month", data=action, orient = "h") f = sb.swarmplot(x="revenue", y="month", data=action, orient = "h", color = "black") f.set(xlim=(10000, 500000000)) **#What areas have the most influence on revenue?** data1= finalmov[["revenue","budget","popularity","runtime","vote_average","vote_count", "month"]] data1.corr() plt.figure(figsize=(12, 8)) sb.heatmap(data1.corr(), annot=True, cmap="YlGnBu") **#What influence does release date have on revenue?** import datetime as dt data2=finalmov[['release_date','revenue']] data2['release_date']= pd.to_datetime(data2['release_date']) data2['quarter'] = data2['release_date'].dt.quarter #no of movies by quater plt.hist(data2['quarter']) def check_weekday(date): res=len(pd.bdate_range(date,date)) if res == 0 : return 0 else: return 1 data2['weekday']=data2['release_date'].dt.dayofweek data2['isWeekend']=(data2['weekday']>=5).astype(int) #0 for weekday and 1 for weekend data2 #no of movies released on weekend vs weekday plt.hist(data2['isWeekend']) plt.hist(data2['weekday']) corr = data2.corr() plt.figure(figsize=(12, 8)) sb.heatmap(data2.corr(), annot=True, cmap="Blues") #visualising data for action j = pd.DataFrame([]) for index,row in combi.iterrows(): if 'Action' in row['genres']: #print(row) #print(index) tem=combi.iloc[index:index+1] #print(tem) j=pd.concat([j,tem],axis=0) data3 = action[["revenue","budget","popularity","runtime","vote_average","vote_count", "month"]] data3.corr() plt.figure(figsize=(12, 8)) sb.heatmap(data3.corr(), annot=True, cmap="YlGnBu") import datetime as dt action1 =action[['release_date','revenue']] action1['release_date']= pd.to_datetime(action1['release_date']) action1['quarter'] = action1['release_date'].dt.quarter action1 def check_weekday(date): res=len(pd.bdate_range(date,date)) if res == 0 : return 0 else: return 1 action1['weekday']=action1['release_date'].dt.dayofweek action1['isWeekend']=(action1['weekday']>=5).astype(int) #0 for weekday and 1 for weekend action1 corr = action1.corr() plt.figure(figsize=(12, 8)) sb.heatmap(action1.corr(), annot=True, cmap="Blues") #find the top 10 actors that appear the most often actor = finalmov["actor_names"] from sklearn.preprocessing import MultiLabelBinarizer mlb = MultiLabelBinarizer() labels = mlb.fit_transform(actor) label_classes = mlb.classes_ label_classe label_data = pd.DataFrame(labels, columns=label_classes) val = {} for x in label_classes : val.update({x:label_data[x].value_counts()[1]}) sorted_val = sorted(val.items(), key=lambda kv: kv[1], reverse=True) sorted_val #sam l jackson is the most appeared actor sam = pd.DataFrame([]) for index,row in finalmov.iterrows(): if 'Samuel L. Jackson' in row['cast']: #print(row) #print(index)w tem=finalmov.iloc[index:index+1] #print(tem) sam = pd.concat([sam,tem],axis=0) sam sam.corr() plt.figure(figsize=(12, 8)) sb.heatmap(sam.corr(), annot=True, cmap="YlGnBu") #prolly have to do the same for the other 9 xx # Extract Response and Predictors y = pd.DataFrame(finalmov['revenue']) X = pd.DataFrame(finalmov[["budget", "popularity", "vote_average", "vote_count"]]) # Linear Regression using Train Data linreg = LinearRegression() # create the linear regression object linreg.fit(X_train, y_train) # train the linear regression model X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25) # Linear Regression using Train Data linreg = LinearRegression() # create the linear regression object linreg.fit(X_train, y_train) # train the linear regression model # Coefficients of the Linear Regression line print('Intercept of Regression \t: b = ', linreg.intercept_) print('Coefficients of Regression \t: a = ', linreg.coef_) print() # Print the Coefficients against Predictors print(pd.DataFrame(list(zip(X_train.columns, linreg.coef_[0])), columns = ["Predictors", "Coefficients"])) print() # Predict Response corresponding to Predictors y_train_pred = linreg.predict(X_train) y_test_pred = linreg.predict(X_test) # Plot the Predictions vs the True values f, axes = plt.subplots(1, 2, figsize=(24, 12)) axes[0].scatter(y_train, y_train_pred, color = "blue") axes[0].plot(y_train, y_train, 'w-', linewidth = 1) axes[0].set_xlabel("True values of the Response Variable (Train)") axes[0].set_ylabel("Predicted values of the Response Variable (Train)") axes[1].scatter(y_test, y_test_pred, color = "green") axes[1].plot(y_test, y_test, 'w-', linewidth = 1) axes[1].set_xlabel("True values of the Response Variable (Test)") axes[1].set_ylabel("Predicted values of the Response Variable (Test)") plt.show() # Check the Goodness of Fit (on Train Data) print("Goodness of Fit of Model \tTrain Dataset") print("Explained Variance (R^2) \t:", linreg.score(X_train, y_train)) print("Mean Squared Error (MSE) \t:", mean_squared_error(y_train, y_train_pred)) print() # Check the Goodness of Fit (on Test Data) print("Goodness of Fit of Model \tTest Dataset") print("Explained Variance (R^2) \t:", linreg.score(X_test, y_test)) print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, y_test_pred)) print() #Prediction of revenue revenue_pred = finalmov[finalmov["original_title"].isin(["Men in Black 3", "The Avengers", "Avatar"])] revenue_pred X_pred = pd.DataFrame(revenue_pred[["budget", "popularity", "vote_average", "vote_count"]]) y_pred = linreg.predict(X_pred) y_pred y_pred = pd.DataFrame(y_pred, columns = ["PredTotal"], index = revenue_pred.index) finalmov_acc = pd.concat([revenue_pred[["original_title", "revenue"]], y_pred], axis = 1) y_errs = 100 * abs(finalmov_acc["revenue"] - finalmov_acc["PredTotal"]) / finalmov_acc["revenue"] y_errs = pd.DataFrame(y_errs, columns = ["Error"], index = revenue_pred.index) finalmov_acc = pd.concat([finalmov_acc, y_errs], axis = 1) finalmov_acc

Syntax	Example	Reference
# Header	Header	基本排版
- Unordered List	Unordered List
1. Ordered List	Ordered List
- [ ] Todo List	Todo List
> Blockquote	Blockquote
Bold font	Bold font
Italics font	Italics font
~~Strikethrough~~	~~Strikethrough~~
19^th^	19^th
H~2~O	H₂O
++Inserted text++	Inserted text
==Marked text==	Marked text
[link text](https:// "title")	Link
![image alt](https:// "title")	Image
`Code`	`Code`	在筆記中貼入程式碼
```javascript var i = 0; ```	`var i = 0;`
:smile:		Emoji list
{%youtube youtube_id %}	Externals
$L^aT_eX$	L^aT_eX
:::info This is a alert area. :::	This is a alert area.