Scikit-learn

tags: 數據分析 機器學習

資料科學競賽平台

Kaggle

  • 很多數據集
  • 很多線上比賽可以打

天池

  • 阿里巴巴的平台

工具使用步驟

  1. 用pandas把資料讀進來
  2. 用numpy做資料處理

模板

Import

%matplotlib inline import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from sklearn.decomposition import PCA import numpy as np import pandas as pd from sklearn import preprocessing, linear_model, neighbors, from sklearn.metrics import mean_squared_error, r2_score, accuracy_score from sklearn.model_selection import train_test_split from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LogisticRegression from sklearn.cluster import KMeans

匯入檔案

# 注意檔案格式 # 如不是標準csv,可用delim_whitspace = true # 如無header,可用header = None # df = pd.read_csv('path', header = None, delim_whitespace=True) df = pd.read_csv('path')

missing value

df = df[index].dropna(axis = 0, how = 'any')

分割answer跟data

# x = df.drop(index, axis = 1) # axis = 1 代表直列,column # axis = 0 代表橫列,row y = df[13] x = df.drop(13, axis = 1)

Poly特有,產feature

# 產生degree 為 2 的feature poly = PolynomialFeatures(degree = 2).fit(x) x = poly.transform(x)

Split data

# test_size通常小於0.5 # random_state = 1,使shuffle機制停止,固定切割資料,debug可以用 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 1)

Normalization

scaler = preprocessing.StandardScaler().fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test)

Model Select

model = linear_model.LinearRegression() model = LogisticRegression() model.fit(x_train, y_train)

Predict

# Linear y_pred = model.predict(x_test) print('Cofficient : {}'.format(model.coef_)) print('Mean squared error : {}'.format(mean_squared_error(y_test, y_pred))) print('Variance score : {}'.format(r2_score(y_test, y_pred))) # Logistic print(model.coef_) print(model.intercept_) y_pred = model.predict(x_test) print(y_pred) accuracy = model.score(x_test, y_test) print(accuracy)

Code

Linear Regression

from sklearn import preprocessing, linear_model from sklearn.metrics import mean_squared_error, r2_score from sklearn.model_selection import train_test_split import numpy as np import pandas as pd # 資料輸入 df = pd.read_csv('./dataset/housing.csv', header = None, delim_whitespace=True) # 答案取出 y = df[13] x = df.drop(13, axis = 1) # Split data x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1) # Normalization scaler = preprocessing.StandardScaler().fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) # Model Select model = linear_model.LinearRegression() model.fit(x_train, y_train) # Predict y_pred = model.predict(x_test) print('Cofficient : {}'.format(model.coef_)) print('Mean squared error : {}'.format(mean_squared_error(y_test, y_pred))) print('Variance score : {}'.format(r2_score(y_test, y_pred)))

Polynomial Regression

# import numpy as np import pandas as pd from sklearn import linear_model, preprocessing from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, r2_score from sklearn.preprocessing import PolynomialFeatures # 匯入檔案 df = pd.read_csv('./dataset/winequality-red.csv') # 處理answer and data y = df['quality'] x = df.drop('quality', axis = 1) # 產生degree 為 2 的feature poly = PolynomialFeatures(degree = 2).fit(x) x = poly.transform(x) # Split data x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 1) # Normalization scaler = preprocessing.StandardScaler().fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) # Select model model = linear_model.LinearRegression() model.fit(x_train, y_train) # Predict y_pred = model.predict(x_test) # 查看係數 print('The coefficient : {}\n'.format(model.coef_)) print('Mean squared error : {}'.format(mean_squared_error(y_test, y_pred))) print('Variance score : {}'.format(r2_score(y_test, y_pred)))

Logistic Regression

import numpy as np import pandas as pd from sklearn import preprocessing, linear_model from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression pima = pd.read_csv('./dataset/pima-indians-diabetes.csv') #x = pima[['pregnant', 'insulin', 'bmi', 'age']] y = pima['label'] x = pima.drop(['label'], axis = 1) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 1) scaler = preprocessing.StandardScaler().fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) model = LogisticRegression() model.fit(x_train, y_train) print(model.coef_) print(model.intercept_) y_pred = model.predict(x_test) print(y_pred) accuracy = model.score(x_test, y_test) print(accuracy)

Template

Supervised

%matplotlib inline import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from sklearn.decomposition import PCA import numpy as np import pandas as pd from sklearn import preprocessing, linear_model, neighbors from sklearn.metrics import mean_squared_error, r2_score, accuracy_score from sklearn.model_selection import train_test_split from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LogisticRegression from sklearn.cluster import KMeans ### 匯入檔案 ## df = df.reav_csv('path') ## df = df.read_csv('path', header = None) ## df = df.read_csv('path', delim_whitspace = true) ## missing value #df = df[index].dropna(axis = 0, how = 'any') ## 分割answer and data ## x = df.drop(index, axis = 1) ## axis = 1 代表直列,column ## axis = 0 代表橫列,row # y = df[index] # x = df.drop(index, axis = 1) ### Poly(視情況使用) ### 產生degree 為 2 的feature ## poly = PolynomialFeatures(degree = 2).fit(x) ## x = poly.transform(x) ## Split data ## test_size通常小於0.5 ## random_state = 1,使shuffle機制停止,固定切割資料,debug可以用 # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 1) ## Normalization # scaler = preprocessing.StandardScaler().fit(x_train) # x_train = scaler.transform(x_train) # x_test = scaler.transform(x_test) ## Model Select ### Linear Regression ## model = linear_model.LinearRegression() ## model = LogisticRegression() ### KNN ## model = neighbors.KNeighborsClassifier() ## model = neighbors.KNeighborsClassifier(n_neighbors=3) ### Decision Tree ## model = DecisionTreeClassifier(max_depth=3) ## model = DecisionTreeClassifier() ### Random Forest ## model = RandomForestClassifier(max_depth=7, random_state=0) ### SVC ## model = SVC(kernel='rbf') ### Naive Bayes ## model = GaussianNB() ## Fit # model.fit(x_train, y_train) ## Predict # y_pred = model.predict(x_test) ### Metric ## print('Cofficient : {}'.format(model.coef_)) ## print('Mean squared error : {}'.format(mean_squared_error(y_test, y_pred))) ## print('Variance score : {}'.format(r2_score(y_test, y_pred))) ## print('Intercept of Model : {}'.format(model.intercept_)) ## print('準確率 : {}'.format(model.score(x_test, y_test))) ## print('accuracy : {}'.format(accuracy_score(y_test, y_pred))) ## print('number of correct sample : {}'.format(accuracy_score(y_test, y_pred, normalize = False))) ## print('con_matrix: {}'.format(confusion_matrix(y_test, y_pred))) ## 匯出CSV # submission = pd.DataFrame({ # "PassengerId": test_df["PassengerId"], # "Survived": Y_pred # }) # submission.to_csv('submission.csv', index=False)

Unsupervised

%matplotlib inline import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from sklearn.decomposition import PCA import numpy as np import pandas as pd from sklearn import preprocessing, linear_model, neighbors from sklearn.metrics import mean_squared_error, r2_score, accuracy_score from sklearn.model_selection import train_test_split from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LogisticRegression from sklearn.cluster import KMeans ### 匯入檔案 ## df = df.reav_csv('path') ## df = df.read_csv('path', header = None) ## df = df.read_csv('path', delim_whitspace = true) ## missing value #df = df[index].dropna(axis = 0, how = 'any') ### 合併資料 ## f1 = data['V1'].values ## f2 = data['V2'].values ## X = np.array(list(zip(f1, f2))) ### Model select ## model = KMeans(n_clusters=3).fit(x) ## model = DBSCAN(eps=0.3, min_samples=5).fit(x) ## model = mixture.GaussianMixture(n_components=3).fit(X) ## Predict # labels = model.predict(x) # labels = model.labels_ # X_pred = gmm.predict(x) #### Metric ### Kmeans ## centroids = kmeans.cluster_centers_ ## print('centroids: {}'.format(centroids)) ## print('prediction on each data: {}'.format(labels)) ## labels = kmeans.predict(np.array([[12.0,14.0]])) ## print('prediction on data point (12.0, 14.0): {}'.format(labels)) ### DBSCAN ### Note that -1 are noisy points ## print('cluster on X {}'.format(labels)) ### Number of clusters in labels, ignoring noise if present. ## n_clusters = len(set(labels)) - (1 if -1 in labels else 0) ## print('number of clusters: {}'.format(n_clusters)) ## 匯出CSV # submission = pd.DataFrame({ # "PassengerId": test_df["PassengerId"], # "Survived": Y_pred # }) # submission.to_csv('submission.csv', index=False)