Scikit-learn

tags: `數據分析` `機器學習`

資料科學競賽平台

Kaggle

很多數據集
很多線上比賽可以打

天池

阿里巴巴的平台

工具使用步驟

用pandas把資料讀進來
用numpy做資料處理

模板

Import














%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA

import numpy as np
import pandas as pd
from sklearn import preprocessing, linear_model, neighbors, 
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans

匯入檔案





# 注意檔案格式
# 如不是標準csv，可用delim_whitspace = true
# 如無header，可用header = None
# df = pd.read_csv('path', header = None, delim_whitespace=True)
df = pd.read_csv('path')

missing value


df = df[index].dropna(axis = 0, how = 'any')

分割answer跟data





# x = df.drop(index, axis = 1)
# axis = 1 代表直列，column
# axis = 0 代表橫列，row
y = df[13]
x = df.drop(13, axis = 1)

Poly特有，產feature



# 產生degree 為 2 的feature
poly = PolynomialFeatures(degree = 2).fit(x)
x = poly.transform(x)

Split data



# test_size通常小於0.5
# random_state = 1，使shuffle機制停止，固定切割資料，debug可以用
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 1)

Normalization



scaler = preprocessing.StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

Model Select





model = linear_model.LinearRegression()
model = LogisticRegression()


model.fit(x_train, y_train)

Predict






















# Linear
y_pred = model.predict(x_test)

print('Cofficient : {}'.format(model.coef_))

print('Mean squared error : {}'.format(mean_squared_error(y_test, y_pred)))

print('Variance score : {}'.format(r2_score(y_test, y_pred)))


# Logistic
print(model.coef_)

print(model.intercept_)

y_pred = model.predict(x_test)

print(y_pred)
accuracy = model.score(x_test, y_test)

print(accuracy)

Code

Linear Regression



































from sklearn import preprocessing, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd


# 資料輸入
df = pd.read_csv('./dataset/housing.csv', header = None, delim_whitespace=True)
# 答案取出
y = df[13]
x = df.drop(13, axis = 1)


# Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1)


# Normalization
scaler = preprocessing.StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

# Model Select
model = linear_model.LinearRegression()
model.fit(x_train, y_train)

# Predict
y_pred = model.predict(x_test)

print('Cofficient : {}'.format(model.coef_))

print('Mean squared error : {}'.format(mean_squared_error(y_test, y_pred)))

print('Variance score : {}'.format(r2_score(y_test, y_pred)))

Polynomial Regression





































# import numpy as np
import pandas as pd
from sklearn import linear_model, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures

# 匯入檔案
df = pd.read_csv('./dataset/winequality-red.csv')

# 處理answer and data
y = df['quality']
x = df.drop('quality', axis = 1)

# 產生degree 為 2 的feature
poly = PolynomialFeatures(degree = 2).fit(x)
x = poly.transform(x)

# Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 1)

# Normalization
scaler = preprocessing.StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

# Select model
model = linear_model.LinearRegression()
model.fit(x_train, y_train)

# Predict
y_pred = model.predict(x_test)

# 查看係數
print('The coefficient : {}\n'.format(model.coef_))
print('Mean squared error : {}'.format(mean_squared_error(y_test, y_pred)))
print('Variance score : {}'.format(r2_score(y_test, y_pred)))

Logistic Regression
































import numpy as np
import pandas as pd
from sklearn import preprocessing, linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


pima = pd.read_csv('./dataset/pima-indians-diabetes.csv')

#x = pima[['pregnant', 'insulin', 'bmi', 'age']]
y = pima['label']
x = pima.drop(['label'], axis = 1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 1)

scaler = preprocessing.StandardScaler().fit(x_train)

x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

model = LogisticRegression()
model.fit(x_train, y_train)

print(model.coef_)

print(model.intercept_)

y_pred = model.predict(x_test)

print(y_pred)
accuracy = model.score(x_test, y_test)

print(accuracy)

Template

Supervised



































































































%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA

import numpy as np
import pandas as pd
from sklearn import preprocessing, linear_model, neighbors
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans

### 匯入檔案
## df = df.reav_csv('path')
## df = df.read_csv('path', header = None)
## df = df.read_csv('path', delim_whitspace = true)

## missing value
#df = df[index].dropna(axis = 0, how = 'any')

## 分割answer and data
## x = df.drop(index, axis = 1)
## axis = 1 代表直列，column
## axis = 0 代表橫列，row
# y = df[index]
# x = df.drop(index, axis = 1)

### Poly（視情況使用）
### 產生degree 為 2 的feature
## poly = PolynomialFeatures(degree = 2).fit(x)
## x = poly.transform(x)

## Split data
## test_size通常小於0.5
## random_state = 1，使shuffle機制停止，固定切割資料，debug可以用
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 1)


## Normalization
# scaler = preprocessing.StandardScaler().fit(x_train)
# x_train = scaler.transform(x_train)
# x_test = scaler.transform(x_test)

## Model Select

### Linear Regression 
## model = linear_model.LinearRegression()
## model = LogisticRegression()

### KNN
## model = neighbors.KNeighborsClassifier()
## model = neighbors.KNeighborsClassifier(n_neighbors=3)

### Decision Tree
## model = DecisionTreeClassifier(max_depth=3)
## model = DecisionTreeClassifier()

### Random Forest
## model = RandomForestClassifier(max_depth=7, random_state=0)

### SVC
## model = SVC(kernel='rbf')

### Naive Bayes
## model = GaussianNB()

## Fit
# model.fit(x_train, y_train)

## Predict
# y_pred = model.predict(x_test)

### Metric
## print('Cofficient : {}'.format(model.coef_))

## print('Mean squared error : {}'.format(mean_squared_error(y_test, y_pred)))

## print('Variance score : {}'.format(r2_score(y_test, y_pred)))

## print('Intercept of Model : {}'.format(model.intercept_))

## print('準確率 : {}'.format(model.score(x_test, y_test)))

## print('accuracy : {}'.format(accuracy_score(y_test, y_pred)))

## print('number of correct sample : {}'.format(accuracy_score(y_test, y_pred, normalize = False)))

## print('con_matrix: {}'.format(confusion_matrix(y_test, y_pred)))


## 匯出CSV
# submission = pd.DataFrame({
#         "PassengerId": test_df["PassengerId"],
#         "Survived": Y_pred
#     })
    
# submission.to_csv('submission.csv', index=False)

Unsupervised

































































%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA

import numpy as np
import pandas as pd
from sklearn import preprocessing, linear_model, neighbors
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans

### 匯入檔案
## df = df.reav_csv('path')
## df = df.read_csv('path', header = None)
## df = df.read_csv('path', delim_whitspace = true)

## missing value
#df = df[index].dropna(axis = 0, how = 'any')

### 合併資料
## f1 = data['V1'].values
## f2 = data['V2'].values
## X = np.array(list(zip(f1, f2)))

### Model select

## model = KMeans(n_clusters=3).fit(x)
## model = DBSCAN(eps=0.3, min_samples=5).fit(x)
## model = mixture.GaussianMixture(n_components=3).fit(X)

## Predict
# labels = model.predict(x)
# labels = model.labels_
# X_pred = gmm.predict(x)



#### Metric
### Kmeans
## centroids = kmeans.cluster_centers_

## print('centroids: {}'.format(centroids))
## print('prediction on each data: {}'.format(labels))

## labels = kmeans.predict(np.array([[12.0,14.0]]))
## print('prediction on data point (12.0, 14.0): {}'.format(labels))

### DBSCAN
### Note that -1 are noisy points
## print('cluster on X {}'.format(labels))

### Number of clusters in labels, ignoring noise if present.
## n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
## print('number of clusters: {}'.format(n_clusters))

## 匯出CSV
# submission = pd.DataFrame({
#         "PassengerId": test_df["PassengerId"],
#         "Survived": Y_pred
#     })
    
# submission.to_csv('submission.csv', index=False)

Scikit-learn

tags: 數據分析 機器學習

資料科學競賽平台

Kaggle

天池

工具使用步驟

模板

Import

匯入檔案

missing value

分割answer跟data

Poly特有，產feature

Split data

Normalization

Model Select

Predict

Code

Linear Regression

Polynomial Regression

Logistic Regression

Template

Supervised

Unsupervised

Read more

機器學習

學習資源

資源大禮包

學習程式推薦資源

tags: `數據分析` `機器學習`