# Scikit-learn
###### tags: `數據分析` `機器學習`
## 資料科學競賽平台
### Kaggle
* 很多數據集
* 很多線上比賽可以打
### 天池
* 阿里巴巴的平台
## 工具使用步驟
1. 用pandas把資料讀進來
2. 用numpy做資料處理
## 模板
### Import
```python=
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
from sklearn import preprocessing, linear_model, neighbors,
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
```
### 匯入檔案
```python=
# 注意檔案格式
# 如不是標準csv,可用delim_whitspace = true
# 如無header,可用header = None
# df = pd.read_csv('path', header = None, delim_whitespace=True)
df = pd.read_csv('path')
```
### missing value
```python=
df = df[index].dropna(axis = 0, how = 'any')
```
### 分割answer跟data
```python=
# x = df.drop(index, axis = 1)
# axis = 1 代表直列,column
# axis = 0 代表橫列,row
y = df[13]
x = df.drop(13, axis = 1)
```
### Poly特有,產feature
```python=
# 產生degree 為 2 的feature
poly = PolynomialFeatures(degree = 2).fit(x)
x = poly.transform(x)
```
### Split data
```python=
# test_size通常小於0.5
# random_state = 1,使shuffle機制停止,固定切割資料,debug可以用
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 1)
```
### Normalization
```python=
scaler = preprocessing.StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
```
### Model Select
```python=
model = linear_model.LinearRegression()
model = LogisticRegression()
model.fit(x_train, y_train)
```
### Predict
```python=
# Linear
y_pred = model.predict(x_test)
print('Cofficient : {}'.format(model.coef_))
print('Mean squared error : {}'.format(mean_squared_error(y_test, y_pred)))
print('Variance score : {}'.format(r2_score(y_test, y_pred)))
# Logistic
print(model.coef_)
print(model.intercept_)
y_pred = model.predict(x_test)
print(y_pred)
accuracy = model.score(x_test, y_test)
print(accuracy)
```
## Code
### Linear Regression
```python=
from sklearn import preprocessing, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
# 資料輸入
df = pd.read_csv('./dataset/housing.csv', header = None, delim_whitespace=True)
# 答案取出
y = df[13]
x = df.drop(13, axis = 1)
# Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1)
# Normalization
scaler = preprocessing.StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# Model Select
model = linear_model.LinearRegression()
model.fit(x_train, y_train)
# Predict
y_pred = model.predict(x_test)
print('Cofficient : {}'.format(model.coef_))
print('Mean squared error : {}'.format(mean_squared_error(y_test, y_pred)))
print('Variance score : {}'.format(r2_score(y_test, y_pred)))
```
### Polynomial Regression
```python=
# import numpy as np
import pandas as pd
from sklearn import linear_model, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
# 匯入檔案
df = pd.read_csv('./dataset/winequality-red.csv')
# 處理answer and data
y = df['quality']
x = df.drop('quality', axis = 1)
# 產生degree 為 2 的feature
poly = PolynomialFeatures(degree = 2).fit(x)
x = poly.transform(x)
# Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 1)
# Normalization
scaler = preprocessing.StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
# Select model
model = linear_model.LinearRegression()
model.fit(x_train, y_train)
# Predict
y_pred = model.predict(x_test)
# 查看係數
print('The coefficient : {}\n'.format(model.coef_))
print('Mean squared error : {}'.format(mean_squared_error(y_test, y_pred)))
print('Variance score : {}'.format(r2_score(y_test, y_pred)))
```
### Logistic Regression
```python=
import numpy as np
import pandas as pd
from sklearn import preprocessing, linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
pima = pd.read_csv('./dataset/pima-indians-diabetes.csv')
#x = pima[['pregnant', 'insulin', 'bmi', 'age']]
y = pima['label']
x = pima.drop(['label'], axis = 1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 1)
scaler = preprocessing.StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
model = LogisticRegression()
model.fit(x_train, y_train)
print(model.coef_)
print(model.intercept_)
y_pred = model.predict(x_test)
print(y_pred)
accuracy = model.score(x_test, y_test)
print(accuracy)
```
## Template
### Supervised
```python=
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
from sklearn import preprocessing, linear_model, neighbors
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
### 匯入檔案
## df = df.reav_csv('path')
## df = df.read_csv('path', header = None)
## df = df.read_csv('path', delim_whitspace = true)
## missing value
#df = df[index].dropna(axis = 0, how = 'any')
## 分割answer and data
## x = df.drop(index, axis = 1)
## axis = 1 代表直列,column
## axis = 0 代表橫列,row
# y = df[index]
# x = df.drop(index, axis = 1)
### Poly(視情況使用)
### 產生degree 為 2 的feature
## poly = PolynomialFeatures(degree = 2).fit(x)
## x = poly.transform(x)
## Split data
## test_size通常小於0.5
## random_state = 1,使shuffle機制停止,固定切割資料,debug可以用
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 1)
## Normalization
# scaler = preprocessing.StandardScaler().fit(x_train)
# x_train = scaler.transform(x_train)
# x_test = scaler.transform(x_test)
## Model Select
### Linear Regression
## model = linear_model.LinearRegression()
## model = LogisticRegression()
### KNN
## model = neighbors.KNeighborsClassifier()
## model = neighbors.KNeighborsClassifier(n_neighbors=3)
### Decision Tree
## model = DecisionTreeClassifier(max_depth=3)
## model = DecisionTreeClassifier()
### Random Forest
## model = RandomForestClassifier(max_depth=7, random_state=0)
### SVC
## model = SVC(kernel='rbf')
### Naive Bayes
## model = GaussianNB()
## Fit
# model.fit(x_train, y_train)
## Predict
# y_pred = model.predict(x_test)
### Metric
## print('Cofficient : {}'.format(model.coef_))
## print('Mean squared error : {}'.format(mean_squared_error(y_test, y_pred)))
## print('Variance score : {}'.format(r2_score(y_test, y_pred)))
## print('Intercept of Model : {}'.format(model.intercept_))
## print('準確率 : {}'.format(model.score(x_test, y_test)))
## print('accuracy : {}'.format(accuracy_score(y_test, y_pred)))
## print('number of correct sample : {}'.format(accuracy_score(y_test, y_pred, normalize = False)))
## print('con_matrix: {}'.format(confusion_matrix(y_test, y_pred)))
## 匯出CSV
# submission = pd.DataFrame({
# "PassengerId": test_df["PassengerId"],
# "Survived": Y_pred
# })
# submission.to_csv('submission.csv', index=False)
```
### Unsupervised
```python=
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
from sklearn import preprocessing, linear_model, neighbors
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
### 匯入檔案
## df = df.reav_csv('path')
## df = df.read_csv('path', header = None)
## df = df.read_csv('path', delim_whitspace = true)
## missing value
#df = df[index].dropna(axis = 0, how = 'any')
### 合併資料
## f1 = data['V1'].values
## f2 = data['V2'].values
## X = np.array(list(zip(f1, f2)))
### Model select
## model = KMeans(n_clusters=3).fit(x)
## model = DBSCAN(eps=0.3, min_samples=5).fit(x)
## model = mixture.GaussianMixture(n_components=3).fit(X)
## Predict
# labels = model.predict(x)
# labels = model.labels_
# X_pred = gmm.predict(x)
#### Metric
### Kmeans
## centroids = kmeans.cluster_centers_
## print('centroids: {}'.format(centroids))
## print('prediction on each data: {}'.format(labels))
## labels = kmeans.predict(np.array([[12.0,14.0]]))
## print('prediction on data point (12.0, 14.0): {}'.format(labels))
### DBSCAN
### Note that -1 are noisy points
## print('cluster on X {}'.format(labels))
### Number of clusters in labels, ignoring noise if present.
## n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
## print('number of clusters: {}'.format(n_clusters))
## 匯出CSV
# submission = pd.DataFrame({
# "PassengerId": test_df["PassengerId"],
# "Survived": Y_pred
# })
# submission.to_csv('submission.csv', index=False)
```