HackMD - Collaborative Markdown Knowledge Base

### 引入local端dataset ``` import pandas as pd iris = pd.read_csv("iris.csv") \\輸入位址 print(iris) ``` #### iris 使用svm進行分析 ``` import numpy as np import matplotlib.pyplot as plt #sklearn套件直接提供現實世界的鳶尾花的花萼與花瓣的長寬度統計資料 from sklearn.datasets import load_iris iris = load_iris() #可以印出來看看資料Iris Plants Database #print(iris.DESCR) #X是花萼與花瓣的長度、寬度的原始資料 #Y是將花分類之後的正確答案 X = iris.data Y = iris.target #以第一筆為例 #花萼長度花萼寬度花瓣長度花瓣寬度 #以下取法將可以取到花瓣的長度以及花瓣的寬度 X = X[:,2:] #進行機器學習的時候，就需區分訓練用data 以及測試用data from sklearn.model_selection import train_test_split #train_test_split將會自動把資料分類為 x_train, x_test, y_train, y_test 這四種 #測試資料佔的比例暫訂為20%, 因此test_size = 0.2 #random_state請輸入一個隨便的數字 x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, random_state=87 ) #花瓣的長度：x_train[:,0] #花瓣的寬度：x_train[:,1] #c=分類答案的陣列 #顯示在畫面上將會有三種顏色的花瓣 plt.scatter(x_train[:,0],x_train[:,1], c=y_train) #匯入SVM支持向量機函式庫的分類功能SVC from sklearn.svm import SVC #然後要開一台SVC clf = SVC() #開始訓練：clf.fit(輸入資料,正確答案) clf.fit(x_train,y_train) #帶入測試資料試試看預測結果 y_test_predict = clf.predict(x_test) #畫出測試資料的預測結果看看 #從圖看起來，似乎預測挺成功的 plt.scatter(x_test[:,0],x_test[:,1], c=y_test_predict) #可以更精準的判斷到底有幾個點分類錯誤的方式：剪去原本的正確答案y_test：c=y_test_predict - y_test #當預測正確時，c=y_test_predict - y_test應該為0，預測錯誤的話c=y_test_predict - y_test就是1 #由此可以發現只有一個點是預測錯誤的(這個點的顏色與其他所有的點的顏色不同) plt.scatter(x_test[:,0],x_test[:,1], c=y_test_predict - y_test) ``` ### 透過local 解析csv檔進行分析 ``` from sklearn.pipeline import Pipeline, FeatureUnion from sklearn.model_selection import GridSearchCV from sklearn.svm import SVC from sklearn.datasets import load_iris from sklearn.decomposition import PCA from sklearn.feature_selection import SelectKBest from sklearn.model_selection import train_test_split from sklearn.svm import SVC import pandas as pd import matplotlib.pyplot as plt from sklearn import preprocessing import numpy as np df = pd.read_csv('iris.csv') X = df.iloc[:, [3,4]] # Will give you columns 2 and 3 i.e 'petal_length' and 'petal_width' y = df.iloc[:, 5] # Label column i.e 'species' le = preprocessing.LabelEncoder() y = le.fit_transform(y) x_train,x_test,y_train,y_test = train_test_split(X,y, test_size=0.2,random_state=47) clf =SVC() clf.fit(x_train,y_train) y_test_predict=clf.predict(x_test) print(X) colormap = np.array(['y', 'g', 'r']) plt.scatter(x_train['PetalLengthCm'],x_train['PetalWidthCm'], c=y_train) plt.scatter(x_test['PetalLengthCm'],x_test['PetalWidthCm'],c=y_test_predict) plt.scatter(x_test['PetalLengthCm'],x_test['PetalWidthCm'], c=colormap[y_test_predict - y_test]) ``` ### 使用to_numpy的方法，最後畫圖才可以用x_train[:,0] ``` df = pd.read_csv('iris.csv') new=df[['PetalLengthCm','PetalWidthCm']].to_numpy() X=new //data y=df['Species'] //target x_train,x_test,y_train,y_test = train_test_split(X,y, test_size=0.2,random_state=47) clf =SVC(kernel='linear',C=1,gamma='auto') clf.fit(x_train,y_train) y_test_predict=clf.predict(x_test) print(x_train[:,0]) colormap = np.array([ 'g', 'r']) plt.scatter(x_train[:,0],x_train[:,1], c=y_train) ``` ### 繪圖 https://pyecontech.com/2020/04/11/python_svm/ https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html https://ccliao.github.io/2017/06/24/python-svm/ ``` def plot_estimator(estimator, X, y, title): x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 ##加1減1只是為了繪圖時留空白 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) #meshgrid根據坐標向量創建坐標矩陣 Z = estimator.predict(np.c_[xx.ravel(), yy.ravel()]) #一維,共3476筆資料 #ravel() Return a contiguous flattened array #np.c_ 串接兩個list,np.ravel將矩陣變為一維 Z = Z.reshape(xx.shape) #二維,共44筆資料(44,79) plt.plot() #Contours(輪廓) can be explained simply as a curve joining all the continuous points (along the boundary), having same color or intensity. plt.contourf(xx, yy, Z, alpha=0.4, cmap = plt.cm.RdYlBu) #cmap- Colormap #alpha透明度,愈小愈透明0~1 plt.scatter(X[:, 0], X[:, 1], c=y, cmap = plt.cm.brg) #c - color plt.title(title) plt.xlabel('Petal.Length') plt.ylabel('Petal.Width') plt.show() df = pd.read_csv('iris.csv') new=df[['PetalLengthCm','PetalWidthCm']].to_numpy() X=new y=df['Species'] x_train,x_test,y_train,y_test = train_test_split(X,y, test_size=0.2,random_state=47) clf =SVC(kernel='linear',C=1,gamma='auto') clf.fit(x_train,y_train) y_test_predict=clf.predict(x_test) print(x_train[:,0]) colormap = np.array([ 'g', 'r']) plot_estimator(clf,X,y,"svm") ```