# 3. 模型使用 ## 1. 處理資料 * Impute * 刪除資料 ``` import pandas as pd import numpy as np import scipy.stats from sklearn.impute import KNNImputer from matplotlib import pyplot import seaborn as sns ``` ``` data = pd.read_csv("Data.csv") data.head() ``` 刪除不需要欄位 ``` data = data.drop(data.columns[0],axis=1) data.head() ``` 保留數值資料 ``` All_num = data.select_dtypes(include=['float64','int']) All_num.head() ``` 尋找缺值數據 ``` All_num.lactate_max[All_num.lactate_max.isna()] ``` 只保留有數值之數據 ``` All_shape = All_num for n in All_shape.columns[1:]: All_shape = All_shape[-All_shape[n].isna()] All_shape.shape print(All_shape.shape) len(All_shape[All_shape.mortality_90d==0]) ``` 去除導致樣本過少欄位 ``` All_shape = All_num.drop(["leavetime"],axis=1) for n in All_shape.columns: All_shape = All_shape[-All_shape[n].isna()] All_shape.shape print(All_shape.shape) len(All_shape[All_shape.mortality_90d==0]) ``` ``` All_shape.lactate_max[All_shape.lactate_max.isna()] ``` KKKImpute ``` ## base on KNN k-Nearest Neighbors Imputer = KNNImputer(n_neighbors=2) imput_data = All_num.drop(['icustay_id','mortality_90d','age_score','leavetime','los'],axis=1) imput_data_name = imput_data.columns imput_data = Imputer.fit_transform(imput_data) imput_data = pd.DataFrame(imput_data) imput_data.columns = imput_data_name imput_data['mortality_90d'] = data['mortality_90d'] imput_data['los'] = data['los'] imput_data.head() ``` ## 2. Feature selection T檢定 ``` p = [] for variable in All_num.columns[1:]: print(variable) print("Death :",len(All_num[variable].dropna()[All_num.mortality_90d==1]), "\n", "Live :",len(All_num[variable].dropna()[All_num.mortality_90d==0])) print (scipy.stats.ttest_ind(All_num[variable].dropna()[All_num.mortality_90d==1], All_num[variable].dropna()[All_num.mortality_90d==0])) p.append(scipy.stats.ttest_ind(All_num[variable].dropna()[All_num.mortality_90d==1], All_num[variable].dropna()[All_num.mortality_90d==0])[1]) ``` ``` plt.figure(figsize=(5,5)) sns.boxplot(data=All_num,x="mortality_90d",y="los").set_title("los \n p value : 6.112005241921071e-06") ``` 數據對於檢定沒有幫助 ``` All_num['age_score'] All_num['age_score'].unique() ``` 相似性分析 ``` All_cor = All_num.drop(['icustay_id','age_score'],axis=1).corr() All_cor ``` 與leavetime相關性 ``` dict(All_cor.leavetime) ``` Heatmap ``` top_corr_features = All_cor.index plt.figure(figsize=(20,20)) #plot heat map g=sns.heatmap(All_cor[top_corr_features].corr(),annot=True,cmap="RdYlGn") ``` 分類樹特徵篩選 https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html ``` from sklearn.ensemble import ExtraTreesClassifier import matplotlib.pyplot as plt model = ExtraTreesClassifier() model.fit(All_shape.drop(["mortality_90d"],axis=1),All_shape.mortality_90d) print(model.feature_importances_) feat_importances = pd.Series(model.feature_importances_, index=All_shape.drop(["mortality_90d"],axis=1).columns) feat_importances.nlargest(20).plot(kind='barh') plt.show() ``` XGB特徵篩選 https://xgboost.readthedocs.io/en/latest/python/python_api.html ``` from xgboost import XGBClassifier from xgboost import plot_importance model = XGBClassifier() model.fit(All_shape.drop(["mortality_90d"],axis=1),All_shape.mortality_90d) # feature importance print(model.feature_importances_) # plot plot_importance(model) pyplot.show() ``` 去除樣本 ``` ## 觀察樣本分佈狀況 import seaborn as sns from sklearn.decomposition import PCA pca_data = imput_data.drop(["los"],axis=1) pca = PCA(n_components=10).fit_transform(pca_data.drop(["mortality_90d"],axis=1)) sns.scatterplot(x=pca[:,0],y=pca[:,1],hue = pca_data.mortality_90d) ``` 給予資料 index 才可能篩選 ``` np.where(pca[:,0]==max(pca[:,0])) imput_data.index = [n for n in range(imput_data.shape[0])] imput_data.index ``` 去除樣本 ``` ## 觀察樣本分佈狀況 import seaborn as sns from sklearn.decomposition import PCA pca_data = imput_data.drop(["los"],axis=1) pca_data = pca_data.drop(10224,axis=0) pca = PCA(n_components=10).fit_transform(pca_data.drop(["mortality_90d"],axis=1)) sns.scatterplot(x=pca[:,0],y=pca[:,1],hue = pca_data.mortality_90d) ``` ## 3. 模型的建立 常用套件 ``` from sklearn.model_selection import train_test_split import numpy as np from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix, roc_curve, auc import matplotlib.pyplot as plt ``` ROC圖形繪製 ``` def ROC(y_test,y_score): ## 計算偽陽性和真陽性 fpr,tpr,threshold = roc_curve(y_test, y_score) ## 計算auc的值 roc_auc = auc(fpr,tpr) plt.figure() lw = 2 plt.figure(figsize=(10,10)) ## 偽陽性為橫座標,真陽性為縱座標做曲線 plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") plt.show() ``` Support vector machine https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html?highlight=svc#sklearn.svm.SVC ``` def svm(data): from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC X_train, X_test, y_train, y_test = train_test_split(data.drop(["mortality_90d"],axis=1) ,data.mortality_90d, test_size=0.1, random_state=42) clf = make_pipeline(StandardScaler(), SVC(gamma='auto')) clf.fit(X_train, y_train) out = clf.predict(X_test) y_pred_rt = clf.decision_function(X_test) ROC(y_test, y_pred_rt) return accuracy_score(y_test, out), out, y_test; ``` XGBoost ``` def xgboost(data): import xgboost as xgb X_train, X_test, y_train, y_test = train_test_split(data.drop(["mortality_90d"],axis=1) ,data.mortality_90d, test_size=0.2, random_state=42) xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42) xgb_model.fit(X_train, y_train) out = xgb_model.predict(X_test) y_pred_rt = xgb_model.predict_proba(X_test)[:,1] ROC(y_test, y_pred_rt) return accuracy_score(y_test, out), out, y_test; ``` 分類樹 https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html?highlight=randomforestclassifier#sklearn.ensemble.RandomForestClassifier ``` def forest(data): from sklearn.ensemble import RandomForestClassifier X_train, X_test, y_train, y_test = train_test_split(data.drop(["mortality_90d"],axis=1) ,data.mortality_90d, test_size=0.2, random_state=42) model = RandomForestClassifier(n_estimators=100, random_state=0) model.fit(X_train, y_train) out = model.predict(X_test) y_pred_rt = model.predict_proba(X_test)[:,1] ROC(y_test, y_pred_rt) return accuracy_score(y_test, out), out, y_test; ``` Learning curve ``` def learn_curve(history): plt.subplot(211) plt.title('Loss') plt.plot(history.history['loss'], label='train') plt.plot(history.history['val_loss'], label='test') plt.legend() # plot accuracy during training plt.subplot(212) plt.title('Accuracy') plt.plot(history.history['accuracy'], label='train') plt.plot(history.history['val_accuracy'], label='test') plt.legend() plt.show() ``` 類神經網絡建立 ``` def DNN(data): import keras from keras.models import Sequential from keras.layers import Dense from keras.utils import to_categorical X_train, X_test, y_train, y_test = train_test_split(data.drop(["mortality_90d"],axis=1) ,data.mortality_90d, test_size=0.2, random_state=42) y_train = to_categorical(y_train, 2) y_test = to_categorical(y_test, 2) input_shape = (X_train.shape[1],) # Create the model model = Sequential() model.add(Dense(32, input_shape=input_shape, activation='relu')) model.add(Dense(8, activation='relu')) model.add(Dense(2, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) history = model.fit(X_train, y_train, epochs=10, batch_size=5, verbose=1, validation_split=0.2) _, train_acc = model.evaluate(X_train, y_train, verbose=0) _, test_acc = model.evaluate(X_test, y_test, verbose=0) learn_curve(history) return train_acc,test_acc ``` confusion matrix ``` def confusion(pre,label): confmat = confusion_matrix(y_true=label, y_pred=pre) fig, ax = plt.subplots(figsize=(2.5, 2.5)) ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3) for i in range(confmat.shape[0]): for j in range(confmat.shape[1]): ax.text(x=j, y=i, s=confmat[i,j], va='center', ha='center') plt.xlabel('predicted label') plt.ylabel('true label') plt.show() ``` 線性迴歸模型 ``` def LSR(data): # example of training a final regression model from sklearn.linear_model import LinearRegression from sklearn.datasets import make_regression from sklearn.metrics import mean_squared_error X_train, X_test, y_train, y_test = train_test_split(data.drop(["mortality_90d","los"],axis=1) ,data.los, test_size=0.2, random_state=42) # fit final model model = LinearRegression() model.fit(np.array(X_train), y_train) out = model.predict(np.array(X_test)) return y_test-out, mean_squared_error( y_test,out) ``` ## 4. 使用模型 SVM ``` acc, pre, label = svm(imput_data) acc confusion(pre,label) ``` XGBoost ``` acc, pre, label = xgboost(imput_data) acc confusion(pre,label) ``` Random forest ``` forest(imput_data) acc confusion(pre,label) ``` 類神經網路 ``` DNN(imput_data) ``` ## 5. 分析步驟組裝 ### 5.1 去除缺值之分析 ``` All_shape ``` ``` def svm(data): X_train, X_test, y_train, y_test = train_test_split(data.drop(["mortality_90d"],axis=1) ,data.mortality_90d, test_size=0.2, random_state=42) clf = make_pipeline(StandardScaler(), SVC(gamma='auto')) clf.fit(X_train, y_train) out = clf.predict(X_test) y_pred_rt = clf.decision_function(X_test) ROC(y_test, y_pred_rt) return accuracy_score(y_test, out), out, y_test; acc, pre, label = svm(All_shape.drop(["icustay_id"],axis=1)) print(acc) confusion(pre,label) ``` ### 5.2 使用KNNimpute+ExtraTreesClassifier ``` imput_data.head() ``` ``` from sklearn.ensemble import ExtraTreesClassifier import matplotlib.pyplot as plt model = ExtraTreesClassifier() model.fit(imput_data.drop(["mortality_90d"],axis=1),imput_data.mortality_90d) print(model.feature_importances_) feat_importances = pd.Series(model.feature_importances_, index=imput_data.drop(["mortality_90d"],axis=1).columns) feat_importances.nlargest(20).plot(kind='barh') plt.show() ``` ``` imput_X = imput_data[imput_data.drop(["mortality_90d"],axis=1).columns[model.feature_importances_>0.01]] imput_X["mortality_90d"] = imput_data["mortality_90d"] ``` ``` def svm(data): from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC X_train, X_test, y_train, y_test = train_test_split(data.drop(["mortality_90d"],axis=1) ,data.mortality_90d, test_size=0.1, random_state=42) clf = make_pipeline(StandardScaler(), SVC(gamma='auto')) clf.fit(X_train, y_train) out = clf.predict(X_test) y_pred_rt = clf.decision_function(X_test) ROC(y_test, y_pred_rt) return accuracy_score(y_test, out), out, y_test; acc, pre, label = svm(imput_X) print(acc) confusion(pre,label) ``` ### 5.3 使用KNNimpute+PCA去除樣本 ``` imput_data.index = [n for n in range(imput_data.shape[0])] imput_data.index ## 觀察樣本分佈狀況 import seaborn as sns from sklearn.decomposition import PCA pca_data = imput_data.drop(["los"],axis=1) pca = PCA(n_components=10).fit_transform(pca_data.drop(["mortality_90d"],axis=1)) pring(len([n for n in np.where(pca[:,0]>10)[0]])) pca_data = pca_data.drop([n for n in np.where(pca[:,0]>10)[0]],axis=0) pca = PCA(n_components=10).fit_transform(pca_data.drop(["mortality_90d"],axis=1)) sns.scatterplot(x=pca[:,0],y=pca[:,1],hue = pca_data.mortality_90d) ``` ``` def svm(data): X_train, X_test, y_train, y_test = train_test_split(data.drop(["mortality_90d"],axis=1) ,data.mortality_90d, test_size=0.1, random_state=42) clf = make_pipeline(StandardScaler(), SVC(gamma='auto')) clf.fit(X_train, y_train) out = clf.predict(X_test) y_pred_rt = clf.decision_function(X_test) ROC(y_test, y_pred_rt) return accuracy_score(y_test, out), out, y_test; acc, pre, label = svm(pca_data) print(acc) confusion(pre,label) ``` ### 5.4 使用KNNimpute+XGBoost特徵篩選 ``` from xgboost import XGBClassifier from xgboost import plot_importance model = XGBClassifier() model.fit(imput_data.drop(["mortality_90d"],axis=1),imput_data.mortality_90d) # feature importance print(model.feature_importances_) # plot plot_importance(model) pyplot.show() ``` 篩選重要性高的資料 ``` imput_G = imput_data[imput_data.drop(["mortality_90d"],axis=1).columns[model.feature_importances_>0.01]] imput_G["mortality_90d"] = imput_data["mortality_90d"] ``` ``` def svm(data): X_train, X_test, y_train, y_test = train_test_split(data.drop(["mortality_90d"],axis=1) ,data.mortality_90d, test_size=0.1, random_state=42) clf = make_pipeline(StandardScaler(), SVC(gamma='auto')) clf.fit(X_train, y_train) out = clf.predict(X_test) y_pred_rt = clf.decision_function(X_test) ROC(y_test, y_pred_rt) return accuracy_score(y_test, out), out, y_test; acc, pre, label = svm(imput_G) print(acc) confusion(pre,label) ``` ### 5.5 使用KNNimpute+T檢定特徵篩選 ``` ## 以類別計算p值 p = [] label = imput_data.mortality_90d imput_T = imput_data.drop(['mortality_90d'],axis=1) for variable in imput_T: p.append(scipy.stats.ttest_ind(imput_T[variable][label==0], imput_T[variable][label==1])) p ``` 取出p值 ``` pvalue = [n[1] for n in p] ``` 製作訓練表 ``` imput_P = imput_T[imput_T.columns[np.array(pvalue) < 0.05]] imput_P['mortality_90d'] = label ``` ``` def svm(data): from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC X_train, X_test, y_train, y_test = train_test_split(data.drop(["mortality_90d"],axis=1) ,data.mortality_90d, test_size=0.1, random_state=42) clf = make_pipeline(StandardScaler(), SVC(gamma='auto')) clf.fit(X_train, y_train) out = clf.predict(X_test) y_pred_rt = clf.decision_function(X_test) ROC(y_test, y_pred_rt) return accuracy_score(y_test, out), out, y_test; acc, pre, label = svm(imput_P) print(acc) confusion(pre,label) ``` ### 5.6 DNN ``` def DNN(data): X_train, X_test, y_train, y_test = train_test_split(data.drop(["mortality_90d"],axis=1) ,data.mortality_90d, test_size=0.1, random_state=42) y_train = to_categorical(y_train, 2) y_test = to_categorical(y_test, 2) input_shape = (X_train.shape[1],) # Create the model model = Sequential() model.add(Dense(16, input_shape=input_shape, activation='relu')) model.add(Dense(8, activation='relu')) model.add(Dense(2, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) history = model.fit(X_train, y_train, epochs=10, batch_size=1, verbose=1, validation_split=0.2) _, train_acc = model.evaluate(X_train, y_train, verbose=0) _, test_acc = model.evaluate(X_test, y_test, verbose=0) learn_curve(history) return train_acc,test_acc ``` ``` DNN(imput_data) ``` ### 5.7 預測在ICU時間 ``` error, mse = LSR(All_shape.drop('icustay_id',axis=1)) ``` ``` print(mse) sns.boxplot(error) ``` ### 5.8 預測時間,以相關性篩選 ``` All_shape_c = All_shape.drop('icustay_id',axis=1) All_cor = All_shape_c.corr() All_cor ``` ``` abs(All_cor.los).describe() ``` ``` All_shape_C = All_shape_c[All_shape_c.columns[abs(All_cor.los)>abs(All_cor.los).describe()[6]]] ``` ``` error, mse = LSR(All_shape_C) mse sns.boxplot(error) ``` ### 5.9 Autoencoder ``` def AE(data): import keras from keras.models import Model from keras.layers import Dense, Input from keras.utils import to_categorical X_train, X_test, y_train, y_test = train_test_split(data.drop(["mortality_90d","los"],axis=1) ,data.los, test_size=0.2, random_state=42) # Create the model input_window = Input(shape=(X_train.shape[1],)) x = Dense(40, activation='relu')(input_window) x = Dense(20, activation='relu')(x) encoded = Dense(4, activation='relu',name="b")(x) encoder = Model(input_window, encoded) x = Dense(20, activation='relu')(encoded) x = Dense(40, activation='relu')(x) decoded = Dense(X_train.shape[1], activation='relu')(x) autoencoder = Model(input_window, decoded) autoencoder.summary() opt = keras.optimizers.Adam(learning_rate=0.01) autoencoder.compile(loss='mean_squared_error', optimizer=opt, metrics=['MeanSquaredError']) history = autoencoder.fit(X_train, y_train, epochs=10, batch_size=1, verbose=1, validation_split=0.2) g = autoencoder.layers[0].output plt.subplot(211) plt.title('Loss') plt.plot(history.history['loss'], label='train') plt.plot(history.history['val_loss'], label='test') plt.legend() return encoder, X_train ``` ``` ae_model, X_train = AE(All_shape.drop('icustay_id',axis=1)) ``` ``` ## 得到bottleneck數值 ae_model.compile(loss='mean_squared_error', optimizer='adam', metrics=['MeanSquaredError']) result = ae_model.predict(X_train) result ``` https://keras.io/api/