# 3. 模型使用
## 1. 處理資料
* Impute
* 刪除資料
```
import pandas as pd
import numpy as np
import scipy.stats
from sklearn.impute import KNNImputer
from matplotlib import pyplot
import seaborn as sns
```
```
data = pd.read_csv("Data.csv")
data.head()
```
刪除不需要欄位
```
data = data.drop(data.columns[0],axis=1)
data.head()
```
保留數值資料
```
All_num = data.select_dtypes(include=['float64','int'])
All_num.head()
```
尋找缺值數據
```
All_num.lactate_max[All_num.lactate_max.isna()]
```
只保留有數值之數據
```
All_shape = All_num
for n in All_shape.columns[1:]:
All_shape = All_shape[-All_shape[n].isna()]
All_shape.shape
print(All_shape.shape)
len(All_shape[All_shape.mortality_90d==0])
```
去除導致樣本過少欄位
```
All_shape = All_num.drop(["leavetime"],axis=1)
for n in All_shape.columns:
All_shape = All_shape[-All_shape[n].isna()]
All_shape.shape
print(All_shape.shape)
len(All_shape[All_shape.mortality_90d==0])
```
```
All_shape.lactate_max[All_shape.lactate_max.isna()]
```
KKKImpute
```
## base on KNN k-Nearest Neighbors
Imputer = KNNImputer(n_neighbors=2)
imput_data = All_num.drop(['icustay_id','mortality_90d','age_score','leavetime','los'],axis=1)
imput_data_name = imput_data.columns
imput_data = Imputer.fit_transform(imput_data)
imput_data = pd.DataFrame(imput_data)
imput_data.columns = imput_data_name
imput_data['mortality_90d'] = data['mortality_90d']
imput_data['los'] = data['los']
imput_data.head()
```
## 2. Feature selection
T檢定
```
p = []
for variable in All_num.columns[1:]:
print(variable)
print("Death :",len(All_num[variable].dropna()[All_num.mortality_90d==1]), "\n",
"Live :",len(All_num[variable].dropna()[All_num.mortality_90d==0]))
print (scipy.stats.ttest_ind(All_num[variable].dropna()[All_num.mortality_90d==1],
All_num[variable].dropna()[All_num.mortality_90d==0]))
p.append(scipy.stats.ttest_ind(All_num[variable].dropna()[All_num.mortality_90d==1],
All_num[variable].dropna()[All_num.mortality_90d==0])[1])
```
```
plt.figure(figsize=(5,5))
sns.boxplot(data=All_num,x="mortality_90d",y="los").set_title("los \n p value : 6.112005241921071e-06")
```
數據對於檢定沒有幫助
```
All_num['age_score']
All_num['age_score'].unique()
```
相似性分析
```
All_cor = All_num.drop(['icustay_id','age_score'],axis=1).corr()
All_cor
```
與leavetime相關性
```
dict(All_cor.leavetime)
```
Heatmap
```
top_corr_features = All_cor.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(All_cor[top_corr_features].corr(),annot=True,cmap="RdYlGn")
```
分類樹特徵篩選
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
```
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(All_shape.drop(["mortality_90d"],axis=1),All_shape.mortality_90d)
print(model.feature_importances_)
feat_importances = pd.Series(model.feature_importances_, index=All_shape.drop(["mortality_90d"],axis=1).columns)
feat_importances.nlargest(20).plot(kind='barh')
plt.show()
```
XGB特徵篩選
https://xgboost.readthedocs.io/en/latest/python/python_api.html
```
from xgboost import XGBClassifier
from xgboost import plot_importance
model = XGBClassifier()
model.fit(All_shape.drop(["mortality_90d"],axis=1),All_shape.mortality_90d)
# feature importance
print(model.feature_importances_)
# plot
plot_importance(model)
pyplot.show()
```
去除樣本
```
## 觀察樣本分佈狀況
import seaborn as sns
from sklearn.decomposition import PCA
pca_data = imput_data.drop(["los"],axis=1)
pca = PCA(n_components=10).fit_transform(pca_data.drop(["mortality_90d"],axis=1))
sns.scatterplot(x=pca[:,0],y=pca[:,1],hue = pca_data.mortality_90d)
```
給予資料 index 才可能篩選
```
np.where(pca[:,0]==max(pca[:,0]))
imput_data.index = [n for n in range(imput_data.shape[0])]
imput_data.index
```
去除樣本
```
## 觀察樣本分佈狀況
import seaborn as sns
from sklearn.decomposition import PCA
pca_data = imput_data.drop(["los"],axis=1)
pca_data = pca_data.drop(10224,axis=0)
pca = PCA(n_components=10).fit_transform(pca_data.drop(["mortality_90d"],axis=1))
sns.scatterplot(x=pca[:,0],y=pca[:,1],hue = pca_data.mortality_90d)
```
## 3. 模型的建立
常用套件
```
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
```
ROC圖形繪製
```
def ROC(y_test,y_score):
## 計算偽陽性和真陽性
fpr,tpr,threshold = roc_curve(y_test, y_score)
## 計算auc的值
roc_auc = auc(fpr,tpr)
plt.figure()
lw = 2
plt.figure(figsize=(10,10))
## 偽陽性為橫座標,真陽性為縱座標做曲線
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
```
Support vector machine
https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html?highlight=svc#sklearn.svm.SVC
```
def svm(data):
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
X_train, X_test, y_train, y_test = train_test_split(data.drop(["mortality_90d"],axis=1)
,data.mortality_90d, test_size=0.1, random_state=42)
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
out = clf.predict(X_test)
y_pred_rt = clf.decision_function(X_test)
ROC(y_test, y_pred_rt)
return accuracy_score(y_test, out), out, y_test;
```
XGBoost
```
def xgboost(data):
import xgboost as xgb
X_train, X_test, y_train, y_test = train_test_split(data.drop(["mortality_90d"],axis=1)
,data.mortality_90d, test_size=0.2, random_state=42)
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model.fit(X_train, y_train)
out = xgb_model.predict(X_test)
y_pred_rt = xgb_model.predict_proba(X_test)[:,1]
ROC(y_test, y_pred_rt)
return accuracy_score(y_test, out), out, y_test;
```
分類樹
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html?highlight=randomforestclassifier#sklearn.ensemble.RandomForestClassifier
```
def forest(data):
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, y_train, y_test = train_test_split(data.drop(["mortality_90d"],axis=1)
,data.mortality_90d, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=0)
model.fit(X_train, y_train)
out = model.predict(X_test)
y_pred_rt = model.predict_proba(X_test)[:,1]
ROC(y_test, y_pred_rt)
return accuracy_score(y_test, out), out, y_test;
```
Learning curve
```
def learn_curve(history):
plt.subplot(211)
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
# plot accuracy during training
plt.subplot(212)
plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.legend()
plt.show()
```
類神經網絡建立
```
def DNN(data):
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
X_train, X_test, y_train, y_test = train_test_split(data.drop(["mortality_90d"],axis=1)
,data.mortality_90d, test_size=0.2, random_state=42)
y_train = to_categorical(y_train, 2)
y_test = to_categorical(y_test, 2)
input_shape = (X_train.shape[1],)
# Create the model
model = Sequential()
model.add(Dense(32, input_shape=input_shape, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=10, batch_size=5, verbose=1, validation_split=0.2)
_, train_acc = model.evaluate(X_train, y_train, verbose=0)
_, test_acc = model.evaluate(X_test, y_test, verbose=0)
learn_curve(history)
return train_acc,test_acc
```
confusion matrix
```
def confusion(pre,label):
confmat = confusion_matrix(y_true=label, y_pred=pre)
fig, ax = plt.subplots(figsize=(2.5, 2.5))
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
for i in range(confmat.shape[0]):
for j in range(confmat.shape[1]):
ax.text(x=j, y=i, s=confmat[i,j], va='center', ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label')
plt.show()
```
線性迴歸模型
```
def LSR(data):
# example of training a final regression model
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
X_train, X_test, y_train, y_test = train_test_split(data.drop(["mortality_90d","los"],axis=1)
,data.los, test_size=0.2, random_state=42)
# fit final model
model = LinearRegression()
model.fit(np.array(X_train), y_train)
out = model.predict(np.array(X_test))
return y_test-out, mean_squared_error( y_test,out)
```
## 4. 使用模型
SVM
```
acc, pre, label = svm(imput_data)
acc
confusion(pre,label)
```
XGBoost
```
acc, pre, label = xgboost(imput_data)
acc
confusion(pre,label)
```
Random forest
```
forest(imput_data)
acc
confusion(pre,label)
```
類神經網路
```
DNN(imput_data)
```
## 5. 分析步驟組裝
### 5.1 去除缺值之分析
```
All_shape
```
```
def svm(data):
X_train, X_test, y_train, y_test = train_test_split(data.drop(["mortality_90d"],axis=1)
,data.mortality_90d, test_size=0.2, random_state=42)
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
out = clf.predict(X_test)
y_pred_rt = clf.decision_function(X_test)
ROC(y_test, y_pred_rt)
return accuracy_score(y_test, out), out, y_test;
acc, pre, label = svm(All_shape.drop(["icustay_id"],axis=1))
print(acc)
confusion(pre,label)
```
### 5.2 使用KNNimpute+ExtraTreesClassifier
```
imput_data.head()
```
```
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(imput_data.drop(["mortality_90d"],axis=1),imput_data.mortality_90d)
print(model.feature_importances_)
feat_importances = pd.Series(model.feature_importances_, index=imput_data.drop(["mortality_90d"],axis=1).columns)
feat_importances.nlargest(20).plot(kind='barh')
plt.show()
```
```
imput_X = imput_data[imput_data.drop(["mortality_90d"],axis=1).columns[model.feature_importances_>0.01]]
imput_X["mortality_90d"] = imput_data["mortality_90d"]
```
```
def svm(data):
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
X_train, X_test, y_train, y_test = train_test_split(data.drop(["mortality_90d"],axis=1)
,data.mortality_90d, test_size=0.1, random_state=42)
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
out = clf.predict(X_test)
y_pred_rt = clf.decision_function(X_test)
ROC(y_test, y_pred_rt)
return accuracy_score(y_test, out), out, y_test;
acc, pre, label = svm(imput_X)
print(acc)
confusion(pre,label)
```
### 5.3 使用KNNimpute+PCA去除樣本
```
imput_data.index = [n for n in range(imput_data.shape[0])]
imput_data.index
## 觀察樣本分佈狀況
import seaborn as sns
from sklearn.decomposition import PCA
pca_data = imput_data.drop(["los"],axis=1)
pca = PCA(n_components=10).fit_transform(pca_data.drop(["mortality_90d"],axis=1))
pring(len([n for n in np.where(pca[:,0]>10)[0]]))
pca_data = pca_data.drop([n for n in np.where(pca[:,0]>10)[0]],axis=0)
pca = PCA(n_components=10).fit_transform(pca_data.drop(["mortality_90d"],axis=1))
sns.scatterplot(x=pca[:,0],y=pca[:,1],hue = pca_data.mortality_90d)
```
```
def svm(data):
X_train, X_test, y_train, y_test = train_test_split(data.drop(["mortality_90d"],axis=1)
,data.mortality_90d, test_size=0.1, random_state=42)
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
out = clf.predict(X_test)
y_pred_rt = clf.decision_function(X_test)
ROC(y_test, y_pred_rt)
return accuracy_score(y_test, out), out, y_test;
acc, pre, label = svm(pca_data)
print(acc)
confusion(pre,label)
```
### 5.4 使用KNNimpute+XGBoost特徵篩選
```
from xgboost import XGBClassifier
from xgboost import plot_importance
model = XGBClassifier()
model.fit(imput_data.drop(["mortality_90d"],axis=1),imput_data.mortality_90d)
# feature importance
print(model.feature_importances_)
# plot
plot_importance(model)
pyplot.show()
```
篩選重要性高的資料
```
imput_G = imput_data[imput_data.drop(["mortality_90d"],axis=1).columns[model.feature_importances_>0.01]]
imput_G["mortality_90d"] = imput_data["mortality_90d"]
```
```
def svm(data):
X_train, X_test, y_train, y_test = train_test_split(data.drop(["mortality_90d"],axis=1)
,data.mortality_90d, test_size=0.1, random_state=42)
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
out = clf.predict(X_test)
y_pred_rt = clf.decision_function(X_test)
ROC(y_test, y_pred_rt)
return accuracy_score(y_test, out), out, y_test;
acc, pre, label = svm(imput_G)
print(acc)
confusion(pre,label)
```
### 5.5 使用KNNimpute+T檢定特徵篩選
```
## 以類別計算p值
p = []
label = imput_data.mortality_90d
imput_T = imput_data.drop(['mortality_90d'],axis=1)
for variable in imput_T:
p.append(scipy.stats.ttest_ind(imput_T[variable][label==0],
imput_T[variable][label==1]))
p
```
取出p值
```
pvalue = [n[1] for n in p]
```
製作訓練表
```
imput_P = imput_T[imput_T.columns[np.array(pvalue) < 0.05]]
imput_P['mortality_90d'] = label
```
```
def svm(data):
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
X_train, X_test, y_train, y_test = train_test_split(data.drop(["mortality_90d"],axis=1)
,data.mortality_90d, test_size=0.1, random_state=42)
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
out = clf.predict(X_test)
y_pred_rt = clf.decision_function(X_test)
ROC(y_test, y_pred_rt)
return accuracy_score(y_test, out), out, y_test;
acc, pre, label = svm(imput_P)
print(acc)
confusion(pre,label)
```
### 5.6 DNN
```
def DNN(data):
X_train, X_test, y_train, y_test = train_test_split(data.drop(["mortality_90d"],axis=1)
,data.mortality_90d, test_size=0.1, random_state=42)
y_train = to_categorical(y_train, 2)
y_test = to_categorical(y_test, 2)
input_shape = (X_train.shape[1],)
# Create the model
model = Sequential()
model.add(Dense(16, input_shape=input_shape, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=10, batch_size=1, verbose=1, validation_split=0.2)
_, train_acc = model.evaluate(X_train, y_train, verbose=0)
_, test_acc = model.evaluate(X_test, y_test, verbose=0)
learn_curve(history)
return train_acc,test_acc
```
```
DNN(imput_data)
```
### 5.7 預測在ICU時間
```
error, mse = LSR(All_shape.drop('icustay_id',axis=1))
```
```
print(mse)
sns.boxplot(error)
```
### 5.8 預測時間,以相關性篩選
```
All_shape_c = All_shape.drop('icustay_id',axis=1)
All_cor = All_shape_c.corr()
All_cor
```
```
abs(All_cor.los).describe()
```
```
All_shape_C = All_shape_c[All_shape_c.columns[abs(All_cor.los)>abs(All_cor.los).describe()[6]]]
```
```
error, mse = LSR(All_shape_C)
mse
sns.boxplot(error)
```
### 5.9 Autoencoder
```
def AE(data):
import keras
from keras.models import Model
from keras.layers import Dense, Input
from keras.utils import to_categorical
X_train, X_test, y_train, y_test = train_test_split(data.drop(["mortality_90d","los"],axis=1)
,data.los, test_size=0.2, random_state=42)
# Create the model
input_window = Input(shape=(X_train.shape[1],))
x = Dense(40, activation='relu')(input_window)
x = Dense(20, activation='relu')(x)
encoded = Dense(4, activation='relu',name="b")(x)
encoder = Model(input_window, encoded)
x = Dense(20, activation='relu')(encoded)
x = Dense(40, activation='relu')(x)
decoded = Dense(X_train.shape[1], activation='relu')(x)
autoencoder = Model(input_window, decoded)
autoencoder.summary()
opt = keras.optimizers.Adam(learning_rate=0.01)
autoencoder.compile(loss='mean_squared_error', optimizer=opt, metrics=['MeanSquaredError'])
history = autoencoder.fit(X_train, y_train, epochs=10, batch_size=1, verbose=1, validation_split=0.2)
g = autoencoder.layers[0].output
plt.subplot(211)
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
return encoder, X_train
```
```
ae_model, X_train = AE(All_shape.drop('icustay_id',axis=1))
```
```
## 得到bottleneck數值
ae_model.compile(loss='mean_squared_error', optimizer='adam', metrics=['MeanSquaredError'])
result = ae_model.predict(X_train)
result
```
https://keras.io/api/