# 2. 視覺化
## 1. 資料內容
套件下載
```
pip install seaborn
```
載入套件
```
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import psycopg2
```
抓取資料
```
conn = psycopg2.connect(database="database_name", user="user_name",
password="user_password", host="IP",
port="IP_port")
## 連接上資料庫,抓取資料。
## 允許python執行postgresql語法
conn = psycopg2.connect(database="mimic", user="twmimicviewer",
password="twmimicviewer@1234", host="203.145.218.182", port="5432")
## 連接上資料庫,抓取資料。
## 允許python執行postgresql語法
with conn.cursor() as cur:
## 觀察查詢筆數
print(cur.rowcount)
## SQL語法
sql = """
SELECT * FROM sepsis.angus_cohort
"""
## 執行sql語法
cur.execute(sql)
## 取得欄位名稱
name = [desc[0] for desc in cur.description]
## fetchall是將所有收尋的資料都寫入 rows 中
# rows = pd.DataFrame(cur.fetchall())
angus_cohort = pd.DataFrame(cur.fetchall(),columns=name)
print(angus_cohort.shape)
angus_cohort.head()
```

### 操作:把其他的檔案都寫入這裡面
* Python環境變數名稱 : 資料表在資料庫的名稱
1. GCS : gcs_24
2. LAB : lab_24
3. SAPSII : sapsii
4. UO : uo_24
5. Vital : vital_24
將五張表統整成一張,以icustay_id為標準
```
All = pd.merge(angus_cohort,GCS,on="icustay_id")
All = pd.merge(All,LAB,on="icustay_id")
All = pd.merge(All,SAPSII,on="icustay_id")
All = pd.merge(All,UO,on="icustay_id")
All = pd.merge(All,Vital,on="icustay_id")
All.head()
print(All.shape)
len(pd.unique(angus_cohort.gender))
```
統計是否90天內死亡人數與比例
```
## print 顯示結果
## str 轉換成字串
## len 顯示長度
## unique 統整存在列表中數據,重複的只會顯示一次
## round(數值,小數點後幾位)
print('Number of ICU stays: ' + str(len(All.icustay_id.unique())))
print('Number of Mortality: ' + str(len(All.mortality_90d[All['mortality_90d']==1])))
print('Number of non-Mortality: ' + str(len(All.mortality_90d[All['mortality_90d']==0])))
print('Percentage of Mortality: ' + str(round(100*len(All.mortality_90d[All['mortality_90d']==1]) / len(All.mortality_90d),0)) + '%')
```
找尋包含一個特定字串欄位
```
## 列表欄位
list(All.columns)
```
```
## 找出包含 hadm_id 字元
"""
for s in All.columns:
if "hadm_id" in s:
print(s)
"""
matching = [s for s in All.columns if "hadm_id" in s]
matching
```
找尋包含多個特定字串欄位
```
## 一次分別篩選包含不同的字元
"""
for s in All.columns:
if "hadm_id" in s or "subject_id" in s:
print(s)
"""
matching = [s for s in All.columns if "hadm_id" in s or "subject_id" in s]
matching
```
刪除無需使用欄位
```
All_f = All.drop(matching,axis=1)
list(All_f.columns)
```
統計各欄位資料
```
All_f.describe()
```
### 敘述統計呈現
* count 數據總量
* mean 數據平均
* std 數據標準差
* 25%, 50%, 75% 數據四分位
* max 數據最大值
## 2. 視覺化數據內容
各欄位資料類型與數量
```
## 計算各欄位的筆數
All_f.info()
```
取出float64和int64的資料
```
All_num = All_f.select_dtypes(include=['float64',"int64"])
All_num.head()
```
### Boxplot
```
fig = plt.figure(figsize=(10,10))
sns.boxplot(y="age", data=All_num)
plt.boxplot(All_num.age)
plt.show()
```
迴圈呈現不同數據boxplot
* 此範例只呈現24個欄位數據
```
fig = plt.figure(figsize=(30,24))
count = 0
for variable in All_num.columns[:24]:
count += 1
## 分割幾張圖
plt.subplot(4, 6, count)
## 畫圖,y 為欄位的名稱,data 為資料
ax = sns.boxplot(y=variable, data=All_num)
plt.show()
```
```
fig = plt.figure(figsize=(30,24))
count = 0
for variable in All_num.columns[:24]:
count += 1
plt.subplot(4, 6, count)
ax = sns.boxplot(x = 'mortality_90d', y=variable, data=All_num)
ax = sns.stripplot(x = 'mortality_90d', y=variable, data=All_num, color="orange", size=1)
plt.show()
```
### 操作:
1. 以性別為分組 (設定:All.gender)
2. 顯示前六張欄位 (All_num數據)
3. 以2,3 或 3,2 顯示圖片
4. 點的顏色設定為 yellow
### Scater plot
```
## 未分群
fig = plt.figure(figsize=(10,10))
sns.scatterplot(data=All_num, x=All_num.columns[10], y=All_num.columns[9])
fig.show()
```
```
## 分群
fig = plt.figure(figsize=(10,10))
sns.scatterplot(data=All_num, x=All_num.columns[10], y=All_num.columns[9],hue ='mortality_90d')
## 標記範圍
plt.plot(range(41),np.repeat(35,41),'r')
plt.plot(np.repeat(40,36),range(36),'r')
fig.show()
```
### Pie chart
```
## pie chart
## count
## autopct 第一位數:位置,小數點後:四捨五入,%:是否要顯示百分位
size = [len(All_num[All_num.age>50]) , len(All_num[((All_num.age > 30) | (All_num.age <=50))])
,len(All_num[All_num.age<=30])]
label = ["age<=30", "50<=age<30", "age>50"]
plt.pie(size , labels = label, autopct='%1.0f%%')
```
### 操作:以圓餅圖顯示mortality有無的比例 (All_num.mortality_90d)
### Histogram
```
## Histogram
## 不區分類別
sns.histplot(All_num[All_num.columns[9]] , color="skyblue", label=All_num.columns[9])
sns.histplot(All_num[All_num.columns[10]] , color="red", label=All_num.columns[10])
plt.xlabel("Value")
#plt.ylabel("")
plt.legend()
```
```
## Histogram
## 是否死亡 寫法1
sns.histplot( All_num[All_num.columns[9]][All_num.mortality_90d==0] , color="skyblue", label=[All_num.columns[9]+"+dead"])
sns.histplot( All_num[All_num.columns[9]][All_num.mortality_90d==1] , color="red", label=[All_num.columns[9]+"+live"])
plt.xlabel("Value")
#plt.ylabel("")
plt.legend(loc='best', bbox_to_anchor=(1, 1))
## Histogram
## 是否死亡 寫法2
fig = plt.figure(figsize=(5,5))
sns.histplot(data = All_num , x = All_num.columns[9], hue ='mortality_90d', element="step")
plt.xlabel("Value")
#plt.ylabel("")
fig.show()
```
## 3. 視覺化呈現資料以時間排列
### 視覺化各種時段數據偵測,或者藥物使用劑量
### 1. 整合chartevents、D_items、icustay
### 2. 參考網址 [https://github.com/MIT-LCP/mimic-iii-paper]
```
import time
t1 = time.time()
# !/root/anaconda3/bin/pip install psycopg2-binary
## 連線postgresql的套件
import psycopg2
## 建立資料結構的套件,使用於資料處理,如:分割、合併等等 (Python data analysis)
import pandas as pd
## 設定連接資料庫參數
conn = psycopg2.connect(database="mimic", user="twmimicviewer",
password="twmimicviewer@1234", host="203.145.218.182", port="5432")
## 連接上資料庫,抓取資料。
## 允許python執行postgresql語法
with conn.cursor() as cur:
## 觀察查詢筆數
print(cur.rowcount)
## SQL語法
sql = """
SELECT ce.icustay_id
, ce.charttime
, ce.charttime - ie.intime AS icutime
, di.label
, ce.value
, ce.valuenum
, ce.valueuom
FROM mimiciii.chartevents ce
INNER join mimiciii.d_items di
ON ce.itemid = di.itemid
INNER join mimiciii.icustays ie
ON ce.icustay_id = ie.icustay_id
WHERE ce.icustay_id = 276024
AND error != 1
ORDER BY ce.charttime
"""
## 執行sql語法
cur.execute(sql)
## 取得欄位名稱
name = [desc[0] for desc in cur.description]
## fetchall是將所有收尋的資料都寫入 rows 中
# rows = pd.DataFrame(cur.fetchall())
rows = pd.DataFrame(cur.fetchall(),columns=name)
t2 = time.time()
print('time elapsed: ' + str(round(t2-t1, 2)) + ' seconds')
```
處理時間資料
```
## dt.days,顯示天數
## dt.seconds,計算小於1天數據,以秒數顯示
# rows.icutime.dt.days
print(rows['icutime'][0])
print(rows['icutime'].dt.seconds[0])
## 23(h)*60*60+31(m)*60+53(s)
print(rows['icutime'].dt.days[0])
```
* maxdays可以選擇只使用某個時間點內的資料
```
# 去除超過某個時間後的資料
maxdays = 5;
rows = rows.loc[rows.icutime.dt.days<=maxdays]
```
時間表示資料,轉換成以小時為單位,並寫入新的欄位
```
# 將時間資料轉換成以小時為單位
rows['icutimehr'] = (rows['icutime'].dt.seconds/60/60)+(rows['icutime'].dt.days*24)
```
統計每一筆臨床數據數量
```
dict(rows['label'].value_counts())
```
計算平均值
```
# 得到三種數據的平均值
hr_mean = rows.valuenum[rows.label=='Heart Rate'].mean()
bp_mean = rows.icutimehr[rows.label=='Non Invasive Blood Pressure mean'].mean()
temp_mean = ((rows.valuenum[rows.label=='Temperature Fahrenheit']-32)/1.8).mean()
# 列出平均數值
print("Mean HR is: {0:.2f}".format(hr_mean))
print("Mean BP is: {0:.2f}".format(bp_mean))
print("Mean temp, C is: {0:.2f}".format(temp_mean))
```
敘述統計不同種類數值
```
# 得到三種數據的平均值
hr_describe = rows.valuenum[rows.label=='Heart Rate'].describe()
bp_describe = rows.icutimehr[rows.label=='Non Invasive Blood Pressure mean'].describe()
temp_describe = ((rows.valuenum[rows.label=='Temperature Fahrenheit']-32)/1.8).describe()
# 列出其餘統計數值
print("Describe HR is:\n",hr_describe,"\n")
print("Describe BP is:\n",bp_describe,"\n")
print("Describe temp is:\n",temp_describe,"\n")
```
```
hr_describe[1]
hr_describe["count"]
dict(hr_describe)["count"]
```
以時間軸,標記偵測數值
```
# 設定圖片大小 寬*高
fig = plt.figure(figsize=(22, 20))
## 設定XY軸,字大小
plt.rcParams.update({'font.size': 20})
# 設定RGB顏色
tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120)]
# 因為設定的RGB值域必須在0-1
for i in range(len(tableau20)):
r, g, b = tableau20[i]
tableau20[i] = (r / 255., g / 255., b / 255.)
# 設定的周圍線顯示狀況
# 圖片位置:subplot(xyz)位置
ax = plt.subplot(111)
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_visible(True)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(True)
# 畫圖
## plot(x,y,color:顏色, lw:線的寬, ls:點連線的方式, marker:點的形狀, markersize:點的大小, label:在圖示的名稱)
# Marker : https://matplotlib.org/3.3.2/api/markers_api.html#module-matplotlib.markers
plt.plot(rows.icutimehr[rows.label=='Heart Rate'],
rows.valuenum[rows.label=='Heart Rate'],
color=tableau20[0],
lw=5, ls=":",
marker='v', markersize=6, label='Heart rate')
plt.plot(rows.icutimehr[rows.label=='O2 saturation pulseoxymetry'],
rows.valuenum[rows.label=='O2 saturation pulseoxymetry'],
color=tableau20[1],
lw=2.5, ls="-",
marker='o', markersize=6, label='O2 saturation')
plt.plot(rows.icutimehr[rows.label=='Non Invasive Blood Pressure mean'],
rows.valuenum[rows.label=='Non Invasive Blood Pressure mean'],
color=tableau20[2],
lw=2.5, ls="-.",
marker='o', markersize=6, label='NIBP, mean')
plt.plot(rows.icutimehr[rows.label=='Respiratory Rate'],
rows.valuenum[rows.label=='Respiratory Rate'],
color=tableau20[3],
lw=2.5, ls="",
marker='v', markersize=6, label='Respiratory rate')
plt.plot(rows.icutimehr[rows.label=='Respiratory Rate'],
rows.valuenum[rows.label=='Respiratory Rate'],
color=tableau20[3],
lw=2.5, ls="",
marker='v', markersize=6, label='Respiratory rate')
# 圖示設定
plt.legend(loc=1,fontsize=18)
# 貼上各種文字
plt.text(40,150,"Example plot",fontsize=40)
# X軸名稱
plt.xlabel('Time after admission to the intensive care unit, hours', fontsize=22)
# Y軸名稱
plt.ylabel('Measurement, absolute value', fontsize=22)
# y軸數值顯示
plt.yticks(np.arange(0, 160, 20))
# 圖片儲存
#fig.savefig('examplepatient.pdf', bbox_inches='tight')
```
## 4. 取出病患死亡時間
取出時間資料
```
# !/root/anaconda3/bin/pip install psycopg2-binary
## 連線postgresql的套件
import psycopg2
## 建立資料結構的套件,使用於資料處理,如:分割、合併等等 (Python data analysis)
import pandas as pd
## 設定連接資料庫參數
conn = psycopg2.connect(database="mimic", user="twmimicadmin",
password="twmimicadmin@1234", host="203.145.218.182", port="5432")
## 連接上資料庫,抓取資料。
## 允許python執行postgresql語法
with conn.cursor() as cur:
time = pd.DataFrame()
## SQL語法
for n in All.icustay_id:
sql = """
SELECT
icu.icustay_id
, ma.deathtime-icu.INTIME AS leavetime
FROM mimiciii.icustays icu
JOIN mimiciii.admissions ma
ON icu.HADM_ID = ma.HADM_ID
WHERE icu.icustay_id = """+ str(n)+""" """
cur.execute(sql)
## 取得欄位名稱
#name = [desc[0] for desc in cur.description]
## fetchall是將所有收尋的資料都寫入 rows 中
out = pd.DataFrame(cur.fetchall())
time = pd.concat([time,out])
time.columns = ["icustay_id","leavetime"]
```
以icustay_id將資料合併後,輸出
```
time.leavetime = (time.leavetime.dt.seconds/60/60)+(time.leavetime.dt.days*24)
Output = pd.merge(time,All,on="icustay_id")
Output.head()
```