2. 視覺化 - HackMD

# 2. 視覺化 ## 1. 資料內容套件下載 ``` pip install seaborn ``` 載入套件 ``` import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import psycopg2 ``` 抓取資料 ``` conn = psycopg2.connect(database="database_name", user="user_name", password="user_password", host="IP", port="IP_port") ## 連接上資料庫，抓取資料。 ## 允許python執行postgresql語法 conn = psycopg2.connect(database="mimic", user="twmimicviewer", password="twmimicviewer@1234", host="203.145.218.182", port="5432") ## 連接上資料庫，抓取資料。 ## 允許python執行postgresql語法 with conn.cursor() as cur: ## 觀察查詢筆數 print(cur.rowcount) ## SQL語法 sql = """ SELECT * FROM sepsis.angus_cohort """ ## 執行sql語法 cur.execute(sql) ## 取得欄位名稱 name = [desc[0] for desc in cur.description] ## fetchall是將所有收尋的資料都寫入 rows 中 # rows = pd.DataFrame(cur.fetchall()) angus_cohort = pd.DataFrame(cur.fetchall(),columns=name) print(angus_cohort.shape) angus_cohort.head() ``` ![](https://i.imgur.com/VKm2U61.png) ### 操作：把其他的檔案都寫入這裡面 * Python環境變數名稱 : 資料表在資料庫的名稱 1. GCS : gcs_24 2. LAB : lab_24 3. SAPSII : sapsii 4. UO : uo_24 5. Vital : vital_24 將五張表統整成一張，以icustay_id為標準 ``` All = pd.merge(angus_cohort,GCS,on="icustay_id") All = pd.merge(All,LAB,on="icustay_id") All = pd.merge(All,SAPSII,on="icustay_id") All = pd.merge(All,UO,on="icustay_id") All = pd.merge(All,Vital,on="icustay_id") All.head() print(All.shape) len(pd.unique(angus_cohort.gender)) ``` 統計是否90天內死亡人數與比例 ``` ## print 顯示結果 ## str 轉換成字串 ## len 顯示長度 ## unique 統整存在列表中數據，重複的只會顯示一次 ## round(數值,小數點後幾位) print('Number of ICU stays: ' + str(len(All.icustay_id.unique()))) print('Number of Mortality: ' + str(len(All.mortality_90d[All['mortality_90d']==1]))) print('Number of non-Mortality: ' + str(len(All.mortality_90d[All['mortality_90d']==0]))) print('Percentage of Mortality: ' + str(round(100*len(All.mortality_90d[All['mortality_90d']==1]) / len(All.mortality_90d),0)) + '%') ``` 找尋包含一個特定字串欄位 ``` ## 列表欄位 list(All.columns) ``` ``` ## 找出包含 hadm_id 字元 """ for s in All.columns: if "hadm_id" in s: print(s) """ matching = [s for s in All.columns if "hadm_id" in s] matching ``` 找尋包含多個特定字串欄位 ``` ## 一次分別篩選包含不同的字元 """ for s in All.columns: if "hadm_id" in s or "subject_id" in s: print(s) """ matching = [s for s in All.columns if "hadm_id" in s or "subject_id" in s] matching ``` 刪除無需使用欄位 ``` All_f = All.drop(matching,axis=1) list(All_f.columns) ``` 統計各欄位資料 ``` All_f.describe() ``` ### 敘述統計呈現 * count 數據總量 * mean 數據平均 * std 數據標準差 * 25%, 50%, 75% 數據四分位 * max 數據最大值 ## 2. 視覺化數據內容各欄位資料類型與數量 ``` ## 計算各欄位的筆數 All_f.info() ``` 取出float64和int64的資料 ``` All_num = All_f.select_dtypes(include=['float64',"int64"]) All_num.head() ``` ### Boxplot ``` fig = plt.figure(figsize=(10,10)) sns.boxplot(y="age", data=All_num) plt.boxplot(All_num.age) plt.show() ``` 迴圈呈現不同數據boxplot * 此範例只呈現24個欄位數據 ``` fig = plt.figure(figsize=(30,24)) count = 0 for variable in All_num.columns[:24]: count += 1 ## 分割幾張圖 plt.subplot(4, 6, count) ## 畫圖，y 為欄位的名稱，data 為資料 ax = sns.boxplot(y=variable, data=All_num) plt.show() ``` ``` fig = plt.figure(figsize=(30,24)) count = 0 for variable in All_num.columns[:24]: count += 1 plt.subplot(4, 6, count) ax = sns.boxplot(x = 'mortality_90d', y=variable, data=All_num) ax = sns.stripplot(x = 'mortality_90d', y=variable, data=All_num, color="orange", size=1) plt.show() ``` ### 操作： 1. 以性別為分組 (設定：All.gender) 2. 顯示前六張欄位 (All_num數據) 3. 以2,3 或 3,2 顯示圖片 4. 點的顏色設定為 yellow ### Scater plot ``` ## 未分群 fig = plt.figure(figsize=(10,10)) sns.scatterplot(data=All_num, x=All_num.columns[10], y=All_num.columns[9]) fig.show() ``` ``` ## 分群 fig = plt.figure(figsize=(10,10)) sns.scatterplot(data=All_num, x=All_num.columns[10], y=All_num.columns[9],hue ='mortality_90d') ## 標記範圍 plt.plot(range(41),np.repeat(35,41),'r') plt.plot(np.repeat(40,36),range(36),'r') fig.show() ``` ### Pie chart ``` ## pie chart ## count ## autopct 第一位數：位置，小數點後：四捨五入，%：是否要顯示百分位 size = [len(All_num[All_num.age>50]) , len(All_num[((All_num.age > 30) | (All_num.age <=50))]) ,len(All_num[All_num.age<=30])] label = ["age<=30", "50<=age<30", "age>50"] plt.pie(size , labels = label, autopct='%1.0f%%') ``` ### 操作：以圓餅圖顯示mortality有無的比例 (All_num.mortality_90d) ### Histogram ``` ## Histogram ## 不區分類別 sns.histplot(All_num[All_num.columns[9]] , color="skyblue", label=All_num.columns[9]) sns.histplot(All_num[All_num.columns[10]] , color="red", label=All_num.columns[10]) plt.xlabel("Value") #plt.ylabel("") plt.legend() ``` ``` ## Histogram ## 是否死亡寫法1 sns.histplot( All_num[All_num.columns[9]][All_num.mortality_90d==0] , color="skyblue", label=[All_num.columns[9]+"+dead"]) sns.histplot( All_num[All_num.columns[9]][All_num.mortality_90d==1] , color="red", label=[All_num.columns[9]+"+live"]) plt.xlabel("Value") #plt.ylabel("") plt.legend(loc='best', bbox_to_anchor=(1, 1)) ## Histogram ## 是否死亡寫法2 fig = plt.figure(figsize=(5,5)) sns.histplot(data = All_num , x = All_num.columns[9], hue ='mortality_90d', element="step") plt.xlabel("Value") #plt.ylabel("") fig.show() ``` ## 3. 視覺化呈現資料以時間排列 ### 視覺化各種時段數據偵測，或者藥物使用劑量 ### 1. 整合chartevents、D_items、icustay ### 2. 參考網址 [https://github.com/MIT-LCP/mimic-iii-paper] ``` import time t1 = time.time() # !/root/anaconda3/bin/pip install psycopg2-binary ## 連線postgresql的套件 import psycopg2 ## 建立資料結構的套件，使用於資料處理，如：分割、合併等等 (Python data analysis) import pandas as pd ## 設定連接資料庫參數 conn = psycopg2.connect(database="mimic", user="twmimicviewer", password="twmimicviewer@1234", host="203.145.218.182", port="5432") ## 連接上資料庫，抓取資料。 ## 允許python執行postgresql語法 with conn.cursor() as cur: ## 觀察查詢筆數 print(cur.rowcount) ## SQL語法 sql = """ SELECT ce.icustay_id , ce.charttime , ce.charttime - ie.intime AS icutime , di.label , ce.value , ce.valuenum , ce.valueuom FROM mimiciii.chartevents ce INNER join mimiciii.d_items di ON ce.itemid = di.itemid INNER join mimiciii.icustays ie ON ce.icustay_id = ie.icustay_id WHERE ce.icustay_id = 276024 AND error != 1 ORDER BY ce.charttime """ ## 執行sql語法 cur.execute(sql) ## 取得欄位名稱 name = [desc[0] for desc in cur.description] ## fetchall是將所有收尋的資料都寫入 rows 中 # rows = pd.DataFrame(cur.fetchall()) rows = pd.DataFrame(cur.fetchall(),columns=name) t2 = time.time() print('time elapsed: ' + str(round(t2-t1, 2)) + ' seconds') ``` 處理時間資料 ``` ## dt.days，顯示天數 ## dt.seconds，計算小於1天數據，以秒數顯示 # rows.icutime.dt.days print(rows['icutime'][0]) print(rows['icutime'].dt.seconds[0]) ## 23(h)*60*60+31(m)*60+53(s) print(rows['icutime'].dt.days[0]) ``` * maxdays可以選擇只使用某個時間點內的資料 ``` # 去除超過某個時間後的資料 maxdays = 5; rows = rows.loc[rows.icutime.dt.days<=maxdays] ``` 時間表示資料，轉換成以小時為單位，並寫入新的欄位 ``` # 將時間資料轉換成以小時為單位 rows['icutimehr'] = (rows['icutime'].dt.seconds/60/60)+(rows['icutime'].dt.days*24) ``` 統計每一筆臨床數據數量 ``` dict(rows['label'].value_counts()) ``` 計算平均值 ``` # 得到三種數據的平均值 hr_mean = rows.valuenum[rows.label=='Heart Rate'].mean() bp_mean = rows.icutimehr[rows.label=='Non Invasive Blood Pressure mean'].mean() temp_mean = ((rows.valuenum[rows.label=='Temperature Fahrenheit']-32)/1.8).mean() # 列出平均數值 print("Mean HR is: {0:.2f}".format(hr_mean)) print("Mean BP is: {0:.2f}".format(bp_mean)) print("Mean temp, C is: {0:.2f}".format(temp_mean)) ``` 敘述統計不同種類數值 ``` # 得到三種數據的平均值 hr_describe = rows.valuenum[rows.label=='Heart Rate'].describe() bp_describe = rows.icutimehr[rows.label=='Non Invasive Blood Pressure mean'].describe() temp_describe = ((rows.valuenum[rows.label=='Temperature Fahrenheit']-32)/1.8).describe() # 列出其餘統計數值 print("Describe HR is:\n",hr_describe,"\n") print("Describe BP is:\n",bp_describe,"\n") print("Describe temp is:\n",temp_describe,"\n") ``` ``` hr_describe[1] hr_describe["count"] dict(hr_describe)["count"] ``` 以時間軸，標記偵測數值 ``` # 設定圖片大小寬*高 fig = plt.figure(figsize=(22, 20)) ## 設定XY軸，字大小 plt.rcParams.update({'font.size': 20}) # 設定RGB顏色 tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120)] # 因為設定的RGB值域必須在0-1 for i in range(len(tableau20)): r, g, b = tableau20[i] tableau20[i] = (r / 255., g / 255., b / 255.) # 設定的周圍線顯示狀況 # 圖片位置：subplot(xyz)位置 ax = plt.subplot(111) ax.spines["top"].set_visible(False) ax.spines["bottom"].set_visible(True) ax.spines["right"].set_visible(False) ax.spines["left"].set_visible(True) # 畫圖 ## plot(x,y,color:顏色, lw:線的寬, ls:點連線的方式, marker:點的形狀, markersize:點的大小, label:在圖示的名稱) # Marker : https://matplotlib.org/3.3.2/api/markers_api.html#module-matplotlib.markers plt.plot(rows.icutimehr[rows.label=='Heart Rate'], rows.valuenum[rows.label=='Heart Rate'], color=tableau20[0], lw=5, ls=":", marker='v', markersize=6, label='Heart rate') plt.plot(rows.icutimehr[rows.label=='O2 saturation pulseoxymetry'], rows.valuenum[rows.label=='O2 saturation pulseoxymetry'], color=tableau20[1], lw=2.5, ls="-", marker='o', markersize=6, label='O2 saturation') plt.plot(rows.icutimehr[rows.label=='Non Invasive Blood Pressure mean'], rows.valuenum[rows.label=='Non Invasive Blood Pressure mean'], color=tableau20[2], lw=2.5, ls="-.", marker='o', markersize=6, label='NIBP, mean') plt.plot(rows.icutimehr[rows.label=='Respiratory Rate'], rows.valuenum[rows.label=='Respiratory Rate'], color=tableau20[3], lw=2.5, ls="", marker='v', markersize=6, label='Respiratory rate') plt.plot(rows.icutimehr[rows.label=='Respiratory Rate'], rows.valuenum[rows.label=='Respiratory Rate'], color=tableau20[3], lw=2.5, ls="", marker='v', markersize=6, label='Respiratory rate') # 圖示設定 plt.legend(loc=1,fontsize=18) # 貼上各種文字 plt.text(40,150,"Example plot",fontsize=40) # X軸名稱 plt.xlabel('Time after admission to the intensive care unit, hours', fontsize=22) # Y軸名稱 plt.ylabel('Measurement, absolute value', fontsize=22) # y軸數值顯示 plt.yticks(np.arange(0, 160, 20)) # 圖片儲存 #fig.savefig('examplepatient.pdf', bbox_inches='tight') ``` ## 4. 取出病患死亡時間取出時間資料 ``` # !/root/anaconda3/bin/pip install psycopg2-binary ## 連線postgresql的套件 import psycopg2 ## 建立資料結構的套件，使用於資料處理，如：分割、合併等等 (Python data analysis) import pandas as pd ## 設定連接資料庫參數 conn = psycopg2.connect(database="mimic", user="twmimicadmin", password="twmimicadmin@1234", host="203.145.218.182", port="5432") ## 連接上資料庫，抓取資料。 ## 允許python執行postgresql語法 with conn.cursor() as cur: time = pd.DataFrame() ## SQL語法 for n in All.icustay_id: sql = """ SELECT icu.icustay_id , ma.deathtime-icu.INTIME AS leavetime FROM mimiciii.icustays icu JOIN mimiciii.admissions ma ON icu.HADM_ID = ma.HADM_ID WHERE icu.icustay_id = """+ str(n)+""" """ cur.execute(sql) ## 取得欄位名稱 #name = [desc[0] for desc in cur.description] ## fetchall是將所有收尋的資料都寫入 rows 中 out = pd.DataFrame(cur.fetchall()) time = pd.concat([time,out]) time.columns = ["icustay_id","leavetime"] ``` 以icustay_id將資料合併後，輸出 ``` time.leavetime = (time.leavetime.dt.seconds/60/60)+(time.leavetime.dt.days*24) Output = pd.merge(time,All,on="icustay_id") Output.head() ```