Visualization - HackMD

--- GA: UA-159972578-2 --- ###### tags: `R` `Visualization` # Visualization ![](https://i.imgur.com/cARoFsi.png) ![](https://i.imgur.com/nHqk6aX.png) Other Reference: [Rpubs](https://rpubs.com/RitaTang/Visualization) ## R Markdown Theme ```{r} date: '`r Sys.Date()`' output: rmdformats::material # readthedown ``` ```{r} date: "`r Sys.Date()`" output: prettydoc::html_pretty: number_sections: yes theme: cayman # hpstr toc: true ``` ```{r} output: html_document: highlight: pygments theme: flatly css: style.css toc: true toc_depth: 3 toc_float: collapsed: false ``` ## Correlation Plot ### 法一 ```{r} library(corrplot) dfcor = cor(df[,c(4:5,7,10:11)]) round(dfcor, 2) symnum(dfcor, abbr.colnames = F) corrplot(dfcor, type = "upper", order = "hclust", tl.col = "black", tl.srt = 0) corrplot.mixed(cor(df[,c(4:5,7,10:11)]), upper="ellipse") ``` ![](https://i.imgur.com/3bnoQWJ.png) ### 法二 ```{r} col = colorRampPalette(c("blue", "white", "red"))(20) heatmap(x = res, col = col, symm = TRUE) # symm表示是否對稱 ``` ![](https://i.imgur.com/esWmeXu.png) ### 法三 ```{r} library(PerformanceAnalytics) chart.Correlation(mydata, histogram = T, pch=19) ``` ![](https://i.imgur.com/EkvDWkk.png) ## ggplot2 套件 #### 註：Mac畫ggplot無法顯示中文解決方法 ```{r} ggplot()+ theme(text=element_text(family="蘋方-繁中黑體", size=12), plot.title=element_text(hjust = 0.5)) ``` ### 1. Line Plot ```{r} ZM = Z0 %>% mutate(month = format(as.Date(date),format="%m")) age_ratio = as.data.frame(prop.table(table(ZM$age,ZM$month))) ZM = left_join(ZM, age_ratio, by=c("age"="Var1", "month"="Var2")) ggplot(ZM, aes(x = month, y = Freq, group = age, color=age)) + geom_point() + geom_line() # 沒有做group會無法把點跟點一起連線 ``` ![](https://i.imgur.com/YzPI7iE.png) ```{r} # 熱門男生名字的『比例』 mtop = mbaby %>% group_by(year) %>% top_n(1, number) %>% pull(name) %>% unique() mbaby %>% group_by(year) %>% mutate(fraction = number / sum(number)) %>% filter(name %in% mtop) %>% ggplot(aes(x=year,y=fraction,col=name)) + geom_line(alpha=0.5) + geom_point(size=0.5) -> g ggplotly(g) ``` ![](https://i.imgur.com/QosQ6bP.png) + Label ```{r} ggplot(ex1221, aes(Discharge, Area)) + geom_point(aes(size=NO3)) + xlab("My x label") + ylab("My y label") + ggtitle("Weighted Scatterplot of Watershed Area vs. Discharge and Nitrogen Levels (PPM)") ``` ```{r} ggplot(ex1221, aes(Discharge, Area)) + geom_point(aes(size=NO3)) + labs(size= "Nitrogen", x = "My x label", y = "My y label", title = "Weighted Scatterplot of Watershed Area vs. Discharge and Nitrogen Levels (PPM)") ``` ### 2. Bar Plot + 使用時機：類別間的比例/數量比較 ![](https://i.imgur.com/bAhantl.png) + 看總數及個別數量：左 + 看比例分佈：中 + 看個別數量比較：右 #### geom_bar ``` df %>% ggplot(aes(as.factor(mon), fill = dept)) + geom_bar() + geom_text(stat = "count", aes(label = (..count..)), vjust=0.5, position=position_dodge(width=1), vjust=-0.25, size=4) + theme(legend.position='none', text = element_text(family="蘋方-繁中黑體", size=12), plot.title = element_text(size=24, hjust = 0.5)) + xlab("實習時長(月數)") + ylab("人數") + facet_wrap(~ dept, scales = "free") + ggtitle("實習時長分析-依各系") ``` ![](https://i.imgur.com/q8hK7jp.png) ``` df %>% ggplot(aes(as.factor(mon), fill = dept)) + geom_bar() + geom_text(stat = "count", aes(label = (..count..)), position=position_stack(vjust=0.5), size=4) + theme(text = element_text(family="蘋方-繁中黑體", size=12), plot.title = element_text(size=24, hjust = 0.5)) + xlab("月數") + ylab("人數") + ggtitle("實習時長分析-依月數(加總)") ``` ![](https://i.imgur.com/88T1zRz.png) ```{r} df %>% ggplot(aes(prov, fill = expr)) + geom_bar(position = 'dodge') + scale_x_discrete(labels=c("correct","not_provide","incorrect")) ``` ![](https://i.imgur.com/NWq9mX2.png) ```{r} ggplot(aes(x= artDate,y=sentiment,fill=method)) + geom_col(show.legend = FALSE) + # 黏在一起的長條圖(分顏色) scale_x_date(labels = date_format("%m/%d")) + # 日期format # 依照method切開長條圖成兩張scale一樣的圖, col數選1就是上下, 2為左右 facet_wrap(~method, ncol = 2, scales = "fixed")+ geom_text(aes(label=sentiment)) # 標數值 ``` ![](https://i.imgur.com/SiaPTQc.png) ![](https://i.imgur.com/5shsQkk.png) ```{r} word_count %>% inner_join(LIWC_ch) %>% group_by(sentiment) %>% top_n(10,wt = count) %>% # wt: Use for ordering. If not specified, defaults to the last variable in the tbl. ungroup() %>% # 先ungroup才可以重新排列 mutate(word = reorder(word, count)) %>% ggplot(aes(word, count, fill = sentiment)) + geom_col(show.legend = FALSE) + facet_wrap(~sentiment, scales = "free_y") + labs(y = "Contribution to sentiment", x = NULL) + theme(text=element_text(family="蘋方-繁中黑體", size=14))+ coord_flip() # 直轉橫 ``` ![](https://i.imgur.com/OmU5lil.png) #### geom_col ``` df %>% group_by(dept) %>% count(comDept) %>% ggplot(aes(comDept, n, fill=dept)) + geom_col() + geom_text(aes(y=n, label=n), position=position_dodge(width=1), vjust=-0.25, size=4) + theme(legend.position='none', text = element_text(family="蘋方-繁中黑體", size=12), plot.title = element_text(size=24, hjust = 0.5)) + xlab("實習部門") + ylab("人數") + facet_wrap(~ dept, scales = "free") + ggtitle("各系實習部門分析-依系所") ``` ![](https://i.imgur.com/MOsaAQX.png) ### 3. Lineplot(Ratio) + Barplot(Count) + 適合用於趨勢分析 ``` trend = function(var, col){ stat({{var}}) %>% ggplot() + geom_bar(mapping = aes(x = {{var}}, y = n), stat = "identity", fill = col) + geom_line(mapping = aes(x = {{var}}, y = ratio*554, group=1), size = 1.3, color = "#024349") + scale_y_continuous(name = "Total", sec.axis = sec_axis(~./554, name = "Turnover Rate", labels = function(b) { paste0(round(b * 100, 0), "%")})) + theme( axis.title.y = element_text(color = col), axis.title.y.right = element_text(color = "#024349")) } trend(year, "#FAD589") ``` ![](https://i.imgur.com/jkSdMUS.png) ### 4. Boxplot ![](https://i.imgur.com/gHc5BQg.png) + 上三張圖都是為了看資料的分佈 + 注重數量：hitogram + 注重比例：density(密度函數的好處就是將級距差太大的變數能放在一起看) ```{r} simps_cnt %>% inner_join(afinn) %>% group_by(episode, value) %>% summarise(cnt = sum(count)) %>% ggplot() + geom_boxplot(aes(x=episode, y=value, colour=as.factor(episode))) + ggtitle("Afinn-每集情緒差異") + scale_x_continuous(breaks=seq(1,16,1)) + # 將x軸全部的點都標出來 # 原本只有1,5,10,15 theme(text=element_text(family="蘋方-繁中黑體", size=12), plot.title=element_text(hjust = 0.5)) -> p_afinn plotly::ggplotly(p_afinn) ``` ![](https://i.imgur.com/culWpDf.png) + ggplot中的par(mfrow=c(1,2)) ```{r} plot1 = df %>% ggplot(aes(sex,scale(itr))) + geom_boxplot() + scale_x_discrete(labels=c("male","female")) plot2 = df %>% ggplot(aes(sex,scale(ip))) + geom_boxplot() + scale_x_discrete(labels=c("male","female")) gridExtra::grid.arrange(plot1, plot2, nrow=1, ncol=2) ``` ![](https://i.imgur.com/dB1gZdj.png) #### 善用melt轉換將原先三個欄位"sex","itr"和"ip"變為以下三欄位(更長): + sex=c("1","2") + variables=c("itr","ip") + value=c(1.1,1.4,2.3,...對照的值) 即能用同樣的資料集，將上面的組合圖整合為一張圖 ```{r} itr_ip = melt(df[,c(4:6)], id.var = "sex") itr_ip %>% ggplot(aes(variable,scale(value))) + geom_boxplot(aes(fill=sex)) + scale_fill_manual(values=c("#0099CC","#FF6666")) ``` ![](https://i.imgur.com/SezyQiy.png) ## Plot + 基本繪圖功能： + 類別變數的分布：直條圖 barplot() + 數量變數的分布：直方圖 hist() + 兩數量變數的關係：點狀圖 plot() + 比較各類別的數量分布：盒狀圖 boxplot() + 設定par(family="蘋方-繁中黑體") + plot(Y~X, family="蘋方-繁中黑體") ```{r} par(mfrow=c(2,2)) plot(SLID$wages~SLID$language) plot(SLID$wages~SLID$age) plot(SLID$wages~SLID$education) plot(SLID$wages~SLID$sex) ``` ![](https://i.imgur.com/KLtW0EI.png) + barplot(data type: matrix | vector) ```{r} #兩行不同顏色為一組分類的長條圖 matrix[3x2] ex_pr = as.matrix(table(df$expr,df$prov)) colnames(ex_pr) = c("correct","not_provide","incorrect") ex_pr[1,] = ex_pr[1,]/sum(ex_pr[1,]) # Scale to the same unit and check the propotion ex_pr[2,] = ex_pr[2,]/sum(ex_pr[2,]) barplot(ex_pr, beside = T, legend.text = c("exp_Y","exp_N")) ``` ![](https://i.imgur.com/l7RlrV4.png) ## Word Cloud ```{r} wordcloud2(demoFreq, size = 2, minRotation = -pi/2, maxRotation = -pi/2) # 不是直排就是橫排, 緊密排成一個橢圓 ``` ![](https://i.imgur.com/fUR8BND.jpg) ## Heatmap ``` table %>% d3heatmap(labrow = T, col=colorRamp(c('seagreen','lightyellow','red')), show_grid = F) ``` ```{r} table(weekday=format(ts,'%w'), month=format(ts,'%m')) ``` ``` month weekday 01 02 03 04 05 06 07 08 09 10 11 12 0 2110 1837 2075 2070 2168 2239 2339 2304 2352 2424 2254 2144 1 2395 1937 2200 2323 2359 2187 2457 2288 2258 2399 2323 2271 2 2317 1885 2270 2118 2222 2183 2412 2251 2142 2416 2258 2317 3 2259 2007 2242 2060 2345 2347 2408 2428 2239 2484 2182 2415 4 2334 1904 2263 2099 2402 2190 2385 2464 2320 2280 2253 2425 5 2392 2036 2443 2388 2340 2566 2459 2591 2390 2692 2475 2512 6 2240 1905 2265 2222 2199 2290 2341 2246 2359 2391 2318 2342 ``` + Reference for format: https://www.r-bloggers.com/date-formats-in-r/ ```{r} library(d3heatmap) table(format(ts,"%u"), format(ts,"%H")) %>% # %u: Week day(Monday is 1) # %H: Hours(00-23) as.data.frame.matrix %>% d3heatmap(F,F,col=colorRamp(c('seagreen','lightyellow','red'))) ``` ![](https://i.imgur.com/iSdluHp.png) ```{r} pacman::p_load(manipulateWidget) # install/load library color = colorRamp(c('seagreen','lightyellow','red')) # heatmap color # make 2 heatmaps for each top5 location L = lapply(top5, function(loc) { weekday = format(ts5[TOP5$LocationDescription == loc],"%u") hour = format(ts5[TOP5$LocationDescription == loc], "%H") arrest = TOP5$Arrest[TOP5$LocationDescription == loc] list(count = as.data.frame.matrix( table(weekday, hour) ), arrest = tapply(arrest, list(weekday, hour), mean)) %>% lapply(d3heatmap, F, F, col=color, show_grid=F, xaxis_height=8, yaxis_width=8) }) # plot 10 heatmaps at once combineWidgets( list=do.call(c, L), ncol=2, title = "<h4>TOP5 7x24 : 竊盜案件數量 & 破案率</h4>", footer= "<p>由上至下：(1)住宅車道、(2)加油站、(3)巷弄、(4)公用停車場、(5)街道</p>" ) ``` ![](https://i.imgur.com/8e0rYu9.png) ## PCA Plot <iframe width=100%, height=500px, src="https://hackmd.io/@ritatang242/r1lDWqiUL"> </iframe> ## Bubble plot ``` df = within(seo_ori, {bubblesize = seo_ori$avg_mon_search_volume}) g = df %>% ggplot(aes(x=company1, y=company2, col=category, label=keyword)) + geom_point(aes(size=bubblesize), alpha=0.7) + scale_size(range = c(1, 20)) + ggtitle("A Co. v.s. B Co. 的SEO排名表現") + theme(plot.title = element_text(hjust = 0.5)) ggplotly(g) ``` # Highcharter [Reference](http://jkunst.com/highcharter/) ## treemap ``` stat = function(var){ tree = df %>% group_by({{var}}) %>% summarise(n = n(), deal = sum(y)) %>% mutate(ratio = deal/n) %>% arrange(-ratio, -n, -deal) return(tree) } hchart(stat(consultant), "treemap", hcaes(x = consultant, value = n, color = ratio)) ``` ![](https://i.imgur.com/2kVWgNt.png)

Read more

統計分析方法應用於R語言

recommenderlab套件介紹

Text Mining (Preprocessing)

TraMineR套件介紹