## LDA ```{r eval=FALSE} ldas_allPosts = c() topics = c(3, 5, 10, 25, 36) for(topic in topics){ start_time <- Sys.time() lda_allPosts <- LDA(allPosts_dtm, k = topic, control = list(seed = 2020)) ldas_allPosts =c(ldas_allPosts,lda_allPosts) print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time))) save(ldas_allPosts,file = "ldas_result_allPosts") } ``` ```{r} load("ldas_result_allPosts") ``` ```{r} topics = c(3, 5, 10, 25, 36) data_frame(k = topics, perplex = map_dbl(ldas_allPosts, topicmodels::perplexity)) %>% ggplot(aes(k, perplex)) + geom_point() + geom_line() + labs(title = "Evaluating LDA topic models", subtitle = "Optimal number of topics (smaller is better)", x = "Number of topics", y = "Perplexity") ``` ```{r} new_lda_allPosts = ldas_allPosts[[3]] ## 選定topic 為10 的結果 topics_allPosts <- tidy(new_lda_allPosts, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。 ``` ```{r} remove_words = c("冰雪奇緣", "迪士尼", "艾莎", "安娜", "無敵破壞王", "電影", "劇情", "動畫", "這部", "角色", "公主", "我們", "網路", "世界") top_terms_allPosts <- topics_allPosts %>% filter(!term %in% remove_words)%>% group_by(topic) %>% top_n(10, beta) %>% ungroup() %>% arrange(topic, -beta) top_terms_allPosts %>% mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) + scale_fill_manual(values=mycolors)+ facet_wrap(~ topic, scales = "free") + coord_flip()+ theme(text = element_text(family='STHeitiTC-Light')) ``` ## 主題 ```{r} topic_name = c('票房', '動物方城市', '冰雪奇緣', '皮克斯', '無敵破壞王', '大英雄天團', '怪獸', '腦筋急轉彎', '冰雪', '玩具總動員') ``` ```{r} tmResult <- posterior(new_lda_allPosts) doc_pro <- tmResult$topics dim(doc_pro) ``` ```{r} # get document topic proportions allPosts_data <- allPosts %>% select(artTitle, artDate, artTime, artUrl, artPoster, artCat) document_topics <- doc_pro[allPosts_data$artUrl,] document_topics_df = data.frame(document_topics) colnames(document_topics_df) = topic_name rownames(document_topics_df) = NULL news_topic = cbind(allPosts_data, document_topics_df) news_topic %>% head(10) ``` ## 查看特定主題 ```{r , eval=FALSE} news_topic %>% arrange(desc(`冰雪`)) %>% head(10) ``` ## 了解主題在時間的變化 ```{r eval=FALSE} news_topic[,c(7:16)] =sapply(news_topic[,c(7:16)] , as.numeric) news_topic %>% group_by(artDate = format(artDate,'%Y%m')) %>% summarise_if(is.numeric, sum, na.rm = TRUE) %>% melt(id.vars = "artDate")%>% ggplot( aes(x=artDate, y=value, fill=variable)) + geom_bar(stat = "identity") + ylab("value") + scale_fill_manual(values=mycolors)+ theme(axis.text.x = element_text(angle = 90, hjust = 1)) + theme(text = element_text(family='STHeitiTC-Light')) ``` ## 以比例了解主題時間變化 ```{r} news_topic %>% group_by(artDate = format(artDate,'%Y%m')) %>% summarise_if(is.numeric, sum, na.rm = TRUE) %>% melt(id.vars = "artDate")%>% group_by(artDate)%>% mutate(total_value =sum(value))%>% ggplot( aes(x=artDate, y=value/total_value, fill=variable)) + geom_bar(stat = "identity") + ylab("proportion") + scale_fill_manual(values=mycolors)+ theme(axis.text.x = element_text(angle = 90, hjust = 1)) + theme(text = element_text(family='STHeitiTC-Light')) ``` ## 網路圖 ```{r} # 選取所需欄位 allReviews <- allReviews %>% select(artUrl, cmtPoster, cmtStatus, cmtContent) ``` ## 發文者數量 ```{r} length(unique(allPosts$artPoster)) ``` ## 回覆者數量 ```{r} length(unique(allReviews$cmtPoster)) ``` ## 總共參與人數 ```{r} allPoster <- c(allPosts$artPoster, allReviews$cmtPoster) length(unique(allPoster)) ``` ### 整理所有參與人 ```{r} # 整理所有出現過得使用者 # 如果它曾發過文的話就標註他爲poster # 如果沒有發過文的話則標註他爲replyer userList <- data.frame(user=unique(allPoster)) %>% mutate(type=ifelse(user%in%allPosts$artPoster, "poster", "replyer")) userList ``` ## 建立社群網路圖 ```{r} postsReviews <- merge(x = allPosts, y = allReviews, by = "artUrl") postsReviews ``` ### 篩選欄位 ```{r} # 取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位 link <- postsReviews %>% select(cmtPoster, artPoster, artUrl) link ``` ### 建立網路關係 ```{r} reviewNetwork <- graph_from_data_frame(d=link, directed=T) reviewNetwork ``` ### 網路圖 ```{r} # 畫出網路圖 plot(reviewNetwork) ``` ### 調整參數 ```{r} # 把點點的大小和線的粗細調小,並不顯示使用者賬號。 plot(reviewNetwork, vertex.size=2, edge.arrow.size=.2,vertex.label=NA) ``` # 資料篩選 ### 挑出2019-11-21當天的文章和它的回覆 ```{r} link <- postsReviews %>% filter(artDate == as.Date('2019-11-21')) %>% select(cmtPoster, artPoster, artUrl) %>% unique() link ``` ### 過濾圖中的點(v) ```{r} # 這邊要篩選link中有出現的使用者 # 因爲如果userList(igraph中graph_from_data_frame的v參數吃的那個東西)中出現了沒有在link中出現的使用者 # 也會被igraph畫上去,圖片就會變得沒有意義 # 想要看會變怎麼樣的人可以跑一下下面的code filtered_user <- userList %>% filter(user%in%link$cmtPoster | user%in%link$artPoster) %>% arrange(desc(type)) filtered_user ``` ### 過濾使用者後 ```{r} set.seed(487) # 建立網路關係圖,因爲剛剛看的時候感覺箭頭有點礙眼, # 所以這裏我們先把關係的方向性拿掉,減少圖片中的不必要的資訊 reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F) plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.3,vertex.label=NA) ``` ### 加強圖像的顯示資訊(1) ```{r} set.seed(487) # 用使用者的身份來區分點的顏色,如果有發文的話是金色的,只有回覆文章的則用淺藍色表示 V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue") plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.3,vertex.label=NA) ``` ### 加強圖像的顯示資訊(2) ```{r} set.seed(487) # 篩選要顯示出的使用者,以免圖形被密密麻麻的文字覆蓋 # 顯示有超過5個關聯的使用者賬號 labels <- degree(reviewNetwork) V(reviewNetwork)$label <- names(labels) V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue") plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.2, vertex.label=ifelse(degree(reviewNetwork) > 5, V(reviewNetwork)$label, NA), vertex.label.font=2) ``` ```{r} new_lda_allPosts5 = ldas_allPosts[[3]] ## 選定topic 為5 的結果 topics_allPosts5 <- tidy(new_lda_allPosts5, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。 ``` ```{r} remove_words = c("冰雪奇緣", "迪士尼", "艾莎", "安娜", "無敵破壞王", "電影", "劇情", "動畫", "這部", "角色", "公主", "我們", "網路", "世界") top_terms_allPosts5 <- topics_allPosts5 %>% filter(!term %in% remove_words)%>% group_by(topic) %>% top_n(10, beta) %>% ungroup() %>% arrange(topic, -beta) top_terms_allPosts5 %>% mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) + scale_fill_manual(values=mycolors)+ facet_wrap(~ topic, scales = "free") + coord_flip()+ theme(text = element_text(family='STHeitiTC-Light')) ``` ```{r} # 使用LDA分類每篇文章的主題 animate_topics <- tidy(new_lda_allPosts5, matrix="gamma") %>% # 在tidy function中使用參數"gamma"來取得 theta矩陣。 group_by(document) %>% top_n(1, wt=gamma) animate_topics ``` ### LDA主題進行視覺化 ```{r} # 把文章資訊和主題join起來 postsReviews <- merge(x = postsReviews, y = animate_topics, by.x = "artUrl", by.y="document") postsReviews ``` ```{r} # 挑選出2019/11/21後的文章, # 篩選有在15篇以上文章回覆者, # 文章主題歸類為1(口罩預購)與3(各國疫情)者, # 欄位只取:cmtPoster(評論者), artPoster(發文者), artUrl(文章連結), topic(主題) link <- postsReviews %>% filter(artDate > as.Date('2019-11-21')) %>% group_by(cmtPoster, artUrl) %>% filter(n()>15) %>% ungroup() %>% filter(topic == 6 | topic == 8 | topic == 9 | topic == 10) %>% select(cmtPoster, artPoster, artUrl, topic) %>% unique() link ``` ```{r} # 篩選link中有出現的使用者 filtered_user <- userList %>% filter(user%in%link$cmtPoster | user%in%link$artPoster) %>% arrange(desc(type)) filtered_user ``` ### 使用者經常參與的文章種類 ```{r} # 建立網路關係 reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F) # 依據使用者身份對點進行上色 labels <- degree(reviewNetwork) V(reviewNetwork)$label <- names(labels) V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue") # 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色 E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$topic == "1", "lightgreen", "palevioletred") # 畫出社群網路圖 set.seed(5432) plot(reviewNetwork, vertex.size=3, edge.width=3, vertex.label.dist=1, vertex.label=ifelse(degree(reviewNetwork) > 7, V(reviewNetwork)$label, NA),vertex.label.font=2) # 加入標示 legend("bottomright", c("發文者","回文者"), pch=21, col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1) legend("topleft", c("冰雪奇緣","怪獸"), col=c("lightgreen","palevioletred"), lty=1, cex=1) ``` ### 使用者是否受到歡迎 ```{r} # PTT的回覆有三種,推文、噓文、箭頭 # 我們只要看推噓就好,因此把箭頭清掉 link <- postsReviews %>% filter(cmtStatus!="→") %>% group_by(cmtPoster, artUrl) %>% filter(n()>5) %>% ungroup() %>% select(cmtPoster, artPoster, artUrl, cmtStatus) %>% unique() # 接下來把網路圖畫出來,跟前面做的事都一樣,因此不再細述 # 篩選link中有出現的使用者 filtered_user <- userList %>% filter(user%in%link$cmtPoster | user%in%link$artPoster) %>% arrange(desc(type)) # 建立網路關係 reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F) # 依據使用者身份對點進行上色 labels <- degree(reviewNetwork) V(reviewNetwork)$label <- names(labels) V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue") # 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色 E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$cmtStatus == "推", "lightgreen", "palevioletred") # 畫出社群網路圖 set.seed(5432) plot(reviewNetwork, vertex.size=2, edge.width=3, vertex.label.dist=1, vertex.label=ifelse(degree(reviewNetwork) > 5, V(reviewNetwork)$label, NA),vertex.label.font=2) # 顯示中文 op <- par(family = "STHeitiTC-Light") # 加入標示 legend("bottomright", c("發文者","回文者"), pch=21, col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1) legend("topleft", c("推","噓"), col=c("lightgreen","palevioletred"), lty=1, cex=1) ```