## LDA
```{r eval=FALSE}
ldas_allPosts = c()
topics = c(3, 5, 10, 25, 36)
for(topic in topics){
start_time <- Sys.time()
lda_allPosts <- LDA(allPosts_dtm, k = topic, control = list(seed = 2020))
ldas_allPosts =c(ldas_allPosts,lda_allPosts)
print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
save(ldas_allPosts,file = "ldas_result_allPosts")
}
```
```{r}
load("ldas_result_allPosts")
```
```{r}
topics = c(3, 5, 10, 25, 36)
data_frame(k = topics,
perplex = map_dbl(ldas_allPosts, topicmodels::perplexity)) %>%
ggplot(aes(k, perplex)) +
geom_point() +
geom_line() +
labs(title = "Evaluating LDA topic models",
subtitle = "Optimal number of topics (smaller is better)",
x = "Number of topics",
y = "Perplexity")
```
```{r}
new_lda_allPosts = ldas_allPosts[[3]] ## 選定topic 為10 的結果
topics_allPosts <- tidy(new_lda_allPosts, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
```
```{r}
remove_words = c("冰雪奇緣", "迪士尼", "艾莎", "安娜", "無敵破壞王", "電影", "劇情", "動畫", "這部", "角色", "公主", "我們", "網路", "世界")
top_terms_allPosts <- topics_allPosts %>%
filter(!term %in% remove_words)%>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms_allPosts %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values=mycolors)+
facet_wrap(~ topic, scales = "free") +
coord_flip()+
theme(text = element_text(family='STHeitiTC-Light'))
```
## 主題
```{r}
topic_name = c('票房', '動物方城市', '冰雪奇緣', '皮克斯', '無敵破壞王', '大英雄天團', '怪獸', '腦筋急轉彎', '冰雪', '玩具總動員')
```
```{r}
tmResult <- posterior(new_lda_allPosts)
doc_pro <- tmResult$topics
dim(doc_pro)
```
```{r}
# get document topic proportions
allPosts_data <- allPosts %>%
select(artTitle, artDate, artTime, artUrl, artPoster, artCat)
document_topics <- doc_pro[allPosts_data$artUrl,]
document_topics_df = data.frame(document_topics)
colnames(document_topics_df) = topic_name
rownames(document_topics_df) = NULL
news_topic = cbind(allPosts_data, document_topics_df)
news_topic %>% head(10)
```
## 查看特定主題
```{r , eval=FALSE}
news_topic %>%
arrange(desc(`冰雪`)) %>% head(10)
```
## 了解主題在時間的變化
```{r eval=FALSE}
news_topic[,c(7:16)] =sapply(news_topic[,c(7:16)] , as.numeric)
news_topic %>%
group_by(artDate = format(artDate,'%Y%m')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
ggplot( aes(x=artDate, y=value, fill=variable)) +
geom_bar(stat = "identity") + ylab("value") +
scale_fill_manual(values=mycolors)+
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme(text = element_text(family='STHeitiTC-Light'))
```
## 以比例了解主題時間變化
```{r}
news_topic %>%
group_by(artDate = format(artDate,'%Y%m')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
group_by(artDate)%>%
mutate(total_value =sum(value))%>%
ggplot( aes(x=artDate, y=value/total_value, fill=variable)) +
geom_bar(stat = "identity") + ylab("proportion") +
scale_fill_manual(values=mycolors)+
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme(text = element_text(family='STHeitiTC-Light'))
```
## 網路圖
```{r}
# 選取所需欄位
allReviews <- allReviews %>%
select(artUrl, cmtPoster, cmtStatus, cmtContent)
```
## 發文者數量
```{r}
length(unique(allPosts$artPoster))
```
## 回覆者數量
```{r}
length(unique(allReviews$cmtPoster))
```
## 總共參與人數
```{r}
allPoster <- c(allPosts$artPoster, allReviews$cmtPoster)
length(unique(allPoster))
```
### 整理所有參與人
```{r}
# 整理所有出現過得使用者
# 如果它曾發過文的話就標註他爲poster
# 如果沒有發過文的話則標註他爲replyer
userList <- data.frame(user=unique(allPoster)) %>%
mutate(type=ifelse(user%in%allPosts$artPoster, "poster", "replyer"))
userList
```
## 建立社群網路圖
```{r}
postsReviews <- merge(x = allPosts, y = allReviews, by = "artUrl")
postsReviews
```
### 篩選欄位
```{r}
# 取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位
link <- postsReviews %>%
select(cmtPoster, artPoster, artUrl)
link
```
### 建立網路關係
```{r}
reviewNetwork <- graph_from_data_frame(d=link, directed=T)
reviewNetwork
```
### 網路圖
```{r}
# 畫出網路圖
plot(reviewNetwork)
```
### 調整參數
```{r}
# 把點點的大小和線的粗細調小,並不顯示使用者賬號。
plot(reviewNetwork, vertex.size=2, edge.arrow.size=.2,vertex.label=NA)
```
# 資料篩選
### 挑出2019-11-21當天的文章和它的回覆
```{r}
link <- postsReviews %>%
filter(artDate == as.Date('2019-11-21')) %>%
select(cmtPoster, artPoster, artUrl) %>%
unique()
link
```
### 過濾圖中的點(v)
```{r}
# 這邊要篩選link中有出現的使用者
# 因爲如果userList(igraph中graph_from_data_frame的v參數吃的那個東西)中出現了沒有在link中出現的使用者
# 也會被igraph畫上去,圖片就會變得沒有意義
# 想要看會變怎麼樣的人可以跑一下下面的code
filtered_user <- userList %>%
filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
arrange(desc(type))
filtered_user
```
### 過濾使用者後
```{r}
set.seed(487)
# 建立網路關係圖,因爲剛剛看的時候感覺箭頭有點礙眼,
# 所以這裏我們先把關係的方向性拿掉,減少圖片中的不必要的資訊
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.3,vertex.label=NA)
```
### 加強圖像的顯示資訊(1)
```{r}
set.seed(487)
# 用使用者的身份來區分點的顏色,如果有發文的話是金色的,只有回覆文章的則用淺藍色表示
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.3,vertex.label=NA)
```
### 加強圖像的顯示資訊(2)
```{r}
set.seed(487)
# 篩選要顯示出的使用者,以免圖形被密密麻麻的文字覆蓋
# 顯示有超過5個關聯的使用者賬號
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.2,
vertex.label=ifelse(degree(reviewNetwork) > 5, V(reviewNetwork)$label, NA), vertex.label.font=2)
```
```{r}
new_lda_allPosts5 = ldas_allPosts[[3]] ## 選定topic 為5 的結果
topics_allPosts5 <- tidy(new_lda_allPosts5, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
```
```{r}
remove_words = c("冰雪奇緣", "迪士尼", "艾莎", "安娜", "無敵破壞王", "電影", "劇情", "動畫", "這部", "角色", "公主", "我們", "網路", "世界")
top_terms_allPosts5 <- topics_allPosts5 %>%
filter(!term %in% remove_words)%>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms_allPosts5 %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values=mycolors)+
facet_wrap(~ topic, scales = "free") +
coord_flip()+
theme(text = element_text(family='STHeitiTC-Light'))
```
```{r}
# 使用LDA分類每篇文章的主題
animate_topics <- tidy(new_lda_allPosts5, matrix="gamma") %>% # 在tidy function中使用參數"gamma"來取得 theta矩陣。
group_by(document) %>%
top_n(1, wt=gamma)
animate_topics
```
### LDA主題進行視覺化
```{r}
# 把文章資訊和主題join起來
postsReviews <- merge(x = postsReviews, y = animate_topics, by.x = "artUrl", by.y="document")
postsReviews
```
```{r}
# 挑選出2019/11/21後的文章,
# 篩選有在15篇以上文章回覆者,
# 文章主題歸類為1(口罩預購)與3(各國疫情)者,
# 欄位只取:cmtPoster(評論者), artPoster(發文者), artUrl(文章連結), topic(主題)
link <- postsReviews %>%
filter(artDate > as.Date('2019-11-21')) %>%
group_by(cmtPoster, artUrl) %>%
filter(n()>15) %>%
ungroup() %>%
filter(topic == 6 | topic == 8 | topic == 9 | topic == 10) %>%
select(cmtPoster, artPoster, artUrl, topic) %>%
unique()
link
```
```{r}
# 篩選link中有出現的使用者
filtered_user <- userList %>%
filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
arrange(desc(type))
filtered_user
```
### 使用者經常參與的文章種類
```{r}
# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$topic == "1", "lightgreen", "palevioletred")
# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=3, edge.width=3, vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) > 7, V(reviewNetwork)$label, NA),vertex.label.font=2)
# 加入標示
legend("bottomright", c("發文者","回文者"), pch=21,
col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("冰雪奇緣","怪獸"),
col=c("lightgreen","palevioletred"), lty=1, cex=1)
```
### 使用者是否受到歡迎
```{r}
# PTT的回覆有三種,推文、噓文、箭頭
# 我們只要看推噓就好,因此把箭頭清掉
link <- postsReviews %>%
filter(cmtStatus!="→") %>%
group_by(cmtPoster, artUrl) %>%
filter(n()>5) %>%
ungroup() %>%
select(cmtPoster, artPoster, artUrl, cmtStatus) %>%
unique()
# 接下來把網路圖畫出來,跟前面做的事都一樣,因此不再細述
# 篩選link中有出現的使用者
filtered_user <- userList %>%
filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
arrange(desc(type))
# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$cmtStatus == "推", "lightgreen", "palevioletred")
# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=2, edge.width=3, vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) > 5, V(reviewNetwork)$label, NA),vertex.label.font=2)
# 顯示中文
op <- par(family = "STHeitiTC-Light")
# 加入標示
legend("bottomright", c("發文者","回文者"), pch=21,
col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("推","噓"),
col=c("lightgreen","palevioletred"), lty=1, cex=1)
```