---
GA: UA-159972578-2
---
###### tags: `R` `Data Processing` `資料前處理` `Regex` `正則表達式` `文字分析` `Text Mining` `情緒分析` `Sentiment Analysis`
# Sympsons(English)_Sentiment Analysis
Reference: [Text Mining Ebook](https://www.tidytextmining.com)
See More Detail Here: [Rpubs Ver.(Completed)](https://rpubs.com/RitaTang/text_simpsons_final)
# 文本探索
## 讀取資料
```{r warning=FALSE, message=FALSE, cache=FALSE, error=FALSE}
pacman::p_load(tidytext, dplyr, ggplot2, magrittr, plotly, readr, stringr, wordcloud2, wordcloud, reshape2)
simps <- read_file("./simps/all.txt")
stpw <- read_file("./simps/stpw.txt")
```
## 文字前處理
### 查看斷詞結果
```{r echo=FALSE}
stpw = stpw %>% tibble(text = .) %>%
unnest_tokens(word, text)
simps = simps %>% tibble(line=1:length(simps), text = .) %>%
unnest_tokens(word, text) %>% filter(!word %in% stpw$word)
```
### 標註集數
```{r}
simps_ep = simps %>% filter(nchar(word)>1) %>%
mutate(episode = cumsum(str_detect(word, regex("^ep[1-9]|1[6]$"))))
```
# 情緒分析
## 情緒字典介紹
+ 英文(get_sentiments)
+ NRC (10 Category)
+ 34% (+): joy, positive, surprise, trust
+ 66% (-): anger, anticipation, disgust, fear, negative, sadness
+ Afinn (Numeric: -5~+5, mean: -0.6)
+ Bing (2 Category: 30% Positive, 70% Negative)
+ Loughran (6 Category)
+ This dictionary was developed based on analyses of <b>financial reports.</b>
+ (+): positive
+ (-): constraining, litigious, negative, superfluous, uncertainty
## 準備字典
```{r}
afinn = get_sentiments("afinn")
nrc = get_sentiments("nrc")
bing = get_sentiments("bing")
loughran = get_sentiments("loughran")
sapply(c(afinn, nrc, bing, loughran), head)
```
## 以Bing字典判斷辛普森一家台詞的情緒傾向
```{r warning=FALSE, message=FALSE, cache=FALSE, error=FALSE}
simps_ep %>%
select(word) %>%
inner_join(bing) %>%
group_by(sentiment) %>%
summarise(cnt = n())
simps_cnt = simps_ep %>%
group_by(episode, word) %>%
summarise(count = n()) %>%
arrange(desc(count))
```
```
sentiment cnt
<chr> <int>
negative 1176
positive 919
```
## 分析四種字典每集情緒
```{r tidy=FALSE, results="asis", message=FALSE}
p_afinn = simps_cnt %>%
inner_join(afinn) %>%
group_by(episode, value) %>%
summarise(cnt = sum(count)) %>% ggplot() +
geom_boxplot(aes(x=episode, y=value, colour=as.factor(episode))) +
ggtitle("Afinn-每集情緒詞數量差異") +
scale_x_continuous(breaks=seq(1,16,1)) +
theme(text=element_text(family="蘋方-繁 中黑體", size=12),
plot.title=element_text(hjust = 0.5))
ggplotly(p_afinn)
```

+ 對Afinn而言,第2, 12, 15集平均值高於總體;第4, 8集低於總體
```{r tidy=FALSE, results="asis", message=FALSE}
p_nrc = simps_cnt %>%
inner_join(nrc) %>%
group_by(episode, sentiment) %>%
summarise(cnt = sum(count)) %>% ggplot() +
geom_line(aes(x=episode, y=cnt, colour=sentiment)) +
ggtitle("NRC-每集情緒詞數量差異") +
scale_x_continuous(breaks=seq(1,16,1)) +
theme(text=element_text(family="蘋方-繁 中黑體", size=12),
plot.title=element_text(hjust = 0.5))
ggplotly(p_nrc)
```

+ 對NRC而言,第9集特別正面
```{r message=FALSE}
p_bing = simps_cnt %>%
inner_join(bing) %>%
group_by(episode, sentiment) %>%
summarise(cnt = sum(count)) %>% ggplot() +
geom_line(aes(x=episode, y=cnt, colour=sentiment)) +
ggtitle("Bing-每集情緒詞數量差異") +
scale_x_continuous(breaks=seq(1,16,1)) +
theme(text=element_text(family="蘋方-繁 中黑體", size=12),
plot.title=element_text(hjust = 0.5))
ggplotly(p_bing)
```

+ 對Bing而言,差距最大的為第10集(負面),差距最小為第12集(中性)
```{r message=FALSE}
p_loughran = simps_cnt %>%
inner_join(loughran) %>%
group_by(episode, sentiment) %>%
summarise(cnt = sum(count)) %>% ggplot() +
geom_line(aes(x=episode, y=cnt, colour=sentiment)) +
ggtitle("Loughran-每集情緒詞數量差異") +
scale_x_continuous(breaks=seq(1,16,1)) +
theme(text=element_text(family="蘋方-繁 中黑體", size=12),
plot.title=element_text(hjust = 0.5))
ggplotly(p_loughran)
```

+ 對Loughran而言,第14集最負面
+ 整體來說,Afinn較為中性,NRC正面>負面,其餘字典則是負面>正面
# 文字雲視覺化
## 整體字幕庫文字雲
```{r}
tokens_count = simps_ep %>%
group_by(word) %>%
summarise(count = n()) %>%
filter(count > 30) %>%
arrange(desc(count))
head(tokens_count, 10)
js_color_fun = "function (word, weight) {
return (weight > 65) ? '#f02222' : '#c09292';
}"
tokens_count %>% wordcloud2(color = htmlwidgets::JS(js_color_fun), backgroundColor = "black")
```

## 分析第13集中的正負面詞彙
```{r}
simps_ep %>%
group_by(episode, word) %>%
summarise(count = n()) %>%
filter(episode==13) %>%
inner_join(bing) %>%
group_by(word, sentiment) %>%
summarise(count=sum(count)) %>%
filter(count > 1) %>% # 只有1次的太多了因此濾掉
acast(word ~ sentiment, value.var = "count", fill = 0) %>%
comparison.cloud(colors = c("gray80", "gray20"),
max.words = 100, family = "蘋方-繁 中黑體")
```
