如果無法轉成DocumentTermMatrix
–-> Rcpp didn't have previous_save 之類的error
把Rcpp整個檔案刪掉
再 install.packages("Rcpp")
wordcloud 裡面包含5個txt檔
檔案連結點我
下載後看放哪裡,再改dir的位置哈哈。
分段講解
文本處理
library(tm)
dir <- "C:\\Users\\user\\Desktop\\wordcloud"
docs <- Corpus(DirSource(dir))
docs
Corpus 可以用三種方式來讀取資料
這次是用DirSource的方式,三者差別
輸出Corpus
writeCorpus(Your_Corpus_Name,path = dir)
建立語料庫
library(tm)
dir <- "C:\\Users\\user\\Desktop\\wordcloud"
docs <- Corpus(DirSource(dir))
docs
inspect(docs)
可以直接呼叫docs or 用 inspect() 來查看 Corpus
建立語意資料庫
移除標點符號時,像是 don't 的 單引號也會被移除,剩下 dont
docs <- tm_map(docs,content_transformer(tolower))
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, stripWhitespace)
# add two extra stop words: 'news' and 'via'
myStopwords <- c(stopwords("english"), "news", "via")
# remove 'r' and 'big' from stopwords
myStopwords <- setdiff(myStopwords, c("r", "big"))
docs <- tm_map(docs, removeWords, myStopwords)
# stem words
docs <- tm_map(docs, stemDocument)
writeLines(as.character(docs[[1]]))
docs[[n]] 是指語料庫中的第幾個文件,從1開始(此次文件有5個,所以是1~5)
尚未處理的writeline
處理過的writeline
dtm <- DocumentTermMatrix(docs)
dtm
freq <- colSums(as.matrix(dtm))
length(freq) == 748
轉換後把matrix叫出來查看,注意看terms(有幾個字)
用freq來計算每個字出現的頻率
最後用length來check一下
748 = terms 個數 (以此例來說)
Result + 畫圖展示
ord <- order(freq,decreasing = TRUE)
#inspect most frequent occurring terms
freq[head(ord)]
#inspect least frequent occurring terms
freq[tail(ord)]
# lowfreq = 至少出現幾次
findFreqTerms(dtm,lowfreq=7)
# 設定一個詞A,然後找出所有跟隨這個詞出現的詞
# 後面數字是出現的機率(0~1)
# ex. 設1的話就是找只要A出現,就一定會出現的詞
findAssocs(dtm,'die',0.6)
wf=data.frame(term=names(freq),occurrences=freq)
library(ggplot2)
p <- ggplot(subset(wf, freq >= 7), aes(term, occurrences))
p <- p + geom_bar(stat='identity')
p <- p + theme(axis.text.x=element_text(angle=20, hjust=1),plot.title = element_text(hjust = 0.5),legend.position ='best')
p <- p + ggtitle('Term-occurance histogram (freq>=7)')
p
可以在freq那邊選擇要展示的文字之出現頻率至少大於多少才可以,這邊是用 >= 7
還有要注意的點是 stat = 'identity' 要寫 –->因為 default = 'count'
如果是count的話,他會計算在dataframe中每個字詞出現的次數 –-> 也就是1
所以會變成高度全部都是1的bar圖 –-> not good
install.packages("wordcloud")
library(wordcloud)
set.seed(42)
#limit words by specifying min frequency
# pal(n,plattes) ---> n = number of color in platte,min = 3 // plattes see below link
wordcloud(names(freq),freq, min.freq=5,colors=brewer.pal(6,'Paired'))
# more brewer: https://www.datanovia.com/en/blog/the-a-z-of-rcolorbrewer-palette/
min.freq = n = 至少出現n次的字詞才會顯示在wordcloud上面
set.seed(42)只是為了能重現同一種雲
討論
有發現一些字詞的尾部被切掉了,例如village –-> villag
可能後續還要查詢一下這方面的資訊
不過作為email分析的前置練習
我覺得還行
完整CODE
library(tm)
dir <- "C:\\Users\\user\\Desktop\\wordcloud"
docs <- Corpus(DirSource(dir))
docs
getTransformations()
writeLines(as.character(docs[[1]]))
docs <- tm_map(docs,content_transformer(tolower))
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, stripWhitespace)
# add two extra stop words: 'news' and 'via'
myStopwords <- c(stopwords("english"), "news", "via")
# remove 'r' and 'big' from stopwords
# myStopwords <- setdiff(myStopwords, c("r", "big"))
docs <- tm_map(docs, removeWords, myStopwords)
# stem words
docs <- tm_map(docs, stemDocument)
writeLines(as.character(docs[[1]]))
dtm <- DocumentTermMatrix(docs)
dtm
freq <- colSums(as.matrix(dtm))
length(freq) == 748
ord <- order(freq,decreasing = TRUE)
#inspect most frequent occurring terms
freq[head(ord)]
#inspect least frequent occurring terms
freq[tail(ord)]
# lowfreq = 至少出現幾次
findFreqTerms(dtm,lowfreq=7)
# 設定一個詞A,然後找出所有跟隨這個詞出現的詞
# 後面數字是出現的機率(0~1)
# ex. 設1的話就是找只要A出現,就一定會出現的詞
findAssocs(dtm,'die',0.6)
wf=data.frame(term=names(freq),occurrences=freq)
library(ggplot2)
p <- ggplot(subset(wf, freq >= 7), aes(term, occurrences))
p <- p + geom_bar(stat='identity')
p <- p + theme(axis.text.x=element_text(angle=20, hjust=1),plot.title = element_text(hjust = 0.5),legend.position ='best')
p <- p + ggtitle('Term-occurance histogram (freq>=7)')
p
# wordcloud
library(wordcloud)
#setting the same seed each time ensures consistent look across clouds
set.seed(42)
#limit words by specifying min frequency
# pal(n,plattes) ---> n = number of color in platte,min = 3 // plattes see below link
wordcloud(names(freq),freq, min.freq=5,colors=brewer.pal(6,'Paired'))
# more brewer: https://www.datanovia.com/en/blog/the-a-z-of-rcolorbrewer-palette/
reference
R
beginner
cat
tutorial