# Project of Food Forum (01)
### PTT
```r=
# Write CSV
ptt_content <- read.csv("/Users/lialee/Desktop/ptt_content.csv")
ptt_content <- ptt_content[ptt_content$area %in% c(1:4),]
ptt_content <- ptt_content %>%
distinct(content ,url, .keep_all = TRUE)
unique(ptt_content$city)
a <- c("北","中","南","東","離島")
b <- 1:5
follow <- data.frame(cbind(a, b))
ptt_content <- merge(follow, ptt_content, by.x = "b", by.y = "area", all.x = T)
ptt_content <- ptt_content[,-1]
ptt_content <- ptt_content[is.na(ptt_content$city)==F,]
c <- unique(ptt_content$city)
c <- data.frame(c)
d <- c("台北市","桃園市","新竹市","基隆市","新北市","宜蘭縣","桃園市","新竹縣","新竹縣","台北市",
"桃園市","台中市","雲林縣","彰化縣","苗栗縣","苗栗縣","雲林縣","南投縣","雲林縣","台中市",
"南投縣","南投縣","雲林縣","苗栗縣","彰化縣","南投縣","雲林縣","雲林縣","苗栗縣","雲林縣",
"雲林縣","雲林縣","雲林縣","南投縣","高雄市","台南市","嘉義市","屏東縣","高雄市","嘉義縣",
"台南市","台南市","台南市","嘉義縣","台南市","桃園市","嘉義市","嘉義縣","高雄市","花蓮縣",
"台東縣","台東縣")
find <- data.frame(cbind(c, d))
ptt_content <- merge(find, ptt_content, by.x = "c", by.y = "city", all.x = T)
ptt_content <- ptt_content[,-1]
colnames(ptt_content) <- c("city","area","id","date","year","month","day","weekday","time",
"ip","title","content","push","boo","total")
sapply(ptt_content, class)
ptt_content$year <- as.numeric(ptt_content$year)
# make graph
ppt <- ptt_content %>%
select(year, weekday, id) %>%
group_by(year, weekday) %>%
summarise(total_post = length(id))
ggplot() +
geom_line(aes(x = year, y = total_post, color = weekday),data = ppt, size = 0.8) +
scale_colour_manual(name = "性別",
values = c("Red","Orange","Yellow","Green","Dodger Blue","Navy","Blue Violet"),
labels = c("一","二","三","四","五","六","日"))+
labs(x = "年度",
y = "發文總數",
title = "PTT美食版討論度分析",
caption = "版權歸小鎮村民擁有")+
scale_y_continuous(breaks = c(0,500,1500,2500,3500))+
theme_minimal()+
theme(text = element_text(family = "Heiti TC Light"), legend.position='top',
plot.title = element_text(color = "Royal Blue",hjust = 0.5, face='bold', size =18),
legend.title = element_text(face = "bold"),
axis.title = element_text(color="Steel Blue", face="bold", size=14))
```

```r=
dcard_post <- dcard_area[!is.na(dcard_area$City),]
unique(dcard_post$City)
# store df of total post
ptt_section <- ptt_content %>%
select(year, id, area, city) %>%
group_by(area, city) %>%
summarise(Total_Post = length(id))
# make graph
ptt_section %>%
ggplot(aes(x = city, y= Total_Post, fill=area)) +
geom_histogram(stat = "identity")+
theme_minimal() +
theme(text = element_text(family = "Heiti TC Light"), legend.position='top') +
scale_fill_hue(h = c(60,250)) +
labs(x = "城市",
y = "發文總數",
title = "PTT美食版討論度分析",
caption = "版權歸小鎮村民擁有", fill = "區域")+
scale_y_continuous(breaks = c(0,500,1000,1500,2000,2500,3000,3500,4000,4500,5000,5500,6000,6500,7000))
```
### Dcard
```r=
# set factor levels
dcard_post$Area <- factor(dcard_area$Area, levels = c("北","中","南","東","離島"))
d_post$City <- factor(d_post$City, levels = c("台北市","新北市","基隆市","桃園市",
"新竹市","新竹縣","苗栗縣", "台中市",
"彰化縣","雲林縣","南投縣","嘉義市","嘉義縣","台南市",
"高雄市","屏東縣","宜蘭縣","花蓮縣","台東縣",
"澎湖縣","金門縣"))
# store df of total post
d_post <- dcard_post %>%
select(Season, ID, Area, City) %>%
group_by(Area, City) %>%
summarise(Total_Post = length(ID))
# make graph
d_post %>%
ggplot(aes(x = City, y= Total_Post, fill=Area)) +
geom_histogram(stat = "identity")+
theme_minimal() +
theme(text = element_text(family = "Heiti TC Light"), legend.position='top',
plot.title = element_text(color = "Royal Blue",hjust = 0.5, face='bold', size =18),
legend.title = element_text(face = "bold"),
axis.title = element_text(color="Steel Blue", face="bold", size=14)) +
scale_fill_hue(h = c(60,250)) +
labs(x = "城市",
y = "發文總數",
title = "Dcard美食版討論度分析(二)",
caption = "版權歸小鎮村民擁有", fill = "區域")+
scale_y_continuous(breaks = c(0,500,1000,1500,2000,2500,3000,3500,4000,4500,5000,5500,6000,6500,7000))
```

```r=
dcard_area %>%
select(Gender, ID, YnM) %>%
filter(Gender %in% c("F","M")) %>%
group_by(Gender, YnM) %>%
summarise(Total_Post = length(ID)) %>%
ggplot(aes(x = YnM, y= Total_Post, color=Gender)) +
geom_line(size = 0.8) +
theme_minimal()+
theme(text = element_text(family = "Heiti TC Light"), legend.position='top',
plot.title = element_text(color = "Royal Blue",hjust = 0.5, face='bold', size =18),
legend.title = element_text(face = "bold"),
axis.title = element_text(color="Steel Blue", face="bold", size=14),
axis.line = element_line(size = 1, colour = "Steel Blue")) +
scale_colour_manual(name = "性別",
values = c("Tomato","Medium Sea Green"),
labels = c("女性","男性"))+
labs(x = "時間",
y = "發文總數",
title = "Dcard美食版討論度分析",
caption = "版權歸小鎮村民擁有")+
scale_y_continuous(breaks = c(0,250,500,750,1000,1250,1500)) +
scale_x_datetime(date_labels = "%Y-%m", breaks = "6 month")
```

```r=
TCM_15 <- TCM[c(4,6,11,17,29,34,35,42,45,46,48,49,50,52,56),]
TCM_15 %>%
ggplot(aes(x = Male.North, y= Female.North, color = Keywords))+
geom_point() +
theme_minimal()+
theme(legend.position='none',text = element_text(family = "Heiti TC Light"),
plot.title = element_text(color = "Royal Blue",hjust = 0.5, face='bold', size =18),
legend.title = element_text(face = "bold"),
axis.title = element_text(color="Steel Blue", face="bold", size=14))+
labs(x = "男性文章",
y = "女性文章",
title = "Dcard北台灣關鍵字頻率分析",
caption = "版權歸小鎮村民擁有")+
geom_text(aes(label = Keywords, family = "Heiti TC Light"), hjust=0.5, vjust=1.5, size = 5)
```

### MySQL Connection
```r=
install.packages("RMySQL")
library(RMySQL)
library(DBI)
host <- "192.168.35.119"
port <- 3306
dbname <- "dcad_db" # 輸入資料庫名稱
user <- "lia" # 輸入使用者名稱
password <- "lia" # 輸入使用者密碼
engine <- dbConnect(RMySQL::MySQL(),
host = host,
port = port,
dbname = dbname,
user = user,
password = password
)
dcard_mysql <- dbReadTable(engine, name = 'dcard_comment', fileEncoding = "utf8", header= FALSE)
View(dcard_mysql)
dbWriteTable(engine, name = 'dcard_comment', value = dcard_test, overwrite = TRUE)
dbDisconnect(engine)
```
### Shiny & Mysql
```r=
install.packages("RMySQL")
library(RMySQL)
library(DBI)
library(shiny)
host <- "192.168.35.119" # 輸入自己的 AWS RDS Enpoint 位址
port <- 3306
dbname <- "dcad_db" # 輸入自己設定的資料庫名稱
user <- "lia" # 輸入自己設定的使用者名稱
password <- "lia" # 輸入自己設定的使用者密碼
engine <- dbConnect(RMySQL::MySQL(),
host = host,
port = port,
dbname = dbname,
user = user,
password = password
)
ui <- fluidPage(
#numericInput("nrows", "Enter the number of rows to display:", 5),
textInput("ID", "Enter your ID:", "34838"),
tableOutput("tbl")
)
server <- function(input, output, session) {
output$tbl <- renderTable({
engine <- dbConnect(
drv = RMySQL::MySQL(),
dbname = dbname,
host = host,
username = user,
password = password)
on.exit(dbDisconnect(engine), add = TRUE)
dbSendQuery(engine, "SET NAMES utf8mb4;")
query <- paste0("SELECT * FROM dcad_db.dcard_content WHERE ID = '", input$ID, "';")
dbGetQuery(engine, query)
#dbGetQuery(engine, paste0("SELECT * FROM dcad_db.dcard_content Limit ", input$nrows, ";"))
})
}
shinyApp(ui, server)
# - - - - - 編碼設定
dbSendQuery(con, "SET NAMES utf8mb4;")
dbSendQuery(con, "SET CHARACTER SET utf8mb4;")
dbSendQuery(con, "SET character_set_connection=utf8mb4;")
dbGetQuery(con, "SHOW VARIABLES WHERE Variable_name LIKE 'character%' OR Variable_name LIKE 'collation%';")
```
### Mysql & Dplyr
```r=
library(dplyr)
library(dbplyr)
library(devtools)
install.packages("pool")
library(pool)
mysql_db <- dbPool(
RMySQL::MySQL(),
dbname = "dcad_db",
host = "192.168.35.119",
username = "lia",
password = "lia"
)
mysql_db %>%
tbl("dcard_content") %>%
head(5)
```
### Association Rule
```r=
# 資料匯入與清洗
library(jsonlite)
library(jiebaR)
dcard_area <- fromJSON("/Users/lialee/Desktop/III Final Project/dcard_final.json")
# conbine year and month
for(i in 1:41761){
a <- paste(dcard_area$Year[i], dcard_area$Month[i], sep = "-")
dcard_area$YnM[i] <- a
}
# change type to POSIXct
library(lubridate)
dcard_area$YnM <- parse_date_time(dcard_area$YnM, orders = "Ym")
dcard_area$YnM <- as.Date(dcard_area$YnM)
# correct content error
a <- which(dcard_area$Season == "東")
dcard_area$Season[a] <- "冬"
# set factor levels
dcard_area$Season <- factor(dcard_area$Season, levels = c("春","夏","秋","冬"))
# set factor levels
dcard_area$Area <- factor(dcard_area$Area, levels = c("北","中","南","東","離島"))
d_post$City <- factor(d_post$City, levels = c("台北市","新北市","基隆市","桃園市",
"新竹市","新竹縣","苗栗縣", "台中市",
"彰化縣","雲林縣","南投縣","嘉義市","嘉義縣","台南市",
"高雄市","屏東縣","宜蘭縣","花蓮縣","台東縣",
"澎湖縣","金門縣"))
# area, city, post_number
d_post <- read.csv("/Users/lialee/Desktop/d_post.csv")
d_post <- d_post[,-1]
d_post$Area <- factor(d_post$Area, levels = c("北","中","南","東","離島"))
# select content, id, gender
dcard_content <- dcard_area[,c(2,4,5)]
dim(dcard_content)
```
```r=
# 資料依男女拆開
dcard_M <- dcard_content[dcard_content$Gender == 'M',]
dcard_F <- dcard_content[dcard_content$Gender == 'F',]
# 確認是否有重複資料(很重要)
library(dplyr)
dcard_M_unique <- dcard_M %>%
distinct(ID, Content, .keep_all = TRUE)
dcard_F_unique <- dcard_F %>%
distinct(ID, Content, .keep_all = TRUE)
# 設定停用詞
stop_word <- '/Users/lialee/Desktop/Programming/TextMining/Data/stop.txt'
# 設定使用詞庫
user_dic <- "/Users/lialee/Desktop/III Final Project/foodclean.csv"
# 設定分詞器
mixseg <- worker(stop_word = stop_word, user = user_dic, type = "tag")
# - - - 男女文章清洗
k <- dcard_M_unique$Content
k <- dcard_F_unique$Content
k <- gsub('[0-9]+', "", k)
k <- gsub('[[:space:]]', "", k)
k <- gsub('[a-zA-Z]', "", k)
k <- gsub('#', "", k)
k <- gsub('[ ️ ︎ ﹏ ︵ ︶ ︿ ﹃ ꒳]',"",k)
k <- gsub('[︴ ︹ ︺ ꒦ ꒪ ꒫"]' ,"",k)
k <- gsub('[a-zA-Z]', "", k)
k <- gsub('[-+/.─◆○~=,「」▲:~※_★$、?│【】()()]' ,"", k)
tb <- data.frame(k, dcard_M_unique$ID)
tb <- data.frame(k, dcard_F_unique$ID)
# 中文分詞
for(i in 1:nrow(tb)){
w <- segment(as.vector(tb$k[i]), mixseg)
tb$w[i] <- list(w)
}
# 詞性標注
for(i in 1:nrow(tb)){
t <- names(tagging(as.vector(tb$k[i]), mixseg))
tb$t[i] <- list(t)
}
# 解迴圈+只選名詞
library(stringr)
for(i in 1:nrow(tb)){
t <- which(tb$t[[i]][1:length(tb$t[[i]])] %in% c('n','nr','nrt','ns','nt','nz') == T)
w <- tb$w[[i]][t]
tb$ww[i] <- paste(str_c(w, sep="|"), collapse = ",")
}
# 改欄位名稱
colnames(tb) <- c("Content", "ID", "所有斷詞","詞性","n")
# 欄位切割
library(splitstackshape)
new <- cSplit(tb, "n", ",")
# 改欄位名稱
colnames(new) <- c("Content","ID","所有斷詞","詞性",1:68)
# 將無名詞欄位刪除
library(tidyr)
new$ID <- as.factor(new$ID)
sum(complete.cases(new$`1`))
new <- new[complete.cases(new$`1`),]
id_word_M <- new[,c(2,5:68)]
id_word_F <- new[,c(2,5:55)]
# 再次確認是否有重複ID
sum(duplicated(id_word_M$ID) == T)
# 存檔
write.csv(test,"/Users/lialee/Desktop/M.csv")
# 轉向
t_id_word_M <- t(id_word_M)
nrow(id_word_M)
id <- c()
id <- id_word_M$ID
ncol(t_id_word_M)
colnames(t_id_word_M) <- id
class(t_id_word_M)
dim(t_id_word_M)
t_id_word_M <- t_id_word_M[-1,]
# 轉置
t_id_word_M <- gather(data.frame(t_id_word_M), key = "id", value = "words", 1:ncol(t_id_word_M), na.rm = T)
# 刪除X
t_id_word_M$id[1:length(t_id_word_M$id)] <- gsub("X", "",t_id_word_M$id[1:length(t_id_word_M$id)])
t_id_word_M <- t_M # 備份
# 順序排列
counter = 1
for(i in 1:nrow(id_word_M)){
t <- sum(t_id_word_M$id == id_word_M$ID[i])
for(j in 1:t){
t_id_word_M$order[counter] <- j
counter = counter + 1
}
}
# Write CSV
write.csv(t_id_word_M ,"/Users/lialee/Desktop/id_word_Male.csv")
write.csv(t_id_word_F ,"/Users/lialee/Desktop/id_word_Female.csv")
```
```r=
# 讀入附件檔案
t_id_word_M <- read.csv("/Users/lialee/Desktop/t_id_word_M2.csv", header = TRUE, fileEncoding = "UTF-8")
t_id_word_M <- t_id_word_M[!t_id_word_M$words %in% c("材料","美食","蛋糕","店","粉","版"),]
trans_sequence <- t_id_word_M %>%
group_by(id, order) %>%
summarize(
SIZE = n(),
words = paste(as.character(words), collapse = ';')
)
#trans_sequence <- data.frame(lapply(trans_sequence, as.factor))
trans_sequence <- trans_sequence[order(trans_sequence$id, trans_sequence$order),]
library(arulesSequences)
# Convert to transaction matrix data type
write.table(trans_sequence, "mytxtout.txt", sep=";", row.names = FALSE, col.names = FALSE, quote = FALSE)
trans_matrix <- read_baskets("mytxtout.txt", sep = ";", info = c("sequenceID","eventID","SIZE"))
head(inspect(trans_matrix))
# Get frequent sequences and corresponding support values
s1 <- cspade(trans_matrix, parameter = list(support = 0.01),
control = list(verbose = TRUE))
s1.df <- as(s1, "data.frame")
summary(s1)
inspect(s1)
rules <- s1[which(s1@quality$support>0.02)]
inspect(rules)
s1 <- sort(s1,by="support",decreasing=T)
inspect(s1)
```
### K-means , Cluster Descriptions and CART.tree
```r=
library(jsonlite)
library(jiebaR)
dcard_area <- fromJSON("/Users/lialee/Desktop/III Final Project/dcard_final.json")
# select content, id, gender
dim(dcard_area)
dcard_content <- dcard_area[,c(2,4,20,21)]
# 刪除na
dcard_content <- dcard_content[!is.na(dcard_content$Area),]
library(dplyr)
# 重複執行男女,北中南東,共8組
F_E <- dcard_content %>%
filter(Gender == "F") %>%
filter(Area == "東")
# 檢查是否重複
F_E <- F_E %>%
distinct(Content, .keep_all = TRUE)
stop_word <- '/Users/lialee/Desktop/Programming/TextMining/Data/stop.txt'
user_dic <- "/Users/lialee/Desktop/III Final Project/foodclean.csv"
mixseg <- worker(stop_word = stop_word, user = user_dic, type = "tag")
# - - - 開始跑斷詞
df_FE <- data.frame()
seq_doc <- NULL # Word Segmentation Results
seq_tag <- NULL # POS Tagging Results
k <- F_E$Content
k <- gsub('[0-9]+', "", k)
k <- gsub('[[:space:]]', "", k)
k <- gsub('[a-zA-Z]', "", k)
k <- gsub('#', "", k)
k <- gsub('[ ️ ︎ ﹏ ︵ ︶ ︿ ﹃ ꒳]',"",k)
k <- gsub('[︴ ︹ ︺ ꒦ ꒪ ꒫"]' ,"",k)
k <- gsub('[a-zA-Z]', "", k)
k <- gsub('[-+/.─◆○~=,「」▲:~※_★$、?│【】()()]' ,"", k)
# 中文分詞
w <- segment(as.vector(k), mixseg)
seq_doc <- c(seq_doc, w)
# 詞性標注
t <- names(tagging(as.vector(k), mixseg))
seq_tag <- c(seq_tag , t)
seq <- data.frame(seq_doc, seq_tag)
seq <- seq[seq$seq_tag %in% c('n','nr','nrt','ns','nt','nz'),]
seq_doc <- table(as.character(seq$seq_doc))
seq_doc <- data.frame(seq_doc, clas = 'Female East')
df_FE <- rbind(df_FE, seq_doc)
# 八組合成一個data frame
df_area <- rbind(df_area, df_FE)
names(df_area)[1] <- 'Keywords'
names(df_area)[2] <- 'Frequency'
names(df_area)[3] <- 'Type'
DF <- c(table(df_area$Keywords))
FM <- unique(df_area$Type)
library(reshape2)
TCM <- acast(df_area, Keywords ~ Type, value.var='Frequency', fill = 0, drop = FALSE, sum)
TCB <- ifelse(TCM > 0, 1, 0)
# 共出現超過500次才選
selectedKW <- rowSums(TCM) >= 500
TCM <- as.data.frame(TCM[selectedKW,])
TCB <- as.data.frame(TCB[selectedKW,])
DF <- DF[selectedKW]
counter <- 30641 #文章總篇數
IDF <- log10(counter / DF)
cbind(rownames(TCM), IDF)
TTF <- colSums(TCM)
TCM_IDF <- t(t(TCM) / TTF) * IDF
TCM <- data.frame(Keywords = rownames(TCM), TCM)
rownames(TCM) <- NULL
TCM_IDF <- data.frame(Keywords = rownames(TCM_IDF), TCM_IDF)
rownames(TCM_IDF) <- NULL
TCB <- data.frame(Keywords = rownames(TCB), TCB)
rownames(TCB) <- NULL
colnam <- TCM$Keywords
TCM$Keywords <- NULL
# 轉向
t_TCM <- as.data.frame(t(TCM))
colnames(t_TCM) <- colnam
rownames(t_TCM) <- FM
# 這個步驟是為了決策樹跟分類無關
cart_TCM <- t_TCM
cart_TCM$Type <- FM
# K-means
library(cluster)
# Decide K
result <- list()
for (i in 2:4){
kmd <- kmeans(t_TCM, centers=i)
sil <- silhouette(kmd$cluster, dist(t_TCM))
result[[paste('k=',i,sep='')]] <- mean(sil[,'sil_width'])
}
result
```
K-means Result
**發現分三群的k-means最高**
```r=
$`k=2`
[1] 0.4849345
$`k=3`
[1] 0.5801996
$`k=4`
[1] 0.4641646
```
```r=
# K = 3
kmd <- kmeans(t_TCM, centers=3)
kmd$cluster
- - - - - - - - -
kmd$cluster
Male North Female North Male Middle Male South Male East Female Middle
2 3 1 2 1 2
Female South Female East
2 1
```
```r=
# 看輪廓係數
sil <- silhouette(kmd$cluster, dist(t_TCM))
mean(sil[,'sil_width'])
- - - - - - - - -
cluster neighbor sil_width
[1,] 2 1 0.2561490
[2,] 3 2 0.0000000
[3,] 1 2 0.6009884
[4,] 2 1 0.7626361
[5,] 1 2 0.7814875
[6,] 2 1 0.7626361
[7,] 2 1 0.6711010
[8,] 1 2 0.8065991
# Cluster Descriptions
kmd$centers
```
```r=
# Display Clustering Results
# 以下都是畫文字雲用的
library(wordcloud2)
install.packages("webshot")
library(webshot)
webshot::install_phantomjs()
library("htmlwidgets")
for(i in 1:3) {
Clus_i <- t_TCM[kmd$cluster==i,]
Clus_n <- colnames(t_TCM)
Clus_f <- colSums(Clus_i)
Word_Tab <- data.frame(Clus_n, Clus_f)
rownames(Word_Tab) <- NULL
Word_Tab <- Word_Tab[Word_Tab$Clus_f!=0,]
my_graph <- wordcloud2(Word_Tab, size = 0.5, minSize = 0, gridSize = 3,color = "random-light", backgroundColor = "white")
saveWidget(my_graph,paste0("/Users/lialee/Desktop/",i,".html"),selfcontained = F)
webshot(paste0("/Users/lialee/Desktop/",i,".html"),
paste0("/Users/lialee/Desktop/",i,".png"),
vwidth = 600, vheight=350)
}
```



```r=
# - - - - - cart_TCM 畫決策樹
library(rpart)
library(rpart.plot)
CART.tree <- rpart(Type ~ ., data=cart_TCM,
control=rpart.control(minsplit=2, cp=0))
rpart.plot(CART.tree)
```

### 巧克力文章分析
```r=
# 選出有巧克力的文章
chocolate <- dcard_area[grepl("巧克力", dcard_area$Content) == TRUE,]
stop_word <- '/Users/lialee/Desktop/Programming/TextMining/Data/stop.txt'
user_dic <- "/Users/lialee/Desktop/foodclean.csv"
mixseg <- worker(stop_word = stop_word, user = user_dic, type = "tag")
choco <- data.frame()
seq_doc <- NULL
seq_tag <- NULL
k <- chocolate$Content
k <- gsub('[0-9]+', "", k)
k <- gsub('[[:space:]]', "", k)
k <- gsub('[a-zA-Z]', "", k)
k <- gsub('#', "", k)
k <- gsub('[ ️ ︎ ﹏ ︵ ︶ ︿ ﹃ ꒳]',"",k)
k <- gsub('[︴ ︹ ︺ ꒦ ꒪ ꒫]',"",k)
k <- gsub('[-+/.─◆○~=,「」▲:~※_★$、?│【】()()]' ,"", k)
# 內文清洗完與ID合併
choco <- data.frame(k, chocolate$ID)
# 中文分词
for(i in 1:nrow(choco)){
w <- segment(as.vector(choco$k[i]), mixseg)
choco$w[i] <- list(w)
}
# 詞性標注
for(i in 1:nrow(choco)){
t <- names(tagging(as.vector(choco$k[i]), mixseg))
choco$t[i] <- list(t)
}
library(stringr)
for(i in 1:nrow(choco)){
t <- which(choco$t[[i]][1:length(choco$t[[i]])] %in% c('n','nr','nrt','ns','nt','nz') == T)
w <- choco$w[[i]][t]
choco$ww[i] <- paste(str_c(w, sep="|"), collapse = ",")
}
# 只保留ID跟斷詞後的名詞
choco <- choco[,c(2,5)]
# 把名詞用,切開
library(splitstackshape)
choco_sep <- cSplit(choco, "ww", ",")
# 轉向與轉置才能做詞頻統計
library(tidyr)
t_choco_sep <- t(choco_sep)
dim(t_choco_sep)
id <- c()
id <- choco_sep$chocolate.ID
colnames(t_choco_sep) <- id
class(t_choco_sep)
t_choco_sep <- t_choco_sep[-1,]
t_choco_sep <- gather(data.frame(t_choco_sep), key = "id", value = "words", 1:ncol(t_choco_sep), na.rm = T)
t_choco_sep$id[1:length(t_choco_sep$id)] <- gsub("X", "",t_choco_sep$id[1:length(t_choco_sep$id)])
# 詞頻統計
choco_count <- data.frame(table(as.character(t_choco_sep$words)))
names(choco_count)[1] <- 'Keywords'
names(choco_count)[2] <- 'Frequency'
DF <- c(table(choco_count$Keywords))
# 選出具代表性的詞
library(reshape2)
TCM <- acast(choco_count, Keywords ~ Frequency, value.var='Frequency', fill = 0, drop = FALSE, sum)
TCB <- ifelse(TCM > 0, 1, 0)
# 至少出現50次
selectedKW <- rowSums(TCM) >= 50
TCM <- as.data.frame(TCM[selectedKW,])
TCB <- as.data.frame(TCB[selectedKW,])
rownames(TCM)
```
```
> rownames(TCM)
[1] "白巧克力" "版" "冰淇淋" "餅乾" "布朗" "材料" "草莓"
[8] "茶" "蛋糕" "店" "東西" "粉" "芙" "個人"
[15] "醬" "結果" "咖啡" "烤箱" "口感" "口味" "美食"
[22] "奶油" "尼" "檸檬" "牛奶" "朋友" "巧克力" "全家"
[29] "熔岩" "乳酪" "食譜" "時間" "甜點" "味" "味道"
[36] "鮮奶油" "餡" "香蕉" "杏仁" "原味" "早餐"
```
### Prepare Data for Prediction
```r=
# 以巧克力為主題分析
# 選出有巧克力的文章
chocolate <- dcard_area[grepl("巧克力", dcard_area$Content) == TRUE,]
# 選出沒有巧克力的文章
not_chocolate <- dcard_area[grepl("巧克力", dcard_area$Content) == F,]
# 去重
chocolate <- chocolate %>%
distinct(Content ,ID, .keep_all = TRUE)
not_chocolate <- not_chocolate %>%
distinct(Content ,ID, .keep_all = TRUE)
# 去不完整資料
chocolate <- chocolate[complete.cases(chocolate$Area) ==T,]
not_chocolate <- not_chocolate[complete.cases(not_chocolate$Area) ==T,]
# chocolate文章ID給標籤yes
df <- data.frame(chocolate$ID, clas = 'yes')
colnames(df) <- c("id","tpye")
# 沒有chocolate文章ID給標籤no
df1 <- data.frame(not_chocolate$ID, clas = 'no')
colnames(df1) <- c("id","tpye")
df2 <- rbind(df,df1)
# 還原ID欄位資料
df_complete <- merge(df2, dcard_area, by.x = "id", by.y = "ID", all.x = TRUE)
# 篩選預測需使用欄位
df_complete <- df_complete[,c(1,2,3,6,16,19,22)]
# 觀察資料型態
sapply(df_complete,unique)
# 發現性別沒清洗
df_complete <- df_complete %>%
filter(Gender %in% c("F","M"))
# 把欲預測欄位放到最後
s <- df_complete$tpye
df_complete <- df_complete[,-2]
df_complete$type <- s
# - - - - - 開始正歸化
# 季節轉數值
replace1 <- function(x) {
x = sub("春","0",x)
x = sub("夏","1",x)
x = sub("秋","2",x)
x = sub("冬","3",x)
x = as.integer(x)
return (x)
}
# 城市轉數值
replace2 <- function(x) {
x = sub("台北市","0",x)
x = sub("新北市","1",x)
x = sub("基隆市","2",x)
x = sub("桃園市","3",x)
x = sub("新竹市","4",x)
x = sub("新竹縣","5",x)
x = sub("苗栗縣","6",x)
x = sub("台中市","7",x)
x = sub("彰化縣","8",x)
x = sub("雲林縣","9",x)
x = sub("南投縣","10",x)
x = sub("嘉義市","11",x)
x = sub("嘉義縣","12",x)
x = sub("台南市","13",x)
x = sub("高雄市","14",x)
x = sub("屏東縣","15",x)
x = sub("宜蘭縣","16",x)
x = sub("花蓮縣","17",x)
x = sub("台東縣","18",x)
x = sub("澎湖縣","19",x)
x = sub("金門縣","20",x)
x = as.integer(x)
return (x)
}
# 性別轉數值
replace3 <- function(x) {
x = sub("M","1",x)
x = sub("F","0",x)
x = as.integer(x)
return (x)
}
sapply(df_complete, typeof)
df_norm <- df_complete
# 不知道為何無法直接對一個表格做事
df_norm.s <- as.data.frame(lapply(df_norm,replace1))
df_norm.s <- df_norm.s[,"Season"]
df_norm.g <- data.frame(lapply(df_norm,replace3))
df_norm.g <- df_norm.g[,"Gender"]
df_norm.c <- data.frame(lapply(df_norm,replace2))
df_norm.c <- df_norm.c[,"City"]
df_norm <- cbind(df_norm ,df_norm.c)
# 學校有152所較複雜,執行正規化處理
unique(df_norm$University)
check_uni <- data.frame(unique(df_norm$University), c(1:152))
colnames(check_uni) <- c("x1","x2")
df_ok <- merge(check_uni, df_norm, by.x = "x1", by.y = "University", all.x = TRUE)
df_ok <- df_ok[,-c(1,4,6,7)]
colnames(df_ok) <- c("University","ID","Year","Chocolate","Season","Gender","City")
# 把欲預測欄位:巧克力討論度,放到最後
s <- df_ok$Chocolate
df_ok <- df_ok[,-4]
df_ok$Chocolate <- s
# 轉換成類別
sapply(df_ok, typeof)
for(i in c(1:6)) {
df_ok[[i]] <- as.factor(df_ok[[i]])
}
choco_module <- df_ok
sapply(choco_module, typeof)
select <- sample(1:nrow(choco_module),nrow(choco_module)*0.8)
train <- choco_module[select,]
test <- choco_module[-select,]
```
### Prediction : Naive Bayes
```r=
choco_module <- df_ok
sapply(choco_module, typeof)
# 把ID拿掉
choco_module <- choco_module[,-2]
select <- sample(1:nrow(choco_module),nrow(choco_module)*0.8)
train <- choco_module[select,]
test <- choco_module[-select,]
# Build Naive Bayes Model
library(bnlearn)
#目標屬性"BROADBAND"
nb_default <- naive.bayes(train, training="Chocolate")
plot(nb_default)
test.y_hat <- predict(nb_default, test, prob=FALSE)
test.y_hat_prob <- predict(nb_default, test, prob=TRUE)
nb_default_results <- cbind(Prediction=as.character(test.y_hat), t(attr(test.y_hat_prob,"prob")))
nb_default_results
# Model Evaluation
accuracy.nb_default <- sum(test.y_hat==test$`Chocolate`) / length(test$`Chocolate`)
accuracy.nb_default
# accuracy.nb_default 0.9569649
agreement_KNN <- test.y_hat==test$`Chocolate`
agreement_KNN
table(test.y_hat, test$`Chocolate`, dnn=c("Prediction","Actual"))
- - - - - - - - -
Actual
Prediction yes no
yes 0 0
no 266 5915
```

```r=
# Build Bayes Net Tan Model
library(bnlearn)
bn_default <- tree.bayes(train, training="Chocolate")
plot(bn_default)
test.y_hat <- predict(bn_default, test, prob=FALSE)
test.y_hat_prob <- predict(bn_default, test, prob=TRUE)
bn_default_results <- cbind(Prediction=as.character(test.y_hat), t(as.matrix(attr(test.y_hat_prob,"prob"))))
bn_default_results
# Model Evaluation
accuracy.nb_default <- sum(test.y_hat==test$`Chocolate`) / length(test$`Chocolate`)
accuracy.nb_default
# accuracy.nb_default 0.8975894
agreement_KNN <- test.y_hat==test$`Chocolate`
agreement_KNN
table(test.y_hat, test$`Chocolate`, dnn=c("Prediction","Actual"))
- - - - - - - - -
Actual
Prediction yes no
yes 7 374
no 259 5541
```

```r=
mocha <- dcard_area[grepl("抹茶", dcard_area$Content) == TRUE,]
ramen <- dcard_area[grepl("拉麵", dcard_area$Content) == TRUE,]
cheese <- dcard_area[grepl("起司", dcard_area$Content) == TRUE,]
cheese <- cheese %>%
distinct(Content ,ID, .keep_all = TRUE)
cheese <- cheese[complete.cases(cheese$Area) ==T,]
mocha <- mocha %>%
filter(Gender %in% c("F","M"))
df3 <- data.frame(cheese$ID, clas = 'cheese')
colnames(df1) <- c("id","type")
df <- rbind(df, df3)
unique(df$type)
df_three <- merge(df, dcard_area, by.x = "id", by.y = "ID", all.x = TRUE)
df_three <- df_three[,c(1,2,3,6,16,19,22)]
sapply(df_three,unique)
df_norm <- df_three
# 不知道為何無法直接對一個表格做事
df_norm.s <- as.data.frame(lapply(df_norm,replace1))
df_norm.s <- df_norm.s[,"Season"]
df_norm.g <- data.frame(lapply(df_norm,replace3))
df_norm.g <- df_norm.g[,"Gender"]
df_norm.c <- data.frame(lapply(df_norm,replace2))
df_norm.c <- df_norm.c[,"City"]
df_norm <- cbind(df_norm ,df_norm.c)
# 學校較複雜,另外處理
unique(df_norm$University)
check_uni <- data.frame(unique(df_norm$University), c(1:139))
colnames(check_uni) <- c("x1","x2")
df_ok <- merge(check_uni, df_norm, by.x = "x1", by.y = "University", all.x = TRUE)
df_ok <- df_ok[,-c(1,5,8,7)]
colnames(df_ok) <- c("University","ID","Type","Year","Season","Gender","City")
s <- df_ok$Type
df_ok <- df_ok[,-3]
df_ok$Type <- s
# 轉換成類別
sapply(df_ok, typeof)
for(i in c(1:6)) {
df_ok[[i]] <- as.factor(df_ok[[i]])
}
three_module <- df_ok
sapply(three_module, typeof)
# 把ID拿掉
three_module <- three_module[,-2]
select <- sample(1:nrow(three_module),nrow(three_module)*0.8)
train <- three_module[select,]
test <- three_module[-select,]
```
###### tags: `Food Forum` `Project`