# Project of Food Forum (01) ### PTT ```r= # Write CSV ptt_content <- read.csv("/Users/lialee/Desktop/ptt_content.csv") ptt_content <- ptt_content[ptt_content$area %in% c(1:4),] ptt_content <- ptt_content %>% distinct(content ,url, .keep_all = TRUE) unique(ptt_content$city) a <- c("北","中","南","東","離島") b <- 1:5 follow <- data.frame(cbind(a, b)) ptt_content <- merge(follow, ptt_content, by.x = "b", by.y = "area", all.x = T) ptt_content <- ptt_content[,-1] ptt_content <- ptt_content[is.na(ptt_content$city)==F,] c <- unique(ptt_content$city) c <- data.frame(c) d <- c("台北市","桃園市","新竹市","基隆市","新北市","宜蘭縣","桃園市","新竹縣","新竹縣","台北市", "桃園市","台中市","雲林縣","彰化縣","苗栗縣","苗栗縣","雲林縣","南投縣","雲林縣","台中市", "南投縣","南投縣","雲林縣","苗栗縣","彰化縣","南投縣","雲林縣","雲林縣","苗栗縣","雲林縣", "雲林縣","雲林縣","雲林縣","南投縣","高雄市","台南市","嘉義市","屏東縣","高雄市","嘉義縣", "台南市","台南市","台南市","嘉義縣","台南市","桃園市","嘉義市","嘉義縣","高雄市","花蓮縣", "台東縣","台東縣") find <- data.frame(cbind(c, d)) ptt_content <- merge(find, ptt_content, by.x = "c", by.y = "city", all.x = T) ptt_content <- ptt_content[,-1] colnames(ptt_content) <- c("city","area","id","date","year","month","day","weekday","time", "ip","title","content","push","boo","total") sapply(ptt_content, class) ptt_content$year <- as.numeric(ptt_content$year) # make graph ppt <- ptt_content %>% select(year, weekday, id) %>% group_by(year, weekday) %>% summarise(total_post = length(id)) ggplot() + geom_line(aes(x = year, y = total_post, color = weekday),data = ppt, size = 0.8) + scale_colour_manual(name = "性別", values = c("Red","Orange","Yellow","Green","Dodger Blue","Navy","Blue Violet"), labels = c("一","二","三","四","五","六","日"))+ labs(x = "年度", y = "發文總數", title = "PTT美食版討論度分析", caption = "版權歸小鎮村民擁有")+ scale_y_continuous(breaks = c(0,500,1500,2500,3500))+ theme_minimal()+ theme(text = element_text(family = "Heiti TC Light"), legend.position='top', plot.title = element_text(color = "Royal Blue",hjust = 0.5, face='bold', size =18), legend.title = element_text(face = "bold"), axis.title = element_text(color="Steel Blue", face="bold", size=14)) ``` ![](https://i.imgur.com/WbsUds7.jpg) ```r= dcard_post <- dcard_area[!is.na(dcard_area$City),] unique(dcard_post$City) # store df of total post ptt_section <- ptt_content %>% select(year, id, area, city) %>% group_by(area, city) %>% summarise(Total_Post = length(id)) # make graph ptt_section %>% ggplot(aes(x = city, y= Total_Post, fill=area)) + geom_histogram(stat = "identity")+ theme_minimal() + theme(text = element_text(family = "Heiti TC Light"), legend.position='top') + scale_fill_hue(h = c(60,250)) + labs(x = "城市", y = "發文總數", title = "PTT美食版討論度分析", caption = "版權歸小鎮村民擁有", fill = "區域")+ scale_y_continuous(breaks = c(0,500,1000,1500,2000,2500,3000,3500,4000,4500,5000,5500,6000,6500,7000)) ``` ### Dcard ```r= # set factor levels dcard_post$Area <- factor(dcard_area$Area, levels = c("北","中","南","東","離島")) d_post$City <- factor(d_post$City, levels = c("台北市","新北市","基隆市","桃園市", "新竹市","新竹縣","苗栗縣", "台中市", "彰化縣","雲林縣","南投縣","嘉義市","嘉義縣","台南市", "高雄市","屏東縣","宜蘭縣","花蓮縣","台東縣", "澎湖縣","金門縣")) # store df of total post d_post <- dcard_post %>% select(Season, ID, Area, City) %>% group_by(Area, City) %>% summarise(Total_Post = length(ID)) # make graph d_post %>% ggplot(aes(x = City, y= Total_Post, fill=Area)) + geom_histogram(stat = "identity")+ theme_minimal() + theme(text = element_text(family = "Heiti TC Light"), legend.position='top', plot.title = element_text(color = "Royal Blue",hjust = 0.5, face='bold', size =18), legend.title = element_text(face = "bold"), axis.title = element_text(color="Steel Blue", face="bold", size=14)) + scale_fill_hue(h = c(60,250)) + labs(x = "城市", y = "發文總數", title = "Dcard美食版討論度分析(二)", caption = "版權歸小鎮村民擁有", fill = "區域")+ scale_y_continuous(breaks = c(0,500,1000,1500,2000,2500,3000,3500,4000,4500,5000,5500,6000,6500,7000)) ``` ![](https://i.imgur.com/xCFBrxF.jpg) ```r= dcard_area %>% select(Gender, ID, YnM) %>% filter(Gender %in% c("F","M")) %>% group_by(Gender, YnM) %>% summarise(Total_Post = length(ID)) %>% ggplot(aes(x = YnM, y= Total_Post, color=Gender)) + geom_line(size = 0.8) + theme_minimal()+ theme(text = element_text(family = "Heiti TC Light"), legend.position='top', plot.title = element_text(color = "Royal Blue",hjust = 0.5, face='bold', size =18), legend.title = element_text(face = "bold"), axis.title = element_text(color="Steel Blue", face="bold", size=14), axis.line = element_line(size = 1, colour = "Steel Blue")) + scale_colour_manual(name = "性別", values = c("Tomato","Medium Sea Green"), labels = c("女性","男性"))+ labs(x = "時間", y = "發文總數", title = "Dcard美食版討論度分析", caption = "版權歸小鎮村民擁有")+ scale_y_continuous(breaks = c(0,250,500,750,1000,1250,1500)) + scale_x_datetime(date_labels = "%Y-%m", breaks = "6 month") ``` ![](https://i.imgur.com/qccy6AB.jpg) ```r= TCM_15 <- TCM[c(4,6,11,17,29,34,35,42,45,46,48,49,50,52,56),] TCM_15 %>% ggplot(aes(x = Male.North, y= Female.North, color = Keywords))+ geom_point() + theme_minimal()+ theme(legend.position='none',text = element_text(family = "Heiti TC Light"), plot.title = element_text(color = "Royal Blue",hjust = 0.5, face='bold', size =18), legend.title = element_text(face = "bold"), axis.title = element_text(color="Steel Blue", face="bold", size=14))+ labs(x = "男性文章", y = "女性文章", title = "Dcard北台灣關鍵字頻率分析", caption = "版權歸小鎮村民擁有")+ geom_text(aes(label = Keywords, family = "Heiti TC Light"), hjust=0.5, vjust=1.5, size = 5) ``` ![](https://i.imgur.com/t9FQPFx.jpg) ### MySQL Connection ```r= install.packages("RMySQL") library(RMySQL) library(DBI) host <- "192.168.35.119" port <- 3306 dbname <- "dcad_db" # 輸入資料庫名稱 user <- "lia" # 輸入使用者名稱 password <- "lia" # 輸入使用者密碼 engine <- dbConnect(RMySQL::MySQL(), host = host, port = port, dbname = dbname, user = user, password = password ) dcard_mysql <- dbReadTable(engine, name = 'dcard_comment', fileEncoding = "utf8", header= FALSE) View(dcard_mysql) dbWriteTable(engine, name = 'dcard_comment', value = dcard_test, overwrite = TRUE) dbDisconnect(engine) ``` ### Shiny & Mysql ```r= install.packages("RMySQL") library(RMySQL) library(DBI) library(shiny) host <- "192.168.35.119" # 輸入自己的 AWS RDS Enpoint 位址 port <- 3306 dbname <- "dcad_db" # 輸入自己設定的資料庫名稱 user <- "lia" # 輸入自己設定的使用者名稱 password <- "lia" # 輸入自己設定的使用者密碼 engine <- dbConnect(RMySQL::MySQL(), host = host, port = port, dbname = dbname, user = user, password = password ) ui <- fluidPage( #numericInput("nrows", "Enter the number of rows to display:", 5), textInput("ID", "Enter your ID:", "34838"), tableOutput("tbl") ) server <- function(input, output, session) { output$tbl <- renderTable({ engine <- dbConnect( drv = RMySQL::MySQL(), dbname = dbname, host = host, username = user, password = password) on.exit(dbDisconnect(engine), add = TRUE) dbSendQuery(engine, "SET NAMES utf8mb4;") query <- paste0("SELECT * FROM dcad_db.dcard_content WHERE ID = '", input$ID, "';") dbGetQuery(engine, query) #dbGetQuery(engine, paste0("SELECT * FROM dcad_db.dcard_content Limit ", input$nrows, ";")) }) } shinyApp(ui, server) # - - - - - 編碼設定 dbSendQuery(con, "SET NAMES utf8mb4;") dbSendQuery(con, "SET CHARACTER SET utf8mb4;") dbSendQuery(con, "SET character_set_connection=utf8mb4;") dbGetQuery(con, "SHOW VARIABLES WHERE Variable_name LIKE 'character%' OR Variable_name LIKE 'collation%';") ``` ### Mysql & Dplyr ```r= library(dplyr) library(dbplyr) library(devtools) install.packages("pool") library(pool) mysql_db <- dbPool( RMySQL::MySQL(), dbname = "dcad_db", host = "192.168.35.119", username = "lia", password = "lia" ) mysql_db %>% tbl("dcard_content") %>% head(5) ``` ### Association Rule ```r= # 資料匯入與清洗 library(jsonlite) library(jiebaR) dcard_area <- fromJSON("/Users/lialee/Desktop/III Final Project/dcard_final.json") # conbine year and month for(i in 1:41761){ a <- paste(dcard_area$Year[i], dcard_area$Month[i], sep = "-") dcard_area$YnM[i] <- a } # change type to POSIXct library(lubridate) dcard_area$YnM <- parse_date_time(dcard_area$YnM, orders = "Ym") dcard_area$YnM <- as.Date(dcard_area$YnM) # correct content error a <- which(dcard_area$Season == "東") dcard_area$Season[a] <- "冬" # set factor levels dcard_area$Season <- factor(dcard_area$Season, levels = c("春","夏","秋","冬")) # set factor levels dcard_area$Area <- factor(dcard_area$Area, levels = c("北","中","南","東","離島")) d_post$City <- factor(d_post$City, levels = c("台北市","新北市","基隆市","桃園市", "新竹市","新竹縣","苗栗縣", "台中市", "彰化縣","雲林縣","南投縣","嘉義市","嘉義縣","台南市", "高雄市","屏東縣","宜蘭縣","花蓮縣","台東縣", "澎湖縣","金門縣")) # area, city, post_number d_post <- read.csv("/Users/lialee/Desktop/d_post.csv") d_post <- d_post[,-1] d_post$Area <- factor(d_post$Area, levels = c("北","中","南","東","離島")) # select content, id, gender dcard_content <- dcard_area[,c(2,4,5)] dim(dcard_content) ``` ```r= # 資料依男女拆開 dcard_M <- dcard_content[dcard_content$Gender == 'M',] dcard_F <- dcard_content[dcard_content$Gender == 'F',] # 確認是否有重複資料(很重要) library(dplyr) dcard_M_unique <- dcard_M %>% distinct(ID, Content, .keep_all = TRUE) dcard_F_unique <- dcard_F %>% distinct(ID, Content, .keep_all = TRUE) # 設定停用詞 stop_word <- '/Users/lialee/Desktop/Programming/TextMining/Data/stop.txt' # 設定使用詞庫 user_dic <- "/Users/lialee/Desktop/III Final Project/foodclean.csv" # 設定分詞器 mixseg <- worker(stop_word = stop_word, user = user_dic, type = "tag") # - - - 男女文章清洗 k <- dcard_M_unique$Content k <- dcard_F_unique$Content k <- gsub('[0-9]+', "", k) k <- gsub('[[:space:]]', "", k) k <- gsub('[a-zA-Z]', "", k) k <- gsub('#', "", k) k <- gsub('[ ️ ︎ ﹏ ︵ ︶ ︿ ﹃ ꒳]',"",k) k <- gsub('[︴ ︹ ︺ ꒦ ꒪ ꒫"]' ,"",k) k <- gsub('[a-zA-Z]', "", k) k <- gsub('[-+/.─◆○~=,「」▲:~※_★$、?│【】()()]' ,"", k) tb <- data.frame(k, dcard_M_unique$ID) tb <- data.frame(k, dcard_F_unique$ID) # 中文分詞 for(i in 1:nrow(tb)){ w <- segment(as.vector(tb$k[i]), mixseg) tb$w[i] <- list(w) } # 詞性標注 for(i in 1:nrow(tb)){ t <- names(tagging(as.vector(tb$k[i]), mixseg)) tb$t[i] <- list(t) } # 解迴圈+只選名詞 library(stringr) for(i in 1:nrow(tb)){ t <- which(tb$t[[i]][1:length(tb$t[[i]])] %in% c('n','nr','nrt','ns','nt','nz') == T) w <- tb$w[[i]][t] tb$ww[i] <- paste(str_c(w, sep="|"), collapse = ",") } # 改欄位名稱 colnames(tb) <- c("Content", "ID", "所有斷詞","詞性","n") # 欄位切割 library(splitstackshape) new <- cSplit(tb, "n", ",") # 改欄位名稱 colnames(new) <- c("Content","ID","所有斷詞","詞性",1:68) # 將無名詞欄位刪除 library(tidyr) new$ID <- as.factor(new$ID) sum(complete.cases(new$`1`)) new <- new[complete.cases(new$`1`),] id_word_M <- new[,c(2,5:68)] id_word_F <- new[,c(2,5:55)] # 再次確認是否有重複ID sum(duplicated(id_word_M$ID) == T) # 存檔 write.csv(test,"/Users/lialee/Desktop/M.csv") # 轉向 t_id_word_M <- t(id_word_M) nrow(id_word_M) id <- c() id <- id_word_M$ID ncol(t_id_word_M) colnames(t_id_word_M) <- id class(t_id_word_M) dim(t_id_word_M) t_id_word_M <- t_id_word_M[-1,] # 轉置 t_id_word_M <- gather(data.frame(t_id_word_M), key = "id", value = "words", 1:ncol(t_id_word_M), na.rm = T) # 刪除X t_id_word_M$id[1:length(t_id_word_M$id)] <- gsub("X", "",t_id_word_M$id[1:length(t_id_word_M$id)]) t_id_word_M <- t_M # 備份 # 順序排列 counter = 1 for(i in 1:nrow(id_word_M)){ t <- sum(t_id_word_M$id == id_word_M$ID[i]) for(j in 1:t){ t_id_word_M$order[counter] <- j counter = counter + 1 } } # Write CSV write.csv(t_id_word_M ,"/Users/lialee/Desktop/id_word_Male.csv") write.csv(t_id_word_F ,"/Users/lialee/Desktop/id_word_Female.csv") ``` ```r= # 讀入附件檔案 t_id_word_M <- read.csv("/Users/lialee/Desktop/t_id_word_M2.csv", header = TRUE, fileEncoding = "UTF-8") t_id_word_M <- t_id_word_M[!t_id_word_M$words %in% c("材料","美食","蛋糕","店","粉","版"),] trans_sequence <- t_id_word_M %>% group_by(id, order) %>% summarize( SIZE = n(), words = paste(as.character(words), collapse = ';') ) #trans_sequence <- data.frame(lapply(trans_sequence, as.factor)) trans_sequence <- trans_sequence[order(trans_sequence$id, trans_sequence$order),] library(arulesSequences) # Convert to transaction matrix data type write.table(trans_sequence, "mytxtout.txt", sep=";", row.names = FALSE, col.names = FALSE, quote = FALSE) trans_matrix <- read_baskets("mytxtout.txt", sep = ";", info = c("sequenceID","eventID","SIZE")) head(inspect(trans_matrix)) # Get frequent sequences and corresponding support values s1 <- cspade(trans_matrix, parameter = list(support = 0.01), control = list(verbose = TRUE)) s1.df <- as(s1, "data.frame") summary(s1) inspect(s1) rules <- s1[which(s1@quality$support>0.02)] inspect(rules) s1 <- sort(s1,by="support",decreasing=T) inspect(s1) ``` ### K-means , Cluster Descriptions and CART.tree ```r= library(jsonlite) library(jiebaR) dcard_area <- fromJSON("/Users/lialee/Desktop/III Final Project/dcard_final.json") # select content, id, gender dim(dcard_area) dcard_content <- dcard_area[,c(2,4,20,21)] # 刪除na dcard_content <- dcard_content[!is.na(dcard_content$Area),] library(dplyr) # 重複執行男女,北中南東,共8組 F_E <- dcard_content %>% filter(Gender == "F") %>% filter(Area == "東") # 檢查是否重複 F_E <- F_E %>% distinct(Content, .keep_all = TRUE) stop_word <- '/Users/lialee/Desktop/Programming/TextMining/Data/stop.txt' user_dic <- "/Users/lialee/Desktop/III Final Project/foodclean.csv" mixseg <- worker(stop_word = stop_word, user = user_dic, type = "tag") # - - - 開始跑斷詞 df_FE <- data.frame() seq_doc <- NULL # Word Segmentation Results seq_tag <- NULL # POS Tagging Results k <- F_E$Content k <- gsub('[0-9]+', "", k) k <- gsub('[[:space:]]', "", k) k <- gsub('[a-zA-Z]', "", k) k <- gsub('#', "", k) k <- gsub('[ ️ ︎ ﹏ ︵ ︶ ︿ ﹃ ꒳]',"",k) k <- gsub('[︴ ︹ ︺ ꒦ ꒪ ꒫"]' ,"",k) k <- gsub('[a-zA-Z]', "", k) k <- gsub('[-+/.─◆○~=,「」▲:~※_★$、?│【】()()]' ,"", k) # 中文分詞 w <- segment(as.vector(k), mixseg) seq_doc <- c(seq_doc, w) # 詞性標注 t <- names(tagging(as.vector(k), mixseg)) seq_tag <- c(seq_tag , t) seq <- data.frame(seq_doc, seq_tag) seq <- seq[seq$seq_tag %in% c('n','nr','nrt','ns','nt','nz'),] seq_doc <- table(as.character(seq$seq_doc)) seq_doc <- data.frame(seq_doc, clas = 'Female East') df_FE <- rbind(df_FE, seq_doc) # 八組合成一個data frame df_area <- rbind(df_area, df_FE) names(df_area)[1] <- 'Keywords' names(df_area)[2] <- 'Frequency' names(df_area)[3] <- 'Type' DF <- c(table(df_area$Keywords)) FM <- unique(df_area$Type) library(reshape2) TCM <- acast(df_area, Keywords ~ Type, value.var='Frequency', fill = 0, drop = FALSE, sum) TCB <- ifelse(TCM > 0, 1, 0) # 共出現超過500次才選 selectedKW <- rowSums(TCM) >= 500 TCM <- as.data.frame(TCM[selectedKW,]) TCB <- as.data.frame(TCB[selectedKW,]) DF <- DF[selectedKW] counter <- 30641 #文章總篇數 IDF <- log10(counter / DF) cbind(rownames(TCM), IDF) TTF <- colSums(TCM) TCM_IDF <- t(t(TCM) / TTF) * IDF TCM <- data.frame(Keywords = rownames(TCM), TCM) rownames(TCM) <- NULL TCM_IDF <- data.frame(Keywords = rownames(TCM_IDF), TCM_IDF) rownames(TCM_IDF) <- NULL TCB <- data.frame(Keywords = rownames(TCB), TCB) rownames(TCB) <- NULL colnam <- TCM$Keywords TCM$Keywords <- NULL # 轉向 t_TCM <- as.data.frame(t(TCM)) colnames(t_TCM) <- colnam rownames(t_TCM) <- FM # 這個步驟是為了決策樹跟分類無關 cart_TCM <- t_TCM cart_TCM$Type <- FM # K-means library(cluster) # Decide K result <- list() for (i in 2:4){ kmd <- kmeans(t_TCM, centers=i) sil <- silhouette(kmd$cluster, dist(t_TCM)) result[[paste('k=',i,sep='')]] <- mean(sil[,'sil_width']) } result ``` K-means Result **發現分三群的k-means最高** ```r= $`k=2` [1] 0.4849345 $`k=3` [1] 0.5801996 $`k=4` [1] 0.4641646 ``` ```r= # K = 3 kmd <- kmeans(t_TCM, centers=3) kmd$cluster - - - - - - - - - kmd$cluster Male North Female North Male Middle Male South Male East Female Middle 2 3 1 2 1 2 Female South Female East 2 1 ``` ```r= # 看輪廓係數 sil <- silhouette(kmd$cluster, dist(t_TCM)) mean(sil[,'sil_width']) - - - - - - - - - cluster neighbor sil_width [1,] 2 1 0.2561490 [2,] 3 2 0.0000000 [3,] 1 2 0.6009884 [4,] 2 1 0.7626361 [5,] 1 2 0.7814875 [6,] 2 1 0.7626361 [7,] 2 1 0.6711010 [8,] 1 2 0.8065991 # Cluster Descriptions kmd$centers ``` ```r= # Display Clustering Results # 以下都是畫文字雲用的 library(wordcloud2) install.packages("webshot") library(webshot) webshot::install_phantomjs() library("htmlwidgets") for(i in 1:3) { Clus_i <- t_TCM[kmd$cluster==i,] Clus_n <- colnames(t_TCM) Clus_f <- colSums(Clus_i) Word_Tab <- data.frame(Clus_n, Clus_f) rownames(Word_Tab) <- NULL Word_Tab <- Word_Tab[Word_Tab$Clus_f!=0,] my_graph <- wordcloud2(Word_Tab, size = 0.5, minSize = 0, gridSize = 3,color = "random-light", backgroundColor = "white") saveWidget(my_graph,paste0("/Users/lialee/Desktop/",i,".html"),selfcontained = F) webshot(paste0("/Users/lialee/Desktop/",i,".html"), paste0("/Users/lialee/Desktop/",i,".png"), vwidth = 600, vheight=350) } ``` ![](https://i.imgur.com/z46yF3O.png) ![](https://i.imgur.com/iuZXQsG.png) ![](https://i.imgur.com/nHxkZ8V.png) ```r= # - - - - - cart_TCM 畫決策樹 library(rpart) library(rpart.plot) CART.tree <- rpart(Type ~ ., data=cart_TCM, control=rpart.control(minsplit=2, cp=0)) rpart.plot(CART.tree) ``` ![](https://i.imgur.com/oBEv3xW.jpg) ### 巧克力文章分析 ```r= # 選出有巧克力的文章 chocolate <- dcard_area[grepl("巧克力", dcard_area$Content) == TRUE,] stop_word <- '/Users/lialee/Desktop/Programming/TextMining/Data/stop.txt' user_dic <- "/Users/lialee/Desktop/foodclean.csv" mixseg <- worker(stop_word = stop_word, user = user_dic, type = "tag") choco <- data.frame() seq_doc <- NULL seq_tag <- NULL k <- chocolate$Content k <- gsub('[0-9]+', "", k) k <- gsub('[[:space:]]', "", k) k <- gsub('[a-zA-Z]', "", k) k <- gsub('#', "", k) k <- gsub('[ ️ ︎ ﹏ ︵ ︶ ︿ ﹃ ꒳]',"",k) k <- gsub('[︴ ︹ ︺ ꒦ ꒪ ꒫]',"",k) k <- gsub('[-+/.─◆○~=,「」▲:~※_★$、?│【】()()]' ,"", k) # 內文清洗完與ID合併 choco <- data.frame(k, chocolate$ID) # 中文分词 for(i in 1:nrow(choco)){ w <- segment(as.vector(choco$k[i]), mixseg) choco$w[i] <- list(w) } # 詞性標注 for(i in 1:nrow(choco)){ t <- names(tagging(as.vector(choco$k[i]), mixseg)) choco$t[i] <- list(t) } library(stringr) for(i in 1:nrow(choco)){ t <- which(choco$t[[i]][1:length(choco$t[[i]])] %in% c('n','nr','nrt','ns','nt','nz') == T) w <- choco$w[[i]][t] choco$ww[i] <- paste(str_c(w, sep="|"), collapse = ",") } # 只保留ID跟斷詞後的名詞 choco <- choco[,c(2,5)] # 把名詞用,切開 library(splitstackshape) choco_sep <- cSplit(choco, "ww", ",") # 轉向與轉置才能做詞頻統計 library(tidyr) t_choco_sep <- t(choco_sep) dim(t_choco_sep) id <- c() id <- choco_sep$chocolate.ID colnames(t_choco_sep) <- id class(t_choco_sep) t_choco_sep <- t_choco_sep[-1,] t_choco_sep <- gather(data.frame(t_choco_sep), key = "id", value = "words", 1:ncol(t_choco_sep), na.rm = T) t_choco_sep$id[1:length(t_choco_sep$id)] <- gsub("X", "",t_choco_sep$id[1:length(t_choco_sep$id)]) # 詞頻統計 choco_count <- data.frame(table(as.character(t_choco_sep$words))) names(choco_count)[1] <- 'Keywords' names(choco_count)[2] <- 'Frequency' DF <- c(table(choco_count$Keywords)) # 選出具代表性的詞 library(reshape2) TCM <- acast(choco_count, Keywords ~ Frequency, value.var='Frequency', fill = 0, drop = FALSE, sum) TCB <- ifelse(TCM > 0, 1, 0) # 至少出現50次 selectedKW <- rowSums(TCM) >= 50 TCM <- as.data.frame(TCM[selectedKW,]) TCB <- as.data.frame(TCB[selectedKW,]) rownames(TCM) ``` ``` > rownames(TCM) [1] "白巧克力" "版" "冰淇淋" "餅乾" "布朗" "材料" "草莓" [8] "茶" "蛋糕" "店" "東西" "粉" "芙" "個人" [15] "醬" "結果" "咖啡" "烤箱" "口感" "口味" "美食" [22] "奶油" "尼" "檸檬" "牛奶" "朋友" "巧克力" "全家" [29] "熔岩" "乳酪" "食譜" "時間" "甜點" "味" "味道" [36] "鮮奶油" "餡" "香蕉" "杏仁" "原味" "早餐" ``` ### Prepare Data for Prediction ```r= # 以巧克力為主題分析 # 選出有巧克力的文章 chocolate <- dcard_area[grepl("巧克力", dcard_area$Content) == TRUE,] # 選出沒有巧克力的文章 not_chocolate <- dcard_area[grepl("巧克力", dcard_area$Content) == F,] # 去重 chocolate <- chocolate %>% distinct(Content ,ID, .keep_all = TRUE) not_chocolate <- not_chocolate %>% distinct(Content ,ID, .keep_all = TRUE) # 去不完整資料 chocolate <- chocolate[complete.cases(chocolate$Area) ==T,] not_chocolate <- not_chocolate[complete.cases(not_chocolate$Area) ==T,] # chocolate文章ID給標籤yes df <- data.frame(chocolate$ID, clas = 'yes') colnames(df) <- c("id","tpye") # 沒有chocolate文章ID給標籤no df1 <- data.frame(not_chocolate$ID, clas = 'no') colnames(df1) <- c("id","tpye") df2 <- rbind(df,df1) # 還原ID欄位資料 df_complete <- merge(df2, dcard_area, by.x = "id", by.y = "ID", all.x = TRUE) # 篩選預測需使用欄位 df_complete <- df_complete[,c(1,2,3,6,16,19,22)] # 觀察資料型態 sapply(df_complete,unique) # 發現性別沒清洗 df_complete <- df_complete %>% filter(Gender %in% c("F","M")) # 把欲預測欄位放到最後 s <- df_complete$tpye df_complete <- df_complete[,-2] df_complete$type <- s # - - - - - 開始正歸化 # 季節轉數值 replace1 <- function(x) { x = sub("春","0",x) x = sub("夏","1",x) x = sub("秋","2",x) x = sub("冬","3",x) x = as.integer(x) return (x) } # 城市轉數值 replace2 <- function(x) { x = sub("台北市","0",x) x = sub("新北市","1",x) x = sub("基隆市","2",x) x = sub("桃園市","3",x) x = sub("新竹市","4",x) x = sub("新竹縣","5",x) x = sub("苗栗縣","6",x) x = sub("台中市","7",x) x = sub("彰化縣","8",x) x = sub("雲林縣","9",x) x = sub("南投縣","10",x) x = sub("嘉義市","11",x) x = sub("嘉義縣","12",x) x = sub("台南市","13",x) x = sub("高雄市","14",x) x = sub("屏東縣","15",x) x = sub("宜蘭縣","16",x) x = sub("花蓮縣","17",x) x = sub("台東縣","18",x) x = sub("澎湖縣","19",x) x = sub("金門縣","20",x) x = as.integer(x) return (x) } # 性別轉數值 replace3 <- function(x) { x = sub("M","1",x) x = sub("F","0",x) x = as.integer(x) return (x) } sapply(df_complete, typeof) df_norm <- df_complete # 不知道為何無法直接對一個表格做事 df_norm.s <- as.data.frame(lapply(df_norm,replace1)) df_norm.s <- df_norm.s[,"Season"] df_norm.g <- data.frame(lapply(df_norm,replace3)) df_norm.g <- df_norm.g[,"Gender"] df_norm.c <- data.frame(lapply(df_norm,replace2)) df_norm.c <- df_norm.c[,"City"] df_norm <- cbind(df_norm ,df_norm.c) # 學校有152所較複雜,執行正規化處理 unique(df_norm$University) check_uni <- data.frame(unique(df_norm$University), c(1:152)) colnames(check_uni) <- c("x1","x2") df_ok <- merge(check_uni, df_norm, by.x = "x1", by.y = "University", all.x = TRUE) df_ok <- df_ok[,-c(1,4,6,7)] colnames(df_ok) <- c("University","ID","Year","Chocolate","Season","Gender","City") # 把欲預測欄位:巧克力討論度,放到最後 s <- df_ok$Chocolate df_ok <- df_ok[,-4] df_ok$Chocolate <- s # 轉換成類別 sapply(df_ok, typeof) for(i in c(1:6)) { df_ok[[i]] <- as.factor(df_ok[[i]]) } choco_module <- df_ok sapply(choco_module, typeof) select <- sample(1:nrow(choco_module),nrow(choco_module)*0.8) train <- choco_module[select,] test <- choco_module[-select,] ``` ### Prediction : Naive Bayes ```r= choco_module <- df_ok sapply(choco_module, typeof) # 把ID拿掉 choco_module <- choco_module[,-2] select <- sample(1:nrow(choco_module),nrow(choco_module)*0.8) train <- choco_module[select,] test <- choco_module[-select,] # Build Naive Bayes Model library(bnlearn) #目標屬性"BROADBAND" nb_default <- naive.bayes(train, training="Chocolate") plot(nb_default) test.y_hat <- predict(nb_default, test, prob=FALSE) test.y_hat_prob <- predict(nb_default, test, prob=TRUE) nb_default_results <- cbind(Prediction=as.character(test.y_hat), t(attr(test.y_hat_prob,"prob"))) nb_default_results # Model Evaluation accuracy.nb_default <- sum(test.y_hat==test$`Chocolate`) / length(test$`Chocolate`) accuracy.nb_default # accuracy.nb_default 0.9569649 agreement_KNN <- test.y_hat==test$`Chocolate` agreement_KNN table(test.y_hat, test$`Chocolate`, dnn=c("Prediction","Actual")) - - - - - - - - - Actual Prediction yes no yes 0 0 no 266 5915 ``` ![](https://i.imgur.com/cFeSX1K.jpg) ```r= # Build Bayes Net Tan Model library(bnlearn) bn_default <- tree.bayes(train, training="Chocolate") plot(bn_default) test.y_hat <- predict(bn_default, test, prob=FALSE) test.y_hat_prob <- predict(bn_default, test, prob=TRUE) bn_default_results <- cbind(Prediction=as.character(test.y_hat), t(as.matrix(attr(test.y_hat_prob,"prob")))) bn_default_results # Model Evaluation accuracy.nb_default <- sum(test.y_hat==test$`Chocolate`) / length(test$`Chocolate`) accuracy.nb_default # accuracy.nb_default 0.8975894 agreement_KNN <- test.y_hat==test$`Chocolate` agreement_KNN table(test.y_hat, test$`Chocolate`, dnn=c("Prediction","Actual")) - - - - - - - - - Actual Prediction yes no yes 7 374 no 259 5541 ``` ![](https://i.imgur.com/U2qyysD.jpg) ```r= mocha <- dcard_area[grepl("抹茶", dcard_area$Content) == TRUE,] ramen <- dcard_area[grepl("拉麵", dcard_area$Content) == TRUE,] cheese <- dcard_area[grepl("起司", dcard_area$Content) == TRUE,] cheese <- cheese %>% distinct(Content ,ID, .keep_all = TRUE) cheese <- cheese[complete.cases(cheese$Area) ==T,] mocha <- mocha %>% filter(Gender %in% c("F","M")) df3 <- data.frame(cheese$ID, clas = 'cheese') colnames(df1) <- c("id","type") df <- rbind(df, df3) unique(df$type) df_three <- merge(df, dcard_area, by.x = "id", by.y = "ID", all.x = TRUE) df_three <- df_three[,c(1,2,3,6,16,19,22)] sapply(df_three,unique) df_norm <- df_three # 不知道為何無法直接對一個表格做事 df_norm.s <- as.data.frame(lapply(df_norm,replace1)) df_norm.s <- df_norm.s[,"Season"] df_norm.g <- data.frame(lapply(df_norm,replace3)) df_norm.g <- df_norm.g[,"Gender"] df_norm.c <- data.frame(lapply(df_norm,replace2)) df_norm.c <- df_norm.c[,"City"] df_norm <- cbind(df_norm ,df_norm.c) # 學校較複雜,另外處理 unique(df_norm$University) check_uni <- data.frame(unique(df_norm$University), c(1:139)) colnames(check_uni) <- c("x1","x2") df_ok <- merge(check_uni, df_norm, by.x = "x1", by.y = "University", all.x = TRUE) df_ok <- df_ok[,-c(1,5,8,7)] colnames(df_ok) <- c("University","ID","Type","Year","Season","Gender","City") s <- df_ok$Type df_ok <- df_ok[,-3] df_ok$Type <- s # 轉換成類別 sapply(df_ok, typeof) for(i in c(1:6)) { df_ok[[i]] <- as.factor(df_ok[[i]]) } three_module <- df_ok sapply(three_module, typeof) # 把ID拿掉 three_module <- three_module[,-2] select <- sample(1:nrow(three_module),nrow(three_module)*0.8) train <- three_module[select,] test <- three_module[-select,] ``` ###### tags: `Food Forum` `Project`