# 2018-08-07_R教學 ###### tags: `資料科學自學園` # 馬懿 觀察網頁規律:http://news.nsysu.edu.tw/files/40-1342-2910-[頁數].php?Lang=zh-tw ```r= # 設置環境 rm(list = ls()) library(xml2) library(rvest) library(tmcn) library(tm) html.list=function(x){ nsysu=paste0("http://news.nsysu.edu.tw/files/40-1342-2910-",x,".php?Lang=zh-tw") nsysu <- read_html(nsysu, encoding = "UTF-8", options = "HUGE") %>% html_nodes(".h5 a") %>% html_attr('href') return(nsysu)} for(i in 1:216){ nsysu=c(nsysu,html.list([i])) cat(i)} title = NULL date = NULL text = NULL for(i in 1:length(nsysu)){ title=c(title,read_html(nsysu[i])%>%html_nodes(".item-title")%>%html_text()) title=gsub("\n","",title) date=c(date,read_html(nsysu[i])%>%html_nodes(".attr-val")%>%html_text()) if(length(title)>length(date))date=c(date,"NA") text=c(text,read_html(nsysu[i])%>%html_nodes(".ptcontent")%>%html_text()) if(length(date)>length(text))text=c(text,"NA") text=gsub("\n{2,}","",text) text=gsub("\t{2,}","",text) text=gsub("\n{2,}","",text) print(i)} nsysu.news=data.frame( "標題"=title, "日期"=date, "內文"=text) write.csv(nsysu.news ,file = "nsysu_news.csv",fileEncoding = "UTF-8") ``` # Tree > 做了一些更改,經實驗,是可以運行的(不會產生ERROR)。 ```r= # SETTING ENVIRONMENT rm(list = ls()) library(xml2) library(rvest) library(tmcn) library(tm) # SET FUNCTION link_fun = function(page){ web = paste0("http://news.nsysu.edu.tw/files/40-1342-2910-",page,".php?Lang=zh-tw") link = read_html(web,encoding = "UTF-8",options = "HUGE") %>% html_nodes(".h5 a") %>% html_attr("href") return(link)} arti_fun = function(link){ html = read_html(link, encoding = "UTF-8", options = "HUGE") title = html_nodes(html,".item-title") %>% html_text() date = html_nodes(html,".attr-val") %>% html_text() ctnt = html_nodes(html,".ptcontent") %>% html_text() if (length(title) == 0) {title <- NA} if (length(date) == 0) {date <- NA} if (length(ctnt) == 0) {ctnt <- NA} arti = cbind(title,date,ctnt,link) return(arti)} save_fun = function(number){ link = link_fun(number) for (i in 1:length(l)){ arti = rbind(arti,arti_fun(l[i])) message(paste(arti[i,1],Sys.time(),sep = "\t"))} arti[,1] = gsub("\n","",arti[,1]) arti[,3] = gsub("\n{2,}","",arti[,3]) arti[,3] = gsub("\t{2,}","",arti[,3]) arti[,3] = gsub("\n{2,}","",arti[,3]) name = paste0("中山大學新聞網: ",number,"_",Sys.Date(),".csv") write.csv(arti,file = name,fileEncoding = "UTF-8") Sys.sleep(5)} # RUN AMD WRITE TO CSV lapply(1:216,save_fun) ``` 施馬(改寫function之ㄍㄛˊ在一起版) ```r= html.list=function(x){ nsysu=paste0("http://news.nsysu.edu.tw/files/40-1342-2910-",x,".php?Lang=zh-tw") nsysu <- read_html(nsysu, encoding = "UTF-8", options = "HUGE") %>% html_nodes(".h5 a") %>% html_attr('href') for(i in 1:15){ title=c(title,read_html(nsysu[i])%>%html_nodes(".item-title")%>%html_text()) title=gsub("\n","",title) date=c(date,read_html(nsysu[i])%>%html_nodes(".attr-val")%>%html_text()) if(length(title)>length(date))date=c(date,"NA") text=c(text,read_html(nsysu[i])%>%html_nodes(".ptcontent")%>%html_text()) if(length(date)>length(text))text=c(text,"NA") text=gsub("\n","",text) text=gsub("\t","",text)} nsysu.news=cbind(title,date,text) filename=paste0("nsysunews-",x,".csv") write.csv(nsysu.news ,file = filename,fileEncoding = "UTF-8") print(paste0("the page ",x," had finished.")) Sys.sleep(5) } for(i in 1:3){ html.list(i) } ```