# 2018-08-07_R教學
###### tags: `資料科學自學園`
# 馬懿
觀察網頁規律:http://news.nsysu.edu.tw/files/40-1342-2910-[頁數].php?Lang=zh-tw
```r=
# 設置環境
rm(list = ls())
library(xml2)
library(rvest)
library(tmcn)
library(tm)
html.list=function(x){
nsysu=paste0("http://news.nsysu.edu.tw/files/40-1342-2910-",x,".php?Lang=zh-tw")
nsysu <- read_html(nsysu, encoding = "UTF-8", options = "HUGE") %>% html_nodes(".h5 a") %>% html_attr('href')
return(nsysu)}
for(i in 1:216){
nsysu=c(nsysu,html.list([i]))
cat(i)}
title = NULL
date = NULL
text = NULL
for(i in 1:length(nsysu)){
title=c(title,read_html(nsysu[i])%>%html_nodes(".item-title")%>%html_text())
title=gsub("\n","",title)
date=c(date,read_html(nsysu[i])%>%html_nodes(".attr-val")%>%html_text())
if(length(title)>length(date))date=c(date,"NA")
text=c(text,read_html(nsysu[i])%>%html_nodes(".ptcontent")%>%html_text())
if(length(date)>length(text))text=c(text,"NA")
text=gsub("\n{2,}","",text)
text=gsub("\t{2,}","",text)
text=gsub("\n{2,}","",text)
print(i)}
nsysu.news=data.frame(
"標題"=title,
"日期"=date,
"內文"=text)
write.csv(nsysu.news ,file = "nsysu_news.csv",fileEncoding = "UTF-8")
```
# Tree
> 做了一些更改,經實驗,是可以運行的(不會產生ERROR)。
```r=
# SETTING ENVIRONMENT
rm(list = ls())
library(xml2)
library(rvest)
library(tmcn)
library(tm)
# SET FUNCTION
link_fun = function(page){
web = paste0("http://news.nsysu.edu.tw/files/40-1342-2910-",page,".php?Lang=zh-tw")
link = read_html(web,encoding = "UTF-8",options = "HUGE") %>% html_nodes(".h5 a") %>% html_attr("href")
return(link)}
arti_fun = function(link){
html = read_html(link, encoding = "UTF-8", options = "HUGE")
title = html_nodes(html,".item-title") %>% html_text()
date = html_nodes(html,".attr-val") %>% html_text()
ctnt = html_nodes(html,".ptcontent") %>% html_text()
if (length(title) == 0) {title <- NA}
if (length(date) == 0) {date <- NA}
if (length(ctnt) == 0) {ctnt <- NA}
arti = cbind(title,date,ctnt,link)
return(arti)}
save_fun = function(number){
link = link_fun(number)
for (i in 1:length(l)){
arti = rbind(arti,arti_fun(l[i]))
message(paste(arti[i,1],Sys.time(),sep = "\t"))}
arti[,1] = gsub("\n","",arti[,1])
arti[,3] = gsub("\n{2,}","",arti[,3])
arti[,3] = gsub("\t{2,}","",arti[,3])
arti[,3] = gsub("\n{2,}","",arti[,3])
name = paste0("中山大學新聞網: ",number,"_",Sys.Date(),".csv")
write.csv(arti,file = name,fileEncoding = "UTF-8")
Sys.sleep(5)}
# RUN AMD WRITE TO CSV
lapply(1:216,save_fun)
```
施馬(改寫function之ㄍㄛˊ在一起版)
```r=
html.list=function(x){
nsysu=paste0("http://news.nsysu.edu.tw/files/40-1342-2910-",x,".php?Lang=zh-tw")
nsysu <- read_html(nsysu, encoding = "UTF-8", options = "HUGE") %>% html_nodes(".h5 a") %>% html_attr('href')
for(i in 1:15){
title=c(title,read_html(nsysu[i])%>%html_nodes(".item-title")%>%html_text())
title=gsub("\n","",title)
date=c(date,read_html(nsysu[i])%>%html_nodes(".attr-val")%>%html_text())
if(length(title)>length(date))date=c(date,"NA")
text=c(text,read_html(nsysu[i])%>%html_nodes(".ptcontent")%>%html_text())
if(length(date)>length(text))text=c(text,"NA")
text=gsub("\n","",text)
text=gsub("\t","",text)}
nsysu.news=cbind(title,date,text)
filename=paste0("nsysunews-",x,".csv")
write.csv(nsysu.news ,file = filename,fileEncoding = "UTF-8")
print(paste0("the page ",x," had finished."))
Sys.sleep(5)
}
for(i in 1:3){
html.list(i)
}
```