# 2019-01-09-黨報頭條
###### tags: `資料科學自學園`
```r=
rm(list = ls()) # 清除暫存區的資料
library(xml2)
library(httr)
library(tmcn)
library(tm)
library(rvest)
library(magrittr)
EndNo = 12491
fun_Num = function(pageNo){
srch.page = paste0('http://data.people.com.cn/pd/dbtbyw/s?top=1&pageNo=',pageNo,'&qs=%7B%22cId%22%3A%2263%22%2C%22cds%22%3A%5B%7B%22fld%22%3A%22dataTime.start%22%2C%22cdr%22%3A%22AND%22%2C%22hlt%22%3A%22false%22%2C%22vlr%22%3A%22AND%22%2C%22qtp%22%3A%22DEF%22%2C%22val%22%3A%222012-01-01%22%7D%2C%7B%22fld%22%3A%22dataTime.end%22%2C%22cdr%22%3A%22AND%22%2C%22hlt%22%3A%22false%22%2C%22vlr%22%3A%22AND%22%2C%22qtp%22%3A%22DEF%22%2C%22val%22%3A%222018-11-30%22%7D%5D%2C%22obs%22%3A%5B%7B%22fld%22%3A%22dataTime%22%2C%22drt%22%3A%22DESC%22%7D%5D%7D')
page_ht = tryCatch(read_html(httr::GET(srch.page, timeout(10)),
encoding = "UTF-8"),
error = function(msg){
message(paste(msg,Sys.time(),sep = '\n'))
return(NA)},
warning = function(msg){
message(paste(msg,Sys.time(),sep = "\n"))
return(NA)})
artilink = tryCatch(html_nodes(page_ht,"h2 a") %>% html_attr("href"),
error = function(msg){
message(paste(msg,Sys.time(),sep = "\n"))
return(NA)},
warning = function(msg){
message(paste(msg,Sys.time(),sep = "\n"))
return(NA)})
rsrcwdate = tryCatch(html_nodes(page_ht,".news_sum_bottom span") %>% html_text(),
error = function(msg){
message(paste(msg,Sys.time(),sep = "\n"))
return(NA)},
warning = function(msg){
message(paste(msg,Sys.time(),sep = "\n"))
return(NA)})
rsrcwdate = gsub("时间: ","",rsrcwdate)
print(paste0("finish at ",Sys.time()))
Sys.sleep(runif(2,3,20))
return(list(list(artilink),list(rsrcwdate)))}
fun_arti =function(artilink){
arti_ht = tryCatch(read_html(httr::GET(paste0('http://data.people.com.cn',artilink), timeout(10)),
encoding = 'UTF-8', options = 'HUGE'),
error = function(msg){
message(paste(msg,Sys.time(),sep = '\n'))
return(NA)},
warning = function(msg){
message(paste(msg,Sys.time(),sep = "\n"))
return(NA)})
if (is.na(arti_ht) == TRUE) {
title = NA
ctnt = NA
print("抓取網頁失敗!")} else {
title = html_nodes(arti_ht,".title h2") %>% html_text() #內文標題的CSS
if (length(title)==0) title = NA
ctnt = html_nodes(arti_ht,"#detail-p") %>% html_text()
if (length(ctnt)==0) ctnt = NA}
arti = list(title,ctnt)
message(title)
message(paste0("finish at ",Sys.time()))
Sys.sleep(runif(2,3,20))
return(arti)}
FUN_RUN = function(startNum){
for (pageNo in startNum:EndNo){
print(sprintf("================ 抓取 %s 的內容 ================",pageNo))
C = fun_Num(pageNo)
D = unlist(C)[1:(length(unlist(C))/3)]
TT = unlist(C)[seq(((length(unlist(C))/3)+2),length(unlist(C)),by=2)]
rsrc = unlist(C)[seq(((length(unlist(C))/3)+1),length(unlist(C)),by=2)]
A = lapply(D[1:length(D)],fun_arti)
E = matrix(ncol = 5) # 設定一個矩陣,寬度為6
colnames(E) = c( # 矩陣的標題分別為這六項。
"NewsDate", # 是哪一天的新聞。
"Title", # 標題為何。
"Reference",
"Content", # 內文內容。
"URL") # 該篇文章的連結。
for (artiseed in 1:length(A)){
check_date = TT[artiseed]
E_tmp = c(TT[artiseed],unlist(A[artiseed][1]),rsrc[artiseed],unlist(A[artiseed][2]),D[artiseed])
E = rbind(E,E_tmp)}
E = E[-1,] # 將矩陣第一列的資料(為六項NA值)移除
filename = paste0('dbtbyw','_No_',pageNo,".csv") # 設定檔案名稱「filename」
write.csv(E,file = filename,fileEncoding = "UTF-8") # 把矩陣「E」匯出成CSV檔,檔案名稱為「日期.csv」
print(paste0("================ ","FINISH and write into ",filename,"================"))
Sys.sleep(runif(2,3,20))}}
startNum = 7871
FUN_RUN(startNum)
```