2019-01-09-黨報頭條

# 2019-01-09-黨報頭條 ###### tags: `資料科學自學園` ```r= rm(list = ls()) # 清除暫存區的資料 library(xml2) library(httr) library(tmcn) library(tm) library(rvest) library(magrittr) EndNo = 12491 fun_Num = function(pageNo){ srch.page = paste0('http://data.people.com.cn/pd/dbtbyw/s?top=1&pageNo=',pageNo,'&qs=%7B%22cId%22%3A%2263%22%2C%22cds%22%3A%5B%7B%22fld%22%3A%22dataTime.start%22%2C%22cdr%22%3A%22AND%22%2C%22hlt%22%3A%22false%22%2C%22vlr%22%3A%22AND%22%2C%22qtp%22%3A%22DEF%22%2C%22val%22%3A%222012-01-01%22%7D%2C%7B%22fld%22%3A%22dataTime.end%22%2C%22cdr%22%3A%22AND%22%2C%22hlt%22%3A%22false%22%2C%22vlr%22%3A%22AND%22%2C%22qtp%22%3A%22DEF%22%2C%22val%22%3A%222018-11-30%22%7D%5D%2C%22obs%22%3A%5B%7B%22fld%22%3A%22dataTime%22%2C%22drt%22%3A%22DESC%22%7D%5D%7D') page_ht = tryCatch(read_html(httr::GET(srch.page, timeout(10)), encoding = "UTF-8"), error = function(msg){ message(paste(msg,Sys.time(),sep = '\n')) return(NA)}, warning = function(msg){ message(paste(msg,Sys.time(),sep = "\n")) return(NA)}) artilink = tryCatch(html_nodes(page_ht,"h2 a") %>% html_attr("href"), error = function(msg){ message(paste(msg,Sys.time(),sep = "\n")) return(NA)}, warning = function(msg){ message(paste(msg,Sys.time(),sep = "\n")) return(NA)}) rsrcwdate = tryCatch(html_nodes(page_ht,".news_sum_bottom span") %>% html_text(), error = function(msg){ message(paste(msg,Sys.time(),sep = "\n")) return(NA)}, warning = function(msg){ message(paste(msg,Sys.time(),sep = "\n")) return(NA)}) rsrcwdate = gsub("时间： ","",rsrcwdate) print(paste0("finish at ",Sys.time())) Sys.sleep(runif(2,3,20)) return(list(list(artilink),list(rsrcwdate)))} fun_arti =function(artilink){ arti_ht = tryCatch(read_html(httr::GET(paste0('http://data.people.com.cn',artilink), timeout(10)), encoding = 'UTF-8', options = 'HUGE'), error = function(msg){ message(paste(msg,Sys.time(),sep = '\n')) return(NA)}, warning = function(msg){ message(paste(msg,Sys.time(),sep = "\n")) return(NA)}) if (is.na(arti_ht) == TRUE) { title = NA ctnt = NA print("抓取網頁失敗！")} else { title = html_nodes(arti_ht,".title h2") %>% html_text() #內文標題的CSS if (length(title)==0) title = NA ctnt = html_nodes(arti_ht,"#detail-p") %>% html_text() if (length(ctnt)==0) ctnt = NA} arti = list(title,ctnt) message(title) message(paste0("finish at ",Sys.time())) Sys.sleep(runif(2,3,20)) return(arti)} FUN_RUN = function(startNum){ for (pageNo in startNum:EndNo){ print(sprintf("================ 抓取 %s 的內容 ================",pageNo)) C = fun_Num(pageNo) D = unlist(C)[1:(length(unlist(C))/3)] TT = unlist(C)[seq(((length(unlist(C))/3)+2),length(unlist(C)),by=2)] rsrc = unlist(C)[seq(((length(unlist(C))/3)+1),length(unlist(C)),by=2)] A = lapply(D[1:length(D)],fun_arti) E = matrix(ncol = 5) # 設定一個矩陣，寬度為6 colnames(E) = c( # 矩陣的標題分別為這六項。 "NewsDate", # 是哪一天的新聞。 "Title", # 標題為何。 "Reference", "Content", # 內文內容。 "URL") # 該篇文章的連結。 for (artiseed in 1:length(A)){ check_date = TT[artiseed] E_tmp = c(TT[artiseed],unlist(A[artiseed][1]),rsrc[artiseed],unlist(A[artiseed][2]),D[artiseed]) E = rbind(E,E_tmp)} E = E[-1,] # 將矩陣第一列的資料（為六項NA值）移除 filename = paste0('dbtbyw','_No_',pageNo,".csv") # 設定檔案名稱「filename」 write.csv(E,file = filename,fileEncoding = "UTF-8") # 把矩陣「E」匯出成CSV檔，檔案名稱為「日期.csv」 print(paste0("================ ","FINISH and write into ",filename,"================")) Sys.sleep(runif(2,3,20))}} startNum = 7871 FUN_RUN(startNum) ```