# Dec. 31 Before New Year ###### tags: `資料科學自學園` ```{r} #Crawling Data rm(list=ls()) #清除舊有資料 #------------------------- #FUN.main = function(x){paste("http://chuansong.me/account/cctvnewscenter?start=",12*(x-1),sep = "")} main.tmp = lapply(1:800,function(x){paste("http://chuansong.me/account/cctvnewscenter?start=",12*(x-1),sep = "")}) #設定網站連結 main = unlist(main.tmp) #整理網頁頁數 #------------------------- #設定環境---------------- library(xml2) library(rvest) library(magrittr) library(tmcn) Sys.setlocale("LC_ALL","Chinese") #--------------------------- library(parallel) #平行運算 cl <- makeCluster(3) clusterEvalQ(cl, c(library(xml2),library(magrittr),library(rvest))) clusterExport(cl,"main") #--------------------------- FUN.html = function(i){return(tryCatch(read_html(main[i]) %>% html_nodes(.,".question_link") %>% html_attr("href"),error=function(e) NULL)) Sys.sleep(runif(1,2,4))} #網頁爬蟲函式 clusterExport(cl, "FUN.html") #---------------------------- #html <- c() #----------------------------- html.tmp <- parLapply(cl,c(301:400),FUN.html) FUN.html(401) html.part <- unlist(html.tmp) html <- c(html,html.part) rm(html.tmp,html.part) #---------------------------- clusterExport(cl, "html") url.tmp <- parLapply(cl,c(1:length(html)),function(x){paste("http://chuansong.me",html[x],sep = "")}) url <- unlist(url.tmp) #----------------------------- ``` ## 擷取內文資訊 ```{r} FUN.art <- function(x){return(tryCatch(read_html(url[x], options = "HUGE"),error = function(e) NULL)); Sys.sleep(runif(1,2,4))} #clusterExport(cl, "url") art.tmp <- lapply(1:length(url),FUN.art) #--------------------------------- FUN.title <- function(x){return(tryCatch(html_nodes(art.tmp[[x]], "#activity-name") %>% html_text(),error = function(e) NULL))} FUN.date <- function(x){return(tryCatch(html_nodes(art.tmp[[x]], "#post-date") %>% html_text(),error = function(e) NULL))} FUN.ctnt <- function(x){return(tryCatch(html_nodes(art.tmp[[x]], "#img-content") %>% html_text(),error = function(e) NULL))} art.tmp clusterExport(cl, c("art.tmp","FUN.title","FUN.date","FUN.ctnt")) FUN.title.1 <- function(x){return(tryCatch(html_nodes(art.tmp[[x]], "#activity-name") %>% html_text(),error = function(e) NULL))} parLapply(cl,1,FUN.title.1) title.tmp <- lapply(1:length(url),FUN.title) date.tmp <- lapply(1:length(url),FUN.date) ctnt.tmp <- lapply(1:length(url),FUN.ctnt) ```