# Dec. 31 Before New Year
###### tags: `資料科學自學園`
```{r}
#Crawling Data
rm(list=ls()) #清除舊有資料
#-------------------------
#FUN.main = function(x){paste("http://chuansong.me/account/cctvnewscenter?start=",12*(x-1),sep = "")}
main.tmp = lapply(1:800,function(x){paste("http://chuansong.me/account/cctvnewscenter?start=",12*(x-1),sep = "")}) #設定網站連結
main = unlist(main.tmp) #整理網頁頁數
#-------------------------
#設定環境----------------
library(xml2)
library(rvest)
library(magrittr)
library(tmcn)
Sys.setlocale("LC_ALL","Chinese")
#---------------------------
library(parallel) #平行運算
cl <- makeCluster(3)
clusterEvalQ(cl, c(library(xml2),library(magrittr),library(rvest)))
clusterExport(cl,"main")
#---------------------------
FUN.html = function(i){return(tryCatch(read_html(main[i]) %>% html_nodes(.,".question_link") %>% html_attr("href"),error=function(e) NULL))
Sys.sleep(runif(1,2,4))} #網頁爬蟲函式
clusterExport(cl, "FUN.html")
#----------------------------
#html <- c()
#-----------------------------
html.tmp <- parLapply(cl,c(301:400),FUN.html)
FUN.html(401)
html.part <- unlist(html.tmp)
html <- c(html,html.part)
rm(html.tmp,html.part)
#----------------------------
clusterExport(cl, "html")
url.tmp <- parLapply(cl,c(1:length(html)),function(x){paste("http://chuansong.me",html[x],sep = "")})
url <- unlist(url.tmp)
#-----------------------------
```
## 擷取內文資訊
```{r}
FUN.art <- function(x){return(tryCatch(read_html(url[x], options = "HUGE"),error = function(e) NULL));
Sys.sleep(runif(1,2,4))}
#clusterExport(cl, "url")
art.tmp <- lapply(1:length(url),FUN.art)
#---------------------------------
FUN.title <- function(x){return(tryCatch(html_nodes(art.tmp[[x]], "#activity-name") %>% html_text(),error = function(e) NULL))}
FUN.date <- function(x){return(tryCatch(html_nodes(art.tmp[[x]], "#post-date") %>% html_text(),error = function(e) NULL))}
FUN.ctnt <- function(x){return(tryCatch(html_nodes(art.tmp[[x]], "#img-content") %>% html_text(),error = function(e) NULL))}
art.tmp
clusterExport(cl, c("art.tmp","FUN.title","FUN.date","FUN.ctnt"))
FUN.title.1 <- function(x){return(tryCatch(html_nodes(art.tmp[[x]], "#activity-name") %>% html_text(),error = function(e) NULL))}
parLapply(cl,1,FUN.title.1)
title.tmp <- lapply(1:length(url),FUN.title)
date.tmp <- lapply(1:length(url),FUN.date)
ctnt.tmp <- lapply(1:length(url),FUN.ctnt)
```