# 0624-TITUS
###### tags: `資料科學自學園` `TITUS` `R Language` `Wechat`
```r
rm(list = ls())
library(tmcn)
library(xml2)
library(magrittr)
library(rvest)
list = big3wechat$link %>% as.character()
count = NA %>% as.list()
function_count = function(no){
url = list[no]
html = read_html(url,encoding = "UTF-8")
sprintf("================No. %d web ================",no) %>% paste(url,sep = "\n")%>% message()
sprintf("Catch img from '#js_content' ") %>% message()
html1 = html_nodes(html,"#js_content") %>% html_nodes("img") %>% html_attr("data-src")
message(html1)
tmp1 = tryCatch({
html_nodes(html,"#js_content") %>% html_nodes("img") %>% html_attr("data-src")},
error = function(msg){NA
sprintf("Got Error at No. %d",no) %>% paste(msg,sep = "\n") %>% message()
})
sprintf("Catch img from '#media' ") %>% message()
html2 = html_nodes(html,"#media")
message(html2)
tmp2 = html_nodes(html,"#media")
tmp2 = tryCatch(expr = {
html_nodes(html,"#media")},
error = function(msg){return(NA)
sprintf("Got Error at No. %d",no) %>% paste(msg,sep = "\n") %>% message()
})
if (length(tmp1) != 0) {if (is.na(tmp1) != TRUE) {ln1 = length(tmp1)} else {ln1 = 0}} else {ln1 = 0}
sprintf("Catch img from '#js_content', length = %d ",ln1) %>% message()
if (length(tmp2) != 0) {if (is.na(tmp2) != TRUE) {ln2 = length(tmp2)} else {ln2 = 0}} else {ln2 = 0}
sprintf("Catch img from '#media', length = %d ",ln2) %>% message()
ln = list(ln1 + ln2, html1, html2)
return(ln)
}
library(parallel)
detectCores()
cl = makeCluster(3)
clusterEvalQ(cl, library(xml2))
clusterEvalQ(cl, library(magrittr))
clusterEvalQ(cl, library(rvest))
clusterEvalQ(cl, library(tmcn))
clusterExport(cl, "list")
clusterExport(cl, "count")
clusterExport(cl, "function_count")
count = lapply(1:length(list),function_count)
# html_nodes(html,"#js_view_source") %>% html_attr("href")
# html_nodes(html,"#js_view_source") %>% message()
# write(list,file= "list")
countno = lapply(1:length(count),function(x){count[[x]][[1]]})
countno = unlist(countno)
uncheck = which(countno != 1,arr.ind =TRUE)
#----------------------------
seed = length(count)
for(no in seed:length(list)){
url = list[no]
html = read_html(url,encoding = "UTF-8")
sprintf("================No. %d web ================",no) %>% paste(url,sep = "\n")%>% message()
sprintf("Catch img from '#js_content' ") %>% message()
# html1 = html_nodes(html,"#js_content") %>% html_nodes("img") %>% html_attr("data-src")
# message(html1)
tmp1 = tryCatch({
html_nodes(html,"#js_content") %>% html_nodes("img") %>% html_attr("data-src")},
error = function(msg){NA
sprintf("Got Error at No. %d",no) %>% paste(msg,sep = "\n") %>% message()
})
sprintf("Catch img from '#media' ") %>% message()
# html2 = html_nodes(html,"#media")
# message(html2)
tmp2 = html_nodes(html,"#media")
tmp2 = tryCatch(expr = {
html_nodes(html,"#media")},
error = function(msg){return(NA)
sprintf("Got Error at No. %d",no) %>% paste(msg,sep = "\n") %>% message()
})
if (length(tmp1) != 0) {if (is.na(tmp1) != TRUE) {ln1 = length(tmp1)} else {ln1 = 0}} else {ln1 = 0}
sprintf("Catch img from '#js_content', length = %d ",ln1) %>% message()
if (length(tmp2) != 0) {if (is.na(tmp2) != TRUE) {ln2 = length(tmp2)} else {ln2 = 0}} else {ln2 = 0}
sprintf("Catch img from '#media', length = %d ",ln2) %>% message()
ln = list(ln1 + ln2, html1, html2)
count[no] = ln
}
## TESTWEB
urlll = "https://mp.weixin.qq.com/s?__biz=MjM5MjAxNDM4MA==&mid=2666200188&idx=1&sn=b5a4d2371d4335ec9ee3c3dd84041613&chksm=bdb2953f8ac51c29debf6f175b1023bfae52e060084b01508a534d2c16c777f3a433ad81b5e3&scene=38#wechat_redirect"
```