# 0624-TITUS ###### tags: `資料科學自學園` `TITUS` `R Language` `Wechat` ```r rm(list = ls()) library(tmcn) library(xml2) library(magrittr) library(rvest) list = big3wechat$link %>% as.character() count = NA %>% as.list() function_count = function(no){ url = list[no] html = read_html(url,encoding = "UTF-8") sprintf("================No. %d web ================",no) %>% paste(url,sep = "\n")%>% message() sprintf("Catch img from '#js_content' ") %>% message() html1 = html_nodes(html,"#js_content") %>% html_nodes("img") %>% html_attr("data-src") message(html1) tmp1 = tryCatch({ html_nodes(html,"#js_content") %>% html_nodes("img") %>% html_attr("data-src")}, error = function(msg){NA sprintf("Got Error at No. %d",no) %>% paste(msg,sep = "\n") %>% message() }) sprintf("Catch img from '#media' ") %>% message() html2 = html_nodes(html,"#media") message(html2) tmp2 = html_nodes(html,"#media") tmp2 = tryCatch(expr = { html_nodes(html,"#media")}, error = function(msg){return(NA) sprintf("Got Error at No. %d",no) %>% paste(msg,sep = "\n") %>% message() }) if (length(tmp1) != 0) {if (is.na(tmp1) != TRUE) {ln1 = length(tmp1)} else {ln1 = 0}} else {ln1 = 0} sprintf("Catch img from '#js_content', length = %d ",ln1) %>% message() if (length(tmp2) != 0) {if (is.na(tmp2) != TRUE) {ln2 = length(tmp2)} else {ln2 = 0}} else {ln2 = 0} sprintf("Catch img from '#media', length = %d ",ln2) %>% message() ln = list(ln1 + ln2, html1, html2) return(ln) } library(parallel) detectCores() cl = makeCluster(3) clusterEvalQ(cl, library(xml2)) clusterEvalQ(cl, library(magrittr)) clusterEvalQ(cl, library(rvest)) clusterEvalQ(cl, library(tmcn)) clusterExport(cl, "list") clusterExport(cl, "count") clusterExport(cl, "function_count") count = lapply(1:length(list),function_count) # html_nodes(html,"#js_view_source") %>% html_attr("href") # html_nodes(html,"#js_view_source") %>% message() # write(list,file= "list") countno = lapply(1:length(count),function(x){count[[x]][[1]]}) countno = unlist(countno) uncheck = which(countno != 1,arr.ind =TRUE) #---------------------------- seed = length(count) for(no in seed:length(list)){ url = list[no] html = read_html(url,encoding = "UTF-8") sprintf("================No. %d web ================",no) %>% paste(url,sep = "\n")%>% message() sprintf("Catch img from '#js_content' ") %>% message() # html1 = html_nodes(html,"#js_content") %>% html_nodes("img") %>% html_attr("data-src") # message(html1) tmp1 = tryCatch({ html_nodes(html,"#js_content") %>% html_nodes("img") %>% html_attr("data-src")}, error = function(msg){NA sprintf("Got Error at No. %d",no) %>% paste(msg,sep = "\n") %>% message() }) sprintf("Catch img from '#media' ") %>% message() # html2 = html_nodes(html,"#media") # message(html2) tmp2 = html_nodes(html,"#media") tmp2 = tryCatch(expr = { html_nodes(html,"#media")}, error = function(msg){return(NA) sprintf("Got Error at No. %d",no) %>% paste(msg,sep = "\n") %>% message() }) if (length(tmp1) != 0) {if (is.na(tmp1) != TRUE) {ln1 = length(tmp1)} else {ln1 = 0}} else {ln1 = 0} sprintf("Catch img from '#js_content', length = %d ",ln1) %>% message() if (length(tmp2) != 0) {if (is.na(tmp2) != TRUE) {ln2 = length(tmp2)} else {ln2 = 0}} else {ln2 = 0} sprintf("Catch img from '#media', length = %d ",ln2) %>% message() ln = list(ln1 + ln2, html1, html2) count[no] = ln } ## TESTWEB urlll = "https://mp.weixin.qq.com/s?__biz=MjM5MjAxNDM4MA==&mid=2666200188&idx=1&sn=b5a4d2371d4335ec9ee3c3dd84041613&chksm=bdb2953f8ac51c29debf6f175b1023bfae52e060084b01508a534d2c16c777f3a433ad81b5e3&scene=38#wechat_redirect" ```