# 0512_coding ###### tags: `資料科學自學園` `R Language` [datalink](https://github.com/TreeZi/MIE03HKRUD03/blob/master/GameId) ```{r} #rm(list=ls()) k = 1 #讀取資料 GameId <- read.table("D:/CrawlingData_Jan/GameId", quote="\"", comment.char="") #路徑要更改 GameId = as.character(GameId[,1]) #設定環境 install.packages("xml2") install.packages("rvest") install.packages("magrittr") install.packages("httr") library(xml2) library(rvest) library(magrittr) library(httr) #設立DATAFRAME:Proxy Pool---- proxy_list = data.frame(x=1,y=2) colnames(proxy_list) = c("address","port") ##設立DATAFRAME:Check Proxy pool checked_proxy_list = data.frame(x=1,y=2) colnames(checked_proxy_list) = c("address","port") ##抓取Proxy ipdata <- read_html("https://free-proxy-list.net/") l = 1 for (i in 1:20){ current_proxy <- html_text(html_nodes(ipdata,xpath=sprintf('//*[@id="proxylisttable"]/tbody/tr[%s]/td[1]', l))) current_port <- html_text(html_nodes(ipdata,xpath=sprintf('//*[@id="proxylisttable"]/tbody/tr[%s]/td[2]', l))) proxy_list[i,] = c(current_proxy, current_port) result = try(GET(url = 'https://httpbin.org/ip', use_proxy(proxy_list[i,1],as.numeric(proxy_list[i,2])) ,timeout(200)),TRUE) ip_ok = grepl(proxy_list[i,1], result, fixed=TRUE) if(ip_ok == TRUE){ checked_proxy_list[i,] = c(proxy_list[i,1],as.numeric(proxy_list[i,2]))} l= l+1} ##刪除空直 checked_proxy_list = checked_proxy_list[!is.na(checked_proxy_list$address),] checked_proxy_list = checked_proxy_list[checked_proxy_list$address!=1,] ct = lapply(1:length(GameId),function(x){NA}) #抓取資料-------------------------------------------------------------- # 改區間k = 1 q = 0 w = 1 error_gameip = NULL repeat{ #改區間 if (k > 50000) break sprintf("number: %s",k) %>% print() ##改區間 if(k == 1){X = NULL} if( length(X) == 0) { q = q + 1 proxy_number = sample(length(checked_proxy_list[,1]),1) print("Change proxy_number") } print(paste0("Start at ",Sys.time())) X = tryCatch({GET(GameId[k], use_proxy(checked_proxy_list[proxy_number,1],as.numeric(checked_proxy_list[proxy_number,2])),timeout(30) )}, error = function(e) NULL) print(checked_proxy_list[proxy_number,1]) X <- tryCatch({read_html(X)},error = function(e) NULL) print(X) if (length(X) > 0) { gn <- html_nodes(X, '.page-title a') %>% html_text() pf <- html_nodes(X, '.core-platform b') %>% html_text() cs <- html_nodes(X, '.criticsite') %>% html_text() cd <- html_nodes(X, '.cdate') %>% html_text() cr <- html_nodes(X, '.criticrating') %>% html_text() metaC <- tryCatch(html_nodes(X,'.split') %>% html_text(),error = function(e)NULL) if (length(metaC) != 0) { metaC <- html_nodes(X,'.split') %>% html_text() metaC_link <- html_nodes(X,'.review_link a') %>% html_attr('href') metaC = strsplit(metaC[2],split = '\r\n\t\t\t\t\t',fixed = T) %>% unlist() metaC = metaC[3] cs <- c(cs,"MetaCritic") cd <- c(cd,metaC_link) cr <- c(cr,metaC) } if(length(metaC) == 0){ cs <- NA cd <- NA cr <- NA } ct_current = cbind(k,gn,pf,cs,cd,cr,GameId[k]) if (length(ct_current) != 5){ ct[k] = list(ct_current) sprintf('No. %s is done',k) %>% print() print(paste0("End at ",Sys.time())) k <- k + 1 q = 0 } if (length(ct_current)==5){ Sys.sleep(60) sprintf('No. %s is fail',k) %>% print() print(paste0("Error at ",date())) } } #每成功執行500筆資料,就重新抓一次proxy ip清單 if (k %% 1000 == 0){ ipdata <- read_html("https://free-proxy-list.net/") l = 1 for (i in 1:20){ current_proxy <- html_text(html_nodes(ipdata,xpath=sprintf('//*[@id="proxylisttable"]/tbody/tr[%s]/td[1]', l))) current_port <- html_text(html_nodes(ipdata,xpath=sprintf('//*[@id="proxylisttable"]/tbody/tr[%s]/td[2]', l))) proxy_list[i,] = c(current_proxy, current_port) l= l+1 #這裡很厲害,是利用檢查ip的網站回傳檢查proxyip是否有用 result = try(GET(url = 'https://httpbin.org/ip', use_proxy(proxy_list[i,1],as.numeric(proxy_list[i,2])) ,timeout(200)),TRUE) ip_ok = grepl(proxy_list[i,1], result, fixed=TRUE) #grepl怎麼用 if(ip_ok == TRUE){ checked_proxy_list[i,] = c(proxy_list[i,1],as.numeric(proxy_list[i,2]))}} checked_proxy_list = checked_proxy_list[!is.na(checked_proxy_list$address),] checked_proxy_list = checked_proxy_list[checked_proxy_list$address!=1,] print('重新抓取Proxy Pool')} if(k %% 5000 == 0) { p = sprintf("0505_thijs_1-to-%s.RData",k) # 改區間 save.image(sprintf("C:/Users/CCCM/Desktop/ttt-0511/%s",p)) print('save data') } if (q == 100){ error_gameip[w] = c(k,GameId[k]) %>% list() k = k + 1 w = w + 1 ipdata <- read_html("https://free-proxy-list.net/") l = 1 for (i in 1:20){ current_proxy <- html_text(html_nodes(ipdata,xpath=sprintf('//*[@id="proxylisttable"]/tbody/tr[%s]/td[1]', l))) current_port <- html_text(html_nodes(ipdata,xpath=sprintf('//*[@id="proxylisttable"]/tbody/tr[%s]/td[2]', l))) proxy_list[i,] = c(current_proxy, current_port) l= l+1 } for (i in 1:20){ result = try(GET(url = 'https://httpbin.org/ip', use_proxy(proxy_list[i,1],as.numeric(proxy_list[i,2]))) ,TRUE) ip_ok = grepl("origin", result, fixed=TRUE) if(ip_ok == TRUE){ checked_proxy_list[i,] = c(proxy_list[i,1],as.numeric(proxy_list[i,2])) q = 0}}}} #其他--------------------------------- data.critic <- data.frame(GameID = NA, Platform = NA, CriticSite = NA, CriticDate = NA, CriticRank = NA, GameLink = NA) ```