# 0512_coding
###### tags: `資料科學自學園` `R Language`
[datalink](https://github.com/TreeZi/MIE03HKRUD03/blob/master/GameId)
```{r}
#rm(list=ls())
k = 1
#讀取資料
GameId <- read.table("D:/CrawlingData_Jan/GameId", quote="\"", comment.char="") #路徑要更改
GameId = as.character(GameId[,1])
#設定環境
install.packages("xml2")
install.packages("rvest")
install.packages("magrittr")
install.packages("httr")
library(xml2)
library(rvest)
library(magrittr)
library(httr)
#設立DATAFRAME:Proxy Pool----
proxy_list = data.frame(x=1,y=2)
colnames(proxy_list) = c("address","port")
##設立DATAFRAME:Check Proxy pool
checked_proxy_list = data.frame(x=1,y=2)
colnames(checked_proxy_list) = c("address","port")
##抓取Proxy
ipdata <- read_html("https://free-proxy-list.net/")
l = 1
for (i in 1:20){
current_proxy <- html_text(html_nodes(ipdata,xpath=sprintf('//*[@id="proxylisttable"]/tbody/tr[%s]/td[1]', l)))
current_port <- html_text(html_nodes(ipdata,xpath=sprintf('//*[@id="proxylisttable"]/tbody/tr[%s]/td[2]', l)))
proxy_list[i,] = c(current_proxy, current_port)
result = try(GET(url = 'https://httpbin.org/ip',
use_proxy(proxy_list[i,1],as.numeric(proxy_list[i,2]))
,timeout(200)),TRUE)
ip_ok = grepl(proxy_list[i,1], result, fixed=TRUE)
if(ip_ok == TRUE){
checked_proxy_list[i,] = c(proxy_list[i,1],as.numeric(proxy_list[i,2]))}
l= l+1}
##刪除空直
checked_proxy_list = checked_proxy_list[!is.na(checked_proxy_list$address),]
checked_proxy_list = checked_proxy_list[checked_proxy_list$address!=1,]
ct = lapply(1:length(GameId),function(x){NA})
#抓取資料--------------------------------------------------------------
#
改區間k = 1
q = 0
w = 1
error_gameip = NULL
repeat{
#改區間
if (k > 50000) break
sprintf("number: %s",k) %>% print()
##改區間
if(k == 1){X = NULL}
if( length(X) == 0) {
q = q + 1
proxy_number = sample(length(checked_proxy_list[,1]),1)
print("Change proxy_number")
}
print(paste0("Start at ",Sys.time()))
X = tryCatch({GET(GameId[k],
use_proxy(checked_proxy_list[proxy_number,1],as.numeric(checked_proxy_list[proxy_number,2])),timeout(30)
)},
error = function(e) NULL)
print(checked_proxy_list[proxy_number,1])
X <- tryCatch({read_html(X)},error = function(e) NULL)
print(X)
if (length(X) > 0) {
gn <- html_nodes(X, '.page-title a') %>% html_text()
pf <- html_nodes(X, '.core-platform b') %>% html_text()
cs <- html_nodes(X, '.criticsite') %>% html_text()
cd <- html_nodes(X, '.cdate') %>% html_text()
cr <- html_nodes(X, '.criticrating') %>% html_text()
metaC <- tryCatch(html_nodes(X,'.split') %>% html_text(),error = function(e)NULL)
if (length(metaC) != 0) {
metaC <- html_nodes(X,'.split') %>% html_text()
metaC_link <- html_nodes(X,'.review_link a') %>% html_attr('href')
metaC = strsplit(metaC[2],split = '\r\n\t\t\t\t\t',fixed = T) %>% unlist()
metaC = metaC[3]
cs <- c(cs,"MetaCritic")
cd <- c(cd,metaC_link)
cr <- c(cr,metaC)
}
if(length(metaC) == 0){
cs <- NA
cd <- NA
cr <- NA
}
ct_current = cbind(k,gn,pf,cs,cd,cr,GameId[k])
if (length(ct_current) != 5){
ct[k] = list(ct_current)
sprintf('No. %s is done',k) %>% print()
print(paste0("End at ",Sys.time()))
k <- k + 1
q = 0
}
if (length(ct_current)==5){
Sys.sleep(60)
sprintf('No. %s is fail',k) %>% print()
print(paste0("Error at ",date()))
}
}
#每成功執行500筆資料,就重新抓一次proxy ip清單
if (k %% 1000 == 0){
ipdata <- read_html("https://free-proxy-list.net/")
l = 1
for (i in 1:20){
current_proxy <- html_text(html_nodes(ipdata,xpath=sprintf('//*[@id="proxylisttable"]/tbody/tr[%s]/td[1]', l)))
current_port <- html_text(html_nodes(ipdata,xpath=sprintf('//*[@id="proxylisttable"]/tbody/tr[%s]/td[2]', l)))
proxy_list[i,] = c(current_proxy, current_port)
l= l+1
#這裡很厲害,是利用檢查ip的網站回傳檢查proxyip是否有用
result = try(GET(url = 'https://httpbin.org/ip',
use_proxy(proxy_list[i,1],as.numeric(proxy_list[i,2]))
,timeout(200)),TRUE)
ip_ok = grepl(proxy_list[i,1], result, fixed=TRUE)
#grepl怎麼用
if(ip_ok == TRUE){
checked_proxy_list[i,] = c(proxy_list[i,1],as.numeric(proxy_list[i,2]))}}
checked_proxy_list = checked_proxy_list[!is.na(checked_proxy_list$address),]
checked_proxy_list = checked_proxy_list[checked_proxy_list$address!=1,]
print('重新抓取Proxy Pool')}
if(k %% 5000 == 0) {
p = sprintf("0505_thijs_1-to-%s.RData",k)
# 改區間
save.image(sprintf("C:/Users/CCCM/Desktop/ttt-0511/%s",p))
print('save data')
}
if (q == 100){
error_gameip[w] = c(k,GameId[k]) %>% list()
k = k + 1
w = w + 1
ipdata <- read_html("https://free-proxy-list.net/")
l = 1
for (i in 1:20){
current_proxy <- html_text(html_nodes(ipdata,xpath=sprintf('//*[@id="proxylisttable"]/tbody/tr[%s]/td[1]', l)))
current_port <- html_text(html_nodes(ipdata,xpath=sprintf('//*[@id="proxylisttable"]/tbody/tr[%s]/td[2]', l)))
proxy_list[i,] = c(current_proxy, current_port)
l= l+1
}
for (i in 1:20){
result = try(GET(url = 'https://httpbin.org/ip',
use_proxy(proxy_list[i,1],as.numeric(proxy_list[i,2])))
,TRUE)
ip_ok = grepl("origin", result, fixed=TRUE)
if(ip_ok == TRUE){
checked_proxy_list[i,] = c(proxy_list[i,1],as.numeric(proxy_list[i,2]))
q = 0}}}}
#其他---------------------------------
data.critic <- data.frame(GameID = NA,
Platform = NA,
CriticSite = NA,
CriticDate = NA,
CriticRank = NA,
GameLink = NA)
```