# Python coding
###### tags: `資料科學自學園` `Python`
```python
global seed
seed = 330001
import requests
from lxml import html
from lxml.html import fromstring
import bs4 as bs
from bs4 import BeautifulSoup
import re
import random
import csv
import time
user_agent_list = [
#Chrome
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
#Firefox
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
]
##GET PROXY LIST
def get_proxies():
global proxies
print("Getting proxies")
proxies = []
url = 'https://free-proxy-list.net/'
response = requests.get(url)
parser = fromstring(response.text)
for i in parser.xpath('//tbody/tr')[:300]:
if i.xpath('.//td[7][contains(text(),"yes")]'):
#Grabbing IP and corresponding PORT
proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
proxies.append(proxy)
print("Number of proxies: {}".format(len(proxies)))
def make_request():
global player_id
global user_agent_list
global proxies
global soup
global request_success
print("Scraping player: {}".format(player_id))
try:
current_user_agent = random.choice(user_agent_list)
headers = {'User-Agent': current_user_agent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8'}
current_proxy = random.choice(proxies)
print("Using proxy: {}".format(current_proxy))
https_proxy = "https://{}".format(current_proxy)
proxyDict = {"https" : https_proxy}
site = 'https://www.transfermarkt.com/transfermarkt/profil/spieler/{}'.format(player_id)
result = requests.get(site,proxies=proxyDict,headers=headers,timeout=5)
sauce = result.content
soup = BeautifulSoup(sauce,'html.parser')
if len(soup) > 0:
request_success = 1
except Exception as e:
print("--------error-----------")
print e
request_success = 0
proxies.remove(current_proxy)
print ("Proxy error - removing proxy")
print ("Proxies left: {}".format(len(proxies)))
print("------------------------")
def scrape_info():
global player_id
global player_name
global soup
global info_success
global big_info_list
player_info_name = ""
player_info_dob = ""
player_info_pob = ""
player_info_age = ""
player_info_height = ""
player_info_nat = ""
player_info_nat1 = ""
player_info_nat2 = ""
player_info_pos = ""
player_info_foot = ""
player_info_cclub = ""
player_info_since = ""
player_info_until = ""
data = []
data1 = []
info_list = []
table = []
rows = []
headers = []
try:
table = soup.find('table', attrs={'class':'auflistung'})
rows = table.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.encode('utf-8').strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
headers = table.find_all('tr')
for header in headers:
heads = header.find_all('th')
heads = [ele.text.encode('utf-8').strip() for ele in heads]
data1.append([ele for ele in heads if ele]) # Get rid of empty values
except Exception as e:
print("--------error-----------")
print e
info_success = 0
print ("No data in soup")
print("------------------------")
try:
for i in range(len(data1)):
if data1[i][0] == "Date of birth:" and data[i][0] != "N/A" and data[i][0] != "-":
player_info_dob = data[i][0]
if data1[i][0] == "Place of birth:" and data[i][0] != "N/A" and data[i][0] != "-":
player_info_pob = data[i][0]
if data1[i][0] == "Age:" and data[i][0] != "N/A" and data[i][0] != "-":
player_info_age = data[i][0]
if data1[i][0] == "Height:" and data[i][0] != "N/A" and data[i][0] != "-":
player_info_height = data[i][0]
if data1[i][0] == "Nationality:" and data[i][0] != "N/A" and data[i][0] != "-":
player_info_nat = data[i][0]
if data1[i][0] == "Position:" and data[i][0] != "N/A" and data[i][0] != "-":
player_info_pos = data[i][0]
if data1[i][0] == "Foot:" and data[i][0] != "N/A" and data[i][0] != "-":
player_info_foot = data[i][0]
if data1[i][0] == "Current club:" and data[i][0] != "N/A" and data[i][0] != "-":
player_info_cclub = data[i][0]
if data1[i][0] == "In the team since:" and data[i][0] != "N/A" and data[i][0] != "-":
player_info_since = data[i][0]
if data1[i][0] == "Contract until:" and data[i][0] != "N/A" and data[i][0] != "-":
player_info_until = data[i][0]
split_nat = player_info_nat.split('\xc2\xa0\xc2\xa0')
if len(split_nat) > 2:
player_info_nat1 = split_nat[1]
player_info_nat2 = split_nat[2]
elif len(split_nat) > 0:
player_info_nat1 = player_info_nat.replace('\xc2\xa0\xc2\xa0','')
player_info_nat2 = ""
else:
player_info_nat1 = ""
player_info_nat2 = ""
if player_info_dob == "":
player_info_dob = "NA"
if player_info_pob == "":
player_info_pob = "NA"
if player_info_age == "":
player_info_age = "NA"
if player_info_height == "":
player_info_height = "NA"
if player_info_nat1 == "":
player_info_nat1 = "NA"
if player_info_nat2 == "":
player_info_nat2 = "NA"
if player_info_pos == "":
player_info_pos = "NA"
if player_info_foot == "":
player_info_foot = "NA"
if player_info_cclub == "":
player_info_cclub = "NA"
if player_info_since == "":
player_info_since = "NA"
if player_info_until == "":
player_info_until = "NA"
except Exception as e:
print("--------error-----------")
print e
print("no data in data1")
info_success = 0
try:
patFinder = re.compile('http://www.transfermarkt.com/(.*)/profil')
findPat = re.findall(patFinder,str(soup))
if len(findPat) > 0:
player_name = findPat[0]
except Exception as e:
print("--------error-----------")
print e
print("No player name")
try:
print(player_name)
info_list.append(player_id)
info_list.append(player_name.decode('ascii','ignore'))
info_list.append(player_info_dob.decode('ascii','ignore'))
info_list.append(player_info_pob.decode('ascii','ignore'))
info_list.append(player_info_age.decode('ascii','ignore'))
info_list.append(player_info_height.decode('ascii','ignore'))
info_list.append(player_info_nat1.decode('ascii','ignore'))
info_list.append(player_info_nat2.decode('ascii','ignore'))
info_list.append(player_info_pos.decode('ascii','ignore'))
info_list.append(player_info_foot.decode('ascii','ignore'))
info_list.append(player_info_cclub.decode('ascii','ignore'))
info_list.append(player_info_since.decode('ascii','ignore'))
info_list.append(player_info_until.decode('ascii','ignore'))
if len(info_list) == 13:
big_info_list.append(info_list)
info_success = 1
except Exception as e:
print("--------error-----------")
print e
print("no data in info_list")
info_success = 0
def write_csv():
global big_info_list
global seed
myFile = open('transfermarket_player_data_{}-{}.csv'.format(seed,seed+9999), 'w')
with myFile:
writer = csv.writer(myFile)
writer.writerow(["player_id",
"player_name",
"player_dob",
"player_pob",
"player_age",
"player_height",
"player_nat1",
"player_nat2",
"player_pos",
"player_foot",
"player_cclub",
"player_since",
"player_until"])
for record in big_info_list:
writer.writerow(record)
global count
count = 1
global proxies
proxies = []
global big_info_list
big_info_list = []
global request_success
request_success = 0
global info_success
info_success = 0
global player_name
player_name = ""
start = time.time()
for i in range(seed,seed+100):
player_id = i
player_name = ""
count += 1
request_success = 0
info_success = 0
if len(proxies) < 5:
get_proxies()
while request_success == 0:
make_request()
while info_success == 0:
scrape_info()
if count > 10:
write_csv()
count = 1
end = time.time()
print end-start
```