# Python coding ###### tags: `資料科學自學園` `Python` ```python global seed seed = 330001 import requests from lxml import html from lxml.html import fromstring import bs4 as bs from bs4 import BeautifulSoup import re import random import csv import time user_agent_list = [ #Chrome 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', #Firefox 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)', 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)', 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)' ] ##GET PROXY LIST def get_proxies(): global proxies print("Getting proxies") proxies = [] url = 'https://free-proxy-list.net/' response = requests.get(url) parser = fromstring(response.text) for i in parser.xpath('//tbody/tr')[:300]: if i.xpath('.//td[7][contains(text(),"yes")]'): #Grabbing IP and corresponding PORT proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]]) proxies.append(proxy) print("Number of proxies: {}".format(len(proxies))) def make_request(): global player_id global user_agent_list global proxies global soup global request_success print("Scraping player: {}".format(player_id)) try: current_user_agent = random.choice(user_agent_list) headers = {'User-Agent': current_user_agent, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8'} current_proxy = random.choice(proxies) print("Using proxy: {}".format(current_proxy)) https_proxy = "https://{}".format(current_proxy) proxyDict = {"https" : https_proxy} site = 'https://www.transfermarkt.com/transfermarkt/profil/spieler/{}'.format(player_id) result = requests.get(site,proxies=proxyDict,headers=headers,timeout=5) sauce = result.content soup = BeautifulSoup(sauce,'html.parser') if len(soup) > 0: request_success = 1 except Exception as e: print("--------error-----------") print e request_success = 0 proxies.remove(current_proxy) print ("Proxy error - removing proxy") print ("Proxies left: {}".format(len(proxies))) print("------------------------") def scrape_info(): global player_id global player_name global soup global info_success global big_info_list player_info_name = "" player_info_dob = "" player_info_pob = "" player_info_age = "" player_info_height = "" player_info_nat = "" player_info_nat1 = "" player_info_nat2 = "" player_info_pos = "" player_info_foot = "" player_info_cclub = "" player_info_since = "" player_info_until = "" data = [] data1 = [] info_list = [] table = [] rows = [] headers = [] try: table = soup.find('table', attrs={'class':'auflistung'}) rows = table.find_all('tr') for row in rows: cols = row.find_all('td') cols = [ele.text.encode('utf-8').strip() for ele in cols] data.append([ele for ele in cols if ele]) # Get rid of empty values headers = table.find_all('tr') for header in headers: heads = header.find_all('th') heads = [ele.text.encode('utf-8').strip() for ele in heads] data1.append([ele for ele in heads if ele]) # Get rid of empty values except Exception as e: print("--------error-----------") print e info_success = 0 print ("No data in soup") print("------------------------") try: for i in range(len(data1)): if data1[i][0] == "Date of birth:" and data[i][0] != "N/A" and data[i][0] != "-": player_info_dob = data[i][0] if data1[i][0] == "Place of birth:" and data[i][0] != "N/A" and data[i][0] != "-": player_info_pob = data[i][0] if data1[i][0] == "Age:" and data[i][0] != "N/A" and data[i][0] != "-": player_info_age = data[i][0] if data1[i][0] == "Height:" and data[i][0] != "N/A" and data[i][0] != "-": player_info_height = data[i][0] if data1[i][0] == "Nationality:" and data[i][0] != "N/A" and data[i][0] != "-": player_info_nat = data[i][0] if data1[i][0] == "Position:" and data[i][0] != "N/A" and data[i][0] != "-": player_info_pos = data[i][0] if data1[i][0] == "Foot:" and data[i][0] != "N/A" and data[i][0] != "-": player_info_foot = data[i][0] if data1[i][0] == "Current club:" and data[i][0] != "N/A" and data[i][0] != "-": player_info_cclub = data[i][0] if data1[i][0] == "In the team since:" and data[i][0] != "N/A" and data[i][0] != "-": player_info_since = data[i][0] if data1[i][0] == "Contract until:" and data[i][0] != "N/A" and data[i][0] != "-": player_info_until = data[i][0] split_nat = player_info_nat.split('\xc2\xa0\xc2\xa0') if len(split_nat) > 2: player_info_nat1 = split_nat[1] player_info_nat2 = split_nat[2] elif len(split_nat) > 0: player_info_nat1 = player_info_nat.replace('\xc2\xa0\xc2\xa0','') player_info_nat2 = "" else: player_info_nat1 = "" player_info_nat2 = "" if player_info_dob == "": player_info_dob = "NA" if player_info_pob == "": player_info_pob = "NA" if player_info_age == "": player_info_age = "NA" if player_info_height == "": player_info_height = "NA" if player_info_nat1 == "": player_info_nat1 = "NA" if player_info_nat2 == "": player_info_nat2 = "NA" if player_info_pos == "": player_info_pos = "NA" if player_info_foot == "": player_info_foot = "NA" if player_info_cclub == "": player_info_cclub = "NA" if player_info_since == "": player_info_since = "NA" if player_info_until == "": player_info_until = "NA" except Exception as e: print("--------error-----------") print e print("no data in data1") info_success = 0 try: patFinder = re.compile('http://www.transfermarkt.com/(.*)/profil') findPat = re.findall(patFinder,str(soup)) if len(findPat) > 0: player_name = findPat[0] except Exception as e: print("--------error-----------") print e print("No player name") try: print(player_name) info_list.append(player_id) info_list.append(player_name.decode('ascii','ignore')) info_list.append(player_info_dob.decode('ascii','ignore')) info_list.append(player_info_pob.decode('ascii','ignore')) info_list.append(player_info_age.decode('ascii','ignore')) info_list.append(player_info_height.decode('ascii','ignore')) info_list.append(player_info_nat1.decode('ascii','ignore')) info_list.append(player_info_nat2.decode('ascii','ignore')) info_list.append(player_info_pos.decode('ascii','ignore')) info_list.append(player_info_foot.decode('ascii','ignore')) info_list.append(player_info_cclub.decode('ascii','ignore')) info_list.append(player_info_since.decode('ascii','ignore')) info_list.append(player_info_until.decode('ascii','ignore')) if len(info_list) == 13: big_info_list.append(info_list) info_success = 1 except Exception as e: print("--------error-----------") print e print("no data in info_list") info_success = 0 def write_csv(): global big_info_list global seed myFile = open('transfermarket_player_data_{}-{}.csv'.format(seed,seed+9999), 'w') with myFile: writer = csv.writer(myFile) writer.writerow(["player_id", "player_name", "player_dob", "player_pob", "player_age", "player_height", "player_nat1", "player_nat2", "player_pos", "player_foot", "player_cclub", "player_since", "player_until"]) for record in big_info_list: writer.writerow(record) global count count = 1 global proxies proxies = [] global big_info_list big_info_list = [] global request_success request_success = 0 global info_success info_success = 0 global player_name player_name = "" start = time.time() for i in range(seed,seed+100): player_id = i player_name = "" count += 1 request_success = 0 info_success = 0 if len(proxies) < 5: get_proxies() while request_success == 0: make_request() while info_success == 0: scrape_info() if count > 10: write_csv() count = 1 end = time.time() print end-start ```