# Python scraper
###### tags: `notes`
https://github.com/REMitchell/python-scraping
### Python虛擬環境
```
$ virtualenv scrapingEnv
$ cd scrapingEnv
$ source bin/activate
```
## WEB CRAWLER
```python
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
def getTitle(url):
try:
html = urlopen(url)
# 主機找不到此頁面 (e.g. 404 Page Not Found)
except HTTPError as e:
return None
try:
bsObj = BeautifulSoup(html.read(), "lxml")
title = bsObj.body.h1
# 找不到主機 (e.g. 斷線, 打錯網址) => None Obj
except AttributeError as e:
return None
return title
title = getTitle("http://www.pythonscraping.com/pages/page1.html")
if title == None:
print("Title could not be found")
else:
print(title)
```
___
```python
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re # regular expression
html = urlopen('http://www.pythonscraping.com/pages/warandpeace.html')
bs = BeautifulSoup(html, "html.parser")
# findAll(tag, attr, recursive, text, limit, keywords)
nameList = bs.findAll('span', {'class': 'green'})
for name in nameList:
print(name.get_text()) # .get_text 去掉標籤
for child in bs.find('table',{'id':'giftList'}).children: # vs. descendants
print(child) # 印出 tr, th, td ...
# for sibling in bs.table.tr.next_siblings: # 排除標題列
# .previous_siblings, .parent
images = bs.find_all('img', {'src':re.compile('\.\.\/img\/gifts/img.*\.jpg')})
for image in images:
print(image['src'])
# bs.find_all(lambda tag: len(tag.attrs) == 2)
```
___
```python
# wiki文章頁面的URL不包含冒號,但檔案上傳頁、討論頁有冒號
# python 預設遞迴限制 1000 次
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
pages = set()
def getLinks(pageUrl):
global pages
html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
if 'href' in link.attrs:
if link.attrs['href'] not in pages:
#We have encountered a new page
newPage = link.attrs['href']
print(newPage)
pages.add(newPage)
getLinks(newPage)
getLinks('')
```
```python=
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
# urlparse(startingPage).scheme => https
# urlparse(startingPage).netloc => www.w3schools.com
#Retrieves a list of all Internal links found on a page
def getInternalLinks(bs, includeUrl):
includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme, urlparse(includeUrl).netloc)
internalLinks = []
#Finds all links that begin with a "/"
for link in bs.find_all('a', href=re.compile('^(/|.*'+includeUrl+')')):
if link.attrs['href'] is not None:
if link.attrs['href'] not in internalLinks:
if(link.attrs['href'].startswith('/')):
internalLinks.append(includeUrl+link.attrs['href'])
else:
internalLinks.append(link.attrs['href'])
return internalLinks
#Retrieves a list of all external links found on a page
def getExternalLinks(bs, excludeUrl):
externalLinks = []
#Finds all links that start with "http" that do
#not contain the current URL
for link in bs.find_all('a', href=re.compile('^(http|www)((?!'+excludeUrl+').)*$')):
if link.attrs['href'] is not None:
if link.attrs['href'] not in externalLinks:
externalLinks.append(link.attrs['href'])
return externalLinks
allExtLinks = set()
allIntLinks = set()
def getAllExternalLinks(siteUrl):
html = urlopen(siteUrl)
domain = '{}://{}'.format(urlparse(siteUrl).scheme, urlparse(siteUrl).netloc)
bs = BeautifulSoup(html, 'html.parser')
internalLinks = getInternalLinks(bs, domain)
externalLinks = getExternalLinks(bs, domain)
for link in externalLinks:
if link not in allExtLinks:
allExtLinks.add(link)
print(link)
for link in internalLinks:
if link not in allIntLinks:
allIntLinks.add(link)
getAllExternalLinks(link)
allIntLinks.add('http://oreilly.com')
getAllExternalLinks('http://oreilly.com')
```
```
$ scrapy project Name
```
___
* GET : 透過瀏覽器的位址列前往網站的動作
* POST : 傳送資料給主機上的某個後端程式
* PUT : 更新一個物件或資訊 (大多被 POST 取代)
* DELETE : 刪除物件
```python
import json
jsonString = '''{ ...... }'''
jsonObj = json.loads(jsonString)
print(jsonObj.get("..."))
```
___
## STORE DATA
* 儲存媒體檔參考(URL) -> hotlinking
* 儲存媒體檔檔案
```python
import os
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
# urllib.request.urlretrieve 從遠端 URL 下載檔案
downloadDirectory = 'downloaded'
baseUrl = 'http://pythonscraping.com'
def getAbsoluteURL(baseUrl, source):
if source.startswith('http://www.'):
url = 'http://{}'.format(source[11:])
elif source.startswith('http://'):
url = source
elif source.startswith('www.'):
url = source[4:]
url = 'http://{}'.format(source)
else:
url = '{}/{}'.format(baseUrl, source)
if baseUrl not in url:
return None
return url
def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):
path = absoluteUrl.replace('www.', '')
path = path.replace(baseUrl, '')
path = downloadDirectory+path
directory = os.path.dirname(path)
if not os.path.exists(directory):
os.makedirs(directory)
return path
html = urlopen('http://www.pythonscraping.com')
bs = BeautifulSoup(html, 'html.parser')
downloadList = bs.findAll(src=True)
for download in downloadList:
fileUrl = getAbsoluteURL(baseUrl, download['src'])
if fileUrl is not None:
print(fileUrl)
urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory))
```
```python
import pymysql # 需要額外裝(好像沒有 pip install)
# MySQL 預設 DATABASE CHARACTER SET, TABLE CHARACTER SET, VARCHAR 為 utf8mb4 -> ALTER utf8mb4_unicode_ci
conn = pymysql.connect(host='127.0.0.1', unix_socket='/tmp/mysql.sock', user='root', passwd='root', db='mysql', charset='utf8')
cur = conn.cursor()
cur.execute('USE scraping')
cur.execute('SELECT * FROM ... WHERE ...')
print(cur.fetchone())
try:
cur.execute("INSERT INTO ... VALUES ...")
cur.connection.commit()
finally:
cur.close()
conn.close()
```
## MAIL
```python
import smtplib
from email.mime.text import MIMEText
msg = MIMEText('The body of the email is here')
msg['Subject'] = 'An Email Alert'
msg['From'] = 'ryan@pythonscraping.com'
msg['To'] = 'webmaster@pythonscraping.com'
s = smtplib.SMTP('localhost')
s.send_message(msg)
s.quit()
```
___
## ENCODING
* ASCII : '0' + 7-bit
* UTF-8 : 表現一個字元至少 8 bit,最多 4 bytes
* '0' + 7-bit : 與 ASCII 表達字元相同
* '1' + .... : 超多 ASCII 表達範圍
```python
html = urlopen("http://en.wikipedia.org/wiki/Python_(programming_language)")
bs = BeautifulSoup(html, "html.parser")
content = bs.find("div", {"id":"mw-content-text"}).get_text()
content = bytes(content, "UTF-8")
content = content.decode("UTF-8")
print(content) # check <meta charset="utf-8">
```
## CSV
```python
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen('http://en.wikipedia.org/wiki/Comparison_of_text_editors')
bs = BeautifulSoup(html, 'html.parser')
# The main comparison table is currently the first table on the page
table = bs.findAll('table',{'class':'wikitable'})[0]
rows = table.findAll('tr')
csvFile = open('editors.csv', 'wt+') # wt 寫文件時會用 \r\n 表示換行
writer = csv.writer(csvFile)
try:
for row in rows:
csvRow = []
for cell in row.findAll(['td', 'th']):
csvRow.append(cell.get_text())
writer.writerow(csvRow)
finally:
csvFile.close()
```
```python
import csv
from urllib.request import urlopen
from io import StringIO
data = urlopen('http://pythonscraping.com/files/MontyPythonAlbums.csv').read().decode('ascii', 'ignore')
dataFile = StringIO(data)
# 會印出標題列
csvReader = csv.reader(dataFile)
for row in csvReader:
print(row)
# 標題列不印出
dictReader = csv.DictReader(dataFile)
print(dictReader.fieldnames)
for row in dictReader:
print(row)
```
___
## CLEAN DATA
https://ithelp.ithome.com.tw/articles/10222163
```python
import re
import string
content = re.sub('\n|[[\d+\]]', ' ', content)
# content = re.sub('\n+', ' ', content)
# content = re.sub('[[0-9]*\]', ' ', content)
# content = re.sub(' +', ' ', content)
content = bytes(content, 'UTF-8') # 消除跳脫字元
content = content.decode('ascii', 'ignore')
content = content.split(' ')
words=[]
for item in content:
item = item.strip(string.punctuation + string.whitespace)
if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'):
words.append(item)
```
```
from collections import OrderedDict
from collections import Counter
import operator
sortedNGrams = sorted(ngrams.items(), key=operator.itemgetter(1), reverse=True)
```
## FORMS
* `<form enctype=" ... ">`
* application/x-www-form-urlencoded : 預設,發送前編碼所有字符
* multipart/form-data : 不對字符編碼,在使用包含文件上傳的表單時須使用該值
* text/plain : 空格轉換成 "+" 加號,但不對特殊字符編碼
```python
import requests
# 傳送資料的目的地網址 -> <form method="POST" action=" ... " >
# 填資料的欄位 -> <input type="text" name=" ... " >
params = {'firstname': 'Ryan', 'lastname': 'Mitchell'}
r = requests.post("http://pythonscraping.com/pages/processing.php", data=params)
print(r.text)
# 填資料的欄位 -> <input type="file" name=" ... " >
files = {'uploadFile': open('files/Python-logo.png', 'rb')} # 二進位檔案讀取
r = requests.post('http://pythonscraping.com/pages/processing2.php', files=files)
print(r.text)
```
```python
import requests
params = {'username': 'Ryan', 'password': 'password'}
r = requests.post('http://pythonscraping.com/pages/cookies/welcome.php', params)
print('Cookie is set to:')
print(r.cookies.get_dict())
r = requests.get('http://pythonscraping.com/pages/cookies/profile.php', cookies=r.cookies)
print(r.text)
```
```python
import requests
from requests.auth import AuthBase
from requests.auth import HTTPBasicAuth
auth = HTTPBasicAuth('ryan', 'password')
r = requests.post(url='http://pythonscraping.com/pages/auth/login.php', auth=auth)
print(r.text)
```
___
## JAVASCRIPT
* Ajax : 用來對 web server 傳送或接收資料,但不必為此多請求一個新頁面的技術
* DHTML : 隨著用戶端 script 修改,頁面內容而有所變化
* Selenium + PhantomJS(headless瀏覽器)
```python
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(executable_path='drivers/chromedriver', options=chrome_options)
driver.get('http://pythonscraping.com/pages/javascript/ajaxDemo.html')
try:
element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'loadedButton')))
finally:
print(driver.find_element_by_id('content').text)
driver.close()
# pageSource = driver.page_source
# bsObj = BeautifulSoup(pageSource)
# print(bsObj.find(id="content").get_text())
# selector
# driver.find_elements_by_id('content').text
# locator
# driver.find_elements(By.ID, 'content').text
# ID, CLASS_NAME, CSS_SELECTOR, LINK_TEXT, PARTIAL_LINK_TEXT, NAME, TAG_NAME, XPATH
```
```python
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.remote.webelement import WebElement
from selenium.common.exceptions import StaleElementReferenceException
import time
# monitor one DOM element to check whether in this page
def waitForLoad(driver):
elem = driver.find_element_by_tag_name("html")
count = 0
while True:
count += 1
if count > 20:
print("Timing out after 10 seconds and returning")
return
time.sleep(.5)
try:
elem == driver.find_element_by_tag_name("html")
except StaleElementReferenceException:
return
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(executable_path='drivers/chromedriver', options=chrome_options)
driver.get("http://pythonscraping.com/pages/javascript/redirectDemo1.html")
waitForLoad(driver)
print(driver.page_source)
driver.close()
```