---
disqus: hackmd
---
PPT home sale crawler
===
以 Python 編寫一個會爬取 PTT Home Sale 版中的房屋資訊的爬蟲,並存成json格式的檔案。
## 撰寫爬蟲程式
撰寫一個爬蟲程式,使用 requests, BeautifulSoup 與 Regular Expression 套件擷取並解析 PTT Home Sale 版的房屋資訊,最後把資料存成 json 格式的資料檔案。
## Colab demo :
https://colab.research.google.com/drive/1IWgxppdIZc7Lt2fZtGeJGHQ3ugdtHrlV?usp=sharing
## 準備套件
```python=
import requests
import re
import urllib3
import json
from bs4 import BeautifulSoup
urllib3.disable_warnings()
```
## 網站域名
```python=
allowed_domains='https://www.ptt.cc'
```
## 宣告 List
```python=
buy=[]
sell=[]
```
## 網站請求函數
```python=
def get_web_page(url):
if url:
res=requests.get(url=url,verify=False)
soup=BeautifulSoup(res.text,'lxml')
return (soup,url)
else:
return None,None
```
## 網頁翻頁函數
```python=
def next_page(soup):
if(soup.find('div',{'class':'btn-group btn-group-paging'}).find('a','btn wide disabled',string="‹ 上頁")):
return None
else:
result=soup.find('div',{'class':'btn-group btn-group-paging'}).find("a",string="‹ 上頁").get('href')
link=allowed_domains+result
return link
```
## 擷取賣方頁面的房屋資訊函數
```python=
def data_extraction_sell(string):
i=1
data={}
if re.search(r"房屋類型:(.+)",string)!=None:
data['房屋類型']=re.search(r"房屋類型:(.+)",string).group(i)
else:
data['房屋類型']="record not found"
if re.search(r"建物權狀:(\d+(\.\d+))",string)!=None:
data['建物權狀']=re.search(r"建物權狀:(\d+(\.\d+))",string).group(i)
else:
data['建物權狀']="record not found"
if re.search(r"附屬建物:(\d+(\.\d+))",string)!=None:
data['附屬建物']=re.search(r"附屬建物:(\d+(\.\d+))",string).group(i)
else:
data['附屬建物']="record not found"
if re.search(r"共同使用:(\d+(\.\d+))",string)!=None:
data['共同使用']=re.search(r"共同使用:(\d+(\.\d+))",string).group(i)
else:
data['共同使用']="record not found"
if re.search(r"車位:(\d+(\.\d+))",string)!=None:
data['車位']=re.search(r"車位:(\d+(\.\d+))",string).group(i)
else:
data['車位']="record not found"
if re.search(r"建物權狀是否包含車位坪數:(.+)",string)!=None:
data['建物權狀是否包含車位坪數']=re.search(r"建物權狀是否包含車位坪數:(.+)",string).group(i)
else:
data['建物權狀是否包含車位坪數']="record not found"
if re.search(r"土地權狀:(\d+(\.\d+))",string)!=None:
data['土地權狀']=re.search(r"土地權狀:(\d+(\.\d+))",string).group(i)
else:
data['土地權狀']="record not found"
if re.search(r"格局:(.+)",string)!=None:
data['格局']=re.search(r"格局:(.+)",string).group(i)
else:
data['格局']="record not found"
if re.search(r"開價:(\d+(\.\d+))",string)!=None:
data['開價']=re.search(r"開價:(\d+(\.\d+))",string).group(i)
else:
data['開價']="record not found"
return data
```
## 擷取買方頁面的房屋資訊函數
```python=
def data_extraction_buy(string):
i=1
data={}
if re.search(r"房屋地點:(.+)",string)!=None:
data['房屋地點']=re.search(r"房屋地點:(.+)",string).group(i)
else:
data['房屋地點']="record not found"
if re.search(r"房屋屋齡:(\d+(\.\d+))",string)!=None:
data['房屋屋齡']=re.search(r"房屋屋齡:(\d+(\.\d+))",string).group(i)
else:
data['房屋屋齡']="record not found"
if re.search(r"房屋類型:(.+)",string)!=None:
data['房屋類型']=re.search(r"房屋類型:(.+)",string).group(i)
else:
data['房屋類型']="record not found"
if re.search(r"房屋坪數:(\d+(\.\d+))",string)!=None:
data['房屋坪數']=re.search(r"房屋坪數:(\d+(\.\d+))",string).group(i)
else:
data['房屋坪數']="record not found"
if re.search(r"停車位:(\d+(\.\d+))",string)!=None:
data['停車位']=re.search(r"停車位:(\d+(\.\d+))",string).group(i)
else:
data['停車位']="record not found"
if re.search(r"車位種類:(.+)",string)!=None:
data['車位種類']=re.search(r"車位種類:(.+)",string).group(i)
else:
data['車位種類']="record not found"
if re.search(r"房屋格局:(.+)",string)!=None:
data['房屋格局']=re.search(r"房屋格局:(.+)",string).group(i)
else:
data['房屋格局']="record not found"
if re.search(r"是否加蓋:(.+)",string)!=None:
data['是否加蓋']=re.search(r"是否加蓋:(.+)",string).group(i)
else:
data['是否加蓋']="record not found"
if re.search(r"房屋座向:(.+)",string)!=None:
data['房屋座向']=re.search(r"房屋座向:(.+)",string).group(i)
else:
data['房屋座向']="record not found"
if re.search(r"買價:(\d+(\.\d+))",string)!=None:
data['買價']=re.search(r"買價:(\d+(\.\d+))",string).group(i)
else:
data['買價']="record not found"
if re.search(r"需求時間:(.+)",string)!=None:
data['需求時間']=re.search(r"需求時間:(.+)",string).group(i)
if re.search(r"聯絡方式:(.+)",string)!=None:
data['聯絡方式']=re.search(r"聯絡方式:(.+)",string).group(i)
else:
data['聯絡方式']="record not found"
if re.search(r"其他聯絡:(.+)",string)!=None:
data['其他聯絡']=re.search(r"其他聯絡:(.+)",string).group(i)
else:
data['其他聯絡']="record not found"
return data
```
## 時間解析函數
```python=
def find_date(date):
first=Date.find(' ')
second=Date.find(' ',first+1)
third=Date.find(' ',second+1)
fourth=Date.find(' ',third+1)
day=Date[:first]
date=Date[first+1:third]
time=Date[third+1:fourth]
year=Date[fourth+1:]
return day,date,time,year
```
## 儲存Json格式的函數
```python=
def save_data_json(soup,title_name):
global buy
global sell
global day,date,time,year,Date
start_quote=title_name.find('/')
end_quote=title_name.find('/', start_quote+1)
county=title_name[start_quote+1:end_quote]
try:
Date=soup.select('.article-meta-value')[3].text
except:
pass
else:
day,date,time,year=find_date(Date)
string=re.sub(r"[ \u3000\f\r\t\v]","",soup.text)
if "買" in title_name:
data=data_extraction_buy(string)
buy.append([county,year+' '+date,time,data])
elif "賣" in title_name:
data=data_extraction_sell(string)
sell.append([county,year+' '+date,time,data])
```
## 擷取網頁的名稱將其分類
```python=
j=1
def name_filter(soup):
global j
for entry in soup.select('.r-ent'):
title_name=entry.select('.title')[0].text
if '[' in title_name and ']'in title_name:
start_quote = title_name.find('[')
end_quote = title_name.find(']', start_quote+1)
title_name = title_name[start_quote+1:end_quote]
if (any(c in title_name for c in ["買","賣"])):
print(j,'.',title_name)
j=j+1
link=allowed_domains+entry.find('a').get('href')
content,url=get_web_page(link)
save_data_json(content,title_name)
```
## 運行程式
```python=
(soup,url)=get_web_page(url='https://www.ptt.cc/bbs/home-sale/index.html')
k=1
while True:
if url:
print('Page:',k,':',url,'\n')
k=k+1
name_filter(soup)
link=next_page(soup)
(soup,url)=get_web_page(link)
else:
break
```
## 輸出資料
```python=
for entry in sell:
print(entry[0],entry[1],'\n\n',entry[3],'\n')
```
## 把資料儲存為json檔案
```python=
def file(data):
with open("test",mode="w",encoding="utf-8")as file:
json_string=json.dumps(data)
file.write(json_string)
file(buy) #自行設定 buy / sell
```
## 小結
在這個小節中我們簡介如何以 Python 的 BeautifulSoup 與 Regular Expression 套件擷取網頁的房屋資料。使用 BeautifulSoup 解析網頁的 HTML 並且運用 Regular Expression 擷取網頁特定關鍵字的資料。最後把所擷取到的資料存成 json 格式的檔案。