PPT home sale crawler

--- disqus: hackmd --- PPT home sale crawler === 以 Python 編寫一個會爬取 PTT Home Sale 版中的房屋資訊的爬蟲，並存成json格式的檔案。 ## 撰寫爬蟲程式撰寫一個爬蟲程式，使用 requests, BeautifulSoup 與 Regular Expression 套件擷取並解析 PTT Home Sale 版的房屋資訊，最後把資料存成 json 格式的資料檔案。 ## Colab demo : https://colab.research.google.com/drive/1IWgxppdIZc7Lt2fZtGeJGHQ3ugdtHrlV?usp=sharing ## 準備套件 ```python= import requests import re import urllib3 import json from bs4 import BeautifulSoup urllib3.disable_warnings() ``` ## 網站域名 ```python= allowed_domains='https://www.ptt.cc' ``` ## 宣告 List ```python= buy=[] sell=[] ``` ## 網站請求函數 ```python= def get_web_page(url): if url: res=requests.get(url=url,verify=False) soup=BeautifulSoup(res.text,'lxml') return (soup,url) else: return None,None ``` ## 網頁翻頁函數 ```python= def next_page(soup): if(soup.find('div',{'class':'btn-group btn-group-paging'}).find('a','btn wide disabled',string="‹ 上頁")): return None else: result=soup.find('div',{'class':'btn-group btn-group-paging'}).find("a",string="‹ 上頁").get('href') link=allowed_domains+result return link ``` ## 擷取賣方頁面的房屋資訊函數 ```python= def data_extraction_sell(string): i=1 data={} if re.search(r"房屋類型：(.+)",string)!=None: data['房屋類型']=re.search(r"房屋類型：(.+)",string).group(i) else: data['房屋類型']="record not found" if re.search(r"建物權狀：(\d+(\.\d+))",string)!=None: data['建物權狀']=re.search(r"建物權狀：(\d+(\.\d+))",string).group(i) else: data['建物權狀']="record not found" if re.search(r"附屬建物：(\d+(\.\d+))",string)!=None: data['附屬建物']=re.search(r"附屬建物：(\d+(\.\d+))",string).group(i) else: data['附屬建物']="record not found" if re.search(r"共同使用：(\d+(\.\d+))",string)!=None: data['共同使用']=re.search(r"共同使用：(\d+(\.\d+))",string).group(i) else: data['共同使用']="record not found" if re.search(r"車位：(\d+(\.\d+))",string)!=None: data['車位']=re.search(r"車位：(\d+(\.\d+))",string).group(i) else: data['車位']="record not found" if re.search(r"建物權狀是否包含車位坪數：(.+)",string)!=None: data['建物權狀是否包含車位坪數']=re.search(r"建物權狀是否包含車位坪數：(.+)",string).group(i) else: data['建物權狀是否包含車位坪數']="record not found" if re.search(r"土地權狀：(\d+(\.\d+))",string)!=None: data['土地權狀']=re.search(r"土地權狀：(\d+(\.\d+))",string).group(i) else: data['土地權狀']="record not found" if re.search(r"格局：(.+)",string)!=None: data['格局']=re.search(r"格局：(.+)",string).group(i) else: data['格局']="record not found" if re.search(r"開價：(\d+(\.\d+))",string)!=None: data['開價']=re.search(r"開價：(\d+(\.\d+))",string).group(i) else: data['開價']="record not found" return data ``` ## 擷取買方頁面的房屋資訊函數 ```python= def data_extraction_buy(string): i=1 data={} if re.search(r"房屋地點：(.+)",string)!=None: data['房屋地點']=re.search(r"房屋地點：(.+)",string).group(i) else: data['房屋地點']="record not found" if re.search(r"房屋屋齡：(\d+(\.\d+))",string)!=None: data['房屋屋齡']=re.search(r"房屋屋齡：(\d+(\.\d+))",string).group(i) else: data['房屋屋齡']="record not found" if re.search(r"房屋類型：(.+)",string)!=None: data['房屋類型']=re.search(r"房屋類型：(.+)",string).group(i) else: data['房屋類型']="record not found" if re.search(r"房屋坪數：(\d+(\.\d+))",string)!=None: data['房屋坪數']=re.search(r"房屋坪數：(\d+(\.\d+))",string).group(i) else: data['房屋坪數']="record not found" if re.search(r"停車位：(\d+(\.\d+))",string)!=None: data['停車位']=re.search(r"停車位：(\d+(\.\d+))",string).group(i) else: data['停車位']="record not found" if re.search(r"車位種類：(.+)",string)!=None: data['車位種類']=re.search(r"車位種類：(.+)",string).group(i) else: data['車位種類']="record not found" if re.search(r"房屋格局：(.+)",string)!=None: data['房屋格局']=re.search(r"房屋格局：(.+)",string).group(i) else: data['房屋格局']="record not found" if re.search(r"是否加蓋：(.+)",string)!=None: data['是否加蓋']=re.search(r"是否加蓋：(.+)",string).group(i) else: data['是否加蓋']="record not found" if re.search(r"房屋座向：(.+)",string)!=None: data['房屋座向']=re.search(r"房屋座向：(.+)",string).group(i) else: data['房屋座向']="record not found" if re.search(r"買價：(\d+(\.\d+))",string)!=None: data['買價']=re.search(r"買價：(\d+(\.\d+))",string).group(i) else: data['買價']="record not found" if re.search(r"需求時間：(.+)",string)!=None: data['需求時間']=re.search(r"需求時間：(.+)",string).group(i) if re.search(r"聯絡方式：(.+)",string)!=None: data['聯絡方式']=re.search(r"聯絡方式：(.+)",string).group(i) else: data['聯絡方式']="record not found" if re.search(r"其他聯絡：(.+)",string)!=None: data['其他聯絡']=re.search(r"其他聯絡：(.+)",string).group(i) else: data['其他聯絡']="record not found" return data ``` ## 時間解析函數 ```python= def find_date(date): first=Date.find(' ') second=Date.find(' ',first+1) third=Date.find(' ',second+1) fourth=Date.find(' ',third+1) day=Date[:first] date=Date[first+1:third] time=Date[third+1:fourth] year=Date[fourth+1:] return day,date,time,year ``` ## 儲存Json格式的函數 ```python= def save_data_json(soup,title_name): global buy global sell global day,date,time,year,Date start_quote=title_name.find('/') end_quote=title_name.find('/', start_quote+1) county=title_name[start_quote+1:end_quote] try: Date=soup.select('.article-meta-value')[3].text except: pass else: day,date,time,year=find_date(Date) string=re.sub(r"[ \u3000\f\r\t\v]","",soup.text) if "買" in title_name: data=data_extraction_buy(string) buy.append([county,year+' '+date,time,data]) elif "賣" in title_name: data=data_extraction_sell(string) sell.append([county,year+' '+date,time,data]) ``` ## 擷取網頁的名稱將其分類 ```python= j=1 def name_filter(soup): global j for entry in soup.select('.r-ent'): title_name=entry.select('.title')[0].text if '[' in title_name and ']'in title_name: start_quote = title_name.find('[') end_quote = title_name.find(']', start_quote+1) title_name = title_name[start_quote+1:end_quote] if (any(c in title_name for c in ["買","賣"])): print(j,'.',title_name) j=j+1 link=allowed_domains+entry.find('a').get('href') content,url=get_web_page(link) save_data_json(content,title_name) ``` ## 運行程式 ```python= (soup,url)=get_web_page(url='https://www.ptt.cc/bbs/home-sale/index.html') k=1 while True: if url: print('Page:',k,':',url,'\n') k=k+1 name_filter(soup) link=next_page(soup) (soup,url)=get_web_page(link) else: break ``` ## 輸出資料 ```python= for entry in sell: print(entry[0],entry[1],'\n\n',entry[3],'\n') ``` ## 把資料儲存為json檔案 ```python= def file(data): with open("test",mode="w",encoding="utf-8")as file: json_string=json.dumps(data) file.write(json_string) file(buy) #自行設定 buy / sell ``` ## 小結在這個小節中我們簡介如何以 Python 的 BeautifulSoup 與 Regular Expression 套件擷取網頁的房屋資料。使用 BeautifulSoup 解析網頁的 HTML 並且運用 Regular Expression 擷取網頁特定關鍵字的資料。最後把所擷取到的資料存成 json 格式的檔案。