# 爬蟲ptt
## 主要工具
- import requests
#pip3 install requests
- BeautifulSoup
#pip3 install beautifulsoup4
- lxml
#pip3 install lxml
## 建置與執行
### 環境設定
建議使用 virtualenv 控制 python 環境
``` sh
pip3 install virtualenv
which python3
virtualenv -p /usr/bin/python3 Envpython3 #指定python版本並建置envname
source Envpython3/bin/activate
```
### 開始爬蟲
```python=
import requests
from bs4 import BeautifulSoup
homepage = requests.get('https://www.ptt.cc/bbs/hotboards.html')
# print(homepage.text)
# with open ('a.text','a') as f:
# f.write(homepage.text)
soup = BeautifulSoup(homepage.text,'lxml')
# print(soup)
# with open ('b.text','a') as f:
# f.write(soup.text)
posts11 = soup.find('a', class_= 'board')
posts12 = soup.select_one('a.board')
print(posts11)
print('---')
print(posts12)
posts1 = soup.find_all('a', class_= 'board')
# posts1 = soup.select('a.board')
for p in posts1:
h_name = p.find('div', class_= 'board-name')
# posts2 = soup.select('a.board')
# print(name)
h_page = p.select('span')
# print(page)
h_classname = p.select('div.board-class')
# print(classname)
h_title = p.select('div.board-title')
# print(title)
h_url = 'https://www.ptt.cc' + p['href']
print(h_url)
article = requests.get(
url = h_url,
cookies = {'over18': 'yes'} # ptt18歲的認證
)
soup = BeautifulSoup(article.text,'lxml')
r_ent = soup.select('div.r-ent')[0].text
a_url = soup.select('div.title > a')[0]['href']
a_title = soup.select('div.title')[0].text
a_author = soup.select('div.author')[0].text
a_date = soup.select('div.date')[0].text
print(a_title)
print('https://www.ptt.cc/'+a_url)
```
### lxml格式

```python
import requests
from bs4 import BeautifulSoup
import time
start = time.time()
homepage = requests.get('https://www.ptt.cc/bbs/hotboards.html')
# print(homepage.text)
a = 0
b = 0
c = 0
d = 0
for i in range(1,100):
with open ('nosoup.text','a') as f:
f.write(homepage.text)
end = time.time()
elapsed = end - start
a = a + elapsed
print("Time taken nosoup: ", elapsed, "seconds.")
start = time.time()
soup = BeautifulSoup(homepage.text,'lxml')
# print(soup)
with open ('lxml.text','a') as f:
f.write(soup.text)
end = time.time()
elapsed = end - start
b = b + elapsed
print("Time taken lxml.text: ", elapsed, "seconds.")
start = time.time()
h5soup = BeautifulSoup(homepage.text,'html5lib')
# print(h5soup)
with open ('html5lib.text','a') as f:
f.write(h5soup.text)
end = time.time()
elapsed = end - start
c = c + elapsed
print("Time taken html5lib: ", elapsed, "seconds.")
start = time.time()
pasersoup = BeautifulSoup(homepage.text,'html.parser')
# print(pasersoup)
with open ('htmlparser.text','a') as f:
f.write(pasersoup.text)
end = time.time()
elapsed = end - start
d = d + elapsed
print("Time taken htmlparser: ", elapsed, "seconds.")
print("Time taken nosoup: ", a, "seconds.")
print("Time taken lxml: ", b, "seconds.")
print("Time taken html5lib: ", c, "seconds.")
print("Time taken htmlparser: ", d, "seconds.")
```

```python
soup = BeautifulSoup(homepage.text,'lxml')
```


```python
posts1 = soup.find_all('a', class_= 'board')
# posts1 = soup.select('a.board')
for p in posts1:
h_name = p.find('div', class_= 'board-name')
h_page = p.select('span')
h_classname = p.select('div.board-class')
h_title = p.select('div.board-title')
h_url = 'https://www.ptt.cc' + p['href']
```

### 下一頁 爬取版內文章連結與資訊

### 解決需要click的頁面

### 尋找需要帶入的參數

```python
article = requests.get(
url = h_url,
cookies = {'over18': 'yes'} # ptt18歲的認證
)
```
#### 有問題就來找我問!
### 作業1
Gossiping 15907 綜合 ◎【八卦】紅.橙.黃.綠.藍.紫
全部

### 作業2
去高雄找女網友 要帶多少錢 seabox 5/22
標題 , 作者 , 日期
5頁

### 作業3
內文即可 可用split 進行分割 例如: a = 123.345 a.split('.')[0]就可以取得123
另外字串處理有許多方法 replace , strip...等

###### 第一個做完的獲得飲料一杯!!
## python class function
### dict格式
```
dictname = {
"key":Value
}
```
### 作業123 + 存入csv檔
```python=
import requests
import csv
import codecs
from bs4 import BeautifulSoup
class ppt_spider():
start_urls = ['https://www.ptt.cc/bbs/hotboards.html']
def __init__(self, start_urls):
self.h_number = 0
self.c_number = 0
self.header = []
self.start_urls = start_urls
def start_requests(self):
header = ['firstpage', 'username', 'classname', 'category_pages', 'category_title', 'category_url', 'a_title', 'a_author', 'a_date', 'note_url', 'note',]
with open ('ptt.csv','a',encoding='utf-8') as f:
csv_write = csv.DictWriter(f, fieldnames=header ,delimiter = ';')
csv_write.writeheader()
f.close()
for url in self.start_urls:
request_url = requests.get(url)
self.home_page(request_url)
def home_page(self, response):
soup = BeautifulSoup(response.text,'lxml')
posts1 = soup.find_all('a', class_= 'board')
for p in posts1:
self.h_number = self.h_number + 1
username = p.find('div', class_= 'board-name').text
# print(name)
category_pages = p.select('span')[0].text
# print(pages)
classname = p.select('div.board-class')[0].text
# print(classname)
category_title = p.select('div.board-title')[0].text
# print(title)
category_url = 'https://www.ptt.cc' + p['href']
# print(self.h_number, ' : ' , url)
request_url = requests.get(
url = category_url,
cookies = {'over18': 'yes'} # ptt18歲的認證
)
meta1 = {
"firstpage" : str(response.url),
"username" : str(username),
"classname" : str(classname),
"category_pages" : str(category_pages),
"category_title" : str(category_title),
"category_url" : str(category_url)
}
self.category_page(request_url, meta1)
def category_page(self, response, meta1):
self.c_number = self.c_number + 1
soup = BeautifulSoup(response.text,'lxml')
r_ent = soup.select('div.r-ent')[0].text
a_url = soup.select('div.title > a')[0]['href']
a_title = soup.select('div.title')[0].text.replace('\n','').replace('\t','')
if '刪除' in a_title:
a_url = a_title
note = a_title
else :
note = ''
a_author = soup.select('div.author')[0].text
# print(a_author)
a_date = soup.select('div.date')[0].text
# print(a_date)
note_url = 'https://www.ptt.cc/' + a_url
# print(a_title)
# print(self.c_number , ' : ' ,'https://www.ptt.cc/' + a_url)
meta2 = {
"a_title" : str(a_title),
"a_author" : str(a_author),
"a_date" : str(a_date),
"note_url" : str(note_url),
"note" : str(note)
}
meta_12 = dict(meta1.items() | meta2.items())
request_url = requests.get(
url = note_url,
cookies = {'over18': 'yes'} # ptt18歲的認證
)
self.note(request_url, meta_12)
def note(self, response, meta_12):
soup = BeautifulSoup(response.text,'lxml')
checkpage = soup.title.text
if '404 Not Found' in checkpage:
pass
else:
meta_12["note"] = soup.select('#main-content')[0].text.split('※ 發信站')[0].replace('\n',' ')
header = ['firstpage', 'username', 'classname', 'category_pages', 'category_title', 'category_url', 'a_title', 'a_author', 'a_date', 'note_url', 'note']
with open ('ptt.csv','a',encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=header, delimiter = ';')
meta_12 = {key: value for key, value in meta_12.items()if key in header}
writer.writerow(meta_12)
a = ppt_spider()
a.start_requests()
```
### 將dict寫入csv方法
###### 定義fieldnames 欄位名稱 , 根據dict的key做定義
```python=
header = ['firstpage', 'username', 'classname', 'category_pages', 'category_title', 'category_url', 'a_title', 'a_author', 'a_date', 'note_url', 'note',]
with open ('ptt.csv','a',encoding='utf-8') as f:
csv_write = csv.DictWriter(f, fieldnames=header ,delimiter = ';')
csv_write.writeheader()
meta_12 = {key: value for key, value in meta_12.items()if key in header}
writer.writerow(meta_12)
f.close()
```
```python=
meta_12 = {key: value for key, value in meta_12.items()if key in header}
writer.writerow(meta_12)
for key, value in meta_12.items():
if key in header:
meta_12 = {key : value}
writer.writerow(meta_12)
```
## crawl 工具
- python3
- scrapy
```pip3 install scrapy```
### scrapy getting started
scrapy startproject Newjob #jobname
cd Newjob/ && scrapy genspider ppt www.ptt.cc
#### 參數解釋
name = 'ppt'
scrapy crawl 使用 name 找到spider

#### 將要輸出資料的key定義一個scrapy類型

``` 執行scrapy crawl ppt -o name.csv```
### 作業4
http://quotes.toscrape.com/
```python=
import scrapy
from bs4 import BeautifulSoup
from ppt_crawler.items import PptCrawlerItem
class PptSpider(scrapy.Spider):
name = 'ppt'
allowed_domains = ['www.ptt.cc']
start_urls = ['https://www.ptt.cc/bbs/hotboards.html']
def __init__(self)
self.h_number = 0
self.c_number = 0
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, callback=self.home_page)
# request_url = requests.get(url)
# self.home_page(request_url)
def home_page(self, response):
soup = BeautifulSoup(response.text,'lxml')
posts1 = soup.find_all('a', class_= 'board')
for p in posts1:
self.h_number = self.h_number + 1
username = p.find('div', class_= 'board-name').text
# print(name)
category_pages = p.select('span')[0].text
# print(pages)
classname = p.select('div.board-class')[0].text
# print(classname)
category_title = p.select('div.board-title')[0].text
# print(title)
category_url = 'https://www.ptt.cc' + p['href']
# print(self.h_number, ' : ' , url)
# request_url = requests.get(
# url = category_url,
# cookies = {'over18': 'yes'} # ptt18歲的認證
# )
meta = {
"firstpage" : str(response.url),
"username" : str(username),
"classname" : str(classname),
"category_pages" : str(category_pages),
"category_title" : str(category_title),
"category_url" : str(category_url)
}
yield scrapy.Request(category_url, callback=self.category_page, meta = meta)
# self.category_page(request_url, meta1)
def category_page(self, response):
meta1 = response.meta
self.c_number = self.c_number + 1
soup = BeautifulSoup(response.text,'lxml')
r_ent = soup.select('div.r-ent')[0].text
a_url = soup.select('div.title > a')[0]['href']
a_title = soup.select('div.title')[0].text.replace('\n','').replace('\t','')
if '刪除' in a_title:
a_url = a_title
note = a_title
else :
note = ''
a_author = soup.select('div.author')[0].text
# print(a_author)
a_date = soup.select('div.date')[0].text
# print(a_date)
note_url = 'https://www.ptt.cc/' + a_url
# print(a_title)
# print(self.c_number , ' : ' ,'https://www.ptt.cc/' + a_url)
meta2 = {
"a_title" : str(a_title),
"a_author" : str(a_author),
"a_date" : str(a_date),
"note_url" : str(note_url),
"note" : str(note)
}
meta = dict(meta1.items() | meta2.items())
# print(meta)
# request_url = requests.get(
# url = note_url,
# cookies = {'over18': 'yes'} # ptt18歲的認證
# )
# self.note(request_url, meta_12)
yield scrapy.Request(note_url, cookies = {'over18': 'yes'}, callback=self.note, meta = meta)
def note(self, response):
meta = response.meta
soup = BeautifulSoup(response.text,'lxml')
checkpage = soup.title.text
if '404 Not Found' in checkpage:
pass
else:
meta["note"] = soup.select('#main-content')[0].text.split('※ 發信站')[0].replace('\n',' ')
item = PptCrawlerItem()
item['firstpage'] = meta['firstpage']
item['username'] = meta['username']
item['classname'] = meta['classname']
item['category_pages'] = meta['category_pages']
item['category_title'] = meta['category_title']
item['category_url'] = meta['category_url']
item['a_title'] = meta['a_title']
item['a_author'] = meta['a_author']
item['a_date'] = meta['a_date']
item['note_url'] = meta['note_url']
item['note'] = meta['note']
return item
```