1.期末專題主題
2.期末專題基本目標
2.1基礎實作
2.2進階實作
首先,先匯入所需使用的套件
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import pandas as pd
爬取文章設定,因為有多頁網頁資訊,需使用webdriver操作網頁下滾的動作
N = 500 # 設定要爬取文章數
# 打開瀏覽器, 進入欲爬取網頁
browser = webdriver.Chrome(executable_path='../chromedriver')
browser.get("https://www.cupoy.com/newsfeed/topicgrp/tech_tw")
start_time=time.time()
count = 0
articles_info = []
print('目前文章數: ', end='')
while count<N:
html_source = browser.page_source
soup = BeautifulSoup(html_source, 'html5lib')
target = soup.find_all('a', class_='sc-jxGEyO')
dummy = 0 #重複、需扣掉的文章數量
for d in target:
article = {}
article['title']= d['title']
article['url'] = d['href']
article['origin'] = '/'.join(d['href'].split('/', 3)[:-1]) # 新聞來源的網站主頁
if article not in articles_info: #如果有因為往下滑的時間延遲造成爬取到重複的新聞
articles_info.append(article)
else:
dummy += 1 #重複、需扣掉的文章數量
count = count + len(target) - dummy #修正後的文章數量
print(count, end=' ')
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5) # 每隔五秒鐘自動往下滑, 擷取 500 篇新聞的 url
browser.quit()
if len(articles_info) > N: # 取前 500 筆
articles_info = articles_info[:N]
end_time = time.time()
print('take time:', end_time-start_time, 's')
確認一下爬取到的500筆新聞資訊,以及標題
for i, a in enumerate(articles_info, start=1):
print(i, ' ', a['title'])
# 轉成 dataframe 並存檔
import os
os.getcwd()
df = pd.DataFrame(articles_info)
df.to_csv('C:/Users/vincentLee1231995/OneDrive/Documents/Personal/Crawling-in-60Days/Homework/final project/news_info.csv', index=False)
# 讀取 csv
import pandas as pd
news_info = pd.read_csv('news_info.csv')
news_info.head()
接著,需要爬取每篇新聞的內容,這裡我們先使用單線程爬蟲的方式(多線程的寫法會放在文末的"問題解決"段落中)
把所用到的代碼封裝成函式
content=[] #存放最終內容
def analysis(url_list, content):
import requests
from bs4 import BeautifulSoup
import time
import re
headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Edg/94.0.992.38'}
for url in url_list:
response = requests.get(url, headers=headers)
response.encoding='utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
single_content=[] #每則新聞的內容(盤整過的第二手資料)
paragraphs = [] #每則新聞的內容(爬回的第一手資料)
if soup.find('p'):
paragraphs.append(soup.find_all('p'))
if soup.find('section'):
paragraphs.append(soup.find_all('section'))
for pars in paragraphs: # pars 是 <p>集合 和 <section>集合
for par in pars:
text = re.sub('[\W]+', ' ', par.text) # 找出特殊字元 (\n,\t,...等) 替換成空白
single_content.append(text)
content.append(single_content)
寫完函式後,呼叫並帶入參數做使用
target = news_info['url'] #目標網址
print(len(target))
import time
start=time.time()
analysis(target,content) #呼叫函式
end=time.time()
print('Done!', end-start, 's')
把得到的結果存入一個新的列表,加進一個新欄位到原本的資料框中
news = []
for i in content:
news.append(i)
news_info['content'] = news
news_info.to_csv('C:/Users/vincentLee1231995/OneDrive/Documents/Personal/Crawling-in-60Days/Homework/final project/news_contents.csv', index=False)
接著,我們用新得到的資料框,整理資料並做初步統計
data = pd.read_csv('./news_contents.csv')
data.head()
以來源網址做計數
origin_state = data['origin'].value_counts().reset_index()
origin_state.columns=['origin', 'count']
origin_state
發現有部分來源網址出現的次數極少,把它們歸類為"其他"
def mapping_new_origin(origin, count):
if count >= 5:
return origin
else:
return 'Others'
new_origin = []
for i in range(len(origin_state)):
new_origin.append(mapping_new_origin(origin_state.iloc[i, 0], origin_state.iloc[i, 1]))
origin_state['new_origin'] = new_origin #重整過的結果存成一個新欄位
origin_state
重新做一次計數,並存檔為新的資料框
new_origin_state = origin_state.groupby(by='new_origin').sum().reset_index().sort_values(by='count', ascending=False).reset_index(drop=True)
new_origin_state
new_origin_state.to_csv('C:/Users/vincentLee1231995/OneDrive/Documents/Personal/Crawling-in-60Days/Homework/final project/new_origin_state.csv', index=False)
接著,繪製圓餅圖,可以更清楚的了解分布
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(25,10))
plt.pie(new_origin_state['count'],
labels=new_origin_state['new_origin'],
autopct='%0.1f%%')
#plt.rcParams['font.sans-serif']=['FangSong']
plt.legend()
plt.title('新聞來源網站分布')
plt.show()
再來我們要將新聞內容斷詞,根據關鍵詞繪製文字雲
import pandas as pd
data = pd.read_csv('./news_contents.csv')
data.head()
news = list(data['content'].values)
print(len(news), type(news))
安裝、匯入相關套件
pip install jieba
import jieba
import jieba.analyse
準備停用詞表
stopwords=[]
with open('cn_stopwords.txt', 'r', encoding='utf-8') as f: # 使用從網路上抓來的停用詞表
for data in f.readlines():
data = data.strip()
stopwords.append(data)
len(stopwords)
使用jieba斷詞
print('Jobs just begin!')
remained_news = []
startTime=time.time()
for n in news:
seg = jieba.cut(n, cut_all=False) # 預設模式斷詞,分詞後去除停用詞, 然後再重組回文章
try:
remained_news.append(''.join(list(filter(lambda a: a not in stopwords and a != '\n', seg))))
except Exception as e:
print(e) # 若有 'nan' 的項目會引發錯誤
print(news.index(n)) # 取得序號再次確認新聞內容是否有問題
endTime=time.time()
print('Take time: ', endTime-startTime, 's')
print('All jobs Done!')
關鍵詞分析
keywords = []
startTime=time.time()
for n in remained_news:
keywords.append(jieba.analyse.extract_tags(n, topK=20, withWeight=False))
for j in keywords:
print(j,'\n')
endTime=time.time()
print('Take time: ', endTime-startTime, 's')
print('All jobs Done!')
斷詞與關鍵字分析分別得出兩個文本來源,先以前者製作文字雲看看;一樣先封裝成函式再做使用
def plt_wordcloud(content):
from wordcloud import WordCloud
import jieba
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
%matplotlib inline
words = jieba.cut(content, cut_all=False)
all_words = '' #要在wordcloud函式使用的參數,即目標文本
for word in words:
all_words += ' '+word
wcloud = WordCloud(width=500,
height=500,
background_color='white',
mask=None,
min_font_size=8,
font_path='C:/Windows/Fonts/kaiu.ttf').generate(all_words)
plt.figure(figsize=(20, 10), facecolor=None)
plt.imshow(wcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
資料轉換,並呼叫函式,傳入參數
allnews = ''
for n in remained_news:
allnews += n
plt_wordcloud(allnews)
觀察所得到的結果,雖成功繪製出文字雲,但內容似乎與"科技版"沒有太大關連性,因此決定改用篩選出的關鍵詞重新製作一次
keyword_text=''
for lst in keywords:
for item in lst:
keyword_text+=' '+item
print(len(keyword_text))
重新封裝一個內容較簡單的函式
def wordCloud(text):
wcloud = WordCloud(width=500, height=500, background_color='white', mask=None,
min_font_size=8, font_path='C:/Windows/Fonts/msjh.ttc').generate(text)
plt.figure(figsize=(20, 10), facecolor=None)
plt.imshow(wcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
呼叫函式,傳入參數
wordCloud(keyword_text)
1.遇到chromedriver版本相容問題:
selenium报错Message: This version of ChromeDriver only supports Chrome version xx:`
2.使用單線程爬蟲的方式蒐集內容所花費的時間較多,改以多線程方式做看看
爬取內容的函式,較原來的單線程,需稍微調整:
- 加入一個參數num,用來做迴圈的次數計數
- 遇到requests.exceptions.ConnectionError问题,在函式末端加上time.sleep(5)避免過度頻繁的訪問,詳文可參考:https://blog.csdn.net/wancongconga/article/details/111030335
content=[]
def analysis(url_list, content, num, start, end):
import requests
from bs4 import BeautifulSoup
import time
import re
headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Edg/94.0.992.38'}
for url in url_list[start:end]:
response = requests.get(url, headers=headers)
response.encoding='utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
single_content=[]
paragraphs = []
if soup.find('p'):
paragraphs.append(soup.find_all('p'))
if soup.find('section'):
paragraphs.append(soup.find_all('section'))
for pars in paragraphs: # pars 是 <p>集合 和 <section>集合
for par in pars:
text = re.sub('[\W]+', ' ', par.text) # 找出特殊字元 (\n,\t,...等) 替換成空白
single_content.append(text)
content.append(single_content)
time.sleep(5)
設定目標網址:
N=500
n_thread=10
target = news_info['url']
設定多線程:
# 建立 10個子執行緒
threads = []
startTime=time.time()
for i in range(10):
start = int(N/n_thread)*num # 設定此項 job 所負責的起始和終點序號
end = int(N/n_thread)*(num+1)
threads.append(threading.Thread(target = analysis, args = (target,content,i,start,end)))
threads[i].start()
# 等待所有子執行緒結束
for i in range(5):
threads[i].join()
endTime=time.time()
print("Done.")
print('Take time: ', endTime-startTime,'s')
較單線程節省了大約130秒的時間
3.Jieba套件-補充
- Jieba的三種斷詞模式
-預設模式
-全文模式
-搜尋引擎模式- 文字雲的形狀
1.個人Github連結: Github
2.個人於百日馬拉松的顯示名稱:Vincent_1231995