【目次】
Reference website:
pip3 install requests
pip3 install beautifulsoup4
import requests
url = 'https://www.reddit.com/'
response = requests.get(url)
print(response)
#<Response [200]>
換個網站試試看:天下雜誌
import requests
import urllib.request
url = 'https://www.cw.com.tw/today'
# 按F12 - Network - get方法
fake_browser = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
}
request = urllib.request.Request(url, headers=fake_browser)
response = urllib.request.urlopen(request)
print(request, response.getcode())
#<urllib.request.Request object at 0x104aa9b20> 200
import requests
url = 'https://www.gamer.com.tw/'
response = requests.request('get', url)
file_name = 'gamer.html'
with open(file_name, 'w', encoding='utf-8') as f:
f.write(response.text)
# f = open(file_name, 'w', encoding='utf-8')```
# f.write(response.text)```
print('Success!')
import requests
import urllib.request
url = 'https://www.cw.com.tw/today'
fake_browser = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
}
request = urllib.request.Request(url, headers=fake_browser)
response = urllib.request.urlopen(request)
file_name = 'CommonWealth_Magazine.html'
with open(file_name, 'w', encoding='utf-8') as f:
f.write(response.read().decode('utf-8'))
import requests
# BeautifulSoup 套件
from bs4 import BeautifulSoup
url = 'https://www.gamer.com.tw/'
response = requests.request('get', url)
# 將html文字轉成BeautifulSoup物件
soup = BeautifulSoup(response.text, 'html.parser')
# 這樣就能用它搜尋裡面的內容
title = soup.find('title').text
print(title)
import requests
import urllib.request
from bs4 import BeautifulSoup
url = 'https://www.cw.com.tw/today'
fake_browser = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
}
req = urllib.request.Request(url, headers = fake_browser)
response = urllib.request.urlopen(req)
# 將html文字轉成BeautifulSoup物件
soup = BeautifulSoup(response.read().decode('utf-8'), 'html.parser')
# 這樣就能用它搜尋裡面的內容
title = soup.find('title').text
print(title)
#今日最新-天下雜誌
#1
import requests
url = 'https://www.businessweekly.com.tw/newlist.aspx'
response = requests.get(url)
print(response)
#<Response [200]>
#2
import requests
url = 'https://www.businessweekly.com.tw/newlist.aspx'
response = requests.get(url)
file_name = 'news.html'
with open(file_name, 'w', encoding='utf-8') as f:
f.write(response.text)
print('Success!')
#Success!
<title>
)
#3
import requests
from bs4 import BeautifulSoup
url = 'https://www.businessweekly.com.tw/newlist.aspx'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.find('title').text
print(title)
#今日最新文章 - 商業周刊 - 商周.com
import requests
from bs4 import BeautifulSoup
url = 'https://www.gamer.com.tw/'
response = requests.request('get', url)
soup = BeautifulSoup(response.text, 'html.parser')
# 或是可用CSS選擇器
side_titles = soup.select('.BA-left li a')
for title in side_titles:
print(title.text)
import requests
import urllib.request
from bs4 import BeautifulSoup
url = 'https://www.cw.com.tw/today'
fake_browser = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
}
req = urllib.request.Request(url, headers = fake_browser)
response = urllib.request.urlopen(req)
# 將html文字轉成BeautifulSoup物件
soup = BeautifulSoup(response.read().decode('utf-8'), 'html.parser')
# 或是可用CSS選擇器
side_titles = soup.select('#item1 > div:nth-child(1) > section:nth-child(3) > div.caption > p')
for title in side_titles:
print(title.text)
import requests
import urllib.request
from bs4 import BeautifulSoup
import re
url = 'https://www.cw.com.tw/today'
fake_browser = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
}
request = urllib.request.Request(url, headers=fake_browser)
response = urllib.request.urlopen(request)
# 將html文字轉成BeautifulSoup物件
soup = BeautifulSoup(response.read().decode('utf-8'), 'html.parser')
# 或是可用正則表達式
response_crawling = soup.find_all('a', href=re.compile('article'))
for a in response_crawling:
print(a.text) #不會印出"HTML標籤"
print(a) #連同HTML標籤一起爬取
...
file_name = 'Lab2_text.txt'
#方法一
f = open(file_name, 'w', encoding='utf-8')
f.write(response_crawling.text)
f.close()
# 方法二
with open(file_name, 'w', encoding='utf-8') as f:
f.write(response_crawling.text)
f.close()
import requests
from bs4 import BeautifulSoup
url = 'https://www.businessweekly.com.tw/newlist.aspx'
response = requests.get(url)
print(response)
soup = BeautifulSoup(response.text, 'html.parser')
side_titles = soup.select('#Scroll-panel-all a')
for title in side_titles:
print(title.text)
import requests
from bs4 import BeautifulSoup
url = 'https://www.businessweekly.com.tw/newlist.aspx'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
side_titles = soup.select('#Scroll-panel-all a')
file_name = 'Lab.txt'
file = open(file_name, 'w', encoding = 'utf8')
for title in side_titles:
file.write(title.text + '\n')
print(title.text)
file.close()
# 方法一
import requests
from bs4 import BeautifulSoup
url = 'https://www.ck101.org/293/293983/51812965.html'
response = requests.get(url) #使用header避免訪問受到限制
#print(response)
soup = BeautifulSoup(response.content, 'html.parser')
title = soup.find('title').text
items = soup.select('.yuedu_zhengwen')
file_name = './novel.txt'
file = open(file_name, 'w', encoding = 'utf8')
file.write(title + '\n' + '\n')
for i in items:
file.write(i.text + '\n')
print(i.text + '\n')
file.close()
# 方法二
import requests
from bs4 import BeautifulSoup
import os
word_first = 2965 #word_last = 4330 # 4330-2965+1=1366
url = 'https://www.ck101.org/293/293983/5181'+ str(word_first) +'.html'
response = requests.get(url) #使用header避免訪問受到限制
print(response)
soup = BeautifulSoup(response.content, 'html.parser')
items = soup.select('.yuedu_zhengwen')
items_string = str(items).replace('<br/>','').replace('</div>','').replace('[<div','').replace('class="yuedu_zhengwen"','').replace('id="content">','')
items_string_split = items_string.split()
print(items_string_split)
folder_path ='./novel/'
if (os.path.exists(folder_path) == False): #判斷資料夾是否存在
os.makedirs(folder_path) #Create folder
file_name = './novel/Lab.txt'
file = open(file_name, 'w', encoding = 'utf8')
for items_string in items_string_split:
file.write(items_string + '\n')
#print(items_string + '\n')
file.close()
print('Done!')
import requests
from bs4 import BeautifulSoup
url = 'https://www.ck101.org/293/293983/51812965.html'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
title = soup.find('title').text
items = soup.select('.yuedu_zhengwen')
file_name = './Lab.txt'
file = open(file_name, 'w', encoding = 'utf8')
file.write(title + '\n' + '\n')
for i in items:
#去除某些不要的文字。如:小÷說◎網 】,♂小÷說◎網 】,
i = str(i).replace('小÷說◎網 】,♂小÷說◎網 】,','').replace('<br/>','').replace('<div class="yuedu_zhengwen" id="content">','').replace('</div>','')
file.write(i + '\n')
print(i + '\n')
file.close()
import requests
from bs4 import BeautifulSoup
import os
url = 'https://www.ck101.org/293/293983/51812965.html'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
title = soup.find('title').text
items = soup.select('.yuedu_zhengwen')
#判斷資料夾是否存在
folder_path ='./novel/'
if (os.path.exists(folder_path) == False):
os.makedirs(folder_path) #Create folder
# 在 novel資料夾下,建立 Lab.txt
file_name = './novel/Lab.txt'
file = open(file_name, 'w', encoding = 'utf8')
file.write(title + '\n' + '\n')
for i in items:
i = str(i).replace('小÷說◎網 】,♂小÷說◎網 】,','').replace('<br/>','').replace('<div class="yuedu_zhengwen" id="content">','').replace('</div>','')
file.write(i + '\n')
print(i + '\n')
file.close()
#方法一
import requests
from bs4 import BeautifulSoup
import os
index = 0
#判斷資料夾是否存在
folder_path ='./novel/'
if (os.path.exists(folder_path) == False):
os.makedirs(folder_path) #Create folder
def get_content(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
title = soup.find('title').text
items = soup.select('.yuedu_zhengwen')
file_write(items, title)
def file_write(items, title):
global index
file_name = './novel/Lab' + str(index + 1) + '.txt'
f = open(file_name, 'w', encoding='utf-8')
f.write(title + '\n' + '\n')
for i in items:
i = str(i).replace('小÷說◎網 】,♂小÷說◎網 】,','').replace('<br/>','').replace('<div class="yuedu_zhengwen" id="content">','').replace('</div>','')
f.write(i + '\n')
#print(i + '\n')
f.close() #close file
index += 1
print('Done!')
# 自動爬取多個章節內容並儲存
url = ['https://www.ck101.org/293/293983/5181{}.html'.format(str(i)) for i in range(2965,4330)]
for u in url:
get_content(u)
#方法二
import requests
import urllib.request
from bs4 import BeautifulSoup
import os
index = 0
url = ['https://www.ck101.org/293/293983/5181{}.html'.format(str(i)) for i in range(2965,4330)]
def get_content(url):
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers = headers)
soup = BeautifulSoup(response.content, 'html.parser')
items = soup.select('.yuedu_zhengwen')
items_string = str(items).replace('<br/>','').replace('</div>]','').replace('[<div','').replace('class="yuedu_zhengwen"','').replace('id="content">','')
items_string_split = items_string.split()
print(items_string_split)
file_write(items_string_split, items)
def file_write(items_string_split, items):
global index
a = ''
for items_string in items_string_split:
a = a + items_string + '\n'
print(a)
novel_name = './novel' + str(index + 1) + '.text'
with open(novel_name,'w', encoding='big5') as f: #以byte的形式將圖片數據寫入
f.write(a)
f.close() #close file
index += 1
print('Done!')
for titles in url:
get_content(titles)
import requests
from bs4 import BeautifulSoup
import os
url = 'https://www.google.com/search?rlz=1C2CAFB_enTW617TW617&biw=1600&bih=762&tbm=isch&sa=1&ei=Z3BUXLqNOZmk-QbW_KaYDw&q=%E7%8B%97&oq=%E7%8B%97&gs_l=img.3..0l10.18328.18868..20040...0.0..0.52.143.3......1....1..gws-wiz-img.......0i24.5zgXwVAqY4U'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
items = soup.find_all('img')
folder_path ='./photo/'
if (os.path.exists(folder_path) == False): #判斷資料夾是否存在
os.makedirs(folder_path) #Create folder
photolimit = 10
for index , item in enumerate (items):
if (item and index < photolimit):
# use 'get' to get photo link path , requests = send request
html = requests.get(item.get('src'))
img_name = folder_path + str(index + 1) + '.png'
with open(img_name,'wb') as f: #以byte的形式將圖片數據寫入
f.write(html.content)
f.flush()
f.close()
print('第 %d 張' % (index + 1))
print('Done')
import requests
import urllib.request
from bs4 import BeautifulSoup
import os
import time
word = input('Input key word: ')
url = 'https://www.google.com/search?rlz=1C2CAFB_enTW617TW617&biw=1600&bih=762&tbm=isch&sa=1&ei=n3JUXIWIJNatoAT87a-4Cw&q='
+ word + '&oq=' + word + '&gs_l=img.3..35i39l2j0l8.40071.45943..46702...1.0..2.56.625.13......3....1..gws-wiz-img.....0..0i24.9fotvswIauk'
photolimit = 10
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url,headers = headers) #使用header避免訪問受到限制
soup = BeautifulSoup(response.content, 'html.parser')
items = soup.find_all('img')
folder_path ='./photo/'
if (os.path.exists(folder_path) == False): #判斷資料夾是否存在
os.makedirs(folder_path) #Create folder
for index , item in enumerate (items):
if (item and index < photolimit ):
html = requests.get(item.get('src')) # use 'get' to get photo link path , requests = send request
img_name = folder_path + str(index + 1) + '.png'
with open(img_name,'wb') as file: #以byte的形式將圖片數據寫入
file.write(html.content)
file.flush()
file.close() #close file
print('第 %d 張' % (index + 1))
time.sleep(1)
print('Done')
Hint:
import requests
from bs4 import BeautifulSoup
import os
#import time
word = input("關鍵字:")
url = 'https://www.shutterstock.com/search?search_source=base_landing_page&language=zh-Hant&searchterm='+ word +'&image_type=all'
res = requests.get(url)
soup = BeautifulSoup(res.content, 'html.parser')
all_img = soup.find_all('img')
folder_path ='./photo_sheep/'
if (os.path.exists(folder_path) == False): #判斷資料夾是否存在
os.makedirs(folder_path) #Create folder
for index, img in enumerate (all_img):
if (img):
html = requests.get(img.get('src'))
img_name = folder_path + str(index + 1) + '.png'
with open(img_name,'wb') as f:
f.write(html.content)
f.flush()
print(index + 1)
#time.sleep(1)
print('done ~ ')
import pandas
''' 方法一 '''
import pandas
pandas.set_option('max_columns', 200)
pandas.set_option('max_rows', 200)
url = 'https://course.ttu.edu.tw/u9/main/listcourse.php'
table_data = pandas.read_html(url)
file_name = "crawl_table_byPandas.txt"
file = open(file_name, 'w',encoding = 'utf8')
file.write(str(table_data))
print("-- File Writing Ending --")
file.close()
# 統計數據與相關係數
for data in table_data:
print(data.describe())
''' 方法二 '''
import pandas
# Read the html of the table
table = pandas.read_html("https://course.ttu.edu.tw/u9/main/listcourse.php")
# Transfer the first line as the title of column
# table.columns = table.iloc[0]
# table.reindex(table.index.drop(1))
file_name = 'crawled_table.txt'
file = open(file_name, 'w',encoding = 'utf8')
for i in range(len(table)):
file.write(str(table))
print("-- File Writing Ending --")
file.close()
import requests
from bs4 import BeautifulSoup
response = requests.get("https://course.ttu.edu.tw/u9/main/listcourse.php")
soup = BeautifulSoup(response.text, "lxml")
tag = ".mistab"
with open('./crawl_table_byCSS.txt', 'w',encoding = 'utf8') as f:
for course in soup.select(tag):
print(course.get_text())
f.write(course.get_text())
'''
results = course.get_text()
print(results)
f.write(results + '\n') #f objeocct open txt
'''
print("-- File Writing Ending --")
f.close()
#http://www.chiehfuchan.com/%E7%B0%A1%E5%96%AE%E5%88%A9%E7%94%A8-python-%E5%A5%97%E4%BB%B6-speechrecognition-%E9%80%B2%E8%A1%8C%E8%AA%9E%E9%9F%B3%E8%BE%A8%E8%AD%98/
#https://ithelp.ithome.com.tw/articles/10196577
#https://zhuanlan.zhihu.com/p/50677236
import speech_recognition as sr
r = sr.Recognizer()
with sr.AudioFile("C:\\Users\\pcsh1\\Documents\\錄音\\2.wav") as source: #file.wav
r.adjust_for_ambient_noise(source) #解決環境噪聲
audio = r.listen(source) # audio = r.record(source, duration=100)
en_simplechinese = r.recognize_google(audio, language = 'zh-TW ; en-US')
print(en_simplechinese)
#with sr.Microphone() as source:
#audio = r.listen(source)
#============================================================
### 轉繁體中文 ###
from hanziconv import HanziConv
tra_chinese = HanziConv.toTraditional(en_simplechinese)
print(tra_chinese)
#============================================================
### jieba 斷詞斷句 ###
import jieba
import jieba.analyse
f = open('test.txt','w',encoding='utf8') #清空檔案內容 #'a'
f.write(tra_chinese) #寫入檔案
f.close() # 關閉檔案
f = open('test.txt','r',encoding='utf8')
article = f.read()
tags = jieba.analyse.extract_tags(article,10)
print('最重要字詞',tags)
f.close()
#============================================================
'''
### jieba 斷詞斷句 ###
import jieba
import jieba.analyse
f = open('test.txt','r',encoding='utf8')
article = f.read()
tags = jieba.analyse.extract_tags(article,100)
print('最重要字詞',tags)
'''
import requests
from bs4 import BeautifulSoup as b
payload = {'mail_id':'404040523', 'mail_pwd':'gibe258deny700'}
rs = requests.session()
res = rs.post('http://stu.fju.edu.tw/stusql/SingleSignOn/StuScore/SSO_stu_login.asp', data = payload)
res2 = rs.get('http://stu.fju.edu.tw/stusql/SingleSignOn/StuScore/stu_scoreter.asp')
#print(res2.content)
soup = b(res2.content, "html.parser")
all_td1 = soup.find_all('td', {'align': 'left', 'valign': None})
list1 = []
for obj in all_td1:
list1.append(obj.contents[0])
#print(obj)
for obj in list1:
print(obj.string)
print("===============")
all_td2 = soup.find_all('td', {'align': 'center', 'valign': None})
list2 = []
for obj in all_td2:
list2.append(obj.contents[0])
#print(obj)
for obj in list2:
print(obj.string)
print("===============")
all_td3 = soup.find_all('td', {'align': 'right', 'valign': None})
list3 = []
for obj in all_td3:
list3.append(obj.contents[0])
#print(obj)
for obj in list3:
print(obj.string)