python
https://chat.whatsapp.com/FHuNWUGCpUo4rLUxd4VFUm
上課影片、相關通知以及python程式問題都可以在群組問歐~~:)
# | 課程主題 | 課程內容 |
---|---|---|
第一週 | python套件 | 套件安裝、資料取得 |
第二週 | 語法介紹 | 爬蟲必知HTML語法介紹 |
第三週 | HTML解析爬取、實戰演練 | HTML解析爬取、實戰演練 |
第四週 | 填寫表單及送出 | 填寫表單及送出 |
第五週 | 取得動態HTML資料 | 取得動態HTML資料 |
第陸週 | 實戰演練 | 實戰演練 |
主機代管(Co-location):
網頁寄存服務(Web Hosting):
requirements.txt
requests
beautifulsoup4
lxml
pandas
openpyxl
webdriver_manager
selenium
pip3 install -r requirements.txt
import requests
url = 'http://tw.yahoo.com.'
r = requests.get(url)
print(r)
print(r.status_code) # 狀態
# print(r.text) # 純文字
print(type(r.content)) # 二進位檔 bytes
網頁狀態碼 | 說明 |
---|---|
100–199 | 資訊回應 |
200–299 | 成功回應 |
300–399 | 重定向 |
400–499 | 用戶端錯誤 |
500–599 | 伺服器端錯誤 |
import requests
url = 'http://aaa.24ht.com.tw'
htmlfile = requests.get(url)
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) \
# AppleWebKit/537.36 (KHTML, Gecko) Chrome/45.0.2454.101 \
# Safari/537.36'
# }
# htmlfile = requests.get(url, headers=headers)
htmlfile.encoding = 'utf8'
htmlfile.raise_for_status()
print(htmlfile.text)
import requests
url = 'https://www.yahoo.co.jp'
htmlfile = requests.get(url)
file_name = 'webpage.html'
with open(file_name, 'w', encoding='utf-8') as file_obj:
file_obj.write(htmlfile.text)
import requests
url = 'https://www.google.com.tw/search?q={0}+food&oq={0}'
keyword = input('請輸入要找的關鍵字: ')
htmlfile = requests.get(url.format(keyword))
file_name = 'webpage.html'
with open(file_name, 'w', encoding='utf-8') as file_obj:
file_obj.write(htmlfile.text)
import requests
search = ['cat', 'dog', 'bird']
for q in search:
url = f'''https://www.google.com/search?q={q}&sca_esv=593758061&sxsrf=AM9HkKkGcvF9KBLy4RikxvKwsPpIiOtCDw%3A1703595822109&ei=Ls-KZeGcBpzd2roPgYce&ved=0ahUKEwjh68T-lK2DAxWcrlYBHYGDBwAQ4dUDCBA&uact=5&oq={q}&gs_lp=Egxnd3Mtd2l6LXNlcnAiA2NhdDIKEAAYRxjWBBiwAzIKEAAYRxjWBBiwAzIKEAAYRxjWBBiwAzIKEAAYRxjWBBiwAzIKEAAYRxjWBBiwAzIKEAAYRxjWBBiwAzIKEAAYRxjWBBiwAzIKEAAYRxjWBBiwAzINEAAYgAQYigUYQxiwAzINEAAYgAQYigUYQxiwAzINEAAYgAQYigUYQxiwAzINEAAYgAQYigUYQxiwAzIOEAAY5AIY1gQYsAPYAQEyDhAAGOQCGNYEGLAD2AEBMg4QABjkAhjWBBiwA9gBATITEC4YgAQYigUYQxjIAxiwA9gBAjITEC4YgAQYigUYQxjIAxiwA9gBAjITEC4YgAQYigUYQxjIAxiwA9gBAjITEC4YgAQYigUYQxjIAxiwA9gBAkjRBVAAWABwAXgBkAEAmAEAoAEAqgEAuAEDyAEA4gMEGAAgQYgGAZAGE7oGBggBEAEYCboGBggCEAEYCA&sclient=gws-wiz-serp'''
htmlfile = requests.get(url)
file_name = f'{q}.html'
with open(file_name, 'w', encoding='utf-8') as file_obj:
file_obj.write(htmlfile.text)
import requests
url = 'https://www.taiwandns.com/wp-content/plugins/post2pdf-converter/post2pdf-converter-pdf-maker.php?id=4720&file=id&font=droidsansfallback&monospaced=droidsansfallback&fontsize=13&subsetting=0&ratio=1.35&header=1&title=1&wrap_title=0&logo=1&logo_file=logo.png&logo_width=60&footer=1&filters=1&shortcode=parse&ffamily=0'
htmlfile = requests.get(url)
file_name = 'webpage.pdf'
with open(file_name, 'wb') as file_obj:
for content in htmlfile.iter_content(1024):
size = file_obj.write(content)
print(size)
import requests
from bs4 import BeautifulSoup
base_url = 'https://www.ptt.cc'
template_url = base_url + '/bbs/{0}/index.html'
resp = requests.get(
url=template_url.format('Gossiping'),
cookies={'over18': '1'} # 設定cookie值
)
# print(resp.text)
soup = BeautifulSoup(resp.text, "lxml")
print(soup.prettify()) # 將抓下來的網頁內容排版,方便閱讀
print(soup.title)
print(soup.title.string)
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = 'https://www.ptt.cc'
template_url = base_url + '/bbs/{0}/index.html'
resp = requests.get(
url=template_url.format('Gossiping'),
cookies={'over18': '1'} # 設定cookie值
)
# print(resp.text)
soup = BeautifulSoup(resp.text, "lxml")
# urls = soup.find_all('a')
# print(urls)
# titles = soup.find_all("div", {"class": "title"})
# print(titles)
# titles = soup.find_all("div", {"class": "title"})
# for title in titles:
# url = title.find('a')
# print(url)
# titles = soup.find_all("div", {"class": "title"})
# for title in titles:
# url = title.find('a')
# print(url.get('href')) # 取得超連結網址
urls = []
titles = soup.find_all("div", {"class": "title"})
for title in titles:
url = title.find('a')
urls.append(base_url + url.get('href'))
# print(urls)
df = pd.DataFrame(urls)
df.to_excel('ptt.xlsx')
import requests
from bs4 import BeautifulSoup
base_url = 'https://www.ptt.cc'
template_url = base_url + '/bbs/{0}/index.html'
groups = {
'八卦板': 'Gossiping',
'電影板': 'Movie'
}
def get_dom(group):
resp = requests.get(
url=template_url.format(groups[group]),
cookies={'over18': '1'} # 設定cookie值
)
if resp.status_code != 200:
print('網址不正確:', resp.url)
return None
else:
return resp.text
def get_title(dom):
titles = []
soup = BeautifulSoup(dom, "lxml")
div_result = soup.find_all('div', 'r-ent')
for div in div_result:
res = div.find('div', 'title').find('a')
if res:
titles.append({
'title': res.string,
'url': base_url + res['href']
})
return titles
if __name__ == '__main__':
dom = get_dom('電影板')
if dom:
titles = get_title(dom)
for t in titles:
print(t['url'], t['title'])
item = soup.find(id='id_name') # 取得符合id為id_name的內容
items = soup.find_all("a", href="url_text") # 取得符合tag為a且超連結網址為url_text內容
items = soup.find_all(href=re.compile("re_text")) # 根據正則表達示取得符合的內容
items = soup.find_all(href=re.compile("re_text"), id="id_name")
items = soup.find_all(attrs={"data-foo": "value"})
items = soup.find_all("a", class_="title")
items = soup.find_all(class_=re.compile("^bold"))
items = soup.find_all("a", string="string_context") # 根據文字內容取得符合的內容
items = soup.find_all("a", string=re.compile("^re_text"))
item = soup.find(id="url_text").find_parents("div") # 往上尋找第一個符合的父標籤內容
item = soup.find(id="url_text").find_previous_siblings("a") # 找同層的前一個節點
item = soup.find(id="url_text").find_next_siblings("a") # 找同層的後一個節點
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = 'https://www.ptt.cc'
template_url = base_url + '/bbs/{0}/index.html'
resp = requests.get(
url=template_url.format('Gossiping'),
cookies={'over18': '1'} # 設定cookie值
)
soup = BeautifulSoup(resp.text, "lxml")
items = soup.select(".title > a")
print(items)
for i in items:
print(base_url + i.get('href'))
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
yahoo_url = 'https://tw.stock.yahoo.com/q/q?s='
def convert_string(x):
return x.string
keyword = input('請輸入要找的股票代號: ')
resp = requests.get(yahoo_url + keyword)
soup = BeautifulSoup(resp.text, 'lxml')
table_result = soup.find("table", width="750", border="2")
trs = table_result.find_all("tr")
ths = trs[0].find_all("th")
ths = map(convert_string, ths)
tds = trs[1].find_all("td")
tds = map(convert_string, tds)
stock_info = dict(zip(ths, tds))
print(stock_info)
df = pd.DataFrame(stock_info, index=[0])
df.to_excel('stock_info.xlsx')
import requests
payload = {
'username': 'amos',
'password': 'python',
}
r = requests.post('http://128.199.172.46:9000/login', data=payload)
print(r.status_code) # 狀態碼
print(r.headers['content-type']) #
print(r.encoding) # 文字編碼
print(r.text) # 取得網頁內容
import requests
# import pandas as pd
import os, time, logging
base_url = 'https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=csv&date={0}&stockNo={1}'
base_path = 'c:/users/install/Desktop/證交所/'
stock_nos = ['1102'] # 股票代號
start_date = '20200101'
end_date = '20200301'
limit_times = 20
times = 1
sleep_time = 60 # 秒
def createFolder(directory):
'''
建立資料夾
'''
try:
if not os.path.exists(directory):
os.makedirs(directory)
except OSError:
print ('Error: 建立資料夾失敗: ' + directory)
# def get_months(start_date, end_date):
# '''
# 取得指定期間的每月第一天日期,不含最後日期!
# '''
# dates= pd.date_range(start_date, end_date , freq='1M')-pd.offsets.MonthBegin(1)
# return [date.strftime("%Y%m%d") for date in dates]
# stock_dates = get_months(start_date, end_date)
stock_dates = ['20200101', '20200201']
createFolder(base_path)
try:
for stock_date in stock_dates:
for stock_no in stock_nos:
if times == limit_times:
logging.info('sleep {} sec'.format(sleep_time))
time.sleep(sleep_time) # 避免太頻繁被網站封鎖
times = 0
url = base_url.format(stock_date, stock_no)
r = requests.get(url)
if r.status_code == 200:
with open(base_path + stock_no + '_' + stock_date + '.csv', 'w', encoding='cp950') as f:
f.write(r.text) # 將該股票代號該月交易資料存檔
logging.info('{} {} is ok'.format(stock_no, stock_date))
times += 1
except Exception as e:
logging.error('{} {}: {}'.format(stock_no, stock_date, e))
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install()) # 自動載入取得Chrome Driver
urls = ['http://tw.yahoo.com', 'http://www.google.com'] # 要連線網址
for url in urls:
driver.get(url) # 要求chrome連線到該網址
time.sleep(2) # 暫停2秒
import logging
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
options = webdriver.ChromeOptions()
options.add_argument('--headless') # 不開啟chrome視窗
driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)
urls = []
url = 'https://pubmed.ncbi.nlm.nih.gov/?term=%28%28%28SARS-COV-2%29+OR+%28COVID-19%29%29+OR+%28Coronavirus%29%29+OR+%282019-ncov%29&size=200&page={}'
driver.get(url.format(1))
page_num = int(driver.find_element_by_css_selector('div.results-amount > span.value').text.replace(',', ''))//200 + 1
print("Total Page: ", page_num) # 計算總頁數
for p in range(1, 1+1): # 改成range(1, page_num+1)可以抓全部頁面資料
driver.get(url.format(p))
print('==>', url.format(p))
elems = driver.find_elements_by_css_selector('a.docsum-title') # 取得指定元素
for e in elems:
print(e.get_attribute('href')) # 取得該元素內href屬性內容
import logging
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)
urls = []
url = 'https://tw.appledaily.com/home/'
driver.get(url)
elms = driver.find_elements_by_css_selector('div.stories-container span.desktop-blurb')
for el in elms:
print(el.get_attribute('href'))
補充說明
# 原始資料
b = [
{'name': 1},
{'name': 2},
{'name': 3},
{'name': 4},
{'name': 5}
]
print(b)
# 一一把value取出顯示在螢幕上
for i in b:
print(i['name'])
# 一一把value取出組成list型態
c = [ i['name'] for i in b]
print(c)
import time
import logging
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)
urls = []
url = 'https://pubmed.ncbi.nlm.nih.gov/?term=%28%28%28SARS-COV-2%29+OR+%28COVID-19%29%29+OR+%28Coronavirus%29%29+OR+%282019-ncov%29&size=200&page={}'
driver.get(url.format(1))
page_num = int(driver.find_element_by_css_selector('div.results-amount > span.value').text.replace(',', ''))//200 + 1
print("Total Page: ", page_num)
for p in range(1, 2+1): # 改成range(1, page_num+1)可以抓全部頁面資料
driver.get(url.format(p))
print('==>', url.format(p))
elems = driver.find_elements_by_css_selector('article.full-docsum > .docsum-wrap > .docsum-content > .docsum-title')
urls = urls + [elem.get_attribute('href') for elem in elems]
df = pd.DataFrame(urls)
df.to_excel('covid-19_papers_urls_20200716.xlsx')
import os
import logging
import requests
import bs4
import lxml
import pandas as pd
logging.basicConfig(level=logging.INFO, filename='pubmed_data.log', format='%(asctime)s - %(levelname)s: %(message)s')
columns_name = ['url', 'source', 'title', 'authors', 'abstract', 'keywords', 'pmid']
df = pd.DataFrame(columns = columns_name)
urls = [
# 'https://pubmed.ncbi.nlm.nih.gov/15288785/',
# 'https://pubmed.ncbi.nlm.nih.gov/32667047/'
# 'https://www.ncbi.nlm.nih.gov/pubmed/32139620',
# 'https://www.ncbi.nlm.nih.gov/pubmed/32192285',
# 'https://pubmed.ncbi.nlm.nih.gov/32310612/',
'https://pubmed.ncbi.nlm.nih.gov/32064855/',
'https://pubmed.ncbi.nlm.nih.gov/32369103/',
'https://pubmed.ncbi.nlm.nih.gov/32157862/'
]
for i, url in enumerate(urls):
logging.info('{0}: {1}'.format(str(i), url))
one_record = {}
try:
r = requests.get(url)
s = bs4.BeautifulSoup(r.text, 'lxml')
one_record['url'] = url
if(s.select('.article-citation') != []):
one_record['source'] = s.select('.article-citation')[0].text.replace('\n', '')
else:
one_record['source'] = s.select('.book-citation')[0].text.replace('\n', '')
one_record['title'] = s.select('.heading-title')[0].text.replace('\n', '')
if(s.select('.authors') != []):
one_record['authors'] = s.select('.authors')[0].text.replace('\n', '')
if(s.select('.abstract-content') != []):
one_record['abstract'] = s.select('.abstract-content')[0].text.replace('\n', '')
if(s.find_all(class_="sub-title", string='\n Keywords:\n ') != []):
one_record['keywords'] = s.find_all(class_="sub-title", string='\n Keywords:\n ')[0].find_parents("p")[0].text.replace('\n', '')
one_record['pmid'] = s.select('.current-id')[0].text
df = df.append([one_record], ignore_index=True)
except Exception as e:
logging.error(e)
logging.debug(one_record)
df.to_excel('pubmed_data_test.xlsx')
import os
import logging
import requests
import bs4
import lxml
import sqlite3
import pandas as pd
from model import DB
logging.basicConfig(level=logging.INFO, filename='pubmed_data.log', format='%(asctime)s - %(levelname)s: %(message)s')
db = DB()
conn = sqlite3.connect('pubmed.sqlite3')
columns_name = ['url', 'source', 'title', 'authors', 'abstract', 'keywords', 'pmid']
urls = pd.read_excel(r'covid-19_papers_urls_20200716.xlsx')[0]
for i, url in enumerate(urls):
logging.info('{0}: {1}'.format(str(i), url))
one_record = {}
# df = pd.DataFrame(columns=columns_name)
try:
r = requests.get(url)
s = bs4.BeautifulSoup(r.text, 'lxml')
one_record['url'] = url
if(s.select('.article-citation') != []):
one_record['source'] = s.select('.article-citation')[0].text.replace('\n', '')
else:
one_record['source'] = s.select('.book-citation')[0].text.replace('\n', '')
one_record['title'] = s.select('.heading-title')[0].text.replace('\n', '')
if(s.select('.authors') != []):
one_record['authors'] = s.select('.authors')[0].text.replace('\n', '')
if(s.select('.abstract-content') != []):
one_record['abstract'] = s.select('.abstract-content')[0].text.replace('\n', '')
if(s.find_all(class_="sub-title", string='\n Keywords:\n ') != []):
one_record['keywords'] = s.find_all(class_="sub-title", string='\n Keywords:\n ')[0].find_parents("p")[0].text.replace('\n', '')
one_record['pmid'] = s.select('.current-id')[0].text
df = pd.DataFrame([one_record])
# db.df2sql(df, 'pubmed_data', columns_name, 8)
except Exception as e:
logging.error(e)
logging.debug(one_record)
df.to_sql('articles', conn, if_exists='replace')
from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL
from sqlalchemy.types import VARCHAR
import pandas as pd
class DB:
def __init__(self):
self.username = 'root'
self.password = 'test'
self.host = 'localhost'
self.port = '3306'
self.db = 'pmap'
self.engine = create_engine(f"mysql+pymysql://{self.username}:{self.password}@{self.host}:{self.port}/{self.db}")
self.con = self.engine.connect()
# dataframe存到資料庫
def df2sql(self, df, table_name, columns_name, columns_len=None):
df.to_sql(table_name, self.con, if_exists='append')
# 從資料庫讀取成dataframe
def sql2df(self, table_name, columns=None, dtype=None):
return pd.read_sql(table_name, con=self.con, columns=columns)
# return pd.read_sql('select * from playerlist2', con=self.con, columns=columns)
'''
去除前置詞及Actions之後的字串
'''
import logging
import pandas as pd
logging.basicConfig(level=logging.INFO, filename='pubmed_data.log', format='%(asctime)s - %(levelname)s: %(message)s')
df_pubtypes = pd.read_excel(r'pubmed_data_20200716.xlsx', sheet_name='pubmed types', usecols=['Publication Type'])
# for index, row in df_pubtypes.iterrows():
# print(row[0])
df_pubmed_data = pd.read_excel(r'pubmed_data_20200716.xlsx')
df_pubmed_data['source_title'] = df_pubmed_data['source']
for index, row in df_pubtypes.iterrows():
df_pubmed_data['source_title'] = df_pubmed_data['source_title'].str.replace(row[0]+' ', '')
df_pubmed_data['source_title'] = df_pubmed_data['source_title'].str.extract(r'(.+)Actions')
df_pubmed_data.drop(df_pubmed_data.columns[[0,1]], axis=1, inplace=True)
df_pubmed_data.to_excel('new.xlsx')
import time
import logging
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
timeout = 50
url = 'https://www.facebook.com/'
options = webdriver.ChromeOptions()
# options.add_argument('--headless')
# options.add_argument('--no-sandbox')
# options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)
driver.get(url)
email = driver.find_element_by_id('email')
password = driver.find_element_by_id('pass')
button = driver.find_element_by_name('login')
email.send_keys('wootu.test@gmail.com')
password.send_keys('aaaaaaaaaa')
button.click()
try:
element_present = EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-pagelet="root"]'))
friends = WebDriverWait(driver, timeout).until(element_present)
friends = friends.find_element_by_css_selector('div[data-visualcompletion="ignore-dynamic"]')
friends = friends.find_elements_by_css_selector('span')
for f in friends:
print(f.text)
except TimeoutException:
print("Timed out waiting for page to load")
driver.close()