Vincent55 楊竣鴻
ref: https://developer.mozilla.org/en-US/docs/Web/HTTP/Overview
methods | 功能 | 參數 |
---|---|---|
GET | 取得資料 | ?key1=value1&key2=value2 |
POST | 上傳資料 | key1=value1&key2=value2 |
DELETE | 刪除資料 | |
PATCH | 更新資料 |
chrome://dino/
Runner.instance_.setSpeed(1000000)
<head> <link rel="stylesheet" type="text/css" href="mycss.css"> <style> body{ background:#fff; color:#777; } h1{ font-weight:bold; font-style:italic; font-family:sans-serif; color:green; } a { color: #0077cc; text-decoration: none; } .container { border: 2px solid #ccc; padding: 20px; border-radius: 10px; } </style> </head>
https://jupiter.challenges.picoctf.org/problem/9670/
Hint: 使用開發人員工具,檢查 HTML CSS JavaScript
Requests is a simple, yet elegant, HTTP library.
pip install requests
import requests #先將欲發出 GET 請求的網址先存在 url url = 'https://example.com/' #對 url 發出 GET 請求,並將 Response 物件存在 res res = requests.get(url) print(type(res), res) #Output: <class 'requests.models.Response'> <Response [200]>
import requests #先將欲發出 GET 請求的網址先存在 url url = 'https://example.com/' #對 url 發出 GET 請求,並將 Response 物件存在 res res = requests.get(url) print(res.text) ''' <!doctype html> <html> <head> <title>Example Domain</title> <meta charset="utf-8" /> <meta http-equiv="Content-type" content="text/html; charset=utf-8" /> <meta name="viewport" content="width=device-width, initial-scale=1" /> <style type="text/css"> body { background-color: #f0f0f2; margin: 0; padding: 0; font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif; } div { width: 600px; margin: 5em auto; padding: 2em; background-color: #fdfdff; border-radius: 0.5em; box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02); } a:link, a:visited { color: #38488f; text-decoration: none; } @media (max-width: 700px) { div { margin: 0 auto; width: auto; } } </style> </head> <body> <div> <h1>Example Domain</h1> <p>This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission.</p> <p><a href="https://www.iana.org/domains/example">More information...</a></p> </div> </body> </html> '''
https://csie.ncku.camp/
政府資料開放平台 https://data.gov.tw/
{
"firstName": "John",
"lastName": "Smith",
"sex": "male",
"age": 25,
"ismarry": false,
"child": null,
"address": {
"streetAddress": "21 2nd Street",
"city": "New York",
"state": "NY",
"postalCode": "10021"
},
"phoneNumber": [
{
"type": "home",
"number": "212 555-1234"
},
{
"type": "fax",
"number": "646 555-4567"
}
]
}
response.json()
xxx.json
.json
檔案with open('result.json', 'w', encoding='utf-8') as f:
json.dump(res, f, indent=2, sort_keys=True, ensure_ascii=False)
.json
檔案取得 JSONwith open('result.json', 'r') as result_fd:
result_file = json.load(result_fd)
print(result_file)
import requests import json #先將欲發出 GET 請求的網址先存在 url url = 'http://dev.vincent55.tw/json' #對 url 發出 GET 請求,並將 Response 物件透過 json 解析後存在 res res = requests.get(url).json() print(type(res), res) print(res['slideshow']['slides'][1]['title']) # 將回傳值存於 result.json with open('result.json', 'w', encoding='utf-8') as f: json.dump(res, f, indent=2, sort_keys=True, ensure_ascii=False) ''' <class 'dict'> {'slideshow': {'author': 'Yours Truly', 'date': 'date of publication', 'slides': [{'title': 'Wake up to WonderWidgets!', 'type': 'all'}, {'items': ['Why <em>WonderWidgets</em> are great', 'Who <em>buys</em> WonderWidgets'], 'title': 'Overview', 'type': 'all'}], 'title': 'Sample Slide Show'}} Overview '''
https://api.thecatapi.com/v1/images/search
result.json
裡面result.json
不會被覆蓋pip install beautifulsoup4
from bs4 import BeautifulSoup html_text = """ <html><head></head><body><h1>Hello, World!</h1></body></html> """ soup = BeautifulSoup(html_text, "html.parser") print(soup.prettify())
<html>
<head>
</head>
<body>
<h1>
Hello, World!
</h1>
</body>
</html>
soup.find('p', id='myid', class_='myclass')
from bs4 import BeautifulSoup import requests # 將 resp.text 也就是 HTML 資料定義到 BeautifulSoup 物件內,並用 html.parser 解析 HTML 內容 soup = BeautifulSoup(requests.get(url).text, "html.parser") # 輸出網頁的 title print(soup.title.getText()) #輸出第一個尋找到的 <li> 元素的文字 print(soup.li.getText()) #輸出第一個尋找到的 <li> 元素的文字(相同效果) print(soup.find('li').getText()) #尋找全部 <li> 元素的文字 lis = soup.find_all('li') for li in lis: print(li.getText())
from bs4 import BeautifulSoup import requests url = "https://www.ptt.cc/bbs/Baseball/index.html" r = requests.get(url) soup = BeautifulSoup(r.text, "html.parser") for article in soup.find_all(class_="r-ent"): print(article)
from bs4 import BeautifulSoup import requests import json url = "https://www.ptt.cc/bbs/Baseball/index.html" r = requests.get(url) soup = BeautifulSoup(r.text, "html.parser") results = [] for article in soup.find_all(class_="r-ent"): results.append({ "title":article.find(class_="title").text.strip(), "author": article.find(class_="author").text.strip(), "link": article.find(class_="title").a['href'] }) # 將回傳值存於 result.json with open('result.json', 'w', encoding='utf-8') as f: json.dump(results, f, indent=2, sort_keys=True, ensure_ascii=False)
https://www.ptt.cc/bbs/Baseball/index.html
Hint: 可將目前的爬蟲寫成一個 function,並且每次都用爬蟲去取得上一頁的連結
pip install selenium webdriver-manager
from selenium import webdriver from selenium.webdriver.chrome.service import Service as ChromeService from webdriver_manager.chrome import ChromeDriverManager import time driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install())) time.sleep(10) driver.close()
from selenium import webdriver from selenium.webdriver.chrome.service import Service as ChromeService from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By import time driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install())) driver.get("http://www.python.org") #前往該網頁 elem = driver.find_element(By.NAME, "q") #找到特定元素 elem.clear() #將該欄位清空 elem.send_keys("nckucsie") #模擬使用者打入字串 elem.send_keys(Keys.RETURN) #模擬試用者 Enter time.sleep(5) #sleep 5 秒查看結果 driver.close() #將 driver 關閉
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
https://forum.gamer.com.tw/
Hint: 結合我們剛的兩個案例,先自動捲動後再取得文章
pip install <wheel file>.whl
Node.js® is a JavaScript runtime built on Chrome's V8 JavaScript engine.
V8 engine: https://chromium.googlesource.com/v8/v8