網路爬蟲

Vincent55 楊竣鴻

網頁基本原理與複習

網路爬蟲 Vincent55 楊竣鴻 2023-07-11 09:20 - 11:20 2023-07-11 14:50 - 16:50 上課前先解釋如何使用這份簡報，可以使用編輯模式看 Note 跟範例

	<head>
	<link rel="stylesheet" type="text/css" href="mycss.css">
	<style>
	body{
	background:#fff;
	color:#777;
	}
	h1{
	font-weight:bold;
	font-style:italic;
	font-family:sans-serif;
	color:green;
	}
	a {
	color: #0077cc;
	text-decoration: none;
	}


	.container {
	border: 2px solid #ccc;
	padding: 20px;
	border-radius: 10px;
	}

	</style>
	</head>

	import requests
	#先將欲發出 GET 請求的網址先存在 url
	url = 'https://example.com/'
	#對 url 發出 GET 請求，並將 Response 物件存在 res
	res = requests.get(url)

	print(type(res), res)

	#Output: <class 'requests.models.Response'> <Response [200]>

	import requests
	#先將欲發出 GET 請求的網址先存在 url
	url = 'https://example.com/'
	#對 url 發出 GET 請求，並將 Response 物件存在 res
	res = requests.get(url)
	print(res.text)
	'''
	<!doctype html>
	<html>
	<head>
	<title>Example Domain</title>

	<meta charset="utf-8" />
	<meta http-equiv="Content-type" content="text/html; charset=utf-8" />
	<meta name="viewport" content="width=device-width, initial-scale=1" />
	<style type="text/css">
	body {
	background-color: #f0f0f2;
	margin: 0;
	padding: 0;
	font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;

	}
	div {
	width: 600px;
	margin: 5em auto;
	padding: 2em;
	background-color: #fdfdff;
	border-radius: 0.5em;
	box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
	}
	a:link, a:visited {
	color: #38488f;
	text-decoration: none;
	}
	@media (max-width: 700px) {
	div {
	margin: 0 auto;
	width: auto;
	}
	}
	</style>
	</head>

	<body>
	<div>
	<h1>Example Domain</h1>
	<p>This domain is for use in illustrative examples in documents. You may use this
	domain in literature without prior coordination or asking for permission.</p>
	<p><a href="https://www.iana.org/domains/example">More information...</a></p>
	</div>
	</body>
	</html>
	'''

	import requests
	import json
	#先將欲發出 GET 請求的網址先存在 url
	url = 'http://dev.vincent55.tw/json'
	#對 url 發出 GET 請求，並將 Response 物件透過 json 解析後存在 res
	res = requests.get(url).json()
	print(type(res), res)
	print(res['slideshow']['slides'][1]['title'])

	# 將回傳值存於 result.json
	with open('result.json', 'w', encoding='utf-8') as f:
	json.dump(res, f, indent=2, sort_keys=True, ensure_ascii=False)

	'''
	<class 'dict'> {'slideshow': {'author': 'Yours Truly', 'date': 'date of publication', 'slides': [{'title': 'Wake up to WonderWidgets!', 'type': 'all'}, {'items': ['Why <em>WonderWidgets</em> are great', 'Who <em>buys</em> WonderWidgets'], 'title': 'Overview', 'type': 'all'}], 'title': 'Sample Slide Show'}}
	Overview
	'''

	from bs4 import BeautifulSoup
	html_text = """
	<html><head></head><body><h1>Hello, World!</h1></body></html>
	"""
	soup = BeautifulSoup(html_text, "html.parser")
	print(soup.prettify())

	from bs4 import BeautifulSoup
	import requests
	# 將 resp.text 也就是 HTML 資料定義到 BeautifulSoup 物件內，並用 html.parser 解析 HTML 內容
	soup = BeautifulSoup(requests.get(url).text, "html.parser")

	# 輸出網頁的 title
	print(soup.title.getText())

	#輸出第一個尋找到的 <li> 元素的文字
	print(soup.li.getText())

	#輸出第一個尋找到的 <li> 元素的文字(相同效果)
	print(soup.find('li').getText())

	#尋找全部 <li> 元素的文字
	lis = soup.find_all('li')
	for li in lis:
	print(li.getText())

	from bs4 import BeautifulSoup
	import requests
	url = "https://www.ptt.cc/bbs/Baseball/index.html"
	r = requests.get(url)
	soup = BeautifulSoup(r.text, "html.parser")
	for article in soup.find_all(class_="r-ent"):
	print(article)

	from selenium import webdriver
	from selenium.webdriver.chrome.service import Service as ChromeService
	from webdriver_manager.chrome import ChromeDriverManager
	import time

	driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
	time.sleep(10)
	driver.close()

	<span data-position="NaN" data-size="14">#儲存到 json file</span><br>
	<span data-position="NaN" data-size="53">with open('result.json', 'w', encoding='utf-8') as f:</span><br>
	<span data-position="NaN" data-size="61">json.dump(res, f, indent=2, sort_keys=True, ensure_ascii=False)</span><br>
	<span data-position="NaN" data-size="12">#將 driver 關閉</span><br>
	<span data-position="NaN" data-size="14">driver.close()</span><br>

	<span data-position="NaN" data-size="14">#sleep 五秒，觀察變化</span><br>
	<span data-position="NaN" data-size="13">time.sleep(5)</span><br>
	<span data-position="NaN" data-size="12">#將 driver 關閉</span><br>
	<span data-position="NaN" data-size="14">driver.close()</span><br>