專題練習(一)

# 專題練習(一) url = https://technews.tw/ ![](https://hackmd.io/_uploads/HyaGYPDUn.png) ## python爬蟲程式 import json #請求獲取網頁內容 url = 'https://technews.tw/' response = requests.get(url=url) # 以 Beautiful Soup 解析 HTML 程式碼 soup = BeautifulSoup(response.text, 'html.parser') #使用 find_all 找出所有特定的 HTML 標籤節點 info_items = soup.find_all('li', 'block2014') items = [] for item in info_items: content = item.find('div', class_='cat01').text.strip() sum_title = item.find('div', class_='sum_title').text.strip() sum_title_url = item.find('div', class_='img').a['href'].strip() spotlist_elements = item.find_all('li', class_='spotlist') spotlist = [] for element in spotlist_elements: title = element.a.text.strip() url = element.a['href'].strip() spotlist.append({"title": title, "url": url}) movie = { "category": content, "sum_title": sum_title, "sum_title_url": sum_title_url, "spotlist": spotlist } items.append(movie) with open('output.json', 'w', encoding='utf-8') as file: json.dump(items, file, ensure_ascii=False, indent=4) print("爬取完成並保存為output.json文件") ## 結果 ![](https://hackmd.io/_uploads/HkKl6wvIh.png) ## json檔 ![](https://hackmd.io/_uploads/S1b5aDw8h.png) ## 1-2專題練習 ![](https://hackmd.io/_uploads/SyNbdYwI2.png) import requests from bs4 import BeautifulSoup import json def save_text_as_file(filename, content): with open(filename, 'w', encoding='utf-8') as file: file.write(content) # 發送 GET 請求獲取網頁內容 url = 'https://technews.tw/' response = requests.get(url=url) # 使用 BeautifulSoup 解析 HTML 程序碼 soup = BeautifulSoup(response.text, 'html.parser') # 使用 find_all 找出所有特定的 HTML 標籤節點 info_items = soup.find_all('li', 'block2014') items = [] for item in info_items: # 提取 category, sum_title 和 sum_title_url content = item.find('div', class_='cat01').text.strip() sum_title = item.find('div', class_='sum_title').text.strip() sum_title_url = item.find('div', class_='img').a['href'].strip() # 生成 sum_title 的文件名 sum_title_filename = f'sum_{content}_{sum_title[:4]}.txt' # 請求 sum_title_url 並獲取響應內容 sum_title_response = requests.get(url=sum_title_url) sum_title_content = sum_title_response.text # 將 sum_title_content 保存為文件 save_text_as_file(sum_title_filename, sum_title_content) # 獲取 spotlist 元素列表 spotlist_elements = item.find_all('li', class_='spotlist') spotlist = [] for element in spotlist_elements: # 提取 title 和 url title = element.a.text.strip() url = element.a['href'].strip() # 生成 spotlist 的文件名 spotlist_filename = f'spot_{content}_{title[:4]}.txt' # 請求 url 並獲取響應內容 spotlist_response = requests.get(url=url) spotlist_soup = BeautifulSoup(spotlist_response.text, 'html.parser') # 找到所有带有 class="indent" 的 div 元素 spotlist_divs = spotlist_soup.find_all('div', class_='indent') # 提取所有 div 元素的文本內容，並拼接成字符串 spotlist_content = '\n'.join([div.text.strip() for div in spotlist_divs]) # 將 spotlist_content 保存為文件 save_text_as_file(spotlist_filename, spotlist_content) # 將 title 和 url 添加到 spotlist 列表中 spotlist.append({"title": title, "url": url}) # 構建 movie 字典，並添加到 items 列表中 movie = { "category": content, "sum_title": sum_title, "sum_title_url": sum_title_url, "spotlist": spotlist } items.append(movie) # 將 items 列表保存為 JSON 文件 with open('output.json', 'w', encoding='utf-8') as file: json.dump(items, file, ensure_ascii=False, indent=4) # 輸出提示信息 print("爬取完成並保存為output.json文件") ## 執行結果 ![](https://hackmd.io/_uploads/H1gXFFwIh.png) ![](https://hackmd.io/_uploads/B1kVsKD8h.png)