0901_Human_Python

# 0901_Human_Python ###### tags: `資料科學自學園` `Python` `人大數據` - [time=Sat, Sep 1, 2018 6:52 AM] ```python= # coding: utf-8 import requests import csv import re import time from bs4 import BeautifulSoup r = requests.get('http://news.nsysu.edu.tw/files/40-1342-2910-{}.php?Lang=zh-tw'.format(range(1,272))) for page in range(1,272): web_r = requests.get('http://news.nsysu.edu.tw/files/40-1342-2910-{}.php?Lang=zh-tw'.format(page)) if web_r.status_code == requests.codes.ok: print("Get No. {} page's web resource".format(page)) web_r.encoding = 'utf-8' web_s = BeautifulSoup(web_r.text,'html.parser') web_tag = web_s.select('#Dyn_2_2 .h5 a') print(web_tag) list_d = list() for i in web_tag: d = {} title = str(i.get('title')) url = str(i.get('href')) date = str(i.find_all(class_ = 'date ')) d['title'] = title d['url'] = url d['date'] = date print(d) arti_r = requests.get(url) if arti_r.status_code == requests.codes.ok: print("Get {} 's content".format(title)) arti_r.encoding = 'utf-8' arti_s = BeautifulSoup(arti_r.text, 'html.parser') arti_tag = arti_s.find_all(class_ = "ptcontent") content = "" # 抓內文 for par in arti_tag: content += str(par.text) # 內文段落合成 Content d['content'] = content print("Catch the content") list_d.append(d) time.sleep(5) print("Catch all the content on the No.{} .".format(page)) filename = "NsysuNews-{}.csv".format(page) with open(filename,'w',newline = '') as save: colnames = ['date','title', 'url', 'content'] wt = csv.DictWriter(save,fieldnames = colnames) wt.writeheader() for jj in range(0,len(list_d)): wt.writerow(list_d[jj]) ```