**0826PYTHON上課資料**
startswith
endswith
find/"findall→字串不能"
rfind
# ***建立/讀寫檔案***
import os
mydir = 'testch14'
#如果mydir不存在就建立此資料夾
if os.path.exists(mydir):
print("已經存在 %s " % mydir)
else:
os.mkdir(mydir)
print("建立 %s 資料夾成功" % mydir)
date = input('輸入日期:')
event = input('輸入事件:')
description = input('輸入心得:')
fn = "testch14/note.txt"
with open(fn , 'w') as file_obj :
file_obj.write(date + '\n')
file_obj.write(event + '\n')
file_obj.write(description)
with open(fn) as file_Obj: # 用預設mode=r開啟檔案,傳回檔案物件file_Obj
obj_list = file_Obj.read() # 每次讀一行
print(obj_list)
1. %s 字串 %d 整數 %f 浮點數
2. r 讀/ w字"會覆蓋" /a(append)字/r+/w+/a+
# ***下載環境確認指令***
#Import some packages to use
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
#To see our directory
import os
import random
import gc #Gabage collector for cleaning deleted data from memory
* 下載環境
至ANACONDA 環境(Environments)播放圖示左鍵 第一個open termainal 輸入下面
一一複製貼上下載
pip install tensorflow
pip install keras
pip install opencv-python
pip install pandas
pip install Pillow
pip install seaborn
pip install -U scikit-learn scipy matplotlib
# **圖像訓練**
先解壓縮檔案dog cat
train_dir = 'C:/Users/USER/train'
test_dir = 'C:/Users/USER/test1'
train_dogs = ['C:/Users/USER/train/{}'.format(i) for i in os.listdir(train_dir) if 'dog' in i] #get dog images
train_cats = ['C:/Users/USER/train/{}'.format(i) for i in os.listdir(train_dir) if 'cat' in i] #get cat images
test_imgs = ['C:/Users/USER/test1/{}'.format(i) for i in os.listdir(test_dir)] #使用 os.listdir() 命令来获取训练数据 zip 文件中的所有图像,并在其名称中检索带有 dog 的所有图像。
train_imgs = train_dogs[:2000] + train_cats[:2000] # slice the dataset and use 2000 in each class
random.shuffle(train_imgs) # shuffle it randomly
del train_dogs
del train_cats
gc.collect() #collect garbage to save memory
*#PS路徑一定要檢查*
----------------------------------------以上
# 程式一
nrows = 150
ncolumns = 150
channels = 3 #change to 1 if you want to use grayscale image
#A function to read and process the images to an acceptable format for our model
def read_and_process_image(list_of_images):
"""
Returns two arrays:
X is an array of resized images
y is an array of labels
"""
X = [] # images
y = [] # labels
for image in list_of_images:
X.append(cv2.resize(cv2.imread(image, cv2.IMREAD_COLOR), (nrows,ncolumns), interpolation=cv2.INTER_CUBIC)) #Read the image
#get the labels
if 'dog' in image:
y.append(1)
elif 'cat' in image:
y.append(0)
return X, y
--------------------------------------以上
# 程式二
X, y = read_and_process_image(train_imgs)
#X 现在是图像像素值数组的列表,y 是标签列表。让我们预览 X 中的第一张图片
X[0]
----------------------------------------以上
# 程式三
/# 1和0分別代表狗和貓,繪製X的前5個數組,無法使用上面的matplotlib.image的mpimg模塊在X中繪製圖像,因為這些是像素數組而不是原始jpg文件。
/# 所以應該使用imshow()命令。
#Lets view some of the pics
plt.figure(figsize=(20,10))
columns = 5
for i in range(columns):
plt.subplot(5 / columns + 1, columns, i + 1)
plt.imshow(X[i])
----------------------------------------以上
# 程式四
import seaborn as sns
del train_imgs
gc.collect()
#Convert list to numpy array
X = np.array(X)
y = np.array(y)
#Lets plot the label to be sure we just have two class
sns.countplot(y)
plt.title('Labels for Cats and Dogs')
----------------------------------------以上
# 程式五
#檢查數據的形狀。始終檢查並確認數據的形狀(圍度),這非常重要
print("Shape of train images is:", X.shape)
print("Shape of labels is:", y.shape)
----------------------------------------以上
# 程式六
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=2)
print("Shape of train images is:", X_train.shape)
print("Shape of validation images is:", X_val.shape)
print("Shape of labels is:", y_train.shape)
print("Shape of labels is:", y_val.shape)
----------------------------------------以上
# 程式七
del X
del y
gc.collect()
#get the length of the train and validation data
ntrain = len(X_train)
nval = len(X_val)
#We will use a batch size of 32. Note: batch size should be a factor of 2.***4,8,16,32,64...***
batch_size = 32
----------------------------------------以上
# 程式八
from keras import layers
from keras import models
from keras import optimizers
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.image import img_to_array, load_img
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu',input_shape=(150, 150, 3)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dropout(0.2)) #Dropout for regularization
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid')) #Sigmoid function at the end because we have just two classes

----------------------------------------以上
# 程式九
model.summary()
----------------------------------------以上
# 程式十
sgd = optimizers.SGD(lr=0.01, decay=1e-4, momentum=0.9, nesterov=True)
model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['acc'])
----------------------------------------以上
# 程式十一
train_datagen = ImageDataGenerator(rescale=1./255, #Scale the image between 0 and 1
rotation_range=40,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True,)
val_datagen = ImageDataGenerator(rescale=1./255) #We do not augment validation data. we only perform rescale
----------------------------------------以上
# 程式十二
train_generator = train_datagen.flow(X_train, y_train, batch_size=batch_size)
val_generator = val_datagen.flow(X_val, y_val, batch_size=batch_size)
----------------------------------------以上
# 程式十三
history =model.fit_generator(train_generator,
steps_per_epoch=ntrain // batch_size,
epochs=64,
validation_data=val_generator,
validation_steps=nval // batch_size)
----------------------------------------以上
# 程式十四
/# 保存我們的模型,使用下面顯示的簡單Keras功能,這樣我們可以隨時重複使用它,而不是在重新運行我們的筆記本時再次訓練。
/# #Save the model
model.save_weights('model_wieghts.h5')
model.save('model_keras.h5')
/#lets plot the train and val curve
#get the details form the history object
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
#Train and validation accuracy
plt.plot(epochs, acc, 'b', label='Training accurarcy')
plt.plot(epochs, val_acc, 'r', label='Validation accurarcy')
plt.title('Training and Validation accurarcy')
plt.legend()
plt.figure()
#Train and validation loss
plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and Validation loss')
plt.legend()
plt.show()
/#Now lets predict on the first 10 Images of the test set
X_test, y_test = read_and_process_image(test_imgs[0:10]) #Y_test in this case will be empty.
x = np.array(X_test)
test_datagen = ImageDataGenerator(rescale=1./255)
# 程式十五
/# 創建一個列表來保存我們要生成的標籤。
/# 我們設置了我們要繪製的圖像的圖形大小。
/# 在這裡,我們通過在我們訓練的模型上調用.predict()方法,對ImageDataGenerator提供的特定圖像進行預測。
/# 該PRED變量的模型是如何肯定的是,當前圖像是狗的概率。
/# 由於我們給狗標籤為1,概率很高 - 至少大於平均值0.5 - 意味著我們的模型非常有信心圖像是狗,否則它是貓。
/# 因此,我們只需創建一個if -else語句,如果概率大於0.5 ,則附加字符串' Dog ',否則它會將' cat ' 附加到text_label。
/# 我們這樣做是為了在繪製圖像時為圖像添加標題。
/# 在這裡我們添加一個子圖,以便我們可以繪製多個圖像。
/# 在這裡,我們將預測的類添加為圖像圖的標題。
i = 0
text_labels = []
plt.figure(figsize=(30,20))
for batch in test_datagen.flow(x, batch_size=1):
pred = model.predict(batch)
if pred > 0.5:
text_labels.append('dog')
else:
text_labels.append('cat')
plt.subplot(5 / columns + 1, columns, i + 1)
plt.title('This is a ' + text_labels[i])
imgplot = plt.imshow(batch[0])
i += 1
if i % 10 == 0:
break
plt.show()
# VS code
中文化。擴充化 SEARCH Language 下載繁體重開
抓資料夾 執行任一檔案確認可以作業
# 變識不同車輛(標記)

抓圖
檔案:png與jpg
結果
降版本 open terminal 輸入pip install scipy==1.0.0
# 爬蟲8/30
**建構搜尋網頁**
"<HTML>" "</HTML>"
"HEAD"跳轉 列新css
"TITLE" "/TITLE"標題
"BODY"內文
"form"表單
"h1~h6"標題之字大小
"p"段落
"br"換行
"table"表格
"tr"表格列
"td"表格行
"input"輸入內文
"div"標籤
自適應網頁 RWD
"ul"項目符號
*
#\d 一個數字
#\D 一個非數字
#\w 一個英數字元
#\W 一個非英數字元
#\s 一個空白字元
#\S 一個非空白字元
#\b 一個單字範圍(介於 \w和\W,無論順序為何)
#\B 一個非單字範圍
#a | b 或
#. 除了\n之外的任何字元
#^來源字串的開頭
#$來源字串的結尾
#prev ? 零或一個prev
#prev * 零或多個prev 越多越好
#prev *? 零或一個prev 越少越好
#prev + 一個或多個prev 越多越好
#prev +? 一個或多個prev 越少越好
#prev {m} m個連續的prev
#prev {m , n} m到n個連續的prev 越多越好
#prev {m , n}? m到n個連續的prev 越少越好
#[ abc ] a或b或c
#[ ^abc ] not(a或b或c)
#prev (?= next) prev,如果它後面有next的話
#prev (?| next) prev,如果它後面沒有next的話
#(?<= prev) next 如果next之前是prev,匹配next
#(?<! prev) next 如果next之前不是prev,匹配next
# 獲取網頁內容
from urllib.request import urlopen
html = urlopen("http://pythonscraping.com/pages/page1.html")
print(html.read())#運用html.read()獲取網頁的HTML內容
1. 下載環境
至ANACONDA 環境(Environments)播放圖示左鍵 第一個open termainal 輸入下面
一一複製貼上下載
pip install beautifulsoup4
2. 程式
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://pythonscraping.com/pages/page1.html")
bsObj = BeautifulSoup(html.read())
print(bsObj.h1)#從網頁中提取的h1內容 HTML→body→h1
bsObj.h1
--------------------------------------以上
*尋找網頁中的字串(title)
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
def getTitle(url):
try:
html = urlopen(url)
except HTTPError as e:
return None
try:
bsObj = BeautifulSoup(html.read())
title = bsObj.body.h1
except AttributeError as e:
return None
return title
title = getTitle("http://www.pythonscraping.com/pages/page1.html")
if title == None:
print("Title could not be found")
else:
print(title)
-------------------------------------以上
find 找網頁上的第一筆資料
findAll 找網頁上全部的資

t = bsObj.findAll({"h1","h2"})
t
以---------------------------上
type(XXXX) →python中看資料型態
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bsObj = BeautifulSoup(html)
#通過BeautifulSoup對象 我們可以用findAll函數抽取只包含在<span class="green">綠色的文字
nameList = bsObj.findAll("span", {"class":{"green","red"}})
print(type(nameList))
for name in nameList:
print(name.get_text())
----------------------以上
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen('https://zh.wikipedia.org/wiki/%E5%87%AF%E6%96%87%C2%B7%E8%B4%9D%E8%82%AF').read().decode('utf-8')
bsObj = BeautifulSoup(html)
#通過BeautifulSoup對象 我們可以用findAll函數抽取只包含在超連結
//. nameList = bsObj.findAll("a")
//. nameList = bsObj.findAll({"a":"href"})
//兩個寫法都可以
print(type(nameList))
for name in nameList:
print(name.get_text())
------------------------------------------------以上
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("https://morvanzhou.github.io/static/scraping/list.html")
#通過BeautifulSoup對象 我們可以用findAll函數抽取只包含在超連結
bsObj = BeautifulSoup(html)
nameList = bsObj.findAll("li",{"class":"month"})#找出月分
print(type(nameList))
for name in nameList:
print(name.get_text())
---------------------------------------------------以上
import bs4,requests
url = "https://www.taiwanlottery.com.tw/"
html = requests.get(url)
print("網頁下載中...")
html.raise_for_status() #驗證網頁是否下在成功
print("網頁200正常.........")
objSoup = bs4.BeautifulSoup(html.text, "html.parser") #建立BeautifulSoup物件
#print(type(objSoup))
dataTag = objSoup.select(" .contents_box02") #尋找class是.contents_box02
print("串列長度",len(dataTag))
#print(dataTag)
for i in range(len(dataTag)): #列出含.contents_box02的串列長度
print(dataTag[i])
---------------------------------------------------以上
import bs4,requests
url = "https://www.taiwanlottery.com.tw/"
html = requests.get(url)
print("網頁下載中...")
html.raise_for_status() #驗證網頁是否下在成功
print("網頁200正常.........")
objSoup = bs4.BeautifulSoup(html.text, "html.parser") #建立BeautifulSoup物件
#print(type(objSoup))
dataTag = objSoup.select(" .contents_box02") #尋找class是.contents_box02
print("串列長度",len(dataTag))
#print(dataTag)
for i in range(len(dataTag)): #列出含.contents_box02的串列長度
print(dataTag[i])
#找尋開出順序與大小順序的球
balls = dataTag[2].find_all('div', {'class':'ball_tx ball_yellow'})
balls1 = dataTag[0].find_all('div', {'class':'ball_tx ball_green'})
print("開出順序 : ", end='')
for i in range(6): # 前6球是開出順序
print(balls[i].text, end=' ')
print("\n大小順序 : ", end='')
for i in range(6,len(balls)): # 第7球以後是大小順序
print(balls[i].text, end=' ')
#找出第二區的紅球
redball = dataTag[2].find_all('div', {'class':'ball_red'})
print("\n第二區 :", redball[0].text)
print("開出順序 : ", end='')
for i in range(6): # 前6球是開出順序
print(balls1[i].text, end=' ')
print("\n大小順序 : ", end='')
for i in range(6,len(balls)): # 第7球以後是大小順序
print(balls1[i].text, end=' ')
#找出第二區的紅球
redball = dataTag[0].find_all('div', {'class':'ball_red'})
print("\n第二區 :", redball[0].text)
------------------------------以上
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from random import randint
from time import time
確認函數庫下載
----------------------------------以上
url = 'http://www.imdb.com/search/title?release_date=2017&sort=num_votes,desc&page=1'
response = get(url)
print(response.text[:500])
html_soup = BeautifulSoup(response.text, 'html.parser')
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
type(html_soup)
print(type(movie_containers))
print(len(movie_containers))
first_movie = movie_containers[0]#movie message
print("電影名稱 : ",first_movie.h3.a.text)
first_mscore = first_movie.find('span', class_ = 'metascore favorable')
print(type(first_mscore))
first_mscore = int(first_mscore.text)
print("Metascore 評分 :" , first_mscore)
first_year = first_movie.h3.find('span',class_ = 'lister-item-year text-muted unbold')
first_year = first_year.text
print("發行年分 : " , frist_year[1:5])
first_imdb = float(first_movie.strong.text)
print("IMDB評級 : " ,first_imdb)
first_votes = first_movie.find('span', attrs = {'name':'nv'})
print (first_votes.text)
print ("投票數 : " +first_votes['data-value'])#int
索引span class →class_
span name →attrs
--------------------------------------以上
# 整合檢所網頁資訊
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
# Extract data from individual movie container
for container in movie_containers:
# 僅當容器具有Metascore時才提取感興趣的數據點:
if container.find('div', class_ = 'ratings-metascore') is not None:
# The name
name = container.h3.a.text
names.append(name)
# The year
year = container.h3.find('span', class_ = 'lister-item-year').text
years.append(year)
# The IMDB rating
imdb = float(container.strong.text)
imdb_ratings.append(imdb)
# The Metascore
m_score = container.find('span', class_ = 'metascore').text
metascores.append(int(m_score))
# The number of votes
vote = container.find('span', attrs = {'name':'nv'})['data-value']
votes.append(int(vote))
print(" ")
print("電影名稱 : ",name)
print("發行年分 : " , year[1:5])
print("IMDB評級 : " ,imdb)
print ("投票數 : " ,vote)#int
print("Metascore 評分 :" , m_score)
//最後將資訊內容會出成EXCEL
test_df = pd.DataFrame({'movie': names,
'year': years,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes})
print(test_df.info())
test_df.to_csv('movie.ratings.csv')
----------------------------------------------------以上
# VScode 檢索圖片下載
import requests
import time
from bs4 import BeautifulSoup
import os
import re
import urllib.request
import json
PTT_URL = 'https://www.ptt.cc'
#取得網頁文件的function
def get_web_page(url):
time.sleep(0.5) # 每次爬取前暫停 0.5 秒以免被 PTT 網站判定為大量惡意爬取
resp = requests.get(
url=url,
cookies={'over18': '1'}
)
if resp.status_code != 200:
print('Invalid url:', resp.url)
return None
else:
return resp.text
def get_articles(dom, date):
soup = BeautifulSoup(dom, 'html.parser')
# 取得上一頁的連結
paging_div = soup.find('div', 'btn-group btn-group-paging')
prev_url = paging_div.find_all('a')[1]['href']
articles = [] # 儲存取得的文章資料
divs = soup.find_all('div', 'r-ent')
for d in divs:
if d.find('div', 'date').string.strip() == date: # 發文日期正確
# 取得推文數
push_count = 0
if d.find('div', 'nrec').string:
try:
push_count = int(d.find('div', 'nrec').string) # 轉換字串為數字
except ValueError: # 若轉換失敗,不做任何事,push_count 保持為 0
pass
# 取得文章連結及標題
if d.find('a'): # 有超連結,表示文章存在,未被刪除
href = d.find('a')['href']
title = d.find('a').string
articles.append({
'title': title,
'href': href,
'push_count': push_count
})
return articles, prev_url#回傳這一頁的文章和上一頁
def parse(dom):
soup = BeautifulSoup(dom, 'html.parser')
links = soup.find(id='main-content').find_all('a')
img_urls = []
for link in links:
if re.match(r'^https?://(i.)?(m.)?imgur.com', link['href']):
img_urls.append(link['href'])
return img_urls
def save(img_urls, title):
if img_urls:
try:
dname = title.strip() # 用 strip() 去除字串前後的空白
os.makedirs(dname)
for img_url in img_urls:
if img_url.split('//')[1].startswith('m.'):
img_url = img_url.replace('//m.', '//i.')
if not img_url.split('//')[1].startswith('i.'):
img_url = img_url.split('//')[0] + '//i.' + img_url.split('//')[1]
if not img_url.endswith('.jpg'):
img_url += '.jpg'
fname = img_url.split('/')[-1]
urllib.request.urlretrieve(img_url, os.path.join(dname, fname))
except Exception as e:
print(e)
if __name__ == '__main__':
current_page = get_web_page(PTT_URL + '/bbs/Beauty/index.html')
if current_page:
articles = [] # 全部的今日文章
date = time.strftime("%m/%d").lstrip('0') # 今天日期, 去掉開頭的 '0' 以符合 PTT 網站格式
current_articles, prev_url = get_articles(current_page, date) # 目前頁面的今日文章
while current_articles: # 若目前頁面有今日文章則加入 articles,並回到上一頁繼續尋找是否有今日文章
articles += current_articles
current_page = get_web_page(PTT_URL + prev_url)
current_articles, prev_url = get_articles(current_page, date)
# 已取得文章列表,開始進入各文章讀圖
for article in articles:
print('Processing', article)
page = get_web_page(PTT_URL + article['href'])
if page:
img_urls = parse(page)
save(img_urls, article['title'])
article['num_image'] = len(img_urls)
# 儲存文章資訊
with open('data.json', 'w', encoding='utf-8') as f:
json.dump(articles, f, indent=2, sort_keys=True, ensure_ascii=False)
***打開VSCODE 桌面建立資料夾→進VSCODE載入資料夾→在資料夾取名XXXX.py→匯入以上程式碼下圖紅色框***

上圖紅框處為進入ptt所需要經過的 限制級 權限 數字 1 取得 為粉框中 權限區輸入的代碼1

------------------------

最後在程式第86行 的Beauty 可以任意更改 黃框解釋 進入其網頁 查詢 各版的名稱英文代號 填入後就會下載該區所有圖檔(藍框)

執行後的樣子
