**0826PYTHON上課資料** startswith endswith find/"findall→字串不能" rfind # ***建立/讀寫檔案*** import os mydir = 'testch14' #如果mydir不存在就建立此資料夾 if os.path.exists(mydir): print("已經存在 %s " % mydir) else: os.mkdir(mydir) print("建立 %s 資料夾成功" % mydir) date = input('輸入日期:') event = input('輸入事件:') description = input('輸入心得:') fn = "testch14/note.txt" with open(fn , 'w') as file_obj : file_obj.write(date + '\n') file_obj.write(event + '\n') file_obj.write(description) with open(fn) as file_Obj: # 用預設mode=r開啟檔案,傳回檔案物件file_Obj obj_list = file_Obj.read() # 每次讀一行 print(obj_list) 1. %s 字串 %d 整數 %f 浮點數 2. r 讀/ w字"會覆蓋" /a(append)字/r+/w+/a+ # ***下載環境確認指令*** #Import some packages to use import cv2 import numpy as np import pandas as pd import matplotlib.pyplot as plt %matplotlib inline #To see our directory import os import random import gc #Gabage collector for cleaning deleted data from memory * 下載環境 至ANACONDA 環境(Environments)播放圖示左鍵 第一個open termainal 輸入下面 一一複製貼上下載 pip install tensorflow pip install keras pip install opencv-python pip install pandas pip install Pillow pip install seaborn pip install -U scikit-learn scipy matplotlib # **圖像訓練** 先解壓縮檔案dog cat train_dir = 'C:/Users/USER/train' test_dir = 'C:/Users/USER/test1' train_dogs = ['C:/Users/USER/train/{}'.format(i) for i in os.listdir(train_dir) if 'dog' in i] #get dog images train_cats = ['C:/Users/USER/train/{}'.format(i) for i in os.listdir(train_dir) if 'cat' in i] #get cat images test_imgs = ['C:/Users/USER/test1/{}'.format(i) for i in os.listdir(test_dir)] #使用 os.listdir() 命令来获取训练数据 zip 文件中的所有图像,并在其名称中检索带有 dog 的所有图像。 train_imgs = train_dogs[:2000] + train_cats[:2000] # slice the dataset and use 2000 in each class random.shuffle(train_imgs) # shuffle it randomly del train_dogs del train_cats gc.collect() #collect garbage to save memory *#PS路徑一定要檢查* ----------------------------------------以上 # 程式一 nrows = 150 ncolumns = 150 channels = 3 #change to 1 if you want to use grayscale image #A function to read and process the images to an acceptable format for our model def read_and_process_image(list_of_images): """ Returns two arrays: X is an array of resized images y is an array of labels """ X = [] # images y = [] # labels for image in list_of_images: X.append(cv2.resize(cv2.imread(image, cv2.IMREAD_COLOR), (nrows,ncolumns), interpolation=cv2.INTER_CUBIC)) #Read the image #get the labels if 'dog' in image: y.append(1) elif 'cat' in image: y.append(0) return X, y --------------------------------------以上 # 程式二 X, y = read_and_process_image(train_imgs) #X 现在是图像像素值数组的列表,y 是标签列表。让我们预览 X 中的第一张图片 X[0] ----------------------------------------以上 # 程式三 /# 1和0分別代表狗和貓,繪製X的前5個數組,無法使用上面的matplotlib.image的mpimg模塊在X中繪製圖像,因為這些是像素數組而不是原始jpg文件。 /# 所以應該使用imshow()命令。 #Lets view some of the pics plt.figure(figsize=(20,10)) columns = 5 for i in range(columns): plt.subplot(5 / columns + 1, columns, i + 1) plt.imshow(X[i]) ----------------------------------------以上 # 程式四 import seaborn as sns del train_imgs gc.collect() #Convert list to numpy array X = np.array(X) y = np.array(y) #Lets plot the label to be sure we just have two class sns.countplot(y) plt.title('Labels for Cats and Dogs') ----------------------------------------以上 # 程式五 #檢查數據的形狀。始終檢查並確認數據的形狀(圍度),這非常重要 print("Shape of train images is:", X.shape) print("Shape of labels is:", y.shape) ----------------------------------------以上 # 程式六 from sklearn.model_selection import train_test_split X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=2) print("Shape of train images is:", X_train.shape) print("Shape of validation images is:", X_val.shape) print("Shape of labels is:", y_train.shape) print("Shape of labels is:", y_val.shape) ----------------------------------------以上 # 程式七 del X del y gc.collect() #get the length of the train and validation data ntrain = len(X_train) nval = len(X_val) #We will use a batch size of 32. Note: batch size should be a factor of 2.***4,8,16,32,64...*** batch_size = 32 ----------------------------------------以上 # 程式八 from keras import layers from keras import models from keras import optimizers from keras.preprocessing.image import ImageDataGenerator from keras.preprocessing.image import img_to_array, load_img model = models.Sequential() model.add(layers.Conv2D(32, (3, 3), activation='relu',input_shape=(150, 150, 3))) model.add(layers.MaxPooling2D((2, 2))) model.add(layers.Conv2D(64, (3, 3), activation='relu')) model.add(layers.MaxPooling2D((2, 2))) model.add(layers.Conv2D(128, (3, 3), activation='relu')) model.add(layers.MaxPooling2D((2, 2))) model.add(layers.Conv2D(128, (3, 3), activation='relu')) model.add(layers.MaxPooling2D((2, 2))) model.add(layers.Flatten()) model.add(layers.Dropout(0.2)) #Dropout for regularization model.add(layers.Dense(512, activation='relu')) model.add(layers.Dense(1, activation='sigmoid')) #Sigmoid function at the end because we have just two classes ![](https://i.imgur.com/1z4QfIE.jpg) ----------------------------------------以上 # 程式九 model.summary() ----------------------------------------以上 # 程式十 sgd = optimizers.SGD(lr=0.01, decay=1e-4, momentum=0.9, nesterov=True) model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['acc']) ----------------------------------------以上 # 程式十一 train_datagen = ImageDataGenerator(rescale=1./255, #Scale the image between 0 and 1 rotation_range=40, width_shift_range=0.2, height_shift_range=0.2, shear_range=0.2, zoom_range=0.2, horizontal_flip=True,) val_datagen = ImageDataGenerator(rescale=1./255) #We do not augment validation data. we only perform rescale ----------------------------------------以上 # 程式十二 train_generator = train_datagen.flow(X_train, y_train, batch_size=batch_size) val_generator = val_datagen.flow(X_val, y_val, batch_size=batch_size) ----------------------------------------以上 # 程式十三 history =model.fit_generator(train_generator, steps_per_epoch=ntrain // batch_size, epochs=64, validation_data=val_generator, validation_steps=nval // batch_size) ----------------------------------------以上 # 程式十四 /# 保存我們的模型,使用下面顯示的簡單Keras功能,這樣我們可以隨時重複使用它,而不是在重新運行我們的筆記本時再次訓練。 /# #Save the model model.save_weights('model_wieghts.h5') model.save('model_keras.h5') /#lets plot the train and val curve #get the details form the history object acc = history.history['acc'] val_acc = history.history['val_acc'] loss = history.history['loss'] val_loss = history.history['val_loss'] epochs = range(1, len(acc) + 1) #Train and validation accuracy plt.plot(epochs, acc, 'b', label='Training accurarcy') plt.plot(epochs, val_acc, 'r', label='Validation accurarcy') plt.title('Training and Validation accurarcy') plt.legend() plt.figure() #Train and validation loss plt.plot(epochs, loss, 'b', label='Training loss') plt.plot(epochs, val_loss, 'r', label='Validation loss') plt.title('Training and Validation loss') plt.legend() plt.show() /#Now lets predict on the first 10 Images of the test set X_test, y_test = read_and_process_image(test_imgs[0:10]) #Y_test in this case will be empty. x = np.array(X_test) test_datagen = ImageDataGenerator(rescale=1./255) # 程式十五 /# 創建一個列表來保存我們要生成的標籤。 /# 我們設置了我們要繪製的圖像的圖形大小。 /# 在這裡,我們通過在我們訓練的模型上調用.predict()方法,對ImageDataGenerator提供的特定圖像進行預測。 /# 該PRED變量的模型是如何肯定的是,當前圖像是狗的概率。 /# 由於我們給狗標籤為1,概率很高 - 至少大於平均值0.5 - 意味著我們的模型非常有信心圖像是狗,否則它是貓。 /# 因此,我們只需創建一個if -else語句,如果概率大於0.5 ,則附加字符串' Dog ',否則它會將' cat ' 附加到text_label。 /# 我們這樣做是為了在繪製圖像時為圖像添加標題。 /# 在這裡我們添加一個子圖,以便我們可以繪製多個圖像。 /# 在這裡,我們將預測的類添加為圖像圖的標題。 i = 0 text_labels = [] plt.figure(figsize=(30,20)) for batch in test_datagen.flow(x, batch_size=1): pred = model.predict(batch) if pred > 0.5: text_labels.append('dog') else: text_labels.append('cat') plt.subplot(5 / columns + 1, columns, i + 1) plt.title('This is a ' + text_labels[i]) imgplot = plt.imshow(batch[0]) i += 1 if i % 10 == 0: break plt.show() # VS code 中文化。擴充化 SEARCH Language 下載繁體重開 抓資料夾 執行任一檔案確認可以作業![](https://i.imgur.com/L7W9nre.png) # 變識不同車輛(標記) ![](https://i.imgur.com/IvHt1NW.png) 抓圖![](https://i.imgur.com/BvRUb9k.jpg) 檔案:png與jpg 結果![](https://i.imgur.com/PXW3e9b.jpg) 降版本 open terminal 輸入pip install scipy==1.0.0 # 爬蟲8/30 **建構搜尋網頁** "<HTML>" "</HTML>" "HEAD"跳轉 列新css "TITLE" "/TITLE"標題 "BODY"內文 "form"表單 "h1~h6"標題之字大小 "p"段落 "br"換行 "table"表格 "tr"表格列 "td"表格行 "input"輸入內文 "div"標籤 自適應網頁 RWD "ul"項目符號 * #\d 一個數字 #\D 一個非數字 #\w 一個英數字元 #\W 一個非英數字元 #\s 一個空白字元 #\S 一個非空白字元 #\b 一個單字範圍(介於 \w和\W,無論順序為何) #\B 一個非單字範圍 #a | b 或 #. 除了\n之外的任何字元 #^來源字串的開頭 #$來源字串的結尾 #prev ? 零或一個prev #prev * 零或多個prev 越多越好 #prev *? 零或一個prev 越少越好 #prev + 一個或多個prev 越多越好 #prev +? 一個或多個prev 越少越好 #prev {m} m個連續的prev #prev {m , n} m到n個連續的prev 越多越好 #prev {m , n}? m到n個連續的prev 越少越好 #[ abc ] a或b或c #[ ^abc ] not(a或b或c) #prev (?= next) prev,如果它後面有next的話 #prev (?| next) prev,如果它後面沒有next的話 #(?<= prev) next 如果next之前是prev,匹配next #(?<! prev) next 如果next之前不是prev,匹配next # 獲取網頁內容 from urllib.request import urlopen html = urlopen("http://pythonscraping.com/pages/page1.html") print(html.read())#運用html.read()獲取網頁的HTML內容 1. 下載環境 至ANACONDA 環境(Environments)播放圖示左鍵 第一個open termainal 輸入下面 一一複製貼上下載 pip install beautifulsoup4 2. 程式 from urllib.request import urlopen from bs4 import BeautifulSoup html = urlopen("http://pythonscraping.com/pages/page1.html") bsObj = BeautifulSoup(html.read()) print(bsObj.h1)#從網頁中提取的h1內容 HTML→body→h1 bsObj.h1 --------------------------------------以上 *尋找網頁中的字串(title) from urllib.request import urlopen from urllib.error import HTTPError from bs4 import BeautifulSoup def getTitle(url): try: html = urlopen(url) except HTTPError as e: return None try: bsObj = BeautifulSoup(html.read()) title = bsObj.body.h1 except AttributeError as e: return None return title title = getTitle("http://www.pythonscraping.com/pages/page1.html") if title == None: print("Title could not be found") else: print(title) -------------------------------------以上 find 找網頁上的第一筆資料 findAll 找網頁上全部的資 ![](https://i.imgur.com/ExROkHX.jpg) t = bsObj.findAll({"h1","h2"}) t 以---------------------------上 type(XXXX) →python中看資料型態 from urllib.request import urlopen from bs4 import BeautifulSoup html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html") bsObj = BeautifulSoup(html) #通過BeautifulSoup對象 我們可以用findAll函數抽取只包含在<span class="green">綠色的文字 nameList = bsObj.findAll("span", {"class":{"green","red"}}) print(type(nameList)) for name in nameList: print(name.get_text()) ----------------------以上 from urllib.request import urlopen from bs4 import BeautifulSoup html = urlopen('https://zh.wikipedia.org/wiki/%E5%87%AF%E6%96%87%C2%B7%E8%B4%9D%E8%82%AF').read().decode('utf-8') bsObj = BeautifulSoup(html) #通過BeautifulSoup對象 我們可以用findAll函數抽取只包含在超連結 //. nameList = bsObj.findAll("a") //. nameList = bsObj.findAll({"a":"href"}) //兩個寫法都可以 print(type(nameList)) for name in nameList: print(name.get_text()) ------------------------------------------------以上 from urllib.request import urlopen from bs4 import BeautifulSoup html = urlopen("https://morvanzhou.github.io/static/scraping/list.html") #通過BeautifulSoup對象 我們可以用findAll函數抽取只包含在超連結 bsObj = BeautifulSoup(html) nameList = bsObj.findAll("li",{"class":"month"})#找出月分 print(type(nameList)) for name in nameList: print(name.get_text()) ---------------------------------------------------以上 import bs4,requests url = "https://www.taiwanlottery.com.tw/" html = requests.get(url) print("網頁下載中...") html.raise_for_status() #驗證網頁是否下在成功 print("網頁200正常.........") objSoup = bs4.BeautifulSoup(html.text, "html.parser") #建立BeautifulSoup物件 #print(type(objSoup)) dataTag = objSoup.select(" .contents_box02") #尋找class是.contents_box02 print("串列長度",len(dataTag)) #print(dataTag) for i in range(len(dataTag)): #列出含.contents_box02的串列長度 print(dataTag[i]) ---------------------------------------------------以上 import bs4,requests url = "https://www.taiwanlottery.com.tw/" html = requests.get(url) print("網頁下載中...") html.raise_for_status() #驗證網頁是否下在成功 print("網頁200正常.........") objSoup = bs4.BeautifulSoup(html.text, "html.parser") #建立BeautifulSoup物件 #print(type(objSoup)) dataTag = objSoup.select(" .contents_box02") #尋找class是.contents_box02 print("串列長度",len(dataTag)) #print(dataTag) for i in range(len(dataTag)): #列出含.contents_box02的串列長度 print(dataTag[i]) #找尋開出順序與大小順序的球 balls = dataTag[2].find_all('div', {'class':'ball_tx ball_yellow'}) balls1 = dataTag[0].find_all('div', {'class':'ball_tx ball_green'}) print("開出順序 : ", end='') for i in range(6): # 前6球是開出順序 print(balls[i].text, end=' ') print("\n大小順序 : ", end='') for i in range(6,len(balls)): # 第7球以後是大小順序 print(balls[i].text, end=' ') #找出第二區的紅球 redball = dataTag[2].find_all('div', {'class':'ball_red'}) print("\n第二區 :", redball[0].text) print("開出順序 : ", end='') for i in range(6): # 前6球是開出順序 print(balls1[i].text, end=' ') print("\n大小順序 : ", end='') for i in range(6,len(balls)): # 第7球以後是大小順序 print(balls1[i].text, end=' ') #找出第二區的紅球 redball = dataTag[0].find_all('div', {'class':'ball_red'}) print("\n第二區 :", redball[0].text) ------------------------------以上 from requests import get from bs4 import BeautifulSoup import pandas as pd from time import sleep from random import randint from time import time 確認函數庫下載 ----------------------------------以上 url = 'http://www.imdb.com/search/title?release_date=2017&sort=num_votes,desc&page=1' response = get(url) print(response.text[:500]) html_soup = BeautifulSoup(response.text, 'html.parser') movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced') type(html_soup) print(type(movie_containers)) print(len(movie_containers)) first_movie = movie_containers[0]#movie message print("電影名稱 : ",first_movie.h3.a.text) first_mscore = first_movie.find('span', class_ = 'metascore favorable') print(type(first_mscore)) first_mscore = int(first_mscore.text) print("Metascore 評分 :" , first_mscore) first_year = first_movie.h3.find('span',class_ = 'lister-item-year text-muted unbold') first_year = first_year.text print("發行年分 : " , frist_year[1:5]) first_imdb = float(first_movie.strong.text) print("IMDB評級 : " ,first_imdb) first_votes = first_movie.find('span', attrs = {'name':'nv'}) print (first_votes.text) print ("投票數 : " +first_votes['data-value'])#int 索引span class →class_ span name →attrs --------------------------------------以上 # 整合檢所網頁資訊 names = [] years = [] imdb_ratings = [] metascores = [] votes = [] # Extract data from individual movie container for container in movie_containers: # 僅當容器具有Metascore時才提取感興趣的數據點: if container.find('div', class_ = 'ratings-metascore') is not None: # The name name = container.h3.a.text names.append(name) # The year year = container.h3.find('span', class_ = 'lister-item-year').text years.append(year) # The IMDB rating imdb = float(container.strong.text) imdb_ratings.append(imdb) # The Metascore m_score = container.find('span', class_ = 'metascore').text metascores.append(int(m_score)) # The number of votes vote = container.find('span', attrs = {'name':'nv'})['data-value'] votes.append(int(vote)) print(" ") print("電影名稱 : ",name) print("發行年分 : " , year[1:5]) print("IMDB評級 : " ,imdb) print ("投票數 : " ,vote)#int print("Metascore 評分 :" , m_score) //最後將資訊內容會出成EXCEL test_df = pd.DataFrame({'movie': names, 'year': years, 'imdb': imdb_ratings, 'metascore': metascores, 'votes': votes}) print(test_df.info()) test_df.to_csv('movie.ratings.csv') ----------------------------------------------------以上 # VScode 檢索圖片下載 import requests import time from bs4 import BeautifulSoup import os import re import urllib.request import json PTT_URL = 'https://www.ptt.cc' #取得網頁文件的function def get_web_page(url): time.sleep(0.5) # 每次爬取前暫停 0.5 秒以免被 PTT 網站判定為大量惡意爬取 resp = requests.get( url=url, cookies={'over18': '1'} ) if resp.status_code != 200: print('Invalid url:', resp.url) return None else: return resp.text def get_articles(dom, date): soup = BeautifulSoup(dom, 'html.parser') # 取得上一頁的連結 paging_div = soup.find('div', 'btn-group btn-group-paging') prev_url = paging_div.find_all('a')[1]['href'] articles = [] # 儲存取得的文章資料 divs = soup.find_all('div', 'r-ent') for d in divs: if d.find('div', 'date').string.strip() == date: # 發文日期正確 # 取得推文數 push_count = 0 if d.find('div', 'nrec').string: try: push_count = int(d.find('div', 'nrec').string) # 轉換字串為數字 except ValueError: # 若轉換失敗,不做任何事,push_count 保持為 0 pass # 取得文章連結及標題 if d.find('a'): # 有超連結,表示文章存在,未被刪除 href = d.find('a')['href'] title = d.find('a').string articles.append({ 'title': title, 'href': href, 'push_count': push_count }) return articles, prev_url#回傳這一頁的文章和上一頁 def parse(dom): soup = BeautifulSoup(dom, 'html.parser') links = soup.find(id='main-content').find_all('a') img_urls = [] for link in links: if re.match(r'^https?://(i.)?(m.)?imgur.com', link['href']): img_urls.append(link['href']) return img_urls def save(img_urls, title): if img_urls: try: dname = title.strip() # 用 strip() 去除字串前後的空白 os.makedirs(dname) for img_url in img_urls: if img_url.split('//')[1].startswith('m.'): img_url = img_url.replace('//m.', '//i.') if not img_url.split('//')[1].startswith('i.'): img_url = img_url.split('//')[0] + '//i.' + img_url.split('//')[1] if not img_url.endswith('.jpg'): img_url += '.jpg' fname = img_url.split('/')[-1] urllib.request.urlretrieve(img_url, os.path.join(dname, fname)) except Exception as e: print(e) if __name__ == '__main__': current_page = get_web_page(PTT_URL + '/bbs/Beauty/index.html') if current_page: articles = [] # 全部的今日文章 date = time.strftime("%m/%d").lstrip('0') # 今天日期, 去掉開頭的 '0' 以符合 PTT 網站格式 current_articles, prev_url = get_articles(current_page, date) # 目前頁面的今日文章 while current_articles: # 若目前頁面有今日文章則加入 articles,並回到上一頁繼續尋找是否有今日文章 articles += current_articles current_page = get_web_page(PTT_URL + prev_url) current_articles, prev_url = get_articles(current_page, date) # 已取得文章列表,開始進入各文章讀圖 for article in articles: print('Processing', article) page = get_web_page(PTT_URL + article['href']) if page: img_urls = parse(page) save(img_urls, article['title']) article['num_image'] = len(img_urls) # 儲存文章資訊 with open('data.json', 'w', encoding='utf-8') as f: json.dump(articles, f, indent=2, sort_keys=True, ensure_ascii=False) ***打開VSCODE 桌面建立資料夾→進VSCODE載入資料夾→在資料夾取名XXXX.py→匯入以上程式碼下圖紅色框*** ![](https://i.imgur.com/tDXCjRC.png) 上圖紅框處為進入ptt所需要經過的 限制級 權限 數字 1 取得 為粉框中 權限區輸入的代碼1 ![](https://i.imgur.com/osUbFza.png) ------------------------ ![](https://i.imgur.com/78DbgyO.jpg) 最後在程式第86行 的Beauty 可以任意更改 黃框解釋 進入其網頁 查詢 各版的名稱英文代號 填入後就會下載該區所有圖檔(藍框) ![](https://i.imgur.com/gW1yPsH.png) 執行後的樣子 ![](https://i.imgur.com/49D97K9.png)