***建立/讀寫檔案***

**0826PYTHON上課資料** startswith endswith find/"findall→字串不能" rfind # ***建立/讀寫檔案*** import os mydir = 'testch14' #如果mydir不存在就建立此資料夾 if os.path.exists(mydir): print("已經存在 %s " % mydir) else: os.mkdir(mydir) print("建立 %s 資料夾成功" % mydir) date = input('輸入日期:') event = input('輸入事件:') description = input('輸入心得:') fn = "testch14/note.txt" with open(fn , 'w') as file_obj : file_obj.write(date + '\n') file_obj.write(event + '\n') file_obj.write(description) with open(fn) as file_Obj: # 用預設mode=r開啟檔案,傳回檔案物件file_Obj obj_list = file_Obj.read() # 每次讀一行 print(obj_list) 1. %s 字串 %d 整數 %f 浮點數 2. r 讀/ w字"會覆蓋" /a(append)字/r+/w+/a+ # ***下載環境確認指令*** #Import some packages to use import cv2 import numpy as np import pandas as pd import matplotlib.pyplot as plt %matplotlib inline #To see our directory import os import random import gc #Gabage collector for cleaning deleted data from memory * 下載環境至ANACONDA 環境(Environments)播放圖示左鍵第一個open termainal 輸入下面一一複製貼上下載 pip install tensorflow pip install keras pip install opencv-python pip install pandas pip install Pillow pip install seaborn pip install -U scikit-learn scipy matplotlib # **圖像訓練** 先解壓縮檔案dog cat train_dir = 'C:/Users/USER/train' test_dir = 'C:/Users/USER/test1' train_dogs = ['C:/Users/USER/train/{}'.format(i) for i in os.listdir(train_dir) if 'dog' in i] #get dog images train_cats = ['C:/Users/USER/train/{}'.format(i) for i in os.listdir(train_dir) if 'cat' in i] #get cat images test_imgs = ['C:/Users/USER/test1/{}'.format(i) for i in os.listdir(test_dir)] #使用 os.listdir() 命令来获取训练数据 zip 文件中的所有图像，并在其名称中检索带有 dog 的所有图像。 train_imgs = train_dogs[:2000] + train_cats[:2000] # slice the dataset and use 2000 in each class random.shuffle(train_imgs) # shuffle it randomly del train_dogs del train_cats gc.collect() #collect garbage to save memory *#PS路徑一定要檢查* ----------------------------------------以上 # 程式一 nrows = 150 ncolumns = 150 channels = 3 #change to 1 if you want to use grayscale image #A function to read and process the images to an acceptable format for our model def read_and_process_image(list_of_images): """ Returns two arrays: X is an array of resized images y is an array of labels """ X = [] # images y = [] # labels for image in list_of_images: X.append(cv2.resize(cv2.imread(image, cv2.IMREAD_COLOR), (nrows,ncolumns), interpolation=cv2.INTER_CUBIC)) #Read the image #get the labels if 'dog' in image: y.append(1) elif 'cat' in image: y.append(0) return X, y --------------------------------------以上 # 程式二 X, y = read_and_process_image(train_imgs) #X 现在是图像像素值数组的列表，y 是标签列表。让我们预览 X 中的第一张图片 X[0] ----------------------------------------以上 # 程式三 /# 1和0分別代表狗和貓，繪製X的前5個數組，無法使用上面的matplotlib.image的mpimg模塊在X中繪製圖像，因為這些是像素數組而不是原始jpg文件。 /# 所以應該使用imshow（）命令。 #Lets view some of the pics plt.figure(figsize=(20,10)) columns = 5 for i in range(columns): plt.subplot(5 / columns + 1, columns, i + 1) plt.imshow(X[i]) ----------------------------------------以上 # 程式四 import seaborn as sns del train_imgs gc.collect() #Convert list to numpy array X = np.array(X) y = np.array(y) #Lets plot the label to be sure we just have two class sns.countplot(y) plt.title('Labels for Cats and Dogs') ----------------------------------------以上 # 程式五 #檢查數據的形狀。始終檢查並確認數據的形狀(圍度)，這非常重要 print("Shape of train images is:", X.shape) print("Shape of labels is:", y.shape) ----------------------------------------以上 # 程式六 from sklearn.model_selection import train_test_split X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=2) print("Shape of train images is:", X_train.shape) print("Shape of validation images is:", X_val.shape) print("Shape of labels is:", y_train.shape) print("Shape of labels is:", y_val.shape) ----------------------------------------以上 # 程式七 del X del y gc.collect() #get the length of the train and validation data ntrain = len(X_train) nval = len(X_val) #We will use a batch size of 32. Note: batch size should be a factor of 2.***4,8,16,32,64...*** batch_size = 32 ----------------------------------------以上 # 程式八 from keras import layers from keras import models from keras import optimizers from keras.preprocessing.image import ImageDataGenerator from keras.preprocessing.image import img_to_array, load_img model = models.Sequential() model.add(layers.Conv2D(32, (3, 3), activation='relu',input_shape=(150, 150, 3))) model.add(layers.MaxPooling2D((2, 2))) model.add(layers.Conv2D(64, (3, 3), activation='relu')) model.add(layers.MaxPooling2D((2, 2))) model.add(layers.Conv2D(128, (3, 3), activation='relu')) model.add(layers.MaxPooling2D((2, 2))) model.add(layers.Conv2D(128, (3, 3), activation='relu')) model.add(layers.MaxPooling2D((2, 2))) model.add(layers.Flatten()) model.add(layers.Dropout(0.2)) #Dropout for regularization model.add(layers.Dense(512, activation='relu')) model.add(layers.Dense(1, activation='sigmoid')) #Sigmoid function at the end because we have just two classes ![](https://i.imgur.com/1z4QfIE.jpg) ----------------------------------------以上 # 程式九 model.summary() ----------------------------------------以上 # 程式十 sgd = optimizers.SGD(lr=0.01, decay=1e-4, momentum=0.9, nesterov=True) model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['acc']) ----------------------------------------以上 # 程式十一 train_datagen = ImageDataGenerator(rescale=1./255, #Scale the image between 0 and 1 rotation_range=40, width_shift_range=0.2, height_shift_range=0.2, shear_range=0.2, zoom_range=0.2, horizontal_flip=True,) val_datagen = ImageDataGenerator(rescale=1./255) #We do not augment validation data. we only perform rescale ----------------------------------------以上 # 程式十二 train_generator = train_datagen.flow(X_train, y_train, batch_size=batch_size) val_generator = val_datagen.flow(X_val, y_val, batch_size=batch_size) ----------------------------------------以上 # 程式十三 history =model.fit_generator(train_generator, steps_per_epoch=ntrain // batch_size, epochs=64, validation_data=val_generator, validation_steps=nval // batch_size) ----------------------------------------以上 # 程式十四 /# 保存我們的模型，使用下面顯示的簡單Keras功能，這樣我們可以隨時重複使用它，而不是在重新運行我們的筆記本時再次訓練。 /# #Save the model model.save_weights('model_wieghts.h5') model.save('model_keras.h5') /#lets plot the train and val curve #get the details form the history object acc = history.history['acc'] val_acc = history.history['val_acc'] loss = history.history['loss'] val_loss = history.history['val_loss'] epochs = range(1, len(acc) + 1) #Train and validation accuracy plt.plot(epochs, acc, 'b', label='Training accurarcy') plt.plot(epochs, val_acc, 'r', label='Validation accurarcy') plt.title('Training and Validation accurarcy') plt.legend() plt.figure() #Train and validation loss plt.plot(epochs, loss, 'b', label='Training loss') plt.plot(epochs, val_loss, 'r', label='Validation loss') plt.title('Training and Validation loss') plt.legend() plt.show() /#Now lets predict on the first 10 Images of the test set X_test, y_test = read_and_process_image(test_imgs[0:10]) #Y_test in this case will be empty. x = np.array(X_test) test_datagen = ImageDataGenerator(rescale=1./255) # 程式十五 /# 創建一個列表來保存我們要生成的標籤。 /# 我們設置了我們要繪製的圖像的圖形大小。 /# 在這裡，我們通過在我們訓練的模型上調用.predict（）方法，對ImageDataGenerator提供的特定圖像進行預測。 /# 該PRED變量的模型是如何肯定的是，當前圖像是狗的概率。 /# 由於我們給狗標籤為1，概率很高 - 至少大於平均值0.5 - 意味著我們的模型非常有信心圖像是狗，否則它是貓。 /# 因此，我們只需創建一個if -else語句，如果概率大於0.5 ，則附加字符串' Dog '，否則它會將' cat ' 附加到text_label。 /# 我們這樣做是為了在繪製圖像時為圖像添加標題。 /# 在這裡我們添加一個子圖，以便我們可以繪製多個圖像。 /# 在這裡，我們將預測的類添加為圖像圖的標題。 i = 0 text_labels = [] plt.figure(figsize=(30,20)) for batch in test_datagen.flow(x, batch_size=1): pred = model.predict(batch) if pred > 0.5: text_labels.append('dog') else: text_labels.append('cat') plt.subplot(5 / columns + 1, columns, i + 1) plt.title('This is a ' + text_labels[i]) imgplot = plt.imshow(batch[0]) i += 1 if i % 10 == 0: break plt.show() # VS code 中文化。擴充化 SEARCH Language 下載繁體重開抓資料夾執行任一檔案確認可以作業![](https://i.imgur.com/L7W9nre.png) # 變識不同車輛(標記) ![](https://i.imgur.com/IvHt1NW.png) 抓圖![](https://i.imgur.com/BvRUb9k.jpg) 檔案:png與jpg 結果![](https://i.imgur.com/PXW3e9b.jpg) 降版本 open terminal 輸入pip install scipy==1.0.0 # 爬蟲8/30 **建構搜尋網頁** "<HTML>" "</HTML>" "HEAD"跳轉列新css "TITLE" "/TITLE"標題 "BODY"內文 "form"表單 "h1~h6"標題之字大小 "p"段落 "br"換行 "table"表格 "tr"表格列 "td"表格行 "input"輸入內文 "div"標籤自適應網頁 RWD "ul"項目符號 * #\d 一個數字 #\D 一個非數字 #\w 一個英數字元 #\W 一個非英數字元 #\s 一個空白字元 #\S 一個非空白字元 #\b 一個單字範圍(介於 \w和\W，無論順序為何) #\B 一個非單字範圍 #a | b 或 #. 除了\n之外的任何字元 #^來源字串的開頭 #$來源字串的結尾 #prev ? 零或一個prev #prev * 零或多個prev 越多越好 #prev *? 零或一個prev 越少越好 #prev + 一個或多個prev 越多越好 #prev +? 一個或多個prev 越少越好 #prev {m} m個連續的prev #prev {m , n} m到n個連續的prev 越多越好 #prev {m , n}? m到n個連續的prev 越少越好 #[ abc ] a或b或c #[ ^abc ] not(a或b或c) #prev (?= next) prev，如果它後面有next的話 #prev (?| next) prev，如果它後面沒有next的話 #(?<= prev) next 如果next之前是prev，匹配next #(?<! prev) next 如果next之前不是prev，匹配next # 獲取網頁內容 from urllib.request import urlopen html = urlopen("http://pythonscraping.com/pages/page1.html") print(html.read())#運用html.read()獲取網頁的HTML內容 1. 下載環境至ANACONDA 環境(Environments)播放圖示左鍵第一個open termainal 輸入下面一一複製貼上下載 pip install beautifulsoup4 2. 程式 from urllib.request import urlopen from bs4 import BeautifulSoup html = urlopen("http://pythonscraping.com/pages/page1.html") bsObj = BeautifulSoup(html.read()) print(bsObj.h1)#從網頁中提取的h1內容 HTML→body→h1 bsObj.h1 --------------------------------------以上 *尋找網頁中的字串(title) from urllib.request import urlopen from urllib.error import HTTPError from bs4 import BeautifulSoup def getTitle(url): try: html = urlopen(url) except HTTPError as e: return None try: bsObj = BeautifulSoup(html.read()) title = bsObj.body.h1 except AttributeError as e: return None return title title = getTitle("http://www.pythonscraping.com/pages/page1.html") if title == None: print("Title could not be found") else: print(title) -------------------------------------以上 find 找網頁上的第一筆資料 findAll 找網頁上全部的資 ![](https://i.imgur.com/ExROkHX.jpg) t = bsObj.findAll({"h1","h2"}) t 以---------------------------上 type(XXXX) →python中看資料型態 from urllib.request import urlopen from bs4 import BeautifulSoup html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html") bsObj = BeautifulSoup(html) #通過BeautifulSoup對象我們可以用findAll函數抽取只包含在<span class="green">綠色的文字 nameList = bsObj.findAll("span", {"class":{"green","red"}}) print(type(nameList)) for name in nameList: print(name.get_text()) ----------------------以上 from urllib.request import urlopen from bs4 import BeautifulSoup html = urlopen('https://zh.wikipedia.org/wiki/%E5%87%AF%E6%96%87%C2%B7%E8%B4%9D%E8%82%AF').read().decode('utf-8') bsObj = BeautifulSoup(html) #通過BeautifulSoup對象我們可以用findAll函數抽取只包含在超連結 //. nameList = bsObj.findAll("a") //. nameList = bsObj.findAll({"a":"href"}) //兩個寫法都可以 print(type(nameList)) for name in nameList: print(name.get_text()) ------------------------------------------------以上 from urllib.request import urlopen from bs4 import BeautifulSoup html = urlopen("https://morvanzhou.github.io/static/scraping/list.html") #通過BeautifulSoup對象我們可以用findAll函數抽取只包含在超連結 bsObj = BeautifulSoup(html) nameList = bsObj.findAll("li",{"class":"month"})#找出月分 print(type(nameList)) for name in nameList: print(name.get_text()) ---------------------------------------------------以上 import bs4,requests url = "https://www.taiwanlottery.com.tw/" html = requests.get(url) print("網頁下載中...") html.raise_for_status() #驗證網頁是否下在成功 print("網頁200正常.........") objSoup = bs4.BeautifulSoup(html.text, "html.parser") #建立BeautifulSoup物件 #print(type(objSoup)) dataTag = objSoup.select(" .contents_box02") #尋找class是.contents_box02 print("串列長度",len(dataTag)) #print(dataTag) for i in range(len(dataTag)): #列出含.contents_box02的串列長度 print(dataTag[i]) ---------------------------------------------------以上 import bs4,requests url = "https://www.taiwanlottery.com.tw/" html = requests.get(url) print("網頁下載中...") html.raise_for_status() #驗證網頁是否下在成功 print("網頁200正常.........") objSoup = bs4.BeautifulSoup(html.text, "html.parser") #建立BeautifulSoup物件 #print(type(objSoup)) dataTag = objSoup.select(" .contents_box02") #尋找class是.contents_box02 print("串列長度",len(dataTag)) #print(dataTag) for i in range(len(dataTag)): #列出含.contents_box02的串列長度 print(dataTag[i]) #找尋開出順序與大小順序的球 balls = dataTag[2].find_all('div', {'class':'ball_tx ball_yellow'}) balls1 = dataTag[0].find_all('div', {'class':'ball_tx ball_green'}) print("開出順序 : ", end='') for i in range(6): # 前6球是開出順序 print(balls[i].text, end=' ') print("\n大小順序 : ", end='') for i in range(6,len(balls)): # 第7球以後是大小順序 print(balls[i].text, end=' ') #找出第二區的紅球 redball = dataTag[2].find_all('div', {'class':'ball_red'}) print("\n第二區 :", redball[0].text) print("開出順序 : ", end='') for i in range(6): # 前6球是開出順序 print(balls1[i].text, end=' ') print("\n大小順序 : ", end='') for i in range(6,len(balls)): # 第7球以後是大小順序 print(balls1[i].text, end=' ') #找出第二區的紅球 redball = dataTag[0].find_all('div', {'class':'ball_red'}) print("\n第二區 :", redball[0].text) ------------------------------以上 from requests import get from bs4 import BeautifulSoup import pandas as pd from time import sleep from random import randint from time import time 確認函數庫下載 ----------------------------------以上 url = 'http://www.imdb.com/search/title?release_date=2017&sort=num_votes,desc&page=1' response = get(url) print(response.text[:500]) html_soup = BeautifulSoup(response.text, 'html.parser') movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced') type(html_soup) print(type(movie_containers)) print(len(movie_containers)) first_movie = movie_containers[0]#movie message print("電影名稱 : ",first_movie.h3.a.text) first_mscore = first_movie.find('span', class_ = 'metascore favorable') print(type(first_mscore)) first_mscore = int(first_mscore.text) print("Metascore 評分 :" , first_mscore) first_year = first_movie.h3.find('span',class_ = 'lister-item-year text-muted unbold') first_year = first_year.text print("發行年分 : " , frist_year[1:5]) first_imdb = float(first_movie.strong.text) print("IMDB評級 : " ,first_imdb) first_votes = first_movie.find('span', attrs = {'name':'nv'}) print (first_votes.text) print ("投票數 : " +first_votes['data-value'])#int 索引span class →class_ span name →attrs --------------------------------------以上 # 整合檢所網頁資訊 names = [] years = [] imdb_ratings = [] metascores = [] votes = [] # Extract data from individual movie container for container in movie_containers: # 僅當容器具有Metascore時才提取感興趣的數據點: if container.find('div', class_ = 'ratings-metascore') is not None: # The name name = container.h3.a.text names.append(name) # The year year = container.h3.find('span', class_ = 'lister-item-year').text years.append(year) # The IMDB rating imdb = float(container.strong.text) imdb_ratings.append(imdb) # The Metascore m_score = container.find('span', class_ = 'metascore').text metascores.append(int(m_score)) # The number of votes vote = container.find('span', attrs = {'name':'nv'})['data-value'] votes.append(int(vote)) print(" ") print("電影名稱 : ",name) print("發行年分 : " , year[1:5]) print("IMDB評級 : " ,imdb) print ("投票數 : " ,vote)#int print("Metascore 評分 :" , m_score) //最後將資訊內容會出成EXCEL test_df = pd.DataFrame({'movie': names, 'year': years, 'imdb': imdb_ratings, 'metascore': metascores, 'votes': votes}) print(test_df.info()) test_df.to_csv('movie.ratings.csv') ----------------------------------------------------以上 # VScode 檢索圖片下載 import requests import time from bs4 import BeautifulSoup import os import re import urllib.request import json PTT_URL = 'https://www.ptt.cc' #取得網頁文件的function def get_web_page(url): time.sleep(0.5) # 每次爬取前暫停 0.5 秒以免被 PTT 網站判定為大量惡意爬取 resp = requests.get( url=url, cookies={'over18': '1'} ) if resp.status_code != 200: print('Invalid url:', resp.url) return None else: return resp.text def get_articles(dom, date): soup = BeautifulSoup(dom, 'html.parser') # 取得上一頁的連結 paging_div = soup.find('div', 'btn-group btn-group-paging') prev_url = paging_div.find_all('a')[1]['href'] articles = [] # 儲存取得的文章資料 divs = soup.find_all('div', 'r-ent') for d in divs: if d.find('div', 'date').string.strip() == date: # 發文日期正確 # 取得推文數 push_count = 0 if d.find('div', 'nrec').string: try: push_count = int(d.find('div', 'nrec').string) # 轉換字串為數字 except ValueError: # 若轉換失敗，不做任何事，push_count 保持為 0 pass # 取得文章連結及標題 if d.find('a'): # 有超連結，表示文章存在，未被刪除 href = d.find('a')['href'] title = d.find('a').string articles.append({ 'title': title, 'href': href, 'push_count': push_count }) return articles, prev_url#回傳這一頁的文章和上一頁 def parse(dom): soup = BeautifulSoup(dom, 'html.parser') links = soup.find(id='main-content').find_all('a') img_urls = [] for link in links: if re.match(r'^https?://(i.)?(m.)?imgur.com', link['href']): img_urls.append(link['href']) return img_urls def save(img_urls, title): if img_urls: try: dname = title.strip() # 用 strip() 去除字串前後的空白 os.makedirs(dname) for img_url in img_urls: if img_url.split('//')[1].startswith('m.'): img_url = img_url.replace('//m.', '//i.') if not img_url.split('//')[1].startswith('i.'): img_url = img_url.split('//')[0] + '//i.' + img_url.split('//')[1] if not img_url.endswith('.jpg'): img_url += '.jpg' fname = img_url.split('/')[-1] urllib.request.urlretrieve(img_url, os.path.join(dname, fname)) except Exception as e: print(e) if __name__ == '__main__': current_page = get_web_page(PTT_URL + '/bbs/Beauty/index.html') if current_page: articles = [] # 全部的今日文章 date = time.strftime("%m/%d").lstrip('0') # 今天日期, 去掉開頭的 '0' 以符合 PTT 網站格式 current_articles, prev_url = get_articles(current_page, date) # 目前頁面的今日文章 while current_articles: # 若目前頁面有今日文章則加入 articles，並回到上一頁繼續尋找是否有今日文章 articles += current_articles current_page = get_web_page(PTT_URL + prev_url) current_articles, prev_url = get_articles(current_page, date) # 已取得文章列表，開始進入各文章讀圖 for article in articles: print('Processing', article) page = get_web_page(PTT_URL + article['href']) if page: img_urls = parse(page) save(img_urls, article['title']) article['num_image'] = len(img_urls) # 儲存文章資訊 with open('data.json', 'w', encoding='utf-8') as f: json.dump(articles, f, indent=2, sort_keys=True, ensure_ascii=False) ***打開VSCODE 桌面建立資料夾→進VSCODE載入資料夾→在資料夾取名XXXX.py→匯入以上程式碼下圖紅色框*** ![](https://i.imgur.com/tDXCjRC.png) 上圖紅框處為進入ptt所需要經過的限制級權限數字 1 取得為粉框中權限區輸入的代碼1 ![](https://i.imgur.com/osUbFza.png) ------------------------ ![](https://i.imgur.com/78DbgyO.jpg) 最後在程式第86行的Beauty 可以任意更改黃框解釋進入其網頁查詢各版的名稱英文代號填入後就會下載該區所有圖檔(藍框) ![](https://i.imgur.com/gW1yPsH.png) 執行後的樣子 ![](https://i.imgur.com/49D97K9.png)