numpy & pandas 筆記 (1)

# numpy & pandas 筆記 (1) ***本篇資料來源為莫煩 python:** https://morvanzhou.github.io/tutorials/data-manipulation/np-pd/ 安裝： `pip3 install numpy` `pip3 install pandas` ## numpy array - 基本用法 array宣告, dimension, shape, size ```python=1 import numpy as np array = np.array([[1,2,3], [4,5,6]]) print(array) print('number of dim:',array.ndim) print('shape',array.shape) print('size:',array.size) ''' output [[1 2 3] [4 5 6]] ('number of dim:', 2) ('shape', (2, 3)) ('size:', 6) ''' ``` - 定義資料格式 ```python=1 # 使用 dtype 定義資料格式 a = np.array([2,33,4],dtype=np.int) print(a) # [ 2 33 4] print(a.dtype) # int64 a = np.array([2,33,4],dtype=np.int32) print(a.dtype) # int32 a = np.array([2,33,4],dtype=np.float) print(a.dtype) # float64 a = np.array([2,33,4],dtype=np.float32) print(a.dtype) # float32 ``` - array 的各種宣告方法 ```python=1 # 宣告全部是 0 的 array zeros = np.zeros( (3,4) ) print(zeros) '''output [[ 0. 0. 0. 0.] [ 0. 0. 0. 0.] [ 0. 0. 0. 0.]] ''' # 宣告全部是 1 的 array 同樣可以使用 dtype 定義資料型態 ones = np.ones( (4,3), dtype=np.int16 ) print(ones) '''output [[1 1 1] [1 1 1] [1 1 1] [1 1 1]] ''' # 宣告 empty 等同於得到一個幾乎接近 0 的 array empty = np.empty((3,4)) print(empty) ''' output [[ 0.00000000e+000 4.94065646e-324 9.88131292e-324 1.48219694e-323] [ 1.97626258e-323 2.47032823e-323 2.96439388e-323 3.45845952e-323] [ 3.95252517e-323 4.44659081e-323 4.94065646e-323 5.43472210e-323]] ''' # 宣告有序陣列 a = np.arange(10,20,2) print(a) # [10 12 14 16 18] # 宣告 0 ~ 11 得陣列，形狀為 3x4 a = np.arange(12).reshape( (3,4) ) print(a) '''output [[ 0 1 2 3] [ 4 5 6 7] [ 8 9 10 11]] ''' # 宣告一個等分的線段 array a = np.linspace(1,10,5) # 起點 1 終點 10，分 5 等份 print(a) # [ 1. 3.25 5.5 7.75 10. ] # 宣告等分的線段 array 並且排列成 2x3 a = np.linspace(1,10,6).reshape(2,3) # 起點 1 終點 10，分 6 等份 print(a) '''output [[ 1. 2.8 4.6] [ 6.4 8.2 10. ]] ''' ``` ## numpy 矩陣 (array) 運算 :::info 矩陣當中，到底行，列的方向是什麼？英文: - row: 橫的 - column: 直的中文: - 台灣: 直行橫列 - row: 列 - column: 行 - 大陸: 直列橫行 - row: 行 - column: 列參考資料: https://tw.answers.yahoo.com/question/index?qid=20051014000014KK01201 ::: 由於中文實在是太混亂了，以下我只要解釋到行列相關的一律採用英文 row, column - 對於矩陣中所有元素運算 ```python=1 # coding=utf-8 import numpy as np a = np.array([10,20,30,40]) b = np.arange(4) print(a,b) # [10 20 30 40] [0 1 2 3] # 對於每個元素取平方 c = b**2 print(c) # [0 1 4 9] s = 10*np.sin(a) print(s) # [-5.44021111 9.12945251 -9.88031624 7.4511316 ] # 比較運算 print(b) # [0 1 2 3] print(b<3) # [ True True True False] print(b==3) # [False False False True] ``` - 矩陣與矩陣之間的運算 ```python=1 # coding=utf-8 import numpy as np a = np.array([[1,1], [0,1]]) b = np.arange(4).reshape((2,2)) print(a) # [[1 1] # [0 1]] print(b) #[[0 1] # [2 3]] # 逐一相乘 print(a*b) # [[0 1] # [0 3]] # 矩陣乘法 (以下兩種寫法是相等的) print(np.dot(a,b)) print(a.dot(b)) #[[2 4] # [2 3]] ``` - 找矩陣當中的總和，最大值，最小值 ```python=1 # coding=utf-8 import numpy as np # 宣告隨機生成的 array a = np.random.random((2,4)) print(a) # [[ 0.44842423 0.78949907 0.8370462 0.44611223] # [ 0.14324946 0.26730942 0.45553012 0.20880903]] # 求矩陣元素總和 print(np.sum(a)) # 3.59597976167 # 求矩陣元素最小值 print(np.min(a)) # 0.14324946148 # 求矩陣元素最大值 print(np.max(a)) # 0.837046196628 ``` - 使用 axis 針對矩陣中的行或列找總和，最大值，最小值 ```python=1 # coding=utf-8 import numpy as np # 使用 axis 針對矩陣中的 row, column 做運算 a = np.array([[1,2,3], [9,0,6]]) # 求矩陣 row 元素總和 print(np.sum(a,axis=1)) # [ 6 15] # 1 + 2 + 3 = 6 # 9 + 0 + 6 = 15 # 求矩陣 column 元素最小值 print(np.min(a,axis=0)) # [1 0 3] # min(1,9) = 1 # min(2,0) = 0 # min(3,6) = 3 # 求矩陣 row 元素最大值 print(np.max(a,axis=1)) # [3 9] # max(1,2,3) = 3 # max(9,0,6) = 9 ``` - 計算矩陣的平均值與中位數 ```python=1 #coding=utf-8 import numpy as np a = np.arange(2,14).reshape(3,4) print(a) # [[ 2 3 4 5] # [ 6 7 8 9] # [10 11 12 13]] # 平均值 print(np.mean(a)) # 7.5 print(a.mean()) # 7.5 print(np.average(a)) # 7.5 # 搭配 axis 可以得到： # axis = 0: 針對逐 column 的平均值 print(np.mean(a,axis=0)) # [ 6. 7. 8. 9.] ''' mean of (2,6,10) = 6 mean of (3,7,11) = 7 mean of (4,8,12) = 8 mean of (5,9,13) = 9 ''' # axis = 1: 針對逐 row 的平均值 print(np.mean(a,axis=1)) # [ 3.5 7.5 11.5] ''' mean of (2,3,4,5) = 3.5 mean of (6,7,8,9) = 7.5 mean of (10,11,12,13) = 11.5 ''' # 中位數 print(np.median(a)) # 7.5 ``` - 逐步累加 (cumsum) 與逐步差 (diff) ```python=1 #coding=utf-8 import numpy as np # 逐步累加 print(np.cumsum(a)) # [ 2 5 9 14 20 27 35 44 54 65 77 90] # 2 # 2 + 3 = 5 # 2 + 3 + 4 = 9 # 依此類推 # 逐步累差 print(np.diff(a)) ''' [[1 1 1] [1 1 1] [1 1 1]] ''' ``` - 矩陣的排序 sort 與 transpose ```python=1 #coding=utf-8 import numpy as np a = np.random.random((3,4)) print(a) ''' [[ 0.99821757 0.22533492 0.90690435 0.40504797] [ 0.73937987 0.776222 0.60485253 0.44401583] [ 0.33139018 0.14091604 0.81036999 0.74671883]] ''' print(np.sort(a)) ''' 效果：逐列排序 [[ 0.22533492 0.40504797 0.90690435 0.99821757] [ 0.44401583 0.60485253 0.73937987 0.776222 ] [ 0.14091604 0.33139018 0.74671883 0.81036999]] ''' # 矩陣的 transpose print(np.transpose(a)) ''' [[ 0.99821757 0.73937987 0.33139018] [ 0.22533492 0.776222 0.14091604] [ 0.90690435 0.60485253 0.81036999] [ 0.40504797 0.44401583 0.74671883]] ''' ``` - clip 功能簡介，可以拿來篩選過濾用 ```python=1 #coding=utf-8 import numpy as np a = np.arange(2,14).reshape(3,4) print(a) # [[ 2 3 4 5] # [ 6 7 8 9] # [10 11 12 13]] # clip 功能 print(np.clip(a,5,9)) # 意思：所有小於等於 5 的數都用 5 代替; 所有大於等於 9 的數都用 9 代替 ''' [[5 5 5 5] [6 7 8 9] [9 9 9 9]] ''' ``` ## numpy array 的 index (索引) - 找出矩陣當中最大最小值所在的位置 index (索引值): argmin, argmax ```python=1 #coding=utf-8 import numpy as np a = np.arange(2,14).reshape(3,4) print(a) # [[ 2 3 4 5] # [ 6 7 8 9] # [10 11 12 13]] # 最小值的索引 print(np.min(a)) # 2 最小值 print(np.argmin(a)) # 0 最小值的索引 # 最大值的索引 print(np.max(a)) # 13 最大值 print(np.argmax(a)) # 11 最大值的索引 ``` - **Numpy Array Indexing** ```python=1 #coding=utf-8 import numpy as np a = np.arange(3,15).reshape(3,4) print(a) ''' [[ 3 4 5 6] [ 7 8 9 10] [11 12 13 14]] ''' # 以下兩種寫法相等 print(a[2][1]) # 12 print(a[2,1]) # 12 # 指定印出使用 : 設定範圍 print(a[:,:]) ''' [[ 3 4 5 6] [ 7 8 9 10] [11 12 13 14]] ''' print(a[1:,:]) ''' [[ 7 8 9 10] [11 12 13 14]] ''' print(a[0:2,:]) ''' [[ 3 4 5 6] [ 7 8 9 10]] ''' print(a[:,1:]) ''' [[ 4 5 6] [ 8 9 10] [12 13 14]] ''' print(a[:,1:3]) ''' [[ 4 5] [ 8 9] [12 13]] ''' ``` - **使用 for 與 transpose** ```python=44 # for 預設是可以迭代每一 row (橫向) for row in a: print(row) ''' [3 4 5 6] [ 7 8 9 10] [11 12 13 14] ''' # 使用 transpose 搭配 for 迭代印出每一 column (直向) for column in a.T: print(column) ''' [ 3 7 11] [ 4 8 12] [ 5 9 13] [ 6 10 14] ''' ``` - **使用 flatten 與 flat 把二維 array 攤平** ```python=62 # 使用 flatten (flat 是迭代版本) print(a.flatten()) # [ 3 4 5 6 7 8 9 10 11 12 13 14] # 使用 for loop 搭配 flat 將矩陣的每一個元素逐一印出 for item in a.flat: print(item) ''' 3 4 5 6 7 8 9 10 11 12 13 14 ''' ``` ## numpy array 的 merge (合併) - **使用 vertical stack : vstack 垂直合併兩個 array** ```python=1 # coding=utf-8 import numpy as np A = np.array([1,1,1]) B = np.array([2,2,2]) # vertical stack 垂直方向上下合併 C = np.vstack((A,B)) print(A.shape) # (3,) print(B.shape) # (3,) print(C.shape) # (2, 3) print(C) ''' [[1 1 1] [2 2 2]] ''' ``` - **使用 horizontal stack : hstack 水平合併兩個 array** ```python=1 # coding=utf-8 import numpy as np A = np.array([1,1,1]) B = np.array([2,2,2]) # horizontal stack 水平方向左右合併 D = np.hstack((A,B)) print(D.shape) # (6,) print(D) # [1 1 1 2 2 2] ``` - **使用 newaxis 做 array 的轉置與合併應用** ```python=1 # coding=utf-8 import numpy as np A = np.array([1,1,1]) print(A) # [1 1 1] print(A.transpose()) # [1,1,1] 無法使用 transpose 把一個橫向的數列變成縱向的數列 # 要做到這件事，要使用 new axis 增加維度 # 在 row 上面增加維度 print(A[np.newaxis,:].shape) # (1, 3) print(A[np.newaxis,:]) # [[1 1 1]] # 在 col 上面增加維度 print(A[:,np.newaxis].shape) # (3, 1) print(A[:,np.newaxis]) ''' [[1] [1] [1]] ''' # 應用：把兩個橫向的陣列，轉為縱向的，然後合併 A = np.array([1,1,1])[:,np.newaxis] # 橫向陣列轉縱向矩陣 B = np.array([2,2,2])[:,np.newaxis] # 橫向陣列轉縱向矩陣 C = np.vstack((A,B)) # 垂直合併 D = np.hstack((A,B)) # 水平合併 print(A.shape) # (3,1) print(B.shape) # (3,1) print(C.shape) # (6,1) print(C) ''' [[1] [1] [1] [2] [2] [2]] ''' print(D.shape) # (3,2) print(D) ''' [[1 2] [1 2] [1 2]] ''' ``` ***補充說明：陣列與矩陣轉換*** ```python=1 # coding=utf-8 import numpy as np A = np.array([1,1,1]) B = np.array([2,2,2]) print(A.shape) # (3,) print(A) # [1 1 1] print(B.shape) # (3,) print(B) # [2 2 2] A = np.array([1,1,1])[np.newaxis,:] # 橫向陣列轉橫向矩陣 B = np.array([2,2,2])[np.newaxis,:] # 橫向陣列轉橫向矩陣 print(A.shape) # (1,3) print(A) # [[1 1 1]] print(B.shape) # (1,3) print(B) # [[2 2 2]] ``` - **使用 concatenate 做多個 Array 合併** ```python=1 # coding=utf-8 import numpy as np A = np.array([1,1,1]) B = np.array([2,2,2]) # 使用 concatenate 進行合併 E = np.concatenate((A,B,B,A)) print(E) # [1 1 1 2 2 2 2 2 2 1 1 1] F = np.concatenate((A,B,B,A), axis=0) # 合併 print(F) # [1 1 1 2 2 2 2 2 2 1 1 1] ``` ## numpy array 分割 - **使用 split 做等量分割** ```python=1 #coding=utf-8 import numpy as np A = np.arange(12).reshape(3,4) print(A) ''' [[ 0 1 2 3] [ 4 5 6 7] [ 8 9 10 11]] ''' # 縱向分割 split(輸入矩陣, 要分割的數量, axis=1 就是對 col 方向的分割) print(np.split(A, 2, axis=1)) ''' [array([[0, 1], [4, 5], [8, 9]]), array([[ 2, 3], [ 6, 7], [10, 11]])] ''' # 橫向分割 split(輸入矩陣, 要分割的數量, axis=0 就是 row 方向的分割) print(np.split(A, 3, axis=0)) ''' [array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8, 9, 10, 11]])] ''' ``` - **使用 array_split 做不等量分割** ```python=1 #coding=utf-8 import numpy as np A = np.arange(12).reshape(3,4) print(A) ''' [[ 0 1 2 3] [ 4 5 6 7] [ 8 9 10 11]] ''' # p.s. split 分割只能分成等量分割，例如 3x4 col 方向的只能分成 2份 4份不可分成 3份 # 所以下面這行會報錯 print(np.split(A, 3, axis=1)) # ValueError: array split does not result in an equal division # 如果要做不等量的分割的話要使用 array_split() print(np.array_split(A, 3, axis=1)) ''' col 分成了 2,1,1 這樣的份數 [array([[0, 1], [4, 5], [8, 9]]), array([[ 2], [ 6], [10]]), array([[ 3], [ 7], [11]])] ''' ``` - **使用 vsplit 與 hsplit 分割** ```python=1 #coding=utf-8 import numpy as np A = np.arange(12).reshape(3,4) print(A) ''' [[ 0 1 2 3] [ 4 5 6 7] [ 8 9 10 11]] ''' # 使用 verticle split (vsplit) 來做 axis=0 方向分割 print(np.vsplit(A, 3)) ''' [array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8, 9, 10, 11]])] ''' # 使用 horizontal split (hsplit) 來做 axis=1 方向分割 print(np.hsplit(A, 2)) ''' [array([[0, 1], [4, 5], [8, 9]]), array([[ 2, 3], [ 6, 7], [10, 11]])] ''' ``` ## numpy array 複製 copy 與 deep copy - **copy by assignment** ```python=1 #coding=utf-8 import numpy as np a = np.arange(4) print(a) # [0 1 2 3] b = a c = a d = b a[0] = 11 # 修改 a print(b) # [11 1 2 3] 也會動到 b print(b is a) # True 因為 b = a 時 b 就是 a print(c) # [11 1 2 3] 同理 c 也會變 print(d) # [11 1 2 3] 因為 d 就是 b, b 就是 a ，所以改 a 也會動到 d d[1:3]=[22,33] # 所以修改 d 也會動到其他三個變數 print(a) # [11 22 33 3] print(b) # [11 22 33 3] print(c) # [11 22 33 3] ``` - **deep copy** ```python=1 #coding=utf-8 import numpy as np # 如果只想修改 a 但是不想要動用到其他人，那就要使用所謂的 deep copy a = np.arange(4) b = a.copy() # deep copy print(a) # [0 1 2 3] print(b) # [0 1 2 3] a[3] = 44 print(a) # [ 0 1 2 44] print(b) # [0 1 2 3] ```

Syntax	Example	Reference
# Header	Header	基本排版
- Unordered List	Unordered List
1. Ordered List	Ordered List
- [ ] Todo List	Todo List
> Blockquote	Blockquote
Bold font	Bold font
Italics font	Italics font
~~Strikethrough~~	~~Strikethrough~~
19^th^	19^th
H~2~O	H₂O
++Inserted text++	Inserted text
==Marked text==	Marked text
[link text](https:// "title")	Link
![image alt](https:// "title")	Image
`Code`	`Code`	在筆記中貼入程式碼
```javascript var i = 0; ```	`var i = 0;`
:smile:		Emoji list
{%youtube youtube_id %}	Externals
$L^aT_eX$	L^aT_eX
:::info This is a alert area. :::	This is a alert area.