# r08944039_hw2 :::success DataScience homework 2 Link : https://docs.google.com/presentation/d/1C-RXTnb-aOZWSszq1ndzqc1xAx6Nnay19S0JgI1zW-s/edit#slide=id.g6508903b32_0_0 ::: # Problem 1: (Extract data and handling of CSV files and RDB) 20% ### Q : 用什麼指令將 dataframe 寫進 MySQL Database? ```python engine =create_engine('mysql+mysqlconnector://usr:pwd@localhost/'+database_name) df.to_sql(name=database_name, con=engine, if_exists = 'replace', index=False) ``` ### Q : ```df.head()```? ![](https://i.imgur.com/zes8C2g.png) ### Q : ```df.describe()```? ![](https://i.imgur.com/hfqxxiq.png) #### Source Code ```python import requests import pandas as pd import io from sqlalchemy import create_engine def getRawData(date): stock_url = "https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=csv&date=" + date + "&stockNo=2330" response = requests.get(stock_url) # first(include title) if(date == "20190601"): data = response.text.split("各日成交資訊\"")[1][2:].split("\"說明:\"")[0][:-3] # last(remove last) elif(date == "20191001"): data = response.text.split("成交筆數\",")[1][2:].split("\"108/10/14\"")[0][:-3] # middle else: data = response.text.split("成交筆數\",")[1][2:].split("\"說明:\"")[0][:-3] return data def getData(): dates = ["20190601","20190701","20190801","20190901","20191001"] overallData = "" for date in dates: rawData = getRawData(date) overallData += rawData overallData += ",\n" return overallData def getDf(overallData): df = pd.read_csv(io.StringIO(overallData)) df = df.loc[:, ~df.columns.str.contains('^Unnamed')] for column in df: if column in ["成交股數", "成交金額", "成交筆數"]: i = 0 for each in df[column]: df[column][i] = df[column][i].replace(',','') i += 1 return df def writeDb(df, database_name): engine = create_engine('mysql+mysqlconnector://root:ndjkfnRWV23@localhost/'+database_name) df.to_sql(name=database_name, con=engine, if_exists = 'replace', index=False) return engine def readDb(database_name, engine): data = pd.read_sql('SELECT * FROM '+database_name, engine) return data def main(): database_name = "database_test_3" cleanData = getData() df = getDf(cleanData) print(df) engine = writeDb(df, database_name) data = readDb(database_name, engine) if __name__== "__main__": main() ``` ---------------------------------- # Problem 2: (Basic calculation) 20% ## A: From your python program issue SQL statement to find out: ### Total and average trade amount in this time period(成交金額) ### Total and average trade volume in this period (交易股數) ![](https://i.imgur.com/hdYcFWe.png) ### Python Code #### Highlight ```python def sumDb(database_name, engine, column): return pd.read_sql('SELECT SUM('+ column +') FROM '+database_name, engine) def avgDb(database_name, engine, column): return pd.read_sql('SELECT AVG('+ column +') FROM '+database_name, engine) sumMoney = sumDb(database_name, engine, "成交金額") sumStock = sumDb(database_name, engine, "成交股數") avgMoney = avgDb(database_name, engine, "成交金額") avgStock = avgDb(database_name, engine, "成交股數") ``` #### Full Code ```python import requests import pandas as pd import io from sqlalchemy import create_engine def getRawData(date): stock_url = "https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=csv&date=" + date + "&stockNo=2330" response = requests.get(stock_url) # first(include title) if(date == "20190601"): data = response.text.split("各日成交資訊\"")[1][2:].split("\"說明:\"")[0][:-3] # last(remove last) elif(date == "20191001"): data = response.text.split("成交筆數\",")[1][2:].split("\"108/10/14\"")[0][:-3] # middle else: data = response.text.split("成交筆數\",")[1][2:].split("\"說明:\"")[0][:-3] return data def getData(): dates = ["20190601","20190701","20190801","20190801","20191001"] overallData = "" for date in dates: rawData = getRawData(date) overallData += rawData overallData += ",\n" return overallData def getDf(overallData): df = pd.read_csv(io.StringIO(overallData)) df = df.loc[:, ~df.columns.str.contains('^Unnamed')] for column in df: if column in ["成交股數", "成交金額", "成交筆數"]: i = 0 for each in df[column]: df[column][i] = df[column][i].replace(',','') i += 1 return df def writeDb(df, database_name): engine = create_engine('mysql+mysqlconnector://root:ndjkfnRWV23@localhost/'+database_name) df.to_sql(name=database_name, con=engine, if_exists = 'replace', index=False) return engine def readDb(database_name, engine): return pd.read_sql('SELECT * FROM '+database_name, engine) def sumDb(database_name, engine, column): return pd.read_sql('SELECT SUM('+ column +') FROM '+database_name, engine) def avgDb(database_name, engine, column): return pd.read_sql('SELECT AVG('+ column +') FROM '+database_name, engine) def main(): database_name = "database_test_3" cleanData = getData() df = getDf(cleanData) engine = writeDb(df, database_name) data = readDb(database_name, engine) sumMoney = sumDb(database_name, engine, "成交金額") sumStock = sumDb(database_name, engine, "成交股數") avgMoney = avgDb(database_name, engine, "成交金額") avgStock = avgDb(database_name, engine, "成交股數") print("Total 成交金額 = ", sumMoney) print("Toal 成交股數 = ", sumStock) print("Average 成交金額= ", avgMoney) print("Average 成交股數 = ", avgStock) if __name__== "__main__": main() ``` ## B: Use pandas to calculate ### 5-day moving average of daily closing price. ==__```df['收盤價'].rolling(5).mean()```__== +-----------+-----------+--------------------+ | 日期 | 收盤價 | MA5 | +-----------+-----------+--------------------+ | 108/06/03 | 238 | 238 | | 108/06/04 | 233 | 235.5 | | 108/06/05 | 235 | 235.33333333333334 | | 108/06/06 | 232 | 234.5 | | 108/06/10 | 240 | 235.6 | | 108/06/11 | 244.5 | 236.9 | | 108/06/12 | 246 | 239.5 | | 108/06/13 | 240 | 240.5 | | 108/06/14 | 236 | 241.3 | | 108/06/17 | 233 | 239.9 | | 108/06/18 | 235.5 | 238.1 | | 108/06/19 | 244 | 237.7 | | 108/06/20 | 245 | 238.7 | | 108/06/21 | 248.5 | 241.2 | | 108/06/24 | 241 | 242.8 | | 108/06/25 | 238.5 | 243.4 | | 108/06/26 | 234.5 | 241.5 | | 108/06/27 | 240.5 | 240.6 | | 108/06/28 | 239 | 238.7 | | 108/07/01 | 248.5 | 240.2 | | 108/07/02 | 249 | 242.3 | | 108/07/03 | 242.5 | 243.9 | | 108/07/04 | 244 | 244.6 | | 108/07/05 | 243 | 245.4 | | 108/07/08 | 242.5 | 244.2 | | 108/07/09 | 242 | 242.8 | | 108/07/10 | 247 | 243.7 | | 108/07/11 | 250 | 244.9 | | 108/07/12 | 250.5 | 246.4 | | 108/07/15 | 254.5 | 248.8 | | 108/07/16 | 256 | 251.6 | | 108/07/17 | 252 | 252.6 | | 108/07/18 | 254 | 253.4 | | 108/07/19 | 259 | 255.1 | | 108/07/22 | 264 | 257 | | 108/07/23 | 264 | 258.6 | | 108/07/24 | 265 | 261.2 | | 108/07/25 | 265 | 263.4 | | 108/07/26 | 261 | 263.8 | | 108/07/29 | 261 | 263.2 | | 108/07/30 | 260 | 262.4 | | 108/07/31 | 259.5 | 261.3 | | 108/08/01 | 256.5 | 259.6 | | 108/08/02 | 251.5 | 257.7 | | 108/08/05 | 246.5 | 254.8 | | 108/08/06 | 248.5 | 252.5 | | 108/08/07 | 248 | 250.2 | | 108/08/08 | 253.5 | 249.6 | | 108/08/12 | 251 | 249.5 | | 108/08/13 | 246.5 | 249.5 | | 108/08/14 | 249.5 | 249.7 | | 108/08/15 | 248 | 249.7 | | 108/08/16 | 250 | 249 | | 108/08/19 | 252 | 249.2 | | 108/08/20 | 254.5 | 250.8 | | 108/08/21 | 254.5 | 251.8 | | 108/08/22 | 254 | 253 | | 108/08/23 | 254 | 253.8 | | 108/08/26 | 248.5 | 253.1 | | 108/08/27 | 250 | 252.2 | | 108/08/28 | 252 | 251.7 | | 108/08/29 | 254 | 251.7 | | 108/08/30 | 259 | 252.7 | | 108/09/02 | 257.5 | 254.5 | | 108/09/03 | 254 | 255.3 | | 108/09/04 | 257.5 | 256.4 | | 108/09/05 | 263 | 258.2 | | 108/09/06 | 263.5 | 259.1 | | 108/09/09 | 265 | 260.6 | | 108/09/10 | 261.5 | 262.1 | | 108/09/11 | 263 | 263.2 | | 108/09/12 | 262.5 | 263.1 | | 108/09/16 | 265.5 | 263.5 | | 108/09/17 | 265 | 263.5 | | 108/09/18 | 267 | 264.6 | | 108/09/19 | 265 | 265 | | 108/09/20 | 264 | 265.3 | | 108/09/23 | 264 | 265 | | 108/09/24 | 265 | 265 | | 108/09/25 | 266 | 264.8 | | 108/09/26 | 268 | 265.4 | | 108/09/27 | 272 | 267 | | 108/10/01 | 280 | 270.2 | | 108/10/02 | 279.5 | 273.1 | | 108/10/03 | 276.5 | 275.2 | | 108/10/04 | 276.5 | 276.9 | | 108/10/07 | 278 | 278.1 | | 108/10/08 | 286.5 | 279.4 | | 108/10/09 | 282 | 279.9 | +-----------+-----------+--------------------+ >Note : SQL command >SELECT 日期, 收盤價, AVG(收盤價) OVER (ORDER BY 日期 ASC ROWS 4 PRECEDING) AS MA5 FROM database_test_3; ### 5-day moving average of daily trading volume. ==__```df['成交股數'].rolling(5).mean()```__== +-----------+--------------+--------------------+ | 日期 | 成交股數 | MA5 | +-----------+--------------+--------------------+ | 108/06/03 | 36687092.0 | 36687092 | | 108/06/04 | 24443428 | 30565260 | | 108/06/05 | 35901584 | 32344034.666666668 | | 108/06/06 | 34651731 | 32920958.75 | | 108/06/10 | 35521888 | 33441144.6 | | 108/06/11 | 34691670 | 33042060.2 | | 108/06/12 | 30409556 | 34235285.8 | | 108/06/13 | 33731724 | 33801313.8 | | 108/06/14 | 35403556 | 33951678.8 | | 108/06/17 | 51692012 | 37185703.6 | | 108/06/18 | 28798087 | 36006987 | | 108/06/19 | 50234372 | 39971950.2 | | 108/06/20 | 39717291 | 41169063.6 | | 108/06/21 | 62136304 | 46515613.2 | | 108/06/24 | 43460507 | 44869312.2 | | 108/06/25 | 29735283 | 45056751.4 | | 108/06/26 | 28770041 | 40763885.2 | | 108/06/27 | 42007834 | 41221993.8 | | 108/06/28 | 28085212 | 34411775.4 | | 108/07/01 | 66370875 | 38993849 | | 108/07/02 | 26832202 | 38413232.8 | | 108/07/03 | 29549774 | 38569179.4 | | 108/07/04 | 18691240 | 33905860.6 | | 108/07/05 | 27317074 | 33752233 | | 108/07/08 | 29190909 | 26316239.8 | | 108/07/09 | 16538987 | 24257596.8 | | 108/07/10 | 23812131 | 23110068.2 | | 108/07/11 | 31296289 | 25631078 | | 108/07/12 | 23868432 | 24941349.6 | | 108/07/15 | 34264300 | 25956027.8 | | 108/07/16 | 26141249 | 27876480.2 | | 108/07/17 | 34346648 | 29983383.6 | | 108/07/18 | 20908575 | 27905840.8 | | 108/07/19 | 59879798 | 35108114 | | 108/07/22 | 31863639 | 34627981.8 | | 108/07/23 | 29273223 | 35254376.6 | | 108/07/24 | 24511039 | 33287254.8 | | 108/07/25 | 30698545 | 35245248.8 | | 108/07/26 | 20768071 | 27422903.4 | | 108/07/29 | 18473320 | 24744839.6 | | 108/07/30 | 22895612 | 23469317.4 | | 108/07/31 | 36365601 | 25840229.8 | | 108/08/01 | 30107526 | 25722026 | | 108/08/02 | 54165644 | 32401540.6 | | 108/08/05 | 52835721 | 39274020.8 | | 108/08/06 | 58916879 | 46478274.2 | | 108/08/07 | 29435460 | 45092246 | | 108/08/08 | 30140266 | 45098794 | | 108/08/12 | 24732603 | 39212185.8 | | 108/08/13 | 25045107 | 33654063 | | 108/08/14 | 36336703 | 29138027.8 | | 108/08/15 | 28277412 | 28906418.2 | | 108/08/16 | 25696715 | 28017708 | | 108/08/19 | 25098514 | 28090890.2 | | 108/08/20 | 20620494 | 27205967.6 | | 108/08/21 | 20269296 | 23992486.2 | | 108/08/22 | 22136840 | 22764371.8 | | 108/08/23 | 14772877 | 20579604.2 | | 108/08/26 | 31740620 | 21908025.4 | | 108/08/27 | 53664951 | 28516916.8 | | 108/08/28 | 15280882 | 27519234 | | 108/08/29 | 20085767 | 27109019.4 | | 108/08/30 | 35299201 | 31214284.2 | | 108/09/02 | 14776854 | 27821531 | | 108/09/03 | 26086495 | 22305839.8 | | 108/09/04 | 23697733 | 23989210 | | 108/09/05 | 49041728 | 29780402.2 | | 108/09/06 | 26609515 | 28042465 | | 108/09/09 | 17317833 | 28550660.8 | | 108/09/10 | 30019866 | 29337335 | | 108/09/11 | 36266015 | 31850991.4 | | 108/09/12 | 38792293 | 29801104.4 | | 108/09/16 | 38913966 | 32261994.6 | | 108/09/17 | 30069844 | 34812396.8 | | 108/09/18 | 51360759 | 39080575.4 | | 108/09/19 | 27165567 | 37260485.8 | | 108/09/20 | 44555865 | 38413200.2 | | 108/09/23 | 14220208 | 33474448.6 | | 108/09/24 | 24304943 | 32321468.4 | | 108/09/25 | 23244163 | 26698149.2 | | 108/09/26 | 34128103 | 28090656.4 | | 108/09/27 | 44242817 | 28028046.8 | | 108/10/01 | 75248890 | 40233783.2 | | 108/10/02 | 30572953 | 41487385.2 | | 108/10/03 | 35874864 | 44013525.4 | | 108/10/04 | 35814613 | 44350827.4 | | 108/10/07 | 17750230 | 39052310 | | 108/10/08 | 39868551 | 31976242.2 | | 108/10/09 | 34895456 | 32840742.8 | +-----------+--------------+--------------------+ >Note: SQL command >SELECT 日期, 成交股數, AVG(成交股數) OVER (ORDER BY 日期 ASC ROWS 4 PRECEDING) AS MA5 FROM database_test_3; # Problem 3: (Simple vector calculation) 20% Find the biggest daily change. ## a. 以"當日收盤"和"前一天收盤"來比較 日期 : 108/07/01 變化的數值 : 9.5 當日最高價 : 250.0 當日成交股數 : 66370875 ![](https://i.imgur.com/cZPZ8U8.png) ## b.以"當日收盤"和"前一天收盤"來比較 日期 : 108/06/28 變化的數值 : 3.9749% 當日最高價 : 241.5 當日成交股數 : 28085212 ![](https://i.imgur.com/5KtFjku.png) # Problem 4: (Outlier) 20% Special days ## 結果 108/08/06 6.008272 108/10/01 5.245829 108/08/05 3.260889 108/07/01 3.169937 108/06/11 2.997232 ## 算法 ```python def normalization(df): df['成交股數'] = (df['成交股數'] - df['成交股數'].mean())/(df['成交股數'].std()) df['最高低價差'] = df['最高價'] - df['最低價'] df['最高低價差'] = (df['最高低價差'] - df['最高低價差'].mean())/(df['最高低價差'].std()) df['收盤價差'] = df['收盤價'] - df['開盤價'] df['收盤價差'] = (df['收盤價差'] - df['收盤價差'].mean())/(df['收盤價差'].std()) def returnTop5Outlier(df): df['成交股數_誤差'] = (df['成交股數'])**2 df['最高低價差_誤差'] = (df['最高低價差'])**2 df['收盤價差_誤差'] = (df['收盤價差'])**2 df['誤差'] = (df['成交股數_誤差'] + df['最高低價差_誤差'] + df['收盤價差_誤差'])**0.5 normalization(df) returnTop5Outlier(df) data = df['誤差'].astype(str).astype(float) data.nlargest() ``` # Problem 5: (Most profit) 20% ## TOP 5 獲益最高的:買入日期、賣出日其、獲益 Top 1 : 108/06/06, 108/10/08, 54.5(=286.5 - 232.0) Top 2 : 108/06/04, 108/10/08, 53.5(=286.5 - 233.0) Top 3 : 108/06/26, 108/10/08, 52.0(=286.5 - 234.5) Top 4 : 108/06/05, 108/10/08, 51.5(=286.5 - 235.0) Top 5 : 108/06/18, 108/10/08, 51.0(=286.5 - 235.5) ## 演算法 用一個時間複雜度為 $O(n^2)$ 的函數,對於每一天作為買的日子,然後往後跟比自己大的每一天相減,如果有更大的差異,那就更新profit。並且加了一個限制條件。就是在update最大profit的時候,不能夠超出一個上限。 這個 __上限__ 對於找 Top 1 來說,是float的MAX VALUE 這個 __上限__ 對於找 Top 2 來說,是 Top 1 的 max profit 這個 __上限__ 對於找 Top 3 來說,是 Top 2 的 max profit 這個 __上限__ 對於找 Top 4 來說,是 Top 3 的 max profit 這個 __上限__ 對於找 Top 5 來說,是 Top 4 的 max profit ## Code ```python import sys def findMaxProfit(data, MAXPROFIT): length = len(data) maxProfit = sys.float_info.min buyDay = -1 sellDay = -1 for i in range(length-1): for j in range(i+1,length): profit = data[j]-data[i] if profit > maxProfit and profit < MAXPROFIT: buyDay = i sellDay = j maxProfit = profit return buyDay, sellDay, maxProfit def findTop5ProfitDay(data): length = len(data) MAX = sys.float_info.max top5profitBuy = [None] * 5 top5profitSell = [None] * 5 top5profit = [None] * 5 top5profitBuy[0], top5profitSell[0], top5profit[0] = findMaxProfit(data,MAX) for i in range(1,5): top5profitBuy[i], top5profitSell[i], top5profit[i] = findMaxProfit(data,top5profit[i-1]) return top5profitBuy, top5profitSell, top5profit findTop5ProfitDay(df['收盤價']) ```