# r08944039_hw2
:::success
DataScience homework 2
Link : https://docs.google.com/presentation/d/1C-RXTnb-aOZWSszq1ndzqc1xAx6Nnay19S0JgI1zW-s/edit#slide=id.g6508903b32_0_0
:::
# Problem 1: (Extract data and handling of CSV files and RDB) 20%
### Q : 用什麼指令將 dataframe 寫進 MySQL Database?
```python
engine =create_engine('mysql+mysqlconnector://usr:pwd@localhost/'+database_name)
df.to_sql(name=database_name, con=engine, if_exists = 'replace', index=False)
```
### Q : ```df.head()```?

### Q : ```df.describe()```?

#### Source Code
```python
import requests
import pandas as pd
import io
from sqlalchemy import create_engine
def getRawData(date):
stock_url = "https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=csv&date=" + date + "&stockNo=2330"
response = requests.get(stock_url)
# first(include title)
if(date == "20190601"):
data = response.text.split("各日成交資訊\"")[1][2:].split("\"說明:\"")[0][:-3]
# last(remove last)
elif(date == "20191001"):
data = response.text.split("成交筆數\",")[1][2:].split("\"108/10/14\"")[0][:-3]
# middle
else:
data = response.text.split("成交筆數\",")[1][2:].split("\"說明:\"")[0][:-3]
return data
def getData():
dates = ["20190601","20190701","20190801","20190901","20191001"]
overallData = ""
for date in dates:
rawData = getRawData(date)
overallData += rawData
overallData += ",\n"
return overallData
def getDf(overallData):
df = pd.read_csv(io.StringIO(overallData))
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
for column in df:
if column in ["成交股數", "成交金額", "成交筆數"]:
i = 0
for each in df[column]:
df[column][i] = df[column][i].replace(',','')
i += 1
return df
def writeDb(df, database_name):
engine = create_engine('mysql+mysqlconnector://root:ndjkfnRWV23@localhost/'+database_name)
df.to_sql(name=database_name, con=engine, if_exists = 'replace', index=False)
return engine
def readDb(database_name, engine):
data = pd.read_sql('SELECT * FROM '+database_name, engine)
return data
def main():
database_name = "database_test_3"
cleanData = getData()
df = getDf(cleanData)
print(df)
engine = writeDb(df, database_name)
data = readDb(database_name, engine)
if __name__== "__main__":
main()
```
----------------------------------
# Problem 2: (Basic calculation) 20%
## A: From your python program issue SQL statement to find out:
### Total and average trade amount in this time period(成交金額)
### Total and average trade volume in this period (交易股數)

### Python Code
#### Highlight
```python
def sumDb(database_name, engine, column):
return pd.read_sql('SELECT SUM('+ column +') FROM '+database_name, engine)
def avgDb(database_name, engine, column):
return pd.read_sql('SELECT AVG('+ column +') FROM '+database_name, engine)
sumMoney = sumDb(database_name, engine, "成交金額")
sumStock = sumDb(database_name, engine, "成交股數")
avgMoney = avgDb(database_name, engine, "成交金額")
avgStock = avgDb(database_name, engine, "成交股數")
```
#### Full Code
```python
import requests
import pandas as pd
import io
from sqlalchemy import create_engine
def getRawData(date):
stock_url = "https://www.twse.com.tw/exchangeReport/STOCK_DAY?response=csv&date=" + date + "&stockNo=2330"
response = requests.get(stock_url)
# first(include title)
if(date == "20190601"):
data = response.text.split("各日成交資訊\"")[1][2:].split("\"說明:\"")[0][:-3]
# last(remove last)
elif(date == "20191001"):
data = response.text.split("成交筆數\",")[1][2:].split("\"108/10/14\"")[0][:-3]
# middle
else:
data = response.text.split("成交筆數\",")[1][2:].split("\"說明:\"")[0][:-3]
return data
def getData():
dates = ["20190601","20190701","20190801","20190801","20191001"]
overallData = ""
for date in dates:
rawData = getRawData(date)
overallData += rawData
overallData += ",\n"
return overallData
def getDf(overallData):
df = pd.read_csv(io.StringIO(overallData))
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
for column in df:
if column in ["成交股數", "成交金額", "成交筆數"]:
i = 0
for each in df[column]:
df[column][i] = df[column][i].replace(',','')
i += 1
return df
def writeDb(df, database_name):
engine = create_engine('mysql+mysqlconnector://root:ndjkfnRWV23@localhost/'+database_name)
df.to_sql(name=database_name, con=engine, if_exists = 'replace', index=False)
return engine
def readDb(database_name, engine):
return pd.read_sql('SELECT * FROM '+database_name, engine)
def sumDb(database_name, engine, column):
return pd.read_sql('SELECT SUM('+ column +') FROM '+database_name, engine)
def avgDb(database_name, engine, column):
return pd.read_sql('SELECT AVG('+ column +') FROM '+database_name, engine)
def main():
database_name = "database_test_3"
cleanData = getData()
df = getDf(cleanData)
engine = writeDb(df, database_name)
data = readDb(database_name, engine)
sumMoney = sumDb(database_name, engine, "成交金額")
sumStock = sumDb(database_name, engine, "成交股數")
avgMoney = avgDb(database_name, engine, "成交金額")
avgStock = avgDb(database_name, engine, "成交股數")
print("Total 成交金額 = ", sumMoney)
print("Toal 成交股數 = ", sumStock)
print("Average 成交金額= ", avgMoney)
print("Average 成交股數 = ", avgStock)
if __name__== "__main__":
main()
```
## B: Use pandas to calculate
### 5-day moving average of daily closing price.
==__```df['收盤價'].rolling(5).mean()```__==
+-----------+-----------+--------------------+
| 日期 | 收盤價 | MA5 |
+-----------+-----------+--------------------+
| 108/06/03 | 238 | 238 |
| 108/06/04 | 233 | 235.5 |
| 108/06/05 | 235 | 235.33333333333334 |
| 108/06/06 | 232 | 234.5 |
| 108/06/10 | 240 | 235.6 |
| 108/06/11 | 244.5 | 236.9 |
| 108/06/12 | 246 | 239.5 |
| 108/06/13 | 240 | 240.5 |
| 108/06/14 | 236 | 241.3 |
| 108/06/17 | 233 | 239.9 |
| 108/06/18 | 235.5 | 238.1 |
| 108/06/19 | 244 | 237.7 |
| 108/06/20 | 245 | 238.7 |
| 108/06/21 | 248.5 | 241.2 |
| 108/06/24 | 241 | 242.8 |
| 108/06/25 | 238.5 | 243.4 |
| 108/06/26 | 234.5 | 241.5 |
| 108/06/27 | 240.5 | 240.6 |
| 108/06/28 | 239 | 238.7 |
| 108/07/01 | 248.5 | 240.2 |
| 108/07/02 | 249 | 242.3 |
| 108/07/03 | 242.5 | 243.9 |
| 108/07/04 | 244 | 244.6 |
| 108/07/05 | 243 | 245.4 |
| 108/07/08 | 242.5 | 244.2 |
| 108/07/09 | 242 | 242.8 |
| 108/07/10 | 247 | 243.7 |
| 108/07/11 | 250 | 244.9 |
| 108/07/12 | 250.5 | 246.4 |
| 108/07/15 | 254.5 | 248.8 |
| 108/07/16 | 256 | 251.6 |
| 108/07/17 | 252 | 252.6 |
| 108/07/18 | 254 | 253.4 |
| 108/07/19 | 259 | 255.1 |
| 108/07/22 | 264 | 257 |
| 108/07/23 | 264 | 258.6 |
| 108/07/24 | 265 | 261.2 |
| 108/07/25 | 265 | 263.4 |
| 108/07/26 | 261 | 263.8 |
| 108/07/29 | 261 | 263.2 |
| 108/07/30 | 260 | 262.4 |
| 108/07/31 | 259.5 | 261.3 |
| 108/08/01 | 256.5 | 259.6 |
| 108/08/02 | 251.5 | 257.7 |
| 108/08/05 | 246.5 | 254.8 |
| 108/08/06 | 248.5 | 252.5 |
| 108/08/07 | 248 | 250.2 |
| 108/08/08 | 253.5 | 249.6 |
| 108/08/12 | 251 | 249.5 |
| 108/08/13 | 246.5 | 249.5 |
| 108/08/14 | 249.5 | 249.7 |
| 108/08/15 | 248 | 249.7 |
| 108/08/16 | 250 | 249 |
| 108/08/19 | 252 | 249.2 |
| 108/08/20 | 254.5 | 250.8 |
| 108/08/21 | 254.5 | 251.8 |
| 108/08/22 | 254 | 253 |
| 108/08/23 | 254 | 253.8 |
| 108/08/26 | 248.5 | 253.1 |
| 108/08/27 | 250 | 252.2 |
| 108/08/28 | 252 | 251.7 |
| 108/08/29 | 254 | 251.7 |
| 108/08/30 | 259 | 252.7 |
| 108/09/02 | 257.5 | 254.5 |
| 108/09/03 | 254 | 255.3 |
| 108/09/04 | 257.5 | 256.4 |
| 108/09/05 | 263 | 258.2 |
| 108/09/06 | 263.5 | 259.1 |
| 108/09/09 | 265 | 260.6 |
| 108/09/10 | 261.5 | 262.1 |
| 108/09/11 | 263 | 263.2 |
| 108/09/12 | 262.5 | 263.1 |
| 108/09/16 | 265.5 | 263.5 |
| 108/09/17 | 265 | 263.5 |
| 108/09/18 | 267 | 264.6 |
| 108/09/19 | 265 | 265 |
| 108/09/20 | 264 | 265.3 |
| 108/09/23 | 264 | 265 |
| 108/09/24 | 265 | 265 |
| 108/09/25 | 266 | 264.8 |
| 108/09/26 | 268 | 265.4 |
| 108/09/27 | 272 | 267 |
| 108/10/01 | 280 | 270.2 |
| 108/10/02 | 279.5 | 273.1 |
| 108/10/03 | 276.5 | 275.2 |
| 108/10/04 | 276.5 | 276.9 |
| 108/10/07 | 278 | 278.1 |
| 108/10/08 | 286.5 | 279.4 |
| 108/10/09 | 282 | 279.9 |
+-----------+-----------+--------------------+
>Note : SQL command
>SELECT 日期,
收盤價,
AVG(收盤價) OVER (ORDER BY 日期 ASC ROWS 4 PRECEDING) AS MA5
FROM database_test_3;
### 5-day moving average of daily trading volume.
==__```df['成交股數'].rolling(5).mean()```__==
+-----------+--------------+--------------------+
| 日期 | 成交股數 | MA5 |
+-----------+--------------+--------------------+
| 108/06/03 | 36687092.0 | 36687092 |
| 108/06/04 | 24443428 | 30565260 |
| 108/06/05 | 35901584 | 32344034.666666668 |
| 108/06/06 | 34651731 | 32920958.75 |
| 108/06/10 | 35521888 | 33441144.6 |
| 108/06/11 | 34691670 | 33042060.2 |
| 108/06/12 | 30409556 | 34235285.8 |
| 108/06/13 | 33731724 | 33801313.8 |
| 108/06/14 | 35403556 | 33951678.8 |
| 108/06/17 | 51692012 | 37185703.6 |
| 108/06/18 | 28798087 | 36006987 |
| 108/06/19 | 50234372 | 39971950.2 |
| 108/06/20 | 39717291 | 41169063.6 |
| 108/06/21 | 62136304 | 46515613.2 |
| 108/06/24 | 43460507 | 44869312.2 |
| 108/06/25 | 29735283 | 45056751.4 |
| 108/06/26 | 28770041 | 40763885.2 |
| 108/06/27 | 42007834 | 41221993.8 |
| 108/06/28 | 28085212 | 34411775.4 |
| 108/07/01 | 66370875 | 38993849 |
| 108/07/02 | 26832202 | 38413232.8 |
| 108/07/03 | 29549774 | 38569179.4 |
| 108/07/04 | 18691240 | 33905860.6 |
| 108/07/05 | 27317074 | 33752233 |
| 108/07/08 | 29190909 | 26316239.8 |
| 108/07/09 | 16538987 | 24257596.8 |
| 108/07/10 | 23812131 | 23110068.2 |
| 108/07/11 | 31296289 | 25631078 |
| 108/07/12 | 23868432 | 24941349.6 |
| 108/07/15 | 34264300 | 25956027.8 |
| 108/07/16 | 26141249 | 27876480.2 |
| 108/07/17 | 34346648 | 29983383.6 |
| 108/07/18 | 20908575 | 27905840.8 |
| 108/07/19 | 59879798 | 35108114 |
| 108/07/22 | 31863639 | 34627981.8 |
| 108/07/23 | 29273223 | 35254376.6 |
| 108/07/24 | 24511039 | 33287254.8 |
| 108/07/25 | 30698545 | 35245248.8 |
| 108/07/26 | 20768071 | 27422903.4 |
| 108/07/29 | 18473320 | 24744839.6 |
| 108/07/30 | 22895612 | 23469317.4 |
| 108/07/31 | 36365601 | 25840229.8 |
| 108/08/01 | 30107526 | 25722026 |
| 108/08/02 | 54165644 | 32401540.6 |
| 108/08/05 | 52835721 | 39274020.8 |
| 108/08/06 | 58916879 | 46478274.2 |
| 108/08/07 | 29435460 | 45092246 |
| 108/08/08 | 30140266 | 45098794 |
| 108/08/12 | 24732603 | 39212185.8 |
| 108/08/13 | 25045107 | 33654063 |
| 108/08/14 | 36336703 | 29138027.8 |
| 108/08/15 | 28277412 | 28906418.2 |
| 108/08/16 | 25696715 | 28017708 |
| 108/08/19 | 25098514 | 28090890.2 |
| 108/08/20 | 20620494 | 27205967.6 |
| 108/08/21 | 20269296 | 23992486.2 |
| 108/08/22 | 22136840 | 22764371.8 |
| 108/08/23 | 14772877 | 20579604.2 |
| 108/08/26 | 31740620 | 21908025.4 |
| 108/08/27 | 53664951 | 28516916.8 |
| 108/08/28 | 15280882 | 27519234 |
| 108/08/29 | 20085767 | 27109019.4 |
| 108/08/30 | 35299201 | 31214284.2 |
| 108/09/02 | 14776854 | 27821531 |
| 108/09/03 | 26086495 | 22305839.8 |
| 108/09/04 | 23697733 | 23989210 |
| 108/09/05 | 49041728 | 29780402.2 |
| 108/09/06 | 26609515 | 28042465 |
| 108/09/09 | 17317833 | 28550660.8 |
| 108/09/10 | 30019866 | 29337335 |
| 108/09/11 | 36266015 | 31850991.4 |
| 108/09/12 | 38792293 | 29801104.4 |
| 108/09/16 | 38913966 | 32261994.6 |
| 108/09/17 | 30069844 | 34812396.8 |
| 108/09/18 | 51360759 | 39080575.4 |
| 108/09/19 | 27165567 | 37260485.8 |
| 108/09/20 | 44555865 | 38413200.2 |
| 108/09/23 | 14220208 | 33474448.6 |
| 108/09/24 | 24304943 | 32321468.4 |
| 108/09/25 | 23244163 | 26698149.2 |
| 108/09/26 | 34128103 | 28090656.4 |
| 108/09/27 | 44242817 | 28028046.8 |
| 108/10/01 | 75248890 | 40233783.2 |
| 108/10/02 | 30572953 | 41487385.2 |
| 108/10/03 | 35874864 | 44013525.4 |
| 108/10/04 | 35814613 | 44350827.4 |
| 108/10/07 | 17750230 | 39052310 |
| 108/10/08 | 39868551 | 31976242.2 |
| 108/10/09 | 34895456 | 32840742.8 |
+-----------+--------------+--------------------+
>Note: SQL command
>SELECT 日期,
成交股數,
AVG(成交股數) OVER (ORDER BY 日期 ASC ROWS 4 PRECEDING) AS MA5
FROM database_test_3;
# Problem 3: (Simple vector calculation) 20% Find the biggest daily change.
## a. 以"當日收盤"和"前一天收盤"來比較
日期 : 108/07/01
變化的數值 : 9.5
當日最高價 : 250.0
當日成交股數 : 66370875

## b.以"當日收盤"和"前一天收盤"來比較
日期 : 108/06/28
變化的數值 : 3.9749%
當日最高價 : 241.5
當日成交股數 : 28085212

# Problem 4: (Outlier) 20% Special days
## 結果
108/08/06 6.008272
108/10/01 5.245829
108/08/05 3.260889
108/07/01 3.169937
108/06/11 2.997232
## 算法
```python
def normalization(df):
df['成交股數'] = (df['成交股數'] - df['成交股數'].mean())/(df['成交股數'].std())
df['最高低價差'] = df['最高價'] - df['最低價']
df['最高低價差'] = (df['最高低價差'] - df['最高低價差'].mean())/(df['最高低價差'].std())
df['收盤價差'] = df['收盤價'] - df['開盤價']
df['收盤價差'] = (df['收盤價差'] - df['收盤價差'].mean())/(df['收盤價差'].std())
def returnTop5Outlier(df):
df['成交股數_誤差'] = (df['成交股數'])**2
df['最高低價差_誤差'] = (df['最高低價差'])**2
df['收盤價差_誤差'] = (df['收盤價差'])**2
df['誤差'] = (df['成交股數_誤差'] + df['最高低價差_誤差'] + df['收盤價差_誤差'])**0.5
normalization(df)
returnTop5Outlier(df)
data = df['誤差'].astype(str).astype(float)
data.nlargest()
```
# Problem 5: (Most profit) 20%
## TOP 5 獲益最高的:買入日期、賣出日其、獲益
Top 1 : 108/06/06, 108/10/08, 54.5(=286.5 - 232.0)
Top 2 : 108/06/04, 108/10/08, 53.5(=286.5 - 233.0)
Top 3 : 108/06/26, 108/10/08, 52.0(=286.5 - 234.5)
Top 4 : 108/06/05, 108/10/08, 51.5(=286.5 - 235.0)
Top 5 : 108/06/18, 108/10/08, 51.0(=286.5 - 235.5)
## 演算法
用一個時間複雜度為 $O(n^2)$ 的函數,對於每一天作為買的日子,然後往後跟比自己大的每一天相減,如果有更大的差異,那就更新profit。並且加了一個限制條件。就是在update最大profit的時候,不能夠超出一個上限。
這個 __上限__ 對於找 Top 1 來說,是float的MAX VALUE
這個 __上限__ 對於找 Top 2 來說,是 Top 1 的 max profit
這個 __上限__ 對於找 Top 3 來說,是 Top 2 的 max profit
這個 __上限__ 對於找 Top 4 來說,是 Top 3 的 max profit
這個 __上限__ 對於找 Top 5 來說,是 Top 4 的 max profit
## Code
```python
import sys
def findMaxProfit(data, MAXPROFIT):
length = len(data)
maxProfit = sys.float_info.min
buyDay = -1
sellDay = -1
for i in range(length-1):
for j in range(i+1,length):
profit = data[j]-data[i]
if profit > maxProfit and profit < MAXPROFIT:
buyDay = i
sellDay = j
maxProfit = profit
return buyDay, sellDay, maxProfit
def findTop5ProfitDay(data):
length = len(data)
MAX = sys.float_info.max
top5profitBuy = [None] * 5
top5profitSell = [None] * 5
top5profit = [None] * 5
top5profitBuy[0], top5profitSell[0], top5profit[0] = findMaxProfit(data,MAX)
for i in range(1,5):
top5profitBuy[i], top5profitSell[i], top5profit[i] = findMaxProfit(data,top5profit[i-1])
return top5profitBuy, top5profitSell, top5profit
findTop5ProfitDay(df['收盤價'])
```