# notebook57d5a8afce
###### tags: `Code`
# 環境設定
```python
import time
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import LayerNormalization
```
```python
data_folder = "../input/g-research-crypto-forecasting"
```
# 讀取資料
## 讀取訓練資料
- 注意讀取此處,尖峰時,記憶體可能吃掉 7G
- 讀完後會回到 2G
```python
crypto_df = pd.read_csv(data_folder+'/train.csv')
```
## 訓練集的資訊
```python
# 秀出訓練資料
crypto_df
```
```python
# info
crypto_df.info()
```
## 讀取加密貨幣資訊
```python
asset_details = pd.read_csv(data_folder + '/asset_details.csv')
```
```python
asset_details
```
## 建立加密貨幣映射表
```python
# Adding Crypto symbols to the dataframes for easy reference
asset_symbols_map = {
0: 'BNB',
1: 'BTC',
2: 'BCH',
3: 'ADA',
4: 'DOGE',
5: 'EOS.IO',
6: 'ETH',
7: 'ETC',
8: 'IOTA',
9: 'LTC',
10: 'MKR',
11: 'XMR',
12: 'XLM',
13: 'TRON'
}
crypto_df['Symbol'] = crypto_df.Asset_ID.map(asset_symbols_map) # mapping
asset_details['Symbol'] = asset_details.Asset_ID.map(asset_symbols_map) # mapping
```
```python
crypto_df.head(5)
```
```python
asset_details
```
# 資料處理
## 處理缺失值
### 分出各個加密貨幣資料
- `crypto` 是個**字典**將會存放各個加密貨幣的資料。
- 使用方法 `crypto['symbol_name']`
- `asset_symbols_map` 可以從此處得到 `symbol_name`
```python
crypto = dict()
for i in asset_symbols_map:
crypto[asset_symbols_map[i]] = crypto_df[crypto_df["Asset_ID"]==i].set_index("timestamp")
```
### 檢視各資料的時間範圍
```python
for i in asset_symbols_map:
beg = crypto[asset_symbols_map[i]].index[0].astype('datetime64[s]')
end = crypto[asset_symbols_map[i]].index[-1].astype('datetime64[s]')
print(asset_symbols_map[i]+' data goes from ', beg, 'to ', end)
```
### 選取特定時間範圍的資料
- 用 `reindex( method='pad')` 填補 Timestamp 之間的空缺值
- 用 `.fillna(method='ffill')` 填補 feature 的空缺值 (ex: Target)
- 會透過 `info()` 印出每個筆資料的狀況
- 會透過 `.value_counts()` 觀看每筆資料的缺失 timestamp 的莊況
- 會透過 `.isna().sum())` 觀看每筆資料的缺失值狀況
- 丟掉訓練時不必要的欄位 `Asset_ID`、`Symbol`
```python
# auxiliary function, from datetime to timestamp
totimestamp = lambda s: np.int32(time.mktime(datetime.strptime(s, "%d/%m/%Y").timetuple()))
start_time = totimestamp('01/01/2021')
end_time = 1632181440 #totimestamp('01/09/2021')
crypto_min = dict()
for i in asset_symbols_map:
print('['+str(i)+']-'+asset_symbols_map[i]+'=============================================')
crypto_min[asset_symbols_map[i]] = crypto[asset_symbols_map[i]].loc[start_time:]#[start_time:end_time]
crypto_min[asset_symbols_map[i]] = crypto_min[asset_symbols_map[i]].reindex(range(crypto_min[asset_symbols_map[i]].index[0],
crypto_min[asset_symbols_map[i]].index[-1]+60,
60), method='pad')
crypto_min[asset_symbols_map[i]] = crypto_min[asset_symbols_map[i]].fillna(method='ffill')
crypto_min[asset_symbols_map[i]] = crypto_min[asset_symbols_map[i]].drop('Asset_ID', 1)
crypto_min[asset_symbols_map[i]] = crypto_min[asset_symbols_map[i]].drop('Symbol', 1)
print('**Info**')
print(crypto_min[asset_symbols_map[i]].info())
print('\n**Timestamp**')
print((crypto_min[asset_symbols_map[i]].index[1:]-crypto_min[asset_symbols_map[i]].index[:-1]).value_counts().head())
print('\n**Missing Value**')
print(crypto_min[asset_symbols_map[i]].isna().sum())
print('\n\n')
```
## 生成訓練資料
### 重新整合成一個 timestamp 有 14 個 加密貨幣的資料
```python
_ = list()
for i in asset_symbols_map:
_.append(crypto_min[asset_symbols_map[i]].to_numpy())
x = np.array(_)
print(x.shape)
```
```python
# 取出沒有 target 的部分
x_feature = x[:,:,:-1]
print(x_feature.shape)
```
```python
# 取出 target 的部分
#x_target = x[:,:,-1][:,:,np.newaxis]
x_target = x[:,:,:]
print(x_target.shape)
```
```python
# 重新調整
x_feature = np.concatenate(x_feature,axis=1)
print(x_feature.shape)
x_target = np.concatenate(x_target,axis=1)
print(x_target.shape)
```
```python
# 做標準化
## fit
x_ft_scaler=StandardScaler().fit(x_feature)
x_tg_scaler=StandardScaler().fit(x_target)
## transform
x_feature = x_ft_scaler.transform(x_feature)
x_target = x_tg_scaler.transform(x_target)
```
```python
# 製作時間序列
n_steps = 10
batch_size = x_feature.shape[0]-n_steps+1
x_dim = x_feature.shape[1]
y_dim = x_target.shape[1]
x_train = np.zeros((batch_size,n_steps,x_dim))
y_train = np.zeros((batch_size,y_dim))
for j in range(batch_size):
x_train[j] = x_feature[j:j+n_steps,:]
y_train[j] = x_target[j+n_steps-1,:]
print("batch_size : ", batch_size)
print("x_dim : ", x_dim)
print("y_dim : ", y_dim)
print("x_train.shape : ", x_train.shape)
print("y_train.shape : ", y_train.shape)
```
# 模型訓練
```python
# 秀出資料大小
print(x_train.shape)
print(y_train.shape)
```
```python
model = keras.models.Sequential([
keras.layers.LSTM(600, return_sequences=True, input_shape=[None, x_dim]),
keras.layers.LSTM(600),
#keras.layers.Dropout(0.2),
keras.layers.Dense(y_dim)
])
callback = tf.keras.callbacks.EarlyStopping(
monitor='val_loss',
patience=2,
mode='auto',
restore_best_weights=True
)
model.compile(loss="mse", optimizer="adam")
history = model.fit(x_train, y_train, epochs=100, callbacks = [callback], validation_split=0.01)
```
# API 提供預測
```python
import gresearch_crypto
env = gresearch_crypto.make_env()
iter_test = env.iter_test()
```
```python
# 建立 for 迴圈
x_train_n = x_train[-1:,:] # training data 的最後一筆
n_i = 1
for (test_df, sample_prediction_df) in iter_test:
test_len = test_df.shape[0]
try:
test_df.fillna(method='ffill', inplace=True)
test_df.fillna(method='bfill', inplace=True)
row_id = test_df[['Asset_ID','row_id']].set_index('Asset_ID').row_id.to_dict()
test = test_df.sort_values(by=['Asset_ID'])[['Count','Open','High','Low','Close','Volume','VWAP']].to_numpy() # (14, 7)
test = test.reshape((1,-1)) # (1, 98)
test = x_ft_scaler.transform(test).reshape((1,1,-1)) # (1, 1, 98)
x_train_n = np.concatenate([x_train_n,test],axis = 1)[:,1:,:] # 此處控管使用記憶體量不要膨脹 # (1, 11, 98)
result = model.predict(x_train_n)
result = x_tg_scaler.inverse_transform(result).reshape((-1,8))[:,-1]
res_tg = result.tolist()
output = list()
for i,j in enumerate(row_id):
output.append('%0.8f'%(res_tg[j]))
sample_prediction_df['Target'] = output
print("output : ", output)
except:
sample_prediction_df['Target'] = [-0.003]*(test_len)
env.predict(sample_prediction_df)
```