# lstm-m2o-0106-10step-final
###### tags: `Code`
# Score

# G-Research Crypto Forecasting
Use your ML expertise to predict real crypto market data
# 環境設定
```python
import time
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
```
## 訓練資料設定
```python
#data_folder = ''
data_folder = '../input/g-research-crypto-forecasting/'
crypto_df = pd.read_csv(data_folder+'train.csv')
```
## 建立貨幣 map
```python
# Adding Crypto symbols to the dataframes for easy reference
asset_symbols_map = {
0: 'BNB',
1: 'BTC',
2: 'BCH',
3: 'ADA',
4: 'DOGE',
5: 'EOS.IO',
6: 'ETH',
7: 'ETC',
8: 'IOTA',
9: 'LTC',
10: 'MKR',
11: 'XMR',
12: 'XLM',
13: 'TRON'
}
```
# 工具函數
## find_range
- 找出加密貨幣中時間最長的
```python
def find_range(df):
# 找出加密貨幣中時間最長的
max_len = 0
max_index = -1
min_start_time = -1
max_finish_time = -1
for i in asset_symbols_map:
_ = (df["Asset_ID"]==i).sum()
_st = int(df[df["Asset_ID"]==i].iloc[0]['timestamp'])
_ft = int(df[df["Asset_ID"]==i].iloc[-1]['timestamp'])
if min_start_time == -1:
min_start_time = _st
if min_start_time > _st:
min_start_time = _st
if max_finish_time < _ft:
max_finish_time = _ft
if _ >= max_len:
max_len = _
max_index = i
return min_start_time, max_finish_time, max_len, max_index
```
## fill_missing_value
- 填充缺失值 & 挑選位於尾部的資料
```python
def fill_missing_value(pdData, crypto_map, tail_num=None, selectime=None , showinfo=False):
# 強迫每人拉自一樣長度 (看最長的是誰)
# 保證加密貨幣都有值
# 保證大家長度都一樣
crypto_min = dict()
pdData = pdData.sort_values(by=['timestamp'])
min_start_time, max_finish_time, max_len, max_index = find_range(pdData.reset_index())
for i in asset_symbols_map:
crypto_min[asset_symbols_map[i]] = pdData[pdData["Asset_ID"]==i].set_index(['timestamp'])
crypto_min[asset_symbols_map[i]] = crypto_min[asset_symbols_map[i]].reindex(range(min_start_time, max_finish_time+60,60), method='pad')
crypto_min[asset_symbols_map[i]] = crypto_min[asset_symbols_map[i]].fillna(method='ffill').fillna(method='bfill')
if showinfo == True:
print('['+str(i)+']-'+asset_symbols_map[i]+'=============================================')
print('**Info**')
print(crypto_min[asset_symbols_map[i]].info())
print('\n**Timestamp**')
print((crypto_min[asset_symbols_map[i]].index[1:]-crypto_min[asset_symbols_map[i]].index[:-1]).value_counts().head())
print('\n**Missing Value**')
print(crypto_min[asset_symbols_map[i]].isna().sum())
print('\n\n')
# 是否限制輸出大小為每個貨幣的尾巴幾筆
if tail_num != None:
if selectime == None:
for i in asset_symbols_map:
crypto_min[asset_symbols_map[i]] = crypto_min[asset_symbols_map[i]].tail(tail_num)
else:
for i in asset_symbols_map:
crypto_min[asset_symbols_map[i]] = crypto_min[asset_symbols_map[i]].loc[selectime-60*(tail_num-1) : selectime+60]
# 再次合併所有加密貨幣的資料
for i in asset_symbols_map:
if i == 0:
crypto_min_new = crypto_min[asset_symbols_map[i]]
else:
crypto_min_new = crypto_min_new.append(crypto_min[asset_symbols_map[i]])
return crypto_min_new.reset_index()
```
## get_features
- 生成額外特徵
```python
def upper_shadow(df):
return df['High'] - np.maximum(df['Close'], df['Open'])
def lower_shadow(df):
return np.minimum(df['Close'], df['Open']) - df['Low']
def get_features(df, row = False):
df_feat = df.drop(['Target'],axis=1)
df_feat['spread'] = df_feat['High'] - df_feat['Low']
df_feat['mean_trade'] = df_feat['Volume']/df_feat['Count']
df_feat['log_price_change'] = np.log(df_feat['Close']/df_feat['Open'])
df_feat['upper_Shadow'] = upper_shadow(df_feat)
df_feat['lower_Shadow'] = lower_shadow(df_feat)
df_feat["high_div_low"] = df_feat["High"] / df_feat["Low"]
df_feat['trade'] = df_feat['Close'] - df_feat['Open']
df_feat['Target'] = df['Target']
return df_feat
```
## make_trianing_data
- 生成訓練模型需使用的資料
```python
def make_trianing_data(pdData):
crypto_min = dict()
for i in asset_symbols_map:
crypto_min[asset_symbols_map[i]] = pdData[pdData["Asset_ID"]==i].sort_values(by=['timestamp']).set_index(['timestamp']).drop('Asset_ID', axis=1)
# crypto_min[asset_symbols_map[i]] = pdData[pdData["Asset_ID"]==i].sort_values(by=['timestamp']).set_index(['timestamp'])
# crypto_min[asset_symbols_map[i]] = crypto_min[asset_symbols_map[i]][['Count','Open','High','Low','Close','Volume','VWAP','Target']]
_ = list()
for i in asset_symbols_map:
_.append(crypto_min[asset_symbols_map[i]].to_numpy())
x = np.array(_)
# 取出沒有 target 的部分
x_feature = x[:,:,:-1]
#print(x_feature.shape)
# 取出 target 的部分
#x_target = x[:,:,-1][:,:,np.newaxis]
y_target = x[:,:,:]
#print(x_target.shape)
# 重新調整
x_feature = np.concatenate(x_feature,axis=1)
#print(x_feature.shape)
y_target = np.concatenate(y_target,axis=1)
#print(x_target.shape)
return x_feature, y_target
```
## time_series
- 製作時間序列
```python
def time_series(x,y,n_steps):
# 製作時間序列
batch_size = x.shape[0]-n_steps+1
x_dim = x.shape[1]
y_dim = y.shape[1]
x_ = np.zeros((batch_size,n_steps,x_dim))
y_ = np.zeros((batch_size,y_dim))
for j in range(batch_size):
x_[j] = x[j:j+n_steps,:]
y_[j] = y[j+n_steps-1,:]
return x_, y_, x_dim, y_dim
```
# 資料前處理 & 模型訓練
## 訓練資料範圍設定
```python
# 選擇時間範圍
# auxiliary function, from datetime to timestamp
totimestamp = lambda s: np.int32(time.mktime(datetime.strptime(s, "%d/%m/%Y").timetuple()))
# 開始與結束的時間
start_time = totimestamp('01/06/2021')
end_time = 1632181440 #totimestamp('01/09/2021')
# 時間序列步長
n_steps = 10
```
## 資料前處理
- 添補空缺值
- 生成額外的特徵
- 生成訓練資料
- 標準化
- 生成時間序列格式
```python
stime = time.time()
# 填補空缺值
crypto_min = fill_missing_value(pdData=crypto_df.set_index(['timestamp']).loc[start_time:].reset_index(), crypto_map=asset_symbols_map, showinfo=False)
# 生成額外的特徵
crypto_min = get_features(crypto_min)
# 生成訓練資料
x_train, y_train = make_trianing_data(crypto_min)
print(x_train.shape, y_train.shape)
# 標準化
## fit
x_scaler=StandardScaler().fit(x_train)
y_scaler=StandardScaler().fit(y_train)
## transform
x_train = x_scaler.transform(x_train)
y_train = y_scaler.transform(y_train)
# 生成時間序列格式
x_train, y_train, x_dim, y_dim = time_series(x_train, y_train, n_steps)
print(x_train.shape, y_train.shape)
print('Cost time:',time.time()-stime)
```
(161280, 196) (161280, 210)
(161271, 10, 196) (161271, 210)
Cost time: 8.154935598373413
## 建立模型
- Sequential model
1. LSTM `600`
2. dropout `0.2`
3. LSTM `600`
4. dropout `0.2`
5. Dense
- EarlyStopping
- Loss: `mse`
- Optimizer: `adam`
- Epoch: `100`
- Validation_split: `0.01`
```python
stime = time.time()
model = keras.models.Sequential([
keras.layers.LSTM(600, return_sequences=True,
input_shape=[None, x_dim]),
keras.layers.Dropout(0.2),
keras.layers.LSTM(600),
keras.layers.Dropout(0.2),
keras.layers.Dense(y_dim)
])
callback = tf.keras.callbacks.EarlyStopping(
monitor='val_loss',
patience=2,
mode='auto',
restore_best_weights=True
)
model.compile(loss="mse", optimizer="adam")
history = model.fit(x_train, y_train, epochs=100, callbacks = [callback], validation_split=0.01)
print('Cost time:',time.time()-stime)
```
2022-01-11 14:20:22.549889: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-11 14:20:22.655182: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-11 14:20:22.656349: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-11 14:20:22.659820: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-11 14:20:22.660894: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-11 14:20:22.662222: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-11 14:20:22.663509: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-11 14:20:24.294054: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-11 14:20:24.294930: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-11 14:20:24.295574: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-11 14:20:24.296138: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15403 MB memory: -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0
2022-01-11 14:20:25.925427: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1251718720 exceeds 10% of free system memory.
2022-01-11 14:20:27.446024: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1251718720 exceeds 10% of free system memory.
2022-01-11 14:20:28.469333: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
Epoch 1/100
2022-01-11 14:20:31.501281: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005
4990/4990 [==============================] - 42s 8ms/step - loss: 0.2516 - val_loss: 0.3045
Epoch 2/100
4990/4990 [==============================] - 37s 7ms/step - loss: 0.2088 - val_loss: 0.2786
Epoch 3/100
4990/4990 [==============================] - 38s 8ms/step - loss: 0.1956 - val_loss: 0.2781
Epoch 4/100
4990/4990 [==============================] - 37s 7ms/step - loss: 0.1919 - val_loss: 0.2896
Epoch 5/100
4990/4990 [==============================] - 38s 8ms/step - loss: 0.1834 - val_loss: 0.3059
Cost time: 198.7042691707611
# 運行測試 API
## 測試前置
- 建立測試環境
- 從訓練資料抽出最後 n_steps 筆資料
- 為了讓測試資料塞入補值
- 為了能生成測試的時間序列格式
```python
# 建立測試環境
import gresearch_crypto
env = gresearch_crypto.make_env()
iter_test = env.iter_test()
```
```python
# 限制輸出 n_steps 筆
crypto_min_tail = fill_missing_value(crypto_min, asset_symbols_map, n_steps)
```
## 開始測試
- 測資會被塞入訓練資料重新填值
```python
start_time = time.time()
end_time = time.time()
n_i =1
for (test_dfs, pred_dfs) in iter_test:
test_dfs = test_dfs.sort_values(by=['timestamp'])
test_dfs['Target'] = 0
row_id = test_dfs[['Asset_ID','row_id']].set_index('Asset_ID')['row_id'].to_dict()
test_dfs = get_features(test_dfs)
selecttime = int(crypto_min_tail.iloc[-1]['timestamp'])+60
test_dfs['timestamp'] = selecttime
crypto_min_tail= crypto_min_tail.append(test_dfs.drop(['row_id'],axis=1), ignore_index=True).drop_duplicates()
crypto_min_tail= fill_missing_value(crypto_min_tail, asset_symbols_map, n_steps, selecttime)
x_test, y_test = make_trianing_data(crypto_min_tail)
x_test = x_scaler.transform(x_test)
x_test, _,_,_ = time_series(x_test, y_test, n_steps)
result = model.predict(x_test)
res_tg = y_scaler.inverse_transform(result).reshape((14,-1))[:,-1]
res_tg = res_tg.tolist()
output = list()
for i,j in enumerate(row_id):
output.append(float('%0.8f'%(res_tg[j])))
pred_dfs['Target'] = output
env.predict(pred_dfs)
print('Iter: '+str(n_i)+', cost time: '+str(time.time()-end_time),end='\r')
n_i += 1
end_time = time.time()
print('\nEnd time: ',time.time()-start_time)
```
This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
Iter: 4, cost time: 0.10308551788330078
End time: 1.037280559539795
## Local API 模擬器
```python
# import time
# import numpy as np
# import pandas as pd
# from numpy import dtype
# asset_details = pd.read_csv(data_folder+'asset_details.csv')
# id_2_weight = dict(zip(asset_details.Asset_ID, asset_details.Weight))
# dtypes = {'timestamp': np.int64, 'Asset_ID': np.int8,
# 'Count': np.int32, 'Open': np.float64,
# 'High': np.float64, 'Low': np.float64,
# 'Close': np.float64, 'Volume': np.float64,
# 'VWAP': np.float64, 'Target': np.float64}
# def datestring_to_timestamp(ts):
# return int(pd.Timestamp(ts).timestamp())
# def read_csv_slice(file_path=data_folder+'train.csv', dtypes=dtypes, use_window=None):
# df = pd.read_csv(file_path, dtype=dtypes)
# if use_window is not None:
# df = df[(df.timestamp >= use_window[0]) & (df.timestamp < use_window[1])]
# return df
# def weighted_correlation(a, b, weights):
# w = np.ravel(weights)
# a = np.ravel(a)
# b = np.ravel(b)
# sum_w = np.sum(w)
# mean_a = np.sum(a * w) / sum_w
# mean_b = np.sum(b * w) / sum_w
# var_a = np.sum(w * np.square(a - mean_a)) / sum_w
# var_b = np.sum(w * np.square(b - mean_b)) / sum_w
# cov = np.sum((a * b * w)) / np.sum(w) - mean_a * mean_b
# corr = cov / np.sqrt(var_a * var_b)
# return corr
# # start = datestring_to_timestamp('2021-06-13-00-00')
# # end = datestring_to_timestamp('2021-09-22-01-00')
# start = 1623542400
# end = 1623542620
# file_path = data_folder+'supplemental_train.csv'
# train_df = read_csv_slice(file_path,use_window=[start, end])
# class API:
# def __init__(self, df):
# df = df.astype(dtypes)
# df['row_id'] = df.index
# dfg = df.groupby('timestamp')
# self.data_iter = dfg.__iter__()
# self.init_num_times = len(dfg)
# self.next_calls = 0
# self.pred_calls = 0
# self.predictions = []
# self.targets = []
# print("This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set. ;)")
# def __iter__(self):
# return self
# def __len__(self):
# return self.init_num_times - self.next_calls
# def __next__(self):
# assert self.pred_calls == self.next_calls, "You must call `predict()` successfully before you can get the next batch of data."
# timestamp, df = next(self.data_iter)
# self.next_calls += 1
# data_df = df.drop(columns=['Target'])
# true_df = df.drop(columns=['timestamp','Count','Open','High','Low','Close','Volume','VWAP'])
# true_df = true_df[['row_id', 'Target', 'Asset_ID']]
# self.targets.append(true_df)
# pred_df = true_df.drop(columns=['Asset_ID'])
# pred_df['Target'] = 0.
# return data_df, pred_df
# def predict(self, pred_df):
# assert self.pred_calls == self.next_calls - 1, "You must get the next batch of data from the API before making a new prediction."
# assert pred_df.columns.to_list() == ['row_id', 'Target'], "Prediction dataframe should have columns `row_id` and `Target`."
# pred_df = pred_df.astype({'row_id': dtype('int64'), 'Target': dtype('float64')})
# self.predictions.append(pred_df)
# self.pred_calls += 1
# def score(self, id_2_weight=id_2_weight):
# pred_df = pd.concat(self.predictions).rename(columns={'Target':'Prediction'})
# true_df = pd.concat(self.targets)
# scoring_df = pd.merge(true_df, pred_df, on='row_id', how='left')
# scoring_df['Weight'] = scoring_df.Asset_ID.map(id_2_weight)
# scoring_df = scoring_df[scoring_df.Target.isna()==False]
# if scoring_df.Prediction.var(ddof=0) < 1e-10:
# score = -1
# else:
# score = weighted_correlation(scoring_df.Prediction, scoring_df.Target, scoring_df.Weight)
# return scoring_df, score
```
```python
# # 限制輸出 n_steps 筆
# crypto_min_tail = fill_missing_value(crypto_min, asset_symbols_map, n_steps)
```
```python
# api = API(train_df)
# print()
# start_time = time.time()
# end_time = time.time()
# n_i =1
# for (test_dfs, pred_dfs) in api:
# test_dfs = test_dfs.sort_values(by=['timestamp'])
# test_dfs['Target'] = 0
# row_id = test_dfs[['Asset_ID','row_id']].set_index('Asset_ID')['row_id'].to_dict()
# test_dfs = get_features(test_dfs)
# selecttime = int(crypto_min_tail.iloc[-1]['timestamp'])+60
# test_dfs['timestamp'] = selecttime
# crypto_min_tail= crypto_min_tail.append(test_dfs.drop(['row_id'],axis=1), ignore_index=True).drop_duplicates()
# crypto_min_tail= fill_missing_value(crypto_min_tail, asset_symbols_map, n_steps, selecttime)
# x_test, y_test = make_trianing_data(crypto_min_tail)
# # 做標準化
# # transform
# x_test = x_scaler.transform(x_test)
# y_test = y_scaler.transform(y_test)
# x_test, y_test,_,_ = time_series(x_test, y_test, n_steps)
# result = model.predict(x_test)
# res_tg = y_scaler.inverse_transform(result).reshape((14,-1))[:,-1]
# res_tg = res_tg.tolist()
# output = list()
# for i,j in enumerate(row_id):
# output.append(float('%0.8f'%(res_tg[j])))
# pred_dfs['Target'] = output
# api.predict(pred_dfs)
# print('Iter: '+str(n_i)+', cost time: '+str(time.time()-end_time),end='\r')
# n_i += 1
# end_time = time.time()
# df, score = api.score()
# print()
# print(f"Your LB score is {round(score, 4)}")
```