---
title: 'R語言預測股價之LSTM範例'
disqus: hackmd
---
R語言預測股價之LSTM範例
===
[TOC]
## 流程圖
```flow
st=>start: 股價資料
op=>parallel: 切割訓練集、驗證集、測試集
para=>parallel: parallel tasks
op2=>operation: 轉換為LSTM陣列
op3=>operation: 模型訓練
io_traindata=>inputoutput: 訓練、驗證集
io_testdata=>inputoutput: 測試集
io_model=>inputoutput: Keras Model
op_predict=>operation: 模型預測
e=>end
st->op
op(path1)->io_traindata->op2->op3->io_model->op_predict
op(path2, right, botto)->io_testdata->op_predict
```
## 載入所需套件
``` R=
rm(list=ls());gc()
library(keras) # 深度學習Keras套件
library(tidyquant) # 抓取股價資料套件
library(dplyr) # 資料篩選套件
library(TTR) # 計算股票技術指標套件
library(caret) # 資料預處理套件
library(parallel) # 平行運算套件
library(abind) # list轉換成data.frame所需套件
library(tidyr) # 轉換長寬資料套件
library(ggplot2) # 資料視覺化套件
```
## 抓取股價資料
首先抓取Yahoo Finance的股價資料,此處以台積電為例
``` R=
download.stockCode <- c("2330.TW")
stockData <- tq_get(x = download.stockCode, get = "stock.prices", from = "2008-01-01")
summary(stockData)
```
## 模型建立
### 定義訓練集、驗證集、測試集時間範圍
預測股票報酬/股價切割時間相當重要,本範例將使用移動窗口(Rolling Window),
於每年中的1月1日與7月1日進行模型訓練。
訓練樣本時間長度為5年,
驗證樣本時間長度為6個月,
測試集樣本時間長度為6個月。
```mermaid
gantt
title Rolling Window
section 模型1
訓練期(5年) :a1, 2009-07-01, 2014-07-01
驗證期(半年) :a2, 2014-07-01, 2015-01-01
測試期(半年) :a3, 2015-01-01, 2015-06-30
section 模型2
訓練期(5年) :a1, 2010-01-01, 2015-01-01
驗證期(半年) :a2, 2015-01-01, 2015-06-30
測試期(半年) :a3, 2015-07-01, 2015-12-31
```
``` R=
# 訓練樣本時間長度
train_timeSpan <- lubridate::years(5)
# 驗證樣本時間長度
validation_timeSpan <- months(6)
# 模型重訓練日期
predictDateList <- lapply(paste0(c(2015:max(year(stockData$date)))) ,
function(x)paste0(x, c("0101", "0701"))) %>%
unlist() %>%
ymd() %>%
sort()
```
### 定義單一樣本時間長度
由於LSTM之單一樣本維度定義為$dim(timeSteps, featureNums)$,
從Yahoo Finance抓取之股價資料為日資料,
而LSTM Model單一樣本中列資料與列資料間隔時間只要是固定時間即可,
不必一定要間隔一日,也可以是二日、三日。
本例將以近5日的調整後盤價作為單一樣本
``` R=
# 樣本總時間長度
lookback <- 5
# 樣本列間時間長度
step <- 1
# 時間步長
timeStepNums <- floor(lookback / step)
```
建立資料轉換函數與模型建立函數
```R=
# convert data.frame to LSTM array
to_lstm_array <- function(start_ind, data.frame, lookback = lookback, timeStepNums = timeStepNums){
# 特徵數量
featureNums <- ncol(data.frame)
x <- data.frame[seq(from = (start_ind-lookback), to = start_ind, length.out = timeStepNums),] %>%
as.matrix() %>%
array(dim=c(1, timeStepNums , featureNums))
return(x)
}
# 模型建立函數
build_lstm_model <- function(timeStepNums = timeStepNums, featureNums = featureNums){
model <- keras_model_sequential()
model %>%
layer_lstm(units=10, input_shape=c(timeStepNums, featureNums), return_sequences=T) %>%
layer_dropout(0.5) %>%
layer_lstm(units=32, input_shape=c(timeStepNums, featureNums), return_sequences=F) %>%
layer_dropout(0.5) %>%
layer_dense(units=10, activation="relu") %>%
layer_dense(units=1, activation="relu")
model %>% compile(
loss="mse",
optimizer="adam",
metrics="mse"
)
return(model)
}
```
## 資料預處理
在預處理上主要分為兩項步驟
1.建立特徵、標籤
2.資料正規化(normalization)
### 1.建立模型特徵、預測目標
``` R=
stockData <- stockData %>%
na.omit() %>%
arrange(date) %>%
select(date, close = adjusted) %>%
mutate(Y_Label = lead(close, 1),
Y_outDate = lead(date, 1)) %>%
na.omit()
```
為簡化特徵,本範例單純以調整後收盤價作為特徵預測隔日調整後收盤價,
並且記錄標籤日期預防使用未來資料。
### 2.資料正規化(normalization)
本文使用Caret套件中的preProcess函數,針對訓練集進行標準化,
並使用訓練集建立之模型預測將驗證集、測試集進行正規化。
另外建立逆正規化函數,以便將預測值還原。
```R=
# 逆向標準化函數
unPreProc <- function(preProc, data){
stopifnot(class(preProc) == "preProcess")
stopifnot("data.frame" %in% class(data))
for(i in names(preProc$mean)){
tmp <- data[, i] * preProc$std[[i]] + preProc$mean[[i]]
data[, i] <- tmp
}
return(data)
}
```
## 模型訓練
每半年重新訓練模型
``` R=
# 依日期進行模型訓練
dailyList <- stockData$date %>%
unique() %>%
sort()
allInfo <- tibble()
ind <- 1
for(ind in 1:length(predictDateList)){
cat(paste0("目前預測日期: ", predictDateList[ind], ", 目前進度: ", ind, " / ", length(predictDateList) ) , "\n")
# 測試期期間
testData.StartDate <- dailyList[max(which(dailyList <= predictDateList[ind])) - lookback]
testData.endDate <- as.Date(ifelse(ind != length(predictDateList), predictDateList[ind+1], max(stockData$date)))
# 驗證期期間
validData.endDate <- predictDateList[ind]
validData.StartDate <- dailyList[max(which(dailyList <= validData.endDate - validation_timeSpan)) - lookback]
# 訓練期期間
trainData.endDate <- predictDateList[ind] - validation_timeSpan
trainData.StartDate <- dailyList[max(which(dailyList <= trainData.endDate - train_timeSpan)) - lookback]
# 切割資料
testInfo <- stockData %>%
filter(date >= testData.StartDate, date < testData.endDate) %>%
select(c(date, Y_outDate, Y_Label))
X_test <- stockData %>%
filter(date >= testData.StartDate, date < testData.endDate) %>%
select(-c(date, Y_outDate, Y_Label))
Y_test <- stockData %>%
filter(date >= testData.StartDate, date < testData.endDate) %>%
select(Y_Label)
X_val <- stockData %>%
filter(date >= validData.StartDate, Y_outDate < validData.endDate) %>%
select(-c(date, Y_outDate, Y_Label))
Y_val <- stockData %>%
filter(date >= validData.StartDate, Y_outDate < validData.endDate) %>%
select(Y_Label)
X_train <- stockData %>%
filter(date >= trainData.StartDate, Y_outDate < trainData.endDate) %>%
select(-c(date, Y_outDate, Y_Label))
Y_train <- stockData %>%
filter(date >= trainData.StartDate, Y_outDate < trainData.endDate) %>%
select(Y_Label)
# 將資料特徵進行標準化
# 使用訓練集樣本建立各特徵標準化模型
# center: x - mean(x)
# scale: x / std(x)
preProcValues <- preProcess(X_train, method = c("center", "scale"))
X_train <- predict(preProcValues, X_train)
X_val <- predict(preProcValues, X_val)
X_test <- predict(preProcValues, X_test)
# unPreProc(preProcValues, X_train)
preProcY <- preProcess(Y_train, method = c("center", "scale"))
Y_train <- predict(preProcY, Y_train)
Y_val <- predict(preProcY, Y_val)
# 將資料結構轉換為LSTM陣列
# 使用CPU執行緒數目
cut_Cluster<- detectCores()-6
cl <- makeCluster(cut_Cluster)
# 匯入使用變數給每個執行緒
# clusterExport(cl, c("X_train","Y_train"))
# 匯入套件給每個執行緒
clusterEvalQ(cl, c(library(dplyr)))
# 執行平行運算
output_train <- parLapply(cl, (lookback+1):nrow(X_train), to_lstm_array, X_train, lookback, timeStepNums)
output_val <- parLapply(cl, (lookback+1):nrow(X_val), to_lstm_array, X_val, lookback, timeStepNums)
output_test <- parLapply(cl, (lookback+1):nrow(X_test), to_lstm_array, X_test, lookback, timeStepNums)
# 關閉平行運算
stopCluster(cl)
# 合併樣本集
X_train <- do.call("abind", c(output_train, along=1))
Y_train <- Y_train[(lookback+1):nrow(Y_train),] %>%
as.matrix()
X_val <- do.call("abind", c(output_val, along=1))
Y_val <- Y_val[(lookback+1):nrow(Y_val),] %>%
as.matrix()
X_test <- do.call("abind", c(output_test, along=1))
Y_test <- Y_test[(lookback+1):nrow(Y_test),] %>%
as.matrix()
testInfo <- testInfo[(lookback+1):nrow(testInfo),]
# 建立模型
# 樣本維度
cat("樣本集資料:", dim(X_train), "\n")
model <- build_lstm_model(timeStepNums = dim(X_train)[2], featureNums = dim(X_train)[3])
# 檢查模型
summary(model)
# 開始訓練
batch_size <- 50
epochs <- 50
patience <- 20
trainHistory <- model %>% fit(
x=X_train,
y=Y_train,
validation_data = list(X_val, Y_val),
batch_size=batch_size,
epochs=epochs,
shuffle=T,
callbacks = c(callback_early_stopping(monitor = "val_loss", patience = patience, mode = "auto"))
)
# final epoch
final_epoch <- trainHistory$metrics$loss %>%
length() %>%
max()
final_epoch <- ifelse(final_epoch != epochs, epochs - patience, epochs)
# 合併訓練集與驗證集,使用最佳epoch訓練模型
model <- build_lstm_model(timeStepNums = dim(X_train)[2], featureNums = dim(X_train)[3])
# 開始訓練
trainHistory <- model %>% fit(
x = abind(X_train, X_val , along = 1),
y = abind(Y_train, Y_val , along = 1),
validation_data = list(X_val, Y_val),
batch_size = batch_size,
epochs = final_epoch,
shuffle = T
)
# 儲存模型
model_fileName <- paste0("stockPrediction_LSTM_", predictDateList[ind], ".h5")
model %>% save_model_hdf5(model_fileName)
# 預測測試期資料
Y_predict <- model %>%
predict(X_test) %>%
as.data.frame() %>%
rename_at(vars(colnames(.)), ~ colnames(Y_train))
# 還原Y預測值
Y_predict <- unPreProc(preProcY, Y_predict)
# 合併測試期資料與預測值
testInfo <- testInfo %>%
bind_cols(Y_predict)
# 儲存測試期資料
allInfo <- allInfo %>%
bind_rows(testInfo)
}
```
## 資料視覺化
``` R=
# 資料視覺化
plotData = allInfo %>%
rename(Y_predict = Y_Label1,
Y_real = Y_Label) %>%
gather(category, adjclose, Y_real, Y_predict)
plotData$category = factor(plotData$category , levels = c("Y_real" , "Y_predict"))
ggplot(data = plotData ,
aes(x = Y_outDate, y = round(adjclose, 2) , color= category))+
geom_line(aes(color = category ), size = 1.2)+
labs(title = paste0("調整後收盤價實際與預測走勢圖")) +
ylab("調整後收盤價")+
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, size = 16),
plot.subtitle = element_text(hjust = 0.5, size = 16),
text = element_text(family = "BL", size = 14),
legend.position="bottom")
```
###### tags: `R` `LSTM` `Keras` `Stock_Prediction`